#!/usr/bin/env python3
"""Run a single description trigger evaluation.

Tests whether a given prompt would trigger a skill based on its description.
Used by run_loop.py for description optimization.

Usage:
    python3 run_eval.py --description "Skill description text" --query "User prompt to test"

Output:
    JSON with trigger prediction and reasoning.
"""

import argparse
import json
import subprocess
import sys
import tempfile
from pathlib import Path


def evaluate_trigger(description, query):
    """Evaluate whether a skill description would trigger for a given query.

    Uses a simple heuristic: check if the key concepts in the description
    match the query's intent. For more accurate results, use claude -p.
    """
    # Normalize for comparison
    desc_lower = description.lower()
    query_lower = query.lower()

    # Extract "when" clause keywords from description
    when_idx = desc_lower.find("when")
    if when_idx >= 0:
        when_clause = desc_lower[when_idx:]
    else:
        when_clause = desc_lower

    # Simple keyword overlap scoring
    desc_words = set(when_clause.split())
    query_words = set(query_lower.split())

    # Remove common stop words
    stop_words = {"a", "an", "the", "is", "are", "was", "were", "be", "been",
                  "being", "have", "has", "had", "do", "does", "did", "will",
                  "would", "could", "should", "may", "might", "can", "to",
                  "of", "in", "for", "on", "with", "at", "by", "from", "or",
                  "and", "not", "no", "but", "if", "this", "that", "it", "i",
                  "you", "use", "when"}
    desc_words -= stop_words
    query_words -= stop_words

    overlap = desc_words & query_words
    score = len(overlap) / max(len(desc_words), 1)

    return {
        "would_trigger": score > 0.15,
        "confidence": min(score * 2, 1.0),
        "matching_keywords": list(overlap),
        "score": score
    }


def evaluate_trigger_with_claude(description, query):
    """Use claude CLI to evaluate trigger accuracy (more accurate but slower)."""
    prompt = f"""Given this skill description:
"{description}"

Would this user message trigger the skill? Answer with ONLY "yes" or "no":
"{query}"
"""
    try:
        result = subprocess.run(
            ["claude", "-p", prompt, "--output-format", "text"],
            capture_output=True, text=True, timeout=30
        )
        answer = result.stdout.strip().lower()
        return {
            "would_trigger": answer.startswith("yes"),
            "confidence": 0.9 if answer in ("yes", "no") else 0.5,
            "raw_answer": answer
        }
    except (subprocess.TimeoutExpired, FileNotFoundError):
        # Fall back to heuristic
        return evaluate_trigger(description, query)


def main():
    parser = argparse.ArgumentParser(description="Evaluate a single description trigger")
    parser.add_argument("--description", required=True, help="Skill description text")
    parser.add_argument("--query", required=True, help="User prompt to test")
    parser.add_argument("--use-claude", action="store_true",
                        help="Use claude CLI for evaluation (slower but more accurate)")
    args = parser.parse_args()

    if args.use_claude:
        result = evaluate_trigger_with_claude(args.description, args.query)
    else:
        result = evaluate_trigger(args.description, args.query)

    print(json.dumps(result, indent=2))


if __name__ == "__main__":
    main()
