#!/usr/bin/env npx tsx /** * Run evaluation scenarios using Claude Agent SDK with shellwright MCP. * Iterates through scenarios and generates recordings from prompt.md files. */ import dotenv from "dotenv"; dotenv.config({ override: true }); import { query } from "@anthropic-ai/claude-agent-sdk"; import * as fs from "fs/promises"; import * as path from "path"; import { execSync } from "child_process"; const SCENARIOS_DIR = path.join(import.meta.dirname, "scenarios"); const LOGS_DIR = path.join(import.meta.dirname, "logs"); const SHELLWRIGHT_LOG = path.join(LOGS_DIR, "shellwright.jsonl"); const AGENT_LOG = path.join(LOGS_DIR, "agent.jsonl"); // Resolve shellwright repo root - allows us to run the evaluations code and // specify exactly where to find the raw code for the MCP server (as we // evaluate against the local code). function getShellwrightRoot(): string { const envPath = process.env.SHELLWRIGHT_ROOT; if (envPath) { return path.resolve(import.meta.dirname, envPath); } // Default: parent directory of evaluations return path.resolve(import.meta.dirname, ".."); } const ROOT_DIR = getShellwrightRoot(); interface Artifact { filename: string; localPath: string; } interface ScenarioResult { name: string; success: boolean; artifacts: Artifact[]; error?: string; } async function downloadArtifact(url: string, destPath: string): Promise { const response = await fetch(url); if (!response.ok) { throw new Error(`Failed to download artifact: ${response.status}`); } const buffer = await response.arrayBuffer(); await fs.writeFile(destPath, Buffer.from(buffer)); } async function runScenario(scenarioPath: string): Promise { const scenarioName = path.basename(scenarioPath); const promptPath = path.join(scenarioPath, "prompt.md"); const prompt = await fs.readFile(promptPath, "utf-8"); console.log(`\n=== Running scenario: ${scenarioName} ===`); try { let toolsCalled = 0; const artifacts: Artifact[] = []; const mcpScript = path.join(ROOT_DIR, "dist/index.js"); console.log(` Starting agent with MCP server: ${mcpScript}`); for await (const message of query({ prompt: `You have access to shellwright MCP tools for terminal recording and screenshots. Use the shellwright tools (mcp__shellwright__shell_start, mcp__shellwright__shell_send, mcp__shellwright__shell_record_start, mcp__shellwright__shell_record_stop, mcp__shellwright__shell_screenshot, etc.) to execute the following scenario. ${prompt}`, options: { permissionMode: "bypassPermissions", allowDangerouslySkipPermissions: true, mcpServers: { shellwright: { command: "node", args: [mcpScript, "--log-path", SHELLWRIGHT_LOG], }, }, }, })) { if (message.type === "assistant") { for (const block of message.message.content) { if (block.type === "tool_use") { toolsCalled++; console.log(` Tool: ${block.name}`); } else if (block.type === "text") { console.log(` Assistant: ${block.text.slice(0, 100)}...`); } } } else if (message.type === "user") { // Capture tool results to extract download_url for (const block of message.message.content) { if (block.type === "tool_result") { const content = typeof block.content === "string" ? block.content : Array.isArray(block.content) ? block.content.map((c: { text?: string }) => c.text || "").join("") : JSON.stringify(block.content); let parsed; try { parsed = JSON.parse(content); } catch { continue; // Not JSON, skip } if (parsed.download_url && parsed.filename) { const dest = path.join(scenarioPath, parsed.filename); console.log(` Downloading: ${parsed.download_url} → ${parsed.filename}`); await downloadArtifact(parsed.download_url, dest); artifacts.push({ filename: parsed.filename, localPath: dest }); console.log(` ✓ Artifact saved: ${dest}`); } } } } else if (message.type === "result") { console.log(` Result: ${message.subtype} (${toolsCalled} tools called)`); if (toolsCalled === 0) { throw new Error("No tools were called - check API key and MCP server configuration"); } } } if (artifacts.length > 0) { return { name: scenarioName, success: true, artifacts }; } return { name: scenarioName, success: false, artifacts: [], error: "No artifacts generated" }; } catch (err) { console.error(` ✗ Error: ${(err as Error).message}`); return { name: scenarioName, success: false, artifacts: [], error: (err as Error).message }; } } async function main() { if (!process.env.ANTHROPIC_API_KEY) { console.error("Error: ANTHROPIC_API_KEY environment variable required"); process.exit(1); } // Create logs directory await fs.mkdir(LOGS_DIR, { recursive: true }); // Set agent log path process.env.CLAUDE_AGENT_LOG = AGENT_LOG; // Build shellwright first console.log(`Shellwright root: ${ROOT_DIR}`); console.log("Building shellwright..."); execSync("npm run build", { stdio: "inherit", cwd: ROOT_DIR }); // Find all scenarios, optionally filtered by command line argument const filterArg = process.argv[2]; let scenarios = await fs.readdir(SCENARIOS_DIR); if (filterArg) { scenarios = scenarios.filter((s) => s.includes(filterArg)); if (scenarios.length === 0) { console.error(`No scenarios matching "${filterArg}"`); process.exit(1); } } const results: ScenarioResult[] = []; for (const scenario of scenarios) { const scenarioPath = path.join(SCENARIOS_DIR, scenario); const stat = await fs.stat(scenarioPath); if (stat.isDirectory()) { const result = await runScenario(scenarioPath); results.push(result); } } // Print summary console.log("\n=== Results ==="); for (const r of results) { const status = r.success ? "✓" : "✗"; const detail = r.success ? r.artifacts.map((a) => a.filename).join(", ") : r.error; console.log(`${status} ${r.name}: ${detail}`); } console.log("\n=== Logs ==="); console.log(`Shellwright: ${SHELLWRIGHT_LOG}`); console.log(`Agent: ${AGENT_LOG}`); const failed = results.filter((r) => !r.success); if (failed.length > 0) { process.exit(1); } } main().catch((err) => { console.error(err); process.exit(1); });