/** * runner.ts — AgentEvalRunner: drives the agent under test programmatically. * * Key design decisions: * * 1. ISOLATED HISTORY: Each test case gets a fresh QueryEngine instance, so * conversation history never bleeds between test cases. * * 2. SAFE TOOL EXECUTION DURING EVAL: * - READ tools: auto-approved (tracker_list_jobs, get_resume, search_jobs, etc.) * - WRITE tools (tracker_add_job, tracker_update_job): auto-denied by default. * Tests marked write-op use a TEMP COPY of jobs.csv so they can test * write operations safely without modifying the real CSV. * * 3. LATENCY MEASUREMENT: wall-clock time summed across all turns for a test. * * 4. TOOL TRACKING: `toolsCalled` list is populated via onToolCall hook, * then used by the scorer for deterministic TIA scoring. * * 5. TIMEOUT: each test case has a configurable timeout (default 120s). * On timeout, the test is scored as a FAIL with latencyScore=0. */ import type { IDataLogger } from "./storage/IDataLogger.js"; import type { TestCase, EvalResult, RunSummary, RunnerOptions } from "./types.js"; export declare class AgentEvalRunner { private readonly runId; private readonly opts; private readonly logger; constructor(logger: IDataLogger, opts: RunnerOptions); /** Run a specific list of TestCases and return the run summary. */ runSuite(tests: TestCase[], suiteName: string): Promise; /** Run a single TestCase. Handles write-op isolation automatically. */ runTest(tc: TestCase, suite: string, current: number, total: number): Promise; private executeTest; private makeErrorResult; private buildSummary; private printSummary; } //# sourceMappingURL=runner.d.ts.map