/** * scorer.ts — Two-layer scoring engine for the CareerVivid Agent Eval Framework. * * Layer 1 — Deterministic (instant, free): * - Tool Invocation Accuracy (TIA): expected vs actual tool calls * - Latency Score (LAT): inverted bucket based on ms * - Keyword bonus/malus applied on top of LLM IRA score * * Layer 2 — LLM-as-Judge (cheap, ~0.01 credit per test): * - Calls Gemini with a structured rubric prompt * - Scores IRA, RQ, CR, RC, HR on 0–10 scales * - Returns a one-sentence rationale per dimension * * When --no-judge is set, qualitative dims default to 5.0 (neutral baseline) * and only deterministic dims are computed. */ import type { TestCase, DimensionScores, EvalResult } from "./types.js"; /** * Convert response latency to a 0–10 score. * Very fast responses (< 2s) get a perfect 10; anything ≥ 30s gets 0. */ export declare function latencyToScore(ms: number): number; /** * Score tool invocation accuracy by comparing expected vs actual tool calls. * * Scoring rationale: * - All expected tools called → 10 * - Partial → proportional credit (floor 2 per expected tool) * - Zero expected tools → 10 (not applicable) * - Forbidden tool called → penalty applied (−2 per forbidden tool) */ export declare function scoreTia(expectedTools: string[], forbiddenTools: string[], actualTools: string[]): number; /** * Returns true when a test required specific tools but NONE of them were called. * Used to apply a hard composite cap in the scorer — bypassing the 5.0 neutral * heuristics baseline that would otherwise mask the tool-invocation failure. */ export declare function didMissAllRequiredTools(expectedTools: string[], actualTools: string[]): boolean; /** * Apply a small keyword-based bonus/penalty to an existing IRA score. * Missing expected keywords: −0.5 per keyword (max −2) * All keywords present: +0.5 bonus (capped at 10) */ export declare function applyKeywordBonus(iraBase: number, expectedKeywords: string[], responseText: string): number; export declare function computeComposite(scores: DimensionScores): number; export interface ScorerOptions { geminiApiKey: string; judgeModel: string; noJudge: boolean; } /** * Score a test case result. * * Accepts the raw agent response, latency, and tool calls, then returns * the fully populated DimensionScores + composite + pass/fail + rationale. */ export declare function score(opts: { tc: TestCase; agentResponse: string; latencyMs: number; toolsCalled: string[]; scorerOpts: ScorerOptions; }): Promise>; //# sourceMappingURL=scorer.d.ts.map