import { HarnessMetadata, Harness } from '../harness.js'; import { RunJudge, JudgeHarness } from './judgeHarness.js'; import { JsonValue, ToolCallRecord, HarnessRun } from '@vitest-evals/core'; /** * Score payload returned by a judge. * * @example * ```ts * const result: JudgeResult = { * score: 1, * metadata: { * rationale: "Output matched the expected refund status.", * }, * }; * ``` */ type JudgeResult = { /** Numeric score. `null` records an intentionally unscored result. */ score: number | null; /** JSON-like judge metadata shown by assertions and reporters. */ metadata?: { /** Human-readable explanation for the score. */ rationale?: string; /** Optional judge-side output or diagnostic payload. */ output?: JsonValue; } & Record; }; /** * Full normalized context passed to every judge. * * Scenario-owned judge criteria should live on `input`. Use `metadata` for * per-run expectations or harness configuration that are not part of the * scenario payload. * * @example * ```ts * type RefundContext = JudgeContext< * string, * { status: "approved" | "denied" }, * { expected: { status: "approved" | "denied" } } * >; * * const RefundStatusJudge = createJudge( * "RefundStatusJudge", * ({ output, metadata }: RefundContext) => ({ * score: output.status === metadata.expected.status ? 1 : 0, * }), * ); * ``` */ interface JudgeContext | undefined = Harness | undefined> { /** Original eval input passed to the harness. */ input: TInput; /** App-facing output returned by the harness. */ output: TOutput; /** Flattened tool calls observed in the normalized session. */ toolCalls: ToolCallRecord[]; /** Per-run expectations or configuration passed to `run(input, { metadata })`. */ metadata: Readonly; /** Complete normalized harness run being judged. */ run: HarnessRun; /** Normalized transcript associated with the harness run. */ session: HarnessRun["session"]; /** Harness associated with this judge context. */ harness: THarness; /** Runs the configured matcher, judge, or suite judge harness with run-scoped context. */ runJudge?: RunJudge; } /** Convenience helper for judges that accept explicit per-call params. */ type JudgeOptions = Record, TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata, THarness extends Harness | undefined = Harness | undefined> = JudgeContext & TParams; /** Function that assesses a normalized judge context. */ type JudgeAssessFn = JudgeContext> = (opts: TOptions) => Promise | JudgeResult; /** * Runtime options supplied by core when calling a legacy judge-side assessor. * * @deprecated Prefer `RunJudgeOptions` with `ctx.runJudge(...)`. */ type JudgeAssessorOptions = { /** Abort signal from the current eval run when available. */ signal?: AbortSignal; }; /** * Legacy provider/model helper that a judge can use without running the app * harness. * * New LLM-backed judges should use `createJudgeHarness(...)` plus * `ctx.runJudge(...)` instead. The judge harness path supports response * formats, matcher/suite/judge-level configuration, and keeps provider * adapters outside core judge implementations. * * @deprecated Prefer `createJudgeHarness(...)` and `ctx.runJudge(...)` for * LLM-backed judges. * * @example * ```ts * const assessor: JudgeAssessor = { * assess: async (prompt, { signal }) => runRubricModel(prompt, { signal }), * }; * ``` */ type JudgeAssessor = { /** Runs the judge-side model/provider call. */ assess: (input: TInput, options: JudgeAssessorOptions) => Promise | TOutput; }; /** * Legacy judge-side assessor after core binds run-scoped options such as abort * signal. * * @deprecated Prefer `RunJudge` from `ctx.runJudge(...)` for LLM-backed judges. */ type BoundJudgeAssessor = { /** Runs the judge-side model/provider call with run-scoped options already bound. */ assess: (input: TInput) => Promise; }; /** * Legacy function that assesses a context with a prebound judge-side assessor. * * @deprecated Prefer `JudgeAssessFn` with `ctx.runJudge(...)`. */ type JudgeAssessWithAssessorFn = JudgeContext, TInput = string, TOutput = string> = (opts: TOptions, assessor: BoundJudgeAssessor) => Promise | JudgeResult; /** * Named judge object consumed by suite-level judges and explicit assertions. * * @example * ```ts * type RefundOutput = { status: "approved" | "denied" }; * type RefundMetadata = { expected: { status: RefundOutput["status"] } }; * * const judge: Judge> = { * name: "RefundStatusJudge", * assess: ({ output, metadata }) => ({ * score: output.status === metadata.expected.status ? 1 : 0, * }), * }; * ``` */ interface Judge = JudgeContext> { /** Stable judge name used in assertion messages and reports. */ name: string; /** Default judge-side harness used when matcher options do not provide one. */ judgeHarness?: JudgeHarness; /** Scores one normalized judge context. */ assess: JudgeAssessFn; } export type { BoundJudgeAssessor, Judge, JudgeAssessFn, JudgeAssessWithAssessorFn, JudgeAssessor, JudgeAssessorOptions, JudgeContext, JudgeOptions, JudgeResult };