import * as vitest from 'vitest'; import { TestAPI } from 'vitest'; import { HarnessMetadata, Harness } from './harness.js'; export { CreateHarnessOptions, CreateHarnessRunArgs, CreateToolCallSpansOptions, EnsureRunTraceOptions, HarnessContext, HarnessResultLike, MaybePromise, SimpleHarnessResult, SimpleSpanEvent, SimpleSpanRecord, SimpleToolCallRecord, SimpleTraceRecord, attachHarnessRunToError, createFailedHarnessRun, createGenAiUsageAttributes, createHarness, createToolCallSpans, ensureRunTrace, getHarnessRunFromError, normalizeHarnessRun, normalizeSpanAttributes, normalizeSpanError, toJsonValue } from './harness.js'; import { JudgeContext, Judge, JudgeResult, JudgeAssessFn, JudgeAssessor, JudgeAssessWithAssessorFn } from './judges/types.js'; export { BoundJudgeAssessor, JudgeAssessorOptions, JudgeOptions } from './judges/types.js'; import { JudgeHarness } from './judges/judgeHarness.js'; export { CreateJudgeHarnessOptions, CreateJudgeHarnessRunOptions, JudgeHarnessInput, JudgeHarnessOutput, RunJudge, RunJudgeOptions, createJudgeHarness, runJudgeHarness } from './judges/judgeHarness.js'; export { wrapText } from './wrapText.js'; export { FactualityJudge, FactualityJudgeChoice, FactualityJudgeConfig, FactualityJudgeExpected, FactualityJudgeOptions, FactualityJudgePrompt, FactualityJudgeVerdict } from './judges/factualityJudge.js'; export { StructuredOutputJudge, StructuredOutputJudgeConfig, StructuredOutputJudgeExpected, StructuredOutputJudgeOptions } from './judges/structuredOutputJudge.js'; export { ToolCallJudge, ToolCallJudgeConfig, ToolCallJudgeExpectedTool, ToolCallJudgeOptions } from './judges/toolCallJudge.js'; export { BaseMatcherConfig, FuzzyMatchOptions, MatchStrategy } from './internal/matchers.js'; import { JsonValue, HarnessRun, NormalizedSession, ToolCallRecord } from '@vitest-evals/core'; export { GenAiOperationName, GenAiOutputType, GenAiProviderName, GenAiSemanticAttributeKey, GenAiSemanticAttributes, GenAiTokenType, GenAiToolType, HarnessRun, HarnessRunError, JsonPrimitive, JsonValue, NormalizedMessage, NormalizedSession, NormalizedSpan, NormalizedSpanAttributeKey, NormalizedSpanAttributes, NormalizedSpanEvent, NormalizedTrace, OpenTelemetrySemanticAttributeKey, OpenTelemetrySemanticAttributes, TimingSummary, ToolCallRecord, UsageSummary, assistantMessages, failedSpans, latestAssistantMessageContent, messagesByRole, spans, spansByKind, systemMessages, toolCalls, toolMessages, userMessages } from '@vitest-evals/core'; import './internal/structuredOutputScorer.js'; import './internal/scoring.js'; import './internal/toolCallScorer.js'; type EvalTaskMeta = { eval?: { scores: (JudgeResult & { name: string; })[]; avgScore: number; output?: unknown; toolCalls?: ToolCallRecord[]; thresholdFailed?: boolean; }; harness?: { name: string; run: HarnessRun; }; }; type HarnessInput> = THarness extends Harness ? TInput : unknown; type HarnessMetadataFor> = THarness extends Harness ? TMetadata : HarnessMetadata; type HarnessOutput> = THarness extends Harness ? TOutput : JsonValue | undefined; type CreateJudgeConfig = JudgeContext> = { name: string; judgeHarness?: JudgeHarness; assess: JudgeAssessFn; }; declare const evalHarnessRunBrand: unique symbol; /** * Harness run returned by the fixture-backed `run(...)` API. * * @example * ```ts * it("approves a refund", async ({ run }) => { * const result = await run("Refund invoice inv_123"); * * expect(result.output.status).toBe("approved"); * }); * ``` */ type EvalHarnessRun = Harness> = HarnessRun & { readonly [evalHarnessRunBrand]: { readonly input: TInput; readonly metadata: TMetadata; readonly output: TOutput; readonly harness: THarness; }; }; /** * Per-run metadata forwarded to the harness alongside the test input. * * @example * ```ts * await run("Refund invoice inv_123", { * metadata: { * expected: { status: "approved" }, * expectedTools: ["lookupInvoice", "createRefund"], * }, * }); * ``` */ interface EvalRunOptions { /** Per-run expectations or configuration forwarded to harnesses and judges. */ metadata?: TMetadata; } /** * Explicit harness execution primitive exposed to each eval test. * * @example * ```ts * const result = await run("Refund invoice inv_123", { * metadata: { expected: { status: "approved" } }, * }); * ``` */ type EvalRun = Harness> = (input: TInput, options?: EvalRunOptions) => Promise>; /** * Fixture-backed Vitest context exposed inside `describeEval(...)` tests. * * @example * ```ts * type RefundOutput = { status: "approved" | "denied" }; * * it("approves a refund", async ({ run }: EvalTestContext) => { * const result = await run("Refund invoice inv_123"); * * expect(result.output.status).toBe("approved"); * }); * ``` */ interface EvalTestContext = Harness> { run: EvalRun; } /** Fixture-backed Vitest test API exposed inside `describeEval(...)`. */ type EvalTestAPI = Harness> = TestAPI>; /** * Suite-level configuration for a harness-backed eval block. * * @example * ```ts * const options: DescribeEvalOptions< * string, * { status: "approved" | "denied" }, * { expected: { status: "approved" | "denied" } } * > = { * harness: refundHarness, * judges: [ToolCallJudge(), StructuredOutputJudge()], * judgeThreshold: 1, * }; * ``` */ interface DescribeEvalOptions = Harness> { /** Harness used for every explicit `run(...)` call in the suite. */ harness: THarness; /** Automatic judges applied after each successful `run(...)`. */ judges?: Array>>; /** Optional judge-side harness used only by judges that call `ctx.runJudge(...)`. */ judgeHarness?: JudgeHarness; /** Passing threshold for automatic suite-level judges. `null` disables fail-on-score. */ judgeThreshold?: number | null; /** Skips the entire eval suite when the predicate returns true. */ skipIf?: () => boolean; } type JudgeAssertionInput> = TJudgeOptions extends { input: infer TInput; } ? TInput : unknown; type JudgeAssertionOutput> = TJudgeOptions extends { output: infer TOutput; } ? TOutput : JsonValue | undefined; type JudgeAssertionMetadata> = TJudgeOptions extends { metadata: infer TMetadata; } ? TMetadata : HarnessMetadata; type JudgeAssertionHarness> = TJudgeOptions extends { harness: infer THarness; } ? Exclude : Harness, JudgeAssertionOutput, JudgeAssertionMetadata>; type JudgeAssertionReservedKey = keyof JudgeContext | "judgeHarness" | "signal" | "threshold"; type JudgeAssertionParams> = Omit; type RequiredKeys = { [K in keyof T]-?: Record extends Pick ? never : K; }[keyof T]; type JudgeAssertionArgs> = RequiredKeys> extends never ? [options?: JudgeAssertionOptions] : [options: JudgeAssertionOptions]; type MatcherOutput = TReceived extends EvalHarnessRun ? TOutput : TReceived extends HarnessRun ? TOutput : TReceived extends NormalizedSession ? JsonValue | undefined : TReceived extends JsonValue ? TReceived : JsonValue | undefined; type JudgeForReceived> = MatcherOutput extends JudgeAssertionOutput ? Judge : never; /** * Optional overrides passed to `expect(...).toSatisfyJudge(...)`. * * @example * ```ts * await expect(result).toSatisfyJudge(RefundStatusJudge, { * threshold: null, * }); * ``` */ type JudgeAssertionOptions = JudgeContext> = JudgeAssertionParams & { /** Override or provide the original eval input for the judge. */ input?: JudgeAssertionInput; /** Override or provide the app-facing output for the judge. */ output?: JudgeAssertionOutput; /** Override or provide per-run judge metadata. */ metadata?: JudgeAssertionMetadata; /** Override or provide flattened tool calls for the judge. */ toolCalls?: ToolCallRecord[]; /** Override or provide the complete normalized harness run. */ run?: HarnessRun>; /** Override or provide the normalized session transcript. */ session?: HarnessRun["session"]; /** Override or provide the harness associated with the judge context. */ harness?: JudgeAssertionHarness; /** Override or provide the judge harness for judges that call `ctx.runJudge(...)`. */ judgeHarness?: JudgeHarness; /** Passing threshold for the explicit matcher. `null` records the score without failing. */ threshold?: number | null; }; /** Function type installed as the `toSatisfyJudge(...)` matcher. */ type ToSatisfyJudge = = JudgeContext>(judge: JudgeForReceived, ...args: JudgeAssertionArgs) => Promise; /** * Vitest matcher extension surface added by `vitest-evals`. * * @example * ```ts * await expect(result).toSatisfyJudge(RefundStatusJudge); * ``` */ interface EvalMatchers { toSatisfyJudge: ToSatisfyJudge; } declare module "vitest" { interface Assertion extends EvalMatchers { } interface AsymmetricMatchersContaining extends EvalMatchers { } interface TaskMeta extends EvalTaskMeta { } } /** * Creates a harness-backed eval suite on top of a fixture-backed Vitest test API. * * @param name - Suite name shown by Vitest and reporters. * @param options - Harness, automatic judges, threshold, and suite skip settings. * @param define - Callback that receives the eval-aware `it` API. * * @example * ```ts * import { piAiHarness } from "@vitest-evals/harness-pi-ai"; * import { getModel } from "@mariozechner/pi-ai"; * import { piAiJudgeHarness } from "@vitest-evals/harness-pi-ai"; * import { expect } from "vitest"; * import { * describeEval, * FactualityJudge, * ToolCallJudge, * toolCalls, * } from "vitest-evals"; * import { createRefundAgent } from "../src/refundAgent"; * * const judgeHarness = piAiJudgeHarness({ * model: getModel("anthropic", "claude-sonnet-4-5"), * temperature: 0, * }); * * describeEval("refund agent", { * harness: piAiHarness({ * agent: () => createRefundAgent(), * }), * judgeHarness, * judges: [ToolCallJudge()], * }, (it) => { * it("approves a refundable invoice", async ({ run }) => { * const result = await run("Refund invoice inv_123", { * metadata: { * expected: "Invoice inv_123 should be refunded.", * }, * }); * * expect(result.output).toMatchObject({ status: "approved" }); * expect(toolCalls(result.session)).toHaveLength(2); * await expect(result).toSatisfyJudge(FactualityJudge(), { * threshold: 0.6, * }); * }); * }); * ``` */ declare function describeEval>(name: string, options: DescribeEvalOptions, HarnessOutput, HarnessMetadataFor, THarness>, define: (it: EvalTestAPI, HarnessOutput, HarnessMetadataFor, THarness>) => void): vitest.SuiteCollector; /** * Formats judge results for reporter and assertion output. * * @param scores - Named judge results to sort and format. * * @example * ```ts * const message = formatScores([ * { * name: "RefundStatusJudge", * score: 0, * metadata: { rationale: "Expected approved, got denied" }, * }, * ]); * ``` */ declare function formatScores(scores: (JudgeResult & { name: string; })[]): string; /** * Creates a named judge object from an assessment function. * * @param name - Stable judge name shown in assertion messages and reports. * @param assess - Function that scores one normalized judge context. * * @example * ```ts * import { createJudge, type JudgeContext } from "vitest-evals"; * * type RefundOutput = { status: "approved" | "denied" }; * type RefundMetadata = { expected: { status: RefundOutput["status"] } }; * * export const RefundStatusJudge = createJudge( * "RefundStatusJudge", * async ({ output, metadata }: JudgeContext) => ({ * score: output.status === metadata.expected.status ? 1 : 0, * metadata: { * rationale: `Expected ${metadata.expected.status}, got ${output.status}`, * }, * }), * ); * ``` * * For LLM-backed judges, prefer the object form with `ctx.runJudge(...)` so * provider-specific model configuration stays in the judge harness. */ declare function createJudge>(name: string, assess: JudgeAssessFn): Judge; declare function createJudge>(config: CreateJudgeConfig): Judge; /** * @deprecated Prefer `createJudge({ name, judgeHarness, assess })` and call * `ctx.runJudge(...)` from LLM-backed judges. */ declare function createJudge, TInput, TOutput>(name: string, assessor: JudgeAssessor, assess: JudgeAssessWithAssessorFn): Judge; export { type DescribeEvalOptions, type EvalHarnessRun, type EvalMatchers, type EvalRun, type EvalRunOptions, type EvalTestAPI, type EvalTestContext, Harness, HarnessMetadata, Judge, type JudgeAssertionOptions, JudgeAssessFn, JudgeAssessWithAssessorFn, JudgeAssessor, JudgeContext, JudgeHarness, JudgeResult, type ToSatisfyJudge, createJudge, describeEval, formatScores };