import { ChatCompletionMessageParam, ChatCompletionTool, ChatCompletionToolChoiceOption, ChatCompletion } from 'openai/resources'; import { ReasoningEffort } from 'openai/resources/shared'; import { OpenAI } from 'openai'; import { z } from 'zod/v3'; interface Score { name: string; score: number | null; metadata?: Record; /** * @deprecated */ error?: unknown; } type ScorerArgs = { output: Output; expected?: Output; } & Extra; type Scorer = (args: ScorerArgs) => Score | Promise; interface CachedLLMParams { /** Model to use for the completion. Note: If using Azure OpenAI, this should be the deployment name.. */ model: string; messages: ChatCompletionMessageParam[]; tools?: ChatCompletionTool[]; tool_choice?: ChatCompletionToolChoiceOption; temperature?: number; max_tokens?: number; reasoning_effort?: ReasoningEffort; /** * Force the request to use the Responses API, even when the model name does * not start with "gpt-5". Useful for proxy/internal setups that serve a * Responses-only model under a name that doesn't match {@link isGPT5Model}. */ use_responses_api?: boolean; span_info?: { spanAttributes?: Record; }; } interface ChatCache { get(params: CachedLLMParams): Promise; set(params: CachedLLMParams, response: ChatCompletion): Promise; } type OpenAIAuth = { /** @deprecated Use the `client` option instead */ openAiApiKey?: string; /** @deprecated Use the `client` option instead */ openAiOrganizationId?: string; /** @deprecated Use the `client` option instead */ openAiBaseUrl?: string; /** @deprecated Use the `client` option instead */ openAiDefaultHeaders?: Record; /** @deprecated Use the `client` option instead */ openAiDangerouslyAllowBrowser?: boolean; /** @deprecated Use the `client` option instead */ azureOpenAi?: AzureOpenAiAuth; client?: never; } | { client: OpenAI; /** @deprecated Use the `client` option instead */ openAiApiKey?: never; /** @deprecated Use the `client` option instead */ openAiOrganizationId?: never; /** @deprecated Use the `client` option instead */ openAiBaseUrl?: never; /** @deprecated Use the `client` option instead */ openAiDefaultHeaders?: never; /** @deprecated Use the `client` option instead */ openAiDangerouslyAllowBrowser?: never; /** @deprecated Use the `client` option instead */ azureOpenAi?: never; }; interface AzureOpenAiAuth { apiKey: string; endpoint: string; apiVersion: string; } declare global { var __inherited_braintrust_wrap_openai: ((openai: any) => any) | undefined; var __client: OpenAI | undefined; var __defaultModel: string | undefined; var __defaultEmbeddingModel: string | undefined; } interface InitOptions { /** * An OpenAI-compatible client to use for all evaluations. * This can be an OpenAI client, or any client that implements the OpenAI API * (e.g., configured to use the Braintrust Gateway with Anthropic, Gemini, etc.) */ client?: OpenAI; /** * The default model(s) to use for evaluations when not specified per-call. * * Can be either: * - A string (for backward compatibility): Sets the default completion model only. * Defaults to "gpt-5-mini" if not set. * - An object with `completion` and/or `embedding` properties: Allows setting * default models for different evaluation types. Only the specified models * are updated; others remain unchanged. * * When using non-OpenAI providers via the Braintrust Gateway, set this to * the appropriate model string (e.g., "claude-3-5-sonnet-20241022"). * * @example * // String form (backward compatible) * init({ defaultModel: "gpt-4-turbo" }) * * @example * // Object form: set both models * init({ * defaultModel: { * completion: "claude-3-5-sonnet-20241022", * embedding: "text-embedding-3-large" * } * }) * * @example * // Object form: set only embedding model * init({ * defaultModel: { * embedding: "text-embedding-3-large" * } * }) */ defaultModel?: string | { /** * Default model for LLM-as-a-judge evaluations (completion). * Defaults to "gpt-5-mini" if not set. */ completion?: string; /** * Default model for embedding-based evaluations. * Defaults to "text-embedding-ada-002" if not set. */ embedding?: string; }; } /** * Initialize autoevals with a custom client and/or default models. * * @example * // Using with OpenAI (default) * import { init } from "autoevals"; * import { OpenAI } from "openai"; * * init({ client: new OpenAI() }); * * @example * // Using with Anthropic via Braintrust Gateway * import { init } from "autoevals"; * import { OpenAI } from "openai"; * * init({ * client: new OpenAI({ * apiKey: process.env.BRAINTRUST_API_KEY, * baseURL: process.env.BRAINTRUST_AI_GATEWAY_URL || "https://gateway.braintrust.dev", * }), * defaultModel: { * completion: "claude-3-5-sonnet-20241022", * embedding: "text-embedding-3-large", * }, * }); * * @example * // String form (backward compatible) * init({ defaultModel: "gpt-4-turbo" }); */ declare const init: ({ client, defaultModel }?: InitOptions) => void; /** * Get the configured default completion model, or "gpt-5-mini" if not set. */ declare const getDefaultModel: () => string; declare const modelGradedSpecSchema: z.ZodObject<{ prompt: z.ZodString; choice_scores: z.ZodRecord; model: z.ZodOptional; use_cot: z.ZodOptional; temperature: z.ZodOptional; max_tokens: z.ZodOptional; }, "strip", z.ZodTypeAny, { prompt: string; choice_scores: Record; model?: string | undefined; temperature?: number | undefined; max_tokens?: number | undefined; use_cot?: boolean | undefined; }, { prompt: string; choice_scores: Record; model?: string | undefined; temperature?: number | undefined; max_tokens?: number | undefined; use_cot?: boolean | undefined; }>; type ModelGradedSpec = z.infer; declare const templateStrings: { readonly battle: string; readonly closed_q_a: string; readonly factuality: string; readonly humor: string; readonly possible: string; readonly security: string; readonly sql: string; readonly summary: string; readonly translation: string; }; declare const templates: Record; interface ScorerWithPartial extends Scorer { partial: (args: { [K in T]: Extra[K]; }) => Scorer & Partial>>; } declare function makePartial(fn: Scorer, name?: string): ScorerWithPartial; /** * Minimal interface for a Trace object that can provide thread data. * This is compatible with the Trace interface from the braintrust SDK. */ interface TraceForScorer { getThread(options?: { preprocessor?: string; }): Promise; } declare const THREAD_VARIABLE_NAMES: string[]; declare const THREAD_VARIABLE_PATTERN: RegExp; /** * Check if a template string might use thread-related template variables. * This is a heuristic - looks for variable names after `{{` or `{%` syntax. */ declare function templateUsesThreadVariables(template: string): boolean; type LLMArgs = { maxTokens?: number; temperature?: number; reasoningEffort?: ReasoningEffort; reasoningEnabled?: boolean; reasoningBudget?: number; /** * Force the request to use the Responses API, even when the model name does * not start with "gpt-5". Useful for proxy/internal setups that serve a * Responses-only model under a non-matching name. */ useResponsesApi?: boolean; } & OpenAIAuth; /** * The default model to use for LLM-based evaluations. * @deprecated Use `init({ defaultModel: "..." })` to configure the default model instead. */ declare const DEFAULT_MODEL = "gpt-5-mini"; declare function buildClassificationTools(useCoT: boolean, choiceStrings: string[]): ChatCompletionTool[]; type OpenAIClassifierArgs = { name: string; model: string; messages: ChatCompletionMessageParam[]; choiceScores: Record; classificationTools: ChatCompletionTool[]; cache?: ChatCache; } & LLMArgs & RenderArgs; declare function OpenAIClassifier(args: ScorerArgs>): Promise; type LLMClassifierArgs = { model?: string; useCoT?: boolean; /** * Optional trace object for multi-turn scoring. * When provided, thread template variables (thread_text, thread_count, etc.) * are automatically computed and made available in the template. */ trace?: TraceForScorer; } & LLMArgs & RenderArgs; declare function LLMClassifierFromTemplate({ name, promptTemplate, choiceScores, model: modelArg, useCoT: useCoTArg, temperature, maxTokens: maxTokensArg, reasoningEffort, reasoningEnabled, reasoningBudget, useResponsesApi, }: { name: string; promptTemplate: string; choiceScores: Record; model?: string; useCoT?: boolean; temperature?: number; maxTokens?: number; reasoningEffort?: ReasoningEffort; reasoningEnabled?: boolean; reasoningBudget?: number; useResponsesApi?: boolean; }): Scorer>; declare function LLMClassifierFromSpec(name: string, spec: ModelGradedSpec): Scorer>; declare function LLMClassifierFromSpecFile(name: string, templateName: keyof typeof templates): Scorer>; /** * Test whether an output _better_ performs the `instructions` than the original * (expected) value. */ declare const Battle: ScorerWithPartial>; /** * Test whether an output answers the `input` using knowledge built into the model. * You can specify `criteria` to further constrain the answer. */ declare const ClosedQA: ScorerWithPartial>; /** * Test whether an output is funny. */ declare const Humor: ScorerWithPartial>; /** * Test whether an output is factual, compared to an original (`expected`) value. */ declare const Factuality: ScorerWithPartial>; /** * Test whether an output is a possible solution to the challenge posed in the input. */ declare const Possible: ScorerWithPartial>; /** * Test whether an output is malicious. */ declare const Security: ScorerWithPartial>; /** * Test whether a SQL query is semantically the same as a reference (output) query. */ declare const Sql: ScorerWithPartial>; /** * Test whether an output is a better summary of the `input` than the original (`expected`) value. */ declare const Summary: ScorerWithPartial>; /** * Test whether an `output` is as good of a translation of the `input` in the specified `language` * as an expert (`expected`) value. */ declare const Translation: ScorerWithPartial>; /** * A simple scorer that uses the Levenshtein distance to compare two strings. */ declare const Levenshtein: ScorerWithPartial; declare const LevenshteinScorer: ScorerWithPartial; /** * A scorer that uses cosine similarity to compare two strings. * * @param args * @param args.prefix A prefix to prepend to the prompt. This is useful for specifying the domain of the inputs. * @param args.model The model to use for the embedding distance. Defaults to "text-embedding-ada-002". * @param args.expectedMin The minimum expected score. Defaults to 0.7. Values below this will be scored as 0, and * values between this and 1 will be scaled linearly. * @returns A score between 0 and 1, where 1 is a perfect match. */ declare const EmbeddingSimilarity: ScorerWithPartial; /** * A scorer that semantically evaluates the overlap between two lists of strings. It works by * computing the pairwise similarity between each element of the output and the expected value, * and then using Linear Sum Assignment to find the best matching pairs. */ declare const ListContains: ScorerWithPartial; allowExtraEntities?: boolean; }>; /** * A scorer that uses OpenAI's moderation API to determine if AI response contains ANY flagged content. * * @param args * @param args.threshold Optional. Threshold to use to determine whether content has exceeded threshold. By * default, it uses OpenAI's default. (Using `flagged` from the response payload.) * @param args.categories Optional. Specific categories to look for. If not set, all categories will * be considered. * @returns A score between 0 and 1, where 1 means content passed all moderation checks. */ declare const Moderation: ScorerWithPartial; /** * A simple scorer that compares numbers by normalizing their difference. */ declare const NumericDiff: ScorerWithPartial; /** * JSON evaluation scorers for comparing and validating JSON data. * * This module provides scorers for working with JSON data: * * - **JSONDiff**: Compare JSON objects for structural and content similarity * - **ValidJSON**: Validate if a value is valid JSON and matches an optional schema * * ## Creating Custom JSON Scorers * * You can create custom JSON scorers by composing existing scorers or building new ones: * * @example * ```typescript * import { Scorer } from "autoevals"; * import { JSONDiff, ValidJSON } from "autoevals/json"; * import { EmbeddingSimilarity } from "autoevals/string"; * * // Custom scorer that validates JSON schema then compares semantically * const myJSONScorer: Scorer = async ({ output, expected, schema }) => { * // First, validate both outputs against schema * const outputValid = await ValidJSON({ output, schema }); * const expectedValid = await ValidJSON({ output: expected, schema }); * * if (outputValid.score === 0 || expectedValid.score === 0) { * return { * name: "CustomJSONScorer", * score: 0, * error: "Invalid JSON format" * }; * } * * // Then compare using semantic similarity for strings * return JSONDiff({ * output, * expected, * stringScorer: EmbeddingSimilarity * }); * }; * * // Custom scorer for specific JSON structure validation * const apiResponseScorer: Scorer = async ({ output }) => { * const parsed = typeof output === "string" ? JSON.parse(output) : output; * * let score = 0; * const errors: string[] = []; * * // Check required fields * if (parsed.status) score += 0.3; * else errors.push("Missing status field"); * * if (parsed.data) score += 0.3; * else errors.push("Missing data field"); * * // Check data structure * if (parsed.data?.items && Array.isArray(parsed.data.items)) { * score += 0.4; * } else { * errors.push("data.items must be an array"); * } * * return { * name: "APIResponseScorer", * score: Math.min(score, 1), * metadata: { errors } * }; * }; * ``` */ /** * Compare JSON objects for structural and content similarity. * * This scorer recursively compares JSON objects, handling: * - Nested dictionaries and arrays * - String similarity using Levenshtein distance (or custom scorer) * - Numeric value comparison (or custom scorer) * - Automatic parsing of JSON strings * * @example * ```typescript * import { JSONDiff } from "autoevals"; * import { EmbeddingSimilarity } from "autoevals/string"; * * // Basic comparison * const result = await JSONDiff({ * output: { * name: "John Smith", * age: 30, * skills: ["python", "javascript"] * }, * expected: { * name: "John A. Smith", * age: 31, * skills: ["python", "typescript"] * } * }); * console.log(result.score); // Similarity score between 0-1 * * // With custom string scorer using embeddings * const semanticResult = await JSONDiff({ * output: { description: "A fast car" }, * expected: { description: "A quick automobile" }, * stringScorer: EmbeddingSimilarity * }); * ``` * * @param output - The JSON object or string to evaluate * @param expected - The expected JSON object or string to compare against * @param stringScorer - Optional custom scorer for string comparisons (default: LevenshteinScorer) * @param numberScorer - Optional custom scorer for number comparisons (default: NumericDiff) * @param preserveStrings - Don't attempt to parse strings as JSON (default: false) * @returns Score object with similarity score between 0-1 */ declare const JSONDiff: ScorerWithPartial; numberScorer?: Scorer; preserveStrings?: boolean; }>; /** * Validate if a value is valid JSON and optionally matches a JSON Schema. * * This scorer checks if: * - The input can be parsed as valid JSON (if it's a string) * - The parsed JSON matches an optional JSON Schema * - Handles both string inputs and pre-parsed JSON objects * * @example * ```typescript * import { ValidJSON } from "autoevals"; * * // Basic JSON validation * const result1 = await ValidJSON({ * output: '{"name": "John", "age": 30}' * }); * console.log(result1.score); // 1 (valid JSON) * * const result2 = await ValidJSON({ * output: '{invalid json}' * }); * console.log(result2.score); // 0 (invalid JSON) * * // With schema validation * const schema = { * type: "object", * properties: { * name: { type: "string" }, * age: { type: "number" } * }, * required: ["name", "age"] * }; * * const result3 = await ValidJSON({ * output: { name: "John", age: 30 }, * schema * }); * console.log(result3.score); // 1 (matches schema) * * const result4 = await ValidJSON({ * output: { name: "John" }, // missing required "age" * schema * }); * console.log(result4.score); // 0 (doesn't match schema) * ``` * * @param output - The value to validate (string or object) * @param schema - Optional JSON Schema to validate against (see https://json-schema.org) * @returns Score object with score of 1 if valid, 0 otherwise */ declare const ValidJSON: ScorerWithPartial; type RagasArgs = { input?: string; context?: string | string[]; model?: string; } & LLMArgs; interface RagasEmbeddingModelArgs extends Record { /** @default If not provided, the default model of {@link EmbeddingSimilarity} is used. */ embeddingModel?: string; } /** * Estimates context recall by estimating TP and FN using annotated answer and * retrieved context. */ declare const ContextEntityRecall: ScorerWithPartial; }>; declare const ContextRelevancy: ScorerWithPartial; declare const ContextRecall: ScorerWithPartial; declare const ContextPrecision: ScorerWithPartial; /** * Measures factual consistency of the generated answer with the given context. */ declare const Faithfulness: ScorerWithPartial; /** * Scores the relevancy of the generated answer to the given question. * Answers with incomplete, redundant or unnecessary information are penalized. */ declare const AnswerRelevancy: ScorerWithPartial; /** * Scores the semantic similarity between the generated answer and ground truth. */ declare const AnswerSimilarity: ScorerWithPartial; /** * Measures answer correctness compared to ground truth using a weighted * average of factuality and semantic similarity. */ declare const AnswerCorrectness: ScorerWithPartial; embeddingModel?: string; }>; /** * A simple scorer that tests whether two values are equal. If the value is an object or array, * it will be JSON-serialized and the strings compared for equality. */ declare const ExactMatch: ScorerWithPartial; declare function normalizeValue(value: unknown, maybeObject: boolean): string; interface AutoevalMethod { method: ScorerWithPartial; description: string; template?: ModelGradedSpec; requiresExtraParams?: boolean; } declare const Evaluators: { label: string; methods: AutoevalMethod[]; }[]; /** * Thread utilities for LLM-as-a-judge scorers. * * This module provides utilities for working with preprocessed conversation * messages (threads) in LLM scorer templates. */ /** * A message with role and content fields (LLM chat message format). */ interface LLMMessage { role: string; content: unknown; } /** * Check if an item looks like an LLM message (has role and content). */ declare function isRoleContentMessage(item: unknown): item is LLMMessage; /** * Check if a value is an array of LLM messages. */ declare function isLLMMessageArray(value: unknown): value is LLMMessage[]; /** * Format an array of LLM messages as human-readable text. */ declare function formatMessageArrayAsText(messages: LLMMessage[]): string; /** * Template variables computed from a thread for use in LLM-as-a-judge scorers. * * Note: `thread` automatically renders as human-readable text in Mustache * templates via the smart escape function. No need for a separate `thread_text`. */ interface ThreadTemplateVars { thread: unknown[]; thread_count: number; first_message: unknown | null; last_message: unknown | null; user_messages: unknown[]; assistant_messages: unknown[]; human_ai_pairs: Array<{ human: unknown; assistant: unknown; }>; } /** * Compute template variables from a thread for use in mustache templates. * Uses lazy getters so expensive computations only run when accessed. * * Note: `thread` (and other message variables) will automatically render as * human-readable text when used in templates like `{{thread}}` due to the * smart escape function in renderMessages. */ declare function computeThreadTemplateVars(thread: unknown[]): ThreadTemplateVars; export { AnswerCorrectness, AnswerRelevancy, AnswerSimilarity, Battle, ClosedQA, ContextEntityRecall, ContextPrecision, ContextRecall, ContextRelevancy, DEFAULT_MODEL, EmbeddingSimilarity, Evaluators, ExactMatch, Factuality, Faithfulness, Humor, type InitOptions, JSONDiff, type LLMArgs, type LLMClassifierArgs, LLMClassifierFromSpec, LLMClassifierFromSpecFile, LLMClassifierFromTemplate, type LLMMessage, Levenshtein, LevenshteinScorer, ListContains, type ModelGradedSpec, Moderation, NumericDiff, OpenAIClassifier, type OpenAIClassifierArgs, Possible, type Score, type Scorer, type ScorerArgs, type ScorerWithPartial, Security, Sql, Summary, THREAD_VARIABLE_NAMES, THREAD_VARIABLE_PATTERN, type ThreadTemplateVars, type TraceForScorer, Translation, ValidJSON, buildClassificationTools, computeThreadTemplateVars, formatMessageArrayAsText, getDefaultModel, init, isLLMMessageArray, isRoleContentMessage, makePartial, modelGradedSpecSchema, normalizeValue, templateUsesThreadVariables, templates };