{"version":3,"sources":["../../src/harness.ts","../../src/internal/matchers.ts","../../src/internal/toolCallScorer.ts"],"sourcesContent":["import {\n  assistantMessages,\n  failedSpans,\n  latestAssistantMessageContent,\n  messagesByRole,\n  spans,\n  spansByKind,\n  systemMessages,\n  toolCalls,\n  toolMessages,\n  userMessages,\n} from \"@vitest-evals/core\";\nimport type {\n  GenAiOperationName,\n  HarnessRun,\n  HarnessRunError,\n  JsonPrimitive,\n  JsonValue,\n  NormalizedMessage,\n  NormalizedSession,\n  NormalizedSpan,\n  NormalizedSpanAttributes,\n  NormalizedSpanEvent,\n  NormalizedTrace,\n  TimingSummary,\n  ToolCallRecord,\n  UsageSummary,\n} from \"@vitest-evals/core\";\n\nexport {\n  assistantMessages,\n  failedSpans,\n  latestAssistantMessageContent,\n  messagesByRole,\n  spans,\n  spansByKind,\n  systemMessages,\n  toolCalls,\n  toolMessages,\n  userMessages,\n} from \"@vitest-evals/core\";\nexport type {\n  GenAiOperationName,\n  GenAiOutputType,\n  GenAiProviderName,\n  GenAiSemanticAttributeKey,\n  GenAiSemanticAttributes,\n  GenAiTokenType,\n  GenAiToolType,\n  HarnessRun,\n  HarnessRunError,\n  JsonPrimitive,\n  JsonValue,\n  NormalizedMessage,\n  NormalizedSession,\n  NormalizedSpan,\n  NormalizedSpanAttributeKey,\n  NormalizedSpanAttributes,\n  NormalizedSpanEvent,\n  NormalizedTrace,\n  OpenTelemetrySemanticAttributeKey,\n  OpenTelemetrySemanticAttributes,\n  TimingSummary,\n  ToolCallRecord,\n  UsageSummary,\n} from \"@vitest-evals/core\";\n\n/** Options for converting normalized tool calls into trace spans. */\nexport type CreateToolCallSpansOptions = {\n  /** Trace id to attach to each generated tool span. */\n  traceId?: string;\n  /** Parent span id to attach to each generated tool span. */\n  parentId?: string;\n  /** Prefix used to create internal span ids instead of reusing tool-call ids. */\n  spanIdPrefix?: string;\n};\n\n/** Options for attaching a fallback run trace to a harness result. */\nexport type EnsureRunTraceOptions = {\n  /** Human-readable run or harness name. */\n  name: string;\n  /** Wall-clock start time for the harness run. */\n  startedAt: Date;\n  /** Wall-clock finish time for the harness run. */\n  finishedAt: Date;\n  /** Optional trace id. A generated id is used when omitted. */\n  id?: string;\n  /** GenAI operation name to place on the root run span. */\n  operationName?: GenAiOperationName;\n  /** Optional JSON-safe source marker for the trace metadata. */\n  source?: string;\n};\n\ntype OutputField<TOutput extends JsonValue | undefined> =\n  undefined extends TOutput ? { output?: TOutput } : { output: TOutput };\n\n/** Per-run metadata shape accepted by harnesses and eval tests. */\nexport type HarnessMetadata = Record<string, unknown>;\n\n/**\n * Runtime context passed from the eval fixture into a harness run.\n *\n * @example\n * ```ts\n * const harness: Harness<string> = {\n *   name: \"refund-agent\",\n *   async run(input, context) {\n *     context.setArtifact(\"inputLength\", input.length);\n *\n *     return {\n *       output: undefined,\n *       session: { messages: [{ role: \"user\", content: input }] },\n *       usage: {},\n *       errors: [],\n *     };\n *   },\n * };\n * ```\n */\nexport type HarnessContext<\n  TMetadata extends HarnessMetadata = HarnessMetadata,\n> = {\n  /** Per-run metadata passed through `run(input, { metadata })`. */\n  metadata: Readonly<TMetadata>;\n  /** Abort signal from Vitest when available. */\n  signal?: AbortSignal;\n  /** Mutable JSON-safe artifact bag shared with the harness. */\n  artifacts: Record<string, JsonValue>;\n  /** Stores one JSON-safe artifact on the current run. */\n  setArtifact: (name: string, value: JsonValue) => void;\n};\n\n/**\n * Adapter that executes the system under test and returns a normalized run.\n *\n * @example\n * ```ts\n * const harness: Harness<string, { status: \"approved\" | \"denied\" }> = {\n *   name: \"refund-agent\",\n *   async run(input, context) {\n *     return normalizeHarnessRun(input, await runRefundFlow(input), context);\n *   },\n * };\n * ```\n */\nexport type Harness<\n  TInput = unknown,\n  TOutput extends JsonValue | undefined = JsonValue | undefined,\n  TMetadata extends HarnessMetadata = HarnessMetadata,\n> = {\n  /** Stable harness name used in reports. */\n  name: string;\n  /** Executes the system under test and returns a normalized run. */\n  run: (\n    input: TInput,\n    context: HarnessContext<TMetadata>,\n  ) => Promise<HarnessRun<TOutput>>;\n};\n\n/** Value or promise accepted by lightweight harness callbacks. */\nexport type MaybePromise<T> = T | Promise<T>;\n\n/** Lightweight tool-call record accepted by `createHarness(...)` results. */\nexport type SimpleToolCallRecord = Omit<\n  ToolCallRecord,\n  \"arguments\" | \"result\" | \"error\" | \"metadata\"\n> & {\n  /** Raw tool arguments accepted by `createHarness(...)` before normalization. */\n  arguments?: unknown;\n  /** Raw tool result accepted by `createHarness(...)` before normalization. */\n  result?: unknown;\n  /** Raw tool error accepted by `createHarness(...)` before normalization. */\n  error?: unknown;\n  /** Raw tool metadata accepted by `createHarness(...)` before normalization. */\n  metadata?: Record<string, unknown>;\n};\n\n/** Lightweight span event accepted by `createHarness(...)` results. */\nexport type SimpleSpanEvent = Omit<NormalizedSpanEvent, \"attributes\"> & {\n  /** Raw event attributes accepted by `createHarness(...)` before normalization. */\n  attributes?: Record<string, unknown>;\n};\n\n/** Lightweight span record accepted by `createHarness(...)` results. */\nexport type SimpleSpanRecord = Omit<\n  NormalizedSpan,\n  \"attributes\" | \"error\" | \"events\"\n> & {\n  /** Raw span attributes accepted by `createHarness(...)` before normalization. */\n  attributes?: Record<string, unknown>;\n  /** Raw span error accepted by `createHarness(...)` before normalization. */\n  error?: unknown;\n  /** Raw span events accepted by `createHarness(...)` before normalization. */\n  events?: SimpleSpanEvent[];\n};\n\n/** Lightweight trace record accepted by `createHarness(...)` results. */\nexport type SimpleTraceRecord = Omit<NormalizedTrace, \"metadata\" | \"spans\"> & {\n  /** Raw trace metadata accepted by `createHarness(...)` before normalization. */\n  metadata?: Record<string, unknown>;\n  /** Lightweight spans to normalize into the trace. */\n  spans: SimpleSpanRecord[];\n};\n\n/**\n * Lightweight result shape normalized by `createHarness(...)`.\n *\n * @example\n * ```ts\n * const result: SimpleHarnessResult<{ status: \"approved\" }> = {\n *   output: { status: \"approved\" },\n *   toolCalls: [{ name: \"lookupInvoice\", arguments: { invoiceId: \"inv_123\" } }],\n *   usage: { totalTokens: 260 },\n * };\n * ```\n */\nexport type SimpleHarnessResult<\n  TOutput extends JsonValue | undefined = JsonValue | undefined,\n> = OutputField<TOutput> & {\n  /** Pre-normalized transcript messages. When omitted, a default user/assistant transcript is created. */\n  messages?: NormalizedMessage[];\n  /** Lightweight tool-call records to normalize into the session. */\n  toolCalls?: SimpleToolCallRecord[];\n  /** Usage summary to attach to the run. */\n  usage?: UsageSummary;\n  /** Timing summary to attach to the run. */\n  timings?: TimingSummary;\n  /** Raw artifact values to normalize and merge into the run. */\n  artifacts?: Record<string, unknown>;\n  /** Lightweight traces and spans to normalize into the run. */\n  traces?: SimpleTraceRecord[];\n  /** Raw session metadata to normalize into the session. */\n  metadata?: Record<string, unknown>;\n  /** Raw errors to normalize into the run. */\n  errors?: unknown[];\n};\n\n/** Either a complete normalized run or a lightweight result to normalize. */\nexport type HarnessResultLike<\n  TOutput extends JsonValue | undefined = JsonValue | undefined,\n> = HarnessRun<TOutput> | SimpleHarnessResult<TOutput>;\n\n/** Arguments passed to the `createHarness(...)` convenience callback. */\nexport type CreateHarnessRunArgs<TInput, TMetadata extends HarnessMetadata> = {\n  /** Original input passed to `run(input)`. */\n  input: TInput;\n  /** Read-only metadata passed to `run(input, { metadata })`. */\n  metadata: Readonly<TMetadata>;\n  /** Abort signal from Vitest when available. */\n  signal?: AbortSignal;\n  /** Mutable run artifact bag. */\n  artifacts: HarnessContext<TMetadata>[\"artifacts\"];\n  /** Stores one JSON-safe artifact on the current run. */\n  setArtifact: HarnessContext<TMetadata>[\"setArtifact\"];\n};\n\n/**\n * Options for creating a lightweight custom application harness.\n *\n * @example\n * ```ts\n * const options: CreateHarnessOptions<string, { status: \"approved\" }> = {\n *   name: \"refund-agent\",\n *   run: async ({ input }) => ({\n *     output: await classifyRefund(input),\n *   }),\n * };\n * ```\n */\nexport type CreateHarnessOptions<\n  TInput = unknown,\n  TOutput extends JsonValue | undefined = JsonValue | undefined,\n  TMetadata extends HarnessMetadata = HarnessMetadata,\n> = {\n  /** Stable harness name used in reports. */\n  name: string;\n  /** Executes application code and returns either a lightweight result or full `HarnessRun`. */\n  run: (\n    args: CreateHarnessRunArgs<TInput, TMetadata>,\n  ) => MaybePromise<HarnessResultLike<TOutput>>;\n};\n\nfunction isJsonPrimitive(value: unknown): value is JsonPrimitive {\n  return (\n    value === null ||\n    typeof value === \"string\" ||\n    typeof value === \"boolean\" ||\n    (typeof value === \"number\" && Number.isFinite(value))\n  );\n}\n\nfunction isJsonRecord(value: unknown): value is Record<string, unknown> {\n  return typeof value === \"object\" && value !== null && !Array.isArray(value);\n}\n\nfunction normalizeJsonArray(value: unknown[], seen: WeakSet<object>) {\n  if (seen.has(value)) {\n    return undefined;\n  }\n\n  seen.add(value);\n  const normalized = value.map((item) => {\n    const normalized = toJsonValueInternal(item, seen);\n    return normalized === undefined ? null : normalized;\n  });\n  seen.delete(value);\n\n  return normalized;\n}\n\nfunction normalizeJsonObject(\n  value: Record<string, unknown>,\n  seen: WeakSet<object>,\n): Record<string, JsonValue> {\n  const normalized: Record<string, JsonValue> = {};\n\n  if (seen.has(value)) {\n    return normalized;\n  }\n\n  seen.add(value);\n  try {\n    for (const [key, entryValue] of Object.entries(value)) {\n      const entry = toJsonValueInternal(entryValue, seen);\n      if (entry !== undefined) {\n        normalized[key] = entry;\n      }\n    }\n  } finally {\n    seen.delete(value);\n  }\n\n  return normalized;\n}\n\n/** Returns true when a value exposes a callable method with the given name. */\nexport function hasCallableMethod(value: unknown, methodName: string) {\n  return (\n    value !== null &&\n    (typeof value === \"object\" || typeof value === \"function\") &&\n    methodName in value &&\n    typeof (value as Record<string, unknown>)[methodName] === \"function\"\n  );\n}\n\n/** Normalizes an unknown value into the JSON-safe shape used by harness runs. */\nexport function toJsonValue(value: unknown): JsonValue | undefined {\n  return toJsonValueInternal(value, new WeakSet());\n}\n\nfunction toJsonValueInternal(\n  value: unknown,\n  seen: WeakSet<object>,\n): JsonValue | undefined {\n  if (isJsonPrimitive(value)) {\n    return value;\n  }\n\n  if (\n    value !== null &&\n    typeof value === \"object\" &&\n    seen.has(value as object)\n  ) {\n    return undefined;\n  }\n\n  if (Array.isArray(value)) {\n    return normalizeJsonArray(value, seen);\n  }\n\n  if (isJsonRecord(value)) {\n    return normalizeJsonObject(value, seen);\n  }\n\n  return undefined;\n}\n\n/** Drops non-JSON properties from a record while preserving valid values. */\nexport function normalizeRecord(\n  value: Record<string, unknown>,\n): Record<string, JsonValue> {\n  return normalizeJsonObject(value, new WeakSet());\n}\n\n/** Normalizes metadata and omits the field entirely when nothing survives. */\nexport function normalizeMetadata(\n  value: Record<string, unknown>,\n): Record<string, JsonValue> | undefined {\n  const normalized = normalizeRecord(value);\n  return Object.keys(normalized).length > 0 ? normalized : undefined;\n}\n\n/** Converts arbitrary content into the JSON-safe message content shape. */\nexport function normalizeContent(value: unknown): JsonValue {\n  const normalized = toJsonValue(value);\n  return normalized !== undefined ? normalized : String(value);\n}\n\n/**\n * Creates a harness from the common \"run app code and return output\" shape.\n *\n * @param options - Harness name plus the callback that executes app code.\n *\n * @example\n * ```ts\n * import { createHarness } from \"vitest-evals\";\n *\n * export const refundHarness = createHarness<\n *   string,\n *   { status: \"approved\" | \"denied\" },\n *   { expected: { status: \"approved\" | \"denied\" } }\n * >({\n *   name: \"refund-agent\",\n *   run: async ({ input, metadata, setArtifact }) => {\n *     const result = await runRefundFlow(input, metadata);\n *     const output = { status: result.status };\n *\n *     setArtifact(\"case\", { expected: metadata.expected.status });\n *\n *     return {\n *       output,\n *       toolCalls: result.toolCalls,\n *       usage: { provider: \"openai\", model: \"gpt-4o-mini\" },\n *     };\n *   },\n * });\n * ```\n */\nexport function createHarness<\n  TInput = unknown,\n  TOutput extends JsonValue | undefined = JsonValue | undefined,\n  TMetadata extends HarnessMetadata = HarnessMetadata,\n>(\n  options: CreateHarnessOptions<TInput, TOutput, TMetadata>,\n): Harness<TInput, TOutput, TMetadata>;\nexport function createHarness<\n  TInput = unknown,\n  TOutput extends JsonValue | undefined = JsonValue | undefined,\n  TMetadata extends HarnessMetadata = HarnessMetadata,\n>(\n  options: CreateHarnessOptions<TInput, TOutput, TMetadata>,\n): Harness<TInput, TOutput, TMetadata> {\n  const harness: Harness<TInput, TOutput, TMetadata> = {\n    name: options.name,\n    run: async (input, context) => {\n      const startedAt = new Date();\n\n      try {\n        const result = await options.run({\n          input,\n          metadata: context.metadata,\n          signal: context.signal,\n          artifacts: context.artifacts,\n          setArtifact: context.setArtifact,\n        });\n        const run = normalizeHarnessRun(input, result, context);\n        ensureRunTrace(run, {\n          name: options.name,\n          startedAt,\n          finishedAt: new Date(),\n        });\n\n        return run;\n      } catch (error) {\n        const partialRun = getHarnessRunFromError(error);\n        if (partialRun) {\n          if (\n            Object.keys(context.artifacts).length > 0 &&\n            !partialRun.artifacts\n          ) {\n            partialRun.artifacts = context.artifacts;\n          }\n          ensureRunTrace(partialRun, {\n            name: options.name,\n            startedAt,\n            finishedAt: new Date(),\n          });\n          throw attachHarnessRunToError(error, partialRun);\n        }\n\n        const failedRun = createFailedHarnessRun(input, error, {\n          artifacts: context.artifacts,\n        });\n        ensureRunTrace(failedRun, {\n          name: options.name,\n          startedAt,\n          finishedAt: new Date(),\n        });\n\n        throw attachHarnessRunToError(error, failedRun);\n      }\n    },\n  };\n\n  return harness;\n}\n\n/**\n * Normalizes a lightweight harness result into the reporter-facing run shape.\n *\n * @param input - Original input passed to the harness.\n * @param result - Lightweight result or pre-normalized harness run.\n * @param context - Optional per-run context used to merge artifacts.\n *\n * @example\n * ```ts\n * const run = normalizeHarnessRun(\"Refund invoice inv_123\", {\n *   output: { status: \"approved\" },\n *   toolCalls: [{ name: \"lookupInvoice\", arguments: { invoiceId: \"inv_123\" } }],\n *   usage: { provider: \"openai\", model: \"gpt-4o-mini\" },\n * });\n *\n * expect(toolCalls(run.session)).toHaveLength(1);\n * ```\n */\nexport function normalizeHarnessRun<\n  TInput = unknown,\n  TMetadata extends HarnessMetadata = HarnessMetadata,\n  TOutput extends JsonValue | undefined = JsonValue | undefined,\n>(\n  input: TInput,\n  result: HarnessResultLike<TOutput>,\n  context?: HarnessContext<TMetadata>,\n): HarnessRun<TOutput> {\n  if (isHarnessRun(result)) {\n    if (\n      context &&\n      Object.keys(context.artifacts).length > 0 &&\n      !result.artifacts\n    ) {\n      return {\n        ...result,\n        artifacts: context.artifacts,\n      };\n    }\n\n    return result;\n  }\n\n  const output = result.output;\n  const toolCalls = normalizeSimpleToolCalls(result.toolCalls);\n  const usage = result.usage ?? {};\n  const messages =\n    result.messages ??\n    createDefaultSessionMessages({\n      input,\n      output,\n      toolCalls,\n    });\n  const metadata = result.metadata\n    ? normalizeMetadata(result.metadata)\n    : undefined;\n  const artifacts = normalizeMergedArtifacts(\n    context?.artifacts,\n    result.artifacts,\n  );\n  const traces = normalizeSimpleTraces(result.traces);\n\n  return {\n    session: {\n      messages,\n      ...(usage.provider ? { provider: usage.provider } : {}),\n      ...(usage.model ? { model: usage.model } : {}),\n      ...(metadata ? { metadata } : {}),\n    },\n    ...(output !== undefined ? { output } : {}),\n    usage,\n    ...(result.timings ? { timings: result.timings } : {}),\n    ...(artifacts ? { artifacts } : {}),\n    ...(traces ? { traces } : {}),\n    errors: normalizeSimpleErrors(result.errors),\n  } as HarnessRun<TOutput>;\n}\n\n/**\n * Builds a JSON-safe failed run for errors that happen before a harness can return.\n *\n * @param input - Original input passed to the harness.\n * @param error - Error thrown by setup or execution.\n * @param options - Optional artifacts to preserve on the failed run.\n */\nexport function createFailedHarnessRun(\n  input: unknown,\n  error: unknown,\n  options: { artifacts?: Record<string, JsonValue> } = {},\n): HarnessRun {\n  const artifacts = options.artifacts;\n\n  return {\n    session: {\n      messages: [\n        {\n          role: \"user\",\n          content: normalizeContent(input),\n        },\n      ],\n    },\n    usage: {},\n    ...(artifacts && Object.keys(artifacts).length > 0 ? { artifacts } : {}),\n    errors: [serializeError(error)],\n  };\n}\n\nfunction createDefaultSessionMessages<TInput>({\n  input,\n  output,\n  toolCalls: normalizedToolCalls,\n}: {\n  input: TInput;\n  output: JsonValue | undefined;\n  toolCalls: ToolCallRecord[];\n}): NormalizedMessage[] {\n  const messages: NormalizedMessage[] = [\n    {\n      role: \"user\",\n      content: normalizeContent(input),\n    },\n  ];\n\n  if (output !== undefined || normalizedToolCalls.length > 0) {\n    messages.push({\n      role: \"assistant\",\n      ...(output !== undefined ? { content: normalizeContent(output) } : {}),\n      ...(normalizedToolCalls.length > 0\n        ? { toolCalls: normalizedToolCalls }\n        : {}),\n    });\n  }\n\n  return messages;\n}\n\nfunction normalizeSimpleToolCalls(\n  calls: SimpleToolCallRecord[] | undefined,\n): ToolCallRecord[] {\n  return (calls ?? []).map((call) => {\n    const {\n      arguments: rawArguments,\n      result: rawResult,\n      error: rawError,\n      metadata: rawMetadata,\n      ...toolCall\n    } = call;\n    const args = normalizeToolCallArguments(rawArguments);\n    const result = toJsonValue(rawResult);\n    const error = normalizeToolCallError(rawError);\n    const metadata = rawMetadata ? normalizeMetadata(rawMetadata) : undefined;\n\n    return {\n      ...toolCall,\n      ...(args ? { arguments: args } : {}),\n      ...(result !== undefined ? { result } : {}),\n      ...(error ? { error } : {}),\n      ...(metadata ? { metadata } : {}),\n    };\n  });\n}\n\nfunction normalizeToolCallArguments(\n  value: unknown,\n): Record<string, JsonValue> | undefined {\n  if (value === undefined) {\n    return undefined;\n  }\n\n  const normalized = toJsonValue(value);\n  return normalized &&\n    typeof normalized === \"object\" &&\n    !Array.isArray(normalized)\n    ? normalized\n    : undefined;\n}\n\nfunction normalizeToolCallError(\n  value: unknown,\n): ToolCallRecord[\"error\"] | undefined {\n  if (value === undefined) {\n    return undefined;\n  }\n\n  const serialized = serializeError(value);\n  const { message, type, ...details } = serialized;\n\n  return {\n    ...details,\n    message: typeof message === \"string\" ? message : String(message),\n    ...(typeof type === \"string\" ? { type } : {}),\n  };\n}\n\nfunction normalizeMergedArtifacts(\n  contextArtifacts: Record<string, JsonValue> | undefined,\n  resultArtifacts: Record<string, unknown> | undefined,\n) {\n  const artifacts = {\n    ...(contextArtifacts ?? {}),\n    ...(resultArtifacts ? normalizeRecord(resultArtifacts) : {}),\n  };\n\n  return Object.keys(artifacts).length > 0 ? artifacts : undefined;\n}\n\nfunction normalizeSimpleErrors(\n  errors: unknown[] | undefined,\n): Array<Record<string, JsonValue>> {\n  return (errors ?? []).map((error) => {\n    const normalized = toJsonValue(error);\n\n    if (\n      normalized &&\n      typeof normalized === \"object\" &&\n      !Array.isArray(normalized) &&\n      Object.keys(normalized).length > 0\n    ) {\n      return normalized;\n    }\n\n    return serializeError(error);\n  });\n}\n\nfunction normalizeSimpleTraces(\n  traces: SimpleTraceRecord[] | undefined,\n): NormalizedTrace[] | undefined {\n  if (!Array.isArray(traces)) {\n    return undefined;\n  }\n\n  const normalized = traces\n    .map(normalizeSimpleTrace)\n    .filter((trace): trace is NormalizedTrace => Boolean(trace));\n\n  return normalized.length > 0 ? normalized : undefined;\n}\n\nfunction normalizeSimpleTrace(trace: unknown): NormalizedTrace | undefined {\n  if (!isJsonRecord(trace)) {\n    return undefined;\n  }\n\n  const {\n    metadata: rawMetadata,\n    spans: rawSpans,\n    ...traceFields\n  } = trace as Partial<SimpleTraceRecord>;\n  const spans = (Array.isArray(rawSpans) ? rawSpans : [])\n    .map((span) => normalizeSimpleSpan(span))\n    .filter((span): span is NormalizedSpan => Boolean(span));\n  const metadata = isJsonRecord(rawMetadata)\n    ? normalizeMetadata(rawMetadata)\n    : undefined;\n\n  if (spans.length === 0 && !traceFields.id && !traceFields.name) {\n    return undefined;\n  }\n\n  return {\n    ...traceFields,\n    ...(metadata ? { metadata } : {}),\n    spans,\n  };\n}\n\nfunction normalizeSimpleSpan(span: unknown): NormalizedSpan | undefined {\n  if (!isJsonRecord(span) || typeof span.name !== \"string\" || !span.name) {\n    return undefined;\n  }\n\n  const {\n    attributes: rawAttributes,\n    error: rawError,\n    events: rawEvents,\n    ...spanFields\n  } = span as Partial<SimpleSpanRecord> & { name: string };\n  const attributes = rawAttributes\n    ? isJsonRecord(rawAttributes)\n      ? normalizeMetadata(rawAttributes)\n      : undefined\n    : undefined;\n  const error = normalizeSpanError(rawError);\n  const events = normalizeSimpleSpanEvents(rawEvents);\n\n  return {\n    ...spanFields,\n    ...(attributes\n      ? { attributes: attributes as NormalizedSpanAttributes }\n      : {}),\n    ...(error ? { error } : {}),\n    ...(events ? { events } : {}),\n  };\n}\n\nfunction normalizeSimpleSpanEvents(\n  events: unknown,\n): NormalizedSpanEvent[] | undefined {\n  if (!Array.isArray(events)) {\n    return undefined;\n  }\n\n  const normalized = events\n    .map(normalizeSimpleSpanEvent)\n    .filter((event): event is NormalizedSpanEvent => Boolean(event));\n\n  return normalized.length > 0 ? normalized : undefined;\n}\n\nfunction normalizeSimpleSpanEvent(\n  event: unknown,\n): NormalizedSpanEvent | undefined {\n  if (!isJsonRecord(event) || typeof event.name !== \"string\" || !event.name) {\n    return undefined;\n  }\n\n  const { attributes: rawAttributes, ...eventFields } =\n    event as Partial<SimpleSpanEvent> & { name: string };\n  const attributes = rawAttributes\n    ? isJsonRecord(rawAttributes)\n      ? normalizeMetadata(rawAttributes)\n      : undefined\n    : undefined;\n\n  return {\n    ...eventFields,\n    ...(attributes\n      ? { attributes: attributes as NormalizedSpanAttributes }\n      : {}),\n  };\n}\n\n/** Normalizes arbitrary span errors while preserving object-shaped messages. */\nexport function normalizeSpanError(\n  error: unknown,\n): NormalizedSpan[\"error\"] | undefined {\n  if (error === undefined) {\n    return undefined;\n  }\n\n  if (error instanceof Error) {\n    const details = normalizeMetadata(\n      error as unknown as Record<string, unknown>,\n    );\n\n    return {\n      ...(details ?? {}),\n      type: error.name,\n      message: error.message,\n    };\n  }\n\n  if (\n    error &&\n    typeof error === \"object\" &&\n    !Array.isArray(error) &&\n    typeof (error as { message?: unknown }).message === \"string\"\n  ) {\n    const normalized = normalizeMetadata(error as Record<string, unknown>);\n    const { message, type, ...details } = normalized ?? {};\n\n    return {\n      ...details,\n      message: message as string,\n      ...(typeof type === \"string\" ? { type } : {}),\n    };\n  }\n\n  const serialized = serializeError(error);\n  const { message, type, ...details } = serialized;\n\n  return {\n    ...details,\n    message: typeof message === \"string\" ? message : String(message),\n    ...(typeof type === \"string\" ? { type } : {}),\n  };\n}\n\n/** Normalizes raw span attributes into the JSON-safe span attribute shape. */\nexport function normalizeSpanAttributes(\n  attributes: Record<string, unknown>,\n): NormalizedSpanAttributes | undefined {\n  return normalizeMetadata(attributes) as NormalizedSpanAttributes | undefined;\n}\n\n/** Builds common OpenTelemetry GenAI usage attributes from a usage summary. */\nexport function createGenAiUsageAttributes(\n  usage: UsageSummary | undefined,\n  options: { provider?: string } = {},\n) {\n  return {\n    \"gen_ai.provider.name\": usage?.provider ?? options.provider,\n    \"gen_ai.request.model\": usage?.model,\n    \"gen_ai.response.model\": usage?.model,\n    \"gen_ai.usage.input_tokens\": usage?.inputTokens,\n    \"gen_ai.usage.output_tokens\": usage?.outputTokens,\n    \"gen_ai.usage.reasoning.output_tokens\": usage?.reasoningTokens,\n  } satisfies Record<string, unknown>;\n}\n\n/**\n * Converts normalized tool-call records into trace spans.\n *\n * Tool-call ids are preserved as GenAI attributes. Pass `spanIdPrefix` when the\n * spans belong to a known trace so span ids stay internally unique.\n */\nexport function createToolCallSpans(\n  calls: ToolCallRecord[],\n  options: CreateToolCallSpansOptions = {},\n): NormalizedSpan[] {\n  return calls.map((call, index) => {\n    const spanError = call.error ? normalizeSpanError(call.error) : undefined;\n    const spanId = options.spanIdPrefix\n      ? `${options.spanIdPrefix}:${index + 1}`\n      : call.id;\n\n    return {\n      ...(spanId ? { id: spanId } : {}),\n      ...(options.traceId ? { traceId: options.traceId } : {}),\n      ...(options.parentId ? { parentId: options.parentId } : {}),\n      name: call.name,\n      kind: \"tool\",\n      ...(call.startedAt ? { startedAt: call.startedAt } : {}),\n      ...(call.finishedAt ? { finishedAt: call.finishedAt } : {}),\n      ...(call.durationMs !== undefined ? { durationMs: call.durationMs } : {}),\n      status: spanError ? \"error\" : \"ok\",\n      ...(spanError ? { error: spanError } : {}),\n      attributes: normalizeSpanAttributes({\n        \"gen_ai.operation.name\": \"execute_tool\",\n        \"gen_ai.tool.name\": call.name,\n        \"gen_ai.tool.type\": \"function\",\n        ...(call.id ? { \"gen_ai.tool.call.id\": call.id } : {}),\n        ...(call.arguments !== undefined\n          ? { \"gen_ai.tool.call.arguments\": call.arguments }\n          : {}),\n        ...(call.result !== undefined\n          ? { \"gen_ai.tool.call.result\": call.result }\n          : {}),\n      }),\n    } satisfies NormalizedSpan;\n  });\n}\n\n/**\n * Attaches a fallback run trace when a harness result does not already contain spans.\n *\n * This keeps custom harnesses inspectable while first-party harness packages\n * remain free to attach richer native traces.\n */\nexport function ensureRunTrace(\n  run: HarnessRun,\n  options: EnsureRunTraceOptions,\n): NormalizedTrace | undefined {\n  if (spans(run).length > 0) {\n    return undefined;\n  }\n\n  const traceId = options.id ?? createGeneratedTraceId();\n  const rootSpanId = `${traceId}:run`;\n  const durationMs = options.finishedAt.getTime() - options.startedAt.getTime();\n  const rootError =\n    run.errors.length > 0 ? normalizeSpanError(run.errors[0]) : undefined;\n  const runSpan: NormalizedSpan = {\n    id: rootSpanId,\n    traceId,\n    name: options.name,\n    kind: \"run\",\n    startedAt: options.startedAt.toISOString(),\n    finishedAt: options.finishedAt.toISOString(),\n    durationMs,\n    status: rootError ? \"error\" : \"ok\",\n    ...(rootError ? { error: rootError } : {}),\n    attributes: normalizeSpanAttributes({\n      \"gen_ai.operation.name\": options.operationName ?? \"invoke_workflow\",\n      \"gen_ai.workflow.name\": options.name,\n      ...createGenAiUsageAttributes(run.usage),\n    }),\n  };\n  const toolSpans = createToolCallSpans(toolCalls(run.session), {\n    traceId,\n    parentId: rootSpanId,\n    spanIdPrefix: `${traceId}:tool`,\n  });\n  const trace: NormalizedTrace = {\n    id: traceId,\n    name: options.name,\n    startedAt: options.startedAt.toISOString(),\n    finishedAt: options.finishedAt.toISOString(),\n    durationMs,\n    ...(options.source ? { metadata: { source: options.source } } : {}),\n    spans: [runSpan, ...toolSpans],\n  };\n\n  run.traces = [trace];\n  return trace;\n}\n\nlet nextGeneratedTraceId = 0;\n\nfunction createGeneratedTraceId() {\n  nextGeneratedTraceId += 1;\n  return `trace_${nextGeneratedTraceId}`;\n}\n\n/**\n * Attaches a partial or complete harness run to an arbitrary thrown error.\n *\n * @param error - Thrown value to wrap.\n * @param run - Partial or complete normalized harness run to preserve.\n *\n * @example\n * ```ts\n * try {\n *   return await runAgent(input);\n * } catch (error) {\n *   throw attachHarnessRunToError(error, partialRun);\n * }\n * ```\n */\nexport function attachHarnessRunToError(\n  error: unknown,\n  run: HarnessRun,\n): HarnessRunError {\n  const baseError =\n    error instanceof Error\n      ? error\n      : new Error(String(error ?? \"Unknown error\"));\n  return Object.assign(baseError, {\n    vitestEvalsRun: run,\n  });\n}\n\n/**\n * Reads an attached harness run back off a previously wrapped error value.\n *\n * @param error - Unknown thrown value that may contain a harness run.\n *\n * @example\n * ```ts\n * const partialRun = getHarnessRunFromError(error);\n *\n * if (partialRun) {\n *   console.log(toolCalls(partialRun.session));\n * }\n * ```\n */\nexport function getHarnessRunFromError(error: unknown): HarnessRun | undefined {\n  if (\n    error &&\n    typeof error === \"object\" &&\n    \"vitestEvalsRun\" in error &&\n    isHarnessRun((error as { vitestEvalsRun?: unknown }).vitestEvalsRun)\n  ) {\n    return (error as { vitestEvalsRun: HarnessRun }).vitestEvalsRun;\n  }\n\n  return undefined;\n}\n\n/** Returns true when a value matches the normalized `HarnessRun` contract. */\nexport function isHarnessRun(value: unknown): value is HarnessRun {\n  if (!value || typeof value !== \"object\") {\n    return false;\n  }\n\n  const candidate = value as {\n    session?: unknown;\n    usage?: unknown;\n    errors?: unknown;\n  };\n\n  return (\n    isNormalizedSession(candidate.session) &&\n    Boolean(candidate.usage) &&\n    typeof candidate.usage === \"object\" &&\n    !Array.isArray(candidate.usage) &&\n    Array.isArray(candidate.errors)\n  );\n}\n\n/** Returns true when a value matches the normalized session contract. */\nexport function isNormalizedSession(\n  value: unknown,\n): value is NormalizedSession {\n  return (\n    Boolean(value) &&\n    typeof value === \"object\" &&\n    value !== null &&\n    \"messages\" in value &&\n    Array.isArray((value as { messages?: unknown }).messages)\n  );\n}\n\n/** Reuses pre-normalized harness errors when a runtime already returns them. */\nexport function resolveHarnessRunErrors(\n  result: unknown,\n): Array<Record<string, JsonValue>> {\n  if (\n    result &&\n    typeof result === \"object\" &&\n    Array.isArray((result as Record<string, unknown>).errors)\n  ) {\n    return (result as { errors: Array<Record<string, JsonValue>> }).errors;\n  }\n\n  return [];\n}\n\n/** Serializes an arbitrary thrown value into the normalized error shape. */\nexport function serializeError(error: unknown): Record<string, JsonValue> {\n  if (error instanceof Error) {\n    return {\n      type: error.name,\n      message: error.message,\n    };\n  }\n\n  return {\n    type: \"Error\",\n    message: String(error),\n  };\n}\n","/**\n * Shared configuration for built-in value matchers.\n *\n * @example\n * ```ts\n * const config: BaseMatcherConfig = {\n *   requireAll: true,\n *   allowExtras: false,\n * };\n * ```\n */\nexport interface BaseMatcherConfig {\n  /** Require every expected field or tool to match. */\n  requireAll?: boolean;\n  /** Allow actual output/tool calls to contain extra fields or calls. */\n  allowExtras?: boolean;\n  /** Emit matcher debug details to the console. */\n  debug?: boolean;\n}\n\n/**\n * Matching strategy used by structured-output and tool-call judges.\n *\n * @example\n * ```ts\n * const exact: MatchStrategy = \"strict\";\n * const custom: MatchStrategy = (expected, actual) =>\n *   String(actual).includes(String(expected));\n * ```\n */\nexport type MatchStrategy<T = unknown> =\n  | \"strict\"\n  | \"fuzzy\"\n  | ((expected: T, actual: T, context?: string) => boolean);\n\n/**\n * Options controlling fuzzy matcher behavior.\n *\n * @example\n * ```ts\n * const fuzzyOptions: FuzzyMatchOptions = {\n *   caseInsensitive: true,\n *   substring: true,\n *   numericTolerance: 0.01,\n * };\n * ```\n */\nexport interface FuzzyMatchOptions {\n  /** Compare strings without case sensitivity. */\n  caseInsensitive?: boolean;\n  /** Treat either string containing the other as a match. */\n  substring?: boolean;\n  /** Relative and absolute tolerance used for numeric comparisons. */\n  numericTolerance?: number;\n  /** Match arrays without requiring the same order. */\n  ignoreArrayOrder?: boolean;\n  /** Allow simple string/number/boolean coercions before failing. */\n  coerceTypes?: boolean;\n}\n\ntype Mismatch = {\n  key: string;\n  expected: unknown;\n  actual: unknown;\n};\n\n/** Debug logger interface accepted by matcher helpers. */\nexport interface Logger {\n  log: (message: string, ...args: unknown[]) => void;\n}\n\nfunction isRecord(value: unknown): value is Record<string, unknown> {\n  return typeof value === \"object\" && value !== null && !Array.isArray(value);\n}\n\n/** Compares two values with strict deep equality semantics. */\nexport function strictEquals(\n  expected: unknown,\n  actual: unknown,\n  context?: string,\n): boolean {\n  void context;\n\n  if (expected === actual) {\n    return true;\n  }\n\n  if (\n    expected === null ||\n    expected === undefined ||\n    actual === null ||\n    actual === undefined\n  ) {\n    return false;\n  }\n\n  if (typeof expected !== typeof actual) {\n    return false;\n  }\n\n  if (Array.isArray(expected)) {\n    return (\n      Array.isArray(actual) &&\n      expected.length === actual.length &&\n      expected.every((item, index) =>\n        strictEquals(item, actual[index], context),\n      )\n    );\n  }\n\n  if (isRecord(expected) && isRecord(actual)) {\n    const expectedKeys = Object.keys(expected);\n    const actualKeys = Object.keys(actual);\n\n    return (\n      expectedKeys.length === actualKeys.length &&\n      expectedKeys.every(\n        (key) =>\n          key in actual && strictEquals(expected[key], actual[key], context),\n      )\n    );\n  }\n\n  return false;\n}\n\nfunction fuzzyMatchString(\n  expected: string,\n  actual: string,\n  options: FuzzyMatchOptions,\n): boolean {\n  const { caseInsensitive = true, substring = false } = options;\n  const expectedText = caseInsensitive ? expected.toLowerCase() : expected;\n  const actualText = caseInsensitive ? actual.toLowerCase() : actual;\n\n  return substring\n    ? actualText.includes(expectedText) || expectedText.includes(actualText)\n    : expectedText === actualText;\n}\n\nfunction fuzzyMatchNumber(\n  expected: number,\n  actual: number,\n  options: FuzzyMatchOptions,\n): boolean {\n  const { numericTolerance = 0.001 } = options;\n  const tolerance = Math.max(\n    Math.abs(expected) * numericTolerance,\n    numericTolerance,\n  );\n\n  return Math.abs(expected - actual) <= tolerance;\n}\n\nfunction fuzzyMatchArray(\n  expected: unknown[],\n  actual: unknown[],\n  options: FuzzyMatchOptions,\n  context?: string,\n): boolean {\n  const { ignoreArrayOrder = true } = options;\n\n  if (!ignoreArrayOrder) {\n    return (\n      expected.length === actual.length &&\n      expected.every((item, index) =>\n        fuzzyMatch(item, actual[index], options, context),\n      )\n    );\n  }\n\n  const actualUsed = actual.map(() => false);\n  return expected.every((expectedItem) => {\n    for (const [index, actualItem] of actual.entries()) {\n      if (actualUsed[index]) {\n        continue;\n      }\n\n      if (fuzzyMatch(expectedItem, actualItem, options, context)) {\n        actualUsed[index] = true;\n        return true;\n      }\n    }\n\n    return false;\n  });\n}\n\nfunction fuzzyMatchObject(\n  expected: Record<string, unknown>,\n  actual: Record<string, unknown>,\n  options: FuzzyMatchOptions,\n  context?: string,\n): boolean {\n  return Object.entries(expected).every(\n    ([key, value]) =>\n      key in actual && fuzzyMatch(value, actual[key], options, context),\n  );\n}\n\nfunction fuzzyMatchWithCoercion(expected: unknown, actual: unknown): boolean {\n  if (typeof expected === \"boolean\" && typeof actual === \"string\") {\n    return expected === (actual.toLowerCase() === \"true\" || actual === \"1\");\n  }\n\n  if (typeof expected === \"string\" && typeof actual === \"number\") {\n    return Number.parseFloat(expected) === actual;\n  }\n\n  if (typeof expected === \"number\" && typeof actual === \"string\") {\n    return expected === Number.parseFloat(actual);\n  }\n\n  return false;\n}\n\n/** Compares two values with the configured fuzzy matching rules. */\nexport function fuzzyMatch(\n  expected: unknown,\n  actual: unknown,\n  options: FuzzyMatchOptions = {},\n  context?: string,\n): boolean {\n  const { coerceTypes = false } = options;\n\n  if (expected instanceof RegExp) {\n    return typeof actual === \"string\" && expected.test(actual);\n  }\n\n  if (typeof expected === \"function\") {\n    return Boolean((expected as (value: unknown) => unknown)(actual));\n  }\n\n  if (\n    expected === null ||\n    expected === undefined ||\n    actual === null ||\n    actual === undefined\n  ) {\n    return expected === actual;\n  }\n\n  if (typeof expected === \"string\" && typeof actual === \"string\") {\n    return fuzzyMatchString(expected, actual, options);\n  }\n\n  if (typeof expected === \"number\" && typeof actual === \"number\") {\n    return fuzzyMatchNumber(expected, actual, options);\n  }\n\n  if (Array.isArray(expected) && Array.isArray(actual)) {\n    return fuzzyMatchArray(expected, actual, options, context);\n  }\n\n  if (isRecord(expected) && isRecord(actual)) {\n    return fuzzyMatchObject(expected, actual, options, context);\n  }\n\n  if (coerceTypes && fuzzyMatchWithCoercion(expected, actual)) {\n    return true;\n  }\n\n  return expected === actual;\n}\n\n/** Builds a reusable matcher function from a strict, fuzzy, or custom strategy. */\nexport function createMatcher<T = unknown>(\n  strategy: MatchStrategy<T>,\n  options?: FuzzyMatchOptions,\n): (expected: T, actual: T, context?: string) => boolean {\n  if (typeof strategy === \"function\") {\n    return (expected, actual, context) => strategy(expected, actual, context);\n  }\n\n  if (strategy === \"strict\") {\n    return (expected, actual, context) =>\n      strictEquals(expected, actual, context);\n  }\n\n  return (expected, actual, context) =>\n    fuzzyMatch(expected, actual, options ?? {}, context);\n}\n\n/** Formats a value for scorer rationale and debug output. */\nexport function formatValue(value: unknown): string {\n  if (value === undefined) {\n    return \"undefined\";\n  }\n\n  if (value === null) {\n    return \"null\";\n  }\n\n  if (value instanceof RegExp) {\n    return value.toString();\n  }\n\n  if (typeof value === \"string\") {\n    return `\"${value}\"`;\n  }\n\n  if (typeof value === \"object\") {\n    try {\n      return JSON.stringify(value);\n    } catch {\n      return String(value);\n    }\n  }\n\n  return String(value);\n}\n\n/** Returns a normalized partial-credit score for match-based scorers. */\nexport function calculatePartialScore(\n  matched: number,\n  total: number,\n  requireAll: boolean,\n): number {\n  if (requireAll && matched < total) {\n    return 0;\n  }\n\n  return total > 0 ? matched / total : 1;\n}\n\nconst defaultLogger: Logger = {\n  log: (message: string, ...args: unknown[]) => {\n    if (\n      process.env.NODE_ENV === \"development\" ||\n      process.env.NODE_ENV === \"test\" ||\n      process.env.VITEST_EVALS_DEBUG === \"true\"\n    ) {\n      console.log(message, ...args);\n    }\n  },\n};\n\n/** Emits scorer debug details through the provided logger. */\nexport function debugLog(\n  context: string,\n  data: {\n    expected: unknown;\n    actual: unknown;\n    matches?: string[];\n    mismatches?: Mismatch[];\n    extras?: string[];\n  },\n  logger: Logger = defaultLogger,\n): void {\n  logger.log(`${context} debug:`);\n  logger.log(\"Expected:\", data.expected);\n  logger.log(\"Actual:\", data.actual);\n  if (data.matches) {\n    logger.log(\"Matches:\", data.matches);\n  }\n  if (data.mismatches) {\n    logger.log(\"Mismatches:\", data.mismatches);\n  }\n  if (data.extras) {\n    logger.log(\"Extras:\", data.extras);\n  }\n}\n","import type { BaseScorerOptions, ScoredResult, ToolCallLike } from \"./scoring\";\nimport { normalizeContent } from \"../harness\";\nimport {\n  createMatcher,\n  type BaseMatcherConfig,\n  type FuzzyMatchOptions,\n  type MatchStrategy,\n} from \"./matchers\";\n\nexport type ExpectedTool = {\n  /** Expected tool name. */\n  name: string;\n  /** Expected tool arguments matched according to the configured strategy. */\n  arguments?: unknown;\n};\n\nexport interface ToolCallScorerOptions extends BaseScorerOptions {\n  /** Expected tool calls for the run. */\n  expectedTools?: ExpectedTool[];\n}\n\nexport interface ToolCallScorerConfig extends BaseMatcherConfig {\n  /** Require expected tools to appear in the configured order. */\n  ordered?: boolean;\n  /** Matching strategy for expected tool arguments. */\n  params?: MatchStrategy;\n  /** Options used when argument matching is fuzzy. */\n  fuzzyOptions?: FuzzyMatchOptions;\n}\n\n/** Creates a tool-call scorer used by both harness judges and legacy wrappers. */\nexport function ToolCallScorer(config: ToolCallScorerConfig = {}) {\n  const {\n    ordered = false,\n    requireAll = true,\n    allowExtras = true,\n    params = \"strict\",\n    fuzzyOptions: userFuzzyOptions,\n  } = config;\n\n  const fuzzyOptions: FuzzyMatchOptions = {\n    substring: true,\n    caseInsensitive: true,\n    ignoreArrayOrder: true,\n    numericTolerance: 0.001,\n    coerceTypes: false,\n    ...userFuzzyOptions,\n  };\n  const argMatcher = createMatcher(params, fuzzyOptions);\n\n  const scorer = async (opts: ToolCallScorerOptions): Promise<ScoredResult> => {\n    const expectedTools = opts.expectedTools ?? [];\n    const actualCalls = opts.toolCalls ?? [];\n\n    if (expectedTools.length === 0) {\n      return {\n        score: 1,\n        metadata: {\n          rationale: \"No tool calls expected\",\n        },\n      };\n    }\n\n    if (actualCalls.length === 0) {\n      return {\n        score: 0,\n        metadata: {\n          rationale: `Expected ${expectedTools.length} tool(s) but none were called`,\n        },\n      };\n    }\n\n    return ordered\n      ? evaluateOrderedTools(expectedTools, actualCalls, {\n          argMatcher,\n          allowExtras,\n          requireAllTools: requireAll,\n        })\n      : evaluateUnorderedTools(expectedTools, actualCalls, {\n          argMatcher,\n          requireAllTools: requireAll,\n          allowExtras,\n        });\n  };\n\n  Object.defineProperty(scorer, \"name\", {\n    value: \"ToolCallScorer\",\n  });\n\n  return scorer;\n}\n\nfunction evaluateOrderedTools(\n  expected: ExpectedTool[],\n  actual: ToolCallLike[],\n  options: {\n    argMatcher: (expected: unknown, actual: unknown) => boolean;\n    allowExtras: boolean;\n    requireAllTools: boolean;\n  },\n): ScoredResult {\n  let expectedIndex = 0;\n  let actualIndex = 0;\n\n  while (expectedIndex < expected.length && actualIndex < actual.length) {\n    const expectedTool = expected[expectedIndex];\n    const actualTool = actual[actualIndex];\n\n    if (expectedTool.name === actualTool.name) {\n      if (\n        expectedTool.arguments !== undefined &&\n        !options.argMatcher(expectedTool.arguments, actualTool.arguments ?? {})\n      ) {\n        return {\n          score: expectedIndex / expected.length,\n          metadata: {\n            rationale: `Tool '${expectedTool.name}' called with incorrect arguments at position ${expectedIndex + 1} (${expectedIndex}/${expected.length} tools matched correctly)`,\n            expected: normalizeContent(expectedTool.arguments),\n            actual:\n              actualTool.arguments === undefined\n                ? undefined\n                : normalizeContent(actualTool.arguments),\n            matched: expectedIndex,\n            total: expected.length,\n          },\n        };\n      }\n\n      expectedIndex++;\n      actualIndex++;\n      continue;\n    }\n\n    if (options.allowExtras) {\n      actualIndex++;\n      continue;\n    }\n\n    return {\n      score: 0,\n      metadata: {\n        rationale: `Expected '${expectedTool.name}' at position ${expectedIndex + 1} but found '${actualTool.name}'`,\n      },\n    };\n  }\n\n  if (expectedIndex < expected.length) {\n    const missing = expected.slice(expectedIndex).map((tool) => tool.name);\n    if (options.requireAllTools) {\n      return {\n        score: 0,\n        metadata: {\n          rationale: `Missing required tools in sequence: ${missing.join(\", \")}`,\n        },\n      };\n    }\n\n    const matchedCount = expectedIndex;\n    const totalCount = expected.length;\n    return {\n      score: totalCount > 0 ? matchedCount / totalCount : 1,\n      metadata: {\n        rationale: `Partial match: ${matchedCount}/${totalCount} tools called in order (missing: ${missing.join(\", \")})`,\n        matched: matchedCount,\n        total: totalCount,\n      },\n    };\n  }\n\n  if (!options.allowExtras && actualIndex < actual.length) {\n    const extra = actual.slice(actualIndex).map((tool) => tool.name);\n    return {\n      score: 0,\n      metadata: {\n        rationale: `Unexpected extra tools: ${extra.join(\", \")}`,\n      },\n    };\n  }\n\n  return {\n    score: 1,\n    metadata: {\n      rationale: \"All tools called in expected order with correct arguments\",\n    },\n  };\n}\n\nfunction evaluateUnorderedTools(\n  expected: ExpectedTool[],\n  actual: ToolCallLike[],\n  options: {\n    argMatcher: (expected: unknown, actual: unknown) => boolean;\n    requireAllTools: boolean;\n    allowExtras: boolean;\n  },\n): ScoredResult {\n  const remainingExpected = [...expected];\n  const remainingActual = [...actual];\n  const issues: string[] = [];\n\n  for (let index = remainingExpected.length - 1; index >= 0; index--) {\n    const expectedTool = remainingExpected[index];\n    const matchIndex = remainingActual.findIndex((actualTool) => {\n      if (expectedTool.name !== actualTool.name) {\n        return false;\n      }\n\n      return expectedTool.arguments === undefined\n        ? true\n        : options.argMatcher(\n            expectedTool.arguments,\n            actualTool.arguments ?? {},\n          );\n    });\n\n    if (matchIndex !== -1) {\n      remainingExpected.splice(index, 1);\n      remainingActual.splice(matchIndex, 1);\n    }\n  }\n\n  for (const missingTool of remainingExpected) {\n    if (missingTool.arguments !== undefined) {\n      const wrongArgsCalls = actual.filter(\n        (call) => call.name === missingTool.name,\n      );\n      issues.push(\n        wrongArgsCalls.length > 0\n          ? `Tool '${missingTool.name}' called but with incorrect arguments`\n          : `Missing required tool: ${missingTool.name}`,\n      );\n    } else {\n      issues.push(`Missing required tool: ${missingTool.name}`);\n    }\n  }\n\n  const extraTools = remainingActual.map((tool) => tool.name);\n  if (!options.allowExtras && extraTools.length > 0) {\n    issues.push(`Unexpected extra tools: ${extraTools.join(\", \")}`);\n  }\n\n  const expectedMatched = expected.length - remainingExpected.length;\n  const score = expected.length > 0 ? expectedMatched / expected.length : 1;\n  if (issues.length > 0 && (options.requireAllTools || !options.allowExtras)) {\n    return {\n      score: 0,\n      metadata: {\n        rationale: issues.join(\"; \"),\n      },\n    };\n  }\n\n  if (score === 1) {\n    const extraInfo =\n      extraTools.length > 0 ? ` (plus extra: ${extraTools.join(\", \")})` : \"\";\n\n    return {\n      score: 1,\n      metadata: {\n        rationale: `All expected tools were called${extraInfo}`,\n      },\n    };\n  }\n\n  return {\n    score,\n    metadata: {\n      rationale: issues.join(\"; \"),\n      matched: expectedMatched,\n      total: expected.length,\n    },\n  };\n}\n"],"mappings":";AAAA;AAAA,EAKE;AAAA,EAGA;AAAA,OAGK;AAkBP;AAAA,EACE,qBAAAA;AAAA,EACA,eAAAC;AAAA,EACA,iCAAAC;AAAA,EACA,kBAAAC;AAAA,EACA,SAAAC;AAAA,EACA,eAAAC;AAAA,EACA,kBAAAC;AAAA,EACA,aAAAC;AAAA,EACA,gBAAAC;AAAA,EACA,gBAAAC;AAAA,OACK;AAkPP,SAAS,gBAAgB,OAAwC;AAC/D,SACE,UAAU,QACV,OAAO,UAAU,YACjB,OAAO,UAAU,aAChB,OAAO,UAAU,YAAY,OAAO,SAAS,KAAK;AAEvD;AAEA,SAAS,aAAa,OAAkD;AACtE,SAAO,OAAO,UAAU,YAAY,UAAU,QAAQ,CAAC,MAAM,QAAQ,KAAK;AAC5E;AAEA,SAAS,mBAAmB,OAAkB,MAAuB;AACnE,MAAI,KAAK,IAAI,KAAK,GAAG;AACnB,WAAO;AAAA,EACT;AAEA,OAAK,IAAI,KAAK;AACd,QAAM,aAAa,MAAM,IAAI,CAAC,SAAS;AACrC,UAAMC,cAAa,oBAAoB,MAAM,IAAI;AACjD,WAAOA,gBAAe,SAAY,OAAOA;AAAA,EAC3C,CAAC;AACD,OAAK,OAAO,KAAK;AAEjB,SAAO;AACT;AAEA,SAAS,oBACP,OACA,MAC2B;AAC3B,QAAM,aAAwC,CAAC;AAE/C,MAAI,KAAK,IAAI,KAAK,GAAG;AACnB,WAAO;AAAA,EACT;AAEA,OAAK,IAAI,KAAK;AACd,MAAI;AACF,eAAW,CAAC,KAAK,UAAU,KAAK,OAAO,QAAQ,KAAK,GAAG;AACrD,YAAM,QAAQ,oBAAoB,YAAY,IAAI;AAClD,UAAI,UAAU,QAAW;AACvB,mBAAW,GAAG,IAAI;AAAA,MACpB;AAAA,IACF;AAAA,EACF,UAAE;AACA,SAAK,OAAO,KAAK;AAAA,EACnB;AAEA,SAAO;AACT;AAaO,SAAS,YAAY,OAAuC;AACjE,SAAO,oBAAoB,OAAO,oBAAI,QAAQ,CAAC;AACjD;AAEA,SAAS,oBACP,OACA,MACuB;AACvB,MAAI,gBAAgB,KAAK,GAAG;AAC1B,WAAO;AAAA,EACT;AAEA,MACE,UAAU,QACV,OAAO,UAAU,YACjB,KAAK,IAAI,KAAe,GACxB;AACA,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,QAAQ,KAAK,GAAG;AACxB,WAAO,mBAAmB,OAAO,IAAI;AAAA,EACvC;AAEA,MAAI,aAAa,KAAK,GAAG;AACvB,WAAO,oBAAoB,OAAO,IAAI;AAAA,EACxC;AAEA,SAAO;AACT;AAkBO,SAAS,iBAAiB,OAA2B;AAC1D,QAAM,aAAa,YAAY,KAAK;AACpC,SAAO,eAAe,SAAY,aAAa,OAAO,KAAK;AAC7D;;;ACrUA,SAAS,SAAS,OAAkD;AAClE,SAAO,OAAO,UAAU,YAAY,UAAU,QAAQ,CAAC,MAAM,QAAQ,KAAK;AAC5E;AAGO,SAAS,aACd,UACA,QACA,SACS;AACT,OAAK;AAEL,MAAI,aAAa,QAAQ;AACvB,WAAO;AAAA,EACT;AAEA,MACE,aAAa,QACb,aAAa,UACb,WAAW,QACX,WAAW,QACX;AACA,WAAO;AAAA,EACT;AAEA,MAAI,OAAO,aAAa,OAAO,QAAQ;AACrC,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,QAAQ,QAAQ,GAAG;AAC3B,WACE,MAAM,QAAQ,MAAM,KACpB,SAAS,WAAW,OAAO,UAC3B,SAAS;AAAA,MAAM,CAAC,MAAM,UACpB,aAAa,MAAM,OAAO,KAAK,GAAG,OAAO;AAAA,IAC3C;AAAA,EAEJ;AAEA,MAAI,SAAS,QAAQ,KAAK,SAAS,MAAM,GAAG;AAC1C,UAAM,eAAe,OAAO,KAAK,QAAQ;AACzC,UAAM,aAAa,OAAO,KAAK,MAAM;AAErC,WACE,aAAa,WAAW,WAAW,UACnC,aAAa;AAAA,MACX,CAAC,QACC,OAAO,UAAU,aAAa,SAAS,GAAG,GAAG,OAAO,GAAG,GAAG,OAAO;AAAA,IACrE;AAAA,EAEJ;AAEA,SAAO;AACT;AAEA,SAAS,iBACP,UACA,QACA,SACS;AACT,QAAM,EAAE,kBAAkB,MAAM,YAAY,MAAM,IAAI;AACtD,QAAM,eAAe,kBAAkB,SAAS,YAAY,IAAI;AAChE,QAAM,aAAa,kBAAkB,OAAO,YAAY,IAAI;AAE5D,SAAO,YACH,WAAW,SAAS,YAAY,KAAK,aAAa,SAAS,UAAU,IACrE,iBAAiB;AACvB;AAEA,SAAS,iBACP,UACA,QACA,SACS;AACT,QAAM,EAAE,mBAAmB,KAAM,IAAI;AACrC,QAAM,YAAY,KAAK;AAAA,IACrB,KAAK,IAAI,QAAQ,IAAI;AAAA,IACrB;AAAA,EACF;AAEA,SAAO,KAAK,IAAI,WAAW,MAAM,KAAK;AACxC;AAEA,SAAS,gBACP,UACA,QACA,SACA,SACS;AACT,QAAM,EAAE,mBAAmB,KAAK,IAAI;AAEpC,MAAI,CAAC,kBAAkB;AACrB,WACE,SAAS,WAAW,OAAO,UAC3B,SAAS;AAAA,MAAM,CAAC,MAAM,UACpB,WAAW,MAAM,OAAO,KAAK,GAAG,SAAS,OAAO;AAAA,IAClD;AAAA,EAEJ;AAEA,QAAM,aAAa,OAAO,IAAI,MAAM,KAAK;AACzC,SAAO,SAAS,MAAM,CAAC,iBAAiB;AACtC,eAAW,CAAC,OAAO,UAAU,KAAK,OAAO,QAAQ,GAAG;AAClD,UAAI,WAAW,KAAK,GAAG;AACrB;AAAA,MACF;AAEA,UAAI,WAAW,cAAc,YAAY,SAAS,OAAO,GAAG;AAC1D,mBAAW,KAAK,IAAI;AACpB,eAAO;AAAA,MACT;AAAA,IACF;AAEA,WAAO;AAAA,EACT,CAAC;AACH;AAEA,SAAS,iBACP,UACA,QACA,SACA,SACS;AACT,SAAO,OAAO,QAAQ,QAAQ,EAAE;AAAA,IAC9B,CAAC,CAAC,KAAK,KAAK,MACV,OAAO,UAAU,WAAW,OAAO,OAAO,GAAG,GAAG,SAAS,OAAO;AAAA,EACpE;AACF;AAEA,SAAS,uBAAuB,UAAmB,QAA0B;AAC3E,MAAI,OAAO,aAAa,aAAa,OAAO,WAAW,UAAU;AAC/D,WAAO,cAAc,OAAO,YAAY,MAAM,UAAU,WAAW;AAAA,EACrE;AAEA,MAAI,OAAO,aAAa,YAAY,OAAO,WAAW,UAAU;AAC9D,WAAO,OAAO,WAAW,QAAQ,MAAM;AAAA,EACzC;AAEA,MAAI,OAAO,aAAa,YAAY,OAAO,WAAW,UAAU;AAC9D,WAAO,aAAa,OAAO,WAAW,MAAM;AAAA,EAC9C;AAEA,SAAO;AACT;AAGO,SAAS,WACd,UACA,QACA,UAA6B,CAAC,GAC9B,SACS;AACT,QAAM,EAAE,cAAc,MAAM,IAAI;AAEhC,MAAI,oBAAoB,QAAQ;AAC9B,WAAO,OAAO,WAAW,YAAY,SAAS,KAAK,MAAM;AAAA,EAC3D;AAEA,MAAI,OAAO,aAAa,YAAY;AAClC,WAAO,QAAS,SAAyC,MAAM,CAAC;AAAA,EAClE;AAEA,MACE,aAAa,QACb,aAAa,UACb,WAAW,QACX,WAAW,QACX;AACA,WAAO,aAAa;AAAA,EACtB;AAEA,MAAI,OAAO,aAAa,YAAY,OAAO,WAAW,UAAU;AAC9D,WAAO,iBAAiB,UAAU,QAAQ,OAAO;AAAA,EACnD;AAEA,MAAI,OAAO,aAAa,YAAY,OAAO,WAAW,UAAU;AAC9D,WAAO,iBAAiB,UAAU,QAAQ,OAAO;AAAA,EACnD;AAEA,MAAI,MAAM,QAAQ,QAAQ,KAAK,MAAM,QAAQ,MAAM,GAAG;AACpD,WAAO,gBAAgB,UAAU,QAAQ,SAAS,OAAO;AAAA,EAC3D;AAEA,MAAI,SAAS,QAAQ,KAAK,SAAS,MAAM,GAAG;AAC1C,WAAO,iBAAiB,UAAU,QAAQ,SAAS,OAAO;AAAA,EAC5D;AAEA,MAAI,eAAe,uBAAuB,UAAU,MAAM,GAAG;AAC3D,WAAO;AAAA,EACT;AAEA,SAAO,aAAa;AACtB;AAGO,SAAS,cACd,UACA,SACuD;AACvD,MAAI,OAAO,aAAa,YAAY;AAClC,WAAO,CAAC,UAAU,QAAQ,YAAY,SAAS,UAAU,QAAQ,OAAO;AAAA,EAC1E;AAEA,MAAI,aAAa,UAAU;AACzB,WAAO,CAAC,UAAU,QAAQ,YACxB,aAAa,UAAU,QAAQ,OAAO;AAAA,EAC1C;AAEA,SAAO,CAAC,UAAU,QAAQ,YACxB,WAAW,UAAU,QAAQ,WAAW,CAAC,GAAG,OAAO;AACvD;;;AC1PO,SAAS,eAAe,SAA+B,CAAC,GAAG;AAChE,QAAM;AAAA,IACJ,UAAU;AAAA,IACV,aAAa;AAAA,IACb,cAAc;AAAA,IACd,SAAS;AAAA,IACT,cAAc;AAAA,EAChB,IAAI;AAEJ,QAAM,eAAkC;AAAA,IACtC,WAAW;AAAA,IACX,iBAAiB;AAAA,IACjB,kBAAkB;AAAA,IAClB,kBAAkB;AAAA,IAClB,aAAa;AAAA,IACb,GAAG;AAAA,EACL;AACA,QAAM,aAAa,cAAc,QAAQ,YAAY;AAErD,QAAM,SAAS,OAAO,SAAuD;AAC3E,UAAM,gBAAgB,KAAK,iBAAiB,CAAC;AAC7C,UAAM,cAAc,KAAK,aAAa,CAAC;AAEvC,QAAI,cAAc,WAAW,GAAG;AAC9B,aAAO;AAAA,QACL,OAAO;AAAA,QACP,UAAU;AAAA,UACR,WAAW;AAAA,QACb;AAAA,MACF;AAAA,IACF;AAEA,QAAI,YAAY,WAAW,GAAG;AAC5B,aAAO;AAAA,QACL,OAAO;AAAA,QACP,UAAU;AAAA,UACR,WAAW,YAAY,cAAc,MAAM;AAAA,QAC7C;AAAA,MACF;AAAA,IACF;AAEA,WAAO,UACH,qBAAqB,eAAe,aAAa;AAAA,MAC/C;AAAA,MACA;AAAA,MACA,iBAAiB;AAAA,IACnB,CAAC,IACD,uBAAuB,eAAe,aAAa;AAAA,MACjD;AAAA,MACA,iBAAiB;AAAA,MACjB;AAAA,IACF,CAAC;AAAA,EACP;AAEA,SAAO,eAAe,QAAQ,QAAQ;AAAA,IACpC,OAAO;AAAA,EACT,CAAC;AAED,SAAO;AACT;AAEA,SAAS,qBACP,UACA,QACA,SAKc;AACd,MAAI,gBAAgB;AACpB,MAAI,cAAc;AAElB,SAAO,gBAAgB,SAAS,UAAU,cAAc,OAAO,QAAQ;AACrE,UAAM,eAAe,SAAS,aAAa;AAC3C,UAAM,aAAa,OAAO,WAAW;AAErC,QAAI,aAAa,SAAS,WAAW,MAAM;AACzC,UACE,aAAa,cAAc,UAC3B,CAAC,QAAQ,WAAW,aAAa,WAAW,WAAW,aAAa,CAAC,CAAC,GACtE;AACA,eAAO;AAAA,UACL,OAAO,gBAAgB,SAAS;AAAA,UAChC,UAAU;AAAA,YACR,WAAW,SAAS,aAAa,IAAI,iDAAiD,gBAAgB,CAAC,KAAK,aAAa,IAAI,SAAS,MAAM;AAAA,YAC5I,UAAU,iBAAiB,aAAa,SAAS;AAAA,YACjD,QACE,WAAW,cAAc,SACrB,SACA,iBAAiB,WAAW,SAAS;AAAA,YAC3C,SAAS;AAAA,YACT,OAAO,SAAS;AAAA,UAClB;AAAA,QACF;AAAA,MACF;AAEA;AACA;AACA;AAAA,IACF;AAEA,QAAI,QAAQ,aAAa;AACvB;AACA;AAAA,IACF;AAEA,WAAO;AAAA,MACL,OAAO;AAAA,MACP,UAAU;AAAA,QACR,WAAW,aAAa,aAAa,IAAI,iBAAiB,gBAAgB,CAAC,eAAe,WAAW,IAAI;AAAA,MAC3G;AAAA,IACF;AAAA,EACF;AAEA,MAAI,gBAAgB,SAAS,QAAQ;AACnC,UAAM,UAAU,SAAS,MAAM,aAAa,EAAE,IAAI,CAAC,SAAS,KAAK,IAAI;AACrE,QAAI,QAAQ,iBAAiB;AAC3B,aAAO;AAAA,QACL,OAAO;AAAA,QACP,UAAU;AAAA,UACR,WAAW,uCAAuC,QAAQ,KAAK,IAAI,CAAC;AAAA,QACtE;AAAA,MACF;AAAA,IACF;AAEA,UAAM,eAAe;AACrB,UAAM,aAAa,SAAS;AAC5B,WAAO;AAAA,MACL,OAAO,aAAa,IAAI,eAAe,aAAa;AAAA,MACpD,UAAU;AAAA,QACR,WAAW,kBAAkB,YAAY,IAAI,UAAU,oCAAoC,QAAQ,KAAK,IAAI,CAAC;AAAA,QAC7G,SAAS;AAAA,QACT,OAAO;AAAA,MACT;AAAA,IACF;AAAA,EACF;AAEA,MAAI,CAAC,QAAQ,eAAe,cAAc,OAAO,QAAQ;AACvD,UAAM,QAAQ,OAAO,MAAM,WAAW,EAAE,IAAI,CAAC,SAAS,KAAK,IAAI;AAC/D,WAAO;AAAA,MACL,OAAO;AAAA,MACP,UAAU;AAAA,QACR,WAAW,2BAA2B,MAAM,KAAK,IAAI,CAAC;AAAA,MACxD;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AAAA,IACL,OAAO;AAAA,IACP,UAAU;AAAA,MACR,WAAW;AAAA,IACb;AAAA,EACF;AACF;AAEA,SAAS,uBACP,UACA,QACA,SAKc;AACd,QAAM,oBAAoB,CAAC,GAAG,QAAQ;AACtC,QAAM,kBAAkB,CAAC,GAAG,MAAM;AAClC,QAAM,SAAmB,CAAC;AAE1B,WAAS,QAAQ,kBAAkB,SAAS,GAAG,SAAS,GAAG,SAAS;AAClE,UAAM,eAAe,kBAAkB,KAAK;AAC5C,UAAM,aAAa,gBAAgB,UAAU,CAAC,eAAe;AAC3D,UAAI,aAAa,SAAS,WAAW,MAAM;AACzC,eAAO;AAAA,MACT;AAEA,aAAO,aAAa,cAAc,SAC9B,OACA,QAAQ;AAAA,QACN,aAAa;AAAA,QACb,WAAW,aAAa,CAAC;AAAA,MAC3B;AAAA,IACN,CAAC;AAED,QAAI,eAAe,IAAI;AACrB,wBAAkB,OAAO,OAAO,CAAC;AACjC,sBAAgB,OAAO,YAAY,CAAC;AAAA,IACtC;AAAA,EACF;AAEA,aAAW,eAAe,mBAAmB;AAC3C,QAAI,YAAY,cAAc,QAAW;AACvC,YAAM,iBAAiB,OAAO;AAAA,QAC5B,CAAC,SAAS,KAAK,SAAS,YAAY;AAAA,MACtC;AACA,aAAO;AAAA,QACL,eAAe,SAAS,IACpB,SAAS,YAAY,IAAI,0CACzB,0BAA0B,YAAY,IAAI;AAAA,MAChD;AAAA,IACF,OAAO;AACL,aAAO,KAAK,0BAA0B,YAAY,IAAI,EAAE;AAAA,IAC1D;AAAA,EACF;AAEA,QAAM,aAAa,gBAAgB,IAAI,CAAC,SAAS,KAAK,IAAI;AAC1D,MAAI,CAAC,QAAQ,eAAe,WAAW,SAAS,GAAG;AACjD,WAAO,KAAK,2BAA2B,WAAW,KAAK,IAAI,CAAC,EAAE;AAAA,EAChE;AAEA,QAAM,kBAAkB,SAAS,SAAS,kBAAkB;AAC5D,QAAM,QAAQ,SAAS,SAAS,IAAI,kBAAkB,SAAS,SAAS;AACxE,MAAI,OAAO,SAAS,MAAM,QAAQ,mBAAmB,CAAC,QAAQ,cAAc;AAC1E,WAAO;AAAA,MACL,OAAO;AAAA,MACP,UAAU;AAAA,QACR,WAAW,OAAO,KAAK,IAAI;AAAA,MAC7B;AAAA,IACF;AAAA,EACF;AAEA,MAAI,UAAU,GAAG;AACf,UAAM,YACJ,WAAW,SAAS,IAAI,iBAAiB,WAAW,KAAK,IAAI,CAAC,MAAM;AAEtE,WAAO;AAAA,MACL,OAAO;AAAA,MACP,UAAU;AAAA,QACR,WAAW,iCAAiC,SAAS;AAAA,MACvD;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AAAA,IACL;AAAA,IACA,UAAU;AAAA,MACR,WAAW,OAAO,KAAK,IAAI;AAAA,MAC3B,SAAS;AAAA,MACT,OAAO,SAAS;AAAA,IAClB;AAAA,EACF;AACF;","names":["assistantMessages","failedSpans","latestAssistantMessageContent","messagesByRole","spans","spansByKind","systemMessages","toolCalls","toolMessages","userMessages","normalized"]}