{"version":3,"sources":["../../../src/legacy/evaluate/index.ts","../../../src/legacy/format.ts"],"sourcesContent":["/**\n * Temporary compatibility wrapper for the old `evaluate(...)` helper.\n *\n * Keep this module isolated from the harness-first API so legacy suites can be\n * removed cleanly in a later deletion pass.\n */\nimport { generateObject } from \"ai\";\nimport { z } from \"zod\";\nimport { assert, test } from \"vitest\";\nimport { wrapText } from \"../format\";\n\ntype LanguageModel = Parameters<typeof generateObject>[0][\"model\"];\n\nlet defaultModel: LanguageModel | undefined;\n\n/** Configures the default language model used by legacy `evaluate(...)`. */\nexport function configure(opts: { model: LanguageModel }) {\n  defaultModel = opts.model;\n}\n\nconst EVAL_SYSTEM = `You are assessing a submitted output based on a given criterion. Ignore differences in style, grammar, punctuation, or length. Focus only on whether the criterion is met.`;\n\nconst EVAL_PROMPT = (output: string, criteria: string) => `<submission>\n${output}\n</submission>\n\n<criteria>\n${criteria}\n</criteria>\n\nDoes the submission meet the criteria? Select one option:\n(A) The criteria is fully met with no issues\n(B) The criteria is mostly met with minor gaps\n(C) The criteria is partially met with notable gaps\n(D) The criteria is barely met or only tangentially addressed\n(E) The criteria is not met at all`;\n\nconst CHOICE_SCORES: Record<string, number> = {\n  A: 1.0,\n  B: 0.75,\n  C: 0.5,\n  D: 0.25,\n  E: 0.0,\n};\n\ninterface EvaluateOptions {\n  task: () => Promise<string>;\n  criteria: string;\n  threshold?: number;\n}\n\ninterface TestTaskContext {\n  task: { meta: Record<string, any> };\n}\n\n/** @internal Core evaluation logic, exported for testing. */\nexport async function _evaluate(\n  ctx: TestTaskContext,\n  opts: EvaluateOptions,\n): Promise<void> {\n  if (!defaultModel) {\n    throw new Error(\n      \"No model configured. Call configure({ model }) before using evaluate.\",\n    );\n  }\n\n  let output: string;\n  try {\n    output = await opts.task();\n  } catch (error) {\n    const errorMessage = error instanceof Error ? error.message : String(error);\n    ctx.task.meta.eval = {\n      scores: [\n        {\n          score: 0,\n          name: \"evaluate\",\n          metadata: { rationale: `Task failed: ${errorMessage}` },\n        },\n      ],\n      avgScore: 0,\n    };\n    throw error;\n  }\n\n  let object: { answer: string; rationale: string };\n  try {\n    ({ object } = await generateObject({\n      model: defaultModel,\n      schema: z.object({\n        answer: z.enum([\"A\", \"B\", \"C\", \"D\", \"E\"]),\n        rationale: z.string(),\n      }),\n      system: EVAL_SYSTEM,\n      prompt: EVAL_PROMPT(output, opts.criteria),\n    }));\n  } catch (error) {\n    const errorMessage = error instanceof Error ? error.message : String(error);\n    ctx.task.meta.eval = {\n      scores: [\n        {\n          score: 0,\n          name: \"evaluate\",\n          metadata: { rationale: `Judge failed: ${errorMessage}` },\n        },\n      ],\n      avgScore: 0,\n    };\n    throw error;\n  }\n\n  const score = CHOICE_SCORES[object.answer];\n  const threshold = opts.threshold ?? 1.0;\n\n  ctx.task.meta.eval = {\n    scores: [\n      {\n        score,\n        name: \"evaluate\",\n        metadata: { rationale: object.rationale, answer: object.answer },\n      },\n    ],\n    avgScore: score,\n  };\n\n  if (score < threshold) {\n    assert(\n      false,\n      `Score: ${score} (${object.answer}) below threshold: ${threshold}\\n\\n## Output:\\n${wrapText(output)}\\n\\n## Rationale:\\n${wrapText(object.rationale)}`,\n    );\n  }\n}\n\n/** Defines a legacy model-graded eval test. Prefer harness-backed suites. */\nexport function evaluate(\n  name: string,\n  opts: EvaluateOptions & { timeout?: number },\n) {\n  test(name, { timeout: opts.timeout ?? 60000 }, async ({ task: testTask }) => {\n    await _evaluate({ task: testTask }, opts);\n  });\n}\n","import type { Score } from \"./shared\";\n\n/**\n * Temporary legacy formatter helpers.\n *\n * Keep these local to the scorer-first compatibility layer so legacy can be\n * deleted without touching the harness-first entrypoint.\n */\n\n/** Wraps scorer output into fixed-width lines for legacy failure messages. */\nexport function wrapText(text: string, width = 80): string {\n  if (!text || text.length <= width) {\n    return text;\n  }\n\n  const words = text.split(/\\s+/);\n  const lines: string[] = [];\n  let currentLine = \"\";\n\n  for (const word of words) {\n    if (currentLine.length + word.length + 1 > width) {\n      lines.push(currentLine.trim());\n      currentLine = word;\n    } else {\n      currentLine += (currentLine ? \" \" : \"\") + word;\n    }\n  }\n\n  if (currentLine) {\n    lines.push(currentLine);\n  }\n\n  return lines.join(\"\\n\");\n}\n\n/** Formats legacy scorer results for matcher and assertion output. */\nexport function formatScores(scores: (Score & { name: string })[]) {\n  return scores\n    .map((score) => {\n      const scoreLine = `${score.name || \"Unknown\"} [${(score.score ?? 0).toFixed(1)}]`;\n      if (\n        ((score.score ?? 0) < 1.0 && score.metadata?.rationale) ||\n        score.metadata?.output\n      ) {\n        let formattedOutput = \"\";\n        if (score.metadata?.output !== undefined) {\n          const output = score.metadata.output;\n          formattedOutput =\n            typeof output === \"string\"\n              ? `\\noutput  ${wrapText(output)}`\n              : `\\noutput  ${wrapText(JSON.stringify(output, null, 2))}`;\n        }\n\n        return `${scoreLine}${\n          score.metadata?.rationale\n            ? `\\nreason  ${wrapText(score.metadata.rationale)}`\n            : \"\"\n        }${formattedOutput}`;\n      }\n      return scoreLine;\n    })\n    .join(\"\\n\\n\");\n}\n"],"mappings":";AAMA,SAAS,sBAAsB;AAC/B,SAAS,SAAS;AAClB,SAAS,QAAQ,YAAY;;;ACEtB,SAAS,SAAS,MAAc,QAAQ,IAAY;AACzD,MAAI,CAAC,QAAQ,KAAK,UAAU,OAAO;AACjC,WAAO;AAAA,EACT;AAEA,QAAM,QAAQ,KAAK,MAAM,KAAK;AAC9B,QAAM,QAAkB,CAAC;AACzB,MAAI,cAAc;AAElB,aAAW,QAAQ,OAAO;AACxB,QAAI,YAAY,SAAS,KAAK,SAAS,IAAI,OAAO;AAChD,YAAM,KAAK,YAAY,KAAK,CAAC;AAC7B,oBAAc;AAAA,IAChB,OAAO;AACL,sBAAgB,cAAc,MAAM,MAAM;AAAA,IAC5C;AAAA,EACF;AAEA,MAAI,aAAa;AACf,UAAM,KAAK,WAAW;AAAA,EACxB;AAEA,SAAO,MAAM,KAAK,IAAI;AACxB;;;ADpBA,IAAI;AAGG,SAAS,UAAU,MAAgC;AACxD,iBAAe,KAAK;AACtB;AAEA,IAAM,cAAc;AAEpB,IAAM,cAAc,CAAC,QAAgB,aAAqB;AAAA,EACxD,MAAM;AAAA;AAAA;AAAA;AAAA,EAIN,QAAQ;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAUV,IAAM,gBAAwC;AAAA,EAC5C,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AACL;AAaA,eAAsB,UACpB,KACA,MACe;AACf,MAAI,CAAC,cAAc;AACjB,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AAEA,MAAI;AACJ,MAAI;AACF,aAAS,MAAM,KAAK,KAAK;AAAA,EAC3B,SAAS,OAAO;AACd,UAAM,eAAe,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK;AAC1E,QAAI,KAAK,KAAK,OAAO;AAAA,MACnB,QAAQ;AAAA,QACN;AAAA,UACE,OAAO;AAAA,UACP,MAAM;AAAA,UACN,UAAU,EAAE,WAAW,gBAAgB,YAAY,GAAG;AAAA,QACxD;AAAA,MACF;AAAA,MACA,UAAU;AAAA,IACZ;AACA,UAAM;AAAA,EACR;AAEA,MAAI;AACJ,MAAI;AACF,KAAC,EAAE,OAAO,IAAI,MAAM,eAAe;AAAA,MACjC,OAAO;AAAA,MACP,QAAQ,EAAE,OAAO;AAAA,QACf,QAAQ,EAAE,KAAK,CAAC,KAAK,KAAK,KAAK,KAAK,GAAG,CAAC;AAAA,QACxC,WAAW,EAAE,OAAO;AAAA,MACtB,CAAC;AAAA,MACD,QAAQ;AAAA,MACR,QAAQ,YAAY,QAAQ,KAAK,QAAQ;AAAA,IAC3C,CAAC;AAAA,EACH,SAAS,OAAO;AACd,UAAM,eAAe,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK;AAC1E,QAAI,KAAK,KAAK,OAAO;AAAA,MACnB,QAAQ;AAAA,QACN;AAAA,UACE,OAAO;AAAA,UACP,MAAM;AAAA,UACN,UAAU,EAAE,WAAW,iBAAiB,YAAY,GAAG;AAAA,QACzD;AAAA,MACF;AAAA,MACA,UAAU;AAAA,IACZ;AACA,UAAM;AAAA,EACR;AAEA,QAAM,QAAQ,cAAc,OAAO,MAAM;AACzC,QAAM,YAAY,KAAK,aAAa;AAEpC,MAAI,KAAK,KAAK,OAAO;AAAA,IACnB,QAAQ;AAAA,MACN;AAAA,QACE;AAAA,QACA,MAAM;AAAA,QACN,UAAU,EAAE,WAAW,OAAO,WAAW,QAAQ,OAAO,OAAO;AAAA,MACjE;AAAA,IACF;AAAA,IACA,UAAU;AAAA,EACZ;AAEA,MAAI,QAAQ,WAAW;AACrB;AAAA,MACE;AAAA,MACA,UAAU,KAAK,KAAK,OAAO,MAAM,sBAAsB,SAAS;AAAA;AAAA;AAAA,EAAmB,SAAS,MAAM,CAAC;AAAA;AAAA;AAAA,EAAsB,SAAS,OAAO,SAAS,CAAC;AAAA,IACrJ;AAAA,EACF;AACF;AAGO,SAAS,SACd,MACA,MACA;AACA,OAAK,MAAM,EAAE,SAAS,KAAK,WAAW,IAAM,GAAG,OAAO,EAAE,MAAM,SAAS,MAAM;AAC3E,UAAM,UAAU,EAAE,MAAM,SAAS,GAAG,IAAI;AAAA,EAC1C,CAAC;AACH;","names":[]}