/** * Panic signal calibration — measure accuracy against a labeled ground-truth corpus. * * The observe-mode gate says the signal's accuracy must be proven by data. This is that proof, in * code: a corpus of behavioral traces each labeled with ground truth — 'coherent' (focused, healthy * work that must NOT trip an intervention) or 'confused' (thrashing/drift that SHOULD). Each trace is * replayed through the REAL engine (see panic-replay.ts) and scored. The result is a measured * false-positive rate and sensitivity (true-positive rate) over known-labeled behavior. * * This does not replace real observe-mode telemetry — it complements it: a deterministic, CI-runnable * baseline that fails loudly if a change to the engine degrades its discrimination. Contributors can * extend CALIBRATION_CORPUS with real recorded sessions (labeled by a human) to harden it further. * * The intervention threshold under test is L2 (the advisory-injection floor): a 'coherent' trace * tripping L2+ is a false positive; a 'confused' trace reaching L2+ is a true positive. */ import { type ReplayStep } from './panic-replay.js'; export type GroundTruth = 'coherent' | 'confused'; export interface CalibrationScenario { name: string; label: GroundTruth; description: string; steps: ReplayStep[]; sourceRoots?: string[]; } export declare const CALIBRATION_CORPUS: CalibrationScenario[]; /** * Known sensitivities — borderline traces where the signal is over/under-sensitive. These are NOT * asserted as "correct"; they are documented evidence (the gate must weigh them) and a regression * pin on current behavior, so any future engine change that shifts them is noticed. This is the * honest output of validating accuracy against data: where the signal is weak, in the open. */ export interface SensitivityScenario { name: string; description: string; /** Why the signal mis-judges this — the mechanism. */ note: string; steps: ReplayStep[]; sourceRoots?: string[]; /** Whether this trace trips L2+ today (current, possibly-undesirable behavior). */ trips_today: boolean; } export declare const KNOWN_SENSITIVITIES: SensitivityScenario[]; export interface ScenarioResult { name: string; label: GroundTruth; peakLevel: number; peakScore: number; trippedL2: boolean; /** Did the engine classify this trace the way its ground-truth label expects? */ correct: boolean; } export interface CalibrationReport { scenarios: ScenarioResult[]; coherent_total: number; false_positives: number; false_positive_rate: number; confused_total: number; true_positives: number; true_positive_rate: number; accuracy: number; } export interface SensitivityResult { name: string; description: string; note: string; peakLevel: number; trippedL2: boolean; /** true if current behavior still matches the documented expectation (regression pin). */ matchesDocumented: boolean; } /** Evaluate the documented sensitivities against current engine behavior (regression pin). */ export declare function evaluateSensitivities(corpus?: SensitivityScenario[]): SensitivityResult[]; /** Replay every scenario through the real engine and measure discrimination at the L2 threshold. */ export declare function computeCalibration(corpus?: CalibrationScenario[]): CalibrationReport; //# sourceMappingURL=panic-calibration.d.ts.map