/**
 * Panic signal calibration — measure accuracy against a labeled ground-truth corpus.
 *
 * The observe-mode gate says the signal's accuracy must be proven by data. This is that proof, in
 * code: a corpus of behavioral traces each labeled with ground truth — 'coherent' (focused, healthy
 * work that must NOT trip an intervention) or 'confused' (thrashing/drift that SHOULD). Each trace is
 * replayed through the REAL engine (see panic-replay.ts) and scored. The result is a measured
 * false-positive rate and sensitivity (true-positive rate) over known-labeled behavior.
 *
 * This does not replace real observe-mode telemetry — it complements it: a deterministic, CI-runnable
 * baseline that fails loudly if a change to the engine degrades its discrimination. Contributors can
 * extend CALIBRATION_CORPUS with real recorded sessions (labeled by a human) to harden it further.
 *
 * The intervention threshold under test is L2 (the advisory-injection floor): a 'coherent' trace
 * tripping L2+ is a false positive; a 'confused' trace reaching L2+ is a true positive.
 */
import { type ReplayStep } from './panic-replay.js';
export type GroundTruth = 'coherent' | 'confused';
export interface CalibrationScenario {
    name: string;
    label: GroundTruth;
    description: string;
    steps: ReplayStep[];
    sourceRoots?: string[];
}
export declare const CALIBRATION_CORPUS: CalibrationScenario[];
/**
 * Known sensitivities — borderline traces where the signal is over/under-sensitive. These are NOT
 * asserted as "correct"; they are documented evidence (the gate must weigh them) and a regression
 * pin on current behavior, so any future engine change that shifts them is noticed. This is the
 * honest output of validating accuracy against data: where the signal is weak, in the open.
 */
export interface SensitivityScenario {
    name: string;
    description: string;
    /** Why the signal mis-judges this — the mechanism. */
    note: string;
    steps: ReplayStep[];
    sourceRoots?: string[];
    /** Whether this trace trips L2+ today (current, possibly-undesirable behavior). */
    trips_today: boolean;
}
export declare const KNOWN_SENSITIVITIES: SensitivityScenario[];
export interface ScenarioResult {
    name: string;
    label: GroundTruth;
    peakLevel: number;
    peakScore: number;
    trippedL2: boolean;
    /** Did the engine classify this trace the way its ground-truth label expects? */
    correct: boolean;
}
export interface CalibrationReport {
    scenarios: ScenarioResult[];
    coherent_total: number;
    false_positives: number;
    false_positive_rate: number;
    confused_total: number;
    true_positives: number;
    true_positive_rate: number;
    accuracy: number;
}
export interface SensitivityResult {
    name: string;
    description: string;
    note: string;
    peakLevel: number;
    trippedL2: boolean;
    /** true if current behavior still matches the documented expectation (regression pin). */
    matchesDocumented: boolean;
}
/** Evaluate the documented sensitivities against current engine behavior (regression pin). */
export declare function evaluateSensitivities(corpus?: SensitivityScenario[]): SensitivityResult[];
/** Replay every scenario through the real engine and measure discrimination at the L2 threshold. */
export declare function computeCalibration(corpus?: CalibrationScenario[]): CalibrationReport;
//# sourceMappingURL=panic-calibration.d.ts.map