/** * SKILL.md Benchmark Harness * * Evaluates the ATR scanSkill() method against a labeled corpus of * malicious and benign SKILL.md files. Produces per-layer recall, * overall precision, and a detailed per-sample report. * * Corpus: data/skill-benchmark/manifest.json * Samples: data/skill-benchmark/malicious/ and data/skill-benchmark/benign/ * * Layers: * A = obvious payload (curl|bash, base64 exec, reverse shell) * B = obfuscated (bash expansion, paste service relay, encoded) * C = semantic (natural language instructions, social engineering) * * @module agent-threat-rules/eval/skill-benchmark */ interface SampleResult { readonly file: string; readonly label: 'malicious' | 'benign'; readonly layer: string; readonly attack_type: string; readonly detected: boolean; readonly rules_fired: readonly string[]; readonly correct: boolean; readonly latency_ms: number; readonly expected_rules_matched: boolean; readonly category_correct: boolean; } interface LayerMetrics { readonly total: number; readonly detected: number; readonly recall: number; } interface SkillBenchmarkReport { readonly timestamp: string; readonly corpus_size: number; readonly malicious_count: number; readonly benign_count: number; readonly overall_recall: number; readonly overall_precision: number; readonly overall_f1: number; readonly fp_rate: number; readonly layer_a: LayerMetrics; readonly layer_b: LayerMetrics; readonly layer_c: LayerMetrics; readonly true_positives: number; readonly false_positives: number; readonly true_negatives: number; readonly false_negatives: number; readonly expected_rules_accuracy: number; readonly category_accuracy: number; readonly avg_latency_ms: number; readonly max_latency_ms: number; readonly results: readonly SampleResult[]; readonly missed_attacks: readonly SampleResult[]; readonly false_alarms: readonly SampleResult[]; } export declare function runSkillBenchmark(options?: { readonly rulesDir?: string; readonly corpusDir?: string; readonly outputPath?: string; }): Promise; /** * Write the standardized version-pinned Measurement file for a SkillBenchmark * report. Separated from runSkillBenchmark so unit tests can exercise the * benchmark logic without mutating data/measurements/ on disk (which would * make the CI 'sync-stats --check' drift gate flake). * * Called from the CLI block at the bottom of this file and from any * external script that wants to persist the measurement. Safe to call * repeatedly the same day — uses force=true. */ export declare function writeSkillBenchmarkMeasurement(report: SkillBenchmarkReport): void; export declare function printReport(report: SkillBenchmarkReport): void; export {}; //# sourceMappingURL=skill-benchmark.d.ts.map