/** * Types for the drift detection evaluation framework. * * This framework enables systematic measurement of semantic comparison * accuracy, including precision, recall, and confidence calibration. */ /** * A labeled test case for evaluating semantic comparison accuracy. */ export interface GoldenTestCase { /** Unique identifier for this test case */ id: string; /** Category of comparison being tested */ category: 'security' | 'limitation' | 'assertion'; /** First text to compare */ text1: string; /** Second text to compare */ text2: string; /** Tool name for context (affects fingerprinting) */ toolName: string; /** Whether these should be considered semantically equivalent */ expectedMatch: boolean; /** Expected confidence range (optional) */ expectedConfidence?: { min: number; max: number; }; /** Human reasoning for why this is the expected outcome */ reasoning: string; /** Source of this test case */ source: 'manual' | 'llm-generated' | 'production' | 'user-feedback'; /** Tags for filtering and analysis */ tags?: string[]; } /** * Result of evaluating a single test case. */ export interface TestCaseResult { /** The test case that was evaluated */ testCase: GoldenTestCase; /** Whether the comparison returned match */ actualMatch: boolean; /** The confidence score returned */ actualConfidence: number; /** Whether this test passed */ passed: boolean; /** Type of failure if not passed */ failureType?: 'false_positive' | 'false_negative' | 'confidence_out_of_range'; /** Time taken for this comparison (ms) */ durationMs: number; /** Detailed confidence factors */ confidenceFactors?: Array<{ name: string; weight: number; value: number; description: string; }>; } /** * Metrics for a specific category of comparisons. */ export interface CategoryMetrics { /** Category name */ category: string; /** Number of test cases */ totalCases: number; /** Accuracy for this category */ accuracy: number; /** Precision for this category */ precision: number; /** Recall for this category */ recall: number; /** F1 score for this category */ f1Score: number; /** Confusion matrix counts */ truePositives: number; trueNegatives: number; falsePositives: number; falseNegatives: number; } /** * Calibration bucket for analyzing confidence score accuracy. */ export interface CalibrationBucket { /** Range of predicted confidence scores */ predictedRange: { min: number; max: number; }; /** Actual accuracy for predictions in this range */ actualAccuracy: number; /** Number of samples in this bucket */ sampleCount: number; /** Calibration error (|predicted - actual|) */ calibrationError: number; } /** * Complete evaluation result for a drift detection algorithm. */ export interface EvaluationResult { /** Timestamp of evaluation */ timestamp: Date; /** Algorithm version being evaluated */ algorithmVersion: string; /** Dataset version used */ datasetVersion: string; /** Total number of test cases */ totalCases: number; /** Overall accuracy: (TP + TN) / Total */ accuracy: number; /** Precision: TP / (TP + FP) - low false positive rate */ precision: number; /** Recall: TP / (TP + FN) - catches real drift */ recall: number; /** F1 Score: harmonic mean of precision and recall */ f1Score: number; /** Correctly identified as matching */ truePositives: number; /** Correctly identified as different */ trueNegatives: number; /** Incorrectly flagged as different (noise/false alarm) */ falsePositives: number; /** Missed real differences (dangerous) */ falseNegatives: number; /** Average |predicted_confidence - actual_accuracy| */ calibrationError: number; /** Mean squared error of probabilistic predictions */ brierScore: number; /** Calibration buckets for detailed analysis */ calibrationBuckets: CalibrationBucket[]; /** Metrics per category */ categoryMetrics: CategoryMetrics[]; /** All individual test results */ testResults: TestCaseResult[]; /** Failed test cases for analysis */ failures: TestCaseResult[]; /** Total evaluation time (ms) */ totalDurationMs: number; /** Average comparison time (ms) */ averageComparisonMs: number; } /** * Summary for display/reporting. */ export interface EvaluationSummary { accuracy: string; precision: string; recall: string; f1Score: string; falsePositiveRate: string; falseNegativeRate: string; calibrationError: string; totalCases: number; passedCases: number; failedCases: number; } /** * Options for running evaluation. */ export interface EvaluationOptions { /** Filter to specific categories */ categories?: Array<'security' | 'limitation' | 'assertion'>; /** Filter to specific tags */ tags?: string[]; /** Verbose output */ verbose?: boolean; /** Include detailed confidence factors in results */ includeFactors?: boolean; } /** * Semantic comparator interface for pluggable algorithms. */ export interface SemanticComparator { /** Compare two texts and return match result with confidence */ compare(text1: string, text2: string, toolName: string, category: 'security' | 'limitation' | 'assertion'): { matches: boolean; confidence: number; factors?: Array<{ name: string; weight: number; value: number; description: string; }>; }; } //# sourceMappingURL=types.d.ts.map