/** * Performance Regression Detection * * Tracks response times across tool executions and detects performance regressions. * Provides percentile-based metrics (p50, p95, p99) for comprehensive latency analysis. */ import type { BehavioralBaseline, ChangeSeverity, PerformanceConfidence } from './types.js'; /** * Latency trend direction. */ export type LatencyTrend = 'improving' | 'stable' | 'degrading'; /** * Performance metrics for a single tool. */ export interface ToolPerformanceMetrics { /** Tool name */ toolName: string; /** 50th percentile latency in milliseconds */ p50Ms: number; /** 95th percentile latency in milliseconds */ p95Ms: number; /** 99th percentile latency in milliseconds */ p99Ms: number; /** Success rate (0-1) */ successRate: number; /** Total number of executions */ sampleCount: number; /** Average latency in milliseconds */ avgMs: number; /** Minimum latency in milliseconds */ minMs: number; /** Maximum latency in milliseconds */ maxMs: number; /** Standard deviation of latency */ stdDevMs: number; /** Timestamp of when metrics were collected */ collectedAt: Date; /** Statistical confidence metrics */ confidence?: PerformanceConfidence; } /** * Performance baseline for a tool (stored in baseline file). */ export interface PerformanceBaseline { /** Tool name */ toolName: string; /** Baseline 50th percentile latency */ baselineP50: number; /** Baseline 95th percentile latency */ baselineP95: number; /** Baseline 99th percentile latency */ baselineP99: number; /** Baseline success rate */ baselineSuccessRate: number; /** Maximum allowed regression percentage (default from config) */ maxAllowedRegression: number; /** When the baseline was established */ establishedAt: Date; } /** * Performance comparison result for a single tool. */ export interface PerformanceComparison { /** Tool name */ toolName: string; /** Current metrics */ current: ToolPerformanceMetrics; /** Baseline metrics (if available) */ baseline?: PerformanceBaseline; /** Latency trend */ trend: LatencyTrend; /** Regression percentage for p50 (positive = slower, negative = faster) */ p50RegressionPercent: number | null; /** Regression percentage for p95 */ p95RegressionPercent: number | null; /** Regression percentage for p99 */ p99RegressionPercent: number | null; /** Whether this tool has regressed beyond threshold */ hasRegression: boolean; /** Severity of the regression */ severity: ChangeSeverity; /** Human-readable summary */ summary: string; /** Current confidence level */ confidence?: PerformanceConfidence; /** Whether the regression is statistically reliable (based on confidence) */ isReliable: boolean; } /** * Overall performance report for a baseline comparison. */ export interface PerformanceReport { /** Individual tool comparisons */ toolComparisons: PerformanceComparison[]; /** Number of tools with performance regressions */ regressionCount: number; /** Number of tools with improved performance */ improvementCount: number; /** Number of tools with stable performance */ stableCount: number; /** Overall performance trend */ overallTrend: LatencyTrend; /** Overall severity */ overallSeverity: ChangeSeverity; /** Human-readable summary */ summary: string; /** Number of tools with low confidence */ lowConfidenceCount: number; /** Tools with low confidence (names) */ lowConfidenceTools: string[]; /** Number of reliable regressions (regressions with good confidence) */ reliableRegressionCount: number; } /** * Raw latency sample for calculating metrics. */ export interface LatencySample { toolName: string; durationMs: number; success: boolean; timestamp: Date; /** * Expected outcome of this test. * - 'success': Happy path test, expects tool to succeed * - 'error': Validation test, expects tool to reject/fail * - 'either': Edge case, either outcome is acceptable */ expectedOutcome?: 'success' | 'error' | 'either'; /** * Whether the outcome was correct based on expectations. * True if: (expected success && got success) OR (expected error && got error) */ outcomeCorrect?: boolean; } /** * Calculate statistical confidence for performance metrics. * * Confidence is determined by: * 1. Sample count - more samples = higher confidence * 2. Coefficient of variation (CV) - lower variability = higher confidence * * Key insight: For confidence calculation, we only count happy_path tests that * expect success. Validation tests (expectedOutcome: 'error') are tracked * separately because their failure doesn't indicate tool problems. * * Note: The first sample is excluded from variance calculation because it includes * cold-start overhead (JIT compilation, connection establishment, cache warming). * This gives more accurate confidence scores for steady-state performance. * * @param samples - The latency samples to analyze * @param options - Optional configuration * @returns Performance confidence metrics */ export declare function calculatePerformanceConfidence(samples: LatencySample[], options?: { excludeWarmup?: boolean; }): PerformanceConfidence; /** * Calculate performance confidence from ToolPerformanceMetrics. * Use this when you already have calculated metrics but need confidence. * * Note: This function assumes the metrics are from happy path tests only. * For full validation/success separation, use calculatePerformanceConfidence with raw samples. */ export declare function calculateConfidenceFromMetrics(metrics: ToolPerformanceMetrics, options?: { validationSamples?: number; totalTests?: number; }): PerformanceConfidence; /** * Format confidence level for display. */ export declare function formatConfidenceLevel(confidence: PerformanceConfidence, includeIndicator?: boolean): string; /** * Check if performance data has sufficient confidence for reliable comparisons. */ export declare function hasReliableConfidence(confidence: PerformanceConfidence): boolean; /** * Calculate performance metrics from raw latency samples. */ export declare function calculateMetrics(samples: LatencySample[]): ToolPerformanceMetrics | null; /** * Create a performance baseline from metrics. */ export declare function createPerformanceBaseline(metrics: ToolPerformanceMetrics, maxAllowedRegression?: number): PerformanceBaseline; /** * Extract performance baselines from a behavioral baseline. * Uses the performance metrics stored in tool fingerprints. */ export declare function extractPerformanceBaselines(baseline: BehavioralBaseline, regressionThreshold?: number): Map; /** * Compare current metrics against baseline. */ export declare function comparePerformance(current: ToolPerformanceMetrics, baseline: PerformanceBaseline | undefined, regressionThreshold?: number): PerformanceComparison; /** * Generate a complete performance report comparing current and baseline. */ export declare function generatePerformanceReport(currentMetrics: Map, baselines: Map, regressionThreshold?: number): PerformanceReport; /** * Format performance metrics for display. */ export declare function formatMetrics(metrics: ToolPerformanceMetrics): string; /** * Format performance comparison for display. */ export declare function formatComparison(comparison: PerformanceComparison): string; /** * Check if metrics indicate acceptable performance. */ export declare function isPerformanceAcceptable(comparison: PerformanceComparison, failOnRegression?: boolean): boolean; /** * Aggregate multiple samples into metrics grouped by tool. */ export declare function aggregateSamplesByTool(samples: LatencySample[]): Map; //# sourceMappingURL=performance-tracker.d.ts.map