/** * Dual Tournament Engine * * Shared scoring/ranking utilities used by dual tournaments in both /upgrade and /attack flows. * Encodes policy vs evaluator tournaments with human-like reward heuristics and multi-evaluator * aggregation to approximate human code review preferences. */ export type PolicyId = string; export interface TournamentPolicy { id: PolicyId; label?: string; kind?: 'primary' | 'refiner' | 'checkpoint' | 'attack'; elo?: number; } export interface TournamentEvaluator { id: string; label?: string; /** Weight used when combining evaluator rankings (can be influenced by historical ELO). */ weight?: number; kind?: 'hard' | 'soft' | 'hybrid'; elo?: number; } export interface TournamentTask { id: string; goal: string; repoSnapshot?: string; tests?: string[]; constraints?: string[]; metadata?: Record; } export interface CandidateMetrics { executionSuccess?: number; testsPassed?: number; testsFailed?: number; staticAnalysis?: number; codeQuality?: number; blastRadius?: number; diffSize?: number; complexityDelta?: number; dependenciesAdded?: number; speedBonus?: number; toolSuccesses?: number; toolFailures?: number; warnings?: number; } export interface CandidateSignals { /** Learned reward model / preference score (0-1) */ rewardModelScore?: number; /** Self-assessed confidence from the agent (0-1) */ selfAssessment?: number; /** Optional human preference label (0-1) */ humanPreference?: number; } export interface EvaluatorScore { evaluatorId: string; score: number; weight?: number; notes?: string; } export interface TournamentCandidate { id: string; policyId: PolicyId; patchSummary?: string; diffSummary?: string; metrics?: CandidateMetrics; signals?: CandidateSignals; evaluatorScores?: EvaluatorScore[]; rawOutput?: string; } export interface HumanRewardWeights { /** Correctness weight */ alpha: number; /** Code quality / robustness weight */ beta: number; /** Learned reward / human preference weight */ gamma: number; } export declare const DEFAULT_HUMAN_REWARD_WEIGHTS: HumanRewardWeights; export interface RankedCandidate { candidateId: string; aggregateScore: number; /** Relative human-like accuracy (1 = best rank, 0 = worst rank) */ humanAccuracy: number; rank: number; correctnessScore: number; qualityScore: number; learnedScore: number; evaluatorScore: number; } export type PairwiseWins = Record>; export interface TournamentOutcome { task: TournamentTask; ranked: RankedCandidate[]; pairwise: PairwiseWins; evaluatorBreakdown: Record; } export interface TournamentOptions { rewardWeights?: HumanRewardWeights; evaluators?: TournamentEvaluator[]; /** When true, prefer smaller diffs by default if diffSize is provided */ preferSmallerDiff?: boolean; /** Maximum candidates to evaluate (caps O(n^2) work) */ maxCandidates?: number; } /** * Run a dual tournament over candidate patches/agents, combining hard metrics, * human-like reward heuristics, and evaluator rankings. */ export declare function runDualTournament(task: TournamentTask, candidates: TournamentCandidate[], options?: TournamentOptions): TournamentOutcome; //# sourceMappingURL=dualTournament.d.ts.map