type CandidateEvaluation = { subject: string score: number hardRejected: boolean dominantTokens: string[] missingTopTokens: string[] } const TOKEN_STOPWORDS = new Set([ "a", "an", "and", "are", "as", "at", "be", "by", "const", "else", "false", "for", "from", "function", "if", "import", "in", "is", "it", "let", "new", "null", "of", "on", "or", "return", "set", "src", "that", "the", "this", "to", "true", "type", "undefined", "use", "var", "with", ]) const VAGUE_LEAD_VERBS = new Set(["support", "update", "improve", "change"]) const DEFECT_EVIDENCE_TERMS = [ "bug", "broken", "crash", "defect", "fault", "incorrect", "panic", "regression", "wrong", ] export function selectBestCommitSubject({ candidates, context, selectedPaths, }: { candidates: string[] context: string selectedPaths: string[] }): CandidateEvaluation | null { const deduped = dedupe(candidates) if (deduped.length === 0) { return null } const dominantTokens = collectDominantTokens(context, selectedPaths) const evaluations = deduped.map((subject) => evaluateCandidate(subject, context, dominantTokens)) evaluations.sort((a, b) => b.score - a.score) const firstNonRejected = evaluations.find((entry) => !entry.hardRejected) return firstNonRejected ?? evaluations[0] ?? null } export function buildDominantTokenRetryHint({ context, selectedPaths, subject, }: { context: string selectedPaths: string[] subject: string }): string | null { const dominantTokens = collectDominantTokens(context, selectedPaths) if (dominantTokens.length === 0) { return null } const subjectTokens = tokenize(subject) const missing = dominantTokens.filter((token) => !subjectTokens.has(token)).slice(0, 3) if (missing.length === 0) { return null } return `include dominant changed terms when relevant: ${missing.join(", ")}` } function evaluateCandidate( subject: string, context: string, dominantTokens: string[], ): CandidateEvaluation { const parsed = parseSubject(subject) const subjectTokens = tokenize(subject) const defectEvidence = hasDefectEvidence(context) const likelyNewSurface = /- likely_new_surface: yes/.test(context) const topDominant = dominantTokens.slice(0, 2) const missingTopTokens = topDominant.filter((token) => !subjectTokens.has(token)) const overlap = dominantTokens.filter((token) => subjectTokens.has(token)).length const overlapRatio = dominantTokens.length > 0 ? overlap / dominantTokens.length : 1 const hardRejected = topDominant.length > 0 && missingTopTokens.length === topDominant.length let score = 0 score += hardRejected ? -80 : 24 score += overlap * 9 score += Math.round(overlapRatio * 16) if (parsed.type === "feat") { score += likelyNewSurface ? 6 : -8 } if (parsed.type === "fix") { score += defectEvidence ? 8 : -14 } if (parsed.type === "refactor") { if (!defectEvidence) score += 6 if (!likelyNewSurface) score += 3 } const leadVerb = parsed.description.split(/\s+/)[0] ?? "" if (VAGUE_LEAD_VERBS.has(leadVerb)) { score -= 8 } if (/\band\b/.test(parsed.description)) { score += 2 } return { subject, score, hardRejected, dominantTokens, missingTopTokens, } } function collectDominantTokens(context: string, selectedPaths: string[]): string[] { const counts = new Map() for (const path of selectedPaths) { const uniquePathTokens = new Set( splitTokens(path) .map(normalizeToken) .filter((token) => token.length >= 3 && !TOKEN_STOPWORDS.has(token)), ) for (const token of uniquePathTokens) { counts.set(token, (counts.get(token) ?? 0) + 2) } } for (const line of extractDiffLines(context)) { for (const token of splitTokens(line) .map(normalizeToken) .filter((value) => value.length >= 3 && !TOKEN_STOPWORDS.has(value))) { counts.set(token, (counts.get(token) ?? 0) + 1) } } const ranked = Array.from(counts.entries()) .sort((a, b) => (b[1] === a[1] ? a[0].localeCompare(b[0]) : b[1] - a[1])) .map(([token, count]) => ({ token, count })) const dominant = ranked.filter((entry) => entry.count >= 4).slice(0, 8).map((entry) => entry.token) if (dominant.length > 0) { return dominant } return ranked.slice(0, 6).map((entry) => entry.token) } function extractDiffLines(context: string): string[] { const diffStart = context.indexOf("Diff highlights:") const section = diffStart >= 0 ? context.slice(diffStart) : context return section .split("\n") .filter((line) => line.startsWith("+") || line.startsWith("-")) .filter((line) => !line.startsWith("+++ ") && !line.startsWith("--- ")) } function hasDefectEvidence(context: string): boolean { const diffText = extractDiffLines(context).join("\n").toLowerCase() return DEFECT_EVIDENCE_TERMS.some((term) => diffText.includes(term)) } function parseSubject(subject: string): { type: string; description: string } { const match = subject.match(/^([a-z]+)(?:\([^)]*\))?!?:\s*(.+)$/) return { type: (match?.[1] ?? "").trim(), description: (match?.[2] ?? "").trim().toLowerCase(), } } function tokenize(text: string): Set { return new Set( splitTokens(text) .map(normalizeToken) .filter((token) => token.length >= 2), ) } function splitTokens(text: string): string[] { return text.toLowerCase().split(/[^a-z0-9]+/).filter(Boolean) } function normalizeToken(token: string): string { const value = token.toLowerCase() if (value.endsWith("ies") && value.length > 4) { return `${value.slice(0, -3)}y` } if (value.endsWith("s") && value.length > 4) { return value.slice(0, -1) } return value } function dedupe(values: string[]): string[] { const seen = new Set() const output: string[] = [] for (const value of values) { const normalized = value.trim() if (!normalized || seen.has(normalized)) continue seen.add(normalized) output.push(normalized) } return output }