import type { Decision } from "./types.ts"; export const STOP_WORDS = new Set([ "the", "and", "for", "with", "that", "this", "from", "have", "has", "was", "were", "are", "not", "you", "your", "but", "all", "can", "into", "use", "using", "any", "what", "when", "where", "why", "how", "who", "did", "does", "their", "there", ]); const GENERIC_INJECTION_TERMS = new Set([ "decision", "decided", "preference", "workflow", "implementation", "manual", "important", "pinned", ]); export type TokenWeights = ReadonlyMap; const OPERATIONAL_TITLE_RE = /^(read|ran|executed|wrote|edited|listed|searched|fetched|opened)\b/i; export function tokenize(text: string): string[] { const matches = text.toLowerCase().match(/[a-z0-9_./:-]{3,}/g) ?? []; return matches .map((token) => token.replace(/^[.:-]+|[.:-]+$/g, "")) .filter((token) => token.length >= 3 && !STOP_WORDS.has(token)); } export function projectRoot(cwd: string): string { const parts = cwd.split("/").filter(Boolean); const reposIndex = parts.findIndex((part) => part.toLowerCase() === "repos"); if (reposIndex >= 0 && parts[reposIndex + 1]) return `/${parts.slice(0, reposIndex + 2).join("/")}`; return cwd; } export function projectName(cwd: string): string { return projectRoot(cwd).split("/").filter(Boolean).at(-1) ?? cwd; } export function sameProjectScope(a: string, b: string): boolean { return projectRoot(a) === projectRoot(b) || projectName(a) === projectName(b); } export function inferTags(title: string, text: string, existing: string[] = []): string[] { const result = new Set(existing.filter(Boolean)); const body = `${title}\n${text}\n${existing.join(" ")}`.toLowerCase(); if (/\b(decision|decided|choose|chose|use .+ because|architectural decision)\b/.test(body)) result.add("decision"); if (/\b(preference|prefer|always|never|do not|don't|avoid|standing instruction)\b/.test(body)) result.add("preference"); if (/\b(workflow|process|procedure|steps|runbook|playbook|command sequence)\b/.test(body)) result.add("workflow"); if (/\b(bug|bugfix|fix|fixed|error|failed|failure|root cause|regression|traceback)\b/.test(body)) result.add("bugfix"); if (/\b(implemented|added|updated|created|refactored|commit|committed|push|pushed|migration|phase)\b/.test(body)) result.add("implementation"); return [...result].sort(); } export function buildTokenWeights(decisions: Decision[]): Map { const active = decisions.filter((d) => !d.archived); const docCount = active.length; const docFreq = new Map(); for (const decision of active) { const tokens = new Set([ ...tokenize(decision.title), ...decision.tags.flatMap(tokenize), ...tokenize(decision.text), ]); for (const token of tokens) docFreq.set(token, (docFreq.get(token) ?? 0) + 1); } const weights = new Map(); if (docCount === 0) return weights; for (const [token, df] of docFreq) { // Tiny deterministic IDF-style weighting: common corpus terms are slightly // damped, rare terms get a capped boost. No embeddings or model calls. weights.set(token, 0.75 + Math.min(1.75, Math.log((docCount + 1) / (df + 1)))); } return weights; } /** * Lightweight scoring tuned for a small, curated decision corpus and local-LLM use. * * `now` is injected so tests can pin the clock. Lexical overlap on title+tags+text is * the floor; zero overlap → score 0 so unrelated memories never inject. Bonuses cover * pinning, project locality, prior usage, and recency. */ export function scoreDecision( decision: Decision, query: string, cwd: string | undefined, options: { now?: number; forInjection?: boolean; tokenWeights?: TokenWeights } = {}, ): number { if (decision.archived) return 0; if (options.forInjection && decision.supersededBy) return 0; const q = tokenize(query); if (q.length === 0) return 0; const titleTokens = new Set(tokenize(decision.title)); const tagTokens = new Set(decision.tags.flatMap(tokenize)); const bodyTokens = new Set(tokenize(decision.text)); const allDecisionTokens = [...bodyTokens, ...titleTokens, ...tagTokens]; let lexical = 0; let strongMatches = 0; let meaningfulMatches = 0; const weighted = (term: string, base: number) => base * (options.tokenWeights?.get(term) ?? 1); for (const term of q) { if (titleTokens.has(term)) { lexical += weighted(term, 6); strongMatches += 1; if (!GENERIC_INJECTION_TERMS.has(term)) meaningfulMatches += 1; } else if (tagTokens.has(term)) { lexical += weighted(term, 4); strongMatches += 1; if (!GENERIC_INJECTION_TERMS.has(term)) meaningfulMatches += 1; } else if (bodyTokens.has(term)) { lexical += weighted(term, 3); strongMatches += 1; if (!GENERIC_INJECTION_TERMS.has(term)) meaningfulMatches += 1; } else if (allDecisionTokens.some((token) => token.includes(term) || term.includes(token))) { lexical += weighted(term, 1); } } if (lexical === 0) return 0; // Auto-injection is paid every turn, so it should prefer clear multi-token // evidence. Manual search can still surface weaker one-token hints. if (options.forInjection && strongMatches < 2) return 0; if (options.forInjection && meaningfulMatches < 1) return 0; if (options.forInjection && q.length >= 5 && meaningfulMatches < 3) return 0; if (options.forInjection && decision.source === "turn") return 0; if (options.forInjection && decision.source !== "manual" && OPERATIONAL_TITLE_RE.test(decision.title)) return 0; if ( options.forInjection && decision.source !== "manual" && !decision.tags.some((tag) => ["decision", "preference", "workflow"].includes(tag)) ) return 0; if (options.forInjection && q.length > 8) { // Long prompts contain many incidental words. Damp broad lexical totals so a // verbose user request does not turn common terms into false-positive memory. lexical = Math.max(1, Math.round(lexical * Math.sqrt(8 / q.length))); } let score = lexical; if (cwd && decision.cwd === cwd) score += 6; else if (cwd && sameProjectScope(decision.cwd, cwd)) score += 4; else if (cwd && decision.project === projectName(cwd)) score += 2; if (decision.important) score += options.forInjection ? 4 : 8; if (decision.source === "manual") score += options.forInjection ? 2 : 4; if (decision.kbPath) score += 3; if (!options.forInjection) { score += Math.min(decision.retrievalCount, 6); score += Math.min(decision.injectionCount, 4); if (decision.supersededBy) score -= 4; } const now = options.now ?? Date.now(); const ageDays = (now - Date.parse(decision.createdAt)) / 86_400_000; if (ageDays < 1) score += 2; else if (ageDays < 14) score += 1; else if (ageDays > 180) score -= 2; return Math.max(0, Math.round(score)); }