import type { Decision } from "./types.ts"; import { tokenize } from "./scoring.ts"; // Lightweight lexical duplicate detection used to warn on near-identical adds. // Deliberately deterministic and cheap — no embeddings, just significant-word // overlap on title and the opening of the body. const DEDUP_STOP_WORDS = new Set([ "about", "after", "again", "agent", "because", "before", "current", "decision", "decided", "default", "during", "entry", "implementation", "memory", "pi", "project", "should", "summary", "that", "then", "there", "these", "this", "using", "when", "with", "workflow", ]); export function significantWords(text: string): string[] { return [...new Set(tokenize(text).filter((token) => token.length >= 4 && !DEDUP_STOP_WORDS.has(token)))]; } export function overlapRatio(a: string[], b: string[]): number { if (a.length === 0 || b.length === 0) return 0; const bSet = new Set(b); const shared = a.filter((token) => bSet.has(token)).length; return shared / Math.min(a.length, b.length); } export function findPotentialDuplicate(title: string, text: string, existing: Decision[], threshold = 0.6): { decision: Decision; score: number } | undefined { const newTitleWords = significantWords(title); const newBodyWords = significantWords(`${title} ${text.slice(0, 200)}`); let best: { decision: Decision; score: number } | undefined; for (const decision of existing) { if (decision.archived) continue; const titleScore = overlapRatio(newTitleWords, significantWords(decision.title)); const bodyScore = overlapRatio(newBodyWords, significantWords(`${decision.title} ${decision.text.slice(0, 200)}`)); const score = Math.max(titleScore, bodyScore); if (score > (best?.score ?? 0)) best = { decision, score }; } return best && best.score > threshold ? best : undefined; }