/** * @file Pure-TS cosine similarity / TF-IDF relevance scoring. Given a query and text content, * splits into blocks, builds TF-IDF vectors, ranks by cosine similarity to the query, and returns * the top-N most relevant blocks. No ML/runtime dependencies — pure string/vector math. */ // ─── Tokenizer ────────────────────────────────────────────────────────────── /** Split text into normalized tokens (lowercase, stripped punctuation). */ function tokenize(text: string): string[] { return text .toLowerCase() .replaceAll(/[^a-z0-9\s'-]/gu, " ") .split(/\s+/u) .filter((t) => t.length > 1); } // ─── Term frequency (within a single document/block) ──────────────────────── function termFreq(tokens: string[]): Map { const tf = new Map(); for (const t of tokens) { tf.set(t, (tf.get(t) ?? 0) + 1); } // Normalize by total tokens so longer blocks don't dominate for (const [k, v] of tf) { tf.set(k, v / tokens.length); } return tf; } // ─── Inverse document frequency (across all blocks) ───────────────────────── function inverseDocFreq(blockTokens: string[][]): Map { const n = blockTokens.length; const df = new Map(); for (const tokens of blockTokens) { const seen = new Set(tokens); for (const t of seen) { df.set(t, (df.get(t) ?? 0) + 1); } } const idf = new Map(); for (const [term, count] of df) { idf.set(term, Math.log((n + 1) / (count + 1)) + 1); } return idf; } // ─── TF-IDF vector ────────────────────────────────────────────────────────── function buildVector(tokens: string[], idf: Map, vocab: string[]): Float64Array { const vec = new Float64Array(vocab.length); const tf = termFreq(tokens); for (let i = 0; i < vocab.length; i++) { const term = vocab[i]; vec[i] = (tf.get(term) ?? 0) * (idf.get(term) ?? 0); } return vec; } // ─── Cosine similarity ────────────────────────────────────────────────────── function cosineSim(a: Float64Array, b: Float64Array): number { let dot = 0; let magA = 0; let magB = 0; for (let i = 0; i < a.length; i++) { dot += a[i] * b[i]; magA += a[i] * a[i]; magB += b[i] * b[i]; } const denom = Math.sqrt(magA) * Math.sqrt(magB); return denom === 0 ? 0 : dot / denom; } // ─── Public API ───────────────────────────────────────────────────────────── export interface ScoredBlock { /** Block index in the original text. */ index: number; /** Block text content. */ text: string; /** Cosine similarity score (0–1). */ score: number; /** Approximate character range in the original. */ charStart: number; charEnd: number; } export interface SimilarityResult { /** Top-N scored blocks. */ blocks: ScoredBlock[]; /** Total blocks scored. */ totalBlocks: number; /** Query used for scoring. */ query: string; } /** * Split text into blocks and score each against query using TF-IDF cosine similarity. * * @param text — raw text content to search within * @param query — search query * @param topN — max results (default 5) * @param minScore — minimum score to include (default 0.0) * @param blockSize — approximate block size in chars (default 512) */ export function scoreTextByCosine( text: string, query: string, topN = 5, minScore = 0.0, blockSize = 512, ): SimilarityResult { const blocks = splitBlocks(text, blockSize); const blockTexts = blocks.map((b) => b.text); const blockTokens = blockTexts.map((b) => tokenize(b)); const queryTokens = tokenize(query); if (blockTokens.length === 0 || queryTokens.length === 0) { return { blocks: [], totalBlocks: 0, query }; } const idf = inverseDocFreq(blockTokens); const vocab = [...new Set([...queryTokens, ...blockTokens.flat()])].toSorted(); const queryVec = buildVector(queryTokens, idf, vocab); const scored: ScoredBlock[] = []; for (let i = 0; i < blocks.length; i++) { const blockVec = buildVector(blockTokens[i], idf, vocab); const score = cosineSim(queryVec, blockVec); if (score >= minScore) { scored.push({ index: i, text: blockTexts[i], score, charStart: blocks[i].start, charEnd: blocks[i].end, }); } } scored.sort((a, b) => b.score - a.score); const top = scored.slice(0, topN); return { blocks: top, totalBlocks: blocks.length, query }; } // ─── Block splitting ──────────────────────────────────────────────────────── interface TextBlock { text: string; start: number; end: number; } function splitBlocks(text: string, targetSize: number): TextBlock[] { if (!text) return []; if (text.length <= targetSize) { return [{ text, start: 0, end: text.length }]; } const blocks: TextBlock[] = []; let start = 0; while (start < text.length) { let end = Math.min(start + targetSize, text.length); // Try to break at paragraph or sentence boundary if (end < text.length) { const after = text.slice(end, Math.min(end + 200, text.length)); const paraBreak = after.search(/\n\s*\n/u); if (paraBreak >= 0 && paraBreak < 100) { end += paraBreak + 2; } else { const sentenceBreak = after.search(/[.!?]\s/u); if (sentenceBreak >= 0 && sentenceBreak < 50) { end += sentenceBreak + 1; } } } blocks.push({ text: text.slice(start, end).trim(), start, end, }); start = end; } return blocks.filter((b) => b.text.length > 0); }