/** * Anchor matching primitives shared between sync (insertion) and * verify-anchors (drift reporting). The functions are pure: given an * anchor string and surrounding context, locate candidate positions in * a target text using progressively looser strategies. */ export type AnchorStrategy = | 'direct' | 'normalized' | 'stripped' | 'partial-start' | 'partial-start-stripped' | 'context-both' | 'context-before' | 'context-after' | 'split-match' | 'empty-anchor' | 'failed'; export interface AnchorSearchResult { occurrences: number[]; matchedAnchor: string | null; strategy: AnchorStrategy; stripped?: boolean; } /** * Strip CriticMarkup so the matcher sees plain prose instead of * `{++inserted++}`/`{--deleted--}`/etc. Used when an anchor lives * underneath previously imported track changes. */ export function stripCriticMarkup(text: string): string { return text .replace(/\{\+\+([^+]*)\+\+\}/g, '$1') // insertions: keep new text .replace(/\{--([^-]*)--\}/g, '') // deletions: remove old text .replace(/\{~~([^~]*)~>([^~]*)~~\}/g, '$2') // substitutions: keep new text .replace(/\{>>[\s\S]*?<<\}/g, '') // comments: remove (non-greedy; comment text may contain '<') .replace(/\[([^\]]*)\]\{\.mark\}/g, '$1'); // marked text: keep text } /** * Return every starting index where `needle` occurs in `haystack`. * Empty needles return no occurrences (empty matches are not useful * for anchor placement). */ export function findAllOccurrences(haystack: string, needle: string): number[] { if (!needle || needle.length === 0) return []; const occurrences: number[] = []; let idx = 0; while ((idx = haystack.indexOf(needle, idx)) !== -1) { occurrences.push(idx); idx += 1; } return occurrences; } /** * Find candidate positions for `anchor` in `text`, falling back through * progressively looser strategies (whitespace normalization, stripped * CriticMarkup, partial-prefix, surrounding context, word splitting). * * The returned `strategy` lets callers distinguish a clean direct hit * from a fuzzy approximation — useful for drift reporting. */ export function findAnchorInText( anchor: string, text: string, before: string = '', after: string = '' ): AnchorSearchResult { // Empty anchor: skip directly to context-based matching if (!anchor || anchor.trim().length === 0) { if (before || after) { const beforeLower = (before || '').toLowerCase(); const afterLower = (after || '').toLowerCase(); const textLower = text.toLowerCase(); if (before && after) { const beforeIdx = textLower.indexOf(beforeLower.slice(-50)); if (beforeIdx !== -1) { const searchStart = beforeIdx + beforeLower.slice(-50).length; const afterIdx = textLower.indexOf(afterLower.slice(0, 50), searchStart); if (afterIdx !== -1 && afterIdx - searchStart < 500) { return { occurrences: [searchStart], matchedAnchor: null, strategy: 'context-both' }; } } } if (before) { const beforeIdx = textLower.lastIndexOf(beforeLower.slice(-30)); if (beforeIdx !== -1) { return { occurrences: [beforeIdx + beforeLower.slice(-30).length], matchedAnchor: null, strategy: 'context-before', }; } } if (after) { const afterIdx = textLower.indexOf(afterLower.slice(0, 30)); if (afterIdx !== -1) { return { occurrences: [afterIdx], matchedAnchor: null, strategy: 'context-after' }; } } } return { occurrences: [], matchedAnchor: null, strategy: 'empty-anchor' }; } const anchorLower = anchor.toLowerCase(); const textLower = text.toLowerCase(); // Strategy 1: direct match let occurrences = findAllOccurrences(textLower, anchorLower); if (occurrences.length > 0) { return { occurrences, matchedAnchor: anchor, strategy: 'direct' }; } // Strategy 2: normalized whitespace const normalizedAnchor = anchor.replace(/\s+/g, ' ').toLowerCase(); const normalizedText = text.replace(/\s+/g, ' ').toLowerCase(); const idx = normalizedText.indexOf(normalizedAnchor); if (idx !== -1) { return { occurrences: [idx], matchedAnchor: anchor, strategy: 'normalized' }; } // Strategy 3: match in stripped CriticMarkup version const strippedText = stripCriticMarkup(text); const strippedLower = strippedText.toLowerCase(); occurrences = findAllOccurrences(strippedLower, anchorLower); if (occurrences.length > 0) { return { occurrences, matchedAnchor: anchor, strategy: 'stripped', stripped: true }; } // Strategy 4: first N words of anchor (long anchors) const words = anchor.split(/\s+/); if (words.length > 3) { for (let n = Math.min(6, words.length); n >= 3; n--) { const partialAnchor = words.slice(0, n).join(' ').toLowerCase(); if (partialAnchor.length >= 15) { occurrences = findAllOccurrences(textLower, partialAnchor); if (occurrences.length > 0) { return { occurrences, matchedAnchor: words.slice(0, n).join(' '), strategy: 'partial-start' }; } occurrences = findAllOccurrences(strippedLower, partialAnchor); if (occurrences.length > 0) { return { occurrences, matchedAnchor: words.slice(0, n).join(' '), strategy: 'partial-start-stripped', stripped: true, }; } } } } // Strategy 5: context (before/after) only if (before || after) { const beforeLower = before.toLowerCase(); const afterLower = after.toLowerCase(); if (before && after) { const beforeIdx = textLower.indexOf(beforeLower.slice(-50)); if (beforeIdx !== -1) { const searchStart = beforeIdx + beforeLower.slice(-50).length; const afterIdx = textLower.indexOf(afterLower.slice(0, 50), searchStart); if (afterIdx !== -1 && afterIdx - searchStart < 500) { return { occurrences: [searchStart], matchedAnchor: null, strategy: 'context-both' }; } } } if (before) { const beforeIdx = textLower.lastIndexOf(beforeLower.slice(-30)); if (beforeIdx !== -1) { return { occurrences: [beforeIdx + beforeLower.slice(-30).length], matchedAnchor: null, strategy: 'context-before', }; } } if (after) { const afterIdx = textLower.indexOf(afterLower.slice(0, 30)); if (afterIdx !== -1) { return { occurrences: [afterIdx], matchedAnchor: null, strategy: 'context-after' }; } } } // Strategy 6: split anchor on transition characters const splitPatterns = [' ', ', ', '. ', ' - ', ' – ']; for (const sep of splitPatterns) { if (anchor.includes(sep)) { const parts = anchor.split(sep).filter(p => p.length >= 4); for (const part of parts) { const partLower = part.toLowerCase(); occurrences = findAllOccurrences(textLower, partLower); if (occurrences.length > 0 && occurrences.length < 5) { return { occurrences, matchedAnchor: part, strategy: 'split-match' }; } } } } return { occurrences: [], matchedAnchor: null, strategy: 'failed' }; } /** * Classify a strategy as a clean hit, a fuzzy/drifted hit, or no hit. * Used by `verify-anchors` to summarize per-comment match quality. */ export type AnchorMatchQuality = 'clean' | 'drift' | 'context-only' | 'unmatched'; export function classifyStrategy(strategy: AnchorStrategy, occurrences: number): AnchorMatchQuality { if (occurrences === 0) return 'unmatched'; switch (strategy) { case 'direct': case 'normalized': return 'clean'; case 'stripped': case 'partial-start': case 'partial-start-stripped': case 'split-match': return 'drift'; case 'context-both': case 'context-before': case 'context-after': return 'context-only'; case 'empty-anchor': case 'failed': default: return 'unmatched'; } }