/** * Diff engine - diffing and annotation processing for Word→Markdown import */ import { diffWords, Change } from 'diff'; import { extractMarkdownPrefix, protectAnchors, restoreAnchors, protectCrossrefs, restoreCrossrefs, protectMath, restoreMath, replaceRenderedMath, protectCitations, restoreCitations, replaceRenderedCitations, protectImages, restoreImages, matchWordImagesToOriginal, protectTables, restoreTables, } from './protect-restore.js'; import { normalizeWhitespace } from './utils.js'; import type { WordTable } from './word-extraction.js'; // ============================================ // Type Definitions // ============================================ export interface GenerateSmartDiffOptions { wordTables?: WordTable[]; imageRegistry?: any; } // ============================================ // Functions // ============================================ /** * Fix citation and math annotations by preserving original markdown syntax */ export function fixCitationAnnotations(text: string, originalMd: string): string { // Fix math annotations - preserve inline and display math text = text.replace(/\{--(\$[^$]+\$)--\}/g, '$1'); text = text.replace(/\{--(\$\$[^$]+\$\$)--\}/g, '$1'); text = text.replace(/\{~~(\$[^$]+\$)~>[^~]+~~\}/g, '$1'); text = text.replace(/\{~~(\$\$[^$]+\$\$)~>[^~]+~~\}/g, '$1'); // Extract all citations from original markdown const citationPattern = /\[@[^\]]+\]/g; const originalCitations = [...originalMd.matchAll(citationPattern)].map(m => m[0]); // Fix substitutions where left side has markdown citation text = text.replace(/\{~~(\[@[^\]]+\])~>[^~]+~~\}/g, '$1'); // Fix substitutions where left side STARTS with markdown citation text = text.replace(/\{~~(\[@[^\]]+\])\s*([^~]*)~>([^~]*)~~\}/g, (match, cite, oldText, newText) => { if (oldText.trim() === '' && newText.trim() === '') { return cite; } if (oldText.trim() || newText.trim()) { return cite + (oldText.trim() !== newText.trim() ? ` {~~${oldText.trim()}~>${newText.trim()}~~}` : ` ${newText}`); } return cite; }); // Fix deletions of markdown citations text = text.replace(/\{--(\[@[^\]]+\])--\}/g, '$1'); // Fix insertions of rendered citations text = text.replace(/\{\+\+\([A-Z][^)]*\d{4}[^)]*\)\+\+\}/g, ''); // Clean up broken multi-part substitutions text = text.replace(/\{~~(@[A-Za-z]+\d{4})~>[^~]+~~\}/g, '[$1]'); // Fix citations split across substitution boundaries text = text.replace(/\{~~\[@~>[^~]*~~\}([A-Za-z]+\d{4})\]/g, '[@$1]'); // Clean up any remaining partial citations text = text.replace(/\{~~;\s*@([A-Za-z]+\d{4})\]~>[^~]*~~\}/g, '; [@$1]'); // Remove rendered citation insertions (with Unicode support) text = text.replace(/\{\+\+\(\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\+\+\}/gu, ''); text = text.replace(/\{\+\+\(\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, ''); // Trailing citation fragments text = text.replace(/\{\+\+\d{4}[a-z]?(?:[;,]\s*(?:\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+)?\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, ''); text = text.replace(/\{\+\+\d{4}[a-z]?(?:[;,]\s*(?:\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+)?\d{4}[a-z]?)*\)\s*\+\+\}/gu, ''); // Just year with closing paren text = text.replace(/\{\+\+\d{4}[a-z]?\)\.\s*\+\+\}/g, ''); text = text.replace(/\{\+\+\d{4}[a-z]?\)\s*\+\+\}/g, ''); // Leading citation fragments text = text.replace(/\{\+\+\(?\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s*\+\+\}/gu, ''); // Semicolon-separated fragments text = text.replace(/\{\+\+[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?\+\+\}/gu, ''); // Year ranges with authors text = text.replace(/\{\+\+\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\s*\+\+\}/gu, ''); text = text.replace(/\{\+\+\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, ''); // Clean up double spaces and orphaned punctuation text = text.replace(/ +/g, ' '); text = text.replace(/\s+\./g, '.'); text = text.replace(/\s+,/g, ','); // Final cleanup - remove empty annotations text = text.replace(/\{~~\s*~>\s*~~\}/g, ''); text = text.replace(/\{\+\+\s*\+\+\}/g, ''); text = text.replace(/\{--\s*--\}/g, ''); return text; } /** * Strip markdown syntax to get plain text */ function stripMarkdownSyntax(md: string): string { return md .replace(/^---[\s\S]*?---\n*/m, '') .replace(/^#{1,6}\s+/gm, '') .replace(/(\*\*|__)(.*?)\1/g, '$2') .replace(/(\*|_)(.*?)\1/g, '$2') .replace(/\[([^\]]+)\]\([^)]+\)/g, '$1') .replace(/!\[([^\]]*)\]\([^)]+\)/g, '') .replace(/`([^`]+)`/g, '$1') .replace(/```[\s\S]*?```/g, '') .replace(/^>\s*/gm, '') .replace(/^[-*_]{3,}\s*$/gm, '') .replace(/^[\s]*[-*+]\s+/gm, '') .replace(/^[\s]*\d+\.\s+/gm, '') .replace(/\|/g, ' ') .replace(/^[-:]+$/gm, '') .replace(/\n{3,}/g, '\n\n') .trim(); } /** * Inject Word tables (extracted from XML) into pandoc text output */ function injectWordTables(pandocText: string, wordTables: WordTable[]): string { if (!wordTables || wordTables.length === 0) { return pandocText; } let result = pandocText; for (const table of wordTables) { const firstLine = table.markdown.split('\n')[0]; const headerCells = firstLine .split('|') .map((c) => c.trim()) .filter((c) => c.length > 0); if (headerCells.length === 0) continue; const firstCell = headerCells[0]; const startIdx = result.indexOf(firstCell); if (startIdx === -1) continue; const lastLine = table.markdown.split('\n').pop(); const lastCells = lastLine! .split('|') .map((c) => c.trim()) .filter((c) => c.length > 0); const lastCell = lastCells[lastCells.length - 1] || lastCells[0]; const endIdx = result.indexOf(lastCell, startIdx); if (endIdx === -1) continue; let regionStart = result.lastIndexOf('\n\n', startIdx); if (regionStart === -1) regionStart = 0; else regionStart += 2; let regionEnd = result.indexOf('\n\n', endIdx + lastCell.length); if (regionEnd === -1) regionEnd = result.length; result = result.slice(0, regionStart) + table.markdown + '\n\n' + result.slice(regionEnd); } return result; } /** * Generate annotated markdown by diffing original MD against Word text */ export function generateAnnotatedDiff(originalMd: string, wordText: string, author: string = 'Reviewer'): string { const normalizedOriginal = normalizeWhitespace(originalMd); const normalizedWord = normalizeWhitespace(wordText); const changes = diffWords(normalizedOriginal, normalizedWord); let result = ''; for (const part of changes) { if (part.added) { result += `{++${part.value}++}`; } else if (part.removed) { result += `{--${part.value}--}`; } else { result += part.value; } } return result; } /** * Smart paragraph-level diff that preserves markdown structure */ export function generateSmartDiff( originalMd: string, wordText: string, author: string = 'Reviewer', options: GenerateSmartDiffOptions = {} ): string { const { wordTables = [], imageRegistry = null } = options; // Inject Word tables into pandoc output let wordTextWithTables = injectWordTables(wordText, wordTables); // Protect markdown tables const { text: mdWithTablesProtected, tables } = protectTables(originalMd); // Also protect tables in Word text const { text: wordWithTablesProtected, tables: wordTableBlocks } = protectTables(wordTextWithTables); // Protect images const { text: mdWithImagesProtected, images: origImages } = protectImages(mdWithTablesProtected, imageRegistry); const { text: wordWithImagesProtected, images: wordImages } = protectImages(wordWithTablesProtected, imageRegistry); // Match Word images to original images const imageMapping = matchWordImagesToOriginal(origImages, wordImages, imageRegistry); // Replace Word image placeholders with matching original placeholders let wordWithMappedImages = wordWithImagesProtected; for (const [wordPlaceholder, origPlaceholder] of imageMapping) { wordWithMappedImages = wordWithMappedImages.split(wordPlaceholder).join(origPlaceholder); } // Protect figure/table anchors const { text: mdWithAnchorsProtected, anchors: figAnchors } = protectAnchors(mdWithImagesProtected); // Protect cross-references const { text: mdWithXrefsProtected, crossrefs } = protectCrossrefs(mdWithAnchorsProtected); // Protect math const { text: mdWithMathProtected, mathBlocks } = protectMath(mdWithXrefsProtected); // Protect citations const { text: mdProtected, citations } = protectCitations(mdWithMathProtected); // Replace rendered elements in Word text let wordProtected = wordWithMappedImages; wordProtected = replaceRenderedMath(wordProtected, mathBlocks); wordProtected = replaceRenderedCitations(wordProtected, citations.length); // Split into paragraphs const originalParas = mdProtected.split(/\n\n+/); const wordParas = wordProtected.split(/\n\n+/); const result: string[] = []; // Try to match paragraphs intelligently let wordIdx = 0; for (let i = 0; i < originalParas.length; i++) { const orig = originalParas[i] || ''; const { prefix: mdPrefix, content: origContent } = extractMarkdownPrefix(orig.split('\n')[0]); // Find best matching word paragraph let bestMatch = -1; let bestScore = 0; for (let j = wordIdx; j < Math.min(wordIdx + 3, wordParas.length); j++) { const wordPara = wordParas[j] || ''; const origWords = new Set(origContent.toLowerCase().split(/\s+/)); const wordWords = wordPara.toLowerCase().split(/\s+/); const common = wordWords.filter((w) => origWords.has(w)).length; const score = common / Math.max(origWords.size, wordWords.length); if (score > bestScore && score > 0.3) { bestScore = score; bestMatch = j; } } if (bestMatch === -1) { if (mdPrefix && wordIdx < wordParas.length) { const wordPara = wordParas[wordIdx]; if (wordPara.toLowerCase().includes(origContent.toLowerCase().slice(0, 20))) { bestMatch = wordIdx; } } } if (bestMatch >= 0) { const word = wordParas[bestMatch]; const origStripped = stripMarkdownSyntax(orig); const wordNormalized = normalizeWhitespace(word); if (origStripped === wordNormalized) { result.push(orig); } else { const changes = diffWords(origStripped, wordNormalized); let annotated = mdPrefix; for (const part of changes) { if (part.added) { annotated += `{++${part.value}++}`; } else if (part.removed) { annotated += `{--${part.value}--}`; } else { annotated += part.value; } } result.push(annotated); } wordIdx = bestMatch + 1; } else { // Paragraph deleted entirely if (mdPrefix && mdPrefix.match(/^#{1,6}\s+/)) { result.push(orig); } else { result.push(`{--${orig}--}`); } } } // Any remaining word paragraphs are additions for (let j = wordIdx; j < wordParas.length; j++) { const word = wordParas[j]; if (word.trim()) { result.push(`{++${word}++}`); } } // Restore protected content let finalResult = result.join('\n\n'); finalResult = restoreCitations(finalResult, citations); finalResult = restoreMath(finalResult, mathBlocks); finalResult = restoreCrossrefs(finalResult, crossrefs); finalResult = restoreAnchors(finalResult, figAnchors); finalResult = restoreImages(finalResult, origImages); finalResult = restoreImages(finalResult, wordImages); finalResult = restoreTables(finalResult, tables); finalResult = restoreTables(finalResult, wordTableBlocks); return finalResult; } /** * Clean up redundant adjacent annotations */ export function cleanupAnnotations(text: string): string { // Convert adjacent delete+insert to substitution text = text.replace(/\{--(.+?)--\}\s*\{\+\+(.+?)\+\+\}/g, '{~~$1~>$2~~}'); // Also handle insert+delete text = text.replace(/\{\+\+(.+?)\+\+\}\s*\{--(.+?)--\}/g, '{~~$2~>$1~~}'); // Fix malformed patterns text = text.replace(/\{--([^}]+?)~>([^}]+?)~~\}/g, '{~~$1~>$2~~}'); // Fix malformed substitutions that got split text = text.replace(/\{~~([^~]+)\s*--\}/g, '{--$1--}'); text = text.replace(/\{\+\+([^+]+)~~\}/g, '{++$1++}'); // Clean up empty annotations text = text.replace(/\{--\s*--\}/g, ''); text = text.replace(/\{\+\+\s*\+\+\}/g, ''); // Clean up double spaces in prose, but preserve table formatting const lines = text.split('\n'); let inTable = false; const processedLines = lines.map((line, idx) => { const isSeparator = /^[-]+(\s+[-]+)+\s*$/.test(line.trim()); const looksLikeTableRow = /\S+\s{2,}\S+/.test(line); if (isSeparator) { if (!inTable) { inTable = true; } return line; } if (inTable) { if (line.trim() === '') { let lookAhead = idx + 1; let foundTableContent = false; let foundEndSeparator = false; while (lookAhead < lines.length && lookAhead < idx + 20) { const nextLine = lines[lookAhead].trim(); if (nextLine === '') { lookAhead++; continue; } if (/^[-]+(\s+[-]+)+\s*$/.test(nextLine)) { foundEndSeparator = true; break; } if (/\S+\s{2,}\S+/.test(nextLine)) { foundTableContent = true; break; } if (/^\*[^*]+\*\s*$/.test(nextLine)) { foundTableContent = true; break; } if (lines[lookAhead].startsWith(' ')) { lookAhead++; continue; } break; } if (foundTableContent || foundEndSeparator) { return line; } inTable = false; return line; } return line; } if (looksLikeTableRow) { let nextIdx = idx + 1; while (nextIdx < lines.length && lines[nextIdx].trim() === '') { nextIdx++; } if (nextIdx < lines.length && /^[-]+(\s+[-]+)+\s*$/.test(lines[nextIdx].trim())) { return line; } } if (line.trim().startsWith('|')) { return line; } return line.replace(/ +/g, ' '); }); text = processedLines.join('\n'); return text; }