/** * Realign comments from a reference DOCX to markdown * Uses paragraph-level matching with exact positions */ import * as fs from 'fs'; import AdmZip from 'adm-zip'; import { parseStringPromise } from 'xml2js'; interface CommentData { author: string; text: string; } interface CommentWithPosition { id: string; position: number; author: string; text: string; } interface Paragraph { text: string; comments: CommentWithPosition[]; } interface MdParagraph { text: string; start: number; end: number; } interface ParagraphMatch { index: number; score: number; paragraph: MdParagraph; } interface WordContext { before: string[]; after: string[]; } interface CommentInsertion { position: number; text: string; commentText: string; hasReplies: boolean; debug: string; } interface RealignOptions { dryRun?: boolean; author?: string; replyAuthor?: string; } interface RealignResult { success: boolean; dryRun?: boolean; insertions: number; matched?: number; unmatched?: number; } interface RealignMarkdownOptions { author?: string; replyAuthor?: string; } interface RealignMarkdownResult { success: boolean; markdown: string; insertions: number; error?: string; } /** * Extract paragraphs with their full text and comment positions from DOCX */ export async function extractParagraphsWithComments(docxPath: string): Promise { const zip = new AdmZip(docxPath); const doc = zip.readAsText('word/document.xml'); const commentsXml = zip.readAsText('word/comments.xml'); // Parse comments to get authors and texts const parsed = await parseStringPromise(commentsXml, { explicitArray: false }); const commentNodes = parsed['w:comments']?.['w:comment']; if (!commentNodes) return []; const nodes = Array.isArray(commentNodes) ? commentNodes : [commentNodes]; const commentData: Record = {}; for (const c of nodes) { const id = c.$?.['w:id'] ?? ''; const author = c.$?.['w:author'] ?? 'Unknown'; let text = ''; const extractT = (n: any): void => { if (!n) return; if (n['w:t']) { const t = n['w:t']; text += typeof t === 'string' ? t : (t._ || t); } if (n['w:r']) { (Array.isArray(n['w:r']) ? n['w:r'] : [n['w:r']]).forEach(extractT); } if (n['w:p']) { (Array.isArray(n['w:p']) ? n['w:p'] : [n['w:p']]).forEach(extractT); } }; extractT(c); commentData[id] = { author, text: text.trim() }; } // Extract paragraphs with comments const paragraphs: Paragraph[] = []; const paraPattern = /]*>([\s\S]*?)<\/w:p>/g; let match; while ((match = paraPattern.exec(doc)) !== null) { const paraContent = match[1]; const hasComments = /commentRangeStart/.test(paraContent); // Build paragraph text and track comment positions let text = ''; const comments: CommentWithPosition[] = []; const tokenPattern = /]*>([^<]*)<\/w:t>|]*w:id="(\d+)"[^>]*\/?>/g; let tokenMatch; while ((tokenMatch = tokenPattern.exec(paraContent)) !== null) { if (tokenMatch[1] !== undefined) { text += tokenMatch[1]; } else if (tokenMatch[2] !== undefined) { const cid = tokenMatch[2]; const data = commentData[cid]; if (data) { comments.push({ id: cid, position: text.length, author: data.author, text: data.text, }); } } } if (text.trim() || hasComments) { paragraphs.push({ text: text.trim(), comments }); } } return paragraphs; } /** * Find best matching paragraph in markdown for a reference paragraph */ function findMatchingParagraph(refText: string, mdParagraphs: MdParagraph[]): ParagraphMatch | null { // Normalize for comparison const normalize = (s: string): string => s.toLowerCase().replace(/\s+/g, ' ').trim(); const refNorm = normalize(refText); if (refNorm.length < 20) return null; let bestMatch: ParagraphMatch | null = null; let bestScore = 0; for (let i = 0; i < mdParagraphs.length; i++) { const mdNorm = normalize(mdParagraphs[i].text); // Calculate word overlap const refWords = new Set(refNorm.split(' ').filter((w) => w.length > 3)); const mdWords = mdNorm.split(' ').filter((w) => w.length > 3); const overlap = mdWords.filter((w) => refWords.has(w)).length; const score = overlap / Math.max(refWords.size, 1); // Also check for substring containment (for section headers) const containsStart = mdNorm.includes(refNorm.slice(0, 50)); if (score > bestScore || (containsStart && score > 0.3)) { bestScore = Math.max(score, containsStart ? 0.8 : score); bestMatch = { index: i, score: bestScore, paragraph: mdParagraphs[i] }; } } return bestScore > 0.4 ? bestMatch : null; } /** * Extract paragraphs from markdown (split by blank lines) */ function parseMdParagraphs(markdown: string): MdParagraph[] { const paragraphs: MdParagraph[] = []; const parts = markdown.split(/\n\n+/); let pos = 0; for (const part of parts) { const trimmed = part.trim(); if (trimmed) { const partStart = markdown.indexOf(part, pos); if (partStart !== -1) { paragraphs.push({ text: trimmed, start: partStart, end: partStart + part.length, }); pos = partStart + part.length; } } } return paragraphs; } /** * Normalize text for matching (remove citations, extra whitespace) */ function normalizeForMatching(text: string): string { return text // Remove Word citation placeholders .replace(/\(\s*\$+\s*\)/g, '') .replace(/\$+/g, '') // Remove markdown citations .replace(/\[@[^\]]+\]/g, '') .replace(/@[A-Z][a-z]+\d{4}/g, '') // Remove rendered citations like "(Author et al. 2021)" .replace(/\([A-Z][a-z]+(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*[A-Z][a-z]+(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)/g, '') // Remove figure references like "Fig. 1" or "(Fig. 1)" .replace(/\(?Fig\.?\s*\d+[a-z]?\)?/gi, '') // Normalize whitespace .replace(/\s+/g, ' ') .trim() .toLowerCase(); } /** * Find the word at or near a position in text */ function getWordAtPosition(text: string, pos: number): WordContext { const before = text.slice(Math.max(0, pos - 30), pos); const after = text.slice(pos, pos + 30); // Get the last complete word before position const beforeWords = before.split(/\s+/).filter(w => w.length > 2); const afterWords = after.split(/\s+/).filter(w => w.length > 2); return { before: beforeWords.slice(-3), after: afterWords.slice(0, 3) }; } /** * Find position in markdown paragraph matching reference position * Uses the anchor word (word immediately before the comment) for precise matching */ function findMdPosition(refText: string, refPos: number, mdText: string): number { // Get the word(s) immediately before the comment position in reference const refWords = getWordAtPosition(refText, refPos); const normalizedMd = normalizeForMatching(mdText); // The "anchor word" is the last word before the comment const anchorWords = refWords.before; if (anchorWords.length === 0) { const ratio = refPos / Math.max(refText.length, 1); return Math.round(ratio * mdText.length); } // Try to find the anchor word(s) in markdown // Start with the most specific (all words), fall back to fewer for (let numWords = anchorWords.length; numWords >= 1; numWords--) { const searchWords = anchorWords.slice(-numWords); const pattern = searchWords.map(w => w.toLowerCase().replace(/[.*+?^${}()|[\]\\]/g, '\\$&') ).join('\\s+'); const regex = new RegExp(pattern, 'g'); const matches = [...normalizedMd.matchAll(regex)]; if (matches.length === 1) { // Unique match - use this position const matchEnd = matches[0].index! + matches[0][0].length; // Map back to original markdown position const ratio = matchEnd / Math.max(normalizedMd.length, 1); return Math.round(ratio * mdText.length); } else if (matches.length > 1) { // Multiple matches - use context after to disambiguate const afterWords = refWords.after; if (afterWords.length > 0) { const afterPattern = afterWords[0].toLowerCase().replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); for (const match of matches) { const matchEnd = match.index! + match[0].length; const afterContext = normalizedMd.slice(matchEnd, matchEnd + 50); if (afterContext.includes(afterPattern)) { const ratio = matchEnd / Math.max(normalizedMd.length, 1); return Math.round(ratio * mdText.length); } } } // Fall back to first match const matchEnd = matches[0].index! + matches[0][0].length; const ratio = matchEnd / Math.max(normalizedMd.length, 1); return Math.round(ratio * mdText.length); } } // Fallback: proportional position const ratio = refPos / Math.max(refText.length, 1); return Math.round(ratio * mdText.length); } /** * Extract reply comments that follow a parent comment * Returns map of parent comment text -> array of reply texts */ function extractReplies(markdown: string, parentAuthor: string, replyAuthor: string): Map { const replies = new Map(); const pattern = new RegExp( `\\{>>${parentAuthor}:\\s*([^<]+)<<\\}((?:\\s*\\{>>${replyAuthor}:[^<]+<<\\})*)`, 'g' ); let match; while ((match = pattern.exec(markdown)) !== null) { const parentText = match[1].trim(); const replyBlock = match[2]; if (replyBlock) { const replyPattern = new RegExp(`\\{>>${replyAuthor}:\\s*([^<]+)<<\\}`, 'g'); const replyTexts: string[] = []; let replyMatch; while ((replyMatch = replyPattern.exec(replyBlock)) !== null) { replyTexts.push(replyMatch[1].trim()); } if (replyTexts.length > 0) { replies.set(parentText.slice(0, 50), replyTexts); // Use first 50 chars as key } } } return replies; } /** * Realign comments from reference DOCX to markdown */ export async function realignComments( docxPath: string, markdownPath: string, options: RealignOptions = {} ): Promise { const { dryRun = false, author = 'Guy Colling', replyAuthor = 'Gilles Colling' } = options; // Read original markdown to extract replies before stripping const originalMarkdown = fs.readFileSync(markdownPath, 'utf-8'); // Extract reply relationships const replies = extractReplies(originalMarkdown, author, replyAuthor); console.log(`Found ${replies.size} ${author} comments with ${replyAuthor} replies`); // Extract reference paragraphs with comments const refParagraphs = await extractParagraphsWithComments(docxPath); const refWithComments = refParagraphs.filter( (p) => p.comments.length > 0 && p.comments.some((c) => c.author === author) ); console.log(`Found ${refWithComments.length} paragraphs with ${author} comments in reference`); // Strip ALL comments (both authors) from markdown to start fresh let markdown = originalMarkdown; markdown = markdown.replace(/\s*\{>>[^<]+<<\}/g, ''); console.log(`Stripped all comments from markdown`); // Parse markdown paragraphs const mdParagraphs = parseMdParagraphs(markdown); // Track insertions (position, text) - will insert from end to start const insertions: CommentInsertion[] = []; let matched = 0; let unmatched = 0; for (const refPara of refWithComments) { const match = findMatchingParagraph(refPara.text, mdParagraphs); if (!match) { console.log(` No match for: "${refPara.text.slice(0, 60)}..."`); unmatched++; continue; } matched++; const mdPara = match.paragraph; // Get author's comments in this paragraph const authorComments = refPara.comments.filter((c) => c.author === author); for (const comment of authorComments) { // Find corresponding position in markdown paragraph const mdPos = findMdPosition(refPara.text, comment.position, mdPara?.text ?? ''); const absolutePos = (mdPara?.start ?? 0) + mdPos; // Build comment mark with any replies let commentMark = ` {>>${comment.author}: ${comment.text}<<}`; // Check for replies const replyKey = comment.text.trim().slice(0, 50); const replyTexts = replies.get(replyKey); if (replyTexts) { for (const replyText of replyTexts) { commentMark += ` {>>${replyAuthor}: ${replyText}<<}`; } } insertions.push({ position: absolutePos, text: commentMark, commentText: comment.text.slice(0, 30), hasReplies: !!replyTexts, debug: `"${(mdPara?.text ?? '').slice(Math.max(0, mdPos - 20), mdPos)}|HERE|${(mdPara?.text ?? '').slice(mdPos, mdPos + 20)}"`, }); } } console.log(`Matched ${matched} paragraphs, ${unmatched} unmatched`); console.log(`Inserting ${insertions.length} comments (${insertions.filter((i) => i.hasReplies).length} with replies)`); if (dryRun) { console.log('\nDry run - would insert:'); for (const ins of insertions.slice(0, 10)) { console.log(` At ${ins.position}: ${ins.debug}`); console.log(` Comment: "${ins.commentText}..."${ins.hasReplies ? ' (+ replies)' : ''}`); } return { success: true, dryRun: true, insertions: insertions.length }; } // Sort by position descending and insert insertions.sort((a, b) => b.position - a.position); for (const ins of insertions) { markdown = markdown.slice(0, ins.position) + ins.text + markdown.slice(ins.position); } // Write result fs.writeFileSync(markdownPath, markdown); return { success: true, insertions: insertions.length, matched, unmatched }; } /** * Realign comments in markdown string (in-memory, doesn't write to file) */ export async function realignMarkdown( docxPath: string, markdown: string, options: RealignMarkdownOptions = {} ): Promise { const { author = 'Guy Colling', replyAuthor = 'Gilles Colling' } = options; try { // Extract reply relationships from original markdown const replies = extractReplies(markdown, author, replyAuthor); // Extract reference paragraphs with comments const refParagraphs = await extractParagraphsWithComments(docxPath); const refWithComments = refParagraphs.filter( (p) => p.comments.length > 0 && p.comments.some((c) => c.author === author) ); // Strip ALL comments from markdown let result = markdown.replace(/\s*\{>>[^<]+<<\}/g, ''); // Parse markdown paragraphs const mdParagraphs = parseMdParagraphs(result); // Track insertions const insertions: Array<{ position: number; text: string }> = []; for (const refPara of refWithComments) { const match = findMatchingParagraph(refPara.text, mdParagraphs); if (!match) continue; const mdPara = match.paragraph; const authorComments = refPara.comments.filter((c) => c.author === author); for (const comment of authorComments) { const mdPos = findMdPosition(refPara.text, comment.position, mdPara?.text ?? ''); const absolutePos = (mdPara?.start ?? 0) + mdPos; let commentMark = ` {>>${comment.author}: ${comment.text}<<}`; // Check for replies const replyKey = comment.text.trim().slice(0, 50); const replyTexts = replies.get(replyKey); if (replyTexts) { for (const replyText of replyTexts) { commentMark += ` {>>${replyAuthor}: ${replyText}<<}`; } } insertions.push({ position: absolutePos, text: commentMark }); } } // Sort by position descending and insert insertions.sort((a, b) => b.position - a.position); for (const ins of insertions) { result = result.slice(0, ins.position) + ins.text + result.slice(ins.position); } return { success: true, markdown: result, insertions: insertions.length }; } catch (err: any) { return { success: false, markdown, insertions: 0, error: err.message }; } }