/** @file Serialize markdown module. */ import TurndownService from "turndown"; import { gfm } from "turndown-plugin-gfm"; import { normalizeWhitespace } from "./text.ts"; export interface MarkdownOptions { removeImages?: boolean; } /** * Converts cleaned HTML to stable Markdown for model-facing output. * * @remarks * Turndown service construction registers rule objects and plugins. Keeping one configured * service per image policy avoids repeated setup in hot scrape paths while preserving * deterministic output rules. */ export function htmlToMarkdown(html: string, options: MarkdownOptions = {}): string { const service = options.removeImages === false ? keepImagesService : removeImagesService; return normalizeWhitespace(service.turndown(stripLargeElements(html))); } /** * Strip large tables and very long lists before Turndown to avoid expensive conversion on * element-heavy pages where the output is likely to be truncated anyway. Only applies when HTML * exceeds 40 KB. Tables: > 20 rows. Lists: > 100 items. */ function stripLargeElements(html: string): string { if (html.length < 40_000) return html; // Quick check: does HTML contain tables or lists at all? const hasTable = html.includes("]/giu); trCount = trMatches ? trMatches.length : 0; } if (hasList) { const liMatches = html.match(/]/giu); liCount = liMatches ? liMatches.length : 0; } if (trCount < 20 && liCount < 100) return html; // Strip tables and/or lists if thresholds exceeded let result = html; if (trCount >= 20) { result = result.replaceAll(//giu, "\n\n"); } if (liCount >= 100) { result = result.replaceAll(/<(ul|ol)[\s\S]*?<\/(ul|ol)>/giu, "\n\n[Long list]\n\n"); } return result; } function createMarkdownService(removeImages: boolean): TurndownService { const turndown = new TurndownService({ codeBlockStyle: "fenced", headingStyle: "atx", bulletListMarker: "-", emDelimiter: "_", strongDelimiter: "**", }); turndown.use(gfm); turndown.remove(["script", "style", "noscript", "template"]); if (removeImages) { turndown.addRule("removeImages", { filter: "img", replacement: () => "" }); } turndown.addRule("stableLinks", { filter: "a", replacement: (content, node) => { const href = node.getAttribute("href"); // Fast path: no href, return content as-is if (!href) return content; // Simple trim instead of full normalizeWhitespace for link labels const label = content.trim().replaceAll(/\s+/gu, " "); return label ? `[${label}](${href})` : href; }, }); return turndown; } const removeImagesService = createMarkdownService(true); const keepImagesService = createMarkdownService(false);