/** @file Serialize markdown module. */
import TurndownService from "turndown";
import { gfm } from "turndown-plugin-gfm";
import { normalizeWhitespace } from "./text.ts";
export interface MarkdownOptions {
removeImages?: boolean;
}
/**
* Converts cleaned HTML to stable Markdown for model-facing output.
*
* @remarks
* Turndown service construction registers rule objects and plugins. Keeping one configured
* service per image policy avoids repeated setup in hot scrape paths while preserving
* deterministic output rules.
*/
export function htmlToMarkdown(html: string, options: MarkdownOptions = {}): string {
const service = options.removeImages === false ? keepImagesService : removeImagesService;
return normalizeWhitespace(service.turndown(stripLargeElements(html)));
}
/**
* Strip large tables and very long lists before Turndown to avoid expensive conversion on
* element-heavy pages where the output is likely to be truncated anyway. Only applies when HTML
* exceeds 40 KB. Tables: > 20 rows. Lists: > 100 items.
*/
function stripLargeElements(html: string): string {
if (html.length < 40_000) return html;
// Quick check: does HTML contain tables or lists at all?
const hasTable = html.includes("
]/giu);
trCount = trMatches ? trMatches.length : 0;
}
if (hasList) {
const liMatches = html.match(/]/giu);
liCount = liMatches ? liMatches.length : 0;
}
if (trCount < 20 && liCount < 100) return html;
// Strip tables and/or lists if thresholds exceeded
let result = html;
if (trCount >= 20) {
result = result.replaceAll(//giu, "\n\n");
}
if (liCount >= 100) {
result = result.replaceAll(/<(ul|ol)[\s\S]*?<\/(ul|ol)>/giu, "\n\n[Long list]\n\n");
}
return result;
}
function createMarkdownService(removeImages: boolean): TurndownService {
const turndown = new TurndownService({
codeBlockStyle: "fenced",
headingStyle: "atx",
bulletListMarker: "-",
emDelimiter: "_",
strongDelimiter: "**",
});
turndown.use(gfm);
turndown.remove(["script", "style", "noscript", "template"]);
if (removeImages) {
turndown.addRule("removeImages", { filter: "img", replacement: () => "" });
}
turndown.addRule("stableLinks", {
filter: "a",
replacement: (content, node) => {
const href = node.getAttribute("href");
// Fast path: no href, return content as-is
if (!href) return content;
// Simple trim instead of full normalizeWhitespace for link labels
const label = content.trim().replaceAll(/\s+/gu, " ");
return label ? `[${label}](${href})` : href;
},
});
return turndown;
}
const removeImagesService = createMarkdownService(true);
const keepImagesService = createMarkdownService(false);