export function sanitizeMarkdown(input: string): string {
let content = input;
// Remove anchor-only links
content = content.replace(/<\/a>/gi, "");
content = content.replace(/]*>(.*?)<\/a>/gis, "$1");
content = removeNamedToc(content);
content = removeAnchorLists(content);
content = stripMarkdownLinks(content);
content = removeWbrTags(content);
content = stripHtmlTags(content);
content = fixBadUnicode(content);
content = unescapeUnderscoresInCode(content);
content = decodeEscapedNewlines(content);
content = collapseBlankLines(content);
content = convertSetextHeadings(content);
return content.trim();
}
export function hasCodeSnippets(content: string | null | undefined): boolean {
if (!content?.trim()) return false;
// Fenced code blocks
if (/```[\s\S]*?```/m.test(content)) return true;
if (/~~~[\s\S]*?~~~/m.test(content)) return true;
// Inline code
return /`[^`]+`/.test(content);
}
export function bodyLength(content: string): number {
const withoutFrontmatter = content.replace(/^---[\s\S]*?---\s*/, "");
const squeezed = withoutFrontmatter.replace(/\s+/g, "");
return squeezed.length;
}
function removeNamedToc(markdown: string): string {
const lines = markdown.split(/\r?\n/);
const result: string[] = [];
let skippingToc = false;
for (const line of lines) {
const trimmed = line.trim();
if (!skippingToc && /^#{1,6}\s+(?:table of contents|contents|on this page)\s*$/i.test(trimmed)) {
skippingToc = true;
continue;
}
if (skippingToc) {
if (trimmed === "" || /^\s*[-*+]\s+\[[^\]]+]\(#.*\)\s*$/i.test(line)) {
continue;
}
if (/^#{1,6}\s+/.test(trimmed)) {
skippingToc = false;
result.push(line);
continue;
}
skippingToc = false;
}
result.push(line);
}
return result.join("\n");
}
function removeAnchorLists(markdown: string): string {
const lines = markdown.split(/\r?\n/);
return lines.filter((line) => !/^\s*[-*+]\s+\[[^\]]+]\(#.*\)\s*$/i.test(line)).join("\n");
}
function stripMarkdownLinks(markdown: string): string {
let content = markdown;
// Images
content = content.replace(/!\[([^\]]*?)\]\(([^)]+)\)/g, "$1");
// Links - keep text with URL in parens
content = content.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (_match, text, url) => {
return `${text.trim()} (${url})`;
});
return content;
}
function decodeEscapedNewlines(markdown: string): string {
return markdown.replace(/\\r\\n|\\n/g, "\n");
}
function collapseBlankLines(markdown: string): string {
return markdown.replace(/\n{3,}/g, "\n\n");
}
function convertSetextHeadings(markdown: string): string {
const lines = markdown.split(/\r?\n/);
const result: string[] = [];
for (let i = 0; i < lines.length; i++) {
const line = lines[i] ?? "";
const next = lines[i + 1];
const trimmedNext = next?.trim();
if (next !== undefined && /^={3,}$/.test(trimmedNext ?? "")) {
result.push(`# ${line.trim()}`);
i++;
continue;
}
if (next !== undefined && /^-{3,}$/.test(trimmedNext ?? "")) {
result.push(`## ${line.trim()}`);
i++;
continue;
}
result.push(line);
}
return result.join("\n");
}
function removeWbrTags(markdown: string): string {
return markdown.replace(/<\/?wbr\s*\/?>/gi, "");
}
function fixBadUnicode(markdown: string): string {
const replacements: Record = {
"\u00E2\u0080\u0099": "\u2019", // '
"\u00E2\u0080\u0098": "\u2018", // '
"\u00E2\u0080\u009C": "\u201C", // "
"\u00E2\u0080\u009D": "\u201D", // "
"\u00E2\u0080\u0093": "\u2013", // –
"\u00E2\u0080\u0094": "\u2014", // —
"\u00E2\u0080\u00A6": "\u2026", // …
"\u00C2\u00A0": " ",
};
let result = markdown;
for (const [bad, good] of Object.entries(replacements)) {
result = result.split(bad).join(good);
}
return result;
}
function unescapeUnderscoresInCode(markdown: string): string {
const blocks: Record = {};
let content = markdown.replace(/(```|~~~)[\s\S]*?\1/gm, (match) => {
const key = `___CODE_BLOCK_${Object.keys(blocks).length}___`;
blocks[key] = match.replace(/\\_/g, "_");
return key;
});
content = content.replace(/`[^`]+`/g, (match) => match.replace(/\\_/g, "_"));
for (const [key, block] of Object.entries(blocks)) {
content = content.replace(key, block);
}
return content;
}
function stripHtmlTags(markdown: string): string {
const placeholders: Record = {};
let content = markdown.replace(/```[\s\S]*?```/g, (match) => {
const key = `___CODE_FENCE_${Object.keys(placeholders).length}___`;
placeholders[key] = match;
return key;
});
// Strip style and script tags
content = content.replace(/