import { isVideoUrl } from "../../../util/string.js"; /** * Custom markdown-to-HTML converter for BlockNote. * Replaces the unified/remark/rehype pipeline with a direct, minimal implementation * that handles exactly the markdown features BlockNote needs. */ // ─── HTML Escaping ─────────────────────────────────────────────────────────── function escapeHtml(str: string): string { return str .replace(/&/g, "&") .replace(//g, ">") .replace(/"/g, """); } // ─── Helpers ───────────────────────────────────────────────────────────────── function isAlphanumeric(char: string | undefined): boolean { if (!char) { return false; } return /\w/.test(char); } /** * Returns true when an underscore delimiter at position `i` is "intraword", * meaning the characters on both sides are alphanumeric (e.g. `snake_case`). * In that case the underscore should NOT be treated as emphasis per CommonMark. */ function isIntraword(text: string, i: number, delimLen: number): boolean { const before = i > 0 ? text[i - 1] : undefined; const after = i + delimLen < text.length ? text[i + delimLen] : undefined; return isAlphanumeric(before) && isAlphanumeric(after); } // ─── Inline Parser ─────────────────────────────────────────────────────────── type InlineTokenizer = ( text: string, i: number ) => { html: string; end: number } | null; function tryBackslashEscape( text: string, i: number ): { html: string; end: number } | null { if (text[i] !== "\\" || i + 1 >= text.length) {return null;} const next = text[i + 1]; // Hard line break: backslash at end of line if (next === "\n") { return { html: "
\n", end: i + 2 }; } // Escapable characters if ("\\`*_{}[]()#+-.!~|>".includes(next)) { return { html: escapeHtml(next), end: i + 2 }; } return null; } function tryInlineCode( text: string, i: number ): { html: string; end: number } | null { if (text[i] !== "`") {return null;} return parseInlineCode(text, i); } function tryImage( text: string, i: number ): { html: string; end: number } | null { if (text[i] !== "!" || text[i + 1] !== "[") {return null;} return parseImage(text, i); } function tryLink( text: string, i: number ): { html: string; end: number } | null { if (text[i] !== "[") {return null;} return parseLink(text, i); } function tryStrikethrough( text: string, i: number ): { html: string; end: number } | null { if (text[i] !== "~" || text[i + 1] !== "~") {return null;} return parseDelimited(text, i, "~~", "~~", "~~"); } function tryBoldItalic( text: string, i: number ): { html: string; end: number } | null { if ( (text[i] === "*" && text[i + 1] === "*" && text[i + 2] === "*") || (text[i] === "_" && text[i + 1] === "_" && text[i + 2] === "_" && !isIntraword(text, i, 3)) ) { const delimiter = text.substring(i, i + 3); return parseDelimited(text, i, delimiter, "", ""); } return null; } function tryBold( text: string, i: number ): { html: string; end: number } | null { if ( (text[i] === "*" && text[i + 1] === "*") || (text[i] === "_" && text[i + 1] === "_" && !isIntraword(text, i, 2)) ) { const delimiter = text.substring(i, i + 2); return parseDelimited(text, i, delimiter, "", ""); } return null; } function tryItalic( text: string, i: number ): { html: string; end: number } | null { if (text[i] === "*" || (text[i] === "_" && !isIntraword(text, i, 1))) { return parseDelimited(text, i, text[i], "", ""); } return null; } function trySoftBreak( text: string, i: number ): { html: string; end: number } | null { if (text[i] === "\n") { return { html: "
\n", end: i + 1 }; } return null; } // Inline raw HTML: pass through tags, comments, CDATA, processing // instructions, and declarations verbatim so authors can mix HTML into // markdown (e.g. `text foo more`). Anything that doesn't match // these shapes falls through and gets HTML-escaped as plain text. const INLINE_HTML_TAG_RE = /^<\/?[a-zA-Z][a-zA-Z0-9-]*(?:\s+[a-zA-Z_:][a-zA-Z0-9_.:-]*(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s"'=<>`]+))?)*\s*\/?>/; const HTML_COMMENT_RE = /^/; const HTML_CDATA_RE = /^/; const HTML_PI_RE = /^<\?[\s\S]*?\?>/; const HTML_DECL_RE = /^/; function tryInlineHtml( text: string, i: number ): { html: string; end: number } | null { if (text[i] !== "<") {return null;} const rest = text.substring(i); for (const re of [ HTML_COMMENT_RE, HTML_CDATA_RE, HTML_PI_RE, HTML_DECL_RE, INLINE_HTML_TAG_RE, ]) { const m = rest.match(re); if (m) { return { html: m[0], end: i + m[0].length }; } } return null; } /** Characters that can start an inline syntax token. */ const SPECIAL_CHARS = new Set("\\`![~*_\n<"); /** * Ordered array of inline tokenizers, tried in priority order. * The first match wins. */ const inlineTokenizers: InlineTokenizer[] = [ tryBackslashEscape, tryInlineCode, tryImage, tryLink, tryStrikethrough, tryBoldItalic, // *** / ___ tryBold, // ** / __ tryItalic, // * / _ tryInlineHtml, trySoftBreak, ]; /** * Parse inline markdown syntax and return HTML. * Handles: bold, italic, bold+italic, strikethrough, inline code, * links, images (with video detection), hard line breaks, backslash escapes. */ function parseInline(text: string): string { let result = ""; let i = 0; while (i < text.length) { // Hard line break: 2+ trailing spaces immediately before a newline. // (The other hard-break form, backslash + newline, is handled by // tryBackslashEscape.) Strip the trailing spaces from the accumulated // result before emitting the
. if ( text[i] === "\n" && i >= 2 && text[i - 1] === " " && text[i - 2] === " " ) { result = result.replace(/ +$/, ""); result += "
\n"; i++; continue; } // Try each tokenizer in priority order let matched = false; if (SPECIAL_CHARS.has(text[i])) { for (const tokenizer of inlineTokenizers) { const r = tokenizer(text, i); if (r) { result += r.html; i = r.end; matched = true; break; } } } if (!matched) { // Batch consecutive plain-text characters and escape once const runStart = i; i++; while (i < text.length && !SPECIAL_CHARS.has(text[i])) { i++; } result += escapeHtml(text.substring(runStart, i)); } } return result; } function parseInlineCode( text: string, start: number ): { html: string; end: number } | null { // Count opening backticks let openCount = 0; let i = start; while (i < text.length && text[i] === "`") { openCount++; i++; } // Find matching closing backticks let j = i; while (j < text.length) { if (text[j] === "`") { let closeCount = 0; const closeStart = j; while (j < text.length && text[j] === "`") { closeCount++; j++; } if (closeCount === openCount) { let code = text.substring(i, closeStart); // Per CommonMark: line endings inside a code span are converted to // single spaces, then if the result starts AND ends with a space and // is not all-spaces, one leading + trailing space is stripped (so // `` ` `foo` ` `` is ``foo``). code = code.replace(/\n/g, " "); if ( code.length >= 2 && code[0] === " " && code[code.length - 1] === " " && /[^ ]/.test(code) ) { code = code.substring(1, code.length - 1); } return { html: `${escapeHtml(code)}`, end: j, }; } } else { j++; } } return null; } function parseImage( text: string, start: number ): { html: string; end: number } | null { // ![alt](url) or ![alt](url "title") // Use balanced bracket matching to handle nested/escaped brackets in alt text const altEnd = findClosingBracket(text, start + 1); if (altEnd === -1) {return null;} const altStart = start + 2; // after ![ if (text[altEnd + 1] !== "(") {return null;} const urlStart = altEnd + 2; const parenEnd = findClosingParen(text, urlStart - 1); if (parenEnd === -1) {return null;} const alt = text.substring(altStart, altEnd); const { url, title } = parseDestinationAndTitle( text.substring(urlStart, parenEnd), ); if (isVideoUrl(url)) { // Use the alt text as the video's display name (falling back to the // title) so a video link written with the standard `![name](url)` form // round-trips into BlockNote's video block. Captioned videos go through // raw `