/** * Checks if the given HTML element contains markers indicating it was * generated by Notion. Notion uses `\n` in text nodes to represent hard * breaks, which is non-standard but intentional. * * Detected by the `` comment that Notion places * on the clipboard. */ function isNotionHTML(element: HTMLElement): boolean { const walker = element.ownerDocument.createTreeWalker( element, // NodeFilter.SHOW_COMMENT 128, ); let node: Node | null; while ((node = walker.nextNode())) { if (/^\s*notionvc:/.test(node.nodeValue || "")) { return true; } } return false; } /** * Normalizes whitespace in text nodes by collapsing runs of whitespace * (including newlines) to single spaces, matching CSS white-space:normal * behavior. * * This is needed because ProseMirror's DOMParser, when `linebreakReplacement` * is set in the schema (as BlockNote does for hard breaks), converts `\n` * characters in text nodes to hard break nodes instead of collapsing them. * This causes HTML source line wrapping (e.g. from MS Word) to create * visible line breaks in the editor. * * Skipped for sources like Notion that intentionally use `\n` in text nodes * to represent hard breaks instead of `
` tags. * * Skips `
` and `` elements where whitespace should be preserved.
 */
function normalizeTextNodeWhitespace(element: HTMLElement) {
  const preserveWSTags = new Set(["PRE", "CODE"]);
  const walker = element.ownerDocument.createTreeWalker(
    element,
    // NodeFilter.SHOW_TEXT
    4,
    {
      acceptNode(node) {
        // Skip text nodes inside pre/code elements
        let parent = node.parentElement;
        while (parent && parent !== element) {
          if (preserveWSTags.has(parent.tagName)) {
            // NodeFilter.FILTER_REJECT
            return 2;
          }
          parent = parent.parentElement;
        }
        // NodeFilter.FILTER_ACCEPT
        return 1;
      },
    },
  );

  const textNodes: Text[] = [];
  let node: Node | null;
  while ((node = walker.nextNode())) {
    textNodes.push(node as Text);
  }

  for (const textNode of textNodes) {
    if (textNode.nodeValue && /[\r\n]/.test(textNode.nodeValue)) {
      textNode.nodeValue = textNode.nodeValue.replace(/[ \t\r\n\f]+/g, " ");
    }
  }
}

/**
 * Normalizes whitespace in HTML text nodes to match standard CSS
 * white-space:normal behavior. Skipped for Notion HTML which intentionally
 * uses `\n` for hard breaks.
 */
export function preprocessHTMLWhitespace(element: HTMLElement) {
  if (!isNotionHTML(element)) {
    normalizeTextNodeWhitespace(element);
  }
}