/**
* Checks if the given HTML element contains markers indicating it was
* generated by Notion. Notion uses `\n` in text nodes to represent hard
* breaks, which is non-standard but intentional.
*
* Detected by the `` comment that Notion places
* on the clipboard.
*/
function isNotionHTML(element: HTMLElement): boolean {
const walker = element.ownerDocument.createTreeWalker(
element,
// NodeFilter.SHOW_COMMENT
128,
);
let node: Node | null;
while ((node = walker.nextNode())) {
if (/^\s*notionvc:/.test(node.nodeValue || "")) {
return true;
}
}
return false;
}
/**
* Normalizes whitespace in text nodes by collapsing runs of whitespace
* (including newlines) to single spaces, matching CSS white-space:normal
* behavior.
*
* This is needed because ProseMirror's DOMParser, when `linebreakReplacement`
* is set in the schema (as BlockNote does for hard breaks), converts `\n`
* characters in text nodes to hard break nodes instead of collapsing them.
* This causes HTML source line wrapping (e.g. from MS Word) to create
* visible line breaks in the editor.
*
* Skipped for sources like Notion that intentionally use `\n` in text nodes
* to represent hard breaks instead of `
` tags.
*
* Skips `
` and `` elements where whitespace should be preserved.
*/
function normalizeTextNodeWhitespace(element: HTMLElement) {
const preserveWSTags = new Set(["PRE", "CODE"]);
const walker = element.ownerDocument.createTreeWalker(
element,
// NodeFilter.SHOW_TEXT
4,
{
acceptNode(node) {
// Skip text nodes inside pre/code elements
let parent = node.parentElement;
while (parent && parent !== element) {
if (preserveWSTags.has(parent.tagName)) {
// NodeFilter.FILTER_REJECT
return 2;
}
parent = parent.parentElement;
}
// NodeFilter.FILTER_ACCEPT
return 1;
},
},
);
const textNodes: Text[] = [];
let node: Node | null;
while ((node = walker.nextNode())) {
textNodes.push(node as Text);
}
for (const textNode of textNodes) {
if (textNode.nodeValue && /[\r\n]/.test(textNode.nodeValue)) {
textNode.nodeValue = textNode.nodeValue.replace(/[ \t\r\n\f]+/g, " ");
}
}
}
/**
* Normalizes whitespace in HTML text nodes to match standard CSS
* white-space:normal behavior. Skipped for Notion HTML which intentionally
* uses `\n` for hard breaks.
*/
export function preprocessHTMLWhitespace(element: HTMLElement) {
if (!isNotionHTML(element)) {
normalizeTextNodeWhitespace(element);
}
}