import { Arr, Fun, Obj, Strings, Type } from '@ephox/katamari';
import { Attribute, NodeTypes, Remove, Replication, SugarElement } from '@ephox/sugar';
import createDompurify, { Config, DOMPurifyI } from 'dompurify';
import * as TransparentElements from '../../content/TransparentElements';
import * as NodeType from '../../dom/NodeType';
import * as FilterNode from '../../html/FilterNode';
import * as FilterRegistry from '../../html/FilterRegistry';
import * as InvalidNodes from '../../html/InvalidNodes';
import * as LegacyFilter from '../../html/LegacyFilter';
import * as ParserFilters from '../../html/ParserFilters';
import { isEmpty, isLineBreakNode, isPaddedWithNbsp, paddEmptyNode } from '../../html/ParserUtils';
import { BlobCache } from '../file/BlobCache';
import Tools from '../util/Tools';
import * as URI from '../util/URI';
import AstNode from './Node';
import Schema, { getTextRootBlockElements, SchemaMap, SchemaRegExpMap } from './Schema';
/**
* @summary
* This class parses HTML code into a DOM like structure of nodes it will remove redundant whitespace and make
* sure that the node tree is valid according to the specified schema.
* So for example: `
a
b
c` will become `a
b
c
`.
*
* @example
* const parser = tinymce.html.DomParser({ validate: true }, schema);
* const rootNode = parser.parse('content
');
*
* @class tinymce.html.DomParser
* @version 3.4
*/
const makeMap = Tools.makeMap, extend = Tools.extend;
export interface ParserArgs {
getInner?: boolean | number;
forced_root_block?: boolean | string;
context?: string;
isRootContent?: boolean;
format?: string;
invalid?: boolean;
no_events?: boolean;
// TODO finish typing the parser args
[key: string]: any;
}
export type ParserFilterCallback = (nodes: AstNode[], name: string, args: ParserArgs) => void;
export interface ParserFilter extends FilterRegistry.Filter {}
export interface DomParserSettings {
allow_html_data_urls?: boolean;
allow_svg_data_urls?: boolean;
allow_conditional_comments?: boolean;
allow_html_in_named_anchor?: boolean;
allow_script_urls?: boolean;
allow_unsafe_link_target?: boolean;
convert_fonts_to_spans?: boolean;
fix_list_elements?: boolean;
font_size_legacy_values?: string;
forced_root_block?: boolean | string;
forced_root_block_attrs?: Record;
preserve_cdata?: boolean;
remove_trailing_brs?: boolean;
root_name?: string;
validate?: boolean;
inline_styles?: boolean;
blob_cache?: BlobCache;
document?: Document;
}
interface DomParser {
schema: Schema;
addAttributeFilter: (name: string, callback: ParserFilterCallback) => void;
getAttributeFilters: () => ParserFilter[];
removeAttributeFilter: (name: string, callback?: ParserFilterCallback) => void;
addNodeFilter: (name: string, callback: ParserFilterCallback) => void;
getNodeFilters: () => ParserFilter[];
removeNodeFilter: (name: string, callback?: ParserFilterCallback) => void;
parse: (html: string, args?: ParserArgs) => AstNode;
}
type WalkerCallback = (node: AstNode) => void;
const basePurifyConfig: Config = {
IN_PLACE: true,
ALLOW_UNKNOWN_PROTOCOLS: true,
// Deliberately ban all tags and attributes by default, and then un-ban them on demand in hooks
// #comment and #cdata-section are always allowed as they aren't controlled via the schema
// body is also allowed due to the DOMPurify checking the root node before sanitizing
ALLOWED_TAGS: [ '#comment', '#cdata-section', 'body' ],
ALLOWED_ATTR: []
};
// A list of attributes that should be filtered further based on the parser settings
const filteredUrlAttrs = Tools.makeMap('src,href,data,background,action,formaction,poster,xlink:href');
const internalElementAttr = 'data-mce-type';
const getPurifyConfig = (settings: DomParserSettings, mimeType: string): Config => {
const config = { ...basePurifyConfig };
// Set the relevant parser mimetype
config.PARSER_MEDIA_TYPE = mimeType;
// Allow any URI when allowing script urls
if (settings.allow_script_urls) {
config.ALLOWED_URI_REGEXP = /.*/;
// Allow anything except javascript (or similar) URIs if all html data urls are allowed
} else if (settings.allow_html_data_urls) {
config.ALLOWED_URI_REGEXP = /^(?!(\w+script|mhtml):)/i;
}
return config;
};
const setupPurify = (settings: DomParserSettings, schema: Schema): DOMPurifyI => {
const purify = createDompurify();
const specialElements = schema.getSpecialElements();
const validate = settings.validate;
let uid = 0;
// We use this to add new tags to the allow-list as we parse, if we notice that a tag has been banned but it's still in the schema
purify.addHook('uponSanitizeElement', (ele, evt) => {
// Pad conditional comments if they aren't allowed
if (ele.nodeType === NodeTypes.COMMENT && !settings.allow_conditional_comments && /^\[if/i.test(ele.nodeValue ?? '')) {
ele.nodeValue = ' ' + ele.nodeValue;
}
// Just leave non-elements such as text and comments up to dompurify
const tagName = evt.tagName;
if (ele.nodeType !== NodeTypes.ELEMENT || tagName === 'body') {
return;
}
// Construct the sugar element wrapper
const element = SugarElement.fromDom(ele);
const lcTagName = tagName.toLowerCase();
// Determine if we're dealing with an internal attribute
const isInternalElement = Attribute.has(element, internalElementAttr);
// Cleanup bogus elements
const bogus = Attribute.get(element, 'data-mce-bogus');
if (!isInternalElement && Type.isString(bogus)) {
if (bogus === 'all') {
Remove.remove(element);
} else {
Remove.unwrap(element);
}
return;
}
// Determine if the schema allows the element and either add it or remove it
const rule = schema.getElementRule(lcTagName);
if (validate && !rule) {
// If a special element is invalid, then remove the entire element instead of unwrapping
if (Obj.has(specialElements, lcTagName)) {
Remove.remove(element);
} else {
Remove.unwrap(element);
}
return;
} else {
evt.allowedTags[tagName] = true;
}
// Validate the element using the attribute rules
if (validate && rule && !isInternalElement) {
// Fix the attributes for the element, unwrapping it if we have to
Arr.each(rule.attributesForced ?? [], (attr) => {
Attribute.set(element, attr.name, attr.value === '{$uid}' ? `mce_${uid++}` : attr.value);
});
Arr.each(rule.attributesDefault ?? [], (attr) => {
if (!Attribute.has(element, attr.name)) {
Attribute.set(element, attr.name, attr.value === '{$uid}' ? `mce_${uid++}` : attr.value);
}
});
// If none of the required attributes were found then remove
if (rule.attributesRequired && !Arr.exists(rule.attributesRequired, (attr) => Attribute.has(element, attr))) {
Remove.unwrap(element);
return;
}
// If there are no attributes then remove
if (rule.removeEmptyAttrs && Attribute.hasNone(element)) {
Remove.unwrap(element);
return;
}
// Change the node name if the schema says to
if (rule.outputName && rule.outputName !== lcTagName) {
Replication.mutate(element, rule.outputName as keyof HTMLElementTagNameMap);
}
}
});
// Let's do the same thing for attributes
purify.addHook('uponSanitizeAttribute', (ele, evt) => {
const tagName = ele.tagName.toLowerCase();
const { attrName, attrValue } = evt;
evt.keepAttr = !validate || schema.isValid(tagName, attrName) || Strings.startsWith(attrName, 'data-') || Strings.startsWith(attrName, 'aria-');
if (attrName in filteredUrlAttrs && URI.isInvalidUri(settings, attrValue, tagName)) {
evt.keepAttr = false;
}
if (evt.keepAttr) {
evt.allowedAttributes[attrName] = true;
if (attrName in schema.getBoolAttrs()) {
evt.attrValue = attrName;
}
// We need to tell DOMPurify to forcibly keep the attribute if it's an SVG data URI and svg data URIs are allowed
if (settings.allow_svg_data_urls && Strings.startsWith(attrValue, 'data:image/svg+xml')) {
evt.forceKeepAttr = true;
}
// For internal elements always keep the attribute if the attribute name is id, class or style
} else if (ele.hasAttribute(internalElementAttr) && (attrName === 'id' || attrName === 'class' || attrName === 'style')) {
evt.forceKeepAttr = true;
}
});
return purify;
};
const transferChildren = (parent: AstNode, nativeParent: Node, specialElements: SchemaRegExpMap) => {
const parentName = parent.name;
// Exclude the special elements where the content is RCDATA as their content needs to be parsed instead of being left as plain text
// See: https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
const isSpecial = parentName in specialElements && parentName !== 'title' && parentName !== 'textarea';
const childNodes = nativeParent.childNodes;
for (let ni = 0, nl = childNodes.length; ni < nl; ni++) {
const nativeChild = childNodes[ni];
const child = new AstNode(nativeChild.nodeName.toLowerCase(), nativeChild.nodeType);
if (NodeType.isElement(nativeChild)) {
const attributes = nativeChild.attributes;
for (let ai = 0, al = attributes.length; ai < al; ai++) {
const attr = attributes[ai];
child.attr(attr.name, attr.value);
}
} else if (NodeType.isText(nativeChild)) {
child.value = nativeChild.data;
if (isSpecial) {
child.raw = true;
}
} else if (NodeType.isComment(nativeChild) || NodeType.isCData(nativeChild) || NodeType.isPi(nativeChild)) {
child.value = nativeChild.data;
}
transferChildren(child, nativeChild, specialElements);
parent.append(child);
}
};
const walkTree = (root: AstNode, preprocessors: WalkerCallback[], postprocessors: WalkerCallback[]) => {
const traverseOrder: AstNode[] = [];
for (let node: AstNode | null | undefined = root, lastNode = node; node; lastNode = node, node = node.walk()) {
const tempNode = node;
Arr.each(preprocessors, (preprocess) => preprocess(tempNode));
if (Type.isNullable(tempNode.parent) && tempNode !== root) {
// The node has been detached, so rewind a little and don't add it to our traversal
node = lastNode;
} else {
traverseOrder.push(tempNode);
}
}
for (let i = traverseOrder.length - 1; i >= 0; i--) {
const node = traverseOrder[i];
Arr.each(postprocessors, (postprocess) => postprocess(node));
}
};
// All the dom operations we want to perform, regardless of whether we're trying to properly validate things
// e.g. removing excess whitespace
// e.g. removing empty nodes (or padding them with
)
//
// Returns [ preprocess, postprocess ]
const whitespaceCleaner = (root: AstNode, schema: Schema, settings: DomParserSettings, args: ParserArgs): [WalkerCallback, WalkerCallback] => {
const validate = settings.validate;
const nonEmptyElements = schema.getNonEmptyElements();
const whitespaceElements = schema.getWhitespaceElements();
const blockElements: Record = extend(makeMap('script,style,head,html,body,title,meta,param'), schema.getBlockElements());
const textRootBlockElements = getTextRootBlockElements(schema);
const allWhiteSpaceRegExp = /[ \t\r\n]+/g;
const startWhiteSpaceRegExp = /^[ \t\r\n]+/;
const endWhiteSpaceRegExp = /[ \t\r\n]+$/;
const hasWhitespaceParent = (node: AstNode) => {
let tempNode = node.parent;
while (Type.isNonNullable(tempNode)) {
if (tempNode.name in whitespaceElements) {
return true;
} else {
tempNode = tempNode.parent;
}
}
return false;
};
const isTextRootBlockEmpty = (node: AstNode) => {
let tempNode: AstNode | null | undefined = node;
while (Type.isNonNullable(tempNode)) {
if (tempNode.name in textRootBlockElements) {
return isEmpty(schema, nonEmptyElements, whitespaceElements, tempNode);
} else {
tempNode = tempNode.parent;
}
}
return false;
};
const isBlock = (node: AstNode) => node.name in blockElements && !TransparentElements.isTransparentAstInline(schema, node);
const isAtEdgeOfBlock = (node: AstNode, start: boolean): boolean => {
const neighbour = start ? node.prev : node.next;
if (Type.isNonNullable(neighbour) || Type.isNullable(node.parent)) {
return false;
}
// Make sure our parent is actually a block, and also make sure it isn't a temporary "context" element
// that we're probably going to unwrap as soon as we insert this content into the editor
return isBlock(node.parent) && (node.parent !== root || args.isRootContent === true);
};
const preprocess = (node: AstNode) => {
if (node.type === 3) {
// Remove leading whitespace here, so that all whitespace in nodes to the left of us has already been fixed
if (!hasWhitespaceParent(node)) {
let text = node.value ?? '';
text = text.replace(allWhiteSpaceRegExp, ' ');
if (isLineBreakNode(node.prev, isBlock) || isAtEdgeOfBlock(node, true)) {
text = text.replace(startWhiteSpaceRegExp, '');
}
if (text.length === 0) {
node.remove();
} else {
node.value = text;
}
}
}
};
const postprocess = (node: AstNode) => {
if (node.type === 1) {
// Check for empty nodes here, because children will have been processed and (if necessary) emptied / removed already
const elementRule = schema.getElementRule(node.name);
if (validate && elementRule) {
const isNodeEmpty = isEmpty(schema, nonEmptyElements, whitespaceElements, node);
if (elementRule.paddInEmptyBlock && isNodeEmpty && isTextRootBlockEmpty(node)) {
paddEmptyNode(args, isBlock, node);
} else if (elementRule.removeEmpty && isNodeEmpty) {
if (isBlock(node)) {
node.remove();
} else {
node.unwrap();
}
} else if (elementRule.paddEmpty && (isNodeEmpty || isPaddedWithNbsp(node))) {
paddEmptyNode(args, isBlock, node);
}
}
} else if (node.type === 3) {
// Removing trailing whitespace here, so that all whitespace in nodes to the right of us has already been fixed
if (!hasWhitespaceParent(node)) {
let text = node.value ?? '';
if (node.next && isBlock(node.next) || isAtEdgeOfBlock(node, false)) {
text = text.replace(endWhiteSpaceRegExp, '');
}
if (text.length === 0) {
node.remove();
} else {
node.value = text;
}
}
}
};
return [ preprocess, postprocess ];
};
const getRootBlockName = (settings: DomParserSettings, args: ParserArgs) => {
const name = args.forced_root_block ?? settings.forced_root_block;
if (name === false) {
return '';
} else if (name === true) {
return 'p';
} else {
return name;
}
};
const DomParser = (settings: DomParserSettings = {}, schema = Schema()): DomParser => {
const nodeFilterRegistry = FilterRegistry.create();
const attributeFilterRegistry = FilterRegistry.create();
// Apply setting defaults
const defaultedSettings = {
validate: true,
root_name: 'body',
...settings
};
const parser = new DOMParser();
const purify = setupPurify(defaultedSettings, schema);
const parseAndSanitizeWithContext = (html: string, rootName: string, format: string = 'html'): Element => {
const mimeType = format === 'xhtml' ? 'application/xhtml+xml' : 'text/html';
// Determine the root element to wrap the HTML in when parsing. If we're dealing with a
// special element then we need to wrap it so the internal content is handled appropriately.
const isSpecialRoot = Obj.has(schema.getSpecialElements(), rootName.toLowerCase());
const content = isSpecialRoot ? `<${rootName}>${html}${rootName}>` : html;
// If parsing XHTML then the content must contain the xmlns declaration, see https://www.w3.org/TR/xhtml1/normative.html#strict
const wrappedHtml = format === 'xhtml' ? `${content}` : `${content}`;
const body = parser.parseFromString(wrappedHtml, mimeType).body;
// Sanitize the content
purify.sanitize(body, getPurifyConfig(defaultedSettings, mimeType));
purify.removed = [];
return isSpecialRoot ? body.firstChild as Element : body;
};
/**
* Adds a node filter function to the parser, the parser will collect the specified nodes by name
* and then execute the callback once it has finished parsing the document.
*
* @method addNodeFilter
* @param {String} name Comma separated list of nodes to collect.
* @param {Function} callback Callback function to execute once it has collected nodes.
* @example
* parser.addNodeFilter('p,h1', (nodes, name) => {
* for (var i = 0; i < nodes.length; i++) {
* console.log(nodes[i].name);
* }
* });
*/
const addNodeFilter = nodeFilterRegistry.addFilter;
const getNodeFilters = nodeFilterRegistry.getFilters;
/**
* Removes a node filter function or removes all filter functions from the parser for the node names provided.
*
* @method removeNodeFilter
* @param {String} name Comma separated list of node names to remove filters for.
* @param {Function} callback Optional callback function to only remove a specific callback.
* @example
* // Remove a single filter
* parser.removeNodeFilter('p,h1', someCallback);
*
* // Remove all filters
* parser.removeNodeFilter('p,h1');
*/
const removeNodeFilter = nodeFilterRegistry.removeFilter;
/**
* Adds an attribute filter function to the parser, the parser will collect nodes that has the specified attributes
* and then execute the callback once it has finished parsing the document.
*
* @method addAttributeFilter
* @param {String} name Comma separated list of attributes to collect.
* @param {Function} callback Callback function to execute once it has collected nodes.
* @example
* parser.addAttributeFilter('src,href', (nodes, name) => {
* for (let i = 0; i < nodes.length; i++) {
* console.log(nodes[i].name);
* }
* });
*/
const addAttributeFilter = attributeFilterRegistry.addFilter;
const getAttributeFilters = attributeFilterRegistry.getFilters;
/**
* Removes an attribute filter function or removes all filter functions from the parser for the attribute names provided.
*
* @method removeAttributeFilter
* @param {String} name Comma separated list of attribute names to remove filters for.
* @param {Function} callback Optional callback function to only remove a specific callback.
* @example
* // Remove a single filter
* parser.removeAttributeFilter('src,href', someCallback);
*
* // Remove all filters
* parser.removeAttributeFilter('src,href');
*/
const removeAttributeFilter = attributeFilterRegistry.removeFilter;
const findInvalidChildren = (node: AstNode, invalidChildren: AstNode[]): void => {
if (InvalidNodes.isInvalid(schema, node)) {
invalidChildren.push(node);
}
};
const isWrappableNode = (blockElements: SchemaMap, node: AstNode) => {
const isInternalElement = Type.isString(node.attr(internalElementAttr));
const isInlineElement = node.type === 1 && (!Obj.has(blockElements, node.name) && !TransparentElements.isTransparentAstBlock(schema, node));
return node.type === 3 || (isInlineElement && !isInternalElement);
};
const addRootBlocks = (rootNode: AstNode, rootBlockName: string): void => {
const blockElements = extend(makeMap('script,style,head,html,body,title,meta,param'), schema.getBlockElements());
const startWhiteSpaceRegExp = /^[ \t\r\n]+/;
const endWhiteSpaceRegExp = /[ \t\r\n]+$/;
let node = rootNode.firstChild, rootBlockNode: AstNode | null = null;
// Removes whitespace at beginning and end of block so:
// x
-> x
const trim = (rootBlock: AstNode | null) => {
if (rootBlock) {
node = rootBlock.firstChild;
if (node && node.type === 3) {
node.value = node.value?.replace(startWhiteSpaceRegExp, '');
}
node = rootBlock.lastChild;
if (node && node.type === 3) {
node.value = node.value?.replace(endWhiteSpaceRegExp, '');
}
}
};
// Check if rootBlock is valid within rootNode for example if P is valid in H1 if H1 is the contentEditable root
if (!schema.isValidChild(rootNode.name, rootBlockName.toLowerCase())) {
return;
}
while (node) {
const next = node.next;
if (isWrappableNode(blockElements, node)) {
if (!rootBlockNode) {
// Create a new root block element
rootBlockNode = new AstNode(rootBlockName, 1);
rootBlockNode.attr(defaultedSettings.forced_root_block_attrs);
rootNode.insert(rootBlockNode, node);
rootBlockNode.append(node);
} else {
rootBlockNode.append(node);
}
} else {
trim(rootBlockNode);
rootBlockNode = null;
}
node = next;
}
trim(rootBlockNode);
};
/**
* Parses the specified HTML string into a DOM like node tree and returns the result.
*
* @method parse
* @param {String} html Html string to sax parse.
* @param {Object} args Optional args object that gets passed to all filter functions.
* @return {tinymce.html.Node} Root node containing the tree.
* @example
* const rootNode = tinymce.html.DomParser({...}).parse('text');
*/
const parse = (html: string, args: ParserArgs = {}): AstNode => {
const validate = defaultedSettings.validate;
const rootName = args.context ?? defaultedSettings.root_name;
// Parse and sanitize the content
const element = parseAndSanitizeWithContext(html, rootName, args.format);
TransparentElements.updateChildren(schema, element);
// Create the AST representation
const rootNode = new AstNode(rootName, 11);
transferChildren(rootNode, element, schema.getSpecialElements());
// This next line is needed to fix a memory leak in chrome and firefox.
// For more information see TINY-9186
element.innerHTML = '';
// Set up whitespace fixes
const [ whitespacePre, whitespacePost ] = whitespaceCleaner(rootNode, schema, defaultedSettings, args);
// Find the invalid children in the tree
const invalidChildren: AstNode[] = [];
const invalidFinder = validate ? (node: AstNode) => findInvalidChildren(node, invalidChildren) : Fun.noop;
// Set up attribute and node matching
const matches: FilterNode.FilterMatches = { nodes: {}, attributes: {}};
const matchFinder = (node: AstNode) => FilterNode.matchNode(getNodeFilters(), getAttributeFilters(), node, matches);
// Walk the dom, apply all of the above things
walkTree(rootNode, [ whitespacePre, matchFinder ], [ whitespacePost, invalidFinder ]);
// Because we collected invalid children while walking backwards, we need to reverse the list before operating on them
invalidChildren.reverse();
// Fix invalid children or report invalid children in a contextual parsing
if (validate && invalidChildren.length > 0) {
if (args.context) {
const { pass: topLevelChildren, fail: otherChildren } = Arr.partition(invalidChildren, (child) => child.parent === rootNode);
InvalidNodes.cleanInvalidNodes(otherChildren, schema, matchFinder);
args.invalid = topLevelChildren.length > 0;
} else {
InvalidNodes.cleanInvalidNodes(invalidChildren, schema, matchFinder);
}
}
// Wrap nodes in the root into block elements if the root is body
const rootBlockName = getRootBlockName(defaultedSettings, args);
if (rootBlockName && (rootNode.name === 'body' || args.isRootContent)) {
addRootBlocks(rootNode, rootBlockName);
}
// Run filters only when the contents is valid
if (!args.invalid) {
FilterNode.runFilters(matches, args);
}
return rootNode;
};
const exports = {
schema,
addAttributeFilter,
getAttributeFilters,
removeAttributeFilter,
addNodeFilter,
getNodeFilters,
removeNodeFilter,
parse
};
ParserFilters.register(exports, defaultedSettings);
LegacyFilter.register(exports, defaultedSettings, schema);
return exports;
};
export default DomParser;