/************************************************************* * * Copyright (c) 2018-2025 The MathJax Consortium * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * @file Implements a lightweight DOM adaptor * * @author dpvc@mathjax.org (Davide Cervone) */ import { AttributeData } from '../../core/DOMAdaptor.js'; import { MinDOMParser } from '../HTMLAdaptor.js'; import * as Entities from '../../util/Entities.js'; import { LiteDocument } from './Document.js'; import { LiteElement } from './Element.js'; import { LiteText, LiteComment } from './Text.js'; import { LiteAdaptor } from '../liteAdaptor.js'; /** * Patterns used in parsing serialized HTML */ const SPACE = '[ \\n]+'; const OPTIONALSPACE = '[ \\n]*'; const TAGNAME = `[A-Za-z][^\u0000-\u001F "'>/=\u007F-\u009F]*`; const ATTNAME = `[^\u0000-\u001F "'>/=\u007F-\u009F]+`; const VALUE = `(?:'[^']*'|"[^"]*"|${SPACE})`; const VALUESPLIT = `(?:'([^']*)'|"([^"]*)"|(${SPACE}))`; const ATTRIBUTE = `${ATTNAME}(?:${OPTIONALSPACE}=${OPTIONALSPACE}${VALUE})?`; const ATTRIBUTESPLIT = `(${ATTNAME})(?:${OPTIONALSPACE}=${OPTIONALSPACE}${VALUESPLIT})?`; const TAG = `(<(?:${TAGNAME}(?:${SPACE}${ATTRIBUTE})*` + `${OPTIONALSPACE}/?|/${TAGNAME}|!--[^]*?--|![^]*?)(?:>|$))`; export const PATTERNS = { tag: new RegExp(TAG, 'u'), attr: new RegExp(ATTRIBUTE, 'u'), attrsplit: new RegExp(ATTRIBUTESPLIT, 'u'), }; /************************************************************/ /** * Implements a lightweight DOMParser replacement * (Not perfect, but handles most well-formed HTML) */ export class LiteParser implements MinDOMParser { /** * The list of self-closing tags */ public static SELF_CLOSING: { [name: string]: boolean } = { area: true, base: true, br: true, col: true, command: true, embed: true, hr: true, img: true, input: true, keygen: true, link: true, menuitem: true, meta: true, param: true, source: true, track: true, wbr: true, }; /** * The list of tags chose content is not parsed (PCDATA) */ public static PCDATA: { [name: string]: boolean } = { option: true, textarea: true, fieldset: true, title: true, style: true, script: true, }; public static XMLNS: { [name: string]: string } = { svg: 'http://www.w3.org/2000/svg', math: 'http://www.w3.org/1998/Math/MathML', html: 'http://www.w3.org/1999/xhtml', }; /** * @override */ public parseFromString( text: string, _format: string = 'text/html', adaptor: LiteAdaptor = null ) { const root = adaptor.createDocument(); let node = adaptor.body(root); // // Split the HTML into an array of text, tag, text, tag, ... // Then loop through them and add text nodes and process tags. // const parts = text.replace(/<\?.*?\?>/g, '').split(PATTERNS.tag); while (parts.length) { const text = parts.shift(); const tag = parts.shift(); if (text) { this.addText(adaptor, node, text); } if (tag && tag.charAt(tag.length - 1) === '>') { if (tag.charAt(1) === '!') { this.addComment(adaptor, node, tag); } else if (tag.charAt(1) === '/') { node = this.closeTag(adaptor, node, tag); } else { node = this.openTag(adaptor, node, tag, parts); } } } this.checkDocument(adaptor, root); return root; } /** * @param {LiteAdaptor} adaptor The adaptor for managing nodes * @param {LiteElement} node The node to add a text element to * @param {string} text The text for the text node * @returns {LiteText} The text element */ protected addText( adaptor: LiteAdaptor, node: LiteElement, text: string ): LiteText { text = Entities.translate(text); return adaptor.append(node, adaptor.text(text)) as LiteText; } /** * @param {LiteAdaptor} adaptor The adaptor for managing nodes * @param {LiteElement} node The node to add a comment to * @param {string} comment The text for the comment node * @returns {LiteComment} The comment element */ protected addComment( adaptor: LiteAdaptor, node: LiteElement, comment: string ): LiteComment { return adaptor.append(node, new LiteComment(comment)) as LiteComment; } /** * @param {LiteAdaptor} adaptor The adaptor for managing nodes * @param {LiteElement} node The node to close * @param {string} tag The close tag being processed * @returns {LiteElement} The first unclosed parent node */ protected closeTag( adaptor: LiteAdaptor, node: LiteElement, tag: string ): LiteElement { const kind = tag.slice(2, tag.length - 1).toLowerCase(); while (adaptor.parent(node) && adaptor.kind(node) !== kind) { node = adaptor.parent(node); } return adaptor.parent(node); } /** * @param {LiteAdaptor} adaptor The adaptor for managing nodes * @param {LiteElement} node The parent node for the tag * @param {string} tag The tag being processed * @param {string[]} parts The rest of the text/tag array * @returns {LiteElement} The node to which the next tag will be added */ protected openTag( adaptor: LiteAdaptor, node: LiteElement, tag: string, parts: string[] ): LiteElement { const PCDATA = (this.constructor as typeof LiteParser).PCDATA; const SELF_CLOSING = (this.constructor as typeof LiteParser).SELF_CLOSING; // // Get the child to be added to the node // const kind = tag.match(/<(.*?)[\s\n>/]/)[1].toLowerCase(); const child = adaptor.node(kind) as LiteElement; // // Split out the tag attributes as an array of space, name, value1, value3, value3, // where value1, value2, and value3 are the value of the node (only one is defined) // that come from matching quoted strings with ' (value1), " (value2) or no quotes (value3). // const attributes = tag .replace(/^<.*?[\s\n>]/, '') .split(PATTERNS.attrsplit); // // If the tag was complete (it ends with > or has no attributes) // if (attributes.pop().match(/>$/) || attributes.length < 5) { this.addAttributes(adaptor, child, attributes); adaptor.append(node, child); // // For non-self-closing tags, // For tags whose contents is PCDATA (like