/* * Three behaviours of htmlparser2 are patched here, the first 2 being 'convenience' patches * to avoid repeated passing of lowerCaseAttributeNames and recognizeSelfClosing options. * 1. Defaulting to self closing tag recognition * 2. Disabling automatic attribute name conversion to lower case * 3. Ability to inject/whitelist certain tags to be parsed like script/style tags do. ('special' tags) */ import { Tokenizer, Parser } from 'htmlparser2'; /* Enable any self closing tags '' to be parsed. This allows MarkBind's own components to be recognised as such (e.g. ''). This is equivalent to the { recognizeSelfClosing: true } option of htmlparser2; We modify the relevant code to avoid passing it in repeatedly. */ Parser.prototype.onselfclosingtag = function () { this._closeCurrentTag(); }; /* Disable automatic lower case conversion. This is equivalent to the { lowerCaseAttributeNames: false } option of htmlparser2; We modify the relevant code to avoid passing it in repeatedly as well. */ Parser.prototype.onattribname = function (name: string) { this._attribname = name; }; /* eslint-disable */ var i = 0, TEXT = i++, BEFORE_TAG_NAME = i++, //after < IN_TAG_NAME = i++, IN_SELF_CLOSING_TAG = i++, BEFORE_CLOSING_TAG_NAME = i++, IN_CLOSING_TAG_NAME = i++, AFTER_CLOSING_TAG_NAME = i++, //attributes BEFORE_ATTRIBUTE_NAME = i++, IN_ATTRIBUTE_NAME = i++, AFTER_ATTRIBUTE_NAME = i++, BEFORE_ATTRIBUTE_VALUE = i++, IN_ATTRIBUTE_VALUE_DQ = i++, // " IN_ATTRIBUTE_VALUE_SQ = i++, // ' IN_ATTRIBUTE_VALUE_NQ = i++, //declarations BEFORE_DECLARATION = i++, // ! IN_DECLARATION = i++, //processing instructions IN_PROCESSING_INSTRUCTION = i++, // ? //comments BEFORE_COMMENT = i++, IN_COMMENT = i++, AFTER_COMMENT_1 = i++, AFTER_COMMENT_2 = i++, //cdata BEFORE_CDATA_1 = i++, // [ BEFORE_CDATA_2 = i++, // C BEFORE_CDATA_3 = i++, // D BEFORE_CDATA_4 = i++, // A BEFORE_CDATA_5 = i++, // T BEFORE_CDATA_6 = i++, // A IN_CDATA = i++, // [ AFTER_CDATA_1 = i++, // ] AFTER_CDATA_2 = i++, // ] //special tags BEFORE_SPECIAL = i++, //S BEFORE_SPECIAL_END = i++, //S /* SCRIPT and STYLE states are preserved to maintain consistency of un-patched methods using state numbers greater than them in htmlparser2. */ BEFORE_SCRIPT_1 = i++, //C BEFORE_SCRIPT_2 = i++, //R BEFORE_SCRIPT_3 = i++, //I BEFORE_SCRIPT_4 = i++, //P BEFORE_SCRIPT_5 = i++, //T AFTER_SCRIPT_1 = i++, //C AFTER_SCRIPT_2 = i++, //R AFTER_SCRIPT_3 = i++, //I AFTER_SCRIPT_4 = i++, //P AFTER_SCRIPT_5 = i++, //T BEFORE_STYLE_1 = i++, //T BEFORE_STYLE_2 = i++, //Y BEFORE_STYLE_3 = i++, //L BEFORE_STYLE_4 = i++, //E AFTER_STYLE_1 = i++, //T AFTER_STYLE_2 = i++, //Y AFTER_STYLE_3 = i++, //L AFTER_STYLE_4 = i++, //E BEFORE_ENTITY = i++, //& BEFORE_NUMERIC_ENTITY = i++, //# IN_NAMED_ENTITY = i++, IN_NUMERIC_ENTITY = i++, IN_HEX_ENTITY = i++, //X SPECIAL_NONE = 0, /* Non parser-state constants */ // _matchSpecialTagsNextCharacters, _matchNextSpecialTagClosingCharacter HAS_MATCHING = -1, NO_MATCH = -2, HAS_MATCHED = -3; const DEFAULT_SPECIAL_TAGS = [ 'dark-mode-toggle', 'script', 'site-nav', 'style', 'variable', 'markdown', 'md', ]; function whitespace(c: string) { return c === " " || c === "\n" || c === "\t" || c === "\f" || c === "\r"; } Tokenizer.prototype.specialTagNames = [...DEFAULT_SPECIAL_TAGS]; /** * Checks whether the token matches one of the first characters of the special tags, * and initialises the _matchingSpecialTagIndexes array with the matches' indexes if so. */ Tokenizer.prototype._matchSpecialTagsFirstCharacters = function(c: string) { this._matchingSpecialTagIndexes = []; const numSpecialTags = this.specialTagNames.length; const lowerCaseChar = c.toLowerCase(); for (let j = 0; j < numSpecialTags; j += 1) { if (lowerCaseChar === this.specialTagNames[j][0]) { this._matchingSpecialTagIndexes.push(j) } } if (this._matchingSpecialTagIndexes.length > 0) { this._nextSpecialTagMatchIndex = 1; return true; } return false; }; /** * Processes the _matchingSpecialTagIndexes array, filtering out previous match indexes * that do not match the current token. * If one of the previous matches successfully matched, the match index is returned. */ Tokenizer.prototype._matchSpecialTagsNextCharacters = function(c: string) { const matchingSpecialTags = []; const numMatchingTags = this._matchingSpecialTagIndexes.length; const lowerCaseChar = c.toLowerCase(); for (let j = 0; j < numMatchingTags; j += 1) { const tagIndex = this._matchingSpecialTagIndexes[j]; const testChar = this.specialTagNames[tagIndex][this._nextSpecialTagMatchIndex]; if (testChar === undefined) { if (c === "/" || c === ">" || whitespace(c)) { return tagIndex; } } else if (testChar === lowerCaseChar) { matchingSpecialTags.push(tagIndex); } } this._matchingSpecialTagIndexes = matchingSpecialTags; return this._matchingSpecialTagIndexes.length > 0 ? HAS_MATCHING : NO_MATCH; }; /** * Changes the Tokenizer state to BEFORE_SPECIAL if the token matches one of * the first characters of _specialTagNames. */ Tokenizer.prototype._stateBeforeTagName = function(c: string) { if (c === "/") { this._state = BEFORE_CLOSING_TAG_NAME; } else if (c === "<") { this._cbs.ontext(this._getSection()); this._sectionStart = this._index; } else if (c === ">" || this._special !== SPECIAL_NONE || whitespace(c)) { this._state = TEXT; } else if (c === "!") { this._state = BEFORE_DECLARATION; this._sectionStart = this._index + 1; } else if (c === "?") { this._state = IN_PROCESSING_INSTRUCTION; this._sectionStart = this._index + 1; } else { // We want special tag treatment to be xmlMode independent for MarkBind. this._state = this._matchSpecialTagsFirstCharacters(c) ? BEFORE_SPECIAL : IN_TAG_NAME; this._sectionStart = this._index; } }; /** * Changes the Tokenizer state to IN_TAG_NAME or BEFORE_SPECIAL state again depending * on whether there are still matches in _matchingSpecialTagIndexes. */ Tokenizer.prototype._stateBeforeSpecial = function(c: string) { const result = this._matchSpecialTagsNextCharacters(c); if (result === HAS_MATCHING) { this._nextSpecialTagMatchIndex += 1; return; } // Reset for _matchNextSpecialTagClosingCharacter later this._nextSpecialTagMatchIndex = 0; this._state = IN_TAG_NAME; this._index--; //consume the token again if (result === NO_MATCH) { return; } /* Assign the one-based index of the matching tag in the special tags array. This is due to htmlparser2 using 0 for SPECIAL_NONE, which would conflict with a zero-based index. */ this._special = result + 1; }; /** * Patched self closing tag state handler that removes the special state * if the special tag was self-closed. */ Tokenizer.prototype._stateInSelfClosingTag = function(c: string) { if (c === ">") { this._cbs.onselfclosingtag(); this._state = TEXT; this._sectionStart = this._index + 1; /* Allow all special tags to be self-closed. Script and style tags are also allowed to be self-closed, which breaks from the default html spec-compliant of behaviour of htmlparser2. We allow this as such tags would be expanded upon re-rendering the html anyway. ie. '' */ this._special = SPECIAL_NONE; } else if (!whitespace(c)) { this._state = BEFORE_ATTRIBUTE_NAME; this._index--; } }; /** * Processes the _special flag and _nextSpecialTagMatchIndex state variable, * returning a flag indicating whether the current special tag has finished matching or not. */ Tokenizer.prototype._matchNextSpecialTagClosingCharacter = function(c: string) { const nextTestChar = this.specialTagNames[this._special - 1][this._nextSpecialTagMatchIndex]; if (nextTestChar === undefined) { this._nextSpecialTagMatchIndex = 0; return (c === ">" || whitespace(c)) ? HAS_MATCHED : NO_MATCH; } else if (nextTestChar === c.toLowerCase()) { this._nextSpecialTagMatchIndex += 1; return HAS_MATCHING; } this._nextSpecialTagMatchIndex = 0; return NO_MATCH; }; /** * Changes the Tokenizer state to BEFORE_SPECIAL_END if the token matches one of * the first character of the currently matched special tag. */ Tokenizer.prototype._stateBeforeCloseingTagName = function(c: string) { if (whitespace(c)) { // do nothing, consume whitespace } else if (c === ">") { this._state = TEXT; } else if (this._special !== SPECIAL_NONE) { if (this._matchNextSpecialTagClosingCharacter(c) !== NO_MATCH) { this._state = BEFORE_SPECIAL_END; } else { this._state = TEXT; this._index--; } } else { this._state = IN_CLOSING_TAG_NAME; this._sectionStart = this._index; } }; /** * Changes the Tokenizer state back to TEXT, IN_TAG_NAME depending * on whether the token has finished or is still matching * the currently matched special tag. */ Tokenizer.prototype._stateBeforeSpecialEnd = function(c: string) { const result = this._matchNextSpecialTagClosingCharacter(c); if (result === HAS_MATCHING) { return; } if (result === HAS_MATCHED) { this._sectionStart = this._index - this.specialTagNames[this._special - 1].length; this._special = SPECIAL_NONE; this._state = IN_CLOSING_TAG_NAME; this._index--; //reconsume the token return; } this._index--; this._state = TEXT; }; /** * Altered _parse handler that handles the extra MARKDOWN state. * BEFORE/AFTER SCRIPT and STYLE state handlers are made redundant * by the above patches and removed as well. */ Tokenizer.prototype._parse = function(){ while(this._index < this._buffer.length && this._running){ var c = this._buffer.charAt(this._index); if(this._state === TEXT){ this._stateText(c); } else if(this._state === BEFORE_TAG_NAME){ this._stateBeforeTagName(c); } else if(this._state === IN_TAG_NAME){ this._stateInTagName(c); } else if(this._state === BEFORE_CLOSING_TAG_NAME){ this._stateBeforeCloseingTagName(c); } else if(this._state === IN_CLOSING_TAG_NAME){ this._stateInCloseingTagName(c); } else if(this._state === AFTER_CLOSING_TAG_NAME){ this._stateAfterCloseingTagName(c); } else if(this._state === IN_SELF_CLOSING_TAG){ this._stateInSelfClosingTag(c); } /* * attributes */ else if(this._state === BEFORE_ATTRIBUTE_NAME){ this._stateBeforeAttributeName(c); } else if(this._state === IN_ATTRIBUTE_NAME){ this._stateInAttributeName(c); } else if(this._state === AFTER_ATTRIBUTE_NAME){ this._stateAfterAttributeName(c); } else if(this._state === BEFORE_ATTRIBUTE_VALUE){ this._stateBeforeAttributeValue(c); } else if(this._state === IN_ATTRIBUTE_VALUE_DQ){ this._stateInAttributeValueDoubleQuotes(c); } else if(this._state === IN_ATTRIBUTE_VALUE_SQ){ this._stateInAttributeValueSingleQuotes(c); } else if(this._state === IN_ATTRIBUTE_VALUE_NQ){ this._stateInAttributeValueNoQuotes(c); } /* * declarations */ else if(this._state === BEFORE_DECLARATION){ this._stateBeforeDeclaration(c); } else if(this._state === IN_DECLARATION){ this._stateInDeclaration(c); } /* * processing instructions */ else if(this._state === IN_PROCESSING_INSTRUCTION){ this._stateInProcessingInstruction(c); } /* * comments */ else if(this._state === BEFORE_COMMENT){ this._stateBeforeComment(c); } else if(this._state === IN_COMMENT){ this._stateInComment(c); } else if(this._state === AFTER_COMMENT_1){ this._stateAfterComment1(c); } else if(this._state === AFTER_COMMENT_2){ this._stateAfterComment2(c); } /* * cdata */ else if(this._state === BEFORE_CDATA_1){ this._stateBeforeCdata1(c); } else if(this._state === BEFORE_CDATA_2){ this._stateBeforeCdata2(c); } else if(this._state === BEFORE_CDATA_3){ this._stateBeforeCdata3(c); } else if(this._state === BEFORE_CDATA_4){ this._stateBeforeCdata4(c); } else if(this._state === BEFORE_CDATA_5){ this._stateBeforeCdata5(c); } else if(this._state === BEFORE_CDATA_6){ this._stateBeforeCdata6(c); } else if(this._state === IN_CDATA){ this._stateInCdata(c); } else if(this._state === AFTER_CDATA_1){ this._stateAfterCdata1(c); } else if(this._state === AFTER_CDATA_2){ this._stateAfterCdata2(c); } /* * special tags */ else if(this._state === BEFORE_SPECIAL){ this._stateBeforeSpecial(c); } else if(this._state === BEFORE_SPECIAL_END){ this._stateBeforeSpecialEnd(c); } /* * entities */ else if(this._state === BEFORE_ENTITY){ this._stateBeforeEntity(c); } else if(this._state === BEFORE_NUMERIC_ENTITY){ this._stateBeforeNumericEntity(c); } else if(this._state === IN_NAMED_ENTITY){ this._stateInNamedEntity(c); } else if(this._state === IN_NUMERIC_ENTITY){ this._stateInNumericEntity(c); } else if(this._state === IN_HEX_ENTITY){ this._stateInHexEntity(c); } else { this._cbs.onerror(Error("unknown _state"), this._state); } this._index++; } this._cleanup(); }; /** * Injects the tagsToIgnore into the Tokenizer's specialTagNames. */ export function injectIgnoreTags(tagsToIgnore: Iterable): void { Tokenizer.prototype.specialTagNames = [...DEFAULT_SPECIAL_TAGS, ...tagsToIgnore]; }