import { Obj, Strings, Type } from '@ephox/katamari'; import DomSerializer from '../api/dom/Serializer'; import Schema from '../api/html/Schema'; import * as Zwsp from '../text/Zwsp'; const isConditionalComment = (html: string, startIndex: number) => /^\s*\[if [\w\W]+\]>.*/.test(html.substr(startIndex)); const findCommentEndIndex = (html: string, isBogus: boolean, startIndex: number = 0) => { const lcHtml = html.toLowerCase(); if (lcHtml.indexOf('[if ', startIndex) !== -1 && isConditionalComment(lcHtml, startIndex)) { const endIfIndex = lcHtml.indexOf('[endif]', startIndex); return lcHtml.indexOf('>', endIfIndex); } else { if (isBogus) { const endIndex = lcHtml.indexOf('>', startIndex); return endIndex !== -1 ? endIndex : lcHtml.length; } else { const endCommentRegexp = /--!?>/g; endCommentRegexp.lastIndex = startIndex; const match = endCommentRegexp.exec(html); return match ? match.index + match[0].length : lcHtml.length; } } }; /* * Returns the index of the matching end tag for a specific start tag. This can * be used to skip all children of a parent element from being processed. */ const findMatchingEndTagIndex = (schema: Schema, html: string, startIndex: number): number => { // TODO: TINY-7658: this regex does not support CDATA const startTagRegExp = /<([!?\/])?([A-Za-z0-9\-_:.]+)/g; const endTagRegExp = /(?:\s(?:[^'">]+(?:"[^"]*"|'[^']*'))*[^"'>]*(?:"[^">]*|'[^'>]*)?|\s*|\/)>/g; const voidElements = schema.getVoidElements(); let count = 1, index = startIndex; // keep finding HTML tags (opening, closing, or neither like comments or
s) while (count !== 0) { startTagRegExp.lastIndex = index; // ideally, we only want to run through this the once - but sometimes the startTagRegExp will give us false positives (things that begin // like tags, but don't end like them) and so we might need to bump up its lastIndex and try again. while (true) { const startMatch = startTagRegExp.exec(html); if (startMatch === null) { // doesn't matter what count is, we've run out of HTML tags return index; } else if (startMatch[1] === '!') { // TODO: TINY-7658 add CDATA support here if (Strings.startsWith(startMatch[2], '--')) { index = findCommentEndIndex(html, false, startMatch.index + '!--'.length); } else { index = findCommentEndIndex(html, true, startMatch.index + 1); } break; } else { // it's an element endTagRegExp.lastIndex = startTagRegExp.lastIndex; const endMatch = endTagRegExp.exec(html); // TODO: once we don't need IE, make the regex sticky (will be faster than looking at .index afterwards and throwing out bad matches) if (Type.isNull(endMatch) || endMatch.index !== startTagRegExp.lastIndex) { // We can skip through to the end of startMatch only because there's no way a "<" could appear halfway through " count += 1; } index = startTagRegExp.lastIndex + endMatch[0].length; break; } } } return index; }; const trimHtml = (tempAttrs: string[], html: string): string => { const trimContentRegExp = new RegExp([ '\\s?(' + tempAttrs.join('|') + ')="[^"]+"' // Trim temporary data-mce prefixed attributes like data-mce-selected ].join('|'), 'gi'); return html.replace(trimContentRegExp, ''); }; const trimInternal = (serializer: DomSerializer, html: string): string => { const bogusAllRegExp = /<(\w+) [^>]*data-mce-bogus="all"[^>]*>/g; const schema = serializer.schema; let content = trimHtml(serializer.getTempAttrs(), html); const voidElements = schema.getVoidElements(); // Remove all bogus elements marked with "all" let matches: RegExpExecArray | null; while ((matches = bogusAllRegExp.exec(content))) { const index = bogusAllRegExp.lastIndex; const matchLength = matches[0].length; let endTagIndex: number; if (voidElements[matches[1]]) { endTagIndex = index; } else { endTagIndex = findMatchingEndTagIndex(schema, content, index); } content = content.substring(0, index - matchLength) + content.substring(endTagIndex); bogusAllRegExp.lastIndex = index - matchLength; } return Zwsp.trim(content); }; // We might need external/internal trimming in the future so lets keep the separation const trimExternal = trimInternal; export { trimExternal, trimInternal, // Exported for testing purposes only findMatchingEndTagIndex };