import { Obj, Strings, Type } from '@ephox/katamari';
import DomSerializer from '../api/dom/Serializer';
import Schema from '../api/html/Schema';
import * as Zwsp from '../text/Zwsp';
const isConditionalComment = (html: string, startIndex: number) => /^\s*\[if [\w\W]+\]>.*/.test(html.substr(startIndex));
const findCommentEndIndex = (html: string, isBogus: boolean, startIndex: number = 0) => {
const lcHtml = html.toLowerCase();
if (lcHtml.indexOf('[if ', startIndex) !== -1 && isConditionalComment(lcHtml, startIndex)) {
const endIfIndex = lcHtml.indexOf('[endif]', startIndex);
return lcHtml.indexOf('>', endIfIndex);
} else {
if (isBogus) {
const endIndex = lcHtml.indexOf('>', startIndex);
return endIndex !== -1 ? endIndex : lcHtml.length;
} else {
const endCommentRegexp = /--!?>/g;
endCommentRegexp.lastIndex = startIndex;
const match = endCommentRegexp.exec(html);
return match ? match.index + match[0].length : lcHtml.length;
}
}
};
/*
* Returns the index of the matching end tag for a specific start tag. This can
* be used to skip all children of a parent element from being processed.
*/
const findMatchingEndTagIndex = (schema: Schema, html: string, startIndex: number): number => {
// TODO: TINY-7658: this regex does not support CDATA
const startTagRegExp = /<([!?\/])?([A-Za-z0-9\-_:.]+)/g;
const endTagRegExp = /(?:\s(?:[^'">]+(?:"[^"]*"|'[^']*'))*[^"'>]*(?:"[^">]*|'[^'>]*)?|\s*|\/)>/g;
const voidElements = schema.getVoidElements();
let count = 1, index = startIndex;
// keep finding HTML tags (opening, closing, or neither like comments or
s)
while (count !== 0) {
startTagRegExp.lastIndex = index;
// ideally, we only want to run through this the once - but sometimes the startTagRegExp will give us false positives (things that begin
// like tags, but don't end like them) and so we might need to bump up its lastIndex and try again.
while (true) {
const startMatch = startTagRegExp.exec(html);
if (startMatch === null) {
// doesn't matter what count is, we've run out of HTML tags
return index;
} else if (startMatch[1] === '!') {
// TODO: TINY-7658 add CDATA support here
if (Strings.startsWith(startMatch[2], '--')) {
index = findCommentEndIndex(html, false, startMatch.index + '!--'.length);
} else {
index = findCommentEndIndex(html, true, startMatch.index + 1);
}
break;
} else { // it's an element
endTagRegExp.lastIndex = startTagRegExp.lastIndex;
const endMatch = endTagRegExp.exec(html);
// TODO: once we don't need IE, make the regex sticky (will be faster than looking at .index afterwards and throwing out bad matches)
if (Type.isNull(endMatch) || endMatch.index !== startTagRegExp.lastIndex) {
// We can skip through to the end of startMatch only because there's no way a "<" could appear halfway through "
count += 1;
}
index = startTagRegExp.lastIndex + endMatch[0].length;
break;
}
}
}
return index;
};
const trimHtml = (tempAttrs: string[], html: string): string => {
const trimContentRegExp = new RegExp([
'\\s?(' + tempAttrs.join('|') + ')="[^"]+"' // Trim temporary data-mce prefixed attributes like data-mce-selected
].join('|'), 'gi');
return html.replace(trimContentRegExp, '');
};
const trimInternal = (serializer: DomSerializer, html: string): string => {
const bogusAllRegExp = /<(\w+) [^>]*data-mce-bogus="all"[^>]*>/g;
const schema = serializer.schema;
let content = trimHtml(serializer.getTempAttrs(), html);
const voidElements = schema.getVoidElements();
// Remove all bogus elements marked with "all"
let matches: RegExpExecArray | null;
while ((matches = bogusAllRegExp.exec(content))) {
const index = bogusAllRegExp.lastIndex;
const matchLength = matches[0].length;
let endTagIndex: number;
if (voidElements[matches[1]]) {
endTagIndex = index;
} else {
endTagIndex = findMatchingEndTagIndex(schema, content, index);
}
content = content.substring(0, index - matchLength) + content.substring(endTagIndex);
bogusAllRegExp.lastIndex = index - matchLength;
}
return Zwsp.trim(content);
};
// We might need external/internal trimming in the future so lets keep the separation
const trimExternal = trimInternal;
export {
trimExternal,
trimInternal,
// Exported for testing purposes only
findMatchingEndTagIndex
};