import { RewritingStream } from "parse5-html-rewriting-stream";
import {
startsWithAny,
decodeLatin1,
encodeLatin1,
MAX_STREAM_CHUNK_SIZE,
REPLAY_TOP_FRAME_NAME,
} from "../utils";
import {
type ProxyRewriter,
type ArchiveResponse,
type Rewriter,
} from "./index.js";
import { type StartTag } from "parse5-sax-parser";
import { type Token } from "parse5";
import { type RWOpts } from "../types";
const encoder = new TextEncoder();
const decoder = new TextDecoder();
// ===========================================================================
const META_REFRESH_REGEX =
/([\d.]+\s*;\s*(?:url\s*=)?\s*)['"]?(.+)['"]?(\s*)/im;
const DATA_RW_PROTOCOLS = ["http://", "https://", "//"];
const defmod = "mp_";
const MAX_HTML_REWRITE_SIZE = 50000000;
const UTF_MATCH = ["utf-8", "utf8"];
const rewriteTags: Record> = {
a: { href: "ln_" },
applet: {
codebase: "oe_",
archive: "oe_",
},
area: { href: defmod },
audio: { src: "oe_" },
base: { href: defmod },
blockquote: { cite: defmod },
body: { background: "im_" },
button: { formaction: defmod },
command: { icon: "im_" },
del: { cite: defmod },
embed: { src: "oe_" },
iframe: { src: "if_" },
image: { src: "im_", "xlink:href": "im_", href: "im_" },
img: {
src: "im_",
srcset: "im_",
},
ins: { cite: defmod },
input: {
src: "im_",
formaction: defmod,
},
form: { action: defmod },
frame: { src: "fr_" },
link: { href: "oe_" },
meta: { content: "mt_" },
object: {
codebase: "oe_",
data: "oe_",
},
param: { value: "oe_" },
q: { cite: defmod },
ref: { href: "oe_" },
script: { src: "js_", "xlink:href": "js_" },
source: { src: "oe_", srcset: "oe_" },
video: {
src: "oe_",
poster: "im_",
},
};
const OBJECT_FLASH_DATA_RX = [
{
match: /youtube.com\/v\/([^&]+)[&]/,
replace: "youtube.com/embed/$1?",
},
];
type TextNodeRewriteRule = {
urlMatch: RegExp;
match: RegExp;
replace: string;
};
const TEXT_NODE_REWRITE_RULES: TextNodeRewriteRule[] = [
{
urlMatch: /[?&]:loadOrderID=([\d]+)/,
match: /(loadOrderID&(quot;&)?#x[^;]+?;)([\d]+)/gi,
replace: "$1$U1",
},
];
// ===========================================================================
export class HTMLRewriter {
rewriter: Rewriter;
rule: TextNodeRewriteRule | null = null;
ruleMatch: RegExpMatchArray | null = null;
isCharsetUTF8: boolean;
detectedCharset = "";
jsOpts: RWOpts;
constructor(rewriter: Rewriter) {
this.rewriter = rewriter;
this.rule = null;
this.jsOpts = { baseUrl: rewriter.baseUrl, prefix: rewriter.prefix };
for (const rule of TEXT_NODE_REWRITE_RULES) {
const m = this.rewriter.url.match(rule.urlMatch);
if (m) {
this.ruleMatch = m;
this.rule = rule;
break;
}
}
this.isCharsetUTF8 = rewriter.isCharsetUTF8;
}
detectMetaCharset(buffer: Uint8Array): string {
// only look at first bytes
const CHARSET_PEEK_SIZE = 1024;
const META_CHARSET_REGEX =
/]*charset\s*=\s*["']?([^"'\s>/]+)["']?[^>]*>/i;
const META_CHARSET_HTTP_EQUIV =
/]*http-equiv\s*=\s*["']?content-type["']?[^>]*content\s*=\s*["']?[^"']*charset\s*=\s*([^"'\s>/;]+)[^>]*>/i;
if (buffer.length > CHARSET_PEEK_SIZE) {
buffer = buffer.slice(0, CHARSET_PEEK_SIZE);
}
const text = new TextDecoder("latin1").decode(buffer);
// Look for meta charset attribute
const charsetMatch = text.match(META_CHARSET_REGEX);
if (charsetMatch?.[1]) {
return charsetMatch[1].toLowerCase();
}
// Look for meta http-equiv with charset in content
const httpEquivMatch = text.match(META_CHARSET_HTTP_EQUIV);
if (httpEquivMatch?.[1]) {
return httpEquivMatch[1].toLowerCase();
}
return "";
}
rewriteMetaContent(
attrs: Token.Attribute[],
attr: Token.Attribute,
rewriter: Rewriter,
) {
let equiv = this.getAttr(attrs, "http-equiv");
if (equiv) {
equiv = equiv.toLowerCase();
}
if (equiv === "content-security-policy") {
attr.name = "_" + attr.name;
} else if (equiv === "refresh") {
return attr.value.replace(
META_REFRESH_REGEX,
(m, p1, p2: string, p3) =>
p1 + this.rewriteUrl(rewriter, p2, false, "mt_") + p3,
);
} else if (this.getAttr(attrs, "name") === "referrer") {
return "no-referrer-when-downgrade";
} else if (startsWithAny(attr.value, DATA_RW_PROTOCOLS)) {
return this.rewriteUrl(rewriter, attr.value);
}
return attr.value;
}
rewriteSrcSet(value: string, rewriter: Rewriter) {
const SRCSET_REGEX = /\s*(\S*\s+[\d.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/;
const rv: string[] = [];
for (const v of value.split(SRCSET_REGEX)) {
if (v) {
const parts = v.trim().split(" ");
// @ts-expect-error [TODO] - TS2345 - Argument of type 'string | undefined' is not assignable to parameter of type 'string'.
parts[0] = this.rewriteUrl(rewriter, parts[0]);
rv.push(parts.join(" "));
}
}
return rv.join(", ");
}
rewriteTagAndAttrs(
tag: StartTag,
attrRules: Record,
rewriter: Rewriter,
): string {
const isUrl = (val: string) => {
return startsWithAny(val, DATA_RW_PROTOCOLS);
};
const tagName = tag.tagName;
// no attribute rewriting for web-component tags, which must contain a '-'
if (tagName.indexOf("-") > 0) {
return "";
}
let addCloseTag = "";
const replaceTag = (newTag: string) => {
if (newTag === "iframe") {
if (tag.selfClosing) {
tag.selfClosing = false;
addCloseTag = newTag;
}
tag.attrs.push({ name: "style", value: "border: none" });
}
tag.tagName = newTag;
};
for (const attr of tag.attrs) {
const name = attr.name || "";
const value = attr.value || "";
// js attrs with javascript:
if (value.startsWith("javascript:")) {
attr.value =
"javascript:" +
rewriter.rewriteJS(value.slice("javascript:".length), {
inline: true,
...this.jsOpts,
});
} else if (name.startsWith("on") && name.slice(2, 3) != "-") {
// js attrs
attr.value = rewriter.rewriteJS(value, {
inline: true,
...this.jsOpts,
});
// css attrs
} else if (name === "style") {
attr.value = rewriter.rewriteCSS(attr.value);
}
// background attr
else if (name === "background") {
attr.value = this.rewriteUrl(rewriter, value);
} else if (
name === "srcset" ||
(name === "imagesrcset" && tagName === "link")
) {
attr.value = this.rewriteSrcSet(value, rewriter);
}
// for now, download attribute doesn't work in Chrome
// but disabling triggers default behavior which often does
else if (
name === "crossorigin" ||
name === "integrity" ||
name === "download"
) {
attr.name = "_" + attr.name;
} else if (tagName === "meta" && name === "content") {
attr.value = this.rewriteMetaContent(tag.attrs, attr, rewriter);
} else if (tagName === "meta" && name === "charset") {
if (value && UTF_MATCH.includes(value.toLowerCase())) {
this.isCharsetUTF8 = true;
}
} else if (tagName === "param" && isUrl(value)) {
attr.value = this.rewriteUrl(rewriter, attr.value);
} else if (name.startsWith("data-") && isUrl(value)) {
attr.value = this.rewriteUrl(rewriter, attr.value);
} else if (tagName === "base" && name === "href") {
try {
// rewrite url, keeping relativeness intact
attr.value = this.rewriter.updateBaseUrl(attr.value);
// [TODO]
// eslint-disable-next-line @typescript-eslint/no-unused-vars
} catch (e) {
console.warn("Invalid : " + attr.value);
}
} else if (tagName === "script" && name === "src") {
const rwType = this.getScriptRWType(tag);
const mod = rwType === "module" ? "esm_" : "";
const newValue = this.rewriteUrl(rewriter, attr.value, false, mod);
if (newValue === attr.value) {
tag.attrs.push({ name: "__wb_orig_src", value: attr.value });
if (
// [TODO]
// eslint-disable-next-line @typescript-eslint/prefer-optional-chain
attr.value &&
attr.value.startsWith("data:text/javascript;base64")
) {
attr.value = this.rewriteJSBase64(attr.value, rewriter);
} else {
attr.value = this.rewriteUrl(rewriter, attr.value, true, mod);
}
} else {
attr.value = newValue;
}
} else if (tagName === "object" && name === "data") {
const type = this.getAttr(tag.attrs, "type");
// convert object tag to iframe or img
if (type === "application/pdf") {
attr.name = "src";
attr.value = this.rewriteUrl(rewriter, attr.value, false, "if_");
replaceTag("iframe");
} else if (type === "image/svg+xml") {
attr.name = "src";
attr.value = this.rewriteUrl(rewriter, attr.value);
replaceTag("img");
} else if (type === "application/x-shockwave-flash") {
for (const rule of OBJECT_FLASH_DATA_RX) {
const value = attr.value.replace(rule.match, rule.replace);
if (value !== attr.value) {
attr.name = "src";
attr.value = this.rewriteUrl(rewriter, value, false, "if_");
replaceTag("iframe");
break;
}
}
}
} else if (tagName === "embed" && name === "src") {
const type = this.getAttr(tag.attrs, "type") || "";
if (type.startsWith("image/")) {
replaceTag("img");
attr.value = this.rewriteUrl(rewriter, value, false, "mp_");
} else if (
type === "application/pdf" ||
!type.startsWith("application/")
) {
replaceTag("iframe");
attr.value = this.rewriteUrl(rewriter, value, false, "if_");
if (type !== "application/pdf") {
// add sandbox to prevent downloads from unknown types
tag.attrs.push({
name: "sandbox",
value: "allow-same-origin allow-scripts",
});
}
}
} else if (name === "target") {
const target = attr.value;
if (
target === "_blank" ||
target === "_parent" ||
target === "_top" ||
target === "new"
) {
attr.value = REPLAY_TOP_FRAME_NAME;
}
} else if (attrRules[name] || name === "href" || name === "src") {
attr.value = this.rewriteUrl(
rewriter,
attr.value,
false,
attrRules[name],
);
}
}
return addCloseTag;
}
getAttr(attrs: Token.Attribute[], name: string) {
for (const attr of attrs) {
if (attr.name === name) {
return attr.value;
}
}
return null;
}
getScriptRWType(tag: StartTag) {
const scriptType = this.getAttr(tag.attrs, "type");
if (scriptType === "module") {
return "module";
} else if (scriptType === "application/json") {
return "json";
} else if (
!scriptType ||
scriptType.indexOf("javascript") >= 0 ||
scriptType.indexOf("ecmascript") >= 0
) {
return "js";
} else if (scriptType.startsWith("text/")) {
return "text";
} else if (scriptType === "importmap") {
return "importmap";
} else {
return "";
}
}
async rewrite(response: ArchiveResponse) {
let buffer = await response.getBuffer();
if (!buffer) {
return response;
}
if (buffer.length > MAX_HTML_REWRITE_SIZE) {
console.warn("Skipping rewriting, HTML file too big: " + buffer.length);
return response;
}
const { bomFound, text } = await response.getText(this.isCharsetUTF8, true);
if (bomFound) {
// make new buffer in UTF-8 with no BOM
buffer = new TextEncoder().encode(text);
this.isCharsetUTF8 = true;
}
// don't try to detect if already detected as UTF-8 or detected a charset via headers
if (!this.isCharsetUTF8 && !this.rewriter.isCharsetDetected) {
this.detectedCharset = this.detectMetaCharset(buffer);
if (UTF_MATCH.includes(this.detectedCharset)) {
this.isCharsetUTF8 = true;
}
}
const rewriter = this.rewriter;
const rwStream = new RewritingStream();
// [TODO]
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(rwStream as any).tokenizer.preprocessor.bufferWaterline = Infinity;
let insertAdded = false;
let headDone = false;
let isTextEmpty = true;
let context = "";
let scriptRw = "";
let replaceTag = "";
const addInsert = () => {
if (!insertAdded && rewriter.headInsertFunc) {
// emit charset before rest of head insert to ensure its in the first 1024 bytes
if (this.detectedCharset) {
rwStream.emitRaw(``);
}
const headInsert = rewriter.headInsertFunc(rewriter.url);
if (headInsert) {
rwStream.emitRaw(headInsert);
}
insertAdded = true;
}
};
rwStream.on("startTag", (startTag) => {
const tagRules = rewriteTags[startTag.tagName];
const original = startTag.tagName;
const addCloseTag = this.rewriteTagAndAttrs(
startTag,
tagRules || {},
rewriter,
);
if (!insertAdded && !["head", "html"].includes(startTag.tagName)) {
addInsert();
}
rwStream.emitStartTag(startTag);
// if tag is changed, eg. to iframe, need to add closing tag
if (addCloseTag) {
rwStream.emitEndTag({ tagName: addCloseTag });
}
switch (startTag.tagName) {
case "script": {
if (startTag.selfClosing) {
break;
}
context = startTag.tagName;
isTextEmpty = true;
scriptRw = this.getScriptRWType(startTag);
break;
}
case "style":
if (!startTag.selfClosing) {
context = startTag.tagName;
}
break;
case "head":
addInsert();
break;
case "body":
headDone = true;
break;
}
if (startTag.tagName !== original) {
context = original;
replaceTag = startTag.tagName;
}
});
rwStream.on("endTag", (endTag) => {
if (endTag.tagName === context) {
if (replaceTag) {
endTag.tagName = replaceTag;
replaceTag = "";
}
switch (context) {
case "head":
headDone = true;
break;
case "script":
if (headDone && !isTextEmpty && scriptRw === "js") {
rwStream.emitRaw(";document.close();");
}
break;
}
context = "";
}
rwStream.emitEndTag(endTag);
});
rwStream.on("text", (textToken, raw) => {
const text = (() => {
if (context === "script") {
const isModule = scriptRw === "module";
isTextEmpty = isTextEmpty && textToken.text.trim().length === 0;
if (scriptRw === "js" || isModule) {
return rewriter.rewriteJS(textToken.text, {
isModule,
...this.jsOpts,
});
} else if (scriptRw === "json") {
return rewriter.rewriteJSON(textToken.text, {
isModule,
...this.jsOpts,
});
} else if (scriptRw === "importmap") {
return rewriter.rewriteImportmap(textToken.text);
} else {
return textToken.text;
}
} else if (context === "style") {
return rewriter.rewriteCSS(textToken.text);
} else {
return this.rewriteHTMLText(raw);
}
})();
for (let i = 0; i < text.length; i += MAX_STREAM_CHUNK_SIZE) {
rwStream.emitRaw(text.slice(i, i + MAX_STREAM_CHUNK_SIZE));
}
});
// eslint-disable-next-line @typescript-eslint/no-this-alias
const htmlrewriter = this;
response.setReader(
new ReadableStream({
async start(controller) {
rwStream.on("data", (text) => {
controller.enqueue(
// [TODO]
htmlrewriter.isCharsetUTF8
? // eslint-disable-next-line @typescript-eslint/no-unsafe-argument
encoder.encode(text)
: // eslint-disable-next-line @typescript-eslint/no-unsafe-argument
encodeLatin1(text),
);
});
rwStream.on("end", () => {
controller.close();
});
if (htmlrewriter.isCharsetUTF8) {
rwStream.write(decoder.decode(buffer), "utf8");
} else {
rwStream.write(decodeLatin1(buffer), "latin1");
}
addInsert();
rwStream.end();
},
}),
);
return response;
}
rewriteUrl(rewriter: Rewriter, text: string, forceAbs = false, mod = "") {
// if html charset not utf-8, just convert the url to utf-8 for rewriting
if (!this.isCharsetUTF8) {
text = decoder.decode(encodeLatin1(text));
}
const res = rewriter.rewriteUrl(text, forceAbs);
return mod && mod !== defmod && mod !== "ln_" && mod !== "mt_"
? res.replace(defmod + "/", mod + "/")
: res;
}
rewriteHTMLText(text: string) {
if (this.rule && this.ruleMatch) {
// todo: make more general if additional rules needed
// for now, just replace the first match
// @ts-expect-error [TODO] - TS2769 - No overload matches this call.
const replacer = this.rule.replace.replace("$U1", this.ruleMatch[1]);
const newText = text.replace(this.rule.match, replacer);
if (text !== newText) {
return newText;
}
}
return text;
}
rewriteJSBase64(text: string, rewriter: Rewriter) {
const parts = text.split(",");
// @ts-expect-error [TODO] - TS2769 - No overload matches this call.
const content = rewriter.rewriteJS(atob(parts[1]), {
isModule: false,
...this.jsOpts,
});
parts[1] = btoa(content);
return parts.join(",");
}
}
// ===========================================================================
export class ProxyHTMLRewriter extends HTMLRewriter {
override rewriteUrl(
rewriter: Rewriter,
text: string,
forceAbs = false,
mod = "",
) {
if (mod === "if_" || mod === "fr_" || mod === "mt_") {
return (rewriter as ProxyRewriter).fullRewriteUrl(text, forceAbs);
}
if (mod === "ln_") {
// if html charset not utf-8, just convert the url to utf-8 for rewriting
if (!this.isCharsetUTF8) {
text = decoder.decode(encodeLatin1(text));
}
return rewriter.rewriteUrl(text, forceAbs);
}
// always rewrite if http->https conversion is needed
if ((rewriter as ProxyRewriter).httpToHttps) {
return rewriter.rewriteUrl(text, forceAbs);
}
return text;
}
override rewriteTagAndAttrs(
tag: StartTag,
attrRules: Record,
rewriter: Rewriter,
): string {
const proxyRewriter = rewriter as ProxyRewriter;
if (proxyRewriter.httpToHttps) {
return super.rewriteTagAndAttrs(tag, attrRules, rewriter);
}
const tagName = tag.tagName;
if (tagName === "embed" || tagName === "object") {
return super.rewriteTagAndAttrs(tag, attrRules, rewriter);
}
// no attribute rewriting for web-component tags, which must contain a '-'
if (tagName.indexOf("-") > 0) {
return "";
}
for (const attr of tag.attrs) {
const name = attr.name || "";
const value = attr.value || "";
if (tagName === "meta" && name === "content") {
attr.value = this.rewriteMetaContent(tag.attrs, attr, rewriter);
} else if (attrRules[name] || name === "href" || name === "src") {
attr.value = this.rewriteUrl(rewriter, value, false, attrRules[name]);
}
if (
proxyRewriter.rewriteRelCanonical &&
tagName === "link" &&
name === "href" &&
value
) {
if (this.getAttr(tag.attrs, "rel") === "canonical") {
attr.value = proxyRewriter.rewriteUrl(attr.value);
}
}
}
return "";
}
}