// ─── HTML attribute stripping for cleaner extraction ─────────────────
// Inspired by Retio-pagemap's compress_html() — removes non-semantic
// attributes (class, id, data-*, style, event handlers, etc.) before
// feeding HTML to Readability / Defuddle. This reduces token bloat
// from HTML cruft while preserving semantic attributes.
/** Attributes to KEEP during compression (semantic / machine-readable). */
const KEEP_ATTRS = new Set([
"itemprop",
"itemtype",
"itemscope",
"role",
"aria-label",
"aria-labelledby",
"href",
"src",
"alt",
"title",
"datetime",
"content",
"property",
"type",
"name",
"value",
]);
/**
* Strip noise attributes from an HTML string using regex.
* Removes: class, id, data-*, style, event handlers, and many ARIA
* attributes while preserving semantic attributes like href, src, alt,
* itemprop, role, etc.
*
* This is a lossy operation for visual rendering but lossless for
* content extraction — the text and structure remain intact.
*/
export function stripNoiseAttributes(html: string): string {
// Remove attributes we don't want, keeping the ones in KEEP_ATTRS.
// Pattern: attrName="value" or attrName='value' or attrName=value
const removePattern = new RegExp(
`\\s+(?!(?:${[...KEEP_ATTRS].join("|")})\\b)` +
`[a-zA-Z][\\w-]*` +
`\\s*=\\s*(?:"[^"]*"|'[^']*'|[^\\s>]+)`,
"gi",
);
return html.replace(removePattern, "");
}
/**
* Remove empty HTML elements — tags with no text content between them.
* Runs multiple passes to catch nested empties (inner-first removal).
*/
export function removeEmptyElements(html: string, passes = 3): string {
const emptyTagRe =
/<(div|span|p|section|article|aside|figure|figcaption|details|summary|b|i|em|strong|small|sup|sub|a|abbr|cite|code|mark|u|s)\b[^>]*>\s*<\/\1>/gi;
let result = html;
for (let i = 0; i < passes; i++) {
const prev = result;
result = result.replace(emptyTagRe, "");
if (result === prev) break;
}
return result;
}
/**
* Full HTML compression pass: strip noise attributes + remove empties.
* Safe to call before Readability/Defuddle — only removes attributes,
* never content.
*/
export function compressHtml(html: string): string {
let result = stripNoiseAttributes(html);
result = removeEmptyElements(result);
return result;
}