/**
* Core HTML sanitization logic.
*
* @module bquery/security
* @internal
*/
import {
DANGEROUS_ATTR_PREFIXES,
DANGEROUS_PROTOCOLS,
DANGEROUS_TAGS,
DEFAULT_ALLOWED_ATTRIBUTES,
DEFAULT_ALLOWED_TAGS,
RESERVED_IDS,
} from './constants';
import type { SanitizeOptions } from './types';
/**
* Check if an attribute name is allowed.
* @internal
*/
const isAllowedAttribute = (
name: string,
allowedSet: Set,
allowDataAttrs: boolean
): boolean => {
const lowerName = name.toLowerCase();
// Check dangerous prefixes
for (const prefix of DANGEROUS_ATTR_PREFIXES) {
if (lowerName.startsWith(prefix)) return false;
}
// Check data attributes
if (allowDataAttrs && lowerName.startsWith('data-')) return true;
// Check aria attributes (allowed by default)
if (lowerName.startsWith('aria-')) return true;
// Check explicit allow list
return allowedSet.has(lowerName);
};
/**
* Check if an ID/name value could cause DOM clobbering.
* @internal
*/
const isSafeIdOrName = (value: string): boolean => {
const lowerValue = value.toLowerCase().trim();
return !RESERVED_IDS.has(lowerValue);
};
/**
* Normalize URL by removing control characters, whitespace, and Unicode tricks.
* Enhanced to prevent various bypass techniques.
* @internal
*/
const normalizeUrl = (value: string): string =>
value
// Remove null bytes and control characters
.replace(/[\u0000-\u001F\u007F]+/g, '')
// Remove zero-width characters that could hide malicious content
.replace(/[\u200B-\u200D\uFEFF\u2028\u2029]+/g, '')
// Remove escaped Unicode sequences
.replace(/\\u[\da-fA-F]{4}/g, '')
// Remove whitespace
.replace(/\s+/g, '')
// Normalize case
.toLowerCase();
/**
* Check if a URL value is safe.
* @internal
*/
const isSafeUrl = (value: string): boolean => {
const normalized = normalizeUrl(value);
for (const protocol of DANGEROUS_PROTOCOLS) {
if (normalized.startsWith(protocol)) return false;
}
return true;
};
/**
* Check if a srcset attribute value is safe.
* srcset contains comma-separated entries of "url [descriptor]".
* Each individual URL must be validated.
* @internal
*/
const isSafeSrcset = (value: string): boolean => {
const entries = value.split(',');
for (const entry of entries) {
const url = entry.trim().split(/\s+/)[0];
if (url && !isSafeUrl(url)) return false;
}
return true;
};
/**
* Check if a URL is external (different origin).
* @internal
*/
const isExternalUrl = (url: string): boolean => {
try {
// Normalize URL by trimming whitespace
const trimmedUrl = url.trim();
// Protocol-relative URLs (//example.com) are always external.
// CRITICAL: This check must run before the relative-URL check below;
// otherwise, a protocol-relative URL like "//evil.com" would be treated
// as a non-http(s) relative URL and incorrectly classified as same-origin.
// Handling them up front guarantees correct security classification.
if (trimmedUrl.startsWith('//')) {
return true;
}
// Normalize URL for case-insensitive protocol checks
const lowerUrl = trimmedUrl.toLowerCase();
// Check for non-http(s) protocols which are considered external/special
// (mailto:, tel:, ftp:, etc.)
const hasProtocol = /^[a-z][a-z0-9+.-]*:/i.test(trimmedUrl);
if (hasProtocol && !lowerUrl.startsWith('http://') && !lowerUrl.startsWith('https://')) {
// These are special protocols, not traditional "external" links
// but we treat them as external for security consistency
return true;
}
// Relative URLs are not external
if (!lowerUrl.startsWith('http://') && !lowerUrl.startsWith('https://')) {
return false;
}
// In non-browser environments (e.g., Node.js), treat all absolute URLs as external
if (typeof window === 'undefined' || !window.location) {
return true;
}
const urlObj = new URL(trimmedUrl, window.location.href);
return urlObj.origin !== window.location.origin;
} catch {
// If URL parsing fails, treat as potentially external for safety
return true;
}
};
/**
* Parse an HTML string into a Document using DOMParser.
* This helper is intentionally separated to make the control-flow around HTML parsing
* explicit for static analysis tools. It should ONLY be called when the input is
* known to contain HTML syntax (angle brackets).
*
* DOMParser creates an inert document where scripts don't execute, making it safe
* for parsing untrusted HTML that will subsequently be sanitized.
*
* @param htmlContent - A string that is known to contain HTML markup (has < or >)
* @returns The parsed Document
* @internal
*/
const parseHtmlDocument = (htmlContent: string): Document => {
const parser = new DOMParser();
// Parse as a full HTML document in an inert context; scripts won't execute
return parser.parseFromString(htmlContent, 'text/html');
};
/**
* Safely parse HTML string into a DocumentFragment using DOMParser.
* DOMParser is preferred over innerHTML for security as it creates an inert document
* where scripts don't execute and provides better static analysis recognition.
*
* This function includes input normalization to satisfy static analysis tools:
* - Coerces input to string and trims whitespace
* - For plain text (no HTML tags), creates a Text node directly without parsing
* - Only invokes DOMParser for actual HTML-like content via parseHtmlDocument
*
* The separation between plain text handling and HTML parsing is intentional:
* DOM text that contains no HTML syntax is never fed into an HTML parser,
* preventing "DOM text reinterpreted as HTML" issues.
*
* @internal
*/
const parseHtmlSafely = (html: string): DocumentFragment => {
// Step 1: Normalize input - coerce to string and trim
// This defensive check handles edge cases even though TypeScript says it's a string
const normalizedHtml = (typeof html === 'string' ? html : String(html ?? '')).trim();
// Step 2: Create the fragment that will hold our result
const fragment = document.createDocumentFragment();
// Step 3: Early return for empty input
if (normalizedHtml.length === 0) {
return fragment;
}
// Step 4: If input contains no angle brackets, it's plain text - no HTML parsing needed.
// Plain text is handled as a Text node, never passed to an HTML parser.
// This explicitly prevents "DOM text reinterpreted as HTML" for purely textual inputs.
const containsHtmlSyntax = normalizedHtml.includes('<') || normalizedHtml.includes('>');
if (!containsHtmlSyntax) {
fragment.appendChild(document.createTextNode(normalizedHtml));
return fragment;
}
// Step 5: Input contains HTML syntax - parse it via the dedicated HTML parsing helper.
// This separation makes the data-flow explicit: only strings with HTML syntax
// are passed to DOMParser, satisfying static analysis requirements.
const doc = parseHtmlDocument(normalizedHtml);
// Move all children from the document body into the fragment.
// This avoids interpolating untrusted HTML into an outer wrapper string.
const body = doc.body;
if (!body) {
return fragment;
}
while (body.firstChild) {
fragment.appendChild(body.firstChild);
}
return fragment;
};
/**
* Core sanitization logic (without Trusted Types wrapper).
* @internal
*/
export const sanitizeHtmlCore = (html: string, options: SanitizeOptions = {}): string => {
const {
allowTags = [],
allowAttributes = [],
allowDataAttributes = true,
stripAllTags = false,
} = options;
// Build combined allow sets (excluding dangerous tags even if specified)
const allowedTags = new Set(
[...DEFAULT_ALLOWED_TAGS, ...allowTags.map((t) => t.toLowerCase())].filter(
(tag) => !DANGEROUS_TAGS.has(tag)
)
);
const allowedAttrs = new Set([
...DEFAULT_ALLOWED_ATTRIBUTES,
...allowAttributes.map((a) => a.toLowerCase()),
]);
// Use DOMParser for safe HTML parsing (inert context, no script execution)
const fragment = parseHtmlSafely(html);
if (stripAllTags) {
return fragment.textContent ?? '';
}
// Walk the DOM tree
const walker = document.createTreeWalker(fragment, NodeFilter.SHOW_ELEMENT);
const toRemove: Element[] = [];
while (walker.nextNode()) {
const el = walker.currentNode as Element;
const tagName = el.tagName.toLowerCase();
// Remove explicitly dangerous tags even if in allow list
if (DANGEROUS_TAGS.has(tagName)) {
toRemove.push(el);
continue;
}
// Remove disallowed tags entirely
if (!allowedTags.has(tagName)) {
toRemove.push(el);
continue;
}
// Process attributes
const attrsToRemove: string[] = [];
for (const attr of Array.from(el.attributes)) {
const attrName = attr.name.toLowerCase();
// Check if attribute is allowed
if (!isAllowedAttribute(attrName, allowedAttrs, allowDataAttributes)) {
attrsToRemove.push(attr.name);
continue;
}
// Check for DOM clobbering on id and name attributes
if ((attrName === 'id' || attrName === 'name') && !isSafeIdOrName(attr.value)) {
attrsToRemove.push(attr.name);
continue;
}
// Validate URL attributes
if (
(attrName === 'href' || attrName === 'src' || attrName === 'action') &&
!isSafeUrl(attr.value)
) {
attrsToRemove.push(attr.name);
continue;
}
// Validate srcset URLs individually
if (attrName === 'srcset' && !isSafeSrcset(attr.value)) {
attrsToRemove.push(attr.name);
}
}
// Remove disallowed attributes
for (const attrName of attrsToRemove) {
el.removeAttribute(attrName);
}
// Add rel="noopener noreferrer" to external links for security
if (tagName === 'a') {
const href = el.getAttribute('href');
const target = el.getAttribute('target');
const hasTargetBlank = target?.toLowerCase() === '_blank';
const isExternal = href && isExternalUrl(href);
// Add security attributes to links opening in new window or external links
if (hasTargetBlank || isExternal) {
const existingRel = el.getAttribute('rel');
const relValues = new Set(existingRel ? existingRel.split(/\s+/).filter(Boolean) : []);
// Add noopener and noreferrer
relValues.add('noopener');
relValues.add('noreferrer');
el.setAttribute('rel', Array.from(relValues).join(' '));
}
}
}
// Remove disallowed elements
for (const el of toRemove) {
el.remove();
}
// Serialize the sanitized fragment to HTML string.
// We use a temporary container to get the innerHTML of the fragment.
const serializeFragment = (frag: DocumentFragment): string => {
const container = document.createElement('div');
container.appendChild(frag.cloneNode(true));
return container.innerHTML;
};
// Double-parse to prevent mutation XSS (mXSS).
// Browsers may normalize HTML during serialization in ways that could create
// new dangerous content when re-parsed. By re-parsing the sanitized output
// and verifying stability, we ensure the final HTML is safe.
const firstPass = serializeFragment(fragment);
// Re-parse through DOMParser for mXSS detection.
// Using DOMParser instead of innerHTML for security.
const verifyFragment = parseHtmlSafely(firstPass);
const secondPass = serializeFragment(verifyFragment);
// Verify stability: if content mutates between parses, it indicates mXSS attempt
if (firstPass !== secondPass) {
// Content mutated during re-parse - potential mXSS detected.
// Return safely escaped text content as fallback.
return fragment.textContent ?? '';
}
return secondPass;
};