/**
* Utility functions are collected here. These are functions that are used
* by the exported functions below. Some of them are themselves exported.
*/
import sanitizeFilename from "sanitize-filename";
import * as cheerio from "cheerio";
import chalk from "chalk";
const H1_TO_H6_TAGS = new Set(["h1", "h2", "h3", "h4", "h5", "h6"]);
const HEADING_TAGS = new Set([...H1_TO_H6_TAGS, "hgroup"]);
const INJECT_SECTION_ID_TAGS = new Set([...HEADING_TAGS, "section", "dt"]);
const LIVE_SAMPLE_PARTS = ["html", "css", "js"];
const SECTION_ID_DISALLOWED = /["#$%&+,/:;=?@[\]^`{|}~')(\\]/g;
export class KumascriptError extends Error {
constructor(message) {
super(message);
this.name = this.constructor.name;
}
}
export function slugify(text) {
// Turn the text content of a header, or the value of the "name" attribute,
// into a slug for use as an ID as well as a filename. Trim it, collapse
// whitespace gaps into underscores, remove the same special characters that
// Kuma removes (for consistency, since for example many live-samples depend
// on this), and finally remove any remaining characters that would not work
// within a filename on a Windows, Mac OS X, or Unix filesystem.
// NOTE: These are the constraints that have to be satisfied:
// 1) the result can be used as a filename on Windows, Mac OS X, and Unix
// (this is why the "sanitize-filename" npm package is used)
// 2) the result will be used as an "id" attribute, so in HTML5 it must
// contain at least one character and must not contain any whitespace
// characters (the "sanitize-filename" npm package will itself remove
// spaces, but since they're useful in breaking up phrases, before we
// run "sanitize-filename" we convert whitespace gaps into underscores)
// 3) many macros use sample ID's that assume that "id" attributes have
// had the SECTION_ID_DISALLOWED characters removed, so for now we have
// to maintain that legacy
// 4) there's no need to add constraints that assume the result will be
// used as a CSS ID selector, since it will be properly escaped for that
// use case (see the "cssesc" code within the "getSection" method of the
// HTMLTool below)
return sanitizeFilename(
text.trim().replace(SECTION_ID_DISALLOWED, "").replace(/\s+/g, "_")
);
}
export function spacesToUnderscores(text) {
return text.replace(/ |%20/g, "_");
}
export function safeDecodeURIComponent(text) {
// This function will attempt to URI-decode the incoming text, which may
// or may not be URI-encoded, and if it can't, it assumes the text is not
// URI-encoded and simply falls back to using the text itself. This exists
// solely because some localized pages URI-encode the sample ID argument
// to their "EmbedLiveSample" macro calls, and we need to run the non-URI-
// encoded sample ID through "slugify()" above prior to URI-encoding it
// for the live-sample URL.
try {
return decodeURIComponent(text);
} catch (e) {
return text;
}
}
const minimalIDEscape = (string) => {
string = string.replace(/\./g, "\\.");
const firstChar = string.charAt(0);
if (/^-[-\d]/.test(string)) {
string = "\\-" + string.slice(1);
} else if (/\d/.test(firstChar)) {
string = "\\3" + firstChar + " " + string.slice(1);
}
return string;
};
const findSectionStart = ($, sectionID) => {
return $(`#${minimalIDEscape(sectionID)}`);
};
const hasHeading = ($, sampleID) =>
!sampleID ? false : findSectionStart($, sampleID).length > 0;
function findTopLevelParent($el) {
while ($el.siblings(":header").length == 0 && $el.parent().length > 0) {
$el = $el.parent();
}
return $el;
}
const getLevel = ($header) => parseInt($header[0].name[1], 10);
const getHigherHeaderSelectors = (upTo) =>
Array.from({ length: upTo }, (_, i) => "h" + (i + 1)).join(", ");
function* collectLevels($el) {
// Initialized to 7 so that we pick up the lowest heading level which is
let level = 7;
let $prev = $el;
while (level !== 1) {
const nextHigherLevel = getHigherHeaderSelectors(level - 1);
const $header = $prev.prevAll(nextHigherLevel).first();
if ($header.length == 0) {
return;
}
level = getLevel($header);
$prev = $header;
yield $header.clone().add($header.nextUntil(nextHigherLevel).clone());
}
}
function collectClosestCode($start) {
const $el = findTopLevelParent($start);
for (const $level of collectLevels($el)) {
const pairs = LIVE_SAMPLE_PARTS.map((part) => {
const selector = `.${part}, pre[class*="brush:${part}"], pre[class*="${part};"]`;
const $filtered = $level.find(selector).add($level.filter(selector));
return [
part,
$filtered
.map((i, element) => cheerio.load(element).text())
.get()
.join("\n"),
];
});
if (pairs.some(([, code]) => !!code)) {
return Object.fromEntries(pairs);
}
}
return null;
}
export class HTMLTool {
private pathDescription: string;
private $: cheerio.CheerioAPI;
constructor(html, pathDescription?: any) {
this.$ =
typeof html == "string"
? cheerio.load(html, { decodeEntities: true })
: html;
this.pathDescription = pathDescription;
}
removeNoIncludes() {
this.$(".noinclude").remove();
return this;
}
injectSectionIDs() {
let idCount = 0;
const $ = this.$;
const knownIDs = new Set();
function generateUniqueID() {
let id;
do {
id = `sect${++idCount}`;
} while (knownIDs.has(id));
knownIDs.add(id);
return id;
}
// Now, let's inject section ID's.
// The rules are simple; for the tags we look at...
// If it as a `name` attribute, use that as the ID.
// If it already has an ID, leave it and use that.
// If it's a H1-6 tag, generate (slugify) an ID from its text.
// If all else, generate a unique one.
// And we ensure all IDs that get added are completely lowercase.
$([...INJECT_SECTION_ID_TAGS].join(",")).each((i, element) => {
const $element = $(element);
const $first = $element[0] as cheerio.Element;
const isDt = $first.name === "dt";
// Default is the existing one. Let's see if we need to change it.
let id = $element.attr("id");
if ($element.attr("name")) {
// The "name" attribute overrides any current "id".
id = slugify($element.attr("name").toLowerCase());
} else if (id) {
// If there’s already has an ID, use it — and lowercase it as long
// as the value isn’t "Quick_links" (which we need to keep as-is),
// and as long as it’s not a class=bc-data div (the ID for which we
// need to keep as-is).
if (id !== "Quick_links" && $first.attribs["class"] !== "bc-data") {
id = id.toLowerCase();
}
} else if (H1_TO_H6_TAGS.has($first.name) || isDt) {
// For heading elements, we start by getting the text content of
// the entire heading element (including any children it may have).
let text = $element.text();
if (isDt) {
// dt elements can, along with the actual term, contain stuff
// like Optional. If
// we include the text from that, we end up with generated IDs
// like id="rtcSessionDescriptionInit_Optional". So, for dt, we
// take just the text from the first element child of the dt.
text = $element.contents().first().text();
}
id = slugify(text).toLowerCase();
if (id) {
// Ensure that the slugified "id" has not already been
// taken. If it has, create a unique version of it.
let version = 2;
const originalID = id;
while (knownIDs.has(id)) {
id = `${originalID}_${version++}`.toLowerCase();
}
}
}
if (!id) {
// No need to call toLowerCase() here, because generateUniqueID()
// makes all-lowercase IDs in the form sectN, where N is a number.
id = generateUniqueID();
}
knownIDs.add(id);
$element.attr("id", id);
if (isDt) {
// Link the first element child to the ID.
const firstContent = $element.contents().first();
if (!firstContent.is("a") && firstContent.find("a").length === 0) {
$(firstContent).wrap(``);
}
}
});
return this;
}
getSection(section) {
const $ = this.$;
// This is important since many macros specify the section ID with spaces,
// and/or characters that are stripped from the actual ID's (e.g., "(" and ")").
const sectionID = slugify(section);
// Kuma looks for the first HTML tag of a limited set of section tags with ANY
// attribute equal to the "sectionID", but in practice it's always an "id" attribute,
// so let's simplify this as well as make it much faster.
const sectionStart = findSectionStart($, sectionID);
if (!sectionStart.length) {
let errorMessage = `unable to find an HTML element with an "id" of "${sectionID}" within ${this.pathDescription}`;
const hasMoreThanAscii = [...sectionID].some(
(char) => char.charCodeAt(0) > 127
);
if (hasMoreThanAscii) {
const cleanedSectionID = [...sectionID]
.filter((char) => char.charCodeAt(0) <= 127)
.join("");
errorMessage +=
' -- hint! removing all non-ASCII characters in the "id" may fix this, so try ' +
`'id="${cleanedSectionID}"' and 'EmbedLiveSample("${cleanedSectionID}", ...)'`;
}
throw new KumascriptError(errorMessage);
}
let result;
const sectionTag = sectionStart.get(0).tagName;
if (HEADING_TAGS.has(sectionTag)) {
// Heading-based sections comprise the start and all siblings
// after the start until the beginning of the next section of
// equal or higher level. For example, if the section starts
// with an "h3", take the "h3" and all of the following sibling
// elements until either there are no more siblings or we reach
// an "h1", "h2", or another "h3" element.
const nextSection = [...HEADING_TAGS]
.filter((tag) => tag <= sectionTag || tag === "hgroup")
.join(",");
result = sectionStart.add(sectionStart.nextUntil(nextSection));
} else {
// Non-heading-based sections comprise all of the children of
// the starting element, but not the starting element itself.
result = sectionStart.children();
}
return result;
}
extractSection(section) {
const result = this.getSection(section).not(".noinclude");
return this.$.html(result);
}
extractLiveSampleObject(iframeID) {
const sectionID = iframeID.substr("frame_".length);
let result;
if (hasHeading(this.$, sectionID)) {
result = Object.create(null);
const sample = this.getSection(sectionID);
// We have to wrap the collection of elements from the section
// we've just acquired because we're going to search among all
// descendants and we want to include the elements themselves
// as well as their descendants.
const $ = cheerio.load(`
${this.$.html(sample)}
`);
for (const part of LIVE_SAMPLE_PARTS) {
const src = $(
`.${part}, pre[class*="brush:${part}"], pre[class*="${part};"]`
)
.map((i, element) => $(element).text())
.get()
.join("\n");
// The string replacements below have been carried forward from Kuma:
// * Bugzilla 819999: gets decoded to \xa0, which trips up CSS.
// * Bugzilla 1284781: is incorrectly parsed on embed sample.
result[part] = src ? src.replace(/\u00a0/g, " ") : null;
}
if (!LIVE_SAMPLE_PARTS.some((part) => result[part])) {
throw new KumascriptError(
`unable to find any live code samples for "${sectionID}" within ${this.pathDescription}`
);
}
} else {
// We're here because we can't find the sectionID, so instead we're going
// to find the live-sample iframe by its id (iframeID, NOT sectionID), and
// then collect the closest blocks of code for the live sample.
console.warn(
chalk.yellow(
`invalid header id in live sample ${sectionID} within ${this.pathDescription}`
)
);
result = collectClosestCode(findSectionStart(this.$, iframeID));
if (!result) {
throw new KumascriptError(
`unable to find any live code samples for "${sectionID}" within ${this.pathDescription}`
);
}
}
result.hasMathML = //g, ">")
.replace(/]+)>)/gi, "");
}