/** * Utility functions are collected here. These are functions that are used * by the exported functions below. Some of them are themselves exported. */ import sanitizeFilename from "sanitize-filename"; import * as cheerio from "cheerio"; import chalk from "chalk"; const H1_TO_H6_TAGS = new Set(["h1", "h2", "h3", "h4", "h5", "h6"]); const HEADING_TAGS = new Set([...H1_TO_H6_TAGS, "hgroup"]); const INJECT_SECTION_ID_TAGS = new Set([...HEADING_TAGS, "section", "dt"]); const LIVE_SAMPLE_PARTS = ["html", "css", "js"]; const SECTION_ID_DISALLOWED = /["#$%&+,/:;=?@[\]^`{|}~')(\\]/g; export class KumascriptError extends Error { constructor(message) { super(message); this.name = this.constructor.name; } } export function slugify(text) { // Turn the text content of a header, or the value of the "name" attribute, // into a slug for use as an ID as well as a filename. Trim it, collapse // whitespace gaps into underscores, remove the same special characters that // Kuma removes (for consistency, since for example many live-samples depend // on this), and finally remove any remaining characters that would not work // within a filename on a Windows, Mac OS X, or Unix filesystem. // NOTE: These are the constraints that have to be satisfied: // 1) the result can be used as a filename on Windows, Mac OS X, and Unix // (this is why the "sanitize-filename" npm package is used) // 2) the result will be used as an "id" attribute, so in HTML5 it must // contain at least one character and must not contain any whitespace // characters (the "sanitize-filename" npm package will itself remove // spaces, but since they're useful in breaking up phrases, before we // run "sanitize-filename" we convert whitespace gaps into underscores) // 3) many macros use sample ID's that assume that "id" attributes have // had the SECTION_ID_DISALLOWED characters removed, so for now we have // to maintain that legacy // 4) there's no need to add constraints that assume the result will be // used as a CSS ID selector, since it will be properly escaped for that // use case (see the "cssesc" code within the "getSection" method of the // HTMLTool below) return sanitizeFilename( text.trim().replace(SECTION_ID_DISALLOWED, "").replace(/\s+/g, "_") ); } export function spacesToUnderscores(text) { return text.replace(/ |%20/g, "_"); } export function safeDecodeURIComponent(text) { // This function will attempt to URI-decode the incoming text, which may // or may not be URI-encoded, and if it can't, it assumes the text is not // URI-encoded and simply falls back to using the text itself. This exists // solely because some localized pages URI-encode the sample ID argument // to their "EmbedLiveSample" macro calls, and we need to run the non-URI- // encoded sample ID through "slugify()" above prior to URI-encoding it // for the live-sample URL. try { return decodeURIComponent(text); } catch (e) { return text; } } const minimalIDEscape = (string) => { string = string.replace(/\./g, "\\."); const firstChar = string.charAt(0); if (/^-[-\d]/.test(string)) { string = "\\-" + string.slice(1); } else if (/\d/.test(firstChar)) { string = "\\3" + firstChar + " " + string.slice(1); } return string; }; const findSectionStart = ($, sectionID) => { return $(`#${minimalIDEscape(sectionID)}`); }; const hasHeading = ($, sampleID) => !sampleID ? false : findSectionStart($, sampleID).length > 0; function findTopLevelParent($el) { while ($el.siblings(":header").length == 0 && $el.parent().length > 0) { $el = $el.parent(); } return $el; } const getLevel = ($header) => parseInt($header[0].name[1], 10); const getHigherHeaderSelectors = (upTo) => Array.from({ length: upTo }, (_, i) => "h" + (i + 1)).join(", "); function* collectLevels($el) { // Initialized to 7 so that we pick up the lowest heading level which is
let level = 7; let $prev = $el; while (level !== 1) { const nextHigherLevel = getHigherHeaderSelectors(level - 1); const $header = $prev.prevAll(nextHigherLevel).first(); if ($header.length == 0) { return; } level = getLevel($header); $prev = $header; yield $header.clone().add($header.nextUntil(nextHigherLevel).clone()); } } function collectClosestCode($start) { const $el = findTopLevelParent($start); for (const $level of collectLevels($el)) { const pairs = LIVE_SAMPLE_PARTS.map((part) => { const selector = `.${part}, pre[class*="brush:${part}"], pre[class*="${part};"]`; const $filtered = $level.find(selector).add($level.filter(selector)); return [ part, $filtered .map((i, element) => cheerio.load(element).text()) .get() .join("\n"), ]; }); if (pairs.some(([, code]) => !!code)) { return Object.fromEntries(pairs); } } return null; } export class HTMLTool { private pathDescription: string; private $: cheerio.CheerioAPI; constructor(html, pathDescription?: any) { this.$ = typeof html == "string" ? cheerio.load(html, { decodeEntities: true }) : html; this.pathDescription = pathDescription; } removeNoIncludes() { this.$(".noinclude").remove(); return this; } injectSectionIDs() { let idCount = 0; const $ = this.$; const knownIDs = new Set(); function generateUniqueID() { let id; do { id = `sect${++idCount}`; } while (knownIDs.has(id)); knownIDs.add(id); return id; } // Now, let's inject section ID's. // The rules are simple; for the tags we look at... // If it as a `name` attribute, use that as the ID. // If it already has an ID, leave it and use that. // If it's a H1-6 tag, generate (slugify) an ID from its text. // If all else, generate a unique one. // And we ensure all IDs that get added are completely lowercase. $([...INJECT_SECTION_ID_TAGS].join(",")).each((i, element) => { const $element = $(element); const $first = $element[0] as cheerio.Element; const isDt = $first.name === "dt"; // Default is the existing one. Let's see if we need to change it. let id = $element.attr("id"); if ($element.attr("name")) { // The "name" attribute overrides any current "id". id = slugify($element.attr("name").toLowerCase()); } else if (id) { // If there’s already has an ID, use it — and lowercase it as long // as the value isn’t "Quick_links" (which we need to keep as-is), // and as long as it’s not a class=bc-data div (the ID for which we // need to keep as-is). if (id !== "Quick_links" && $first.attribs["class"] !== "bc-data") { id = id.toLowerCase(); } } else if (H1_TO_H6_TAGS.has($first.name) || isDt) { // For heading elements, we start by getting the text content of // the entire heading element (including any children it may have). let text = $element.text(); if (isDt) { // dt elements can, along with the actual term, contain stuff // like Optional. If // we include the text from that, we end up with generated IDs // like id="rtcSessionDescriptionInit_Optional". So, for dt, we // take just the text from the first element child of the dt. text = $element.contents().first().text(); } id = slugify(text).toLowerCase(); if (id) { // Ensure that the slugified "id" has not already been // taken. If it has, create a unique version of it. let version = 2; const originalID = id; while (knownIDs.has(id)) { id = `${originalID}_${version++}`.toLowerCase(); } } } if (!id) { // No need to call toLowerCase() here, because generateUniqueID() // makes all-lowercase IDs in the form sectN, where N is a number. id = generateUniqueID(); } knownIDs.add(id); $element.attr("id", id); if (isDt) { // Link the first element child to the ID. const firstContent = $element.contents().first(); if (!firstContent.is("a") && firstContent.find("a").length === 0) { $(firstContent).wrap(``); } } }); return this; } getSection(section) { const $ = this.$; // This is important since many macros specify the section ID with spaces, // and/or characters that are stripped from the actual ID's (e.g., "(" and ")"). const sectionID = slugify(section); // Kuma looks for the first HTML tag of a limited set of section tags with ANY // attribute equal to the "sectionID", but in practice it's always an "id" attribute, // so let's simplify this as well as make it much faster. const sectionStart = findSectionStart($, sectionID); if (!sectionStart.length) { let errorMessage = `unable to find an HTML element with an "id" of "${sectionID}" within ${this.pathDescription}`; const hasMoreThanAscii = [...sectionID].some( (char) => char.charCodeAt(0) > 127 ); if (hasMoreThanAscii) { const cleanedSectionID = [...sectionID] .filter((char) => char.charCodeAt(0) <= 127) .join(""); errorMessage += ' -- hint! removing all non-ASCII characters in the "id" may fix this, so try ' + `'id="${cleanedSectionID}"' and 'EmbedLiveSample("${cleanedSectionID}", ...)'`; } throw new KumascriptError(errorMessage); } let result; const sectionTag = sectionStart.get(0).tagName; if (HEADING_TAGS.has(sectionTag)) { // Heading-based sections comprise the start and all siblings // after the start until the beginning of the next section of // equal or higher level. For example, if the section starts // with an "h3", take the "h3" and all of the following sibling // elements until either there are no more siblings or we reach // an "h1", "h2", or another "h3" element. const nextSection = [...HEADING_TAGS] .filter((tag) => tag <= sectionTag || tag === "hgroup") .join(","); result = sectionStart.add(sectionStart.nextUntil(nextSection)); } else { // Non-heading-based sections comprise all of the children of // the starting element, but not the starting element itself. result = sectionStart.children(); } return result; } extractSection(section) { const result = this.getSection(section).not(".noinclude"); return this.$.html(result); } extractLiveSampleObject(iframeID) { const sectionID = iframeID.substr("frame_".length); let result; if (hasHeading(this.$, sectionID)) { result = Object.create(null); const sample = this.getSection(sectionID); // We have to wrap the collection of elements from the section // we've just acquired because we're going to search among all // descendants and we want to include the elements themselves // as well as their descendants. const $ = cheerio.load(`
${this.$.html(sample)}
`); for (const part of LIVE_SAMPLE_PARTS) { const src = $( `.${part}, pre[class*="brush:${part}"], pre[class*="${part};"]` ) .map((i, element) => $(element).text()) .get() .join("\n"); // The string replacements below have been carried forward from Kuma: // * Bugzilla 819999:   gets decoded to \xa0, which trips up CSS. // * Bugzilla 1284781:   is incorrectly parsed on embed sample. result[part] = src ? src.replace(/\u00a0/g, " ") : null; } if (!LIVE_SAMPLE_PARTS.some((part) => result[part])) { throw new KumascriptError( `unable to find any live code samples for "${sectionID}" within ${this.pathDescription}` ); } } else { // We're here because we can't find the sectionID, so instead we're going // to find the live-sample iframe by its id (iframeID, NOT sectionID), and // then collect the closest blocks of code for the live sample. console.warn( chalk.yellow( `invalid header id in live sample ${sectionID} within ${this.pathDescription}` ) ); result = collectClosestCode(findSectionStart(this.$, iframeID)); if (!result) { throw new KumascriptError( `unable to find any live code samples for "${sectionID}" within ${this.pathDescription}` ); } } result.hasMathML = //g, ">") .replace(/]+)>)/gi, ""); }