import * as cheerio from "cheerio"; import { ProseSection, Section } from "../libs/types/document.js"; import { extractSpecifications } from "./extract-specifications.js"; type SectionsAndFlaws = [Section[], string[]]; export async function extractSections( $: cheerio.CheerioAPI ): Promise<[Section[], string[]]> { const flaws: string[] = []; const sections: Section[] = []; const section = cheerio .load("
", { // decodeEntities: false })("div") .eq(0); const body = $("body")[0] as cheerio.ParentNode; const iterable = [...(body.childNodes as cheerio.Element[])]; let c = 0; for (const child of iterable) { if ( (child as cheerio.Element).tagName === "h2" || (child as cheerio.Element).tagName === "h3" ) { if (c) { const [subSections, subFlaws] = await addSections(section.clone()); sections.push(...subSections); flaws.push(...subFlaws); section.empty(); } c = 0; } // We *could* wrap this in something like `if (child.tagName) {` // which would exclude any node that isn't a tag, such as comments. // That might make the DOM nodes more compact and memory efficient. c++; section.append(child); } if (c) { // last straggler const [subSections, subFlaws] = await addSections(section); sections.push(...subSections); flaws.push(...subFlaws); } // Check for and mutate possible duplicated IDs. // If a HTML document has...: // //

Check these examples

// ... //

Examples

// // then this can cause various problems. For example, the anchor links // won't work. The Table of Contents won't be able to do a loop with unique // `key={section.id}` values. // The reason we need to loop through to get a list of all existing IDs // first is because we might have this: // //

Foo X

//

Foo Y

//

Foo Z

// // So when you encounter `

Foo Y

` you'll know that you // can't suggest it to be `

Foo Y

` because that ID // is taken by another one, later. const allIDs = new Set( sections .map((section) => section.value.id) .filter(Boolean) .map((id) => id.toLowerCase()) ); const seenIDs = new Set(); for (const section of sections) { const originalID = section.value.id; if (!originalID) { // Not all sections have an ID. For example, prose sections that don't // start with a

. // Since we're primarily concerned about *uniqueness* here, let's just // skip worrying about these. continue; } // We normalize all IDs to lowercase so that `id="Foo"` === `id="foo"`. const id = originalID.toLowerCase(); if (seenIDs.has(id)) { // That's bad! We have to come up with a new ID but it can't be one // that's used by another other section. let increment = 2; let newID = `${originalID}_${increment}`; while ( seenIDs.has(newID.toLowerCase()) || allIDs.has(newID.toLowerCase()) ) { increment++; newID = `${originalID}_${increment}`; } section.value.id = newID; seenIDs.add(newID.toLowerCase()); flaws.push( `'${originalID}' is not a unique ID in this HTML (temporarily changed to ${section.value.id})` ); } else { seenIDs.add(id); } } return [sections, flaws]; } /** Return an array of new sections to be added to the complete document. * * Generally, this function is called with a cheerio (`$`) section that * has HTML in it. The task is to structure that a little bit. * If the HTML inside the '$' is: * *

Foo

*

Bla bla

* * * then, the expected output is to return: * * [{ * type: "prose", * id: "foo", * title: "Foo" * content: "

Bla bla

\n

" * }] * * The reason it's always returning an array is because of special * sections. A special section is one where we try to transform it * first. For example BCD tables. If the input is this: * *

Browser Compat

*
...
* * Then, extract the ID, get the structured data and eventually return this: * * [{ * type: "browser_compatibility", * value: { * query: "foo.bar.thing", * id: "browser_compat", * title: "Browser Compat", * data: {....} * }] * * Another example is for the specification section. If the input is this: * *

Specifications

*
...
* * Then, extract the data-bcd-query and return this: * * [{ * type: "specifications", * value: { * query: "foo.bar.thing", * id: "specifications", * title: "Specifications", * specifications: {....} * }] */ async function addSections( $: cheerio.Cheerio ): Promise { const flaws: string[] = []; const countPotentialSpecialDivs = $.find("div.bc-data, div.bc-specs").length; if (countPotentialSpecialDivs) { /** If there's exactly 1 special table the only section to add is something * like this: * { * "type": "browser_compatibility", * "value": { * "title": "Browser compatibility", * "id": "browser_compatibility", * "query": "html.elements.video", * "data": {....} * } * * Where the 'title' and 'id' values comes from the

tag (if available). * * However, if there are **multiple special tables**, * it needs to return something like this: * * [{ * "type": "prose", * "value": { * "id": "browser_compatibility", * "title": "Browser compatibility" * "content": "Possible stuff before the table" * }, * { * "type": "browser_compatibility", * "value": { * "query": "html.elements.video", * "data": {.... * }, * { * "type": "prose", * "value": { * "content": "Any other stuff before table maybe" * }, */ if (countPotentialSpecialDivs > 1) { const subSections: Section[] = []; const section = cheerio .load("
", { // decodeEntities: false })("div") .eq(0); // Loop over each and every "root element" in the node and keep piling // them up in a buffer, until you encounter a `div.bc-data` or `div.bc-specs` then // add that to the stack, clear and repeat. const div = $[0] as cheerio.ParentNode; const iterable = [...(div.childNodes as cheerio.Element[])]; let c = 0; let countSpecialDivsFound = 0; for (const child of iterable) { if ( child.tagName === "div" && child.attribs && child.attribs.class && (child.attribs.class.includes("bc-data") || child.attribs.class.includes("bc-specs")) ) { countSpecialDivsFound++; if (c) { const [proseSections, proseFlaws] = _addSectionProse( section.clone() ); subSections.push(...proseSections); flaws.push(...proseFlaws); section.empty(); c = 0; // reset the counter } section.append(child); // XXX That `_addSingleSpecialSection(section.clone())` might return a // and empty array and that means it failed and we should // bail. subSections.push( ...(await _addSingleSpecialSection(section.clone())) ); section.empty(); } else { section.append(child); c++; } } if (c) { const [proseSections, proseFlaws] = _addSectionProse(section.clone()); subSections.push(...proseSections); flaws.push(...proseFlaws); } if (countSpecialDivsFound !== countPotentialSpecialDivs) { const leftoverCount = countPotentialSpecialDivs - countSpecialDivsFound; const explanation = `${leftoverCount} 'div.bc-data' or 'div.bc-specs' element${ leftoverCount > 1 ? "s" : "" } found but deeply nested.`; flaws.push(explanation); } return [subSections, flaws]; } const specialSections = await _addSingleSpecialSection($); // The _addSingleSpecialSection() function will have sucked up the

or

// and the `div.bc-data` or `div.bc-specs` to turn it into a special section. // First remove that, then put whatever HTML is left as a prose // section underneath. $.find("div.bc-data, h2, h3").remove(); $.find("div.bc-specs, h2, h3").remove(); const [proseSections, proseFlaws] = _addSectionProse($); specialSections.push(...proseSections); flaws.push(...proseFlaws); if (specialSections.length) { return [specialSections, flaws]; } } // all else, leave as is const [proseSections, proseFlaws] = _addSectionProse($); flaws.push(...proseFlaws); return [proseSections, flaws]; } async function _addSingleSpecialSection( $: cheerio.Cheerio ): Promise { let id: string | null = null; let title: string | null = null; let isH3 = false; const h2s = $.find("h2"); if (h2s.length === 1) { id = h2s.attr("id"); title = h2s.text(); } else { const h3s = $.find("h3"); if (h3s.length === 1) { id = h3s.attr("id"); title = h3s.text(); isH3 = true; } } let dataQuery = ""; let hasMultipleQueries = false; let specURLsString = ""; let specialSectionType: string | null = null; if ($.find("div.bc-data").length) { specialSectionType = "browser_compatibility"; const elem = $.find("div.bc-data"); // Macro adds "data-query", but some translated-content still uses "id". dataQuery = (elem.attr("data-query") || elem.attr("id")) ?? ""; hasMultipleQueries = elem.attr("data-multiple") === "true"; } else if ($.find("div.bc-specs").length) { specialSectionType = "specifications"; dataQuery = $.find("div.bc-specs").attr("data-bcd-query") ?? ""; specURLsString = $.find("div.bc-specs").attr("data-spec-urls") ?? ""; } // Some old legacy documents haven't been re-rendered yet, since it // was added, so the `div.bc-data` tag doesn't have a `id="bcd:..."` // or `data-bcd="..."` attribute. If that's the case, bail and fall // back on a regular prose section :( if (!dataQuery && specURLsString === "") { // I wish there was a good place to log this! return _addSectionProse($)[0]; } const query = dataQuery.replace(/^bcd:/, ""); if (specialSectionType === "browser_compatibility") { if (hasMultipleQueries) { title = query; id = query; isH3 = true; } return [ { type: specialSectionType, value: { title, id, isH3, query, }, }, ]; } else if (specialSectionType === "specifications") { const specifications = await extractSpecifications(query, specURLsString); return [ { type: "specifications", value: { title, id, isH3, specifications, query, }, }, ]; } throw new Error(`Unrecognized special section type '${specialSectionType}'`); } function _addSectionProse( $: cheerio.Cheerio ): SectionsAndFlaws { let id: string | null = null; let title: string | null = null; let isH3 = false; const flaws: string[] = []; // The way this works... // Given a section of HTML, try to extract a id, title, let h2found = false; const h2s = $.find("h2"); h2s.each((i) => { const h2 = h2s.eq(i); if (i) { // Excess! flaws.push( `Excess

tag that is NOT at root-level (id='${h2.attr( "id" )}', text='${h2.text()}')` ); } else { // First element id = h2.attr("id") ?? ""; title = h2.html() ?? ""; h2.remove(); } h2found = true; }); // If there was no

, look through all the

s. if (!h2found) { const h3s = $.find("h3"); h3s.each((i) => { const h3 = h3s.eq(i); if (i) { // Excess! flaws.push( `Excess

tag that is NOT at root-level (id='${h3.attr( "id" )}', text='${h3.text()}')` ); } else { id = h3.attr("id") ?? ""; title = h3.html() ?? ""; if (id && title) { isH3 = true; h3.remove(); } } }); } if (id) { // Remove trailing underscores (https://github.com/mdn/yari/issues/5492). id = id.replace(/_+$/g, ""); } const value: ProseSection["value"] = { id, title, isH3, content: $.html()?.trim(), }; const sections: ProseSection[] = []; if (value.content || value.title) { sections.push({ type: "prose", value, }); } return [sections, flaws]; }