import * as cheerio from "cheerio";
import { ProseSection, Section } from "../libs/types/document.js";
import { extractSpecifications } from "./extract-specifications.js";
type SectionsAndFlaws = [Section[], string[]];
export async function extractSections(
$: cheerio.CheerioAPI
): Promise<[Section[], string[]]> {
const flaws: string[] = [];
const sections: Section[] = [];
const section = cheerio
.load("
", {
// decodeEntities: false
})("div")
.eq(0);
const body = $("body")[0] as cheerio.ParentNode;
const iterable = [...(body.childNodes as cheerio.Element[])];
let c = 0;
for (const child of iterable) {
if (
(child as cheerio.Element).tagName === "h2" ||
(child as cheerio.Element).tagName === "h3"
) {
if (c) {
const [subSections, subFlaws] = await addSections(section.clone());
sections.push(...subSections);
flaws.push(...subFlaws);
section.empty();
}
c = 0;
}
// We *could* wrap this in something like `if (child.tagName) {`
// which would exclude any node that isn't a tag, such as comments.
// That might make the DOM nodes more compact and memory efficient.
c++;
section.append(child);
}
if (c) {
// last straggler
const [subSections, subFlaws] = await addSections(section);
sections.push(...subSections);
flaws.push(...subFlaws);
}
// Check for and mutate possible duplicated IDs.
// If a HTML document has...:
//
// Check these examples
// ...
// Examples
//
// then this can cause various problems. For example, the anchor links
// won't work. The Table of Contents won't be able to do a loop with unique
// `key={section.id}` values.
// The reason we need to loop through to get a list of all existing IDs
// first is because we might have this:
//
// Foo X
// Foo Y
// Foo Z
//
// So when you encounter `Foo Y
` you'll know that you
// can't suggest it to be `Foo Y
` because that ID
// is taken by another one, later.
const allIDs = new Set(
sections
.map((section) => section.value.id)
.filter(Boolean)
.map((id) => id.toLowerCase())
);
const seenIDs = new Set();
for (const section of sections) {
const originalID = section.value.id;
if (!originalID) {
// Not all sections have an ID. For example, prose sections that don't
// start with a .
// Since we're primarily concerned about *uniqueness* here, let's just
// skip worrying about these.
continue;
}
// We normalize all IDs to lowercase so that `id="Foo"` === `id="foo"`.
const id = originalID.toLowerCase();
if (seenIDs.has(id)) {
// That's bad! We have to come up with a new ID but it can't be one
// that's used by another other section.
let increment = 2;
let newID = `${originalID}_${increment}`;
while (
seenIDs.has(newID.toLowerCase()) ||
allIDs.has(newID.toLowerCase())
) {
increment++;
newID = `${originalID}_${increment}`;
}
section.value.id = newID;
seenIDs.add(newID.toLowerCase());
flaws.push(
`'${originalID}' is not a unique ID in this HTML (temporarily changed to ${section.value.id})`
);
} else {
seenIDs.add(id);
}
}
return [sections, flaws];
}
/** Return an array of new sections to be added to the complete document.
*
* Generally, this function is called with a cheerio (`$`) section that
* has HTML in it. The task is to structure that a little bit.
* If the HTML inside the '$' is:
*
* Foo
*
Bla bla
*
*
* then, the expected output is to return:
*
* [{
* type: "prose",
* id: "foo",
* title: "Foo"
* content: "Bla bla
\n
"
* }]
*
* The reason it's always returning an array is because of special
* sections. A special section is one where we try to transform it
* first. For example BCD tables. If the input is this:
*
* Browser Compat
* ...
*
* Then, extract the ID, get the structured data and eventually return this:
*
* [{
* type: "browser_compatibility",
* value: {
* query: "foo.bar.thing",
* id: "browser_compat",
* title: "Browser Compat",
* data: {....}
* }]
*
* Another example is for the specification section. If the input is this:
*
* Specifications
* ...
*
* Then, extract the data-bcd-query and return this:
*
* [{
* type: "specifications",
* value: {
* query: "foo.bar.thing",
* id: "specifications",
* title: "Specifications",
* specifications: {....}
* }]
*/
async function addSections(
$: cheerio.Cheerio
): Promise {
const flaws: string[] = [];
const countPotentialSpecialDivs = $.find("div.bc-data, div.bc-specs").length;
if (countPotentialSpecialDivs) {
/** If there's exactly 1 special table the only section to add is something
* like this:
* {
* "type": "browser_compatibility",
* "value": {
* "title": "Browser compatibility",
* "id": "browser_compatibility",
* "query": "html.elements.video",
* "data": {....}
* }
*
* Where the 'title' and 'id' values comes from the tag (if available).
*
* However, if there are **multiple special tables**,
* it needs to return something like this:
*
* [{
* "type": "prose",
* "value": {
* "id": "browser_compatibility",
* "title": "Browser compatibility"
* "content": "Possible stuff before the table"
* },
* {
* "type": "browser_compatibility",
* "value": {
* "query": "html.elements.video",
* "data": {....
* },
* {
* "type": "prose",
* "value": {
* "content": "Any other stuff before table maybe"
* },
*/
if (countPotentialSpecialDivs > 1) {
const subSections: Section[] = [];
const section = cheerio
.load("
", {
// decodeEntities: false
})("div")
.eq(0);
// Loop over each and every "root element" in the node and keep piling
// them up in a buffer, until you encounter a `div.bc-data` or `div.bc-specs` then
// add that to the stack, clear and repeat.
const div = $[0] as cheerio.ParentNode;
const iterable = [...(div.childNodes as cheerio.Element[])];
let c = 0;
let countSpecialDivsFound = 0;
for (const child of iterable) {
if (
child.tagName === "div" &&
child.attribs &&
child.attribs.class &&
(child.attribs.class.includes("bc-data") ||
child.attribs.class.includes("bc-specs"))
) {
countSpecialDivsFound++;
if (c) {
const [proseSections, proseFlaws] = _addSectionProse(
section.clone()
);
subSections.push(...proseSections);
flaws.push(...proseFlaws);
section.empty();
c = 0; // reset the counter
}
section.append(child);
// XXX That `_addSingleSpecialSection(section.clone())` might return a
// and empty array and that means it failed and we should
// bail.
subSections.push(
...(await _addSingleSpecialSection(section.clone()))
);
section.empty();
} else {
section.append(child);
c++;
}
}
if (c) {
const [proseSections, proseFlaws] = _addSectionProse(section.clone());
subSections.push(...proseSections);
flaws.push(...proseFlaws);
}
if (countSpecialDivsFound !== countPotentialSpecialDivs) {
const leftoverCount = countPotentialSpecialDivs - countSpecialDivsFound;
const explanation = `${leftoverCount} 'div.bc-data' or 'div.bc-specs' element${
leftoverCount > 1 ? "s" : ""
} found but deeply nested.`;
flaws.push(explanation);
}
return [subSections, flaws];
}
const specialSections = await _addSingleSpecialSection($);
// The _addSingleSpecialSection() function will have sucked up the or
// and the `div.bc-data` or `div.bc-specs` to turn it into a special section.
// First remove that, then put whatever HTML is left as a prose
// section underneath.
$.find("div.bc-data, h2, h3").remove();
$.find("div.bc-specs, h2, h3").remove();
const [proseSections, proseFlaws] = _addSectionProse($);
specialSections.push(...proseSections);
flaws.push(...proseFlaws);
if (specialSections.length) {
return [specialSections, flaws];
}
}
// all else, leave as is
const [proseSections, proseFlaws] = _addSectionProse($);
flaws.push(...proseFlaws);
return [proseSections, flaws];
}
async function _addSingleSpecialSection(
$: cheerio.Cheerio
): Promise {
let id: string | null = null;
let title: string | null = null;
let isH3 = false;
const h2s = $.find("h2");
if (h2s.length === 1) {
id = h2s.attr("id");
title = h2s.text();
} else {
const h3s = $.find("h3");
if (h3s.length === 1) {
id = h3s.attr("id");
title = h3s.text();
isH3 = true;
}
}
let dataQuery = "";
let hasMultipleQueries = false;
let specURLsString = "";
let specialSectionType: string | null = null;
if ($.find("div.bc-data").length) {
specialSectionType = "browser_compatibility";
const elem = $.find("div.bc-data");
// Macro adds "data-query", but some translated-content still uses "id".
dataQuery = (elem.attr("data-query") || elem.attr("id")) ?? "";
hasMultipleQueries = elem.attr("data-multiple") === "true";
} else if ($.find("div.bc-specs").length) {
specialSectionType = "specifications";
dataQuery = $.find("div.bc-specs").attr("data-bcd-query") ?? "";
specURLsString = $.find("div.bc-specs").attr("data-spec-urls") ?? "";
}
// Some old legacy documents haven't been re-rendered yet, since it
// was added, so the `div.bc-data` tag doesn't have a `id="bcd:..."`
// or `data-bcd="..."` attribute. If that's the case, bail and fall
// back on a regular prose section :(
if (!dataQuery && specURLsString === "") {
// I wish there was a good place to log this!
return _addSectionProse($)[0];
}
const query = dataQuery.replace(/^bcd:/, "");
if (specialSectionType === "browser_compatibility") {
if (hasMultipleQueries) {
title = query;
id = query;
isH3 = true;
}
return [
{
type: specialSectionType,
value: {
title,
id,
isH3,
query,
},
},
];
} else if (specialSectionType === "specifications") {
const specifications = await extractSpecifications(query, specURLsString);
return [
{
type: "specifications",
value: {
title,
id,
isH3,
specifications,
query,
},
},
];
}
throw new Error(`Unrecognized special section type '${specialSectionType}'`);
}
function _addSectionProse(
$: cheerio.Cheerio
): SectionsAndFlaws {
let id: string | null = null;
let title: string | null = null;
let isH3 = false;
const flaws: string[] = [];
// The way this works...
// Given a section of HTML, try to extract a id, title,
let h2found = false;
const h2s = $.find("h2");
h2s.each((i) => {
const h2 = h2s.eq(i);
if (i) {
// Excess!
flaws.push(
`Excess tag that is NOT at root-level (id='${h2.attr(
"id"
)}', text='${h2.text()}')`
);
} else {
// First element
id = h2.attr("id") ?? "";
title = h2.html() ?? "";
h2.remove();
}
h2found = true;
});
// If there was no , look through all the s.
if (!h2found) {
const h3s = $.find("h3");
h3s.each((i) => {
const h3 = h3s.eq(i);
if (i) {
// Excess!
flaws.push(
`Excess tag that is NOT at root-level (id='${h3.attr(
"id"
)}', text='${h3.text()}')`
);
} else {
id = h3.attr("id") ?? "";
title = h3.html() ?? "";
if (id && title) {
isH3 = true;
h3.remove();
}
}
});
}
if (id) {
// Remove trailing underscores (https://github.com/mdn/yari/issues/5492).
id = id.replace(/_+$/g, "");
}
const value: ProseSection["value"] = {
id,
title,
isH3,
content: $.html()?.trim(),
};
const sections: ProseSection[] = [];
if (value.content || value.title) {
sections.push({
type: "prose",
value,
});
}
return [sections, flaws];
}