import cheerio from "cheerio"; import * as Parser from "./parser.js"; import { Document, Redirect } from "../../content/index.js"; import { translationsOf } from "../../content/translations.js"; import { isValidLocale } from "../../libs/locale-utils/index.js"; import { m2hSync } from "../../markdown/index.js"; import { findPostFileBySlug, getSlugByBlogPostUrl } from "../../build/utils.js"; const DUMMY_BASE_URL = "https://example.com"; const MACROS_IN_SUMMARY_TO_IGNORE = new Set([ "apiref", "jsref", "compat", "page", "deprecated_header", "previous", "previousmenu", "previousnext", "previousmenunext", "quicklinkswithsubpages", ]); const MACROS_IN_SUMMARY_TO_REPLACE_WITH_FIRST_ARGUMENT = new Set(["glossary"]); function repairURL(url) { // Returns a lowercase URI with common irregularities repaired. url = url.trim().toLowerCase(); if (!url.startsWith("/")) { // Ensure the URI starts with a "/". url = `/${url}`; } // Remove redundant forward slashes, like "//". url = url.replace(/\/{2,}/g, "/"); // Ensure the URI starts with a valid locale. const maybeLocale = url.split("/")[1]; if (!isValidLocale(maybeLocale)) { if (maybeLocale === "en") { // Converts URI's like "/en/..." to "/en-us/...". url = url.replace(`/${maybeLocale}`, "/en-us"); } else { // Converts URI's like "/web/..." to "/en-us/web/...", or // URI's like "/docs/..." to "/en-us/docs/...". url = `/en-us${url}`; } } // Ensure the locale is followed by "/docs". const [locale, maybeDocs] = url.split("/").slice(1, 3); if (maybeDocs !== "docs") { // Converts URI's like "/en-us/web/..." to "/en-us/docs/web/...". url = url.replace(`/${locale}`, `/${locale}/docs`); } return url; } export const info = { getPathname(url) { // This function returns just the pathname of the given "url", removing // any trailing "/". return new URL(url, DUMMY_BASE_URL).pathname.replace(/\/$/, ""); }, cleanURL(url, followRedirects = true) { // This function returns just the lowercase pathname of the given "url", // removing any trailing "/". The DUMMY_BASE_URL is not important here, since // we're only after the path of any incoming "url", but it's required by // the URL constructor when the incoming "url" is relative. const repairedURL = repairURL( new URL(url, DUMMY_BASE_URL).pathname.replace(/\/$/, "").toLowerCase() ); if (followRedirects) { const resolvedURL = Redirect.resolve(repairedURL); if (resolvedURL !== repairedURL) { // The `Redirect.resolve()` returned an actual redirect, and that needs // to be "repaired" as well. // Remember, it defaults to the URL you passed in if nothing was found // in the redirects lookup. return repairURL(resolvedURL); } return resolvedURL; } return repairedURL; }, getDescription(url) { const cleanedURL = info.cleanURL(url); let description = `${cleanedURL}`; if (cleanedURL !== url.toLowerCase()) { description += ` (derived from "${url}")`; } return description; }, getChildren(url, includeSelf) { // We don't need "depth" since it's handled dynamically (lazily). // The caller can keep requesting "subpages" as deep as the // hierarchy goes, and they'll be provided on-demand. // IMPORTANT: The list returned does not need to be frozen since // it's re-created for each caller (so one caller can't mess with // another), but also should NOT be frozen since some macros sort // the list in-place. const page = info.getPageByURL(url, { throwIfDoesNotExist: true }); if (includeSelf) { return [page]; } return page.subpages; }, // TODO getTranslations(url: string) { // function buildTranslationObjects(data) { // // Builds a list of translation objects suitable for // // consumption by Kumascript macros, using the translation // // information from the given "data" as well as the "pageInfoByUri". // const result = []; // let rawTranslations = data.translations || []; // if (!rawTranslations.length && data.translation_of) { // const englishUri = `/en-US/docs/${data.translation_of}`; // const englishData = pageInfoByUri.get(englishUri.toLowerCase()); // if (englishData) { // // First, add the English translation for this non-English locale. // result.push( // Object.freeze({ // url: englishUri, // locale: "en-US", // title: englishData.title, // summary: englishData.summary, // }) // ); // rawTranslations = englishData.translations || []; // } // } // for (const { locale, slug } of rawTranslations) { // if (locale !== data.locale) { // // A locale is never a translation of itself. // const uri = `/${locale}/docs/${slug}`; // const pageData = pageInfoByUri.get(uri.toLowerCase()); // result.push( // Object.freeze({ // url: uri, // locale: locale, // title: pageData.title, // summary: pageData.summary, // }) // ); // } // } // return result; // } return info.getPageByURL(url, { throwIfDoesNotExist: true }).translations(); }, getPageByURL( url: string, { throwIfDoesNotExist = false, followRedirects = true } = {} ) { // Always start by looking it up *without* following redirects. let document = Document.findByURL(info.cleanURL(url, false)); // Usually, `followRedirects` is disabled if the caller definitely is not // not interested in following redirects (e.g. listing sub-pages) if (!document && followRedirects) { document = Document.findByURL(info.cleanURL(url, true)); } if (!document) { // The macros expect an empty object if the URL does not exist, so // "throwIfDoesNotExist" should only be used within "info" itself. if (throwIfDoesNotExist) { throw new Error( `${info.getDescription(url)} (url: ${url}) does not exist` ); } return {}; } return this.getPage(document); }, getPage(document) { if (typeof document === "string") { console.trace( "getPage() was called with a string, presumably a URL. " + "This is deprecated in favor of getPageByURL()." ); return this.getPageByURL(document); } const { locale, slug, title, status, tags } = document.metadata; const { rawBody, isMarkdown } = document; return { url: document.url, locale, slug, title, short_title: document.metadata["short-title"], status: status || [], tags: tags || [], pageType: document.metadata["page-type"], // Let translations be lazy loaded. translations() { return translationsOf(document.metadata.slug, document.metadata.locale); }, summary() { // Back in the old Kuma days we used to store the summary as another piece // of metadata on each document. It was always available, with any kumascript // macros rendered out. // In Yari, this is not possible. We don't duplicate the summary in every // document. Instead, we extract it from the document when we build it. // So, to avoid the whole chicken-and-egg problem, instead, we're going to // try to extract it on-the-fly, from raw HTML or Markdown. // Note, we can't always use Cheerio here because the `document.rawBody` is // actually not valid HTML, hence the desperate fall back on regex. // A lot of times, you'll actually find that the first paragraph isn't // a

tag. But often, in those cases it'll have that `seoSummary` // tag. Like this for example: // //

The window.stop() ... // // So that's why we always start by looking for that tag first. let $ = null; let summary = ""; try { $ = cheerio.load(isMarkdown ? m2hSync(rawBody, { locale }) : rawBody); $("span.seoSummary, .summary").each((i, element) => { if (!summary) { const html = $(element) .html() .replace(/"/g, '"') .replace(/'/g, "'"); summary = postProcessSummaryHTMLSnippet(html, document); } }); if (!summary) { // To avoid

tags that are inside things like // `

{ if (!summary) { const html = $(element) .html() .replace(/"/g, '"') .replace(/'/g, "'"); summary = postProcessSummaryHTMLSnippet(html, document); } }); } } catch (er) { console.warn( `Cheerio on document.rawBody (${document.url}) failed to parse`, er ); } return summary; }, get subpages() { return Document.findChildren(document.url) .map((document) => info.getPage(document)) .filter((p) => p?.url); }, }; }, hasPage(url): boolean { if (Document.findByURL(info.cleanURL(url))) { return true; } const slug = getSlugByBlogPostUrl(url); return Boolean(slug) && Boolean(findPostFileBySlug(slug)); }, }; /** * Return the HTML string as if we had it KumaScript rendered. When we extract * a summary from the raw HTML, we sometimes get things like KS macros in it. * We can't fully/properly render these because it can easily get us into an * infinite recursion problem. So we have to make the most of it we can from * the raw HTML. * * @param {string} text The summary as taken from the raw HTML. */ function postProcessSummaryHTMLSnippet(text, document) { if (!text.trim()) { return ""; } let tokens; try { tokens = Parser.parse(text); } catch (e) { // Unfortunate, but not the right time to flag this as a flaw. console.warn( `(${document.url}) Unable to Parser.parse() text '${text}' due to error:`, e ); return text; } let output = ""; for (const token of tokens) { if (token.type !== "MACRO") { // If it isn't a MACRO token, it's a TEXT token. output += token.chars; continue; } if (!token.args.length) { // Any macro that doesn't have arguments should just be ignored. // Examples are: // {{AddonsSidebar }} // {{ SeeCompatTable() }} // {{Non-standard_header}} // {{Non-standard_header()}} continue; } const macroName = token.name.toLowerCase(); // Some macros do have arguments, but there's no good reason to render them out // for the benefit of a summary, in this context. if (MACROS_IN_SUMMARY_TO_IGNORE.has(macroName)) { continue; } if (MACROS_IN_SUMMARY_TO_REPLACE_WITH_FIRST_ARGUMENT.has(macroName)) { output += token.args[0]; } else { output += `${token.args[0]}`; } } return output.trim(); } export default info;