/** * @file Builds a small hierarchical API-surface tree from already-fetched documentation pages. The * module is intentionally parser-light: Task 22 compiles crawl/extract outputs, while richer raw * Markdown/RST/source parsing belongs to Task 21 and symbol filtering to Task 23. */ import { selectOne } from "css-select"; import type { AnyNode } from "domhandler"; import { textContent } from "domutils"; import { parseDocument } from "htmlparser2"; import { parseMarkdown as sharedMarkdownParse } from "../../parse/markup/doc.ts"; import type { ScrapeResult } from "../../scrape/pipeline.ts"; import { extractHeadingSections, firstTextBySelector } from "../doc-structure.ts"; import { cleanText, stripUndefined, titleCase, truncateText } from "../text.ts"; import type { ApiSurfaceParameter, ApiSurfaceFunction, ApiSurfaceClass, ApiSurfaceModule, ApiSurfaceTree, ApiSurfaceInputPage, } from "./types.ts"; export function buildApiSurfaceFromScrapes(pages: ScrapeResult[]): ApiSurfaceTree { return buildApiSurface( pages.map((page) => ({ url: page.url ?? page.finalUrl ?? "unknown", finalUrl: page.finalUrl, title: page.data.title, description: page.data.description, html: page.data.html, markdown: page.data.markdown, text: page.data.text, data: page.data.json, error: page.error && { code: page.error.code, message: page.error.message, }, })), ); } export function buildApiSurface(pages: ApiSurfaceInputPage[]): ApiSurfaceTree { const modules: ApiSurfaceModule[] = []; const errors: ApiSurfaceTree["errors"] = []; for (const page of pages) { if (page.error) { errors.push({ ...page.error, url: page.finalUrl ?? page.url }); continue; } modules.push(moduleFromPage(page)); } const tree: ApiSurfaceTree = stripUndefined({ project: inferProject(pages), version: firstVersion(modules, pages), modules, errors: errors.length > 0 ? errors : undefined, fallback: modules.some((module) => hasApiSymbols(module)) ? undefined : { kind: "flat-markdown", reason: "No API signatures were detected; returned page-level documentation modules.", pageCount: modules.length, }, }); return tree; } function moduleFromPage(page: ApiSurfaceInputPage): ApiSurfaceModule { const url = page.finalUrl ?? page.url; const docsite = docsiteData(page.data); if (docsite) return moduleFromDocsite(page, docsite, url); const parsed = parsePageContent(page, url); return stripUndefined({ name: page.title ?? parsed.title ?? moduleNameFromUrl(url), description: page.description ?? parsed.description, url, functions: parsed.functions, classes: parsed.classes.length > 0 ? parsed.classes : undefined, }); } function moduleFromDocsite( page: ApiSurfaceInputPage, docsite: DocsiteLike, url: string, ): ApiSurfaceModule { const functions = docsite.apiSignature ? [signatureFunction(docsite.apiSignature, url)] : functionsFromSections(docsite.sections ?? [], url); const classes = classesFromSections(docsite.sections ?? [], url); return stripUndefined({ name: docsite.title ?? page.title ?? moduleNameFromUrl(url), description: docsite.summary ?? page.description, url, functions, classes: classes.length > 0 ? classes : undefined, }); } function parsePageContent( page: ApiSurfaceInputPage, url: string, ): { title?: string; description?: string; functions: ApiSurfaceFunction[]; classes: ApiSurfaceClass[]; } { if (page.html) return parseHtml(page.html, url); return parseMarkdownPage(page.markdown ?? page.text ?? "", url); } function parseHtml(html: string, url: string): ReturnType { const document = parseDocument(html, { lowerCaseAttributeNames: true, lowerCaseTags: true, }); const title = firstTextBySelector(document, ["main h1", "article h1", "h1", "title"]); const sections = extractHeadingSections(document as AnyNode).filter( (section) => section.level <= 4, ); return { title, description: firstParagraph(document), functions: functionsFromSections(sections, url), classes: classesFromSections(sections, url), }; } function parseMarkdownPage(markdown: string, url: string): ReturnType { const doc = sharedMarkdownParse(markdown); const lines = markdown.split(/\r?\n/u); const sections: SectionLike[] = []; let cbIdx = 0; const relevantHeadings = doc.headings.filter((heading) => heading.level <= 4); for (let index = 0; index < relevantHeadings.length; index += 1) { const heading = relevantHeadings[index]; const nextLine = relevantHeadings[index + 1]?.line ?? lines.length + 1; const sectionCodeBlocks: Array<{ language?: string; code: string }> = []; while (cbIdx < doc.codeBlocks.length && doc.codeBlocks[cbIdx].lineEnd < nextLine) { const cb = doc.codeBlocks[cbIdx]; if (cb.value) sectionCodeBlocks.push({ language: cb.language, code: cb.value }); cbIdx += 1; } const contentLines = lines.slice(heading.line, nextLine - 1); sections.push({ heading: heading.text, content: cleanText(contentLines.join(" ")), codeBlocks: sectionCodeBlocks.length > 0 ? sectionCodeBlocks : undefined, }); } return { title: sections[0]?.heading, description: sections[0]?.content, functions: functionsFromSections(sections, url), classes: classesFromSections(sections, url), }; } function functionsFromSections(sections: SectionLike[], url: string): ApiSurfaceFunction[] { const functions: ApiSurfaceFunction[] = []; for (const section of sections) { const signature = signatureFromSection(section); const name = signature ? nameFromSignature(signature) : symbolName(section.heading); if (!name || looksLikeClass(section.heading)) continue; functions.push( stripUndefined({ name, signature, description: truncateText(section.content, 700), examples: examples(section.codeBlocks), url: sectionUrl(url, section.anchor), }), ); } return dedupeByName(functions); } function classesFromSections(sections: SectionLike[], url: string): ApiSurfaceClass[] { const classes: ApiSurfaceClass[] = []; for (const section of sections) { if (!looksLikeClass(section.heading)) continue; classes.push( stripUndefined({ name: section.heading.replace(/^class\s+/iu, "").trim(), description: truncateText(section.content, 700), methods: functionsFromSections([{ ...section, heading: "" }], url), url: sectionUrl(url, section.anchor), }), ); } return dedupeByName(classes); } function signatureFunction( signature: NonNullable, url: string, ): ApiSurfaceFunction { return stripUndefined({ name: signature.name, signature: signature.signature, parameters: signature.parameters, returns: signature.returns, url, }); } interface SectionLike { heading: string; anchor?: string; content?: string; codeBlocks?: Array<{ language?: string; code: string }>; } interface DocsiteLike { title?: string; summary?: string; version?: string; sections?: SectionLike[]; apiSignature?: { name: string; signature: string; parameters?: ApiSurfaceParameter[]; returns?: { type?: string; description?: string }; }; } function docsiteData(data: unknown): DocsiteLike | undefined { if (!data || typeof data !== "object") return; const value = data as { title?: unknown; sections?: unknown; source?: { provider?: unknown }; }; return value.source?.provider === "docsite" || Array.isArray(value.sections) ? (data as DocsiteLike) : undefined; } function signatureFromSection(section: SectionLike): string | undefined { return section.codeBlocks ?.map((block) => block.code) .find((code) => /[A-Za-z_$][\w$]*(?:\.|#)?[\w$]*\s*\(/u.test(code)); } function nameFromSignature(signature: string): string | undefined { return signature.match( /(?:function\s+|new\s+)?([A-Za-z_$][\w$]*(?:\.[A-Za-z_$][\w$]*)?)\s*\(/u, )?.[1]; } function symbolName(heading: string): string | undefined { return heading.match(/`?([A-Za-z_$][\w$]*(?:\.[A-Za-z_$][\w$]*)?)\s*\(/u)?.[1]; } function looksLikeClass(heading: string): boolean { return /^class\s+/iu.test(heading) || /\bclass\b/iu.test(heading); } function examples(blocks: SectionLike["codeBlocks"]): string[] | undefined { const values = blocks?.map((block) => block.code).filter(Boolean) ?? []; return values.length > 0 ? values.slice(0, 3) : undefined; } function hasApiSymbols(module: ApiSurfaceModule): boolean { return module.functions.length > 0 || (module.classes?.length ?? 0) > 0; } function firstVersion( modules: ApiSurfaceModule[], pages: ApiSurfaceInputPage[], ): string | undefined { for (const page of pages) { const version = docsiteData(page.data)?.version ?? versionFromUrl(page.finalUrl ?? page.url); if (version) return version; } return versionFromUrl(modules[0]?.url); } function inferProject(pages: ApiSurfaceInputPage[]): string | undefined { const first = pages.find((page) => page.url !== "unknown")?.finalUrl ?? pages[0]?.url; if (!first) return; try { const url = new URL(first); return url.hostname.replace(/^www\./u, ""); } catch { /* ignore */ } } function versionFromUrl(value?: string): string | undefined { if (!value) return; try { return new URL(value).pathname .split("/") .find((part) => /^(?:v?\d+(?:\.\d+){0,3}|latest|next|stable)$/iu.test(part)); } catch { /* ignore */ } } function moduleNameFromUrl(value: string): string { try { const url = new URL(value); return titleCase(lastTruthyPart(url.pathname.split("/")) ?? url.hostname); } catch { return value; } } function lastTruthyPart(parts: string[]): string | undefined { for (let i = parts.length - 1; i >= 0; i -= 1) { if (parts[i]) return parts[i]; } return undefined; } function firstParagraph(document: AnyNode): string | undefined { return truncateText( cleanText(textContent(selectOne("main p, article p, p", document) ?? [])), 700, ); } function sectionUrl(url: string, anchor?: string): string { return anchor ? `${url}${anchor}` : url; } function dedupeByName(items: T[]): T[] { const seen = new Set(); return items.filter((item) => { const key = item.name.toLowerCase(); if (seen.has(key)) return false; seen.add(key); return true; }); }