/** * @file Post-parse symbol and section selection for deterministic extraction. The selector works * only on already prepared page text/markdown/html so fetch, robots, cache, SSRF, and mode policy * remain in the shared scrape/http boundary. It is a lightweight structural pass, not semantic * code analysis. */ import type { PatternSourceFormat } from "../pattern/index.ts"; import type { ExtractSchemaPreset, SymbolIncludeFilter, SymbolSelectionOptions, SymbolSelectionResult, SelectedSection, SelectedCodeBlock, SelectedTable, SelectedSymbol, SymbolIncludeType, } from "./types.ts"; export type { SymbolIncludeType, ExtractSchemaPreset, SymbolIncludeFilter, SymbolSelectionOptions, SelectedSection, SelectedCodeBlock, SelectedTable, SelectedSymbol, SymbolSelectionResult, } from "./types.ts"; interface ParsedContent { headings: SelectedSection[]; sections: SelectedSection[]; codeBlocks: SelectedCodeBlock[]; tables: SelectedTable[]; symbols: SelectedSymbol[]; } export function selectSymbolContent( content: string, options: SymbolSelectionOptions, ): SymbolSelectionResult | undefined { const include = normalizedInclude(options); if (include.length === 0) return; const parsed = parseSelectableContent(content, options.sourceFormat ?? "text"); const sections = uniqueSections([ ...matchesForType(parsed, include, "heading"), ...matchesForType(parsed, include, "section"), ]); const codeBlocks = uniqueBlocks(matchesForType(parsed, include, "code-block")); const tables = uniqueTables(matchesForType(parsed, include, "table")); const symbols = uniqueSymbols(matchesForType(parsed, include, "symbol")); return { extractSchema: options.extractSchema, include, sections, codeBlocks, tables, symbols, unmatched: include.filter((filter) => matchCountForFilter(parsed, filter) === 0), }; } function normalizedInclude(options: SymbolSelectionOptions): SymbolIncludeFilter[] { return [...presetInclude(options.extractSchema), ...(options.include ?? [])]; } function presetInclude(preset: ExtractSchemaPreset | undefined): SymbolIncludeFilter[] { if (preset === "api-reference") return [ { type: "section", level: 2 }, { type: "section", level: 3 }, { type: "code-block" }, { type: "table" }, ]; if (preset === "changelog") return [ { type: "section", pattern: "(^|\\b)(v?\\d+\\.\\d+|changelog|release)", }, ]; if (preset === "faq") return [{ type: "section", pattern: "(faq|question|\\?)$" }]; if (preset === "compatibility-table") return [ { type: "table", pattern: "(browser|version|node|support|compat)", }, ]; return []; } function parseSelectableContent(content: string, sourceFormat: PatternSourceFormat): ParsedContent { const headings = selectHeadings(content, sourceFormat); const sections = sectionsFromHeadings(content, headings); const codeBlocks = selectCodeBlocks(content, sourceFormat); const tables = extractTables(content, sourceFormat); const codeBearingContent = codeBlocks.length === 0 && looksCodeBearing(content) ? symbolsFromCodeBlock({ type: "code-block", code: content, start: 0, end: content.length, }) : []; const symbols = uniqueSymbols([ ...codeBlocks.flatMap((block) => symbolsFromCodeBlock(block)), ...codeBearingContent, ]); return { headings, sections, codeBlocks, tables, symbols }; } function selectHeadings(content: string, sourceFormat: PatternSourceFormat): SelectedSection[] { const headings: SelectedSection[] = []; for (const match of content.matchAll(/^(#{1,6})\s+([^\n#].*)$/gmu)) { const title = stripMarkdown(match[2] || "").trim(); headings.push({ type: "heading", title, level: match[1]?.length || 1, start: match.index || 0, end: (match.index || 0) + match[0].length, text: title, }); } if (sourceFormat === "html") { for (const match of content.matchAll(/]*>([\s\S]*?)<\/h\1>/giu)) { const title = stripHtml(match[2] || "").trim(); headings.push({ type: "heading", title, level: Number(match[1]), start: match.index || 0, end: (match.index || 0) + match[0].length, text: title, }); } } return headings.toSorted((a, b) => a.start - b.start); } function sectionsFromHeadings(content: string, headings: SelectedSection[]): SelectedSection[] { return headings.map((heading, index) => { const next = headings.slice(index + 1).find((candidate) => candidate.level <= heading.level); const end = next?.start ?? content.length; return { type: "section", title: heading.title, level: heading.level, start: heading.start, end, text: content.slice(heading.start, end).trim(), }; }); } function selectCodeBlocks(content: string, sourceFormat: PatternSourceFormat): SelectedCodeBlock[] { const blocks: SelectedCodeBlock[] = []; for (const match of content.matchAll(/```([^\n`]*)\n([\s\S]*?)```/gu)) { const start = match.index || 0; blocks.push({ type: "code-block", language: (match[1] || "").trim() || undefined, start, end: start + match[0].length, code: match[2] || "", }); } if (sourceFormat === "html") { for (const match of content.matchAll(/]*>([\s\S]*?)<\/pre>/giu)) { const start = match.index || 0; blocks.push({ type: "code-block", language: htmlCodeLanguage(match[0]), start, end: start + match[0].length, code: stripHtml(match[1] || ""), }); } } return blocks.toSorted((a, b) => a.start - b.start); } function extractTables(content: string, sourceFormat: PatternSourceFormat): SelectedTable[] { const tables: SelectedTable[] = []; for (const match of content.matchAll(/(?:^|\n)((?:\|[^\n]*\|\n?){2,})/gu)) { const start = (match.index || 0) + (match[0].startsWith("\n") ? 1 : 0); tables.push({ type: "table", start, end: start + (match[1] || "").length, text: (match[1] || "").trim(), }); } if (sourceFormat === "html") { for (const match of content.matchAll(/]*>[\s\S]*?<\/table>/giu)) { const start = match.index || 0; tables.push({ type: "table", start, end: start + match[0].length, text: stripHtml(match[0]).trim(), }); } } return tables.toSorted((a, b) => a.start - b.start); } function symbolsFromCodeBlock(block: SelectedCodeBlock): SelectedSymbol[] { const symbols: SelectedSymbol[] = []; const declaration = /^(?:(\/\*\*[\s\S]*?\*\/|(?:\s*\/\/\/.*\n)+)\s*)?\s*(?:export\s+)?(?:async\s+)?(function|class|interface|type|const|let|var)\s+([A-Za-z_$][\w$]*)\s*([^\n{;]*)/gmu; for (const match of block.code.matchAll(declaration)) { if (!isCodeDeclaration(match[2], match[4])) continue; const relative = match.index || 0; const start = block.start + relative; const rawKind = match[2] || "variable"; const kind = rawKind === "const" || rawKind === "let" || rawKind === "var" ? "variable" : rawKind; const name = match[3] || ""; const tail = (match[4] || "").trim(); symbols.push({ type: "symbol", name, kind: kind as SelectedSymbol["kind"], signature: `${rawKind} ${name}${tail}`.trim(), description: docDescription(match[1]), language: block.language, start, end: start + match[0].length, }); } return symbols; } function looksCodeBearing(content: string): boolean { const declaration = /^\s*(?:export\s+)?(?:async\s+)?(function|class|interface|type|const|let|var)\s+[A-Za-z_$][\w$]*\s*([^\n{;]*)/gmu; return [...content.matchAll(declaration)].some((match) => isCodeDeclaration(match[1], match[2])); } function isCodeDeclaration(kind: string | undefined, tail: string | undefined): boolean { const value = (tail ?? "").trim(); if (kind === "function") return value.startsWith("("); if (kind === "const" || kind === "let" || kind === "var") return /^[:=]/u.test(value); if (kind === "type") return value.startsWith("="); if (kind === "class" || kind === "interface") return value === "" || /^(extends|implements)\b/u.test(value); return false; } function matchesForType( parsed: ParsedContent, include: SymbolIncludeFilter[], type: T, ): ExtractedFor[] { return include .filter((filter) => filter.type === type) .flatMap((filter) => collectionForType(parsed, type).filter((item) => matchesFilter(item, filter)), ); } type ExtractedFor = T extends "code-block" ? SelectedCodeBlock : T extends "table" ? SelectedTable : T extends "symbol" ? SelectedSymbol : SelectedSection; function collectionForType( parsed: ParsedContent, type: T, ): ExtractedFor[] { if (type === "heading") return parsed.headings as ExtractedFor[]; if (type === "section") return parsed.sections as ExtractedFor[]; if (type === "code-block") return parsed.codeBlocks as ExtractedFor[]; if (type === "table") return parsed.tables as ExtractedFor[]; return parsed.symbols as ExtractedFor[]; } function matchCountForFilter(parsed: ParsedContent, filter: SymbolIncludeFilter): number { return collectionForType(parsed, filter.type).filter((item) => matchesFilter(item, filter)) .length; } function matchesFilter( item: SelectedSection | SelectedCodeBlock | SelectedTable | SelectedSymbol, filter: SymbolIncludeFilter, ): boolean { if ("level" in item && filter.level !== undefined && item.level !== filter.level) return false; if ("language" in item && filter.language && item.language !== filter.language) return false; const haystack = searchableText(item); if (filter.name && !haystack.toLowerCase().includes(filter.name.toLowerCase())) return false; if (filter.pattern && !safePattern(filter.pattern).test(haystack)) return false; return true; } const safePatternCache = new Map(); const SAFE_PATTERN_CACHE_LIMIT = 50; function safePattern(pattern: string): RegExp { let regex = safePatternCache.get(pattern); if (regex) { // Bump to most-recent (true LRU) safePatternCache.delete(pattern); safePatternCache.set(pattern, regex); return regex; } try { // oxlint-disable-next-line security/detect-non-literal-regexp -- pattern is user-supplied but cached to avoid re-compilation across many items regex = new RegExp(pattern, "iu"); } catch { regex = /$a/u; } if (safePatternCache.size >= SAFE_PATTERN_CACHE_LIMIT) { const firstKey = safePatternCache.keys().next().value as string; safePatternCache.delete(firstKey); } safePatternCache.set(pattern, regex); return regex; } function searchableText( item: SelectedSection | SelectedCodeBlock | SelectedTable | SelectedSymbol, ): string { if ("title" in item) return `${item.title}\n${item.text}`; if ("code" in item) return `${item.language ?? ""}\n${item.code}`; if ("name" in item) return `${item.name}\n${item.signature ?? ""}\n${item.description ?? ""}`; return item.text; } function uniqueSections(items: SelectedSection[]): SelectedSection[] { return uniqueBy(items, (item) => `${item.type}:${item.start}:${item.end}`); } function uniqueBlocks(items: SelectedCodeBlock[]): SelectedCodeBlock[] { return uniqueBy(items, (item) => `${item.start}:${item.end}:${item.language ?? ""}`); } function uniqueTables(items: SelectedTable[]): SelectedTable[] { return uniqueBy(items, (item) => `${item.start}:${item.end}`); } function uniqueSymbols(items: SelectedSymbol[]): SelectedSymbol[] { return uniqueBy(items, (item) => `${item.name}:${item.start}:${item.end}`); } function uniqueBy(items: T[], key: (item: T) => string): T[] { const seen = new Set(); return items.filter((item) => { const value = key(item); if (seen.has(value)) return false; seen.add(value); return true; }); } function stripMarkdown(value: string): string { return value.replaceAll(/[`*_~[\]()]/gu, "").trim(); } function stripHtml(value: string): string { return value .replaceAll(/<[^>]+>/gu, " ") .replaceAll(/\s+/gu, " ") .trim(); } function htmlCodeLanguage(value: string): string | undefined { return value.match(/language-([A-Za-z0-9_-]+)/u)?.[1]; } function docDescription(value: string | undefined): string | undefined { if (!value) return; return value .replaceAll(/^\s*\/\*\*|\*\/\s*$/gu, "") .replaceAll(/^\s*\* ?/gmu, "") .replaceAll(/^\s*\/\/\/ ?/gmu, "") .trim(); }