/** * @file Compile completed scrape outputs into bounded context. Context files are local, * deterministic summaries of already-fetched pages. They never fetch or infer beyond the scrape * payload; each entry keeps enough path/title/excerpt structure for downstream LLM context * without replacing the raw stored crawl or batch result. */ import { PI_TRUNCATION_LIMITS } from "../defaults.ts"; import type { ScrapeResult } from "../scrape/pipeline.ts"; export type ContextSource = "crawl" | "batch"; export interface ContextPage { url: string; result: ScrapeResult; responseId?: string; } export interface ContextMetadata { source: ContextSource; crawlId?: string; batchId?: string; createdAt: string; urlCount: number; totalChars: number; truncated: boolean; } export interface ContextNode { url: string; title?: string; breadcrumbs?: string[]; summary?: string; children?: Array<{ url: string; title?: string }>; contentRef?: string; excerpt?: string; } export interface CompiledContext { package: ContextMetadata; tree: ContextNode[]; } export interface CompileContextInput { source: ContextSource; crawlId?: string; batchId?: string; pages: readonly ContextPage[]; createdAt?: string; maxBytes?: number; } const DEFAULT_EXCERPT_CHARS = 800; const MIN_EXCERPT_CHARS = 120; const SUMMARY_CHARS = 220; export function compileContext(input: CompileContextInput): CompiledContext { const maxBytes = input.maxBytes ?? PI_TRUNCATION_LIMITS.maxBytes; const pages = input.pages.filter((page) => !page.result.error); const totalChars = pages.reduce((total, page) => total + contentText(page.result).length, 0); const tree = attachChildren(pages.map((page) => entryForPage(page, DEFAULT_EXCERPT_CHARS))); const base = { package: { source: input.source, crawlId: input.crawlId, batchId: input.batchId, createdAt: input.createdAt ?? new Date().toISOString(), urlCount: pages.length, totalChars, truncated: false, }, tree, } satisfies CompiledContext; return boundPackage(base, input, maxBytes); } function boundPackage( base: CompiledContext, input: CompileContextInput, maxBytes: number, ): CompiledContext { if (byteLength(base) <= maxBytes) return base; for (const excerptChars of [400, MIN_EXCERPT_CHARS, 0]) { const tree = attachChildren( input.pages .filter((page) => !page.result.error) .map((page) => entryForPage(page, excerptChars)), ); const candidate = { ...base, package: { ...base.package, truncated: true }, tree, }; if (byteLength(candidate) <= maxBytes) return candidate; } return truncateEntries(base, maxBytes); } function truncateEntries(base: CompiledContext, maxBytes: number): CompiledContext { const tree: ContextNode[] = []; const packageMeta = { ...base.package, truncated: true }; for (const entry of base.tree) { const compact = withoutUndefined({ url: entry.url, title: entry.title, breadcrumbs: entry.breadcrumbs, contentRef: entry.contentRef, }); const candidate = { package: packageMeta, tree: [...tree, compact] }; if (byteLength(candidate) > maxBytes && tree.length > 0) break; tree.push(compact); } return { package: packageMeta, tree }; } function entryForPage(page: ContextPage, excerptChars: number): ContextNode { const result = page.result; const url = result.finalUrl ?? result.url ?? page.url; const text = contentText(result); return withoutUndefined({ url, title: result.data.title, breadcrumbs: breadcrumbs(url, result.data.title), summary: clipSummary(text), contentRef: page.responseId ?? responseIdFromResult(result), excerpt: excerptChars > 0 ? clip(text, excerptChars) : undefined, }); } function attachChildren(entries: ContextNode[]): ContextNode[] { const byUrl = new Map(entries.map((entry) => [entry.url, entry])); for (const entry of entries) { const parent = parentUrl(entry.url, byUrl); if (!parent) continue; const children = parent.children ?? []; children.push(withoutUndefined({ url: entry.url, title: entry.title })); parent.children = children; } return entries; } function parentUrl(url: string, entries: Map): ContextNode | undefined { try { const current = new URL(url); const parts = current.pathname.split("/").filter(Boolean); while (parts.length > 0) { parts.pop(); const parent = new URL(current); parent.pathname = parts.length > 0 ? `/${parts.join("/")}/` : "/"; parent.search = ""; parent.hash = ""; const match = entries.get(parent.toString()); if (match && match.url !== url) return match; } } catch { /* ignore */ } } function breadcrumbs(url: string, title?: string): string[] | undefined { try { const parsed = new URL(url); const parts = parsed.pathname .split("/") .filter(Boolean) .map((part) => decodeURIComponent(part).replaceAll(/[-_]+/gu, " ")); const crumbs = [parsed.hostname, ...parts]; if (title && crumbs.at(-1) !== title) crumbs.push(title); return crumbs; } catch { return title ? [title] : undefined; } } function contentText(result: ScrapeResult): string { const data = result.data; if (typeof data.markdown === "string") return data.markdown; if (typeof data.text === "string") return data.text; if (typeof data.html === "string") return data.html; // oxlint-disable-next-line typescript/no-unnecessary-condition -- capture group/optional field may be undefined at runtime if (data.json !== undefined) return JSON.stringify(data.json) ?? ""; return ""; } function clipSummary(text: string): string | undefined { const normalized = normalizeText(text); return normalized ? clip(normalized, SUMMARY_CHARS) : undefined; } function clip(text: string, maxChars: number): string { const normalized = normalizeText(text); if (normalized.length <= maxChars) return normalized; return `${normalized.slice(0, Math.max(0, maxChars - 1)).trimEnd()}…`; } function normalizeText(text: string): string { return text.replaceAll(/\s+/gu, " ").trim(); } function responseIdFromResult(result: ScrapeResult): string | undefined { return (result as { responseId?: unknown }).responseId as string | undefined; } function byteLength(value: unknown): number { return Buffer.byteLength(JSON.stringify(value)); } function withoutUndefined>(value: T): T { return Object.fromEntries(Object.entries(value).filter(([, entry]) => entry !== undefined)) as T; }