import { randomUUID } from "node:crypto"; import { loadEffectiveConfig } from "../config.ts"; import type { VerticalExtractionResult, VerticalExtractorPage, } from "../extract/vertical/capabilities.ts"; import type { ManifestRegistry } from "../extract/vertical/manifest-registry.ts"; import type { ManifestDiagnostic } from "../extract/vertical/manifest-types.ts"; import { matchManifestUrl } from "../extract/vertical/matcher.ts"; import type { createBrowserReadClient as createBrowserReadClientFn, listExtractorCapabilities as listExtractorCapabilitiesFn, runVerticalExtractor as runVerticalExtractorFn, } from "../extract/vertical/registry.ts"; import type { HttpClient } from "../http/client.ts"; import type { ExtractorCapability } from "../types.ts"; /** * @file Web_extract action="vertical" and action="list" handlers — deterministic extractor * capabilities and vertical extraction. */ import type { ToolUpdate } from "./infra/define.ts"; import { emitProgress } from "./infra/progress.ts"; import { inputErrorResult, toolResult } from "./infra/result.ts"; import type { Params, WebExtractToolOptions } from "./web-extract.ts"; interface VerticalBrowserFallbackMetadata { browserFallback?: { used: boolean; backend: string; }; } type VerticalResultWithMetadata = VerticalExtractionResult & VerticalBrowserFallbackMetadata; type VerticalRegistryModule = { createBrowserReadClient: typeof createBrowserReadClientFn; listExtractorCapabilities: typeof listExtractorCapabilitiesFn; runVerticalExtractor: typeof runVerticalExtractorFn; buildManifestRegistry: (includeProject?: boolean) => Promise; }; let verticalRegistryPromise: Promise | undefined; function loadVerticalRegistry(): Promise { verticalRegistryPromise ??= import("../extract/vertical/registry.ts"); return verticalRegistryPromise; } export async function listDeterministicExtractors() { const { listExtractorCapabilities, buildManifestRegistry } = await loadVerticalRegistry(); const capabilities = listExtractorCapabilities(); const registry = await buildManifestRegistry(); const { listManifestExtractors } = await import("../extract/vertical/manifest-registry.ts"); const manifestItems = listManifestExtractors(registry); const merged = manifestItems.map((item) => { const cap = capabilities.find((c: ExtractorCapability) => c.name === item.name); return { ...item, requiresBrowser: cap?.requiresBrowser ?? item.requirements?.requiresBrowser ?? false, requiresLLM: cap?.requiresLLM ?? item.requirements?.requiresLLM ?? false, requiresCloud: cap?.requiresCloud ?? item.requirements?.requiresCloud ?? false, }; }); const diagnostics = registry.errors.length > 0 ? `\nDiagnostics: ${registry.errors.map((e: ManifestDiagnostic) => e.message).join("; ")}` : ""; return toolResult({ text: `${merged.length} extractor(s):\n${merged .map((item) => { const patterns = item.urlPatterns.length > 0 ? ` [${item.urlPatterns.join(", ")}]` : " [content-based, no URL]"; const desc = item.description ? ` — ${item.description}` : ""; return `- ${item.name}${desc}\n ${patterns}`; }) .join("\n")}${diagnostics}`, data: merged, format: "json", summary: `Listed ${merged.length} deterministic extractor capabilities.`, assistantGuidance: "The list above shows each extractor's declared URL patterns. Use action=vertical with extractor= only when the target URL matches the corresponding pattern — the extractor hits that site's structured API rather than scraping HTML. If the URL doesn't match any pattern, use web_scrape instead. Extractors marked [content-based, no URL] expect raw content via the content parameter, not a URL. For all other extraction (regex, selectors, excerpts, schema) use action=pattern or action=adhoc.", }); } export async function runDeterministicExtractor( params: Params, options: WebExtractToolOptions, signal: AbortSignal, onUpdate?: ToolUpdate, ) { if (!params.extractor || !params.url) { return inputErrorResult( "EXTRACT_INPUT_MISSING", "vertical_extract", "web_extract action=vertical requires both extractor and url.", "Provide extractor and url for vertical extraction.", ); } const extractor: string = params.extractor; const url: string = params.url; const mismatch = await suggestExtractorForUrl(extractor, url); if (mismatch) return mismatch; const effectiveParams = await resolveVerticalBrowserParams(params, extractor); const config = await loadEffectiveConfig(); await emitProgress(onUpdate, { state: "processing", url, message: `extractor ${extractor}`, }); const browser = await maybeOpenVerticalBrowser(url, effectiveParams, options, signal, onUpdate); try { const { runVerticalExtractor } = await loadVerticalRegistry(); const result = await runVerticalExtractor( extractor, url, { prerenderedPage: browser?.prerenderedPage, httpClient: browser?.client, requestOptions: { cacheTtlSeconds: config.scrapeDefaults.cacheTtlSeconds, maxAgeSeconds: config.scrapeDefaults.maxAgeSeconds, refresh: config.scrapeDefaults.refresh, respectRobots: params.respectRobots, }, onProgress: onUpdate ? (progress) => emitProgress(onUpdate, { state: progress.state as "waiting" | "loading" | "processing" | "done" | "error", message: progress.message, url: progress.url, }) : undefined, }, signal, ); const resultWithMetadata: VerticalResultWithMetadata = browser ? { ...result, browserFallback: { used: true, backend: effectiveParams.browserBackend ?? "cloak", }, } : result; return toolResult({ text: verticalExtractorText(extractor, resultWithMetadata), data: resultWithMetadata, url, format: "json", sources: result.sources, summary: verticalExtractorSummary(extractor, resultWithMetadata), error: result.error && { ...result.error, phase: "vertical_extract", url, }, assistantGuidance: verticalExtractorGuidance(resultWithMetadata), }); } finally { await browser?.close(); } } /** * Guard against the agent pairing an extractor with a URL it cannot handle (e.g. extractor:"reddit" * with a subreddit listing URL, which belongs to reddit_listing). Runs before browser escalation so * a mismatch never pays for a cloaked-browser navigation. Only suggests a sibling that matches the * URL on a _literal_ host, so greedy wildcard-host manifests (e.g. gitlab's * https://:host/:owner/:repo) can't claim unrelated URLs; anything else falls through to the normal * not-found/unsupported errors. */ async function suggestExtractorForUrl(extractor: string, url: string) { let parsed: URL; try { parsed = new URL(url); } catch { return; } const { buildManifestRegistry } = await loadVerticalRegistry(); const registry = await buildManifestRegistry(); const requested = registry.get(extractor); if (requested && matchManifestUrl(requested.manifest, parsed)) return; const host = parsed.hostname.toLowerCase(); const alt = registry.entries.find( (entry) => entry.manifest.name !== extractor && manifestLiteralHosts(entry.manifest).has(host) && matchManifestUrl(entry.manifest, parsed), ); if (!alt) return; const patterns = alt.manifest.urlPatterns.join(", "); return inputErrorResult( "EXTRACTOR_URL_MISMATCH", "vertical_extract", `extractor="${extractor}" does not match this URL. Use extractor="${alt.manifest.name}" — it matches: ${patterns}`, ); } /** * Concrete (non-wildcard) hosts a manifest's patterns pin to, for high-confidence sibling * suggestions. */ function manifestLiteralHosts(manifest: { urlPatterns: string[] }): Set { const hosts = new Set(); for (const pattern of manifest.urlPatterns) { const host = pattern.replace(/^https?:\/\//u, "").split("/")[0] ?? ""; if (host && !host.includes(":") && !host.includes("*")) hosts.add(host.toLowerCase()); } return hosts; } interface VerticalBrowserSession { prerenderedPage: VerticalExtractorPage; client: Pick; close(): Promise; } /** * Manifests declaring requirements.requiresBrowser:true (e.g. Reddit, which 403s plain HTTP) * escalate to mode:"browser" + cloak backend — but only when the caller did not pick a mode, so * explicit overrides like mode:"fingerprint" still opt out. Vertical-agnostic: any blocked vertical * opts in via its manifest, no per-name branching here. */ async function resolveVerticalBrowserParams(params: Params, extractor: string): Promise { if (params.mode !== undefined) return params; const { listExtractorCapabilities } = await loadVerticalRegistry(); const requiresBrowser = listExtractorCapabilities().find((cap) => cap.name === extractor)?.requiresBrowser ?? false; if (!requiresBrowser) return params; // ponytail: backend pinned to cloak (Reddit's wall needs it); add a manifest backend field if a vertical needs playwright. return { ...params, mode: "browser", browserBackend: params.browserBackend ?? "cloak" }; } /** * For mode:"browser", open one browser session and navigate to the vertical URL so the page carries * cookies + the JS-challenge pass, then return a browser-backed fetch client + the rendered page. * Vertical API/page fetches run via in-page fetch() and beat fingerprint/JS blocks (e.g. Reddit * 403). Propagates if the browser backend is unavailable — mode:"browser" is an explicit request * for it. */ async function maybeOpenVerticalBrowser( url: string, params: Params, options: WebExtractToolOptions, signal: AbortSignal, onUpdate?: ToolUpdate, ): Promise { if (params.mode !== "browser") return; await emitProgress(onUpdate, { state: "loading", message: "opening browser session for vertical fetch", }); const openBrowserFetchSession = options.openBrowserFetchSession ?? (await import("../browser/playwright.ts")).openBrowserFetchSession; const { createBrowserReadClient } = await loadVerticalRegistry(); // ponytail: per-call ephemeral session so close() can safely destroy it; wire params.sessionId reuse if auth needed. const session = await openBrowserFetchSession( { url, sessionId: `vertical-${randomUUID()}`, browserBackend: params.browserBackend }, signal, ); const { rendered } = session; return { prerenderedPage: { requestedUrl: url, finalUrl: rendered.finalUrl, status: rendered.status ?? 200, text: rendered.html, html: rendered.html, }, client: createBrowserReadClient((req, sig) => session.pageFetch(req, sig)), close: () => session.close(), }; } function browserFallbackLabel( fallback: VerticalBrowserFallbackMetadata["browserFallback"] | undefined, ): string | undefined { return fallback?.used ? `browser fallback · ${fallback.backend}` : undefined; } /** Plain-text summary for the call result line (theme applied by renderResult). */ function verticalExtractorSummary( extractor: string | undefined, result: VerticalResultWithMetadata, ): string { const name = extractor ?? result.extractor; const blocked = blockedSource(result.data); if (blocked) { return `${name} returned URL metadata only (${blocked.reason ?? ""})`; } if (result.error) { const detail = [result.error.code, result.error.message].filter(Boolean).join(" \u00B7 "); return `\u2514\u2500 \u2715 ${name} failed \u00B7 ${detail}`; } const [metaLine] = extractorPreview(result.data); const details = [metaLine, browserFallbackLabel(result.browserFallback)] .filter(Boolean) .join(" \u00B7 "); return `\u2514\u2500 \u2713 ${name} done \u00B7 ${details}`; } /** Plain-text answer context (theme applied by renderResult). */ function verticalExtractorText( extractor: string | undefined, result: VerticalResultWithMetadata, ): string { const name = extractor ?? result.extractor; const blocked = blockedSource(result.data); if (blocked) { return [ `${name} returned URL metadata only (${blocked.reason ?? "structured endpoint unavailable"})`, attemptedText(blocked.attemptedEndpoints ?? result.sources?.map((source) => source.url)), ] .filter(Boolean) .join("\n"); } if (result.error) { return [ `\u2514\u2500 \u2715 ${name} failed \u00B7 ${result.error.code}${result.error.message ? ` \u00B7 ${result.error.message}` : ""}`, attemptedText(result.sources?.map((source) => source.url)), ] .filter(Boolean) .join("\n"); } const [metaLine] = extractorPreview(result.data); const details = [metaLine, browserFallbackLabel(result.browserFallback)] .filter(Boolean) .join(" \u00B7 "); const treePrefix = `\u2514\u2500 \u2713 ${name} done`; // Include full transcript text (up to 2000 chars) in the answer context const data = result.data as Record | undefined; const transcript = data?.transcript as { text?: string } | undefined; if (transcript?.text) { const text = transcript.text.replaceAll(/\s+/gu, " ").trim(); const snippet = text.length > 2000 ? text.slice(0, 2000) + "\u2026" : text; return `${treePrefix} \u00B7 ${details}\n\u2502 ${snippet}`; } return `${treePrefix} \u00B7 ${details}`; } /** * Build a compact inline preview from common vertical data fields. Returns [metaLine: string, * transcriptSnippet?: string]. */ function extractorPreview(data: unknown): [string, string | undefined] { const d = data as Record | undefined; if (!d) return ["extracted JSON", undefined]; const parts: string[] = []; // Title (used by youtube, npm, github, reddit, most verticals) if (typeof d.title === "string" && d.title) parts.push(d.title); // Views (youtube, stackoverflow, etc.) if (typeof d.viewCount === "number" && d.viewCount > 0) { parts.push(`${d.viewCount.toLocaleString()} views`); } else if (typeof d.views === "number" && d.views > 0) { parts.push(`${(d.views / 1000000).toFixed(d.views >= 100000000 ? 0 : 1)}M views`); } else if (typeof d.views === "string" && d.views) { parts.push(`${d.views} views`); } // Answers (stackoverflow) const answers = d.answers; if (Array.isArray(answers) && answers.length > 0) { parts.push(`${answers.length} answers`); } // Transcript preview (youtube) const transcript = d.transcript as { text?: string; segments?: unknown[] } | undefined; if (transcript?.segments) { parts.push(`${transcript.segments.length} segments`); } if (transcript?.text) { const text = transcript.text.replaceAll(/\s+/gu, " ").trim(); const snippet = text.length > 120 ? text.slice(0, 120) + "\u2026" : text; return [parts.join(" \u00B7 "), snippet]; } // Description preview fallback (any vertical) if (typeof d.description === "string" && d.description) { const desc = d.description.replaceAll(/\s+/gu, " ").trim(); const snippet = desc.length > 120 ? desc.slice(0, 120) + "\u2026" : desc; parts.push(snippet); } // Comments count (youtube, reddit) const comments = d.comments; if (Array.isArray(comments) && comments.length > 0) { parts.push(`${comments.length} comments`); } // Transcript tracks (youtube) const tracks = d.transcriptTracks; if (Array.isArray(tracks) && tracks.length > 1) { parts.push(`${tracks.length} languages`); } return [parts.length > 0 ? parts.join(" \u00B7 ") : "extracted JSON", undefined]; } function verticalExtractorGuidance(result: VerticalExtractionResult): string | undefined { const blocked = blockedSource(result.data); if (blocked?.reason) return blocked.reason; return result.error?.message; } function attemptedText(urls: string[] | undefined): string | undefined { const uniqueUrls = [...new Set(urls?.filter(Boolean) ?? [])]; return uniqueUrls.length > 0 ? `attempted:\n - ${uniqueUrls.join("\n - ")}` : undefined; } function blockedSource( data: unknown, ): { blocked?: boolean; reason?: string; attemptedEndpoints?: string[] } | undefined { const source = (data as { source?: unknown } | undefined)?.source; if (!source || typeof source !== "object") return; const typed = source as { blocked?: boolean; reason?: string; attemptedEndpoints?: string[]; }; return typed.blocked ? typed : undefined; }