/** @file Pi tool adapter for single-URL scraping, snapshot writing, and diffing. */ import { Type, type Static } from "typebox"; import { DEFAULT_BROWSER_BACKEND } from "../defaults.ts"; import { diffScrapeResult, saveSnapshot, type SnapshotDiffResult, updateSnapshotReference, } from "../diff/snapshots.ts"; import { saveBodyToDownloads } from "../http/download-storage.ts"; import { resolveProxyParam } from "../http/proxy-pool.ts"; import { getOrCreateSession } from "../http/session.ts"; import { describeScrapeResult, formatAge } from "../scrape/describe.ts"; import { filterLines } from "../scrape/line-filter.ts"; import { formatLineMatchPreview } from "../scrape/line-preview.ts"; import { resolveScrapeOptions } from "../scrape/options.ts"; import type { ScrapeResult } from "../scrape/pipeline.ts"; import { freshnessFromTimestamp } from "../storage/cache/freshness.ts"; import { storeResponseWithId } from "../storage/responses/store.ts"; import { toolCall } from "../tui/index.ts"; import { renderWebDiffResult } from "../tui/renderers/diff.ts"; import { renderWebScrapeResult } from "../tui/renderers/scrape.ts"; import { qualityFromCache, refreshUrlAction, storedTraceContext } from "./infra/agentic-context.ts"; import { defineWebTool, type WebTool } from "./infra/define.ts"; import { emitProgress } from "./infra/progress.ts"; import { inputErrorResult, toolResult } from "./infra/result.ts"; import { outputFormatSchema, scrapeModeOptionSchema, sessionOptionSchema, urlProperty, } from "./infra/schemas.ts"; import { sessionLifecycle } from "./infra/session-lifecycle.ts"; export const webScrapeSchema = Type.Object({ url: Type.Optional(urlProperty()), ...scrapeModeOptionSchema, format: Type.Optional(outputFormatSchema), include: Type.Optional(Type.Unsafe({})), exclude: Type.Optional(Type.Unsafe({})), onlyMainContent: Type.Optional(Type.Unsafe({})), timeoutSeconds: Type.Optional(Type.Unsafe({})), maxBytes: Type.Optional(Type.Unsafe({})), maxChars: Type.Optional(Type.Unsafe({})), headers: Type.Optional(Type.Unsafe>({})), proxy: Type.Optional(Type.Unsafe({ type: ["string", "array"] })), respectRobots: Type.Optional(Type.Unsafe({})), refresh: Type.Optional(Type.Unsafe({})), followAlternates: Type.Optional(Type.Unsafe({})), followMetaRefresh: Type.Optional(Type.Unsafe({})), saveToFile: Type.Optional( Type.Unsafe({ description: "true|{dir,name,bytes}", }), ), snapshotName: Type.Optional(Type.Unsafe({})), snapshotTag: Type.Optional(Type.Unsafe({})), diff: Type.Optional( Type.Unsafe< | boolean | { snapshotName?: string; snapshotTag?: string; compareTag?: string; maxSnapshotAgeSeconds?: number; } >({ description: "true|{name,tag,compare,age}", }), ), linesMatching: Type.Optional(Type.Unsafe({})), contextLines: Type.Optional(Type.Unsafe({})), caseSensitive: Type.Optional(Type.Unsafe({})), chunks: Type.Optional(Type.Unsafe({})), maxTokens: Type.Optional(Type.Unsafe({})), overlapTokens: Type.Optional(Type.Unsafe({})), ...sessionOptionSchema, stealth: Type.Optional(Type.Unsafe({})), autoWait: Type.Optional(Type.Unsafe({})), browserBackend: Type.Optional( Type.Unsafe<"cloak" | "playwright">({ enum: ["cloak", "playwright"] }), ), }); type Params = Static; type DiffParams = Exclude; export function createWebScrapeTool(): WebTool { return defineWebTool({ name: "web_scrape", label: "Scrape", description: "Read URL", parameters: webScrapeSchema, async execute(_toolCallId, params: Params, signal, onUpdate) { if (params.diff !== undefined) return await diffScrape(params, signal, onUpdate); return await readScrape(params, signal, onUpdate); }, renderCall: (args, theme, _context) => toolCall("web_scrape", renderScrapeCallParts(args), theme), renderResult: (result, { expanded }, theme) => { const details = result.details as Partial<{ kind: string }>; if (details.kind === "diff") return renderWebDiffResult(result, expanded, theme); return renderWebScrapeResult(result, expanded, theme); }, }); } export const webScrapeTool = createWebScrapeTool(); function renderScrapeCallParts(params: Params): string[] { return [`(${params.mode ?? "auto"} → ${params.format ?? "markdown"})`]; } async function readScrape( params: Params, signal: AbortSignal, onUpdate?: Parameters["execute"]>[3], ) { if (!params.url) { return inputErrorResult( "SCRAPE_URL_MISSING", "scrape", "web_scrape task=read requires url.", "Provide url for web_scrape task=read.", ); } const { loadEffectiveConfig } = await import("../config.ts"); const config = await loadEffectiveConfig(); const resolvedProxy = resolveProxyParam(params.proxy); const session = params.sessionId ? await getOrCreateSession(params.sessionId) : undefined; if (session) { const extra = params as Record; if (extra.browserProfile) session.defaultBrowserProfile = extra.browserProfile as string; if (resolvedProxy) session.defaultProxy = resolvedProxy; if (params.mode) session.defaultMode = params.mode; if (extra.headers) session.defaultHeaders = { ...session.defaultHeaders, ...(extra.headers as Record), }; } const cleanParams = { ...params, proxy: resolvedProxy }; const scrapeOptions = resolveScrapeOptions(cleanParams, config, session); await emitProgress(onUpdate, { state: "loading", url: params.url, message: `scraping ${scrapeOptions.mode}`, checklist: [ { id: "validated", label: "URL validated", state: "done" }, { id: "robots", label: "robots checked", state: "pending" }, { id: "fetch", label: "fetching page", state: "pending" }, { id: "parse", label: "parsing content", state: "pending" }, { id: "store", label: "storing result", state: "pending" }, ], }); const { scrapeUrl } = await import("../scrape/pipeline.ts"); let result = await scrapeUrl(params.url, scrapeOptions, {}, signal); // Derive display mode: show "cloak" or "playwright" for browser mode const displayMode = result.mode === "browser" ? ((scrapeOptions.browserBackend as string | undefined) ?? DEFAULT_BROWSER_BACKEND) : result.mode; const needles = params.linesMatching; if (needles && needles.length > 0 && !result.error) { const text = result.data.rawText ?? result.data.text ?? ""; const matches = filterLines(text, needles, params.contextLines, params.caseSensitive); result = { ...result, data: { ...result.data, matches } }; } // Chunking: paragraph-bounded, token-budgeted segments for RAG workflows if (params.chunks && !result.error) { const sourceText = result.data.markdown ?? result.data.text ?? ""; if (sourceText.length > 0) { const { chunkMarkdown } = await import("../parse/chunker.ts"); const chunks = chunkMarkdown(sourceText, { maxTokens: params.maxTokens, overlapTokens: params.overlapTokens, }); result = { ...result, data: { ...result.data, chunks } }; } } await emitProgress(onUpdate, { state: result.error ? "error" : "done", url: result.finalUrl ?? params.url, message: result.error?.message, checklist: [ { id: "validated", label: "URL validated", state: "done" }, { id: "robots", label: "robots checked", state: "done" }, { id: "fetch", label: result.cache?.cached ? "cache hit" : "fetched page", state: result.error ? "failed" : "done", }, { id: "parse", label: "parsed content", state: result.error ? "failed" : "done", }, { id: "store", label: "storing result", state: "pending" }, ], }); const { storeResponse } = await import("../storage/responses/store.ts"); const stored = await storeResponse(result); let snapshotSaved: { name: string; tag?: string; path: string } | undefined; if (params.snapshotName && !result.error && result.url) { try { const snapOptions = { snapshotName: params.snapshotName, snapshotTag: params.snapshotTag }; const saved = await saveSnapshot(result, snapOptions); snapshotSaved = { name: params.snapshotName, tag: params.snapshotTag, path: saved.path, }; await updateSnapshotReference(result.url, stored, snapOptions); } catch { // Soft toolFailure — snapshot write failed but scrape succeeded; return with warning } } // saveToFile: move from temp to content-addressed storage let savedFilePath: string | undefined; if (params.saveToFile && !result.error && result.data.file) { try { const { createReadStream } = await import("node:fs"); const { unlink } = await import("node:fs/promises"); const fileInfo = result.data.file as { path: string; contentType?: string }; const saveOpts = typeof params.saveToFile === "object" ? params.saveToFile : {}; const stream = createReadStream(fileInfo.path); const sourceUrl = result.url ?? result.finalUrl ?? "https://unknown"; const dl = await saveBodyToDownloads( stream, fileInfo.contentType, sourceUrl, result.data.file as Record, saveOpts, ); savedFilePath = dl.filePath; await unlink(fileInfo.path).catch(() => null); result = { ...result, data: { ...result.data, file: { ...result.data.file, path: dl.filePath, sha256: dl.sha256 }, }, }; } catch { // Soft toolFailure } } const matchPreview = !result.error ? formatLineMatchPreview(result.data.matches, { maxChars: 4_000 }) : undefined; const shaped = shapeScrapeResult(result, stored.responseId, matchPreview); const { notice: toolSessionNotice, suffix: sessionSuffix } = await sessionLifecycle(params); const description = describeScrapeResult(result, { displayMode }); const scrapeText = matchPreview ? `${description.split("\n", 1)[0]}\n${matchPreview}` : description; const snapshotSuffix = snapshotSaved ? `\nsnapshot saved as "${snapshotSaved.name}"${snapshotSaved.tag ? ` (tag: ${snapshotSaved.tag})` : ""}` : ""; return toolResult({ text: result.error ? `Scrape failed: ${result.error.message}` : `${scrapeText}${sessionSuffix}${snapshotSuffix}${savedFilePath ? `\nsaved to: ${savedFilePath}` : ""}`, data: result.data, url: result.url, finalUrl: result.finalUrl, status: result.status, mode: displayMode, format: result.format, timing: result.timing, truncated: result.truncated, contentType: result.contentType, headers: result.headers, downloadedBytes: result.downloadedBytes, cache: result.cache, responseId: stored.responseId, fullOutputPath: stored.fullOutputPath, snapshotSaved, savedFilePath, error: result.error, diagnostics: toolSessionNotice ? { toolSessionNotice } : undefined, ...shaped, }); } async function diffScrape( params: Params, signal: AbortSignal, onUpdate?: Parameters["execute"]>[3], ) { if (!params.url) { return inputErrorResult( "SCRAPE_URL_MISSING", "scrape", "web_scrape diff requires url.", "Provide url for web_scrape diff.", ); } const { loadEffectiveConfig } = await import("../config.ts"); const config = await loadEffectiveConfig(); const diffOptions = typeof params.diff === "boolean" ? {} : (params.diff as DiffParams); const diffCleanParams = { ...params, proxy: resolveProxyParam(params.proxy) }; const scrapeOptions = resolveScrapeOptions(diffCleanParams, config); await emitProgress(onUpdate, { state: "loading", url: params.url, message: "snapshotName" in diffOptions && diffOptions.snapshotName ? `diffing snapshot '${diffOptions.snapshotName}'` : "diffing against snapshot", }); const { scrapeUrl } = await import("../scrape/pipeline.ts"); const scrape = await scrapeUrl(params.url, scrapeOptions, {}, signal); if (scrape.error) { return toolResult({ text: `Diff failed: ${scrape.error.message}`, data: {}, url: params.url, kind: "diff", error: scrape.error, }); } try { const diff = await diffScrapeResult(scrape, { snapshotName: "snapshotName" in diffOptions ? diffOptions.snapshotName : undefined, snapshotTag: "snapshotTag" in diffOptions ? diffOptions.snapshotTag : undefined, compareTag: "compareTag" in diffOptions ? diffOptions.compareTag : undefined, }); const { metadata: stored } = await storeResponseWithId( (responseId) => { diff.current.metadata.responseId = responseId; return diff; }, { contentType: "application/json" }, ); diff.current.metadata.fullOutputPath = stored.fullOutputPath; await updateSnapshotReference(diff.current.url, stored, { snapshotName: "snapshotName" in diffOptions ? diffOptions.snapshotName : undefined, snapshotTag: "snapshotTag" in diffOptions ? diffOptions.snapshotTag : undefined, }); const baselineFreshness = baselineFreshnessFor( diff, (diffOptions as { maxSnapshotAgeSeconds?: number }).maxSnapshotAgeSeconds, ); const text = renderDiffSummary(diff, stored.responseId); const shaped = shapeDiffResult(diff, stored.responseId, baselineFreshness); return toolResult({ text, data: diff, url: params.url, finalUrl: diff.current.finalUrl, kind: "diff", mode: diff.current.metadata.mode, format: "json", responseId: stored.responseId, fullOutputPath: stored.fullOutputPath, contentType: "application/json", freshness: baselineFreshness, ...shaped, }); } catch (error) { if (typeof error === "object" && error !== null && "structured" in error) { const err = error as { structured: { code: string; phase: string; message: string; retryable: boolean }; message: string; }; return toolResult({ text: `Diff failed: ${err.message}`, data: {}, url: params.url, kind: "diff", error: err.structured, }); } throw error; } } function shapeScrapeResult(result: ScrapeResult, responseId: string, matchPreview?: string) { const url = result.finalUrl ?? result.url ?? "about:blank"; const source = result.cache?.cached ? `from cache fetched ${formatAge(result.cache.ageSeconds)} with staleness ${result.cache.staleness ?? "fresh"}` : "from a fresh network fetch"; const summary = result.error ? `Scrape failed for ${url}: ${result.error.message}` : `Scraped ${url} ${source}.`; return { summary, answerContext: result.error ? `The scrape failed during ${result.error.phase}: ${result.error.message}` : (matchPreview ?? ""), ...storedTraceContext({ responseId, source: { id: "page", title: result.data.title, uri: url, excerpt: ( matchPreview ?? result.data.markdown ?? result.data.text ?? result.data.title ?? "" ).slice(0, 240), relevance: "Primary scraped page content.", retrievedAt: result.cache?.fetchedAt ?? new Date().toISOString(), sourceType: "docs", }, extraActions: [refreshUrlAction(url)], }), qualitySignals: qualityFromCache(result.cache), }; } export function diffInterpretation(diff: SnapshotDiffResult): string { const name = diffLabel(diff); if (!diff.previous) return `No previous${name}; saved a baseline for future comparisons.`; if (diff.summary?.unchangedAfterNormalization) return `No meaningful content changes after normalization for${name}; prior content is effectively equivalent.`; const changed = diff.diff?.changedCount ?? 0; const added = diff.diff?.addedCount ?? 0; const removed = diff.diff?.removedCount ?? 0; const headingChanges = (diff.summary?.addedHeadings.length ?? 0) + (diff.summary?.removedHeadings.length ?? 0); const linkChanges = (diff.summary?.addedLinks.length ?? 0) + (diff.summary?.removedLinks.length ?? 0); if (changed === 0 && added === 0 && removed === 0 && headingChanges === 0 && linkChanges === 0) { return `No content changes detected for${name}; current and previous snapshots match.`; } return `Content changed for${name}: ${changed} changed, ${added} added, ${removed} removed line(s), ${headingChanges} heading change(s), ${linkChanges} link change(s).`; } function baselineFreshnessFor(diff: SnapshotDiffResult, maxSnapshotAgeSeconds: unknown) { if (!diff.previous || maxSnapshotAgeSeconds === undefined) return; return freshnessFromTimestamp( diff.previous.metadata.timestamp, toPositiveNumber(maxSnapshotAgeSeconds), ); } function toPositiveNumber(value: unknown): number | undefined { const number = typeof value === "number" ? value : Number(value); return Number.isFinite(number) && number > 0 ? number : undefined; } function renderDiffSummary(diff: SnapshotDiffResult, responseId: string): string { const name = diffLabel(diff); if (!diff.previous) return `No previous${name}; saved baseline. responseId: ${responseId}`; if (diff.summary?.unchangedAfterNormalization) return `Only volatile content changed after normalization for${name}. responseId: ${responseId}`; const textDiff = diff.diff; const parts = [ textDiff ? `${textDiff.changedCount} changed, ${textDiff.addedCount} added, ${textDiff.removedCount} removed, ${textDiff.unchanged} unchanged` : "No text diff", `${diff.summary?.addedHeadings.length ?? 0} added heading(s)`, `${diff.summary?.removedHeadings.length ?? 0} removed heading(s)`, `${diff.summary?.addedLinks.length ?? 0} added link(s)`, `${diff.summary?.removedLinks.length ?? 0} removed link(s)`, `${diff.summary?.changedMetadata.length ?? 0} metadata change(s)`, `responseId: ${responseId}`, ]; return parts.join(" · "); } function diffLabel(diff: SnapshotDiffResult): string { return ` ${baselineLabel(diff)}`; } function baselineLabel(diff: SnapshotDiffResult): string { const snapshot = diff.snapshotName ? `snapshot '${diff.snapshotName}'` : "snapshot"; const tag = diff.snapshotTag ? ` tag '${diff.snapshotTag}'` : ""; const baseline = diff.compareTag ? ` compared to tag '${diff.compareTag}'` : ""; return `${snapshot}${tag}${baseline}`; } function shapeDiffResult( diff: SnapshotDiffResult, responseId: string, baselineFreshness?: ReturnType, ) { const interpretation = diffInterpretation(diff); const sourceUrl = diff.current.finalUrl ?? diff.current.url; const baselineWarning = baselineFreshness?.stale ? `Baseline snapshot is ${formatAge(baselineFreshness.ageSeconds)} old; refresh or save a newer snapshot before relying on time-sensitive comparisons.` : undefined; return { summary: interpretation, answerContext: [ interpretation, diff.previous ? `Compared current content against ${baselineLabel(diff)}.` : "No previous snapshot existed; this run established the baseline.", baselineWarning, `Use responseId ${responseId} to inspect the full diff, hashes, headings, links, metadata changes, and snapshot metadata.`, ] .filter(Boolean) .join("\n"), ...storedTraceContext({ responseId, source: { id: "current", uri: sourceUrl, excerpt: diff.current.content.text.slice(0, 240), relevance: "Current scraped page used for snapshot comparison.", retrievedAt: diff.current.metadata.timestamp, sourceType: "docs", }, retrieveDescription: "Inspect the full stored diff result.", guidanceSuffix: "For changed diffs, inspect added/removed sections before answering from an older snapshot.", }), qualitySignals: { confidence: baselineFreshness?.stale ? ("medium" as const) : ("high" as const), freshness: baselineFreshness?.stale ? ("stale_possible" as const) : ("current" as const), coverage: "complete" as const, knownGaps: [ !diff.previous ? "This was the first snapshot, so no previous content was available for comparison." : undefined, baselineWarning, ].filter(Boolean) as string[], }, }; }