import { join, resolve } from "node:path"; import { cpus } from "node:os"; import { readFile, writeFile } from "node:fs/promises"; import { Type } from "typebox"; import type { ExtensionAPI } from "@earendil-works/pi-coding-agent"; import { getLatestChromeProfile, DEFAULT_OS } from "../fetch.ts"; import { pullPageEnhanced } from "../content.ts"; import { discover } from "../discovery.ts"; import { storeContent, BASE_TEMP } from "../session-store.ts"; import { compileContextPackage } from "../context-package.ts"; import { RequestQueue, hasQueueFile } from "../request-queue.ts"; import { BrowserPool } from "../browser-pool.ts"; import { SessionRouter, parseRoutes } from "../session-router.ts"; import type { FetchOpts, ScrapeMode, Page } from "../types.ts"; import { frontmatter, writePage, rewriteLinks, runPullFromQueue, } from "./utils.ts"; export function registerWebpullTool(pi: ExtensionAPI): void { pi.registerTool({ name: "aio-webpull", label: "Webpull", description: "Pull any public website or docs site into local markdown files with anti-bot TLS fingerprinting. Discovers pages via sitemap, navigation links, or crawling. Writes files preserving URL structure with YAML frontmatter.", promptSnippet: "Pull an entire website into local markdown files", promptGuidelines: [ "Use aio-websearch when the user wants to find information online. Returns compact search results.", "Use aio-webfetch when the user wants to download a specific URL or batch of URLs.", "After aio-webpull completes, use the built-in read tool to inspect the generated markdown files.", ], parameters: Type.Object({ url: Type.String({ description: "URL to pull (e.g. https://docs.example.com)", }), out: Type.Optional( Type.String({ description: "Output directory under temp (default: )", }), ), max: Type.Optional( Type.Number({ description: "Max pages to pull (default: 100)", default: 100, }), ), mode: Type.Optional( Type.String({ description: `Scrape mode: "auto" (default), "fast", "fingerprint", or "browser". Auto escalates when bot protection is detected.`, }), ), browser: Type.Optional( Type.String({ description: `Browser profile for TLS fingerprinting. Default: "${getLatestChromeProfile()}". Examples: chrome_145, firefox_147, safari_26, edge_145`, }), ), os: Type.Optional( Type.String({ description: `OS profile for fingerprinting. Default: "${DEFAULT_OS}". Options: windows, macos, linux, android, ios`, }), ), proxy: Type.Optional( Type.String({ description: "Proxy URL (e.g. http://user:pass@host:port or socks5://host:port)", }), ), compile: Type.Optional( Type.Boolean({ description: "Compile pulled pages into a single context package after completion.", }), ), resume: Type.Optional( Type.Boolean({ description: "Resume a previous pull from the output directory (default: auto-detect). Set to false to force a fresh pull.", }), ), routes: Type.Optional( Type.Array( Type.Object({ pattern: Type.String({ description: "URL pattern: path string, glob (*/docs/*), or regex (/^\\/api\\//)", }), mode: Type.Optional( Type.String({ description: "Fetcher mode: fast, fingerprint, browser, or auto", }), ), extractor: Type.Optional( Type.String({ description: "Vertical extractor name (e.g. npm, pypi, wikipedia)", }), ), browser: Type.Optional( Type.String({ description: "Browser profile override for this route", }), ), os: Type.Optional( Type.String({ description: "OS profile override for this route", }), ), }), { description: "Route definitions: URL pattern -> fetcher mode/extractor. Evaluated in order, first match wins.", }, ), ), adaptive: Type.Optional( Type.Boolean({ description: "Enable adaptive content selector — remembers element structure to survive site redesigns (default: false)", }), ), bypass: Type.Optional( Type.Boolean({ description: "Enable paywall bypass on every page in the pull. If a fetched page looks paywalled, retry using a chain of strategies (Googlebot UA, archive.org Wayback, Playwright with paywall JS blocked) before recording an error.", }), ), }), async execute(_toolCallId, params, signal, onUpdate) { let raw = params.url; if (!/^https?:\/\//i.test(raw)) raw = `https://${raw}`; let url: URL; try { url = new URL(raw); } catch { throw new Error(`Bad URL: ${params.url}`); } const outDir = params.out ? resolve(BASE_TEMP, params.out) : join(BASE_TEMP, url.hostname); const max = params.max ?? 100; const concurrency = Math.max(4, cpus().length * 2); const browser = (params.browser as string) ?? getLatestChromeProfile(); const os = (params.os as string) ?? DEFAULT_OS; const proxy = (params.proxy as string) ?? undefined; const mode = (params.mode as ScrapeMode) ?? "auto"; const compile = (params.compile as boolean) ?? false; const resume = params.resume !== false; const routes = (params.routes ?? []) as { pattern: string; mode?: string; extractor?: string; browser?: string; os?: string; }[]; const adaptive = params.adaptive === true || params.adaptive === "true"; let wreqSession: any = null; try { const { createSession } = await import("wreq-js"); wreqSession = await createSession({ browser: browser as any, os: os as any, ...(proxy ? { proxy } : {}), }); } catch { /* session creation failed — fall back to isolated fetches */ } const fetchOpts: FetchOpts = { browser, os, proxy, mode, adaptive, wreqSession, bypass: params.bypass === true, }; const router = routes.length > 0 ? new SessionRouter(parseRoutes(routes)) : null; let queue: RequestQueue | null = null; if (resume && hasQueueFile(outDir)) { queue = await RequestQueue.resume(outDir); if (queue) { const s = queue.stats(); onUpdate?.({ content: [ { type: "text", text: `šŸ”„ Resuming pull: ${s.completed} done, ${s.queued} queued, ${s.failed} failed`, }, ], details: { stage: "resume", stats: s }, }); } } if (!queue) { onUpdate?.({ content: [ { type: "text", text: `šŸ” Discovering pages for ${url.href} (${browser}/${os})...`, }, ], details: { stage: "discover", browser, os }, }); const urls = await discover(url.href, max, fetchOpts); if (!urls.length) throw new Error("No pages found."); queue = await RequestQueue.create(outDir); await queue.add(urls); onUpdate?.({ content: [ { type: "text", text: `šŸ“„ Found ${urls.length} pages. Pulling with ${concurrency} workers...`, }, ], details: { stage: "pull", total: urls.length, browser, os }, }); } const needsBrowser = mode === "browser" || mode === "auto"; const browserPool = needsBrowser ? new BrowserPool({ headless: true, channel: "chrome" }) : null; // Session warm-up: hit root URL before deep links to establish // cookies, TLS state, and anti-bot clearance. if (mode !== "fast") { try { await pullPageEnhanced(url.href, { ...fetchOpts, ...(browserPool ? { browserPool } : {}), }); // Dwell: 800-1500ms jittered pause to mimic human behavior await new Promise((r) => setTimeout(r, 800 + Math.random() * 700)); } catch { /* warm-up failed, proceed anyway */ } } let ok = 0; let err = 0; const files: string[] = []; const errors: string[] = []; const pageUrlToPath = new Map(); const pagePathToUrl = new Map(); const pagePathToTitle = new Map(); const totalUrls = queue.stats().queued + queue.stats().inProgress + queue.stats().completed; try { await runPullFromQueue(queue, concurrency, async (pageUrl: string) => { if (signal?.aborted) return; const urlOpts: FetchOpts = { ...fetchOpts, ...(browserPool ? { browserPool } : {}), }; if (router) { const match = router.match(pageUrl); if (match) { if (match.mode) urlOpts.mode = match.mode as ScrapeMode; if (match.browser) urlOpts.browser = match.browser; if (match.os) urlOpts.os = match.os; } } const result = await pullPageEnhanced(pageUrl, urlOpts); if (!result.ok) { const willRetry = await queue.fail( pageUrl, result.error ?? "Unknown error", ); if (!willRetry) { err++; errors.push(`${pageUrl}: ${result.error}`); } return; } await queue.complete(pageUrl); const page: Page = { url: result.url!, title: result.title || new URL(result.url!).pathname, markdown: frontmatter(result.title || "", result.url!, { author: result.author, published: result.published, site: result.site, language: result.language, wordCount: result.wordCount, }) + (result.content ?? ""), }; const rel = await writePage(page, outDir); files.push(rel); pageUrlToPath.set(page.url, rel); pagePathToUrl.set(rel, page.url); pagePathToTitle.set(rel, page.title || rel); ok++; storeContent(result.url!, result.title, page.markdown, undefined, { author: result.author, published: result.published, site: result.site, language: result.language, wordCount: result.wordCount, }); const qStats = queue.stats(); onUpdate?.({ content: [ { type: "text", text: `ā³ ${ok + err}/${totalUrls} pages processed — pulled ${result.title || page.url} → ${rel}`, }, ], details: { stage: "stream", ok, err, total: totalUrls, file: rel, title: result.title, url: result.url, wordCount: result.wordCount, queueStats: qStats, }, }); }); } finally { if (browserPool) { await browserPool.drain(); } if (queue) { await queue.close(); } if (wreqSession) { try { await wreqSession.close(); } catch { /* best-effort */ } } } if (pageUrlToPath.size > 1) { let rewrites = 0; for (const rel of files) { const full = join(outDir, rel); try { const md = await readFile(full, "utf8"); const rewritten = rewriteLinks(md, pageUrlToPath, rel); if (rewritten !== md) { await writeFile(full, rewritten, "utf8"); rewrites++; } } catch { /* best effort */ } } if (rewrites > 0) { onUpdate?.({ content: [ { type: "text", text: `šŸ”— Rewrote links in ${rewrites} files` }, ], details: { stage: "rewrite", filesRewritten: rewrites }, }); } } const summary = [ `āœ… Pulled ${ok} pages to ${outDir}`, err > 0 ? `āš ļø ${err} pages failed` : "", ``, `Files:`, ...files.slice(0, 30).map((f) => ` - ${f}`), files.length > 30 ? ` ... and ${files.length - 30} more` : "", errors.length > 0 ? `\nErrors:\n${errors .slice(0, 10) .map((e) => ` - ${e}`) .join("\n")}` : "", ] .filter(Boolean) .join("\n"); let packagePath: string | undefined; if (compile && ok > 0) { try { const pages = await Promise.all( files.map(async (rel) => { const filePath = join(outDir, rel); try { const content = await readFile(filePath, "utf8"); return { url: pagePathToUrl.get(rel) ?? rel, title: pagePathToTitle.get(rel) ?? rel, content, relPath: rel, }; } catch { return null; } }), ); const validPages = pages.filter((p) => p !== null); if (validPages.length > 0) { const pkg = await compileContextPackage( validPages, join(outDir, "..", "packages"), { packageName: `${url.hostname}-${Date.now()}`, }, ); packagePath = pkg.packagePath; } } catch { /* best effort */ } } const totalProcessed = ok + err; return { content: [ { type: "text", text: summary + (packagePath ? `\nšŸ“¦ Compiled package: ${packagePath}` : ""), }, ], details: { outDir, total: totalProcessed, ok, err, files, errors, browser, os, proxy, packagePath, adaptive, queueStats: queue?.stats(), browserPoolStats: browserPool?.stats(), }, }; }, }); }