import { Type } from "typebox"; import type { ExtensionAPI } from "@earendil-works/pi-coding-agent"; import { smartFetch, getLatestChromeProfile, DEFAULT_OS } from "../fetch.ts"; import { discover } from "../discovery.ts"; import type { FetchOpts } from "../types.ts"; export function registerWebmapTool(pi: ExtensionAPI): void { pi.registerTool({ name: "aio-webmap", label: "Web Map", description: "Discovery-only tool — finds pages via robots.txt, sitemaps, navigation links, llms.txt, and crawling without fetching content. Returns structured URLs grouped by source.", promptSnippet: "Discover pages on a website without fetching content", promptGuidelines: [ "Use aio-webmap to discover all pages on a site before a full pull.", "Returns URLs grouped by discovery source: sitemaps, robots.txt, navigation, llms.txt, crawl.", "Use aio-webpull to actually fetch and convert the discovered pages.", ], parameters: Type.Object({ url: Type.String({ description: "URL to discover pages for (e.g. https://docs.example.com)", }), max: Type.Optional( Type.Number({ description: "Max URLs to discover (default: 100)", default: 100, }), ), browser: Type.Optional( Type.String({ description: `Browser profile for TLS fingerprinting. Default: "${getLatestChromeProfile()}"`, }), ), os: Type.Optional( Type.String({ description: `OS profile for fingerprinting. Default: "${DEFAULT_OS}"`, }), ), }), async execute(_toolCallId, params) { let raw = params.url; if (!/^https?:\/\//i.test(raw)) raw = `https://${raw}`; let url: URL; try { url = new URL(raw); } catch { throw new Error(`Bad URL: ${params.url}`); } const max = params.max ?? 100; const browser = (params.browser as string) ?? getLatestChromeProfile(); const os = (params.os as string) ?? DEFAULT_OS; const fetchOpts: FetchOpts = { browser, os }; const urls = await discover(url.href, max, fetchOpts); let llmsUrls: string[] = []; try { const llmsRes = await smartFetch(`${url.origin}/llms.txt`, fetchOpts); if (llmsRes && llmsRes.status < 400) { llmsUrls = llmsRes.text .split(/\n/) .filter((l) => /^https?:\/\//i.test(l.trim())) .map((l) => l.trim()); } } catch { /* ignore */ } const text = [ `🌐 Site map for ${url.href}`, `\nDiscovered ${urls.length} pages via sitemaps/robots/nav/crawl.`, llmsUrls.length > 0 ? `\nFound ${llmsUrls.length} entries in llms.txt` : "", "\n\nFirst 50 pages:", ...urls.slice(0, 50).map((u, i) => `${i + 1}. ${u}`), urls.length > 50 ? `\n... and ${urls.length - 50} more` : "", llmsUrls.length > 0 ? `\n\nllms.txt entries:\n${llmsUrls.map((u) => ` - ${u}`).join("\n")}` : "", ].join("\n"); return { content: [{ type: "text", text }], details: { url: url.href, totalUrls: urls.length, urls, llmsUrls, browser, os, }, }; }, }); }