/** @file Map discover module. */ import type { HttpClient } from "../http/client.ts"; import { createHttpClient } from "../http/client.ts"; import { normalizeUrl } from "../url/normalize.ts"; import { llmsUrlForSite, parseLlmsLinks } from "./llms.ts"; import { defaultSitemapUrl, parseRobotsSitemaps, parseSitemapXml, robotsUrlForSite, } from "./sitemaps.ts"; export interface MapUrlEntry { url: string; source: "robots" | "sitemap" | "llms"; sourceUrl: string; lastmod?: string; title?: string; } export interface SiteMapResult { seedUrl: string; urls: MapUrlEntry[]; tree: Record; sitemaps: string[]; } export interface SiteMapOptions { maxSitemaps?: number; cacheTtlSeconds?: number; maxAgeSeconds?: number; refresh?: boolean; } export interface SiteMapDeps { httpClient?: Pick; } export async function discoverSiteUrls( seed: string, options: SiteMapOptions = {}, deps: SiteMapDeps = {}, signal?: AbortSignal, ): Promise { const seedUrl = normalizeUrl(seed); const client = deps.httpClient ?? createHttpClient(); const found = new Map(); const sitemaps = new Set(); const robotsUrl = robotsUrlForSite(seedUrl); const robots = await fetchText(client, robotsUrl, options, signal); if (robots) for (const sitemap of parseRobotsSitemaps(robots, robotsUrl).sitemaps) sitemaps.add(sitemap); sitemaps.add(defaultSitemapUrl(seedUrl)); const queue = [...sitemaps]; const maxSitemaps = options.maxSitemaps ?? 20; for (let index = 0; index < queue.length && index < maxSitemaps; index += 1) { const sitemapUrl = queue[index]; const body = await fetchSitemap(client, sitemapUrl, options, signal); if (!body) continue; const parsed = parseSitemapXml(body, sitemapUrl); for (const nested of parsed.sitemaps) if (!sitemaps.has(nested)) { sitemaps.add(nested); queue.push(nested); } for (const entry of parsed.urls) found.set(entry.url, { url: entry.url, source: "sitemap", sourceUrl: entry.source, lastmod: entry.lastmod, }); } const llmsUrl = llmsUrlForSite(seedUrl); const llms = await fetchText(client, llmsUrl, options, signal); if (llms) for (const entry of parseLlmsLinks(llms, llmsUrl)) found.set(entry.url, { url: entry.url, source: "llms", sourceUrl: entry.source, title: entry.title, }); return { seedUrl, urls: [...found.values()].toSorted((a, b) => a.url.localeCompare(b.url)), tree: buildTree([...found.keys()]), sitemaps: [...sitemaps], }; } function buildTree(urls: string[]): Record { const tree: Record = {}; for (const url of urls) { const parsed = new URL(url); const section = parsed.pathname.split("/").find(Boolean) ?? "/"; tree[section] = [...(tree[section] ?? []), url]; } return tree; } async function fetchText( client: Pick, url: string, options: SiteMapOptions, signal?: AbortSignal, ): Promise { const result = await client .fetchUrl( url, { respectRobots: false, forceText: true, maxBytes: 2 * 1024 * 1024, cacheTtlSeconds: options.cacheTtlSeconds, maxAgeSeconds: options.maxAgeSeconds, refresh: options.refresh, }, signal, ) .catch(() => { /* no-op */ }); return result?.text; } async function fetchSitemap( client: Pick, url: string, options: SiteMapOptions, signal?: AbortSignal, ): Promise { const result = await client .fetchUrl( url, { respectRobots: false, forceText: true, maxBytes: 2 * 1024 * 1024, cacheTtlSeconds: options.cacheTtlSeconds, maxAgeSeconds: options.maxAgeSeconds, refresh: options.refresh, }, signal, ) .catch(() => { /* no-op */ }); if (!result) return; if (url.endsWith(".gz") && result.body) return result.body; return result.text ?? result.body; }