/** @file Parse llms module. */ import { dedupeBy } from "../../url/dedupe.ts"; import { normalizeUrl } from "../../url/normalize.ts"; export interface AgentReadableCandidate { url: string; kind: "markdown_sibling" | "llms_txt" | "llms_entry"; } export function likelyAgentReadableUrls(input: string | URL): AgentReadableCandidate[] { const url = new URL(normalizeUrl(input)); const withoutSlash = url.pathname.replace(/\/$/u, ""); const markdown = new URL(url); markdown.pathname = `${withoutSlash || "/index"}.md`; markdown.search = ""; markdown.hash = ""; const llms = new URL("/llms.txt", url.origin); return [ { url: markdown.toString(), kind: "markdown_sibling" }, { url: llms.toString(), kind: "llms_txt" }, ]; } export function parseLlmsTxt(text: string, baseUrl: string): AgentReadableCandidate[] { const candidates: AgentReadableCandidate[] = []; for (const line of text.split(/\r?\n/u)) { const urls = [...line.matchAll(/https?:\/\/\S+|\[[^\]]+\]\(([^)]+)\)/giu)]; for (const match of urls) { const raw = match[1] || match[0]; try { candidates.push({ url: new URL(raw, baseUrl).toString(), kind: "llms_entry", }); } catch { // Ignore malformed llms.txt entries; callers can continue with other candidates. } } } return dedupeBy(candidates, (item) => item.url); }