// ─── arXiv extractor ─────────────────────────────────────────────── // Uses the Atom export feed API. import type { VerticalResult } from "./types.js"; export function matchesArxiv(url: string): boolean { return /^https?:\/\/arxiv\.org\/abs\/\d+\.\d+/i.test(url); } export async function extractArxiv( url: string, fetchText: (url: string) => Promise, ): Promise { const match = url.match(/arxiv\.org\/abs\/(\d+\.\d+(?:v\d+)?)/i); if (!match) return null; const id = match[1]!; // Use the Atom API const atomUrl = `https://export.arxiv.org/api/query?id_list=${encodeURIComponent(id)}`; const xml = await fetchText(atomUrl); if (!xml) return null; // Quick XML extraction (no XML parser dependency, string-based) function extractTag(xmlStr: string, tag: string): string { const open = `<${tag}`; const close = ``; const startIdx = xmlStr.indexOf(open); if (startIdx === -1) return ""; const contentStart = xmlStr.indexOf(">", startIdx); if (contentStart === -1) return ""; const contentEnd = xmlStr.indexOf(close, contentStart); if (contentEnd === -1) return ""; return xmlStr.slice(contentStart + 1, contentEnd).trim(); } const title = extractTag(xml, "title"); const summary = extractTag(xml, "summary"); const published = extractTag(xml, "published"); const updated = extractTag(xml, "updated"); // Authors const authors: string[] = []; const authorRe = /]*>\s*([^<]*)<\/name>/gi; let m: RegExpExecArray | null; while ((m = authorRe.exec(xml)) !== null) { authors.push(m[1]!.trim()); } // Categories const categories: string[] = []; const catRe = /]*term="([^"]*)"/gi; while ((m = catRe.exec(xml)) !== null) { categories.push(m[1]!.trim()); } // PDF link const pdfLink = xml.match(/]*title="pdf"[^>]*href="([^"]*)"/i)?.[1] || `https://arxiv.org/pdf/${id}.pdf`; let md = `# ${title}\n\n`; if (published) md += `- **Published:** ${published}\n`; if (updated && updated !== published) md += `- **Updated:** ${updated}\n`; if (authors.length) md += `- **Authors:** ${authors.join(", ")}\n`; if (categories.length) md += `- **Categories:** ${categories.join(", ")}\n`; md += `- **PDF:** ${pdfLink}\n`; if (summary) { md += `\n## Abstract\n\n${summary}\n`; } return { ok: true, url, title, content: md, }; }