import { parseHTML } from "linkedom";
import type { RenderResult, SpecialHandler } from "./types";
import { buildResult, loadPage } from "./types";
import { convertWithMarkit, fetchBinary } from "./utils";
/**
* Handle arXiv URLs via arXiv API
*/
export const handleArxiv: SpecialHandler = async (
url: string,
timeout: number,
signal?: AbortSignal,
): Promise => {
try {
const parsed = new URL(url);
if (parsed.hostname !== "arxiv.org") return null;
// Extract paper ID from various URL formats
// /abs/1234.56789, /pdf/1234.56789, /abs/cs/0123456
const match = parsed.pathname.match(/\/(abs|pdf)\/(.+?)(?:\.pdf)?$/);
if (!match) return null;
const paperId = match[2];
const fetchedAt = new Date().toISOString();
const notes: string[] = [];
// Fetch metadata via arXiv API
const apiUrl = `https://export.arxiv.org/api/query?id_list=${paperId}`;
const result = await loadPage(apiUrl, { timeout, signal });
if (!result.ok) return null;
// Parse the Atom feed response
const doc = parseHTML(result.content).document;
const entry = doc.querySelector("entry");
if (!entry) return null;
const title = entry.querySelector("title")?.textContent?.trim()?.replace(/\s+/g, " ");
const summary = entry.querySelector("summary")?.textContent?.trim();
const authors = Array.from(entry.querySelectorAll("author name") as Iterable<{ textContent: string | null }>)
.map(n => n.textContent?.trim())
.filter((name): name is string => Boolean(name));
const published = entry.querySelector("published")?.textContent?.trim()?.split("T")[0];
const categories = Array.from(
entry.querySelectorAll("category") as Iterable<{ getAttribute: (name: string) => string | null }>,
)
.map(c => c.getAttribute("term"))
.filter((term): term is string => Boolean(term));
const pdfLink = entry.querySelector('link[title="pdf"]')?.getAttribute("href");
let md = `# ${title || "arXiv Paper"}\n\n`;
if (authors.length) md += `**Authors:** ${authors.join(", ")}\n`;
if (published) md += `**Published:** ${published}\n`;
if (categories.length) md += `**Categories:** ${categories.join(", ")}\n`;
md += `**arXiv:** ${paperId}\n\n`;
md += `---\n\n## Abstract\n\n${summary || "No abstract available."}\n\n`;
// If it was a PDF link or we want full content, try to fetch and convert PDF
if (match[1] === "pdf" || parsed.pathname.includes(".pdf")) {
if (pdfLink) {
notes.push("Fetching PDF for full content...");
const pdfResult = await fetchBinary(pdfLink, timeout, signal);
if (pdfResult.ok) {
const converted = await convertWithMarkit(pdfResult.buffer, ".pdf", timeout, signal);
if (converted.ok && converted.content.length > 500) {
md += `---\n\n## Full Paper\n\n${converted.content}\n`;
notes.push("PDF converted via markit");
}
}
}
}
return buildResult(md, {
url,
method: "arxiv",
fetchedAt,
notes: notes.length ? notes : ["Fetched via arXiv API"],
});
} catch {}
return null;
};