import { load } from "cheerio";
import { buildFetchOptions } from "./proxy";
import { extractFromHtml, bodyTrimmedLength } from "./extract";
import { sanitizeMarkdown, hasCodeSnippets, bodyLength } from "./sanitize";
import type { CrawlConfig, CrawlResult } from "./types";
const SPA_INDICATORS = [
"__next_data__",
"data-nextjs",
"data-reactroot",
"_react",
"ng-app",
"ng-version",
"data-v-",
"data-server-rendered",
"app-root",
'
',
'
',
"enable javascript",
"requires javascript",
"javascript is required",
"javascript must be enabled",
];
export class WebsiteCrawler {
private userAgents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
];
constructor(
private proxyEndpoint?: string,
private headlessRenderer?: HeadlessRenderer,
) {}
randomUserAgent(): string {
return this.userAgents[Math.floor(Math.random() * this.userAgents.length)] ?? this.userAgents[0];
}
isLlmsManifest(url: string): boolean {
try {
const path = new URL(url).pathname;
const basename = path.split("/").pop()?.toLowerCase() ?? "";
return basename !== "" && /^llms.*\.txt$/.test(basename);
} catch {
return false;
}
}
async fetchLlmsManifest(url: string, config: CrawlConfig): Promise {
const normalized = this.normalizeUrl(url);
if (!normalized) throw new Error("Invalid URL");
if (!this.inScope(normalized, config)) {
throw new Error(`URL ${normalized} is outside the configured scope.`);
}
const opts = buildFetchOptions(this.proxyEndpoint, 15000);
const response = await fetch(normalized, {
...opts,
headers: {
"User-Agent": this.randomUserAgent(),
"Accept": "text/plain;q=1.0,*/*;q=0.8",
},
});
if (!response.ok) {
throw new Error(`Failed to fetch ${normalized}: ${response.status}`);
}
const body = await response.text();
const links: string[] = [];
const lines = body.split(/\r?\n/);
const base = new URL(normalized);
for (const line of lines) {
const trimmed = line.trim();
if (!trimmed || trimmed.startsWith("#")) continue;
if (trimmed.includes("|")) continue;
if (!trimmed.startsWith("-")) continue;
let extractedUrl: string | null = null;
const linkMatch = trimmed.match(/^-\s*\[.*?\]\((https?:\/\/[^\s)]+)\)/);
if (linkMatch) {
extractedUrl = linkMatch[1];
} else {
const clean = trimmed.replace(/^-\s*/, "");
if (!/\s/.test(clean)) extractedUrl = clean;
}
if (!extractedUrl) continue;
try {
const resolved = new URL(extractedUrl, base);
const normalizedLink = this.normalizeUrl(resolved.href);
if (normalizedLink && normalizedLink.length <= 255 && this.inScope(normalizedLink, config)) {
links.push(normalizedLink);
}
} catch {
// Invalid URL
}
}
return [...new Set(links)];
}
async fetch(url: string, config: CrawlConfig): Promise {
const normalized = this.normalizeUrl(url);
if (!normalized) throw new Error("Invalid URL");
if (!this.inScope(normalized, config)) {
throw new Error(`URL ${normalized} is outside the configured scope.`);
}
const userAgent = this.randomUserAgent();
// Try markdown content negotiation first
const markdownResult = await this.tryMarkdownContentNegotiation(normalized, userAgent, config);
if (markdownResult) return markdownResult;
// Fall back to HTML processing
const opts = buildFetchOptions(this.proxyEndpoint, 20000);
const response = await fetch(normalized, {
...opts,
headers: {
"User-Agent": userAgent,
"Accept": "text/html;q=0.9,*/*;q=0.8",
},
});
if (!response.ok) {
throw new Error(`Failed to fetch ${normalized}: ${response.status}`);
}
let body = await response.text();
let links = this.extractLinks(body, normalized, config);
let extracted = extractFromHtml(body, normalized, { minBodyChars: config.minBodyCharacters });
let markdown = sanitizeMarkdown(extracted.markdown);
let length = bodyLength(markdown);
// Check if headless rendering should be used
const headlessCheck = this.shouldUseHeadless(body, markdown, links, config);
if (headlessCheck.shouldUse && this.headlessRenderer) {
const renderedHtml = await this.headlessRenderer.render(normalized, userAgent);
if (renderedHtml) {
body = renderedHtml;
links = this.extractLinks(body, normalized, config);
extracted = extractFromHtml(body, normalized, { minBodyChars: config.minBodyCharacters });
markdown = sanitizeMarkdown(extracted.markdown);
length = bodyLength(markdown);
}
}
if (length < config.minBodyCharacters) {
throw new Error("Document too small after sanitization.");
}
if (config.requireCodeSnippets && !hasCodeSnippets(markdown)) {
throw new Error("Document missing code snippets.");
}
return {
url: normalized,
title: extracted.title,
markdown,
path: this.buildRelativePath(normalized, extracted.title),
links,
};
}
private async tryMarkdownContentNegotiation(
url: string,
userAgent: string,
config: CrawlConfig,
): Promise {
try {
const opts = buildFetchOptions(this.proxyEndpoint, 20000);
const response = await fetch(url, {
...opts,
headers: {
"User-Agent": userAgent,
"Accept": "text/markdown;q=1.0,text/x-markdown;q=0.9,text/plain;q=0.5",
},
});
if (!response.ok) return null;
const contentType = response.headers.get("Content-Type") ?? "";
const body = await response.text();
const isMarkdown =
contentType.includes("text/markdown") ||
contentType.includes("text/x-markdown") ||
(contentType.includes("text/plain") && this.looksLikeMarkdown(body));
if (!isMarkdown) return null;
return this.processMarkdownResponse(url, body, config);
} catch {
return null;
}
}
private processMarkdownResponse(url: string, body: string, config: CrawlConfig): CrawlResult {
const markdown = sanitizeMarkdown(body);
const length = bodyLength(markdown);
if (length < config.minBodyCharacters) {
throw new Error("Document too small after sanitization.");
}
if (config.requireCodeSnippets && !hasCodeSnippets(markdown)) {
throw new Error("Document missing code snippets.");
}
const title = this.extractTitleFromMarkdown(markdown, url);
const links = this.extractLinksFromMarkdown(body, url, config);
return {
url,
title,
markdown,
path: this.buildRelativePath(url, title),
links,
};
}
private extractLinksFromMarkdown(markdown: string, currentUrl: string, config: CrawlConfig): string[] {
const links: string[] = [];
const base = new URL(currentUrl);
const matches = markdown.matchAll(/\[(?:[^\]]*)\]\(([^)\s]+)\)/g);
for (const match of matches) {
const href = match[1];
if (!href || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:")) {
continue;
}
try {
const resolved = new URL(href, base);
const normalized = this.normalizeUrl(resolved.href);
if (normalized && normalized.length <= 255 && this.inScope(normalized, config)) {
links.push(normalized);
}
} catch {
// Invalid URL
}
}
return [...new Set(links)];
}
private shouldUseHeadless(
html: string,
markdown: string,
links: string[],
config: CrawlConfig,
): { shouldUse: boolean; reason: string } {
if (config.forceHeadless) {
return { shouldUse: true, reason: "force_headless enabled" };
}
if (!config.allowHeadless || !this.headlessRenderer?.isEnabled()) {
return { shouldUse: false, reason: "headless disabled" };
}
const sparseThreshold = 400;
const length = bodyLength(markdown);
if (length < sparseThreshold) {
return { shouldUse: true, reason: `body too short (${length} < ${sparseThreshold})` };
}
if (links.length < 3) {
return { shouldUse: true, reason: `too few links extracted (${links.length})` };
}
const htmlLower = html.toLowerCase();
for (const indicator of SPA_INDICATORS) {
if (htmlLower.includes(indicator.toLowerCase())) {
return { shouldUse: true, reason: `SPA indicator found: ${indicator}` };
}
}
return { shouldUse: false, reason: "no signals detected" };
}
private extractTitleFromMarkdown(markdown: string, url: string): string {
const h1Match = markdown.match(/^#\s+(.+)$/m);
if (h1Match?.[1]) return h1Match[1].trim();
try {
const path = new URL(url).pathname;
const filename = path.split("/").pop()?.replace(/\.md$/, "") ?? "";
return filename || new URL(url).host;
} catch {
return url;
}
}
private looksLikeMarkdown(body: string): boolean {
const trimmed = body.trimStart();
if (!trimmed) return false;
const prefix = trimmed.slice(0, 200).toLowerCase();
if (prefix.includes(" 0) {
const allowed = config.allowedPaths.some((p) => path.startsWith(p.toLowerCase()));
if (!allowed) return false;
}
if (config.deniedPaths.length > 0) {
const denied = config.deniedPaths.some((p) => path.startsWith(p.toLowerCase()));
if (denied) return false;
}
return true;
} catch {
return false;
}
}
private extractLinks(html: string, currentUrl: string, config: CrawlConfig): string[] {
const $ = load(html);
const links: string[] = [];
const base = new URL(currentUrl);
$("a").each((_i, el) => {
const href = $(el).attr("href");
if (!href || href.startsWith("javascript:")) return;
try {
const resolved = new URL(href, base);
const normalized = this.normalizeUrl(resolved.href);
if (normalized && normalized.length <= 255 && this.inScope(normalized, config)) {
links.push(normalized);
}
} catch {
// Invalid URL
}
});
return [...new Set(links)];
}
private buildRelativePath(url: string, title: string | null): string {
try {
const parsed = new URL(url);
let path = parsed.pathname.replace(/^\//, "").replace(/\/$/, "");
if (!path) {
path = "index";
}
// Clean up path
path = path.replace(/\.html?$/i, "").replace(/\.md$/i, "");
// Add .md extension
if (!path.endsWith(".md")) {
path = `${path}.md`;
}
return path;
} catch {
return "page.md";
}
}
}
export type { HeadlessRenderer } from "./headless";
export { createHeadlessRenderer, findChromeBinaryPath, getChromeVersion, getInstallInstructions } from "./headless";