// ─── Structured bot-block detection ──────────────────────────────── // Categorized detection for known anti-bot systems. Returns structured // metadata so callers can decide whether to retry, escalate, or give up. export interface BotBlockResult { blocked: boolean; /** Known blocker type, or "generic" for heuristic-only matches. */ blockerType?: | "cloudflare" | "anubis" | "perimeterx" | "datadome" | "incapsula" | "akamai" | "generic"; /** Human-readable explanation. */ message: string; /** Whether retrying with a different profile/browser may help. */ retryable: boolean; /** The phase of the page that triggered detection. */ phase: | "javascript_challenge" | "captcha" | "rate_limit" | "proof_of_work" | "generic_block"; /** Raw confidence score 0–1 based on signal strength. */ confidence: number; } type KnownBlocker = Exclude; interface BlockerSignature { type: KnownBlocker; phase: BotBlockResult["phase"]; retryable: boolean; /** Required string(s) that must ALL be present. */ required: string[]; /** Optional string(s); at least one must be present if provided. */ optional?: string[]; /** Weight for confidence scoring. */ weight: number; } const BLOCKER_SIGNATURES: BlockerSignature[] = [ { type: "cloudflare", phase: "javascript_challenge", retryable: true, required: ["cf-browser-verification", "checking your browser"], weight: 0.95, }, { type: "cloudflare", phase: "captcha", retryable: false, required: ["attention required"], optional: ["cloudflare", "captcha", "verify you are human"], weight: 0.95, }, { type: "cloudflare", phase: "rate_limit", retryable: true, required: ["rate limited"], optional: ["cloudflare", "too many requests", "ray id"], weight: 0.9, }, { type: "anubis", phase: "proof_of_work", retryable: true, required: ["protected by anubis"], optional: ["proof-of-work", "making sure you're not a bot"], weight: 0.98, }, { type: "perimeterx", phase: "javascript_challenge", retryable: true, required: ["perimeterx"], optional: ["px-captcha", "please verify"], weight: 0.95, }, { type: "datadome", phase: "javascript_challenge", retryable: true, required: ["datadome"], optional: ["please verify", "captcha", "checking your browser"], weight: 0.95, }, { type: "incapsula", phase: "javascript_challenge", retryable: true, required: ["incap_ses"], optional: ["incapsula", " Javascript is required"], weight: 0.9, }, { type: "akamai", phase: "javascript_challenge", retryable: true, required: ["akamai"], optional: ["checking your browser", "please verify"], weight: 0.9, }, ]; // Generic heuristic markers (used when no specific signature matches) const GENERIC_BOT_MARKERS = [ "making sure you're not a bot", "checking your browser", "just a moment", "enable javascript and cookies to continue", "attention required", "verify you are human", "unusual traffic", "before you continue", "please enable javascript", "browser check", "security check", "captcha", "ddos protection", "bot detection", ]; /** * Analyze HTML/text content for known anti-bot / blocking pages. * Returns structured metadata instead of a simple boolean so callers * can make intelligent retry/escalation decisions. */ export function detectBotBlock( text: string, statusCode?: number, ): BotBlockResult { const sample = String(text || "") .slice(0, 8000) .toLowerCase(); // Check specific blocker signatures for (const sig of BLOCKER_SIGNATURES) { const hasRequired = sig.required.every((r) => sample.includes(r)); if (!hasRequired) continue; const hasOptional = !sig.optional || sig.optional.some((o) => sample.includes(o)); if (sig.optional && !hasOptional) continue; const messages: Record = { cloudflare: "Cloudflare bot protection detected", anubis: "Anubis proof-of-work challenge detected", perimeterx: "PerimeterX bot protection detected", datadome: "DataDome bot protection detected", incapsula: "Incapsula bot protection detected", akamai: "Akamai bot protection detected", generic: "Bot protection likely detected", }; return { blocked: true, blockerType: sig.type, message: messages[sig.type] ?? messages.generic, retryable: sig.retryable, phase: sig.phase, confidence: sig.weight, }; } // Generic heuristic fallback const matches = GENERIC_BOT_MARKERS.filter((m) => sample.includes(m)); if ( matches.length >= 2 || (matches.length >= 1 && statusCode && statusCode >= 400) ) { return { blocked: true, blockerType: "generic", message: `Bot protection likely detected (${matches.join(", ")})`, retryable: true, phase: "generic_block", confidence: Math.min(0.5 + matches.length * 0.15, 0.85), }; } return { blocked: false, message: "No bot protection detected", retryable: false, phase: "generic_block", confidence: 0, }; } // ─── Login-redirect detection ────────────────────────────────────── /** Known authentication/login domains. */ const AUTH_DOMAINS = [ "accounts.google.com", "login.microsoftonline.com", "login.live.com", "auth0.com", "okta.com", "signin.aws.amazon.com", "console.aws.amazon.com", ]; /** Hostname prefixes that indicate an auth/login service. */ const AUTH_HOSTNAME_PREFIXES = [ "login.", "signin.", "auth.", "sso.", "accounts.", "idp.", ]; /** Content patterns that indicate a login wall when combined with a hostname redirect. */ const LOGIN_CONTENT_PATTERNS = [ "sign in", "log in", "authentication required", "create an account to continue", "login to continue", "please sign in", "please log in", ]; /** * Detects redirect-to-login pages: sites that return 200 but redirect to an * auth domain or serve a login form instead of the requested content. * * Only triggers when the response was redirected to a different hostname — * this avoids false positives on pages that legitimately mention "sign in". */ export function detectLoginRedirect( requestedUrl: string, finalUrl: string, html: string, ): string | undefined { try { const requested = new URL(requestedUrl); const final = new URL(finalUrl); if (requested.hostname.toLowerCase() === final.hostname.toLowerCase()) { return undefined; } const finalHost = final.hostname.toLowerCase(); if ( AUTH_DOMAINS.some((d) => finalHost === d || finalHost.endsWith(`.${d}`)) ) { return `redirected to login (${final.hostname})`; } if (AUTH_HOSTNAME_PREFIXES.some((p) => finalHost.startsWith(p))) { return `redirected to login (${final.hostname})`; } const sample = html.slice(0, 20000).toLowerCase(); if (LOGIN_CONTENT_PATTERNS.some((p) => sample.includes(p))) { return `redirected to login page (${final.hostname})`; } } catch { // URL parsing failures are not login redirects. } return undefined; } /** * Backward-compatible boolean check. Use detectBotBlock() for full metadata. */ export function isLikelyBotProtection(text: string): boolean { return detectBotBlock(text).blocked; }