/** * `browser analyze ` — turn site-recon guesswork into deterministic CLI output. * * When an agent starts a new adapter, the first question is "which pattern am * I looking at?" (A/B/C/D/E from site-recon docs) and "will Node-side fetch * work, or will anti-bot middleware block me?". Today the agent has to open * the page, poke `network`, try cURL, fail, guess again. This module condenses * that into one call that returns a classification + evidence. * * Kept pure (no page imports) so the bulk is unit-testable; the CLI wrapper * drives a real page, feeds the resulting signals here, and prints the verdict. */ import type { CliCommand } from '../registry.js'; export interface PageSignals { /** URL we navigated to (may redirect; both fields help agents notice that). */ requestedUrl: string; finalUrl: string; /** document.cookie split into names; value not needed for detection. */ cookieNames: string[]; /** * Response bodies captured during the navigation + first few seconds. * We only need enough body text to spot WAF markers; the CLI truncates * per-entry before feeding us. */ networkEntries: Array<{ url: string; status: number; contentType: string; /** First N chars of body; null when not available. */ bodyPreview: string | null; }>; /** * Which globals the page exposes on `window`. We don't care about the values, * just presence — distinguishes Pattern B (SSR state) from Pattern A. */ initialState: { __INITIAL_STATE__: boolean; __NUXT__: boolean; __NEXT_DATA__: boolean; __APOLLO_STATE__: boolean; }; /** Document title — only for the human-debug `summary` field. */ title: string; } export type AntiBotVendor = 'aliyun_waf' | 'cloudflare' | 'akamai' | 'geetest' | 'unknown'; export interface AntiBotVerdict { detected: boolean; vendor: AntiBotVendor | null; evidence: string[]; /** One-line imperative instruction for the agent. */ implication: string; } export declare function detectAntiBot(signals: PageSignals): AntiBotVerdict; export type Pattern = 'A' | 'B' | 'C' | 'D' | 'E' | 'unknown'; export interface PatternVerdict { pattern: Pattern; reason: string; /** How many JSON XHR/fetch responses we saw during navigation. */ json_responses: number; /** Count of non-2xx API responses — hint for token-gated (Pattern D). */ auth_failures: number; } /** * Apply the decision tree from `site-recon.md` mechanically. * * B beats A when initial-state globals are present: even if the page fetches * more data via XHR afterwards, the SSR payload is the highest-leverage source. * D (token-gated) dominates when we see 401/403 on what looks like API * endpoints — without that, an authenticated route looks identical to A. */ export declare function classifyPattern(signals: PageSignals): PatternVerdict; export interface NearestAdapter { site: string; example_commands: string[]; reason: string; } /** * Find existing adapters that target the same site. * * Keep the hostname match simple — agents extend naming conventions * differently per site, so we match on the registered `domain` field and fall * back to site-name containment. Returning `null` is fine; agents can always * read site-memory docs. */ export declare function findNearestAdapter(finalUrl: string, registry: Map): NearestAdapter | null; export interface AnalyzeReport { requested_url: string; final_url: string; title: string; pattern: PatternVerdict; anti_bot: AntiBotVerdict; initial_state: PageSignals['initialState']; nearest_adapter: NearestAdapter | null; recommended_next_step: string; } /** * Synthesize the verdict from collected signals + registry. * * The `recommended_next_step` is deliberately a single imperative * sentence — agents act on it directly instead of re-deriving advice from * the structured fields. */ export declare function analyzeSite(signals: PageSignals, registry: Map): AnalyzeReport;