/** * Spider Skill - Fast web scraping and crawling capabilities. * * Tier 2 built-in skill. Port of the Python `SpiderSkill` that performs * HTTP scraping via `fetch`, lightweight HTML-to-text extraction, and * breadth-first crawling with configurable limits. Supports three tools: * * - `scrape_url` — single-page text/markdown extraction * - `crawl_site` — breadth-first crawl from a start URL * - `extract_structured_data` — CSS/XPath-like structured extraction * * Multiple instances are supported; use `tool_name` in config to differentiate. */ import { SkillBase } from '../SkillBase.js'; import type { SkillToolDefinition, SkillConfig, ParameterSchemaEntry } from '../SkillBase.js'; /** * Fast web scraping skill optimized for speed and token efficiency. * * Multi-instance capable. Port of the Python `SpiderSkill` with three tools: * `scrape_url`, `crawl_site`, and `extract_structured_data`. Configuration * mirrors the Python schema (delay, concurrent_requests, timeout, max_pages, * max_depth, extract_type, max_text_length, clean_text, selectors, * follow_patterns, user_agent, headers, follow_robots_txt, cache_enabled). * * @example * ```ts * agent.addSkill('spider', { max_pages: 5, max_depth: 2 }); * ``` */ export declare class SpiderSkill extends SkillBase { static SKILL_NAME: string; static SKILL_DESCRIPTION: string; static SKILL_VERSION: string; static REQUIRED_PACKAGES: readonly string[]; static REQUIRED_ENV_VARS: readonly string[]; static SUPPORTS_MULTIPLE_INSTANCES: boolean; static getParameterSchema(): Record; private delay; private concurrentRequests; private timeout; private maxPages; private maxDepth; private extractType; private maxTextLength; private cleanText; private cacheEnabled; private followRobotsTxt; private userAgent; private headers; private selectors; private compiledFollowPatterns; private cache; private readonly cacheMaxSize; getInstanceKey(): string; setup(): Promise; getHints(): string[]; cleanup(): Promise; /** @returns Three tools: `scrape_url`, `crawl_site`, and `extract_structured_data`. */ getTools(): SkillToolDefinition[]; /** * Redirect a target URL through `SPIDER_BASE_URL` when set. Used by the * porting-sdk's `audit_skills_dispatch.py` to point every outbound fetch * at its loopback fixture without requiring the test URL to resolve. * Preserves the path (and any query/fragment) from the original target. */ private static _redirectForAudit; /** Fetch a URL with caching and timeout handling. Returns null on failure. */ private _fetchUrl; /** * Extract plain text from a fetched response using simple regex stripping. * * Takes a {@link CachedResponse} (TS equivalent of `requests.Response`) to * match Python's `_fast_text_extract(response)` capability surface — the * response-like object exposes `url`, `status`, and `body` for callers that * need to branch on status/content-type. */ private _fastTextExtract; /** * Extract a value using a CSS selector (via cheerio) or XPath-like * selector (`//tag`). Mirrors Python `_structured_extract`'s selector * branching: selectors starting with `/` are treated as XPath (tag-name * subset), everything else is full CSS via cheerio's querySelector-style * engine. */ private _applySelector; /** Markdown extraction using cheerio. Mirrors Python `_markdown_extract`. */ private _markdownExtract; private _structuredExtract; private _scrapeUrlHandler; private _crawlSiteHandler; private _extractStructuredHandler; } /** * Factory function for creating SpiderSkill instances. * @param config - Optional skill configuration. * @returns A new SpiderSkill instance. */ export declare function createSkill(config?: SkillConfig): SpiderSkill;