/** * Web Search Skill - Searches the web using the Google Custom Search API. * * Tier 3 built-in skill: requires `GOOGLE_SEARCH_API_KEY` and * `GOOGLE_SEARCH_ENGINE_ID` (legacy alias: `GOOGLE_SEARCH_CX`) environment * variables, or the equivalent config params. Uses the Google Custom Search * JSON API to perform web searches and return formatted results. */ import { SkillBase } from '../SkillBase.js'; import type { SkillToolDefinition, SkillPromptSection, SkillConfig, ParameterSchemaEntry } from '../SkillBase.js'; /** * Run the HTML → plain-text extraction pipeline that mirrors Python's * `GoogleSearchScraper.extract_html_content` (skill.py:204-282). * * Pipeline: * 1. Pre-process `` sections to plain text (Python html.parser * keeps CDATA content; cheerio/htmlparser2 drops it). * 2. Walk {@link CONTENT_CANDIDATES} in order, picking the first match. * Fall back to `` or the raw document. * 3. Clone the picked subtree so tag/pattern removal only affects it — * analogous to Python's `content_soup = BeautifulSoup(str(main_content))`. * The selector-first-then-filter order is load-bearing: if the real * content is wrapped in a sidebar-pattern div, Python still finds it * because the selector runs before removal. * 4. Remove {@link UNWANTED_TAGS} and every element matching any of * {@link UNWANTED_PATTERNS} on either `class` or `id` (case-insensitive). * 5. Collapse whitespace and trim. * * Exported so `tests/skills/web-search-parity.test.ts` can verify byte- * identical behavior against Python BeautifulSoup fixtures; not re-exported * from the skills barrel, so this stays internal to the skill module. * * @internal */ export declare function extractTextFromHtml(html: string): string; /** * Searches the web using the Google Custom Search JSON API. * * Tier 3 built-in skill. Credentials can be supplied via the `api_key` and * `search_engine_id` params or `GOOGLE_SEARCH_API_KEY` / * `GOOGLE_SEARCH_ENGINE_ID` (legacy: `GOOGLE_SEARCH_CX`) environment variables. * * The handler mirrors Python's `search_and_scrape_best` pipeline: fetches * `oversample_factor × num_results` candidates from Google, scrapes each * result page (SSRF-guarded, cheerio-based text extraction), scores for * quality (length + query relevance + boilerplate penalty), deduplicates by * domain, and returns the top `num_results` above `min_quality_score` with * full page content. If every scrape fails or falls below the threshold the * handler falls back to raw API snippets so the agent still has something * to say. * * Supported config: `tool_name`, `num_results`, `no_results_message`, * `safe_search`, `delay`, `max_content_length`, `oversample_factor`, * `min_quality_score`. * * @example * ```ts * agent.addSkill('web_search', { * api_key: process.env.GOOGLE_SEARCH_API_KEY, * search_engine_id: process.env.GOOGLE_CSE_ID, * num_results: 3, * }); * ``` */ export declare class WebSearchSkill extends SkillBase { static SKILL_NAME: string; static SKILL_DESCRIPTION: string; static SKILL_VERSION: string; static REQUIRED_PACKAGES: readonly string[]; static REQUIRED_ENV_VARS: readonly string[]; static SUPPORTS_MULTIPLE_INSTANCES: boolean; static getParameterSchema(): Record; /** * @returns Manifest declaring Google Search credentials as required env vars. * Reports `GOOGLE_SEARCH_ENGINE_ID` as the canonical name; `GOOGLE_SEARCH_CX` * is still accepted as a legacy fallback at runtime. */ /** * Validate required credentials before the skill becomes active. * * Mirrors Python's `setup()` (skill.py:559-600) which checks `api_key` and * `search_engine_id` and returns `False` (logging an error) if either is * absent. In the TS SDK credentials may also arrive via environment variables * (`GOOGLE_SEARCH_API_KEY` / `GOOGLE_SEARCH_ENGINE_ID` or the legacy alias * `GOOGLE_SEARCH_CX`), so both config params and env vars are checked. * @returns `true` if all required credentials are present, `false` otherwise. */ setup(): Promise; /** * Instance key for the SkillManager. Includes the configured * `search_engine_id` (or `"default"`) and `tool_name` (or `"web_search"`) * to match Python's `"{SKILL_NAME}_{search_engine_id}_{tool_name}"` scheme. */ getInstanceKey(): string; /** Global data injected into the agent's SWML context (mirrors Python). */ getGlobalData(): Record; /** Resolve the tool name (defaults to `web_search`, matches Python default). */ private getToolName; /** * @returns A single tool (named via `tool_name`) that performs a Google * Custom Search and returns formatted results. */ getTools(): SkillToolDefinition[]; /** Apply the `{query}` template to the no-results message. */ private static _formatNoResultsMessage; /** * Check whether the URL points at Reddit. Python parity: `is_reddit_url` * (skill.py:66). */ private static _isRedditUrl; /** * Fetch a Reddit URL via the `.json` endpoint and build a structured summary * of the post + top comments. * * Python parity: `extract_reddit_content` (skill.py:71-190). Matches the * post-title/author/score/comments assembly and the top-20 → valid → top-5 * comment pipeline. Returns just the compiled text — Python's * `search_and_scrape_best` unconditionally overwrites Reddit's * engagement-score metrics with the 6-factor `_calculate_content_quality` * (skill.py:447-448), so we skip computing the dead engagement score here * and let the handler score the text via `_qualityMetrics`. * * Falls through to HTML extraction on JSON fetch failure or malformed * payload, matching Python's `except Exception: fall back` behavior. */ private _extractRedditContent; /** * Fetch a URL and extract clean text content, then score it with the * 6-factor `_qualityMetrics`. Reddit URLs are routed to the JSON extractor * first; the compiled Reddit text is scored with the same 6-factor formula * to match Python's `search_and_scrape_best` overwrite behavior * (skill.py:447-448). * * Python parity: `extract_text_from_url` (skill.py:192-202). Returns `null` * on any failure (network, non-200, parse error, or SSRF rejection). */ private _scrapeUrl; /** * Six-factor content quality metrics (score + sub-metrics used in the * per-result output). Combines content length, word diversity, * boilerplate penalty, sentence structure, domain reputation, and query * relevance. * * Python parity: `_calculate_content_quality` (skill.py:284-414). Preserves * the weights (0.25/0.10/0.10/0.15/0.15/0.25), the 26-phrase boilerplate * list, the quality/low-quality domain lists, and the phrase-match bonus * on relevance. Also returns the same metric fields Python exposes * (`text_length`, `sentence_count`, `query_relevance`, `query_words_found`, * `domain`) so the handler can render the full Python output format. */ private static _qualityMetrics; /** @returns Prompt section describing web search capabilities and usage guidance. */ protected _getPromptSections(): SkillPromptSection[]; } /** * Factory function for creating WebSearchSkill instances. * @param config - Optional skill configuration. * @returns A new WebSearchSkill instance. */ export declare function createSkill(config?: SkillConfig): WebSearchSkill;