/** * Content extraction types. * * @remarks * Types for article content extraction using Mozilla Readability. * * @packageDocumentation */ /** * Options for content extraction. */ interface ContentExtractionOptions { /** * Base URL for resolving relative links and images. * Highly recommended for proper link resolution. */ baseUrl?: string; /** * Minimum character count for article content. * Articles shorter than this are considered too short. * @default 500 */ charThreshold?: number; /** * Maximum number of elements to parse. * Set to 0 for no limit. * @default 0 */ maxElemsToParse?: number; /** * Whether to preserve CSS classes in extracted HTML. * @default false */ keepClasses?: boolean; /** * CSS classes to preserve when keepClasses is false. */ classesToPreserve?: string[]; /** * Whether to skip JSON-LD parsing for metadata. * @default false */ disableJSONLD?: boolean; /** * Check if content is probably readerable before extraction. * If true and content is not readerable, returns early with failure. * @default false */ checkReadability?: boolean; /** * Enable debug logging. * @default false */ debug?: boolean; } /** * Successfully extracted content. */ interface ExtractedContent { /** Extraction succeeded */ success: true; /** Article title */ title: string; /** Cleaned HTML content */ content: string; /** Plain text content (HTML stripped) */ textContent: string; /** Article excerpt/summary */ excerpt: string; /** Author byline */ byline?: string; /** Site name */ siteName?: string; /** Content language code (e.g., 'en', 'de') */ lang?: string; /** Text direction */ dir?: 'ltr' | 'rtl'; /** Published time (ISO 8601 string if available) */ publishedTime?: string; /** Character count of text content */ length: number; /** Word count */ wordCount: number; /** Estimated reading time in minutes */ readingTime: number; /** Whether content passed readability check */ readerable: boolean; /** Extraction time in milliseconds */ extractionTime: number; } /** * Error types for extraction failures. */ type ExtractionErrorType = 'NOT_READERABLE' | 'PARSE_ERROR' | 'EXTRACTION_FAILED' | 'INVALID_HTML' | 'UNKNOWN'; /** * Failed content extraction. */ interface ExtractionFailure { /** Extraction failed */ success: false; /** Error message */ error: string; /** Categorized error type */ errorType: ExtractionErrorType; /** Whether content passed readability check (if checked) */ readerable: boolean; /** Extraction time in milliseconds */ extractionTime: number; /** Original error details (if available) */ details?: unknown; } /** * Result of content extraction. * * @remarks * Always returns a result, never throws exceptions. */ type ContentResult = ExtractedContent | ExtractionFailure; /** * Quality assessment metrics. */ interface ContentQuality { /** Word count */ wordCount: number; /** Character count */ charCount: number; /** Estimated reading time in minutes */ readingTime: number; /** Average words per sentence */ avgWordsPerSentence: number; /** Paragraph count */ paragraphCount: number; /** Image count in content */ imageCount: number; /** Link count in content */ linkCount: number; /** Link density (ratio of link text to total text) */ linkDensity: number; /** Overall quality score (0-100) */ qualityScore: number; } /** * Main content extraction module. * * @remarks * Extracts article content from HTML using Mozilla Readability. * Never throws exceptions - always returns a ContentResult. * * @packageDocumentation */ /** * Extract article content from HTML. * * @remarks * Uses Mozilla Readability to extract clean article content from a pre-parsed Document. * This function never throws exceptions - always returns a ContentResult. * * Error handling: * - Returns success: false for any extraction failure * - Categorizes errors by type for better handling * - Includes extraction time even for failures * * @param doc - Pre-parsed Document to extract content from * @param options - Extraction options * @returns Extraction result (success or failure) * * @example * ```typescript * import { parseHTML } from '../utils/html-parser.js'; * import { extractSEO } from '../metadata/index.js'; * * const doc = parseHTML(html); * const metadata = extractSEO(doc); * const content = extractContent(doc, { * baseUrl: 'https://example.com/article', * charThreshold: 300, * checkReadability: true, * }); * * if (content.success) { * console.log(content.title); * console.log(content.wordCount); * console.log(`${content.readingTime} min read`); * } else { * console.error(content.error); * } * ``` */ declare function extractContent(doc: Document, options?: ContentExtractionOptions): ContentResult; /** * HTML to text conversion types. * * @remarks * Types for converting HTML to plain text with the `htmlToText` function. * * @packageDocumentation */ /** * Options for HTML to plain text conversion. */ interface HtmlToTextOptions { /** * How to treat the input HTML. * * @remarks * - `"fragment"`: Treat as HTML fragment (default) * - `"document"`: Treat as full document (ignores `` content) * * @defaultValue `"fragment"` */ mode?: 'fragment' | 'document'; /** * How to render anchor (``) tags. * * @remarks * - `"text"`: Show only the link text (default) * - `"inline"`: Show text followed by URL in parentheses, e.g., "Click here (https://example.com)" * - `"remove"`: Remove links entirely * * @defaultValue `"text"` */ links?: 'text' | 'inline' | 'remove'; /** * How to render image (``) tags. * * @remarks * - `"alt"`: Show the alt text (default) * - `"remove"`: Remove images entirely * * @defaultValue `"alt"` */ images?: 'alt' | 'remove'; /** * Collapse consecutive whitespace outside preserved tags. * * @remarks * When `true`, multiple spaces, tabs, and line breaks are collapsed into single spaces. * Whitespace inside preserved tags (e.g., `
`, ``) is always kept intact.
     *
     * @defaultValue `true`
     */
    collapseWhitespace?: boolean;
    /**
     * Maximum consecutive newlines allowed after compaction.
     *
     * @remarks
     * Limits runs of newlines to this value. Set to `1` for single spacing,
     * `2` for double spacing (default), or higher values as needed.
     *
     * @defaultValue `2`
     */
    maxNewlines?: number;
    /**
     * Optional hard-wrap column width.
     *
     * @remarks
     * When set to a positive number, lines will be wrapped at this column width.
     * Does not wrap inside preserved tags like `
` or ``.
     * Set to `null` to disable wrapping (default).
     *
     * @defaultValue `null`
     */
    wrap?: number | null;
    /**
     * Separator between table cells.
     *
     * @remarks
     * - `"tab"`: Use tab character (default)
     * - `"space"`: Use space character
     *
     * @defaultValue `"tab"`
     */
    tableCellSeparator?: 'tab' | 'space';
    /**
     * HTML tags to exclude entirely along with their contents.
     *
     * @remarks
     * By default excludes: `script`, `style`, `noscript`, `template`, `svg`, `canvas`
     *
     * @defaultValue `["script", "style", "noscript", "template", "svg", "canvas"]`
     */
    excludeTags?: string[];
    /**
     * Decode HTML entities.
     *
     * @remarks
     * When `true`, decodes entities like `&`, `<`, `—`, etc.
     *
     * @defaultValue `true`
     */
    decodeEntities?: boolean;
    /**
     * Tags whose internal whitespace is preserved.
     *
     * @remarks
     * These tags will not have their whitespace collapsed, allowing proper
     * formatting of code blocks and preformatted text.
     *
     * @defaultValue `["pre", "code", "textarea"]`
     */
    preserveTags?: string[];
    /**
     * Trim leading and trailing whitespace from the result.
     *
     * @defaultValue `true`
     */
    trim?: boolean;
}

/**
 * HTML to text conversion.
 *
 * @remarks
 * Convert HTML to plain text using a zero-dependency streaming tokenizer.
 * Pure, deterministic transformation suitable for logs, previews, classification,
 * and search indexing. Preserves essential structure by inserting newlines at
 * block boundaries, handles entities, and provides configurable options.
 *
 * @packageDocumentation
 */

/**
 * Convert an HTML string to plain text.
 *
 * @remarks
 * This function uses a streaming tokenizer to parse HTML and extract text content.
 * It handles block elements, whitespace preservation, HTML entities, tables, and more.
 *
 * Features:
 * - Preserves document structure with appropriate line breaks
 * - Handles HTML entities (numeric and common named entities)
 * - Configurable link and image handling
 * - Table rendering with configurable cell separators
 * - Whitespace preservation for code/pre blocks
 * - Optional hard-wrapping at column width
 *
 * @param html - HTML string (fragment or full document)
 * @param options - Conversion options
 * @returns Plain text string
 *
 * @throws {TypeError} If html is not a string
 *
 * @example
 * ```typescript
 * const html = '

Hello

World!

'; * const text = htmlToText(html); * console.log(text); // "Hello\n\nWorld!" * ``` * * @example * ```typescript * const html = '
Visit'; * const text = htmlToText(html, { links: 'inline' }); * console.log(text); // "Visit (https://example.com)" * ``` */ declare function htmlToText(html: string, options?: HtmlToTextOptions): string; /** * Content quality assessment. * * @remarks * Analyzes extracted content to provide quality metrics. * * @packageDocumentation */ /** * Calculate word count from text. * * @param text - Text to count words in * @returns Number of words */ declare function countWords(text: string): number; /** * Calculate reading time in minutes. * * @remarks * Uses average reading speed of 200 words per minute. * * @param wordCount - Number of words * @returns Estimated reading time in minutes */ declare function calculateReadingTime(wordCount: number): number; /** * Assess content quality. * * @remarks * Analyzes extracted content and returns comprehensive quality metrics. * * @param content - Extracted content * @returns Quality assessment * * @example * ```typescript * const content = extractContent(html); * if (content.success) { * const quality = assessContentQuality(content); * console.log(`Quality score: ${quality.qualityScore}/100`); * console.log(`Reading time: ${quality.readingTime} minutes`); * } * ``` */ declare function assessContentQuality(content: ExtractedContent): ContentQuality; /** * Mozilla Readability wrapper with linkedom. * * @remarks * Provides a clean interface to Mozilla Readability using linkedom as the DOM implementation. * * @packageDocumentation */ /** * Check if HTML content is probably readerable. * * @remarks * Quick check to determine if content extraction is likely to succeed. * This is a heuristic check and may produce false positives/negatives. * * @param doc - Pre-parsed Document to check * @param options - Readability check options * @returns True if content appears to be an article * * @example * ```typescript * import { parseHTML } from '../utils/html-parser.js'; * * const doc = parseHTML(html); * if (isProbablyReaderable(doc)) { * const result = extractContent(doc); * } * ``` */ declare function isProbablyReaderable(doc: Document, options?: { minContentLength?: number; minScore?: number; }): boolean; /** * Feed format detection utilities. * * @packageDocumentation */ /** * Feed format type. * * @remarks * Represents the detected or expected format of a feed. * - `'rss'` - RSS 2.0, 0.9x, or RSS 1.0 (RDF) * - `'atom'` - Atom 1.0 * - `'json-feed'` - JSON Feed 1.0 or 1.1 * - `'sitemap'` - XML Sitemap (urlset or sitemapindex) * - `'unknown'` - Format could not be determined */ type FeedFormat = 'rss' | 'atom' | 'json-feed' | 'sitemap' | 'unknown'; /** * Detect feed format from content string. * * @remarks * Analyzes the content to determine if it's RSS, Atom, or JSON Feed. * Detection is based on root elements, namespaces, and structure. * * Detection priority: * 1. JSON Feed (checks for JSON with jsonfeed.org version) * 2. RSS (checks for `` or `` root elements) * 3. Atom (checks for `` root element with Atom namespace) * * @param content - Feed content as string * @returns Detected format or 'unknown' if format cannot be determined * * @example * ```typescript * const format = detectFormat(feedContent); * if (format === 'rss') { * console.log('This is an RSS feed'); * } * ``` */ declare function detectFormat(content: string): FeedFormat; /** * Check if content is a valid feed (any format). * * @param content - Feed content as string * @returns `true` if content is RSS, Atom, or JSON Feed * * @example * ```typescript * if (isFeed(content)) { * const result = parseFeed(content); * } * ``` */ declare function isFeed(content: string): boolean; /** * Check if content is RSS format. * * @param content - Feed content as string * @returns `true` if content is RSS (any version) */ declare function isRSS(content: string): boolean; /** * Check if content is Atom format. * * @param content - Feed content as string * @returns `true` if content is Atom 1.0 */ declare function isAtom(content: string): boolean; /** * Check if content is JSON Feed format. * * @param content - Feed content as string * @returns `true` if content is JSON Feed (1.0 or 1.1) */ declare function isJSONFeed(content: string): boolean; /** * Unified feed types - normalized interface across all feed formats. * * @remarks * These types provide a consistent interface for working with feeds regardless * of the original format (RSS, Atom, or JSON Feed). All format-specific data * is normalized to this structure by the parser. * * @packageDocumentation */ /** * Feed author information. * * @remarks * Represents author/contributor information normalized across all feed formats. * Not all formats provide all fields. */ interface FeedAuthor { /** Author's name */ name?: string; /** Author's email address */ email?: string; /** Author's website URL */ url?: string; } /** * Feed enclosure (attached file). * * @remarks * Represents attached files like audio, video, or documents. Commonly used * for podcasts and media feeds. */ interface FeedEnclosure { /** URL of the attached file */ url: string; /** MIME type of the file (e.g., 'audio/mpeg', 'video/mp4') */ type?: string; /** File size in bytes */ length?: number; } /** * Feed item (entry/article/post). * * @remarks * Represents a single item in a feed. Items are normalized across all formats * to provide a consistent interface. Not all fields are available in all formats. */ interface FeedItem { /** Unique identifier for the item (GUID, ID, or URL) */ id: string; /** Item title */ title?: string; /** Canonical URL for the item */ url?: string; /** External URL for linked posts (when different from canonical URL) */ externalUrl?: string; /** Full HTML content of the item */ contentHtml?: string; /** Plain text content of the item */ contentText?: string; /** Short summary or description */ summary?: string; /** Publication date in ISO 8601 format */ published?: string; /** Last modified date in ISO 8601 format */ modified?: string; /** Item authors (may be empty if using feed-level authors) */ authors?: FeedAuthor[]; /** Tags, categories, or keywords */ tags?: string[]; /** Featured image URL */ image?: string; /** Attached files (audio, video, documents) */ enclosures?: FeedEnclosure[]; } /** * Normalized feed data. * * @remarks * The main feed object containing metadata and items. This is the recommended * interface for working with feeds as it provides a consistent structure * regardless of the original format. */ interface Feed { /** Original feed format */ format: 'rss' | 'atom' | 'json-feed' | 'sitemap'; /** Feed title (required) */ title: string; /** Feed description or subtitle */ description?: string; /** Feed's home page URL */ url?: string; /** Feed's own URL (self-reference) */ feedUrl?: string; /** Feed language code (e.g., 'en', 'de') */ language?: string; /** Feed icon or logo URL */ image?: string; /** Feed-level authors */ authors?: FeedAuthor[]; /** Last update date in ISO 8601 format */ updated?: string; /** Feed items (entries/articles/posts) */ items: FeedItem[]; } /** * Parse result containing both normalized and original data. * * @remarks * Returned by {@link parseFeed}. Contains both the normalized feed data * (recommended for most use cases) and the original format-specific data * (for advanced use cases requiring format-specific fields). */ interface ParseResult { /** Normalized feed data (recommended) */ feed: Feed; /** Original format-specific data (advanced use) */ original: unknown; } /** * Unified feed parser with automatic format detection. * * @packageDocumentation */ /** * Parse any feed format with automatic format detection. * * @remarks * This is the main entry point for feed parsing. It automatically detects whether * the content is RSS, Atom, or JSON Feed, parses it, and returns a normalized * output structure along with the original format-specific data. * * All relative URLs in the feed are converted to absolute URLs if a base URL is provided. * This is essential for feed readers that need to fetch images, enclosures, or follow links. * * @param content - Feed content as string (XML or JSON) * @param baseUrl - Optional base URL for resolving relative URLs (string or URL object) * @returns Object containing normalized feed data and original format-specific data * @throws Error if format cannot be detected or parsing fails * * @example * ```typescript * const feedContent = await fetch('https://example.com/feed.xml').then(r => r.text()); * const result = parseFeed(feedContent, 'https://example.com/feed.xml'); * * console.log(result.feed.title); * console.log(result.feed.items[0].title); * console.log(result.feed.items[0].url); // Absolute URL * ``` */ declare function parseFeed(content: string, baseUrl?: string | URL): ParseResult; /** * Types for high-level gathering functionality. * * @packageDocumentation */ /** * Gathered website data. * * @remarks * This interface represents the complete gathered data from a website, * including the authoritative URL and all extracted metadata. * It will be extended incrementally with more properties. */ interface Website { /** * Authoritative URL for the page. * * @remarks * Uses canonical URL if present, otherwise the final URL after redirects. */ url: URL; /** Discovered feed URLs (RSS, Atom, JSON Feed) as URL objects */ feeds: URL[]; /** * Page title (cleaned, from best available source). * * @remarks * Collects titles from multiple sources, cleans them, and picks the longest. * Sources: OpenGraph, Twitter Card, HTML title tag, First H1 */ title?: string; /** * Page description (from best available source). * * @remarks * Collects descriptions from metadata and picks the longest. * Sources: OpenGraph, Twitter Card, HTML meta description */ description?: string; /** * Page keyvisual/image URL (from best available source). * * @remarks * Priority: OpenGraph > Twitter Card > Largest Apple Touch Icon > Favicon * Returns the URL object of the best visual representation of the site. */ image?: URL; /** * Best available icon/favicon for the site. * * @remarks * Priority: Largest Apple Touch Icon > Safari mask icon > Favicon > Shortcut icon > MS tile > Fluid icon * Returns the highest quality icon available, preferring modern, high-resolution formats. */ icon?: URL; /** * Primary language code (ISO 639-1). * * @remarks * Extracted from HTML lang attribute, content-language meta tag, or OpenGraph locale. * Normalized to lowercase ISO 639-1 format (e.g., 'en', 'de', 'fr', 'ja'). */ language?: string; /** * Region code (ISO 3166-1 alpha-2). * * @remarks * Only present if the language includes a region specifier. * Normalized to uppercase ISO 3166-1 alpha-2 format (e.g., 'US', 'GB', 'DE'). */ region?: string; /** * Raw HTML content of the page (UTF-8). * * @remarks * The complete HTML source after fetching and decoding to UTF-8. * Useful for custom processing or caching. */ html: string; /** * Plain text content extracted from the HTML. * * @remarks * Automatically converted from HTML using the `htmlToText` function. * Removes all tags, decodes entities, and preserves document structure * with appropriate line breaks. */ text: string; /** * Internal links found on the page (same domain, excluding current URL). * * @remarks * All links are URL objects. The current page URL is excluded to avoid * self-references. Useful for site crawling and navigation analysis. */ internalLinks: URL[]; /** * External links found on the page (different domains). * * @remarks * All links are URL objects. Useful for analyzing outbound links, * citations, and external resources. */ externalLinks: URL[]; } /** * Gathered article data. * * @remarks * This interface represents the complete gathered data from an article page, * including the authoritative URL, raw HTML, and extracted content. * It will be extended incrementally with more properties. */ interface Article { /** * Authoritative URL for the article. * * @remarks * Uses canonical URL if present, otherwise the final URL after redirects. */ url: URL; /** * Raw HTML content of the article page (UTF-8). * * @remarks * The complete HTML source after fetching and decoding to UTF-8. * Useful for custom processing or caching. */ html: string; /** * Plain text content extracted from the HTML. * * @remarks * Automatically converted from HTML using the `htmlToText` function. * Removes all tags, decodes entities, and preserves document structure * with appropriate line breaks. */ text: string; /** * Cleaned article content (plain text). * * @remarks * Extracted using Mozilla Readability (cleaned HTML), then converted to * plain text using `htmlToText` for proper formatting. * This is the main article body without navigation, ads, or other clutter. * Falls back to undefined if Readability extraction fails. */ content?: string; /** * Article title. * * @remarks * Extracted from Mozilla Readability if available. * Falls back to metadata (Schema.org, OpenGraph, Twitter Card, HTML title) * if Readability extraction fails or title is empty. */ title?: string; /** * Article description/excerpt. * * @remarks * Extracted from Mozilla Readability's excerpt if available. * Falls back to metadata (OpenGraph, Twitter Card, HTML meta description) * if Readability excerpt is empty or extraction fails. */ description?: string; /** * Article keyvisual/image URL (from best available source). * * @remarks * Priority: Schema.org NewsArticle/Article (largest) > OpenGraph > Twitter Card > Largest Apple Touch Icon > Favicon * Returns the URL object of the best visual representation of the article. */ image?: URL; /** * Primary language code (ISO 639-1). * * @remarks * Extracted from HTML lang attribute, Content-Language meta, or OpenGraph locale. * Returns lowercase 2-letter ISO 639-1 code (e.g., 'en', 'de', 'fr'). */ language?: string; /** * Region/country code (ISO 3166-1 alpha-2). * * @remarks * Extracted from language tags like 'en-US' or 'de-DE'. * Returns uppercase 2-letter ISO 3166-1 alpha-2 code (e.g., 'US', 'GB', 'DE'). */ region?: string; /** * Internal links found in the article (same domain/subdomain). * * @remarks * Links pointing to pages within the same domain. * Automatically excludes the current article URL. * All URLs are absolute and normalized. */ internalLinks: URL[]; /** * External links found in the article (different domains). * * @remarks * Links pointing to external domains (useful for citations, references). * All URLs are absolute and normalized. */ externalLinks: URL[]; /** * Word count of the article. * * @remarks * Calculated from `content` if available (Readability-cleaned content), * otherwise calculated from `text` (full page text). * Based on whitespace-separated word boundaries. */ wordCount: number; /** * Estimated reading time in minutes. * * @remarks * Calculated from word count using average reading speed of 200 words per minute. * Minimum value is 1 minute. */ readingTime: number; } /** * High-level article gathering functionality. * * @packageDocumentation */ /** * Gather article data from a URL in one convenient call. * * @remarks * This is a high-level convenience method that fetches an article page and extracts * relevant data. It handles encoding detection, redirects, and provides * a unified interface for all article data. * * This method will be extended incrementally to include metadata extraction, * content extraction, and more. * * @param url - Article URL as string or URL object * @returns Gathered article data including URL, content, metadata, language, and links * @throws Error if URL is invalid or fetch fails * * @example * ```typescript * // Fetch an article and get its data * const article = await gatherArticle('https://example.com/article'); * console.log(article.url); // Final URL after redirects * console.log(article.html); // Raw HTML content (UTF-8) * console.log(article.text); // Plain text (full page HTML converted) * console.log(article.content); // Cleaned article content (Readability + htmlToText) * console.log(article.title); // Article title (from Readability or metadata) * console.log(article.description); // Article excerpt or description * console.log(article.image); // Article keyvisual/image (from best source) * console.log(article.language); // Language code (ISO 639-1, e.g., 'en') * console.log(article.region); // Region code (ISO 3166-1 alpha-2, e.g., 'US') * console.log(article.internalLinks); // Array of internal link URLs * console.log(article.externalLinks); // Array of external link URLs * console.log(article.wordCount); // Word count (from content or text) * console.log(article.readingTime); // Estimated reading time in minutes * ``` */ declare function gatherArticle(url: string | URL): Promise
; /** * High-level feed gathering functionality. * * @packageDocumentation */ /** * Gather and parse a feed from a URL in one convenient call. * * @remarks * This is a high-level convenience method that combines fetching and parsing. * It handles encoding detection, redirects, and feed format detection automatically. * Falls back to sitemap parsing when standard feed formats aren't detected. * * @param url - Feed URL as string or URL object * @returns Normalized feed data * @throws Error if URL is invalid, fetch fails, or feed cannot be parsed * * @example * ```typescript * // Fetch and parse a feed * const feed = await gatherFeed('https://example.com/feed.xml'); * * console.log(feed.title); * console.log(feed.items[0].title); * console.log(feed.items[0].url); * ``` */ declare function gatherFeed(url: string | URL): Promise; /** * High-level website gathering functionality. * * @packageDocumentation */ /** * Gather website data from a URL in one convenient call. * * @remarks * This is a high-level convenience method that fetches a website and extracts * all relevant data. It handles encoding detection, redirects, and provides * a unified interface for all website data. * * This method will be extended incrementally to include metadata extraction, * content extraction, and more. * * @param url - Website URL as string or URL object * @returns Gathered website data including final URL, title, description, image, icon, language, html, text, feeds, and links * @throws Error if URL is invalid or fetch fails * * @example * ```typescript * // Fetch a website and get its data * const site = await gatherWebsite('https://example.com'); * console.log(site.url); // Final URL after redirects * console.log(site.title); // Page title (cleaned, from best source) * console.log(site.description); // Page description (from best source) * console.log(site.image); // Page image/keyvisual (from best source) * console.log(site.icon); // Best available icon/favicon * console.log(site.language); // Primary language code (ISO 639-1) * console.log(site.region); // Region code (ISO 3166-1 alpha-2) * console.log(site.html); // Raw HTML content (UTF-8) * console.log(site.text); // Plain text content (extracted from HTML) * console.log(site.feeds); // Array of feed URL objects * console.log(site.internalLinks); // Array of internal link URL objects * console.log(site.externalLinks); // Array of external link URL objects * ``` */ declare function gatherWebsite(url: string | URL): Promise; /** * HTML parsing utilities using linkedom. * * @remarks * This module provides a simple wrapper around linkedom for consistent * HTML parsing across all metadata extraction modules. Parsing should happen * once at the top level and the parsed document passed to all extractors. * * @packageDocumentation */ /** * Parse HTML string into a DOM document. * * @remarks * Parses HTML using linkedom, providing a standards-compliant DOM implementation. * This should be called once per document, with the result passed to all metadata * extractors for performance. * * Never throws - returns a document even for malformed HTML. * * @param html - HTML string to parse * @param baseUrl - Optional base URL for resolving relative URLs * @returns Parsed DOM document * * @example * ```typescript * const doc = parseHTML('Test'); * const title = doc.querySelector('title')?.textContent; * ``` */ declare function parseHTML(html: string, baseUrl?: string): Document; type HTMLDocument = Document; /** * Input type that accepts either a parsed Document or raw HTML string. * This allows extractor functions to be more forgiving. */ type DocumentInput = Document | string; /** * Analytics and tracking types. * * @remarks * Types for analytics service detection (IDs only, no tracking). * * @packageDocumentation */ /** * Analytics metadata. * * @remarks * Contains detected analytics service IDs. Privacy-conscious - only extracts IDs, * doesn't perform any tracking. */ interface AnalyticsMetadata { /** Google Analytics tracking IDs (UA-, G-, GT- prefixes) */ googleAnalytics?: string[]; /** Google Tag Manager container IDs */ googleTagManager?: string[]; /** Facebook Pixel IDs */ facebookPixel?: string[]; /** Matomo/Piwik site IDs */ matomo?: string[]; /** Plausible Analytics domains */ plausible?: string[]; /** Adobe Analytics (Omniture) IDs */ adobe?: string[]; /** Cloudflare Web Analytics tokens */ cloudflare?: string[]; /** Fathom Analytics site IDs */ fathom?: string[]; } /** * Analytics and tracking extraction. * * @remarks * Detects analytics service IDs from HTML documents. * Privacy-conscious - only extracts IDs, doesn't perform any tracking. * * @packageDocumentation */ /** * Extract analytics metadata from HTML. * * @remarks * Detects analytics service IDs by examining script tags and their content. * Only extracts identifiers, does not track or collect user data. * * @param input - Parsed HTML document or raw HTML string * @returns Analytics metadata * * @example * ```typescript * // With parsed document (recommended for multiple extractions) * const doc = parseHTML(htmlString); * const analytics = extractAnalytics(doc); * * // Or directly with HTML string * const analytics = extractAnalytics(htmlString); * ``` */ declare function extractAnalytics(input: DocumentInput): AnalyticsMetadata; /** * Assets extraction types. * * @remarks * Types for categorized asset URLs extracted from HTML documents. * * @author Anonyfox * @license MIT * @see {@link https://github.com/Anonyfox/ravenjs} * @see {@link https://ravenjs.dev} * @see {@link https://anonyfox.com} * * @packageDocumentation */ /** * Categorized assets extracted from HTML. * * @remarks * Contains all external assets referenced in the document, organized by type. * All URLs are normalized to absolute format if a base URL is available. */ interface AssetsMetadata { /** Image URLs from img, picture, srcset, and meta tags */ images?: string[]; /** Stylesheet URLs from link tags */ stylesheets?: string[]; /** Script URLs from script tags */ scripts?: string[]; /** Font URLs extracted from CSS */ fonts?: string[]; /** Media URLs from video, audio, source, and track elements */ media?: string[]; /** Web app manifest URLs */ manifests?: string[]; /** Preload/prefetch resource hints */ preloads?: PreloadResource[]; /** DNS prefetch and preconnect hints */ connectionHints?: ConnectionHint[]; } /** * Preload or prefetch resource hint. */ interface PreloadResource { /** Resource URL */ url: string; /** Resource type (script, style, font, image, etc.) */ as?: string; /** MIME type */ type?: string; /** Crossorigin attribute */ crossorigin?: string; /** Whether this is a prefetch (true) or preload (false) */ prefetch?: boolean; } /** * DNS prefetch or preconnect hint. */ interface ConnectionHint { /** Domain URL */ url: string; /** Whether this is a preconnect (true) or dns-prefetch (false) */ preconnect?: boolean; /** Crossorigin attribute */ crossorigin?: string; } /** * Assets extraction. * * @remarks * Extracts categorized asset URLs from HTML documents. * * @author Anonyfox * @license MIT * @see {@link https://github.com/Anonyfox/ravenjs} * @see {@link https://ravenjs.dev} * @see {@link https://anonyfox.com} * * @packageDocumentation */ /** * Extract assets metadata from HTML. * * @remarks * Extracts all external assets referenced in the document, organized by type. * All URLs are normalized to absolute format based on the document's base URL. * * The extractor finds assets from: * - Images: ``, ``, `srcset`, OpenGraph meta tags * - Stylesheets: `` * - Scripts: `