/** * Content extraction types. * * @remarks * Types for article content extraction using Mozilla Readability. * * @packageDocumentation */ /** * Options for content extraction. */ interface ContentExtractionOptions { /** * Base URL for resolving relative links and images. * Highly recommended for proper link resolution. */ baseUrl?: string; /** * Minimum character count for article content. * Articles shorter than this are considered too short. * @default 500 */ charThreshold?: number; /** * Maximum number of elements to parse. * Set to 0 for no limit. * @default 0 */ maxElemsToParse?: number; /** * Whether to preserve CSS classes in extracted HTML. * @default false */ keepClasses?: boolean; /** * CSS classes to preserve when keepClasses is false. */ classesToPreserve?: string[]; /** * Whether to skip JSON-LD parsing for metadata. * @default false */ disableJSONLD?: boolean; /** * Check if content is probably readerable before extraction. * If true and content is not readerable, returns early with failure. * @default false */ checkReadability?: boolean; /** * Enable debug logging. * @default false */ debug?: boolean; } /** * Successfully extracted content. */ interface ExtractedContent { /** Extraction succeeded */ success: true; /** Article title */ title: string; /** Cleaned HTML content */ content: string; /** Plain text content (HTML stripped) */ textContent: string; /** Article excerpt/summary */ excerpt: string; /** Author byline */ byline?: string; /** Site name */ siteName?: string; /** Content language code (e.g., 'en', 'de') */ lang?: string; /** Text direction */ dir?: 'ltr' | 'rtl'; /** Published time (ISO 8601 string if available) */ publishedTime?: string; /** Character count of text content */ length: number; /** Word count */ wordCount: number; /** Estimated reading time in minutes */ readingTime: number; /** Whether content passed readability check */ readerable: boolean; /** Extraction time in milliseconds */ extractionTime: number; } /** * Error types for extraction failures. */ type ExtractionErrorType = 'NOT_READERABLE' | 'PARSE_ERROR' | 'EXTRACTION_FAILED' | 'INVALID_HTML' | 'UNKNOWN'; /** * Failed content extraction. */ interface ExtractionFailure { /** Extraction failed */ success: false; /** Error message */ error: string; /** Categorized error type */ errorType: ExtractionErrorType; /** Whether content passed readability check (if checked) */ readerable: boolean; /** Extraction time in milliseconds */ extractionTime: number; /** Original error details (if available) */ details?: unknown; } /** * Result of content extraction. * * @remarks * Always returns a result, never throws exceptions. */ type ContentResult = ExtractedContent | ExtractionFailure; /** * Quality assessment metrics. */ interface ContentQuality { /** Word count */ wordCount: number; /** Character count */ charCount: number; /** Estimated reading time in minutes */ readingTime: number; /** Average words per sentence */ avgWordsPerSentence: number; /** Paragraph count */ paragraphCount: number; /** Image count in content */ imageCount: number; /** Link count in content */ linkCount: number; /** Link density (ratio of link text to total text) */ linkDensity: number; /** Overall quality score (0-100) */ qualityScore: number; } /** * Main content extraction module. * * @remarks * Extracts article content from HTML using Mozilla Readability. * Never throws exceptions - always returns a ContentResult. * * @packageDocumentation */ /** * Extract article content from HTML. * * @remarks * Uses Mozilla Readability to extract clean article content from a pre-parsed Document. * This function never throws exceptions - always returns a ContentResult. * * Error handling: * - Returns success: false for any extraction failure * - Categorizes errors by type for better handling * - Includes extraction time even for failures * * @param doc - Pre-parsed Document to extract content from * @param options - Extraction options * @returns Extraction result (success or failure) * * @example * ```typescript * import { parseHTML } from '../utils/html-parser.js'; * import { extractSEO } from '../metadata/index.js'; * * const doc = parseHTML(html); * const metadata = extractSEO(doc); * const content = extractContent(doc, { * baseUrl: 'https://example.com/article', * charThreshold: 300, * checkReadability: true, * }); * * if (content.success) { * console.log(content.title); * console.log(content.wordCount); * console.log(`${content.readingTime} min read`); * } else { * console.error(content.error); * } * ``` */ declare function extractContent(doc: Document, options?: ContentExtractionOptions): ContentResult; /** * HTML to text conversion types. * * @remarks * Types for converting HTML to plain text with the `htmlToText` function. * * @packageDocumentation */ /** * Options for HTML to plain text conversion. */ interface HtmlToTextOptions { /** * How to treat the input HTML. * * @remarks * - `"fragment"`: Treat as HTML fragment (default) * - `"document"`: Treat as full document (ignores `
` content) * * @defaultValue `"fragment"` */ mode?: 'fragment' | 'document'; /** * How to render anchor (``) tags. * * @remarks * - `"text"`: Show only the link text (default) * - `"inline"`: Show text followed by URL in parentheses, e.g., "Click here (https://example.com)" * - `"remove"`: Remove links entirely * * @defaultValue `"text"` */ links?: 'text' | 'inline' | 'remove'; /** * How to render image (``, ``) is always kept intact.
*
* @defaultValue `true`
*/
collapseWhitespace?: boolean;
/**
* Maximum consecutive newlines allowed after compaction.
*
* @remarks
* Limits runs of newlines to this value. Set to `1` for single spacing,
* `2` for double spacing (default), or higher values as needed.
*
* @defaultValue `2`
*/
maxNewlines?: number;
/**
* Optional hard-wrap column width.
*
* @remarks
* When set to a positive number, lines will be wrapped at this column width.
* Does not wrap inside preserved tags like `` or ``.
* Set to `null` to disable wrapping (default).
*
* @defaultValue `null`
*/
wrap?: number | null;
/**
* Separator between table cells.
*
* @remarks
* - `"tab"`: Use tab character (default)
* - `"space"`: Use space character
*
* @defaultValue `"tab"`
*/
tableCellSeparator?: 'tab' | 'space';
/**
* HTML tags to exclude entirely along with their contents.
*
* @remarks
* By default excludes: `script`, `style`, `noscript`, `template`, `svg`, `canvas`
*
* @defaultValue `["script", "style", "noscript", "template", "svg", "canvas"]`
*/
excludeTags?: string[];
/**
* Decode HTML entities.
*
* @remarks
* When `true`, decodes entities like `&`, `<`, `—`, etc.
*
* @defaultValue `true`
*/
decodeEntities?: boolean;
/**
* Tags whose internal whitespace is preserved.
*
* @remarks
* These tags will not have their whitespace collapsed, allowing proper
* formatting of code blocks and preformatted text.
*
* @defaultValue `["pre", "code", "textarea"]`
*/
preserveTags?: string[];
/**
* Trim leading and trailing whitespace from the result.
*
* @defaultValue `true`
*/
trim?: boolean;
}
/**
* HTML to text conversion.
*
* @remarks
* Convert HTML to plain text using a zero-dependency streaming tokenizer.
* Pure, deterministic transformation suitable for logs, previews, classification,
* and search indexing. Preserves essential structure by inserting newlines at
* block boundaries, handles entities, and provides configurable options.
*
* @packageDocumentation
*/
/**
* Convert an HTML string to plain text.
*
* @remarks
* This function uses a streaming tokenizer to parse HTML and extract text content.
* It handles block elements, whitespace preservation, HTML entities, tables, and more.
*
* Features:
* - Preserves document structure with appropriate line breaks
* - Handles HTML entities (numeric and common named entities)
* - Configurable link and image handling
* - Table rendering with configurable cell separators
* - Whitespace preservation for code/pre blocks
* - Optional hard-wrapping at column width
*
* @param html - HTML string (fragment or full document)
* @param options - Conversion options
* @returns Plain text string
*
* @throws {TypeError} If html is not a string
*
* @example
* ```typescript
* const html = 'Hello
World!
';
* const text = htmlToText(html);
* console.log(text); // "Hello\n\nWorld!"
* ```
*
* @example
* ```typescript
* const html = 'Visit';
* const text = htmlToText(html, { links: 'inline' });
* console.log(text); // "Visit (https://example.com)"
* ```
*/
declare function htmlToText(html: string, options?: HtmlToTextOptions): string;
/**
* Content quality assessment.
*
* @remarks
* Analyzes extracted content to provide quality metrics.
*
* @packageDocumentation
*/
/**
* Calculate word count from text.
*
* @param text - Text to count words in
* @returns Number of words
*/
declare function countWords(text: string): number;
/**
* Calculate reading time in minutes.
*
* @remarks
* Uses average reading speed of 200 words per minute.
*
* @param wordCount - Number of words
* @returns Estimated reading time in minutes
*/
declare function calculateReadingTime(wordCount: number): number;
/**
* Assess content quality.
*
* @remarks
* Analyzes extracted content and returns comprehensive quality metrics.
*
* @param content - Extracted content
* @returns Quality assessment
*
* @example
* ```typescript
* const content = extractContent(html);
* if (content.success) {
* const quality = assessContentQuality(content);
* console.log(`Quality score: ${quality.qualityScore}/100`);
* console.log(`Reading time: ${quality.readingTime} minutes`);
* }
* ```
*/
declare function assessContentQuality(content: ExtractedContent): ContentQuality;
/**
* Mozilla Readability wrapper with linkedom.
*
* @remarks
* Provides a clean interface to Mozilla Readability using linkedom as the DOM implementation.
*
* @packageDocumentation
*/
/**
* Check if HTML content is probably readerable.
*
* @remarks
* Quick check to determine if content extraction is likely to succeed.
* This is a heuristic check and may produce false positives/negatives.
*
* @param doc - Pre-parsed Document to check
* @param options - Readability check options
* @returns True if content appears to be an article
*
* @example
* ```typescript
* import { parseHTML } from '../utils/html-parser.js';
*
* const doc = parseHTML(html);
* if (isProbablyReaderable(doc)) {
* const result = extractContent(doc);
* }
* ```
*/
declare function isProbablyReaderable(doc: Document, options?: {
minContentLength?: number;
minScore?: number;
}): boolean;
/**
* Feed format detection utilities.
*
* @packageDocumentation
*/
/**
* Feed format type.
*
* @remarks
* Represents the detected or expected format of a feed.
* - `'rss'` - RSS 2.0, 0.9x, or RSS 1.0 (RDF)
* - `'atom'` - Atom 1.0
* - `'json-feed'` - JSON Feed 1.0 or 1.1
* - `'sitemap'` - XML Sitemap (urlset or sitemapindex)
* - `'unknown'` - Format could not be determined
*/
type FeedFormat = 'rss' | 'atom' | 'json-feed' | 'sitemap' | 'unknown';
/**
* Detect feed format from content string.
*
* @remarks
* Analyzes the content to determine if it's RSS, Atom, or JSON Feed.
* Detection is based on root elements, namespaces, and structure.
*
* Detection priority:
* 1. JSON Feed (checks for JSON with jsonfeed.org version)
* 2. RSS (checks for `` or `` root elements)
* 3. Atom (checks for `` root element with Atom namespace)
*
* @param content - Feed content as string
* @returns Detected format or 'unknown' if format cannot be determined
*
* @example
* ```typescript
* const format = detectFormat(feedContent);
* if (format === 'rss') {
* console.log('This is an RSS feed');
* }
* ```
*/
declare function detectFormat(content: string): FeedFormat;
/**
* Check if content is a valid feed (any format).
*
* @param content - Feed content as string
* @returns `true` if content is RSS, Atom, or JSON Feed
*
* @example
* ```typescript
* if (isFeed(content)) {
* const result = parseFeed(content);
* }
* ```
*/
declare function isFeed(content: string): boolean;
/**
* Check if content is RSS format.
*
* @param content - Feed content as string
* @returns `true` if content is RSS (any version)
*/
declare function isRSS(content: string): boolean;
/**
* Check if content is Atom format.
*
* @param content - Feed content as string
* @returns `true` if content is Atom 1.0
*/
declare function isAtom(content: string): boolean;
/**
* Check if content is JSON Feed format.
*
* @param content - Feed content as string
* @returns `true` if content is JSON Feed (1.0 or 1.1)
*/
declare function isJSONFeed(content: string): boolean;
/**
* Unified feed types - normalized interface across all feed formats.
*
* @remarks
* These types provide a consistent interface for working with feeds regardless
* of the original format (RSS, Atom, or JSON Feed). All format-specific data
* is normalized to this structure by the parser.
*
* @packageDocumentation
*/
/**
* Feed author information.
*
* @remarks
* Represents author/contributor information normalized across all feed formats.
* Not all formats provide all fields.
*/
interface FeedAuthor {
/** Author's name */
name?: string;
/** Author's email address */
email?: string;
/** Author's website URL */
url?: string;
}
/**
* Feed enclosure (attached file).
*
* @remarks
* Represents attached files like audio, video, or documents. Commonly used
* for podcasts and media feeds.
*/
interface FeedEnclosure {
/** URL of the attached file */
url: string;
/** MIME type of the file (e.g., 'audio/mpeg', 'video/mp4') */
type?: string;
/** File size in bytes */
length?: number;
}
/**
* Feed item (entry/article/post).
*
* @remarks
* Represents a single item in a feed. Items are normalized across all formats
* to provide a consistent interface. Not all fields are available in all formats.
*/
interface FeedItem {
/** Unique identifier for the item (GUID, ID, or URL) */
id: string;
/** Item title */
title?: string;
/** Canonical URL for the item */
url?: string;
/** External URL for linked posts (when different from canonical URL) */
externalUrl?: string;
/** Full HTML content of the item */
contentHtml?: string;
/** Plain text content of the item */
contentText?: string;
/** Short summary or description */
summary?: string;
/** Publication date in ISO 8601 format */
published?: string;
/** Last modified date in ISO 8601 format */
modified?: string;
/** Item authors (may be empty if using feed-level authors) */
authors?: FeedAuthor[];
/** Tags, categories, or keywords */
tags?: string[];
/** Featured image URL */
image?: string;
/** Attached files (audio, video, documents) */
enclosures?: FeedEnclosure[];
}
/**
* Normalized feed data.
*
* @remarks
* The main feed object containing metadata and items. This is the recommended
* interface for working with feeds as it provides a consistent structure
* regardless of the original format.
*/
interface Feed {
/** Original feed format */
format: 'rss' | 'atom' | 'json-feed' | 'sitemap';
/** Feed title (required) */
title: string;
/** Feed description or subtitle */
description?: string;
/** Feed's home page URL */
url?: string;
/** Feed's own URL (self-reference) */
feedUrl?: string;
/** Feed language code (e.g., 'en', 'de') */
language?: string;
/** Feed icon or logo URL */
image?: string;
/** Feed-level authors */
authors?: FeedAuthor[];
/** Last update date in ISO 8601 format */
updated?: string;
/** Feed items (entries/articles/posts) */
items: FeedItem[];
}
/**
* Parse result containing both normalized and original data.
*
* @remarks
* Returned by {@link parseFeed}. Contains both the normalized feed data
* (recommended for most use cases) and the original format-specific data
* (for advanced use cases requiring format-specific fields).
*/
interface ParseResult {
/** Normalized feed data (recommended) */
feed: Feed;
/** Original format-specific data (advanced use) */
original: unknown;
}
/**
* Unified feed parser with automatic format detection.
*
* @packageDocumentation
*/
/**
* Parse any feed format with automatic format detection.
*
* @remarks
* This is the main entry point for feed parsing. It automatically detects whether
* the content is RSS, Atom, or JSON Feed, parses it, and returns a normalized
* output structure along with the original format-specific data.
*
* All relative URLs in the feed are converted to absolute URLs if a base URL is provided.
* This is essential for feed readers that need to fetch images, enclosures, or follow links.
*
* @param content - Feed content as string (XML or JSON)
* @param baseUrl - Optional base URL for resolving relative URLs (string or URL object)
* @returns Object containing normalized feed data and original format-specific data
* @throws Error if format cannot be detected or parsing fails
*
* @example
* ```typescript
* const feedContent = await fetch('https://example.com/feed.xml').then(r => r.text());
* const result = parseFeed(feedContent, 'https://example.com/feed.xml');
*
* console.log(result.feed.title);
* console.log(result.feed.items[0].title);
* console.log(result.feed.items[0].url); // Absolute URL
* ```
*/
declare function parseFeed(content: string, baseUrl?: string | URL): ParseResult;
/**
* Types for high-level gathering functionality.
*
* @packageDocumentation
*/
/**
* Gathered website data.
*
* @remarks
* This interface represents the complete gathered data from a website,
* including the authoritative URL and all extracted metadata.
* It will be extended incrementally with more properties.
*/
interface Website {
/**
* Authoritative URL for the page.
*
* @remarks
* Uses canonical URL if present, otherwise the final URL after redirects.
*/
url: URL;
/** Discovered feed URLs (RSS, Atom, JSON Feed) as URL objects */
feeds: URL[];
/**
* Page title (cleaned, from best available source).
*
* @remarks
* Collects titles from multiple sources, cleans them, and picks the longest.
* Sources: OpenGraph, Twitter Card, HTML title tag, First H1
*/
title?: string;
/**
* Page description (from best available source).
*
* @remarks
* Collects descriptions from metadata and picks the longest.
* Sources: OpenGraph, Twitter Card, HTML meta description
*/
description?: string;
/**
* Page keyvisual/image URL (from best available source).
*
* @remarks
* Priority: OpenGraph > Twitter Card > Largest Apple Touch Icon > Favicon
* Returns the URL object of the best visual representation of the site.
*/
image?: URL;
/**
* Best available icon/favicon for the site.
*
* @remarks
* Priority: Largest Apple Touch Icon > Safari mask icon > Favicon > Shortcut icon > MS tile > Fluid icon
* Returns the highest quality icon available, preferring modern, high-resolution formats.
*/
icon?: URL;
/**
* Primary language code (ISO 639-1).
*
* @remarks
* Extracted from HTML lang attribute, content-language meta tag, or OpenGraph locale.
* Normalized to lowercase ISO 639-1 format (e.g., 'en', 'de', 'fr', 'ja').
*/
language?: string;
/**
* Region code (ISO 3166-1 alpha-2).
*
* @remarks
* Only present if the language includes a region specifier.
* Normalized to uppercase ISO 3166-1 alpha-2 format (e.g., 'US', 'GB', 'DE').
*/
region?: string;
/**
* Raw HTML content of the page (UTF-8).
*
* @remarks
* The complete HTML source after fetching and decoding to UTF-8.
* Useful for custom processing or caching.
*/
html: string;
/**
* Plain text content extracted from the HTML.
*
* @remarks
* Automatically converted from HTML using the `htmlToText` function.
* Removes all tags, decodes entities, and preserves document structure
* with appropriate line breaks.
*/
text: string;
/**
* Internal links found on the page (same domain, excluding current URL).
*
* @remarks
* All links are URL objects. The current page URL is excluded to avoid
* self-references. Useful for site crawling and navigation analysis.
*/
internalLinks: URL[];
/**
* External links found on the page (different domains).
*
* @remarks
* All links are URL objects. Useful for analyzing outbound links,
* citations, and external resources.
*/
externalLinks: URL[];
}
/**
* Gathered article data.
*
* @remarks
* This interface represents the complete gathered data from an article page,
* including the authoritative URL, raw HTML, and extracted content.
* It will be extended incrementally with more properties.
*/
interface Article {
/**
* Authoritative URL for the article.
*
* @remarks
* Uses canonical URL if present, otherwise the final URL after redirects.
*/
url: URL;
/**
* Raw HTML content of the article page (UTF-8).
*
* @remarks
* The complete HTML source after fetching and decoding to UTF-8.
* Useful for custom processing or caching.
*/
html: string;
/**
* Plain text content extracted from the HTML.
*
* @remarks
* Automatically converted from HTML using the `htmlToText` function.
* Removes all tags, decodes entities, and preserves document structure
* with appropriate line breaks.
*/
text: string;
/**
* Cleaned article content (plain text).
*
* @remarks
* Extracted using Mozilla Readability (cleaned HTML), then converted to
* plain text using `htmlToText` for proper formatting.
* This is the main article body without navigation, ads, or other clutter.
* Falls back to undefined if Readability extraction fails.
*/
content?: string;
/**
* Article title.
*
* @remarks
* Extracted from Mozilla Readability if available.
* Falls back to metadata (Schema.org, OpenGraph, Twitter Card, HTML title)
* if Readability extraction fails or title is empty.
*/
title?: string;
/**
* Article description/excerpt.
*
* @remarks
* Extracted from Mozilla Readability's excerpt if available.
* Falls back to metadata (OpenGraph, Twitter Card, HTML meta description)
* if Readability excerpt is empty or extraction fails.
*/
description?: string;
/**
* Article keyvisual/image URL (from best available source).
*
* @remarks
* Priority: Schema.org NewsArticle/Article (largest) > OpenGraph > Twitter Card > Largest Apple Touch Icon > Favicon
* Returns the URL object of the best visual representation of the article.
*/
image?: URL;
/**
* Primary language code (ISO 639-1).
*
* @remarks
* Extracted from HTML lang attribute, Content-Language meta, or OpenGraph locale.
* Returns lowercase 2-letter ISO 639-1 code (e.g., 'en', 'de', 'fr').
*/
language?: string;
/**
* Region/country code (ISO 3166-1 alpha-2).
*
* @remarks
* Extracted from language tags like 'en-US' or 'de-DE'.
* Returns uppercase 2-letter ISO 3166-1 alpha-2 code (e.g., 'US', 'GB', 'DE').
*/
region?: string;
/**
* Internal links found in the article (same domain/subdomain).
*
* @remarks
* Links pointing to pages within the same domain.
* Automatically excludes the current article URL.
* All URLs are absolute and normalized.
*/
internalLinks: URL[];
/**
* External links found in the article (different domains).
*
* @remarks
* Links pointing to external domains (useful for citations, references).
* All URLs are absolute and normalized.
*/
externalLinks: URL[];
/**
* Word count of the article.
*
* @remarks
* Calculated from `content` if available (Readability-cleaned content),
* otherwise calculated from `text` (full page text).
* Based on whitespace-separated word boundaries.
*/
wordCount: number;
/**
* Estimated reading time in minutes.
*
* @remarks
* Calculated from word count using average reading speed of 200 words per minute.
* Minimum value is 1 minute.
*/
readingTime: number;
}
/**
* High-level article gathering functionality.
*
* @packageDocumentation
*/
/**
* Gather article data from a URL in one convenient call.
*
* @remarks
* This is a high-level convenience method that fetches an article page and extracts
* relevant data. It handles encoding detection, redirects, and provides
* a unified interface for all article data.
*
* This method will be extended incrementally to include metadata extraction,
* content extraction, and more.
*
* @param url - Article URL as string or URL object
* @returns Gathered article data including URL, content, metadata, language, and links
* @throws Error if URL is invalid or fetch fails
*
* @example
* ```typescript
* // Fetch an article and get its data
* const article = await gatherArticle('https://example.com/article');
* console.log(article.url); // Final URL after redirects
* console.log(article.html); // Raw HTML content (UTF-8)
* console.log(article.text); // Plain text (full page HTML converted)
* console.log(article.content); // Cleaned article content (Readability + htmlToText)
* console.log(article.title); // Article title (from Readability or metadata)
* console.log(article.description); // Article excerpt or description
* console.log(article.image); // Article keyvisual/image (from best source)
* console.log(article.language); // Language code (ISO 639-1, e.g., 'en')
* console.log(article.region); // Region code (ISO 3166-1 alpha-2, e.g., 'US')
* console.log(article.internalLinks); // Array of internal link URLs
* console.log(article.externalLinks); // Array of external link URLs
* console.log(article.wordCount); // Word count (from content or text)
* console.log(article.readingTime); // Estimated reading time in minutes
* ```
*/
declare function gatherArticle(url: string | URL): Promise;
/**
* High-level feed gathering functionality.
*
* @packageDocumentation
*/
/**
* Gather and parse a feed from a URL in one convenient call.
*
* @remarks
* This is a high-level convenience method that combines fetching and parsing.
* It handles encoding detection, redirects, and feed format detection automatically.
* Falls back to sitemap parsing when standard feed formats aren't detected.
*
* @param url - Feed URL as string or URL object
* @returns Normalized feed data
* @throws Error if URL is invalid, fetch fails, or feed cannot be parsed
*
* @example
* ```typescript
* // Fetch and parse a feed
* const feed = await gatherFeed('https://example.com/feed.xml');
*
* console.log(feed.title);
* console.log(feed.items[0].title);
* console.log(feed.items[0].url);
* ```
*/
declare function gatherFeed(url: string | URL): Promise;
/**
* High-level website gathering functionality.
*
* @packageDocumentation
*/
/**
* Gather website data from a URL in one convenient call.
*
* @remarks
* This is a high-level convenience method that fetches a website and extracts
* all relevant data. It handles encoding detection, redirects, and provides
* a unified interface for all website data.
*
* This method will be extended incrementally to include metadata extraction,
* content extraction, and more.
*
* @param url - Website URL as string or URL object
* @returns Gathered website data including final URL, title, description, image, icon, language, html, text, feeds, and links
* @throws Error if URL is invalid or fetch fails
*
* @example
* ```typescript
* // Fetch a website and get its data
* const site = await gatherWebsite('https://example.com');
* console.log(site.url); // Final URL after redirects
* console.log(site.title); // Page title (cleaned, from best source)
* console.log(site.description); // Page description (from best source)
* console.log(site.image); // Page image/keyvisual (from best source)
* console.log(site.icon); // Best available icon/favicon
* console.log(site.language); // Primary language code (ISO 639-1)
* console.log(site.region); // Region code (ISO 3166-1 alpha-2)
* console.log(site.html); // Raw HTML content (UTF-8)
* console.log(site.text); // Plain text content (extracted from HTML)
* console.log(site.feeds); // Array of feed URL objects
* console.log(site.internalLinks); // Array of internal link URL objects
* console.log(site.externalLinks); // Array of external link URL objects
* ```
*/
declare function gatherWebsite(url: string | URL): Promise;
/**
* HTML parsing utilities using linkedom.
*
* @remarks
* This module provides a simple wrapper around linkedom for consistent
* HTML parsing across all metadata extraction modules. Parsing should happen
* once at the top level and the parsed document passed to all extractors.
*
* @packageDocumentation
*/
/**
* Parse HTML string into a DOM document.
*
* @remarks
* Parses HTML using linkedom, providing a standards-compliant DOM implementation.
* This should be called once per document, with the result passed to all metadata
* extractors for performance.
*
* Never throws - returns a document even for malformed HTML.
*
* @param html - HTML string to parse
* @param baseUrl - Optional base URL for resolving relative URLs
* @returns Parsed DOM document
*
* @example
* ```typescript
* const doc = parseHTML('Test ');
* const title = doc.querySelector('title')?.textContent;
* ```
*/
declare function parseHTML(html: string, baseUrl?: string): Document;
type HTMLDocument = Document;
/**
* Input type that accepts either a parsed Document or raw HTML string.
* This allows extractor functions to be more forgiving.
*/
type DocumentInput = Document | string;
/**
* Analytics and tracking types.
*
* @remarks
* Types for analytics service detection (IDs only, no tracking).
*
* @packageDocumentation
*/
/**
* Analytics metadata.
*
* @remarks
* Contains detected analytics service IDs. Privacy-conscious - only extracts IDs,
* doesn't perform any tracking.
*/
interface AnalyticsMetadata {
/** Google Analytics tracking IDs (UA-, G-, GT- prefixes) */
googleAnalytics?: string[];
/** Google Tag Manager container IDs */
googleTagManager?: string[];
/** Facebook Pixel IDs */
facebookPixel?: string[];
/** Matomo/Piwik site IDs */
matomo?: string[];
/** Plausible Analytics domains */
plausible?: string[];
/** Adobe Analytics (Omniture) IDs */
adobe?: string[];
/** Cloudflare Web Analytics tokens */
cloudflare?: string[];
/** Fathom Analytics site IDs */
fathom?: string[];
}
/**
* Analytics and tracking extraction.
*
* @remarks
* Detects analytics service IDs from HTML documents.
* Privacy-conscious - only extracts IDs, doesn't perform any tracking.
*
* @packageDocumentation
*/
/**
* Extract analytics metadata from HTML.
*
* @remarks
* Detects analytics service IDs by examining script tags and their content.
* Only extracts identifiers, does not track or collect user data.
*
* @param input - Parsed HTML document or raw HTML string
* @returns Analytics metadata
*
* @example
* ```typescript
* // With parsed document (recommended for multiple extractions)
* const doc = parseHTML(htmlString);
* const analytics = extractAnalytics(doc);
*
* // Or directly with HTML string
* const analytics = extractAnalytics(htmlString);
* ```
*/
declare function extractAnalytics(input: DocumentInput): AnalyticsMetadata;
/**
* Assets extraction types.
*
* @remarks
* Types for categorized asset URLs extracted from HTML documents.
*
* @author Anonyfox
* @license MIT
* @see {@link https://github.com/Anonyfox/ravenjs}
* @see {@link https://ravenjs.dev}
* @see {@link https://anonyfox.com}
*
* @packageDocumentation
*/
/**
* Categorized assets extracted from HTML.
*
* @remarks
* Contains all external assets referenced in the document, organized by type.
* All URLs are normalized to absolute format if a base URL is available.
*/
interface AssetsMetadata {
/** Image URLs from img, picture, srcset, and meta tags */
images?: string[];
/** Stylesheet URLs from link tags */
stylesheets?: string[];
/** Script URLs from script tags */
scripts?: string[];
/** Font URLs extracted from CSS */
fonts?: string[];
/** Media URLs from video, audio, source, and track elements */
media?: string[];
/** Web app manifest URLs */
manifests?: string[];
/** Preload/prefetch resource hints */
preloads?: PreloadResource[];
/** DNS prefetch and preconnect hints */
connectionHints?: ConnectionHint[];
}
/**
* Preload or prefetch resource hint.
*/
interface PreloadResource {
/** Resource URL */
url: string;
/** Resource type (script, style, font, image, etc.) */
as?: string;
/** MIME type */
type?: string;
/** Crossorigin attribute */
crossorigin?: string;
/** Whether this is a prefetch (true) or preload (false) */
prefetch?: boolean;
}
/**
* DNS prefetch or preconnect hint.
*/
interface ConnectionHint {
/** Domain URL */
url: string;
/** Whether this is a preconnect (true) or dns-prefetch (false) */
preconnect?: boolean;
/** Crossorigin attribute */
crossorigin?: string;
}
/**
* Assets extraction.
*
* @remarks
* Extracts categorized asset URLs from HTML documents.
*
* @author Anonyfox
* @license MIT
* @see {@link https://github.com/Anonyfox/ravenjs}
* @see {@link https://ravenjs.dev}
* @see {@link https://anonyfox.com}
*
* @packageDocumentation
*/
/**
* Extract assets metadata from HTML.
*
* @remarks
* Extracts all external assets referenced in the document, organized by type.
* All URLs are normalized to absolute format based on the document's base URL.
*
* The extractor finds assets from:
* - Images: `
`, ``, `srcset`, OpenGraph meta tags
* - Stylesheets: ``
* - Scripts: `