/** * Article extraction via Readability — generic `page → article HTML` pipeline. * * Complements `src/browser/extract.ts`: that one takes a caller-supplied * selector. This one works with zero configuration on arbitrary article pages * (blogs, news, docs) by running `@mozilla/readability` inside the page * context via CDP evaluate. * * Pipeline: * 1. Short-circuit non-HTML documents (`text/plain`, JSON, XML) — a page * renderer wrapping a plain-text file would pollute the DOM pipeline. * 2. Short-circuit the "body is a single
" case, which browsers use
 *      when loading *.txt / *.md over file:// or raw.githubusercontent.com.
 *   3. Deep-clone the document, apply caller-supplied `cleanSelectors` to the
 *      clone (preserves live page state for subsequent snapshot/click).
 *   4. Inject Readability + isProbablyReaderable sources into the page,
 *      parse on the clone. `isProbablyReaderable` gates the parse unless
 *      `force: true`.
 *   5. On Readability miss, walk a fallback selector chain
 *      (main → [role="main"] → #main-content → … → body) and return the
 *      first root with >80 characters of text.
 *
 * Readability runs in the page's own window because it needs real DOM APIs
 * (getComputedStyle, treeWalker). Running it Node-side would require jsdom —
 * a heavy dep the rest of OpenCLI doesn't need.
 */
export interface ExtractArticleOptions {
    /** CSS selectors removed from the cloned document before Readability runs. */
    cleanSelectors?: string[];
    /** Fallback chain when Readability fails. Defaults to the common structural ids. */
    fallbackSelectors?: string[];
    /** Bypass `isProbablyReaderable` and always attempt a parse. */
    force?: boolean;
}
export type ExtractSource = 'readability' | 'fallback' | 'raw-text' | 'pre';
export interface ExtractedArticle {
    html: string;
    title: string;
    byline?: string;
    publishedTime?: string;
    siteName?: string;
    source: ExtractSource;
}
export declare const DEFAULT_FALLBACK_SELECTORS: string[];
/**
 * Build the JS expression evaluated in-page to extract the article. Exported
 * for testability — callers on the host side should use `extractArticle`.
 */
export declare function buildExtractArticleJs(options?: ExtractArticleOptions): string;
export interface PageLike {
    evaluate(js: string): Promise;
}
/**
 * Run the extract pipeline on the given page. Returns `null` when no usable
 * content is found (Readability miss + empty fallback chain).
 */
export declare function extractArticle(page: PageLike, options?: ExtractArticleOptions): Promise;