/** * Article extraction via Readability — generic `page → article HTML` pipeline. * * Complements `src/browser/extract.ts`: that one takes a caller-supplied * selector. This one works with zero configuration on arbitrary article pages * (blogs, news, docs) by running `@mozilla/readability` inside the page * context via CDP evaluate. * * Pipeline: * 1. Short-circuit non-HTML documents (`text/plain`, JSON, XML) — a page * renderer wrapping a plain-text file would pollute the DOM pipeline. * 2. Short-circuit the "body is a single
" case, which browsers use
* when loading *.txt / *.md over file:// or raw.githubusercontent.com.
* 3. Deep-clone the document, apply caller-supplied `cleanSelectors` to the
* clone (preserves live page state for subsequent snapshot/click).
* 4. Inject Readability + isProbablyReaderable sources into the page,
* parse on the clone. `isProbablyReaderable` gates the parse unless
* `force: true`.
* 5. On Readability miss, walk a fallback selector chain
* (main → [role="main"] → #main-content → … → body) and return the
* first root with >80 characters of text.
*
* Readability runs in the page's own window because it needs real DOM APIs
* (getComputedStyle, treeWalker). Running it Node-side would require jsdom —
* a heavy dep the rest of OpenCLI doesn't need.
*/
export interface ExtractArticleOptions {
/** CSS selectors removed from the cloned document before Readability runs. */
cleanSelectors?: string[];
/** Fallback chain when Readability fails. Defaults to the common structural ids. */
fallbackSelectors?: string[];
/** Bypass `isProbablyReaderable` and always attempt a parse. */
force?: boolean;
}
export type ExtractSource = 'readability' | 'fallback' | 'raw-text' | 'pre';
export interface ExtractedArticle {
html: string;
title: string;
byline?: string;
publishedTime?: string;
siteName?: string;
source: ExtractSource;
}
export declare const DEFAULT_FALLBACK_SELECTORS: string[];
/**
* Build the JS expression evaluated in-page to extract the article. Exported
* for testability — callers on the host side should use `extractArticle`.
*/
export declare function buildExtractArticleJs(options?: ExtractArticleOptions): string;
export interface PageLike {
evaluate(js: string): Promise;
}
/**
* Run the extract pipeline on the given page. Returns `null` when no usable
* content is found (Readability miss + empty fallback chain).
*/
export declare function extractArticle(page: PageLike, options?: ExtractArticleOptions): Promise;