import { TextTools, Resource } from '../../index.js'; import { HtmlToTextOptions } from './get-plaintext.js'; export interface PageContent extends Record { text?: string; readability?: TextTools.ReadabilityScore; } /** * Options to control the extraction of core content from an HTML page */ export interface PageContentOptions { /** * Generate a plaintext version of the HTML. * * @remarks * Other options like {@link PageContentOptions.selector | selector}, * {@link PageContentOptions.allowMultipleContentElements | allowMultipleContentElements}, and * {@link PageContentOptions.defaultToFullDocument | defaultToFullDocument} will override the * equivalent values in this configuration object. * * @see {@link https://github.com/html-to-text/node-html-to-text/tree/master/packages/html-to-text | Html-To-Text docs} for details */ htmlToText?: HtmlToTextOptions; /** * One or more CSS selectors used to find the markup's primary content. * * @remarks * This option is prefered over {@link HtmlToTextOptions.baseElements.selectors | baseElements.selectors} * on the {@link PageContentOptions.htmlToText | text} option. HtmlToText is good, but its support for * some selectors is limited and can generate surprising results. Whenever possible, use this option instead. */ selector?: string | string[]; /** * Allow multiple page elements to be treated as the markup's 'primary content'. * * @remarks * Setting this to `true` is equivalent to setting {@link HtmlToTextOptions.limits.maxBaseElements | limits.maxBaseElements} * on the {@link PageContentOptions.htmlToText | text} option to `1`. * * @defaultValue `false` */ allowMultipleContentElements?: boolean; /** * Fall back to the full text of the page if the specified selectors have no * matches. This will include headers, footers, navigation elements, etc. * * @remarks * Setting this to `true` is equivalent to setting {@link HtmlToTextOptions.baseElements.returnDomByDefault | baseElements.returnDomByDefault} * on the {@link PageContentOptions.htmlToText | text} option. * * @defaultValue `false` */ defaultToFullDocument?: boolean; /** * Trim surrounding whitespace around the content's plaintext. * * @defaultValue `true` */ trim?: boolean; /** * Calculate the readability score for the page's main content. * * @defaultValue `true` */ readability?: boolean | TextTools.ReadabilityScoreOptions; } /** * Extract the core content of an HTML page and return its plaintext, with * optional configuration options. */ export declare function getPageContent(input: string | cheerio.Root | Resource, customOptions?: PageContentOptions): Promise; //# sourceMappingURL=get-page-content.d.ts.map