import { BlockTextBuilder } from "./lib/block-text-builder"; export type compiledFunction = (str: string) => string; export type metaData = any; /** * Preprocess options, compile selectors into a decision tree, * return a function intended for batch processing. */ export function compile(options?: HtmlToTextOptions): compiledFunction; /** * Convert given HTML content to plain text string. * * @example * const { htmlToText } = require('html-to-text'); * const text = htmlToText('<h1>Hello World</h1>', { * wordwrap: 130 * }); * console.log(text); // HELLO WORLD */ export function htmlToText(html: string, options?: HtmlToTextOptions, metadata?: metaData): string; export { htmlToText as convert }; export interface HtmlToTextOptions { /** * Options for narrowing down to informative parts of HTML document. */ baseElements?: BaseElementsOptions | undefined; /** * Decode HTML entities found in the input HTML if true. * Otherwise preserve in output text. */ decodeEntities?: boolean | undefined; /** * A dictionary with characters that should be replaced in the output * text and corresponding escape sequences. */ encodeCharacters?: Record<string, string> | undefined; /** * A dictionary with custom formatting functions for specific kinds of elements. * * Keys are custom string identifiers, values are callbacks. */ formatters?: Record<string, FormatCallback> | undefined; /** * Options for handling complex documents and limiting the output size. */ limits?: LimitsOptions | undefined; /** * Describes how to wrap long words. */ longWordSplit?: LongWordSplitOptions | undefined; /** * By default, any newlines `\n` from the input HTML are dropped. * * If `true`, these newlines will be preserved in the output. */ preserveNewlines?: boolean | undefined; /** * Instructions for how to render HTML elements based on matched selectors. * * Use this to (re)define options for new or already supported tags. */ selectors?: SelectorDefinition[] | undefined; /** * All characters that are considered whitespace. * Default is according to HTML specifications. */ whitespaceCharacters?: string | undefined; /** * After how many chars a line break should follow in `p` elements. * * Set to `null` or `false` to disable word-wrapping. */ wordwrap?: number | false | null | undefined; /** * The following are deprecated options. See the documentation. */ /** * @deprecated Use baseElements.selectors instead. */ baseElement?: string | string[] | undefined; /** * @deprecated Use baseElements instead. */ returnDomByDefault?: boolean | undefined; /** * @deprecated Use selectors with `format: 'dataTable'` instead. */ tables?: string[] | boolean | undefined; /** * @deprecated Use selectors instead. */ tags?: TagDefinitions | undefined; } /** * Options for narrowing down to informative parts of HTML document. */ export interface BaseElementsOptions { /** * The resulting text output will be composed from the text content of elements * matched with these selectors. */ selectors?: string[] | undefined; /** * When multiple selectors are set, this option specifies * whether the selectors order has to be reflected in the output text. * * `'selectors'` (default) - matches for the first selector will appear first, etc; * * `'occurrence'` - all bases will appear in the same order as in input HTML. */ orderBy?: "selectors" | "occurrence" | undefined; /** * Use the entire document if none of provided selectors matched. */ returnDomByDefault?: boolean | undefined; } /** * Options for handling complex documents and limiting the output size. */ export interface LimitsOptions { /** * ...] * A string to put in place of skipped content. */ ellipsis?: string | undefined; /** * Stop looking for more base elements after reaching this amount. * * Unlimited if undefined. */ maxBaseElements?: number | undefined; /** * Maximum number of child nodes of a single node to be added to the * output. Unlimited if undefined. */ maxChildNodes?: number | undefined; /** * Only go to a certain depth starting from `Options.baseElement`. * * Replace deeper nodes with ellipsis. * * No depth limit if undefined. */ maxDepth?: number | undefined; /** * If the input string is longer than this value - it will be truncated * and a message will be sent to `stderr`. * * Ellipsis is not used in this case. */ maxInputLength?: number | undefined; } /** * Describes how to wrap long words. */ export interface LongWordSplitOptions { /** * Break long words on the `Options.wordwrap` limit when there are no characters to wrap on. */ forceWrapOnLimit?: boolean | undefined; /** * An array containing the characters that may be wrapped on. */ wrapCharacters?: string[] | undefined; } /** * Describes how to handle tags matched by a selector. */ export interface SelectorDefinition { /** * CSS selector. Refer to README for notes on supported selectors etc. */ selector: string; /** * Identifier of a {@link FormatCallback}, built-in or provided in `Options.formatters` dictionary. */ format?: string | undefined; /** * Options to customize the formatter for this tag. */ options?: FormatOptions | undefined; } /** * Describes how to handle a tag. */ export interface TagDefinition { /** * Identifier of a {@link FormatCallback}, built-in or provided in `Options.formatters` dictionary. */ format?: string | undefined; /** * Options to customize the formatter for this tag. */ options?: FormatOptions | undefined; } /** * Options specific to different formatters ({@link FormatCallback}). * This is an umbrella type definition. Each formatter supports it's own subset of options. */ export interface FormatOptions { /** * Number of line breaks to separate previous block from this one. * * Note that N+1 line breaks are needed to make N empty lines. */ leadingLineBreaks?: number | undefined; /** * Number of line breaks to separate this block from the next one. * * Note that N+1 line breaks are needed to make N empty lines. */ trailingLineBreaks?: number | undefined; /** * (Only for: `anchor` and `image` formatters.) Server host for link `href` attributes and image `src` attributes * relative to the root (the ones that start with `/`). * * For example, with `baseUrl = 'http://asdf.com'` and `<a href='/dir/subdir'>...</a>` * the link in the text will be `http://asdf.com/dir/subdir`. * * Keep in mind that `baseUrl` should not end with a `/`. */ baseUrl?: string | undefined; /** * Surround links with these brackets.<br/>Set to `false` or `['', '']` to disable. * @default ['[', ']'] */ linkBrackets?: [string, string] | false | undefined; /** * (Only for: `anchor` and `image` formatters.) A function to rewrite link * href attributes and image src attributes. Applied before baseUrl. */ pathRewrite?: ((path: string, meta: metaData) => string) | undefined; /** * (Only for: `anchor` formatter.) By default links are translated in the following way: * * `<a href='link'>text</a>` => becomes => `text [link]`. * * If this option is set to `true` and `link` and `text` are the same, * `[link]` will be omitted and only `text` will be present. */ hideLinkHrefIfSameAsText?: boolean | undefined; /** * (Only for: `anchor` formatter.) Ignore all links. Only process internal text of anchor tags. */ ignoreHref?: boolean | undefined; /** * (Only for: `anchor` formatter.) Ignore anchor links (where `href='#...'`). */ noAnchorUrl?: boolean | undefined; /** * (Only for: `unorderedList` formatter.) String prefix for each list item. */ itemPrefix?: string | undefined; /** * (Only for: `heading` formatter.) By default, headings (`<h1>`, `<h2>`, etc) are uppercased. * * Set this to `false` to leave headings as they are. */ uppercase?: boolean | undefined; /** * (Only for: `horizontalLine` formatter.) Length of the `<hr/>` line. * * If numeric value is provided - it is used. * Otherwise, if global `wordwrap` number is provided - it is used. * If neither is true, then the fallback value of 40 is used. */ length?: number | undefined; /** * (Only for: `blockquote` formatter.) Trim empty lines from blockquote. */ trimEmptyLines?: boolean | undefined; /** * (Only for: `table`, `dataTable` formatter.) By default, heading cells (`<th>`) are uppercased. * * Set this to `false` to leave heading cells as they are. */ uppercaseHeaderCells?: boolean | undefined; /** * (Only for: `table`, `dataTable` formatter.) Data table cell content will be wrapped to fit this width * instead of global `wordwrap` limit. * * Set to `undefined` in order to fall back to `wordwrap` limit. */ maxColumnWidth?: number | undefined; /** * (Only for: `table`, `dataTable` formatter.) Number of spaces between data table columns. */ colSpacing?: number | undefined; /** * (Only for: `table`, `dataTable` formatter.) Number of empty lines between data table rows. */ rowSpacing?: number | undefined; /** * (Only for: `blockString`, `inlineString` formatters.) A string to be inserted in place of a tag. */ string?: string | undefined; /** * (Only for: `inlineSurround` formatter.) String prefix to be inserted before inline tag contents. */ prefix?: string | undefined; /** * (Only for: `inlineSurround` formatter.) String suffix to be inserted after inline tag contents. */ suffix?: string | undefined; /** * User defined values are supported. */ [key: string]: any; /** * @deprecated Use linkBrackets instead. * (Only for: `anchor` formatter.) Don't print brackets around links. */ noLinkBrackets?: boolean | undefined; } /** * Simplified definition of [htmlparser2](https://github.com/fb55/htmlparser2) Node type. * * Makes no distinction between elements (tags) and data nodes (good enough for now). */ export interface DomNode { /** * Type of node - "text", "tag", "comment", "script", etc. */ type: string; /** * Content of a data node. */ data?: string | undefined; /** * Tag name. */ name?: string | undefined; /** * Tag attributes dictionary. */ attribs?: any; /** * Child nodes. * Not optional for typescript use. */ children: DomNode[]; /** * Parent node. */ parent?: DomNode | undefined; } /** * A function to stringify a DOM node. */ export type FormatCallback = ( elem: DomNode, walk: RecursiveCallback, builder: BlockTextBuilder, formatOptions: FormatOptions, ) => void; /** * A function to process child nodes. * Passed into a {@link FormatCallback} as an argument. */ export type RecursiveCallback = (nodes: DomNode[], builder: BlockTextBuilder) => void; /** * Type of object passed to tags in the options. */ export interface TagDefinitions { ""?: TagDefinition | undefined; a?: TagDefinition | undefined; article?: TagDefinition | undefined; aside?: TagDefinition | undefined; blockquote?: TagDefinition | undefined; br?: TagDefinition | undefined; div?: TagDefinition | undefined; footer?: TagDefinition | undefined; form?: TagDefinition | undefined; h1?: TagDefinition | undefined; h2?: TagDefinition | undefined; h3?: TagDefinition | undefined; h4?: TagDefinition | undefined; h5?: TagDefinition | undefined; h6?: TagDefinition | undefined; header?: TagDefinition | undefined; hr?: TagDefinition | undefined; img?: TagDefinition | undefined; main?: TagDefinition | undefined; nav?: TagDefinition | undefined; ol?: TagDefinition | undefined; p?: TagDefinition | undefined; pre?: TagDefinition | undefined; table?: TagDefinition | undefined; ul?: TagDefinition | undefined; wbr?: TagDefinition | undefined; }