import { FileReader, Document } from '@llamaindex/core/schema'; import { Opts } from 'string-strip-html'; /** * Extract the significant text from an arbitrary HTML document. * The contents of any head, script, style, and xml tags are removed completely. * The URLs for a[href] tags are extracted, along with the inner text of the tag. * All other tags are removed, and the inner text is kept intact. * Html entities (e.g., &) are not decoded. */ declare class HTMLReader extends FileReader { /** * Public method for this reader. * Required by BaseReader interface. * @param fileContent - The content of the file. * @returns `Promise` A Promise object, eventually yielding zero or one Document parsed from the HTML content of the specified file. */ loadDataAsContent(fileContent: Uint8Array): Promise; /** * Wrapper for string-strip-html usage. * @param html Raw HTML content to be parsed. * @param options An object of options for the underlying library * @see getOptions * @returns The HTML content, stripped of unwanted tags and attributes */ parseContent(html: string, options?: Partial): Promise; /** * Wrapper for our configuration options passed to string-strip-html library * @see https://codsen.com/os/string-strip-html/examples * @returns An object of options for the underlying library */ getOptions(): Partial; } export { HTMLReader };