import { Resource } from '../../model/index.js'; import { MetaValues } from './parse-meta-tags.js'; import { ElementData } from './find-element-data.js'; import { JsonMap } from '@salesforce/ts-types'; /** * Options to control extraction of structured data from HTML pages */ export type PageDataOptions = { /** * Parse all available information, ignoring any other 'false' parameters set in * the options object. */ all?: boolean; /** * Parse and list the attributes of the HTML `` tag. Body classes and IDs are * often populated with contextual and content related metadata by CMSs templates. */ attributes?: boolean; /** * Extract common HTML head sub-tags like ``, `<base>`, and so on. */ head?: boolean; /** * Parse and list any `<meta>` tags present in the document. These will be returned * as a dictionary keyed by the 'name', 'itemprop', and 'property' attributes of the * meta tags; keys with colons will be treated as nested; `<meta name="og:title" ...>` * for example will become become `meta['og']['title'] = ...` */ meta?: boolean; /** * Parse and list any `<link>` tags present in the document. These will be returned * as a dictionary keyed by the links' `rel` attributes. */ links?: boolean; /** * Parse and list any `<noscript>` tags present in the document. */ noscript?: boolean; /** * Parse and list any `<script>` tags in the document; JSON data will be parsed * and stored in a separate 'json' property of the results. */ scripts?: boolean; /** * Parse and list any JSON or JSON+LD tags in the document, even if normal scripts * are ignored. */ json?: boolean; /** * If present, move Schema.org JSON+LD data to a dedicated property in the results. */ schemaOrg?: boolean; /** * Parse and list any CSS `<style>` tags present in the document. */ styles?: boolean; /** * Parse and list any HTML `<template>` tags present in the document. */ templates?: boolean; /** * Ignore rather than modifying tags and structured data that are present but * incomplete or in the wrong location. Turning on 'strict' mode will throw away * quite a bit of data, because everyone on the planet outputs horribly malformed * meta tags, RDFa gunk, and so on. It's a plague. Nightmarish stuff, really. */ strict?: boolean; /** * A list of HTML Meta tags whose attributes should be treated as comma-delimited * lists rather than strings. */ metaArrayAttributes?: string[]; }; /** * Structured data parsed from an HTML document. * * This function makes a best attempt to return accurate and complete data that's * actually easy to retrieve in code, without rewriting a parser. Nobody wants that. */ export interface PageData { [key: string]: unknown; attributes?: ElementData; title?: string; base?: string; baseTarget?: string; meta?: MetaValues; links?: Record<string, Record<string, string | undefined>[]>; templates?: Record<string, string | undefined>[]; scripts?: Record<string, string | undefined>[]; json?: Record<string, unknown>[]; schemaOrg?: JsonMap; styles?: Record<string, string | undefined>[]; noscript?: Record<string, string | undefined>[]; } export declare function getPageData(input: string | cheerio.Root | Resource, customOptions?: PageDataOptions): Promise<PageData>; //# sourceMappingURL=get-page-data.d.ts.map