import { Locator, Page } from "playwright-core"; import { BasicSchema } from "../ai-extractors/jsonSchema"; import { HtmlStrategy, ImageStrategy } from "../ai-extractors"; /** * Extracts an array of structured data from a web page in an optimized way, this function will use ai for the first n times, until it collects multiple examples * then it will build reliable selectors in the background to make the process more efficient * * @param page - The Playwright Page object from which to extract the data. * @param options.label - A label for this extraction process, used for billing and monitoring. * @param options.itemEntityName - The name of the entity items being extracted, it must be between 1 and 50 characters long and can only contain letters, digits, periods, underscores, and hyphens. * @param options.itemEntitySchema - The schema of the entity items being extracted. * @param options.strategy - Optional. The strategy to use for extraction, if not provided, the html strategy with claude haiku will be used. * @param options.prompt - Optional. A prompt to guide the extraction process. * @param options.optionalPropertiesInvalidator - Optional. A function to invalidate optional properties. * @param options.variantKey - Optional. A variant key for the extraction process, use this when the page has multiple variants/shapes. * @param options.apiKey - Optional. An API key to use for the AI extraction. Extractions made with you API key will not be billed to your account. * @returns A promise that resolves to a list of extracted data. * * @example * ```typescript extractArrayFromPage * import { extractArrayFromPage } from "@intuned/sdk/optimized-extractors"; * * await page.goto("https://books.toscrape.com/") * const books = await extractArrayFromPage(page, * { * strategy: { * model: "gpt4-turbo", * type: "HTML" * }, * itemEntityName: "book", * label: "books-extraction", * itemEntitySchema: { * type: "object", * required: ["name"], * properties: { * name: { * type: "string", * description: "book name", * primary: true * } * } * } * }, * ) * * console.log(books) * * // output: * // [ * // ... * // { name: 'Olio' }, * // { name: 'Mesaerion: The Best Science Fiction Stories 1800-1849' }, * // { name: 'Libertarianism for Beginners' }, * // { name: "It's Only the Himalayas" } * // ... * // ] * * ``` */ export declare function extractArrayFromPage( page: Page, options: { label: string; itemEntityName: string; itemEntitySchema: SimpleArrayItemSchema; strategy?: ImageStrategy | HtmlStrategy; prompt?: string; optionalPropertiesInvalidator?: ( result: Record[] ) => string[]; variantKey?: string; apiKey?: string; } ): Promise[]>; /** * Extracts an array of structured data from a locator. * * @param locator - The Playwright Locator object from which to extract the data. * @param options.label - A label for this extraction process, used for billing and monitoring. * @param options.itemEntityName - The name of the entity items being extracted. it must be between 1 and 50 characters long and can only contain letters, digits, periods, underscores, and hyphens. * @param options.itemEntitySchema - The schema of the entity items being extracted. * @param options.strategy - Optional. The strategy to use for extraction, if not provided, the html strategy with claude haiku will be used. * @param options.prompt - Optional. A prompt to guide the extraction process. * @param options.optionalPropertiesInvalidator - Optional. A function to invalidate optional properties. * @param options.variantKey - Optional. A variant key for the extraction process. * @param options.apiKey - Optional. An API key to use for the AI extraction. Extractions made with you API key will not be billed to your account. * @returns A promise that resolves to a list of extracted data. * * @example * ```typescript extractArrayFromLocator * import { extractArrayFromLocator } from "@intuned/sdk/optimized-extractors"; * * await page.goto("https://books.toscrape.com/") * const books = await extractArrayFromLocator(page.locator("section"), * { * itemEntityName: "book", * label: "books-extraction", * itemEntitySchema: { * type: "object", * required: ["name"], * properties: { * name: { * type: "string", * description: "book name", * primary: true * } * } * } * }, * ) * * console.log(books) * * // output: * // [ * // ... * // { name: 'Olio' }, * // { name: 'Mesaerion: The Best Science Fiction Stories 1800-1849' }, * // { name: 'Libertarianism for Beginners' }, * // { name: "It's Only the Himalayas" } * // ... * // ] * * ``` */ export declare function extractArrayFromLocator( locator: Locator, options: { label: string; itemEntityName: string; itemEntitySchema: SimpleArrayItemSchema; strategy?: ImageStrategy | HtmlStrategy; prompt?: string; optionalPropertiesInvalidator?: ( result: Record[] ) => string[]; variantKey?: string; apiKey?: string; } ): Promise[]>; /** * A simple object schema with string properties. * @interface SimpleObjectStringSchema * @extends BasicSchema * @property type - The type of the schema, which is always "string". */ interface SimpleObjectStringSchema extends BasicSchema { type: "string"; } /** * A simple array schema with string properties. * @interface SimpleArrayStringSchema * @extends BasicSchema * @property type - The type of the schema, which is always "string". * @property [primary] - Optional. Indicates whether this is a primary property. */ interface SimpleArrayStringSchema extends BasicSchema { type: "string"; primary?: boolean; } /** * A simple object schema with properties. * @interface SimpleObjectSchema * @extends BasicSchema * @property type - The type of the schema, which is always "object". * @property properties - The properties of the object. * @property required - The required properties of the object. */ export interface SimpleObjectSchema extends BasicSchema { type: "object"; properties: Record; required: string[]; } /** * A simple array item schema with properties. * @interface SimpleArrayItemSchema * @extends BasicSchema * @property type - The type of the schema, which is always "object". * @property properties - The properties of the array item. * @property required - The required properties of the array item. */ export interface SimpleArrayItemSchema extends BasicSchema { type: "object"; properties: Record; required: string[]; } /** * Extracts a structured object from a web page. * * @param page - The Playwright Page object from which to extract the data. * @param options.label - A label for this extraction process, used for billing and monitoring. * @param options.entityName - The name of the entity being extracted. it must be between 1 and 50 characters long and can only contain letters, digits, periods, underscores, and hyphens. * @param options.entitySchema - The schema of the entity being extracted. * @param options.strategy - Optional. The strategy to use for extraction, if not provided, the html strategy with claude haiku will be used. * @param options.prompt - Optional. A prompt to guide the extraction process. * @param options.optionalPropertiesInvalidator - Optional. A function to invalidate optional properties. * @param options.variantKey - Optional. A variant key for the extraction process. * @param options.apiKey - Optional. An API key to use for the AI extraction. Extractions made with you API key will not be billed to your account. * @returns A promise that resolves to the extracted object. * @example * ```typescript extractObjectFromPage * import { extractObjectFromPage } from "@intuned/sdk/optimized-extractors"; * * await page.goto("https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html") * const book = await extractObjectFromPage(page, * { * entityName: "book", * label: "book-extraction", * entitySchema: { * type: "object", * required: ["name","price","reviews"], * properties: { * name: { * type: "string", * description: "book name", * }, * price: { * type: "string", * description: "book price" * }, * reviews: { * type: "string", * description: "Number of reviews" * } * * } * } * }, * ) * * console.log(book) * * // output: * // { name: 'A Light in the Attic', price: '£51.77', reviews: '0' } * * ``` */ export declare function extractObjectFromPage( page: Page, options: { label: string; entityName: string; entitySchema: SimpleObjectSchema; strategy?: ImageStrategy | HtmlStrategy; prompt?: string; optionalPropertiesInvalidator?: ( result: Record | null ) => string[]; variantKey?: string; apiKey?: string; } ): Promise | null>; /** * Extracts a structured object from a locator. * * @param locator - The Playwright Locator object from which to extract the data. * @param options.label - A label for this extraction process, used for billing and monitoring. * @param options.entityName - The name of the entity being extracted. it must be between 1 and 50 characters long and can only contain letters, digits, periods, underscores, and hyphens. * @param options.entitySchema - The schema of the entity being extracted. * @param options.strategy - Optional. The strategy to use for extraction, if not provided, the html strategy with claude haiku will be used. * @param options.prompt - Optional. A prompt to guide the extraction process. * @param options.optionalPropertiesInvalidator - Optional. A function to invalidate optional properties. * @param options.variantKey - Optional. A variant key for the extraction process. * @param options.apiKey - Optional. An API key to use for the AI extraction. Extractions made with you API key will not be billed to your account. * @returns A promise that resolves to the extracted object. * * @example * ```typescript extractObjectFromLocator * import { extractObjectFromLocator } from "@intuned/sdk/optimized-extractors"; * * await page.goto("https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html") * const book = await extractObjectFromLocator(page.locator(".page_inner"), * { * entityName: "book", * label: "book-extraction", * entitySchema: { * type: "object", * required: ["name","price","reviews"], * properties: { * name: { * type: "string", * description: "book name", * }, * price: { * type: "string", * description: "book price" * }, * reviews: { * type: "string", * description: "Number of reviews" * } * * } * } * }, * ) * * console.log(book) * * // output: * // { name: 'A Light in the Attic', price: '£51.77', reviews: '0' } * * ``` */ export declare function extractObjectFromLocator( locator: Locator, options: { label: string; entityName: string; entitySchema: SimpleObjectSchema; strategy?: ImageStrategy | HtmlStrategy; prompt?: string; optionalPropertiesInvalidator?: ( result: Record | null ) => string[]; variantKey?: string; apiKey?: string; } ): Promise | null>;