import { Locator, Page } from "playwright-core"; import { JsonSchema, ObjectSchema } from "./jsonSchema"; /** * * @param file the file you want to extract the data from, * @param options.label a label for this extraction process, used for billing and monitoring * @param options.dataSchema the json schema of the data you're trying to extract. * @param options.prompt optional, a prompt to guide the extraction process and provide more context. * @param options.strategy optional, the strategy to use for extraction. use `IMAGE` if the info you're trying to extract is visual and cannot be converted to markdown. Defaults to `MARKDOWN` strategy with `gpt4-turbo` model. * @example * ```typescript extractStructuredDataFromFile * import { extractStructuredDataFromFile } from "@intuned/sdk/ai-extractors"; * * const movie = await extractStructuredDataFromFile({ * source: { * type: "url", * data: "" * }, * type: "pdf", * // pages array is optional, do not pass it if you want to include all pages in the process * pages: [1, 2] * }, { * label: "extract_movie", * dataSchema: { * type: "object", * properties: { * "name": { * type: "string", * description: "movie name" * }, * revenue: { * type: "string", * description: "movie revenue" * } * } * } * }) * * ``` */ export declare function extractStructuredDataFromFile( file: ImageFile | PdfFile | SpreadsheetFile | DocumentFile, options: { label: string; dataSchema: JsonSchema; prompt?: string; strategy?: MarkdownFileStrategy | ImageFileStrategy; } ): Promise; /** * Represents a file source from a buffer. * * @interface * @property type - The type of the file source, which is always "buffer". * @property data - The buffer data of the file. */ interface FileBufferSource { type: "buffer"; data: Buffer; } /** * Represents a file source from a URL. * * @interface * @property type - The type of the file source, which is always "url". * @property data - The URL of the file. */ interface FileUrlSource { type: "url"; data: string; } /** * Represents a file source from a base64 string. * * @interface * @property type - The type of the file source, which is always "base64". * @property data - The base64 string of the file data. */ interface FileBase64Source { type: "base64"; data: string; } /** * Represents an image file source. * * @interface * @property type - The type of the file, which is always "image". * @property source - The source of the file data. */ interface ImageFile { type: "image"; source: FileBufferSource | FileUrlSource | FileBase64Source; } /** * Represents a PDF file source. * * @interface * @property type - The type of the file, which is always "pdf". * @property [pages] - Optional. The specific pages of the PDF to extract data from, if not provided, all page will be included. * @property source - The source of the file data. */ export interface PdfFile { type: "pdf"; pages?: number[]; source: FileBufferSource | FileUrlSource | FileBase64Source; } /** * Represents a Spreadsheet file source. For now, only .xlsx Excel spreadsheets are supported. * * @interface * @property type - The type of the file, which is always "spreadsheet". * @property sheetName - The name of the sheet to extract data from. * @property source - The source of the file data. */ export interface SpreadsheetFile { type: "spreadsheet"; sheetName: string; source: FileBufferSource | FileUrlSource | FileBase64Source; } /** * Represents a Document file source. For now, only .docx Word files are supported. * * @interface * @property type - The type of the file, which is always "document". * @property [pages] - Optional. The specific pages of the document to extract data from, if not provided, all page will be included. * @property source - The source of the file data. * @property config - Optional. Configurations on how the spreadsheet should be processed when it is converted to a document. */ export interface DocumentFile { type: "document"; pages?: number[]; source: FileBufferSource | FileUrlSource | FileBase64Source; } /** * Extracts tables from a file (ImageFile or PdfFile). * * @param file - The file you want to extract the tables from. * @param options.label - A label for this extraction process, used for billing and monitoring. * @returns {Promise>} A promise that resolves to an array of extracted tables. * * @example * ```typescript extractTablesFromFile * import { extractTablesFromFile } from "@intuned/sdk/ai-extractors"; * * const tables = await extractTablesFromFile({ * source: { * type: "url", * data: "" * }, * type: "pdf", * // pages array is optional, do not pass it if you want to include all pages in the process * pages: [1, 2] * }, { * label: "extract_tables" * }); * * console.log(tables); * ``` */ export declare function extractTablesFromFile( file: ImageFile | PdfFile | SpreadsheetFile | DocumentFile, options: { label: string; } ): Promise>; /** * Represents a table extracted from a pdf file. * * @interface * @property title - the title of the table if found * @property content - a 2 dimensional array contains the table values. */ interface ExtractedTable { title: string | null; content: (string | null)[][]; } /** * converts a file to markdown (ImageFile or PdfFile). * * @param file - The file you want to extract the markdown content from. * @param options.label - A label for this extraction process, used for billing and monitoring. * @returns {Promise} A promise that resolves to the extracted markdown content as a string. * * @example * ```typescript extractMarkdownFromFile * import { extractMarkdownFromFile } from "@intuned/sdk/ai-extractors"; * * const markdown = await extractMarkdownFromFile({ * source: { * type: "url", * data: "" * }, * type: "pdf", * // pages array is optional, do not pass it if you want to include all pages in the process * pages: [1, 2] * }, { * label: "extract_markdown" * }); * * console.log(markdown); * ``` */ export declare function extractMarkdownFromFile( file: ImageFile | PdfFile | SpreadsheetFile | DocumentFile, options: { label: string; } ): Promise; /** * Extracts structured data from a web page. * * @param page - The Playwright Page from which to extract the structured data. * @param options * @param options.label - A label for this extraction process, used for billing and monitoring. * @param options.dataSchema - The JSON schema of the data you're trying to extract. * @param options.prompt - Optional. A prompt to guide the extraction process and provide more context. * @param options.strategy - Optional. The strategy to use for extraction, use the `IMAGE` strategy if the info you're trying to extract is visual and does not exist on the html of the page, * @param options.apiKey - Optional. An API key to use for the AI extraction. Extractions made with you API key will not be billed to your account. * @returns A promise that resolves to the extracted structured data. * * @example * ```typescript extractStructuredDataFromPage * import { extractStructuredDataFromPage } from "@intuned/sdk/ai-extractors"; * * await page.goto('https://example.com'); * * const options = { * label: "extract_page_data", * dataSchema: { * type: "object", * properties: { * title: { type: "string", description: "The title of the page" }, * date: { type: "string", description: "The date of the content" } * } * }, * }; * * const data = await extractStructuredDataFromPage(page, options); * console.log(data); * * ``` */ export declare function extractStructuredDataFromPage( page: Page, options: { label: string; dataSchema: JsonSchema; prompt?: string; strategy?: ImageStrategy | HtmlStrategy; apiKey?: string; } ): Promise; /** * Extracts structured data from a web page. * * @param locator - The Playwright locator from which to extract the structured data. * @param options * @param options.label - A label for this extraction process, used for billing and monitoring. * @param options.dataSchema - The JSON schema of the data you're trying to extract. * @param options.prompt - Optional. A prompt to guide the extraction process and provide more context. * @param options.strategy - Optional. The strategy to use for extraction, use the `IMAGE` strategy if the info you're trying to extract is visual and does not exist on the html of the page, * @returns A promise that resolves to the extracted structured data. * * @example * ```typescript extractStructuredDataFromLocator * import { extractStructuredDataFromLocator } from "@intuned/sdk/ai-extractors"; * * await page.goto('https://example.com'); * * const options = { * label: "extract_locator_data", * dataSchema: { * type: "object", * properties: { * title: { type: "string", description: "The title of the page" }, * date: { type: "string", description: "The date of the content" } * } * }, * }; * * const data = await extractStructuredDataFromLocator(page.locator(".section"), options); * console.log(data); * * ``` */ export declare function extractStructuredDataFromLocator( locator: Locator, options: { label: string; dataSchema: JsonSchema; prompt?: string; strategy?: ImageStrategy | HtmlStrategy; } ): Promise; /** * this strategy will use a screenshot of the page/locator with some processing to extract the needed data. * should be used when the information you're trying to extract is not present in the dom as a text but can be identified visually. * @interface * @property model - the model to use in the extraction process. * @property type - the type of the strategy */ export interface ImageStrategy { model: | "claude-3-haiku" | "claude-3-haiku-20240307" | "claude-3-opus" | "claude-3-opus-20240229" | "claude-3-sonnet" | "claude-3-sonnet-20240229" | "claude-3.5-sonnet" | "claude-3-5-sonnet-20240620" | "claude-3-5-sonnet-20241022" | "gpt4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-mini" | "gpt-4o-mini-2024-07-18" | "gemini-1.5-pro" | "gemini-1.5-pro-002" | "gemini-1.5-flash-8b" | "gemini-1.5-flash-8b-002" | "gemini-1.5-flash" | "gemini-1.5-flash-002" | "gemini-2.0-flash-exp"; type: "IMAGE"; } /** * this strategy will use the html of the page/locator to extract the needed data. we filter out some of the attributes to reduce context. * the attributes included are only: `aria-label` `data-name` `name` `type` `placeholder` `value` `role` `title` `href` `id` `alt`, * * @interface * @property model - the model to use in the extraction process * @property type - the type of the strategy */ export interface HtmlStrategy { model: | "claude-3-haiku" | "claude-3-haiku-20240307" | "claude-3-5-haiku" | "claude-3-5-haiku-20241022" | "claude-3-opus" | "claude-3-opus-20240229" | "claude-3-sonnet" | "claude-3-sonnet-20240229" | "claude-3.5-sonnet" | "claude-3-5-sonnet-20240620" | "claude-3-5-sonnet-20241022" | "gpt4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt3.5-turbo" | "gpt-3.5-turbo-0125" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-mini" | "gpt-4o-mini-2024-07-18" | "gemini-1.5-pro" | "gemini-1.5-pro-002" | "gemini-1.5-flash-8b" | "gemini-1.5-flash-8b-002" | "gemini-1.5-flash" | "gemini-1.5-flash-002" | "gemini-2.0-flash-exp"; type: "HTML"; } /** * this strategy will extract markdown content from the file then run data extraction on it. * * @interface * @property model - the model to use in the extraction process * @property type - the type of the strategy */ export interface MarkdownFileStrategy { model: | "claude-3-haiku" | "claude-3-haiku-20240307" | "claude-3-5-haiku" | "claude-3-5-haiku-20241022" | "claude-3-opus" | "claude-3-opus-20240229" | "claude-3-sonnet" | "claude-3-sonnet-20240229" | "claude-3.5-sonnet" | "claude-3-5-sonnet-20240620" | "claude-3-5-sonnet-20241022" | "gpt4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt3.5-turbo" | "gpt-3.5-turbo-0125" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-mini" | "gpt-4o-mini-2024-07-18" | "gemini-1.5-pro" | "gemini-1.5-pro-002" | "gemini-1.5-flash-8b" | "gemini-1.5-flash-8b-002" | "gemini-1.5-flash" | "gemini-1.5-flash-002" | "gemini-2.0-flash-exp"; type: "MARKDOWN"; } /** * this strategy will use the image content of the file to extract the needed data. * should be used when the information you're trying to extract cannot be converted to markdown. For example, a checkbox in a pdf file. * @interface * @property model - the model to use in the extraction process. * @property type - the type of the strategy */ export interface ImageFileStrategy { model: | "claude-3-haiku" | "claude-3-haiku-20240307" | "claude-3-opus" | "claude-3-opus-20240229" | "claude-3-sonnet" | "claude-3-sonnet-20240229" | "claude-3.5-sonnet" | "claude-3-5-sonnet-20240620" | "claude-3-5-sonnet-20241022" | "gpt4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-mini" | "gpt-4o-mini-2024-07-18" | "gemini-1.5-pro" | "gemini-1.5-pro-002" | "gemini-1.5-flash-8b" | "gemini-1.5-flash-8b-002" | "gemini-1.5-flash" | "gemini-1.5-flash-002" | "gemini-2.0-flash-exp"; type: "IMAGE"; } /** * Extracts structured data from content items (text or images). * * @param content - The content items from which to extract the structured data. * @param options.label - A label for this extraction process, used for billing and monitoring. * @param options.dataSchema - The JSON schema of the data you're trying to extract. * @param [options.prompt] - Optional. A prompt to guide the extraction process. * @param options.model - The model to use for extraction. * @param options.apiKey - Optional. An API key to use for the AI extraction. Extractions made with you API key will not be billed to your account. * @returns A promise that resolves to the extracted structured data. * * @example * ```typescript extractStructuredDataFromContent * import { extractStructuredDataFromContent } from "@intuned/sdk/ai-extractors"; * * const content = [ * { type: "text", data: "Sample text data" }, * { * type: "image-url", * image_type: "jpeg", * data: "https://example.com/image.jpg" * } * ]; * * const options = { * label: "extract_contact_info", * dataSchema: { * type: "object", * properties: { * name: { type: "string", description: "contact name" }, * phone: { type: "string", description: "contact info" } * } * }, * model: "gpt4-turbo" * }; * * const data = await extractStructuredDataFromContent(content, options); * console.log(data); * ``` */ export declare function extractStructuredDataFromContent( content: | (TextContentItem | ImageBufferContentItem | ImageUrlContentItem)[] | TextContentItem | ImageBufferContentItem | ImageUrlContentItem, options: { label: string; dataSchema: ObjectSchema; prompt?: string; model: | "claude-3-haiku" | "claude-3-haiku-20240307" | "claude-3-5-haiku" | "claude-3-5-haiku-20241022" | "claude-3-opus" | "claude-3-opus-20240229" | "claude-3-sonnet" | "claude-3-sonnet-20240229" | "claude-3.5-sonnet" | "claude-3-5-sonnet-20240620" | "claude-3-5-sonnet-20241022" | "gpt4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt3.5-turbo" | "gpt-3.5-turbo-0125" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-mini" | "gpt-4o-mini-2024-07-18" | "gemini-1.5-pro" | "gemini-1.5-pro-002" | "gemini-1.5-flash-8b" | "gemini-1.5-flash-8b-002" | "gemini-1.5-flash" | "gemini-1.5-flash-002" | "gemini-2.0-flash-exp"; apiKey?: string; } ): Promise; /** * @interface * @property type - The type of the content item, which is always "text". * @property data - The text data. */ export interface TextContentItem { type: "text"; data: string; } /** * @interface * @property type - The type of the content item, which is always "image-buffer". * @property image_type - The image format (e.g., "png", "jpeg", "gif", "webp"). * @property data - The buffer containing the image data. */ export interface ImageBufferContentItem { type: "image-buffer"; image_type: "png" | "jpeg" | "gif" | "webp"; data: Buffer; } /** * @interface * @property type - The type of the content item, which is always "image-url". * @property image_type - The image format (e.g., "png", "jpeg", "gif", "webp"). * @property data - The URL of the image. */ export interface ImageUrlContentItem { type: "image-url"; image_type: "png" | "jpeg" | "gif" | "webp"; data: string; } /** * Extracts markdown content from a web page. * * @param {Page} page - The Playwright Page object from which to extract the markdown content. * @returns {Promise} A promise that resolves to the extracted markdown content. * * @example * ```typescript extractMarkdownFromPage * import { extractMarkdownFromPage } from "@intuned/sdk/ai-extractors"; * * await page.goto('https://example.com'); * const markdown = await extractMarkdownFromPage(page); * console.log(markdown); * * ``` */ export declare function extractMarkdownFromPage(page: Page): Promise; /** * Extracts markdown content from a specific locator within a web page. * * @param {Locator} locator - The Playwright Locator object from which to extract the markdown content. * @returns {Promise} A promise that resolves to the extracted markdown content. * * @example * ```typescript extractMarkdownFromLocator * import { extractMarkdownFromLocator } from "@intuned/sdk/ai-extractors"; * * await page.goto('https://example.com'); * const locator = page.locator('.article'); * const markdown = await extractMarkdownFromLocator(locator); * console.log(markdown); * * ``` */ export declare function extractMarkdownFromLocator( locator: Locator ): Promise;