import { Page, Browser, BrowserContext } from 'playwright'; import { EventEmitter } from 'eventemitter3'; /** * Configuration for browser pool */ export interface BrowserPoolConfig { /** Maximum number of browser instances in the pool */ maxSize: number; /** Maximum age of browser instances in milliseconds before they are recycled. */ maxAge: number; /** * Browser launch options for Playwright. * @see https://playwright.dev/docs/api/class-browsertype#browser-type-launch */ launchOptions: { /** Whether to run the browser in headless mode. Defaults to true. */ headless: boolean; /** Additional arguments to pass to the browser instance. */ args: string[]; /** Maximum time in milliseconds to wait for the browser instance to start. Defaults to 30000 (30 seconds). */ timeout: number; }; /** Interval in milliseconds at which to check for and clean up old/unused browser instances. */ cleanupInterval: number; } /** * Browser instance in the pool */ export interface BrowserInstance { id: string; browser: Browser; context: BrowserContext; page: Page; lastUsed: number; inUse: boolean; createdAt: number; } /** * Represents the version details of the scraper toolkit. * This can include information about the toolkit version, dependencies, and development dependencies. */ export interface ToolkitVersionDetails { version: string; dependencies: Record; devDependencies: Record; } /** * Scraper execution options */ export interface ScraperExecutionOptions { /** Number of retry attempts */ retries: number; /** Delay between retries in milliseconds */ retryDelay: number; /** Request timeout in milliseconds */ timeout: number; /** Whether to use proxy rotation */ useProxyRotation: boolean; /** Custom headers */ headers: Record; /** User agent string */ userAgent: string; /** Whether to enable JavaScript */ javascript: boolean; /** Whether to load images during page navigation. Defaults to false. */ loadImages: boolean; /** * Viewport configuration for the browser page. * @see https://playwright.dev/docs/api/class-page#page-set-viewport-size */ viewport: { /** Viewport width in pixels. */ width: number; /** Viewport height in pixels. */ height: number; }; } /** * Defines the types of hooks available in the scraper lifecycle. * - `beforeRequest`: Executed before a scraping request is made (e.g., before navigation). * - `afterRequest`: Executed after a scraping request has completed, regardless of success or failure. * - `onSuccess`: Executed only if the `parse` function completes successfully and returns data. * - `onError`: Executed if any error occurs during the scraping process (navigation, parsing, etc.). * - `onRetry`: Executed before a retry attempt is made after a failure. */ export type ScraperHook = 'beforeRequest' | 'afterRequest' | 'onSuccess' | 'onError' | 'onRetry'; /** * Defines the signature for a hook handler function. * @template Input The type of the input data for the scraper. * @template Output The type of the output data from the scraper's parse function. * @param context The scraper context, providing access to input, page, options, etc. */ export type HookHandler = (context: ScraperContext) => Promise | void; /** * Context object passed to the `parse` function and to all hook handlers. * It provides access to the current scraping state and resources. * @template Input The type of the input data for the scraper. * @template Output The type of the output data from the scraper's parse function. */ export interface ScraperContext { /** The input data provided for the current scraping task. */ input: Input; /** The Playwright `Page` object for interacting with the browser. */ page: Page; /** The current attempt number for this scraping task (1 for first attempt, 2 for first retry, etc.). */ attempt: number; /** Timestamp (ms since epoch) when the current scraping attempt started. */ startTime: number; /** The execution options applicable to the current scraping task. */ options: ScraperExecutionOptions; /** A key-value store for sharing custom data between hooks or within a scraping lifecycle. */ metadata: Record; /** * The result from the `parse` function. Available in `afterRequest` and `onSuccess` hooks if parsing was successful. * Potentially available in `onError` if error occurred after parsing. */ result?: Output; /** * The error object if an error occurred. Available in `onError` and `afterRequest` hooks if an error was thrown. * Also available in `onRetry` hook. */ error?: Error; /** Logger instance for logging within the scraper context, e.g., in parse function or hooks. */ log: Logger; } /** * Defines the strategy for navigating to the target URL(s). * - `direct`: Directly navigates to the provided URL. * - `form`: Interacts with a form (filling fields, submitting) to reach target content. * - `api`: Primarily focuses on intercepting or making API calls, browser navigation might be secondary. * - `custom`: A user-defined navigation logic. */ export interface NavigationStrategy { /** The type of navigation strategy to employ. */ type: 'direct' | 'form' | 'api' | 'custom'; /** Configuration object specific to the chosen navigation `type`. */ config: Record; } /** * Defines the strategy for determining when a page is considered "ready" after navigation * or an action, before proceeding with parsing or further interactions. * - `selector`: Waits for a specific DOM selector to be present and visible. * - `response`: Waits for a specific network response (e.g., an API call). * - `timeout`: Waits for a fixed duration. * - `custom`: A user-defined wait logic. */ export interface WaitStrategy { /** The type of wait strategy to employ. */ type: 'selector' | 'response' | 'timeout' | 'custom'; /** Configuration object specific to the chosen wait `type`. */ config: Record; } /** * Defines the structure and behavior of a scraper. * @template Input The type of the input data this scraper expects. * @template Output The type of the data this scraper is expected to output after parsing. */ export interface ScraperDefinition { /** Unique identifier for the scraper */ id: string; /** Human-readable name */ name: string; /** Description of what this scraper does */ description?: string; /** Base URL or URL template */ url: string; /** Navigation strategy */ navigation: NavigationStrategy; /** Wait strategy */ waitStrategy: WaitStrategy; /** Parse function to extract data */ parse: (context: ScraperContext) => Promise; /** Validation function for input */ validateInput?: (input: Input) => boolean | string; /** Validation function for output */ validateOutput?: (output: Output) => boolean | string; /** Whether this scraper requires CAPTCHA solving */ requiresCaptcha: boolean; /** * Rate limiting configuration for this specific scraper. * Overrides global rate limit settings if provided. */ rateLimit?: { /** Maximum number of requests allowed within the specified period. */ requests: number; /** Time period in milliseconds during which the request limit applies. */ period: number; }; /** Custom execution options specific to this scraper. Merged with global/profile options. */ options?: Partial; /** * Scraper-specific lifecycle hooks. * Keys are hook names (`ScraperHook`), and values are arrays of `HookHandler` functions. * These hooks are executed in addition to any global hooks. */ hooks?: Partial[]>>; /** Tags for categorizing or filtering scrapers. */ tags?: string[]; /** Version of the scraper definition */ version?: string; } /** * Scraper execution result */ export interface ScraperResult { /** Indicates whether the scraping task was successful. */ success: boolean; /** The scraped data, if successful. Matches the `Output` type of the `ScraperDefinition`. */ data?: Output; /** The error object, if the scraping task failed. */ error?: Error; /** The total number of attempts made for this task (including retries). */ attempts: number; /** The total duration of the scraping task in milliseconds. */ duration: number; /** Metadata associated with the scraper execution. */ metadata: { /** The ID of the scraper definition used. */ scraperId: string; /** Timestamp (ms since epoch) when the scraping task was initiated. */ timestamp: number; /** The User-Agent string used for the requests. */ userAgent: string; /** The final URL from which data was parsed (or attempted). */ url: string; }; } /** * Defines the interface for a ScraperPlugin. * Plugins can extend the functionality of the ScraperEngine, * for example, by adding new methods, hooks, or modifying behavior. */ export interface ScraperPlugin { /** The unique name of the plugin. */ name: string; /** The version of the plugin (e.g., semver string). */ version: string; /** * Called when the plugin is installed via `engine.use(plugin)`. * This method should implement the plugin's setup logic, such as * registering global hooks, modifying engine properties, etc. * @param engine The instance of the ScraperEngine. */ install: (engine: ScraperEngine) => void; /** * Optional method called if the plugin system supports uninstallation. * Should clean up any resources or modifications made by the `install` method. * @param engine The instance of the ScraperEngine. */ uninstall?: (engine: ScraperEngine) => void; } /** * Defines the structure of events emitted by the ScraperEngine. * Consumers can listen to these events using `engine.on('eventName', handler)`. */ export interface ScraperEvents { /** Emitted when a scraper execution starts. Payload includes scraper ID and input. */ 'scraper:start': { scraperId: string; input: unknown; }; /** Emitted when a scraper execution succeeds. Payload includes scraper ID and the result. */ 'scraper:success': { scraperId: string; result: ScraperResult; }; /** Emitted when a scraper execution fails. Payload includes scraper ID and the error. */ 'scraper:error': { scraperId: string; error: Error; }; /** Emitted when a scraper execution attempt is being retried. Payload includes scraper ID and attempt number. */ 'scraper:retry': { scraperId: string; attempt: number; error: Error; }; /** Emitted when a browser instance is acquired from the pool. Payload includes instance ID. */ 'pool:acquire': { instanceId: string; }; /** Emitted when a browser instance is released back to the pool. Payload includes instance ID. */ 'pool:release': { instanceId: string; }; /** Emitted when the browser pool performs a cleanup operation. Payload includes number of removed instances. */ 'pool:cleanup': { removedInstances: number; }; } /** * Defines the core interface for the scraper engine. * It is responsible for managing scraper definitions, plugins, hooks, * and executing scraping tasks. It also emits events related to the scraping lifecycle. */ export interface ScraperEngine extends EventEmitter { /** Execute a scraper with given input */ execute(definition: ScraperDefinition, input: Input, options?: Partial): Promise>; /** Register a scraper definition */ register(definition: ScraperDefinition): void; /** Get a registered scraper definition */ getDefinition(id: string): ScraperDefinition | undefined; /** List all registered scrapers */ listDefinitions(): ScraperDefinition[]; /** Install a plugin */ use(plugin: ScraperPlugin): void; /** Add a global hook */ addHook(hook: ScraperHook, handler: HookHandler): void; /** Remove a global hook */ removeHook(hook: ScraperHook, handler: HookHandler): void; /** Shutdown the engine and cleanup resources */ shutdown(): Promise; } /** * Configuration for the scraper engine */ export interface ScraperEngineConfig { browserPool: BrowserPoolConfig; defaultOptions: ScraperExecutionOptions; /** List of plugin names or paths to be automatically installed upon engine initialization. */ plugins: string[]; /** * Global lifecycle hooks to be applied to all scrapers. * Keys are hook names (`ScraperHook`), and values are arrays of `HookHandler` functions. */ globalHooks: Partial>; /** Configuration for the engine's logger. */ logging: { /** Minimum log level to output. */ level: 'debug' | 'info' | 'warn' | 'error'; /** Format of the log output. */ format: 'json' | 'text'; }; } /** * Represents a logger instance that can be used throughout the toolkit. * This interface is compatible with common loggers like Winston or pino. */ export interface Logger { debug: (message: string, ...meta: unknown[]) => void; info: (message: string, ...meta: unknown[]) => void; warn: (message: string, ...meta: unknown[]) => void; error: (message: string, ...meta: unknown[]) => void; } //# sourceMappingURL=types.d.ts.map