import { ErrorClassification, ExtractionConfig, ExtractionResult, WorkerPool, WorkerPoolStats, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol, EmbeddingConfig } from './types.js'; export { Chunk, ChunkingConfig, EmbeddingModelType, ExtractedImage, HtmlConversionOptions, HtmlOutputConfig, HtmlPreprocessingOptions, ImageExtractionConfig, KeywordConfig, LanguageDetectionConfig, OcrConfig, PageContent, PageExtractionConfig, PdfConfig, PostProcessorConfig, Table, TesseractConfig, TokenReductionConfig } from './types.js'; import { PanicContext } from './errors.js'; export { CacheError, EmbeddingError, ErrorCode, ImageProcessingError, KreuzbergError, MissingDependencyError, OcrError, ParsingError, PluginError, ValidationError } from './errors.js'; /** * Get the error code for the last FFI error. * * Returns the FFI error code as an integer. This is useful for programmatic error handling * and distinguishing between different types of failures in native code. * * Error codes: * - 0: Success (no error) * - 1: GenericError * - 2: Panic * - 3: InvalidArgument * - 4: IoError * - 5: ParsingError * - 6: OcrError * - 7: MissingDependency * - 8: Embedding * * @returns The integer error code * * @example * ```typescript * import { extractFile, getLastErrorCode, ErrorCode } from '@kreuzberg/node'; * * try { * const result = await extractFile('document.pdf'); * } catch (error) { * const code = getLastErrorCode(); * if (code === ErrorCode.Panic) { * console.error('Native code panic detected'); * } * } * ``` */ declare function getLastErrorCode(): number; /** * Get panic context information if the last error was a panic. * * Returns detailed information about a panic in native code, or null if the last error was not a panic. * This provides debugging information when native code panics. * * @returns A `PanicContext` object with file, line, function, message, and timestamp_secs, or null if no panic context is available * * @example * ```typescript * import { extractFile, getLastPanicContext } from '@kreuzberg/node'; * * try { * const result = await extractFile('document.pdf'); * } catch (error) { * const context = getLastPanicContext(); * if (context) { * console.error(`Panic at ${context.file}:${context.line}`); * console.error(`In function: ${context.function}`); * console.error(`Message: ${context.message}`); * } * } * ``` */ declare function getLastPanicContext(): PanicContext | null; /** * Returns the human-readable name for an error code. * * Maps numeric error codes to their string names, providing a consistent way * to get error code names across all platforms. * * @param code - The numeric error code (0-8) * @returns The error code name as a string (e.g., "validation", "ocr", "embedding") * * @example * ```typescript * import { getErrorCodeName } from '@kreuzberg/node'; * * const name = getErrorCodeName(0); // returns "validation" * const name = getErrorCodeName(2); // returns "ocr" * const name = getErrorCodeName(99); // returns "unknown" * ``` */ declare function getErrorCodeName(code: number): string; /** * Returns the description for an error code. * * Retrieves user-friendly descriptions of error types from the FFI layer. * * @param code - The numeric error code (0-8) * @returns A brief description of the error type * * @example * ```typescript * import { getErrorCodeDescription } from '@kreuzberg/node'; * * const desc = getErrorCodeDescription(0); // returns "Input validation error" * const desc = getErrorCodeDescription(4); // returns "File system I/O error" * const desc = getErrorCodeDescription(99); // returns "Unknown error code" * ``` */ declare function getErrorCodeDescription(code: number): string; /** * Classifies an error message string into an error code category. * * This function analyzes the error message content and returns the most likely * error code (0-7) based on keyword patterns. Used to programmatically classify * errors for handling purposes. * * The classification is based on keyword matching: * - **Validation (0)**: Keywords like "invalid", "validation", "schema", "required" * - **Parsing (1)**: Keywords like "parsing", "corrupted", "malformed" * - **Ocr (2)**: Keywords like "ocr", "tesseract", "language", "model" * - **MissingDependency (3)**: Keywords like "not found", "missing", "dependency" * - **Io (4)**: Keywords like "file", "disk", "read", "write", "permission" * - **Plugin (5)**: Keywords like "plugin", "register", "extension" * - **UnsupportedFormat (6)**: Keywords: unsupported, format, mime * - **Internal (7)**: Keywords: internal, bug, panic * - **Embedding (8)**: Keywords: embed, embedding, vector, inference * * @param errorMessage - The error message string to classify * @returns An object with the classification details * * @example * ```typescript * import { classifyError } from '@kreuzberg/node'; * * const result = classifyError("PDF file is corrupted"); * // Returns: { code: 1, name: "parsing", confidence: 0.95 } * * const result = classifyError("Tesseract not found"); * // Returns: { code: 3, name: "missing_dependency", confidence: 0.9 } * ``` */ declare function classifyError(errorMessage: string): ErrorClassification; /** * Batch extraction APIs for processing multiple documents. * * This module provides synchronous and asynchronous functions for extracting content * from multiple files or byte arrays in parallel. Batch operations offer better * performance and memory management compared to calling single extraction functions * in a loop. * * **Benefits of Batch Processing**: * - Parallel processing in Rust for maximum performance * - Optimized memory usage across all extractions * - More reliable for large-scale document processing * * @internal This module is part of Layer 2 (extraction APIs). */ /** * Extract content from multiple files in parallel (synchronous). * * **Recommended for**: Processing multiple documents efficiently with better * performance and memory management compared to individual `extractFileSync()` calls. * * **Benefits**: * - Parallel processing in Rust for maximum performance * - Optimized memory usage across all extractions * - More reliable for batch document processing * * @param paths - List of file paths to extract (absolute or relative paths) * @param config - Extraction configuration object. If null, uses default extraction settings. * @returns Array of ExtractionResults (one per file, in same order as input) * @throws {Error} If any file cannot be read or parsed * @throws {ParsingError} When any document format is invalid or corrupted * @throws {OcrError} When OCR processing fails (if OCR is enabled) * @throws {ValidationError} When any extraction result fails validation (if validators registered) * @throws {KreuzbergError} For other extraction-related failures * * @example * ```typescript * import { batchExtractFilesSync } from '@kreuzberg/node'; * * const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx']; * const results = batchExtractFilesSync(files); * * results.forEach((result, i) => { * console.log(`File ${files[i]}: ${result.content.substring(0, 100)}...`); * }); * ``` */ declare function batchExtractFilesSync(paths: string[], config?: ExtractionConfig | null): ExtractionResult[]; /** * Extract content from multiple files in parallel (asynchronous). * * **Recommended for**: Processing multiple documents efficiently with better * performance and memory management compared to individual `extractFile()` calls. * * **Benefits**: * - Parallel processing in Rust for maximum performance * - Optimized memory usage across all extractions * - More reliable for batch document processing * * @param paths - List of file paths to extract (absolute or relative paths) * @param config - Extraction configuration object. If null, uses default extraction settings. * @returns Promise resolving to array of ExtractionResults (one per file, in same order as input) * @throws {Error} If any file cannot be read or parsed * @throws {ParsingError} When any document format is invalid or corrupted * @throws {OcrError} When OCR processing fails (if OCR is enabled) * @throws {ValidationError} When any extraction result fails validation (if validators registered) * @throws {KreuzbergError} For other extraction-related failures * * @example * ```typescript * import { batchExtractFiles } from '@kreuzberg/node'; * * const files = ['invoice1.pdf', 'invoice2.pdf', 'invoice3.pdf']; * const results = await batchExtractFiles(files, { * ocr: { backend: 'tesseract', language: 'eng' } * }); * * // Process all results * const totalAmount = results * .map(r => extractAmount(r.content)) * .reduce((a, b) => a + b, 0); * ``` */ declare function batchExtractFiles(paths: string[], config?: ExtractionConfig | null): Promise; /** * Extract content from multiple byte arrays in parallel (synchronous). * * **Recommended for**: Processing multiple documents from memory efficiently with better * performance and memory management compared to individual `extractBytesSync()` calls. * * **Benefits**: * - Parallel processing in Rust for maximum performance * - Optimized memory usage across all extractions * - More reliable for batch document processing * * @param dataList - List of file contents as Uint8Arrays (must be same length as mimeTypes) * @param mimeTypes - List of MIME types (one per data item, required for accurate format detection) * @param config - Extraction configuration object. If null, uses default extraction settings. * @returns Array of ExtractionResults (one per data item, in same order as input) * @throws {TypeError} When dataList contains non-Uint8Array items or length mismatch with mimeTypes * @throws {Error} If any data cannot be read or parsed * @throws {ParsingError} When any document format is invalid or corrupted * @throws {OcrError} When OCR processing fails (if OCR is enabled) * @throws {ValidationError} When any extraction result fails validation (if validators registered) * @throws {KreuzbergError} For other extraction-related failures * * @example * ```typescript * import { batchExtractBytesSync } from '@kreuzberg/node'; * import { readFileSync } from 'fs'; * * const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx']; * const dataList = files.map(f => readFileSync(f)); * const mimeTypes = ['application/pdf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet']; * * const results = batchExtractBytesSync(dataList, mimeTypes); * results.forEach((result, i) => { * console.log(`File ${files[i]}: ${result.content.substring(0, 100)}...`); * }); * ``` */ declare function batchExtractBytesSync(dataList: Uint8Array[], mimeTypes: string[], config?: ExtractionConfig | null): ExtractionResult[]; /** * Extract content from multiple byte arrays in parallel (asynchronous). * * **Recommended for**: Processing multiple documents from memory efficiently with better * performance and memory management compared to individual `extractBytes()` calls. * * **Benefits**: * - Parallel processing in Rust for maximum performance * - Optimized memory usage across all extractions * - More reliable for batch document processing * * @param dataList - List of file contents as Uint8Arrays (must be same length as mimeTypes) * @param mimeTypes - List of MIME types (one per data item, required for accurate format detection) * @param config - Extraction configuration object. If null, uses default extraction settings. * @returns Promise resolving to array of ExtractionResults (one per data item, in same order as input) * @throws {TypeError} When dataList contains non-Uint8Array items or length mismatch with mimeTypes * @throws {Error} If any data cannot be read or parsed * @throws {ParsingError} When any document format is invalid or corrupted * @throws {OcrError} When OCR processing fails (if OCR is enabled) * @throws {ValidationError} When any extraction result fails validation (if validators registered) * @throws {KreuzbergError} For other extraction-related failures * * @example * ```typescript * import { batchExtractBytes } from '@kreuzberg/node'; * import { readFile } from 'fs/promises'; * * const files = ['invoice1.pdf', 'invoice2.pdf', 'invoice3.pdf']; * const dataList = await Promise.all(files.map(f => readFile(f))); * const mimeTypes = files.map(() => 'application/pdf'); * * const results = await batchExtractBytes(dataList, mimeTypes, { * ocr: { backend: 'tesseract', language: 'eng' } * }); * * // Process all results * const totalAmount = results * .map(r => extractAmount(r.content)) * .reduce((a, b) => a + b, 0); * ``` */ declare function batchExtractBytes(dataList: Uint8Array[], mimeTypes: string[], config?: ExtractionConfig | null): Promise; /** * PDF page rendering functions. * * Render individual PDF pages or iterate over all pages as PNG images. */ /** * Render a single PDF page to a PNG buffer (synchronous). * * @param filePath - Path to the PDF file * @param pageIndex - Zero-based page index * @param options - Optional settings * @param options.dpi - DPI for rendering (default 150) * @returns Buffer containing PNG image data */ declare function renderPdfPageSync(filePath: string, pageIndex: number, options?: { dpi?: number; }): Buffer; /** * Render a single PDF page to a PNG buffer (asynchronous). * * @param filePath - Path to the PDF file * @param pageIndex - Zero-based page index * @param options - Optional settings * @param options.dpi - DPI for rendering (default 150) * @returns Promise resolving to a Buffer containing PNG image data */ declare function renderPdfPage(filePath: string, pageIndex: number, options?: { dpi?: number; }): Promise; /** A rendered PDF page with its index and PNG data. */ interface PdfPageResult { pageIndex: number; data: Buffer; } /** * Collect all PDF pages as PNG images (synchronous). * * @param filePath - Path to the PDF file * @param options - Optional settings * @param options.dpi - DPI for rendering (default 150) * @returns Array of PdfPageResult objects */ declare function iteratePdfPagesSync(filePath: string, options?: { dpi?: number; }): PdfPageResult[]; /** * Collect all PDF pages as PNG images (asynchronous). * * @param filePath - Path to the PDF file * @param options - Optional settings * @param options.dpi - DPI for rendering (default 150) * @returns Promise resolving to an array of PdfPageResult objects */ declare function iteratePdfPages(filePath: string, options?: { dpi?: number; }): Promise; /** * Get the number of pages in a PDF file. * * @param filePath - Path to the PDF file * @returns Number of pages */ declare function pdfPageCount(filePath: string): number; /** * Lazy PDF page iterator. Renders one page at a time via `.next()`. * Call `.close()` when done to free native resources. * * @example * ```typescript * const iter = new PdfPageIterator("doc.pdf", { dpi: 150 }); * let result; * while ((result = iter.next()) !== null) { * const { pageIndex, data } = result; * // process page... * } * iter.close(); * ``` */ declare class PdfPageIterator { private inner; constructor(filePath: string, options?: { dpi?: number; }); /** Advance and return the next page, or null when exhausted. */ next(): PdfPageResult | null; /** Total number of pages in the PDF. */ pageCount(): number; /** Free native resources. Safe to call multiple times. */ close(): void; } /** * Single-document extraction APIs. * * This module provides synchronous and asynchronous functions for extracting content * from a single file or byte array. These are convenience wrappers around the native * binding that handle config normalization and result conversion. * * **Usage Note**: For processing multiple files, prefer batch extraction functions * (`batchExtractFiles`, `batchExtractFilesSync`) which provide better performance * and memory management. * * @internal This module is part of Layer 2 (extraction APIs). */ /** * Extract content from a single file (synchronous). * * **Usage Note**: For processing multiple files, prefer `batchExtractFilesSync()` which * provides better performance and memory management. * * @param filePath - Path to the file to extract (string). Can be absolute or relative. * @param mimeTypeOrConfig - Optional MIME type hint or extraction configuration. * If a string, treated as MIME type. If an object, treated as ExtractionConfig. * If null, MIME type is auto-detected from file extension or content. * @param maybeConfig - Extraction configuration object. If null, uses default extraction settings. * Only used if second parameter is a MIME type string. * @returns ExtractionResult containing extracted content, metadata, tables, and optional chunks/images * @throws {Error} If file doesn't exist, cannot be accessed, or cannot be read * @throws {ParsingError} When document format is invalid or corrupted * @throws {OcrError} When OCR processing fails (if OCR is enabled) * @throws {ValidationError} When extraction result fails validation (if validators registered) * @throws {KreuzbergError} For other extraction-related failures * * @example * ```typescript * import { extractFileSync } from '@kreuzberg/node'; * * // Basic usage * const result = extractFileSync('document.pdf'); * console.log(result.content); * * // With explicit MIME type * const result2 = extractFileSync('document.pdf', 'application/pdf'); * * // With configuration * const result3 = extractFileSync('document.pdf', { * chunking: { * maxChars: 1000, * maxOverlap: 200, * }, * }); * ``` */ declare function extractFileSync(filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig, maybeConfig?: ExtractionConfig | null): ExtractionResult; /** * Extract content from a single file (asynchronous). * * **Usage Note**: For processing multiple files, prefer `batchExtractFiles()` which * provides better performance and memory management. * * @param filePath - Path to the file to extract (string). Can be absolute or relative. * @param mimeTypeOrConfig - Optional MIME type hint or extraction configuration. * If a string, treated as MIME type. If an object, treated as ExtractionConfig. * If null, MIME type is auto-detected from file extension or content. * @param maybeConfig - Extraction configuration object. If null, uses default extraction settings. * Only used if second parameter is a MIME type string. * @returns Promise containing extracted content, metadata, tables, and optional chunks/images * @throws {Error} If file doesn't exist, cannot be accessed, or cannot be read * @throws {ParsingError} When document format is invalid or corrupted * @throws {OcrError} When OCR processing fails (if OCR is enabled) * @throws {ValidationError} When extraction result fails validation (if validators registered) * @throws {KreuzbergError} For other extraction-related failures * * @example * ```typescript * import { extractFile } from '@kreuzberg/node'; * * // Basic usage * const result = await extractFile('document.pdf'); * console.log(result.content); * * // With chunking enabled * const config = { * chunking: { * maxChars: 1000, * maxOverlap: 200, * }, * }; * const result2 = await extractFile('long_document.pdf', null, config); * console.log(result2.chunks); // Array of text chunks * ``` */ declare function extractFile(filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig, maybeConfig?: ExtractionConfig | null): Promise; /** * Extract content from raw bytes (synchronous). * * **Usage Note**: For processing multiple byte arrays, prefer `batchExtractBytesSync()` * which provides better performance and memory management. * * @param data - File content as Uint8Array (Buffer will be converted) * @param mimeType - MIME type of the data (required for accurate format detection). Must be a valid MIME type string. * @param config - Extraction configuration object. If null, uses default extraction settings. * @returns ExtractionResult containing extracted content, metadata, tables, and optional chunks/images * @throws {TypeError} When data is not a valid Uint8Array * @throws {Error} When file cannot be read or parsed * @throws {ParsingError} When document format is invalid or corrupted * @throws {OcrError} When OCR processing fails (if OCR is enabled) * @throws {ValidationError} When extraction result fails validation (if validators registered) * @throws {KreuzbergError} For other extraction-related failures * * @example * ```typescript * import { extractBytesSync } from '@kreuzberg/node'; * import { readFileSync } from 'fs'; * * const data = readFileSync('document.pdf'); * const result = extractBytesSync(data, 'application/pdf'); * console.log(result.content); * ``` */ declare function extractBytesSync(dataOrPath: Uint8Array | string, mimeType: string, config?: ExtractionConfig | null): ExtractionResult; /** * Extract content from raw bytes (asynchronous). * * **Usage Note**: For processing multiple byte arrays, prefer `batchExtractBytes()` * which provides better performance and memory management. * * @param data - File content as Uint8Array (Buffer will be converted) * @param mimeType - MIME type of the data (required for accurate format detection). Must be a valid MIME type string. * @param config - Extraction configuration object. If null, uses default extraction settings. * @returns Promise containing extracted content, metadata, tables, and optional chunks/images * @throws {TypeError} When data is not a valid Uint8Array * @throws {Error} When file cannot be read or parsed * @throws {ParsingError} When document format is invalid or corrupted * @throws {OcrError} When OCR processing fails (if OCR is enabled) * @throws {ValidationError} When extraction result fails validation (if validators registered) * @throws {KreuzbergError} For other extraction-related failures * * @example * ```typescript * import { extractBytes } from '@kreuzberg/node'; * import { readFile } from 'fs/promises'; * * const data = await readFile('document.pdf'); * const result = await extractBytes(data, 'application/pdf'); * console.log(result.content); * ``` */ declare function extractBytes(dataOrPath: Uint8Array | string, mimeType: string, config?: ExtractionConfig | null): Promise; /** * Worker pool management for concurrent document extraction. * * This module provides utilities for creating and managing worker pools that enable * concurrent extraction of documents using Node.js worker threads. Worker pools allow * multiple extraction operations to run in parallel with configurable pool sizes. * * **Usage Pattern**: * 1. Create a pool with `createWorkerPool(size)` * 2. Submit tasks with `extractFileInWorker()` or `batchExtractFilesInWorker()` * 3. Close the pool with `closeWorkerPool()` when done * * @internal This module is part of Layer 2 (extraction APIs). */ /** * Create a new worker pool for concurrent extraction operations. * * Creates a pool of worker threads that can process extraction tasks concurrently. * The pool manages a queue of pending tasks and distributes them across available workers. * * @param size - Optional number of workers in the pool. If not specified, defaults to the number of CPU cores. * @returns WorkerPool instance that can be used with extraction functions * * @example * ```typescript * import { createWorkerPool } from '@kreuzberg/node'; * * // Create pool with default size (number of CPU cores) * const pool = createWorkerPool(); * * // Create pool with 4 workers * const pool4 = createWorkerPool(4); * ``` */ declare function createWorkerPool(size?: number): WorkerPool; /** * Get statistics about a worker pool. * * Returns information about the pool's current state, including the number of active workers, * queued tasks, and total processed tasks. * * @param pool - The worker pool instance * @returns WorkerPoolStats with pool information * * @example * ```typescript * import { createWorkerPool, getWorkerPoolStats } from '@kreuzberg/node'; * * const pool = createWorkerPool(4); * const stats = getWorkerPoolStats(pool); * * console.log(`Pool size: ${stats.size}`); * console.log(`Active workers: ${stats.activeWorkers}`); * console.log(`Queued tasks: ${stats.queuedTasks}`); * ``` */ declare function getWorkerPoolStats(pool: WorkerPool): WorkerPoolStats; /** * Extract content from a single file using a worker pool (asynchronous). * * Submits an extraction task to the worker pool. The task is executed by one of the * available workers in the background, allowing other tasks to be processed concurrently. * * @param pool - The worker pool instance * @param filePath - Path to the file to extract * @param mimeTypeOrConfig - Optional MIME type or extraction configuration. * If a string, treated as MIME type. If an object, treated as ExtractionConfig. * If null, MIME type is auto-detected from file extension or content. * @param maybeConfig - Extraction configuration object. If null, uses default extraction settings. * Only used if second parameter is a MIME type string. * @returns Promise containing extracted content and metadata * * @throws {Error} If the file cannot be read or extraction fails * * @example * ```typescript * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node'; * * const pool = createWorkerPool(4); * * try { * const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx']; * const results = await Promise.all( * files.map(f => extractFileInWorker(pool, f)) * ); * * results.forEach((r, i) => { * console.log(`${files[i]}: ${r.content.substring(0, 100)}...`); * }); * } finally { * await closeWorkerPool(pool); * } * ``` */ declare function extractFileInWorker(pool: WorkerPool, filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig, maybeConfig?: ExtractionConfig | null): Promise; /** * Extract content from multiple files in parallel using a worker pool (asynchronous). * * Submits multiple extraction tasks to the worker pool for concurrent processing. * This is more efficient than using `extractFileInWorker` multiple times sequentially. * * @param pool - The worker pool instance * @param paths - Array of file paths to extract * @param config - Extraction configuration object (applies to all files). If null, uses default extraction settings. * @returns Promise array of results (one per file, in same order) * * @throws {Error} If any file cannot be read or extraction fails * * @example * ```typescript * import { createWorkerPool, batchExtractFilesInWorker, closeWorkerPool } from '@kreuzberg/node'; * * const pool = createWorkerPool(4); * * try { * const files = ['invoice1.pdf', 'invoice2.pdf', 'invoice3.pdf']; * const results = await batchExtractFilesInWorker(pool, files, { * ocr: { backend: 'tesseract', language: 'eng' } * }); * * const total = results.reduce((sum, r) => sum + extractAmount(r.content), 0); * console.log(`Total: $${total}`); * } finally { * await closeWorkerPool(pool); * } * ``` */ declare function batchExtractFilesInWorker(pool: WorkerPool, paths: string[], config?: ExtractionConfig | null): Promise; /** * Close a worker pool and shut down all worker threads. * * Should be called when the pool is no longer needed to clean up resources * and gracefully shut down worker threads. Any pending tasks will be cancelled. * * @param pool - The worker pool instance to close * @returns Promise that resolves when the pool is fully closed * * @throws {Error} If pool shutdown fails * * @example * ```typescript * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node'; * * const pool = createWorkerPool(4); * * try { * const result = await extractFileInWorker(pool, 'document.pdf'); * console.log(result.content); * } finally { * // Clean up the pool * await closeWorkerPool(pool); * } * ``` */ declare function closeWorkerPool(pool: WorkerPool): Promise; /** * Register a custom post-processor. * * Post-processors allow you to hook into the extraction pipeline and transform * the extraction results. They run after the core extraction is complete. * * Post-processors are async and can modify extraction results before they are * returned to the caller. * * @param processor - Post-processor implementing PostProcessorProtocol * * @example * ```typescript * import { registerPostProcessor, extractFile } from '@kreuzberg/node'; * * class CustomProcessor { * name() { * return 'custom_processor'; * } * processingStage() { * return 'post'; * } * async process(result) { * // Add custom metadata * result.metadata.customField = 'custom_value'; * return result; * } * } * * // Use async extraction (required for custom processors) * const result = await extractFile('document.pdf'); * console.log(result.metadata.customField); // 'custom_value' * ``` */ declare function registerPostProcessor(processor: PostProcessorProtocol): void; /** * Unregister a postprocessor by name. * * Removes a previously registered postprocessor from the registry. * If the processor doesn't exist, this is a no-op (does not throw). * * @param name - Name of the processor to unregister (case-sensitive) * * @example * ```typescript * import { unregisterPostProcessor } from '@kreuzberg/node'; * * unregisterPostProcessor('my_processor'); * ``` */ declare function unregisterPostProcessor(name: string): void; /** * Clear all registered postprocessors. * * Removes all postprocessors from the registry. Useful for test cleanup or resetting state. * If no postprocessors are registered, this is a no-op. * * @example * ```typescript * import { clearPostProcessors } from '@kreuzberg/node'; * * clearPostProcessors(); * ``` */ declare function clearPostProcessors(): void; /** * List all registered post-processors. * * Returns the names of all currently registered post-processors (both built-in and custom). * * @returns Array of post-processor names (empty array if none registered) * * @example * ```typescript * import { listPostProcessors } from '@kreuzberg/node'; * * const names = listPostProcessors(); * console.log('Registered post-processors:', names); * ``` */ declare function listPostProcessors(): string[]; /** * Register a custom validator. * * Validators check extraction results for quality, completeness, or correctness. * Unlike post-processors, validator errors **fail fast** - if a validator throws an error, * the extraction fails immediately. * * Validators are async and run after post-processors in the extraction pipeline. * * @param validator - Validator implementing ValidatorProtocol * * @example * ```typescript * import { registerValidator, extractFile } from '@kreuzberg/node'; * * class MinLengthValidator { * name() { * return 'min_length_validator'; * } * * priority() { * return 100; * } * * async validate(result) { * if (result.content.length < 10) { * throw new Error('Content too short'); * } * } * } * * registerValidator(new MinLengthValidator()); * ``` */ declare function registerValidator(validator: ValidatorProtocol): void; /** * Unregister a validator by name. * * Removes a previously registered validator from the global registry. * If the validator doesn't exist, this is a no-op (does not throw). * * @param name - Validator name to unregister (case-sensitive) * * @example * ```typescript * import { unregisterValidator } from '@kreuzberg/node'; * * unregisterValidator('min_length_validator'); * ``` */ declare function unregisterValidator(name: string): void; /** * Clear all registered validators. * * Removes all validators from the global registry. Useful for test cleanup * or resetting state. * * @example * ```typescript * import { clearValidators } from '@kreuzberg/node'; * * clearValidators(); * ``` */ declare function clearValidators(): void; /** * List all registered validators. * * Returns the names of all currently registered validators (both built-in and custom). * * @returns Array of validator names (empty array if none registered) * * @example * ```typescript * import { listValidators } from '@kreuzberg/node'; * * const names = listValidators(); * console.log('Registered validators:', names); * ``` */ declare function listValidators(): string[]; /** * Register a custom OCR backend. * * This function registers a JavaScript OCR backend that will be used by Kreuzberg's * extraction pipeline when OCR is enabled. The backend must implement the * {@link OcrBackendProtocol} interface. * * ## Usage * * 1. Create a class implementing {@link OcrBackendProtocol} * 2. Call `initialize()` on your backend instance (if needed) * 3. Register the backend with `registerOcrBackend()` * 4. Use the backend name in extraction config * * ## Thread Safety * * The registered backend must be thread-safe as it may be called concurrently * from multiple Rust async tasks. Ensure your implementation handles concurrent * calls properly. * * @param backend - OcrBackendProtocol implementation with name(), supportedLanguages(), and processImage() * @throws {Error} If backend is missing required methods (name, supportedLanguages, or processImage) * @throws {Error} If backend name is empty string or contains invalid characters * @throws {Error} If a backend with the same name is already registered * @throws {Error} If registration fails due to FFI issues * * @example * ```typescript * import { extractFile } from '@kreuzberg/node'; * * // PaddleOCR is built into the native Rust core - just use the backend name * const result = await extractFile('scanned.pdf', null, { * ocr: { backend: 'paddle-ocr', language: 'en' } * }); * console.log(result.content); * ``` * * @example * ```typescript * import { registerOcrBackend } from '@kreuzberg/node'; * * class MyOcrBackend { * name() { * return 'my-ocr'; * } * * supportedLanguages(): string[] { * return ['en', 'de', 'fr']; * } * * async processImage(imageBytes: Uint8Array, language: string) { * const text = await myCustomOcrEngine(imageBytes, language); * return { * content: text, * mime_type: 'text/plain', * metadata: { confidence: 0.95, language }, * tables: [] * }; * } * } * * registerOcrBackend(new MyOcrBackend()); * ``` */ declare function registerOcrBackend(backend: OcrBackendProtocol): void; /** * List all registered OCR backends. * * Returns an array of names of all currently registered OCR backends, * including built-in backends like "tesseract". * * @returns Array of OCR backend names (empty array if none registered) * * @example * ```typescript * import { listOcrBackends } from '@kreuzberg/node'; * * const backends = listOcrBackends(); * console.log(backends); // ['tesseract', 'my-custom-backend', ...] * ``` */ declare function listOcrBackends(): string[]; /** * Unregister an OCR backend by name. * * Removes the specified OCR backend from the registry. If the backend doesn't exist, * this operation is a no-op (does not throw an error). * * @param name - Name of the OCR backend to unregister * * @example * ```typescript * import { unregisterOcrBackend } from '@kreuzberg/node'; * * // Unregister a custom backend * unregisterOcrBackend('my-custom-ocr'); * ``` */ declare function unregisterOcrBackend(name: string): void; /** * Clear all registered OCR backends. * * Removes all OCR backends from the registry, including built-in backends. * Use with caution as this will make OCR functionality unavailable until * backends are re-registered. If no backends are registered, this is a no-op. * * @example * ```typescript * import { clearOcrBackends } from '@kreuzberg/node'; * * clearOcrBackends(); * ``` */ declare function clearOcrBackends(): void; /** * List all registered document extractors. * * Returns an array of names of all currently registered document extractors, * including built-in extractors for PDF, Office documents, images, etc. * * @returns Array of document extractor names (empty array if none registered) * * @example * ```typescript * import { listDocumentExtractors } from '@kreuzberg/node'; * * const extractors = listDocumentExtractors(); * console.log(extractors); // ['pdf', 'docx', 'xlsx', 'custom-extractor', ...] * ``` */ declare function listDocumentExtractors(): string[]; /** * Unregister a document extractor by name. * * Removes the specified document extractor from the registry. If the extractor * doesn't exist, this operation is a no-op (does not throw an error). * * @param name - Name of the document extractor to unregister * * @example * ```typescript * import { unregisterDocumentExtractor } from '@kreuzberg/node'; * * // Unregister a custom extractor * unregisterDocumentExtractor('MyCustomExtractor'); * ``` */ declare function unregisterDocumentExtractor(name: string): void; /** * Clear all registered document extractors. * * Removes all document extractors from the registry, including built-in extractors. * Use with caution as this will make document extraction unavailable until * extractors are re-registered. * * @example * ```typescript * import { clearDocumentExtractors } from '@kreuzberg/node'; * * clearDocumentExtractors(); * ``` */ declare function clearDocumentExtractors(): void; /** * Load extraction configuration from a file. * * @param filePath - Path to the configuration file * @returns ExtractionConfig object loaded from the file * * @deprecated Use ExtractionConfig.fromFile() instead */ declare function loadConfigFile(filePath: string): ExtractionConfig; /** * Load extraction configuration from a specified path. * * @param path - Path to the configuration file or directory * @returns ExtractionConfig object or null * * @deprecated Use ExtractionConfig.fromFile() or ExtractionConfig.discover() instead */ declare function loadConfigFromPath(path: string): ExtractionConfig | null; /** * Detect MIME type from raw bytes. * * Uses content inspection (magic bytes) to determine MIME type. * This is more accurate than extension-based detection but requires * reading the file content. * * @param bytes - Raw file content as Buffer * @returns The detected MIME type string * * @throws {Error} If MIME type cannot be determined from content * * @example * ```typescript * import { detectMimeType } from '@kreuzberg/node'; * import * as fs from 'fs'; * * // Read file content * const content = fs.readFileSync('document.pdf'); * * // Detect MIME type from bytes * const mimeType = detectMimeType(content); * console.log(mimeType); // 'application/pdf' * ``` */ declare function detectMimeType(bytes: Buffer): string; /** * Detect MIME type from a file path. * * Determines the MIME type based on the file extension in the provided path. * By default, checks if the file exists; can be disabled with checkExists parameter. * * @param filePath - The file path to detect MIME type from (e.g., 'document.pdf') * @param checkExists - Whether to verify the file exists (default: true) * @returns The detected MIME type as a string (e.g., 'application/pdf') * * @throws {Error} If MIME type cannot be determined from the file extension, * or if checkExists is true and the file does not exist * * @example * ```typescript * import { detectMimeTypeFromPath } from '@kreuzberg/node'; * * // Detect MIME type from existing file * const mimeType = detectMimeTypeFromPath('/path/to/document.pdf'); * console.log(mimeType); // 'application/pdf' * * // Detect without checking file existence * const mimeType2 = detectMimeTypeFromPath('document.docx', false); * console.log(mimeType2); // 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' * ``` */ declare function detectMimeTypeFromPath(filePath: string, checkExists?: boolean): string; /** * Validate that a MIME type is supported by Kreuzberg. * * Checks if a MIME type is in the list of supported formats. Note that any * `image/*` MIME type is automatically considered valid. * * @param mimeType - The MIME type to validate (string) * @returns The validated MIME type (may be normalized) * * @throws {Error} If the MIME type is not supported * * @example * ```typescript * import { validateMimeType } from '@kreuzberg/node'; * * // Validate supported type * const validated = validateMimeType('application/pdf'); * console.log(validated); // 'application/pdf' * * // Validate custom image type * const validated2 = validateMimeType('image/custom-format'); * console.log(validated2); // 'image/custom-format' (any image/* is valid) * * // Validate unsupported type (throws error) * try { * validateMimeType('video/mp4'); * } catch (err) { * console.error(err); // Error: Unsupported format: video/mp4 * } * ``` */ declare function validateMimeType(mimeType: string): string; /** * Get file extensions for a given MIME type. * * Returns an array of file extensions commonly associated with the specified * MIME type. For example, 'application/pdf' returns ['pdf']. * * @param mimeType - The MIME type to look up (e.g., 'application/pdf', 'image/jpeg') * @returns Array of file extensions (without leading dots) * * @throws {Error} If the MIME type is not recognized or supported * * @example * ```typescript * import { getExtensionsForMime } from '@kreuzberg/node'; * * // Get extensions for PDF * const pdfExts = getExtensionsForMime('application/pdf'); * console.log(pdfExts); // ['pdf'] * * // Get extensions for JPEG * const jpegExts = getExtensionsForMime('image/jpeg'); * console.log(jpegExts); // ['jpg', 'jpeg'] * ``` */ declare function getExtensionsForMime(mimeType: string): string[]; /** * Embedding preset configuration. * * Contains all settings for a specific embedding model preset. */ interface EmbeddingPreset { /** Name of the preset (e.g., "fast", "balanced", "quality", "multilingual") */ name: string; /** Recommended chunk size in characters */ chunkSize: number; /** Recommended overlap in characters */ overlap: number; /** Model identifier (e.g., "AllMiniLML6V2Q", "BGEBaseENV15") */ modelName: string; /** Embedding vector dimensions */ dimensions: number; /** Human-readable description of the preset */ description: string; } /** * Get all available embedding presets. * * Returns an array of names of all available embedding model presets. * * @returns Array of preset names (e.g., ["fast", "balanced", "quality", "multilingual"]) * * @example * ```typescript * import { listEmbeddingPresets } from '@kreuzberg/node'; * * const presets = listEmbeddingPresets(); * console.log('Available presets:', presets); * ``` */ declare function listEmbeddingPresets(): string[]; /** * Get embedding preset configuration by name. * * Retrieves the configuration for a specific embedding model preset. * Returns null if the preset doesn't exist. * * @param name - Name of the preset (e.g., "balanced", "fast", "quality") * @returns EmbeddingPreset configuration if found, null otherwise * * @example * ```typescript * import { getEmbeddingPreset } from '@kreuzberg/node'; * * const preset = getEmbeddingPreset('balanced'); * if (preset) { * console.log(`Model: ${preset.modelName}, Dims: ${preset.dimensions}`); * // Model: BGEBaseENV15, Dims: 768 * } * ``` */ declare function getEmbeddingPreset(name: string): EmbeddingPreset | null; /** * Generate vector embeddings for a list of texts (synchronous). * * Requires the `embeddings` feature to be enabled (ONNX Runtime must be available). * Returns one float32 array per input text. An empty input returns an empty array. * * @param texts - Array of strings to embed * @param config - Optional embedding configuration (model preset, batch size, normalization) * @returns Array of float32 arrays (one embedding vector per input text) * * @throws {Error} If ONNX Runtime is not available or the model cannot be loaded * * @example * ```typescript * import { embedSync } from '@kreuzberg/node'; * * const embeddings = embedSync(['Hello, world!'], { model: { type: 'preset', name: 'balanced' } }); * console.log(embeddings.length); // 1 * console.log(embeddings[0].length); // 768 * ``` */ declare function embedSync(texts: string[], config?: EmbeddingConfig): number[][]; /** * Generate vector embeddings for a list of texts (asynchronous). * * Requires the `embeddings` feature to be enabled (ONNX Runtime must be available). * Returns one float32 array per input text. An empty input returns an empty array. * * @param texts - Array of strings to embed * @param config - Optional embedding configuration (model preset, batch size, normalization) * @returns Promise resolving to an array of float32 arrays (one embedding vector per input text) * * @throws {Error} If ONNX Runtime is not available or the model cannot be loaded * * @example * ```typescript * import { embed } from '@kreuzberg/node'; * * const embeddings = await embed(['Hello, world!'], { model: { type: 'preset', name: 'balanced' } }); * console.log(embeddings.length); // 1 * console.log(embeddings[0].length); // 768 * ``` */ declare function embed(texts: string[], config?: EmbeddingConfig): Promise; /** * @internal Allows tests to provide a mocked native binding. */ declare function __setBindingForTests(mock: unknown): void; /** * @internal Resets the cached native binding for tests. */ declare function __resetBindingForTests(): void; /** * Kreuzberg - Multi-language document intelligence framework. * * This is a TypeScript SDK around a high-performance Rust core. * All extraction logic, chunking, quality processing, and language detection * are implemented in Rust for maximum performance. * * ## Module Organization * * The SDK is organized into logical domains: * - **Extraction**: Single and batch document extraction with worker pool support * - **Types**: Core type definitions and interfaces * - **Errors**: Error classes and diagnostic utilities * - **Plugins**: Custom post-processors, validators, and OCR backends * - **Registry**: Plugin and document extractor management * - **Config**: Configuration loading and management * - **MIME**: MIME type detection and validation * - **Embeddings**: Embedding model presets * * ## API Usage Recommendations * * **For processing multiple documents**, prefer batch APIs: * - Use `batchExtractFiles()` / `batchExtractFilesSync()` for multiple files * - Use `batchExtractBytes()` / `batchExtractBytesSync()` for multiple byte arrays * - Use worker pool APIs for high-concurrency scenarios * * **Batch APIs provide**: * - Better performance (parallel processing in Rust) * - More reliable memory management * - Recommended for all multi-document workflows * * **Single extraction APIs** (`extractFile`, `extractBytes`) are suitable for: * - One-off document processing * - Interactive applications processing documents on-demand * - Avoid calling these in tight loops - use batch APIs instead * * ## Supported Formats * * - **Documents**: PDF, DOCX, PPTX, XLSX, DOC, PPT * - **Text**: Markdown, Plain Text, XML * - **Web**: HTML (converted to Markdown) * - **Data**: JSON, YAML, TOML * - **Email**: EML, MSG * - **Images**: PNG, JPEG, TIFF (with OCR support) * * @example * ```typescript * import { extractFile, batchExtractFiles } from '@kreuzberg/node'; * * // Single file extraction * const result = await extractFile('document.pdf'); * console.log(result.content); * * // Multiple files (recommended approach) * const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx']; * const results = await batchExtractFiles(files); * results.forEach(r => console.log(r.content)); * ``` * * @module @kreuzberg/node */ declare const __version__ = "4.9.9"; export { EmbeddingConfig, type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PdfPageIterator, type PdfPageResult, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, embed, embedSync, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, iteratePdfPages, iteratePdfPagesSync, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, loadConfigFile, loadConfigFromPath, pdfPageCount, registerOcrBackend, registerPostProcessor, registerValidator, renderPdfPage, renderPdfPageSync, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };