import { ErrorClassification, ExtractionConfig, ExtractionResult, WorkerPool, WorkerPoolStats, PostProcessorProtocol, ValidatorProtocol, OcrBackendProtocol, EmbeddingConfig } from './types.js';
export { Chunk, ChunkingConfig, EmbeddingModelType, ExtractedImage, HtmlConversionOptions, HtmlOutputConfig, HtmlPreprocessingOptions, ImageExtractionConfig, KeywordConfig, LanguageDetectionConfig, OcrConfig, PageContent, PageExtractionConfig, PdfConfig, PostProcessorConfig, Table, TesseractConfig, TokenReductionConfig } from './types.js';
import { PanicContext } from './errors.js';
export { CacheError, EmbeddingError, ErrorCode, ImageProcessingError, KreuzbergError, MissingDependencyError, OcrError, ParsingError, PluginError, ValidationError } from './errors.js';

/**
 * Get the error code for the last FFI error.
 *
 * Returns the FFI error code as an integer. This is useful for programmatic error handling
 * and distinguishing between different types of failures in native code.
 *
 * Error codes:
 * - 0: Success (no error)
 * - 1: GenericError
 * - 2: Panic
 * - 3: InvalidArgument
 * - 4: IoError
 * - 5: ParsingError
 * - 6: OcrError
 * - 7: MissingDependency
 * - 8: Embedding
 *
 * @returns The integer error code
 *
 * @example
 * ```typescript
 * import { extractFile, getLastErrorCode, ErrorCode } from '@kreuzberg/node';
 *
 * try {
 *   const result = await extractFile('document.pdf');
 * } catch (error) {
 *   const code = getLastErrorCode();
 *   if (code === ErrorCode.Panic) {
 *     console.error('Native code panic detected');
 *   }
 * }
 * ```
 */
declare function getLastErrorCode(): number;
/**
 * Get panic context information if the last error was a panic.
 *
 * Returns detailed information about a panic in native code, or null if the last error was not a panic.
 * This provides debugging information when native code panics.
 *
 * @returns A `PanicContext` object with file, line, function, message, and timestamp_secs, or null if no panic context is available
 *
 * @example
 * ```typescript
 * import { extractFile, getLastPanicContext } from '@kreuzberg/node';
 *
 * try {
 *   const result = await extractFile('document.pdf');
 * } catch (error) {
 *   const context = getLastPanicContext();
 *   if (context) {
 *     console.error(`Panic at ${context.file}:${context.line}`);
 *     console.error(`In function: ${context.function}`);
 *     console.error(`Message: ${context.message}`);
 *   }
 * }
 * ```
 */
declare function getLastPanicContext(): PanicContext | null;
/**
 * Returns the human-readable name for an error code.
 *
 * Maps numeric error codes to their string names, providing a consistent way
 * to get error code names across all platforms.
 *
 * @param code - The numeric error code (0-8)
 * @returns The error code name as a string (e.g., "validation", "ocr", "embedding")
 *
 * @example
 * ```typescript
 * import { getErrorCodeName } from '@kreuzberg/node';
 *
 * const name = getErrorCodeName(0);  // returns "validation"
 * const name = getErrorCodeName(2);  // returns "ocr"
 * const name = getErrorCodeName(99); // returns "unknown"
 * ```
 */
declare function getErrorCodeName(code: number): string;
/**
 * Returns the description for an error code.
 *
 * Retrieves user-friendly descriptions of error types from the FFI layer.
 *
 * @param code - The numeric error code (0-8)
 * @returns A brief description of the error type
 *
 * @example
 * ```typescript
 * import { getErrorCodeDescription } from '@kreuzberg/node';
 *
 * const desc = getErrorCodeDescription(0);  // returns "Input validation error"
 * const desc = getErrorCodeDescription(4);  // returns "File system I/O error"
 * const desc = getErrorCodeDescription(99); // returns "Unknown error code"
 * ```
 */
declare function getErrorCodeDescription(code: number): string;
/**
 * Classifies an error message string into an error code category.
 *
 * This function analyzes the error message content and returns the most likely
 * error code (0-7) based on keyword patterns. Used to programmatically classify
 * errors for handling purposes.
 *
 * The classification is based on keyword matching:
 * - **Validation (0)**: Keywords like "invalid", "validation", "schema", "required"
 * - **Parsing (1)**: Keywords like "parsing", "corrupted", "malformed"
 * - **Ocr (2)**: Keywords like "ocr", "tesseract", "language", "model"
 * - **MissingDependency (3)**: Keywords like "not found", "missing", "dependency"
 * - **Io (4)**: Keywords like "file", "disk", "read", "write", "permission"
 * - **Plugin (5)**: Keywords like "plugin", "register", "extension"
 * - **UnsupportedFormat (6)**: Keywords: unsupported, format, mime
 * - **Internal (7)**: Keywords: internal, bug, panic
 * - **Embedding (8)**: Keywords: embed, embedding, vector, inference
 *
 * @param errorMessage - The error message string to classify
 * @returns An object with the classification details
 *
 * @example
 * ```typescript
 * import { classifyError } from '@kreuzberg/node';
 *
 * const result = classifyError("PDF file is corrupted");
 * // Returns: { code: 1, name: "parsing", confidence: 0.95 }
 *
 * const result = classifyError("Tesseract not found");
 * // Returns: { code: 3, name: "missing_dependency", confidence: 0.9 }
 * ```
 */
declare function classifyError(errorMessage: string): ErrorClassification;

/**
 * Batch extraction APIs for processing multiple documents.
 *
 * This module provides synchronous and asynchronous functions for extracting content
 * from multiple files or byte arrays in parallel. Batch operations offer better
 * performance and memory management compared to calling single extraction functions
 * in a loop.
 *
 * **Benefits of Batch Processing**:
 * - Parallel processing in Rust for maximum performance
 * - Optimized memory usage across all extractions
 * - More reliable for large-scale document processing
 *
 * @internal This module is part of Layer 2 (extraction APIs).
 */

/**
 * Extract content from multiple files in parallel (synchronous).
 *
 * **Recommended for**: Processing multiple documents efficiently with better
 * performance and memory management compared to individual `extractFileSync()` calls.
 *
 * **Benefits**:
 * - Parallel processing in Rust for maximum performance
 * - Optimized memory usage across all extractions
 * - More reliable for batch document processing
 *
 * @param paths - List of file paths to extract (absolute or relative paths)
 * @param config - Extraction configuration object. If null, uses default extraction settings.
 * @returns Array of ExtractionResults (one per file, in same order as input)
 * @throws {Error} If any file cannot be read or parsed
 * @throws {ParsingError} When any document format is invalid or corrupted
 * @throws {OcrError} When OCR processing fails (if OCR is enabled)
 * @throws {ValidationError} When any extraction result fails validation (if validators registered)
 * @throws {KreuzbergError} For other extraction-related failures
 *
 * @example
 * ```typescript
 * import { batchExtractFilesSync } from '@kreuzberg/node';
 *
 * const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
 * const results = batchExtractFilesSync(files);
 *
 * results.forEach((result, i) => {
 *   console.log(`File ${files[i]}: ${result.content.substring(0, 100)}...`);
 * });
 * ```
 */
declare function batchExtractFilesSync(paths: string[], config?: ExtractionConfig | null): ExtractionResult[];
/**
 * Extract content from multiple files in parallel (asynchronous).
 *
 * **Recommended for**: Processing multiple documents efficiently with better
 * performance and memory management compared to individual `extractFile()` calls.
 *
 * **Benefits**:
 * - Parallel processing in Rust for maximum performance
 * - Optimized memory usage across all extractions
 * - More reliable for batch document processing
 *
 * @param paths - List of file paths to extract (absolute or relative paths)
 * @param config - Extraction configuration object. If null, uses default extraction settings.
 * @returns Promise resolving to array of ExtractionResults (one per file, in same order as input)
 * @throws {Error} If any file cannot be read or parsed
 * @throws {ParsingError} When any document format is invalid or corrupted
 * @throws {OcrError} When OCR processing fails (if OCR is enabled)
 * @throws {ValidationError} When any extraction result fails validation (if validators registered)
 * @throws {KreuzbergError} For other extraction-related failures
 *
 * @example
 * ```typescript
 * import { batchExtractFiles } from '@kreuzberg/node';
 *
 * const files = ['invoice1.pdf', 'invoice2.pdf', 'invoice3.pdf'];
 * const results = await batchExtractFiles(files, {
 *   ocr: { backend: 'tesseract', language: 'eng' }
 * });
 *
 * // Process all results
 * const totalAmount = results
 *   .map(r => extractAmount(r.content))
 *   .reduce((a, b) => a + b, 0);
 * ```
 */
declare function batchExtractFiles(paths: string[], config?: ExtractionConfig | null): Promise<ExtractionResult[]>;
/**
 * Extract content from multiple byte arrays in parallel (synchronous).
 *
 * **Recommended for**: Processing multiple documents from memory efficiently with better
 * performance and memory management compared to individual `extractBytesSync()` calls.
 *
 * **Benefits**:
 * - Parallel processing in Rust for maximum performance
 * - Optimized memory usage across all extractions
 * - More reliable for batch document processing
 *
 * @param dataList - List of file contents as Uint8Arrays (must be same length as mimeTypes)
 * @param mimeTypes - List of MIME types (one per data item, required for accurate format detection)
 * @param config - Extraction configuration object. If null, uses default extraction settings.
 * @returns Array of ExtractionResults (one per data item, in same order as input)
 * @throws {TypeError} When dataList contains non-Uint8Array items or length mismatch with mimeTypes
 * @throws {Error} If any data cannot be read or parsed
 * @throws {ParsingError} When any document format is invalid or corrupted
 * @throws {OcrError} When OCR processing fails (if OCR is enabled)
 * @throws {ValidationError} When any extraction result fails validation (if validators registered)
 * @throws {KreuzbergError} For other extraction-related failures
 *
 * @example
 * ```typescript
 * import { batchExtractBytesSync } from '@kreuzberg/node';
 * import { readFileSync } from 'fs';
 *
 * const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
 * const dataList = files.map(f => readFileSync(f));
 * const mimeTypes = ['application/pdf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'];
 *
 * const results = batchExtractBytesSync(dataList, mimeTypes);
 * results.forEach((result, i) => {
 *   console.log(`File ${files[i]}: ${result.content.substring(0, 100)}...`);
 * });
 * ```
 */
declare function batchExtractBytesSync(dataList: Uint8Array[], mimeTypes: string[], config?: ExtractionConfig | null): ExtractionResult[];
/**
 * Extract content from multiple byte arrays in parallel (asynchronous).
 *
 * **Recommended for**: Processing multiple documents from memory efficiently with better
 * performance and memory management compared to individual `extractBytes()` calls.
 *
 * **Benefits**:
 * - Parallel processing in Rust for maximum performance
 * - Optimized memory usage across all extractions
 * - More reliable for batch document processing
 *
 * @param dataList - List of file contents as Uint8Arrays (must be same length as mimeTypes)
 * @param mimeTypes - List of MIME types (one per data item, required for accurate format detection)
 * @param config - Extraction configuration object. If null, uses default extraction settings.
 * @returns Promise resolving to array of ExtractionResults (one per data item, in same order as input)
 * @throws {TypeError} When dataList contains non-Uint8Array items or length mismatch with mimeTypes
 * @throws {Error} If any data cannot be read or parsed
 * @throws {ParsingError} When any document format is invalid or corrupted
 * @throws {OcrError} When OCR processing fails (if OCR is enabled)
 * @throws {ValidationError} When any extraction result fails validation (if validators registered)
 * @throws {KreuzbergError} For other extraction-related failures
 *
 * @example
 * ```typescript
 * import { batchExtractBytes } from '@kreuzberg/node';
 * import { readFile } from 'fs/promises';
 *
 * const files = ['invoice1.pdf', 'invoice2.pdf', 'invoice3.pdf'];
 * const dataList = await Promise.all(files.map(f => readFile(f)));
 * const mimeTypes = files.map(() => 'application/pdf');
 *
 * const results = await batchExtractBytes(dataList, mimeTypes, {
 *   ocr: { backend: 'tesseract', language: 'eng' }
 * });
 *
 * // Process all results
 * const totalAmount = results
 *   .map(r => extractAmount(r.content))
 *   .reduce((a, b) => a + b, 0);
 * ```
 */
declare function batchExtractBytes(dataList: Uint8Array[], mimeTypes: string[], config?: ExtractionConfig | null): Promise<ExtractionResult[]>;

/**
 * PDF page rendering functions.
 *
 * Render individual PDF pages or iterate over all pages as PNG images.
 */
/**
 * Render a single PDF page to a PNG buffer (synchronous).
 *
 * @param filePath - Path to the PDF file
 * @param pageIndex - Zero-based page index
 * @param options - Optional settings
 * @param options.dpi - DPI for rendering (default 150)
 * @returns Buffer containing PNG image data
 */
declare function renderPdfPageSync(filePath: string, pageIndex: number, options?: {
    dpi?: number;
}): Buffer;
/**
 * Render a single PDF page to a PNG buffer (asynchronous).
 *
 * @param filePath - Path to the PDF file
 * @param pageIndex - Zero-based page index
 * @param options - Optional settings
 * @param options.dpi - DPI for rendering (default 150)
 * @returns Promise resolving to a Buffer containing PNG image data
 */
declare function renderPdfPage(filePath: string, pageIndex: number, options?: {
    dpi?: number;
}): Promise<Buffer>;
/** A rendered PDF page with its index and PNG data. */
interface PdfPageResult {
    pageIndex: number;
    data: Buffer;
}
/**
 * Collect all PDF pages as PNG images (synchronous).
 *
 * @param filePath - Path to the PDF file
 * @param options - Optional settings
 * @param options.dpi - DPI for rendering (default 150)
 * @returns Array of PdfPageResult objects
 */
declare function iteratePdfPagesSync(filePath: string, options?: {
    dpi?: number;
}): PdfPageResult[];
/**
 * Collect all PDF pages as PNG images (asynchronous).
 *
 * @param filePath - Path to the PDF file
 * @param options - Optional settings
 * @param options.dpi - DPI for rendering (default 150)
 * @returns Promise resolving to an array of PdfPageResult objects
 */
declare function iteratePdfPages(filePath: string, options?: {
    dpi?: number;
}): Promise<PdfPageResult[]>;
/**
 * Get the number of pages in a PDF file.
 *
 * @param filePath - Path to the PDF file
 * @returns Number of pages
 */
declare function pdfPageCount(filePath: string): number;
/**
 * Lazy PDF page iterator. Renders one page at a time via `.next()`.
 * Call `.close()` when done to free native resources.
 *
 * @example
 * ```typescript
 * const iter = new PdfPageIterator("doc.pdf", { dpi: 150 });
 * let result;
 * while ((result = iter.next()) !== null) {
 *     const { pageIndex, data } = result;
 *     // process page...
 * }
 * iter.close();
 * ```
 */
declare class PdfPageIterator {
    private inner;
    constructor(filePath: string, options?: {
        dpi?: number;
    });
    /** Advance and return the next page, or null when exhausted. */
    next(): PdfPageResult | null;
    /** Total number of pages in the PDF. */
    pageCount(): number;
    /** Free native resources. Safe to call multiple times. */
    close(): void;
}

/**
 * Single-document extraction APIs.
 *
 * This module provides synchronous and asynchronous functions for extracting content
 * from a single file or byte array. These are convenience wrappers around the native
 * binding that handle config normalization and result conversion.
 *
 * **Usage Note**: For processing multiple files, prefer batch extraction functions
 * (`batchExtractFiles`, `batchExtractFilesSync`) which provide better performance
 * and memory management.
 *
 * @internal This module is part of Layer 2 (extraction APIs).
 */

/**
 * Extract content from a single file (synchronous).
 *
 * **Usage Note**: For processing multiple files, prefer `batchExtractFilesSync()` which
 * provides better performance and memory management.
 *
 * @param filePath - Path to the file to extract (string). Can be absolute or relative.
 * @param mimeTypeOrConfig - Optional MIME type hint or extraction configuration.
 *   If a string, treated as MIME type. If an object, treated as ExtractionConfig.
 *   If null, MIME type is auto-detected from file extension or content.
 * @param maybeConfig - Extraction configuration object. If null, uses default extraction settings.
 *   Only used if second parameter is a MIME type string.
 * @returns ExtractionResult containing extracted content, metadata, tables, and optional chunks/images
 * @throws {Error} If file doesn't exist, cannot be accessed, or cannot be read
 * @throws {ParsingError} When document format is invalid or corrupted
 * @throws {OcrError} When OCR processing fails (if OCR is enabled)
 * @throws {ValidationError} When extraction result fails validation (if validators registered)
 * @throws {KreuzbergError} For other extraction-related failures
 *
 * @example
 * ```typescript
 * import { extractFileSync } from '@kreuzberg/node';
 *
 * // Basic usage
 * const result = extractFileSync('document.pdf');
 * console.log(result.content);
 *
 * // With explicit MIME type
 * const result2 = extractFileSync('document.pdf', 'application/pdf');
 *
 * // With configuration
 * const result3 = extractFileSync('document.pdf', {
 *   chunking: {
 *     maxChars: 1000,
 *     maxOverlap: 200,
 *   },
 * });
 * ```
 */
declare function extractFileSync(filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig, maybeConfig?: ExtractionConfig | null): ExtractionResult;
/**
 * Extract content from a single file (asynchronous).
 *
 * **Usage Note**: For processing multiple files, prefer `batchExtractFiles()` which
 * provides better performance and memory management.
 *
 * @param filePath - Path to the file to extract (string). Can be absolute or relative.
 * @param mimeTypeOrConfig - Optional MIME type hint or extraction configuration.
 *   If a string, treated as MIME type. If an object, treated as ExtractionConfig.
 *   If null, MIME type is auto-detected from file extension or content.
 * @param maybeConfig - Extraction configuration object. If null, uses default extraction settings.
 *   Only used if second parameter is a MIME type string.
 * @returns Promise<ExtractionResult> containing extracted content, metadata, tables, and optional chunks/images
 * @throws {Error} If file doesn't exist, cannot be accessed, or cannot be read
 * @throws {ParsingError} When document format is invalid or corrupted
 * @throws {OcrError} When OCR processing fails (if OCR is enabled)
 * @throws {ValidationError} When extraction result fails validation (if validators registered)
 * @throws {KreuzbergError} For other extraction-related failures
 *
 * @example
 * ```typescript
 * import { extractFile } from '@kreuzberg/node';
 *
 * // Basic usage
 * const result = await extractFile('document.pdf');
 * console.log(result.content);
 *
 * // With chunking enabled
 * const config = {
 *   chunking: {
 *     maxChars: 1000,
 *     maxOverlap: 200,
 *   },
 * };
 * const result2 = await extractFile('long_document.pdf', null, config);
 * console.log(result2.chunks); // Array of text chunks
 * ```
 */
declare function extractFile(filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig, maybeConfig?: ExtractionConfig | null): Promise<ExtractionResult>;
/**
 * Extract content from raw bytes (synchronous).
 *
 * **Usage Note**: For processing multiple byte arrays, prefer `batchExtractBytesSync()`
 * which provides better performance and memory management.
 *
 * @param data - File content as Uint8Array (Buffer will be converted)
 * @param mimeType - MIME type of the data (required for accurate format detection). Must be a valid MIME type string.
 * @param config - Extraction configuration object. If null, uses default extraction settings.
 * @returns ExtractionResult containing extracted content, metadata, tables, and optional chunks/images
 * @throws {TypeError} When data is not a valid Uint8Array
 * @throws {Error} When file cannot be read or parsed
 * @throws {ParsingError} When document format is invalid or corrupted
 * @throws {OcrError} When OCR processing fails (if OCR is enabled)
 * @throws {ValidationError} When extraction result fails validation (if validators registered)
 * @throws {KreuzbergError} For other extraction-related failures
 *
 * @example
 * ```typescript
 * import { extractBytesSync } from '@kreuzberg/node';
 * import { readFileSync } from 'fs';
 *
 * const data = readFileSync('document.pdf');
 * const result = extractBytesSync(data, 'application/pdf');
 * console.log(result.content);
 * ```
 */
declare function extractBytesSync(dataOrPath: Uint8Array | string, mimeType: string, config?: ExtractionConfig | null): ExtractionResult;
/**
 * Extract content from raw bytes (asynchronous).
 *
 * **Usage Note**: For processing multiple byte arrays, prefer `batchExtractBytes()`
 * which provides better performance and memory management.
 *
 * @param data - File content as Uint8Array (Buffer will be converted)
 * @param mimeType - MIME type of the data (required for accurate format detection). Must be a valid MIME type string.
 * @param config - Extraction configuration object. If null, uses default extraction settings.
 * @returns Promise<ExtractionResult> containing extracted content, metadata, tables, and optional chunks/images
 * @throws {TypeError} When data is not a valid Uint8Array
 * @throws {Error} When file cannot be read or parsed
 * @throws {ParsingError} When document format is invalid or corrupted
 * @throws {OcrError} When OCR processing fails (if OCR is enabled)
 * @throws {ValidationError} When extraction result fails validation (if validators registered)
 * @throws {KreuzbergError} For other extraction-related failures
 *
 * @example
 * ```typescript
 * import { extractBytes } from '@kreuzberg/node';
 * import { readFile } from 'fs/promises';
 *
 * const data = await readFile('document.pdf');
 * const result = await extractBytes(data, 'application/pdf');
 * console.log(result.content);
 * ```
 */
declare function extractBytes(dataOrPath: Uint8Array | string, mimeType: string, config?: ExtractionConfig | null): Promise<ExtractionResult>;

/**
 * Worker pool management for concurrent document extraction.
 *
 * This module provides utilities for creating and managing worker pools that enable
 * concurrent extraction of documents using Node.js worker threads. Worker pools allow
 * multiple extraction operations to run in parallel with configurable pool sizes.
 *
 * **Usage Pattern**:
 * 1. Create a pool with `createWorkerPool(size)`
 * 2. Submit tasks with `extractFileInWorker()` or `batchExtractFilesInWorker()`
 * 3. Close the pool with `closeWorkerPool()` when done
 *
 * @internal This module is part of Layer 2 (extraction APIs).
 */

/**
 * Create a new worker pool for concurrent extraction operations.
 *
 * Creates a pool of worker threads that can process extraction tasks concurrently.
 * The pool manages a queue of pending tasks and distributes them across available workers.
 *
 * @param size - Optional number of workers in the pool. If not specified, defaults to the number of CPU cores.
 * @returns WorkerPool instance that can be used with extraction functions
 *
 * @example
 * ```typescript
 * import { createWorkerPool } from '@kreuzberg/node';
 *
 * // Create pool with default size (number of CPU cores)
 * const pool = createWorkerPool();
 *
 * // Create pool with 4 workers
 * const pool4 = createWorkerPool(4);
 * ```
 */
declare function createWorkerPool(size?: number): WorkerPool;
/**
 * Get statistics about a worker pool.
 *
 * Returns information about the pool's current state, including the number of active workers,
 * queued tasks, and total processed tasks.
 *
 * @param pool - The worker pool instance
 * @returns WorkerPoolStats with pool information
 *
 * @example
 * ```typescript
 * import { createWorkerPool, getWorkerPoolStats } from '@kreuzberg/node';
 *
 * const pool = createWorkerPool(4);
 * const stats = getWorkerPoolStats(pool);
 *
 * console.log(`Pool size: ${stats.size}`);
 * console.log(`Active workers: ${stats.activeWorkers}`);
 * console.log(`Queued tasks: ${stats.queuedTasks}`);
 * ```
 */
declare function getWorkerPoolStats(pool: WorkerPool): WorkerPoolStats;
/**
 * Extract content from a single file using a worker pool (asynchronous).
 *
 * Submits an extraction task to the worker pool. The task is executed by one of the
 * available workers in the background, allowing other tasks to be processed concurrently.
 *
 * @param pool - The worker pool instance
 * @param filePath - Path to the file to extract
 * @param mimeTypeOrConfig - Optional MIME type or extraction configuration.
 *   If a string, treated as MIME type. If an object, treated as ExtractionConfig.
 *   If null, MIME type is auto-detected from file extension or content.
 * @param maybeConfig - Extraction configuration object. If null, uses default extraction settings.
 *   Only used if second parameter is a MIME type string.
 * @returns Promise<ExtractionResult> containing extracted content and metadata
 *
 * @throws {Error} If the file cannot be read or extraction fails
 *
 * @example
 * ```typescript
 * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
 *
 * const pool = createWorkerPool(4);
 *
 * try {
 *   const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
 *   const results = await Promise.all(
 *     files.map(f => extractFileInWorker(pool, f))
 *   );
 *
 *   results.forEach((r, i) => {
 *     console.log(`${files[i]}: ${r.content.substring(0, 100)}...`);
 *   });
 * } finally {
 *   await closeWorkerPool(pool);
 * }
 * ```
 */
declare function extractFileInWorker(pool: WorkerPool, filePath: string, mimeTypeOrConfig?: string | null | ExtractionConfig, maybeConfig?: ExtractionConfig | null): Promise<ExtractionResult>;
/**
 * Extract content from multiple files in parallel using a worker pool (asynchronous).
 *
 * Submits multiple extraction tasks to the worker pool for concurrent processing.
 * This is more efficient than using `extractFileInWorker` multiple times sequentially.
 *
 * @param pool - The worker pool instance
 * @param paths - Array of file paths to extract
 * @param config - Extraction configuration object (applies to all files). If null, uses default extraction settings.
 * @returns Promise<ExtractionResult[]> array of results (one per file, in same order)
 *
 * @throws {Error} If any file cannot be read or extraction fails
 *
 * @example
 * ```typescript
 * import { createWorkerPool, batchExtractFilesInWorker, closeWorkerPool } from '@kreuzberg/node';
 *
 * const pool = createWorkerPool(4);
 *
 * try {
 *   const files = ['invoice1.pdf', 'invoice2.pdf', 'invoice3.pdf'];
 *   const results = await batchExtractFilesInWorker(pool, files, {
 *     ocr: { backend: 'tesseract', language: 'eng' }
 *   });
 *
 *   const total = results.reduce((sum, r) => sum + extractAmount(r.content), 0);
 *   console.log(`Total: $${total}`);
 * } finally {
 *   await closeWorkerPool(pool);
 * }
 * ```
 */
declare function batchExtractFilesInWorker(pool: WorkerPool, paths: string[], config?: ExtractionConfig | null): Promise<ExtractionResult[]>;
/**
 * Close a worker pool and shut down all worker threads.
 *
 * Should be called when the pool is no longer needed to clean up resources
 * and gracefully shut down worker threads. Any pending tasks will be cancelled.
 *
 * @param pool - The worker pool instance to close
 * @returns Promise that resolves when the pool is fully closed
 *
 * @throws {Error} If pool shutdown fails
 *
 * @example
 * ```typescript
 * import { createWorkerPool, extractFileInWorker, closeWorkerPool } from '@kreuzberg/node';
 *
 * const pool = createWorkerPool(4);
 *
 * try {
 *   const result = await extractFileInWorker(pool, 'document.pdf');
 *   console.log(result.content);
 * } finally {
 *   // Clean up the pool
 *   await closeWorkerPool(pool);
 * }
 * ```
 */
declare function closeWorkerPool(pool: WorkerPool): Promise<void>;

/**
 * Register a custom post-processor.
 *
 * Post-processors allow you to hook into the extraction pipeline and transform
 * the extraction results. They run after the core extraction is complete.
 *
 * Post-processors are async and can modify extraction results before they are
 * returned to the caller.
 *
 * @param processor - Post-processor implementing PostProcessorProtocol
 *
 * @example
 * ```typescript
 * import { registerPostProcessor, extractFile } from '@kreuzberg/node';
 *
 * class CustomProcessor {
 *   name() {
 *     return 'custom_processor';
 *   }
 *   processingStage() {
 *     return 'post';
 *   }
 *   async process(result) {
 *     // Add custom metadata
 *     result.metadata.customField = 'custom_value';
 *     return result;
 *   }
 * }
 *
 * // Use async extraction (required for custom processors)
 * const result = await extractFile('document.pdf');
 * console.log(result.metadata.customField); // 'custom_value'
 * ```
 */
declare function registerPostProcessor(processor: PostProcessorProtocol): void;
/**
 * Unregister a postprocessor by name.
 *
 * Removes a previously registered postprocessor from the registry.
 * If the processor doesn't exist, this is a no-op (does not throw).
 *
 * @param name - Name of the processor to unregister (case-sensitive)
 *
 * @example
 * ```typescript
 * import { unregisterPostProcessor } from '@kreuzberg/node';
 *
 * unregisterPostProcessor('my_processor');
 * ```
 */
declare function unregisterPostProcessor(name: string): void;
/**
 * Clear all registered postprocessors.
 *
 * Removes all postprocessors from the registry. Useful for test cleanup or resetting state.
 * If no postprocessors are registered, this is a no-op.
 *
 * @example
 * ```typescript
 * import { clearPostProcessors } from '@kreuzberg/node';
 *
 * clearPostProcessors();
 * ```
 */
declare function clearPostProcessors(): void;
/**
 * List all registered post-processors.
 *
 * Returns the names of all currently registered post-processors (both built-in and custom).
 *
 * @returns Array of post-processor names (empty array if none registered)
 *
 * @example
 * ```typescript
 * import { listPostProcessors } from '@kreuzberg/node';
 *
 * const names = listPostProcessors();
 * console.log('Registered post-processors:', names);
 * ```
 */
declare function listPostProcessors(): string[];

/**
 * Register a custom validator.
 *
 * Validators check extraction results for quality, completeness, or correctness.
 * Unlike post-processors, validator errors **fail fast** - if a validator throws an error,
 * the extraction fails immediately.
 *
 * Validators are async and run after post-processors in the extraction pipeline.
 *
 * @param validator - Validator implementing ValidatorProtocol
 *
 * @example
 * ```typescript
 * import { registerValidator, extractFile } from '@kreuzberg/node';
 *
 * class MinLengthValidator {
 *   name() {
 *     return 'min_length_validator';
 *   }
 *
 *   priority() {
 *     return 100;
 *   }
 *
 *   async validate(result) {
 *     if (result.content.length < 10) {
 *       throw new Error('Content too short');
 *     }
 *   }
 * }
 *
 * registerValidator(new MinLengthValidator());
 * ```
 */
declare function registerValidator(validator: ValidatorProtocol): void;
/**
 * Unregister a validator by name.
 *
 * Removes a previously registered validator from the global registry.
 * If the validator doesn't exist, this is a no-op (does not throw).
 *
 * @param name - Validator name to unregister (case-sensitive)
 *
 * @example
 * ```typescript
 * import { unregisterValidator } from '@kreuzberg/node';
 *
 * unregisterValidator('min_length_validator');
 * ```
 */
declare function unregisterValidator(name: string): void;
/**
 * Clear all registered validators.
 *
 * Removes all validators from the global registry. Useful for test cleanup
 * or resetting state.
 *
 * @example
 * ```typescript
 * import { clearValidators } from '@kreuzberg/node';
 *
 * clearValidators();
 * ```
 */
declare function clearValidators(): void;
/**
 * List all registered validators.
 *
 * Returns the names of all currently registered validators (both built-in and custom).
 *
 * @returns Array of validator names (empty array if none registered)
 *
 * @example
 * ```typescript
 * import { listValidators } from '@kreuzberg/node';
 *
 * const names = listValidators();
 * console.log('Registered validators:', names);
 * ```
 */
declare function listValidators(): string[];

/**
 * Register a custom OCR backend.
 *
 * This function registers a JavaScript OCR backend that will be used by Kreuzberg's
 * extraction pipeline when OCR is enabled. The backend must implement the
 * {@link OcrBackendProtocol} interface.
 *
 * ## Usage
 *
 * 1. Create a class implementing {@link OcrBackendProtocol}
 * 2. Call `initialize()` on your backend instance (if needed)
 * 3. Register the backend with `registerOcrBackend()`
 * 4. Use the backend name in extraction config
 *
 * ## Thread Safety
 *
 * The registered backend must be thread-safe as it may be called concurrently
 * from multiple Rust async tasks. Ensure your implementation handles concurrent
 * calls properly.
 *
 * @param backend - OcrBackendProtocol implementation with name(), supportedLanguages(), and processImage()
 * @throws {Error} If backend is missing required methods (name, supportedLanguages, or processImage)
 * @throws {Error} If backend name is empty string or contains invalid characters
 * @throws {Error} If a backend with the same name is already registered
 * @throws {Error} If registration fails due to FFI issues
 *
 * @example
 * ```typescript
 * import { extractFile } from '@kreuzberg/node';
 *
 * // PaddleOCR is built into the native Rust core - just use the backend name
 * const result = await extractFile('scanned.pdf', null, {
 *   ocr: { backend: 'paddle-ocr', language: 'en' }
 * });
 * console.log(result.content);
 * ```
 *
 * @example
 * ```typescript
 * import { registerOcrBackend } from '@kreuzberg/node';
 *
 * class MyOcrBackend {
 *   name() {
 *     return 'my-ocr';
 *   }
 *
 *   supportedLanguages(): string[] {
 *     return ['en', 'de', 'fr'];
 *   }
 *
 *   async processImage(imageBytes: Uint8Array, language: string) {
 *     const text = await myCustomOcrEngine(imageBytes, language);
 *     return {
 *       content: text,
 *       mime_type: 'text/plain',
 *       metadata: { confidence: 0.95, language },
 *       tables: []
 *     };
 *   }
 * }
 *
 * registerOcrBackend(new MyOcrBackend());
 * ```
 */
declare function registerOcrBackend(backend: OcrBackendProtocol): void;
/**
 * List all registered OCR backends.
 *
 * Returns an array of names of all currently registered OCR backends,
 * including built-in backends like "tesseract".
 *
 * @returns Array of OCR backend names (empty array if none registered)
 *
 * @example
 * ```typescript
 * import { listOcrBackends } from '@kreuzberg/node';
 *
 * const backends = listOcrBackends();
 * console.log(backends); // ['tesseract', 'my-custom-backend', ...]
 * ```
 */
declare function listOcrBackends(): string[];
/**
 * Unregister an OCR backend by name.
 *
 * Removes the specified OCR backend from the registry. If the backend doesn't exist,
 * this operation is a no-op (does not throw an error).
 *
 * @param name - Name of the OCR backend to unregister
 *
 * @example
 * ```typescript
 * import { unregisterOcrBackend } from '@kreuzberg/node';
 *
 * // Unregister a custom backend
 * unregisterOcrBackend('my-custom-ocr');
 * ```
 */
declare function unregisterOcrBackend(name: string): void;
/**
 * Clear all registered OCR backends.
 *
 * Removes all OCR backends from the registry, including built-in backends.
 * Use with caution as this will make OCR functionality unavailable until
 * backends are re-registered. If no backends are registered, this is a no-op.
 *
 * @example
 * ```typescript
 * import { clearOcrBackends } from '@kreuzberg/node';
 *
 * clearOcrBackends();
 * ```
 */
declare function clearOcrBackends(): void;

/**
 * List all registered document extractors.
 *
 * Returns an array of names of all currently registered document extractors,
 * including built-in extractors for PDF, Office documents, images, etc.
 *
 * @returns Array of document extractor names (empty array if none registered)
 *
 * @example
 * ```typescript
 * import { listDocumentExtractors } from '@kreuzberg/node';
 *
 * const extractors = listDocumentExtractors();
 * console.log(extractors); // ['pdf', 'docx', 'xlsx', 'custom-extractor', ...]
 * ```
 */
declare function listDocumentExtractors(): string[];
/**
 * Unregister a document extractor by name.
 *
 * Removes the specified document extractor from the registry. If the extractor
 * doesn't exist, this operation is a no-op (does not throw an error).
 *
 * @param name - Name of the document extractor to unregister
 *
 * @example
 * ```typescript
 * import { unregisterDocumentExtractor } from '@kreuzberg/node';
 *
 * // Unregister a custom extractor
 * unregisterDocumentExtractor('MyCustomExtractor');
 * ```
 */
declare function unregisterDocumentExtractor(name: string): void;
/**
 * Clear all registered document extractors.
 *
 * Removes all document extractors from the registry, including built-in extractors.
 * Use with caution as this will make document extraction unavailable until
 * extractors are re-registered.
 *
 * @example
 * ```typescript
 * import { clearDocumentExtractors } from '@kreuzberg/node';
 *
 * clearDocumentExtractors();
 * ```
 */
declare function clearDocumentExtractors(): void;

/**
 * Load extraction configuration from a file.
 *
 * @param filePath - Path to the configuration file
 * @returns ExtractionConfig object loaded from the file
 *
 * @deprecated Use ExtractionConfig.fromFile() instead
 */
declare function loadConfigFile(filePath: string): ExtractionConfig;
/**
 * Load extraction configuration from a specified path.
 *
 * @param path - Path to the configuration file or directory
 * @returns ExtractionConfig object or null
 *
 * @deprecated Use ExtractionConfig.fromFile() or ExtractionConfig.discover() instead
 */
declare function loadConfigFromPath(path: string): ExtractionConfig | null;

/**
 * Detect MIME type from raw bytes.
 *
 * Uses content inspection (magic bytes) to determine MIME type.
 * This is more accurate than extension-based detection but requires
 * reading the file content.
 *
 * @param bytes - Raw file content as Buffer
 * @returns The detected MIME type string
 *
 * @throws {Error} If MIME type cannot be determined from content
 *
 * @example
 * ```typescript
 * import { detectMimeType } from '@kreuzberg/node';
 * import * as fs from 'fs';
 *
 * // Read file content
 * const content = fs.readFileSync('document.pdf');
 *
 * // Detect MIME type from bytes
 * const mimeType = detectMimeType(content);
 * console.log(mimeType); // 'application/pdf'
 * ```
 */
declare function detectMimeType(bytes: Buffer): string;
/**
 * Detect MIME type from a file path.
 *
 * Determines the MIME type based on the file extension in the provided path.
 * By default, checks if the file exists; can be disabled with checkExists parameter.
 *
 * @param filePath - The file path to detect MIME type from (e.g., 'document.pdf')
 * @param checkExists - Whether to verify the file exists (default: true)
 * @returns The detected MIME type as a string (e.g., 'application/pdf')
 *
 * @throws {Error} If MIME type cannot be determined from the file extension,
 * or if checkExists is true and the file does not exist
 *
 * @example
 * ```typescript
 * import { detectMimeTypeFromPath } from '@kreuzberg/node';
 *
 * // Detect MIME type from existing file
 * const mimeType = detectMimeTypeFromPath('/path/to/document.pdf');
 * console.log(mimeType); // 'application/pdf'
 *
 * // Detect without checking file existence
 * const mimeType2 = detectMimeTypeFromPath('document.docx', false);
 * console.log(mimeType2); // 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
 * ```
 */
declare function detectMimeTypeFromPath(filePath: string, checkExists?: boolean): string;
/**
 * Validate that a MIME type is supported by Kreuzberg.
 *
 * Checks if a MIME type is in the list of supported formats. Note that any
 * `image/*` MIME type is automatically considered valid.
 *
 * @param mimeType - The MIME type to validate (string)
 * @returns The validated MIME type (may be normalized)
 *
 * @throws {Error} If the MIME type is not supported
 *
 * @example
 * ```typescript
 * import { validateMimeType } from '@kreuzberg/node';
 *
 * // Validate supported type
 * const validated = validateMimeType('application/pdf');
 * console.log(validated); // 'application/pdf'
 *
 * // Validate custom image type
 * const validated2 = validateMimeType('image/custom-format');
 * console.log(validated2); // 'image/custom-format' (any image/* is valid)
 *
 * // Validate unsupported type (throws error)
 * try {
 *   validateMimeType('video/mp4');
 * } catch (err) {
 *   console.error(err); // Error: Unsupported format: video/mp4
 * }
 * ```
 */
declare function validateMimeType(mimeType: string): string;
/**
 * Get file extensions for a given MIME type.
 *
 * Returns an array of file extensions commonly associated with the specified
 * MIME type. For example, 'application/pdf' returns ['pdf'].
 *
 * @param mimeType - The MIME type to look up (e.g., 'application/pdf', 'image/jpeg')
 * @returns Array of file extensions (without leading dots)
 *
 * @throws {Error} If the MIME type is not recognized or supported
 *
 * @example
 * ```typescript
 * import { getExtensionsForMime } from '@kreuzberg/node';
 *
 * // Get extensions for PDF
 * const pdfExts = getExtensionsForMime('application/pdf');
 * console.log(pdfExts); // ['pdf']
 *
 * // Get extensions for JPEG
 * const jpegExts = getExtensionsForMime('image/jpeg');
 * console.log(jpegExts); // ['jpg', 'jpeg']
 * ```
 */
declare function getExtensionsForMime(mimeType: string): string[];

/**
 * Embedding preset configuration.
 *
 * Contains all settings for a specific embedding model preset.
 */
interface EmbeddingPreset {
    /** Name of the preset (e.g., "fast", "balanced", "quality", "multilingual") */
    name: string;
    /** Recommended chunk size in characters */
    chunkSize: number;
    /** Recommended overlap in characters */
    overlap: number;
    /** Model identifier (e.g., "AllMiniLML6V2Q", "BGEBaseENV15") */
    modelName: string;
    /** Embedding vector dimensions */
    dimensions: number;
    /** Human-readable description of the preset */
    description: string;
}
/**
 * Get all available embedding presets.
 *
 * Returns an array of names of all available embedding model presets.
 *
 * @returns Array of preset names (e.g., ["fast", "balanced", "quality", "multilingual"])
 *
 * @example
 * ```typescript
 * import { listEmbeddingPresets } from '@kreuzberg/node';
 *
 * const presets = listEmbeddingPresets();
 * console.log('Available presets:', presets);
 * ```
 */
declare function listEmbeddingPresets(): string[];
/**
 * Get embedding preset configuration by name.
 *
 * Retrieves the configuration for a specific embedding model preset.
 * Returns null if the preset doesn't exist.
 *
 * @param name - Name of the preset (e.g., "balanced", "fast", "quality")
 * @returns EmbeddingPreset configuration if found, null otherwise
 *
 * @example
 * ```typescript
 * import { getEmbeddingPreset } from '@kreuzberg/node';
 *
 * const preset = getEmbeddingPreset('balanced');
 * if (preset) {
 *   console.log(`Model: ${preset.modelName}, Dims: ${preset.dimensions}`);
 *   // Model: BGEBaseENV15, Dims: 768
 * }
 * ```
 */
declare function getEmbeddingPreset(name: string): EmbeddingPreset | null;
/**
 * Generate vector embeddings for a list of texts (synchronous).
 *
 * Requires the `embeddings` feature to be enabled (ONNX Runtime must be available).
 * Returns one float32 array per input text. An empty input returns an empty array.
 *
 * @param texts - Array of strings to embed
 * @param config - Optional embedding configuration (model preset, batch size, normalization)
 * @returns Array of float32 arrays (one embedding vector per input text)
 *
 * @throws {Error} If ONNX Runtime is not available or the model cannot be loaded
 *
 * @example
 * ```typescript
 * import { embedSync } from '@kreuzberg/node';
 *
 * const embeddings = embedSync(['Hello, world!'], { model: { type: 'preset', name: 'balanced' } });
 * console.log(embeddings.length); // 1
 * console.log(embeddings[0].length); // 768
 * ```
 */
declare function embedSync(texts: string[], config?: EmbeddingConfig): number[][];
/**
 * Generate vector embeddings for a list of texts (asynchronous).
 *
 * Requires the `embeddings` feature to be enabled (ONNX Runtime must be available).
 * Returns one float32 array per input text. An empty input returns an empty array.
 *
 * @param texts - Array of strings to embed
 * @param config - Optional embedding configuration (model preset, batch size, normalization)
 * @returns Promise resolving to an array of float32 arrays (one embedding vector per input text)
 *
 * @throws {Error} If ONNX Runtime is not available or the model cannot be loaded
 *
 * @example
 * ```typescript
 * import { embed } from '@kreuzberg/node';
 *
 * const embeddings = await embed(['Hello, world!'], { model: { type: 'preset', name: 'balanced' } });
 * console.log(embeddings.length); // 1
 * console.log(embeddings[0].length); // 768
 * ```
 */
declare function embed(texts: string[], config?: EmbeddingConfig): Promise<number[][]>;

/**
 * @internal Allows tests to provide a mocked native binding.
 */
declare function __setBindingForTests(mock: unknown): void;
/**
 * @internal Resets the cached native binding for tests.
 */
declare function __resetBindingForTests(): void;

/**
 * Kreuzberg - Multi-language document intelligence framework.
 *
 * This is a TypeScript SDK around a high-performance Rust core.
 * All extraction logic, chunking, quality processing, and language detection
 * are implemented in Rust for maximum performance.
 *
 * ## Module Organization
 *
 * The SDK is organized into logical domains:
 * - **Extraction**: Single and batch document extraction with worker pool support
 * - **Types**: Core type definitions and interfaces
 * - **Errors**: Error classes and diagnostic utilities
 * - **Plugins**: Custom post-processors, validators, and OCR backends
 * - **Registry**: Plugin and document extractor management
 * - **Config**: Configuration loading and management
 * - **MIME**: MIME type detection and validation
 * - **Embeddings**: Embedding model presets
 *
 * ## API Usage Recommendations
 *
 * **For processing multiple documents**, prefer batch APIs:
 * - Use `batchExtractFiles()` / `batchExtractFilesSync()` for multiple files
 * - Use `batchExtractBytes()` / `batchExtractBytesSync()` for multiple byte arrays
 * - Use worker pool APIs for high-concurrency scenarios
 *
 * **Batch APIs provide**:
 * - Better performance (parallel processing in Rust)
 * - More reliable memory management
 * - Recommended for all multi-document workflows
 *
 * **Single extraction APIs** (`extractFile`, `extractBytes`) are suitable for:
 * - One-off document processing
 * - Interactive applications processing documents on-demand
 * - Avoid calling these in tight loops - use batch APIs instead
 *
 * ## Supported Formats
 *
 * - **Documents**: PDF, DOCX, PPTX, XLSX, DOC, PPT
 * - **Text**: Markdown, Plain Text, XML
 * - **Web**: HTML (converted to Markdown)
 * - **Data**: JSON, YAML, TOML
 * - **Email**: EML, MSG
 * - **Images**: PNG, JPEG, TIFF (with OCR support)
 *
 * @example
 * ```typescript
 * import { extractFile, batchExtractFiles } from '@kreuzberg/node';
 *
 * // Single file extraction
 * const result = await extractFile('document.pdf');
 * console.log(result.content);
 *
 * // Multiple files (recommended approach)
 * const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
 * const results = await batchExtractFiles(files);
 * results.forEach(r => console.log(r.content));
 * ```
 *
 * @module @kreuzberg/node
 */

declare const __version__ = "4.9.9";

export { EmbeddingConfig, type EmbeddingPreset, ErrorClassification, ExtractionConfig, ExtractionResult, OcrBackendProtocol, PanicContext, PdfPageIterator, type PdfPageResult, PostProcessorProtocol, ValidatorProtocol, WorkerPool, WorkerPoolStats, __resetBindingForTests, __setBindingForTests, __version__, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, batchExtractFilesInWorker, batchExtractFilesSync, classifyError, clearDocumentExtractors, clearOcrBackends, clearPostProcessors, clearValidators, closeWorkerPool, createWorkerPool, detectMimeType, detectMimeTypeFromPath, embed, embedSync, extractBytes, extractBytesSync, extractFile, extractFileInWorker, extractFileSync, getEmbeddingPreset, getErrorCodeDescription, getErrorCodeName, getExtensionsForMime, getLastErrorCode, getLastPanicContext, getWorkerPoolStats, iteratePdfPages, iteratePdfPagesSync, listDocumentExtractors, listEmbeddingPresets, listOcrBackends, listPostProcessors, listValidators, loadConfigFile, loadConfigFromPath, pdfPageCount, registerOcrBackend, registerPostProcessor, registerValidator, renderPdfPage, renderPdfPageSync, unregisterDocumentExtractor, unregisterOcrBackend, unregisterPostProcessor, unregisterValidator, validateMimeType };