/* auto-generated by NAPI-RS */ /* eslint-disable */ /** * Lazy PDF page iterator. A more memory-efficient alternative to * `iteratePdfPagesSync`/`iteratePdfPages` when memory is a concern or when * pages should be processed as they are rendered (e.g., sending each page to * a vision model for OCR). * * Renders one page at a time via the `.next()` method. Callers must call * `.close()` when done to free native resources. * * # Example * * ```javascript * const iter = new PdfPageIterator("doc.pdf", 150); * let result; * while ((result = iter.next()) !== null) { * const { pageIndex, data } = result; * // process page... * } * iter.close(); * ``` */ export declare class JsPdfPageIterator { /** * Create a new PDF page iterator. * * # Parameters * * * `file_path` - Path to the PDF file * * `dpi` - Optional DPI (default 150) */ constructor(filePath: string, dpi?: number | undefined | null) /** * Advance the iterator and return the next page. * * Returns `{ pageIndex, data }` or `null` when exhausted. */ next(): PdfPageResult | null /** Total number of pages in the PDF. */ pageCount(): number /** Free native resources. Safe to call multiple times. */ close(): void } /** Opaque handle to a worker pool */ export declare class JsWorkerPool { } /** * Batch extract from multiple byte arrays (asynchronous). * * Asynchronously processes multiple in-memory buffers in parallel. Non-blocking * alternative to `batchExtractBytesSync`. * * # Parameters * * * `data_list` - Array of buffers to extract * * `mime_types` - Array of MIME types (must match data_list length) * * `config` - Optional extraction configuration * * `file_configs` - Optional per-item extraction configs (must match data_list length if provided) * * # Returns * * Promise resolving to array of `ExtractionResult`. * * # Example * * ```typescript * import { batchExtractBytes } from '@kreuzberg/node'; * * const responses = await Promise.all([ * fetch('https://example.com/doc1.pdf'), * fetch('https://example.com/doc2.pdf') * ]); * const buffers = await Promise.all( * responses.map(r => r.arrayBuffer().then(b => Buffer.from(b))) * ); * const results = await batchExtractBytes( * buffers, * ['application/pdf', 'application/pdf'], * null * ); * ``` */ export declare function batchExtractBytes(dataList: Array, mimeTypes: Array, config?: JsExtractionConfig | undefined | null, fileConfigs?: Array | undefined | null): Promise> /** * Batch extract from multiple byte arrays (synchronous). * * Synchronously processes multiple in-memory buffers in parallel. Requires * corresponding MIME types for each buffer. * * # Parameters * * * `data_list` - Array of buffers to extract * * `mime_types` - Array of MIME types (must match data_list length) * * `config` - Optional extraction configuration * * `file_configs` - Optional per-item extraction configs (must match data_list length if provided) * * # Returns * * Array of `ExtractionResult` in the same order as inputs. * * # Errors * * Throws if data_list and mime_types lengths don't match. * * # Example * * ```typescript * import { batchExtractBytesSync } from '@kreuzberg/node'; * * const buffers = [buffer1, buffer2, buffer3]; * const mimeTypes = ['application/pdf', 'image/png', 'text/plain']; * const results = batchExtractBytesSync(buffers, mimeTypes, null); * ``` */ export declare function batchExtractBytesSync(dataList: Array, mimeTypes: Array, config?: JsExtractionConfig | undefined | null, fileConfigs?: Array | undefined | null): Array /** * Batch extract from multiple files (asynchronous). * * Asynchronously processes multiple files in parallel. Non-blocking alternative * to `batchExtractFilesSync` with same performance benefits. * * # Parameters * * * `paths` - Array of file paths to extract * * `config` - Optional extraction configuration (applied to all files) * * `file_configs` - Optional per-file extraction configs (must match paths length if provided) * * # Returns * * Promise resolving to array of `ExtractionResult`. * * # Example * * ```typescript * import { batchExtractFiles } from '@kreuzberg/node'; * * const files = ['report1.pdf', 'report2.pdf', 'report3.pdf']; * const results = await batchExtractFiles(files, null); * console.log(`Processed ${results.length} files`); * ``` */ export declare function batchExtractFiles(paths: Array, config?: JsExtractionConfig | undefined | null, fileConfigs?: Array | undefined | null): Promise> /** * Extract multiple files using worker threads from the pool. * * Submits multiple file extraction tasks to the worker pool for concurrent * processing. Files are processed in parallel up to the pool size limit. * * # Parameters * * * `pool` - Worker pool handle * * `file_paths` - Array of file paths to extract * * `config` - Optional extraction configuration applied to all files * * # Returns * * Promise resolving to array of extraction results in the same order as input paths. * * # Example * * ```typescript * import { createWorkerPool, batchExtractFilesInWorker } from '@kreuzberg/node'; * * const pool = createWorkerPool(4); * const files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx']; * const results = await batchExtractFilesInWorker(pool, files, { * useCache: true * }); * * results.forEach((result, i) => { * console.log(`File ${i + 1}: ${result.content.length} chars`); * }); * ``` */ export declare function batchExtractFilesInWorker(pool: JsWorkerPool, filePaths: Array, config?: JsExtractionConfig | undefined | null): Promise> export declare function batchExtractFilesSync(paths: Array, config?: JsExtractionConfig | undefined | null, fileConfigs?: Array | undefined | null): Array export declare function classifyError(errorMessage: string): ErrorClassification /** * Clear all registered document extractors. * * Removes all document extractors from the registry, including built-in extractors. * Use with caution as this will make document extraction unavailable until * extractors are re-registered. * * # Example * * ```typescript * import { clearDocumentExtractors } from 'kreuzberg'; * * clearDocumentExtractors(); * ``` */ export declare function clearDocumentExtractors(): void /** Clear all registered OCR backends */ export declare function clearOcrBackends(): void /** Clear all registered postprocessors */ export declare function clearPostProcessors(): void /** Clear all registered validators */ export declare function clearValidators(): void /** * Close and shutdown a worker pool gracefully. * * Waits for all in-flight extraction tasks to complete before shutting down * the pool. After calling this function, the pool handle becomes invalid. * * # Parameters * * * `pool` - Worker pool handle * * # Returns * * Promise that resolves when all workers have completed and pool is closed. * * # Example * * ```typescript * import { createWorkerPool, closeWorkerPool } from '@kreuzberg/node'; * * const pool = createWorkerPool(4); * // ... use pool for extractions ... * await closeWorkerPool(pool); // Wait for completion and cleanup * ``` */ export declare function closeWorkerPool(pool: JsWorkerPool): Promise /** * Get a specific field from config (represented as JSON string) by name via FFI. * * Retrieves a configuration field by path, supporting nested access with * dot notation (e.g., "ocr.backend"). Returns the field value as a JSON string. * * # Arguments * * * `json_str` - A JSON string representation of the configuration * * `field_name` - The field path to retrieve (e.g., "useCache", "ocr.backend") * * # Returns * * The field value as a JSON string, or null if not found */ export declare function configGetFieldInternal(jsonStr: string, fieldName: string): string | null /** * Merge two configs (override takes precedence over base) via FFI. * * Performs a shallow merge where fields from the override config take * precedence over fields in the base config. * * # Arguments * * * `base_json` - A JSON string representation of the base ExtractionConfig * * `override_json` - A JSON string representation of the override ExtractionConfig * * # Returns * * The merged configuration as a JSON string, or error */ export declare function configMergeInternal(baseJson: string, overrideJson: string): string /** * Validate and normalize an ExtractionConfig JSON string via FFI. * * This validates the JSON and returns a normalized version, using the shared * FFI layer to ensure consistent validation across all language bindings. * * # Arguments * * * `json_str` - A JSON string containing the configuration * * # Returns * * The normalized JSON string representation of the config, or error */ export declare function configValidateAndNormalize(jsonStr: string): string /** * Create a new worker pool for concurrent extraction operations. * * Creates a pool of worker threads for CPU-bound document extraction. * Tasks submitted to the pool will be executed concurrently up to the pool size. * * # Parameters * * * `size` - Number of concurrent workers (defaults to CPU count) * * # Returns * * Worker pool handle that can be used with extraction functions. * * # Example * * ```typescript * import { createWorkerPool } from '@kreuzberg/node'; * * const pool = createWorkerPool(4); // 4 concurrent workers * console.log(`Pool created with ${pool.size} workers`); * ``` */ export declare function createWorkerPool(size?: number | undefined | null): JsWorkerPool /** * Detect MIME type from raw bytes. * * Uses content inspection (magic bytes) to determine MIME type. * This is more accurate than extension-based detection but requires * reading the file content. * * # Parameters * * * `bytes` - Raw file content as Buffer * * # Returns * * The detected MIME type string. * * # Errors * * Throws an error if MIME type cannot be determined from content. * * # Example * * ```typescript * import { detectMimeTypeFromBytes } from 'kreuzberg'; * import * as fs from 'fs'; * * // Read file content * const content = fs.readFileSync('document.pdf'); * * // Detect MIME type from bytes * const mimeType = detectMimeTypeFromBytes(content); * ``` */ export declare function detectMimeTypeFromBytes(bytes: Buffer): string /** * Detect MIME type from a file path. * * Determines the MIME type based on the file extension in the provided path. * By default, checks if the file exists; can be disabled with check_exists parameter. * * # Parameters * * * `path` - The file path to detect MIME type from (e.g., 'document.pdf') * * `check_exists` - Whether to verify the file exists (default: true) * * # Returns * * The detected MIME type as a string (e.g., 'application/pdf'). * * # Errors * * Throws an error if MIME type cannot be determined from the file extension, * or if check_exists is true and the file does not exist. * * # Example * * ```typescript * import { detectMimeTypeFromPath } from 'kreuzberg'; * * // Detect MIME type from existing file * const mimeType = detectMimeTypeFromPath('/path/to/document.pdf'); * * // Detect without checking file existence * const mimeType2 = detectMimeTypeFromPath('document.docx', false); * ``` */ export declare function detectMimeTypeFromPath(path: string, checkExists?: boolean | undefined | null): string /** * Discover extraction configuration file in current directory or parent directories. * * Searches for configuration files in the following order: * 1. `kreuzberg.toml` * 2. `kreuzberg.yaml` / `kreuzberg.yml` * 3. `kreuzberg.json` * 4. Searches parent directories up to the filesystem root * * Returns the first configuration file found or throws an error if none found. * * # Returns * * `JsExtractionConfig` object with discovered configuration. * * # Errors * * Throws an error if no configuration file is found. * * # Example * * ```typescript * import { discoverExtractionConfig } from 'kreuzberg'; * * // Automatically finds kreuzberg.toml or kreuzberg.yaml in current or parent directories * const config = discoverExtractionConfig(); * const result = await extractFile('document.pdf', null, config); * ``` */ export declare function discoverExtractionConfig(): JsExtractionConfig | null /** * Generate embeddings from a list of text strings (asynchronous). * * # Arguments * * * `texts` - List of strings to embed * * `config` - Optional embedding configuration (model, batch size, normalization) * * # Returns * * `Promise` — one embedding vector per input text * * # Example * * ```typescript * import { embed } from '@kreuzberg/node'; * * const embeddings = await embed(['Hello, world!'], { model: { type: 'preset', name: 'balanced' } }); * console.log(embeddings.length); // 1 * ``` */ export declare function embed(texts: Array, config?: JsEmbeddingConfig | undefined | null): Promise>> export interface EmbeddingPreset { /** Name of the preset (e.g., "fast", "balanced", "quality", "multilingual") */ name: string /** Recommended chunk size in characters */ chunkSize: number /** Recommended overlap in characters */ overlap: number /** Model identifier (e.g., "AllMiniLML6V2Q", "BGEBaseENV15") */ modelName: string /** Embedding vector dimensions */ dimensions: number /** Human-readable description of the preset */ description: string } /** * Embedding preset configuration for TypeScript bindings. * * Contains all settings for a specific embedding model preset. */ export interface EmbeddingPreset { /** Name of the preset (e.g., "fast", "balanced", "quality", "multilingual") */ name: string /** Recommended chunk size in characters */ chunkSize: number /** Recommended overlap in characters */ overlap: number /** Model identifier (e.g., "AllMiniLML6V2Q", "BGEBaseENV15") */ modelName: string /** Embedding vector dimensions */ dimensions: number /** Human-readable description of the preset */ description: string } /** * Generate embeddings from a list of text strings (synchronous). * * # Arguments * * * `texts` - List of strings to embed * * `config` - Optional embedding configuration (model, batch size, normalization) * * # Returns * * `number[][]` — one embedding vector per input text * * # Example * * ```typescript * import { embedSync } from '@kreuzberg/node'; * * const embeddings = embedSync(['Hello, world!'], { model: { type: 'preset', name: 'balanced' } }); * console.log(embeddings.length); // 1 * ``` */ export declare function embedSync(texts: Array, config?: JsEmbeddingConfig | undefined | null): Array> /** * Classifies an error message string into an error code category. * * This function analyzes the error message content and returns the most likely * error code (0-7) based on keyword patterns. Used to programmatically classify * errors for handling purposes. * * # Arguments * * * `error_message` - The error message string to classify * * # Returns * * An object with: * - `code`: The numeric error code (0-7) * - `name`: The error code name string * - `description`: Brief description of the error type * - `confidence`: Confidence score (0.0-1.0) of the classification * * # Classification Rules * * - **Validation (0)**: Keywords: invalid, validation, invalid_argument, schema, required, unexpected field * - **Parsing (1)**: Keywords: parsing, parse_error, corrupted, malformed, invalid format, decode, encoding * - **Ocr (2)**: Keywords: ocr, optical, character, recognition, tesseract, language, model * - **MissingDependency (3)**: Keywords: not found, not installed, missing, dependency, require, unavailable * - **Io (4)**: Keywords: io, file, disk, read, write, permission, access, path * - **Plugin (5)**: Keywords: plugin, register, extension, handler, processor * - **UnsupportedFormat (6)**: Keywords: unsupported, format, mime, type, codec * - **Internal (7)**: Keywords: internal, bug, panic, unexpected, invariant * - **Embedding (8)**: Keywords: embed, embedding, vector, inference, model * * # Examples * * ```typescript * const result = classifyError("PDF file is corrupted"); * // Returns: { code: 1, name: "parsing", confidence: 0.95 } * * const result = classifyError("Tesseract not found"); * // Returns: { code: 3, name: "missing_dependency", confidence: 0.9 } * ``` */ export interface ErrorClassification { code: number name: string description: string confidence: number } /** * Extract content from bytes (asynchronous). * * Asynchronously extracts content from a byte buffer. Non-blocking alternative * to `extractBytesSync` for processing in-memory data. * * # Parameters * * * `data` - Buffer containing the document bytes * * `mime_type` - MIME type of the data * * `config` - Optional extraction configuration * * # Returns * * Promise resolving to `ExtractionResult`. * * # Example * * ```typescript * import { extractBytes } from '@kreuzberg/node'; * * const response = await fetch('https://example.com/document.pdf'); * const buffer = Buffer.from(await response.arrayBuffer()); * const result = await extractBytes(buffer, 'application/pdf', null); * ``` */ export declare function extractBytes(data: Buffer, mimeType: string, config?: JsExtractionConfig | undefined | null): Promise /** * Extract content from bytes (synchronous). * * Synchronously extracts content from a byte buffer without requiring a file path. * Useful for processing in-memory data, network streams, or database BLOBs. * * # Parameters * * * `data` - Buffer containing the document bytes * * `mime_type` - MIME type of the data (e.g., "application/pdf", "image/png") * * `config` - Optional extraction configuration * * # Returns * * `ExtractionResult` with extracted content and metadata. * * # Errors * * Throws an error if data is malformed or MIME type is unsupported. * * # Example * * ```typescript * import { extractBytesSync } from '@kreuzberg/node'; * import fs from 'fs'; * * const buffer = fs.readFileSync('document.pdf'); * const result = extractBytesSync(buffer, 'application/pdf', null); * console.log(result.content); * ``` */ export declare function extractBytesSync(data: Buffer, mimeType: string, config?: JsExtractionConfig | undefined | null): JsExtractionResult /** * Extract content from a file (asynchronous). * * Asynchronously extracts text, tables, images, and metadata from a document file. * Non-blocking alternative to `extractFileSync` for use in async/await contexts. * * # Parameters * * * `file_path` - Path to the file to extract (absolute or relative) * * `mime_type` - Optional MIME type hint (auto-detected if omitted) * * `config` - Optional extraction configuration (OCR, chunking, etc.) * * # Returns * * Promise resolving to `ExtractionResult` with extracted content and metadata. * * # Errors * * Rejects if file processing fails (see `extractFileSync` for error conditions). * * # Example * * ```typescript * import { extractFile } from '@kreuzberg/node'; * * // Async/await usage * const result = await extractFile('document.pdf', null, null); * console.log(result.content); * * // Promise usage * extractFile('report.docx', null, null) * .then(result => console.log(result.content)) * .catch(err => console.error(err)); * ``` */ export declare function extractFile(filePath: string, mimeType?: string | undefined | null, config?: JsExtractionConfig | undefined | null): Promise /** * Extract a file using a worker thread from the pool. * * Submits a file extraction task to the worker pool. The task will execute * when a worker thread becomes available. This is useful for CPU-bound * extraction operations that need to be run concurrently. * * # Parameters * * * `pool` - Worker pool handle * * `file_path` - Path to the file to extract * * `password` - Optional password for encrypted files * * `config` - Optional extraction configuration * * # Returns * * Promise resolving to extraction result. * * # Example * * ```typescript * import { createWorkerPool, extractFileInWorker } from '@kreuzberg/node'; * * const pool = createWorkerPool(4); * const result = await extractFileInWorker(pool, 'document.pdf', null, { * useCache: true * }); * console.log(result.content); * ``` */ export declare function extractFileInWorker(pool: JsWorkerPool, filePath: string, mimeType?: string | undefined | null, config?: JsExtractionConfig | undefined | null): Promise export declare function extractFileSync(filePath: string, mimeType?: string | undefined | null, config?: JsExtractionConfig | undefined | null): JsExtractionResult /** * Get a specific embedding preset by name. * * Returns a preset configuration object, or null if the preset name is not found. * * # Arguments * * * `name` - The preset name (case-sensitive) * * # Returns * * An `EmbeddingPreset` object with the following properties: * - `name`: string - Preset name * - `chunkSize`: number - Recommended chunk size in characters * - `overlap`: number - Recommended overlap in characters * - `modelName`: string - Model identifier * - `dimensions`: number - Embedding vector dimensions * - `description`: string - Human-readable description * * Returns `null` if preset name is not found. * * # Example * * ```typescript * import { getEmbeddingPreset } from 'kreuzberg'; * * const preset = getEmbeddingPreset('balanced'); * if (preset) { * console.log(`Model: ${preset.modelName}, Dims: ${preset.dimensions}`); * // Model: BGEBaseENV15, Dims: 768 * } * ``` */ export declare function getEmbeddingPreset(name: string): EmbeddingPreset | null /** * Get a specific embedding preset by name. * * Returns a preset configuration object, or null if the preset name is not found. * * # Arguments * * * `name` - The preset name (case-sensitive) * * # Returns * * An `EmbeddingPreset` object with the following properties: * - `name`: string - Preset name * - `chunkSize`: number - Recommended chunk size in characters * - `overlap`: number - Recommended overlap in characters * - `modelName`: string - Model identifier * - `dimensions`: number - Embedding vector dimensions * - `description`: string - Human-readable description * * Returns `null` if preset name is not found. * * # Example * * ```typescript * import { getEmbeddingPreset } from 'kreuzberg'; * * const preset = getEmbeddingPreset('balanced'); * if (preset) { * console.log(`Model: ${preset.modelName}, Dims: ${preset.dimensions}`); * // Model: BGEBaseENV15, Dims: 768 * } * ``` */ export declare function getEmbeddingPreset(name: string): EmbeddingPreset | null /** * Returns the description for an error code. * * Maps to FFI function kreuzberg_error_code_description(). * * # Arguments * * * `code` - Numeric error code (0-7) * * # Returns * * A string containing a brief description of the error * * # Examples * * ```typescript * const desc = getErrorCodeDescription(0); // returns "Input validation error" * const desc = getErrorCodeDescription(4); // returns "File system I/O error" * const desc = getErrorCodeDescription(99); // returns "Unknown error code" * ``` */ export declare function getErrorCodeDescription(code: number): string /** * Returns the human-readable name for an error code. * * Maps to FFI function kreuzberg_error_code_name(). * * # Arguments * * * `code` - Numeric error code (0-7) * * # Returns * * A string containing the error code name (e.g., "validation", "ocr", "unknown") * * # Examples * * ```typescript * const name = getErrorCodeName(0); // returns "validation" * const name = getErrorCodeName(2); // returns "ocr" * const name = getErrorCodeName(99); // returns "unknown" * ``` */ export declare function getErrorCodeName(code: number): string /** * Get file extensions for a given MIME type. * * Returns an array of file extensions commonly associated with the specified * MIME type. For example, 'application/pdf' returns ['pdf']. * * # Parameters * * * `mime_type` - The MIME type to look up (e.g., 'application/pdf', 'image/jpeg') * * # Returns * * Array of file extensions (without leading dots). * * # Errors * * Throws an error if the MIME type is not recognized or supported. * * # Example * * ```typescript * import { getExtensionsForMime } from 'kreuzberg'; * * // Get extensions for PDF * const pdfExts = getExtensionsForMime('application/pdf'); * console.log(pdfExts); // ['pdf'] * * // Get extensions for JPEG * const jpegExts = getExtensionsForMime('image/jpeg'); * console.log(jpegExts); // ['jpg', 'jpeg'] * ``` */ export declare function getExtensionsForMime(mimeType: string): Array /** * Get the error code for the last FFI error. * * Returns the FFI error code as an integer. Error codes are: * - 0: Success (no error) * - 1: GenericError * - 2: Panic * - 3: InvalidArgument * - 4: IoError * - 5: ParsingError * - 6: OcrError * - 7: MissingDependency * * This is useful for programmatic error handling and distinguishing * between different types of failures in native code. * * # Returns * * The integer error code. * * # Example * * ```typescript * import { extractFile, getLastErrorCode, ErrorCode } from '@kreuzberg/node'; * * try { * const result = await extractFile('document.pdf'); * } catch (error) { * const code = getLastErrorCode(); * if (code === ErrorCode.Panic) { * console.error('Native code panic detected'); * } * } * ``` */ export declare function getLastErrorCode(): number /** * Get panic context information if the last error was a panic. * * Returns detailed information about a panic in native code, or null * if the last error was not a panic. * * # Returns * * A `PanicContext` object with: * - `file`: string - Source file where panic occurred * - `line`: number - Line number * - `function`: string - Function name * - `message`: string - Panic message * - `timestamp_secs`: number - Unix timestamp (seconds since epoch) * * Returns `null` if no panic context is available. * * # Example * * ```typescript * import { extractFile, getLastPanicContext } from '@kreuzberg/node'; * * try { * const result = await extractFile('document.pdf'); * } catch (error) { * const context = getLastPanicContext(); * if (context) { * console.error(`Panic at ${context.file}:${context.line}`); * console.error(`In function: ${context.function}`); * console.error(`Message: ${context.message}`); * } * } * ``` */ export declare function getLastPanicContext(): any | null /** * Get valid binarization methods. * * Returns a list of all valid binarization method values. * * # Returns * * Array of valid binarization methods: ["otsu", "adaptive", "sauvola"] * * # Example * * ```typescript * import { getValidBinarizationMethods } from '@kreuzberg/node'; * * const methods = getValidBinarizationMethods(); * console.log(methods); // ['otsu', 'adaptive', 'sauvola'] * ``` */ export declare function getValidBinarizationMethods(): Array /** * Get valid language codes. * * Returns a list of all valid language codes in ISO 639-1 and 639-3 formats. * * # Returns * * Array of valid language codes (both 2-letter and 3-letter codes) * * # Example * * ```typescript * import { getValidLanguageCodes } from '@kreuzberg/node'; * * const codes = getValidLanguageCodes(); * console.log(codes); // ['en', 'de', 'fr', ..., 'eng', 'deu', 'fra', ...] * ``` */ export declare function getValidLanguageCodes(): Array /** * Get valid OCR backends. * * Returns a list of all valid OCR backend values. * * # Returns * * Array of valid OCR backends: ["tesseract", "easyocr", "paddleocr"] * * # Example * * ```typescript * import { getValidOcrBackends } from '@kreuzberg/node'; * * const backends = getValidOcrBackends(); * console.log(backends); // ['tesseract', 'easyocr', 'paddleocr'] * ``` */ export declare function getValidOcrBackends(): Array /** * Get valid token reduction levels. * * Returns a list of all valid token reduction level values. * * # Returns * * Array of valid levels: ["off", "light", "moderate", "aggressive", "maximum"] * * # Example * * ```typescript * import { getValidTokenReductionLevels } from '@kreuzberg/node'; * * const levels = getValidTokenReductionLevels(); * console.log(levels); // ['off', 'light', 'moderate', 'aggressive', 'maximum'] * ``` */ export declare function getValidTokenReductionLevels(): Array /** * Get worker pool statistics. * * Returns current statistics about the worker pool including size, * active workers, and queued tasks. * * # Parameters * * * `pool` - Worker pool handle * * # Returns * * Pool statistics object with size, activeWorkers, and queuedTasks fields. * * # Example * * ```typescript * import { createWorkerPool, getWorkerPoolStats } from '@kreuzberg/node'; * * const pool = createWorkerPool(4); * const stats = getWorkerPoolStats(pool); * console.log(`Active: ${stats.activeWorkers}/${stats.size}`); * ``` */ export declare function getWorkerPoolStats(pool: JsWorkerPool): WorkerPoolStats /** * Create a PDF page iterator and collect all pages (asynchronous). * * Non-blocking variant of `iteratePdfPagesSync`. Rendering is offloaded * to the worker pool. * * Note: Pages are collected eagerly into an array. For true lazy iteration, * use `new PdfPageIterator(filePath, dpi)` which exposes a `.next()` method * that renders one page at a time. * * # Parameters * * * `file_path` - Path to the PDF file * * `dpi` - Optional DPI (default 150) * * # Returns * * Promise resolving to an array of `PdfPageResult` objects. */ export declare function iteratePdfPages(filePath: string, dpi?: number | undefined | null): Promise> /** * Create a PDF page iterator and collect all pages (synchronous). * * Opens the PDF once and renders pages lazily, returning an array of * `{ pageIndex, data }` objects. Each page is rendered one at a time so * only one raw image is in memory at a time. * * Note: Pages are collected eagerly into an array. For true lazy iteration, * use `new PdfPageIterator(filePath, dpi)` which exposes a `.next()` method * that renders one page at a time. * * # Parameters * * * `file_path` - Path to the PDF file * * `dpi` - Optional DPI (default 150) * * # Returns * * Array of `PdfPageResult` objects. */ export declare function iteratePdfPagesSync(filePath: string, dpi?: number | undefined | null): Array /** * Hardware acceleration configuration for ONNX Runtime inference. * * Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used * for layout detection and embedding generation. */ export interface JsAccelerationConfig { /** Execution provider: "auto" (default), "cpu", "coreml", "cuda", "tensorrt". */ provider?: string /** GPU device ID for CUDA/TensorRT. Ignored for CPU/CoreML/Auto. */ deviceId?: number } export interface JsArchiveEntry { path: string mimeType: string result: JsExtractionResult } export interface JsBoundingBox { x0: number y0: number x1: number y1: number } export interface JsChunk { content: string chunkType: string embedding?: number[] | undefined metadata: JsChunkMetadata } export interface JsChunkingConfig { maxChars?: number maxOverlap?: number /** Optional embedding configuration for generating embeddings */ embedding?: JsEmbeddingConfig /** Optional preset name for chunking parameters */ preset?: string /** * Chunker type: "text" (default), "markdown", "yaml", or "semantic". * Set to "semantic" for topic-aware chunking that works out of the box * with sensible defaults. No other parameters needed. */ chunkerType?: string /** Sizing type: "characters" (default) or "tokenizer" */ sizingType?: string /** HuggingFace model ID for tokenizer sizing (e.g., "Xenova/gpt-4o") */ sizingModel?: string /** Optional cache directory for tokenizer files */ sizingCacheDir?: string /** Prepend heading context to each chunk when using markdown chunker */ prependHeadingContext?: boolean /** * Cosine similarity threshold for semantic topic detection (0.0-1.0). * Optional, defaults to 0.75. Rarely needs tuning. */ topicThreshold?: number } export interface JsChunkMetadata { byteStart: number byteEnd: number tokenCount?: number chunkIndex: number totalChunks: number firstPage?: number lastPage?: number headingContext?: JsHeadingContext } /** Concurrency configuration for Node.js bindings. */ export interface JsConcurrencyConfig { /** Maximum number of threads for all internal thread pools. */ maxThreads?: number } /** * Content filtering configuration for Node.js bindings. * * Controls whether "furniture" content (headers, footers, watermarks, * repeating text) is included in or stripped from extraction results. */ export interface JsContentFilterConfig { /** Include running headers in extraction output. Default: false. */ includeHeaders?: boolean /** Include running footers in extraction output. Default: false. */ includeFooters?: boolean /** Enable cross-page repeating text detection and removal. Default: true. */ stripRepeatingText?: boolean /** Include watermark text in extraction output. Default: false. */ includeWatermarks?: boolean } export interface JsElement { elementId: string elementType: string text: string metadata: JsElementMetadata } export interface JsElementMetadata { pageNumber?: number filename?: string coordinates?: JsBoundingBox elementIndex?: number additional?: Record | undefined } /** Email extraction configuration for Node.js bindings. */ export interface JsEmailConfig { /** * Windows codepage number for MSG files with no codepage property. * Common values: 1250 (Central European), 1251 (Cyrillic), 1252 (Western, default), * 1253 (Greek), 1254 (Turkish), 932 (Japanese), 936 (Simplified Chinese). */ msgFallbackCodepage?: number } /** Embedding generation configuration for Node.js bindings. */ export interface JsEmbeddingConfig { /** Embedding model configuration */ model?: JsEmbeddingModelType /** Whether to normalize embeddings (L2 normalization) */ normalize?: boolean /** Batch size for embedding generation */ batchSize?: number /** Whether to show download progress for models */ showDownloadProgress?: boolean /** Custom cache directory for model storage */ cacheDir?: string /** Hardware acceleration configuration for ONNX Runtime inference */ acceleration?: JsAccelerationConfig } /** * Embedding model type configuration for Node.js bindings. * * This struct represents different embedding model sources: * - `preset`: Use a named preset (e.g., "balanced", "fast", "quality", "multilingual") * - `custom`: Use a custom ONNX model from HuggingFace */ export interface JsEmbeddingModelType { /** Type of model: "preset" or "custom" */ modelType: string /** For preset: preset name; for custom: HuggingFace model ID */ value: string /** Number of dimensions (only for custom) */ dimensions?: number } export interface JsExtractedImage { data: Buffer format: string imageIndex: number pageNumber?: number width?: number height?: number colorspace?: string bitsPerComponent?: number isMask: boolean description?: string ocrResult?: JsExtractionResult | undefined boundingBox?: JsBoundingBox sourcePath?: string } export interface JsExtractedKeyword { text: string score: number algorithm: string positions?: Array } export interface JsExtractionConfig { useCache?: boolean enableQualityProcessing?: boolean ocr?: JsOcrConfig forceOcr?: boolean /** Disable OCR entirely — image files return empty content instead of errors */ disableOcr?: boolean /** List of 1-indexed page numbers to force OCR on (None = use force_ocr setting) */ forceOcrPages?: Array chunking?: JsChunkingConfig images?: JsImageExtractionConfig pdfOptions?: JsPdfConfig tokenReduction?: JsTokenReductionConfig languageDetection?: JsLanguageDetectionConfig postprocessor?: JsPostProcessorConfig keywords?: JsKeywordConfig htmlOptions?: JsHtmlOptions maxConcurrentExtractions?: number pages?: JsPageConfig /** Output text format: "plain" | "markdown" | "djot" | "html" */ outputFormat?: string /** Result structure format: "unified" | "element_based" */ resultFormat?: string /** Include document structure in extraction result */ includeDocumentStructure?: boolean /** Layout detection configuration (None = layout detection disabled) */ layout?: JsLayoutDetectionConfig /** Email extraction configuration */ email?: JsEmailConfig /** Hardware acceleration configuration for ONNX Runtime inference */ acceleration?: JsAccelerationConfig /** Security limits to guard against DoS attacks */ securityLimits?: JsSecurityLimits /** Concurrency configuration for thread pool control */ concurrency?: JsConcurrencyConfig /** Cache namespace for tenant isolation */ cacheNamespace?: string /** Per-request cache TTL in seconds (0 = skip cache) */ cacheTtlSecs?: number /** Maximum recursion depth for archive extraction (default: 3) */ maxArchiveDepth?: number /** Default per-file extraction timeout in seconds */ extractionTimeoutSecs?: number /** Tree-sitter language pack configuration for code analysis */ treeSitter?: JsTreeSitterConfig /** Structured extraction configuration for LLM-based data extraction */ structuredExtraction?: JsStructuredExtractionConfig /** Content filtering configuration for headers/footers/watermarks */ contentFilter?: JsContentFilterConfig /** HTML output configuration for styled HTML rendering */ htmlOutput?: JsHtmlOutputConfig } export interface JsExtractionResult { content: string mimeType: string metadata: Metadata tables: Array detectedLanguages: Array chunks: Array images: Array pages: Array elements: Array document: DocumentStructure | null djotContent: DjotContent | null ocrElements: OcrElement[] | null extractedKeywords: Array qualityScore?: number processingWarnings: Array llmUsage: Array annotations: Array children: Array uris: Array /** Code intelligence results from tree-sitter processing. */ codeIntelligence: CodeProcessResult | null /** Structured extraction output conforming to the provided JSON schema. */ structuredOutput: Record | null } export interface JsFileExtractionConfig { enableQualityProcessing?: boolean ocr?: JsOcrConfig forceOcr?: boolean /** Disable OCR entirely — image files return empty content instead of errors */ disableOcr?: boolean /** List of 1-indexed page numbers to force OCR on (None = use force_ocr setting) */ forceOcrPages?: Array chunking?: JsChunkingConfig images?: JsImageExtractionConfig pdfOptions?: JsPdfConfig tokenReduction?: JsTokenReductionConfig languageDetection?: JsLanguageDetectionConfig postprocessor?: JsPostProcessorConfig keywords?: JsKeywordConfig htmlOptions?: JsHtmlOptions pages?: JsPageConfig /** Output text format: "plain" | "markdown" | "djot" | "html" */ outputFormat?: string /** Result structure format: "unified" | "element_based" */ resultFormat?: string /** Include document structure in extraction result */ includeDocumentStructure?: boolean /** Layout detection configuration (None = layout detection disabled) */ layout?: JsLayoutDetectionConfig /** Per-file extraction timeout in seconds */ timeoutSecs?: number /** Tree-sitter language pack configuration for code analysis */ treeSitter?: JsTreeSitterConfig /** Structured extraction configuration for LLM-based data extraction */ structuredExtraction?: JsStructuredExtractionConfig /** Content filtering configuration for headers/footers/watermarks */ contentFilter?: JsContentFilterConfig } export interface JsHeadingContext { headings: Array } export interface JsHeadingLevel { level: number text: string } export interface JsHierarchicalBlock { text: string fontSize: number level: string bbox?: [number, number, number, number] | undefined } export interface JsHierarchyConfig { enabled?: boolean kClusters?: number includeBbox?: boolean ocrCoverageThreshold?: number } export interface JsHtmlOptions { headingStyle?: string listIndentType?: string listIndentWidth?: number bullets?: string strongEmSymbol?: string escapeAsterisks?: boolean escapeUnderscores?: boolean escapeMisc?: boolean escapeAscii?: boolean codeLanguage?: string autolinks?: boolean defaultTitle?: boolean brInTables?: boolean highlightStyle?: string extractMetadata?: boolean whitespaceMode?: string stripNewlines?: boolean wrap?: boolean wrapWidth?: number convertAsInline?: boolean subSymbol?: string supSymbol?: string newlineStyle?: string codeBlockStyle?: string keepInlineImagesIn?: Array encoding?: string debug?: boolean stripTags?: Array preserveTags?: Array preprocessing?: JsHtmlPreprocessingOptions } /** * HTML output configuration for styled HTML rendering. * * Controls how `outputFormat: "html"` renders documents when `htmlOutput` * is set on the extraction config. */ export interface JsHtmlOutputConfig { /** Inline CSS string injected after the theme stylesheet. */ css?: string /** Path to a CSS file loaded at renderer construction time. */ cssFile?: string /** Built-in theme: "default", "github", "dark", "light", "unstyled". Default: "unstyled". */ theme?: string /** CSS class prefix for emitted class names. Default: "kb-". */ classPrefix?: string /** Embed resolved CSS in a `