/** * Type definitions for Kreuzberg extraction results. * * These types mirror the strongly-typed Rust metadata structures, * providing type safety for TypeScript users. */ /** * Tesseract OCR engine configuration options. * * @example * ```typescript * const config: TesseractConfig = { * psm: 6, * enableTableDetection: true, * tesseditCharWhitelist: '0123456789' * }; * ``` */ interface TesseractConfig { /** * Page Segmentation Mode (0-13). Controls how Tesseract segments and recognizes text. * Common values: 3 (auto), 6 (single uniform block), 11 (sparse text). * Default: 3 (auto layout analysis). */ psm?: number; /** * Enable table detection during OCR processing. * When true, Tesseract attempts to preserve table structure in the output. * Default: false. */ enableTableDetection?: boolean; /** * Whitelist of characters Tesseract should recognize. * Only these characters will be returned by the OCR engine. * Use empty string to allow all characters. Useful for constraining output to digits, * specific alphabets, or other character sets. * Default: null (recognize all). */ tesseditCharWhitelist?: string; } /** * OCR element hierarchy level. * * Defines the granularity of OCR element extraction. */ type OcrElementLevel = "word" | "line" | "block" | "page"; /** * Bounding geometry for OCR elements using rectangle coordinates. * * Represents rectangular coordinates with position and dimensions. */ interface OcrBoundingGeometryRectangle { type: "rectangle"; left: number; top: number; width: number; height: number; } /** * Bounding geometry for OCR elements using quadrilateral points. * * Represents irregular quadrilateral shapes with four corner points. */ interface OcrBoundingGeometryQuadrilateral { type: "quadrilateral"; points: number[][]; } /** * Bounding geometry for OCR elements. * * Can be either rectangular or quadrilateral based on the OCR engine's detection capability. */ type OcrBoundingGeometry = OcrBoundingGeometryRectangle | OcrBoundingGeometryQuadrilateral; /** * Confidence scores for OCR operations. * * Tracks confidence levels for different aspects of OCR processing. */ interface OcrConfidence { /** Confidence score (0.0-1.0) for text detection. */ detection?: number; /** Confidence score (0.0-1.0) for text recognition. */ recognition?: number; } /** * Rotation information for OCR elements. * * Tracks detected text rotation and associated confidence. */ interface OcrRotation { /** Angle of rotation in degrees. */ angleDegrees?: number; /** Confidence score (0.0-1.0) for rotation detection. */ confidence?: number; } /** * Individual OCR element (word, line, block, or page). * * Represents a granular unit of text extracted by OCR with geometric and confidence information. */ interface OcrElement { /** Extracted text content */ text: string; /** Bounding geometry of the element in the image */ geometry?: OcrBoundingGeometry; /** Confidence scores for detection and recognition */ confidence?: OcrConfidence; /** Hierarchy level of this element */ level?: OcrElementLevel; /** Rotation information if text is rotated */ rotation?: OcrRotation; /** Page number where this element was found (1-indexed) */ pageNumber?: number; /** Parent element ID for hierarchical relationships */ parentId?: string; /** Backend-specific metadata that doesn't fit standard fields */ backendMetadata?: Record; } /** * Configuration for OCR element extraction. * * Controls how granular OCR elements are extracted and organized. */ interface OcrElementConfig { /** Enable extraction of granular OCR elements. Default: false. */ includeElements?: boolean; /** Minimum hierarchy level to extract. Default: 'word'. */ minLevel?: OcrElementLevel; /** Minimum confidence threshold (0.0-1.0) for including elements. Default: 0.0. */ minConfidence?: number; /** Build hierarchical relationships between elements. Default: false. */ buildHierarchy?: boolean; } /** * PaddleOCR engine configuration options. * * Specific configuration for the PaddleOCR backend. */ interface PaddleOcrConfig { /** Language code(s) for OCR (e.g., 'en', 'zh', 'multi'). */ language?: string; /** Directory to cache downloaded OCR models. */ cacheDir?: string; /** Enable angle classification for rotated text detection. Default: false. */ useAngleCls?: boolean; /** Enable table structure detection. Default: false. */ enableTableDetection?: boolean; /** Database threshold for text detection (0.0-1.0). Default: 0.3. */ detDbThresh?: number; /** Box threshold for text detection (0.0-1.0). Default: 0.5. */ detDbBoxThresh?: number; /** Unclip ratio for expanding detected text regions. Default: 1.5. */ detDbUnclipRatio?: number; /** Maximum side length for detection preprocessing. Default: 960. */ detLimitSideLen?: number; /** Batch size for text recognition. Default: 30. */ recBatchNum?: number; } /** * OCR (Optical Character Recognition) configuration. * * Controls which OCR engine to use and how it processes images. */ interface OcrConfig { /** Whether OCR is enabled. Setting to false disables OCR entirely (equivalent to disable_ocr: true on ExtractionConfig). Default: true. */ enabled?: boolean; /** OCR backend name (e.g., 'tesseract', 'paddleocr', 'easyocr'). Required. */ backend: string; /** ISO 639-1/3 language code(s) for OCR (e.g., 'eng', 'fra', 'deu'). Default: 'eng'. */ language?: string; /** Tesseract engine-specific configuration options. Only used when backend is 'tesseract'. */ tesseractConfig?: TesseractConfig; /** PaddleOCR engine-specific configuration options. Only used when backend is 'paddleocr'. */ paddleOcrConfig?: PaddleOcrConfig; /** OCR element extraction configuration. */ elementConfig?: OcrElementConfig; } /** * Document chunking configuration for splitting large documents. * * Breaks large documents into smaller, manageable chunks while preserving context. * Useful for RAG (Retrieval Augmented Generation) and vector database indexing. */ /** * Embedding model type selector. * * Use `modelType: "preset"` with a named preset (recommended), or `modelType: "custom"` with * a HuggingFace model ID and the expected vector dimensions. */ interface EmbeddingModelType { /** Selector: `"preset"`, `"custom"`, or `"llm"` */ modelType: "preset" | "custom" | "llm"; /** For preset: preset name (e.g., "fast", "balanced", "quality", "multilingual"); for custom: HuggingFace model ID; for llm: model identifier */ value: string; /** Expected embedding dimensions — required for `modelType: "custom"` */ dimensions?: number; } /** * Configuration for the standalone embedding API and for chunk-level embeddings. */ interface EmbeddingConfig { /** Embedding model selection (defaults to the "balanced" preset). */ model?: EmbeddingModelType; /** Normalize embedding vectors to unit length (recommended for cosine similarity). Default: true. */ normalize?: boolean; /** Number of texts processed per inference batch. Default: 32. */ batchSize?: number; /** Show download progress when the model is downloaded for the first time. Default: false. */ showDownloadProgress?: boolean; /** Custom directory for caching downloaded model files. */ cacheDir?: string; } interface ChunkingConfig { /** Maximum characters per chunk. Default: 4096. */ maxChars?: number; /** Maximum overlapping characters between consecutive chunks for context preservation. Default: 512. */ maxOverlap?: number; /** * Alternative to maxChars: chunk size using different unit. * Mutually exclusive with maxChars. */ chunkSize?: number; /** * Alternative to maxOverlap: overlap amount using different unit. * Mutually exclusive with maxOverlap. */ chunkOverlap?: number; /** * Named preset configuration (e.g., 'default', 'aggressive', 'minimal'). * Uses preset values if neither maxChars nor chunkSize is specified. */ preset?: string; /** Embedding configuration for generating vector embeddings for each chunk. */ embedding?: EmbeddingConfig; /** Enable or disable chunking. Default: true when chunking config is provided. */ enabled?: boolean; /** Sizing type: "characters" (default) or "tokenizer". */ sizingType?: "characters" | "tokenizer"; /** HuggingFace model ID for tokenizer sizing (e.g., "Xenova/gpt-4o"). */ sizingModel?: string; /** Optional cache directory for tokenizer files. */ sizingCacheDir?: string; } /** * Language detection configuration. * * Automatically detects the language(s) of extracted content. */ interface LanguageDetectionConfig { /** Enable automatic language detection. Default: true. */ enabled?: boolean; /** Minimum confidence score (0.0-1.0) for language detection. Default: 0.5. */ minConfidence?: number; /** Detect multiple languages in the same document. Default: false. */ detectMultiple?: boolean; } /** * Token reduction configuration for optimizing token usage. * * Reduces the number of tokens in extracted content while preserving meaning. * Useful for reducing costs in LLM pipelines. */ interface TokenReductionConfig { /** Reduction mode: 'aggressive' or 'conservative'. Default: 'conservative'. */ mode?: string; /** Preserve tokens for semantically important words even in aggressive mode. Default: true. */ preserveImportantWords?: boolean; } /** * Hierarchy extraction configuration. * * Controls document hierarchy detection based on font size clustering. */ interface HierarchyConfig { /** Enable hierarchy extraction. Default: true. */ enabled?: boolean; /** Number of font size clusters (2-10). Default: 6. */ kClusters?: number; /** Include bounding box information. Default: true. */ includeBbox?: boolean; /** OCR coverage threshold (0.0-1.0). Default: null. */ ocrCoverageThreshold?: number | null; } /** * PDF-specific extraction configuration. * * Controls how PDF documents are processed. */ interface PdfConfig { /** Extract images from PDF pages. Default: true. */ extractImages?: boolean; /** List of passwords to try for password-protected PDFs. */ passwords?: string[]; /** Extract document metadata (title, author, creation date, etc.). Default: true. */ extractMetadata?: boolean; /** Hierarchy extraction configuration. */ hierarchy?: HierarchyConfig; /** Extract annotations from PDF pages. Default: false. */ extractAnnotations?: boolean; /** Top margin fraction (0.0-0.5) for filtering header content. */ topMarginFraction?: number; /** Bottom margin fraction (0.0-0.5) for filtering footer content. */ bottomMarginFraction?: number; } /** * Image extraction and processing configuration. * * Controls how images are extracted and optimized from documents. */ interface ImageExtractionConfig { /** Enable image extraction from documents. Default: true. */ extractImages?: boolean; /** Target DPI (dots per inch) for extracted images. Higher DPI = better quality but larger files. Default: 150. */ targetDpi?: number; /** Maximum image dimension (width or height) in pixels. Images larger than this are downscaled. Default: 2000. */ maxImageDimension?: number; /** Automatically adjust DPI based on image content and quality. Default: true. */ autoAdjustDpi?: boolean; /** Minimum DPI to maintain for image quality. Default: 72. */ minDpi?: number; /** Maximum DPI to avoid excessive file sizes. Default: 300. */ maxDpi?: number; } /** * Post-processor configuration for modifying extracted content. * * Post-processors allow customization and cleanup of extraction results * without failing the extraction if they encounter errors. */ interface PostProcessorConfig { /** Enable or disable post-processing entirely. Default: true. */ enabled?: boolean; /** List of processor names to enable (allowlist). When set, only these are used. */ enabledProcessors?: string[]; /** List of processor names to disable (denylist). These are skipped. */ disabledProcessors?: string[]; } /** * HTML preprocessing options. * * Cleans HTML content before conversion to Markdown. */ interface HtmlPreprocessingOptions { /** Enable HTML preprocessing. Default: true. */ enabled?: boolean; /** Preset cleanup level: 'minimal' (light), 'standard' (balanced), 'aggressive' (heavy). Default: 'standard'. */ preset?: "minimal" | "standard" | "aggressive"; /** Remove navigation menus and headers. Default: true. */ removeNavigation?: boolean; /** Remove form elements. Default: true. */ removeForms?: boolean; } /** * HTML to Markdown conversion configuration options. * * Controls how HTML content is converted to Markdown format, including formatting, * escaping, and special handling for various HTML elements. */ interface HtmlConversionOptions { /** Heading style conversion: "atx" (# style), "underlined" (underline style), or "atx_closed" (# style closed). Default: "atx". */ headingStyle?: "atx" | "underlined" | "atx_closed"; /** List indentation type: "spaces" or "tabs". Default: "spaces". */ listIndentType?: "spaces" | "tabs"; /** Number of spaces/tabs per list indent level. Default: 4. */ listIndentWidth?: number; /** Bullet characters for unordered lists (e.g., '*', '-', '+'). Default: '*'. */ bullets?: string; /** Markdown symbol for strong/bold emphasis: '**' or '__'. Default: '**'. */ strongEmSymbol?: string; /** Escape asterisks (*) in text to prevent accidental formatting. Default: false. */ escapeAsterisks?: boolean; /** Escape underscores (_) in text to prevent accidental formatting. Default: false. */ escapeUnderscores?: boolean; /** Escape miscellaneous special characters. Default: false. */ escapeMisc?: boolean; /** Escape ASCII control characters. Default: false. */ escapeAscii?: boolean; /** Default code language for syntax highlighting in code blocks (e.g., 'javascript'). Default: null. */ codeLanguage?: string; /** Convert HTML links to Markdown autolinks format ([text](url)). Default: true. */ autolinks?: boolean; /** Use the HTML title element as default for links when no text is available. Default: false. */ defaultTitle?: boolean; /** Insert
tags in Markdown tables. Default: false. */ brInTables?: boolean; /** Use HOCR spatial table format for better table structure preservation. Default: false. */ hocrSpatialTables?: boolean; /** Highlight style for marked/highlighted text: "double_equal" (==text==), "html" (), "bold" (**text**), or "none". Default: "none". */ highlightStyle?: "double_equal" | "html" | "bold" | "none"; /** Extract metadata from HTML (title, meta tags, etc.). Default: false. */ extractMetadata?: boolean; /** Whitespace handling: "normalized" (collapse whitespace) or "strict" (preserve all whitespace). Default: "normalized". */ whitespaceMode?: "normalized" | "strict"; /** Remove newlines from output (convert to single line). Default: false. */ stripNewlines?: boolean; /** Enable line wrapping at specified width. Default: true. */ wrap?: boolean; /** Maximum line width when wrapping is enabled. Default: 80. */ wrapWidth?: number; /** Convert as inline Markdown instead of block elements. Default: false. */ convertAsInline?: boolean; /** Markdown symbol for subscript text (e.g., '~' for ~text~). Default: '~'. */ subSymbol?: string; /** Markdown symbol for superscript text (e.g., '^' for ^text^). Default: '^'. */ supSymbol?: string; /** Newline style in output: "spaces" (two spaces + newline) or "backslash" (backslash + newline). Default: "spaces". */ newlineStyle?: "spaces" | "backslash"; /** Code block style: "indented" (4-space indent), "backticks" (```), or "tildes" (~~~). Default: "backticks". */ codeBlockStyle?: "indented" | "backticks" | "tildes"; /** List of HTML tag names to keep as inline images (don't convert). Default: []. */ keepInlineImagesIn?: string[]; /** Character encoding for output (e.g., 'utf-8', 'ascii'). Default: 'utf-8'. */ encoding?: string; /** Enable debug mode for detailed conversion logging. Default: false. */ debug?: boolean; /** List of HTML tag names to remove entirely from output. Default: []. */ stripTags?: string[]; /** List of HTML tag names to preserve in output (don't convert to Markdown). Default: []. */ preserveTags?: string[]; /** HTML preprocessing options for cleaning HTML before conversion. */ preprocessing?: HtmlPreprocessingOptions; } /** Keyword extraction algorithm type. */ type KeywordAlgorithm = "yake" | "rake"; /** * YAKE (Yet Another Keyword Extractor) algorithm configuration. * * YAKE is an unsupervised keyword extraction method that doesn't require training data. */ interface YakeParams { /** Window size for co-occurrence analysis (number of words to consider). Default: 3. */ windowSize?: number; } /** * RAKE (Rapid Automatic Keyword Extraction) algorithm configuration. * * RAKE extracts keywords based on word co-occurrence and statistical measures. */ interface RakeParams { /** Minimum word length to consider as keyword. Default: 3. */ minWordLength?: number; /** Maximum number of words per keyword phrase. Default: 3. */ maxWordsPerPhrase?: number; } /** * Keyword extraction configuration. * * Extracts important keywords/phrases from document content using YAKE or RAKE algorithms. */ interface KeywordConfig { /** Extraction algorithm: "yake" or "rake". Default: "yake". */ algorithm?: KeywordAlgorithm; /** Maximum number of keywords to extract. Default: 10. */ maxKeywords?: number; /** Minimum relevance score (0.0-1.0) for keywords. Keywords below this are filtered out. Default: 0.1. */ minScore?: number; /** N-gram range: [min_length, max_length] for phrase keywords (e.g., [1, 3] for 1-3 word phrases). Default: [1, 3]. */ ngramRange?: [number, number]; /** Language for keyword extraction (e.g., 'en', 'de', 'fr'). Default: 'en'. */ language?: string; /** YAKE algorithm-specific parameters. Only used when algorithm is "yake". */ yakeParams?: YakeParams; /** RAKE algorithm-specific parameters. Only used when algorithm is "rake". */ rakeParams?: RakeParams; } /** * Extracted keyword with relevance metadata. * * Represents a single keyword extracted from text along with its relevance score, * the algorithm that extracted it, and optional position information. */ interface ExtractedKeyword { /** The keyword text */ text: string; /** Relevance score (higher is better, algorithm-specific range) */ score: number; /** Algorithm that extracted this keyword */ algorithm: KeywordAlgorithm; /** Optional positions where keyword appears in text (character offsets) */ positions?: number[]; } /** * Warning from a post-processor during extraction. */ interface ProcessingWarning { /** Name of the post-processor that produced the warning */ source: string; /** Warning message */ message: string; } /** * LLM usage statistics for an extraction operation. * * Tracks token usage and cost information when LLM-based processing is applied. */ interface LlmUsage { /** LLM model identifier (e.g., 'gpt-4', 'claude-3-opus') */ model: string; /** Source of this LLM usage (e.g., 'extraction', 'post-processing') */ source: string; /** Number of input tokens consumed (if available) */ inputTokens?: number | null; /** Number of output tokens generated (if available) */ outputTokens?: number | null; /** Total tokens used (sum of input + output tokens, if available) */ totalTokens?: number | null; /** Estimated cost in USD (if available) */ estimatedCost?: number | null; /** Finish reason or status (e.g., 'stop', 'max_tokens', 'tool_use') */ finishReason?: string | null; } /** * Page tracking and extraction configuration. * * Controls how pages/slides/sheets are extracted and tracked in the document. * Page range information in chunk metadata (first_page/last_page) is automatically * enabled when page boundaries are available and chunking is configured. */ interface PageExtractionConfig { /** Extract pages as separate array (ExtractionResult.pages) */ extractPages?: boolean; /** Insert page markers in main content string */ insertPageMarkers?: boolean; /** Page marker format (use {page_num} placeholder) */ markerFormat?: string; } /** * Layout detection configuration for PDF extraction. * * Controls layout detection using ONNX-based document layout models (YOLO or RT-DETR) * to detect document structure elements like tables, figures, headers, and code blocks. * Requires the `layout-detection` feature to be compiled. */ interface LayoutDetectionConfig { /** Model preset: "fast" (YOLO, 11 classes) or "accurate" (RT-DETR, 17 classes). Default: "fast". */ preset?: string; /** Override the model's default confidence threshold for detections. Default: null (use model default). */ confidenceThreshold?: number; /** Apply postprocessing heuristics to improve detection quality. Default: true. */ applyHeuristics?: boolean; /** * Table structure recognition model. Controls which model is used for table cell detection. * Options: "tatr" (default), "slanet_wired", "slanet_wireless", "slanet_plus", "slanet_auto". */ tableModel?: string; } /** * HTML output configuration for styled HTML rendering. * * Controls how `outputFormat: "html"` renders documents when styled output * is desired. Supports built-in themes and custom CSS injection. */ interface HtmlOutputConfig { /** Inline CSS string injected after the theme stylesheet. */ css?: string; /** Path to a CSS file loaded at renderer construction time. */ cssFile?: string; /** * Built-in colour/typography theme. * Default: "unstyled". */ theme?: "default" | "github" | "dark" | "light" | "unstyled"; /** * CSS class prefix applied to every emitted class name. * Default: "kb-". */ classPrefix?: string; /** * When true, embed resolved CSS in a `