/** * Chunking strategy options for PDF document processing. * * Choose the strategy based on your document characteristics: * * - **fixed-pages**: Legacy approach, splits by fixed page count. * - Pros: Fast, simple, predictable chunk sizes * - Cons: May exceed token limits for dense documents * - Best for: Uniform density documents, simple text * * - **token-based**: Splits based on token count to respect model limits. * - Pros: Respects model token limits, handles variable density * - Cons: Slower analysis, variable chunk sizes * - Best for: Variable density documents, technical content * * - **hybrid**: RECOMMENDED - Balances token count and page limits. * - Pros: Best of both worlds, reliable, flexible * - Cons: Slightly more complex configuration * - Best for: Most documents, general-purpose processing */ export type ChunkingStrategy = 'fixed-pages' | 'token-based' | 'hybrid'; /** * Processing mode for chunked documents. * * - **sequential**: Process chunks one at a time (cost-optimized) * - **parallel**: Process multiple chunks simultaneously (speed-optimized) */ export type ProcessingMode = 'sequential' | 'parallel'; /** * Aggregation strategy for combining chunk results. * * - **majority-vote**: Select most frequent classification across chunks * - **weighted-vote**: Weight early chunks higher than later chunks * - **first-chunk**: Use classification from first chunk only */ export type AggregationStrategy = 'majority-vote' | 'weighted-vote' | 'first-chunk'; /** * Configuration for fixed-pages chunking strategy. * Splits documents by fixed page count (legacy approach). */ export interface FixedPagesConfig { /** * Threshold for triggering chunking based on page count. * Documents with pages > threshold will be chunked. * @default 100 */ readonly pageThreshold?: number; /** * Number of pages per chunk. * @default 50 */ readonly chunkSize?: number; /** * Number of overlapping pages between consecutive chunks. * Must be less than chunkSize. * @default 5 */ readonly overlapPages?: number; } /** * Configuration for token-based chunking strategy. * Splits documents based on estimated token count to respect model limits. */ export interface TokenBasedConfig { /** * Threshold for triggering chunking based on token count. * Documents with tokens > threshold will be chunked. * @default 150000 */ readonly tokenThreshold?: number; /** * Maximum tokens per chunk. * Ensures no chunk exceeds model token limits. * @default 100000 */ readonly maxTokensPerChunk?: number; /** * Number of overlapping tokens between consecutive chunks. * Provides context continuity across chunks. * @default 5000 */ readonly overlapTokens?: number; } /** * Configuration for hybrid chunking strategy (RECOMMENDED). * Balances token count and page limits for optimal chunking. */ export interface HybridConfig { /** * Hard limit on pages per chunk. * Prevents very large chunks even if token count is low. * Note: Bedrock has a hard limit of 100 pages per PDF, so we default to 99 * to provide a safety margin. * @default 99 */ readonly maxPagesPerChunk?: number; /** * Soft target for tokens per chunk. * Chunks aim for this token count but respect maxPagesPerChunk. * @default 80000 */ readonly targetTokensPerChunk?: number; /** * Threshold for triggering chunking based on page count. * Documents with pages > threshold will be chunked. * @default 100 */ readonly pageThreshold?: number; /** * Threshold for triggering chunking based on token count. * Documents with tokens > threshold will be chunked. * @default 150000 */ readonly tokenThreshold?: number; /** * Number of overlapping tokens between consecutive chunks. * Provides context continuity across chunks. * @default 5000 */ readonly overlapTokens?: number; } /** * Comprehensive configuration for PDF chunking behavior. * * This interface provides fine-grained control over how large PDF documents are * split into manageable chunks for processing. The chunking system supports three * strategies, each optimized for different document types and use cases. * * ## Chunking Strategies * * ### 1. Hybrid Strategy (RECOMMENDED) * Balances both token count and page limits for optimal chunking. Best for most * documents as it respects model token limits while preventing excessively large chunks. * * ### 2. Token-Based Strategy * Splits documents based on estimated token count. Best for documents with variable * content density (e.g., mixed text and images, tables, charts). * * ### 3. Fixed-Pages Strategy (Legacy) * Simple page-based splitting. Fast but may exceed token limits for dense documents. * Use only for documents with uniform content density. * * ## Processing Modes * * - **parallel**: Process multiple chunks simultaneously (faster, higher cost) * - **sequential**: Process chunks one at a time (slower, lower cost) * * ## Aggregation Strategies * * - **majority-vote**: Most frequent classification wins (recommended) * - **weighted-vote**: Early chunks weighted higher * - **first-chunk**: Use first chunk's classification only * * ## Default Values * * | Parameter | Default | Description | * |-----------|---------|-------------| * | strategy | 'hybrid' | Chunking strategy | * | pageThreshold | 100 | Pages to trigger chunking | * | tokenThreshold | 150000 | Tokens to trigger chunking | * | chunkSize | 50 | Pages per chunk (fixed-pages) | * | overlapPages | 5 | Overlap pages (fixed-pages) | * | maxTokensPerChunk | 100000 | Max tokens per chunk (token-based) | * | overlapTokens | 5000 | Overlap tokens (token-based, hybrid) | * | targetTokensPerChunk | 80000 | Target tokens per chunk (hybrid) | * | maxPagesPerChunk | 99 | Max pages per chunk (hybrid) | * | processingMode | 'parallel' | Processing mode | * | maxConcurrency | 10 | Max parallel chunks | * | aggregationStrategy | 'majority-vote' | Result aggregation | * | minSuccessThreshold | 0.5 | Min success rate for valid result | */ export interface ChunkingConfig { /** * Chunking strategy to use. * * - **hybrid** (RECOMMENDED): Balances token count and page limits * - **token-based**: Respects model token limits, good for variable density * - **fixed-pages**: Simple page-based splitting (legacy, not recommended) * * @default 'hybrid' */ readonly strategy?: ChunkingStrategy; /** * Threshold for triggering chunking based on page count (fixed-pages strategy). * @default 100 */ readonly pageThreshold?: number; /** * Number of pages per chunk (fixed-pages strategy). * @default 50 */ readonly chunkSize?: number; /** * Number of overlapping pages between chunks (fixed-pages strategy). * @default 5 */ readonly overlapPages?: number; /** * Threshold for triggering chunking based on token count (token-based strategy). * @default 150000 */ readonly tokenThreshold?: number; /** * Maximum tokens per chunk (token-based strategy). * @default 100000 */ readonly maxTokensPerChunk?: number; /** * Number of overlapping tokens between chunks (token-based and hybrid strategies). * @default 5000 */ readonly overlapTokens?: number; /** * Hard limit on pages per chunk (hybrid strategy). * Note: Bedrock has a hard limit of 100 pages per PDF, so we default to 99 * to provide a safety margin. * @default 99 */ readonly maxPagesPerChunk?: number; /** * Soft target for tokens per chunk (hybrid strategy). * @default 80000 */ readonly targetTokensPerChunk?: number; /** * Processing mode for chunks. * * - **parallel**: Process multiple chunks simultaneously (faster, higher cost) * - **sequential**: Process chunks one at a time (slower, lower cost) * * @default 'parallel' */ readonly processingMode?: ProcessingMode; /** * Maximum number of chunks to process concurrently (parallel mode only). * Higher values increase speed but also cost. * * @default 10 */ readonly maxConcurrency?: number; /** * Strategy for aggregating results from multiple chunks. * * - **majority-vote**: Most frequent classification wins * - **weighted-vote**: Early chunks weighted higher * - **first-chunk**: Use first chunk's classification * * @default 'majority-vote' */ readonly aggregationStrategy?: AggregationStrategy; /** * Minimum percentage of chunks that must succeed for aggregation. * If fewer chunks succeed, the result is marked as partial failure. * * @default 0.5 (50%) */ readonly minSuccessThreshold?: number; } /** * Metadata about a single chunk of a document. * Contains information about the chunk's position, size, and S3 location. */ export interface ChunkMetadata { /** * Unique identifier for this chunk. * Format: {documentId}_chunk_{index} */ readonly chunkId: string; /** * Zero-based index of this chunk in the document. */ readonly chunkIndex: number; /** * Total number of chunks in the document. */ readonly totalChunks: number; /** * Starting page number (zero-based) of this chunk. */ readonly startPage: number; /** * Ending page number (zero-based, inclusive) of this chunk. */ readonly endPage: number; /** * Number of pages in this chunk. */ readonly pageCount: number; /** * Estimated token count for this chunk. * Based on word-count heuristic (1.3 tokens per word). */ readonly estimatedTokens: number; /** * S3 bucket containing the chunk file. */ readonly bucket: string; /** * S3 key for the chunk file. * Typically in chunks/ prefix. */ readonly key: string; } /** * Document content location information. */ export interface DocumentContent { /** * Storage location type (e.g., 's3'). */ readonly location: string; /** * S3 bucket containing the document. */ readonly bucket: string; /** * S3 key for the document. */ readonly key: string; /** * Original filename of the document. */ readonly filename: string; } /** * Request payload for PDF analysis and chunking Lambda. * Contains document information and chunking configuration. */ export interface ChunkingRequest { /** * Unique identifier for the document. */ readonly documentId: string; /** * Content type of the document. * Typically 'file' for S3-based documents. */ readonly contentType: string; /** * Document content location information. */ readonly content: DocumentContent; /** * Optional chunking configuration. * If not provided, uses default configuration. */ readonly config?: ChunkingConfig; } /** * Token analysis results from PDF analysis. * Provides information about document size and token distribution. */ export interface TokenAnalysis { /** * Total estimated tokens in the document. */ readonly totalTokens: number; /** * Total number of pages in the document. */ readonly totalPages: number; /** * Average tokens per page across the document. */ readonly avgTokensPerPage: number; /** * Optional detailed token count for each page. * Used for token-based and hybrid chunking strategies. */ readonly tokensPerPage?: number[]; } /** * Response when chunking is NOT required. * Document is below thresholds and will be processed without chunking. */ export interface NoChunkingResponse { /** * Document identifier. */ readonly documentId: string; /** * Indicates chunking is not required. */ readonly requiresChunking: false; /** * Token analysis results. */ readonly tokenAnalysis: TokenAnalysis; /** * Human-readable reason why chunking was not applied. * Example: "Document has 50 pages, below threshold of 100" */ readonly reason: string; } /** * Chunking configuration used for processing. * Includes both user-provided and default values. */ export interface ChunkingConfigUsed { readonly strategy: ChunkingStrategy; readonly totalPages: number; readonly totalTokens: number; readonly chunkSize?: number; readonly overlapPages?: number; readonly maxTokensPerChunk?: number; readonly overlapTokens?: number; readonly targetTokensPerChunk?: number; readonly maxPagesPerChunk?: number; readonly processingMode?: string; } /** * Response when chunking IS required. * Document exceeds thresholds and has been split into chunks. */ export interface ChunkingResponse { /** * Document identifier. */ readonly documentId: string; /** * Indicates chunking is required. */ readonly requiresChunking: true; /** * Token analysis results with detailed per-page information. */ readonly tokenAnalysis: TokenAnalysis; /** * Strategy used for chunking. */ readonly strategy: ChunkingStrategy; /** * Array of chunk metadata for all created chunks. */ readonly chunks: ChunkMetadata[]; /** * Configuration used for chunking. * Includes both user-provided and default values. */ readonly config: ChunkingConfigUsed; } /** * Union type for chunking Lambda response. * Either chunking is required or not. */ export type ChunkingLambdaResponse = NoChunkingResponse | ChunkingResponse; /** * Classification result for a chunk. */ export interface ChunkClassificationResult { readonly documentClassification: string; readonly confidence?: number; } /** * Processing result for a chunk. */ export interface ChunkProcessingResult { readonly entities: Entity[]; } /** * Result from processing a single chunk. * Contains classification and extraction results, or error information. */ export interface ChunkResult { /** * Chunk identifier. */ readonly chunkId: string; /** * Zero-based chunk index. */ readonly chunkIndex: number; /** * Optional classification result for this chunk. */ readonly classificationResult?: ChunkClassificationResult; /** * Optional extraction result for this chunk. */ readonly processingResult?: ChunkProcessingResult; /** * Error message if chunk processing failed. */ readonly error?: string; } /** * Extracted entity from document processing. */ export interface Entity { /** * Type of entity (e.g., 'NAME', 'DATE', 'AMOUNT', 'ADDRESS'). */ readonly type: string; /** * Value of the entity. */ readonly value: string; /** * Optional page number where entity was found. * Entities with page numbers are preserved even if duplicated. */ readonly page?: number; /** * Optional chunk index where entity was found. */ readonly chunkIndex?: number; } /** * Request payload for aggregation Lambda. * Contains results from all processed chunks. */ export interface AggregationRequest { /** * Document identifier. */ readonly documentId: string; /** * Results from all processed chunks. */ readonly chunkResults: ChunkResult[]; /** * Strategy to use for aggregation. * @default 'majority-vote' */ readonly aggregationStrategy?: AggregationStrategy; } /** * Summary of chunk processing results. */ export interface ChunksSummary { /** * Total number of chunks created. */ readonly totalChunks: number; /** * Number of chunks that processed successfully. */ readonly successfulChunks: number; /** * Number of chunks that failed processing. */ readonly failedChunks: number; /** * Optional total tokens processed across all chunks. */ readonly totalTokensProcessed?: number; } /** * Aggregated result from processing all chunks. * Combines classification and extraction results into final output. */ export interface AggregatedResult { /** * Document identifier. */ readonly documentId: string; /** * Final document classification (from majority vote or other strategy). */ readonly classification: string; /** * Confidence score for the classification (0-1). * For majority vote: (count of majority / total chunks) */ readonly classificationConfidence: number; /** * Deduplicated entities from all chunks. * Entities without page numbers are deduplicated by (type, value). * Entities with page numbers are preserved even if duplicated. */ readonly entities: Entity[]; /** * Summary of chunk processing results. */ readonly chunksSummary: ChunksSummary; /** * Indicates if result is partial due to chunk failures. * True if fewer than minSuccessThreshold chunks succeeded. */ readonly partialResult: boolean; } /** * Request payload for cleanup Lambda. * Contains information about chunks to delete. */ export interface CleanupRequest { /** * Document identifier. */ readonly documentId: string; /** * Array of chunk metadata for chunks to delete. */ readonly chunks: ChunkMetadata[]; } /** * Response from cleanup Lambda. * Reports success and any errors encountered. */ export interface CleanupResponse { /** * Document identifier. */ readonly documentId: string; /** * Number of chunks successfully deleted. */ readonly deletedChunks: number; /** * Array of error messages for failed deletions. * Empty if all deletions succeeded. */ readonly errors: string[]; }