import { Duration } from 'aws-cdk-lib'; import { Role } from 'aws-cdk-lib/aws-iam'; import { Function } from 'aws-cdk-lib/aws-lambda'; import { IChainable } from 'aws-cdk-lib/aws-stepfunctions'; import { DynamoAttributeValue } from 'aws-cdk-lib/aws-stepfunctions-tasks'; import { Construct } from 'constructs'; import { BaseDocumentProcessing, BaseDocumentProcessingProps, DocumentProcessingStepType } from './base-document-processing'; import { ChunkingConfig } from './chunking-config'; import { BedrockModelProps } from '../framework/bedrock'; /** * Configuration properties for BedrockDocumentProcessing construct. * Extends BaseDocumentProcessingProps with Bedrock-specific options. */ export interface BedrockDocumentProcessingProps extends BaseDocumentProcessingProps { /** * Bedrock foundation model for document classification step. */ readonly classificationBedrockModel?: BedrockModelProps; /** * Bedrock foundation model for document extraction step. */ readonly processingBedrockModel?: BedrockModelProps; /** * Custom prompt template for document classification. * Must include placeholder for document content. * @default DEFAULT_CLASSIFICATION_PROMPT */ readonly classificationPrompt?: string; /** * Custom prompt template for document extraction. * Must include placeholder for document content and classification result. * @default DEFAULT_EXTRACTION_PROMPT */ readonly processingPrompt?: string; /** * Optional Lambda function for document enrichment step. * If provided, will be invoked after extraction with workflow state. */ readonly enrichmentLambdaFunction?: Function; /** * Optional Lambda function for post-processing step. * If provided, will be invoked after enrichment with workflow state. */ readonly postProcessingLambdaFunction?: Function; /** * Timeout for individual Step Functions tasks (classification, extraction, etc.). * @default Duration.minutes(5) */ readonly stepTimeouts?: Duration; /** * Custom prompt template for aggregating results from multiple chunks. * Used when chunking is enabled to merge processing results from all chunks * into a single coherent result. * * The prompt receives the concatenated processing results from all chunks * and should instruct the model to synthesize them into a unified output. * * @default DEFAULT_AGGREGATION_PROMPT */ readonly aggregationPrompt?: string; /** * Enable PDF chunking for large documents. * * When enabled, documents exceeding configured thresholds will be automatically * split into chunks, processed in parallel or sequentially, and results aggregated. * * This feature is useful for: * - Processing large PDFs (>100 pages) * - Handling documents that exceed Bedrock token limits (~200K tokens) * - Improving processing reliability for complex documents * - Processing documents with variable content density * * The chunking workflow: * 1. Analyzes PDF to determine page count and estimate token count * 2. Decides if chunking is needed based on configured thresholds * 3. If chunking is needed, splits PDF into chunks and uploads to S3 * 4. Processes each chunk through classification and extraction * 5. Aggregates results using majority voting for classification * 6. Deduplicates entities across chunks * 7. Cleans up temporary chunk files from S3 * * @default false */ readonly enableChunking?: boolean; /** * Configuration for PDF chunking behavior. * * Only applies when `enableChunking` is true. Allows customization of: * - **Chunking strategy**: How documents are split (fixed-pages, token-based, or hybrid) * - **Thresholds**: When to trigger chunking based on page count or token count * - **Chunk size and overlap**: Control chunk boundaries and context preservation * - **Processing mode**: Parallel (faster) or sequential (cost-optimized) * - **Aggregation strategy**: How to combine results from multiple chunks * * ## Default Configuration * * If not provided, uses sensible defaults optimized for most use cases: * - Strategy: `'hybrid'` (recommended - balances token and page limits) * - Page threshold: 100 pages * - Token threshold: 150,000 tokens * - Processing mode: `'parallel'` * - Max concurrency: 10 * - Aggregation strategy: `'majority-vote'` * * ## Strategy Comparison * * | Strategy | Best For | Pros | Cons | * |----------|----------|------|------| * | `hybrid` | Most documents | Balances token/page limits | Slightly more complex | * | `token-based` | Variable density | Respects model limits | Slower analysis | * | `fixed-pages` | Uniform density | Simple, fast | May exceed token limits | * * @default undefined (uses default configuration when enableChunking is true) * * @see {@link ChunkingConfig} for detailed configuration options */ readonly chunkingConfig?: ChunkingConfig; } /** * Document processing workflow powered by Amazon Bedrock foundation models. * * Extends BaseDocumentProcessing to provide AI-powered document classification and extraction * using Amazon Bedrock foundation models. This implementation offers: * * ## Key Features * - **AI-Powered Classification**: Uses Claude 3.7 Sonnet (configurable) to classify document types * - **Intelligent Extraction**: Extracts structured data from documents using foundation models * - **Cross-Region Inference**: Optional support for improved availability via inference profiles * - **Flexible Processing**: Optional enrichment and post-processing Lambda functions * - **Cost Optimized**: Configurable timeouts and model selection for cost control * * ## Processing Workflow * S3 Upload → Classification (Bedrock) → Extraction (Bedrock) → [Enrichment] → [Post-Processing] → Results * * ## Default Models * - Classification: Claude 3.7 Sonnet (anthropic.claude-3-7-sonnet-20250219-v1:0) * - Extraction: Claude 3.7 Sonnet (anthropic.claude-3-7-sonnet-20250219-v1:0) * * ## Prompt Templates * The construct uses default prompts that can be customized: * - **Classification**: Analyzes document and returns JSON with documentClassification field * - **Extraction**: Uses classification result to extract entities in structured JSON format * * ## Cross-Region Inference * When enabled, uses Bedrock inference profiles for improved availability: * - US prefix: Routes to US-based regions for lower latency * - EU prefix: Routes to EU-based regions for data residency compliance */ export declare class BedrockDocumentProcessing extends BaseDocumentProcessing { protected static readonly DEFAULT_CLASSIFICATION_PROMPT = "\n Analyze the document below, and classify the type of document it is (eg. INVOICE, IDENTITY_DOCUMENT, RECEIPT, etc). The result should be in JSON and should follow the following structure (only respond in JSON with the following structure and do not use markdown to indicate the json, just output plain old json with nothing else):\n\n {\n documentClassification: \n }\n\n Attached document is as follows:\n\n "; protected static readonly DEFAULT_PROCESSING_PROMPT = "\n The document below has been classified as [ACTUAL_CLASSIFICATION]. Extract important entities from the document and return the result as JSON following the structure below (only respond in JSON with the following structure and do not use markdown to indicate the json, just output plain old json with nothing else):\n\n {\n documentClassification: ,\n result: {\n entities: [\n {\n type: \n value: \n },\n ...\n ]\n }\n }\n\n Attached document is as follows:\n\n "; protected static readonly DEFAULT_AGGREGATION_PROMPT = "\n You are given the processing results from multiple chunks of a large document that was split for processing.\n Your task is to synthesize these chunk results into a single, coherent final result.\n\n Instructions:\n 1. Review all the chunk results provided below\n 2. Merge and deduplicate any overlapping information (chunks may have overlapping pages)\n 3. Synthesize the information into a unified, coherent result\n 4. Maintain the same output format as the individual chunk results\n 5. If the chunks contain summaries, create a comprehensive summary that covers all sections\n 6. If the chunks contain entities, deduplicate and consolidate them\n 7. Preserve important details from each chunk while avoiding redundancy\n\n Return the result as JSON (only respond in JSON without markdown formatting):\n\n {\n \"result\": \n }\n\n The chunk results to aggregate are as follows:\n\n "; /** Configuration properties specific to Bedrock document processing */ protected readonly bedrockDocumentProcessingProps: BedrockDocumentProcessingProps; /** The Step Functions state machine that orchestrates the document processing workflow */ readonly stateMachine: import("aws-cdk-lib/aws-stepfunctions").StateMachine; /** Cached classification Lambda function to avoid duplicate resource creation */ private _classificationFunction?; /** Cached processing Lambda function to avoid duplicate resource creation */ private _processingFunction?; /** Counter for generating unique classification step IDs */ private _classificationStepCounter; /** Counter for generating unique processing step IDs */ private _processingStepCounter; /** Cached aggregation Lambda function to avoid duplicate resource creation */ private _aggregationFunction?; /** Counter for generating unique aggregation step IDs */ private _aggregationStepCounter; /** Cached cleanup Lambda function to avoid duplicate resource creation */ private _cleanupFunction?; /** Counter for generating unique enrichment step IDs */ private _enrichmentStepCounter; /** Counter for generating unique post-processing step IDs */ private _postProcessingStepCounter; /** * Creates a new BedrockDocumentProcessing construct. * * Initializes the Bedrock-powered document processing pipeline with AI classification * and extraction capabilities. Creates Lambda functions with appropriate IAM roles * for Bedrock model invocation and S3 access. * * @param scope - The scope in which to define this construct * @param id - The scoped construct ID. Must be unique within the scope. * @param props - Configuration properties for the Bedrock document processing pipeline * @throws Error if chunking configuration is invalid */ constructor(scope: Construct, id: string, props: BedrockDocumentProcessingProps); /** * Validates the chunking configuration parameters. * * Ensures that: * - Chunk size is greater than 0 * - Overlap is non-negative and less than chunk size * - Thresholds are greater than 0 * - Max concurrency is greater than 0 * - Min success threshold is between 0 and 1 * * @param config - The chunking configuration to validate * @throws Error if any configuration parameter is invalid */ private validateChunkingConfig; /** * Implements the document classification step using Amazon Bedrock. * * Creates a Lambda function that invokes the configured Bedrock model to classify * the document type. The function reads the document from S3 and sends it to * Bedrock with the classification prompt. * * This method caches the Lambda function to avoid creating duplicate resources, * but creates a new LambdaInvoke task each time to allow proper state chaining. * * @returns LambdaInvoke task configured for document classification */ protected classificationStep(): DocumentProcessingStepType; /** * Implements the document extraction step using Amazon Bedrock. * * Creates a Lambda function that invokes the configured Bedrock model to extract * structured data from the document. Uses the classification result from the * previous step to provide context for more accurate extraction. * * This method caches the Lambda function to avoid creating duplicate resources, * but creates a new LambdaInvoke task each time to allow proper state chaining. * * @returns LambdaInvoke task configured for document extraction */ protected processingStep(): DocumentProcessingStepType; protected generateLambdaRoleForBedrock(id: string, model?: BedrockModelProps): Role; protected resolveBedrockInvokeEntry(): string; /** * Implements the optional document enrichment step. * * If an enrichment Lambda function is provided in the props, creates a LambdaInvoke * task to perform additional processing on the extracted data. This step is useful * for data validation, transformation, or integration with external systems. * * @returns LambdaInvoke task for enrichment, or undefined to skip this step */ protected enrichmentStep(): DocumentProcessingStepType | undefined; /** * Implements the optional post-processing step. * * If a post-processing Lambda function is provided in the props, creates a LambdaInvoke * task to perform final processing on the workflow results. This step is useful for * data formatting, notifications, or integration with downstream systems. * * @returns LambdaInvoke task for post-processing, or undefined to skip this step */ protected postProcessingStep(): DocumentProcessingStepType | undefined; /** * Implements the optional preprocessing step for PDF chunking. * * When chunking is enabled, creates a Lambda function that analyzes PDFs and * splits large documents into manageable chunks. The function: * 1. Analyzes the PDF to determine page count and token estimates * 2. Decides if chunking is needed based on configured thresholds * 3. If chunking is needed, splits the PDF and uploads chunks to S3 * * @returns LambdaInvoke task for PDF analysis and chunking, or undefined if chunking is disabled */ protected preprocessingStep(): DocumentProcessingStepType | undefined; /** * Provides additional metadata fields for chunking to be stored in DynamoDB. * * When chunking is enabled, adds fields for: * - ChunkingEnabled: string representation of boolean flag * - ChunkingStrategy: strategy used (fixed-pages, token-based, hybrid) * - TokenAnalysis: JSON string with token analysis results * - ChunkMetadata: JSON string array with chunk information * * @returns Record of DynamoDB attribute values for chunking metadata */ protected preprocessingMetadata(): Record; /** * Creates the processing workflow with conditional branching for chunked documents. * * When chunking is enabled, creates a Choice State that: * - Routes to chunked processing flow if document was chunked * - Routes to standard processing flow if document was not chunked * * When chunking is disabled, returns the standard processing workflow. * * @returns Step Functions chain for processing the document */ protected createProcessingWorkflow(): IChainable; /** * Creates the chunked processing flow for large documents. * * This flow: * 1. Uses a Map State to process each chunk in parallel (or sequentially) * 2. Each chunk goes through classification and processing * 3. Results are aggregated using the aggregation Lambda * 4. DynamoDB is updated with the aggregated result * 5. Temporary chunks are cleaned up from S3 * * @returns Step Functions chain for chunked document processing */ private createChunkedProcessingFlow; /** * Creates the aggregation step for combining chunk results using Bedrock. * * Uses the same Bedrock invoke Lambda pattern as the processing step but with * a different prompt designed for aggregating multiple chunk results. * The chunk processing results are passed as text data to the model. * * @returns LambdaInvoke task for result aggregation */ private createAggregationStep; /** * Creates the DynamoDB update step for storing aggregated results. * * Updates the document record with: * - AggregatedResult: JSON string with classification, entities, and summary * - WorkflowStatus: 'complete' * * @returns DynamoUpdateItem task for storing aggregated results */ private createUpdateAggregatedResultStep; /** * Creates the cleanup Lambda step for removing temporary chunk files. * * The cleanup Lambda: * - Deletes all chunk files from S3 chunks/ prefix * - Uses batch delete for efficiency (up to 1000 objects per request) * - Logs errors but doesn't fail the workflow * * @returns LambdaInvoke task for chunk cleanup */ private createCleanupStep; /** * Creates the error handler for aggregation failures. * * When aggregation fails: * - Updates DynamoDB with 'aggregation-failure' status * - Moves document to failed/ prefix * * @returns Step Functions chain for handling aggregation errors */ private createAggregationErrorHandler; }