import type { PdfPageContent } from '../models/PdfPageContent.js'; /** * Enhanced PdfElementComposer with complete advanced text processing algorithm system. * Implements 3-stage processing: OverlappingText → OrderComposites → ComputeTextTypes */ export declare class PdfElementComposer { /** * Compose elements by grouping text elements into paragraphs while preserving images. * @param pages Array of PDF page content with raw elements * @returns Array of PDF page content with composed elements (paragraphs + images) */ static composeElements(pages: PdfPageContent[]): PdfPageContent[]; /** * Compose elements for a single page using complete advanced text processing algorithm system. */ private static composePageElements; /** * Convert PdfElements to internal Composite format for processing */ private static convertToComposites; /** * Convert Composites back to PdfElements */ private static convertToElements; /** * Merge drop caps (large initial letters) with their following paragraphs */ private static mergeDropCaps; /** * Check if an element is a drop cap (large single letter/word) */ private static isDropCap; /** * Check if two elements are vertically close */ private static areVerticallyClose; /** * Merge a drop cap with its following paragraph */ private static mergeDropCapWithParagraph; /** * Combine formatting from drop cap and paragraph */ private static combineDropCapFormatting; /** * Stage 1 - Overlapping Text Algorithm: OverlappingTextAlgorithm (Priority 30) * Spatial merging with 10% font tolerance and dynamic expansion * * IMPORTANT: Detect column structure FIRST to prevent cross-column merging */ private static runOverlappingTextAlgorithm; /** * Detect column boundaries using multi-column layout detection * Returns array of column boundaries: [{left, right}, ...] * * CRITERIA for multi-column detection (font-size aware): * 1. Must have at least 2 distinct left-position clusters with significant gap * 2. Each cluster must have at least 2 elements (real column content) * 3. Clusters must overlap vertically (side-by-side columns) * 4. Column widths can vary (TOC vs paragraph columns) * 5. Gap threshold is dynamic based on font size and typical column width * * This prevents false positives from: * - Centered headers with different left positions * - Mixed alignments (left, center, right) * - Normal text flow variations * - Partial text fragments from PDF extraction */ private static detectColumnBoundaries; /** * Assign a composite to a column based on its left position */ private static assignToColumn; /** * Stage 2 - Spatial Order Algorithm: OrderCompositesAlgorithm (Priority 40) * Reading order detection with beam scanning for multi-column layout */ private static runOrderCompositesAlgorithm; /** * Advanced Beam Scanning: Detect column layout using horizontal density analysis * * Improved algorithm for multi-column detection (2, 3, or more columns): * 1. Exclude spanning elements from density calculation * 2. Build horizontal density histogram to find vertical gaps * 3. Use consistent gap detection across multiple vertical positions * 4. Validate columns by checking element distribution consistency */ private static detectColumnsWithBeamScanning; /** * Stage 3 - Text Type Classification: ComputeTextTypesAlgorithm (Priority 50) * Text type classification based on font size analysis */ private static runComputeTextTypesAlgorithm; /** * Calculate page statistics from composites (optimized approach) */ private static calculatePageStatisticsFromComposites; /** * Advanced composite merging criteria * * Key improvement: Prevent merging elements from different columns by * checking horizontal distance more strictly when elements are on the same line */ private static shouldMergeComposites; /** * Calculate vertical overlap between two bounding boxes */ private static getVerticalOverlap; /** * Calculate horizontal overlap between two bounding boxes * Returns the amount of horizontal overlap (0 if no overlap) */ private static getHorizontalOverlap; /** * Calculate horizontal gap between two bounding boxes * Returns 0 if boxes overlap horizontally */ private static getHorizontalGap; /** * Check if two boxes are vertically close (for stacked elements) */ private static intersectsVerticallyWithExpansion; /** * Create merged composite from cluster of composites */ private static createMergedComposite; private static calculatePageStatistics; /** * Group text elements using dynamic clustering with advanced spatial analysis. */ private static groupWithDynamicClustering; /** * advanced overlapping detection algorithm. */ private static findOverlappingClusters; /** * Advanced element merging criteria with dynamic thresholds. * * Key improvement: Prevent merging elements from different columns by * checking horizontal distance strictly when elements are on the same line */ private static shouldMergeElements; /** * Calculate vertical overlap for elements */ private static getVerticalOverlapForElements; /** * Calculate horizontal gap for elements */ private static getHorizontalGapForElements; /** * Check if two bounding boxes intersect with expansion (advanced). */ private static intersectsWithExpansion; /** * Generic title detection without hard-coded content. */ private static looksLikeGenericTitle; /** * Generic new section detection. */ private static looksLikeNewSection; /** * Create a composed paragraph element from multiple text elements. */ private static createComposedParagraph; /** * Calculate the bounding box that encompasses all elements in a paragraph. */ private static calculateParagraphBounds; /** * Check if text content is meaningful (filters out empty, whitespace-only, or control character text). */ private static isMeaningfulText; /** * Combine formatted HTML text from multiple elements intelligently */ private static combineFormattedText; /** * Optimize formatted HTML by merging spans with similar or compatible styling */ private static optimizeFormattedHtml; /** * Check if all spans can be merged based on their styling compatibility */ private static canMergeAllSpans; /** * Merge styles from multiple spans, prioritizing the most complete styling */ private static getMergedStyles; /** * Clean up formatted HTML by removing empty spans and consolidating redundant elements */ private static cleanupFormattedHtml; /** * Check if text contains block-level HTML tags */ private static hasBlockLevelTags; } //# sourceMappingURL=PdfElementComposer.d.ts.map