/** * Text reconstruction from positioned text fragments. * * Assembles raw text fragments extracted from PDF content streams into * coherent, human-readable text with proper reading order, line breaks, * and paragraph detection. * * Challenges addressed: * - PDF text has no semantic structure (only "draw char at (x,y)") * - Text fragments may be out of order * - Word and line boundaries must be inferred from positions * - Columns and tables need proper handling * - Different fonts/sizes affect spacing thresholds * - Multi-column layouts need column detection * - RTL (Arabic, Hebrew) text needs right-to-left sorting * - Vertical CJK text needs column-based grouping * * @see PDF Reference 1.7, Chapter 5 - Text */ import type { TextFragment } from "./content-interpreter.js"; /** * Reconstruct readable text from positioned text fragments. * * @param fragments - Raw text fragments with positions from content stream * @returns Reconstructed text with proper line breaks and spacing */ export declare function reconstructText(fragments: TextFragment[]): string; /** * Detailed text extraction result preserving position information. */ export interface TextLine { /** The text content of this line */ text: string; /** Y position (PDF coordinate, origin = bottom-left) */ y: number; /** X position of the start of the line */ x: number; /** Font size of the first fragment */ fontSize: number; } /** * Extract text as structured lines. */ export declare function reconstructTextLines(fragments: TextFragment[]): TextLine[];