/** * PDF content stream interpreter for text extraction. * * Implements a full PDF graphics state machine that processes content stream * operators to extract positioned text fragments. These fragments are then * assembled into readable text by the text reconstruction module. * * Supported operator categories: * - Text state: Tf, Tc, Tw, Tz, TL, Ts, Tr * - Text positioning: Td, TD, Tm, T* * - Text showing: Tj, TJ, ', " * - Text objects: BT, ET * - Graphics state: q, Q, cm, gs, i, M, ri, W, W* * - Color: CS, cs, SC, sc, SCN, scn * - Marked content: BDC, BMC, EMC, MP, DP * - Type3 glyph: d0, d1 * - Shading: sh * - Inline images: BI/ID/EI * - XObject invocation: Do (for form XObjects containing text) * * @see PDF Reference 1.7, Chapter 5 - Text * @see PDF Reference 1.7, Chapter 4 - Graphics */ import type { PdfDocument } from "./pdf-document.js"; import type { PdfDictValue } from "./pdf-parser.js"; /** * A text fragment extracted from a PDF page. * Contains the text string and its position in page coordinates. */ export interface TextFragment { /** The extracted text */ text: string; /** X position in page coordinates (points, origin = bottom-left) */ x: number; /** Y position in page coordinates */ y: number; /** Font size in points */ fontSize: number; /** Font name */ fontName: string; /** Width of the text in points */ width: number; /** Character spacing */ charSpacing: number; /** Word spacing */ wordSpacing: number; /** Horizontal scaling factor (100 = normal) */ horizontalScaling: number; /** Whether the text is vertical (WMode=1) */ isVertical: boolean; /** Whether the text is right-to-left (Arabic, Hebrew, etc.) */ isRtl: boolean; } /** * Extract text fragments from a page's content stream(s). */ export declare function extractTextFromPage(pageDict: PdfDictValue, doc: PdfDocument): TextFragment[];