/** * Table extraction from PDF pages using text fragment positioning. * * Detects tabular structures by analyzing the spatial layout of text fragments. * Since PDF content streams typically render tables as positioned text (with or * without drawn grid lines), this module uses a text-only heuristic: * * 1. Group fragments into lines by Y proximity * 2. Detect column boundaries from consistent X-position clusters * 3. Identify contiguous blocks of multi-column lines as tables * 4. Map fragments to cells based on column/line membership * * @see content-interpreter.ts for TextFragment extraction * @see text-reconstruction.ts for line grouping logic */ import type { TextFragment } from "./content-interpreter.js"; /** * A single cell in a PDF table. */ export interface PdfTableCell { /** Text content of the cell */ text: string; /** X position in page coordinates (points) */ x: number; /** Y position in page coordinates (points) */ y: number; /** Width of the cell in points */ width: number; /** Height of the cell in points */ height: number; /** Number of rows this cell spans (default 1) */ rowSpan?: number; /** Number of columns this cell spans (default 1) */ colSpan?: number; } /** * A single row in a PDF table. */ export interface PdfTableRow { /** Cells in this row, ordered left-to-right */ cells: PdfTableCell[]; } /** * A table extracted from a PDF page. */ export interface PdfTable { /** Rows in this table, ordered top-to-bottom */ rows: PdfTableRow[]; /** X position of the table (left edge) in page coordinates */ x: number; /** Y position of the table (top edge) in page coordinates */ y: number; /** Width of the table in points */ width: number; /** Height of the table in points */ height: number; } /** * Extract tables from a page's text fragments. * * Uses text positioning heuristics to detect tabular structures without * relying on drawn lines or grid paths. * * @param fragments - Text fragments from `extractTextFromPage` * @param pageWidth - Page width in points * @param pageHeight - Page height in points * @returns Array of detected tables */ export declare function extractTables(fragments: TextFragment[], pageWidth: number, pageHeight: number): PdfTable[];