/** * PDF reader — public API. * * Provides a high-level, zero-dependency interface for reading PDF files. * Supports: * - Text extraction with multilingual support (WinAnsi, MacRoman, CJK via * ToUnicode CMap, Identity-H/V, Symbol, ZapfDingbats) * - Image extraction (JPEG, JPEG2000, raw/Flate, CCITT, JBIG2) * - Annotation extraction (links, comments, highlights, stamps, etc.) * - Form field extraction (AcroForm: text inputs, checkboxes, radio buttons, dropdowns) * - Metadata reading (Info dictionary + XMP) * - Encrypted PDFs: * - RC4 (40-bit and 128-bit) — tested via roundtrip * - AES-128 (V=4, R=4) — implemented, requires external test fixtures * - AES-256 (V=5, R=5) — implemented, requires external test fixtures * - Cross-reference tables and streams (PDF 1.5+) * - Incremental updates and xref recovery * * @example Text extraction: * ```typescript * import { readPdf } from "excelts/pdf"; * * const pdf = await readPdf(pdfBytes); * console.log(pdf.text); // All text from all pages * console.log(pdf.pages[0].text); // Text from page 1 * ``` * * @example Image extraction: * ```typescript * const pdf = await readPdf(pdfBytes); * for (const image of pdf.pages[0].images) { * console.log(image.format, image.width, image.height); * fs.writeFileSync(`image.${image.format}`, image.data); * } * ``` * * @example Metadata: * ```typescript * const pdf = await readPdf(pdfBytes); * console.log(pdf.metadata.title); * console.log(pdf.metadata.author); * console.log(pdf.metadata.pageCount); * ``` * * @example Encrypted PDF: * ```typescript * const pdf = await readPdf(pdfBytes, { password: "secret" }); * ``` */ import type { PdfAnnotation } from "./annotation-extractor.js"; import type { PdfBookmark } from "./bookmark-extractor.js"; import type { TextFragment } from "./content-interpreter.js"; import type { PdfFormField } from "./form-extractor.js"; import type { ExtractedImage } from "./image-extractor.js"; import type { PdfMetadata } from "./metadata-reader.js"; import type { PdfTable } from "./table-extractor.js"; import type { TextLine } from "./text-reconstruction.js"; /** * Options for reading a PDF. */ export interface ReadPdfOptions { /** * Password for encrypted PDFs. * Can be either the user password or owner password. * @default "" */ password?: string; /** * Which pages to extract (1-based). * If omitted, all pages are extracted. * @example [1, 3, 5] — extract pages 1, 3, and 5 */ pages?: number[]; /** * Whether to extract text. * @default true */ extractText?: boolean; /** * Whether to extract images. * @default true */ extractImages?: boolean; /** * Whether to extract metadata. * @default true */ extractMetadata?: boolean; /** * Whether to extract annotations (links, comments, highlights, etc.). * @default true */ extractAnnotations?: boolean; /** * Whether to extract form fields (AcroForm: text inputs, checkboxes, dropdowns, etc.). * @default true */ extractFormFields?: boolean; /** * Whether to extract bookmarks (document outline / table of contents). * @default true */ extractBookmarks?: boolean; /** * Whether to extract tables from pages using text positioning heuristics. * Opt-in since table detection is heavier than plain text extraction. * @default false */ extractTables?: boolean; } /** * A single page from a read PDF. */ export interface ReadPdfPage { /** 1-based page number */ pageNumber: number; /** Extracted text content */ text: string; /** Structured text lines with position information */ textLines: TextLine[]; /** Raw text fragments with exact positions */ textFragments: TextFragment[]; /** Extracted images */ images: ExtractedImage[]; /** Extracted annotations (links, comments, highlights, etc.) */ annotations: PdfAnnotation[]; /** Tables detected from text fragment positioning (opt-in via extractTables) */ tables: PdfTable[]; /** Page width in points */ width: number; /** Page height in points */ height: number; /** Warnings encountered during extraction (non-fatal errors) */ warnings: string[]; } /** * Result of reading a PDF. */ export interface ReadPdfResult { /** All text from all pages concatenated */ text: string; /** Per-page results */ pages: ReadPdfPage[]; /** Document metadata */ metadata: PdfMetadata; /** Form fields extracted from AcroForm (document-level) */ formFields: PdfFormField[]; /** Bookmarks (document outline) extracted from the outline tree */ bookmarks: PdfBookmark[]; } /** * Read a PDF file and extract text, images, and metadata. * Yields to the event loop between pages to avoid blocking. * * @param data - Raw PDF file bytes * @param options - Extraction options * @returns Promise of extracted content * @throws {PdfStructureError} If the PDF structure is invalid * @throws {PdfError} If decryption fails (wrong password) */ export declare function readPdf(data: Uint8Array, options?: ReadPdfOptions): Promise;