/** * PDF document parser. * * Handles the high-level PDF file structure: * - Locating startxref * - Parsing cross-reference tables (traditional and stream-based) * - Reading trailer dictionaries * - Resolving indirect object references * - Handling incremental updates * * @see PDF Reference 1.7, §3.4 - File Structure */ import type { PdfObject, PdfDictValue, PdfRef, PdfStream } from "./pdf-parser.js"; /** Result of resolving an object with its object/generation numbers for decryption */ interface ResolvedObject { /** The resolved PDF object */ obj: PdfObject | null; /** The object number */ objNum: number; /** The generation number */ gen: number; } /** * Parsed PDF document with lazy object resolution. * * Reads the cross-reference table and trailer on construction, * then resolves individual objects on demand with caching. */ export declare class PdfDocument { private tokenizer; private xref; private cache; readonly trailer: PdfDictValue; /** Encryption handler (set externally after decryption is initialized) */ decryptFn: ((data: Uint8Array, objNum: number, gen: number) => Uint8Array) | null; constructor(data: Uint8Array); /** Get the underlying raw data */ get data(): Uint8Array; private parseFileStructure; /** * Find the startxref offset by scanning backward from EOF. */ private findStartxref; /** * Parse the xref chain starting at the given offset. * Follows /Prev links for incremental updates. * Returns the merged trailer dictionary. */ private parseXrefChain; /** * Parse a traditional xref table and its trailer. */ private parseTraditionalXref; /** * Parse a cross-reference stream (PDF 1.5+). */ private parseXrefStream; /** * Reconstruct the xref table by scanning the entire file for `N N obj` patterns. * This is a fallback for corrupted or broken PDFs where the normal xref parsing fails. * * @returns A synthetic trailer dictionary */ private reconstructXref; /** * Merge trailer entries from an older trailer into the current one. * Only adds keys that don't already exist. */ private mergeTrailer; /** * Resolve a PDF object by its object number and generation. * Returns null if the object doesn't exist. */ resolve(objNum: number, gen?: number): PdfObject | null; /** * Resolve a PDF object and return it along with its object/generation numbers. * Useful for tracking which object a value came from (for decryption). * * @param objNum - The object number to resolve * @param gen - The generation number (default 0) * @returns The resolved object with its objNum and gen for decryption context */ resolveWithObjNum(objNum: number, gen?: number): ResolvedObject; /** * Dereference a PdfRef to its actual object value. * If the input is not a PdfRef, returns it as-is. */ deref(obj: PdfObject | null | undefined): PdfObject | null; /** * Dereference a PdfRef and assert it's a dictionary. */ derefDict(obj: PdfObject | null | undefined): PdfDictValue | null; /** * Dereference a PdfRef and get the stream, along with the objNum/gen * needed for correct per-object decryption. */ derefStream(obj: PdfObject | null | undefined): PdfStream | null; /** * Dereference a PdfRef and get the stream with its object number and generation. * Returns null if the object is not a stream. * The objNum/gen are needed for correct per-object decryption (V1-V4). */ derefStreamWithObjNum(obj: PdfObject | null | undefined): { stream: PdfStream; objNum: number; gen: number; } | null; /** * Get decoded stream data from a stream object. * Applies filter chain decoding and decryption. * * When objNum/gen are not provided (default 0), decryption may not * produce correct results. Use {@link resolveWithObjNum} to obtain * the correct objNum/gen for the stream's containing object. */ getStreamData(stream: PdfStream, objNum?: number, gen?: number): Uint8Array; /** * Decrypt a string value (bytes) if encryption is active. */ decryptString(bytes: Uint8Array, objNum: number, gen: number): Uint8Array; /** * Decode a PDF string to a JS string, with optional decryption. */ decodeString(bytes: Uint8Array, objNum?: number, gen?: number): string; /** * Recursively decrypt all string values (Uint8Array) within a parsed PDF object. * PDF spec requires all strings in an encrypted document to be decrypted using * the per-object key derived from the containing object's objNum/gen. * Streams are NOT decrypted here — they are decrypted in getStreamData(). */ private decryptObjectStrings; /** * Get the catalog dictionary (the root of the document structure). */ getCatalog(): PdfDictValue; /** * Get the pages array from the page tree. * Returns an array of page dictionaries in order. */ getPages(): PdfDictValue[]; /** * Get pages with their object numbers (needed for correct decryption of * inline streams within page objects). */ getPagesWithObjInfo(): Array<{ dict: PdfDictValue; objNum: number; gen: number; }>; /** * Recursively collect page dictionaries from the page tree. * Uses a visited set to prevent infinite recursion on cyclic page trees. */ private collectPages; /** * Get the object number for a given object reference. * Useful for tracking which object a value came from (for decryption). */ getObjNumForRef(ref: PdfRef): number; /** * Parse an object definition at the given byte offset. */ private parseObjectAt; /** * Parse a compressed object from an object stream. * @param objStmNum - The object number of the object stream * @param index - The index of the object within the stream */ private parseCompressedObject; /** * Parse all objects from an object stream. * @returns Map of object number → object value */ private parseObjectStream; /** * Resolve a page's bounding box (MediaBox/CropBox) with indirect ref resolution * and parent inheritance. Returns `{ width, height }` or null if no box found. * * This is a shared helper so callers don't duplicate box resolution logic. */ resolvePageBox(pageDict: PdfDictValue, visited?: Set): { width: number; height: number; } | null; /** * Resolve a page's Resources dictionary, inheriting from parent pages if needed. * Protected against cyclic parent chains. */ resolvePageResources(pageDict: PdfDictValue, visited?: Set): PdfDictValue; } export {};