/** * F5 CLI - PDF Document Processor * Extract content from PDF files with optional OCR * * @module @f5/cli/core/pdf-processor * @version 1.0.0 */ export interface PdfProcessorOptions { enableOcr?: boolean; ocrLanguage?: string; maxPages?: number; extractImages?: boolean; imagesOutputDir?: string; } export interface PdfSection { type: 'text' | 'table' | 'heading'; content: string; page: number; confidence?: number; } export interface PdfPage { pageNumber: number; text: string; hasImages: boolean; } export interface PdfParseResult { fileName: string; title: string | null; sections: PdfSection[]; pages: PdfPage[]; metadata: { pageCount: number; wordCount: number; author?: string; creationDate?: Date; modificationDate?: Date; isScanned: boolean; ocrApplied: boolean; }; } export declare class PdfProcessor { private options; constructor(options?: PdfProcessorOptions); /** * Parse PDF document using pdf-parse v2 API */ parseFile(filePath: string): Promise; /** * Build pages array from getText result or by splitting text */ private buildPagesArray; /** * Parse PDF date format (D:YYYYMMDDHHmmSS) */ private parsePdfDate; /** * Apply OCR to scanned PDF */ private applyOcr; /** * Parse text into structured sections */ private parseTextToSections; /** * Split text into pages (heuristic) */ private splitIntoPages; /** * Extract title from text */ private extractTitle; /** * Convert to Markdown */ toMarkdown(result: PdfParseResult): string; }