/** * PDF Clean Composer * * Filters PDF content to include only main content area elements, * excluding headers, footers, page numbers, and cleaning up text elements. * * Main responsibilities: * - Detect content area boundaries for each page * - Filter elements within content area * - Clean text elements (fix spacing, validate characters) * - Remove non-content elements (headers, footers, page numbers) * - Validate and clean image elements */ import type { PdfPageContent } from '../models/PdfPageContent.js'; import type { PdfDocument } from './PdfDocument.js'; import type { PdfPageRenderer } from '../types/renderer.types.js'; export interface PdfCleanComposerOptions { /** * Margin from top to exclude headers (as percentage of page height) * Default: 0.1 (10%) */ topMarginPercent?: number; /** * Margin from bottom to exclude footers (as percentage of page height) * Default: 0.1 (10%) */ bottomMarginPercent?: number; /** * Margin from left and right to exclude side elements (as percentage of page width) * Default: 0.05 (5%) */ sideMarginPercent?: number; /** * Minimum height for text elements (in points) * Elements smaller than this will be filtered out * Default: 8 */ minTextHeight?: number; /** * Minimum width for text elements (in points) * Elements smaller than this will be filtered out * Default: 10 */ minTextWidth?: number; /** * Maximum allowed spacing between words (as ratio of font size) * Text with excessive spacing will be cleaned * Default: 3.0 */ maxWordSpacingRatio?: number; /** * Remove elements with non-printable or control characters * Default: true */ removeControlCharacters?: boolean; /** * Minimum meaningful text length * Text shorter than this will be filtered out * Default: 3 */ minTextLength?: number; /** * Remove isolated single characters or symbols * Default: true */ removeIsolatedCharacters?: boolean; /** * Minimum width for image elements (in points/pixels) * Images smaller than this will be filtered out as decorative elements * Default: 50 */ minImageWidth?: number; /** * Minimum height for image elements (in points/pixels) * Images smaller than this will be filtered out as decorative elements * Default: 50 */ minImageHeight?: number; /** * Minimum area for image elements (width × height) * Images with smaller area will be filtered out * Default: 2500 (50×50) */ minImageArea?: number; /** * Enable cover page detection and screenshot generation * If the first page is detected as a cover (full-page image), generate a screenshot instead * Default: true */ coverPageDetection?: boolean; /** * Cover page threshold (percentage of page area that an image must cover) * Used to determine if a page is a cover page * Default: 0.8 (80% of page area) */ coverPageThreshold?: number; /** * Screenshot quality for cover pages (1-100) * Default: 95 */ coverPageScreenshotQuality?: number; /** * Output directory path for cleaning image files * If provided, removed image files will be deleted from disk */ outputDir?: string; /** * Pluggable page renderer. When set (e.g. PuppeteerRenderer), cleanComposer * rasterizes cover/page screenshots through it (Chromium) instead of * node-canvas. When null/undefined, the node-canvas PageRenderer path is used. * Mirrors how PdfDecomposer.screenshot() picks its rasterization path. */ renderer?: PdfPageRenderer | null; /** * Target width (px) for the page/cover screenshot when rendering via `renderer`. * Default: 1024. Ignored by the node-canvas fallback (which uses scale 1.0). */ coverPageScreenshotWidth?: number; } export declare class PdfCleanComposer { /** * Clean and filter PDF pages to include only main content * * @param pages Array of PDF page content to clean * @param options Cleaning configuration options * @param pdfDocument Optional PDF document for cover page detection and screenshot * @returns Cleaned array of PDF page content */ static cleanPages(pages: PdfPageContent[], options?: PdfCleanComposerOptions, pdfDocument?: PdfDocument): Promise; /** * Clean a single PDF page * @param skipScreenshot If true, skip screenshot conversion (used when limit reached) */ private static cleanPage; /** * Calculate content area boundaries for a page */ private static calculateContentArea; /** * Clean and filter elements based on content area and quality */ private static cleanElements; /** * Check if element is within content area boundaries */ private static isElementInContentArea; /** * Clean individual element based on its type */ private static cleanElement; /** * Clean text element content and validate dimensions */ private static cleanTextElement; /** * Clean image element and validate dimensions */ private static cleanImageElement; /** * Remove control characters and non-printable characters */ private static removeControlCharacters; /** * Fix excessive spacing in text */ private static fixTextSpacing; /** * Check if text is an isolated character or symbol */ private static isIsolatedCharacter; /** * Validate text element dimensions */ private static validateTextElementDimensions; /** * Check if element is a text element */ private static isTextElement; /** * Check if element is an image element */ private static isImageElement; /** * Normalize bounding box to consistent format */ private static normalizeBoundingBox; /** * Check if element was modified during cleaning */ private static isElementModified; /** * Remove image file from output directory when element is filtered out * Only works in Node.js environment - gracefully degrades in browser */ private static removeImageFile; /** * Detect if the page is a cover page and process it as screenshot * Cover page is detected by having a large image that covers most of the page area * OR multiple images that collectively cover most of the page (for tiled cover pages) */ private static detectAndProcessCoverPage; /** * Determine if a page should be converted to screenshot based on its content */ private static shouldConvertToScreenshot; /** * Calculate how well images are distributed across the page * Returns distribution score (0-1) indicating coverage of page dimensions */ private static calculateImageDistribution; /** * Generate screenshot for cover page * Note: Uses scale 1.0 for memory efficiency on large documents */ private static generatePageScreenshot; } //# sourceMappingURL=PdfCleanComposer.d.ts.map