/**
* Comprehensive types for PDF Decomposer system
* Used across all components to ensure type safety and consistency
*/
///
///
import type { PdfDocument } from '../core/PdfDocument.js';
/**
* PDF.js page object interface
*/
export interface PdfJsPage {
getViewport(params: {
scale: number;
}): PdfJsViewport;
getTextContent(): Promise;
extractText(): Promise;
getAnnotations(): Promise;
extractImages(): Promise;
}
/**
* PDF.js viewport interface
*/
export interface PdfJsViewport {
width: number;
height: number;
convertToViewportRectangle(rect: number[]): number[];
}
/**
* PDF.js text content interface
*/
export interface PdfJsTextContent {
items: PdfJsTextItem[];
}
/**
* PDF.js text item interface
*/
export interface PdfJsTextItem {
str: string;
fontName: string;
transform: number[];
width: number;
height?: number;
}
/**
* PDF.js color-aware element from PdfTextEvaluator
*/
export interface PdfJsColorAwareElement {
text?: string;
textColor?: string;
fontFamily?: string;
fontWeight?: string;
fontStyle?: string;
fontSize?: number;
boundingBox?: PdfDecomposerBoundingBox;
}
/**
* PDF.js annotation interface
*/
export interface PdfJsAnnotation {
subtype: string;
url?: string;
dest?: any;
rect: number[];
id: string;
contents?: string;
}
/**
* PDF.js image item interface
*/
export interface PdfJsImageItem {
boundingBox: PdfDecomposerBoundingBox;
data: Buffer | Uint8Array;
objectId: string;
contentType: string;
}
/**
* Simplified interface for PDF decomposer page processing
* Used by PdfDecomposerPage for basic document access
*/
export interface PdfDecomposerPageData {
pdfDoc: PdfDocument;
pkg: {
pkgDir?: PdfDecomposerDirectory;
pages: any[];
};
}
/**
* Package interface for decomposer output
* Represents the container for all decomposed content and metadata
*/
export interface PdfDecomposerPackage {
fingerprint?: string;
pages: any[];
thumbnail?: any;
state?: PdfDecomposerState;
pkgDir?: PdfDecomposerDirectory;
dir?: string;
}
/**
* Package directory interface
* Handles file system operations for output
*/
export interface PdfDecomposerDirectory {
dir?: string;
create(): Promise;
exists?(): Promise;
}
/**
* Progress state interface
* Used for tracking decomposition progress and status
*/
export interface PdfDecomposerState {
progress: number;
message: string;
processing: boolean;
}
/**
* Decompose error interface
* Used for error reporting during page processing
*/
export interface PdfDecomposerError {
message: string;
pageIndex: number;
}
/**
* Options for PDF decomposition
* Controls various aspects of the decomposition process
*/
export interface PdfDecomposerOptions {
startPage?: number;
endPage?: number;
outputDir?: string;
elementComposer?: boolean;
pageComposer?: boolean;
extractImages?: boolean;
extractLinks?: boolean;
minify?: boolean;
cleanComposer?: boolean;
cleanComposerOptions?: PdfCleanComposerOptions;
minifyOptions?: {
format?: 'plain' | 'html';
elementAttributes?: boolean;
};
}
/**
* Options for content cleaning and filtering
* Controls how the cleanComposer feature filters and cleans content
*/
export interface PdfCleanComposerOptions {
/**
* Margin from top to exclude headers (as percentage of page height)
* Default: 0.1 (10%)
*/
topMarginPercent?: number;
/**
* Margin from bottom to exclude footers (as percentage of page height)
* Default: 0.1 (10%)
*/
bottomMarginPercent?: number;
/**
* Margin from left and right to exclude side elements (as percentage of page width)
* Default: 0.05 (5%)
*/
sideMarginPercent?: number;
/**
* Minimum height for text elements (in points)
* Elements smaller than this will be filtered out
* Default: 8
*/
minTextHeight?: number;
/**
* Minimum width for text elements (in points)
* Elements smaller than this will be filtered out
* Default: 10
*/
minTextWidth?: number;
/**
* Maximum allowed spacing between words (as ratio of font size)
* Text with excessive spacing will be cleaned
* Default: 3.0
*/
maxWordSpacingRatio?: number;
/**
* Remove elements with non-printable or control characters
* Default: true
*/
removeControlCharacters?: boolean;
/**
* Minimum meaningful text length
* Text shorter than this will be filtered out
* Default: 3
*/
minTextLength?: number;
/**
* Remove isolated single characters or symbols
* Default: true
*/
removeIsolatedCharacters?: boolean;
/**
* Minimum width for image elements (in points/pixels)
* Images smaller than this will be filtered out as decorative elements
* Default: 50
*/
minImageWidth?: number;
/**
* Minimum height for image elements (in points/pixels)
* Images smaller than this will be filtered out as decorative elements
* Default: 50
*/
minImageHeight?: number;
/**
* Minimum area for image elements (width × height)
* Images with smaller area will be filtered out
* Default: 2500 (50×50)
*/
minImageArea?: number;
/**
* Enable cover page detection and screenshot generation
* If the first page is detected as a cover (full-page image), generate a screenshot instead
* Default: true
*/
coverPageDetection?: boolean;
/**
* Cover page threshold (percentage of page area that an image must cover)
* Used to determine if a page is a cover page
* Default: 0.8 (80% of page area)
*/
coverPageThreshold?: number;
/**
* Screenshot quality for cover pages (1-100)
* Default: 95
*/
coverPageScreenshotQuality?: number;
/**
* Output directory path for cleaning image files
* If provided, removed image files will be deleted from disk
*/
outputDir?: string;
}
/**
* Options for PDF screenshot generation
* Controls screenshot rendering parameters
*/
export interface PdfDecomposerScreenshotOptions {
imageWidth?: number;
imageHeight?: number;
outputDir?: string;
pages?: number[];
format?: 'png' | 'jpeg';
quality?: number;
}
/**
* Result interface for screenshot operations
* Contains generated screenshots and metadata
*/
export interface PdfDecomposerScreenshotResult {
screenshots: PdfDecomposerScreenshot[];
totalPages: number;
processedPages: number;
}
/**
* Individual screenshot image interface
*/
export interface PdfDecomposerScreenshot {
pageNumber: number;
buffer: Buffer;
width: number;
height: number;
filename?: string;
}
/**
* Base interface for all PDF elements
*/
export interface PdfDecomposerElement {
id: string;
pageIndex: number;
type: 'text' | 'image' | 'link' | 'annotation';
boundingBox: PdfDecomposerBoundingBox;
data: any;
attributes?: Record;
}
/**
* Text element interface
*/
export interface PdfDecomposerTextElement extends PdfDecomposerElement {
type: 'text';
data: string;
formattedData?: string;
attributes: PdfDecomposerTextAttributes;
}
/**
* Image element interface
*/
export interface PdfDecomposerImageElement extends PdfDecomposerElement {
type: 'image';
data: string | Buffer;
attributes: PdfDecomposerImageAttributes;
}
/**
* Link element interface
*/
export interface PdfDecomposerLinkElement extends PdfDecomposerElement {
type: 'link';
data: string;
attributes: PdfDecomposerLinkAttributes;
}
/**
* Text attributes interface
*/
export interface PdfDecomposerTextAttributes {
fontFamily?: string;
fontSize?: number;
textColor?: string;
fontWeight?: 'normal' | 'bold';
fontStyle?: 'normal' | 'italic';
}
/**
* Image attributes interface
*/
export interface PdfDecomposerImageAttributes {
type: 'embedded' | 'legacy';
width: number;
height: number;
format?: string;
originalId?: string;
scaled?: boolean;
scaleFactor?: number;
extraction?: string;
}
/**
* Link attributes interface
*/
export interface PdfDecomposerLinkAttributes {
linkType: 'url' | 'internal' | 'email' | 'annotation';
text?: string;
annotationId?: string;
dest?: any;
}
/**
* Bounding box interface
*/
export interface PdfDecomposerBoundingBox {
top: number;
left: number;
bottom: number;
right: number;
width: number;
height: number;
}
/**
* Enhanced text element with complete type safety
*/
export interface PdfDecomposerExtractedTextElement {
id: string;
pageIndex: number;
type: 'text';
boundingBox: PdfDecomposerBoundingBox;
data: string;
formattedData: string;
attributes: PdfDecomposerTextAttributes & {
originalFont?: string;
};
}
/**
* Enhanced image element with complete metadata
*/
export interface PdfDecomposerExtractedImageElement {
id: string;
pageIndex: number;
type: 'image';
boundingBox: PdfDecomposerBoundingBox;
data: string;
attributes: {
type: 'embedded' | 'legacy';
width: number;
height: number;
format?: string;
originalId?: string;
scaled?: boolean;
scaleFactor?: number;
extraction?: 'universal';
};
}
/**
* Enhanced link element with comprehensive link data
*/
export interface PdfDecomposerExtractedLinkElement {
id: string;
pageIndex: number;
type: 'link';
boundingBox: PdfDecomposerBoundingBox;
data: string;
attributes: {
linkType: 'url' | 'email' | 'internal';
annotationId?: string;
dest?: any;
text?: string;
extraction?: 'text-pattern';
};
}
/**
* Union type for all extracted elements with enhanced type safety
*/
export type PdfDecomposerExtractedElement = PdfDecomposerExtractedTextElement | PdfDecomposerExtractedImageElement | PdfDecomposerExtractedLinkElement;
/**
* Color-aware element from PdfTextEvaluator with proper typing
*/
export interface PdfDecomposerColorAwareElement {
text?: string;
textColor?: string;
fontFamily?: string;
fontWeight?: string;
fontStyle?: string;
fontSize?: number;
boundingBox?: PdfDecomposerBoundingBox;
}
/**
* Memory statistics interface
*/
export interface PdfDecomposerMemoryStats {
used: number;
total: number;
percentage: number;
}
/**
* Memory monitoring options
*/
export interface PdfDecomposerMemoryOptions {
maxMemoryMB: number;
gcThresholdMB: number;
aggressiveCleanup: boolean;
}
/**
* Progress callback interface
*/
export interface PdfDecomposerProgressCallback {
(progress: {
loaded: number;
total: number;
}): void;
}
/**
* Factory options for creating PDF decomposer instances
*/
export interface PdfDecomposerFactoryOptions {
skipDecompose?: boolean;
extractImages?: boolean;
elementComposer?: boolean;
pageComposer?: boolean;
}
/**
* PDF loading options
*/
export interface PdfDecomposerLoadingOptions {
cMapUrl?: string;
cMapPacked?: boolean;
standardFontDataUrl?: string;
disableRange?: boolean;
disableStream?: boolean;
isEvalSupported?: boolean;
verbosity?: number;
}
//# sourceMappingURL=decomposer.types.d.ts.map