/** @module Web entrypoint for ppu-pdf browser support.
 *
 * Provides `PdfReaderLegacyWeb`, a browser-compatible PDF reader based on pdfjs-dist.
 * Supports text extraction, line grouping, compact lines, TOON format, scanned detection,
 * page rendering to HTMLCanvasElement, and scanned PDF OCR via ppu-paddle-ocr/web.
 *
 * @example
 * ```ts
 * import { PdfReaderLegacyWeb } from "ppu-pdf/web";
 *
 * const reader = new PdfReaderLegacyWeb({ verbose: false });
 * const response = await fetch("my-document.pdf");
 * const buffer = await response.arrayBuffer();
 *
 * const pdf = await reader.open(buffer);
 * const texts = await reader.getTexts(pdf);
 * console.log(texts.get(1)?.fullText);
 * await reader.destroy(pdf);
 * ```
 */
import * as pdfjs from "pdfjs-dist";
import { BasePdfReaderCommon } from "../core/base-pdf-reader-common.js";
import { type CompactPageLines, type PageLines, type PageTexts, type PageToonLines, type PdfCompactLineAlgorithm, type PdfReaderOptions, type PdfScannedThreshold } from "../pdf.interface.js";
/** Canvas map type for web — uses HTMLCanvasElement instead of Node.js native Canvas. */
export type WebCanvasMap = Map<number, HTMLCanvasElement>;
/**
 * Browser-compatible PDF reader based on pdfjs-dist.
 *
 * Supports all digital PDF features: text extraction, line grouping,
 * compact lines, TOON format, and scanned detection.
 * Also supports page rendering to HTMLCanvasElement and scanned PDF OCR
 * when combined with ppu-paddle-ocr/web.
 */
export declare class PdfReaderLegacyWeb extends BasePdfReaderCommon {
    private options;
    readonly startIndex = 1;
    constructor(options?: Partial<PdfReaderOptions>);
    /**
     * Opens a PDF document from an ArrayBuffer.
     * @param data - The ArrayBuffer containing the PDF data.
     * @returns The opened PDFDocumentProxy instance.
     */
    open(data: ArrayBuffer): Promise<pdfjs.PDFDocumentProxy>;
    /**
     * Renders all pages of a PDF document into HTMLCanvasElements.
     * @param doc - The PDFDocumentProxy to render.
     * @returns A map of page numbers to HTMLCanvasElement instances.
     */
    renderAll(doc: pdfjs.PDFDocumentProxy): Promise<WebCanvasMap>;
    /**
     * Extracts text from scanned PDF pages using an OCR service.
     * Compatible with ppu-paddle-ocr/web's PaddleOcrService.
     * @param ocrService - Any OCR service with initialize() and recognize(canvas) methods.
     * @param canvasMap - A map of page numbers to HTMLCanvasElement instances.
     * @returns A map of page numbers to extracted text data with OCR results.
     */
    getTextsScanned(ocrService: {
        initialize(): Promise<void>;
        recognize(canvas: HTMLCanvasElement): Promise<any>;
    }, canvasMap: WebCanvasMap): Promise<PageTexts>;
    private getCanvas;
    /**
     * Extracts text from all pages of a PDF document.
     * @param doc - The PDFDocumentProxy to extract text from.
     * @returns A map of page numbers to extracted text data.
     */
    getTexts(pdf: pdfjs.PDFDocumentProxy): Promise<PageTexts>;
    private extractTexts;
    private extractOcrTexts;
    private convertOcrToPdfWords;
    private mapTokenToPdfWord;
    private mergeTextContent;
    private filterTextContent;
    /**
     * Converts extracted text into structured lines.
     * @param pageTexts - The extracted text data from a PDF.
     * @returns A map of page numbers to structured lines.
     */
    getLinesFromTexts(pageTexts: PageTexts): PageLines;
    /**
     * Converts extracted text into TOON format string for LLM-friendly input.
     * @param pageTexts - The extracted text data from a PDF.
     * @returns A string of TOON format
     */
    getLinesFromTextsInToon(pageTexts: PageTexts): PageToonLines;
    /**
     * Converts extracted text into compact structured lines using a specified algorithm.
     * @param pageTexts - The extracted text data from a PDF.
     * @param algorithm - The algorithm for compacting lines (default: "middleY").
     * @returns A map of page numbers to compact structured lines.
     */
    getCompactLinesFromTexts(pageTexts: PageTexts, algorithm?: PdfCompactLineAlgorithm): CompactPageLines;
    /**
     * Determines if the PDF document is scanned based on text thresholds.
     * @param pageTexts - The extracted text data from a PDF.
     * @param options - The threshold options for scanned detection.
     * @returns True if the document is likely scanned, false otherwise.
     */
    isScanned(pageTexts: PageTexts, options?: PdfScannedThreshold): boolean;
    /**
     * Determines if the individual PDF page is scanned/digital based on text thresholds.
     * @param pageText - The extracted page text.
     * @param options - The threshold options for scanned detection.
     * @returns True if the page is likely scanned, false otherwise.
     */
    isPageScanned(pageText: string, options?: PdfScannedThreshold): boolean;
    /**
     * Destroys the PDF document instance to free memory.
     * @param doc - The PDFDocumentProxy instance to destroy.
     */
    destroy(pdf: pdfjs.PDFDocumentProxy): Promise<void>;
}