import type { Document as MupdfDocument } from 'mupdf'; import { AppError } from '../utils/errors.js'; import type { EmbedderInterface } from './pdf-filter.js'; /** * File extensions supported by the parser module (parseFile + parsePdf). * Exported so other modules (e.g. list_files) stay in sync automatically * when new formats are added here. */ export declare const SUPPORTED_EXTENSIONS: Set; /** * Result from parsing a document, containing both content and extracted title. * Title is display-only metadata (NOT used for search scoring). */ export interface ParseResult { content: string; title: string; } /** * DocumentParser configuration. * * Accepts either a single `baseDir` (legacy single-root shape — preserved for * backward compatibility with downstream callers that have not yet migrated * to the multi-root model) or a `baseDirs` array (multi-root shape produced * by `resolveBaseDirs`). Exactly one of the two MUST be supplied; supplying * both is rejected by the constructor so misconfiguration cannot silently * pick one source over the other. * * Behavior under a single allowed root (`{ baseDir }` or * `{ baseDirs: [oneRoot] }`) is byte-identical to the previous single-root * implementation — see `validateFilePath` for the iteration contract under * multiple roots. */ export type ParserConfig = { /** Security: single allowed base directory (legacy shape). */ baseDir: string; baseDirs?: undefined; /** Maximum file size (100MB). */ maxFileSize: number; } | { /** Security: one or more allowed base directories (multi-root shape). */ baseDirs: readonly string[]; baseDir?: undefined; /** Maximum file size (100MB). */ maxFileSize: number; }; /** * Validation error (equivalent to 400) */ export declare class ValidationError extends AppError { constructor(message: string, cause?: Error); } /** * File operation error (equivalent to 500) */ export declare class FileOperationError extends AppError { constructor(message: string, cause?: Error); } /** * Document parser class (PDF/DOCX/TXT/MD support) * * Responsibilities: * - File path validation (path traversal prevention) * - File size validation (100MB limit) * - Parse 4 formats (PDF/DOCX/TXT/MD) */ export declare class DocumentParser { private readonly config; /** Raw allowed roots in input order (pre-realpath). Always non-empty. */ private readonly rawBaseDirs; /** * Lazily cached realpath-normalized allowed roots, each with a trailing * path separator so the `startsWith` check is sibling-prefix safe (e.g. * `/foo/bar/` must not match `/foo/barista/x.txt`). Order is preserved * from `rawBaseDirs` so the legacy single-root rejection message keeps * referencing the user-configured first root. Assumes the allowed roots * are stable for the process lifetime. */ private resolvedBaseDirs; constructor(config: ParserConfig); /** * File path validation (Absolute path requirement + Path traversal prevention). * * This is THE place realpath is used (with the base-dir resolver): the * security/containment boundary. Following symlinks here makes prefix * containment unforgeable. Stored/scanned/looked-up paths elsewhere use * resolve() — see {@link BaseDirsConfig} for the path policy. * * Multi-root semantics: a file is accepted iff its realpath (or, for a * non-symlink path that does not yet exist, its `resolve()`-normalized * absolute path) is under ANY realpath-normalized allowed root using a * trailing-separator prefix check. Broken symlinks are still rejected * outright — the lstat-based detection mirrors the previous single-root * behavior. * * Under a single allowed root the behavior is identical to the previous * single-root implementation. * * @param filePath - File path to validate (must be absolute) * @throws ValidationError - When path is not absolute or outside all allowed roots */ validateFilePath(filePath: string): Promise; /** * File size validation (100MB limit) * * @param filePath - File path to validate * @throws ValidationError - When file size exceeds limit * @throws FileOperationError - When file read fails */ validateFileSize(filePath: string): void; /** * File parsing (auto format detection) * * @param filePath - File path to parse * @returns ParseResult with content and extracted title * @throws ValidationError - Path traversal, size exceeded, unsupported format * @throws FileOperationError - File read failed, parse failed */ parseFile(filePath: string): Promise; /** * PDF parsing with header/footer filtering * * Features: * - Extracts text with position information (x, y, fontSize) * - Semantic header/footer detection using embedding similarity * - Uses hasEOL for proper line break handling * - Extracts document title from PDF metadata and first page font heuristic * * @param filePath - PDF file path * @param embedder - Embedder for semantic header/footer detection * @returns ParseResult with content and extracted title * @throws FileOperationError - File read failed, parse failed */ parsePdf(filePath: string, embedder: EmbedderInterface): Promise; /** * Per-page PDF parsing for the visual-enrichment path. * * Opens a mupdf `Document`, delegates per-page extraction to the shared * `extractPdfPages` helper with the `'preserve-whitespace,preserve-images'` * stext option string so mupdf emits `block.type === 'image'` blocks for * the downstream visual-candidate detector. * * Returns the open `Document` handle alongside the per-page records and * title-resolution materials so the caller can: * - run the renderer (`page.toPixmap()`) on the same handle, * - feed `metadataTitle` + `pages[0].page1FontHint` into `extractPdfTitle` * after `buildChunksAndEmbeddings` returns. * * Disposal contract (asymmetric — read carefully): * - SUCCESS path: this method returns the open `doc` handle. The caller * owns disposal and MUST wrap the call site in * `try { ... } finally { doc.destroy() }`. * - ERROR path: when this method throws, `doc` has already been destroyed * internally before the exception propagates (so the caller never * receives a handle it would not know to clean up). Callers MUST NOT * call `doc.destroy()` on an error from this method. * This method does NOT compute the final title and does NOT decide visual * candidates — those are the dispatch site's and `pdf-visual/detector`'s * responsibilities, respectively. * * @param filePath - PDF file path (validated against BASE_DIR and size limit) * @param embedder - Embedder for semantic header/footer detection * @returns Open mupdf `Document`, `metadataTitle`, and per-page records. * `page1FontHint` (largest-font line on page 1) is present only on `pages[0]`. * @throws ValidationError - Path traversal, size exceeded * @throws FileOperationError - File read or parse failed (after destroying `doc` internally) */ parsePdfPages(filePath: string, embedder: EmbedderInterface): Promise<{ doc: MupdfDocument; metadataTitle: string | undefined; pages: Array<{ pageNum: number; text: string; stextJson: unknown; page1FontHint?: { text: string; fontSize: number; }; }>; }>; /** * DOCX parsing (using mammoth) * * Uses extractRawText for content and convertToHtml additionally for title detection. * * @param filePath - DOCX file path * @returns ParseResult with content and extracted title * @throws FileOperationError - File read failed, parse failed */ private parseDocx; /** * TXT parsing (using fs.readFile) * * @param filePath - TXT file path * @returns ParseResult with content and extracted title * @throws FileOperationError - File read failed */ private parseTxt; /** * MD parsing (using fs.readFile) * * @param filePath - MD file path * @returns ParseResult with content and extracted title * @throws FileOperationError - File read failed */ private parseMd; } //# sourceMappingURL=index.d.ts.map