/** * File Reference Registry * * Central registry for managing file references in on-demand processing mode. * Files are registered with lightweight metadata and previews. Full content * is processed on-demand when the LLM requests it via tools. * * This module is the core of the file reference architecture, replacing * the previous "load everything upfront" pattern for files that exceed * the tiny/small size tiers. * * @module files/fileReferenceRegistry */ import type { FileSource, FileExtractionParams, FileExtractionResult, FileReadResult, FileReference, FileRegistrationOptions, FileRegistryOptions, FileSearchResult, SizeTier } from "../types/index.js"; /** * Registry for managing file references with on-demand processing. * * Design decisions: * - One instance per NeuroLink SDK instance (not global singleton) * - File buffers persisted to temp dir for later streaming access * - LRU eviction when maxFiles exceeded * - Thread-safe via sequential async operations (Node.js single-threaded) * * @example * ```typescript * const registry = new FileReferenceRegistry(); * const ref = await registry.register(buffer, { * filename: 'report.xlsx', * }); * console.log(ref.sizeTier); // 'medium' * console.log(ref.preview); // First 2000 chars of processed content * console.log(ref.estimatedTokens); // Type-aware estimate * * // Later, LLM requests specific section * const section = await registry.readSection(ref.id, 1, 50, 5000); * ``` */ export declare class FileReferenceRegistry { private files; private tempDir; private maxFiles; private maxTempBytes; private defaultPreviewChars; private currentTempBytes; private tempDirCreated; constructor(options?: FileRegistryOptions); /** * Register a file from a Buffer. * * This is the primary registration method. It performs lightweight analysis: * 1. Detect file type from magic bytes (first 1KB) * 2. Determine size tier * 3. Extract preview (first N chars of text, or metadata for binary) * 4. Persist buffer to temp directory for later streaming access * * Total time: ~1-5ms for most files (no full processing). * * @param buffer - File content as Buffer * @param source - How the file was provided ('buffer', 'url', 'path', 'datauri') * @param options - Registration options * @returns FileReference with metadata and preview */ register(buffer: Buffer, source?: FileSource, options?: FileRegistrationOptions): Promise; /** * Register a file from a file path on disk. * * Does NOT read the entire file — only reads the first 1KB for type detection * and preview. The file path is stored for later streaming access. * * @param filePath - Absolute path to the file * @param options - Registration options * @returns FileReference with metadata and preview */ registerFromPath(filePath: string, options?: FileRegistrationOptions): Promise; /** * Get a file reference by ID. * Updates lastAccessedAt for LRU tracking. */ get(id: string): FileReference | undefined; /** * Get a file reference by ID or filename. * Tries ID lookup first, then falls back to filename match. * This handles the common case where an LLM uses the filename * instead of the UUID when calling file tools. * * @param idOrName - UUID or filename to search for * @returns File reference if found, undefined otherwise */ getByIdOrFilename(idOrName: string): FileReference | undefined; /** * Ensure a file has been processed (binary content extracted to text). * * For text files this is a no-op. For binary files (PDF, XLSX, video, etc.) * this triggers on-demand processing if it hasn't happened yet. After this * call, ref.processedContent and ref.preview contain extracted text. * * Used by file tools (get_file_preview) to ensure the preview contains * real content instead of placeholder metadata strings. */ ensureProcessed(fileId: string): Promise; /** * Extract targeted content from a registered file. * * This is the core dispatch method for the `extract_file_content` tool. * Routes extraction to the appropriate processor based on file type and * the parameters provided. * * @param params - Extraction parameters (file_id + type-specific options) * @returns Extraction result with text and/or images */ extractContent(params: FileExtractionParams): Promise; private extractVideoTargeted; private extractPdfTargeted; private extractExcelTargeted; private extractPptxTargeted; private extractArchiveTargeted; private extractAudioTargeted; private extractTextTargeted; /** * List all registered files. * Returns a lightweight summary suitable for the LLM. */ list(): FileReference[]; /** * Generate a formatted table of all registered files for the LLM. */ listFormatted(): string; /** * Read a section of a registered file. * * Uses StreamingReader for memory-efficient access. * * @param fileId - File reference ID * @param startLine - Starting line (1-indexed) * @param endLine - Ending line (1-indexed) * @param tokenBudget - Maximum tokens to return * @param provider - Provider name for token estimation * @returns FileReadResult */ readSection(fileId: string, startLine?: number, endLine?: number, tokenBudget?: number, provider?: string): Promise; /** * Search within a registered file. * * @param fileId - File reference ID * @param pattern - Search pattern (string or regex) * @param maxMatches - Maximum matches to return * @returns FileSearchResult */ search(fileId: string, pattern: string, maxMatches?: number): Promise; /** * Search within in-memory content (for tiny files without temp paths). */ private static searchInMemory; /** * Store a summary for a file reference. */ setSummary(fileId: string, summary: string): void; /** * Remove a file reference and clean up its temp file. */ remove(fileId: string): Promise; /** * Clear all file references and clean up temp directory. */ clear(): Promise; /** * Get the number of registered files. */ get size(): number; /** * Generate the preview text for the initial prompt. * * Returns a compact summary of all registered files that uses ~50-100 tokens * per file instead of full content. The LLM can use file tools to access * more content as needed. * * @returns Formatted string for prompt injection */ generatePromptPreview(): Promise; /** * Get type-specific extraction hints for the LLM prompt. * Tells the LLM what parameters it can use with extract_file_content. */ static getExtractionHint(type: string, sizeStr: string): string | null; /** * Classify a file into a size tier based on byte size. */ static classifySizeTier(sizeBytes: number): SizeTier; /** * Process a binary file on-demand, extracting text content via the * appropriate processor. This bridges the gap between the lazy registration * path (which stores raw binary) and the LLM read tools (which need text). * * Called lazily on first readSection() or search() for non-text files. * Results are cached in ref.processedContent for subsequent reads. */ private processFileOnDemand; /** * Extract text from a PDF buffer using pdf-parse v2 (pdfjs-dist under the hood). * * Handles compressed streams (FlateDecode), CMap-encoded text, modern PDFs, * and most text-based PDF formats. For scanned/image-only PDFs where no text * can be extracted, falls back to a descriptive message. */ private extractPdfText; /** * Extract text content from an Excel file using ExcelProcessor. */ private extractExcelText; /** * Extract text content from a Word document using WordProcessor. */ private extractWordText; /** * Extract text from a PowerPoint file using PptxProcessor. */ private extractPptxText; /** * Extract metadata and content from a video file using VideoProcessor. */ private extractVideoContent; /** * Extract metadata and content from an audio file using AudioProcessor. */ private extractAudioContent; /** * Extract file listing from an archive using ArchiveProcessor. */ private extractArchiveContent; /** * Extract a preview from a buffer. * For text: first N characters. * For binary: type-specific metadata. */ private extractPreview; /** * Detect file type from buffer magic bytes and extension. */ private detectType; /** * Detect file type from extension alone. */ private detectTypeFromExtension; /** * Whether a file type contains readable text content. * For "unknown" types, optionally checks the buffer for valid UTF-8 text. */ private isTextType; /** * Heuristic check: does a buffer look like valid text content? * Checks the first 512 bytes for mostly printable ASCII/UTF-8 characters. * Returns true if >90% of bytes are printable (ASCII 0x20-0x7E, tab, newline, CR). */ private static looksLikeText; /** * Guess MIME type from file type and extension. */ private guessMimeType; /** * Guess file extension from magic bytes. */ private guessExtension; /** * Persist a buffer to the temp directory. */ private persistToTemp; /** * Evict the least recently used file reference. */ private evictLRU; /** * Format byte size as human-readable string. */ private formatSize; }