/** * Read tool: read file contents from the local filesystem. * * Returns file content with line numbers in `cat -n` format. * Handles text files, images (base64 ImageContent), and * detects binary files. * * Reference: docs/cortex/tools/read.md */ import * as crypto from 'node:crypto'; import * as fs from 'node:fs'; import * as path from 'node:path'; import { Type, type Static } from 'typebox'; import type { ReadRegistry } from './shared/read-registry.js'; import type { ToolContentDetails } from '../types.js'; import type { CortexToolRuntime } from './runtime.js'; import { attachRuntimeAwareTool } from './runtime.js'; import { estimateTokens } from '../token-estimator.js'; import { extractPdfText } from './shared/pdf-extractor.js'; // --------------------------------------------------------------------------- // Schema // --------------------------------------------------------------------------- export const ReadParams = Type.Object({ file_path: Type.String({ description: 'Absolute path to the file to read' }), offset: Type.Optional( Type.Number({ description: 'Line number to start reading from (1-based). Only provide if the file is too large to read at once.' }), ), limit: Type.Optional( Type.Number({ description: 'Maximum number of lines to read. Only provide if the file is too large to read at once.' }), ), pages: Type.Optional( Type.String({ description: 'Page range for PDF files (e.g., "1-5", "3", "10-20"). Only applicable to PDF files. Max 20 pages per request.' }), ), }); export type ReadParamsType = Static; // --------------------------------------------------------------------------- // Constants // --------------------------------------------------------------------------- const DEFAULT_LIMIT = 2000; const MAX_LINE_LENGTH = 2000; /** Pre-read gate for full reads (no offset/limit provided). */ const MAX_FULL_READ_BYTES = 256 * 1024; // 256 KB /** Hard ceiling even with offset/limit. Beyond this, use Bash. */ const MAX_READABLE_BYTES = 10 * 1024 * 1024; // 10 MB /** Post-read token ceiling on formatted output. */ const MAX_OUTPUT_TOKENS = 25_000; const IMAGE_EXTENSIONS = new Set(['.png', '.jpg', '.jpeg', '.gif', '.webp']); const IMAGE_MIME_TYPES: Record = { '.png': 'image/png', '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.gif': 'image/gif', '.webp': 'image/webp', }; /** * Device files that would hang the process: infinite output or blocking input. * Checked by path only (no I/O). */ const BLOCKED_DEVICE_PATHS = new Set([ // Infinite output '/dev/zero', '/dev/random', '/dev/urandom', '/dev/full', // Blocks waiting for input '/dev/stdin', '/dev/tty', '/dev/console', // Nonsensical to read '/dev/stdout', '/dev/stderr', // fd aliases for stdin/stdout/stderr '/dev/fd/0', '/dev/fd/1', '/dev/fd/2', ]); function isBlockedDevicePath(filePath: string): boolean { if (BLOCKED_DEVICE_PATHS.has(filePath)) return true; // /proc/self/fd/0-2 and /proc//fd/0-2 are Linux aliases for stdio if ( filePath.startsWith('/proc/') && (filePath.endsWith('/fd/0') || filePath.endsWith('/fd/1') || filePath.endsWith('/fd/2')) ) return true; return false; } // --------------------------------------------------------------------------- // Details type // --------------------------------------------------------------------------- export interface ReadDetails { filePath: string; totalLines: number; byteSize: number; truncated: boolean; truncatedLines: boolean; truncatedChars: boolean; /** Starting line number (1-based) for the content returned. */ startLine: number; /** True when the read was rejected by a size/token gate (content is an error message, not file data). */ rejected?: boolean; } // --------------------------------------------------------------------------- // Config // --------------------------------------------------------------------------- export interface ReadToolConfig { runtime?: CortexToolRuntime | undefined; readRegistry?: ReadRegistry | undefined; } // --------------------------------------------------------------------------- // Helpers // --------------------------------------------------------------------------- /** * Detect if a buffer contains binary content. * A file is considered binary if it contains null bytes in the first 8KB. */ function isBinaryBuffer(buffer: Buffer): boolean { const checkLength = Math.min(buffer.length, 8192); for (let i = 0; i < checkLength; i++) { if (buffer[i] === 0) { return true; } } return false; } /** * Try to detect and decode file content with common encodings. * Handles UTF-8, UTF-16 LE/BE (via BOM), and falls back to Latin-1. */ function decodeFileContent(buffer: Buffer): string { // Check for UTF-16 BOM if (buffer.length >= 2) { if (buffer[0] === 0xff && buffer[1] === 0xfe) { return buffer.toString('utf16le'); } if (buffer[0] === 0xfe && buffer[1] === 0xff) { // UTF-16 BE: swap bytes and decode as UTF-16 LE const swapped = Buffer.alloc(buffer.length); for (let i = 0; i < buffer.length - 1; i += 2) { swapped[i] = buffer[i + 1]!; swapped[i + 1] = buffer[i]!; } return swapped.toString('utf16le'); } } // Check for UTF-8 BOM if (buffer.length >= 3 && buffer[0] === 0xef && buffer[1] === 0xbb && buffer[2] === 0xbf) { return buffer.toString('utf8').slice(1); // Skip the BOM character } // Try UTF-8 first (most common) const utf8 = buffer.toString('utf8'); // Check for replacement characters that suggest bad UTF-8 decoding // Only fall back to Latin-1 if there are many replacement chars const replacementCount = (utf8.match(/\ufffd/g) ?? []).length; if (replacementCount > 0 && replacementCount > buffer.length * 0.01) { return buffer.toString('latin1'); } return utf8; } /** * Format lines with `cat -n` style line numbers. * Format: spaces + line_number + tab + content */ function formatWithLineNumbers( lines: string[], startLine: number, ): string { const maxLineNum = startLine + lines.length - 1; const width = String(maxLineNum).length; return lines .map((line, i) => { const lineNum = startLine + i; const paddedNum = String(lineNum).padStart(width + 2); // Truncate long lines const truncatedLine = line.length > MAX_LINE_LENGTH ? line.slice(0, MAX_LINE_LENGTH) + '... [truncated]' : line; return `${paddedNum}\t${truncatedLine}`; }) .join('\n'); } /** * Format byte count as a human-readable string (KB or MB). */ function formatBytes(bytes: number): string { if (bytes >= 1024 * 1024) { return `${(bytes / (1024 * 1024)).toFixed(1)} MB`; } return `${Math.round(bytes / 1024)} KB`; } /** * Build a rejection result for size/token gate failures. * Returns an error message as tool content with `rejected: true` in details. */ function makeRejection(filePath: string, byteSize: number, message: string): ToolContentDetails { return { content: [{ type: 'text', text: message }], details: { filePath, totalLines: 0, byteSize, truncated: false, truncatedLines: false, truncatedChars: false, startLine: 1, rejected: true, }, }; } // --------------------------------------------------------------------------- // Tool factory // --------------------------------------------------------------------------- export function createReadTool(config: ReadToolConfig): { name: string; description: string; parameters: typeof ReadParams; execute: (params: ReadParamsType) => Promise>; } { const readRegistry = config.runtime?.readRegistry ?? config.readRegistry; if (!readRegistry) { throw new Error('createReadTool requires either runtime or readRegistry'); } const tool = { name: 'Read', description: [ 'Read file contents from the local filesystem.', 'Returns content with line numbers in cat -n format.', '', 'Size limits:', `- Files up to ${formatBytes(MAX_FULL_READ_BYTES)}: read in full (no offset/limit needed)`, `- Files ${formatBytes(MAX_FULL_READ_BYTES)} to ${formatBytes(MAX_READABLE_BYTES)}: must provide offset and limit`, `- Files over ${formatBytes(MAX_READABLE_BYTES)}: use Bash (head, tail, sed) instead`, `- Output capped at ~${MAX_OUTPUT_TOKENS.toLocaleString()} tokens; reduce limit if exceeded`, '', 'For searching file contents, use Grep instead of reading the whole file.', ].join('\n'), parameters: ReadParams, async execute(params: ReadParamsType): Promise> { const filePath = path.resolve(params.file_path); const offset = params.offset ?? 1; const limit = params.limit ?? DEFAULT_LIMIT; // Block device paths that would hang (infinite output or blocking input) if (isBlockedDevicePath(filePath)) { return { content: [{ type: 'text', text: `Cannot read '${params.file_path}': this device file would block or produce infinite output.` }], details: { filePath, totalLines: 0, byteSize: 0, truncated: false, truncatedLines: false, truncatedChars: false, startLine: 1, }, }; } // Check if path exists let stat: fs.Stats; try { stat = await fs.promises.stat(filePath); } catch (err: unknown) { const code = (err as NodeJS.ErrnoException).code; if (code === 'ENOENT') { return { content: [{ type: 'text', text: `File does not exist: ${filePath}` }], details: { filePath, totalLines: 0, byteSize: 0, truncated: false, truncatedLines: false, truncatedChars: false, startLine: 1, }, }; } if (code === 'EACCES') { return { content: [{ type: 'text', text: `Permission denied: ${filePath}` }], details: { filePath, totalLines: 0, byteSize: 0, truncated: false, truncatedLines: false, truncatedChars: false, startLine: 1, }, }; } throw err; } // Cannot read directories if (stat.isDirectory()) { return { content: [{ type: 'text', text: 'Cannot read a directory. Use `ls` via Bash.' }], details: { filePath, totalLines: 0, byteSize: 0, truncated: false, truncatedLines: false, truncatedChars: false, startLine: 1, }, }; } // Gate 1: Absolute size ceiling - reject files > 10 MB entirely if (stat.size > MAX_READABLE_BYTES) { return makeRejection( filePath, stat.size, `File is too large to read (${formatBytes(stat.size)}, limit ${formatBytes(MAX_READABLE_BYTES)}). Use Bash with head, tail, or sed to extract specific sections.`, ); } const ext = path.extname(filePath).toLowerCase(); // Handle image files if (IMAGE_EXTENSIONS.has(ext)) { const buffer = await fs.promises.readFile(filePath); const mimeType = IMAGE_MIME_TYPES[ext] ?? 'application/octet-stream'; const base64 = buffer.toString('base64'); readRegistry.markRead(filePath, { timestamp: stat.mtimeMs }); return { content: [{ type: 'image', data: base64, mimeType }], details: { filePath, totalLines: 0, byteSize: stat.size, truncated: false, truncatedLines: false, truncatedChars: false, startLine: 1, }, }; } // Handle PDF files. The extractor (shared/pdf-extractor.ts) wraps // unpdf and returns a structured result: the Read tool's only // responsibility is to decide how to surface each outcome. if (ext === '.pdf') { const pdfBuffer = await fs.promises.readFile(filePath); const extraction = await extractPdfText({ data: pdfBuffer, pagesSpec: params.pages, }); if ( extraction.kind === 'error' || extraction.kind === 'invalid-range' || extraction.kind === 'empty' ) { // All three are read failures from the caller's perspective: // there is no usable content to hand to the model. Flag as // rejected so consumers can surface them uniformly and the // model can retry (with a different pages spec, OCR, etc.). return makeRejection(filePath, stat.size, extraction.message); } // Line-number the rendered output to match the cat -n style // used for text files. Line numbers reset to 1 per call; PDFs // don't map cleanly to the file-wide offset/limit model. const renderedLines = extraction.rendered.split('\n'); const formatted = formatWithLineNumbers(renderedLines, 1); // Gate 3: token ceiling on the formatted output. const pdfTokenCount = estimateTokens(formatted); if (pdfTokenCount > MAX_OUTPUT_TOKENS) { const requestedPages = extraction.lastPage - extraction.firstPage + 1; const suggestedPages = Math.max( 1, Math.floor(requestedPages * MAX_OUTPUT_TOKENS / pdfTokenCount), ); return makeRejection( filePath, stat.size, `PDF extraction too large (estimated ~${pdfTokenCount.toLocaleString()} tokens, limit ${MAX_OUTPUT_TOKENS.toLocaleString()}). ` + `Narrow the \`pages\` range (try ~${suggestedPages} page${suggestedPages === 1 ? '' : 's'} per call).`, ); } readRegistry.markRead(filePath, { timestamp: stat.mtimeMs }); return { content: [{ type: 'text', text: formatted }], details: { filePath, totalLines: renderedLines.length, byteSize: stat.size, truncated: false, truncatedLines: false, truncatedChars: false, startLine: 1, }, }; } // Gate 2: Full-read size gate - reject full reads of files > 256 KB const hasExplicitRange = params.offset !== undefined || params.limit !== undefined; if (!hasExplicitRange && stat.size > MAX_FULL_READ_BYTES) { return makeRejection( filePath, stat.size, `File is too large to read in full (${formatBytes(stat.size)}, limit ${formatBytes(MAX_FULL_READ_BYTES)}). Provide offset and limit to read a specific range, or use Grep to search for specific content.`, ); } // File-unchanged dedup: if we already read this exact range and the // file hasn't changed on disk, return a stub. The earlier Read result // is still in context, so re-sending wastes tokens. const existingState = readRegistry.getState(filePath); if (existingState && existingState.offset !== undefined) { const rangeMatch = existingState.offset === offset && existingState.limit === limit; if (rangeMatch && stat.mtimeMs === existingState.timestamp) { return { content: [{ type: 'text', text: `[File unchanged since last read: ${filePath}]` }], details: { filePath, totalLines: 0, byteSize: stat.size, truncated: false, truncatedLines: false, truncatedChars: false, startLine: 1, }, }; } } // Read the raw buffer const buffer = await fs.promises.readFile(filePath); // Binary detection (not image, not PDF) if (isBinaryBuffer(buffer)) { return { content: [{ type: 'text', text: 'Binary file detected. Cannot display as text.' }], details: { filePath, totalLines: 0, byteSize: stat.size, truncated: false, truncatedLines: false, truncatedChars: false, startLine: 1, }, }; } // Decode and split into lines const content = decodeFileContent(buffer); const allLines = content.split('\n'); const totalLines = allLines.length; // Handle empty file if (totalLines === 0 || (totalLines === 1 && allLines[0] === '')) { readRegistry.markRead(filePath, { timestamp: stat.mtimeMs, offset, limit }); return { content: [{ type: 'text', text: `[File is empty: ${filePath}]` }], details: { filePath, totalLines: 0, byteSize: stat.size, truncated: false, truncatedLines: false, truncatedChars: false, startLine: 1, }, }; } // Apply offset and limit const startIdx = Math.max(0, offset - 1); // Convert 1-based to 0-based const endIdx = Math.min(totalLines, startIdx + limit); const selectedLines = allLines.slice(startIdx, endIdx); const truncatedLines = endIdx < totalLines; const truncatedChars = selectedLines.some((line) => line.length > MAX_LINE_LENGTH); // Format with line numbers const formatted = formatWithLineNumbers(selectedLines, startIdx + 1); // Gate 3: Post-read token estimation const estimatedTokenCount = estimateTokens(formatted); if (estimatedTokenCount > MAX_OUTPUT_TOKENS) { const suggestedLimit = Math.floor(limit * MAX_OUTPUT_TOKENS / estimatedTokenCount); return makeRejection( filePath, stat.size, `Read result too large (estimated ~${estimatedTokenCount.toLocaleString()} tokens, limit ${MAX_OUTPUT_TOKENS.toLocaleString()}). ` + `The file has ${totalLines} lines. Use a smaller limit (try limit: ${Math.max(1, suggestedLimit)}) ` + `or use Grep to find the specific content you need.`, ); } // Only mark as read after passing all gates, so rejected reads // can be retried without hitting the dedup stub. // For full, non-truncated reads, record a content hash so the // Edit/Write tools can distinguish real file modifications from // mtime-only changes (formatters, cloud sync, antivirus, etc.). const isFullRead = !hasExplicitRange && !truncatedLines; const contentHash = isFullRead ? crypto.createHash('sha256').update(buffer).digest('hex') : undefined; readRegistry.markRead(filePath, { timestamp: stat.mtimeMs, offset, limit, ...(contentHash !== undefined ? { contentHash } : {}), }); let text = formatted; if (truncatedLines) { text += `\n\n[Showing lines ${startIdx + 1}-${endIdx} of ${totalLines} total. Use offset/limit to read more.]`; } return { content: [{ type: 'text', text }], details: { filePath, totalLines, byteSize: stat.size, truncated: truncatedLines || truncatedChars, truncatedLines, truncatedChars, startLine: offset, }, }; }, }; return attachRuntimeAwareTool(tool, { toolKind: 'Read', cloneForRuntime: (runtime) => createReadTool({ ...config, runtime, readRegistry: runtime.readRegistry, }), }); }