/** * Text Chunking Utility * * Splits large texts into chunks suitable for embedding generation. * Handles token limits, word boundaries, and overlap for context preservation. */ /** * Estimate token count for text. * * Uses a simple heuristic: ~4 characters per token for English text. * This is approximate but works well for our use case. * * @param text - Text to estimate tokens for * @returns Estimated token count */ export function estimateTokens(text: string): number { if (!text) return 0; // Remove whitespace for more accurate estimate const trimmed = text.trim(); if (trimmed.length === 0) return 0; // Rough estimate: 1 token per 4 characters for English text // This is a simplification but works well for most cases return Math.ceil(trimmed.length / 4); } /** * Split text into chunks that fit within max tokens. * * Tries to break at word boundaries when possible. * Adds overlap between chunks to preserve context. * * @param text - Text to chunk * @param maxTokens - Maximum tokens per chunk (default: 8000 for OpenRouter) * @param overlapTokens - Number of tokens to overlap between chunks (default: 0) * @returns Array of text chunks */ export function chunkText( text: string, maxTokens: number = 8000, overlapTokens: number = 0 ): string[] { // Handle empty or very short text if (!text || text.trim().length === 0) { return ['']; } const trimmedText = text.trim(); const estimatedTokens = estimateTokens(trimmedText); // If text is under the limit, return as-is if (estimatedTokens <= maxTokens) { return [trimmedText]; } const chunks: string[] = []; const maxChars = maxTokens * 4; // Convert tokens to approximate characters const overlapChars = overlapTokens * 4; let startIndex = 0; let previousEndIndex = 0; let loopCount = 0; const maxLoops = 1000; // Safety limit to prevent infinite loops while (startIndex < trimmedText.length && loopCount < maxLoops) { loopCount++; // Calculate end index for this chunk let endIndex = Math.min(startIndex + maxChars, trimmedText.length); // If not the last chunk, try to break at a word boundary if (endIndex < trimmedText.length) { // Look for word boundary near the end const boundaryChars = 200; // Look back up to 200 chars const searchStart = Math.max(startIndex, endIndex - boundaryChars); const substring = trimmedText.slice(searchStart, endIndex); // Try to find line break first, then space, then punctuation let breakIndex = -1; // Look for last newline in the window const lastNewline = substring.lastIndexOf('\n'); if (lastNewline !== -1) { breakIndex = searchStart + lastNewline + 1; } else { // Look for last space in the window const lastSpace = substring.lastIndexOf(' '); if (lastSpace !== -1) { breakIndex = searchStart + lastSpace + 1; } else { // Look for sentence-ending punctuation for (let i = substring.length - 1; i >= Math.max(0, substring.length - 100); i--) { const char = substring[i]; if (char === '.' || char === '!' || char === '?') { // Make sure it's actually a sentence end (followed by space or end) const nextChar = substring[i + 1]; if (!nextChar || nextChar === ' ' || nextChar === '\n') { breakIndex = searchStart + i + 1; break; } } } } } // Use the break index if found, otherwise use the calculated end if (breakIndex > startIndex) { endIndex = breakIndex; } } // Extract the chunk const chunk = trimmedText.slice(startIndex, endIndex); chunks.push(chunk); // Move to next chunk, accounting for overlap if (overlapChars > 0 && endIndex < trimmedText.length) { // Only apply overlap if not at the end startIndex = Math.max(endIndex - overlapChars, endIndex - maxChars / 2); // Ensure we make progress if (startIndex <= previousEndIndex) { startIndex = endIndex; } // Also ensure we move forward at least a bit if (startIndex >= endIndex) { startIndex = endIndex; } } else { startIndex = endIndex; } previousEndIndex = endIndex; } return chunks; } /** * Chunk text specifically for embedding generation. * * Uses 8000 token limit (OpenRouter's limit for text-embedding-3-small). * * @param text - Text to chunk * @returns Array of text chunks suitable for embeddings */ export function chunkForEmbedding(text: string): string[] { return chunkText(text, 8000, 100); // 100 token overlap for context }