import { TextChunk, ChunkingOptions } from '../types/audio.js'; import { v4 as uuidv4 } from 'uuid'; export class TextChunker { private defaultOptions: ChunkingOptions = { maxSentencesPerChunk: 3, respectParagraphs: true, detectCodeBlocks: true, preserveFormatting: true }; chunkText(text: string, options?: Partial): TextChunk[] { const opts = { ...this.defaultOptions, ...options }; const chunks: TextChunk[] = []; // Pre-process text const preprocessed = this.preprocessText(text); // Handle special content types if (this.isCodeBlock(preprocessed)) { return this.chunkCodeBlock(preprocessed); } // Split into sentences const sentences = this.splitSentences(preprocessed); let currentChunk = ""; let sentenceCount = 0; let chunkOrder = 0; for (let i = 0; i < sentences.length; i++) { const sentence = sentences[i]?.trim(); if (!sentence) continue; currentChunk += (currentChunk ? ' ' : '') + sentence; sentenceCount++; // Determine if we should create a chunk const shouldChunk = this.shouldCreateChunk( sentence, sentenceCount, sentences[i + 1], opts ); if (shouldChunk || i === sentences.length - 1) { chunks.push({ id: uuidv4(), text: currentChunk, order: chunkOrder++, pauseAfter: this.calculatePause(sentence), emphasis: this.detectEmphasis(currentChunk), metadata: { isLastChunk: i === sentences.length - 1, chunkType: this.detectChunkType(currentChunk), wordCount: currentChunk.split(/\s+/).length } }); currentChunk = ""; sentenceCount = 0; } } return chunks; } private preprocessText(text: string): string { // Clean up common formatting issues return text .replace(/\r\n/g, '\n') // Normalize line endings .replace(/\s+/g, ' ') // Collapse multiple spaces .replace(/\n\s*\n/g, '\n\n') // Preserve paragraph breaks .trim(); } private splitSentences(text: string): string[] { // Enhanced sentence splitting that preserves context const sentences: string[] = []; // Split on sentence endings but preserve abbreviations const parts = text.split(/([.!?]+)/); for (let i = 0; i < parts.length; i += 2) { const sentence = parts[i]; const punctuation = parts[i + 1] || ''; if (sentence && sentence.trim()) { const fullSentence = sentence.trim() + punctuation; // Skip if it's likely an abbreviation if (!this.isAbbreviation(fullSentence)) { sentences.push(fullSentence); } else if (i + 2 < parts.length) { // Combine with next part if abbreviation const nextPart = parts[i + 2] || ''; const nextPunctuation = parts[i + 3] || ''; sentences.push((fullSentence + ' ' + nextPart + nextPunctuation).trim()); i += 2; // Skip next iteration } else { sentences.push(fullSentence); } } } return sentences.filter(s => s.trim().length > 0); } private shouldCreateChunk( sentence: string, sentenceCount: number, nextSentence: string | undefined, options: ChunkingOptions ): boolean { // Always break at paragraph boundaries if (options.respectParagraphs && sentence.includes('\n\n')) { return true; } // Break at natural pause points if (sentence.endsWith(':') || sentence.includes(' - ')) { return true; } // Break before code blocks if (nextSentence && this.isCodeBlock(nextSentence)) { return true; } // Break after reaching max sentences if (sentenceCount >= options.maxSentencesPerChunk) { return true; } // Break at long sentences (>100 words) if (sentence.split(/\s+/).length > 100) { return true; } return false; } private calculatePause(sentence: string): number { // Calculate natural pause based on ending punctuation const trimmed = sentence.trim(); if (trimmed.endsWith('!') || trimmed.endsWith('?')) { return 600; // Longer pause for questions/exclamations } if (trimmed.endsWith(':')) { return 500; // Medium pause for colons } if (trimmed.endsWith('.')) { return 400; // Standard pause for periods } if (trimmed.endsWith(',') || trimmed.endsWith(';')) { return 200; // Short pause for commas/semicolons } return 300; // Default pause } private detectEmphasis(text: string): boolean { // Detect text that should be emphasized const emphasisPatterns = [ /\*\*.*?\*\*/, // Bold markdown /\*.*?\*/, // Italic markdown /[A-Z]{2,}/, // ALL CAPS /!!+/, // Multiple exclamation marks /\b(IMPORTANT|NOTE|WARNING|ATTENTION)\b/i ]; return emphasisPatterns.some(pattern => pattern.test(text)); } private detectChunkType(text: string): 'sentence' | 'paragraph' | 'code' | 'list' | 'quote' { if (this.isCodeBlock(text)) return 'code'; if (text.includes('\n\n')) return 'paragraph'; if (/^\s*[-*+]\s/.test(text) || /^\s*\d+\.\s/.test(text)) return 'list'; if (/^[">]/.test(text.trim())) return 'quote'; return 'sentence'; } private isCodeBlock(text: string): boolean { return /```|`.*?`|\bfunction\b|\bclass\b|\bimport\b|\bexport\b/.test(text); } private isAbbreviation(sentence: string): boolean { // Common abbreviations that shouldn't end sentences const abbreviations = [ /\b(Mr|Mrs|Ms|Dr|Prof|Sr|Jr|vs|etc|i\.e|e\.g|cf|Inc|Ltd|Corp)\./i, /\b[A-Z]\./, // Single letter abbreviations /\d+\./ // Numbers with periods ]; return abbreviations.some(pattern => pattern.test(sentence)); } private chunkCodeBlock(text: string): TextChunk[] { // Special handling for code blocks return [{ id: uuidv4(), text: text, order: 0, pauseAfter: 600, emphasis: false, metadata: { isLastChunk: true, chunkType: 'code', wordCount: text.split(/\s+/).length } }]; } }