/** * Long Context Chunking System * * Goal: split documents that exceed embedding model context limits into smaller, * semantically coherent chunks with overlap. * * Notes: * - We use *character counts* as a conservative proxy for tokens. * - The embedder triggers this only after a provider throws a context-length error. */ // ============================================================================ // Types & Constants // ============================================================================ export interface ChunkMetadata { startIndex: number; endIndex: number; length: number; } export interface ChunkResult { chunks: string[]; metadatas: ChunkMetadata[]; totalOriginalLength: number; chunkCount: number; } export interface ChunkerConfig { /** Maximum characters per chunk. */ maxChunkSize: number; /** Overlap between chunks in characters. */ overlapSize: number; /** Minimum chunk size (except the final chunk). */ minChunkSize: number; /** Attempt to split on sentence boundaries for better semantic coherence. */ semanticSplit: boolean; /** Max lines per chunk before we try to split earlier on a line boundary. */ maxLinesPerChunk: number; } // Common embedding context limits (provider/model specific). These are typically // token limits, but we treat them as inputs to a conservative char-based heuristic. export const EMBEDDING_CONTEXT_LIMITS: Record = { // Jina v5 "jina-embeddings-v5-text-small": 8192, "jina-embeddings-v5-text-nano": 8192, // OpenAI "text-embedding-3-small": 8192, "text-embedding-3-large": 8192, // Google "text-embedding-004": 8192, "gemini-embedding-001": 2048, // Local/common "nomic-embed-text": 8192, "all-MiniLM-L6-v2": 512, "all-mpnet-base-v2": 512, }; export const DEFAULT_CHUNKER_CONFIG: ChunkerConfig = { maxChunkSize: 4000, overlapSize: 200, minChunkSize: 200, semanticSplit: true, maxLinesPerChunk: 50, }; // Sentence ending patterns (English + CJK-ish punctuation) const SENTENCE_ENDING = /[.!?。!?]/; // ============================================================================ // Helpers // ============================================================================ function clamp(n: number, lo: number, hi: number): number { return Math.max(lo, Math.min(hi, n)); } function countLines(s: string): number { // Count \n (treat CRLF as one line break) return s.split(/\r\n|\n|\r/).length; } function findLastIndexWithin(text: string, re: RegExp, start: number, end: number): number { // Find last match start index for regex within [start, end). // NOTE: `re` must NOT be global; we will scan manually. let last = -1; for (let i = end - 1; i >= start; i--) { if (re.test(text[i])) return i; } return last; } function findSplitEnd(text: string, start: number, maxEnd: number, minEnd: number, config: ChunkerConfig): number { const safeMinEnd = clamp(minEnd, start + 1, maxEnd); const safeMaxEnd = clamp(maxEnd, safeMinEnd, text.length); // Respect line limit: if we exceed maxLinesPerChunk, force earlier split at a line break. if (config.maxLinesPerChunk > 0) { const candidate = text.slice(start, safeMaxEnd); if (countLines(candidate) > config.maxLinesPerChunk) { // Find the position of the Nth line break. let breaks = 0; for (let i = start; i < safeMaxEnd; i++) { const ch = text[i]; if (ch === "\n") { breaks++; if (breaks >= config.maxLinesPerChunk) { // Split right after this newline. return Math.max(i + 1, safeMinEnd); } } } } } if (config.semanticSplit) { // Prefer a sentence boundary near the end. // Scan backward from safeMaxEnd to safeMinEnd. for (let i = safeMaxEnd - 1; i >= safeMinEnd; i--) { if (SENTENCE_ENDING.test(text[i])) { // Include trailing whitespace after punctuation. let j = i + 1; while (j < safeMaxEnd && /\s/.test(text[j])) j++; return j; } } // Next best: newline boundary. for (let i = safeMaxEnd - 1; i >= safeMinEnd; i--) { if (text[i] === "\n") return i + 1; } } // Fallback: last whitespace boundary. for (let i = safeMaxEnd - 1; i >= safeMinEnd; i--) { if (/\s/.test(text[i])) return i; } return safeMaxEnd; } function sliceTrimWithIndices(text: string, start: number, end: number): { chunk: string; meta: ChunkMetadata } { const raw = text.slice(start, end); const leading = raw.match(/^\s*/)?.[0]?.length ?? 0; const trailing = raw.match(/\s*$/)?.[0]?.length ?? 0; const chunk = raw.trim(); const trimmedStart = start + leading; const trimmedEnd = end - trailing; return { chunk, meta: { startIndex: trimmedStart, endIndex: Math.max(trimmedStart, trimmedEnd), length: chunk.length, }, }; } // ============================================================================ // Chunking Core // ============================================================================ export function chunkDocument(text: string, config: ChunkerConfig = DEFAULT_CHUNKER_CONFIG): ChunkResult { if (!text || text.trim().length === 0) { return { chunks: [], metadatas: [], totalOriginalLength: 0, chunkCount: 0 }; } const totalOriginalLength = text.length; const chunks: string[] = []; const metadatas: ChunkMetadata[] = []; let pos = 0; const maxGuard = Math.max(4, Math.ceil(text.length / Math.max(1, config.maxChunkSize - config.overlapSize)) + 5); let guard = 0; while (pos < text.length && guard < maxGuard) { guard++; const remaining = text.length - pos; if (remaining <= config.maxChunkSize) { const { chunk, meta } = sliceTrimWithIndices(text, pos, text.length); if (chunk.length > 0) { chunks.push(chunk); metadatas.push(meta); } break; } const maxEnd = Math.min(pos + config.maxChunkSize, text.length); const minEnd = Math.min(pos + config.minChunkSize, maxEnd); const end = findSplitEnd(text, pos, maxEnd, minEnd, config); const { chunk, meta } = sliceTrimWithIndices(text, pos, end); // If trimming made it too small, fall back to a hard split. if (chunk.length < config.minChunkSize) { const hardEnd = Math.min(pos + config.maxChunkSize, text.length); const hard = sliceTrimWithIndices(text, pos, hardEnd); if (hard.chunk.length > 0) { chunks.push(hard.chunk); metadatas.push(hard.meta); } if (hardEnd >= text.length) break; pos = Math.max(hardEnd - config.overlapSize, pos + 1); continue; } chunks.push(chunk); metadatas.push(meta); if (end >= text.length) break; // Move forward with overlap. const nextPos = Math.max(end - config.overlapSize, pos + 1); pos = nextPos; } return { chunks, metadatas, totalOriginalLength, chunkCount: chunks.length, }; } /** * Smart chunker that adapts to model context limits. * * We intentionally pick conservative char limits (70% of the reported limit) * since token/char ratios vary. */ export function smartChunk(text: string, embedderModel?: string): ChunkResult { const limit = embedderModel ? EMBEDDING_CONTEXT_LIMITS[embedderModel] : undefined; const base = limit ?? 8192; const config: ChunkerConfig = { maxChunkSize: Math.max(1000, Math.floor(base * 0.7)), overlapSize: Math.max(0, Math.floor(base * 0.05)), minChunkSize: Math.max(100, Math.floor(base * 0.1)), semanticSplit: true, maxLinesPerChunk: 50, }; return chunkDocument(text, config); } export default chunkDocument;