/** * Fast token estimator — TypeScript port of token_count/fast_token_estimate.py * Targets cl100k_base encoding (GPT-4, Claude, DeepSeek, GLM, MiniMax). * * Precision: ~2-7% error for most languages (tested vs tiktoken cl100k_base). * Speed: ~5ms per 100K chars (vs tiktoken ~3-10s). * * Algorithm: single-pass character classification with per-category coefficients. * No BPE encoding, no regex splitting — pure arithmetic on codepoints. */ import { readFileSync } from "fs"; import { join, dirname } from "path"; import { fileURLToPath } from "url"; // ─── CJK Lookup Table ────────────────────────────────────────────────────── // Each byte = token cost × 255 for one CJK character (U+4E00..U+9FFF). // Pre-computed from tiktoken cl100k_base actual encoding. const CJK_START = 0x4E00; const CJK_END = 0x9FFF; let _cjkTable: Uint8Array | null = null; function loadCjkTable(): Uint8Array | null { if (_cjkTable) return _cjkTable; try { // Try multiple paths for the CJK table const paths = [ join(dirname(fileURLToPath(import.meta.url)), "../../token_count/cjk_token_table.bin"), join(dirname(fileURLToPath(import.meta.url)), "../../../token_count/cjk_token_table.bin"), ]; for (const p of paths) { try { const buf = readFileSync(p); if (buf.length === CJK_END - CJK_START + 1) { _cjkTable = new Uint8Array(buf.buffer, buf.byteOffset, buf.length); return _cjkTable; } } catch { /* try next */ } } } catch { /* ignore */ } return null; } // ─── Character Classification ────────────────────────────────────────────── function isLatinLetter(cp: number): boolean { return ( (cp >= 0x41 && cp <= 0x5A) || (cp >= 0x61 && cp <= 0x7A) || (cp >= 0x00C0 && cp <= 0x00FF && cp !== 0x00D7 && cp !== 0x00F7) || (cp >= 0x0100 && cp <= 0x024F) ); } function isCjkHan(cp: number): boolean { return ( (cp >= 0x4E00 && cp <= 0x9FFF) || (cp >= 0x3400 && cp <= 0x4DBF) || (cp >= 0xF900 && cp <= 0xFAFF) ); } function isKana(cp: number): boolean { return (cp >= 0x3040 && cp <= 0x309F) || (cp >= 0x30A0 && cp <= 0x30FF); } function isHangul(cp: number): boolean { return (cp >= 0xAC00 && cp <= 0xD7AF) || (cp >= 0x1100 && cp <= 0x11FF) || (cp >= 0x3130 && cp <= 0x318F); } function isCyrillic(cp: number): boolean { return (cp >= 0x0400 && cp <= 0x04FF) || (cp >= 0x0500 && cp <= 0x052F); } function isArabic(cp: number): boolean { return ( (cp >= 0x0600 && cp <= 0x06FF) || (cp >= 0x0750 && cp <= 0x077F) || (cp >= 0x08A0 && cp <= 0x08FF) || (cp >= 0xFB50 && cp <= 0xFDFF) || (cp >= 0xFE70 && cp <= 0xFEFF) ); } function isGreek(cp: number): boolean { return (cp >= 0x0370 && cp <= 0x03FF) || (cp >= 0x1F00 && cp <= 0x1FFF); } // ─── Main Estimator ──────────────────────────────────────────────────────── /** * Estimate token count for a string without doing BPE encoding. * Targets cl100k_base (GPT-4/Claude/DeepSeek/GLM/MiniMax). * Error typically <5% for code/English, <10% for CJK/mixed. */ export function fastEstimateTokens(text: string): number { if (!text) return 0; const n = text.length; const cjkTable = loadCjkTable(); let tokens = 0.0; let i = 0; // Pre-scan: detect if text is non-English Latin (French, Spanish, etc.) let accentCount = 0; let latinCount = 0; const sampleEnd = Math.min(n, 50000); for (let s = 0; s < sampleEnd; s++) { const cp = text.charCodeAt(s); if (cp >= 0x80 && cp <= 0x024F && ((cp >= 0x00C0 && cp <= 0x00FF && cp !== 0x00D7 && cp !== 0x00F7) || (cp >= 0x0100 && cp <= 0x024F))) { accentCount++; } if ((cp >= 0x41 && cp <= 0x5A) || (cp >= 0x61 && cp <= 0x7A)) { latinCount++; } } const isNonEnglishLatin = latinCount > 100 && accentCount > latinCount * 0.005; while (i < n) { const cp = text.charCodeAt(i); // ── Latin word ── if (isLatinLetter(cp)) { let j = i + 1; while (j < n) { const c = text.charCodeAt(j); if (isLatinLetter(c)) { j++; } else if (c === 0x27 && j + 1 < n && isLatinLetter(text.charCodeAt(j + 1))) { j += 2; } else { break; } } const wl = j - i; // Check if this word contains accented characters let hasAccent = false; if (isNonEnglishLatin) { for (let k = i; k < j; k++) { if (text.charCodeAt(k) >= 0x80) { hasAccent = true; break; } } if (!hasAccent) { // Check nearby window const lo = Math.max(0, i - 100); const hi = Math.min(n, j + 100); for (let k = lo; k < hi; k++) { const cc = text.charCodeAt(k); if (cc >= 0x00C0 && cc <= 0x024F && cc !== 0x00D7 && cc !== 0x00F7) { hasAccent = true; break; } } } } if (hasAccent) { // Non-English Latin words are longer in tokens if (wl <= 3) tokens += 1.0; else if (wl <= 5) tokens += 1.35; else if (wl <= 7) tokens += 1.85; else if (wl <= 9) tokens += 2.5; else if (wl <= 12) tokens += 3.2; else tokens += 3.2 + (wl - 12) * 0.32; } else { // English word if (wl <= 4) tokens += 1.0; else if (wl <= 8) tokens += 1.1; else if (wl <= 13) tokens += 1.5; else tokens += 1.5 + (wl - 13) * 0.3; } i = j; continue; } // ── CJK Han characters ── if (isCjkHan(cp)) { let j = i + 1; let segTokens = 0.0; if (cjkTable && cp >= CJK_START && cp <= CJK_END) { segTokens += cjkTable[cp - CJK_START]; // table values are direct token counts (1-3) } else { segTokens += 1.3; } while (j < n && isCjkHan(text.charCodeAt(j))) { const cp2 = text.charCodeAt(j); if (cjkTable && cp2 >= CJK_START && cp2 <= CJK_END) { segTokens += cjkTable[cp2 - CJK_START]; } else { segTokens += 1.3; } j++; } const run = j - i; // BPE merges adjacent CJK characters. Longer segments get more merges. if (run >= 4) segTokens *= 0.94; else if (run >= 2) segTokens *= 0.97; tokens += segTokens; i = j; continue; } // ── Japanese Kana ── if (isKana(cp)) { let j = i + 1; while (j < n && isKana(text.charCodeAt(j))) j++; const run = j - i; if (run === 1) tokens += 1.0; else if (run === 2) tokens += 1.6; else if (run === 3) tokens += 2.65; else if (run === 4) tokens += 3.7; else if (run <= 6) tokens += run * 0.93; else tokens += run * 0.95; i = j; continue; } // ── Korean Hangul ── if (isHangul(cp)) { tokens += 1.4; i++; continue; } // ── Cyrillic (Russian etc.) ── if (isCyrillic(cp)) { let j = i + 1; while (j < n && isCyrillic(text.charCodeAt(j))) j++; tokens += (j - i) * 0.55; i = j; continue; } // ── Arabic ── if (isArabic(cp)) { let j = i + 1; while (j < n && isArabic(text.charCodeAt(j))) j++; tokens += (j - i) * 0.82; i = j; continue; } // ── Greek ── if (isGreek(cp)) { let j = i + 1; while (j < n && isGreek(text.charCodeAt(j))) j++; tokens += (j - i) * 0.85; i = j; continue; } // ── Digits (with commas, dots) ── if (cp >= 0x30 && cp <= 0x39) { let j = i + 1; let digits = 1; let commas = 0; let dots = 0; while (j < n) { const c = text.charCodeAt(j); if (c >= 0x30 && c <= 0x39) { digits++; j++; } else if (c === 0x2C && j + 1 < n && text.charCodeAt(j + 1) >= 0x30 && text.charCodeAt(j + 1) <= 0x39) { commas++; j += 2; digits++; } else if (c === 0x2E && j + 1 < n && text.charCodeAt(j + 1) >= 0x30 && text.charCodeAt(j + 1) <= 0x39) { dots++; j += 2; digits++; } else { break; } } if (digits <= 3 && commas === 0 && dots === 0) tokens += 1.0; else if (commas > 0) tokens += commas * 2 + 1.0; else if (dots > 0) tokens += Math.max(2.0, digits / 3.0 + dots * 1.5); else tokens += Math.max(1.0, digits / 2.5); i = j; continue; } // ── Whitespace (space, tab) ── if (cp === 0x20 || cp === 0x09) { i++; continue; } // ── Newline ── if (cp === 0x0A || cp === 0x0D) { tokens += 1.0; i++; continue; } // ── Fullwidth punctuation ── if ((cp >= 0x3000 && cp <= 0x303F) || (cp >= 0xFF00 && cp <= 0xFFEF) || cp === 0x2018 || cp === 0x2019 || cp === 0x201C || cp === 0x201D || cp === 0x2014 || cp === 0x2026 || cp === 0x2013) { tokens += 1.0; i++; continue; } // ── ASCII punctuation ── if (cp >= 0x21 && cp <= 0x7E) { tokens += 0.6; i++; continue; } // ── Other Unicode (emoji etc.) ── if (cp > 0x7F) { tokens += 2.5; i++; continue; } i++; } return Math.max(1, Math.round(tokens)); } /** * Estimate tokens for an array of messages (same as buildTiktokenContextSnapshot * but using fast estimation instead of tiktoken). */ export function fastEstimateMessages(messages: any[], jsonReplacer?: (key: string, value: unknown) => unknown): number { let total = 0; for (const msg of messages) { const str = JSON.stringify(msg, jsonReplacer as any); total += fastEstimateTokens(str); } // JSON array overhead total += Math.ceil(messages.length * 0.5); return total; }