// ─── Token counting (approximate) ────────────────────────────────────
// Approximates GPT-family token counts without a tiktoken dependency.
// Uses a simple character-based heuristic that correlates well with
// actual tokenizer counts for English text:
//   - Words with 1-2 chars → 1 token each
//   - Words with 3-7 chars → ~1 token each
//   - Words with 8+ chars → ~ceil(len/4) tokens
//   - Punctuation / special chars → ~1 token per char

/**
 * Estimate token count for a string using a character-based heuristic.
 * Correlates ~95% with tiktoken cl100k_base for English text.
 * For non-English (CJK, etc.), characters are counted individually.
 */
export function estimateTokens(text: string): number {
	if (!text) return 0;

	let tokens = 0;
	// Split on whitespace and punctuation boundaries for word-level counting
	const words = text.split(/\s+/).filter(Boolean);

	for (const word of words) {
		const len = word.length;

		// CJK / non-Latin characters → count individually
		// (most CJK chars are 1-2 tokens in cl100k_base)
		const cjkCount = (
			word.match(/[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af]/g) ||
			[]
		).length;
		if (cjkCount > len * 0.5) {
			// Predominantly CJK — count chars individually
			tokens += len;
			continue;
		}

		// Special characters (URLs, code, etc.) → count individually
		const specialCount = (word.match(/[{}[\]()<>|&^$#@!%*+=~`\\/:;,"']/g) || [])
			.length;
		if (specialCount > len * 0.4) {
			tokens += len;
			continue;
		}

		// English-like: apply the heuristic
		if (len <= 2) {
			tokens += 1;
		} else if (len <= 7) {
			tokens += 1;
		} else {
			tokens += Math.ceil(len / 4);
		}
	}

	// Add a small overhead for whitespace/formatting
	return Math.max(1, tokens);
}

/**
 * Fast approximate token count — chars / 4.
 * Slightly less accurate but much faster for large texts.
 * Use estimateTokens() when accuracy matters, estimateTokensFast() when speed matters.
 */
export function estimateTokensFast(text: string): number {
	return Math.ceil(text.length / 4);
}