/** * UTF-8 Safe Truncation Engine * * Implements Section 4.7 of the Agent Telemetry Specification v1. * Deterministic, UTF-8 safe truncation with the exact suffix `...[truncated]`. */ const TRUNCATION_SUFFIX = "...[truncated]"; const SUFFIX_BYTES = Buffer.byteLength(TRUNCATION_SUFFIX); // 14 /** * Find the largest valid UTF-8 boundary at or before `maxBytes`. * * Walks backward from the cut point past any continuation bytes (10xxxxxx), * then checks whether the lead byte's full sequence fits. If not, excludes * the partial sequence entirely. */ function findUtf8Boundary(bytes: Uint8Array, maxBytes: number): number { if (maxBytes >= bytes.length) return bytes.length; let i = maxBytes; // Walk backward past continuation bytes (10xxxxxx pattern) while (i > 0 && ((bytes[i] ?? 0) & 0xc0) === 0x80) i--; // i is now at a lead byte (or 0). Determine sequence length. const first = bytes[0] ?? 0; if (i > 0 || (first & 0x80) === 0) { const lead = bytes[i] ?? 0; let seqLen = 1; if ((lead & 0xe0) === 0xc0) seqLen = 2; else if ((lead & 0xf0) === 0xe0) seqLen = 3; else if ((lead & 0xf8) === 0xf0) seqLen = 4; // If the full sequence doesn't fit, exclude it if (i + seqLen > maxBytes) return i; return i + seqLen; } return 0; } /** * Truncate a string field to `maxBytes` UTF-8 bytes. * * If the value fits within `maxBytes`, it is returned unchanged. * Otherwise, the longest valid UTF-8 prefix is kept and the suffix * `...[truncated]` is appended, with total byte length <= maxBytes. */ export function truncateField(value: string, maxBytes: number): string { // Fast path: string length <= maxBytes guarantees fit only for ASCII // (multi-byte chars have string.length < byte length, so this is safe // only when string.length <= maxBytes AND the string is ASCII-only) // We use the encoded length as the true check. const encoded = Buffer.from(value, "utf8"); if (encoded.length <= maxBytes) return value; if (maxBytes <= SUFFIX_BYTES) { return TRUNCATION_SUFFIX.slice(0, maxBytes); } const keepBytes = maxBytes - SUFFIX_BYTES; const boundary = findUtf8Boundary(encoded, keepBytes); return encoded.slice(0, boundary).toString("utf8") + TRUNCATION_SUFFIX; }