import chardet from "chardet" /** * 自动检测并解码文本数据,返回字符串。 * 如何检测到可能是中文或日文编码,优先选择 ["gb18030", "shift_jis"] * @param options.encoding 如果指定了 encoding,则直接使用该编码解码 */ export function autoDecodeText( data: Uint8Array | Buffer, options?: { encoding?: string } ): string { let u8data: Uint8Array if (data instanceof Buffer) { u8data = new Uint8Array(data.buffer, data.byteOffset, data.byteLength) } else { u8data = data } let encoding if (options?.encoding) { encoding = options.encoding } else { encoding = guessTextEncoding(u8data) } // console.log("Using encoding:", encoding) const decoder = new TextDecoder(encoding, { fatal: false }) return decoder.decode(u8data) } export function guessTextEncoding(data: Uint8Array): string { // 如果 data 太长,截取前 3000 字节进行检测 if (data.length > 3000) { data = data.slice(0, 3000) } let analyseResult = chardet.analyse(data) // console.log("Detected analyseResult:", analyseResult) // 纠正 UTF-8 的优先级,如果 UTF-8 >= 80%,则优先选择 UTF-8 let utf8Result = analyseResult.find((r) => r.name.toLowerCase() === "utf-8") if (utf8Result && utf8Result.confidence >= 80) { return "utf-8" } let preferList = ["gb18030", "shift_jis"] // 如果检测结果中包含常见编码,优先选择 for (let prefer of preferList) { let max = Math.max(...analyseResult.map((r) => r.confidence)) if (max <= 50) { let found = analyseResult.find((r) => r.name.toLowerCase() === prefer) if (found) { return found.name } } } let encoding = analyseResult.length > 0 ? analyseResult[0].name : "utf-8" return encoding }