import { concreteStream, StreamInternal } from "@effect/core/stream/Stream/operations/_internal/StreamInternal" const emptyByteChunk = Chunk.empty() const emptyStringChunk = Chunk.empty() export function utf8DecodeNoBom( stream: Stream ): Stream { concreteStream(stream) return new StreamInternal( Channel.suspend(stream.channel >> readThenTransduce(emptyByteChunk)) ) } function readThenTransduce( buffer: Chunk ): Channel, unknown, E, Chunk, unknown> { return Channel.readWith( (received: Chunk) => { const [string, buffered] = process(buffer, received) return Channel.write(string).flatMap(() => readThenTransduce(buffered)) }, (err) => Channel.fail(err), () => (buffer.isEmpty ? Channel.unit : Channel.write(stringChunkFrom(buffer))) ) } function process( buffered: Chunk, received: Chunk ): readonly [Chunk, Chunk] { const bytes = buffered + received const [chunk, rest] = bytes.splitAt(computeSplitIndex(bytes)) if (chunk.isEmpty) { return [emptyStringChunk, rest.materialize] } if (rest.isEmpty) { return [stringChunkFrom(chunk), emptyByteChunk] } return [stringChunkFrom(chunk), rest] } function stringChunkFrom(bytes: Chunk): Chunk { return Chunk.single(String.fromCharCode(...bytes)) } function computeSplitIndex(chunk: Chunk): number { // There are 3 bad patterns we need to check to detect an incomplete chunk: // - 2/3/4 byte sequences that start on the last byte // - 3/4 byte sequences that start on the second-to-last byte // - 4 byte sequences that start on the third-to-last byte // // Otherwise, we can convert the entire concatenated chunk to a string. const size = chunk.length if ( size >= 1 && List(is2ByteStart, is3ByteStart, is4ByteStart).find((f) => f(chunk.unsafeGet(size - 1))) .isSome() ) { return size - 1 } if ( size >= 2 && List(is3ByteStart, is4ByteStart).find((f) => f(chunk.unsafeGet(size - 2))).isSome() ) { return size - 2 } if (size >= 3 && is4ByteStart(chunk.unsafeGet(size - 3))) { return size - 3 } return size } function is2ByteStart(byte: number): boolean { return (byte & 0xe0) === 0xc0 } function is3ByteStart(byte: number): boolean { return (byte & 0xf0) === 0xe0 } function is4ByteStart(byte: number): boolean { return (byte & 0xf8) === 0xf0 }