// file included directly /** * References and further reading: * * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the * speed of a memory copy, Software: Practice and Experience 50 (2), 2020. * https://arxiv.org/abs/1910.05109 * * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2 * Instructions, ACM Transactions on the Web 12 (3), 2018. * https://arxiv.org/abs/1704.00605 * * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings. * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force, * Request for Comments: 4648. * * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization. * http://www.alfredklomp.com/programming/sse-base64/. (2014). * * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD * acceleration. https://github.com/aklomp/base64. (2014). * * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014). * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/ * * Nick Kopp. 2013. Base64 Encoding on a GPU. * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013). */ struct block64 { __m512i chunks[1]; }; template size_t encode_base64_impl(char *dst, const char *src, size_t srclen, base64_options options, size_t line_length = simdutf::default_line_length) { size_t offset = 0; if (line_length < 4) { line_length = 4; // We do not support line_length less than 4 } // credit: Wojciech Muła const uint8_t *input = (const uint8_t *)src; uint8_t *out = (uint8_t *)dst; static const char *lookup_tbl = base64_url ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_" : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; const __m512i shuffle_input = _mm512_setr_epi32( 0x01020001, 0x04050304, 0x07080607, 0x0a0b090a, 0x0d0e0c0d, 0x10110f10, 0x13141213, 0x16171516, 0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122, 0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e); const __m512i lookup = _mm512_loadu_si512(reinterpret_cast(lookup_tbl)); const __m512i multi_shifts = _mm512_set1_epi64(UINT64_C(0x3036242a1016040a)); size_t size = srclen; __mmask64 input_mask = 0xffffffffffff; // (1 << 48) - 1 // We want that input == end_input means that we must stop. const uint8_t *end_input = input + (size - (size % 48)); while (input != end_input) { const __m512i v = _mm512_maskz_loadu_epi8( input_mask, reinterpret_cast(input)); const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v); const __m512i indices = _mm512_multishift_epi64_epi8(multi_shifts, in); const __m512i result = _mm512_permutexvar_epi8(indices, lookup); if (use_lines) { if (offset + 64 > line_length) { if (line_length >= 64) { __m512i expanded = _mm512_mask_expand_epi8( _mm512_set1_epi8('\n'), ~(1ULL << ((line_length - offset))), result); _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), expanded); __m128i last_lane = _mm512_extracti32x4_epi32(result, 3); // Lane 3 (bytes 48-63) uint8_t last_byte = static_cast(_mm_extract_epi8(last_lane, 15)); out[64] = last_byte; out += 65; offset = 64 - (line_length - offset); } else { // slow path alignas(64) uint8_t local_buffer[64]; _mm512_storeu_si512(reinterpret_cast<__m512i *>(local_buffer), result); size_t out_pos = 0; size_t local_offset = offset; for (size_t j = 0; j < 64;) { if (local_offset == line_length) { out[out_pos++] = '\n'; local_offset = 0; } out[out_pos++] = local_buffer[j++]; local_offset++; } offset = local_offset; out += out_pos; } } else { _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), result); offset += 64; out += 64; } } else { _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), result); out += 64; } input += 48; } size = size % 48; input_mask = ((__mmask64)1 << size) - 1; const __m512i v = _mm512_maskz_loadu_epi8( input_mask, reinterpret_cast(input)); const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v); const __m512i indices = _mm512_multishift_epi64_epi8(multi_shifts, in); bool padding_needed = (((options & base64_url) == 0) ^ ((options & base64_reverse_padding) == base64_reverse_padding)); size_t padding_amount = ((size % 3) > 0) ? (3 - (size % 3)) : 0; size_t output_len = ((size + 2) / 3) * 4; size_t non_padded_output_len = output_len - padding_amount; if (!padding_needed) { output_len = non_padded_output_len; } // If no output, we are done. if (output_len == 0) { return (size_t)(out - (uint8_t *)dst); } __mmask64 output_mask = 0xFFFFFFFFFFFFFFFF >> (64 - output_len); __m512i result = _mm512_mask_permutexvar_epi8( _mm512_set1_epi8('='), ((__mmask64)1 << non_padded_output_len) - 1, indices, lookup); if (use_lines) { if (offset + output_len > line_length) { if (line_length >= 64) { __m512i expanded = _mm512_mask_expand_epi8( _mm512_set1_epi8('\n'), ~(1ULL << ((line_length - offset))), result); if (output_len == 64) { _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), expanded); out += 64; _mm512_mask_storeu_epi8(reinterpret_cast<__m512i *>(out - 63), 1ULL << 63, result); out++; } else { output_mask = 0xFFFFFFFFFFFFFFFF >> (64 - output_len - 1); _mm512_mask_storeu_epi8(reinterpret_cast<__m512i *>(out), output_mask, expanded); out += output_len + 1; } } else { alignas(64) uint8_t local_buffer[64]; _mm512_storeu_si512(reinterpret_cast<__m512i *>(local_buffer), result); size_t out_pos = 0; size_t local_offset = offset; for (size_t j = 0; j < output_len;) { if (local_offset == line_length) { out[out_pos++] = '\n'; local_offset = 0; } out[out_pos++] = local_buffer[j++]; local_offset++; } offset = local_offset; out += out_pos; } } else { _mm512_mask_storeu_epi8(reinterpret_cast<__m512i *>(out), output_mask, result); out += output_len; } } else { _mm512_mask_storeu_epi8(reinterpret_cast<__m512i *>(out), output_mask, result); out += output_len; } return (size_t)(out - (uint8_t *)dst); } template size_t encode_base64(char *dst, const char *src, size_t srclen, base64_options options) { return encode_base64_impl(dst, src, srclen, options); } template static inline uint64_t to_base64_mask(block64 *b, uint64_t *error, uint64_t input_mask = UINT64_MAX) { __m512i input = b->chunks[0]; const __m512i ascii_space_tbl = _mm512_set_epi8( 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32); __m512i lookup0; if (default_or_url) { lookup0 = _mm512_set_epi8( -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 63, -128, 62, -128, 62, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1, -128, -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -1); } else if (base64_url) { lookup0 = _mm512_set_epi8( -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1, -128, -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -1); } else { lookup0 = _mm512_set_epi8( -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 63, -128, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1, -128, -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -128); } __m512i lookup1; if (default_or_url) { lookup1 = _mm512_set_epi8( -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128, 63, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128); } else if (base64_url) { lookup1 = _mm512_set_epi8( -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128, 63, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128); } else { lookup1 = _mm512_set_epi8( -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128, -128, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128); } const __m512i translated = _mm512_permutex2var_epi8(lookup0, input, lookup1); const __m512i combined = _mm512_or_si512(translated, input); const __mmask64 mask = _mm512_movepi8_mask(combined) & input_mask; if (!ignore_garbage && mask) { const __mmask64 spaces = _mm512_cmpeq_epi8_mask(_mm512_shuffle_epi8(ascii_space_tbl, input), input) & input_mask; *error = (mask ^ spaces); } b->chunks[0] = translated; return mask | (~input_mask); } static inline void copy_block(block64 *b, char *output) { _mm512_storeu_si512(reinterpret_cast<__m512i *>(output), b->chunks[0]); } static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) { uint64_t nmask = ~mask; __m512i c = _mm512_maskz_compress_epi8(nmask, b->chunks[0]); _mm512_storeu_si512(reinterpret_cast<__m512i *>(output), c); return _mm_popcnt_u64(nmask); } // The caller of this function is responsible to ensure that there are 64 bytes // available from reading at src. The data is read into a block64 structure. static inline void load_block(block64 *b, const char *src) { b->chunks[0] = _mm512_loadu_si512(reinterpret_cast(src)); } static inline void load_block_partial(block64 *b, const char *src, __mmask64 input_mask) { b->chunks[0] = _mm512_maskz_loadu_epi8( input_mask, reinterpret_cast(src)); } // The caller of this function is responsible to ensure that there are 128 bytes // available from reading at src. The data is read into a block64 structure. static inline void load_block(block64 *b, const char16_t *src) { __m512i m1 = _mm512_loadu_si512(reinterpret_cast(src)); __m512i m2 = _mm512_loadu_si512(reinterpret_cast(src + 32)); __m512i p = _mm512_packus_epi16(m1, m2); b->chunks[0] = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), p); } static inline void load_block_partial(block64 *b, const char16_t *src, __mmask64 input_mask) { __m512i m1 = _mm512_maskz_loadu_epi16((__mmask32)input_mask, reinterpret_cast(src)); __m512i m2 = _mm512_maskz_loadu_epi16((__mmask32)(input_mask >> 32), reinterpret_cast(src + 32)); __m512i p = _mm512_packus_epi16(m1, m2); b->chunks[0] = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), p); } static inline void base64_decode(char *out, __m512i str) { const __m512i merge_ab_and_bc = _mm512_maddubs_epi16(str, _mm512_set1_epi32(0x01400140)); const __m512i merged = _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000)); const __m512i pack = _mm512_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, 61, 62, 56, 57, 58, 52, 53, 54, 48, 49, 50, 44, 45, 46, 40, 41, 42, 36, 37, 38, 32, 33, 34, 28, 29, 30, 24, 25, 26, 20, 21, 22, 16, 17, 18, 12, 13, 14, 8, 9, 10, 4, 5, 6, 0, 1, 2); const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged); _mm512_mask_storeu_epi8( (__m512i *)out, 0xffffffffffff, shuffled); // mask would be 0xffffffffffff since we write 48 bytes. } // decode 64 bytes and output 48 bytes static inline void base64_decode_block(char *out, const char *src) { base64_decode(out, _mm512_loadu_si512(reinterpret_cast(src))); } static inline void base64_decode_block(char *out, block64 *b) { base64_decode(out, b->chunks[0]); } template full_result compress_decode_base64(char *dst, const chartype *src, size_t srclen, base64_options options, last_chunk_handling_options last_chunk_options) { (void)options; const uint8_t *to_base64 = default_or_url ? tables::base64::to_base64_default_or_url_value : (base64_url ? tables::base64::to_base64_url_value : tables::base64::to_base64_value); auto ri = simdutf::scalar::base64::find_end(src, srclen, options); size_t equallocation = ri.equallocation; size_t padding_characters = ri.equalsigns; srclen = ri.srclen; size_t full_input_length = ri.full_input_length; if (srclen == 0) { if (!ignore_garbage && padding_characters > 0) { return {INVALID_BASE64_CHARACTER, equallocation, 0}; } return {SUCCESS, full_input_length, 0}; } const chartype *const srcinit = src; const char *const dstinit = dst; const chartype *const srcend = src + srclen; // figure out why block_size == 2 is sometimes best??? constexpr size_t block_size = 6; char buffer[block_size * 64]; char *bufferptr = buffer; if (srclen >= 64) { const chartype *const srcend64 = src + srclen - 64; while (src <= srcend64) { block64 b; load_block(&b, src); src += 64; uint64_t error = 0; uint64_t badcharmask = to_base64_mask(&b, &error); if (!ignore_garbage && error) { src -= 64; size_t error_offset = _tzcnt_u64(error); return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit + error_offset), size_t(dst - dstinit)}; } if (badcharmask != 0) { // optimization opportunity: check for simple masks like those made of // continuous 1s followed by continuous 0s. And masks containing a // single bad character. bufferptr += compress_block(&b, badcharmask, bufferptr); } else if (bufferptr != buffer) { copy_block(&b, bufferptr); bufferptr += 64; } else { base64_decode_block(dst, &b); dst += 48; } if (bufferptr >= (block_size - 1) * 64 + buffer) { for (size_t i = 0; i < (block_size - 1); i++) { base64_decode_block(dst, buffer + i * 64); dst += 48; } std::memcpy(buffer, buffer + (block_size - 1) * 64, 64); // 64 might be too much bufferptr -= (block_size - 1) * 64; } } } int last_block_len = (int)(srcend - src); if (last_block_len != 0) { __mmask64 input_mask = ((__mmask64)1 << last_block_len) - 1; block64 b; load_block_partial(&b, src, input_mask); uint64_t error = 0; uint64_t badcharmask = to_base64_mask(&b, &error, input_mask); if (!ignore_garbage && error) { size_t error_offset = _tzcnt_u64(error); return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit + error_offset), size_t(dst - dstinit)}; } src += last_block_len; bufferptr += compress_block(&b, badcharmask, bufferptr); } char *buffer_start = buffer; for (; buffer_start + 64 <= bufferptr; buffer_start += 64) { base64_decode_block(dst, buffer_start); dst += 48; } if ((bufferptr - buffer_start) != 0) { // For efficiency reasons, we end up reproducing much of the code // in base64_tail_decode_impl. Better engineering would be to // refactor the code so that we can call it without a performance hit. size_t rem = (bufferptr - buffer_start); int idx = rem % 4; __mmask64 mask = ((__mmask64)1 << rem) - 1; __m512i input = _mm512_maskz_loadu_epi8(mask, buffer_start); size_t output_len = (rem / 4) * 3; __mmask64 output_mask = mask >> (rem - output_len); const __m512i merge_ab_and_bc = _mm512_maddubs_epi16(input, _mm512_set1_epi32(0x01400140)); const __m512i merged = _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000)); const __m512i pack = _mm512_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, 61, 62, 56, 57, 58, 52, 53, 54, 48, 49, 50, 44, 45, 46, 40, 41, 42, 36, 37, 38, 32, 33, 34, 28, 29, 30, 24, 25, 26, 20, 21, 22, 16, 17, 18, 12, 13, 14, 8, 9, 10, 4, 5, 6, 0, 1, 2); const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged); // We never should have that the number of base64 characters + the // number of padding characters is more than 4. if (!ignore_garbage && (idx + padding_characters > 4)) { return {INVALID_BASE64_CHARACTER, size_t(src - srcinit), size_t(dst - dstinit), true}; } // The idea here is that in loose mode, // if there is padding at all, it must be used // to form 4-wise chunk. However, in loose mode, // we do accept no padding at all. if (!ignore_garbage && last_chunk_options == last_chunk_handling_options::loose && (idx >= 2) && padding_characters > 0 && ((idx + padding_characters) & 3) != 0) { return {INVALID_BASE64_CHARACTER, size_t(src - srcinit), size_t(dst - dstinit), true}; } else // The idea here is that in strict mode, we do not want to accept // incomplete base64 chunks. So if the chunk was otherwise valid, we // return BASE64_INPUT_REMAINDER. if (!ignore_garbage && last_chunk_options == last_chunk_handling_options::strict && (idx >= 2) && ((idx + padding_characters) & 3) != 0) { // The partial chunk was at src - idx _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); dst += output_len; return {BASE64_INPUT_REMAINDER, equallocation, size_t(dst - dstinit)}; } else // If there is a partial chunk with insufficient padding, with // stop_before_partial, we need to just ignore it. In "only full" mode, // skip the minute there are padding characters. if ((last_chunk_options == last_chunk_handling_options::stop_before_partial && (padding_characters + idx < 4) && (idx != 0) && (idx >= 2 || padding_characters == 0)) || (last_chunk_options == last_chunk_handling_options::only_full_chunks && (idx >= 2 || padding_characters == 0))) { _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); dst += output_len; // we need to rewind src to before the partial chunk size_t characters_to_skip = idx; while (characters_to_skip > 0) { src--; auto c = *src; uint8_t code = to_base64[uint8_t(c)]; if (simdutf::scalar::base64::is_eight_byte(c) && code <= 63) { characters_to_skip--; } } // And then we need to skip ignored characters // See https://github.com/simdutf/simdutf/issues/824 while (src > srcinit) { auto c = *(src - 1); uint8_t code = to_base64[uint8_t(c)]; if (simdutf::scalar::base64::is_eight_byte(c) && code <= 63) { break; } src--; } return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)}; } else { if (idx == 2) { if (!ignore_garbage && last_chunk_options == last_chunk_handling_options::strict) { uint32_t triple = (uint32_t(bufferptr[-2]) << 3 * 6) + (uint32_t(bufferptr[-1]) << 2 * 6); if (triple & 0xffff) { _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); dst += output_len; return {BASE64_EXTRA_BITS, size_t(src - srcinit), size_t(dst - dstinit)}; } } output_mask = (output_mask << 1) | 1; output_len += 1; _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); dst += output_len; } else if (idx == 3) { if (!ignore_garbage && last_chunk_options == last_chunk_handling_options::strict) { uint32_t triple = (uint32_t(bufferptr[-3]) << 3 * 6) + (uint32_t(bufferptr[-2]) << 2 * 6) + (uint32_t(bufferptr[-1]) << 1 * 6); if (triple & 0xff) { _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); dst += output_len; return {BASE64_EXTRA_BITS, size_t(src - srcinit), size_t(dst - dstinit)}; } } output_mask = (output_mask << 2) | 3; output_len += 2; _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); dst += output_len; } else if (!ignore_garbage && idx == 1 && (!is_partial(last_chunk_options) || (is_partial(last_chunk_options) && padding_characters > 0))) { _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); dst += output_len; return {BASE64_INPUT_REMAINDER, size_t(src - srcinit), size_t(dst - dstinit)}; } else if (!ignore_garbage && idx == 0 && padding_characters > 0) { _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); dst += output_len; return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)}; } else { _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); dst += output_len; } } if (!ignore_garbage && !is_partial(last_chunk_options) && padding_characters > 0) { size_t output_count = size_t(dst - dstinit); if ((output_count % 3 == 0) || ((output_count % 3) + 1 + padding_characters != 4)) { return {INVALID_BASE64_CHARACTER, equallocation, output_count}; } } return {SUCCESS, full_input_length, size_t(dst - dstinit)}; } if (!ignore_garbage && padding_characters > 0) { if ((size_t(dst - dstinit) % 3 == 0) || ((size_t(dst - dstinit) % 3) + 1 + padding_characters != 4)) { return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)}; } } return {SUCCESS, srclen, size_t(dst - dstinit)}; } simdutf_warn_unused size_t icelake_binary_length_from_base64(const char *input, size_t length) { size_t count = 0; const char *ptr = input; const char *end = input + length; __m512i spaces = _mm512_set1_epi8(0x20); while (ptr + 64 <= end) { __m512i data = _mm512_loadu_si512(reinterpret_cast(ptr)); uint64_t mask = _mm512_cmpgt_epi8_mask(data, spaces); count += count_ones(mask); ptr += 64; } while (ptr < end) { count += (*ptr > 0x20) ? 1 : 0; ptr++; } size_t padding = 0; size_t pos = length; while (pos > 0 && padding < 2) { char c = input[--pos]; if (c == '=') { padding++; } else if (c > ' ') { break; } } return ((count - padding) * 3) / 4; } simdutf_warn_unused size_t icelake_binary_length_from_base64(const char16_t *input, size_t length) { size_t count = 0; const char16_t *ptr = input; const char16_t *end = input + length; __m512i spaces = _mm512_set1_epi16(0x20); while (ptr + 32 <= end) { __m512i data = _mm512_loadu_si512(reinterpret_cast(ptr)); __mmask32 mask = _mm512_cmpgt_epi16_mask(data, spaces); count += _mm_popcnt_u32(mask); ptr += 32; } while (ptr < end) { count += (*ptr > 0x20) ? 1 : 0; ptr++; } size_t padding = 0; size_t pos = length; while (pos > 0 && padding < 2) { char16_t c = input[--pos]; if (c == '=') { padding++; } else if (c > ' ') { break; } } return ((count - padding) * 3) / 4; }