/** * References and further reading: * * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the * speed of a memory copy, Software: Practice and Experience 50 (2), 2020. * https://arxiv.org/abs/1910.05109 * * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2 * Instructions, ACM Transactions on the Web 12 (3), 2018. * https://arxiv.org/abs/1704.00605 * * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings. * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force, * Request for Comments: 4648. * * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization. * http://www.alfredklomp.com/programming/sse-base64/. (2014). * * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD * acceleration. https://github.com/aklomp/base64. (2014). * * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014). * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/ * * Nick Kopp. 2013. Base64 Encoding on a GPU. * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013). */ // --- encoding ---------------------------------------------------- template __m128i lookup_pshufb_improved(const __m128i input) { // credit: Wojciech Muła // reduce 0..51 -> 0 // 52..61 -> 1 .. 10 // 62 -> 11 // 63 -> 12 __m128i result = _mm_subs_epu8(input, _mm_set1_epi8(51)); // distinguish between ranges 0..25 and 26..51: // 0 .. 25 -> remains 0 // 26 .. 51 -> becomes 13 const __m128i less = _mm_cmpgt_epi8(_mm_set1_epi8(26), input); result = _mm_or_si128(result, _mm_and_si128(less, _mm_set1_epi8(13))); __m128i shift_LUT; if (base64_url) { shift_LUT = _mm_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0); } else { shift_LUT = _mm_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0); } // read shift result = _mm_shuffle_epi8(shift_LUT, result); return _mm_add_epi8(result, input); } inline __m128i insert_line_feed16(__m128i input, size_t K) { static const uint8_t shuffle_masks[16][16] = { {0x80, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, {0, 0x80, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, {0, 1, 0x80, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, {0, 1, 2, 0x80, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, {0, 1, 2, 3, 0x80, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, {0, 1, 2, 3, 4, 0x80, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, {0, 1, 2, 3, 4, 5, 0x80, 6, 7, 8, 9, 10, 11, 12, 13, 14}, {0, 1, 2, 3, 4, 5, 6, 0x80, 7, 8, 9, 10, 11, 12, 13, 14}, {0, 1, 2, 3, 4, 5, 6, 7, 0x80, 8, 9, 10, 11, 12, 13, 14}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 0x80, 9, 10, 11, 12, 13, 14}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x80, 10, 11, 12, 13, 14}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0x80, 11, 12, 13, 14}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0x80, 12, 13, 14}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0x80, 13, 14}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 14}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0x80}}; // Prepare a vector with '\n' (0x0A) __m128i line_feed_vector = _mm_set1_epi8('\n'); // Load the precomputed shuffle mask for K (index K-1) __m128i mask = _mm_loadu_si128((__m128i *)shuffle_masks[K]); __m128i lf_pos = _mm_cmpeq_epi8(mask, _mm_set1_epi8(static_cast(0x80))); // Perform the shuffle to reposition the K bytes __m128i shuffled = _mm_shuffle_epi8(input, mask); // Blend with line_feed_vector to insert '\n' at the appropriate positions __m128i result = _mm_blendv_epi8(shuffled, line_feed_vector, lf_pos); return result; } template size_t encode_base64_impl(char *dst, const char *src, size_t srclen, base64_options options, size_t line_length = simdutf::default_line_length) { size_t offset = 0; if (line_length < 4) { line_length = 4; // We do not support line_length less than 4 } // credit: Wojciech Muła // SSE (lookup: pshufb improved unrolled) const uint8_t *input = (const uint8_t *)src; uint8_t *out = (uint8_t *)dst; const __m128i shuf = _mm_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1); size_t i = 0; for (; i + 52 <= srclen; i += 48) { __m128i in0 = _mm_loadu_si128( reinterpret_cast(input + i + 4 * 3 * 0)); __m128i in1 = _mm_loadu_si128( reinterpret_cast(input + i + 4 * 3 * 1)); __m128i in2 = _mm_loadu_si128( reinterpret_cast(input + i + 4 * 3 * 2)); __m128i in3 = _mm_loadu_si128( reinterpret_cast(input + i + 4 * 3 * 3)); in0 = _mm_shuffle_epi8(in0, shuf); in1 = _mm_shuffle_epi8(in1, shuf); in2 = _mm_shuffle_epi8(in2, shuf); in3 = _mm_shuffle_epi8(in3, shuf); const __m128i t0_0 = _mm_and_si128(in0, _mm_set1_epi32(0x0fc0fc00)); const __m128i t0_1 = _mm_and_si128(in1, _mm_set1_epi32(0x0fc0fc00)); const __m128i t0_2 = _mm_and_si128(in2, _mm_set1_epi32(0x0fc0fc00)); const __m128i t0_3 = _mm_and_si128(in3, _mm_set1_epi32(0x0fc0fc00)); const __m128i t1_0 = _mm_mulhi_epu16(t0_0, _mm_set1_epi32(0x04000040)); const __m128i t1_1 = _mm_mulhi_epu16(t0_1, _mm_set1_epi32(0x04000040)); const __m128i t1_2 = _mm_mulhi_epu16(t0_2, _mm_set1_epi32(0x04000040)); const __m128i t1_3 = _mm_mulhi_epu16(t0_3, _mm_set1_epi32(0x04000040)); const __m128i t2_0 = _mm_and_si128(in0, _mm_set1_epi32(0x003f03f0)); const __m128i t2_1 = _mm_and_si128(in1, _mm_set1_epi32(0x003f03f0)); const __m128i t2_2 = _mm_and_si128(in2, _mm_set1_epi32(0x003f03f0)); const __m128i t2_3 = _mm_and_si128(in3, _mm_set1_epi32(0x003f03f0)); const __m128i t3_0 = _mm_mullo_epi16(t2_0, _mm_set1_epi32(0x01000010)); const __m128i t3_1 = _mm_mullo_epi16(t2_1, _mm_set1_epi32(0x01000010)); const __m128i t3_2 = _mm_mullo_epi16(t2_2, _mm_set1_epi32(0x01000010)); const __m128i t3_3 = _mm_mullo_epi16(t2_3, _mm_set1_epi32(0x01000010)); const __m128i input0 = _mm_or_si128(t1_0, t3_0); const __m128i input1 = _mm_or_si128(t1_1, t3_1); const __m128i input2 = _mm_or_si128(t1_2, t3_2); const __m128i input3 = _mm_or_si128(t1_3, t3_3); const __m128i t0 = lookup_pshufb_improved(input0); const __m128i t1 = lookup_pshufb_improved(input1); const __m128i t2 = lookup_pshufb_improved(input2); const __m128i t3 = lookup_pshufb_improved(input3); if (use_lines) { if (line_length >= 64) { // fast path if (offset + 64 > line_length) { size_t location_end = line_length - offset; size_t to_move = 64 - location_end; if (location_end < 16) { // We can store or extract store. See below. //_mm_storeu_si128(reinterpret_cast<__m128i *>(out+1), t0); _mm_storeu_si128(reinterpret_cast<__m128i *>(out), insert_line_feed16(t0, location_end)); out[16] = static_cast(_mm_extract_epi8(t0, 15)); out += 17; } else { _mm_storeu_si128(reinterpret_cast<__m128i *>(out), t0); out += 16; } if (location_end >= 16 && location_end < 32) { // We can store or extract store. See below. //_mm_storeu_si128(reinterpret_cast<__m128i *>(out+1), t1); _mm_storeu_si128(reinterpret_cast<__m128i *>(out), insert_line_feed16(t1, location_end - 16)); out[16] = static_cast(_mm_extract_epi8(t1, 15)); out += 17; } else { _mm_storeu_si128(reinterpret_cast<__m128i *>(out), t1); out += 16; } if (location_end >= 32 && location_end < 48) { // We can store or extract store. See below. //_mm_storeu_si128(reinterpret_cast<__m128i *>(out+1), t2); _mm_storeu_si128(reinterpret_cast<__m128i *>(out), insert_line_feed16(t2, location_end - 32)); out[16] = static_cast(_mm_extract_epi8(t2, 15)); out += 17; } else { _mm_storeu_si128(reinterpret_cast<__m128i *>(out), t2); out += 16; } if (location_end >= 48) { // We can store or extract store. See below. //_mm_storeu_si128(reinterpret_cast<__m128i *>(out+1), t3); _mm_storeu_si128(reinterpret_cast<__m128i *>(out), insert_line_feed16(t3, location_end - 48)); out[16] = static_cast(_mm_extract_epi8(t3, 15)); out += 17; } else { _mm_storeu_si128(reinterpret_cast<__m128i *>(out), t3); out += 16; } offset = to_move; } else { _mm_storeu_si128(reinterpret_cast<__m128i *>(out), t0); _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16), t1); _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32), t2); _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48), t3); offset += 64; out += 64; } } else { // slow path // could be optimized alignas(64) uint8_t buffer[64]; _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer), t0); _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + 16), t1); _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + 32), t2); _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + 48), t3); std::memcpy(out, buffer, 64); size_t out_pos = 0; size_t local_offset = offset; for (size_t j = 0; j < 64;) { if (local_offset == line_length) { out[out_pos++] = '\n'; local_offset = 0; } out[out_pos++] = buffer[j++]; local_offset++; } offset = local_offset; out += out_pos; } } else { _mm_storeu_si128(reinterpret_cast<__m128i *>(out), t0); _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16), t1); _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32), t2); _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48), t3); out += 64; } } for (; i + 16 <= srclen; i += 12) { __m128i in = _mm_loadu_si128(reinterpret_cast(input + i)); // bytes from groups A, B and C are needed in separate 32-bit lanes // in = [DDDD|CCCC|BBBB|AAAA] // // an input triplet has layout // [????????|ccdddddd|bbbbcccc|aaaaaabb] // byte 3 byte 2 byte 1 byte 0 -- byte 3 comes from the next // triplet // // shuffling changes the order of bytes: 1, 0, 2, 1 // [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc] // ^^^^ ^^^^^^^^ ^^^^^^^^ ^^^^ // processed bits in = _mm_shuffle_epi8(in, shuf); // unpacking // t0 = [0000cccc|cc000000|aaaaaa00|00000000] const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0fc0fc00)); // t1 = [00000000|00cccccc|00000000|00aaaaaa] // (c * (1 << 10), a * (1 << 6)) >> 16 (note: an unsigned // multiplication) const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040)); // t2 = [00000000|00dddddd|000000bb|bbbb0000] const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003f03f0)); // t3 = [00dddddd|00000000|00bbbbbb|00000000]( // (d * (1 << 8), b * (1 << 4)) const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010)); // res = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] = t1 | t3 const __m128i indices = _mm_or_si128(t1, t3); const __m128i T0 = lookup_pshufb_improved(indices); _mm_storeu_si128(reinterpret_cast<__m128i *>(out), T0); if (use_lines) { if (line_length >= 16) { // fast path _mm_storeu_si128(reinterpret_cast<__m128i *>(out), T0); if (offset + 16 > line_length) { size_t location_end = line_length - offset; size_t to_move = 16 - location_end; std::memmove(out + location_end + 1, out + location_end, to_move); out[location_end] = '\n'; offset = to_move; out += 16 + 1; } else { offset += 16; out += 16; } } else { // slow path // could be optimized uint8_t buffer[16]; _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer), T0); size_t out_pos = 0; size_t local_offset = offset; for (size_t j = 0; j < 16;) { if (local_offset == line_length) { out[out_pos++] = '\n'; local_offset = 0; } out[out_pos++] = buffer[j++]; local_offset++; } offset = local_offset; out += out_pos; } } else { _mm_storeu_si128(reinterpret_cast<__m128i *>(out), T0); out += 16; } } return ((char *)out - (char *)dst) + scalar::base64::tail_encode_base64_impl( (char *)out, src + i, srclen - i, options, line_length, offset); } template size_t encode_base64(char *dst, const char *src, size_t srclen, base64_options options) { return encode_base64_impl(dst, src, srclen, options); } // --- decoding ----------------------------------------------- static simdutf_really_inline void compress(__m128i data, uint16_t mask, char *output) { if (mask == 0) { _mm_storeu_si128(reinterpret_cast<__m128i *>(output), data); return; } // this particular implementation was inspired by work done by @animetosho // we do it in two steps, first 8 bytes and then second 8 bytes uint8_t mask1 = uint8_t(mask); // least significant 8 bits uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits // next line just loads the 64-bit values thintable_epi8[mask1] and // thintable_epi8[mask2] into a 128-bit register, using only // two instructions on most compilers. __m128i shufmask = _mm_set_epi64x(tables::base64::thintable_epi8[mask2], tables::base64::thintable_epi8[mask1]); // we increment by 0x08 the second half of the mask shufmask = _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0)); // this is the version "nearly pruned" __m128i pruned = _mm_shuffle_epi8(data, shufmask); // we still need to put the two halves together. // we compute the popcount of the first half: int pop1 = tables::base64::BitsSetTable256mul2[mask1]; // then load the corresponding mask, what it does is to write // only the first pop1 bytes from the first 8 bytes, and then // it fills in with the bytes from the second 8 bytes + some filling // at the end. __m128i compactmask = _mm_loadu_si128(reinterpret_cast( tables::base64::pshufb_combine_table + pop1 * 8)); __m128i answer = _mm_shuffle_epi8(pruned, compactmask); _mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer); } static simdutf_really_inline void base64_decode(char *out, __m128i str) { // credit: aqrit const __m128i pack_shuffle = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1); const __m128i t0 = _mm_maddubs_epi16(str, _mm_set1_epi32(0x01400140)); const __m128i t1 = _mm_madd_epi16(t0, _mm_set1_epi32(0x00011000)); const __m128i t2 = _mm_shuffle_epi8(t1, pack_shuffle); // Store the output: // this writes 16 bytes, but we only need 12. _mm_storeu_si128((__m128i *)out, t2); } // decode 64 bytes and output 48 bytes static inline void base64_decode_block(char *out, const char *src) { base64_decode(out, _mm_loadu_si128(reinterpret_cast(src))); base64_decode(out + 12, _mm_loadu_si128(reinterpret_cast(src + 16))); base64_decode(out + 24, _mm_loadu_si128(reinterpret_cast(src + 32))); base64_decode(out + 36, _mm_loadu_si128(reinterpret_cast(src + 48))); } static inline void base64_decode_block_safe(char *out, const char *src) { base64_decode(out, _mm_loadu_si128(reinterpret_cast(src))); base64_decode(out + 12, _mm_loadu_si128(reinterpret_cast(src + 16))); base64_decode(out + 24, _mm_loadu_si128(reinterpret_cast(src + 32))); char buffer[16]; base64_decode(buffer, _mm_loadu_si128(reinterpret_cast(src + 48))); std::memcpy(out + 36, buffer, 12); } // --- decoding - base64 class -------------------------------- class block64 { __m128i chunks[4]; public: // The caller of this function is responsible to ensure that there are 64 // bytes available from reading at src. simdutf_really_inline block64(const char *src) { chunks[0] = _mm_loadu_si128(reinterpret_cast(src)); chunks[1] = _mm_loadu_si128(reinterpret_cast(src + 16)); chunks[2] = _mm_loadu_si128(reinterpret_cast(src + 32)); chunks[3] = _mm_loadu_si128(reinterpret_cast(src + 48)); } public: // The caller of this function is responsible to ensure that there are 128 // bytes available from reading at src. The data is read into a block64 // structure. simdutf_really_inline block64(const char16_t *src) { const auto m1 = _mm_loadu_si128(reinterpret_cast(src)); const auto m2 = _mm_loadu_si128(reinterpret_cast(src + 8)); const auto m3 = _mm_loadu_si128(reinterpret_cast(src + 16)); const auto m4 = _mm_loadu_si128(reinterpret_cast(src + 24)); const auto m5 = _mm_loadu_si128(reinterpret_cast(src + 32)); const auto m6 = _mm_loadu_si128(reinterpret_cast(src + 40)); const auto m7 = _mm_loadu_si128(reinterpret_cast(src + 48)); const auto m8 = _mm_loadu_si128(reinterpret_cast(src + 56)); chunks[0] = _mm_packus_epi16(m1, m2); chunks[1] = _mm_packus_epi16(m3, m4); chunks[2] = _mm_packus_epi16(m5, m6); chunks[3] = _mm_packus_epi16(m7, m8); } public: simdutf_really_inline void copy_block(char *output) { _mm_storeu_si128(reinterpret_cast<__m128i *>(output), chunks[0]); _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 16), chunks[1]); _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 32), chunks[2]); _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 48), chunks[3]); } public: simdutf_really_inline uint64_t compress_block(uint64_t mask, char *output) { if (is_power_of_two(mask)) { return compress_block_single(mask, output); } uint64_t nmask = ~mask; compress(chunks[0], uint16_t(mask), output); compress(chunks[1], uint16_t(mask >> 16), output + count_ones(nmask & 0xFFFF)); compress(chunks[2], uint16_t(mask >> 32), output + count_ones(nmask & 0xFFFFFFFF)); compress(chunks[3], uint16_t(mask >> 48), output + count_ones(nmask & 0xFFFFFFFFFFFFULL)); return count_ones(nmask); } private: simdutf_really_inline size_t compress_block_single(uint64_t mask, char *output) { const size_t pos64 = trailing_zeroes(mask); const int8_t pos = pos64 & 0xf; switch (pos64 >> 4) { case 0b00: { const __m128i v0 = _mm_set1_epi8(char(pos - 1)); const __m128i v1 = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); const __m128i v2 = _mm_cmpgt_epi8(v1, v0); const __m128i sh = _mm_sub_epi8(v1, v2); const __m128i compressed = _mm_shuffle_epi8(chunks[0], sh); _mm_storeu_si128((__m128i *)(output + 0 * 16), compressed); _mm_storeu_si128((__m128i *)(output + 1 * 16 - 1), chunks[1]); _mm_storeu_si128((__m128i *)(output + 2 * 16 - 1), chunks[2]); _mm_storeu_si128((__m128i *)(output + 3 * 16 - 1), chunks[3]); } break; case 0b01: { _mm_storeu_si128((__m128i *)(output + 0 * 16), chunks[0]); const __m128i v0 = _mm_set1_epi8(char(pos - 1)); const __m128i v1 = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); const __m128i v2 = _mm_cmpgt_epi8(v1, v0); const __m128i sh = _mm_sub_epi8(v1, v2); const __m128i compressed = _mm_shuffle_epi8(chunks[1], sh); _mm_storeu_si128((__m128i *)(output + 1 * 16), compressed); _mm_storeu_si128((__m128i *)(output + 2 * 16 - 1), chunks[2]); _mm_storeu_si128((__m128i *)(output + 3 * 16 - 1), chunks[3]); } break; case 0b10: { _mm_storeu_si128((__m128i *)(output + 0 * 16), chunks[0]); _mm_storeu_si128((__m128i *)(output + 1 * 16), chunks[1]); const __m128i v0 = _mm_set1_epi8(char(pos - 1)); const __m128i v1 = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); const __m128i v2 = _mm_cmpgt_epi8(v1, v0); const __m128i sh = _mm_sub_epi8(v1, v2); const __m128i compressed = _mm_shuffle_epi8(chunks[2], sh); _mm_storeu_si128((__m128i *)(output + 2 * 16), compressed); _mm_storeu_si128((__m128i *)(output + 3 * 16 - 1), chunks[3]); } break; case 0b11: { _mm_storeu_si128((__m128i *)(output + 0 * 16), chunks[0]); _mm_storeu_si128((__m128i *)(output + 1 * 16), chunks[1]); _mm_storeu_si128((__m128i *)(output + 2 * 16), chunks[2]); const __m128i v0 = _mm_set1_epi8(char(pos - 1)); const __m128i v1 = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); const __m128i v2 = _mm_cmpgt_epi8(v1, v0); const __m128i sh = _mm_sub_epi8(v1, v2); const __m128i compressed = _mm_shuffle_epi8(chunks[3], sh); _mm_storeu_si128((__m128i *)(output + 3 * 16), compressed); } break; } return 63; } public: template simdutf_really_inline uint64_t to_base64_mask(uint64_t *error) { uint32_t err0 = 0; uint32_t err1 = 0; uint32_t err2 = 0; uint32_t err3 = 0; uint64_t m0 = to_base64_mask( &chunks[0], &err0); uint64_t m1 = to_base64_mask( &chunks[1], &err1); uint64_t m2 = to_base64_mask( &chunks[2], &err2); uint64_t m3 = to_base64_mask( &chunks[3], &err3); if (!ignore_garbage) { *error = (err0) | ((uint64_t)err1 << 16) | ((uint64_t)err2 << 32) | ((uint64_t)err3 << 48); } return m0 | (m1 << 16) | (m2 << 32) | (m3 << 48); } private: template simdutf_really_inline uint16_t to_base64_mask(__m128i *src, uint32_t *error) { const __m128i ascii_space_tbl = _mm_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0); // credit: aqrit __m128i delta_asso; if (default_or_url) { delta_asso = _mm_setr_epi8(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x16); } else if (base64_url) { delta_asso = _mm_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF); } else { delta_asso = _mm_setr_epi8(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F); } __m128i delta_values; if (default_or_url) { delta_values = _mm_setr_epi8( uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0x13), uint8_t(0x04), uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), uint8_t(0x00), uint8_t(0xFF), uint8_t(0x11), uint8_t(0xFF), uint8_t(0xBF), uint8_t(0x10), uint8_t(0xB9)); } else if (base64_url) { delta_values = _mm_setr_epi8(0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9)); } else { delta_values = _mm_setr_epi8(int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), int8_t(0x00), int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9)); } __m128i check_asso; if (default_or_url) { check_asso = _mm_setr_epi8(0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0E, 0x0B, 0x06); } else if (base64_url) { check_asso = _mm_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x3, 0x7, 0xB, 0xE, 0xB, 0x6); } else { check_asso = _mm_setr_epi8(0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F); } __m128i check_values; if (default_or_url) { check_values = _mm_setr_epi8( uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6), uint8_t(0xB5), uint8_t(0xA1), uint8_t(0x00), uint8_t(0x80), uint8_t(0x00), uint8_t(0x80), uint8_t(0x00), uint8_t(0x80)); } else if (base64_url) { check_values = _mm_setr_epi8(uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6), uint8_t(0xA6), uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0, uint8_t(0x80), 0x0, uint8_t(0x80)); } else { check_values = _mm_setr_epi8(int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80)); } const __m128i shifted = _mm_srli_epi32(*src, 3); __m128i delta_hash = _mm_avg_epu8(_mm_shuffle_epi8(delta_asso, *src), shifted); if (default_or_url) { delta_hash = _mm_and_si128(delta_hash, _mm_set1_epi8(0xf)); } const __m128i check_hash = _mm_avg_epu8(_mm_shuffle_epi8(check_asso, *src), shifted); const __m128i out = _mm_adds_epi8(_mm_shuffle_epi8(delta_values, delta_hash), *src); const __m128i chk = _mm_adds_epi8(_mm_shuffle_epi8(check_values, check_hash), *src); const int mask = _mm_movemask_epi8(chk); if (!ignore_garbage && mask) { __m128i ascii_space = _mm_cmpeq_epi8(_mm_shuffle_epi8(ascii_space_tbl, *src), *src); *error = (mask ^ _mm_movemask_epi8(ascii_space)); } *src = out; return (uint16_t)mask; } public: simdutf_really_inline void base64_decode_block(char *out) { base64_decode(out, chunks[0]); base64_decode(out + 12, chunks[1]); base64_decode(out + 24, chunks[2]); base64_decode(out + 36, chunks[3]); } public: simdutf_really_inline void base64_decode_block_safe(char *out) { base64_decode(out, chunks[0]); base64_decode(out + 12, chunks[1]); base64_decode(out + 24, chunks[2]); char buffer[16]; base64_decode(buffer, chunks[3]); std::memcpy(out + 36, buffer, 12); } };