// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the // end of the code points. Only the least significant 12 bits of the mask // are accessed. // It returns how many bytes were consumed (up to 12). size_t convert_masked_utf8_to_utf32(const char *input, uint64_t utf8_end_of_code_point_mask, char32_t *&utf32_out) { // we use an approach where we try to process up to 12 input bytes. // Why 12 input bytes and not 16? Because we are concerned with the size of // the lookup tables. Also 12 is nicely divisible by two and three. // uint32_t *&utf32_output = reinterpret_cast(utf32_out); uint8x16_t in = vld1q_u8(reinterpret_cast(input)); const uint16_t input_utf8_end_of_code_point_mask = utf8_end_of_code_point_mask & 0xFFF; // // Optimization note: our main path below is load-latency dependent. Thus it // is maybe beneficial to have fast paths that depend on branch prediction but // have less latency. This results in more instructions but, potentially, also // higher speeds. // // We first try a few fast paths. if (utf8_end_of_code_point_mask == 0xfff) { // We process in chunks of 12 bytes. // use fast implementation in src/simdutf/arm64/simd.h // Ideally the compiler can keep the tables in registers. simd8 temp{vreinterpretq_s8_u8(in)}; temp.store_ascii_as_utf32_tbl(utf32_out); utf32_output += 12; // We wrote 12 32-bit characters. return 12; // We consumed 12 bytes. } if (input_utf8_end_of_code_point_mask == 0x924) { // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte // UTF-32 code units. Convert to UTF-16 uint16x4_t composed_utf16 = convert_utf8_3_byte_to_utf16(in); // Zero extend and store via ST2 with a zero. uint16x4x2_t interleaver = {{composed_utf16, vmov_n_u16(0)}}; vst2_u16(reinterpret_cast(utf32_output), interleaver); utf32_output += 4; // We wrote 4 32-bit characters. return 12; // We consumed 12 bytes. } // 2 byte sequences occur in short bursts in languages like Greek and Russian. if (input_utf8_end_of_code_point_mask == 0xaaa) { // We want to take 6 2-byte UTF-8 code units and turn them into 6 4-byte // UTF-32 code units. Convert to UTF-16 uint16x8_t composed_utf16 = convert_utf8_2_byte_to_utf16(in); // Zero extend and store via ST2 with a zero. uint16x8x2_t interleaver = {{composed_utf16, vmovq_n_u16(0)}}; vst2q_u16(reinterpret_cast(utf32_output), interleaver); utf32_output += 6; // We wrote 6 32-bit characters. return 12; // We consumed 12 bytes. } /// Either no fast path or an unimportant fast path. const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex [input_utf8_end_of_code_point_mask][0]; const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex [input_utf8_end_of_code_point_mask][1]; if (idx < 64) { // SIX (6) input code-code units // Convert to UTF-16 uint16x8_t composed_utf16 = convert_utf8_1_to_2_byte_to_utf16(in, idx); // Zero extend and store with ST2 and zero uint16x8x2_t interleaver = {{composed_utf16, vmovq_n_u16(0)}}; vst2q_u16(reinterpret_cast(utf32_output), interleaver); utf32_output += 6; // We wrote 6 32-bit characters. return consumed; } else if (idx < 145) { // FOUR (4) input code-code units // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing. uint8x16_t sh = vld1q_u8(reinterpret_cast( simdutf::tables::utf8_to_utf16::shufutf8[idx])); // Shuffle // 1 byte: 00000000 00000000 0ccccccc // 2 byte: 00000000 110bbbbb 10cccccc // 3 byte: 1110aaaa 10bbbbbb 10cccccc uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh)); // Split // 00000000 00000000 0ccccccc uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7F)); // 6 or 7 bits // Note: unmasked // xxxxxxxx aaaaxxxx xxxxxxxx uint32x4_t high = vshrq_n_u32(perm, 4); // 4 bits // Use 16 bit bic instead of and. // The top bits will be corrected later in the bsl // 00000000 10bbbbbb 00000000 uint32x4_t middle = vreinterpretq_u32_u16( vbicq_u16(vreinterpretq_u16_u32(perm), vmovq_n_u16(uint16_t(~0xff00)))); // 5 or 6 bits // Combine low and middle with shift right accumulate // 00000000 00xxbbbb bbcccccc uint32x4_t lowmid = vsraq_n_u32(ascii, middle, 2); // Insert top 4 bits from high byte with bitwise select // 00000000 aaaabbbb bbcccccc uint32x4_t composed = vbslq_u32(vmovq_n_u32(0x0000F000), high, lowmid); vst1q_u32(utf32_output, composed); utf32_output += 4; // We wrote 4 32-bit characters. return consumed; } else if (idx < 209) { // THREE (3) input code-code units if (input_utf8_end_of_code_point_mask == 0x888) { // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte // UTF-32 code units. This uses the same method as the fixed 3 byte // version, reversing and shift left insert. However, there is no need for // a shuffle mask now, just rev16 and rev32. // // This version does not use the LUT, but 4 byte sequences are less common // and the overhead of the extra memory access is less important than the // early branch overhead in shorter sequences, so it comes last. // Swap pairs of bytes // 10dddddd|10cccccc|10bbbbbb|11110aaa // 10cccccc 10dddddd|11110aaa 10bbbbbb uint16x8_t swap1 = vreinterpretq_u16_u8(vrev16q_u8(in)); // Shift left and insert // xxxxcccc ccdddddd|xxxxxxxa aabbbbbb uint16x8_t merge1 = vsliq_n_u16(swap1, vreinterpretq_u16_u8(in), 6); // Swap 16-bit lanes // xxxxcccc ccdddddd xxxxxxxa aabbbbbb // xxxxxxxa aabbbbbb xxxxcccc ccdddddd uint32x4_t swap2 = vreinterpretq_u32_u16(vrev32q_u16(merge1)); // Shift insert again // xxxxxxxx xxxaaabb bbbbcccc ccdddddd uint32x4_t merge2 = vsliq_n_u32(swap2, vreinterpretq_u32_u16(merge1), 12); // Clear the garbage // 00000000 000aaabb bbbbcccc ccdddddd uint32x4_t composed = vandq_u32(merge2, vmovq_n_u32(0x1FFFFF)); // Store vst1q_u32(utf32_output, composed); utf32_output += 3; // We wrote 3 32-bit characters. return 12; // We consumed 12 bytes. } // Unlike UTF-16, doing a fast codepath doesn't have nearly as much benefit // due to surrogates no longer being involved. uint8x16_t sh = vld1q_u8(reinterpret_cast( simdutf::tables::utf8_to_utf16::shufutf8[idx])); // 1 byte: 00000000 00000000 00000000 0ddddddd // 2 byte: 00000000 00000000 110ccccc 10dddddd // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh)); // Ascii uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7F)); uint32x4_t middle = vandq_u32(perm, vmovq_n_u32(0x3f00)); // When converting the way we do, the 3 byte prefix will be interpreted as // the 18th bit being set, since the code would interpret the lead byte // (0b1110bbbb) as a continuation byte (0b10bbbbbb). To fix this, we can // either xor or do an 8 bit add of the 6th bit shifted right by 1. Since // NEON has shift right accumulate, we use that. // 4 byte 3 byte // 10bbbbbb 1110bbbb // 00000000 01000000 6th bit // 00000000 00100000 shift right // 10bbbbbb 0000bbbb add // 00bbbbbb 0000bbbb mask uint8x16_t correction = vreinterpretq_u8_u32(vandq_u32(perm, vmovq_n_u32(0x00400000))); uint32x4_t corrected = vreinterpretq_u32_u8( vsraq_n_u8(vreinterpretq_u8_u32(perm), correction, 1)); // 00000000 00000000 0000cccc ccdddddd uint32x4_t cd = vsraq_n_u32(ascii, middle, 2); // Insert twice // xxxxxxxx xxxaaabb bbbbxxxx xxxxxxxx uint32x4_t ab = vbslq_u32(vmovq_n_u32(0x01C0000), vshrq_n_u32(corrected, 6), vshrq_n_u32(corrected, 4)); // 00000000 000aaabb bbbbcccc ccdddddd uint32x4_t composed = vbslq_u32(vmovq_n_u32(0xFFE00FFF), cd, ab); // Store vst1q_u32(utf32_output, composed); utf32_output += 3; // We wrote 3 32-bit characters. return consumed; } else { // here we know that there is an error but we do not handle errors return 12; } }