// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
// end of the code points. Only the least significant 12 bits of the mask
// are accessed.
// It returns how many bytes were consumed (up to 12).
size_t convert_masked_utf8_to_utf32(const char *input,
                                    uint64_t utf8_end_of_code_point_mask,
                                    char32_t *&utf32_out) {
  // we use an approach where we try to process up to 12 input bytes.
  // Why 12 input bytes and not 16? Because we are concerned with the size of
  // the lookup tables. Also 12 is nicely divisible by two and three.
  //
  uint32_t *&utf32_output = reinterpret_cast<uint32_t *&>(utf32_out);
  __m128i in = __lsx_vld(reinterpret_cast<const uint8_t *>(input), 0);
  const uint16_t input_utf8_end_of_code_point_mask =
      utf8_end_of_code_point_mask & 0xFFF;
  //
  // Optimization note: our main path below is load-latency dependent. Thus it
  // is maybe beneficial to have fast paths that depend on branch prediction but
  // have less latency. This results in more instructions but, potentially, also
  // higher speeds.
  //
  // We first try a few fast paths.
  if ((utf8_end_of_code_point_mask & 0xffff) == 0xffff) {
    // We process in chunks of 16 bytes.
    // use fast implementation in src/simdutf/arm64/simd.h
    // Ideally the compiler can keep the tables in registers.
    __m128i zero = __lsx_vldi(0);
    __m128i in16low = __lsx_vilvl_b(zero, in);
    __m128i in16high = __lsx_vilvh_b(zero, in);
    __m128i in32_0 = __lsx_vilvl_h(zero, in16low);
    __m128i in32_1 = __lsx_vilvh_h(zero, in16low);
    __m128i in32_2 = __lsx_vilvl_h(zero, in16high);
    __m128i in32_3 = __lsx_vilvh_h(zero, in16high);

    __lsx_vst(in32_0, reinterpret_cast<uint32_t *>(utf32_output), 0);
    __lsx_vst(in32_1, reinterpret_cast<uint32_t *>(utf32_output), 16);
    __lsx_vst(in32_2, reinterpret_cast<uint32_t *>(utf32_output), 32);
    __lsx_vst(in32_3, reinterpret_cast<uint32_t *>(utf32_output), 48);

    utf32_output += 16; // We wrote 16 32-bit characters.
    return 16;          // We consumed 16 bytes.
  }
  __m128i zero = __lsx_vldi(0);
  if (input_utf8_end_of_code_point_mask == 0x924) {
    // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte
    // UTF-32 code units. Convert to UTF-16
    __m128i composed_utf16 = convert_utf8_3_byte_to_utf16(in);
    __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);

    __lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
    utf32_output += 4; // We wrote 4 32-bit characters.
    return 12;         // We consumed 12 bytes.
  }
  // 2 byte sequences occur in short bursts in languages like Greek and Russian.
  if (input_utf8_end_of_code_point_mask == 0xaaa) {
    // We want to take 6 2-byte UTF-8 code units and turn them into 6 4-byte
    // UTF-32 code units. Convert to UTF-16
    __m128i composed_utf16 = convert_utf8_2_byte_to_utf16(in);

    __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
    __m128i utf32_high = __lsx_vilvh_h(zero, composed_utf16);

    __lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
    __lsx_vst(utf32_high, reinterpret_cast<uint32_t *>(utf32_output), 16);
    utf32_output += 6;
    return 12; // We consumed 12 bytes.
  }
  // Either no fast path or an unimportant fast path.

  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
      [input_utf8_end_of_code_point_mask][0];
  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
      [input_utf8_end_of_code_point_mask][1];

  if (idx < 64) {
    // SIX (6) input code-code units
    // Convert to UTF-16
    __m128i composed_utf16 = convert_utf8_1_to_2_byte_to_utf16(in, idx);
    __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
    __m128i utf32_high = __lsx_vilvh_h(zero, composed_utf16);

    __lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
    __lsx_vst(utf32_high, reinterpret_cast<uint32_t *>(utf32_output), 16);
    utf32_output += 6;
    return consumed;
  } else if (idx < 145) {
    // FOUR (4) input code-code units
    // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
    __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
                               simdutf::tables::utf8_to_utf16::shufutf8[idx]),
                           0);
    // Shuffle
    // 1 byte: 00000000 00000000 0ccccccc
    // 2 byte: 00000000 110bbbbb 10cccccc
    // 3 byte: 1110aaaa 10bbbbbb 10cccccc
    sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
    __m128i perm = __lsx_vshuf_b(zero, in, sh);
    // Split
    // 00000000 00000000 0ccccccc
    __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F)); // 6 or 7 bits
    // Note: unmasked
    // xxxxxxxx aaaaxxxx xxxxxxxx
    __m128i high =
        __lsx_vsrli_w(__lsx_vand_v(perm, __lsx_vldi(0xf)), 4); // 4 bits
    // Use 16 bit bic instead of and.
    // The top bits will be corrected later in the bsl
    // 00000000 10bbbbbb 00000000
    __m128i middle =
        __lsx_vand_v(perm, lsx_splat_u32(0x0000FF00)); // 5 or 6 bits
    // Combine low and middle with shift right accumulate
    // 00000000 00xxbbbb bbcccccc
    __m128i lowmid = __lsx_vor_v(ascii, __lsx_vsrli_w(middle, 2));
    // Insert top 4 bits from high byte with bitwise select
    // 00000000 aaaabbbb bbcccccc
    __m128i composed = __lsx_vbitsel_v(lowmid, high, lsx_splat_u32(0x0000F000));
    __lsx_vst(composed, utf32_output, 0);
    utf32_output += 4; // We wrote 4 32-bit characters.
    return consumed;
  } else if (idx < 209) {
    // THREE (3) input code-code units
    if (input_utf8_end_of_code_point_mask == 0x888) {
      // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
      // UTF-32 code units. This uses the same method as the fixed 3 byte
      // version, reversing and shift left insert. However, there is no need for
      // a shuffle mask now, just rev16 and rev32.
      //
      // This version does not use the LUT, but 4 byte sequences are less common
      // and the overhead of the extra memory access is less important than the
      // early branch overhead in shorter sequences, so it comes last.

      // Swap pairs of bytes
      // 10dddddd|10cccccc|10bbbbbb|11110aaa
      // 10cccccc 10dddddd|11110aaa 10bbbbbb
      __m128i swap = lsx_swap_bytes(in);
      // Shift left and insert
      // xxxxcccc ccdddddd|xxxxxxxa aabbbbbb
      __m128i merge1 = __lsx_vbitsel_v(__lsx_vsrli_h(swap, 2), swap,
                                       __lsx_vrepli_h(0x3f /*0x003F*/));
      // Shift insert again
      // xxxxxxxx xxxaaabb bbbbcccc ccdddddd
      __m128i merge2 =
          __lsx_vbitsel_v(__lsx_vslli_w(merge1, 12), /* merge1 << 12 */
                          __lsx_vsrli_w(merge1, 16), /* merge1 >> 16 */
                          lsx_splat_u32(0x00000FFF));
      // Clear the garbage
      // 00000000 000aaabb bbbbcccc ccdddddd
      __m128i composed = __lsx_vand_v(merge2, lsx_splat_u32(0x1FFFFF));
      // Store
      __lsx_vst(composed, utf32_output, 0);
      utf32_output += 3; // We wrote 3 32-bit characters.
      return 12;         // We consumed 12 bytes.
    }
    // Unlike UTF-16, doing a fast codepath doesn't have nearly as much benefit
    // due to surrogates no longer being involved.
    __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
                               simdutf::tables::utf8_to_utf16::shufutf8[idx]),
                           0);
    // 1 byte: 00000000 00000000 00000000 0ddddddd
    // 2 byte: 00000000 00000000 110ccccc 10dddddd
    // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
    // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
    sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
    __m128i perm = __lsx_vshuf_b(zero, in, sh);

    // Ascii
    __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F));
    __m128i middle = __lsx_vand_v(perm, lsx_splat_u32(0x00003f00));
    // 00000000 00000000 0000cccc ccdddddd
    __m128i cd = __lsx_vor_v(__lsx_vsrli_w(middle, 2), ascii);

    __m128i correction = __lsx_vand_v(perm, lsx_splat_u32(0x00400000));
    __m128i corrected = __lsx_vadd_b(perm, __lsx_vsrli_w(correction, 1));
    // Insert twice
    // 00000000 000aaabb bbbbxxxx xxxxxxxx
    __m128i corrected_srli2 =
        __lsx_vsrli_w(__lsx_vand_v(corrected, __lsx_vrepli_b(0x7)), 2);
    __m128i ab =
        __lsx_vbitsel_v(corrected_srli2, corrected, __lsx_vrepli_h(0x3f));
    ab = __lsx_vsrli_w(ab, 4);
    // 00000000 000aaabb bbbbcccc ccdddddd
    __m128i composed = __lsx_vbitsel_v(ab, cd, lsx_splat_u32(0x00000FFF));
    // Store
    __lsx_vst(composed, utf32_output, 0);
    utf32_output += 3; // We wrote 3 32-bit characters.
    return consumed;
  } else {
    // here we know that there is an error but we do not handle errors
    return 12;
  }
}
