Protocol Buffers - Google's data interchange format (grpc依赖) https://developers.google.com/protocol-buffers/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

118 lines
3.8 KiB

#include <arm_neon.h>
/* This code is almost the same as SSE implementation, please reference
* utf8-range-sse.inc for detailed explanation.
* The only difference is the range adjustment step. NEON code is more
* straightforward.
*/
static FORCE_INLINE_ATTR inline size_t utf8_range_ValidateUTF8Simd(
const char* data_original, const char* data, const char* end,
int return_position) {
const uint8x16_t first_len_tbl = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3,
};
const uint8x16_t first_range_tbl = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8,
};
const uint8x16_t range_min_tbl = {
0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80,
0xC2, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
};
const uint8x16_t range_max_tbl = {
0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F,
0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
};
/* Range adjustment in NEON uint8x16x2 table. Note that lanes are interleaved
* in register. The table below is plotted vertically to ease understanding.
* The 1st column is for E0~EF, 2nd column for F0~FF.
*/
// clang-format off
const uint8_t range_adjust_tbl_data[] = {
/* index -> 0~15 16~31 <- index */
/* E0 -> */ 2, 3, /* <- F0 */
0, 0,
0, 0,
0, 0,
0, 4, /* <- F4 */
0, 0,
0, 0,
0, 0,
0, 0,
0, 0,
0, 0,
0, 0,
0, 0,
/* ED -> */ 3, 0,
0, 0,
0, 0,
};
// clang-format on
const uint8x16x2_t range_adjust_tbl = vld2q_u8(range_adjust_tbl_data);
const uint8x16_t const_1 = vdupq_n_u8(1);
const uint8x16_t const_2 = vdupq_n_u8(2);
const uint8x16_t const_e0 = vdupq_n_u8(0xE0);
uint8x16_t prev_input = vdupq_n_u8(0);
uint8x16_t prev_first_len = vdupq_n_u8(0);
uint8x16_t error = vdupq_n_u8(0);
while (end - data >= 16) {
const uint8x16_t input = vld1q_u8((const uint8_t*)data);
const uint8x16_t high_nibbles = vshrq_n_u8(input, 4);
const uint8x16_t first_len = vqtbl1q_u8(first_len_tbl, high_nibbles);
uint8x16_t range = vqtbl1q_u8(first_range_tbl, high_nibbles);
range = vorrq_u8(range, vextq_u8(prev_first_len, first_len, 15));
uint8x16_t shift2 = vextq_u8(prev_first_len, first_len, 14);
shift2 = vqsubq_u8(shift2, const_1);
range = vorrq_u8(range, shift2);
uint8x16_t shift3 = vextq_u8(prev_first_len, first_len, 13);
shift3 = vqsubq_u8(shift3, const_2);
range = vorrq_u8(range, shift3);
uint8x16_t shift1 = vextq_u8(prev_input, input, 15);
shift1 = vsubq_u8(shift1, const_e0);
range = vaddq_u8(range, vqtbl2q_u8(range_adjust_tbl, shift1));
const uint8x16_t min_range = vqtbl1q_u8(range_min_tbl, range);
const uint8x16_t max_range = vqtbl1q_u8(range_max_tbl, range);
if (return_position) {
error = vcltq_u8(input, min_range);
error = vorrq_u8(error, vcgtq_u8(input, max_range));
if (vmaxvq_u32(vreinterpretq_u32_u8(error))) {
break;
}
} else {
error = vorrq_u8(error, vcltq_u8(input, min_range));
error = vorrq_u8(error, vcgtq_u8(input, max_range));
}
prev_input = input;
prev_first_len = first_len;
data += 16;
}
if (return_position && data == data_original) {
return utf8_range_ValidateUTF8Naive(data, end, return_position);
}
const int32_t prev = vgetq_lane_s32(vreinterpretq_s32_u8(prev_input), 3);
data -= utf8_range_CodepointSkipBackwards(prev);
if (return_position) {
return (data - data_original) +
utf8_range_ValidateUTF8Naive(data, end, return_position);
}
if (vmaxvq_u32(vreinterpretq_u32_u8(error))) {
return 0;
}
return utf8_range_ValidateUTF8Naive(data, end, return_position);
}