Protocol Buffers - Google's data interchange format (grpc依赖)
https://developers.google.com/protocol-buffers/
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
207 lines
6.8 KiB
207 lines
6.8 KiB
// Copyright 2023 Google LLC |
|
// |
|
// Use of this source code is governed by an MIT-style |
|
// license that can be found in the LICENSE file or at |
|
// https://opensource.org/licenses/MIT. |
|
|
|
/* This is a wrapper for the Google range-sse.cc algorithm which checks whether |
|
* a sequence of bytes is a valid UTF-8 sequence and finds the longest valid |
|
* prefix of the UTF-8 sequence. |
|
* |
|
* The key difference is that it checks for as much ASCII symbols as possible |
|
* and then falls back to the range-sse.cc algorithm. The changes to the |
|
* algorithm are cosmetic, mostly to trick the clang compiler to produce optimal |
|
* code. |
|
* |
|
* For API see the utf8_validity.h header. |
|
*/ |
|
#include "utf8_range.h" |
|
|
|
#include <stddef.h> |
|
#include <stdint.h> |
|
#include <string.h> |
|
|
|
#if defined(__GNUC__) |
|
#define FORCE_INLINE_ATTR __attribute__((always_inline)) |
|
#elif defined(_MSC_VER) |
|
#define FORCE_INLINE_ATTR __forceinline |
|
#else |
|
#define FORCE_INLINE_ATTR |
|
#endif |
|
|
|
static FORCE_INLINE_ATTR inline uint64_t utf8_range_UnalignedLoad64( |
|
const void* p) { |
|
uint64_t t; |
|
memcpy(&t, p, sizeof t); |
|
return t; |
|
} |
|
|
|
static FORCE_INLINE_ATTR inline int utf8_range_AsciiIsAscii(unsigned char c) { |
|
return c < 128; |
|
} |
|
|
|
static FORCE_INLINE_ATTR inline int utf8_range_IsTrailByteOk(const char c) { |
|
return (int8_t)(c) <= (int8_t)(0xBF); |
|
} |
|
|
|
/* If return_position is false then it returns 1 if |data| is a valid utf8 |
|
* sequence, otherwise returns 0. |
|
* If return_position is set to true, returns the length in bytes of the prefix |
|
of |data| that is all structurally valid UTF-8. |
|
*/ |
|
static size_t utf8_range_ValidateUTF8Naive(const char* data, const char* end, |
|
int return_position) { |
|
/* We return err_pos in the loop which is always 0 if !return_position */ |
|
size_t err_pos = 0; |
|
size_t codepoint_bytes = 0; |
|
/* The early check is done because of early continue's on codepoints of all |
|
* sizes, i.e. we first check for ascii and if it is, we call continue, then |
|
* for 2 byte codepoints, etc. This is done in order to reduce indentation and |
|
* improve readability of the codepoint validity check. |
|
*/ |
|
while (data + codepoint_bytes < end) { |
|
if (return_position) { |
|
err_pos += codepoint_bytes; |
|
} |
|
data += codepoint_bytes; |
|
const size_t len = end - data; |
|
const unsigned char byte1 = data[0]; |
|
|
|
/* We do not skip many ascii bytes at the same time as this function is |
|
used for tail checking (< 16 bytes) and for non x86 platforms. We also |
|
don't think that cases where non-ASCII codepoints are followed by ascii |
|
happen often. For small strings it also introduces some penalty. For |
|
purely ascii UTF8 strings (which is the overwhelming case) we call |
|
SkipAscii function which is multiplatform and extremely fast. |
|
*/ |
|
/* [00..7F] ASCII -> 1 byte */ |
|
if (utf8_range_AsciiIsAscii(byte1)) { |
|
codepoint_bytes = 1; |
|
continue; |
|
} |
|
/* [C2..DF], [80..BF] -> 2 bytes */ |
|
if (len >= 2 && byte1 >= 0xC2 && byte1 <= 0xDF && |
|
utf8_range_IsTrailByteOk(data[1])) { |
|
codepoint_bytes = 2; |
|
continue; |
|
} |
|
if (len >= 3) { |
|
const unsigned char byte2 = data[1]; |
|
const unsigned char byte3 = data[2]; |
|
|
|
/* Is byte2, byte3 between [0x80, 0xBF] |
|
* Check for 0x80 was done above. |
|
*/ |
|
if (!utf8_range_IsTrailByteOk(byte2) || |
|
!utf8_range_IsTrailByteOk(byte3)) { |
|
return err_pos; |
|
} |
|
|
|
if (/* E0, A0..BF, 80..BF */ |
|
((byte1 == 0xE0 && byte2 >= 0xA0) || |
|
/* E1..EC, 80..BF, 80..BF */ |
|
(byte1 >= 0xE1 && byte1 <= 0xEC) || |
|
/* ED, 80..9F, 80..BF */ |
|
(byte1 == 0xED && byte2 <= 0x9F) || |
|
/* EE..EF, 80..BF, 80..BF */ |
|
(byte1 >= 0xEE && byte1 <= 0xEF))) { |
|
codepoint_bytes = 3; |
|
continue; |
|
} |
|
if (len >= 4) { |
|
const unsigned char byte4 = data[3]; |
|
/* Is byte4 between 0x80 ~ 0xBF */ |
|
if (!utf8_range_IsTrailByteOk(byte4)) { |
|
return err_pos; |
|
} |
|
|
|
if (/* F0, 90..BF, 80..BF, 80..BF */ |
|
((byte1 == 0xF0 && byte2 >= 0x90) || |
|
/* F1..F3, 80..BF, 80..BF, 80..BF */ |
|
(byte1 >= 0xF1 && byte1 <= 0xF3) || |
|
/* F4, 80..8F, 80..BF, 80..BF */ |
|
(byte1 == 0xF4 && byte2 <= 0x8F))) { |
|
codepoint_bytes = 4; |
|
continue; |
|
} |
|
} |
|
} |
|
return err_pos; |
|
} |
|
if (return_position) { |
|
err_pos += codepoint_bytes; |
|
} |
|
/* if return_position is false, this returns 1. |
|
* if return_position is true, this returns err_pos. |
|
*/ |
|
return err_pos + (1 - return_position); |
|
} |
|
|
|
#if defined(__SSE4_1__) || (defined(__ARM_NEON) && defined(__ARM_64BIT_STATE)) |
|
/* Returns the number of bytes needed to skip backwards to get to the first |
|
byte of codepoint. |
|
*/ |
|
static inline int utf8_range_CodepointSkipBackwards(int32_t codepoint_word) { |
|
const int8_t* const codepoint = (const int8_t*)(&codepoint_word); |
|
if (!utf8_range_IsTrailByteOk(codepoint[3])) { |
|
return 1; |
|
} else if (!utf8_range_IsTrailByteOk(codepoint[2])) { |
|
return 2; |
|
} else if (!utf8_range_IsTrailByteOk(codepoint[1])) { |
|
return 3; |
|
} |
|
return 0; |
|
} |
|
#endif // __SSE4_1__ |
|
|
|
/* Skipping over ASCII as much as possible, per 8 bytes. It is intentional |
|
as most strings to check for validity consist only of 1 byte codepoints. |
|
*/ |
|
static inline const char* utf8_range_SkipAscii(const char* data, |
|
const char* end) { |
|
while (8 <= end - data && |
|
(utf8_range_UnalignedLoad64(data) & 0x8080808080808080) == 0) { |
|
data += 8; |
|
} |
|
while (data < end && utf8_range_AsciiIsAscii(*data)) { |
|
++data; |
|
} |
|
return data; |
|
} |
|
|
|
#if defined(__SSE4_1__) |
|
#include "utf8_range_sse.inc" |
|
#elif defined(__ARM_NEON) && defined(__ARM_64BIT_STATE) |
|
#include "utf8_range_neon.inc" |
|
#endif |
|
|
|
static FORCE_INLINE_ATTR inline size_t utf8_range_Validate( |
|
const char* data, size_t len, int return_position) { |
|
if (len == 0) return 1 - return_position; |
|
// Save buffer start address for later use |
|
const char* const data_original = data; |
|
const char* const end = data + len; |
|
data = utf8_range_SkipAscii(data, end); |
|
/* SIMD algorithm always outperforms the naive version for any data of |
|
length >=16. |
|
*/ |
|
if (end - data < 16) { |
|
return (return_position ? (data - data_original) : 0) + |
|
utf8_range_ValidateUTF8Naive(data, end, return_position); |
|
} |
|
#if defined(__SSE4_1__) || (defined(__ARM_NEON) && defined(__ARM_64BIT_STATE)) |
|
return utf8_range_ValidateUTF8Simd( |
|
data_original, data, end, return_position); |
|
#else |
|
return (return_position ? (data - data_original) : 0) + |
|
utf8_range_ValidateUTF8Naive(data, end, return_position); |
|
#endif |
|
} |
|
|
|
int utf8_range_IsValid(const char* data, size_t len) { |
|
return utf8_range_Validate(data, len, /*return_position=*/0) != 0; |
|
} |
|
|
|
size_t utf8_range_ValidPrefix(const char* data, size_t len) { |
|
return utf8_range_Validate(data, len, /*return_position=*/1); |
|
}
|
|
|