protobuf/third_party/utf8_range/utf8_range.c

// Copyright 2023 Google LLC
//
// Use of this source code is governed by an MIT-style
// license that can be found in the LICENSE file or at
// https://opensource.org/licenses/MIT.

/* This is a wrapper for the Google range-sse.cc algorithm which checks whether
 * a sequence of bytes is a valid UTF-8 sequence and finds the longest valid
 * prefix of the UTF-8 sequence.
 *
 * The key difference is that it checks for as much ASCII symbols as possible
 * and then falls back to the range-sse.cc algorithm. The changes to the
 * algorithm are cosmetic, mostly to trick the clang compiler to produce optimal
 * code.
 *
 * For API see the utf8_validity.h header.
 */
#include "utf8_range.h"

#include <stddef.h>
#include <stdint.h>
#include <string.h>

#if defined(__GNUC__)
#define FORCE_INLINE_ATTR __attribute__((always_inline))
#elif defined(_MSC_VER)
#define FORCE_INLINE_ATTR __forceinline
#else
#define FORCE_INLINE_ATTR
#endif

static FORCE_INLINE_ATTR inline uint64_t utf8_range_UnalignedLoad64(
    const void* p) {
  uint64_t t;
  memcpy(&t, p, sizeof t);
  return t;
}

static FORCE_INLINE_ATTR inline int utf8_range_AsciiIsAscii(unsigned char c) {
  return c < 128;
}

static FORCE_INLINE_ATTR inline int utf8_range_IsTrailByteOk(const char c) {
  return (int8_t)(c) <= (int8_t)(0xBF);
}

/* If return_position is false then it returns 1 if |data| is a valid utf8
 * sequence, otherwise returns 0.
 * If return_position is set to true, returns the length in bytes of the prefix
   of |data| that is all structurally valid UTF-8.
 */
static size_t utf8_range_ValidateUTF8Naive(const char* data, const char* end,
                                           int return_position) {
  /* We return err_pos in the loop which is always 0 if !return_position */
  size_t err_pos = 0;
  size_t codepoint_bytes = 0;
  /* The early check is done because of early continue's on codepoints of all
   * sizes, i.e. we first check for ascii and if it is, we call continue, then
   * for 2 byte codepoints, etc. This is done in order to reduce indentation and
   * improve readability of the codepoint validity check.
   */
  while (data + codepoint_bytes < end) {
    if (return_position) {
      err_pos += codepoint_bytes;
    }
    data += codepoint_bytes;
    const size_t len = end - data;
    const unsigned char byte1 = data[0];

    /* We do not skip many ascii bytes at the same time as this function is
       used for tail checking (< 16 bytes) and for non x86 platforms. We also
       don't think that cases where non-ASCII codepoints are followed by ascii
       happen often. For small strings it also introduces some penalty. For
       purely ascii UTF8 strings (which is the overwhelming case) we call
       SkipAscii function which is multiplatform and extremely fast.
     */
    /* [00..7F] ASCII -> 1 byte */
    if (utf8_range_AsciiIsAscii(byte1)) {
      codepoint_bytes = 1;
      continue;
    }
    /* [C2..DF], [80..BF] -> 2 bytes */
    if (len >= 2 && byte1 >= 0xC2 && byte1 <= 0xDF &&
        utf8_range_IsTrailByteOk(data[1])) {
      codepoint_bytes = 2;
      continue;
    }
    if (len >= 3) {
      const unsigned char byte2 = data[1];
      const unsigned char byte3 = data[2];

      /* Is byte2, byte3 between [0x80, 0xBF]
       * Check for 0x80 was done above.
       */
      if (!utf8_range_IsTrailByteOk(byte2) ||
          !utf8_range_IsTrailByteOk(byte3)) {
        return err_pos;
      }

      if (/* E0, A0..BF, 80..BF */
          ((byte1 == 0xE0 && byte2 >= 0xA0) ||
           /* E1..EC, 80..BF, 80..BF */
           (byte1 >= 0xE1 && byte1 <= 0xEC) ||
           /* ED, 80..9F, 80..BF */
           (byte1 == 0xED && byte2 <= 0x9F) ||
           /* EE..EF, 80..BF, 80..BF */
           (byte1 >= 0xEE && byte1 <= 0xEF))) {
        codepoint_bytes = 3;
        continue;
      }
      if (len >= 4) {
        const unsigned char byte4 = data[3];
        /* Is byte4 between 0x80 ~ 0xBF */
        if (!utf8_range_IsTrailByteOk(byte4)) {
          return err_pos;
        }

        if (/* F0, 90..BF, 80..BF, 80..BF */
            ((byte1 == 0xF0 && byte2 >= 0x90) ||
             /* F1..F3, 80..BF, 80..BF, 80..BF */
             (byte1 >= 0xF1 && byte1 <= 0xF3) ||
             /* F4, 80..8F, 80..BF, 80..BF */
             (byte1 == 0xF4 && byte2 <= 0x8F))) {
          codepoint_bytes = 4;
          continue;
        }
      }
    }
    return err_pos;
  }
  if (return_position) {
    err_pos += codepoint_bytes;
  }
  /* if return_position is false, this returns 1.
   * if return_position is true, this returns err_pos.
   */
  return err_pos + (1 - return_position);
}

#if defined(__SSE4_1__) || (defined(__ARM_NEON) && defined(__ARM_64BIT_STATE))
/* Returns the number of bytes needed to skip backwards to get to the first
   byte of codepoint.
 */
static inline int utf8_range_CodepointSkipBackwards(int32_t codepoint_word) {
  const int8_t* const codepoint = (const int8_t*)(&codepoint_word);
  if (!utf8_range_IsTrailByteOk(codepoint[3])) {
    return 1;
  } else if (!utf8_range_IsTrailByteOk(codepoint[2])) {
    return 2;
  } else if (!utf8_range_IsTrailByteOk(codepoint[1])) {
    return 3;
  }
  return 0;
}
#endif  // __SSE4_1__

/* Skipping over ASCII as much as possible, per 8 bytes. It is intentional
   as most strings to check for validity consist only of 1 byte codepoints.
 */
static inline const char* utf8_range_SkipAscii(const char* data,
                                               const char* end) {
  while (8 <= end - data &&
         (utf8_range_UnalignedLoad64(data) & 0x8080808080808080) == 0) {
    data += 8;
  }
  while (data < end && utf8_range_AsciiIsAscii(*data)) {
    ++data;
  }
  return data;
}

#if defined(__SSE4_1__)
#include "utf8_range_sse.inc"
#elif defined(__ARM_NEON) && defined(__ARM_64BIT_STATE)
#include "utf8_range_neon.inc"
#endif

static FORCE_INLINE_ATTR inline size_t utf8_range_Validate(
    const char* data, size_t len, int return_position) {
  if (len == 0) return 1 - return_position;
  // Save buffer start address for later use
  const char* const data_original = data;
  const char* const end = data + len;
  data = utf8_range_SkipAscii(data, end);
  /* SIMD algorithm always outperforms the naive version for any data of
     length >=16.
   */
  if (end - data < 16) {
    return (return_position ? (data - data_original) : 0) +
           utf8_range_ValidateUTF8Naive(data, end, return_position);
  }
#if defined(__SSE4_1__) || (defined(__ARM_NEON) && defined(__ARM_64BIT_STATE))
  return utf8_range_ValidateUTF8Simd(
      data_original, data, end, return_position);
#else
  return (return_position ? (data - data_original) : 0) +
         utf8_range_ValidateUTF8Naive(data, end, return_position);
#endif
}

int utf8_range_IsValid(const char* data, size_t len) {
  return utf8_range_Validate(data, len, /*return_position=*/0) != 0;
}

size_t utf8_range_ValidPrefix(const char* data, size_t len) {
  return utf8_range_Validate(data, len, /*return_position=*/1);
}