Use the "shldq" decoder for the specialized 64-bit Varint parsers, rather than

using the "RotRight7" decoder. The "shldq" technique is much faster on recent Intel and AMD CPUs, when processing larger integers, especially on Zen. PiperOrigin-RevId: 498078103
2 years ago · 0ca97a1d7d
parent b3ec9ec291
commit 0ca97a1d7d
3 changed files with 427 additions and 170 deletions
--- a/src/google/protobuf/generated_message_tctable_impl.h
+++ b/src/google/protobuf/generated_message_tctable_impl.h
@ -259,16 +259,12 @@ enum FieldType : uint16_t {
 }  // namespace field_layout

 #ifndef NDEBUG
-template <size_t align>
-void AlignFail(uintptr_t address) {
-  GOOGLE_ABSL_LOG(FATAL) << "Unaligned (" << align << ") access at " << address;
-
-  // Explicit abort to let compilers know this function does not return
-  abort();
-}
-
-extern template void AlignFail<4>(uintptr_t);
-extern template void AlignFail<8>(uintptr_t);
+PROTOBUF_EXPORT void AlignFail(std::integral_constant<size_t, 4>,
+                               std::uintptr_t address);
+PROTOBUF_EXPORT void AlignFail(std::integral_constant<size_t, 8>,
+                               std::uintptr_t address);
+inline void AlignFail(std::integral_constant<size_t, 1>,
+                      std::uintptr_t address) {}
 #endif

 // TcParser implements most of the parsing logic for tailcall tables.
@ -365,29 +361,39 @@ class PROTOBUF_EXPORT TcParser final {

  // Manually unrolled and specialized Varint parsing.
  template <typename FieldType, int data_offset, int hasbit_idx>
-  static const char* SpecializedUnrolledVImpl1(PROTOBUF_TC_PARAM_DECL);
+  static const char* FastTV32S1(PROTOBUF_TC_PARAM_DECL);
+  template <typename FieldType, int data_offset, int hasbit_idx>
+  static const char* FastTV64S1(PROTOBUF_TC_PARAM_DECL);
  template <int data_offset, int hasbit_idx>
-  static const char* SpecializedFastV8S1(PROTOBUF_TC_PARAM_DECL);
+  static const char* FastTV8S1(PROTOBUF_TC_PARAM_DECL);

  template <typename FieldType, int data_offset, int hasbit_idx>
  static constexpr TailCallParseFunc SingularVarintNoZag1() {
-    if (data_offset < 100) {
-      if (sizeof(FieldType) == 1) {
-        return &SpecializedFastV8S1<data_offset, hasbit_idx>;
+    if (sizeof(FieldType) == 1) {
+      if (data_offset < 100) {
+        return &FastTV8S1<data_offset, hasbit_idx>;
+      } else {
+        return &FastV8S1;
+      }
+    }
+    if (sizeof(FieldType) == 4) {
+      if (data_offset < 100) {
+        return &FastTV32S1<FieldType, data_offset, hasbit_idx>;
+      } else {  //
+        return &FastV32S1;
+      }
+    }
+    if (sizeof(FieldType) == 8) {
+      if (data_offset < 128) {
+        return &FastTV64S1<FieldType, data_offset, hasbit_idx>;
+      } else {
+        return &FastV64S1;
      }
-      return &SpecializedUnrolledVImpl1<FieldType, data_offset, hasbit_idx>;
-    } else if (sizeof(FieldType) == 1) {
-      return &FastV8S1;
-    } else if (sizeof(FieldType) == 4) {
-      return &FastV32S1;
-    } else if (sizeof(FieldType) == 8) {
-      return &FastV64S1;
-    } else {
-      static_assert(sizeof(FieldType) == 1 || sizeof(FieldType) == 4 ||
-                        sizeof(FieldType) == 8,
-                    "");
-      return nullptr;
    }
+    static_assert(sizeof(FieldType) == 1 || sizeof(FieldType) == 4 ||
+                      sizeof(FieldType) == 8,
+                  "");
+    std::abort();  // unreachable
  }

  // Functions referenced by generated fast tables (closed enum):
@ -482,7 +488,10 @@ class PROTOBUF_EXPORT TcParser final {
 #ifndef NDEBUG
    if (PROTOBUF_PREDICT_FALSE(
            reinterpret_cast<uintptr_t>(target) % alignof(T) != 0)) {
-      AlignFail<alignof(T)>(reinterpret_cast<uintptr_t>(target));
+      AlignFail(std::integral_constant<size_t, alignof(T)>(),
+                reinterpret_cast<uintptr_t>(target));
+      // Explicit abort to let compilers know this code-path does not return
+      abort();
    }
 #endif
    return *target;
@ -495,7 +504,10 @@ class PROTOBUF_EXPORT TcParser final {
 #ifndef NDEBUG
    if (PROTOBUF_PREDICT_FALSE(
            reinterpret_cast<uintptr_t>(target) % alignof(T) != 0)) {
-      AlignFail<alignof(T)>(reinterpret_cast<uintptr_t>(target));
+      AlignFail(std::integral_constant<size_t, alignof(T)>(),
+                reinterpret_cast<uintptr_t>(target));
+      // Explicit abort to let compilers know this code-path does not return
+      abort();
    }
 #endif
    return *target;
@ -537,7 +549,7 @@ class PROTOBUF_EXPORT TcParser final {
  };
  static TestMiniParseResult TestMiniParse(PROTOBUF_TC_PARAM_DECL);
  template <bool export_called_function>
-  static const char* MiniParseImpl(PROTOBUF_TC_PARAM_DECL);
+  static const char* MiniParse(PROTOBUF_TC_PARAM_DECL);

  template <typename TagType, bool group_coding, bool aux_is_table>
  static inline const char* SingularParseMessageAuxImpl(PROTOBUF_TC_PARAM_DECL);
@ -714,12 +726,127 @@ class PROTOBUF_EXPORT TcParser final {
  static const char* MpFallback(PROTOBUF_TC_PARAM_DECL);
 };

+// Shift "byte" left by n * 7 bits, filling vacated bits with ones.
+template <int n>
+inline PROTOBUF_ALWAYS_INLINE uint64_t
+shift_left_fill_with_ones(uint64_t byte, uint64_t ones) {
+  return (byte << (n * 7)) | (ones >> (64 - (n * 7)));
+}
+
+// Shift "byte" left by n * 7 bits, filling vacated bits with ones, and
+// put the new value in res.  Return whether the result was negative.
+template <int n>
+inline PROTOBUF_ALWAYS_INLINE bool shift_left_fill_with_ones_was_negative(
+    uint64_t byte, uint64_t ones, int64_t& res) {
+#if defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(__x86_64__)
+  // For the first two rounds (ptr[1] and ptr[2]), micro benchmarks show a
+  // substantial improvement from capturing the sign from the condition code
+  // register on x86-64.
+  bool sign_bit;
+  asm("shldq %3, %2, %1"
+      : "=@ccs"(sign_bit), "+r"(byte)
+      : "r"(ones), "i"(n * 7));
+  res = byte;
+  return sign_bit;
+#else
+  // Generic fallback:
+  res = shift_left_fill_with_ones<n>(byte, ones);
+  return static_cast<int64_t>(res) < 0;
+#endif
+}
+
+inline PROTOBUF_ALWAYS_INLINE std::pair<const char*, uint64_t>
+Parse64FallbackPair(const char* p, int64_t res1) {
+  auto ptr = reinterpret_cast<const int8_t*>(p);
+
+  // The algorithm relies on sign extension for each byte to set all high bits
+  // when the varint continues. It also relies on asserting all of the lower
+  // bits for each successive byte read. This allows the result to be aggregated
+  // using a bitwise AND. For example:
+  //
+  //          8       1          64     57 ... 24     17  16      9  8       1
+  // ptr[0] = 1aaa aaaa ; res1 = 1111 1111 ... 1111 1111  1111 1111  1aaa aaaa
+  // ptr[1] = 1bbb bbbb ; res2 = 1111 1111 ... 1111 1111  11bb bbbb  b111 1111
+  // ptr[2] = 1ccc cccc ; res3 = 0000 0000 ... 000c cccc  cc11 1111  1111 1111
+  //                             ---------------------------------------------
+  //        res1 & res2 & res3 = 0000 0000 ... 000c cccc  ccbb bbbb  baaa aaaa
+  //
+  // On x86-64, a shld from a single register filled with enough 1s in the high
+  // bits can accomplish all this in one instruction. It so happens that res1
+  // has 57 high bits of ones, which is enough for the largest shift done.
+  //
+  // Just as importantly, by keeping results in res1, res2, and res3, we take
+  // advantage of the superscalar abilities of the CPU.
+  GOOGLE_ABSL_DCHECK_EQ(res1 >> 7, -1);
+  uint64_t ones = res1;  // save the high 1 bits from res1 (input to SHLD)
+  int64_t res2, res3;    // accumulated result chunks
+
+  if (!shift_left_fill_with_ones_was_negative<1>(ptr[1], ones, res2))
+    goto done2;
+  if (!shift_left_fill_with_ones_was_negative<2>(ptr[2], ones, res3))
+    goto done3;
+
+  // For the remainder of the chunks, check the sign of the AND result.
+  res1 &= shift_left_fill_with_ones<3>(ptr[3], ones);
+  if (res1 >= 0) goto done4;
+  res2 &= shift_left_fill_with_ones<4>(ptr[4], ones);
+  if (res2 >= 0) goto done5;
+  res3 &= shift_left_fill_with_ones<5>(ptr[5], ones);
+  if (res3 >= 0) goto done6;
+  res1 &= shift_left_fill_with_ones<6>(ptr[6], ones);
+  if (res1 >= 0) goto done7;
+  res2 &= shift_left_fill_with_ones<7>(ptr[7], ones);
+  if (res2 >= 0) goto done8;
+  res3 &= shift_left_fill_with_ones<8>(ptr[8], ones);
+  if (res3 >= 0) goto done9;
+
+  // For valid 64bit varints, the 10th byte/ptr[9] should be exactly 1. In this
+  // case, the continuation bit of ptr[8] already set the top bit of res3
+  // correctly, so all we have to do is check that the expected case is true.
+  if (PROTOBUF_PREDICT_TRUE(ptr[9] == 1)) goto done10;
+
+  // A value of 0, however, represents an over-serialized varint. This case
+  // should not happen, but if does (say, due to a nonconforming serializer),
+  // deassert the continuation bit that came from ptr[8].
+  if (ptr[9] == 0) {
+#if defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(__x86_64__)
+    // Use a small instruction since this is an uncommon code path.
+    asm("btcq $63,%0" : "+r"(res3));
+#else
+    res3 ^= static_cast<uint64_t>(1) << 63;
+#endif
+    goto done10;
+  }
+
+  // If the 10th byte/ptr[9] itself has any other value, then it is too big to
+  // fit in 64 bits. If the continue bit is set, it is an unterminated varint.
+  return {nullptr, 0};
+
+done2:
+  return {p + 2, res1 & res2};
+done3:
+  return {p + 3, res1 & res2 & res3};
+done4:
+  return {p + 4, res1 & res2 & res3};
+done5:
+  return {p + 5, res1 & res2 & res3};
+done6:
+  return {p + 6, res1 & res2 & res3};
+done7:
+  return {p + 7, res1 & res2 & res3};
+done8:
+  return {p + 8, res1 & res2 & res3};
+done9:
+  return {p + 9, res1 & res2 & res3};
+done10:
+  return {p + 10, res1 & res2 & res3};
+}
+
 // Notes:
 // 1) if data_offset is negative, it's read from data.offset()
 // 2) if hasbit_idx is negative, it's read from data.hasbit_idx()
 template <int data_offset, int hasbit_idx>
-PROTOBUF_NOINLINE const char* TcParser::SpecializedFastV8S1(
-    PROTOBUF_TC_PARAM_DECL) {
+PROTOBUF_NOINLINE const char* TcParser::FastTV8S1(PROTOBUF_TC_PARAM_DECL) {
  using TagType = uint8_t;

  // Special case for a varint bool field with a tag of 1 byte:
@ -766,8 +893,40 @@ PROTOBUF_NOINLINE const char* TcParser::SpecializedFastV8S1(
 }

 template <typename FieldType, int data_offset, int hasbit_idx>
-PROTOBUF_NOINLINE const char* TcParser::SpecializedUnrolledVImpl1(
-    PROTOBUF_TC_PARAM_DECL) {
+PROTOBUF_NOINLINE const char* TcParser::FastTV64S1(PROTOBUF_TC_PARAM_DECL) {
+  using TagType = uint8_t;
+  // super-early success test...
+  if (PROTOBUF_PREDICT_TRUE(((data.data) & 0x80FF) == 0)) {
+    ptr += sizeof(TagType);  // Consume tag
+    if (hasbit_idx < 32) {
+      hasbits |= (uint64_t{1} << hasbit_idx);
+    }
+    uint8_t value = data.data >> 8;
+    RefAt<FieldType>(msg, data_offset) = value;
+    ptr += 1;
+    PROTOBUF_MUSTTAIL return ToTagDispatch(PROTOBUF_TC_PARAM_PASS);
+  }
+  if (PROTOBUF_PREDICT_FALSE(data.coded_tag<TagType>() != 0)) {
+    PROTOBUF_MUSTTAIL return MiniParse(PROTOBUF_TC_PARAM_PASS);
+  }
+  ptr += sizeof(TagType);  // Consume tag
+  if (hasbit_idx < 32) {
+    hasbits |= (uint64_t{1} << hasbit_idx);
+  }
+
+  auto tmp = Parse64FallbackPair(ptr, static_cast<int8_t>(data.data >> 8));
+  data.data = 0;  // Indicate to the compiler that we don't need this anymore.
+  ptr = tmp.first;
+  if (PROTOBUF_PREDICT_FALSE(ptr == nullptr)) {
+    return Error(PROTOBUF_TC_PARAM_PASS);
+  }
+
+  RefAt<FieldType>(msg, data_offset) = static_cast<FieldType>(tmp.second);
+  PROTOBUF_MUSTTAIL return ToTagDispatch(PROTOBUF_TC_PARAM_PASS);
+}
+
+template <typename FieldType, int data_offset, int hasbit_idx>
+PROTOBUF_NOINLINE const char* TcParser::FastTV32S1(PROTOBUF_TC_PARAM_DECL) {
  using TagType = uint8_t;
  // super-early success test...
  if (PROTOBUF_PREDICT_TRUE(((data.data) & 0x80FF) == 0)) {
@ -800,34 +959,30 @@ PROTOBUF_NOINLINE const char* TcParser::SpecializedUnrolledVImpl1(
        if (PROTOBUF_PREDICT_FALSE(res & 0x80)) {
          res = RotRight7AndReplaceLowByte(res, ptr[4]);
          if (PROTOBUF_PREDICT_FALSE(res & 0x80)) {
-            res = RotRight7AndReplaceLowByte(res, ptr[5]);
-            if (PROTOBUF_PREDICT_FALSE(res & 0x80)) {
-              res = RotRight7AndReplaceLowByte(res, ptr[6]);
-              if (PROTOBUF_PREDICT_FALSE(res & 0x80)) {
-                res = RotRight7AndReplaceLowByte(res, ptr[7]);
-                if (PROTOBUF_PREDICT_FALSE(res & 0x80)) {
-                  res = RotRight7AndReplaceLowByte(res, ptr[8]);
-                  if (PROTOBUF_PREDICT_FALSE(res & 0x80)) {
+            if (PROTOBUF_PREDICT_FALSE(ptr[5] & 0x80)) {
+              if (PROTOBUF_PREDICT_FALSE(ptr[6] & 0x80)) {
+                if (PROTOBUF_PREDICT_FALSE(ptr[7] & 0x80)) {
+                  if (PROTOBUF_PREDICT_FALSE(ptr[8] & 0x80)) {
                    if (ptr[9] & 0xFE) return Error(PROTOBUF_TC_PARAM_PASS);
-                    res = RotateLeft(res, -7) & ~1;
-                    res += ptr[9] & 1;
-                    *out = RotateLeft(res, 63);
+                    *out = RotateLeft(res, 28);
                    ptr += 10;
-                    PROTOBUF_MUSTTAIL return ToTagDispatch(PROTOBUF_TC_PARAM_PASS);
+                    PROTOBUF_MUSTTAIL return ToTagDispatch(
+                        PROTOBUF_TC_PARAM_PASS);
                  }
-                  *out = RotateLeft(res, 56);
+                  *out = RotateLeft(res, 28);
                  ptr += 9;
-                  PROTOBUF_MUSTTAIL return ToTagDispatch(PROTOBUF_TC_PARAM_PASS);
+                  PROTOBUF_MUSTTAIL return ToTagDispatch(
+                      PROTOBUF_TC_PARAM_PASS);
                }
-                *out = RotateLeft(res, 49);
+                *out = RotateLeft(res, 28);
                ptr += 8;
                PROTOBUF_MUSTTAIL return ToTagDispatch(PROTOBUF_TC_PARAM_PASS);
              }
-              *out = RotateLeft(res, 42);
+              *out = RotateLeft(res, 28);
              ptr += 7;
              PROTOBUF_MUSTTAIL return ToTagDispatch(PROTOBUF_TC_PARAM_PASS);
            }
-            *out = RotateLeft(res, 35);
+            *out = RotateLeft(res, 28);
            ptr += 6;
            PROTOBUF_MUSTTAIL return ToTagDispatch(PROTOBUF_TC_PARAM_PASS);
          }
--- a/src/google/protobuf/generated_message_tctable_lite.cc
+++ b/src/google/protobuf/generated_message_tctable_lite.cc
@ -58,8 +58,18 @@ using FieldEntry = TcParseTableBase::FieldEntry;
 //////////////////////////////////////////////////////////////////////////////

 #ifndef NDEBUG
-template void AlignFail<4>(uintptr_t);
-template void AlignFail<8>(uintptr_t);
+void AlignFail(std::integral_constant<size_t, 4>, std::uintptr_t address) {
+  GOOGLE_ABSL_LOG(FATAL) << "Unaligned (4) access at " << address;
+
+  // Explicit abort to let compilers know this function does not return
+  abort();
+}
+void AlignFail(std::integral_constant<size_t, 8>, std::uintptr_t address) {
+  GOOGLE_ABSL_LOG(FATAL) << "Unaligned (8) access at " << address;
+
+  // Explicit abort to let compilers know this function does not return
+  abort();
+}
 #endif

 const char* TcParser::GenericFallbackLite(PROTOBUF_TC_PARAM_DECL) {
@ -257,7 +267,7 @@ absl::string_view TcParser::FieldName(const TcParseTableBase* table,
 }

 template <bool export_called_function>
-inline PROTOBUF_ALWAYS_INLINE const char* TcParser::MiniParseImpl(
+inline PROTOBUF_ALWAYS_INLINE const char* TcParser::MiniParse(
    PROTOBUF_TC_PARAM_DECL) {
  TestMiniParseResult* test_out;
  if (export_called_function) {
@ -342,13 +352,13 @@ inline PROTOBUF_ALWAYS_INLINE const char* TcParser::MiniParseImpl(
 }

 PROTOBUF_NOINLINE const char* TcParser::MiniParse(PROTOBUF_TC_PARAM_DECL) {
-  PROTOBUF_MUSTTAIL return MiniParseImpl<false>(PROTOBUF_TC_PARAM_PASS);
+  PROTOBUF_MUSTTAIL return MiniParse<false>(PROTOBUF_TC_PARAM_PASS);
 }
 PROTOBUF_NOINLINE TcParser::TestMiniParseResult TcParser::TestMiniParse(
    PROTOBUF_TC_PARAM_DECL) {
  TestMiniParseResult result = {};
  data.data = reinterpret_cast<uintptr_t>(&result);
-  result.ptr = MiniParseImpl<true>(PROTOBUF_TC_PARAM_PASS);
+  result.ptr = MiniParse<true>(PROTOBUF_TC_PARAM_PASS);
  return result;
 }

@ -678,119 +688,6 @@ PROTOBUF_NOINLINE const char* TcParser::FastF64P2(PROTOBUF_TC_PARAM_DECL) {

 namespace {

-// Shift "byte" left by n * 7 bits, filling vacated bits with ones.
-template <int n>
-inline PROTOBUF_ALWAYS_INLINE uint64_t
-shift_left_fill_with_ones(uint64_t byte, uint64_t ones) {
-  return (byte << (n * 7)) | (ones >> (64 - (n * 7)));
-}
-
-// Shift "byte" left by n * 7 bits, filling vacated bits with ones, and
-// put the new value in res.  Return whether the result was negative.
-template <int n>
-inline PROTOBUF_ALWAYS_INLINE bool shift_left_fill_with_ones_was_negative(
-    uint64_t byte, uint64_t ones, int64_t& res) {
-#if defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(__x86_64__)
-  // For the first two rounds (ptr[1] and ptr[2]), micro benchmarks show a
-  // substantial improvement from capturing the sign from the condition code
-  // register on x86-64.
-  bool sign_bit;
-  asm("shldq %3, %2, %1"
-      : "=@ccs"(sign_bit), "+r"(byte)
-      : "r"(ones), "i"(n * 7));
-  res = byte;
-  return sign_bit;
-#else
-  // Generic fallback:
-  res = (byte << (n * 7)) | (ones >> (64 - (n * 7)));
-  return static_cast<int64_t>(res) < 0;
-#endif
-}
-
-inline PROTOBUF_ALWAYS_INLINE std::pair<const char*, uint64_t>
-Parse64FallbackPair(const char* p, int64_t res1) {
-  auto ptr = reinterpret_cast<const int8_t*>(p);
-
-  // The algorithm relies on sign extension for each byte to set all high bits
-  // when the varint continues. It also relies on asserting all of the lower
-  // bits for each successive byte read. This allows the result to be aggregated
-  // using a bitwise AND. For example:
-  //
-  //          8       1          64     57 ... 24     17  16      9  8       1
-  // ptr[0] = 1aaa aaaa ; res1 = 1111 1111 ... 1111 1111  1111 1111  1aaa aaaa
-  // ptr[1] = 1bbb bbbb ; res2 = 1111 1111 ... 1111 1111  11bb bbbb  b111 1111
-  // ptr[2] = 1ccc cccc ; res3 = 0000 0000 ... 000c cccc  cc11 1111  1111 1111
-  //                             ---------------------------------------------
-  //        res1 & res2 & res3 = 0000 0000 ... 000c cccc  ccbb bbbb  baaa aaaa
-  //
-  // On x86-64, a shld from a single register filled with enough 1s in the high
-  // bits can accomplish all this in one instruction. It so happens that res1
-  // has 57 high bits of ones, which is enough for the largest shift done.
-  GOOGLE_ABSL_DCHECK_EQ(res1 >> 7, -1);
-  uint64_t ones = res1;  // save the high 1 bits from res1 (input to SHLD)
-  int64_t res2, res3;    // accumulated result chunks
-
-  if (!shift_left_fill_with_ones_was_negative<1>(ptr[1], ones, res2))
-    goto done2;
-  if (!shift_left_fill_with_ones_was_negative<2>(ptr[2], ones, res3))
-    goto done3;
-
-  // For the remainder of the chunks, check the sign of the AND result.
-  res1 &= shift_left_fill_with_ones<3>(ptr[3], ones);
-  if (res1 >= 0) goto done4;
-  res2 &= shift_left_fill_with_ones<4>(ptr[4], ones);
-  if (res2 >= 0) goto done5;
-  res3 &= shift_left_fill_with_ones<5>(ptr[5], ones);
-  if (res3 >= 0) goto done6;
-  res1 &= shift_left_fill_with_ones<6>(ptr[6], ones);
-  if (res1 >= 0) goto done7;
-  res2 &= shift_left_fill_with_ones<7>(ptr[7], ones);
-  if (res2 >= 0) goto done8;
-  res3 &= shift_left_fill_with_ones<8>(ptr[8], ones);
-  if (res3 >= 0) goto done9;
-
-  // For valid 64bit varints, the 10th byte/ptr[9] should be exactly 1. In this
-  // case, the continuation bit of ptr[8] already set the top bit of res3
-  // correctly, so all we have to do is check that the expected case is true.
-  if (PROTOBUF_PREDICT_TRUE(ptr[9] == 1)) goto done10;
-
-  // A value of 0, however, represents an over-serialized varint. This case
-  // should not happen, but if does (say, due to a nonconforming serializer),
-  // deassert the continuation bit that came from ptr[8].
-  if (ptr[9] == 0) {
-#if defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(__x86_64__)
-    // Use a small instruction since this is an uncommon code path.
-    asm("btcq $63,%0" : "+r"(res3));
-#else
-    res3 ^= static_cast<uint64_t>(1) << 63;
-#endif
-    goto done10;
-  }
-
-  // If the 10th byte/ptr[9] itself has any other value, then it is too big to
-  // fit in 64 bits. If the continue bit is set, it is an unterminated varint.
-  return {nullptr, 0};
-
-done2:
-  return {p + 2, res1 & res2};
-done3:
-  return {p + 3, res1 & res2 & res3};
-done4:
-  return {p + 4, res1 & res2 & res3};
-done5:
-  return {p + 5, res1 & res2 & res3};
-done6:
-  return {p + 6, res1 & res2 & res3};
-done7:
-  return {p + 7, res1 & res2 & res3};
-done8:
-  return {p + 8, res1 & res2 & res3};
-done9:
-  return {p + 9, res1 & res2 & res3};
-done10:
-  return {p + 10, res1 & res2 & res3};
-}
-
 template <typename Type>
 inline PROTOBUF_ALWAYS_INLINE const char* ParseVarint(const char* p,
                                                      Type* value) {
@ -969,7 +866,7 @@ PROTOBUF_NOINLINE const char* TcParser::SingularVarBigint(
 }

 PROTOBUF_NOINLINE const char* TcParser::FastV8S1(PROTOBUF_TC_PARAM_DECL) {
-  PROTOBUF_MUSTTAIL return SpecializedFastV8S1<-1, -1>(PROTOBUF_TC_PARAM_PASS);
+  PROTOBUF_MUSTTAIL return FastTV8S1<-1, -1>(PROTOBUF_TC_PARAM_PASS);
 }
 PROTOBUF_NOINLINE const char* TcParser::FastV8S2(PROTOBUF_TC_PARAM_DECL) {
  PROTOBUF_MUSTTAIL return SingularVarint<bool, uint16_t>(
--- a/src/google/protobuf/generated_message_tctable_lite_test.cc
+++ b/src/google/protobuf/generated_message_tctable_lite_test.cc
@ -33,6 +33,7 @@
 #include "google/protobuf/generated_message_tctable_impl.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/types/optional.h"
 #include "google/protobuf/wire_format_lite.h"

 namespace google {
@ -43,6 +44,210 @@ namespace {

 using ::testing::Eq;
 using ::testing::Not;
+using ::testing::Optional;
+
+// The fast parser's dispatch table Xors two bytes of incoming data with
+// the data in TcFieldData, so we reproduce that here:
+TcFieldData Xor2SerializedBytes(TcFieldData tfd, const char* ptr) {
+  uint64_t twobytes = 0xFF & ptr[0];
+  twobytes |= (0xFF & ptr[1]) << 8;
+  tfd.data ^= twobytes;
+  return tfd;
+}
+
+absl::optional<const char*> fallback_ptr_received;
+absl::optional<uint64_t> fallback_hasbits_received;
+absl::optional<uint64_t> fallback_tag_received;
+const char* FastParserGaveUp(::google::protobuf::MessageLite*, const char* ptr,
+                             ::google::protobuf::internal::ParseContext*,
+                             ::google::protobuf::internal::TcFieldData data,
+                             const ::google::protobuf::internal::TcParseTableBase*,
+                             uint64_t hasbits) {
+  fallback_ptr_received = ptr;
+  fallback_hasbits_received = hasbits;
+  fallback_tag_received = data.tag();
+  return nullptr;
+}
+
+// To test that we aren't storing too much data, we set up a fake message area
+// and fill all its bytes with kDND.
+constexpr char kDND = 0x5A;  // "Do Not Disturb"
+
+// To retrieve data and see if it matches what we expect, we have this routine
+// which simultaneously reads the data we want, and sets it back to what it was
+// before the test, that is, to kDND.  This makes it easier to test at the end
+// that all the original data is undisturbed.
+template <typename T>
+T ReadAndReset(char* p) {
+  T result;
+  memcpy(&result, p, sizeof(result));
+  memset(p, kDND, sizeof(result));
+  return result;
+}
+
+TEST(FastVarints, NameHere) {
+  constexpr uint8_t kHasBitsOffset = 4;
+  constexpr uint8_t kHasBitIndex = 0;
+  constexpr uint8_t kFieldOffset = 24;
+
+  // clang-format on
+  const TcParseTable<0, 1, 0, 0, 2> parse_table = {
+      {
+          kHasBitsOffset,  //
+          0, 0, 0,         // no _extensions_
+          1, 0,            // max_field_number, fast_idx_mask
+          offsetof(decltype(parse_table), field_lookup_table),
+          0xFFFFFFFF - 1,  // skipmap
+          offsetof(decltype(parse_table), field_entries),
+          1,                                             // num_field_entries
+          0,                                             // num_aux_entries
+          offsetof(decltype(parse_table), field_names),  // no aux_entries
+          nullptr,                                       // default instance
+          FastParserGaveUp,                              // fallback
+      },
+      // Fast Table:
+      {{
+          // optional int32 field = 1;
+          {TcParser::SingularVarintNoZag1<::uint32_t, kFieldOffset,
+                                          kHasBitIndex>(),
+           {/* coded_tag= */ 8, kHasBitIndex, /* aux_idx= */ 0, kFieldOffset}},
+      }},
+      // Field Lookup Table:
+      {{65535, 65535}},
+      // Field Entries:
+      {{
+          // This is set to kFkNone to force MiniParse to call the fallback
+          {kFieldOffset, kHasBitsOffset + 0, 0, (field_layout::kFkNone)},
+      }},
+      // no aux_entries
+      {{}},
+  };
+  // clang-format on
+  uint8_t serialize_buffer[64];
+
+  for (int size : {8, 32, 64, -8, -32, -64}) {
+    auto next_i = [](uint64_t i) {
+      // if i + 1 is a power of two, return that.
+      // (This will also match when i == -1, but for this loop we know that will
+      // not happen.)
+      if ((i & (i + 1)) == 0) return i + 1;
+      // otherwise, i is already a power of two, so advance to one less than the
+      // next power of two.
+      return i + (i - 1);
+    };
+    for (uint64_t i = 0; i + 1 != 0; i = next_i(i)) {
+      char fake_msg[64] = {
+          kDND, kDND, kDND, kDND, kDND, kDND, kDND, kDND,  //
+          kDND, kDND, kDND, kDND, kDND, kDND, kDND, kDND,  //
+          kDND, kDND, kDND, kDND, kDND, kDND, kDND, kDND,  //
+          kDND, kDND, kDND, kDND, kDND, kDND, kDND, kDND,  //
+          kDND, kDND, kDND, kDND, kDND, kDND, kDND, kDND,  //
+          kDND, kDND, kDND, kDND, kDND, kDND, kDND, kDND,  //
+          kDND, kDND, kDND, kDND, kDND, kDND, kDND, kDND,  //
+          kDND, kDND, kDND, kDND, kDND, kDND, kDND, kDND,  //
+      };
+      memset(&fake_msg[kHasBitsOffset], 0, sizeof(uint32_t));
+
+      auto serialize_ptr = WireFormatLite::WriteUInt64ToArray(
+          /* field_number= */ 1, i, serialize_buffer);
+      absl::string_view serialized{
+          reinterpret_cast<char*>(&serialize_buffer[0]),
+          static_cast<size_t>(serialize_ptr - serialize_buffer)};
+
+      const char* ptr = nullptr;
+      const char* end_ptr = nullptr;
+      ParseContext ctx(io::CodedInputStream::GetDefaultRecursionLimit(),
+                       /* aliasing= */ false, &ptr, serialized);
+#if 0  // FOR_DEBUGGING
+      GOOGLE_ABSL_LOG(ERROR) << "size=" << size << " i=" << i << " ptr points to "  //
+                      << +ptr[0] << "," << +ptr[1] << ","                    //
+                      << +ptr[2] << "," << +ptr[3] << ","                    //
+                      << +ptr[4] << "," << +ptr[5] << ","                    //
+                      << +ptr[6] << "," << +ptr[7] << ","                    //
+                      << +ptr[8] << "," << +ptr[9] << "," << +ptr[10] << "\n";
+#endif
+      TailCallParseFunc fn = nullptr;
+      switch (size) {
+        case 8:
+          fn = &TcParser::FastV8S1;
+          break;
+        case -8:
+          fn = &TcParser::FastTV8S1<kFieldOffset, kHasBitIndex>;
+          break;
+        case 32:
+          fn = &TcParser::FastV32S1;
+          break;
+        case -32:
+          fn = &TcParser::FastTV32S1<uint32_t, kFieldOffset, kHasBitIndex>;
+          break;
+        case 64:
+          fn = &TcParser::FastV64S1;
+          break;
+        case -64:
+          fn = &TcParser::FastTV64S1<uint64_t, kFieldOffset, kHasBitIndex>;
+          break;
+      }
+      fallback_ptr_received = absl::nullopt;
+      fallback_hasbits_received = absl::nullopt;
+      fallback_tag_received = absl::nullopt;
+      end_ptr = fn(reinterpret_cast<MessageLite*>(fake_msg), ptr, &ctx,
+                   Xor2SerializedBytes(parse_table.fast_entries[0].bits, ptr),
+                   &parse_table.header, /*hasbits=*/0);
+      switch (size) {
+        case -8:
+        case 8: {
+          if (end_ptr == nullptr) {
+            // If end_ptr is nullptr, that means the FastParser gave up and
+            // tried to pass control to MiniParse.... which is expected anytime
+            // we encounter something other than 0 or 1 encodings.  (Since
+            // FastV8S1 is only used for `bool` fields.)
+            EXPECT_NE(i, true);
+            EXPECT_NE(i, false);
+            EXPECT_THAT(fallback_hasbits_received, Optional(0));
+            // Like the mini-parser functions, and unlike the fast-parser
+            // functions, the fallback receives a ptr already incremented past
+            // the tag, and receives the actual tag in the `data` parameter.
+            EXPECT_THAT(fallback_ptr_received, Optional(ptr + 1));
+            EXPECT_THAT(fallback_tag_received, Optional(0x7F & *ptr));
+            continue;
+          }
+          ASSERT_EQ(end_ptr - ptr, serialized.size());
+
+          auto actual_field = ReadAndReset<uint8_t>(&fake_msg[kFieldOffset]);
+          EXPECT_EQ(actual_field, static_cast<decltype(actual_field)>(i))  //
+              << " hex: " << absl::StrCat(absl::Hex(actual_field));
+        }; break;
+        case -32:
+        case 32: {
+          ASSERT_EQ(end_ptr - ptr, serialized.size());
+
+          auto actual_field = ReadAndReset<uint32_t>(&fake_msg[kFieldOffset]);
+          EXPECT_EQ(actual_field, static_cast<decltype(actual_field)>(i))  //
+              << " hex: " << absl::StrCat(absl::Hex(actual_field));
+        }; break;
+        case -64:
+        case 64: {
+          ASSERT_EQ(end_ptr - ptr, serialized.size());
+
+          auto actual_field = ReadAndReset<uint64_t>(&fake_msg[kFieldOffset]);
+          EXPECT_EQ(actual_field, static_cast<decltype(actual_field)>(i))  //
+              << " hex: " << absl::StrCat(absl::Hex(actual_field));
+        }; break;
+      }
+      EXPECT_TRUE(!fallback_ptr_received);
+      EXPECT_TRUE(!fallback_hasbits_received);
+      EXPECT_TRUE(!fallback_tag_received);
+      auto hasbits = ReadAndReset<uint32_t>(&fake_msg[kHasBitsOffset]);
+      EXPECT_EQ(hasbits, 1 << kHasBitIndex);
+
+      int offset = 0;
+      for (char ch : fake_msg) {
+        EXPECT_EQ(ch, kDND) << " corruption of message at offset " << offset;
+        ++offset;
+      }
+    }
+  }
+}

 MATCHER_P3(IsEntryForFieldNum, table, field_num, field_numbers_table,
           absl::StrCat(negation ? "isn't " : "",