Clean up a few issues with ARM-optimized varint decoding.

1) Move the check for invalid varint outside handling a >= 32 bit varint. This gives a modest speedup (+0.5%) on the [fleetbench protogen benchmark](https://github.com/google/fleetbench/tree/main/fleetbench/proto) by working around an LLVM issue which inserted a redundant branch.
2) Move ValueBarrier() to parse_context.h so we can delete ForceToRegister(), which does the same thing as the single value ValueBarrier() implementation.

PiperOrigin-RevId: 504883550
pull/11644/head
Protobuf Team Bot 2 years ago committed by Copybara-Service
parent 75088504b0
commit bbe2e68686
  1. 38
      src/google/protobuf/parse_context.cc
  2. 20
      src/google/protobuf/parse_context.h

@ -677,27 +677,12 @@ const char* UnknownFieldParse(uint32_t tag, std::string* unknown,
// ABSL_LOG(FATAL) << "Invalid num_bits value";
// }
// ```
template <typename V1Type, typename V2Type>
PROTOBUF_ALWAYS_INLINE inline V1Type ValueBarrier(V1Type value1,
V2Type value2) {
asm("" : "+r"(value1) : "r"(value2));
return value1;
}
// Falsely indicate that the specific value is modified at this location. This
// prevents code which depends on this value from being scheduled earlier.
template <typename V1Type>
PROTOBUF_ALWAYS_INLINE inline V1Type ValueBarrier(V1Type value1) {
asm("" : "+r"(value1));
return value1;
}
PROTOBUF_ALWAYS_INLINE inline uint64_t ExtractAndMergeTwoChunks(
uint64_t data, uint64_t first_byte) {
ABSL_DCHECK_LE(first_byte, 6);
uint64_t first = Ubfx7(data, first_byte * 8);
uint64_t second = Ubfx7(data, (first_byte + 1) * 8);
return ForceToRegister(first | (second << 7));
return ValueBarrier(first | (second << 7));
}
struct SlowPathEncodedInfo {
@ -716,9 +701,9 @@ PROTOBUF_ALWAYS_INLINE inline SlowPathEncodedInfo ComputeLengthAndUpdateP(
SlowPathEncodedInfo result;
// Load the last two bytes of the encoded Varint.
std::memcpy(&result.last8, p + 2, sizeof(result.last8));
uint64_t mask = ForceToRegister(0x8080808080808080);
uint64_t mask = ValueBarrier(0x8080808080808080);
// Only set continuation bits remain
result.masked_cont_bits = ForceToRegister(mask & (~result.last8));
result.masked_cont_bits = ValueBarrier(mask & (~result.last8));
// The first cleared continuation bit is the most significant 1 in the
// reversed value. Result is undefined for an input of 0 and we handle that
// case below.
@ -758,9 +743,11 @@ PROTOBUF_NOINLINE const char* VarintParseSlowArm64(const char* p, uint64_t* out,
merged_45 << kFirstResultBitChunk4;
// This immediate ends in 14 zeroes since valid_chunk_bits is too low by 14.
uint64_t result_mask = kResultMaskUnshifted << info.valid_chunk_bits;
// masked_cont_bits is 0 iff the Varint is invalid. In that case
info.valid_bits =
info.masked_cont_bits ? info.valid_bits : kValidBitsForInvalidVarint;
// masked_cont_bits is 0 iff the Varint is invalid.
if (PROTOBUF_PREDICT_FALSE(!info.masked_cont_bits)) {
*out = 0;
return nullptr;
}
// Test for early exit if Varint does not exceed 6 chunks. Branching on one
// bit is faster on ARM than via a compare and branch.
if (PROTOBUF_PREDICT_FALSE((info.valid_bits & 0x20) != 0)) {
@ -772,11 +759,6 @@ PROTOBUF_NOINLINE const char* VarintParseSlowArm64(const char* p, uint64_t* out,
result |= merged_67 << kFirstResultBitChunk6;
result |= merged_89 << kFirstResultBitChunk8;
// Handle an invalid Varint with all 10 continuation bits set.
info.masked_cont_bits = ValueBarrier(info.masked_cont_bits);
if (PROTOBUF_PREDICT_FALSE(info.masked_cont_bits == 0)) {
*out = 0;
return nullptr;
}
}
// Mask off invalid data bytes.
result &= ~result_mask;
@ -795,8 +777,8 @@ PROTOBUF_NOINLINE const char* VarintParseSlowArm32(const char* p, uint32_t* out,
uint64_t merged_34 = ExtractAndMergeTwoChunks(first8, /*first_chunk=*/3);
first8 = ValueBarrier(first8, p);
uint64_t result = Ubfx7(first8, /*start=*/0);
result = ForceToRegister(result | merged_12 << kFirstResultBitChunk1);
result = ForceToRegister(result | merged_34 << kFirstResultBitChunk3);
result = ValueBarrier(result | merged_12 << kFirstResultBitChunk1);
result = ValueBarrier(result | merged_34 << kFirstResultBitChunk3);
uint64_t result_mask = kResultMaskUnshifted << info.valid_chunk_bits;
result &= ~result_mask;
// It is extremely unlikely that a Varint is invalid so checking that

@ -630,17 +630,25 @@ inline const char* VarintParseSlowArm(const char* p, uint64_t* out,
return VarintParseSlowArm64(p, out, first8);
}
// Moves data into a register and returns data. This impacts the compiler's
// scheduling and instruction selection decisions.
static PROTOBUF_ALWAYS_INLINE inline uint64_t ForceToRegister(uint64_t data) {
asm("" : "+r"(data));
return data;
// Falsely indicate that the specific value is modified at this location. This
// prevents code which depends on this value from being scheduled earlier.
template <typename V1Type>
PROTOBUF_ALWAYS_INLINE inline V1Type ValueBarrier(V1Type value1) {
asm("" : "+r"(value1));
return value1;
}
template <typename V1Type, typename V2Type>
PROTOBUF_ALWAYS_INLINE inline V1Type ValueBarrier(V1Type value1,
V2Type value2) {
asm("" : "+r"(value1) : "r"(value2));
return value1;
}
// Performs a 7 bit UBFX (Unsigned Bit Extract) starting at the indicated bit.
static PROTOBUF_ALWAYS_INLINE inline uint64_t Ubfx7(uint64_t data,
uint64_t start) {
return ForceToRegister((data >> start) & 0x7f);
return ValueBarrier((data >> start) & 0x7f);
}
#endif // __aarch64__

Loading…
Cancel
Save