From ecc9e43eec4ee7d0890edac80f3d555a01549015 Mon Sep 17 00:00:00 2001 From: Protobuf Team Bot Date: Thu, 3 Aug 2023 08:46:59 -0700 Subject: [PATCH] Add a new enum validation format. The format is compact for sequential and almost sequential enums. Uses Eytzinger layout for a fallback binary search. PiperOrigin-RevId: 553489682 --- src/google/protobuf/BUILD.bazel | 12 + src/google/protobuf/generated_enum_util.cc | 149 +++++++ src/google/protobuf/generated_enum_util.h | 53 +++ .../protobuf/generated_enum_util_test.cc | 377 ++++++++++++++++++ 4 files changed, 591 insertions(+) create mode 100644 src/google/protobuf/generated_enum_util_test.cc diff --git a/src/google/protobuf/BUILD.bazel b/src/google/protobuf/BUILD.bazel index 39e62950ff..ae6012349a 100644 --- a/src/google/protobuf/BUILD.bazel +++ b/src/google/protobuf/BUILD.bazel @@ -1490,6 +1490,18 @@ cc_test( ], ) +cc_test( + name = "generated_enum_util_test", + srcs = ["generated_enum_util_test.cc"], + deps = [ + ":protobuf", + "@com_google_absl//absl/container:btree", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:span", + "@com_google_googletest//:gtest_main", + ], +) + cc_test( name = "retention_test", srcs = ["retention_test.cc"], diff --git a/src/google/protobuf/generated_enum_util.cc b/src/google/protobuf/generated_enum_util.cc index ab21577fff..6d26f2184d 100644 --- a/src/google/protobuf/generated_enum_util.cc +++ b/src/google/protobuf/generated_enum_util.cc @@ -31,9 +31,19 @@ #include "google/protobuf/generated_enum_util.h" #include +#include +#include +#include +#include +#include "absl/log/absl_check.h" +#include "absl/types/optional.h" +#include "absl/types/span.h" #include "google/protobuf/generated_message_util.h" +// Must be included last. +#include "google/protobuf/port_def.inc" + namespace google { namespace protobuf { namespace internal { @@ -90,6 +100,145 @@ bool InitializeEnumStrings( return true; } +bool ValidateEnum(int value, const uint32_t* data) { + return ValidateEnumInlined(value, data); +} + +struct EytzingerLayoutSorter { + absl::Span input; + absl::Span output; + size_t i; + + // This is recursive, but the maximum depth is log(N), so it should be safe. + void Sort(size_t output_index = 0) { + if (output_index < input.size()) { + Sort(2 * output_index + 1); + output[output_index] = input[i++]; + Sort(2 * output_index + 2); + } + } +}; + +std::vector GenerateEnumData(absl::Span values) { + const auto sorted_and_unique = [&] { + for (size_t i = 0; i + 1 < values.size(); ++i) { + if (values[i] >= values[i + 1]) return false; + } + return true; + }; + ABSL_DCHECK(sorted_and_unique()); + std::vector fallback_values_too_large, fallback_values_after_bitmap; + std::vector bitmap_values; + constexpr size_t kBitmapBlockSize = 32; + absl::optional start_sequence; + uint32_t sequence_length = 0; + for (int32_t v : values) { + // If we don't yet have a sequence, start it. + if (!start_sequence.has_value()) { + // But only if we can fit it in the sequence. + if (static_cast(v) != v) { + fallback_values_too_large.push_back(v); + continue; + } + + start_sequence = v; + sequence_length = 1; + continue; + } + // If we can extend the sequence, do so. + if (v == static_cast(*start_sequence) + + static_cast(sequence_length) && + sequence_length < 0xFFFF) { + ++sequence_length; + continue; + } + + // We adjust the bitmap values to be relative to the end of the sequence. + const auto adjust = [&](int32_t v) -> uint32_t { + // Cast to int64_t first to avoid overflow. The result is guaranteed to be + // positive and fit in uint32_t. + int64_t a = static_cast(v) - *start_sequence - sequence_length; + ABSL_DCHECK(a >= 0); + ABSL_DCHECK_EQ(a, static_cast(a)); + return a; + }; + const uint32_t adjusted = adjust(v); + + const auto add_bit = [&](uint32_t bit) { + bitmap_values[bit / kBitmapBlockSize] |= uint32_t{1} + << (bit % kBitmapBlockSize); + }; + + // If we can fit it on the already allocated bitmap, do so. + if (adjusted < kBitmapBlockSize * bitmap_values.size()) { + // We can fit it in the existing bitmap. + ABSL_DCHECK_EQ(fallback_values_after_bitmap.size(), 0); + add_bit(adjusted); + continue; + } + + // We can't fit in the sequence and we can't fit in the current bitmap. + // Evaluate if it is better to add to fallback, or to collapse all the + // fallback values after the bitmap into the bitmap. + const size_t cost_if_fallback = + bitmap_values.size() + (1 + fallback_values_after_bitmap.size()); + const size_t rounded_bitmap_size = + (adjusted + kBitmapBlockSize) / kBitmapBlockSize; + const size_t cost_if_collapse = rounded_bitmap_size; + + if (cost_if_collapse <= cost_if_fallback && + kBitmapBlockSize * rounded_bitmap_size < 0x10000) { + // Collapse the existing values, and add the new one. + ABSL_DCHECK_GT(rounded_bitmap_size, bitmap_values.size()); + bitmap_values.resize(rounded_bitmap_size); + for (int32_t to_collapse : fallback_values_after_bitmap) { + add_bit(adjust(to_collapse)); + } + fallback_values_after_bitmap.clear(); + add_bit(adjusted); + } else { + fallback_values_after_bitmap.push_back(v); + } + } + + std::vector fallback_values; + if (fallback_values_after_bitmap.empty()) { + fallback_values = std::move(fallback_values_too_large); + } else if (fallback_values_too_large.empty()) { + fallback_values = std::move(fallback_values_after_bitmap); + } else { + fallback_values.resize(fallback_values_too_large.size() + + fallback_values_after_bitmap.size()); + std::merge(fallback_values_too_large.begin(), + fallback_values_too_large.end(), + fallback_values_after_bitmap.begin(), + fallback_values_after_bitmap.end(), &fallback_values[0]); + } + + std::vector output( + 2 /* seq start + seq len + bitmap len + ordered len */ + + bitmap_values.size() + fallback_values.size()); + uint32_t* p = output.data(); + + ABSL_DCHECK_EQ(sequence_length, static_cast(sequence_length)); + *p++ = uint32_t{static_cast(start_sequence.value_or(0))} | + (uint32_t{sequence_length} << 16); + ABSL_DCHECK_EQ( + kBitmapBlockSize * bitmap_values.size(), + static_cast(kBitmapBlockSize * bitmap_values.size())); + ABSL_DCHECK_EQ(fallback_values.size(), + static_cast(fallback_values.size())); + *p++ = static_cast(kBitmapBlockSize * bitmap_values.size()) | + static_cast(fallback_values.size() << 16); + p = std::copy(bitmap_values.begin(), bitmap_values.end(), p); + + EytzingerLayoutSorter{fallback_values, + absl::MakeSpan(p, fallback_values.size())} + .Sort(); + + return output; +} + } // namespace internal } // namespace protobuf } // namespace google diff --git a/src/google/protobuf/generated_enum_util.h b/src/google/protobuf/generated_enum_util.h index 1f06f2f6a7..11bf82471c 100644 --- a/src/google/protobuf/generated_enum_util.h +++ b/src/google/protobuf/generated_enum_util.h @@ -31,9 +31,15 @@ #ifndef GOOGLE_PROTOBUF_GENERATED_ENUM_UTIL_H__ #define GOOGLE_PROTOBUF_GENERATED_ENUM_UTIL_H__ +#include +#include +#include #include +#include #include "absl/strings/string_view.h" +#include "absl/types/span.h" +#include "google/protobuf/explicitly_constructed.h" #include "google/protobuf/message_lite.h" // Must be included last. @@ -75,6 +81,53 @@ PROTOBUF_EXPORT bool InitializeEnumStrings( const EnumEntry* enums, const int* sorted_indices, size_t size, internal::ExplicitlyConstructed* enum_strings); +// The enum validation format is split in 3 parts: +// - A dense sequence, with start+length +// - A variable size presence bitmap (in increments of 32 bits) +// - A variable size sorted int32_t set for everything else. +// +// The values are as follows: +// +// 0 - [ sequence start (int16_t) ] | [ sequence size (uint16_t) ] << 16 +// 1 - [ bitmap size in bits (uint16_t) ] | [ ordered size (uint16_t) ] << 16 +// x - [ variable length bitmap ] +// y - [ variable length of int32_t values ] +// +// where the bitmap starts right after the end of the sequence. +PROTOBUF_EXPORT bool ValidateEnum(int value, const uint32_t* data); +PROTOBUF_EXPORT std::vector GenerateEnumData( + absl::Span values); + +inline PROTOBUF_ALWAYS_INLINE bool ValidateEnumInlined(int value, + const uint32_t* data) { + const int16_t min_seq = static_cast(data[0] & 0xFFFF); + const uint16_t length_seq = static_cast(data[0] >> 16); + uint64_t adjusted = + static_cast(static_cast(value)) - min_seq; + // Check if the value is within the sequential part. + if (PROTOBUF_PREDICT_TRUE(adjusted < length_seq)) { + return true; + } + + const uint16_t length_bitmap = static_cast(data[1] & 0xFFFF); + adjusted -= length_seq; + // Check if the value is within the bitmap. + if (PROTOBUF_PREDICT_TRUE(adjusted < length_bitmap)) { + return ((data[2 + (adjusted / 32)] >> (adjusted % 32)) & 1) == 1; + } + + // Check if the value is on the ordered part. + const uint16_t num_ordered = static_cast(data[1] >> 16); + data += 2 + length_bitmap / 32; + size_t pos = 0; + while (pos < num_ordered) { + const int32_t sample = static_cast(data[pos]); + if (sample == value) return true; + pos = 2 * pos + (sample > value ? 1 : 2); + } + return false; +} + } // namespace internal } // namespace protobuf } // namespace google diff --git a/src/google/protobuf/generated_enum_util_test.cc b/src/google/protobuf/generated_enum_util_test.cc new file mode 100644 index 0000000000..4c298778e7 --- /dev/null +++ b/src/google/protobuf/generated_enum_util_test.cc @@ -0,0 +1,377 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2023 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "google/protobuf/generated_enum_util.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "absl/container/btree_set.h" +#include "absl/strings/str_format.h" +#include "absl/types/span.h" + + +// Must be included last. +#include "google/protobuf/port_def.inc" + +using testing::_; +using testing::ElementsAre; +using testing::Gt; +using testing::IsEmpty; +using testing::SizeIs; + +namespace google { +namespace protobuf { +namespace internal { +namespace { + +TEST(GenerateEnumDataTest, DebugChecks) { +#if GTEST_HAS_DEATH_TEST + // Not unique + EXPECT_DEBUG_DEATH(GenerateEnumData({1, 1}), "sorted_and_unique"); + // Not sorted + EXPECT_DEBUG_DEATH(GenerateEnumData({2, 1}), "sorted_and_unique"); +#endif +} + +uint32_t Make32(uint16_t a, uint16_t b) { return a | (b << 16); } +std::array Unmake32(uint32_t v) { + return {static_cast(v), static_cast(v >> 16)}; +} + +struct Header { + int16_t sequence_start; + uint16_t sequence_length; + uint16_t bitmap_length; + uint16_t ordered_length; + + std::string ToString() const { + return absl::StrFormat("(%d,%d,%d,%d)", sequence_start, sequence_length, + bitmap_length, ordered_length); + } + friend std::ostream& operator<<(std::ostream& os, Header header) { + return os << header.ToString(); + } +}; + +MATCHER_P4(HeaderHas, sequence_start, sequence_length, bitmap_length, + ordered_length, "") { + return testing::ExplainMatchResult(sequence_start, arg.sequence_start, + result_listener) && + testing::ExplainMatchResult(sequence_length, arg.sequence_length, + result_listener) && + testing::ExplainMatchResult(bitmap_length, arg.bitmap_length, + result_listener) && + testing::ExplainMatchResult(ordered_length, arg.ordered_length, + result_listener); +} + +Header ExtractHeader(absl::Span data) { + return { + static_cast(Unmake32(data[0])[0]), + Unmake32(data[0])[1], + Unmake32(data[1])[0], + Unmake32(data[1])[1], + }; +} + +TEST(GenerateEnumDataTest, BitmapSpaceOptimizationWorks) { + std::vector values = {0}; + + auto encoded = GenerateEnumData(values); + EXPECT_THAT(ExtractHeader(encoded), HeaderHas(0, 1, 0, 0)); + EXPECT_THAT(encoded, SizeIs(2)); + + // Adding one large value puts it on the fallback. + values.push_back(100); + encoded = GenerateEnumData(values); + EXPECT_THAT(ExtractHeader(encoded), HeaderHas(0, 1, 0, 1)); + EXPECT_THAT(encoded, SizeIs(3)); + + // Adding a second one still prefers the fallback. + values.push_back(101); + encoded = GenerateEnumData(values); + EXPECT_THAT(ExtractHeader(encoded), HeaderHas(0, 1, 0, 2)); + EXPECT_THAT(encoded, SizeIs(4)); + + // Adding two more now makes bitmap more efficient, so they are collapsed + // to it. Because we can fit the bitmap in 128 bits, which is the same as the + // ints. + values.push_back(102); + values.push_back(103); + encoded = GenerateEnumData(values); + EXPECT_THAT(ExtractHeader(encoded), HeaderHas(0, 1, 128, 0)); + EXPECT_THAT(encoded, SizeIs(6)); + + // Add one value that falls into the existing bitmap, nothing changes. + values.push_back(104); + encoded = GenerateEnumData(values); + EXPECT_THAT(ExtractHeader(encoded), HeaderHas(0, 1, 128, 0)); + EXPECT_THAT(encoded, SizeIs(6)); + + // Add one value that is in the next 32 bits. It should grow the bitmap. + values.push_back(130); + encoded = GenerateEnumData(values); + EXPECT_THAT(ExtractHeader(encoded), HeaderHas(0, 1, 160, 0)); + EXPECT_THAT(encoded, SizeIs(7)); + + // Add one value far away, it should go into fallback. + values.push_back(200); + encoded = GenerateEnumData(values); + EXPECT_THAT(ExtractHeader(encoded), HeaderHas(0, 1, 160, 1)); + EXPECT_THAT(encoded, SizeIs(8)); + + // Another in the next 32-bit block will still make them fallback. + values.push_back(230); + encoded = GenerateEnumData(values); + EXPECT_THAT(ExtractHeader(encoded), HeaderHas(0, 1, 160, 2)); + EXPECT_THAT(encoded, SizeIs(9)); + + // One more in that same block should collapse them to bitmap. + values.push_back(231); + encoded = GenerateEnumData(values); + EXPECT_THAT(ExtractHeader(encoded), HeaderHas(0, 1, 256, 0)); + EXPECT_THAT(encoded, SizeIs(10)); +} + +void GatherValidValues(absl::Span data, int32_t min, + int32_t max, absl::btree_set& out) { + if (min >= max) return; + for (int32_t i = min;; ++i) { + if (ValidateEnum(i, data.begin())) out.insert(i); + // We check the top limit before ++i to avoid overflows + if (i == max) break; + } +} + +std::vector GetValidValues(absl::Span data, + int32_t min, int32_t max) { + // Btree to keep them sorted. Makes testing easier. + absl::btree_set s; + GatherValidValues(data, min, max, s); + return std::vector(s.begin(), s.end()); +} + +TEST(ValidateEnumTest, SequentialRangeTest) { + EXPECT_THAT(GetValidValues({0, 0}, -100, 100), ElementsAre()); + EXPECT_THAT(GetValidValues( + {// sequence start=3, length=3 + Make32(5, 3), + // no bitmap, no fallback + Make32(0, 0)}, + -100, 100), + ElementsAre(5, 6, 7)); + EXPECT_THAT(GetValidValues( + {// sequence start=-2, length=10 + Make32(-2, 10), + // no bitmap, no fallback + Make32(0, 0)}, + -100, 100), + ElementsAre(-2, -1, 0, 1, 2, 3, 4, 5, 6, 7)); +} + +TEST(ValidateEnumTest, BitmapRangeTest) { + EXPECT_THAT(GetValidValues( + {// no sequence + Make32(0, 0), + // bitmap of 32 bits, no fallback + Make32(32, 0), + // bitmap + 0b10011010101}, + -100, 100), + ElementsAre(0, 2, 4, 6, 7, 10)); + EXPECT_THAT(GetValidValues( + {// no sequence + Make32(0, 0), + // bitmap of 64 bits, no fallback + Make32(64, 0), + // bitmap + (1 << 4) | (1 << 21), 1 << 6}, + -100, 100), + ElementsAre(4, 21, 32 + 6)); +} + +TEST(ValidateEnumTest, GenerateEnumDataSequential) { + EXPECT_THAT(GenerateEnumData({0, 1, 2, 3}), ElementsAre( + // sequence start=0, length=4 + Make32(0, 4), + // no bitmap, no fallback. + Make32(0, 0))); + EXPECT_THAT(GenerateEnumData({-2, -1, 0, 1, 2, 3}), + ElementsAre( + // sequence start=-2, length=6 + Make32(-2, 6), + // no bitmap, no fallback. + Make32(0, 0))); +} + +void TestRoundTrip(absl::Span values, int line) { + auto encoded = GenerateEnumData(values); + + absl::btree_set s; + + // We test that all elements in `values` exist in the encoded data, and also + // test a range of other values to verify that they do not exist in the + // encoded data. + + // We keep track of the max seen to avoid testing the same values many times. + int64_t max_seen = std::numeric_limits::min(); + const auto gather_valid_values_around = [&](int32_t v) { + int32_t min = std::max({ + static_cast(v) - 100, + static_cast(std::numeric_limits::min()), + max_seen, + }); + int32_t max = + std::min(static_cast(v) + 100, + static_cast(std::numeric_limits::max())); + max_seen = std::max(max_seen, int64_t{max}); + GatherValidValues(encoded, min, max, s); + }; + + // We look at a few values around the expected ones. + // We could in theory test the whole int32_t domain, but that takes too long + // to run. + for (int32_t v : values) { + gather_valid_values_around(v); + } + // Also gather some around 0, just to add more coverage, specially when + // `values` is empty. + gather_valid_values_around(0); + + // Skip the checks below if we are correct because they are expensive. + if (std::equal(s.begin(), s.end(), values.begin(), values.end())) return; + + std::vector false_negatives; + for (int32_t v : values) { + if (!ValidateEnum(v, encoded.data())) false_negatives.push_back(v); + s.erase(v); + } + const auto& false_positives = s; + const auto print_data = [&] { + auto header = ExtractHeader(encoded); + return absl::StrFormat("line=%d header=%s", line, header.ToString()); + }; + EXPECT_THAT(false_negatives, IsEmpty()) + << "Missing values from the input. " << print_data() + << "\nEncoded: " << testing::PrintToString(encoded); + EXPECT_THAT(false_positives, IsEmpty()) + << "Found values not in input. " << print_data() + << "\nEncoded: " << testing::PrintToString(encoded); +} + +TEST(ValidateEnumTest, GenerateEnumDataSequentialWithOverflow) { + std::vector values; + for (int32_t i = -33000; i < 33000; ++i) { + values.push_back(i); + } + const auto data = GenerateEnumData(values); + EXPECT_THAT(ExtractHeader(data), + HeaderHas( + // The sequence starts at the minimum possible value, + std::numeric_limits::min(), + // and it is as long as possible. + 0xFFFF, + // we have some values in the bitmap + Gt(0), + // we have some in the fallback + Gt(0))); + + TestRoundTrip(values, __LINE__); +} + +TEST(ValidateEnumTest, GenerateEnumDataBitmap) { + EXPECT_THAT(GenerateEnumData({0, 1, 2, 4, 8, 16, 32}), + ElementsAre(Make32(0, 3), Make32(32, 0), + 0b100000000000000010000000100010)); + TestRoundTrip({}, __LINE__); + TestRoundTrip({0, 1, 2, 4, 8, 16}, __LINE__); + TestRoundTrip({0, 1, 2, 4, 8, 16, 32, 64, 128, 256}, __LINE__); + TestRoundTrip({10000, 10001, 10002, 10004, 10006, 10008, 10010}, __LINE__); + TestRoundTrip({std::numeric_limits::min(), -123123, -123, 213, + 213213, std::numeric_limits::max()}, + __LINE__); +} + +TEST(ValidateEnumTest, GenerateEnumDataBitmapWithOverflow) { + std::vector values; + // We step by 10 to guarantee each new value is more cost effective to add to + // the bitmap, which would cause an overflow of the 16-bit bitmap size if we + // didn't prevent it in the generator. + for (int32_t i = -33000; i < 33000; i += 10) { + values.push_back(i); + } + const auto data = GenerateEnumData(values); + + EXPECT_THAT(ExtractHeader(data), + HeaderHas(_, _, + // we reached the maximum size for the bitmap. + 0x10000 - 32, + // we have some in the fallback + Gt(0))); + + TestRoundTrip(values, __LINE__); +} + +TEST(ValidateEnumTest, GenerateEnumDataWithOverflowOnBoth) { + std::vector values; + for (int32_t i = -33000; i < 100000; ++i) { + values.push_back(i); + } + const auto data = GenerateEnumData(values); + + EXPECT_THAT(ExtractHeader(data), + HeaderHas( + // The sequence starts at the minimum possible value, + std::numeric_limits::min(), + // and it is as long as possible. + 0xFFFF, + // we reached the maximum size for the bitmap. + 0x10000 - 32, + // we have some in the fallback + Gt(0))); + + TestRoundTrip(values, __LINE__); +} + + +} // namespace +} // namespace internal +} // namespace protobuf +} // namespace google + +#include "google/protobuf/port_undef.inc"