Add a new enum validation format. The format is compact for sequential and

almost sequential enums. Uses Eytzinger layout for a fallback binary search.

PiperOrigin-RevId: 553489682
pull/13451/head
Protobuf Team Bot 1 year ago committed by Copybara-Service
parent 3511f8acf9
commit ecc9e43eec
  1. 12
      src/google/protobuf/BUILD.bazel
  2. 149
      src/google/protobuf/generated_enum_util.cc
  3. 53
      src/google/protobuf/generated_enum_util.h
  4. 377
      src/google/protobuf/generated_enum_util_test.cc

@ -1490,6 +1490,18 @@ cc_test(
],
)
cc_test(
name = "generated_enum_util_test",
srcs = ["generated_enum_util_test.cc"],
deps = [
":protobuf",
"@com_google_absl//absl/container:btree",
"@com_google_absl//absl/strings:str_format",
"@com_google_absl//absl/types:span",
"@com_google_googletest//:gtest_main",
],
)
cc_test(
name = "retention_test",
srcs = ["retention_test.cc"],

@ -31,9 +31,19 @@
#include "google/protobuf/generated_enum_util.h"
#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <utility>
#include <vector>
#include "absl/log/absl_check.h"
#include "absl/types/optional.h"
#include "absl/types/span.h"
#include "google/protobuf/generated_message_util.h"
// Must be included last.
#include "google/protobuf/port_def.inc"
namespace google {
namespace protobuf {
namespace internal {
@ -90,6 +100,145 @@ bool InitializeEnumStrings(
return true;
}
bool ValidateEnum(int value, const uint32_t* data) {
return ValidateEnumInlined(value, data);
}
struct EytzingerLayoutSorter {
absl::Span<const int32_t> input;
absl::Span<uint32_t> output;
size_t i;
// This is recursive, but the maximum depth is log(N), so it should be safe.
void Sort(size_t output_index = 0) {
if (output_index < input.size()) {
Sort(2 * output_index + 1);
output[output_index] = input[i++];
Sort(2 * output_index + 2);
}
}
};
std::vector<uint32_t> GenerateEnumData(absl::Span<const int32_t> values) {
const auto sorted_and_unique = [&] {
for (size_t i = 0; i + 1 < values.size(); ++i) {
if (values[i] >= values[i + 1]) return false;
}
return true;
};
ABSL_DCHECK(sorted_and_unique());
std::vector<int32_t> fallback_values_too_large, fallback_values_after_bitmap;
std::vector<uint32_t> bitmap_values;
constexpr size_t kBitmapBlockSize = 32;
absl::optional<int16_t> start_sequence;
uint32_t sequence_length = 0;
for (int32_t v : values) {
// If we don't yet have a sequence, start it.
if (!start_sequence.has_value()) {
// But only if we can fit it in the sequence.
if (static_cast<int16_t>(v) != v) {
fallback_values_too_large.push_back(v);
continue;
}
start_sequence = v;
sequence_length = 1;
continue;
}
// If we can extend the sequence, do so.
if (v == static_cast<int32_t>(*start_sequence) +
static_cast<int32_t>(sequence_length) &&
sequence_length < 0xFFFF) {
++sequence_length;
continue;
}
// We adjust the bitmap values to be relative to the end of the sequence.
const auto adjust = [&](int32_t v) -> uint32_t {
// Cast to int64_t first to avoid overflow. The result is guaranteed to be
// positive and fit in uint32_t.
int64_t a = static_cast<int64_t>(v) - *start_sequence - sequence_length;
ABSL_DCHECK(a >= 0);
ABSL_DCHECK_EQ(a, static_cast<uint32_t>(a));
return a;
};
const uint32_t adjusted = adjust(v);
const auto add_bit = [&](uint32_t bit) {
bitmap_values[bit / kBitmapBlockSize] |= uint32_t{1}
<< (bit % kBitmapBlockSize);
};
// If we can fit it on the already allocated bitmap, do so.
if (adjusted < kBitmapBlockSize * bitmap_values.size()) {
// We can fit it in the existing bitmap.
ABSL_DCHECK_EQ(fallback_values_after_bitmap.size(), 0);
add_bit(adjusted);
continue;
}
// We can't fit in the sequence and we can't fit in the current bitmap.
// Evaluate if it is better to add to fallback, or to collapse all the
// fallback values after the bitmap into the bitmap.
const size_t cost_if_fallback =
bitmap_values.size() + (1 + fallback_values_after_bitmap.size());
const size_t rounded_bitmap_size =
(adjusted + kBitmapBlockSize) / kBitmapBlockSize;
const size_t cost_if_collapse = rounded_bitmap_size;
if (cost_if_collapse <= cost_if_fallback &&
kBitmapBlockSize * rounded_bitmap_size < 0x10000) {
// Collapse the existing values, and add the new one.
ABSL_DCHECK_GT(rounded_bitmap_size, bitmap_values.size());
bitmap_values.resize(rounded_bitmap_size);
for (int32_t to_collapse : fallback_values_after_bitmap) {
add_bit(adjust(to_collapse));
}
fallback_values_after_bitmap.clear();
add_bit(adjusted);
} else {
fallback_values_after_bitmap.push_back(v);
}
}
std::vector<int32_t> fallback_values;
if (fallback_values_after_bitmap.empty()) {
fallback_values = std::move(fallback_values_too_large);
} else if (fallback_values_too_large.empty()) {
fallback_values = std::move(fallback_values_after_bitmap);
} else {
fallback_values.resize(fallback_values_too_large.size() +
fallback_values_after_bitmap.size());
std::merge(fallback_values_too_large.begin(),
fallback_values_too_large.end(),
fallback_values_after_bitmap.begin(),
fallback_values_after_bitmap.end(), &fallback_values[0]);
}
std::vector<uint32_t> output(
2 /* seq start + seq len + bitmap len + ordered len */ +
bitmap_values.size() + fallback_values.size());
uint32_t* p = output.data();
ABSL_DCHECK_EQ(sequence_length, static_cast<uint16_t>(sequence_length));
*p++ = uint32_t{static_cast<uint16_t>(start_sequence.value_or(0))} |
(uint32_t{sequence_length} << 16);
ABSL_DCHECK_EQ(
kBitmapBlockSize * bitmap_values.size(),
static_cast<uint16_t>(kBitmapBlockSize * bitmap_values.size()));
ABSL_DCHECK_EQ(fallback_values.size(),
static_cast<uint16_t>(fallback_values.size()));
*p++ = static_cast<uint32_t>(kBitmapBlockSize * bitmap_values.size()) |
static_cast<uint32_t>(fallback_values.size() << 16);
p = std::copy(bitmap_values.begin(), bitmap_values.end(), p);
EytzingerLayoutSorter{fallback_values,
absl::MakeSpan(p, fallback_values.size())}
.Sort();
return output;
}
} // namespace internal
} // namespace protobuf
} // namespace google

@ -31,9 +31,15 @@
#ifndef GOOGLE_PROTOBUF_GENERATED_ENUM_UTIL_H__
#define GOOGLE_PROTOBUF_GENERATED_ENUM_UTIL_H__
#include <cstddef>
#include <cstdint>
#include <string>
#include <type_traits>
#include <vector>
#include "absl/strings/string_view.h"
#include "absl/types/span.h"
#include "google/protobuf/explicitly_constructed.h"
#include "google/protobuf/message_lite.h"
// Must be included last.
@ -75,6 +81,53 @@ PROTOBUF_EXPORT bool InitializeEnumStrings(
const EnumEntry* enums, const int* sorted_indices, size_t size,
internal::ExplicitlyConstructed<std::string>* enum_strings);
// The enum validation format is split in 3 parts:
// - A dense sequence, with start+length
// - A variable size presence bitmap (in increments of 32 bits)
// - A variable size sorted int32_t set for everything else.
//
// The values are as follows:
//
// 0 - [ sequence start (int16_t) ] | [ sequence size (uint16_t) ] << 16
// 1 - [ bitmap size in bits (uint16_t) ] | [ ordered size (uint16_t) ] << 16
// x - [ variable length bitmap ]
// y - [ variable length of int32_t values ]
//
// where the bitmap starts right after the end of the sequence.
PROTOBUF_EXPORT bool ValidateEnum(int value, const uint32_t* data);
PROTOBUF_EXPORT std::vector<uint32_t> GenerateEnumData(
absl::Span<const int32_t> values);
inline PROTOBUF_ALWAYS_INLINE bool ValidateEnumInlined(int value,
const uint32_t* data) {
const int16_t min_seq = static_cast<int16_t>(data[0] & 0xFFFF);
const uint16_t length_seq = static_cast<uint16_t>(data[0] >> 16);
uint64_t adjusted =
static_cast<uint64_t>(static_cast<int64_t>(value)) - min_seq;
// Check if the value is within the sequential part.
if (PROTOBUF_PREDICT_TRUE(adjusted < length_seq)) {
return true;
}
const uint16_t length_bitmap = static_cast<uint16_t>(data[1] & 0xFFFF);
adjusted -= length_seq;
// Check if the value is within the bitmap.
if (PROTOBUF_PREDICT_TRUE(adjusted < length_bitmap)) {
return ((data[2 + (adjusted / 32)] >> (adjusted % 32)) & 1) == 1;
}
// Check if the value is on the ordered part.
const uint16_t num_ordered = static_cast<uint16_t>(data[1] >> 16);
data += 2 + length_bitmap / 32;
size_t pos = 0;
while (pos < num_ordered) {
const int32_t sample = static_cast<int32_t>(data[pos]);
if (sample == value) return true;
pos = 2 * pos + (sample > value ? 1 : 2);
}
return false;
}
} // namespace internal
} // namespace protobuf
} // namespace google

@ -0,0 +1,377 @@
// Protocol Buffers - Google's data interchange format
// Copyright 2023 Google Inc. All rights reserved.
// https://developers.google.com/protocol-buffers/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "google/protobuf/generated_enum_util.h"
#include <algorithm>
#include <array>
#include <cstdint>
#include <limits>
#include <ostream>
#include <string>
#include <vector>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "absl/container/btree_set.h"
#include "absl/strings/str_format.h"
#include "absl/types/span.h"
// Must be included last.
#include "google/protobuf/port_def.inc"
using testing::_;
using testing::ElementsAre;
using testing::Gt;
using testing::IsEmpty;
using testing::SizeIs;
namespace google {
namespace protobuf {
namespace internal {
namespace {
TEST(GenerateEnumDataTest, DebugChecks) {
#if GTEST_HAS_DEATH_TEST
// Not unique
EXPECT_DEBUG_DEATH(GenerateEnumData({1, 1}), "sorted_and_unique");
// Not sorted
EXPECT_DEBUG_DEATH(GenerateEnumData({2, 1}), "sorted_and_unique");
#endif
}
uint32_t Make32(uint16_t a, uint16_t b) { return a | (b << 16); }
std::array<uint16_t, 2> Unmake32(uint32_t v) {
return {static_cast<uint16_t>(v), static_cast<uint16_t>(v >> 16)};
}
struct Header {
int16_t sequence_start;
uint16_t sequence_length;
uint16_t bitmap_length;
uint16_t ordered_length;
std::string ToString() const {
return absl::StrFormat("(%d,%d,%d,%d)", sequence_start, sequence_length,
bitmap_length, ordered_length);
}
friend std::ostream& operator<<(std::ostream& os, Header header) {
return os << header.ToString();
}
};
MATCHER_P4(HeaderHas, sequence_start, sequence_length, bitmap_length,
ordered_length, "") {
return testing::ExplainMatchResult(sequence_start, arg.sequence_start,
result_listener) &&
testing::ExplainMatchResult(sequence_length, arg.sequence_length,
result_listener) &&
testing::ExplainMatchResult(bitmap_length, arg.bitmap_length,
result_listener) &&
testing::ExplainMatchResult(ordered_length, arg.ordered_length,
result_listener);
}
Header ExtractHeader(absl::Span<const uint32_t> data) {
return {
static_cast<int16_t>(Unmake32(data[0])[0]),
Unmake32(data[0])[1],
Unmake32(data[1])[0],
Unmake32(data[1])[1],
};
}
TEST(GenerateEnumDataTest, BitmapSpaceOptimizationWorks) {
std::vector<int32_t> values = {0};
auto encoded = GenerateEnumData(values);
EXPECT_THAT(ExtractHeader(encoded), HeaderHas(0, 1, 0, 0));
EXPECT_THAT(encoded, SizeIs(2));
// Adding one large value puts it on the fallback.
values.push_back(100);
encoded = GenerateEnumData(values);
EXPECT_THAT(ExtractHeader(encoded), HeaderHas(0, 1, 0, 1));
EXPECT_THAT(encoded, SizeIs(3));
// Adding a second one still prefers the fallback.
values.push_back(101);
encoded = GenerateEnumData(values);
EXPECT_THAT(ExtractHeader(encoded), HeaderHas(0, 1, 0, 2));
EXPECT_THAT(encoded, SizeIs(4));
// Adding two more now makes bitmap more efficient, so they are collapsed
// to it. Because we can fit the bitmap in 128 bits, which is the same as the
// ints.
values.push_back(102);
values.push_back(103);
encoded = GenerateEnumData(values);
EXPECT_THAT(ExtractHeader(encoded), HeaderHas(0, 1, 128, 0));
EXPECT_THAT(encoded, SizeIs(6));
// Add one value that falls into the existing bitmap, nothing changes.
values.push_back(104);
encoded = GenerateEnumData(values);
EXPECT_THAT(ExtractHeader(encoded), HeaderHas(0, 1, 128, 0));
EXPECT_THAT(encoded, SizeIs(6));
// Add one value that is in the next 32 bits. It should grow the bitmap.
values.push_back(130);
encoded = GenerateEnumData(values);
EXPECT_THAT(ExtractHeader(encoded), HeaderHas(0, 1, 160, 0));
EXPECT_THAT(encoded, SizeIs(7));
// Add one value far away, it should go into fallback.
values.push_back(200);
encoded = GenerateEnumData(values);
EXPECT_THAT(ExtractHeader(encoded), HeaderHas(0, 1, 160, 1));
EXPECT_THAT(encoded, SizeIs(8));
// Another in the next 32-bit block will still make them fallback.
values.push_back(230);
encoded = GenerateEnumData(values);
EXPECT_THAT(ExtractHeader(encoded), HeaderHas(0, 1, 160, 2));
EXPECT_THAT(encoded, SizeIs(9));
// One more in that same block should collapse them to bitmap.
values.push_back(231);
encoded = GenerateEnumData(values);
EXPECT_THAT(ExtractHeader(encoded), HeaderHas(0, 1, 256, 0));
EXPECT_THAT(encoded, SizeIs(10));
}
void GatherValidValues(absl::Span<const uint32_t> data, int32_t min,
int32_t max, absl::btree_set<int32_t>& out) {
if (min >= max) return;
for (int32_t i = min;; ++i) {
if (ValidateEnum(i, data.begin())) out.insert(i);
// We check the top limit before ++i to avoid overflows
if (i == max) break;
}
}
std::vector<int32_t> GetValidValues(absl::Span<const uint32_t> data,
int32_t min, int32_t max) {
// Btree to keep them sorted. Makes testing easier.
absl::btree_set<int32_t> s;
GatherValidValues(data, min, max, s);
return std::vector<int32_t>(s.begin(), s.end());
}
TEST(ValidateEnumTest, SequentialRangeTest) {
EXPECT_THAT(GetValidValues({0, 0}, -100, 100), ElementsAre());
EXPECT_THAT(GetValidValues(
{// sequence start=3, length=3
Make32(5, 3),
// no bitmap, no fallback
Make32(0, 0)},
-100, 100),
ElementsAre(5, 6, 7));
EXPECT_THAT(GetValidValues(
{// sequence start=-2, length=10
Make32(-2, 10),
// no bitmap, no fallback
Make32(0, 0)},
-100, 100),
ElementsAre(-2, -1, 0, 1, 2, 3, 4, 5, 6, 7));
}
TEST(ValidateEnumTest, BitmapRangeTest) {
EXPECT_THAT(GetValidValues(
{// no sequence
Make32(0, 0),
// bitmap of 32 bits, no fallback
Make32(32, 0),
// bitmap
0b10011010101},
-100, 100),
ElementsAre(0, 2, 4, 6, 7, 10));
EXPECT_THAT(GetValidValues(
{// no sequence
Make32(0, 0),
// bitmap of 64 bits, no fallback
Make32(64, 0),
// bitmap
(1 << 4) | (1 << 21), 1 << 6},
-100, 100),
ElementsAre(4, 21, 32 + 6));
}
TEST(ValidateEnumTest, GenerateEnumDataSequential) {
EXPECT_THAT(GenerateEnumData({0, 1, 2, 3}), ElementsAre(
// sequence start=0, length=4
Make32(0, 4),
// no bitmap, no fallback.
Make32(0, 0)));
EXPECT_THAT(GenerateEnumData({-2, -1, 0, 1, 2, 3}),
ElementsAre(
// sequence start=-2, length=6
Make32(-2, 6),
// no bitmap, no fallback.
Make32(0, 0)));
}
void TestRoundTrip(absl::Span<const int32_t> values, int line) {
auto encoded = GenerateEnumData(values);
absl::btree_set<int32_t> s;
// We test that all elements in `values` exist in the encoded data, and also
// test a range of other values to verify that they do not exist in the
// encoded data.
// We keep track of the max seen to avoid testing the same values many times.
int64_t max_seen = std::numeric_limits<int64_t>::min();
const auto gather_valid_values_around = [&](int32_t v) {
int32_t min = std::max({
static_cast<int64_t>(v) - 100,
static_cast<int64_t>(std::numeric_limits<int32_t>::min()),
max_seen,
});
int32_t max =
std::min(static_cast<int64_t>(v) + 100,
static_cast<int64_t>(std::numeric_limits<int32_t>::max()));
max_seen = std::max(max_seen, int64_t{max});
GatherValidValues(encoded, min, max, s);
};
// We look at a few values around the expected ones.
// We could in theory test the whole int32_t domain, but that takes too long
// to run.
for (int32_t v : values) {
gather_valid_values_around(v);
}
// Also gather some around 0, just to add more coverage, specially when
// `values` is empty.
gather_valid_values_around(0);
// Skip the checks below if we are correct because they are expensive.
if (std::equal(s.begin(), s.end(), values.begin(), values.end())) return;
std::vector<int32_t> false_negatives;
for (int32_t v : values) {
if (!ValidateEnum(v, encoded.data())) false_negatives.push_back(v);
s.erase(v);
}
const auto& false_positives = s;
const auto print_data = [&] {
auto header = ExtractHeader(encoded);
return absl::StrFormat("line=%d header=%s", line, header.ToString());
};
EXPECT_THAT(false_negatives, IsEmpty())
<< "Missing values from the input. " << print_data()
<< "\nEncoded: " << testing::PrintToString(encoded);
EXPECT_THAT(false_positives, IsEmpty())
<< "Found values not in input. " << print_data()
<< "\nEncoded: " << testing::PrintToString(encoded);
}
TEST(ValidateEnumTest, GenerateEnumDataSequentialWithOverflow) {
std::vector<int32_t> values;
for (int32_t i = -33000; i < 33000; ++i) {
values.push_back(i);
}
const auto data = GenerateEnumData(values);
EXPECT_THAT(ExtractHeader(data),
HeaderHas(
// The sequence starts at the minimum possible value,
std::numeric_limits<int16_t>::min(),
// and it is as long as possible.
0xFFFF,
// we have some values in the bitmap
Gt(0),
// we have some in the fallback
Gt(0)));
TestRoundTrip(values, __LINE__);
}
TEST(ValidateEnumTest, GenerateEnumDataBitmap) {
EXPECT_THAT(GenerateEnumData({0, 1, 2, 4, 8, 16, 32}),
ElementsAre(Make32(0, 3), Make32(32, 0),
0b100000000000000010000000100010));
TestRoundTrip({}, __LINE__);
TestRoundTrip({0, 1, 2, 4, 8, 16}, __LINE__);
TestRoundTrip({0, 1, 2, 4, 8, 16, 32, 64, 128, 256}, __LINE__);
TestRoundTrip({10000, 10001, 10002, 10004, 10006, 10008, 10010}, __LINE__);
TestRoundTrip({std::numeric_limits<int32_t>::min(), -123123, -123, 213,
213213, std::numeric_limits<int32_t>::max()},
__LINE__);
}
TEST(ValidateEnumTest, GenerateEnumDataBitmapWithOverflow) {
std::vector<int32_t> values;
// We step by 10 to guarantee each new value is more cost effective to add to
// the bitmap, which would cause an overflow of the 16-bit bitmap size if we
// didn't prevent it in the generator.
for (int32_t i = -33000; i < 33000; i += 10) {
values.push_back(i);
}
const auto data = GenerateEnumData(values);
EXPECT_THAT(ExtractHeader(data),
HeaderHas(_, _,
// we reached the maximum size for the bitmap.
0x10000 - 32,
// we have some in the fallback
Gt(0)));
TestRoundTrip(values, __LINE__);
}
TEST(ValidateEnumTest, GenerateEnumDataWithOverflowOnBoth) {
std::vector<int32_t> values;
for (int32_t i = -33000; i < 100000; ++i) {
values.push_back(i);
}
const auto data = GenerateEnumData(values);
EXPECT_THAT(ExtractHeader(data),
HeaderHas(
// The sequence starts at the minimum possible value,
std::numeric_limits<int16_t>::min(),
// and it is as long as possible.
0xFFFF,
// we reached the maximum size for the bitmap.
0x10000 - 32,
// we have some in the fallback
Gt(0)));
TestRoundTrip(values, __LINE__);
}
} // namespace
} // namespace internal
} // namespace protobuf
} // namespace google
#include "google/protobuf/port_undef.inc"
Loading…
Cancel
Save