From 9bff7b82d1a9733830a1a1cd9850896a62734a7a Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Tue, 23 Nov 2021 18:49:37 -0800 Subject: [PATCH] Unknown field comparison is passing tests. --- .bazelrc | 4 +- upb/util/BUILD | 10 ++ upb/util/compare.c | 148 +++++++++++++++++++---- upb/util/compare.h | 13 +- upb/util/compare_test.cc | 255 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 402 insertions(+), 28 deletions(-) create mode 100644 upb/util/compare_test.cc diff --git a/.bazelrc b/.bazelrc index f1783871a2..129dcbf94d 100644 --- a/.bazelrc +++ b/.bazelrc @@ -6,7 +6,7 @@ build --extra_toolchains=@system_python//:python_toolchain # Use our custom-configured c++ toolchain. build:m32 --copt=-m32 --linkopt=-m32 -build:asan --copt=-fsanitize=address --linkopt=-fsanitize=address +build:asan --copt=-fsanitize=address --linkopt=-fsanitize=address --copt=-D__SANITIZE_ADDRESS__=1 # For Valgrind, we have to disable checks of "possible" leaks because the Python # interpreter does the sorts of things that flag Valgrind "possible" leak checks. @@ -14,7 +14,7 @@ build:asan --copt=-fsanitize=address --linkopt=-fsanitize=address # know of an easy way to do that. # # We also have to disable pymalloc to avoid triggering Valgrind. -build:valgrind --run_under='valgrind --leak-check=full --trace-children=yes --show-possibly-lost=no --errors-for-leak-kinds=definite --error-exitcode=1' --action_env=PYTHONMALLOC=malloc +build:valgrind --run_under='valgrind --leak-check=full --track-origins=yes --trace-children=yes --show-possibly-lost=no --errors-for-leak-kinds=definite --error-exitcode=1' --action_env=PYTHONMALLOC=malloc build:ubsan --copt=-fsanitize=undefined --linkopt=-fsanitize=undefined --action_env=UBSAN_OPTIONS=halt_on_error=1:print_stacktrace=1 # Workaround for the fact that Bazel links with $CC, not $CXX diff --git a/upb/util/BUILD b/upb/util/BUILD index 48657a3167..355e6b7c5c 100644 --- a/upb/util/BUILD +++ b/upb/util/BUILD @@ -95,3 +95,13 @@ cc_library( hdrs = ["compare.h"], deps = ["//:reflection"], ) + +cc_test( + name = "compare_test", + srcs = ["compare_test.cc"], + deps = [ + "@com_google_absl//absl/strings", + "@com_google_googletest//:gtest_main", + ":compare", + ], +) diff --git a/upb/util/compare.c b/upb/util/compare.c index 32588a42bc..59ac4df8c1 100644 --- a/upb/util/compare.c +++ b/upb/util/compare.c @@ -1,3 +1,29 @@ +/* + * Copyright (c) 2009-2021, Google LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Google LLC nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL Google LLC BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ #include "upb/util/compare.h" @@ -29,6 +55,8 @@ struct upb_UnknownFields { typedef struct { const char *end; upb_arena *arena; + upb_UnknownField *tmp; + size_t tmp_size; int depth; jmp_buf err; } upb_UnknownField_Context; @@ -40,8 +68,8 @@ static void upb_UnknownFields_Grow(upb_UnknownField_Context *ctx, size_t old = (*ptr - *base); size_t new = UPB_MAX(4, old * 2); - *base = upb_arena_realloc(ctx->arena, *base, old * sizeof(*base), - new * sizeof(*base)); + *base = upb_arena_realloc(ctx->arena, *base, old * sizeof(**base), + new * sizeof(**base)); if (!*base) UPB_LONGJMP(ctx->err, kUpb_UnknownCompareResult_OutOfMemory); *ptr = *base + old; @@ -67,23 +95,80 @@ static const char *upb_UnknownFields_ParseVarint(const char *ptr, return ptr; } +// We have to implement our own sort here, since qsort() is not an in-order +// sort. Here we use merge sort, the simplest in-order sort. +static void upb_UnknownFields_Merge(upb_UnknownField *arr, size_t start, + size_t mid, size_t end, + upb_UnknownField *tmp) { + memcpy(tmp, &arr[start], (end - start) * sizeof(*tmp)); + + upb_UnknownField* ptr1 = tmp; + upb_UnknownField* end1 = &tmp[mid - start]; + upb_UnknownField* ptr2 = &tmp[mid - start]; + upb_UnknownField* end2 = &tmp[end - start]; + upb_UnknownField* out = &arr[start]; + + while (ptr1 < end1 && ptr2 < end2) { + if (ptr1->tag <= ptr2->tag) { + *out++ = *ptr1++; + } else { + *out++ = *ptr2++; + } + } + + if (ptr1 < end1) { + memcpy(out, ptr1, (end1 - ptr1) * sizeof(*out)); + } else if (ptr2 < end2) { + memcpy(out, ptr1, (end2 - ptr2) * sizeof(*out)); + } +} + +static void upb_UnknownFields_SortRecursive(upb_UnknownField *arr, + size_t start, size_t end, + upb_UnknownField *tmp) { + if (end - start > 1) { + size_t mid = start + ((end - start) / 2); + upb_UnknownFields_SortRecursive(arr, start, mid, tmp); + upb_UnknownFields_SortRecursive(arr, mid, end, tmp); + upb_UnknownFields_Merge(arr, start, mid, end, tmp); + } +} + +static void upb_UnknownFields_Sort(upb_UnknownField_Context *ctx, + upb_UnknownFields *fields) { + if (ctx->tmp_size < fields->size) { + ctx->tmp_size = UPB_MAX(8, ctx->tmp_size); + while (ctx->tmp_size < fields->size) ctx->tmp_size *= 2; + ctx->tmp = realloc(ctx->tmp, ctx->tmp_size * sizeof(*ctx->tmp)); + } + upb_UnknownFields_SortRecursive(fields->fields, 0, fields->size, ctx->tmp); +} + static upb_UnknownFields *upb_UnknownFields_DoBuild( upb_UnknownField_Context *ctx, const char **buf) { upb_UnknownField *arr_base = NULL; upb_UnknownField *arr_ptr = NULL; upb_UnknownField *arr_end = NULL; const char *ptr = *buf; + uint32_t last_tag = 0; + bool sorted = true; while (ptr < ctx->end) { + uint64_t tag; + ptr = upb_UnknownFields_ParseVarint(ptr, ctx->end, &tag); + UPB_ASSERT(tag <= UINT32_MAX); + int wire_type = tag & 7; + if (wire_type == UPB_WIRE_TYPE_END_GROUP) break; + if (tag < last_tag) sorted = false; + last_tag = tag; + if (arr_ptr == arr_end) { upb_UnknownFields_Grow(ctx, &arr_base, &arr_ptr, &arr_end); } upb_UnknownField *field = arr_ptr; + field->tag = tag; arr_ptr++; - uint64_t val; - ptr = upb_UnknownFields_ParseVarint(ptr, ctx->end, &val); - UPB_ASSERT(val <= UINT32_MAX); - field->tag = val; - switch (field->tag & 7) { + + switch (wire_type) { case UPB_WIRE_TYPE_VARINT: ptr = upb_UnknownFields_ParseVarint(ptr, ctx->end, &field->data.varint); break; @@ -95,7 +180,7 @@ static upb_UnknownFields *upb_UnknownFields_DoBuild( case UPB_WIRE_TYPE_32BIT: UPB_ASSERT(ctx->end - ptr >= 4); memcpy(&field->data.uint32, ptr, 4); - ptr += 8; + ptr += 4; break; case UPB_WIRE_TYPE_DELIMITED: { uint64_t size; @@ -103,6 +188,7 @@ static upb_UnknownFields *upb_UnknownFields_DoBuild( UPB_ASSERT(ctx->end - ptr >= size); field->data.delimited.data = ptr; field->data.delimited.size = size; + ptr += size; break; } case UPB_WIRE_TYPE_START_GROUP: @@ -112,23 +198,24 @@ static upb_UnknownFields *upb_UnknownFields_DoBuild( field->data.group = upb_UnknownFields_DoBuild(ctx, &ptr); ctx->depth++; break; - case UPB_WIRE_TYPE_END_GROUP: - goto done; default: UPB_UNREACHABLE(); } } -done: *buf = ptr; upb_UnknownFields *ret = upb_arena_malloc(ctx->arena, sizeof(*ret)); if (!ret) UPB_LONGJMP(ctx->err, kUpb_UnknownCompareResult_OutOfMemory); ret->fields = arr_base; ret->size = arr_ptr - arr_base; ret->capacity = arr_end - arr_base; + if (!sorted) { + upb_UnknownFields_Sort(ctx, ret); + } return ret; } +// Builds a upb_UnknownFields data structure from the binary data in buf. static upb_UnknownFields *upb_UnknownFields_Build(upb_UnknownField_Context *ctx, const char *buf, size_t size) { @@ -138,6 +225,7 @@ static upb_UnknownFields *upb_UnknownFields_Build(upb_UnknownField_Context *ctx, return fields; } +// Compares two sorted upb_UnknwonFields structures for equality. static bool upb_UnknownFields_IsEqual(const upb_UnknownFields *uf1, const upb_UnknownFields *uf2) { if (uf1->size != uf2->size) return false; @@ -145,7 +233,8 @@ static bool upb_UnknownFields_IsEqual(const upb_UnknownFields *uf1, upb_UnknownField *f1 = &uf1->fields[i]; upb_UnknownField *f2 = &uf2->fields[i]; if (f1->tag != f2->tag) return false; - switch (f1->tag & 7) { + int wire_type = f1->tag & 7; + switch (wire_type) { case UPB_WIRE_TYPE_VARINT: if (f1->data.varint != f2->data.varint) return false; break; @@ -172,12 +261,11 @@ static bool upb_UnknownFields_IsEqual(const upb_UnknownFields *uf1, return true; } -upb_UnknownCompareResult upb_Message_UnknownFieldsAreEqual(const upb_msg *msg1, - const upb_msg *msg2, +upb_UnknownCompareResult upb_Message_UnknownFieldsAreEqual(const char *buf1, + size_t size1, + const char *buf2, + size_t size2, int max_depth) { - size_t size1, size2; - const char *buf1 = upb_msg_getunknown(msg1, &size1); - const char *buf2 = upb_msg_getunknown(msg2, &size2); if (size1 == 0 && size2 == 0) return kUpb_UnknownCompareResult_Equal; if (size1 == 0 || size2 == 0) return kUpb_UnknownCompareResult_NotEqual; if (memcmp(buf1, buf2, size1) == 0) return kUpb_UnknownCompareResult_Equal; @@ -185,13 +273,29 @@ upb_UnknownCompareResult upb_Message_UnknownFieldsAreEqual(const upb_msg *msg1, upb_UnknownField_Context ctx = { .arena = upb_arena_new(), .depth = max_depth, + .tmp = NULL, + .tmp_size = 0, }; + if (!ctx.arena) return kUpb_UnknownCompareResult_OutOfMemory; - upb_UnknownFields *uf1 = upb_UnknownFields_Build(&ctx, buf1, size1); - upb_UnknownFields *uf2 = upb_UnknownFields_Build(&ctx, buf2, size2); - bool ret = upb_UnknownFields_IsEqual(uf1, uf2); + int ret = UPB_SETJMP(ctx.err); + + if (UPB_LIKELY(ret == 0)) { + // First build both unknown fields into a sorted data structure (similar + // to the UnknownFieldSet in C++). + upb_UnknownFields *uf1 = upb_UnknownFields_Build(&ctx, buf1, size1); + upb_UnknownFields *uf2 = upb_UnknownFields_Build(&ctx, buf2, size2); + + // Now perform the equality check on the sorted structures. + if (upb_UnknownFields_IsEqual(uf1, uf2)) { + ret = kUpb_UnknownCompareResult_Equal; + } else { + ret = kUpb_UnknownCompareResult_NotEqual; + } + } + upb_arena_free(ctx.arena); - return ret ? kUpb_UnknownCompareResult_Equal - : kUpb_UnknownCompareResult_NotEqual; + free(ctx.tmp); + return ret; } diff --git a/upb/util/compare.h b/upb/util/compare.h index 944e71683a..107ea2c425 100644 --- a/upb/util/compare.h +++ b/upb/util/compare.h @@ -37,21 +37,26 @@ extern "C" { // Returns true if unknown fields from the two messages are equal when sorted // and varints are made canonical. // -// These semantics are unfortunate, as the comparison is lossy without schema -// data: +// This operation should be considered best effort; the comparison is inherently +// lossy without schema data: +// // 1. We don't know whether delimited fields are sub-messages. Unknown // sub-messages will therefore not have their fields sorted and varints // canonicalized. // 2. We don't know about oneof/non-repeated fields, which should semantically // discard every value except the last. + typedef enum { kUpb_UnknownCompareResult_Equal = 0, kUpb_UnknownCompareResult_NotEqual = 1, kUpb_UnknownCompareResult_OutOfMemory = 2, kUpb_UnknownCompareResult_MaxDepthExceeded = 3, } upb_UnknownCompareResult; -upb_UnknownCompareResult upb_Message_UnknownFieldsAreEqual(const upb_msg *msg1, - const upb_msg *msg2, + +upb_UnknownCompareResult upb_Message_UnknownFieldsAreEqual(const char *buf1, + size_t size1, + const char *buf2, + size_t size2, int max_depth); #ifdef __cplusplus diff --git a/upb/util/compare_test.cc b/upb/util/compare_test.cc new file mode 100644 index 0000000000..13a259bd43 --- /dev/null +++ b/upb/util/compare_test.cc @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2009-2021, Google LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Google LLC nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL Google LLC BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "upb/util/compare.h" + +#include "absl/strings/string_view.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +#include +#include +#include + +struct UnknownField; + +using UnknownFields = std::vector; + +enum class UnknownFieldType { + kVarint, + kLongVarint, // Over-encoded to have distinct wire format. + kDelimited, + kFixed64, + kFixed32, + kGroup, +}; + +union UnknownFieldValue { + uint64_t varint; + uint64_t fixed64; + uint32_t fixed32; + // NULL-terminated (strings must not have embedded NULL). + const char* delimited; + UnknownFields* group; +}; + +struct TypeAndValue { + UnknownFieldType type; + UnknownFieldValue value; +}; + +struct UnknownField { + uint32_t field_number; + TypeAndValue value; +}; + + +TypeAndValue Varint(uint64_t val) { + TypeAndValue ret{UnknownFieldType::kVarint}; + ret.value.varint = val; + return ret; +} + +TypeAndValue LongVarint(uint64_t val) { + TypeAndValue ret{UnknownFieldType::kLongVarint}; + ret.value.varint = val; + return ret; +} + +TypeAndValue Fixed64(uint64_t val) { + TypeAndValue ret{UnknownFieldType::kFixed64}; + ret.value.fixed64 = val; + return ret; +} + +TypeAndValue Fixed32(uint32_t val) { + TypeAndValue ret{UnknownFieldType::kFixed32}; + ret.value.fixed32 = val; + return ret; +} + +TypeAndValue Delimited(const char* val) { + TypeAndValue ret{UnknownFieldType::kDelimited}; + ret.value.delimited = val; + return ret; +} + +TypeAndValue Group(UnknownFields nested) { + TypeAndValue ret{UnknownFieldType::kGroup}; + ret.value.group = &nested; + return ret; +} + +void f() { + auto fields = UnknownFields{ + {1, Varint(1)}, + {2, Fixed64(2)}, + }; + + auto fields2 = UnknownFields{ + {1, Group({ + {2, Group({ + {3, Varint(1)}, + })}, + })}, + }; +} + +void EncodeVarint(uint64_t val, std::string* str) { + do { + char byte = val & 0x7fU; + val >>= 7; + if (val) byte |= 0x80U; + str->push_back(byte); + } while (val); +} + +std::string ToBinaryPayload(const UnknownFields& fields) { + static const upb_wiretype_t wire_types[] = { + UPB_WIRE_TYPE_VARINT, + UPB_WIRE_TYPE_VARINT, + UPB_WIRE_TYPE_DELIMITED, + UPB_WIRE_TYPE_64BIT, + UPB_WIRE_TYPE_32BIT, + UPB_WIRE_TYPE_START_GROUP, + }; + std::string ret; + + for (const auto& field : fields) { + uint32_t tag = field.field_number << 3 | + (wire_types[static_cast(field.value.type)]); + EncodeVarint(tag, &ret); + switch (field.value.type) { + case UnknownFieldType::kVarint: + EncodeVarint(field.value.value.varint, &ret); + break; + case UnknownFieldType::kLongVarint: + EncodeVarint(field.value.value.varint, &ret); + ret.back() |= 0x80; + ret.push_back(0); + break; + case UnknownFieldType::kDelimited: + EncodeVarint(strlen(field.value.value.delimited), &ret); + ret.append(field.value.value.delimited); + break; + case UnknownFieldType::kFixed64: { + uint64_t val = _upb_be_swap64(field.value.value.fixed64); + ret.append(reinterpret_cast(&val), sizeof(val)); + break; + } + case UnknownFieldType::kFixed32: { + uint32_t val = _upb_be_swap32(field.value.value.fixed32); + ret.append(reinterpret_cast(&val), sizeof(val)); + break; + } + case UnknownFieldType::kGroup: { + uint32_t end_tag = field.field_number << 3 | UPB_WIRE_TYPE_END_GROUP; + ret.append(ToBinaryPayload(*field.value.value.group)); + EncodeVarint(end_tag, &ret); + break; + } + } + } + + return ret; +} + +upb_UnknownCompareResult CompareUnknownWithMaxDepth(UnknownFields uf1, + UnknownFields uf2, + int max_depth) { + std::string buf1 = ToBinaryPayload(uf1); + std::string buf2 = ToBinaryPayload(uf2); + return upb_Message_UnknownFieldsAreEqual(buf1.data(), buf1.size(), + buf2.data(), buf2.size(), max_depth); +} + +upb_UnknownCompareResult CompareUnknown(UnknownFields uf1, UnknownFields uf2) { + return CompareUnknownWithMaxDepth(uf1, uf2, 64); +} + +TEST(CompareTest, UnknownFieldsReflexive) { + EXPECT_EQ(kUpb_UnknownCompareResult_Equal, CompareUnknown({}, {})); + EXPECT_EQ(kUpb_UnknownCompareResult_Equal, + CompareUnknown({{1, Varint(123)}, {2, Fixed32(456)}}, + {{1, Varint(123)}, {2, Fixed32(456)}})); + EXPECT_EQ( + kUpb_UnknownCompareResult_Equal, + CompareUnknown( + {{1, Group({{2, Group({{3, Fixed32(456)}, {4, Fixed64(123)}})}})}}, + {{1, Group({{2, Group({{3, Fixed32(456)}, {4, Fixed64(123)}})}})}})); +} + +TEST(CompareTest, UnknownFieldsOrdering) { + EXPECT_EQ(kUpb_UnknownCompareResult_Equal, + CompareUnknown({{1, Varint(111)}, + {2, Delimited("ABC")}, + {3, Fixed32(456)}, + {4, Fixed64(123)}, + {5, Group({})}}, + {{5, Group({})}, + {4, Fixed64(123)}, + {3, Fixed32(456)}, + {2, Delimited("ABC")}, + {1, Varint(111)}})); + EXPECT_EQ(kUpb_UnknownCompareResult_NotEqual, + CompareUnknown({{1, Varint(111)}, + {2, Delimited("ABC")}, + {3, Fixed32(456)}, + {4, Fixed64(123)}, + {5, Group({})}}, + {{5, Group({})}, + {4, Fixed64(123)}, + {3, Fixed32(455)}, // Small difference. + {2, Delimited("ABC")}, + {1, Varint(111)}})); + EXPECT_EQ(kUpb_UnknownCompareResult_Equal, + CompareUnknown({{3, Fixed32(456)}, {4, Fixed64(123)}}, + {{4, Fixed64(123)}, {3, Fixed32(456)}})); + EXPECT_EQ( + kUpb_UnknownCompareResult_Equal, + CompareUnknown( + {{1, Group({{2, Group({{3, Fixed32(456)}, {4, Fixed64(123)}})}})}}, + {{1, Group({{2, Group({{4, Fixed64(123)}, {3, Fixed32(456)}})}})}})); +} + +TEST(CompareTest, LongVarint) { + EXPECT_EQ(kUpb_UnknownCompareResult_Equal, + CompareUnknown({{1, LongVarint(123)}, {2, LongVarint(456)}}, + {{1, Varint(123)}, {2, Varint(456)}})); + EXPECT_EQ(kUpb_UnknownCompareResult_Equal, + CompareUnknown({{2, LongVarint(456)}, {1, LongVarint(123)}}, + {{1, Varint(123)}, {2, Varint(456)}})); +} + +TEST(CompareTest, MaxDepth) { + EXPECT_EQ( + kUpb_UnknownCompareResult_MaxDepthExceeded, + CompareUnknownWithMaxDepth( + {{1, Group({{2, Group({{3, Fixed32(456)}, {4, Fixed64(123)}})}})}}, + {{1, Group({{2, Group({{4, Fixed64(123)}, {3, Fixed32(456)}})}})}}, + 2)); +}