From c755099a896f0c429a6f02bb3446d7396ac7f3b8 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Wed, 6 Oct 2021 13:13:04 -0700 Subject: [PATCH 1/8] WIP. --- cmake/google/protobuf/descriptor.upb.c | 73 ++++++++-- cmake/google/protobuf/descriptor.upb.h | 7 + upb/msg_internal.h | 14 +- upb/msg_test.cc | 50 +++++++ upb/msg_test.proto | 21 +++ upbc/protoc-gen-upb.cc | 181 +++++++++++++++++++++---- 6 files changed, 303 insertions(+), 43 deletions(-) diff --git a/cmake/google/protobuf/descriptor.upb.c b/cmake/google/protobuf/descriptor.upb.c index fc0e469819..588e431977 100644 --- a/cmake/google/protobuf/descriptor.upb.c +++ b/cmake/google/protobuf/descriptor.upb.c @@ -126,16 +126,18 @@ const upb_msglayout google_protobuf_ExtensionRangeOptions_msginit = { UPB_SIZE(8, 8), 1, _UPB_MSGEXT_EXTENDABLE, 0, 255, }; -static const upb_msglayout_sub google_protobuf_FieldDescriptorProto_submsgs[1] = { +static const upb_msglayout_sub google_protobuf_FieldDescriptorProto_submsgs[3] = { {.submsg = &google_protobuf_FieldOptions_msginit}, + {.subenum = &google_protobuf_FieldDescriptorProto_Label_enuminit}, + {.subenum = &google_protobuf_FieldDescriptorProto_Type_enuminit}, }; static const upb_msglayout_field google_protobuf_FieldDescriptorProto__fields[11] = { {1, UPB_SIZE(24, 24), 1, 0, 12, _UPB_MODE_SCALAR | (_UPB_REP_STRVIEW << _UPB_REP_SHIFT)}, {2, UPB_SIZE(32, 40), 2, 0, 12, _UPB_MODE_SCALAR | (_UPB_REP_STRVIEW << _UPB_REP_SHIFT)}, {3, UPB_SIZE(12, 12), 3, 0, 5, _UPB_MODE_SCALAR | (_UPB_REP_4BYTE << _UPB_REP_SHIFT)}, - {4, UPB_SIZE(4, 4), 4, 0, 14, _UPB_MODE_SCALAR | (_UPB_REP_4BYTE << _UPB_REP_SHIFT)}, - {5, UPB_SIZE(8, 8), 5, 0, 14, _UPB_MODE_SCALAR | (_UPB_REP_4BYTE << _UPB_REP_SHIFT)}, + {4, UPB_SIZE(4, 4), 4, 1, 14, _UPB_MODE_SCALAR | (_UPB_REP_4BYTE << _UPB_REP_SHIFT)}, + {5, UPB_SIZE(8, 8), 5, 2, 14, _UPB_MODE_SCALAR | (_UPB_REP_4BYTE << _UPB_REP_SHIFT)}, {6, UPB_SIZE(40, 56), 6, 0, 12, _UPB_MODE_SCALAR | (_UPB_REP_STRVIEW << _UPB_REP_SHIFT)}, {7, UPB_SIZE(48, 72), 7, 0, 12, _UPB_MODE_SCALAR | (_UPB_REP_STRVIEW << _UPB_REP_SHIFT)}, {8, UPB_SIZE(64, 104), 8, 0, 11, _UPB_MODE_SCALAR | (_UPB_REP_PTR << _UPB_REP_SHIFT)}, @@ -248,14 +250,15 @@ const upb_msglayout google_protobuf_MethodDescriptorProto_msginit = { UPB_SIZE(32, 64), 6, _UPB_MSGEXT_NONE, 6, 255, }; -static const upb_msglayout_sub google_protobuf_FileOptions_submsgs[1] = { +static const upb_msglayout_sub google_protobuf_FileOptions_submsgs[2] = { {.submsg = &google_protobuf_UninterpretedOption_msginit}, + {.subenum = &google_protobuf_FileOptions_OptimizeMode_enuminit}, }; static const upb_msglayout_field google_protobuf_FileOptions__fields[21] = { {1, UPB_SIZE(20, 24), 1, 0, 12, _UPB_MODE_SCALAR | (_UPB_REP_STRVIEW << _UPB_REP_SHIFT)}, {8, UPB_SIZE(28, 40), 2, 0, 12, _UPB_MODE_SCALAR | (_UPB_REP_STRVIEW << _UPB_REP_SHIFT)}, - {9, UPB_SIZE(4, 4), 3, 0, 14, _UPB_MODE_SCALAR | (_UPB_REP_4BYTE << _UPB_REP_SHIFT)}, + {9, UPB_SIZE(4, 4), 3, 1, 14, _UPB_MODE_SCALAR | (_UPB_REP_4BYTE << _UPB_REP_SHIFT)}, {10, UPB_SIZE(8, 8), 4, 0, 8, _UPB_MODE_SCALAR | (_UPB_REP_1BYTE << _UPB_REP_SHIFT)}, {11, UPB_SIZE(36, 56), 5, 0, 12, _UPB_MODE_SCALAR | (_UPB_REP_STRVIEW << _UPB_REP_SHIFT)}, {16, UPB_SIZE(9, 9), 6, 0, 8, _UPB_MODE_SCALAR | (_UPB_REP_1BYTE << _UPB_REP_SHIFT)}, @@ -300,16 +303,18 @@ const upb_msglayout google_protobuf_MessageOptions_msginit = { UPB_SIZE(16, 16), 5, _UPB_MSGEXT_EXTENDABLE, 3, 255, }; -static const upb_msglayout_sub google_protobuf_FieldOptions_submsgs[1] = { +static const upb_msglayout_sub google_protobuf_FieldOptions_submsgs[3] = { {.submsg = &google_protobuf_UninterpretedOption_msginit}, + {.subenum = &google_protobuf_FieldOptions_CType_enuminit}, + {.subenum = &google_protobuf_FieldOptions_JSType_enuminit}, }; static const upb_msglayout_field google_protobuf_FieldOptions__fields[7] = { - {1, UPB_SIZE(4, 4), 1, 0, 14, _UPB_MODE_SCALAR | (_UPB_REP_4BYTE << _UPB_REP_SHIFT)}, + {1, UPB_SIZE(4, 4), 1, 1, 14, _UPB_MODE_SCALAR | (_UPB_REP_4BYTE << _UPB_REP_SHIFT)}, {2, UPB_SIZE(12, 12), 2, 0, 8, _UPB_MODE_SCALAR | (_UPB_REP_1BYTE << _UPB_REP_SHIFT)}, {3, UPB_SIZE(13, 13), 3, 0, 8, _UPB_MODE_SCALAR | (_UPB_REP_1BYTE << _UPB_REP_SHIFT)}, {5, UPB_SIZE(14, 14), 4, 0, 8, _UPB_MODE_SCALAR | (_UPB_REP_1BYTE << _UPB_REP_SHIFT)}, - {6, UPB_SIZE(8, 8), 5, 0, 14, _UPB_MODE_SCALAR | (_UPB_REP_4BYTE << _UPB_REP_SHIFT)}, + {6, UPB_SIZE(8, 8), 5, 2, 14, _UPB_MODE_SCALAR | (_UPB_REP_4BYTE << _UPB_REP_SHIFT)}, {10, UPB_SIZE(15, 15), 6, 0, 8, _UPB_MODE_SCALAR | (_UPB_REP_1BYTE << _UPB_REP_SHIFT)}, {999, UPB_SIZE(16, 16), 0, 0, 11, _UPB_MODE_ARRAY | (_UPB_REP_PTR << _UPB_REP_SHIFT)}, }; @@ -380,13 +385,14 @@ const upb_msglayout google_protobuf_ServiceOptions_msginit = { UPB_SIZE(8, 16), 2, _UPB_MSGEXT_EXTENDABLE, 0, 255, }; -static const upb_msglayout_sub google_protobuf_MethodOptions_submsgs[1] = { +static const upb_msglayout_sub google_protobuf_MethodOptions_submsgs[2] = { {.submsg = &google_protobuf_UninterpretedOption_msginit}, + {.subenum = &google_protobuf_MethodOptions_IdempotencyLevel_enuminit}, }; static const upb_msglayout_field google_protobuf_MethodOptions__fields[3] = { {33, UPB_SIZE(8, 8), 1, 0, 8, _UPB_MODE_SCALAR | (_UPB_REP_1BYTE << _UPB_REP_SHIFT)}, - {34, UPB_SIZE(4, 4), 2, 0, 14, _UPB_MODE_SCALAR | (_UPB_REP_4BYTE << _UPB_REP_SHIFT)}, + {34, UPB_SIZE(4, 4), 2, 1, 14, _UPB_MODE_SCALAR | (_UPB_REP_4BYTE << _UPB_REP_SHIFT)}, {999, UPB_SIZE(12, 16), 0, 0, 11, _UPB_MODE_ARRAY | (_UPB_REP_PTR << _UPB_REP_SHIFT)}, }; @@ -512,10 +518,57 @@ static const upb_msglayout *messages_layout[27] = { &google_protobuf_GeneratedCodeInfo_Annotation_msginit, }; +const upb_enumlayout google_protobuf_FieldDescriptorProto_Label_enuminit = { + NULL, + 0xeULL, + 0, +}; + +const upb_enumlayout google_protobuf_FieldDescriptorProto_Type_enuminit = { + NULL, + 0x7fffeULL, + 0, +}; + +const upb_enumlayout google_protobuf_FieldOptions_CType_enuminit = { + NULL, + 0x7ULL, + 0, +}; + +const upb_enumlayout google_protobuf_FieldOptions_JSType_enuminit = { + NULL, + 0x7ULL, + 0, +}; + +const upb_enumlayout google_protobuf_FileOptions_OptimizeMode_enuminit = { + NULL, + 0xeULL, + 0, +}; + +const upb_enumlayout google_protobuf_MethodOptions_IdempotencyLevel_enuminit = { + NULL, + 0x7ULL, + 0, +}; + +static const upb_enumlayout *enums_layout[6] = { + &google_protobuf_FieldDescriptorProto_Label_enuminit, + &google_protobuf_FieldDescriptorProto_Type_enuminit, + &google_protobuf_FieldOptions_CType_enuminit, + &google_protobuf_FieldOptions_JSType_enuminit, + &google_protobuf_FileOptions_OptimizeMode_enuminit, + &google_protobuf_MethodOptions_IdempotencyLevel_enuminit, +}; + const upb_msglayout_file google_protobuf_descriptor_proto_upb_file_layout = { messages_layout, + enums_layout, NULL, 27, + 6, 0, }; diff --git a/cmake/google/protobuf/descriptor.upb.h b/cmake/google/protobuf/descriptor.upb.h index f74dde71f3..ad6fb7eac0 100644 --- a/cmake/google/protobuf/descriptor.upb.h +++ b/cmake/google/protobuf/descriptor.upb.h @@ -154,6 +154,13 @@ typedef enum { } google_protobuf_MethodOptions_IdempotencyLevel; +extern const upb_enumlayout google_protobuf_FieldDescriptorProto_Label_enuminit; +extern const upb_enumlayout google_protobuf_FieldDescriptorProto_Type_enuminit; +extern const upb_enumlayout google_protobuf_FieldOptions_CType_enuminit; +extern const upb_enumlayout google_protobuf_FieldOptions_JSType_enuminit; +extern const upb_enumlayout google_protobuf_FileOptions_OptimizeMode_enuminit; +extern const upb_enumlayout google_protobuf_MethodOptions_IdempotencyLevel_enuminit; + /* google.protobuf.FileDescriptorSet */ UPB_INLINE google_protobuf_FileDescriptorSet *google_protobuf_FileDescriptorSet_new(upb_arena *arena) { diff --git a/upb/msg_internal.h b/upb/msg_internal.h index 7d024f7beb..1fbb2c7182 100644 --- a/upb/msg_internal.h +++ b/upb/msg_internal.h @@ -66,8 +66,8 @@ enum { typedef struct { uint32_t number; uint16_t offset; - int16_t presence; /* If >0, hasbit_index. If <0, ~oneof_index. */ - uint16_t submsg_index; /* undefined if descriptortype != MESSAGE or GROUP. */ + int16_t presence; // If >0, hasbit_index. If <0, ~oneof_index + uint16_t submsg_index; // undefined if descriptortype != MESSAGE/GROUP/ENUM uint8_t descriptortype; uint8_t mode; /* upb_fieldmode | upb_labelflags | (upb_rep << _UPB_REP_SHIFT) */ @@ -130,9 +130,15 @@ typedef struct { _upb_field_parser *field_parser; } _upb_fasttable_entry; +typedef struct { + const int32_t *values; // List of values <0 or >63 + uint64_t mask; // Bits are set for acceptable value 0 <= x < 64 + int value_count; +} upb_enumlayout; + typedef union { const struct upb_msglayout *submsg; - // TODO: const upb_enumlayout *subenum; + const upb_enumlayout *subenum; } upb_msglayout_sub; typedef enum { @@ -179,8 +185,10 @@ typedef struct { typedef struct { const upb_msglayout **msgs; + const upb_enumlayout **enums; const upb_msglayout_ext **exts; int msg_count; + int enum_count; int ext_count; } upb_msglayout_file; diff --git a/upb/msg_test.cc b/upb/msg_test.cc index 636fbcb9b4..24d3da3a59 100644 --- a/upb/msg_test.cc +++ b/upb/msg_test.cc @@ -26,6 +26,7 @@ */ #include "gtest/gtest.h" +#include "gmock/gmock.h" #include "src/google/protobuf/test_messages_proto3.upb.h" #include "upb/def.hpp" #include "upb/json_decode.h" @@ -152,3 +153,52 @@ TEST(MessageTest, MessageSet) { << status.error_message(); VerifyMessageSet(ext_msg3); } + +TEST(MessageTest, Proto2Enum) { + upb::Arena arena; + upb_test_Proto2FakeEnumMessage *fake_msg = upb_test_Proto2FakeEnumMessage_new(arena.ptr()); + + upb_test_Proto2FakeEnumMessage_set_optional_enum(fake_msg, 999); + int32_t *vals = upb_test_Proto2FakeEnumMessage_resize_repeated_enum( + fake_msg, 6, arena.ptr()); + vals[0] = upb_test_Proto2EnumMessage_ZERO; + vals[1] = 7; // Unknown small. + vals[2] = upb_test_Proto2EnumMessage_SMALL; + vals[3] = 888; // Unknown large. + vals[4] = upb_test_Proto2EnumMessage_LARGE; + vals[5] = upb_test_Proto2EnumMessage_NEGATIVE; + upb_test_Proto2FakeEnumMessage_int32_enum_map_set(fake_msg, 5, 777, + arena.ptr()); + + size_t size; + char *pb = + upb_test_Proto2FakeEnumMessage_serialize(fake_msg, arena.ptr(), &size); + + upb_test_Proto2EnumMessage *enum_msg = + upb_test_Proto2EnumMessage_parse(pb, size, arena.ptr()); + ASSERT_TRUE(enum_msg != nullptr); + + EXPECT_EQ(false, upb_test_Proto2EnumMessage_has_optional_enum(enum_msg)); + const int32_t *vals_const = + upb_test_Proto2EnumMessage_repeated_enum(enum_msg, &size); + EXPECT_EQ(4, size); // Two unknown values moved to the unknown field set. + + pb = upb_test_Proto2EnumMessage_serialize(enum_msg, arena.ptr(), &size); + upb_test_Proto2FakeEnumMessage *fake_msg2 = + upb_test_Proto2FakeEnumMessage_parse(pb, size, arena.ptr()); + + EXPECT_EQ(true, upb_test_Proto2FakeEnumMessage_has_optional_enum(fake_msg2)); + vals_const = + upb_test_Proto2FakeEnumMessage_repeated_enum(fake_msg2, &size); + EXPECT_EQ(6, size); + int32_t expected[] = { + upb_test_Proto2EnumMessage_ZERO, + upb_test_Proto2EnumMessage_SMALL, + upb_test_Proto2EnumMessage_LARGE, + upb_test_Proto2EnumMessage_NEGATIVE, + 7, + 888, + }; + EXPECT_THAT(std::vector(vals_const, vals_const + size), + ::testing::ElementsAreArray(expected)); +} diff --git a/upb/msg_test.proto b/upb/msg_test.proto index 6dd940a1a1..fad31206dc 100644 --- a/upb/msg_test.proto +++ b/upb/msg_test.proto @@ -58,3 +58,24 @@ message MessageSetMember { optional MessageSetMember message_set_extension = 4; } } + +message Proto2EnumMessage { + enum Proto2TestEnum { + ZERO = 0; + NEGATIVE = -1; + SMALL = 15; + LARGE = 12345; + } + + optional Proto2TestEnum optional_enum = 1; + repeated Proto2TestEnum repeated_enum = 2; + map int32_enum_map = 3; +} + +// The same fields as Proto2EnumMessage, but with int32 fields so we can fake +// wire format. +message Proto2FakeEnumMessage { + optional int32 optional_enum = 1; + repeated int32 repeated_enum = 2; + map int32_enum_map = 3; +} diff --git a/upbc/protoc-gen-upb.cc b/upbc/protoc-gen-upb.cc index d252957226..52758ff0f8 100644 --- a/upbc/protoc-gen-upb.cc +++ b/upbc/protoc-gen-upb.cc @@ -51,6 +51,10 @@ std::string MessageInit(const protobuf::Descriptor* descriptor) { return MessageName(descriptor) + "_msginit"; } +std::string EnumInit(const protobuf::EnumDescriptor* descriptor) { + return ToCIdent(descriptor->full_name()) + "_enuminit"; +} + std::string ExtensionIdentBase(const protobuf::FieldDescriptor* ext) { assert(ext->is_extension()); std::string ext_scope; @@ -65,8 +69,9 @@ std::string ExtensionLayout(const google::protobuf::FieldDescriptor* ext) { return absl::StrCat(ExtensionIdentBase(ext), "_", ext->name(), "_ext"); } -const char *kMessagesInit = "messages_layout"; +const char *kEnumsInit = "enums_layout"; const char *kExtensionsInit = "extensions_layout"; +const char *kMessagesInit = "messages_layout"; void AddEnums(const protobuf::Descriptor* message, std::vector* enums) { @@ -179,6 +184,24 @@ std::vector SortedSubmessages( return ret; } +std::vector SortedSubEnums( + const protobuf::Descriptor* message) { + std::vector ret; + for (int i = 0; i < message->field_count(); i++) { + if (message->field(i)->cpp_type() == + protobuf::FieldDescriptor::CPPTYPE_ENUM) { + ret.push_back(message->field(i)); + } + } + std::sort(ret.begin(), ret.end(), + [](const protobuf::FieldDescriptor* a, + const protobuf::FieldDescriptor* b) { + return a->enum_type()->full_name() < + b->enum_type()->full_name(); + }); + return ret; +} + std::string EnumValueSymbol(const protobuf::EnumValueDescriptor* value) { return ToCIdent(value->full_name()); } @@ -721,6 +744,14 @@ void WriteHeader(const protobuf::FileDescriptor* file, Output& output) { output("\n"); + if (file->syntax() == protobuf::FileDescriptor::SYNTAX_PROTO2) { + for (auto enumdesc : this_file_enums) { + output("extern const upb_enumlayout $0;\n", EnumInit(enumdesc)); + } + } + + output("\n"); + for (auto message : this_file_messages) { GenerateMessageInHeader(message, output); } @@ -787,45 +818,70 @@ int TableDescriptorType(const protobuf::FieldDescriptor* field) { // embedding field names on the side) we will have to revisit this, because // string vs. bytes behavior is not affected by proto2 vs proto3. return protobuf::FieldDescriptor::TYPE_BYTES; + } else if (field->enum_type() && + field->enum_type()->file()->syntax() == + protobuf::FileDescriptor::SYNTAX_PROTO3) { + // From the perspective of the binary decoder, proto3 enums are identical to + // int32 fields. Only in proto2 do we check enum values to make sure they + // are defined in the enum. + return protobuf::FieldDescriptor::TYPE_INT32; } else { return field->type(); } } -struct SubmsgArray { +struct SubLayoutArray { public: - SubmsgArray(const protobuf::Descriptor* message) : message_(message) { - MessageLayout layout(message); - std::vector sorted_submsgs = - SortedSubmessages(message); - int i = 0; - for (auto submsg : sorted_submsgs) { - if (indexes_.find(submsg->message_type()) != indexes_.end()) { - continue; - } - submsgs_.push_back(submsg->message_type()); - indexes_[submsg->message_type()] = i++; - } - } + SubLayoutArray(const protobuf::Descriptor* message); const std::vector& submsgs() const { return submsgs_; } - int GetIndex(const protobuf::FieldDescriptor* field) { - (void)message_; - assert(field->containing_type() == message_); - auto it = indexes_.find(field->message_type()); + const std::vector& subenums() const { + return subenums_; + } + + int total_count() const { return submsgs_.size() + subenums_.size(); } + + int GetIndex(const void *sub) { + auto it = indexes_.find(sub); assert(it != indexes_.end()); return it->second; } private: - const protobuf::Descriptor* message_; std::vector submsgs_; - absl::flat_hash_map indexes_; + std::vector subenums_; + absl::flat_hash_map indexes_; }; +SubLayoutArray::SubLayoutArray(const protobuf::Descriptor* message) { + MessageLayout layout(message); + std::vector sorted_submsgs = + SortedSubmessages(message); + int i = 0; + for (auto submsg : sorted_submsgs) { + if (indexes_.find(submsg->message_type()) != indexes_.end()) { + continue; + } + submsgs_.push_back(submsg->message_type()); + indexes_[submsg->message_type()] = i++; + } + + std::vector sorted_subenums = + SortedSubEnums(message); + for (auto field : sorted_subenums) { + if (field->file()->syntax() != + protobuf::FileDescriptor::SYNTAX_PROTO2 || + indexes_.find(field->enum_type()) != indexes_.end()) { + continue; + } + subenums_.push_back(field->enum_type()); + indexes_[field->enum_type()] = i++; + } +} + typedef std::pair TableEntry; uint64_t GetEncodedTag(const protobuf::FieldDescriptor* field) { @@ -957,8 +1013,8 @@ bool TryFillTableEntry(const protobuf::Descriptor* message, } if (field->cpp_type() == protobuf::FieldDescriptor::CPPTYPE_MESSAGE) { - SubmsgArray submsg_array(message); - uint64_t idx = submsg_array.GetIndex(field); + SubLayoutArray sublayout_array(message); + uint64_t idx = sublayout_array.GetIndex(field->message_type()); if (idx > 255) return false; data |= idx << 16; @@ -1083,19 +1139,22 @@ void WriteMessage(const protobuf::Descriptor* message, Output& output, uint8_t dense_below = 0; const int dense_below_max = std::numeric_limits::max(); MessageLayout layout(message); - SubmsgArray submsg_array(message); + SubLayoutArray sublayout_array(message); - if (!submsg_array.submsgs().empty()) { + if (sublayout_array.total_count()) { // TODO(haberman): could save a little bit of space by only generating a // "submsgs" array for every strongly-connected component. std::string submsgs_array_name = msg_name + "_submsgs"; submsgs_array_ref = "&" + submsgs_array_name + "[0]"; output("static const upb_msglayout_sub $0[$1] = {\n", - submsgs_array_name, submsg_array.submsgs().size()); + submsgs_array_name, sublayout_array.total_count()); - for (auto submsg : submsg_array.submsgs()) { + for (const auto* submsg : sublayout_array.submsgs()) { output(" {.submsg = &$0},\n", MessageInit(submsg)); } + for (const auto* subenum : sublayout_array.subenums()) { + output(" {.subenum = &$0},\n", EnumInit(subenum)); + } output("};\n\n"); } @@ -1109,7 +1168,7 @@ void WriteMessage(const protobuf::Descriptor* message, Output& output, fields_array_name, field_number_order.size()); for (int i = 0; i < static_cast(field_number_order.size()); i++) { auto field = field_number_order[i]; - int submsg_index = 0; + int sublayout_index = 0; if (i < dense_below_max && field->number() == i + 1 && (i == 0 || field_number_order[i - 1]->number() == i)) { @@ -1117,10 +1176,14 @@ void WriteMessage(const protobuf::Descriptor* message, Output& output, } if (field->cpp_type() == protobuf::FieldDescriptor::CPPTYPE_MESSAGE) { - submsg_index = submsg_array.GetIndex(field); + sublayout_index = sublayout_array.GetIndex(field->message_type()); + } else if (field->enum_type() && + field->enum_type()->file()->syntax() == + protobuf::FileDescriptor::SYNTAX_PROTO2) { + sublayout_index = sublayout_array.GetIndex(field->enum_type()); } - WriteMessageField(field, layout, submsg_index, output); + WriteMessageField(field, layout, sublayout_index, output); } output("};\n\n"); } @@ -1168,6 +1231,61 @@ void WriteMessage(const protobuf::Descriptor* message, Output& output, output("};\n\n"); } +int WriteEnums(const protobuf::FileDescriptor* file, Output& output) { + if (file->syntax() != protobuf::FileDescriptor::SYNTAX_PROTO2) { + return 0; + } + + std::vector this_file_enums = + SortedEnums(file); + + std::string values_init = "NULL"; + + for (auto e : this_file_enums) { + uint64_t mask = 0; + absl::flat_hash_set values; + for (int i = 0; i < e->value_count(); i++) { + int32_t number = e->value(i)->number(); + if (static_cast(number) < 64) { + mask |= 1 << number; + } else { + values.insert(number); + } + } + std::vector values_vec(values.begin(), values.end()); + std::sort(values_vec.begin(), values_vec.end()); + + if (!values_vec.empty()) { + values_init = EnumInit(e) + "_values"; + output("static const int32_t $0[$1] = {\n", values_init, + values_vec.size()); + for (auto value : values_vec) { + output(" $0,\n", value); + } + output("};\n\n"); + } + + output("const upb_enumlayout $0 = {\n", EnumInit(e)); + output(" $0,\n", values_init); + output(" 0x$0ULL,\n", absl::Hex(mask)); + output(" $0,\n", values_vec.size()); + + output("};\n\n"); + } + + if (!this_file_enums.empty()) { + output("static const upb_enumlayout *$0[$1] = {\n", kEnumsInit, + this_file_enums.size()); + for (auto e : this_file_enums) { + output(" &$0,\n", EnumInit(e)); + } + output("};\n"); + output("\n"); + } + + return this_file_enums.size(); +} + void WriteExtension(const protobuf::FieldDescriptor* ext, Output& output) { output("const upb_msglayout_ext $0 = {\n ", ExtensionLayout(ext)); WriteField(ext, "0", "0", 0, output); @@ -1264,11 +1382,14 @@ void WriteSource(const protobuf::FileDescriptor* file, Output& output, int msg_count = WriteMessages(file, output, fasttable_enabled); int ext_count = WriteExtensions(file, output); + int enum_count = WriteEnums(file, output); output("const upb_msglayout_file $0 = {\n", FileLayoutName(file)); output(" $0,\n", msg_count ? kMessagesInit : "NULL"); + output(" $0,\n", enum_count ? kEnumsInit : "NULL"); output(" $0,\n", ext_count ? kExtensionsInit : "NULL"); output(" $0,\n", msg_count); + output(" $0,\n", enum_count); output(" $0,\n", ext_count); output("};\n\n"); From 7907ed913b1d018f84882713a140a731126f7371 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Wed, 6 Oct 2021 19:21:00 -0700 Subject: [PATCH 2/8] Expanded the test to cover packed fields also. --- benchmarks/compare.py | 2 +- upb/decode.c | 190 +++++++++++++++++++++++++++++++++--------- upb/def.c | 52 ++++++++++++ upb/msg_test.cc | 34 ++++++-- upb/msg_test.proto | 4 +- 5 files changed, 234 insertions(+), 48 deletions(-) diff --git a/benchmarks/compare.py b/benchmarks/compare.py index 9cc21d556f..971cc274f3 100755 --- a/benchmarks/compare.py +++ b/benchmarks/compare.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 # # Copyright (c) 2009-2021, Google LLC # All rights reserved. diff --git a/upb/decode.c b/upb/decode.c index fc074d6d52..3aabe0e1dc 100644 --- a/upb/decode.c +++ b/upb/decode.c @@ -101,12 +101,14 @@ static const unsigned FIXED64_OK_MASK = (1 << UPB_DTYPE_DOUBLE) | #define OP_MSGSET_ITEM -2 #define OP_MSGSET_TYPEID -3 #define OP_SCALAR_LG2(n) (n) /* n in [0, 2, 3] => op in [0, 2, 3] */ +#define OP_ENUM 1 #define OP_STRING 4 #define OP_BYTES 5 #define OP_SUBMSG 6 -/* Ops above are scalar-only. Repeated fields can use any op. */ +/* Scalar fields use only ops above. Repeated fields can use any op. */ #define OP_FIXPCK_LG2(n) (n + 5) /* n in [2, 3] => op in [7, 8] */ #define OP_VARPCK_LG2(n) (n + 9) /* n in [0, 2, 3] => op in [9, 11, 12] */ +#define OP_PACKED_ENUM 13 static const int8_t varint_ops[] = { OP_UNKNOWN, /* field not found */ @@ -123,7 +125,7 @@ static const int8_t varint_ops[] = { OP_UNKNOWN, /* MESSAGE */ OP_UNKNOWN, /* BYTES */ OP_SCALAR_LG2(2), /* UINT32 */ - OP_SCALAR_LG2(2), /* ENUM */ + OP_ENUM, /* ENUM */ OP_UNKNOWN, /* SFIXED32 */ OP_UNKNOWN, /* SFIXED64 */ OP_SCALAR_LG2(2), /* SINT32 */ @@ -169,7 +171,7 @@ static const int8_t delim_ops[] = { OP_SUBMSG, /* REPEATED MESSAGE */ OP_BYTES, /* REPEATED BYTES */ OP_VARPCK_LG2(2), /* REPEATED UINT32 */ - OP_VARPCK_LG2(2), /* REPEATED ENUM */ + OP_PACKED_ENUM, /* REPEATED ENUM */ OP_FIXPCK_LG2(2), /* REPEATED SFIXED32 */ OP_FIXPCK_LG2(3), /* REPEATED SFIXED64 */ OP_VARPCK_LG2(2), /* REPEATED SINT32 */ @@ -384,6 +386,134 @@ static const char *decode_togroup(upb_decstate *d, const char *ptr, return decode_group(d, ptr, submsg, subl, field->number); } +static char *encode_varint32(uint32_t val, char *ptr) { + do { + uint8_t byte = val & 0x7fU; + val >>= 7; + if (val) byte |= 0x80U; + *(ptr++) = byte; + } while (val); + return ptr; +} + +UPB_NOINLINE +static bool decode_checkenum_slow(upb_decstate *d, const char *ptr, upb_msg *msg, + const upb_enumlayout *e, + const upb_msglayout_field *field, uint32_t v) { + // OPT: binary search long lists? + int n = e->value_count; + for (int i = 0; i < n; i++) { + if ((uint32_t)e->values[i] == v) return true; + } + + // Unrecognized enum goes into unknown fields. + // For packed fields the tag could be arbitrarily far in the past, so we + // just re-encode the tag here. + char buf[20]; + char *end = buf; + uint32_t tag = ((uint32_t)field->number << 3) | UPB_WIRE_TYPE_VARINT; + end = encode_varint32(tag, end); + end = encode_varint32(v, end); + + if (!_upb_msg_addunknown(msg, buf, end - buf, &d->arena)) { + decode_err(d); + } + + return false; +} + +UPB_FORCEINLINE +static bool decode_checkenum(upb_decstate *d, const char *ptr, upb_msg *msg, + const upb_enumlayout *e, + const upb_msglayout_field *field, wireval *val) { + uint32_t v = val->uint32_val; + + if (UPB_LIKELY(v < 64) && UPB_LIKELY(((1ULL << v) & e->mask))) return true; + + return decode_checkenum_slow(d, ptr, msg, e, field, v); +} + +UPB_NOINLINE +static const char *decode_enum_toarray(upb_decstate *d, const char *ptr, + upb_msg *msg, upb_array *arr, + const upb_msglayout_sub *subs, + const upb_msglayout_field *field, + wireval *val) { + const upb_enumlayout *e = subs[field->submsg_index].subenum; + if (!decode_checkenum(d, ptr, msg, e, field, val)) return ptr; + void *mem = UPB_PTR_AT(_upb_array_ptr(arr), arr->len * 4, void); + arr->len++; + memcpy(mem, val, 4); + return ptr; +} + +#include +UPB_FORCEINLINE +static const char *decode_fixed_packed(upb_decstate *d, const char *ptr, + upb_array *arr, wireval *val, + const upb_msglayout_field *field, + int lg2) { + int mask = (1 << lg2) - 1; + size_t count = val->size >> lg2; + if ((val->size & mask) != 0) { + return decode_err(d); /* Length isn't a round multiple of elem size. */ + } + decode_reserve(d, arr, count); + void *mem = UPB_PTR_AT(_upb_array_ptr(arr), arr->len << lg2, void); + arr->len += count; + memcpy(mem, ptr, val->size); /* XXX: ptr boundary. */ + return ptr + val->size; +} + +UPB_FORCEINLINE +static const char *decode_varint_packed(upb_decstate *d, const char *ptr, + upb_array *arr, wireval *val, + const upb_msglayout_field *field, + int lg2) { + int scale = 1 << lg2; + int saved_limit = decode_pushlimit(d, ptr, val->size); + char *out = UPB_PTR_AT(_upb_array_ptr(arr), arr->len << lg2, void); + while (!decode_isdone(d, &ptr)) { + wireval elem; + ptr = decode_varint64(d, ptr, &elem.uint64_val); + decode_munge(field->descriptortype, &elem); + if (decode_reserve(d, arr, 1)) { + out = UPB_PTR_AT(_upb_array_ptr(arr), arr->len << lg2, void); + } + arr->len++; + memcpy(out, &elem, scale); + out += scale; + } + decode_poplimit(d, ptr, saved_limit); + return ptr; +} + +UPB_NOINLINE +static const char *decode_enum_packed(upb_decstate *d, const char *ptr, + upb_msg *msg, upb_array *arr, + const upb_msglayout_sub *subs, + const upb_msglayout_field *field, + wireval *val) { + const upb_enumlayout *e = subs[field->submsg_index].subenum; + int saved_limit = decode_pushlimit(d, ptr, val->size); + char *out = UPB_PTR_AT(_upb_array_ptr(arr), arr->len * 4, void); + while (!decode_isdone(d, &ptr)) { + wireval elem; + ptr = decode_varint64(d, ptr, &elem.uint64_val); + if (!decode_checkenum(d, ptr, msg, e, field, &elem)) { + continue; + } + if (decode_reserve(d, arr, 1)) { + out = UPB_PTR_AT(_upb_array_ptr(arr), arr->len * 4, void); + } + arr->len++; + memcpy(out, &elem, 4); + out += 4; + } + decode_poplimit(d, ptr, saved_limit); + return ptr; +} + static const char *decode_toarray(upb_decstate *d, const char *ptr, upb_msg *msg, const upb_msglayout_sub *subs, @@ -433,42 +563,18 @@ static const char *decode_toarray(upb_decstate *d, const char *ptr, } } case OP_FIXPCK_LG2(2): - case OP_FIXPCK_LG2(3): { - /* Fixed packed. */ - int lg2 = op - OP_FIXPCK_LG2(0); - int mask = (1 << lg2) - 1; - size_t count = val->size >> lg2; - if ((val->size & mask) != 0) { - return decode_err(d); /* Length isn't a round multiple of elem size. */ - } - decode_reserve(d, arr, count); - mem = UPB_PTR_AT(_upb_array_ptr(arr), arr->len << lg2, void); - arr->len += count; - memcpy(mem, ptr, val->size); /* XXX: ptr boundary. */ - return ptr + val->size; - } + case OP_FIXPCK_LG2(3): + return decode_fixed_packed(d, ptr, arr, val, field, + op - OP_FIXPCK_LG2(0)); case OP_VARPCK_LG2(0): case OP_VARPCK_LG2(2): - case OP_VARPCK_LG2(3): { - /* Varint packed. */ - int lg2 = op - OP_VARPCK_LG2(0); - int scale = 1 << lg2; - int saved_limit = decode_pushlimit(d, ptr, val->size); - char *out = UPB_PTR_AT(_upb_array_ptr(arr), arr->len << lg2, void); - while (!decode_isdone(d, &ptr)) { - wireval elem; - ptr = decode_varint64(d, ptr, &elem.uint64_val); - decode_munge(field->descriptortype, &elem); - if (decode_reserve(d, arr, 1)) { - out = UPB_PTR_AT(_upb_array_ptr(arr), arr->len << lg2, void); - } - arr->len++; - memcpy(out, &elem, scale); - out += scale; - } - decode_poplimit(d, ptr, saved_limit); - return ptr; - } + case OP_VARPCK_LG2(3): + return decode_varint_packed(d, ptr, arr, val, field, + op - OP_VARPCK_LG2(0)); + case OP_ENUM: + return decode_enum_toarray(d, ptr, msg, arr, subs, field, val); + case OP_PACKED_ENUM: + return decode_enum_packed(d, ptr, msg, arr, subs, field, val); default: UPB_UNREACHABLE(); } @@ -516,6 +622,12 @@ static const char *decode_tomsg(upb_decstate *d, const char *ptr, upb_msg *msg, void *mem = UPB_PTR_AT(msg, field->offset, void); int type = field->descriptortype; + if (UPB_UNLIKELY(op == OP_ENUM) && + !decode_checkenum(d, ptr, msg, subs[field->submsg_index].subenum, field, + val)) { + return ptr; + } + /* Set presence if necessary. */ if (field->presence > 0) { _upb_sethas_field(msg, field); @@ -552,6 +664,7 @@ static const char *decode_tomsg(upb_decstate *d, const char *ptr, upb_msg *msg, case OP_SCALAR_LG2(3): memcpy(mem, val, 8); break; + case OP_ENUM: case OP_SCALAR_LG2(2): memcpy(mem, val, 4); break; @@ -798,13 +911,12 @@ static const char *decode_msg(upb_decstate *d, const char *ptr, upb_msg *msg, field_number = tag >> 3; wire_type = tag & 7; - field = decode_findfield(d, layout, field_number, &last_field_index); - if (wire_type == UPB_WIRE_TYPE_END_GROUP) { d->end_group = field_number; return ptr; } + field = decode_findfield(d, layout, field_number, &last_field_index); ptr = decode_wireval(d, ptr, field, wire_type, &val, &op); if (op >= 0) { diff --git a/upb/def.c b/upb/def.c index 7541224f8f..64261aeb03 100644 --- a/upb/def.c +++ b/upb/def.c @@ -120,6 +120,7 @@ struct upb_msgdef { struct upb_enumdef { const google_protobuf_EnumOptions *opts; + const upb_enumlayout *layout; // Only for proto2. const upb_filedef *file; const upb_msgdef *containing_type; // Could be merged with "file". const char *full_name; @@ -1664,6 +1665,11 @@ static void make_layout(symtab_addctx *ctx, const upb_msgdef *m) { if (upb_fielddef_issubmsg(f)) { field->submsg_index = sublayout_count++; subs[field->submsg_index].submsg = upb_fielddef_msgsubdef(f)->layout; + } else if (upb_fielddef_type(f) == UPB_TYPE_ENUM && + f->file->syntax == UPB_SYNTAX_PROTO2) { + field->submsg_index = sublayout_count++; + subs[field->submsg_index].subenum = upb_fielddef_enumsubdef(f)->layout; + UPB_ASSERT(subs[field->submsg_index].subenum); } if (upb_fielddef_label(f) == UPB_LABEL_REQUIRED) { @@ -2357,6 +2363,40 @@ static void create_service( } } +upb_enumlayout *create_enumlayout(symtab_addctx *ctx, const upb_enumdef *e) { + int n = 0; + uint64_t mask = 0; + + for (int i = 0; i < e->value_count; i++) { + uint32_t val = (uint32_t)e->values[i].number; + if (val < 64) { + mask |= 1 << val; + } else { + n++; + } + } + + int32_t *values = symtab_alloc(ctx, sizeof(*values) * n); + int32_t *p = values; + + for (int i = 0; i < e->value_count; i++) { + int32_t val = e->values[i].number; + if ((uint32_t)val >= 64) { + *p++ = val; + } + } + + UPB_ASSERT(p == values + n); + UPB_ASSERT(upb_inttable_count(&e->iton) == n + __builtin_popcountll(mask)); + + upb_enumlayout *layout = symtab_alloc(ctx, sizeof(*layout)); + layout->value_count = n; + layout->mask = mask; + layout->values = values; + + return layout; +} + static void create_enumdef( symtab_addctx *ctx, const char *prefix, const google_protobuf_EnumDescriptorProto *enum_proto, @@ -2418,6 +2458,18 @@ static void create_enumdef( } upb_inttable_compact(&e->iton, ctx->arena); + + if (e->file->syntax == UPB_SYNTAX_PROTO2) { + if (ctx->layout) { + UPB_ASSERT(ctx->enum_count < ctx->layout->enum_count); + e->layout = ctx->layout->enums[ctx->enum_count++]; + UPB_ASSERT(n == e->layout->value_count + __builtin_popcountll(e->layout->mask)); + } else { + e->layout = create_enumlayout(ctx, e); + } + } else { + e->layout = NULL; + } } static void create_msgdef(symtab_addctx *ctx, const char *prefix, diff --git a/upb/msg_test.cc b/upb/msg_test.cc index 24d3da3a59..a136bda0f3 100644 --- a/upb/msg_test.cc +++ b/upb/msg_test.cc @@ -100,7 +100,9 @@ TEST(MessageTest, Extensions) { } void VerifyMessageSet(const upb_test_TestMessageSet *mset_msg) { - EXPECT_TRUE(upb_test_MessageSetMember_has_message_set_extension(mset_msg)); + bool has = upb_test_MessageSetMember_has_message_set_extension(mset_msg); + EXPECT_TRUE(has); + if (!has) return; const upb_test_MessageSetMember *member = upb_test_MessageSetMember_message_set_extension(mset_msg); EXPECT_TRUE(member != nullptr); @@ -159,6 +161,7 @@ TEST(MessageTest, Proto2Enum) { upb_test_Proto2FakeEnumMessage *fake_msg = upb_test_Proto2FakeEnumMessage_new(arena.ptr()); upb_test_Proto2FakeEnumMessage_set_optional_enum(fake_msg, 999); + int32_t *vals = upb_test_Proto2FakeEnumMessage_resize_repeated_enum( fake_msg, 6, arena.ptr()); vals[0] = upb_test_Proto2EnumMessage_ZERO; @@ -167,13 +170,21 @@ TEST(MessageTest, Proto2Enum) { vals[3] = 888; // Unknown large. vals[4] = upb_test_Proto2EnumMessage_LARGE; vals[5] = upb_test_Proto2EnumMessage_NEGATIVE; - upb_test_Proto2FakeEnumMessage_int32_enum_map_set(fake_msg, 5, 777, - arena.ptr()); + + vals = upb_test_Proto2FakeEnumMessage_resize_packed_enum( + fake_msg, 6, arena.ptr()); + vals[0] = upb_test_Proto2EnumMessage_ZERO; + vals[1] = 7; // Unknown small. + vals[2] = upb_test_Proto2EnumMessage_SMALL; + vals[3] = 888; // Unknown large. + vals[4] = upb_test_Proto2EnumMessage_LARGE; + vals[5] = upb_test_Proto2EnumMessage_NEGATIVE; size_t size; char *pb = upb_test_Proto2FakeEnumMessage_serialize(fake_msg, arena.ptr(), &size); + // Parsing as enums puts unknown values into unknown fields. upb_test_Proto2EnumMessage *enum_msg = upb_test_Proto2EnumMessage_parse(pb, size, arena.ptr()); ASSERT_TRUE(enum_msg != nullptr); @@ -183,14 +194,15 @@ TEST(MessageTest, Proto2Enum) { upb_test_Proto2EnumMessage_repeated_enum(enum_msg, &size); EXPECT_EQ(4, size); // Two unknown values moved to the unknown field set. + // Parsing back into the fake message shows the original data, except the + // repeated enum is rearranged. pb = upb_test_Proto2EnumMessage_serialize(enum_msg, arena.ptr(), &size); upb_test_Proto2FakeEnumMessage *fake_msg2 = upb_test_Proto2FakeEnumMessage_parse(pb, size, arena.ptr()); EXPECT_EQ(true, upb_test_Proto2FakeEnumMessage_has_optional_enum(fake_msg2)); - vals_const = - upb_test_Proto2FakeEnumMessage_repeated_enum(fake_msg2, &size); - EXPECT_EQ(6, size); + EXPECT_EQ(999, upb_test_Proto2FakeEnumMessage_optional_enum(fake_msg2)); + int32_t expected[] = { upb_test_Proto2EnumMessage_ZERO, upb_test_Proto2EnumMessage_SMALL, @@ -199,6 +211,16 @@ TEST(MessageTest, Proto2Enum) { 7, 888, }; + + vals_const = + upb_test_Proto2FakeEnumMessage_repeated_enum(fake_msg2, &size); + EXPECT_EQ(6, size); + EXPECT_THAT(std::vector(vals_const, vals_const + size), + ::testing::ElementsAreArray(expected)); + + vals_const = + upb_test_Proto2FakeEnumMessage_packed_enum(fake_msg2, &size); + EXPECT_EQ(6, size); EXPECT_THAT(std::vector(vals_const, vals_const + size), ::testing::ElementsAreArray(expected)); } diff --git a/upb/msg_test.proto b/upb/msg_test.proto index fad31206dc..d7cfc1ca57 100644 --- a/upb/msg_test.proto +++ b/upb/msg_test.proto @@ -69,7 +69,7 @@ message Proto2EnumMessage { optional Proto2TestEnum optional_enum = 1; repeated Proto2TestEnum repeated_enum = 2; - map int32_enum_map = 3; + repeated Proto2TestEnum packed_enum = 3 [packed = true]; } // The same fields as Proto2EnumMessage, but with int32 fields so we can fake @@ -77,5 +77,5 @@ message Proto2EnumMessage { message Proto2FakeEnumMessage { optional int32 optional_enum = 1; repeated int32 repeated_enum = 2; - map int32_enum_map = 3; + repeated int32 packed_enum = 3 [packed = true]; } From 9d26c706e03edec3faa6e82d34f14c514419ffe8 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Wed, 6 Oct 2021 19:39:37 -0700 Subject: [PATCH 3/8] Removed dependency on popcount() intrinsic. --- upb/def.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/upb/def.c b/upb/def.c index 64261aeb03..97db5e86aa 100644 --- a/upb/def.c +++ b/upb/def.c @@ -2363,6 +2363,16 @@ static void create_service( } } +static int popcount(uint64_t x) { + // For assertions only, speed does not matter. + int n = 0; + while (x) { + if (x & 1) n++; + x >>= 1; + } + return n; +} + upb_enumlayout *create_enumlayout(symtab_addctx *ctx, const upb_enumdef *e) { int n = 0; uint64_t mask = 0; @@ -2387,7 +2397,7 @@ upb_enumlayout *create_enumlayout(symtab_addctx *ctx, const upb_enumdef *e) { } UPB_ASSERT(p == values + n); - UPB_ASSERT(upb_inttable_count(&e->iton) == n + __builtin_popcountll(mask)); + UPB_ASSERT(upb_inttable_count(&e->iton) == n + popcount(mask)); upb_enumlayout *layout = symtab_alloc(ctx, sizeof(*layout)); layout->value_count = n; @@ -2463,7 +2473,7 @@ static void create_enumdef( if (ctx->layout) { UPB_ASSERT(ctx->enum_count < ctx->layout->enum_count); e->layout = ctx->layout->enums[ctx->enum_count++]; - UPB_ASSERT(n == e->layout->value_count + __builtin_popcountll(e->layout->mask)); + UPB_ASSERT(n == e->layout->value_count + popcount(e->layout->mask)); } else { e->layout = create_enumlayout(ctx, e); } From 16f763e4d69fef36be8a90d80e6401f211061a80 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Thu, 7 Oct 2021 08:26:59 -0700 Subject: [PATCH 4/8] Addressed PR comments. --- upb/decode.c | 4 ++- upb/def.c | 73 ++++++++++++++++++++++++------------------ upbc/protoc-gen-upb.cc | 2 +- 3 files changed, 45 insertions(+), 34 deletions(-) diff --git a/upb/decode.c b/upb/decode.c index 3aabe0e1dc..abbbe80e6a 100644 --- a/upb/decode.c +++ b/upb/decode.c @@ -461,7 +461,9 @@ static const char *decode_fixed_packed(upb_decstate *d, const char *ptr, decode_reserve(d, arr, count); void *mem = UPB_PTR_AT(_upb_array_ptr(arr), arr->len << lg2, void); arr->len += count; - memcpy(mem, ptr, val->size); /* XXX: ptr boundary. */ + // Note: if/when the decoder supports multi-buffer input, we will need to + // handle buffer seams here. + memcpy(mem, ptr, val->size); return ptr + val->size; } diff --git a/upb/def.c b/upb/def.c index 97db5e86aa..47be1caf07 100644 --- a/upb/def.c +++ b/upb/def.c @@ -2363,7 +2363,7 @@ static void create_service( } } -static int popcount(uint64_t x) { +static int count_bits_debug(uint64_t x) { // For assertions only, speed does not matter. int n = 0; while (x) { @@ -2387,17 +2387,20 @@ upb_enumlayout *create_enumlayout(symtab_addctx *ctx, const upb_enumdef *e) { } int32_t *values = symtab_alloc(ctx, sizeof(*values) * n); - int32_t *p = values; - for (int i = 0; i < e->value_count; i++) { - int32_t val = e->values[i].number; - if ((uint32_t)val >= 64) { - *p++ = val; + if (n) { + int32_t *p = values; + + for (int i = 0; i < e->value_count; i++) { + int32_t val = e->values[i].number; + if ((uint32_t)val >= 64) { + *p++ = val; + } } + UPB_ASSERT(p == values + n); } - UPB_ASSERT(p == values + n); - UPB_ASSERT(upb_inttable_count(&e->iton) == n + popcount(mask)); + UPB_ASSERT(upb_inttable_count(&e->iton) == n + count_bits_debug(mask)); upb_enumlayout *layout = symtab_alloc(ctx, sizeof(*layout)); layout->value_count = n; @@ -2407,6 +2410,34 @@ upb_enumlayout *create_enumlayout(symtab_addctx *ctx, const upb_enumdef *e) { return layout; } +static void create_enumvaldef( + symtab_addctx *ctx, const char *prefix, + const google_protobuf_EnumValueDescriptorProto *val_proto, upb_enumdef *e, + int i) { + upb_enumvaldef *val = (upb_enumvaldef *)&e->values[i]; + upb_strview name = google_protobuf_EnumValueDescriptorProto_name(val_proto); + upb_value v = upb_value_constptr(val); + + val->enum_ = e; /* Must happen prior to symtab_add(). */ + val->full_name = makefullname(ctx, prefix, name); + val->number = google_protobuf_EnumValueDescriptorProto_number(val_proto); + symtab_add(ctx, val->full_name, pack_def(val, UPB_DEFTYPE_ENUMVAL)); + + SET_OPTIONS(val->opts, EnumValueDescriptorProto, EnumValueOptions, val_proto); + + if (i == 0 && e->file->syntax == UPB_SYNTAX_PROTO3 && val->number != 0) { + symtab_errf(ctx, "for proto3, the first enum value must be zero (%s)", + e->full_name); + } + + CHK_OOM(upb_strtable_insert(&e->ntoi, name.data, name.size, v, ctx->arena)); + + // Multiple enumerators can have the same number, first one wins. + if (!upb_inttable_lookup(&e->iton, val->number, NULL)) { + CHK_OOM(upb_inttable_insert(&e->iton, val->number, v, ctx->arena)); + } +} + static void create_enumdef( symtab_addctx *ctx, const char *prefix, const google_protobuf_EnumDescriptorProto *enum_proto, @@ -2442,29 +2473,7 @@ static void create_enumdef( SET_OPTIONS(e->opts, EnumDescriptorProto, EnumOptions, enum_proto); for (i = 0; i < n; i++) { - const google_protobuf_EnumValueDescriptorProto *val_proto = values[i]; - upb_enumvaldef *val = (upb_enumvaldef*)&e->values[i]; - upb_strview name = google_protobuf_EnumValueDescriptorProto_name(val_proto); - upb_value v = upb_value_constptr(val); - - val->enum_ = e; /* Must happen prior to symtab_add(). */ - val->full_name = makefullname(ctx, prefix, name); - val->number = google_protobuf_EnumValueDescriptorProto_number(val_proto); - symtab_add(ctx, val->full_name, pack_def(val, UPB_DEFTYPE_ENUMVAL)); - - SET_OPTIONS(val->opts, EnumValueDescriptorProto, EnumValueOptions, val_proto); - - if (i == 0 && e->file->syntax == UPB_SYNTAX_PROTO3 && val->number != 0) { - symtab_errf(ctx, "for proto3, the first enum value must be zero (%s)", - e->full_name); - } - - CHK_OOM(upb_strtable_insert(&e->ntoi, name.data, name.size, v, ctx->arena)); - - // Multiple enumerators can have the same number, first one wins. - if (!upb_inttable_lookup(&e->iton, val->number, NULL)) { - CHK_OOM(upb_inttable_insert(&e->iton, val->number, v, ctx->arena)); - } + create_enumvaldef(ctx, prefix, values[i], e, i); } upb_inttable_compact(&e->iton, ctx->arena); @@ -2473,7 +2482,7 @@ static void create_enumdef( if (ctx->layout) { UPB_ASSERT(ctx->enum_count < ctx->layout->enum_count); e->layout = ctx->layout->enums[ctx->enum_count++]; - UPB_ASSERT(n == e->layout->value_count + popcount(e->layout->mask)); + UPB_ASSERT(n == e->layout->value_count + count_bits_debug(e->layout->mask)); } else { e->layout = create_enumlayout(ctx, e); } diff --git a/upbc/protoc-gen-upb.cc b/upbc/protoc-gen-upb.cc index 52758ff0f8..f157335d1d 100644 --- a/upbc/protoc-gen-upb.cc +++ b/upbc/protoc-gen-upb.cc @@ -745,7 +745,7 @@ void WriteHeader(const protobuf::FileDescriptor* file, Output& output) { output("\n"); if (file->syntax() == protobuf::FileDescriptor::SYNTAX_PROTO2) { - for (auto enumdesc : this_file_enums) { + for (const auto* enumdesc : this_file_enums) { output("extern const upb_enumlayout $0;\n", EnumInit(enumdesc)); } } From f21ce7255d63eed67b269a2774db61cb3d3188ed Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Thu, 7 Oct 2021 09:02:15 -0700 Subject: [PATCH 5/8] Fixed double-lookups. --- upbc/protoc-gen-upb.cc | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/upbc/protoc-gen-upb.cc b/upbc/protoc-gen-upb.cc index f157335d1d..44974aa3f5 100644 --- a/upbc/protoc-gen-upb.cc +++ b/upbc/protoc-gen-upb.cc @@ -862,23 +862,26 @@ SubLayoutArray::SubLayoutArray(const protobuf::Descriptor* message) { SortedSubmessages(message); int i = 0; for (auto submsg : sorted_submsgs) { - if (indexes_.find(submsg->message_type()) != indexes_.end()) { + if (!indexes_.insert(std::make_pair(submsg->message_type(), i)).second) { + // Already present. continue; } submsgs_.push_back(submsg->message_type()); - indexes_[submsg->message_type()] = i++; + i++; } std::vector sorted_subenums = SortedSubEnums(message); for (auto field : sorted_subenums) { - if (field->file()->syntax() != - protobuf::FileDescriptor::SYNTAX_PROTO2 || - indexes_.find(field->enum_type()) != indexes_.end()) { + if (field->file()->syntax() != protobuf::FileDescriptor::SYNTAX_PROTO2) { + continue; + } + if (!indexes_.insert(std::make_pair(field->enum_type(), i)).second) { + // Already present. continue; } subenums_.push_back(field->enum_type()); - indexes_[field->enum_type()] = i++; + i++; } } From 7771a0515b206de82d23a54ed47ce43035866ebd Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Thu, 7 Oct 2021 11:01:18 -0700 Subject: [PATCH 6/8] Addressed PR comments. --- upb/def.c | 2 ++ upbc/protoc-gen-upb.cc | 12 ++++++------ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/upb/def.c b/upb/def.c index 47be1caf07..a2d533d614 100644 --- a/upb/def.c +++ b/upb/def.c @@ -2391,6 +2391,8 @@ upb_enumlayout *create_enumlayout(symtab_addctx *ctx, const upb_enumdef *e) { if (n) { int32_t *p = values; + // Add values outside the bitmask range to the list, as described in the + // comments for upb_enumlayout. for (int i = 0; i < e->value_count; i++) { int32_t val = e->values[i].number; if ((uint32_t)val >= 64) { diff --git a/upbc/protoc-gen-upb.cc b/upbc/protoc-gen-upb.cc index 44974aa3f5..13df6da2e0 100644 --- a/upbc/protoc-gen-upb.cc +++ b/upbc/protoc-gen-upb.cc @@ -861,8 +861,8 @@ SubLayoutArray::SubLayoutArray(const protobuf::Descriptor* message) { std::vector sorted_submsgs = SortedSubmessages(message); int i = 0; - for (auto submsg : sorted_submsgs) { - if (!indexes_.insert(std::make_pair(submsg->message_type(), i)).second) { + for (const auto* submsg : sorted_submsgs) { + if (!indexes_.try_emplace(submsg->message_type(), i).second) { // Already present. continue; } @@ -872,11 +872,11 @@ SubLayoutArray::SubLayoutArray(const protobuf::Descriptor* message) { std::vector sorted_subenums = SortedSubEnums(message); - for (auto field : sorted_subenums) { + for (const auto* field : sorted_subenums) { if (field->file()->syntax() != protobuf::FileDescriptor::SYNTAX_PROTO2) { continue; } - if (!indexes_.insert(std::make_pair(field->enum_type(), i)).second) { + if (!indexes_.try_emplace(field->enum_type(), i).second) { // Already present. continue; } @@ -1244,7 +1244,7 @@ int WriteEnums(const protobuf::FileDescriptor* file, Output& output) { std::string values_init = "NULL"; - for (auto e : this_file_enums) { + for (const auto* e : this_file_enums) { uint64_t mask = 0; absl::flat_hash_set values; for (int i = 0; i < e->value_count(); i++) { @@ -1279,7 +1279,7 @@ int WriteEnums(const protobuf::FileDescriptor* file, Output& output) { if (!this_file_enums.empty()) { output("static const upb_enumlayout *$0[$1] = {\n", kEnumsInit, this_file_enums.size()); - for (auto e : this_file_enums) { + for (const auto* e : this_file_enums) { output(" &$0,\n", EnumInit(e)); } output("};\n"); From 42be22faea391d55699f5404698afa457f7123ec Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Mon, 11 Oct 2021 18:55:14 -0700 Subject: [PATCH 7/8] Opted the table-driven parser out of fasttable parsing. --- upbc/protoc-gen-upb.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/upbc/protoc-gen-upb.cc b/upbc/protoc-gen-upb.cc index 13df6da2e0..f16188653f 100644 --- a/upbc/protoc-gen-upb.cc +++ b/upbc/protoc-gen-upb.cc @@ -920,8 +920,12 @@ bool TryFillTableEntry(const protobuf::Descriptor* message, case protobuf::FieldDescriptor::TYPE_BOOL: type = "b1"; break; - case protobuf::FieldDescriptor::TYPE_INT32: case protobuf::FieldDescriptor::TYPE_ENUM: + if (field->file()->syntax() == protobuf::FileDescriptor::SYNTAX_PROTO2) { + // We don't have the means to test proto2 enum fields for valid values. + return false; + } + case protobuf::FieldDescriptor::TYPE_INT32: case protobuf::FieldDescriptor::TYPE_UINT32: type = "v4"; break; From e595ceebc56391799ce487286ceb917ffbdef85c Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Mon, 11 Oct 2021 19:30:13 -0700 Subject: [PATCH 8/8] Added FALLTHROUGH_INTENDED to fix GCC warning. --- upbc/protoc-gen-upb.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/upbc/protoc-gen-upb.cc b/upbc/protoc-gen-upb.cc index f16188653f..0b77713ffb 100644 --- a/upbc/protoc-gen-upb.cc +++ b/upbc/protoc-gen-upb.cc @@ -925,6 +925,7 @@ bool TryFillTableEntry(const protobuf::Descriptor* message, // We don't have the means to test proto2 enum fields for valid values. return false; } + ABSL_FALLTHROUGH_INTENDED; case protobuf::FieldDescriptor::TYPE_INT32: case protobuf::FieldDescriptor::TYPE_UINT32: type = "v4";