diff --git a/benchmarks/compare.py b/benchmarks/compare.py index 9cc21d556f..971cc274f3 100755 --- a/benchmarks/compare.py +++ b/benchmarks/compare.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 # # Copyright (c) 2009-2021, Google LLC # All rights reserved. diff --git a/upb/decode.c b/upb/decode.c index fc074d6d52..3aabe0e1dc 100644 --- a/upb/decode.c +++ b/upb/decode.c @@ -101,12 +101,14 @@ static const unsigned FIXED64_OK_MASK = (1 << UPB_DTYPE_DOUBLE) | #define OP_MSGSET_ITEM -2 #define OP_MSGSET_TYPEID -3 #define OP_SCALAR_LG2(n) (n) /* n in [0, 2, 3] => op in [0, 2, 3] */ +#define OP_ENUM 1 #define OP_STRING 4 #define OP_BYTES 5 #define OP_SUBMSG 6 -/* Ops above are scalar-only. Repeated fields can use any op. */ +/* Scalar fields use only ops above. Repeated fields can use any op. */ #define OP_FIXPCK_LG2(n) (n + 5) /* n in [2, 3] => op in [7, 8] */ #define OP_VARPCK_LG2(n) (n + 9) /* n in [0, 2, 3] => op in [9, 11, 12] */ +#define OP_PACKED_ENUM 13 static const int8_t varint_ops[] = { OP_UNKNOWN, /* field not found */ @@ -123,7 +125,7 @@ static const int8_t varint_ops[] = { OP_UNKNOWN, /* MESSAGE */ OP_UNKNOWN, /* BYTES */ OP_SCALAR_LG2(2), /* UINT32 */ - OP_SCALAR_LG2(2), /* ENUM */ + OP_ENUM, /* ENUM */ OP_UNKNOWN, /* SFIXED32 */ OP_UNKNOWN, /* SFIXED64 */ OP_SCALAR_LG2(2), /* SINT32 */ @@ -169,7 +171,7 @@ static const int8_t delim_ops[] = { OP_SUBMSG, /* REPEATED MESSAGE */ OP_BYTES, /* REPEATED BYTES */ OP_VARPCK_LG2(2), /* REPEATED UINT32 */ - OP_VARPCK_LG2(2), /* REPEATED ENUM */ + OP_PACKED_ENUM, /* REPEATED ENUM */ OP_FIXPCK_LG2(2), /* REPEATED SFIXED32 */ OP_FIXPCK_LG2(3), /* REPEATED SFIXED64 */ OP_VARPCK_LG2(2), /* REPEATED SINT32 */ @@ -384,6 +386,134 @@ static const char *decode_togroup(upb_decstate *d, const char *ptr, return decode_group(d, ptr, submsg, subl, field->number); } +static char *encode_varint32(uint32_t val, char *ptr) { + do { + uint8_t byte = val & 0x7fU; + val >>= 7; + if (val) byte |= 0x80U; + *(ptr++) = byte; + } while (val); + return ptr; +} + +UPB_NOINLINE +static bool decode_checkenum_slow(upb_decstate *d, const char *ptr, upb_msg *msg, + const upb_enumlayout *e, + const upb_msglayout_field *field, uint32_t v) { + // OPT: binary search long lists? + int n = e->value_count; + for (int i = 0; i < n; i++) { + if ((uint32_t)e->values[i] == v) return true; + } + + // Unrecognized enum goes into unknown fields. + // For packed fields the tag could be arbitrarily far in the past, so we + // just re-encode the tag here. + char buf[20]; + char *end = buf; + uint32_t tag = ((uint32_t)field->number << 3) | UPB_WIRE_TYPE_VARINT; + end = encode_varint32(tag, end); + end = encode_varint32(v, end); + + if (!_upb_msg_addunknown(msg, buf, end - buf, &d->arena)) { + decode_err(d); + } + + return false; +} + +UPB_FORCEINLINE +static bool decode_checkenum(upb_decstate *d, const char *ptr, upb_msg *msg, + const upb_enumlayout *e, + const upb_msglayout_field *field, wireval *val) { + uint32_t v = val->uint32_val; + + if (UPB_LIKELY(v < 64) && UPB_LIKELY(((1ULL << v) & e->mask))) return true; + + return decode_checkenum_slow(d, ptr, msg, e, field, v); +} + +UPB_NOINLINE +static const char *decode_enum_toarray(upb_decstate *d, const char *ptr, + upb_msg *msg, upb_array *arr, + const upb_msglayout_sub *subs, + const upb_msglayout_field *field, + wireval *val) { + const upb_enumlayout *e = subs[field->submsg_index].subenum; + if (!decode_checkenum(d, ptr, msg, e, field, val)) return ptr; + void *mem = UPB_PTR_AT(_upb_array_ptr(arr), arr->len * 4, void); + arr->len++; + memcpy(mem, val, 4); + return ptr; +} + +#include +UPB_FORCEINLINE +static const char *decode_fixed_packed(upb_decstate *d, const char *ptr, + upb_array *arr, wireval *val, + const upb_msglayout_field *field, + int lg2) { + int mask = (1 << lg2) - 1; + size_t count = val->size >> lg2; + if ((val->size & mask) != 0) { + return decode_err(d); /* Length isn't a round multiple of elem size. */ + } + decode_reserve(d, arr, count); + void *mem = UPB_PTR_AT(_upb_array_ptr(arr), arr->len << lg2, void); + arr->len += count; + memcpy(mem, ptr, val->size); /* XXX: ptr boundary. */ + return ptr + val->size; +} + +UPB_FORCEINLINE +static const char *decode_varint_packed(upb_decstate *d, const char *ptr, + upb_array *arr, wireval *val, + const upb_msglayout_field *field, + int lg2) { + int scale = 1 << lg2; + int saved_limit = decode_pushlimit(d, ptr, val->size); + char *out = UPB_PTR_AT(_upb_array_ptr(arr), arr->len << lg2, void); + while (!decode_isdone(d, &ptr)) { + wireval elem; + ptr = decode_varint64(d, ptr, &elem.uint64_val); + decode_munge(field->descriptortype, &elem); + if (decode_reserve(d, arr, 1)) { + out = UPB_PTR_AT(_upb_array_ptr(arr), arr->len << lg2, void); + } + arr->len++; + memcpy(out, &elem, scale); + out += scale; + } + decode_poplimit(d, ptr, saved_limit); + return ptr; +} + +UPB_NOINLINE +static const char *decode_enum_packed(upb_decstate *d, const char *ptr, + upb_msg *msg, upb_array *arr, + const upb_msglayout_sub *subs, + const upb_msglayout_field *field, + wireval *val) { + const upb_enumlayout *e = subs[field->submsg_index].subenum; + int saved_limit = decode_pushlimit(d, ptr, val->size); + char *out = UPB_PTR_AT(_upb_array_ptr(arr), arr->len * 4, void); + while (!decode_isdone(d, &ptr)) { + wireval elem; + ptr = decode_varint64(d, ptr, &elem.uint64_val); + if (!decode_checkenum(d, ptr, msg, e, field, &elem)) { + continue; + } + if (decode_reserve(d, arr, 1)) { + out = UPB_PTR_AT(_upb_array_ptr(arr), arr->len * 4, void); + } + arr->len++; + memcpy(out, &elem, 4); + out += 4; + } + decode_poplimit(d, ptr, saved_limit); + return ptr; +} + static const char *decode_toarray(upb_decstate *d, const char *ptr, upb_msg *msg, const upb_msglayout_sub *subs, @@ -433,42 +563,18 @@ static const char *decode_toarray(upb_decstate *d, const char *ptr, } } case OP_FIXPCK_LG2(2): - case OP_FIXPCK_LG2(3): { - /* Fixed packed. */ - int lg2 = op - OP_FIXPCK_LG2(0); - int mask = (1 << lg2) - 1; - size_t count = val->size >> lg2; - if ((val->size & mask) != 0) { - return decode_err(d); /* Length isn't a round multiple of elem size. */ - } - decode_reserve(d, arr, count); - mem = UPB_PTR_AT(_upb_array_ptr(arr), arr->len << lg2, void); - arr->len += count; - memcpy(mem, ptr, val->size); /* XXX: ptr boundary. */ - return ptr + val->size; - } + case OP_FIXPCK_LG2(3): + return decode_fixed_packed(d, ptr, arr, val, field, + op - OP_FIXPCK_LG2(0)); case OP_VARPCK_LG2(0): case OP_VARPCK_LG2(2): - case OP_VARPCK_LG2(3): { - /* Varint packed. */ - int lg2 = op - OP_VARPCK_LG2(0); - int scale = 1 << lg2; - int saved_limit = decode_pushlimit(d, ptr, val->size); - char *out = UPB_PTR_AT(_upb_array_ptr(arr), arr->len << lg2, void); - while (!decode_isdone(d, &ptr)) { - wireval elem; - ptr = decode_varint64(d, ptr, &elem.uint64_val); - decode_munge(field->descriptortype, &elem); - if (decode_reserve(d, arr, 1)) { - out = UPB_PTR_AT(_upb_array_ptr(arr), arr->len << lg2, void); - } - arr->len++; - memcpy(out, &elem, scale); - out += scale; - } - decode_poplimit(d, ptr, saved_limit); - return ptr; - } + case OP_VARPCK_LG2(3): + return decode_varint_packed(d, ptr, arr, val, field, + op - OP_VARPCK_LG2(0)); + case OP_ENUM: + return decode_enum_toarray(d, ptr, msg, arr, subs, field, val); + case OP_PACKED_ENUM: + return decode_enum_packed(d, ptr, msg, arr, subs, field, val); default: UPB_UNREACHABLE(); } @@ -516,6 +622,12 @@ static const char *decode_tomsg(upb_decstate *d, const char *ptr, upb_msg *msg, void *mem = UPB_PTR_AT(msg, field->offset, void); int type = field->descriptortype; + if (UPB_UNLIKELY(op == OP_ENUM) && + !decode_checkenum(d, ptr, msg, subs[field->submsg_index].subenum, field, + val)) { + return ptr; + } + /* Set presence if necessary. */ if (field->presence > 0) { _upb_sethas_field(msg, field); @@ -552,6 +664,7 @@ static const char *decode_tomsg(upb_decstate *d, const char *ptr, upb_msg *msg, case OP_SCALAR_LG2(3): memcpy(mem, val, 8); break; + case OP_ENUM: case OP_SCALAR_LG2(2): memcpy(mem, val, 4); break; @@ -798,13 +911,12 @@ static const char *decode_msg(upb_decstate *d, const char *ptr, upb_msg *msg, field_number = tag >> 3; wire_type = tag & 7; - field = decode_findfield(d, layout, field_number, &last_field_index); - if (wire_type == UPB_WIRE_TYPE_END_GROUP) { d->end_group = field_number; return ptr; } + field = decode_findfield(d, layout, field_number, &last_field_index); ptr = decode_wireval(d, ptr, field, wire_type, &val, &op); if (op >= 0) { diff --git a/upb/def.c b/upb/def.c index 7541224f8f..64261aeb03 100644 --- a/upb/def.c +++ b/upb/def.c @@ -120,6 +120,7 @@ struct upb_msgdef { struct upb_enumdef { const google_protobuf_EnumOptions *opts; + const upb_enumlayout *layout; // Only for proto2. const upb_filedef *file; const upb_msgdef *containing_type; // Could be merged with "file". const char *full_name; @@ -1664,6 +1665,11 @@ static void make_layout(symtab_addctx *ctx, const upb_msgdef *m) { if (upb_fielddef_issubmsg(f)) { field->submsg_index = sublayout_count++; subs[field->submsg_index].submsg = upb_fielddef_msgsubdef(f)->layout; + } else if (upb_fielddef_type(f) == UPB_TYPE_ENUM && + f->file->syntax == UPB_SYNTAX_PROTO2) { + field->submsg_index = sublayout_count++; + subs[field->submsg_index].subenum = upb_fielddef_enumsubdef(f)->layout; + UPB_ASSERT(subs[field->submsg_index].subenum); } if (upb_fielddef_label(f) == UPB_LABEL_REQUIRED) { @@ -2357,6 +2363,40 @@ static void create_service( } } +upb_enumlayout *create_enumlayout(symtab_addctx *ctx, const upb_enumdef *e) { + int n = 0; + uint64_t mask = 0; + + for (int i = 0; i < e->value_count; i++) { + uint32_t val = (uint32_t)e->values[i].number; + if (val < 64) { + mask |= 1 << val; + } else { + n++; + } + } + + int32_t *values = symtab_alloc(ctx, sizeof(*values) * n); + int32_t *p = values; + + for (int i = 0; i < e->value_count; i++) { + int32_t val = e->values[i].number; + if ((uint32_t)val >= 64) { + *p++ = val; + } + } + + UPB_ASSERT(p == values + n); + UPB_ASSERT(upb_inttable_count(&e->iton) == n + __builtin_popcountll(mask)); + + upb_enumlayout *layout = symtab_alloc(ctx, sizeof(*layout)); + layout->value_count = n; + layout->mask = mask; + layout->values = values; + + return layout; +} + static void create_enumdef( symtab_addctx *ctx, const char *prefix, const google_protobuf_EnumDescriptorProto *enum_proto, @@ -2418,6 +2458,18 @@ static void create_enumdef( } upb_inttable_compact(&e->iton, ctx->arena); + + if (e->file->syntax == UPB_SYNTAX_PROTO2) { + if (ctx->layout) { + UPB_ASSERT(ctx->enum_count < ctx->layout->enum_count); + e->layout = ctx->layout->enums[ctx->enum_count++]; + UPB_ASSERT(n == e->layout->value_count + __builtin_popcountll(e->layout->mask)); + } else { + e->layout = create_enumlayout(ctx, e); + } + } else { + e->layout = NULL; + } } static void create_msgdef(symtab_addctx *ctx, const char *prefix, diff --git a/upb/msg_test.cc b/upb/msg_test.cc index 24d3da3a59..a136bda0f3 100644 --- a/upb/msg_test.cc +++ b/upb/msg_test.cc @@ -100,7 +100,9 @@ TEST(MessageTest, Extensions) { } void VerifyMessageSet(const upb_test_TestMessageSet *mset_msg) { - EXPECT_TRUE(upb_test_MessageSetMember_has_message_set_extension(mset_msg)); + bool has = upb_test_MessageSetMember_has_message_set_extension(mset_msg); + EXPECT_TRUE(has); + if (!has) return; const upb_test_MessageSetMember *member = upb_test_MessageSetMember_message_set_extension(mset_msg); EXPECT_TRUE(member != nullptr); @@ -159,6 +161,7 @@ TEST(MessageTest, Proto2Enum) { upb_test_Proto2FakeEnumMessage *fake_msg = upb_test_Proto2FakeEnumMessage_new(arena.ptr()); upb_test_Proto2FakeEnumMessage_set_optional_enum(fake_msg, 999); + int32_t *vals = upb_test_Proto2FakeEnumMessage_resize_repeated_enum( fake_msg, 6, arena.ptr()); vals[0] = upb_test_Proto2EnumMessage_ZERO; @@ -167,13 +170,21 @@ TEST(MessageTest, Proto2Enum) { vals[3] = 888; // Unknown large. vals[4] = upb_test_Proto2EnumMessage_LARGE; vals[5] = upb_test_Proto2EnumMessage_NEGATIVE; - upb_test_Proto2FakeEnumMessage_int32_enum_map_set(fake_msg, 5, 777, - arena.ptr()); + + vals = upb_test_Proto2FakeEnumMessage_resize_packed_enum( + fake_msg, 6, arena.ptr()); + vals[0] = upb_test_Proto2EnumMessage_ZERO; + vals[1] = 7; // Unknown small. + vals[2] = upb_test_Proto2EnumMessage_SMALL; + vals[3] = 888; // Unknown large. + vals[4] = upb_test_Proto2EnumMessage_LARGE; + vals[5] = upb_test_Proto2EnumMessage_NEGATIVE; size_t size; char *pb = upb_test_Proto2FakeEnumMessage_serialize(fake_msg, arena.ptr(), &size); + // Parsing as enums puts unknown values into unknown fields. upb_test_Proto2EnumMessage *enum_msg = upb_test_Proto2EnumMessage_parse(pb, size, arena.ptr()); ASSERT_TRUE(enum_msg != nullptr); @@ -183,14 +194,15 @@ TEST(MessageTest, Proto2Enum) { upb_test_Proto2EnumMessage_repeated_enum(enum_msg, &size); EXPECT_EQ(4, size); // Two unknown values moved to the unknown field set. + // Parsing back into the fake message shows the original data, except the + // repeated enum is rearranged. pb = upb_test_Proto2EnumMessage_serialize(enum_msg, arena.ptr(), &size); upb_test_Proto2FakeEnumMessage *fake_msg2 = upb_test_Proto2FakeEnumMessage_parse(pb, size, arena.ptr()); EXPECT_EQ(true, upb_test_Proto2FakeEnumMessage_has_optional_enum(fake_msg2)); - vals_const = - upb_test_Proto2FakeEnumMessage_repeated_enum(fake_msg2, &size); - EXPECT_EQ(6, size); + EXPECT_EQ(999, upb_test_Proto2FakeEnumMessage_optional_enum(fake_msg2)); + int32_t expected[] = { upb_test_Proto2EnumMessage_ZERO, upb_test_Proto2EnumMessage_SMALL, @@ -199,6 +211,16 @@ TEST(MessageTest, Proto2Enum) { 7, 888, }; + + vals_const = + upb_test_Proto2FakeEnumMessage_repeated_enum(fake_msg2, &size); + EXPECT_EQ(6, size); + EXPECT_THAT(std::vector(vals_const, vals_const + size), + ::testing::ElementsAreArray(expected)); + + vals_const = + upb_test_Proto2FakeEnumMessage_packed_enum(fake_msg2, &size); + EXPECT_EQ(6, size); EXPECT_THAT(std::vector(vals_const, vals_const + size), ::testing::ElementsAreArray(expected)); } diff --git a/upb/msg_test.proto b/upb/msg_test.proto index fad31206dc..d7cfc1ca57 100644 --- a/upb/msg_test.proto +++ b/upb/msg_test.proto @@ -69,7 +69,7 @@ message Proto2EnumMessage { optional Proto2TestEnum optional_enum = 1; repeated Proto2TestEnum repeated_enum = 2; - map int32_enum_map = 3; + repeated Proto2TestEnum packed_enum = 3 [packed = true]; } // The same fields as Proto2EnumMessage, but with int32 fields so we can fake @@ -77,5 +77,5 @@ message Proto2EnumMessage { message Proto2FakeEnumMessage { optional int32 optional_enum = 1; repeated int32 repeated_enum = 2; - map int32_enum_map = 3; + repeated int32 packed_enum = 3 [packed = true]; }