diff --git a/benchmarks/compare.py b/benchmarks/compare.py index 9cc21d556f..971cc274f3 100755 --- a/benchmarks/compare.py +++ b/benchmarks/compare.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 # # Copyright (c) 2009-2021, Google LLC # All rights reserved. diff --git a/cmake/google/protobuf/descriptor.upb.c b/cmake/google/protobuf/descriptor.upb.c index fc0e469819..588e431977 100644 --- a/cmake/google/protobuf/descriptor.upb.c +++ b/cmake/google/protobuf/descriptor.upb.c @@ -126,16 +126,18 @@ const upb_msglayout google_protobuf_ExtensionRangeOptions_msginit = { UPB_SIZE(8, 8), 1, _UPB_MSGEXT_EXTENDABLE, 0, 255, }; -static const upb_msglayout_sub google_protobuf_FieldDescriptorProto_submsgs[1] = { +static const upb_msglayout_sub google_protobuf_FieldDescriptorProto_submsgs[3] = { {.submsg = &google_protobuf_FieldOptions_msginit}, + {.subenum = &google_protobuf_FieldDescriptorProto_Label_enuminit}, + {.subenum = &google_protobuf_FieldDescriptorProto_Type_enuminit}, }; static const upb_msglayout_field google_protobuf_FieldDescriptorProto__fields[11] = { {1, UPB_SIZE(24, 24), 1, 0, 12, _UPB_MODE_SCALAR | (_UPB_REP_STRVIEW << _UPB_REP_SHIFT)}, {2, UPB_SIZE(32, 40), 2, 0, 12, _UPB_MODE_SCALAR | (_UPB_REP_STRVIEW << _UPB_REP_SHIFT)}, {3, UPB_SIZE(12, 12), 3, 0, 5, _UPB_MODE_SCALAR | (_UPB_REP_4BYTE << _UPB_REP_SHIFT)}, - {4, UPB_SIZE(4, 4), 4, 0, 14, _UPB_MODE_SCALAR | (_UPB_REP_4BYTE << _UPB_REP_SHIFT)}, - {5, UPB_SIZE(8, 8), 5, 0, 14, _UPB_MODE_SCALAR | (_UPB_REP_4BYTE << _UPB_REP_SHIFT)}, + {4, UPB_SIZE(4, 4), 4, 1, 14, _UPB_MODE_SCALAR | (_UPB_REP_4BYTE << _UPB_REP_SHIFT)}, + {5, UPB_SIZE(8, 8), 5, 2, 14, _UPB_MODE_SCALAR | (_UPB_REP_4BYTE << _UPB_REP_SHIFT)}, {6, UPB_SIZE(40, 56), 6, 0, 12, _UPB_MODE_SCALAR | (_UPB_REP_STRVIEW << _UPB_REP_SHIFT)}, {7, UPB_SIZE(48, 72), 7, 0, 12, _UPB_MODE_SCALAR | (_UPB_REP_STRVIEW << _UPB_REP_SHIFT)}, {8, UPB_SIZE(64, 104), 8, 0, 11, _UPB_MODE_SCALAR | (_UPB_REP_PTR << _UPB_REP_SHIFT)}, @@ -248,14 +250,15 @@ const upb_msglayout google_protobuf_MethodDescriptorProto_msginit = { UPB_SIZE(32, 64), 6, _UPB_MSGEXT_NONE, 6, 255, }; -static const upb_msglayout_sub google_protobuf_FileOptions_submsgs[1] = { +static const upb_msglayout_sub google_protobuf_FileOptions_submsgs[2] = { {.submsg = &google_protobuf_UninterpretedOption_msginit}, + {.subenum = &google_protobuf_FileOptions_OptimizeMode_enuminit}, }; static const upb_msglayout_field google_protobuf_FileOptions__fields[21] = { {1, UPB_SIZE(20, 24), 1, 0, 12, _UPB_MODE_SCALAR | (_UPB_REP_STRVIEW << _UPB_REP_SHIFT)}, {8, UPB_SIZE(28, 40), 2, 0, 12, _UPB_MODE_SCALAR | (_UPB_REP_STRVIEW << _UPB_REP_SHIFT)}, - {9, UPB_SIZE(4, 4), 3, 0, 14, _UPB_MODE_SCALAR | (_UPB_REP_4BYTE << _UPB_REP_SHIFT)}, + {9, UPB_SIZE(4, 4), 3, 1, 14, _UPB_MODE_SCALAR | (_UPB_REP_4BYTE << _UPB_REP_SHIFT)}, {10, UPB_SIZE(8, 8), 4, 0, 8, _UPB_MODE_SCALAR | (_UPB_REP_1BYTE << _UPB_REP_SHIFT)}, {11, UPB_SIZE(36, 56), 5, 0, 12, _UPB_MODE_SCALAR | (_UPB_REP_STRVIEW << _UPB_REP_SHIFT)}, {16, UPB_SIZE(9, 9), 6, 0, 8, _UPB_MODE_SCALAR | (_UPB_REP_1BYTE << _UPB_REP_SHIFT)}, @@ -300,16 +303,18 @@ const upb_msglayout google_protobuf_MessageOptions_msginit = { UPB_SIZE(16, 16), 5, _UPB_MSGEXT_EXTENDABLE, 3, 255, }; -static const upb_msglayout_sub google_protobuf_FieldOptions_submsgs[1] = { +static const upb_msglayout_sub google_protobuf_FieldOptions_submsgs[3] = { {.submsg = &google_protobuf_UninterpretedOption_msginit}, + {.subenum = &google_protobuf_FieldOptions_CType_enuminit}, + {.subenum = &google_protobuf_FieldOptions_JSType_enuminit}, }; static const upb_msglayout_field google_protobuf_FieldOptions__fields[7] = { - {1, UPB_SIZE(4, 4), 1, 0, 14, _UPB_MODE_SCALAR | (_UPB_REP_4BYTE << _UPB_REP_SHIFT)}, + {1, UPB_SIZE(4, 4), 1, 1, 14, _UPB_MODE_SCALAR | (_UPB_REP_4BYTE << _UPB_REP_SHIFT)}, {2, UPB_SIZE(12, 12), 2, 0, 8, _UPB_MODE_SCALAR | (_UPB_REP_1BYTE << _UPB_REP_SHIFT)}, {3, UPB_SIZE(13, 13), 3, 0, 8, _UPB_MODE_SCALAR | (_UPB_REP_1BYTE << _UPB_REP_SHIFT)}, {5, UPB_SIZE(14, 14), 4, 0, 8, _UPB_MODE_SCALAR | (_UPB_REP_1BYTE << _UPB_REP_SHIFT)}, - {6, UPB_SIZE(8, 8), 5, 0, 14, _UPB_MODE_SCALAR | (_UPB_REP_4BYTE << _UPB_REP_SHIFT)}, + {6, UPB_SIZE(8, 8), 5, 2, 14, _UPB_MODE_SCALAR | (_UPB_REP_4BYTE << _UPB_REP_SHIFT)}, {10, UPB_SIZE(15, 15), 6, 0, 8, _UPB_MODE_SCALAR | (_UPB_REP_1BYTE << _UPB_REP_SHIFT)}, {999, UPB_SIZE(16, 16), 0, 0, 11, _UPB_MODE_ARRAY | (_UPB_REP_PTR << _UPB_REP_SHIFT)}, }; @@ -380,13 +385,14 @@ const upb_msglayout google_protobuf_ServiceOptions_msginit = { UPB_SIZE(8, 16), 2, _UPB_MSGEXT_EXTENDABLE, 0, 255, }; -static const upb_msglayout_sub google_protobuf_MethodOptions_submsgs[1] = { +static const upb_msglayout_sub google_protobuf_MethodOptions_submsgs[2] = { {.submsg = &google_protobuf_UninterpretedOption_msginit}, + {.subenum = &google_protobuf_MethodOptions_IdempotencyLevel_enuminit}, }; static const upb_msglayout_field google_protobuf_MethodOptions__fields[3] = { {33, UPB_SIZE(8, 8), 1, 0, 8, _UPB_MODE_SCALAR | (_UPB_REP_1BYTE << _UPB_REP_SHIFT)}, - {34, UPB_SIZE(4, 4), 2, 0, 14, _UPB_MODE_SCALAR | (_UPB_REP_4BYTE << _UPB_REP_SHIFT)}, + {34, UPB_SIZE(4, 4), 2, 1, 14, _UPB_MODE_SCALAR | (_UPB_REP_4BYTE << _UPB_REP_SHIFT)}, {999, UPB_SIZE(12, 16), 0, 0, 11, _UPB_MODE_ARRAY | (_UPB_REP_PTR << _UPB_REP_SHIFT)}, }; @@ -512,10 +518,57 @@ static const upb_msglayout *messages_layout[27] = { &google_protobuf_GeneratedCodeInfo_Annotation_msginit, }; +const upb_enumlayout google_protobuf_FieldDescriptorProto_Label_enuminit = { + NULL, + 0xeULL, + 0, +}; + +const upb_enumlayout google_protobuf_FieldDescriptorProto_Type_enuminit = { + NULL, + 0x7fffeULL, + 0, +}; + +const upb_enumlayout google_protobuf_FieldOptions_CType_enuminit = { + NULL, + 0x7ULL, + 0, +}; + +const upb_enumlayout google_protobuf_FieldOptions_JSType_enuminit = { + NULL, + 0x7ULL, + 0, +}; + +const upb_enumlayout google_protobuf_FileOptions_OptimizeMode_enuminit = { + NULL, + 0xeULL, + 0, +}; + +const upb_enumlayout google_protobuf_MethodOptions_IdempotencyLevel_enuminit = { + NULL, + 0x7ULL, + 0, +}; + +static const upb_enumlayout *enums_layout[6] = { + &google_protobuf_FieldDescriptorProto_Label_enuminit, + &google_protobuf_FieldDescriptorProto_Type_enuminit, + &google_protobuf_FieldOptions_CType_enuminit, + &google_protobuf_FieldOptions_JSType_enuminit, + &google_protobuf_FileOptions_OptimizeMode_enuminit, + &google_protobuf_MethodOptions_IdempotencyLevel_enuminit, +}; + const upb_msglayout_file google_protobuf_descriptor_proto_upb_file_layout = { messages_layout, + enums_layout, NULL, 27, + 6, 0, }; diff --git a/cmake/google/protobuf/descriptor.upb.h b/cmake/google/protobuf/descriptor.upb.h index f74dde71f3..ad6fb7eac0 100644 --- a/cmake/google/protobuf/descriptor.upb.h +++ b/cmake/google/protobuf/descriptor.upb.h @@ -154,6 +154,13 @@ typedef enum { } google_protobuf_MethodOptions_IdempotencyLevel; +extern const upb_enumlayout google_protobuf_FieldDescriptorProto_Label_enuminit; +extern const upb_enumlayout google_protobuf_FieldDescriptorProto_Type_enuminit; +extern const upb_enumlayout google_protobuf_FieldOptions_CType_enuminit; +extern const upb_enumlayout google_protobuf_FieldOptions_JSType_enuminit; +extern const upb_enumlayout google_protobuf_FileOptions_OptimizeMode_enuminit; +extern const upb_enumlayout google_protobuf_MethodOptions_IdempotencyLevel_enuminit; + /* google.protobuf.FileDescriptorSet */ UPB_INLINE google_protobuf_FileDescriptorSet *google_protobuf_FileDescriptorSet_new(upb_arena *arena) { diff --git a/upb/decode.c b/upb/decode.c index fc074d6d52..abbbe80e6a 100644 --- a/upb/decode.c +++ b/upb/decode.c @@ -101,12 +101,14 @@ static const unsigned FIXED64_OK_MASK = (1 << UPB_DTYPE_DOUBLE) | #define OP_MSGSET_ITEM -2 #define OP_MSGSET_TYPEID -3 #define OP_SCALAR_LG2(n) (n) /* n in [0, 2, 3] => op in [0, 2, 3] */ +#define OP_ENUM 1 #define OP_STRING 4 #define OP_BYTES 5 #define OP_SUBMSG 6 -/* Ops above are scalar-only. Repeated fields can use any op. */ +/* Scalar fields use only ops above. Repeated fields can use any op. */ #define OP_FIXPCK_LG2(n) (n + 5) /* n in [2, 3] => op in [7, 8] */ #define OP_VARPCK_LG2(n) (n + 9) /* n in [0, 2, 3] => op in [9, 11, 12] */ +#define OP_PACKED_ENUM 13 static const int8_t varint_ops[] = { OP_UNKNOWN, /* field not found */ @@ -123,7 +125,7 @@ static const int8_t varint_ops[] = { OP_UNKNOWN, /* MESSAGE */ OP_UNKNOWN, /* BYTES */ OP_SCALAR_LG2(2), /* UINT32 */ - OP_SCALAR_LG2(2), /* ENUM */ + OP_ENUM, /* ENUM */ OP_UNKNOWN, /* SFIXED32 */ OP_UNKNOWN, /* SFIXED64 */ OP_SCALAR_LG2(2), /* SINT32 */ @@ -169,7 +171,7 @@ static const int8_t delim_ops[] = { OP_SUBMSG, /* REPEATED MESSAGE */ OP_BYTES, /* REPEATED BYTES */ OP_VARPCK_LG2(2), /* REPEATED UINT32 */ - OP_VARPCK_LG2(2), /* REPEATED ENUM */ + OP_PACKED_ENUM, /* REPEATED ENUM */ OP_FIXPCK_LG2(2), /* REPEATED SFIXED32 */ OP_FIXPCK_LG2(3), /* REPEATED SFIXED64 */ OP_VARPCK_LG2(2), /* REPEATED SINT32 */ @@ -384,6 +386,136 @@ static const char *decode_togroup(upb_decstate *d, const char *ptr, return decode_group(d, ptr, submsg, subl, field->number); } +static char *encode_varint32(uint32_t val, char *ptr) { + do { + uint8_t byte = val & 0x7fU; + val >>= 7; + if (val) byte |= 0x80U; + *(ptr++) = byte; + } while (val); + return ptr; +} + +UPB_NOINLINE +static bool decode_checkenum_slow(upb_decstate *d, const char *ptr, upb_msg *msg, + const upb_enumlayout *e, + const upb_msglayout_field *field, uint32_t v) { + // OPT: binary search long lists? + int n = e->value_count; + for (int i = 0; i < n; i++) { + if ((uint32_t)e->values[i] == v) return true; + } + + // Unrecognized enum goes into unknown fields. + // For packed fields the tag could be arbitrarily far in the past, so we + // just re-encode the tag here. + char buf[20]; + char *end = buf; + uint32_t tag = ((uint32_t)field->number << 3) | UPB_WIRE_TYPE_VARINT; + end = encode_varint32(tag, end); + end = encode_varint32(v, end); + + if (!_upb_msg_addunknown(msg, buf, end - buf, &d->arena)) { + decode_err(d); + } + + return false; +} + +UPB_FORCEINLINE +static bool decode_checkenum(upb_decstate *d, const char *ptr, upb_msg *msg, + const upb_enumlayout *e, + const upb_msglayout_field *field, wireval *val) { + uint32_t v = val->uint32_val; + + if (UPB_LIKELY(v < 64) && UPB_LIKELY(((1ULL << v) & e->mask))) return true; + + return decode_checkenum_slow(d, ptr, msg, e, field, v); +} + +UPB_NOINLINE +static const char *decode_enum_toarray(upb_decstate *d, const char *ptr, + upb_msg *msg, upb_array *arr, + const upb_msglayout_sub *subs, + const upb_msglayout_field *field, + wireval *val) { + const upb_enumlayout *e = subs[field->submsg_index].subenum; + if (!decode_checkenum(d, ptr, msg, e, field, val)) return ptr; + void *mem = UPB_PTR_AT(_upb_array_ptr(arr), arr->len * 4, void); + arr->len++; + memcpy(mem, val, 4); + return ptr; +} + +#include +UPB_FORCEINLINE +static const char *decode_fixed_packed(upb_decstate *d, const char *ptr, + upb_array *arr, wireval *val, + const upb_msglayout_field *field, + int lg2) { + int mask = (1 << lg2) - 1; + size_t count = val->size >> lg2; + if ((val->size & mask) != 0) { + return decode_err(d); /* Length isn't a round multiple of elem size. */ + } + decode_reserve(d, arr, count); + void *mem = UPB_PTR_AT(_upb_array_ptr(arr), arr->len << lg2, void); + arr->len += count; + // Note: if/when the decoder supports multi-buffer input, we will need to + // handle buffer seams here. + memcpy(mem, ptr, val->size); + return ptr + val->size; +} + +UPB_FORCEINLINE +static const char *decode_varint_packed(upb_decstate *d, const char *ptr, + upb_array *arr, wireval *val, + const upb_msglayout_field *field, + int lg2) { + int scale = 1 << lg2; + int saved_limit = decode_pushlimit(d, ptr, val->size); + char *out = UPB_PTR_AT(_upb_array_ptr(arr), arr->len << lg2, void); + while (!decode_isdone(d, &ptr)) { + wireval elem; + ptr = decode_varint64(d, ptr, &elem.uint64_val); + decode_munge(field->descriptortype, &elem); + if (decode_reserve(d, arr, 1)) { + out = UPB_PTR_AT(_upb_array_ptr(arr), arr->len << lg2, void); + } + arr->len++; + memcpy(out, &elem, scale); + out += scale; + } + decode_poplimit(d, ptr, saved_limit); + return ptr; +} + +UPB_NOINLINE +static const char *decode_enum_packed(upb_decstate *d, const char *ptr, + upb_msg *msg, upb_array *arr, + const upb_msglayout_sub *subs, + const upb_msglayout_field *field, + wireval *val) { + const upb_enumlayout *e = subs[field->submsg_index].subenum; + int saved_limit = decode_pushlimit(d, ptr, val->size); + char *out = UPB_PTR_AT(_upb_array_ptr(arr), arr->len * 4, void); + while (!decode_isdone(d, &ptr)) { + wireval elem; + ptr = decode_varint64(d, ptr, &elem.uint64_val); + if (!decode_checkenum(d, ptr, msg, e, field, &elem)) { + continue; + } + if (decode_reserve(d, arr, 1)) { + out = UPB_PTR_AT(_upb_array_ptr(arr), arr->len * 4, void); + } + arr->len++; + memcpy(out, &elem, 4); + out += 4; + } + decode_poplimit(d, ptr, saved_limit); + return ptr; +} + static const char *decode_toarray(upb_decstate *d, const char *ptr, upb_msg *msg, const upb_msglayout_sub *subs, @@ -433,42 +565,18 @@ static const char *decode_toarray(upb_decstate *d, const char *ptr, } } case OP_FIXPCK_LG2(2): - case OP_FIXPCK_LG2(3): { - /* Fixed packed. */ - int lg2 = op - OP_FIXPCK_LG2(0); - int mask = (1 << lg2) - 1; - size_t count = val->size >> lg2; - if ((val->size & mask) != 0) { - return decode_err(d); /* Length isn't a round multiple of elem size. */ - } - decode_reserve(d, arr, count); - mem = UPB_PTR_AT(_upb_array_ptr(arr), arr->len << lg2, void); - arr->len += count; - memcpy(mem, ptr, val->size); /* XXX: ptr boundary. */ - return ptr + val->size; - } + case OP_FIXPCK_LG2(3): + return decode_fixed_packed(d, ptr, arr, val, field, + op - OP_FIXPCK_LG2(0)); case OP_VARPCK_LG2(0): case OP_VARPCK_LG2(2): - case OP_VARPCK_LG2(3): { - /* Varint packed. */ - int lg2 = op - OP_VARPCK_LG2(0); - int scale = 1 << lg2; - int saved_limit = decode_pushlimit(d, ptr, val->size); - char *out = UPB_PTR_AT(_upb_array_ptr(arr), arr->len << lg2, void); - while (!decode_isdone(d, &ptr)) { - wireval elem; - ptr = decode_varint64(d, ptr, &elem.uint64_val); - decode_munge(field->descriptortype, &elem); - if (decode_reserve(d, arr, 1)) { - out = UPB_PTR_AT(_upb_array_ptr(arr), arr->len << lg2, void); - } - arr->len++; - memcpy(out, &elem, scale); - out += scale; - } - decode_poplimit(d, ptr, saved_limit); - return ptr; - } + case OP_VARPCK_LG2(3): + return decode_varint_packed(d, ptr, arr, val, field, + op - OP_VARPCK_LG2(0)); + case OP_ENUM: + return decode_enum_toarray(d, ptr, msg, arr, subs, field, val); + case OP_PACKED_ENUM: + return decode_enum_packed(d, ptr, msg, arr, subs, field, val); default: UPB_UNREACHABLE(); } @@ -516,6 +624,12 @@ static const char *decode_tomsg(upb_decstate *d, const char *ptr, upb_msg *msg, void *mem = UPB_PTR_AT(msg, field->offset, void); int type = field->descriptortype; + if (UPB_UNLIKELY(op == OP_ENUM) && + !decode_checkenum(d, ptr, msg, subs[field->submsg_index].subenum, field, + val)) { + return ptr; + } + /* Set presence if necessary. */ if (field->presence > 0) { _upb_sethas_field(msg, field); @@ -552,6 +666,7 @@ static const char *decode_tomsg(upb_decstate *d, const char *ptr, upb_msg *msg, case OP_SCALAR_LG2(3): memcpy(mem, val, 8); break; + case OP_ENUM: case OP_SCALAR_LG2(2): memcpy(mem, val, 4); break; @@ -798,13 +913,12 @@ static const char *decode_msg(upb_decstate *d, const char *ptr, upb_msg *msg, field_number = tag >> 3; wire_type = tag & 7; - field = decode_findfield(d, layout, field_number, &last_field_index); - if (wire_type == UPB_WIRE_TYPE_END_GROUP) { d->end_group = field_number; return ptr; } + field = decode_findfield(d, layout, field_number, &last_field_index); ptr = decode_wireval(d, ptr, field, wire_type, &val, &op); if (op >= 0) { diff --git a/upb/def.c b/upb/def.c index 7541224f8f..a2d533d614 100644 --- a/upb/def.c +++ b/upb/def.c @@ -120,6 +120,7 @@ struct upb_msgdef { struct upb_enumdef { const google_protobuf_EnumOptions *opts; + const upb_enumlayout *layout; // Only for proto2. const upb_filedef *file; const upb_msgdef *containing_type; // Could be merged with "file". const char *full_name; @@ -1664,6 +1665,11 @@ static void make_layout(symtab_addctx *ctx, const upb_msgdef *m) { if (upb_fielddef_issubmsg(f)) { field->submsg_index = sublayout_count++; subs[field->submsg_index].submsg = upb_fielddef_msgsubdef(f)->layout; + } else if (upb_fielddef_type(f) == UPB_TYPE_ENUM && + f->file->syntax == UPB_SYNTAX_PROTO2) { + field->submsg_index = sublayout_count++; + subs[field->submsg_index].subenum = upb_fielddef_enumsubdef(f)->layout; + UPB_ASSERT(subs[field->submsg_index].subenum); } if (upb_fielddef_label(f) == UPB_LABEL_REQUIRED) { @@ -2357,6 +2363,83 @@ static void create_service( } } +static int count_bits_debug(uint64_t x) { + // For assertions only, speed does not matter. + int n = 0; + while (x) { + if (x & 1) n++; + x >>= 1; + } + return n; +} + +upb_enumlayout *create_enumlayout(symtab_addctx *ctx, const upb_enumdef *e) { + int n = 0; + uint64_t mask = 0; + + for (int i = 0; i < e->value_count; i++) { + uint32_t val = (uint32_t)e->values[i].number; + if (val < 64) { + mask |= 1 << val; + } else { + n++; + } + } + + int32_t *values = symtab_alloc(ctx, sizeof(*values) * n); + + if (n) { + int32_t *p = values; + + // Add values outside the bitmask range to the list, as described in the + // comments for upb_enumlayout. + for (int i = 0; i < e->value_count; i++) { + int32_t val = e->values[i].number; + if ((uint32_t)val >= 64) { + *p++ = val; + } + } + UPB_ASSERT(p == values + n); + } + + UPB_ASSERT(upb_inttable_count(&e->iton) == n + count_bits_debug(mask)); + + upb_enumlayout *layout = symtab_alloc(ctx, sizeof(*layout)); + layout->value_count = n; + layout->mask = mask; + layout->values = values; + + return layout; +} + +static void create_enumvaldef( + symtab_addctx *ctx, const char *prefix, + const google_protobuf_EnumValueDescriptorProto *val_proto, upb_enumdef *e, + int i) { + upb_enumvaldef *val = (upb_enumvaldef *)&e->values[i]; + upb_strview name = google_protobuf_EnumValueDescriptorProto_name(val_proto); + upb_value v = upb_value_constptr(val); + + val->enum_ = e; /* Must happen prior to symtab_add(). */ + val->full_name = makefullname(ctx, prefix, name); + val->number = google_protobuf_EnumValueDescriptorProto_number(val_proto); + symtab_add(ctx, val->full_name, pack_def(val, UPB_DEFTYPE_ENUMVAL)); + + SET_OPTIONS(val->opts, EnumValueDescriptorProto, EnumValueOptions, val_proto); + + if (i == 0 && e->file->syntax == UPB_SYNTAX_PROTO3 && val->number != 0) { + symtab_errf(ctx, "for proto3, the first enum value must be zero (%s)", + e->full_name); + } + + CHK_OOM(upb_strtable_insert(&e->ntoi, name.data, name.size, v, ctx->arena)); + + // Multiple enumerators can have the same number, first one wins. + if (!upb_inttable_lookup(&e->iton, val->number, NULL)) { + CHK_OOM(upb_inttable_insert(&e->iton, val->number, v, ctx->arena)); + } +} + static void create_enumdef( symtab_addctx *ctx, const char *prefix, const google_protobuf_EnumDescriptorProto *enum_proto, @@ -2392,32 +2475,22 @@ static void create_enumdef( SET_OPTIONS(e->opts, EnumDescriptorProto, EnumOptions, enum_proto); for (i = 0; i < n; i++) { - const google_protobuf_EnumValueDescriptorProto *val_proto = values[i]; - upb_enumvaldef *val = (upb_enumvaldef*)&e->values[i]; - upb_strview name = google_protobuf_EnumValueDescriptorProto_name(val_proto); - upb_value v = upb_value_constptr(val); - - val->enum_ = e; /* Must happen prior to symtab_add(). */ - val->full_name = makefullname(ctx, prefix, name); - val->number = google_protobuf_EnumValueDescriptorProto_number(val_proto); - symtab_add(ctx, val->full_name, pack_def(val, UPB_DEFTYPE_ENUMVAL)); - - SET_OPTIONS(val->opts, EnumValueDescriptorProto, EnumValueOptions, val_proto); - - if (i == 0 && e->file->syntax == UPB_SYNTAX_PROTO3 && val->number != 0) { - symtab_errf(ctx, "for proto3, the first enum value must be zero (%s)", - e->full_name); - } + create_enumvaldef(ctx, prefix, values[i], e, i); + } - CHK_OOM(upb_strtable_insert(&e->ntoi, name.data, name.size, v, ctx->arena)); + upb_inttable_compact(&e->iton, ctx->arena); - // Multiple enumerators can have the same number, first one wins. - if (!upb_inttable_lookup(&e->iton, val->number, NULL)) { - CHK_OOM(upb_inttable_insert(&e->iton, val->number, v, ctx->arena)); + if (e->file->syntax == UPB_SYNTAX_PROTO2) { + if (ctx->layout) { + UPB_ASSERT(ctx->enum_count < ctx->layout->enum_count); + e->layout = ctx->layout->enums[ctx->enum_count++]; + UPB_ASSERT(n == e->layout->value_count + count_bits_debug(e->layout->mask)); + } else { + e->layout = create_enumlayout(ctx, e); } + } else { + e->layout = NULL; } - - upb_inttable_compact(&e->iton, ctx->arena); } static void create_msgdef(symtab_addctx *ctx, const char *prefix, diff --git a/upb/msg_internal.h b/upb/msg_internal.h index 7d024f7beb..1fbb2c7182 100644 --- a/upb/msg_internal.h +++ b/upb/msg_internal.h @@ -66,8 +66,8 @@ enum { typedef struct { uint32_t number; uint16_t offset; - int16_t presence; /* If >0, hasbit_index. If <0, ~oneof_index. */ - uint16_t submsg_index; /* undefined if descriptortype != MESSAGE or GROUP. */ + int16_t presence; // If >0, hasbit_index. If <0, ~oneof_index + uint16_t submsg_index; // undefined if descriptortype != MESSAGE/GROUP/ENUM uint8_t descriptortype; uint8_t mode; /* upb_fieldmode | upb_labelflags | (upb_rep << _UPB_REP_SHIFT) */ @@ -130,9 +130,15 @@ typedef struct { _upb_field_parser *field_parser; } _upb_fasttable_entry; +typedef struct { + const int32_t *values; // List of values <0 or >63 + uint64_t mask; // Bits are set for acceptable value 0 <= x < 64 + int value_count; +} upb_enumlayout; + typedef union { const struct upb_msglayout *submsg; - // TODO: const upb_enumlayout *subenum; + const upb_enumlayout *subenum; } upb_msglayout_sub; typedef enum { @@ -179,8 +185,10 @@ typedef struct { typedef struct { const upb_msglayout **msgs; + const upb_enumlayout **enums; const upb_msglayout_ext **exts; int msg_count; + int enum_count; int ext_count; } upb_msglayout_file; diff --git a/upb/msg_test.cc b/upb/msg_test.cc index 636fbcb9b4..a136bda0f3 100644 --- a/upb/msg_test.cc +++ b/upb/msg_test.cc @@ -26,6 +26,7 @@ */ #include "gtest/gtest.h" +#include "gmock/gmock.h" #include "src/google/protobuf/test_messages_proto3.upb.h" #include "upb/def.hpp" #include "upb/json_decode.h" @@ -99,7 +100,9 @@ TEST(MessageTest, Extensions) { } void VerifyMessageSet(const upb_test_TestMessageSet *mset_msg) { - EXPECT_TRUE(upb_test_MessageSetMember_has_message_set_extension(mset_msg)); + bool has = upb_test_MessageSetMember_has_message_set_extension(mset_msg); + EXPECT_TRUE(has); + if (!has) return; const upb_test_MessageSetMember *member = upb_test_MessageSetMember_message_set_extension(mset_msg); EXPECT_TRUE(member != nullptr); @@ -152,3 +155,72 @@ TEST(MessageTest, MessageSet) { << status.error_message(); VerifyMessageSet(ext_msg3); } + +TEST(MessageTest, Proto2Enum) { + upb::Arena arena; + upb_test_Proto2FakeEnumMessage *fake_msg = upb_test_Proto2FakeEnumMessage_new(arena.ptr()); + + upb_test_Proto2FakeEnumMessage_set_optional_enum(fake_msg, 999); + + int32_t *vals = upb_test_Proto2FakeEnumMessage_resize_repeated_enum( + fake_msg, 6, arena.ptr()); + vals[0] = upb_test_Proto2EnumMessage_ZERO; + vals[1] = 7; // Unknown small. + vals[2] = upb_test_Proto2EnumMessage_SMALL; + vals[3] = 888; // Unknown large. + vals[4] = upb_test_Proto2EnumMessage_LARGE; + vals[5] = upb_test_Proto2EnumMessage_NEGATIVE; + + vals = upb_test_Proto2FakeEnumMessage_resize_packed_enum( + fake_msg, 6, arena.ptr()); + vals[0] = upb_test_Proto2EnumMessage_ZERO; + vals[1] = 7; // Unknown small. + vals[2] = upb_test_Proto2EnumMessage_SMALL; + vals[3] = 888; // Unknown large. + vals[4] = upb_test_Proto2EnumMessage_LARGE; + vals[5] = upb_test_Proto2EnumMessage_NEGATIVE; + + size_t size; + char *pb = + upb_test_Proto2FakeEnumMessage_serialize(fake_msg, arena.ptr(), &size); + + // Parsing as enums puts unknown values into unknown fields. + upb_test_Proto2EnumMessage *enum_msg = + upb_test_Proto2EnumMessage_parse(pb, size, arena.ptr()); + ASSERT_TRUE(enum_msg != nullptr); + + EXPECT_EQ(false, upb_test_Proto2EnumMessage_has_optional_enum(enum_msg)); + const int32_t *vals_const = + upb_test_Proto2EnumMessage_repeated_enum(enum_msg, &size); + EXPECT_EQ(4, size); // Two unknown values moved to the unknown field set. + + // Parsing back into the fake message shows the original data, except the + // repeated enum is rearranged. + pb = upb_test_Proto2EnumMessage_serialize(enum_msg, arena.ptr(), &size); + upb_test_Proto2FakeEnumMessage *fake_msg2 = + upb_test_Proto2FakeEnumMessage_parse(pb, size, arena.ptr()); + + EXPECT_EQ(true, upb_test_Proto2FakeEnumMessage_has_optional_enum(fake_msg2)); + EXPECT_EQ(999, upb_test_Proto2FakeEnumMessage_optional_enum(fake_msg2)); + + int32_t expected[] = { + upb_test_Proto2EnumMessage_ZERO, + upb_test_Proto2EnumMessage_SMALL, + upb_test_Proto2EnumMessage_LARGE, + upb_test_Proto2EnumMessage_NEGATIVE, + 7, + 888, + }; + + vals_const = + upb_test_Proto2FakeEnumMessage_repeated_enum(fake_msg2, &size); + EXPECT_EQ(6, size); + EXPECT_THAT(std::vector(vals_const, vals_const + size), + ::testing::ElementsAreArray(expected)); + + vals_const = + upb_test_Proto2FakeEnumMessage_packed_enum(fake_msg2, &size); + EXPECT_EQ(6, size); + EXPECT_THAT(std::vector(vals_const, vals_const + size), + ::testing::ElementsAreArray(expected)); +} diff --git a/upb/msg_test.proto b/upb/msg_test.proto index 6dd940a1a1..d7cfc1ca57 100644 --- a/upb/msg_test.proto +++ b/upb/msg_test.proto @@ -58,3 +58,24 @@ message MessageSetMember { optional MessageSetMember message_set_extension = 4; } } + +message Proto2EnumMessage { + enum Proto2TestEnum { + ZERO = 0; + NEGATIVE = -1; + SMALL = 15; + LARGE = 12345; + } + + optional Proto2TestEnum optional_enum = 1; + repeated Proto2TestEnum repeated_enum = 2; + repeated Proto2TestEnum packed_enum = 3 [packed = true]; +} + +// The same fields as Proto2EnumMessage, but with int32 fields so we can fake +// wire format. +message Proto2FakeEnumMessage { + optional int32 optional_enum = 1; + repeated int32 repeated_enum = 2; + repeated int32 packed_enum = 3 [packed = true]; +} diff --git a/upbc/protoc-gen-upb.cc b/upbc/protoc-gen-upb.cc index d252957226..0b77713ffb 100644 --- a/upbc/protoc-gen-upb.cc +++ b/upbc/protoc-gen-upb.cc @@ -51,6 +51,10 @@ std::string MessageInit(const protobuf::Descriptor* descriptor) { return MessageName(descriptor) + "_msginit"; } +std::string EnumInit(const protobuf::EnumDescriptor* descriptor) { + return ToCIdent(descriptor->full_name()) + "_enuminit"; +} + std::string ExtensionIdentBase(const protobuf::FieldDescriptor* ext) { assert(ext->is_extension()); std::string ext_scope; @@ -65,8 +69,9 @@ std::string ExtensionLayout(const google::protobuf::FieldDescriptor* ext) { return absl::StrCat(ExtensionIdentBase(ext), "_", ext->name(), "_ext"); } -const char *kMessagesInit = "messages_layout"; +const char *kEnumsInit = "enums_layout"; const char *kExtensionsInit = "extensions_layout"; +const char *kMessagesInit = "messages_layout"; void AddEnums(const protobuf::Descriptor* message, std::vector* enums) { @@ -179,6 +184,24 @@ std::vector SortedSubmessages( return ret; } +std::vector SortedSubEnums( + const protobuf::Descriptor* message) { + std::vector ret; + for (int i = 0; i < message->field_count(); i++) { + if (message->field(i)->cpp_type() == + protobuf::FieldDescriptor::CPPTYPE_ENUM) { + ret.push_back(message->field(i)); + } + } + std::sort(ret.begin(), ret.end(), + [](const protobuf::FieldDescriptor* a, + const protobuf::FieldDescriptor* b) { + return a->enum_type()->full_name() < + b->enum_type()->full_name(); + }); + return ret; +} + std::string EnumValueSymbol(const protobuf::EnumValueDescriptor* value) { return ToCIdent(value->full_name()); } @@ -721,6 +744,14 @@ void WriteHeader(const protobuf::FileDescriptor* file, Output& output) { output("\n"); + if (file->syntax() == protobuf::FileDescriptor::SYNTAX_PROTO2) { + for (const auto* enumdesc : this_file_enums) { + output("extern const upb_enumlayout $0;\n", EnumInit(enumdesc)); + } + } + + output("\n"); + for (auto message : this_file_messages) { GenerateMessageInHeader(message, output); } @@ -787,45 +818,73 @@ int TableDescriptorType(const protobuf::FieldDescriptor* field) { // embedding field names on the side) we will have to revisit this, because // string vs. bytes behavior is not affected by proto2 vs proto3. return protobuf::FieldDescriptor::TYPE_BYTES; + } else if (field->enum_type() && + field->enum_type()->file()->syntax() == + protobuf::FileDescriptor::SYNTAX_PROTO3) { + // From the perspective of the binary decoder, proto3 enums are identical to + // int32 fields. Only in proto2 do we check enum values to make sure they + // are defined in the enum. + return protobuf::FieldDescriptor::TYPE_INT32; } else { return field->type(); } } -struct SubmsgArray { +struct SubLayoutArray { public: - SubmsgArray(const protobuf::Descriptor* message) : message_(message) { - MessageLayout layout(message); - std::vector sorted_submsgs = - SortedSubmessages(message); - int i = 0; - for (auto submsg : sorted_submsgs) { - if (indexes_.find(submsg->message_type()) != indexes_.end()) { - continue; - } - submsgs_.push_back(submsg->message_type()); - indexes_[submsg->message_type()] = i++; - } - } + SubLayoutArray(const protobuf::Descriptor* message); const std::vector& submsgs() const { return submsgs_; } - int GetIndex(const protobuf::FieldDescriptor* field) { - (void)message_; - assert(field->containing_type() == message_); - auto it = indexes_.find(field->message_type()); + const std::vector& subenums() const { + return subenums_; + } + + int total_count() const { return submsgs_.size() + subenums_.size(); } + + int GetIndex(const void *sub) { + auto it = indexes_.find(sub); assert(it != indexes_.end()); return it->second; } private: - const protobuf::Descriptor* message_; std::vector submsgs_; - absl::flat_hash_map indexes_; + std::vector subenums_; + absl::flat_hash_map indexes_; }; +SubLayoutArray::SubLayoutArray(const protobuf::Descriptor* message) { + MessageLayout layout(message); + std::vector sorted_submsgs = + SortedSubmessages(message); + int i = 0; + for (const auto* submsg : sorted_submsgs) { + if (!indexes_.try_emplace(submsg->message_type(), i).second) { + // Already present. + continue; + } + submsgs_.push_back(submsg->message_type()); + i++; + } + + std::vector sorted_subenums = + SortedSubEnums(message); + for (const auto* field : sorted_subenums) { + if (field->file()->syntax() != protobuf::FileDescriptor::SYNTAX_PROTO2) { + continue; + } + if (!indexes_.try_emplace(field->enum_type(), i).second) { + // Already present. + continue; + } + subenums_.push_back(field->enum_type()); + i++; + } +} + typedef std::pair TableEntry; uint64_t GetEncodedTag(const protobuf::FieldDescriptor* field) { @@ -861,8 +920,13 @@ bool TryFillTableEntry(const protobuf::Descriptor* message, case protobuf::FieldDescriptor::TYPE_BOOL: type = "b1"; break; - case protobuf::FieldDescriptor::TYPE_INT32: case protobuf::FieldDescriptor::TYPE_ENUM: + if (field->file()->syntax() == protobuf::FileDescriptor::SYNTAX_PROTO2) { + // We don't have the means to test proto2 enum fields for valid values. + return false; + } + ABSL_FALLTHROUGH_INTENDED; + case protobuf::FieldDescriptor::TYPE_INT32: case protobuf::FieldDescriptor::TYPE_UINT32: type = "v4"; break; @@ -957,8 +1021,8 @@ bool TryFillTableEntry(const protobuf::Descriptor* message, } if (field->cpp_type() == protobuf::FieldDescriptor::CPPTYPE_MESSAGE) { - SubmsgArray submsg_array(message); - uint64_t idx = submsg_array.GetIndex(field); + SubLayoutArray sublayout_array(message); + uint64_t idx = sublayout_array.GetIndex(field->message_type()); if (idx > 255) return false; data |= idx << 16; @@ -1083,19 +1147,22 @@ void WriteMessage(const protobuf::Descriptor* message, Output& output, uint8_t dense_below = 0; const int dense_below_max = std::numeric_limits::max(); MessageLayout layout(message); - SubmsgArray submsg_array(message); + SubLayoutArray sublayout_array(message); - if (!submsg_array.submsgs().empty()) { + if (sublayout_array.total_count()) { // TODO(haberman): could save a little bit of space by only generating a // "submsgs" array for every strongly-connected component. std::string submsgs_array_name = msg_name + "_submsgs"; submsgs_array_ref = "&" + submsgs_array_name + "[0]"; output("static const upb_msglayout_sub $0[$1] = {\n", - submsgs_array_name, submsg_array.submsgs().size()); + submsgs_array_name, sublayout_array.total_count()); - for (auto submsg : submsg_array.submsgs()) { + for (const auto* submsg : sublayout_array.submsgs()) { output(" {.submsg = &$0},\n", MessageInit(submsg)); } + for (const auto* subenum : sublayout_array.subenums()) { + output(" {.subenum = &$0},\n", EnumInit(subenum)); + } output("};\n\n"); } @@ -1109,7 +1176,7 @@ void WriteMessage(const protobuf::Descriptor* message, Output& output, fields_array_name, field_number_order.size()); for (int i = 0; i < static_cast(field_number_order.size()); i++) { auto field = field_number_order[i]; - int submsg_index = 0; + int sublayout_index = 0; if (i < dense_below_max && field->number() == i + 1 && (i == 0 || field_number_order[i - 1]->number() == i)) { @@ -1117,10 +1184,14 @@ void WriteMessage(const protobuf::Descriptor* message, Output& output, } if (field->cpp_type() == protobuf::FieldDescriptor::CPPTYPE_MESSAGE) { - submsg_index = submsg_array.GetIndex(field); + sublayout_index = sublayout_array.GetIndex(field->message_type()); + } else if (field->enum_type() && + field->enum_type()->file()->syntax() == + protobuf::FileDescriptor::SYNTAX_PROTO2) { + sublayout_index = sublayout_array.GetIndex(field->enum_type()); } - WriteMessageField(field, layout, submsg_index, output); + WriteMessageField(field, layout, sublayout_index, output); } output("};\n\n"); } @@ -1168,6 +1239,61 @@ void WriteMessage(const protobuf::Descriptor* message, Output& output, output("};\n\n"); } +int WriteEnums(const protobuf::FileDescriptor* file, Output& output) { + if (file->syntax() != protobuf::FileDescriptor::SYNTAX_PROTO2) { + return 0; + } + + std::vector this_file_enums = + SortedEnums(file); + + std::string values_init = "NULL"; + + for (const auto* e : this_file_enums) { + uint64_t mask = 0; + absl::flat_hash_set values; + for (int i = 0; i < e->value_count(); i++) { + int32_t number = e->value(i)->number(); + if (static_cast(number) < 64) { + mask |= 1 << number; + } else { + values.insert(number); + } + } + std::vector values_vec(values.begin(), values.end()); + std::sort(values_vec.begin(), values_vec.end()); + + if (!values_vec.empty()) { + values_init = EnumInit(e) + "_values"; + output("static const int32_t $0[$1] = {\n", values_init, + values_vec.size()); + for (auto value : values_vec) { + output(" $0,\n", value); + } + output("};\n\n"); + } + + output("const upb_enumlayout $0 = {\n", EnumInit(e)); + output(" $0,\n", values_init); + output(" 0x$0ULL,\n", absl::Hex(mask)); + output(" $0,\n", values_vec.size()); + + output("};\n\n"); + } + + if (!this_file_enums.empty()) { + output("static const upb_enumlayout *$0[$1] = {\n", kEnumsInit, + this_file_enums.size()); + for (const auto* e : this_file_enums) { + output(" &$0,\n", EnumInit(e)); + } + output("};\n"); + output("\n"); + } + + return this_file_enums.size(); +} + void WriteExtension(const protobuf::FieldDescriptor* ext, Output& output) { output("const upb_msglayout_ext $0 = {\n ", ExtensionLayout(ext)); WriteField(ext, "0", "0", 0, output); @@ -1264,11 +1390,14 @@ void WriteSource(const protobuf::FileDescriptor* file, Output& output, int msg_count = WriteMessages(file, output, fasttable_enabled); int ext_count = WriteExtensions(file, output); + int enum_count = WriteEnums(file, output); output("const upb_msglayout_file $0 = {\n", FileLayoutName(file)); output(" $0,\n", msg_count ? kMessagesInit : "NULL"); + output(" $0,\n", enum_count ? kEnumsInit : "NULL"); output(" $0,\n", ext_count ? kExtensionsInit : "NULL"); output(" $0,\n", msg_count); + output(" $0,\n", enum_count); output(" $0,\n", ext_count); output("};\n\n");