Merge pull request #423 from haberman/msgset

Implemented support for MessageSet.
pull/13171/head
Joshua Haberman 3 years ago committed by GitHub
commit 0e0de7d9f9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 7
      WORKSPACE
  2. 1
      bazel/workspace_deps.bzl
  3. 12
      benchmarks/compare.py
  4. 108
      upb/decode.c
  5. 58
      upb/def.c
  6. 23
      upb/encode.c
  7. 3
      upb/json_encode.c
  8. 21
      upb/msg_internal.h
  9. 56
      upb/msg_test.cc
  10. 12
      upb/msg_test.proto
  11. 6
      upbc/protoc-gen-upb.cc

@ -26,9 +26,9 @@ http_archive(
http_archive(
name = "com_github_google_benchmark",
urls = ["https://github.com/google/benchmark/archive/16703ff83c1ae6d53e5155df3bb3ab0bc96083be.zip"],
strip_prefix = "benchmark-16703ff83c1ae6d53e5155df3bb3ab0bc96083be",
sha256 = "59f918c8ccd4d74b6ac43484467b500f1d64b40cc1010daa055375b322a43ba3",
urls = ["https://github.com/google/benchmark/archive/0baacde3618ca617da95375e0af13ce1baadea47.zip"],
strip_prefix = "benchmark-0baacde3618ca617da95375e0af13ce1baadea47",
sha256 = "62e2f2e6d8a744d67e4bbc212fcfd06647080de4253c97ad5c6749e09faf2cb0",
)
http_archive(
@ -37,6 +37,7 @@ http_archive(
build_file = "//benchmarks:BUILD.googleapis",
strip_prefix = "googleapis-master",
patch_cmds = ["find google -type f -name BUILD.bazel -delete"],
sha256 = "a3353ef2cea09074eac8a99b6ad8e1c802c9bfac6a6192401287de8d6991b6bc",
)
system_python(

@ -7,6 +7,7 @@ def upb_deps():
name = "com_google_absl",
url = "https://github.com/abseil/abseil-cpp/archive/refs/heads/master.zip",
strip_prefix = "abseil-cpp-master",
sha256 = "9da85425cda33e13c619c35473c9653944d0dd9aab1a69ee9b3543cdaed277e5",
)
maybe(

@ -64,26 +64,28 @@ def Benchmark(outbase, bench_cpu=True, runs=12, fasttable=False):
if bench_cpu:
Run("CC=clang bazel build -c opt --copt=-march=native benchmarks:benchmark" + extra_args)
Run("./bazel-bin/benchmarks/benchmark --benchmark_out_format=json --benchmark_out={} --benchmark_repetitions={}".format(tmpfile, runs))
Run("./bazel-bin/benchmarks/benchmark --benchmark_out_format=json --benchmark_out={} --benchmark_repetitions={} --benchmark_min_time=0.05 --benchmark_enable_random_interleaving=true".format(tmpfile, runs))
with open(tmpfile) as f:
bench_json = json.load(f)
# Translate into the format expected by benchstat.
with open(outbase + ".txt", "w") as f:
txt_filename = outbase + ".txt"
with open(txt_filename, "w") as f:
for run in bench_json["benchmarks"]:
if run["run_type"] == "aggregate":
continue
name = run["name"]
name = name.replace(" ", "")
name = re.sub(r'^BM_', 'Benchmark', name)
if name.endswith("_mean") or name.endswith("_median") or name.endswith("_stddev"):
continue
values = (name, run["iterations"], run["cpu_time"])
print("{} {} {} ns/op".format(*values), file=f)
Run("sort {} -o {} ".format(txt_filename, txt_filename))
Run("CC=clang bazel build -c opt --copt=-g tests:conformance_upb" + extra_args)
Run("cp -f bazel-bin/tests/conformance_upb {}.bin".format(outbase))
baseline = "master"
baseline = "bm-interleave"
bench_cpu = True
fasttable = False

@ -91,8 +91,15 @@ static const unsigned FIXED64_OK_MASK = (1 << UPB_DTYPE_DOUBLE) |
(1 << UPB_DTYPE_FIXED64) |
(1 << UPB_DTYPE_SFIXED64);
/* Three fake field types for MessageSet. */
#define TYPE_MSGSET_ITEM 19
#define TYPE_MSGSET_TYPE_ID 20
#define TYPE_COUNT 20
/* Op: an action to be performed for a wire-type/field-type combination. */
#define OP_UNKNOWN -1
#define OP_UNKNOWN -1 /* Unknown field. */
#define OP_MSGSET_ITEM -2
#define OP_MSGSET_TYPEID -3
#define OP_SCALAR_LG2(n) (n) /* n in [0, 2, 3] => op in [0, 2, 3] */
#define OP_STRING 4
#define OP_BYTES 5
@ -101,7 +108,7 @@ static const unsigned FIXED64_OK_MASK = (1 << UPB_DTYPE_DOUBLE) |
#define OP_FIXPCK_LG2(n) (n + 5) /* n in [2, 3] => op in [7, 8] */
#define OP_VARPCK_LG2(n) (n + 9) /* n in [0, 2, 3] => op in [9, 11, 12] */
static const int8_t varint_ops[19] = {
static const int8_t varint_ops[] = {
OP_UNKNOWN, /* field not found */
OP_UNKNOWN, /* DOUBLE */
OP_UNKNOWN, /* FLOAT */
@ -121,9 +128,11 @@ static const int8_t varint_ops[19] = {
OP_UNKNOWN, /* SFIXED64 */
OP_SCALAR_LG2(2), /* SINT32 */
OP_SCALAR_LG2(3), /* SINT64 */
OP_UNKNOWN, /* MSGSET_ITEM */
OP_MSGSET_TYPEID, /* MSGSET TYPEID */
};
static const int8_t delim_ops[37] = {
static const int8_t delim_ops[] = {
/* For non-repeated field type. */
OP_UNKNOWN, /* field not found */
OP_UNKNOWN, /* DOUBLE */
@ -144,6 +153,8 @@ static const int8_t delim_ops[37] = {
OP_UNKNOWN, /* SFIXED64 */
OP_UNKNOWN, /* SINT32 */
OP_UNKNOWN, /* SINT64 */
OP_UNKNOWN, /* MSGSET_ITEM */
OP_UNKNOWN, /* MSGSET TYPEID */
/* For repeated field type. */
OP_FIXPCK_LG2(3), /* REPEATED DOUBLE */
OP_FIXPCK_LG2(2), /* REPEATED FLOAT */
@ -163,6 +174,7 @@ static const int8_t delim_ops[37] = {
OP_FIXPCK_LG2(3), /* REPEATED SFIXED64 */
OP_VARPCK_LG2(2), /* REPEATED SINT32 */
OP_VARPCK_LG2(3), /* REPEATED SINT64 */
/* Omitting MSGSET_*, because we never emit a repeated msgset type */
};
typedef union {
@ -567,12 +579,30 @@ static bool decode_tryfastdispatch(upb_decstate *d, const char **ptr,
return false;
}
static const char *decode_msgset(upb_decstate *d, const char *ptr, upb_msg *msg,
const upb_msglayout *layout) {
// We create a temporary upb_msglayout here and abuse its fields as temporary
// storage, to avoid creating lots of MessageSet-specific parsing code-paths:
// 1. We store 'layout' in item_layout.subs. We will need this later as
// a key to look up extensions for this MessageSet.
// 2. We use item_layout.fields as temporary storage to store the extension we
// found when parsing the type id.
upb_msglayout item_layout = {
.subs = (const upb_msglayout_sub[]){{.submsg = layout}},
.fields = NULL,
.size = 0,
.field_count = 0,
.ext = _UPB_MSGEXT_MSGSET_ITEM,
.dense_below = 0,
.table_mask = -1};
return decode_group(d, ptr, msg, &item_layout, 1);
}
static const upb_msglayout_field *decode_findfield(upb_decstate *d,
const upb_msglayout *l,
uint32_t field_number,
int *last_field_index) {
static upb_msglayout_field none = {0, 0, 0, 0, 0, 0};
if (l == NULL) return &none;
size_t idx = ((size_t)field_number) - 1; // 0 wraps to SIZE_MAX
@ -598,9 +628,41 @@ static const upb_msglayout_field *decode_findfield(upb_decstate *d,
}
}
if (l->ext == _UPB_MSGEXT_EXTENDABLE && d->extreg) {
const upb_msglayout_ext *ext = _upb_extreg_get(d->extreg, l, field_number);
if (ext) return &ext->field;
if (d->extreg) {
switch (l->ext) {
case _UPB_MSGEXT_EXTENDABLE: {
const upb_msglayout_ext *ext =
_upb_extreg_get(d->extreg, l, field_number);
if (ext) return &ext->field;
break;
}
case _UPB_MSGEXT_MSGSET:
if (field_number == _UPB_MSGSET_ITEM) {
static upb_msglayout_field item = {0, 0, 0, 0, TYPE_MSGSET_ITEM, 0};
return &item;
}
break;
case _UPB_MSGEXT_MSGSET_ITEM:
switch (field_number) {
case _UPB_MSGSET_TYPEID: {
static upb_msglayout_field type_id = {
0, 0, 0, 0, TYPE_MSGSET_TYPE_ID, 0};
return &type_id;
}
case _UPB_MSGSET_MESSAGE:
if (l->fields) {
// We saw type_id previously and succeeded in looking up msg.
return l->fields;
} else {
// TODO: out of order MessageSet.
// This is a very rare case: all serializers will emit in-order
// MessageSets. To hit this case there has to be some kind of
// re-ordering proxy. We should eventually handle this case, but
// not today.
}
break;
}
}
}
return &none; /* Unknown field. */
@ -640,7 +702,7 @@ static const char *decode_wireval(upb_decstate *d, const char *ptr,
case UPB_WIRE_TYPE_DELIMITED: {
int ndx = field->descriptortype;
uint64_t size;
if (_upb_getmode(field) == _UPB_MODE_ARRAY) ndx += 18;
if (_upb_getmode(field) == _UPB_MODE_ARRAY) ndx += TYPE_COUNT;
ptr = decode_varint64(d, ptr, &size);
if (size >= INT32_MAX || ptr - d->end + (int32_t)size > d->limit) {
break; /* Length overflow. */
@ -651,8 +713,13 @@ static const char *decode_wireval(upb_decstate *d, const char *ptr,
}
case UPB_WIRE_TYPE_START_GROUP:
val->uint32_val = field->number;
*op = OP_SUBMSG;
if (field->descriptortype != UPB_DTYPE_GROUP) *op = OP_UNKNOWN;
if (field->descriptortype == UPB_DTYPE_GROUP) {
*op = OP_SUBMSG;
} else if (field->descriptortype == TYPE_MSGSET_ITEM) {
*op = OP_MSGSET_ITEM;
} else {
*op = OP_UNKNOWN;
}
return ptr;
default:
break;
@ -693,6 +760,7 @@ static const char *decode_unknown(upb_decstate *d, const char *ptr,
upb_msg *msg, int field_number, int wire_type,
wireval val, const char **field_start) {
if (field_number == 0) return decode_err(d);
if (wire_type == UPB_WIRE_TYPE_DELIMITED) ptr += val.size;
if (msg) {
if (wire_type == UPB_WIRE_TYPE_START_GROUP) {
@ -742,8 +810,21 @@ static const char *decode_msg(upb_decstate *d, const char *ptr, upb_msg *msg,
if (op >= 0) {
ptr = decode_known(d, ptr, msg, layout, field, op, &val);
} else {
ptr = decode_unknown(d, ptr, msg, field_number, wire_type, val,
&field_start);
switch (op) {
case OP_UNKNOWN:
ptr = decode_unknown(d, ptr, msg, field_number, wire_type, val,
&field_start);
break;
case OP_MSGSET_ITEM:
ptr = decode_msgset(d, ptr, msg, layout);
break;
case OP_MSGSET_TYPEID: {
const upb_msglayout_ext *ext = _upb_extreg_get(
d->extreg, layout->subs[0].submsg, val.uint64_val);
if (ext) ((upb_msglayout *)layout)->fields = &ext->field;
break;
}
}
}
if (decode_isdone(d, &ptr)) return ptr;
@ -811,8 +892,11 @@ bool _upb_decode(const char *buf, size_t size, void *msg,
return ok;
}
#undef OP_UNKNOWN
#undef OP_SKIP
#undef OP_SCALAR_LG2
#undef OP_FIXPCK_LG2
#undef OP_VARPCK_LG2
#undef OP_STRING
#undef OP_BYTES
#undef OP_SUBMSG

@ -99,7 +99,9 @@ struct upb_msgdef {
/* Is this a map-entry message? */
bool map_entry;
bool is_message_set;
upb_wellknowntype_t well_known_type;
const upb_fielddef *message_set_ext;
};
struct upb_enumdef {
@ -159,6 +161,8 @@ struct upb_symtab {
/* Inside a symtab we store tagged pointers to specific def types. */
typedef enum {
UPB_DEFTYPE_MASK = 7,
UPB_DEFTYPE_FIELD = 0,
/* Only inside symtab table. */
@ -171,13 +175,22 @@ typedef enum {
UPB_DEFTYPE_FIELD_JSONNAME = 2
} upb_deftype_t;
static upb_deftype_t deftype(upb_value v) {
uintptr_t num = (uintptr_t)upb_value_getconstptr(v);
return num & UPB_DEFTYPE_MASK;
}
static const void *unpack_def(upb_value v, upb_deftype_t type) {
uintptr_t num = (uintptr_t)upb_value_getconstptr(v);
return (num & 3) == type ? (const void*)(num & ~3) : NULL;
return (num & UPB_DEFTYPE_MASK) == type
? (const void *)(num & ~UPB_DEFTYPE_MASK)
: NULL;
}
static upb_value pack_def(const void *ptr, upb_deftype_t type) {
uintptr_t num = (uintptr_t)ptr | type;
uintptr_t num = (uintptr_t)ptr;
UPB_ASSERT((num & UPB_DEFTYPE_MASK) == 0);
num |= type;
return upb_value_constptr((const void*)num);
}
@ -955,15 +968,27 @@ const upb_enumvaldef *upb_symtab_lookupenumval(const upb_symtab *s,
: NULL;
}
const upb_fielddef *upb_symtab_lookupext(const upb_symtab *s, const char *sym) {
const upb_fielddef *upb_symtab_lookupext2(const upb_symtab *s, const char *name,
size_t size) {
upb_value v;
return upb_strtable_lookup(&s->syms, sym, &v) ?
unpack_def(v, UPB_DEFTYPE_FIELD) : NULL;
if (!upb_strtable_lookup2(&s->syms, name, size, &v)) return NULL;
switch (deftype(v)) {
case UPB_DEFTYPE_FIELD:
return unpack_def(v, UPB_DEFTYPE_FIELD);
case UPB_DEFTYPE_MSG: {
const upb_msgdef *m = unpack_def(v, UPB_DEFTYPE_MSG);
return m->message_set_ext; /* May be NULL if not in MessageeSet. */
}
default:
break;
}
return NULL;
}
const upb_fielddef *upb_symtab_lookupext2(const upb_symtab *s, const char *sym,
size_t size) {
return symtab_lookup2(s, sym, size, UPB_DEFTYPE_FIELD);
const upb_fielddef *upb_symtab_lookupext(const upb_symtab *s, const char *sym) {
return upb_symtab_lookupext2(s, sym, strlen(sym));
}
const upb_filedef *upb_symtab_lookupfile(const upb_symtab *s, const char *name) {
@ -1208,7 +1233,11 @@ static void make_layout(symtab_addctx *ctx, const upb_msgdef *m) {
l->table_mask = 0;
if (upb_msgdef_extrangecount(m) > 0) {
l->ext = _UPB_MSGEXT_EXTENDABLE;
if (m->is_message_set) {
l->ext = _UPB_MSGEXT_MSGSET;
} else {
l->ext = _UPB_MSGEXT_EXTENDABLE;
}
} else {
l->ext = _UPB_MSGEXT_NONE;
}
@ -1921,11 +1950,15 @@ static void create_msgdef(symtab_addctx *ctx, const char *prefix,
m->file = ctx->file;
m->map_entry = false;
m->is_message_set = false;
m->message_set_ext = NULL;
options = google_protobuf_DescriptorProto_options(msg_proto);
if (options) {
m->map_entry = google_protobuf_MessageOptions_map_entry(options);
m->is_message_set =
google_protobuf_MessageOptions_message_set_wire_format(options);
}
if (ctx->layout) {
@ -2058,6 +2091,13 @@ static void resolve_fielddef(symtab_addctx *ctx, const char *prefix,
if (upb_fielddef_issubmsg(f)) {
f->sub.msgdef = symtab_resolve(ctx, f, prefix, name, UPB_DEFTYPE_MSG);
if (f->is_extension_ && f->msgdef->is_message_set &&
f->file == f->msgdef->file) {
// TODO: When defs are restructured to follow message nesting, we can make
// this check more robust. The actual rules for what make something
// qualify as a MessageSet item are more strict.
((upb_msgdef*)f->sub.msgdef)->message_set_ext = f;
}
} else if (f->type_ == UPB_DESCRIPTOR_TYPE_ENUM) {
f->sub.enumdef = symtab_resolve(ctx, f, prefix, name, UPB_DEFTYPE_ENUM);
}

@ -477,6 +477,23 @@ static void encode_field(upb_encstate *e, const upb_msg *msg,
}
}
/* message MessageSet {
* repeated group Item = 1 {
* required int32 type_id = 2;
* required string message = 3;
* }
* } */
static void encode_msgset_item(upb_encstate *e, const upb_msg_ext *ext) {
size_t size;
encode_tag(e, 1, UPB_WIRE_TYPE_END_GROUP);
encode_message(e, ext->data.ptr, ext->ext->sub.submsg, &size);
encode_varint(e, size);
encode_tag(e, 3, UPB_WIRE_TYPE_DELIMITED);
encode_varint(e, ext->ext->field.number);
encode_tag(e, 2, UPB_WIRE_TYPE_VARINT);
encode_tag(e, 1, UPB_WIRE_TYPE_START_GROUP);
}
static void encode_message(upb_encstate *e, const upb_msg *msg,
const upb_msglayout *m, size_t *size) {
size_t pre_len = e->limit - e->ptr;
@ -499,7 +516,11 @@ static void encode_message(upb_encstate *e, const upb_msg *msg,
const upb_msg_ext *end = ext + ext_count;
if (ext_count) {
for (; ext != end; ext++) {
encode_field(e, &ext->data, &ext->ext->sub, &ext->ext->field);
if (UPB_UNLIKELY(m->ext == _UPB_MSGEXT_MSGSET)) {
encode_msgset_item(e, ext);
} else {
encode_field(e, &ext->data, &ext->ext->sub, &ext->ext->field);
}
}
}
}

@ -673,6 +673,9 @@ static void jsonenc_fieldval(jsonenc *e, const upb_fielddef *f,
jsonenc_putsep(e, ",", first);
if (upb_fielddef_isextension(f)) {
// TODO: For MessageSet, I would have expected this to print the message
// name here, but Python doesn't appear to do this. We should do more
// research here about what various implementations do.
jsonenc_printf(e, "\"[%s]\":", upb_fielddef_fullname(f));
} else {
if (e->options & UPB_JSONENC_PROTONAMES) {

@ -136,11 +136,26 @@ typedef union {
} upb_msglayout_sub;
typedef enum {
_UPB_MSGEXT_NONE = 0, // Non-extendable message.
_UPB_MSGEXT_EXTENDABLE = 1, // Normal extendable message.
// TODO: MessageSet
_UPB_MSGEXT_NONE = 0, // Non-extendable message.
_UPB_MSGEXT_EXTENDABLE = 1, // Normal extendable message.
_UPB_MSGEXT_MSGSET = 2, // MessageSet message.
_UPB_MSGEXT_MSGSET_ITEM = 3, // MessageSet item (temporary only, see decode.c)
} upb_msgext_mode;
/* MessageSet wire format is:
* message MessageSet {
* repeated group Item = 1 {
* required int32 type_id = 2;
* required string message = 3;
* }
* }
*/
typedef enum {
_UPB_MSGSET_ITEM = 1,
_UPB_MSGSET_TYPEID = 2,
_UPB_MSGSET_MESSAGE = 3,
} upb_msgext_fieldnum;
struct upb_msglayout {
const upb_msglayout_sub *subs;
const upb_msglayout_field *fields;

@ -92,9 +92,63 @@ TEST(MessageTest, Extensions) {
upb_json_encode(ext_msg, m.ptr(), symtab.ptr(), 0, json_buf, json_size + 1,
status.ptr());
upb_test_TestExtensions *ext_msg3 = upb_test_TestExtensions_new(arena.ptr());
fprintf(stderr, "%.*s\n", (int)json_size, json_buf);
EXPECT_TRUE(upb_json_decode(json_buf, json_size, ext_msg3, m.ptr(),
symtab.ptr(), 0, arena.ptr(), status.ptr()))
<< status.error_message();
VerifyMessage(ext_msg3);
}
void VerifyMessageSet(const upb_test_TestMessageSet *mset_msg) {
EXPECT_TRUE(upb_test_MessageSetMember_has_message_set_extension(mset_msg));
const upb_test_MessageSetMember *member =
upb_test_MessageSetMember_message_set_extension(mset_msg);
EXPECT_TRUE(member != nullptr);
EXPECT_TRUE(upb_test_MessageSetMember_has_optional_int32(member));
EXPECT_EQ(234, upb_test_MessageSetMember_optional_int32(member));
}
TEST(MessageTest, MessageSet) {
upb::Arena arena;
upb_test_TestMessageSet *ext_msg = upb_test_TestMessageSet_new(arena.ptr());
EXPECT_FALSE(upb_test_MessageSetMember_has_message_set_extension(ext_msg));
upb::SymbolTable symtab;
upb::MessageDefPtr m(upb_test_TestMessageSet_getmsgdef(symtab.ptr()));
EXPECT_TRUE(m.ptr() != nullptr);
std::string json = R"json(
{
"[upb_test.MessageSetMember]": {"optional_int32": 234}
}
)json";
upb::Status status;
EXPECT_TRUE(upb_json_decode(json.data(), json.size(), ext_msg, m.ptr(),
symtab.ptr(), 0, arena.ptr(), status.ptr()))
<< status.error_message();
VerifyMessageSet(ext_msg);
// Test round-trip through binary format.
size_t size;
char *serialized = upb_test_TestMessageSet_serialize(ext_msg, arena.ptr(), &size);
ASSERT_TRUE(serialized != nullptr);
ASSERT_GE(size, 0);
upb_test_TestMessageSet *ext_msg2 = upb_test_TestMessageSet_parse_ex(
serialized, size, upb_symtab_extreg(symtab.ptr()), 0, arena.ptr());
VerifyMessageSet(ext_msg2);
// Test round-trip through JSON format.
size_t json_size =
upb_json_encode(ext_msg, m.ptr(), symtab.ptr(), 0, NULL, 0, status.ptr());
char *json_buf =
static_cast<char *>(upb_arena_malloc(arena.ptr(), json_size + 1));
upb_json_encode(ext_msg, m.ptr(), symtab.ptr(), 0, json_buf, json_size + 1,
status.ptr());
upb_test_TestMessageSet *ext_msg3 = upb_test_TestMessageSet_new(arena.ptr());
EXPECT_TRUE(upb_json_decode(json_buf, json_size, ext_msg3, m.ptr(),
symtab.ptr(), 0, arena.ptr(), status.ptr()))
<< status.error_message();
VerifyMessageSet(ext_msg3);
}

@ -46,3 +46,15 @@ message TestExtensions {
extend TestExtensions {
optional protobuf_test_messages.proto3.TestAllTypesProto3 optional_msg_ext = 1002;
}
message TestMessageSet {
extensions 4 to max;
option message_set_wire_format = true;
}
message MessageSetMember {
optional int32 optional_int32 = 1;
extend TestMessageSet {
optional MessageSetMember message_set_extension = 4;
}
}

@ -1106,7 +1106,11 @@ void WriteMessage(const protobuf::Descriptor* message, Output& output,
std::string msgext = "_UPB_MSGEXT_NONE";
if (message->extension_range_count()) {
msgext = "_UPB_MSGEXT_EXTENDABLE";
if (message->options().message_set_wire_format()) {
msgext = "_UPB_MSGEXT_MSGSET";
} else {
msgext = "_UPB_MSGEXT_EXTENDABLE";
}
}
output("const upb_msglayout $0 = {\n", MessageInit(message));

Loading…
Cancel
Save