From 6f59f1256e37ac70deef7a908a25fce67aa1d940 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Wed, 14 Oct 2020 12:44:46 -0700 Subject: [PATCH 01/21] Optimizations to descriptor loading. --- BUILD | 1 + tests/benchmark.cc | 29 ++++++ tools/amalgamate.py | 2 +- upb/def.c | 31 +++--- upb/json/parser.rl | 2 +- upb/msg.c | 2 +- upb/table.c | 230 +++++++------------------------------------- upb/table.int.h | 8 +- 8 files changed, 88 insertions(+), 217 deletions(-) diff --git a/BUILD b/BUILD index 9e49b42304..e7c89c5635 100644 --- a/BUILD +++ b/BUILD @@ -86,6 +86,7 @@ cc_library( "upb/table.int.h", "upb/upb.c", "upb/upb.int.h", + "third_party/wyhash/wyhash.h", ], hdrs = [ "upb/decode.h", diff --git a/tests/benchmark.cc b/tests/benchmark.cc index 15544afe1b..1a1d9b55aa 100644 --- a/tests/benchmark.cc +++ b/tests/benchmark.cc @@ -4,6 +4,7 @@ #include "google/protobuf/descriptor.upb.h" #include "google/protobuf/descriptor.upbdefs.h" #include "google/protobuf/descriptor.pb.h" +#include "upb/def.hpp" upb_strview descriptor = google_protobuf_descriptor_proto_upbdefinit.descriptor; namespace protobuf = ::google::protobuf; @@ -11,6 +12,34 @@ namespace protobuf = ::google::protobuf; /* A buffer big enough to parse descriptor.proto without going to heap. */ char buf[65535]; +static void BM_LoadDescriptor(benchmark::State& state) { + FILE *f = fopen("/tmp/ads-descriptor.pb", "rb"); + fseek(f, 0, SEEK_END); + size_t size = ftell(f); + fseek(f, 0, SEEK_SET); + std::string str(size, '\0'); + fread(&str[0], 1, size, f); + fclose(f); + fprintf(stderr, "size: %zu\n", str.size()); + for (auto _ : state) { + upb::SymbolTable symtab; + upb::Arena arena; + google_protobuf_FileDescriptorSet* set = + google_protobuf_FileDescriptorSet_parse(str.data(), str.size(), + arena.ptr()); + const google_protobuf_FileDescriptorProto*const * files = + google_protobuf_FileDescriptorSet_file(set, &size); + for (size_t i = 0; i < size; i++) { + upb::FileDefPtr file_def = symtab.AddFile(files[i], NULL); + if (!file_def) { + printf("Failed to add file.\n"); + exit(1); + } + } + } +} +BENCHMARK(BM_LoadDescriptor); + static void BM_ArenaOneAlloc(benchmark::State& state) { for (auto _ : state) { upb_arena* arena = upb_arena_new(); diff --git a/tools/amalgamate.py b/tools/amalgamate.py index b244eff8e8..950e9aa9ac 100755 --- a/tools/amalgamate.py +++ b/tools/amalgamate.py @@ -52,7 +52,7 @@ class Amalgamator: include = parse_include(line) if not include: return False - if not (include.startswith("upb") or include.startswith("google")): + if not (include.startswith("upb") or include.startswith("google") or include.startswith("third_party")): return False if include.endswith("hpp"): # Skip, we don't support the amalgamation from C++. diff --git a/upb/def.c b/upb/def.c index 497f402b99..603deb2013 100644 --- a/upb/def.c +++ b/upb/def.c @@ -1309,7 +1309,7 @@ static bool create_oneofdef( CHK_OOM(upb_strtable_insert3(&m->ntof, name.data, name.size, v, ctx->alloc)); CHK_OOM(upb_inttable_init2(&o->itof, UPB_CTYPE_CONSTPTR, ctx->alloc)); - CHK_OOM(upb_strtable_init2(&o->ntof, UPB_CTYPE_CONSTPTR, ctx->alloc)); + CHK_OOM(upb_strtable_init2(&o->ntof, UPB_CTYPE_CONSTPTR, 4, ctx->alloc)); return true; } @@ -1633,14 +1633,13 @@ static bool create_enumdef( e->full_name = makefullname(ctx, prefix, name); CHK_OOM(symtab_add(ctx, e->full_name, pack_def(e, UPB_DEFTYPE_ENUM))); - CHK_OOM(upb_strtable_init2(&e->ntoi, UPB_CTYPE_INT32, ctx->alloc)); + values = google_protobuf_EnumDescriptorProto_value(enum_proto, &n); + CHK_OOM(upb_strtable_init2(&e->ntoi, UPB_CTYPE_INT32, n, ctx->alloc)); CHK_OOM(upb_inttable_init2(&e->iton, UPB_CTYPE_CSTR, ctx->alloc)); e->file = ctx->file; e->defaultval = 0; - values = google_protobuf_EnumDescriptorProto_value(enum_proto, &n); - if (n == 0) { upb_status_seterrf(ctx->status, "enums must contain at least one value (%s)", @@ -1690,7 +1689,7 @@ static bool create_msgdef(symtab_addctx *ctx, const char *prefix, const google_protobuf_FieldDescriptorProto *const *fields; const google_protobuf_EnumDescriptorProto *const *enums; const google_protobuf_DescriptorProto *const *msgs; - size_t i, n; + size_t i, n_oneof, n_field, n; upb_strview name; name = google_protobuf_DescriptorProto_name(msg_proto); @@ -1700,8 +1699,12 @@ static bool create_msgdef(symtab_addctx *ctx, const char *prefix, m->full_name = makefullname(ctx, prefix, name); CHK_OOM(symtab_add(ctx, m->full_name, pack_def(m, UPB_DEFTYPE_MSG))); + oneofs = google_protobuf_DescriptorProto_oneof_decl(msg_proto, &n_oneof); + fields = google_protobuf_DescriptorProto_field(msg_proto, &n_field); + CHK_OOM(upb_inttable_init2(&m->itof, UPB_CTYPE_CONSTPTR, ctx->alloc)); - CHK_OOM(upb_strtable_init2(&m->ntof, UPB_CTYPE_CONSTPTR, ctx->alloc)); + CHK_OOM(upb_strtable_init2(&m->ntof, UPB_CTYPE_CONSTPTR, n_oneof + n_field, + ctx->alloc)); m->file = ctx->file; m->map_entry = false; @@ -1720,17 +1723,15 @@ static bool create_msgdef(symtab_addctx *ctx, const char *prefix, m->layout = upb_malloc(ctx->alloc, sizeof(*m->layout)); } - oneofs = google_protobuf_DescriptorProto_oneof_decl(msg_proto, &n); m->oneof_count = 0; - m->oneofs = upb_malloc(ctx->alloc, sizeof(*m->oneofs) * n); - for (i = 0; i < n; i++) { + m->oneofs = upb_malloc(ctx->alloc, sizeof(*m->oneofs) * n_oneof); + for (i = 0; i < n_oneof; i++) { CHK(create_oneofdef(ctx, m, oneofs[i])); } - fields = google_protobuf_DescriptorProto_field(msg_proto, &n); m->field_count = 0; - m->fields = upb_malloc(ctx->alloc, sizeof(*m->fields) * n); - for (i = 0; i < n; i++) { + m->fields = upb_malloc(ctx->alloc, sizeof(*m->fields) * n_field); + for (i = 0; i < n_field; i++) { CHK(create_fielddef(ctx, m->full_name, m, fields[i])); } @@ -2085,8 +2086,8 @@ upb_symtab *upb_symtab_new(void) { s->arena = upb_arena_new(); alloc = upb_arena_alloc(s->arena); - if (!upb_strtable_init2(&s->syms, UPB_CTYPE_CONSTPTR, alloc) || - !upb_strtable_init2(&s->files, UPB_CTYPE_CONSTPTR, alloc)) { + if (!upb_strtable_init2(&s->syms, UPB_CTYPE_CONSTPTR, 32, alloc) || + !upb_strtable_init2(&s->files, UPB_CTYPE_CONSTPTR, 4, alloc)) { upb_arena_free(s->arena); upb_gfree(s); s = NULL; @@ -2148,7 +2149,7 @@ static const upb_filedef *_upb_symtab_addfile( ctx.layouts = layouts; ctx.status = status; - ok = file && upb_strtable_init2(&addtab, UPB_CTYPE_CONSTPTR, ctx.tmp) && + ok = file && upb_strtable_init2(&addtab, UPB_CTYPE_CONSTPTR, 8, ctx.tmp) && build_filedef(&ctx, file, file_proto) && upb_symtab_addtotabs(s, &ctx); upb_arena_free(tmparena); diff --git a/upb/json/parser.rl b/upb/json/parser.rl index d7dcc54a8b..47ba15ec2e 100644 --- a/upb/json/parser.rl +++ b/upb/json/parser.rl @@ -2869,7 +2869,7 @@ static upb_json_parsermethod *parsermethod_new(upb_json_codecache *c, upb_byteshandler_setstring(&m->input_handler_, parse, m); upb_byteshandler_setendstr(&m->input_handler_, end, m); - upb_strtable_init2(&m->name_table, UPB_CTYPE_CONSTPTR, alloc); + upb_strtable_init2(&m->name_table, UPB_CTYPE_CONSTPTR, 4, alloc); /* Build name_table */ diff --git a/upb/msg.c b/upb/msg.c index 5f6d1ce170..baafe59293 100644 --- a/upb/msg.c +++ b/upb/msg.c @@ -130,7 +130,7 @@ upb_map *_upb_map_new(upb_arena *a, size_t key_size, size_t value_size) { return NULL; } - upb_strtable_init2(&map->table, UPB_CTYPE_INT32, upb_arena_alloc(a)); + upb_strtable_init2(&map->table, UPB_CTYPE_INT32, 4, upb_arena_alloc(a)); map->key_size = key_size; map->val_size = value_size; diff --git a/upb/table.c b/upb/table.c index 34a20530d8..9af5163061 100644 --- a/upb/table.c +++ b/upb/table.c @@ -4,10 +4,12 @@ ** Implementation is heavily inspired by Lua's ltable.c. */ -#include "upb/table.int.h" - #include +#include "third_party/wyhash/wyhash.h" +#include "upb/table.int.h" + +/* Must be last. */ #include "upb/port_def.inc" #define UPB_MAXARRSIZE 16 /* 64k. */ @@ -87,11 +89,7 @@ static upb_tabent *mutable_entries(upb_table *t) { } static bool isfull(upb_table *t) { - if (upb_table_size(t) == 0) { - return true; - } else { - return ((double)(t->count + 1) / upb_table_size(t)) > MAX_LOAD; - } + return t->count == t->max_count; } static bool init(upb_table *t, uint8_t size_lg2, upb_alloc *a) { @@ -100,6 +98,7 @@ static bool init(upb_table *t, uint8_t size_lg2, upb_alloc *a) { t->count = 0; t->size_lg2 = size_lg2; t->mask = upb_table_size(t) ? upb_table_size(t) - 1 : 0; + t->max_count = upb_table_size(t) * MAX_LOAD; bytes = upb_table_size(t) * sizeof(upb_tabent); if (bytes > 0) { t->entries = upb_malloc(a, bytes); @@ -115,9 +114,17 @@ static void uninit(upb_table *t, upb_alloc *a) { upb_free(a, mutable_entries(t)); } -static upb_tabent *emptyent(upb_table *t) { - upb_tabent *e = mutable_entries(t) + upb_table_size(t); - while (1) { if (upb_tabent_isempty(--e)) return e; UPB_ASSERT(e > t->entries); } +static upb_tabent *emptyent(upb_table *t, upb_tabent *e) { + upb_tabent *begin = mutable_entries(t); + upb_tabent *end = begin + upb_table_size(t); + for (e = e + 1; e < end; e++) { + if (upb_tabent_isempty(e)) return e; + } + for (e = begin; e < end; e++) { + if (upb_tabent_isempty(e)) return e; + } + UPB_ASSERT(false); + return NULL; } static upb_tabent *getentry_mutable(upb_table *t, uint32_t hash) { @@ -173,7 +180,7 @@ static void insert(upb_table *t, lookupkey_t key, upb_tabkey tabkey, our_e->next = NULL; } else { /* Collision. */ - upb_tabent *new_e = emptyent(t); + upb_tabent *new_e = emptyent(t, mainpos_e); /* Head of collider's chain. */ upb_tabent *chain = getentry_mutable(t, hashfunc(mainpos_e->key)); if (chain == mainpos_e) { @@ -268,10 +275,14 @@ static upb_tabkey strcopy(lookupkey_t k2, upb_alloc *a) { return (uintptr_t)str; } +static uint32_t table_hash(const char *p, size_t n) { + return wyhash(p, n, 0, _wyp); +} + static uint32_t strhash(upb_tabkey key) { uint32_t len; char *str = upb_tabstr(key, &len); - return upb_murmur_hash2(str, len, 0); + return table_hash(str, len); } static bool streql(upb_tabkey k1, lookupkey_t k2) { @@ -280,9 +291,15 @@ static bool streql(upb_tabkey k1, lookupkey_t k2) { return len == k2.str.len && (len == 0 || memcmp(str, k2.str.str, len) == 0); } -bool upb_strtable_init2(upb_strtable *t, upb_ctype_t ctype, upb_alloc *a) { +bool upb_strtable_init2(upb_strtable *t, upb_ctype_t ctype, + size_t expected_size, upb_alloc *a) { UPB_UNUSED(ctype); /* TODO(haberman): rm */ - return init(&t->t, 2, a); + size_t need_entries = (expected_size + 1) / MAX_LOAD; + int size_lg2 = 2; + while (1 << size_lg2 < need_entries) { + size_lg2++; + } + return init(&t->t, size_lg2, a); } void upb_strtable_clear(upb_strtable *t) { @@ -333,20 +350,20 @@ bool upb_strtable_insert3(upb_strtable *t, const char *k, size_t len, tabkey = strcopy(key, a); if (tabkey == 0) return false; - hash = upb_murmur_hash2(key.str.str, key.str.len, 0); + hash = table_hash(key.str.str, key.str.len); insert(&t->t, key, tabkey, v, hash, &strhash, &streql); return true; } bool upb_strtable_lookup2(const upb_strtable *t, const char *key, size_t len, upb_value *v) { - uint32_t hash = upb_murmur_hash2(key, len, 0); + uint32_t hash = table_hash(key, len); return lookup(&t->t, strkey2(key, len), v, hash, &streql); } bool upb_strtable_remove3(upb_strtable *t, const char *key, size_t len, upb_value *val, upb_alloc *alloc) { - uint32_t hash = upb_murmur_hash2(key, len, 0); + uint32_t hash = table_hash(key, len); upb_tabkey tabkey; if (rm(&t->t, strkey2(key, len), val, &tabkey, hash, &streql)) { if (alloc) { @@ -699,182 +716,3 @@ bool upb_inttable_iter_isequal(const upb_inttable_iter *i1, return i1->t == i2->t && i1->index == i2->index && i1->array_part == i2->array_part; } - -#if defined(UPB_UNALIGNED_READS_OK) || defined(__s390x__) -/* ----------------------------------------------------------------------------- - * MurmurHash2, by Austin Appleby (released as public domain). - * Reformatted and C99-ified by Joshua Haberman. - * Note - This code makes a few assumptions about how your machine behaves - - * 1. We can read a 4-byte value from any address without crashing - * 2. sizeof(int) == 4 (in upb this limitation is removed by using uint32_t - * And it has a few limitations - - * 1. It will not work incrementally. - * 2. It will not produce the same results on little-endian and big-endian - * machines. */ -uint32_t upb_murmur_hash2(const void *key, size_t len, uint32_t seed) { - /* 'm' and 'r' are mixing constants generated offline. - * They're not really 'magic', they just happen to work well. */ - const uint32_t m = 0x5bd1e995; - const int32_t r = 24; - - /* Initialize the hash to a 'random' value */ - uint32_t h = seed ^ len; - - /* Mix 4 bytes at a time into the hash */ - const uint8_t * data = (const uint8_t *)key; - while(len >= 4) { - uint32_t k; - memcpy(&k, data, sizeof(k)); - - k *= m; - k ^= k >> r; - k *= m; - - h *= m; - h ^= k; - - data += 4; - len -= 4; - } - - /* Handle the last few bytes of the input array */ - switch(len) { - case 3: h ^= data[2] << 16; - case 2: h ^= data[1] << 8; - case 1: h ^= data[0]; h *= m; - }; - - /* Do a few final mixes of the hash to ensure the last few - * bytes are well-incorporated. */ - h ^= h >> 13; - h *= m; - h ^= h >> 15; - - return h; -} - -#else /* !UPB_UNALIGNED_READS_OK */ - -/* ----------------------------------------------------------------------------- - * MurmurHashAligned2, by Austin Appleby - * Same algorithm as MurmurHash2, but only does aligned reads - should be safer - * on certain platforms. - * Performance will be lower than MurmurHash2 */ - -#define MIX(h,k,m) { k *= m; k ^= k >> r; k *= m; h *= m; h ^= k; } - -uint32_t upb_murmur_hash2(const void * key, size_t len, uint32_t seed) { - const uint32_t m = 0x5bd1e995; - const int32_t r = 24; - const uint8_t * data = (const uint8_t *)key; - uint32_t h = (uint32_t)(seed ^ len); - uint8_t align = (uintptr_t)data & 3; - - if(align && (len >= 4)) { - /* Pre-load the temp registers */ - uint32_t t = 0, d = 0; - int32_t sl; - int32_t sr; - - switch(align) { - case 1: t |= data[2] << 16; /* fallthrough */ - case 2: t |= data[1] << 8; /* fallthrough */ - case 3: t |= data[0]; - } - - t <<= (8 * align); - - data += 4-align; - len -= 4-align; - - sl = 8 * (4-align); - sr = 8 * align; - - /* Mix */ - - while(len >= 4) { - uint32_t k; - - d = *(uint32_t *)data; - t = (t >> sr) | (d << sl); - - k = t; - - MIX(h,k,m); - - t = d; - - data += 4; - len -= 4; - } - - /* Handle leftover data in temp registers */ - - d = 0; - - if(len >= align) { - uint32_t k; - - switch(align) { - case 3: d |= data[2] << 16; /* fallthrough */ - case 2: d |= data[1] << 8; /* fallthrough */ - case 1: d |= data[0]; /* fallthrough */ - } - - k = (t >> sr) | (d << sl); - MIX(h,k,m); - - data += align; - len -= align; - - /* ---------- - * Handle tail bytes */ - - switch(len) { - case 3: h ^= data[2] << 16; /* fallthrough */ - case 2: h ^= data[1] << 8; /* fallthrough */ - case 1: h ^= data[0]; h *= m; /* fallthrough */ - }; - } else { - switch(len) { - case 3: d |= data[2] << 16; /* fallthrough */ - case 2: d |= data[1] << 8; /* fallthrough */ - case 1: d |= data[0]; /* fallthrough */ - case 0: h ^= (t >> sr) | (d << sl); h *= m; - } - } - - h ^= h >> 13; - h *= m; - h ^= h >> 15; - - return h; - } else { - while(len >= 4) { - uint32_t k = *(uint32_t *)data; - - MIX(h,k,m); - - data += 4; - len -= 4; - } - - /* ---------- - * Handle tail bytes */ - - switch(len) { - case 3: h ^= data[2] << 16; /* fallthrough */ - case 2: h ^= data[1] << 8; /* fallthrough */ - case 1: h ^= data[0]; h *= m; - }; - - h ^= h >> 13; - h *= m; - h ^= h >> 15; - - return h; - } -} -#undef MIX - -#endif /* UPB_UNALIGNED_READS_OK */ diff --git a/upb/table.int.h b/upb/table.int.h index 600637eef2..1e6c232cf8 100644 --- a/upb/table.int.h +++ b/upb/table.int.h @@ -171,7 +171,8 @@ typedef struct _upb_tabent { typedef struct { size_t count; /* Number of entries in the hash part. */ - size_t mask; /* Mask to turn hash value -> bucket. */ + uint32_t mask; /* Mask to turn hash value -> bucket. */ + uint32_t max_count; /* Max count before we hit our load limit. */ uint8_t size_lg2; /* Size of the hashtable part is 2^size_lg2 entries. */ /* Hash table entries. @@ -230,7 +231,8 @@ UPB_INLINE bool upb_arrhas(upb_tabval key) { /* Initialize and uninitialize a table, respectively. If memory allocation * failed, false is returned that the table is uninitialized. */ bool upb_inttable_init2(upb_inttable *table, upb_ctype_t ctype, upb_alloc *a); -bool upb_strtable_init2(upb_strtable *table, upb_ctype_t ctype, upb_alloc *a); +bool upb_strtable_init2(upb_strtable *table, upb_ctype_t ctype, + size_t expected_size, upb_alloc *a); void upb_inttable_uninit2(upb_inttable *table, upb_alloc *a); void upb_strtable_uninit2(upb_strtable *table, upb_alloc *a); @@ -239,7 +241,7 @@ UPB_INLINE bool upb_inttable_init(upb_inttable *table, upb_ctype_t ctype) { } UPB_INLINE bool upb_strtable_init(upb_strtable *table, upb_ctype_t ctype) { - return upb_strtable_init2(table, ctype, &upb_alloc_global); + return upb_strtable_init2(table, ctype, 4, &upb_alloc_global); } UPB_INLINE void upb_inttable_uninit(upb_inttable *table) { From 4f901b643086c10620267c6f854f6e75d346f5d3 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Wed, 14 Oct 2020 16:32:43 -0700 Subject: [PATCH 02/21] Passes all tests. --- BUILD | 1 + CMakeLists.txt | 1 + generated_for_cmake/upb/json/parser.c | 2 +- tests/pb/test_decoder.cc | 55 ++++++--------------------- 4 files changed, 15 insertions(+), 44 deletions(-) diff --git a/BUILD b/BUILD index e7c89c5635..9b5c2e0dd7 100644 --- a/BUILD +++ b/BUILD @@ -855,6 +855,7 @@ filegroup( "upbc/**/*", "upb/**/*", "tests/**/*", + "third_party/**/*", ]), ) diff --git a/CMakeLists.txt b/CMakeLists.txt index 67eeb2c698..4c2ee01bef 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -71,6 +71,7 @@ add_library(upb upb/table.int.h upb/upb.c upb/upb.int.h + third_party/wyhash/wyhash.h upb/decode.h upb/encode.h upb/upb.h diff --git a/generated_for_cmake/upb/json/parser.c b/generated_for_cmake/upb/json/parser.c index 7cdb4de83b..0526979762 100644 --- a/generated_for_cmake/upb/json/parser.c +++ b/generated_for_cmake/upb/json/parser.c @@ -3306,7 +3306,7 @@ static upb_json_parsermethod *parsermethod_new(upb_json_codecache *c, upb_byteshandler_setstring(&m->input_handler_, parse, m); upb_byteshandler_setendstr(&m->input_handler_, end, m); - upb_strtable_init2(&m->name_table, UPB_CTYPE_CONSTPTR, alloc); + upb_strtable_init2(&m->name_table, UPB_CTYPE_CONSTPTR, 4, alloc); /* Build name_table */ diff --git a/tests/pb/test_decoder.cc b/tests/pb/test_decoder.cc index 14fd72abbc..990cfff819 100644 --- a/tests/pb/test_decoder.cc +++ b/tests/pb/test_decoder.cc @@ -53,17 +53,6 @@ #define PRINT_FAILURE(expr) \ fprintf(stderr, "Assertion failed: %s:%d\n", __FILE__, __LINE__); \ fprintf(stderr, "expr: %s\n", #expr); \ - if (testhash) { \ - fprintf(stderr, "assertion failed running test %x.\n", testhash); \ - if (!filter_hash) { \ - fprintf(stderr, \ - "Run with the arg %x to run only this test. " \ - "(This will also turn on extra debugging output)\n", \ - testhash); \ - } \ - fprintf(stderr, "Failed at %02.2f%% through tests.\n", \ - (float)completed * 100 / total); \ - } #define MAX_NESTING 64 @@ -445,17 +434,6 @@ upb::pb::DecoderPtr CreateDecoder(upb::Arena* arena, return ret; } -uint32_t Hash(const string& proto, const string* expected_output, size_t seam1, - size_t seam2, bool may_skip) { - uint32_t hash = upb_murmur_hash2(proto.c_str(), proto.size(), 0); - if (expected_output) - hash = upb_murmur_hash2(expected_output->c_str(), expected_output->size(), hash); - hash = upb_murmur_hash2(&seam1, sizeof(seam1), hash); - hash = upb_murmur_hash2(&seam2, sizeof(seam2), hash); - hash = upb_murmur_hash2(&may_skip, sizeof(may_skip), hash); - return hash; -} - void CheckBytesParsed(upb::pb::DecoderPtr decoder, size_t ofs) { // We can't have parsed more data than the decoder callback is telling us it // parsed. @@ -484,13 +462,11 @@ void do_run_decoder(VerboseParserEnvironment* env, upb::pb::DecoderPtr decoder, env->Reset(proto.c_str(), proto.size(), may_skip, expected_output == NULL); decoder.Reset(); - testhash = Hash(proto, expected_output, i, j, may_skip); - if (filter_hash && testhash != filter_hash) return; if (test_mode != COUNT_ONLY) { output.clear(); if (filter_hash) { - fprintf(stderr, "RUNNING TEST CASE, hash=%x\n", testhash); + fprintf(stderr, "RUNNING TEST CASE\n"); fprintf(stderr, "Input (len=%u): ", (unsigned)proto.size()); PrintBinary(proto); fprintf(stderr, "\n"); @@ -549,7 +525,6 @@ void run_decoder(const string& proto, const string* expected_output) { } } } - testhash = 0; } const static string thirty_byte_nop = cat( @@ -849,23 +824,17 @@ void test_valid() { // Empty protobuf where we never call PutString between // StartString/EndString. - // Randomly generated hash for this test, hope it doesn't conflict with others - // by chance. - const uint32_t emptyhash = 0x5709be8e; - if (!filter_hash || filter_hash == testhash) { - testhash = emptyhash; - upb::Status status; - upb::Arena arena; - upb::Sink sink(global_handlers, &closures[0]); - upb::pb::DecoderPtr decoder = - CreateDecoder(&arena, global_method, sink, &status); - output.clear(); - bool ok = upb::PutBuffer(std::string(), decoder.input()); - ASSERT(ok); - ASSERT(status.ok()); - if (test_mode == ALL_HANDLERS) { - ASSERT(output == string("<\n>\n")); - } + upb::Status status; + upb::Arena arena; + upb::Sink sink(global_handlers, &closures[0]); + upb::pb::DecoderPtr decoder = + CreateDecoder(&arena, global_method, sink, &status); + output.clear(); + bool ok = upb::PutBuffer(std::string(), decoder.input()); + ASSERT(ok); + ASSERT(status.ok()); + if (test_mode == ALL_HANDLERS) { + ASSERT(output == string("<\n>\n")); } test_valid_data_for_signed_type(UPB_DESCRIPTOR_TYPE_DOUBLE, From b5bd5807a77ccd4b311328d9bda30f354f108532 Mon Sep 17 00:00:00 2001 From: Benjamin Peterson Date: Wed, 28 Oct 2020 21:56:58 -0500 Subject: [PATCH 03/21] Migrate to modern Starlark linking api. See https://github.com/bazelbuild/bazel/issues/10860. --- bazel/upb_proto_library.bzl | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/bazel/upb_proto_library.bzl b/bazel/upb_proto_library.bzl index 85cbad8051..8a1b2857f5 100644 --- a/bazel/upb_proto_library.bzl +++ b/bazel/upb_proto_library.bzl @@ -183,10 +183,7 @@ def _upb_proto_rule_impl(ctx): fail("proto_library rule must generate _UpbWrappedCcInfo or " + "_UpbDefsWrappedCcInfo (aspect should have handled this).") - if type(cc_info.linking_context.libraries_to_link) == "list": - lib = cc_info.linking_context.libraries_to_link[0] - else: - lib = cc_info.linking_context.libraries_to_link.to_list()[0] + lib = cc_info.linking_context.linker_inputs.to_list()[0].libraries[0] files = _filter_none([ lib.static_library, lib.pic_static_library, From 723cd8ffc13c15a6dade94ef4381f7fec7ae5bd5 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Thu, 29 Oct 2020 19:18:53 -0700 Subject: [PATCH 04/21] Added wyhash code and LICENSE, and removed temporary benchmark. --- benchmarks/benchmark.cc | 28 ------- third_party/wyhash/LICENSE | 25 +++++++ third_party/wyhash/wyhash.h | 142 ++++++++++++++++++++++++++++++++++++ 3 files changed, 167 insertions(+), 28 deletions(-) create mode 100644 third_party/wyhash/LICENSE create mode 100644 third_party/wyhash/wyhash.h diff --git a/benchmarks/benchmark.cc b/benchmarks/benchmark.cc index 3b0367cb77..7f6b5f0f77 100644 --- a/benchmarks/benchmark.cc +++ b/benchmarks/benchmark.cc @@ -20,34 +20,6 @@ namespace protobuf = ::google::protobuf; /* A buffer big enough to parse descriptor.proto without going to heap. */ char buf[65535]; -static void BM_LoadDescriptor(benchmark::State& state) { - FILE *f = fopen("/tmp/ads-descriptor.pb", "rb"); - fseek(f, 0, SEEK_END); - size_t size = ftell(f); - fseek(f, 0, SEEK_SET); - std::string str(size, '\0'); - fread(&str[0], 1, size, f); - fclose(f); - fprintf(stderr, "size: %zu\n", str.size()); - for (auto _ : state) { - upb::SymbolTable symtab; - upb::Arena arena; - google_protobuf_FileDescriptorSet* set = - google_protobuf_FileDescriptorSet_parse(str.data(), str.size(), - arena.ptr()); - const google_protobuf_FileDescriptorProto*const * files = - google_protobuf_FileDescriptorSet_file(set, &size); - for (size_t i = 0; i < size; i++) { - upb::FileDefPtr file_def = symtab.AddFile(files[i], NULL); - if (!file_def) { - printf("Failed to add file.\n"); - exit(1); - } - } - } -} -BENCHMARK(BM_LoadDescriptor); - static void BM_ArenaOneAlloc(benchmark::State& state) { for (auto _ : state) { upb_arena* arena = upb_arena_new(); diff --git a/third_party/wyhash/LICENSE b/third_party/wyhash/LICENSE new file mode 100644 index 0000000000..471f09f4cf --- /dev/null +++ b/third_party/wyhash/LICENSE @@ -0,0 +1,25 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to + diff --git a/third_party/wyhash/wyhash.h b/third_party/wyhash/wyhash.h new file mode 100644 index 0000000000..1265169fd6 --- /dev/null +++ b/third_party/wyhash/wyhash.h @@ -0,0 +1,142 @@ +//Author: Wang Yi +#ifndef wyhash_final_version +#define wyhash_final_version +//defines that change behavior +#ifndef WYHASH_CONDOM +#define WYHASH_CONDOM 1 //0: read 8 bytes before and after boudaries, dangerous but fastest. 1: normal valid behavior 2: extra protection against entropy loss (probability=2^-63), aka. "blind multiplication" +#endif +#define WYHASH_32BIT_MUM 0 //faster on 32 bit system +//includes +#include +#include +#if defined(_MSC_VER) && defined(_M_X64) + #include + #pragma intrinsic(_umul128) +#endif +#if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__) + #define _likely_(x) __builtin_expect(x,1) + #define _unlikely_(x) __builtin_expect(x,0) +#else + #define _likely_(x) (x) + #define _unlikely_(x) (x) +#endif +//mum function +static inline uint64_t _wyrot(uint64_t x) { return (x>>32)|(x<<32); } +static inline void _wymum(uint64_t *A, uint64_t *B){ +#if(WYHASH_32BIT_MUM) + uint64_t hh=(*A>>32)*(*B>>32), hl=(*A>>32)*(unsigned)*B, lh=(unsigned)*A*(*B>>32), ll=(uint64_t)(unsigned)*A*(unsigned)*B; + #if(WYHASH_CONDOM>1) + *A^=_wyrot(hl)^hh; *B^=_wyrot(lh)^ll; + #else + *A=_wyrot(hl)^hh; *B=_wyrot(lh)^ll; + #endif +#elif defined(__SIZEOF_INT128__) + __uint128_t r=*A; r*=*B; + #if(WYHASH_CONDOM>1) + *A^=(uint64_t)r; *B^=(uint64_t)(r>>64); + #else + *A=(uint64_t)r; *B=(uint64_t)(r>>64); + #endif +#elif defined(_MSC_VER) && defined(_M_X64) + #if(WYHASH_CONDOM>1) + uint64_t a, b; + a=_umul128(*A,*B,&b); + *A^=a; *B^=b; + #else + *A=_umul128(*A,*B,B); + #endif +#else + uint64_t ha=*A>>32, hb=*B>>32, la=(uint32_t)*A, lb=(uint32_t)*B, hi, lo; + uint64_t rh=ha*hb, rm0=ha*lb, rm1=hb*la, rl=la*lb, t=rl+(rm0<<32), c=t>32)+(rm1>>32)+c; + #if(WYHASH_CONDOM>1) + *A^=lo; *B^=hi; + #else + *A=lo; *B=hi; + #endif +#endif +} +static inline uint64_t _wymix(uint64_t A, uint64_t B){ _wymum(&A,&B); return A^B; } +//read functions +#ifndef WYHASH_LITTLE_ENDIAN + #if defined(_WIN32) || defined(__LITTLE_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) + #define WYHASH_LITTLE_ENDIAN 1 + #elif defined(__BIG_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + #define WYHASH_LITTLE_ENDIAN 0 + #endif +#endif +#if (WYHASH_LITTLE_ENDIAN) +static inline uint64_t _wyr8(const uint8_t *p) { uint64_t v; memcpy(&v, p, 8); return v;} +static inline uint64_t _wyr4(const uint8_t *p) { unsigned v; memcpy(&v, p, 4); return v;} +#elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__) +static inline uint64_t _wyr8(const uint8_t *p) { uint64_t v; memcpy(&v, p, 8); return __builtin_bswap64(v);} +static inline uint64_t _wyr4(const uint8_t *p) { unsigned v; memcpy(&v, p, 4); return __builtin_bswap32(v);} +#elif defined(_MSC_VER) +static inline uint64_t _wyr8(const uint8_t *p) { uint64_t v; memcpy(&v, p, 8); return _byteswap_uint64(v);} +static inline uint64_t _wyr4(const uint8_t *p) { unsigned v; memcpy(&v, p, 4); return _byteswap_ulong(v);} +#endif +static inline uint64_t _wyr3(const uint8_t *p, unsigned k) { return (((uint64_t)p[0])<<16)|(((uint64_t)p[k>>1])<<8)|p[k-1];} +//wyhash function +static inline uint64_t _wyfinish16(const uint8_t *p, uint64_t len, uint64_t seed, const uint64_t *secret, uint64_t i){ +#if(WYHASH_CONDOM>0) + uint64_t a, b; + if(_likely_(i<=8)){ + if(_likely_(i>=4)){ a=_wyr4(p); b=_wyr4(p+i-4); } + else if (_likely_(i)){ a=_wyr3(p,i); b=0; } + else a=b=0; + } + else{ a=_wyr8(p); b=_wyr8(p+i-8); } + return _wymix(secret[1]^len,_wymix(a^secret[1], b^seed)); +#else + #define oneshot_shift ((i<8)*((8-i)<<3)) + return _wymix(secret[1]^len,_wymix((_wyr8(p)<>oneshot_shift)^seed)); +#endif +} + +static inline uint64_t _wyfinish(const uint8_t *p, uint64_t len, uint64_t seed, const uint64_t *secret, uint64_t i){ + if(_likely_(i<=16)) return _wyfinish16(p,len,seed,secret,i); + return _wyfinish(p+16,len,_wymix(_wyr8(p)^secret[1],_wyr8(p+8)^seed),secret,i-16); +} + +static inline uint64_t wyhash(const void *key, uint64_t len, uint64_t seed, const uint64_t *secret){ + const uint8_t *p=(const uint8_t *)key; + uint64_t i=len; seed^=*secret; + if(_unlikely_(i>64)){ + uint64_t see1=seed; + do{ + seed=_wymix(_wyr8(p)^secret[1],_wyr8(p+8)^seed)^_wymix(_wyr8(p+16)^secret[2],_wyr8(p+24)^seed); + see1=_wymix(_wyr8(p+32)^secret[3],_wyr8(p+40)^see1)^_wymix(_wyr8(p+48)^secret[4],_wyr8(p+56)^see1); + p+=64; i-=64; + }while(i>64); + seed^=see1; + } + return _wyfinish(p,len,seed,secret,i); +} +//utility functions +const uint64_t _wyp[5] = {0xa0761d6478bd642full, 0xe7037ed1a0b428dbull, 0x8ebc6af09c88c6e3ull, 0x589965cc75374cc3ull, 0x1d8e4e27c47d124full}; +static inline uint64_t wyhash64(uint64_t A, uint64_t B){ A^=_wyp[0]; B^=_wyp[1]; _wymum(&A,&B); return _wymix(A^_wyp[0],B^_wyp[1]);} +static inline uint64_t wyrand(uint64_t *seed){ *seed+=_wyp[0]; return _wymix(*seed,*seed^_wyp[1]);} +static inline double wy2u01(uint64_t r){ const double _wynorm=1.0/(1ull<<52); return (r>>12)*_wynorm;} +static inline double wy2gau(uint64_t r){ const double _wynorm=1.0/(1ull<<20); return ((r&0x1fffff)+((r>>21)&0x1fffff)+((r>>42)&0x1fffff))*_wynorm-3.0;} +static inline uint64_t wy2u0k(uint64_t r, uint64_t k){ _wymum(&r,&k); return k; } + +static inline void make_secret(uint64_t seed, uint64_t *secret){ + uint8_t c[] = {15, 23, 27, 29, 30, 39, 43, 45, 46, 51, 53, 54, 57, 58, 60, 71, 75, 77, 78, 83, 85, 86, 89, 90, 92, 99, 101, 102, 105, 106, 108, 113, 114, 116, 120, 135, 139, 141, 142, 147, 149, 150, 153, 154, 156, 163, 165, 166, 169, 170, 172, 177, 178, 180, 184, 195, 197, 198, 201, 202, 204, 209, 210, 212, 216, 225, 226, 228, 232, 240 }; + for(size_t i=0;i<5;i++){ + uint8_t ok; + do{ + ok=1; secret[i]=0; + for(size_t j=0;j<64;j+=8) secret[i]|=((uint64_t)c[wyrand(&seed)%sizeof(c)])< Date: Thu, 29 Oct 2020 21:17:34 -0700 Subject: [PATCH 05/21] Got rid of floating-point division in table init. Ideally we would get rid of all floating-point operations in table.c, but that's a job for another day. --- tests/test_table.cc | 10 ++++++++++ upb/table.c | 8 +++----- upb/upb.h | 11 +++++++++++ 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/tests/test_table.cc b/tests/test_table.cc index e19a74a50d..83b49f5c20 100644 --- a/tests/test_table.cc +++ b/tests/test_table.cc @@ -618,6 +618,16 @@ void test_delete() { upb_inttable_uninit(&t); } +void test_init() { + for (int i = 0; i < 2048; i++) { + /* Tests that the size calculations in init() (lg2 size for target load) + * work for all expected sizes. */ + upb_strtable t; + upb_strtable_init2(&t, UPB_CTYPE_BOOL, i, &upb_alloc_global); + upb_strtable_uninit(&t); + } +} + extern "C" { int run_tests(int argc, char *argv[]) { diff --git a/upb/table.c b/upb/table.c index 9af5163061..d9201f72ae 100644 --- a/upb/table.c +++ b/upb/table.c @@ -294,11 +294,9 @@ static bool streql(upb_tabkey k1, lookupkey_t k2) { bool upb_strtable_init2(upb_strtable *t, upb_ctype_t ctype, size_t expected_size, upb_alloc *a) { UPB_UNUSED(ctype); /* TODO(haberman): rm */ - size_t need_entries = (expected_size + 1) / MAX_LOAD; - int size_lg2 = 2; - while (1 << size_lg2 < need_entries) { - size_lg2++; - } + size_t need_entries = (expected_size + 1) * 1204 / 1024; + UPB_ASSERT(need_entries >= expected_size * 0.85); + int size_lg2 = _upb_lg2ceil(need_entries); return init(&t->t, size_lg2, a); } diff --git a/upb/upb.h b/upb/upb.h index 85205a5a76..f52e812478 100644 --- a/upb/upb.h +++ b/upb/upb.h @@ -313,6 +313,17 @@ UPB_INLINE uint64_t _upb_be_swap64(uint64_t val) { } } +UPB_INLINE int _upb_lg2ceil(int x) { + if (x == 0) return 0; +#ifdef __GNUC__ + return 32 - __builtin_clz(x - 1); +#else + int lg2 = 0; + while (1 << lg2 < x) lg2++; + return lg2; +#endif +} + #include "upb/port_undef.inc" #ifdef __cplusplus From 8113ebd6c7da7888474d29d8b9e9d91077de916c Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Thu, 29 Oct 2020 21:56:48 -0700 Subject: [PATCH 06/21] Added explanatory comment about integer constants. --- upb/table.c | 1 + 1 file changed, 1 insertion(+) diff --git a/upb/table.c b/upb/table.c index d9201f72ae..4b1cf9b466 100644 --- a/upb/table.c +++ b/upb/table.c @@ -294,6 +294,7 @@ static bool streql(upb_tabkey k1, lookupkey_t k2) { bool upb_strtable_init2(upb_strtable *t, upb_ctype_t ctype, size_t expected_size, upb_alloc *a) { UPB_UNUSED(ctype); /* TODO(haberman): rm */ + // Multiply by approximate reciprocal of MAX_LOAD (0.85), with pow2 denominator. size_t need_entries = (expected_size + 1) * 1204 / 1024; UPB_ASSERT(need_entries >= expected_size * 0.85); int size_lg2 = _upb_lg2ceil(need_entries); From acd72c6d3f0d7464e2d61668fd73d23d67857021 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Sat, 31 Oct 2020 11:04:08 -0700 Subject: [PATCH 07/21] WIP. --- upb/def.c | 119 ++++++++++++++++++++++++++---------------------------- 1 file changed, 58 insertions(+), 61 deletions(-) diff --git a/upb/def.c b/upb/def.c index 603deb2013..4d9713f22f 100644 --- a/upb/def.c +++ b/upb/def.c @@ -864,7 +864,40 @@ void upb_oneof_iter_setdone(upb_oneof_iter *iter) { upb_inttable_iter_setdone(iter); } -/* Dynamic Layout Generation. *************************************************/ +/* Code to build defs from descriptor protos. *********************************/ + +/* There is a question of how much validation to do here. It will be difficult + * to perfectly match the amount of validation performed by proto2. But since + * this code is used to directly build defs from Ruby (for example) we do need + * to validate important constraints like uniqueness of names and numbers. */ + +#define CHK_OOM(ctx, x) if (!(x)) { symtab_oomerr(ctx); } + +typedef struct { + const upb_symtab *symtab; + upb_filedef *file; /* File we are building. */ + upb_alloc *alloc; /* Allocate defs here. */ + upb_alloc *tmp; /* Alloc for addtab and any other tmp data. */ + upb_strtable *addtab; /* full_name -> packed def ptr for new defs */ + const upb_msglayout **layouts; /* NULL if we should build layouts. */ + upb_status *status; /* Record errors here. */ + jmp_buf err; /* longjmp() on error. */ +} symtab_addctx; + +static void symtab_err(symtab_addctx *ctx) { longjmp(ctx->err, 1); } + +static void symtab_errf(symtab_addctx *ctx, const char *fmt, ...) { + va_list argp; + va_start(argp, fmt); + upb_status_vseterrf(d->status, fmt, argp); + va_end(argp); + longjmp(ctx->err, 1); +} + +static void symtab_oomerr(symtab_addctx *ctx) { + upb_status_setoom(ctx->status); + longjmp(ctx->err, 1); +} static size_t div_round_up(size_t n, size_t d) { return (n + d - 1) / d; @@ -931,7 +964,8 @@ static void assign_layout_indices(const upb_msgdef *m, upb_msglayout_field *fiel /* This function is the dynamic equivalent of message_layout.{cc,h} in upbc. * It computes a dynamic layout for all of the fields in |m|. */ -static bool make_layout(const upb_symtab *symtab, const upb_msgdef *m) { +static void make_layout(symtab_addctx *ctx, const upb_symtab *symtab, + const upb_msgdef *m) { upb_msglayout *l = (upb_msglayout*)m->layout; upb_msg_field_iter it; upb_msg_oneof_iter oit; @@ -948,8 +982,7 @@ static bool make_layout(const upb_symtab *symtab, const upb_msgdef *m) { if ((!fields && upb_msgdef_numfields(m)) || (!submsgs && submsg_count)) { - /* OOM. */ - return false; + symtab_oomerr(ctx); } l->field_count = upb_msgdef_numfields(m); @@ -980,7 +1013,7 @@ static bool make_layout(const upb_symtab *symtab, const upb_msgdef *m) { l->field_count = 2; l->size = 2 * sizeof(upb_strview); l->size = UPB_ALIGN_UP(l->size, 8); - return true; + return; } /* Allocate data offsets in three stages: @@ -1092,30 +1125,8 @@ static bool make_layout(const upb_symtab *symtab, const upb_msgdef *m) { /* Sort fields by number. */ qsort(fields, upb_msgdef_numfields(m), sizeof(*fields), field_number_cmp); assign_layout_indices(m, fields); - - return true; } -/* Code to build defs from descriptor protos. *********************************/ - -/* There is a question of how much validation to do here. It will be difficult - * to perfectly match the amount of validation performed by proto2. But since - * this code is used to directly build defs from Ruby (for example) we do need - * to validate important constraints like uniqueness of names and numbers. */ - -#define CHK(x) if (!(x)) { return false; } -#define CHK_OOM(x) if (!(x)) { upb_status_setoom(ctx->status); return false; } - -typedef struct { - const upb_symtab *symtab; - upb_filedef *file; /* File we are building. */ - upb_alloc *alloc; /* Allocate defs here. */ - upb_alloc *tmp; /* Alloc for addtab and any other tmp data. */ - upb_strtable *addtab; /* full_name -> packed def ptr for new defs */ - const upb_msglayout **layouts; /* NULL if we should build layouts. */ - upb_status *status; /* Record errors here. */ -} symtab_addctx; - static char* strviewdup(const symtab_addctx *ctx, upb_strview view) { return upb_strdup2(view.data, view.size, ctx->alloc); } @@ -1145,7 +1156,7 @@ static const char *makefullname(const symtab_addctx *ctx, const char *prefix, } } -static bool finalize_oneofs(symtab_addctx *ctx, upb_msgdef *m) { +static void finalize_oneofs(symtab_addctx *ctx, upb_msgdef *m) { int i; int synthetic_count = 0; upb_oneofdef *mutable_oneofs = (upb_oneofdef*)m->oneofs; @@ -1154,19 +1165,15 @@ static bool finalize_oneofs(symtab_addctx *ctx, upb_msgdef *m) { upb_oneofdef *o = &mutable_oneofs[i]; if (o->synthetic && o->field_count != 1) { - upb_status_seterrf( - ctx->status, "Synthetic oneofs must have one field, not %d: %s", - o->field_count, upb_oneofdef_name(o)); - return false; + symtab_errf(ctx, "Synthetic oneofs must have one field, not %d: %s", + o->field_count, upb_oneofdef_name(o)); } if (o->synthetic) { synthetic_count++; } else if (synthetic_count != 0) { - upb_status_seterrf( - ctx->status, "Synthetic oneofs must be after all other oneofs: %s", - upb_oneofdef_name(o)); - return false; + symtab_errf(ctx, "Synthetic oneofs must be after all other oneofs: %s", + upb_oneofdef_name(o)); } o->fields = upb_malloc(ctx->alloc, sizeof(upb_fielddef*) * o->field_count); @@ -1182,7 +1189,6 @@ static bool finalize_oneofs(symtab_addctx *ctx, upb_msgdef *m) { } m->real_oneof_count = m->oneof_count - synthetic_count; - return true; } size_t getjsonname(const char *name, char *buf, size_t len) { @@ -1230,44 +1236,38 @@ static char* makejsonname(const char* name, upb_alloc *alloc) { return json_name; } -static bool symtab_add(const symtab_addctx *ctx, const char *name, +static void symtab_add(const symtab_addctx *ctx, const char *name, upb_value v) { upb_value tmp; if (upb_strtable_lookup(ctx->addtab, name, &tmp) || upb_strtable_lookup(&ctx->symtab->syms, name, &tmp)) { - upb_status_seterrf(ctx->status, "duplicate symbol '%s'", name); - return false; + status_errf(ctx, "duplicate symbol '%s'", name); } CHK_OOM(upb_strtable_insert3(ctx->addtab, name, strlen(name), v, ctx->tmp)); - return true; } /* Given a symbol and the base symbol inside which it is defined, find the * symbol's definition in t. */ -static bool resolvename(const upb_strtable *t, const upb_fielddef *f, - const char *base, upb_strview sym, - upb_deftype_t type, upb_status *status, - const void **def) { +static const void *resolvename(symtab_addctx *ctx, const upb_strtable *t, + const upb_fielddef *f, const char *base, + upb_strview sym, upb_deftype_t type, + const void **def) { if(sym.size == 0) return false; if(sym.data[0] == '.') { /* Symbols starting with '.' are absolute, so we do a single lookup. * Slice to omit the leading '.' */ upb_value v; if (!upb_strtable_lookup2(t, sym.data + 1, sym.size - 1, &v)) { - return false; + return NULL; } - *def = unpack_def(v, type); - - if (!*def) { - upb_status_seterrf(status, - "type mismatch when resolving field %s, name %s", - f->full_name, sym.data); - return false; + const void *ret = unpack_def(v, type); + if (ret) { + symtab_errf(ctx, "type mismatch when resolving field %s, name %s", + f->full_name, sym.data); } - - return true; + return ret; } else { /* Remove components from base until we find an entry or run out. * TODO: This branch is totally broken, but currently not used. */ @@ -1281,12 +1281,9 @@ const void *symtab_resolve(const symtab_addctx *ctx, const upb_fielddef *f, const char *base, upb_strview sym, upb_deftype_t type) { const void *ret; - if (!resolvename(ctx->addtab, f, base, sym, type, ctx->status, &ret) && - !resolvename(&ctx->symtab->syms, f, base, sym, type, ctx->status, &ret)) { - if (upb_ok(ctx->status)) { - upb_status_seterrf(ctx->status, "couldn't resolve name '%s'", sym.data); - } - return false; + if (!resolvename(ctx, ctx->addtab, f, base, sym, type, &ret) && + !resolvename(ctx, &ctx->symtab->syms, f, base, sym, type, &ret)) { + symtab_errf(ctx, "couldn't resolve name '%s'", sym.data); } return ret; } From c3b56376461cf71b34980abaa7ab4aaf81d7a603 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Sat, 31 Oct 2020 13:12:18 -0700 Subject: [PATCH 08/21] Added benchmark for loading ads descriptor. Generally this seems to track the speed of loading descriptor.proto. ---------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------------------------------------------------------- BM_LoadDescriptor_Upb 59091 ns 59086 ns 11747 121.182MB/s BM_LoadAdsDescriptor_Upb 4218587 ns 4218582 ns 166 120.544MB/s BM_LoadDescriptor_Proto2 241083 ns 241049 ns 2903 29.7043MB/s BM_LoadAdsDescriptor_Proto2 13442631 ns 13442099 ns 52 34.8975MB/s --- WORKSPACE | 9 +++++ benchmarks/BUILD | 12 +++++++ benchmarks/BUILD.googleapis | 29 +++++++++++++++ benchmarks/benchmark.cc | 72 +++++++++++++++++++++++++++++++------ upb/def.c | 7 ++++ upb/def.h | 1 + 6 files changed, 119 insertions(+), 11 deletions(-) create mode 100644 benchmarks/BUILD.googleapis diff --git a/WORKSPACE b/WORKSPACE index c1c8c3be9b..a0b04ad044 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -1,6 +1,7 @@ workspace(name = "upb") load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") +load("@bazel_tools//tools/build_defs/repo:git.bzl", "new_git_repository") load("//bazel:workspace_deps.bzl", "upb_deps") upb_deps() @@ -37,3 +38,11 @@ http_archive( strip_prefix = "benchmark-16703ff83c1ae6d53e5155df3bb3ab0bc96083be", sha256 = "59f918c8ccd4d74b6ac43484467b500f1d64b40cc1010daa055375b322a43ba3", ) + +new_git_repository( + name = "com_google_googleapis", + remote = "https://github.com/googleapis/googleapis.git", + branch = "master", + build_file = "//benchmarks:BUILD.googleapis", + patch_cmds = ["find google -type f -name BUILD.bazel -delete"], +) diff --git a/benchmarks/BUILD b/benchmarks/BUILD index 87315a34e9..ed80b185f1 100644 --- a/benchmarks/BUILD +++ b/benchmarks/BUILD @@ -21,6 +21,16 @@ upb_proto_reflection_library( deps = [":benchmark_descriptor_proto"], ) +upb_proto_reflection_library( + name = "ads_upb_proto_reflection", + deps = ["@com_google_googleapis//:ads_proto"], +) + +cc_proto_library( + name = "ads_cc_proto", + deps = ["@com_google_googleapis//:ads_proto"], +) + cc_proto_library( name = "benchmark_descriptor_cc_proto", deps = [":benchmark_descriptor_proto"], @@ -45,6 +55,8 @@ cc_binary( ":benchmark_descriptor_sv_cc_proto", ":benchmark_descriptor_upb_proto", ":benchmark_descriptor_upb_proto_reflection", + ":ads_cc_proto", + ":ads_upb_proto_reflection", "//:descriptor_upb_proto", "//:reflection", "@com_github_google_benchmark//:benchmark_main", diff --git a/benchmarks/BUILD.googleapis b/benchmarks/BUILD.googleapis new file mode 100644 index 0000000000..c90815b09a --- /dev/null +++ b/benchmarks/BUILD.googleapis @@ -0,0 +1,29 @@ +load( + "@rules_proto//proto:defs.bzl", + "proto_library", +) + +proto_library( + name = "ads_proto", + srcs = glob([ + "google/ads/googleads/v5/**/*.proto", + "google/api/**/*.proto", + "google/rpc/**/*.proto", + "google/longrunning/**/*.proto", + "google/logging/**/*.proto", + ]), + #srcs = ["google/ads/googleads/v5/services/google_ads_service.proto"], + visibility = ["//visibility:public"], + deps = [ + "@com_google_protobuf//:any_proto", + "@com_google_protobuf//:empty_proto", + "@com_google_protobuf//:descriptor_proto", + "@com_google_protobuf//:field_mask_proto", + "@com_google_protobuf//:duration_proto", + "@com_google_protobuf//:timestamp_proto", + "@com_google_protobuf//:struct_proto", + "@com_google_protobuf//:api_proto", + "@com_google_protobuf//:type_proto", + "@com_google_protobuf//:wrappers_proto", + ], +) diff --git a/benchmarks/benchmark.cc b/benchmarks/benchmark.cc index 7f6b5f0f77..69340cf21e 100644 --- a/benchmarks/benchmark.cc +++ b/benchmarks/benchmark.cc @@ -9,9 +9,11 @@ #include "benchmarks/descriptor_sv.pb.h" // For for benchmarks of building descriptors. -#include "google/protobuf/descriptor.upb.h" +#include "google/ads/googleads/v5/services/google_ads_service.pb.h" +#include "google/ads/googleads/v5/services/google_ads_service.upbdefs.h" #include "google/protobuf/descriptor.pb.h" - +#include "google/protobuf/descriptor.upb.h" +#include "google/protobuf/descriptor.upbdefs.h" #include "upb/def.hpp" upb_strview descriptor = benchmarks_descriptor_proto_upbdefinit.descriptor; @@ -20,6 +22,20 @@ namespace protobuf = ::google::protobuf; /* A buffer big enough to parse descriptor.proto without going to heap. */ char buf[65535]; +void CollectFileDescriptors(const protobuf::FileDescriptor* file, + std::vector& serialized_files, + std::unordered_set& seen) { + if (!seen.insert(file).second) return; + for (int i = 0; i < file->dependency_count(); i++) { + CollectFileDescriptors(file->dependency(i), serialized_files, seen); + } + protobuf::FileDescriptorProto file_proto; + file->CopyTo(&file_proto); + std::string serialized; + file_proto.SerializeToString(&serialized); + serialized_files.push_back(std::move(serialized)); +} + static void BM_ArenaOneAlloc(benchmark::State& state) { for (auto _ : state) { upb_arena* arena = upb_arena_new(); @@ -39,22 +55,28 @@ static void BM_ArenaInitialBlockOneAlloc(benchmark::State& state) { BENCHMARK(BM_ArenaInitialBlockOneAlloc); static void BM_LoadDescriptor_Upb(benchmark::State& state) { + size_t bytes_per_iter; for (auto _ : state) { upb::SymbolTable symtab; - upb::Arena arena; - google_protobuf_FileDescriptorProto* file_proto = - google_protobuf_FileDescriptorProto_parse(descriptor.data, - descriptor.size, arena.ptr()); - upb::FileDefPtr file_def = symtab.AddFile(file_proto, NULL); - if (!file_def) { - printf("Failed to add file.\n"); - exit(1); - } + google_protobuf_DescriptorProto_getmsgdef(symtab.ptr()); + bytes_per_iter = _upb_symtab_bytesloaded(symtab.ptr()); } state.SetBytesProcessed(state.iterations() * descriptor.size); } BENCHMARK(BM_LoadDescriptor_Upb); +static void BM_LoadAdsDescriptor_Upb(benchmark::State& state) { + size_t bytes_per_iter; + for (auto _ : state) { + upb::SymbolTable symtab; + google_ads_googleads_v5_services_SearchGoogleAdsRequest_getmsgdef( + symtab.ptr()); + bytes_per_iter = _upb_symtab_bytesloaded(symtab.ptr()); + } + state.SetBytesProcessed(state.iterations() * bytes_per_iter); +} +BENCHMARK(BM_LoadAdsDescriptor_Upb); + static void BM_LoadDescriptor_Proto2(benchmark::State& state) { for (auto _ : state) { protobuf::Arena arena; @@ -73,6 +95,34 @@ static void BM_LoadDescriptor_Proto2(benchmark::State& state) { } BENCHMARK(BM_LoadDescriptor_Proto2); +static void BM_LoadAdsDescriptor_Proto2(benchmark::State& state) { + std::vector serialized_files; + std::unordered_set seen_files; + CollectFileDescriptors( + google::ads::googleads::v5::services::SearchGoogleAdsRequest::descriptor() + ->file(), + serialized_files, seen_files); + size_t bytes_per_iter; + for (auto _ : state) { + bytes_per_iter = 0; + protobuf::Arena arena; + protobuf::DescriptorPool pool; + for (auto file : serialized_files) { + auto proto = protobuf::Arena::CreateMessage( + &arena); + bool ok = proto->ParseFrom(file) && + pool.BuildFile(*proto) != nullptr; + if (!ok) { + printf("Failed to add file.\n"); + exit(1); + } + bytes_per_iter += file.size(); + } + } + state.SetBytesProcessed(state.iterations() * bytes_per_iter); +} +BENCHMARK(BM_LoadAdsDescriptor_Proto2); + static void BM_Parse_Upb_FileDesc_WithArena(benchmark::State& state) { size_t bytes = 0; for (auto _ : state) { diff --git a/upb/def.c b/upb/def.c index 603deb2013..249cb26055 100644 --- a/upb/def.c +++ b/upb/def.c @@ -118,6 +118,7 @@ struct upb_symtab { upb_arena *arena; upb_strtable syms; /* full_name -> packed def ptr */ upb_strtable files; /* file_name -> upb_filedef* */ + size_t bytes_loaded; }; /* Inside a symtab we store tagged pointers to specific def types. */ @@ -2084,6 +2085,7 @@ upb_symtab *upb_symtab_new(void) { } s->arena = upb_arena_new(); + s->bytes_loaded = 0; alloc = upb_arena_alloc(s->arena); if (!upb_strtable_init2(&s->syms, UPB_CTYPE_CONSTPTR, 32, alloc) || @@ -2187,6 +2189,7 @@ bool _upb_symtab_loaddefinit(upb_symtab *s, const upb_def_init *init) { file = google_protobuf_FileDescriptorProto_parse( init->descriptor.data, init->descriptor.size, arena); + s->bytes_loaded += init->descriptor.size; if (!file) { upb_status_seterrf( @@ -2209,5 +2212,9 @@ err: return false; } +size_t _upb_symtab_bytesloaded(const upb_symtab *s) { + return s->bytes_loaded; +} + #undef CHK #undef CHK_OOM diff --git a/upb/def.h b/upb/def.h index d88c2872ea..6b73ef5be3 100644 --- a/upb/def.h +++ b/upb/def.h @@ -293,6 +293,7 @@ int upb_symtab_filecount(const upb_symtab *s); const upb_filedef *upb_symtab_addfile( upb_symtab *s, const google_protobuf_FileDescriptorProto *file, upb_status *status); +size_t _upb_symtab_bytesloaded(const upb_symtab *s); /* For generated code only: loads a generated descriptor. */ typedef struct upb_def_init { From 43c207ea7ee6e689ad78e9c301d86012ab02e18f Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Sat, 31 Oct 2020 13:28:24 -0700 Subject: [PATCH 09/21] Added CMake dummy rule. --- cmake/make_cmakelists.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/make_cmakelists.py b/cmake/make_cmakelists.py index 8e024d1526..ee9760a47c 100755 --- a/cmake/make_cmakelists.py +++ b/cmake/make_cmakelists.py @@ -177,6 +177,9 @@ class WorkspaceFileFunctions(object): def git_repository(self, **kwargs): pass + def new_git_repository(self, **kwargs): + pass + def bazel_version_repository(self, **kwargs): pass From 5ec1d39224c24f323f4f33d5eb7569dcdcd5fdfb Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Sat, 31 Oct 2020 14:13:39 -0700 Subject: [PATCH 10/21] Avoid building .pb.cc for ads protos, as C++ takes forever to compile. --- benchmarks/BUILD | 6 ------ benchmarks/benchmark.cc | 30 +++++++++++++----------------- 2 files changed, 13 insertions(+), 23 deletions(-) diff --git a/benchmarks/BUILD b/benchmarks/BUILD index ed80b185f1..45bb870bc2 100644 --- a/benchmarks/BUILD +++ b/benchmarks/BUILD @@ -26,11 +26,6 @@ upb_proto_reflection_library( deps = ["@com_google_googleapis//:ads_proto"], ) -cc_proto_library( - name = "ads_cc_proto", - deps = ["@com_google_googleapis//:ads_proto"], -) - cc_proto_library( name = "benchmark_descriptor_cc_proto", deps = [":benchmark_descriptor_proto"], @@ -55,7 +50,6 @@ cc_binary( ":benchmark_descriptor_sv_cc_proto", ":benchmark_descriptor_upb_proto", ":benchmark_descriptor_upb_proto_reflection", - ":ads_cc_proto", ":ads_upb_proto_reflection", "//:descriptor_upb_proto", "//:reflection", diff --git a/benchmarks/benchmark.cc b/benchmarks/benchmark.cc index 69340cf21e..ebeaed7654 100644 --- a/benchmarks/benchmark.cc +++ b/benchmarks/benchmark.cc @@ -9,7 +9,6 @@ #include "benchmarks/descriptor_sv.pb.h" // For for benchmarks of building descriptors. -#include "google/ads/googleads/v5/services/google_ads_service.pb.h" #include "google/ads/googleads/v5/services/google_ads_service.upbdefs.h" #include "google/protobuf/descriptor.pb.h" #include "google/protobuf/descriptor.upb.h" @@ -22,18 +21,14 @@ namespace protobuf = ::google::protobuf; /* A buffer big enough to parse descriptor.proto without going to heap. */ char buf[65535]; -void CollectFileDescriptors(const protobuf::FileDescriptor* file, - std::vector& serialized_files, - std::unordered_set& seen) { +void CollectFileDescriptors(const upb_def_init* file, + std::vector& serialized_files, + std::unordered_set& seen) { if (!seen.insert(file).second) return; - for (int i = 0; i < file->dependency_count(); i++) { - CollectFileDescriptors(file->dependency(i), serialized_files, seen); + for (upb_def_init **deps = file->deps; *deps; deps++) { + CollectFileDescriptors(*deps, serialized_files, seen); } - protobuf::FileDescriptorProto file_proto; - file->CopyTo(&file_proto); - std::string serialized; - file_proto.SerializeToString(&serialized); - serialized_files.push_back(std::move(serialized)); + serialized_files.push_back(file->descriptor); } static void BM_ArenaOneAlloc(benchmark::State& state) { @@ -96,11 +91,11 @@ static void BM_LoadDescriptor_Proto2(benchmark::State& state) { BENCHMARK(BM_LoadDescriptor_Proto2); static void BM_LoadAdsDescriptor_Proto2(benchmark::State& state) { - std::vector serialized_files; - std::unordered_set seen_files; + extern upb_def_init google_ads_googleads_v5_services_google_ads_service_proto_upbdefinit; + std::vector serialized_files; + std::unordered_set seen_files; CollectFileDescriptors( - google::ads::googleads::v5::services::SearchGoogleAdsRequest::descriptor() - ->file(), + &google_ads_googleads_v5_services_google_ads_service_proto_upbdefinit, serialized_files, seen_files); size_t bytes_per_iter; for (auto _ : state) { @@ -108,15 +103,16 @@ static void BM_LoadAdsDescriptor_Proto2(benchmark::State& state) { protobuf::Arena arena; protobuf::DescriptorPool pool; for (auto file : serialized_files) { + protobuf::StringPiece input(file.data, file.size); auto proto = protobuf::Arena::CreateMessage( &arena); - bool ok = proto->ParseFrom(file) && + bool ok = proto->ParseFrom(input) && pool.BuildFile(*proto) != nullptr; if (!ok) { printf("Failed to add file.\n"); exit(1); } - bytes_per_iter += file.size(); + bytes_per_iter += input.size(); } } state.SetBytesProcessed(state.iterations() * bytes_per_iter); From c9f9668234bdc13df99b917d83c22e19bfb6af33 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Sun, 1 Nov 2020 00:06:23 -0700 Subject: [PATCH 11/21] symtab: use longjmp() for errors and avoid intermediate table. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We used to use a separate "add table" during the upb_symtab_addfile() operation to make it easier to back out the file if it contained errors. But this created unnecessary work of re-adding the same symbols to the main symtab once everything was validated. Instead we directly add symbols to the main symbols table. If there is an error in validation, we remove precisely the set of symbols that were already added. This also requires using a separate arena for each file. We can fuse it with the symtab's main arena if the operation is successful. LoadDescriptor_Upb 61.2µs ± 4% 53.5µs ± 1% -12.50% (p=0.000 n=12+12) LoadAdsDescriptor_Upb 4.43ms ± 1% 3.06ms ± 0% -31.00% (p=0.000 n=12+12) LoadDescriptor_Proto2 257µs ± 0% 259µs ± 0% +1.00% (p=0.000 n=12+12) LoadAdsDescriptor_Proto2 13.9ms ± 1% 13.9ms ± 1% ~ (p=0.128 n=12+12) --- tests/bindings/lua/test_upb.lua | 14 + upb/def.c | 821 +++++++++++++++----------------- 2 files changed, 396 insertions(+), 439 deletions(-) diff --git a/tests/bindings/lua/test_upb.lua b/tests/bindings/lua/test_upb.lua index 345828d500..3a490c6f6d 100644 --- a/tests/bindings/lua/test_upb.lua +++ b/tests/bindings/lua/test_upb.lua @@ -512,6 +512,20 @@ function test_foo() assert_equal(set.file[1].name, "google/protobuf/descriptor.proto") end +function test_descriptor_error() + local symtab = upb.SymbolTable() + local file = descriptor.FileDescriptorProto() + file.name = "test.proto" + file.message_type[1] = descriptor.DescriptorProto{ + name = "ABC" + } + file.message_type[2] = descriptor.DescriptorProto{ + name = "BC." + } + assert_error(function () symtab:add_file(upb.encode(file)) end) + assert_nil(symtab:lookup_msg("ABC")) +end + function test_gc() local top = test_messages_proto3.TestAllTypesProto3() local n = 100 diff --git a/upb/def.c b/upb/def.c index 56497e5cca..7379a768a0 100644 --- a/upb/def.c +++ b/upb/def.c @@ -3,10 +3,11 @@ #include #include +#include #include #include -#include "google/protobuf/descriptor.upb.h" +#include "google/protobuf/descriptor.upb.h" #include "upb/port_def.inc" typedef struct { @@ -14,15 +15,6 @@ typedef struct { char str[1]; /* Null-terminated string data follows. */ } str_t; -static str_t *newstr(upb_alloc *alloc, const char *data, size_t len) { - str_t *ret = upb_malloc(alloc, sizeof(*ret) + len); - if (!ret) return NULL; - ret->len = len; - if (len) memcpy(ret->str, data, len); - ret->str[len] = '\0'; - return ret; -} - struct upb_fielddef { const upb_filedef *file; const upb_msgdef *msgdef; @@ -157,38 +149,6 @@ static bool upb_isalphanum(char c) { return upb_isletter(c) || upb_isbetween(c, '0', '9'); } -static bool upb_isident(upb_strview name, bool full, upb_status *s) { - const char *str = name.data; - size_t len = name.size; - bool start = true; - size_t i; - for (i = 0; i < len; i++) { - char c = str[i]; - if (c == '.') { - if (start || !full) { - upb_status_seterrf(s, "invalid name: unexpected '.' (%s)", str); - return false; - } - start = true; - } else if (start) { - if (!upb_isletter(c)) { - upb_status_seterrf( - s, "invalid name: path components must start with a letter (%s)", - str); - return false; - } - start = false; - } else { - if (!upb_isalphanum(c)) { - upb_status_seterrf(s, "invalid name: non-alphanumeric character (%s)", - str); - return false; - } - } - } - return !start; -} - static const char *shortdefname(const char *fullname) { const char *p; @@ -248,54 +208,6 @@ static void upb_status_setoom(upb_status *status) { upb_status_seterrmsg(status, "out of memory"); } -static bool assign_msg_indices(upb_msgdef *m, upb_status *s) { - /* Sort fields. upb internally relies on UPB_TYPE_MESSAGE fields having the - * lowest indexes, but we do not publicly guarantee this. */ - upb_msg_field_iter j; - int i; - uint32_t selector; - int n = upb_msgdef_numfields(m); - upb_fielddef **fields; - - if (n == 0) { - m->selector_count = UPB_STATIC_SELECTOR_COUNT; - m->submsg_field_count = 0; - return true; - } - - fields = upb_gmalloc(n * sizeof(*fields)); - if (!fields) { - upb_status_setoom(s); - return false; - } - - m->submsg_field_count = 0; - for(i = 0, upb_msg_field_begin(&j, m); - !upb_msg_field_done(&j); - upb_msg_field_next(&j), i++) { - upb_fielddef *f = upb_msg_iter_field(&j); - UPB_ASSERT(f->msgdef == m); - if (upb_fielddef_issubmsg(f)) { - m->submsg_field_count++; - } - fields[i] = f; - } - - qsort(fields, n, sizeof(*fields), cmp_fields); - - selector = UPB_STATIC_SELECTOR_COUNT + m->submsg_field_count; - for (i = 0; i < n; i++) { - upb_fielddef *f = fields[i]; - f->index_ = i; - f->selector_base = selector + upb_handlers_selectorbaseoffset(f); - selector += upb_handlers_selectorcount(f); - } - m->selector_count = selector; - - upb_gfree(fields); - return true; -} - static void assign_msg_wellknowntype(upb_msgdef *m) { const char *name = upb_msgdef_fullname(m); if (name == NULL) { @@ -865,6 +777,114 @@ void upb_oneof_iter_setdone(upb_oneof_iter *iter) { upb_inttable_iter_setdone(iter); } +/* upb_filedef ****************************************************************/ + +const char *upb_filedef_name(const upb_filedef *f) { + return f->name; +} + +const char *upb_filedef_package(const upb_filedef *f) { + return f->package; +} + +const char *upb_filedef_phpprefix(const upb_filedef *f) { + return f->phpprefix; +} + +const char *upb_filedef_phpnamespace(const upb_filedef *f) { + return f->phpnamespace; +} + +upb_syntax_t upb_filedef_syntax(const upb_filedef *f) { + return f->syntax; +} + +int upb_filedef_msgcount(const upb_filedef *f) { + return f->msg_count; +} + +int upb_filedef_depcount(const upb_filedef *f) { + return f->dep_count; +} + +int upb_filedef_enumcount(const upb_filedef *f) { + return f->enum_count; +} + +const upb_filedef *upb_filedef_dep(const upb_filedef *f, int i) { + return i < 0 || i >= f->dep_count ? NULL : f->deps[i]; +} + +const upb_msgdef *upb_filedef_msg(const upb_filedef *f, int i) { + return i < 0 || i >= f->msg_count ? NULL : &f->msgs[i]; +} + +const upb_enumdef *upb_filedef_enum(const upb_filedef *f, int i) { + return i < 0 || i >= f->enum_count ? NULL : &f->enums[i]; +} + +void upb_symtab_free(upb_symtab *s) { + upb_arena_free(s->arena); + upb_gfree(s); +} + +upb_symtab *upb_symtab_new(void) { + upb_symtab *s = upb_gmalloc(sizeof(*s)); + upb_alloc *alloc; + + if (!s) { + return NULL; + } + + s->arena = upb_arena_new(); + s->bytes_loaded = 0; + alloc = upb_arena_alloc(s->arena); + + if (!upb_strtable_init2(&s->syms, UPB_CTYPE_CONSTPTR, 32, alloc) || + !upb_strtable_init2(&s->files, UPB_CTYPE_CONSTPTR, 4, alloc)) { + upb_arena_free(s->arena); + upb_gfree(s); + s = NULL; + } + return s; +} + +const upb_msgdef *upb_symtab_lookupmsg(const upb_symtab *s, const char *sym) { + upb_value v; + return upb_strtable_lookup(&s->syms, sym, &v) ? + unpack_def(v, UPB_DEFTYPE_MSG) : NULL; +} + +const upb_msgdef *upb_symtab_lookupmsg2(const upb_symtab *s, const char *sym, + size_t len) { + upb_value v; + return upb_strtable_lookup2(&s->syms, sym, len, &v) ? + unpack_def(v, UPB_DEFTYPE_MSG) : NULL; +} + +const upb_enumdef *upb_symtab_lookupenum(const upb_symtab *s, const char *sym) { + upb_value v; + return upb_strtable_lookup(&s->syms, sym, &v) ? + unpack_def(v, UPB_DEFTYPE_ENUM) : NULL; +} + +const upb_filedef *upb_symtab_lookupfile(const upb_symtab *s, const char *name) { + upb_value v; + return upb_strtable_lookup(&s->files, name, &v) ? upb_value_getconstptr(v) + : NULL; +} + +const upb_filedef *upb_symtab_lookupfile2( + const upb_symtab *s, const char *name, size_t len) { + upb_value v; + return upb_strtable_lookup2(&s->files, name, len, &v) ? + upb_value_getconstptr(v) : NULL; +} + +int upb_symtab_filecount(const upb_symtab *s) { + return (int)upb_strtable_count(&s->files); +} + /* Code to build defs from descriptor protos. *********************************/ /* There is a question of how much validation to do here. It will be difficult @@ -872,34 +892,71 @@ void upb_oneof_iter_setdone(upb_oneof_iter *iter) { * this code is used to directly build defs from Ruby (for example) we do need * to validate important constraints like uniqueness of names and numbers. */ -#define CHK_OOM(ctx, x) if (!(x)) { symtab_oomerr(ctx); } +#define CHK_OOM(x) if (!(x)) { symtab_oomerr(ctx); } typedef struct { - const upb_symtab *symtab; + upb_symtab *symtab; upb_filedef *file; /* File we are building. */ - upb_alloc *alloc; /* Allocate defs here. */ - upb_alloc *tmp; /* Alloc for addtab and any other tmp data. */ - upb_strtable *addtab; /* full_name -> packed def ptr for new defs */ + upb_arena *file_arena; /* Allocate defs here. */ + upb_alloc *alloc; /* Alloc of file_arena, for tables. */ const upb_msglayout **layouts; /* NULL if we should build layouts. */ upb_status *status; /* Record errors here. */ jmp_buf err; /* longjmp() on error. */ } symtab_addctx; -static void symtab_err(symtab_addctx *ctx) { longjmp(ctx->err, 1); } - +UPB_NORETURN UPB_NOINLINE static void symtab_errf(symtab_addctx *ctx, const char *fmt, ...) { va_list argp; va_start(argp, fmt); - upb_status_vseterrf(d->status, fmt, argp); + upb_status_vseterrf(ctx->status, fmt, argp); va_end(argp); longjmp(ctx->err, 1); } +UPB_NORETURN UPB_NOINLINE static void symtab_oomerr(symtab_addctx *ctx) { upb_status_setoom(ctx->status); longjmp(ctx->err, 1); } +void *symtab_alloc(symtab_addctx *ctx, size_t bytes) { + void *ret = upb_arena_malloc(ctx->file_arena, bytes); + if (!ret) symtab_oomerr(ctx); + return ret; +} + +static void check_ident(symtab_addctx *ctx, upb_strview name, bool full) { + const char *str = name.data; + size_t len = name.size; + bool start = true; + size_t i; + for (i = 0; i < len; i++) { + char c = str[i]; + if (c == '.') { + if (start || !full) { + symtab_errf(ctx, "invalid name: unexpected '.' (%.*s)", (int)len, str); + } + start = true; + } else if (start) { + if (!upb_isletter(c)) { + symtab_errf( + ctx, + "invalid name: path components must start with a letter (%.*s)", + (int)len, str); + } + start = false; + } else { + if (!upb_isalphanum(c)) { + symtab_errf(ctx, "invalid name: non-alphanumeric character (%.*s)", + (int)len, str); + } + } + } + if (start) { + symtab_errf(ctx, "invalid name: empty part (%.*s)", (int)len, str); + } +} + static size_t div_round_up(size_t n, size_t d) { return (n + d - 1) / d; } @@ -965,8 +1022,7 @@ static void assign_layout_indices(const upb_msgdef *m, upb_msglayout_field *fiel /* This function is the dynamic equivalent of message_layout.{cc,h} in upbc. * It computes a dynamic layout for all of the fields in |m|. */ -static void make_layout(symtab_addctx *ctx, const upb_symtab *symtab, - const upb_msgdef *m) { +static void make_layout(symtab_addctx *ctx, const upb_msgdef *m) { upb_msglayout *l = (upb_msglayout*)m->layout; upb_msg_field_iter it; upb_msg_oneof_iter oit; @@ -974,17 +1030,11 @@ static void make_layout(symtab_addctx *ctx, const upb_symtab *symtab, size_t submsg_count = m->submsg_field_count; const upb_msglayout **submsgs; upb_msglayout_field *fields; - upb_alloc *alloc = upb_arena_alloc(symtab->arena); memset(l, 0, sizeof(*l)); - fields = upb_malloc(alloc, upb_msgdef_numfields(m) * sizeof(*fields)); - submsgs = upb_malloc(alloc, submsg_count * sizeof(*submsgs)); - - if ((!fields && upb_msgdef_numfields(m)) || - (!submsgs && submsg_count)) { - symtab_oomerr(ctx); - } + fields = symtab_alloc(ctx, upb_msgdef_numfields(m) * sizeof(*fields)); + submsgs = symtab_alloc(ctx, submsg_count * sizeof(*submsgs)); l->field_count = upb_msgdef_numfields(m); l->fields = fields; @@ -1128,7 +1178,50 @@ static void make_layout(symtab_addctx *ctx, const upb_symtab *symtab, assign_layout_indices(m, fields); } -static char* strviewdup(const symtab_addctx *ctx, upb_strview view) { +static void assign_msg_indices(symtab_addctx *ctx, upb_msgdef *m) { + /* Sort fields. upb internally relies on UPB_TYPE_MESSAGE fields having the + * lowest indexes, but we do not publicly guarantee this. */ + upb_msg_field_iter j; + int i; + uint32_t selector; + int n = upb_msgdef_numfields(m); + upb_fielddef **fields; + + if (n == 0) { + m->selector_count = UPB_STATIC_SELECTOR_COUNT; + m->submsg_field_count = 0; + return; + } + + fields = upb_gmalloc(n * sizeof(*fields)); + + m->submsg_field_count = 0; + for(i = 0, upb_msg_field_begin(&j, m); + !upb_msg_field_done(&j); + upb_msg_field_next(&j), i++) { + upb_fielddef *f = upb_msg_iter_field(&j); + UPB_ASSERT(f->msgdef == m); + if (upb_fielddef_issubmsg(f)) { + m->submsg_field_count++; + } + fields[i] = f; + } + + qsort(fields, n, sizeof(*fields), cmp_fields); + + selector = UPB_STATIC_SELECTOR_COUNT + m->submsg_field_count; + for (i = 0; i < n; i++) { + upb_fielddef *f = fields[i]; + f->index_ = i; + f->selector_base = selector + upb_handlers_selectorbaseoffset(f); + selector += upb_handlers_selectorcount(f); + } + m->selector_count = selector; + + upb_gfree(fields); +} + +static char *strviewdup(symtab_addctx *ctx, upb_strview view) { return upb_strdup2(view.data, view.size, ctx->alloc); } @@ -1140,13 +1233,12 @@ static bool streql_view(upb_strview view, const char *b) { return streql2(view.data, view.size, b); } -static const char *makefullname(const symtab_addctx *ctx, const char *prefix, +static const char *makefullname(symtab_addctx *ctx, const char *prefix, upb_strview name) { if (prefix) { /* ret = prefix + '.' + name; */ size_t n = strlen(prefix); - char *ret = upb_malloc(ctx->alloc, n + name.size + 2); - CHK_OOM(ret); + char *ret = symtab_alloc(ctx, n + name.size + 2); strcpy(ret, prefix); ret[n] = '.'; memcpy(&ret[n + 1], name.data, name.size); @@ -1177,7 +1269,7 @@ static void finalize_oneofs(symtab_addctx *ctx, upb_msgdef *m) { upb_oneofdef_name(o)); } - o->fields = upb_malloc(ctx->alloc, sizeof(upb_fielddef*) * o->field_count); + o->fields = symtab_alloc(ctx, sizeof(upb_fielddef *) * o->field_count); o->field_count = 0; } @@ -1230,41 +1322,39 @@ size_t getjsonname(const char *name, char *buf, size_t len) { #undef WRITE } -static char* makejsonname(const char* name, upb_alloc *alloc) { +static char* makejsonname(symtab_addctx *ctx, const char* name) { size_t size = getjsonname(name, NULL, 0); - char* json_name = upb_malloc(alloc, size); + char* json_name = symtab_alloc(ctx, size); getjsonname(name, json_name, size); return json_name; } -static void symtab_add(const symtab_addctx *ctx, const char *name, - upb_value v) { - upb_value tmp; - if (upb_strtable_lookup(ctx->addtab, name, &tmp) || - upb_strtable_lookup(&ctx->symtab->syms, name, &tmp)) { - status_errf(ctx, "duplicate symbol '%s'", name); +static void symtab_add(symtab_addctx *ctx, const char *name, upb_value v) { + if (upb_strtable_lookup(&ctx->symtab->syms, name, NULL)) { + symtab_errf(ctx, "duplicate symbol '%s'", name); } - - CHK_OOM(upb_strtable_insert3(ctx->addtab, name, strlen(name), v, ctx->tmp)); + upb_alloc *alloc = upb_arena_alloc(ctx->symtab->arena); + size_t len = strlen(name); + CHK_OOM(upb_strtable_insert3(&ctx->symtab->syms, name, len, v, alloc)); } /* Given a symbol and the base symbol inside which it is defined, find the * symbol's definition in t. */ -static const void *resolvename(symtab_addctx *ctx, const upb_strtable *t, - const upb_fielddef *f, const char *base, - upb_strview sym, upb_deftype_t type, - const void **def) { - if(sym.size == 0) return false; +static const void *symtab_resolve(symtab_addctx *ctx, const upb_fielddef *f, + const char *base, upb_strview sym, + upb_deftype_t type) { + const upb_strtable *t = &ctx->symtab->syms; + if(sym.size == 0) goto notfound; if(sym.data[0] == '.') { /* Symbols starting with '.' are absolute, so we do a single lookup. * Slice to omit the leading '.' */ upb_value v; if (!upb_strtable_lookup2(t, sym.data + 1, sym.size - 1, &v)) { - return NULL; + goto notfound; } const void *ret = unpack_def(v, type); - if (ret) { + if (!ret) { symtab_errf(ctx, "type mismatch when resolving field %s, name %s", f->full_name, sym.data); } @@ -1274,23 +1364,15 @@ static const void *resolvename(symtab_addctx *ctx, const upb_strtable *t, * TODO: This branch is totally broken, but currently not used. */ (void)base; UPB_ASSERT(false); - return false; + goto notfound; } -} -const void *symtab_resolve(const symtab_addctx *ctx, const upb_fielddef *f, - const char *base, upb_strview sym, - upb_deftype_t type) { - const void *ret; - if (!resolvename(ctx, ctx->addtab, f, base, sym, type, &ret) && - !resolvename(ctx, &ctx->symtab->syms, f, base, sym, type, &ret)) { - symtab_errf(ctx, "couldn't resolve name '%s'", sym.data); - } - return ret; +notfound: + symtab_errf(ctx, "couldn't resolve name '%s'", sym.data); } -static bool create_oneofdef( - const symtab_addctx *ctx, upb_msgdef *m, +static void create_oneofdef( + symtab_addctx *ctx, upb_msgdef *m, const google_protobuf_OneofDescriptorProto *oneof_proto) { upb_oneofdef *o; upb_strview name = google_protobuf_OneofDescriptorProto_name(oneof_proto); @@ -1303,16 +1385,23 @@ static bool create_oneofdef( o->synthetic = false; v = pack_def(o, UPB_DEFTYPE_ONEOF); - CHK_OOM(symtab_add(ctx, o->full_name, v)); + symtab_add(ctx, o->full_name, v); CHK_OOM(upb_strtable_insert3(&m->ntof, name.data, name.size, v, ctx->alloc)); CHK_OOM(upb_inttable_init2(&o->itof, UPB_CTYPE_CONSTPTR, ctx->alloc)); CHK_OOM(upb_strtable_init2(&o->ntof, UPB_CTYPE_CONSTPTR, 4, ctx->alloc)); +} - return true; +static str_t *newstr(symtab_addctx *ctx, const char *data, size_t len) { + str_t *ret = symtab_alloc(ctx, sizeof(*ret) + len); + if (!ret) return NULL; + ret->len = len; + if (len) memcpy(ret->str, data, len); + ret->str[len] = '\0'; + return ret; } -static bool parse_default(const symtab_addctx *ctx, const char *str, size_t len, +static void parse_default(symtab_addctx *ctx, const char *str, size_t len, upb_fielddef *f) { char *end; char nullz[64]; @@ -1327,7 +1416,7 @@ static bool parse_default(const symtab_addctx *ctx, const char *str, size_t len, case UPB_TYPE_FLOAT: /* Standard C number parsing functions expect null-terminated strings. */ if (len >= sizeof(nullz) - 1) { - return false; + symtab_errf(ctx, "Default too long: %.*s", (int)len, str); } memcpy(nullz, str, len); nullz[len] = '\0'; @@ -1340,47 +1429,61 @@ static bool parse_default(const symtab_addctx *ctx, const char *str, size_t len, switch (upb_fielddef_type(f)) { case UPB_TYPE_INT32: { long val = strtol(str, &end, 0); - CHK(val <= INT32_MAX && val >= INT32_MIN && errno != ERANGE && !*end); + if (val > INT32_MAX || val < INT32_MIN || errno == ERANGE || *end) { + goto invalid; + } f->defaultval.sint = val; break; } case UPB_TYPE_ENUM: { const upb_enumdef *e = f->sub.enumdef; int32_t val; - CHK(upb_enumdef_ntoi(e, str, len, &val)); + if (!upb_enumdef_ntoi(e, str, len, &val)) { + goto invalid; + } f->defaultval.sint = val; break; } case UPB_TYPE_INT64: { /* XXX: Need to write our own strtoll, since it's not available in c89. */ int64_t val = strtol(str, &end, 0); - CHK(val <= INT64_MAX && val >= INT64_MIN && errno != ERANGE && !*end); + if (val > INT64_MAX || val < INT64_MIN || errno == ERANGE || *end) { + goto invalid; + } f->defaultval.sint = val; break; } case UPB_TYPE_UINT32: { unsigned long val = strtoul(str, &end, 0); - CHK(val <= UINT32_MAX && errno != ERANGE && !*end); + if (val > UINT32_MAX || errno == ERANGE || *end) { + goto invalid; + } f->defaultval.uint = val; break; } case UPB_TYPE_UINT64: { /* XXX: Need to write our own strtoull, since it's not available in c89. */ uint64_t val = strtoul(str, &end, 0); - CHK(val <= UINT64_MAX && errno != ERANGE && !*end); + if (val > UINT64_MAX || errno == ERANGE || *end) { + goto invalid; + } f->defaultval.uint = val; break; } case UPB_TYPE_DOUBLE: { double val = strtod(str, &end); - CHK(errno != ERANGE && !*end); + if (errno == ERANGE || *end) { + goto invalid; + } f->defaultval.dbl = val; break; } case UPB_TYPE_FLOAT: { /* XXX: Need to write our own strtof, since it's not available in c89. */ float val = strtod(str, &end); - CHK(errno != ERANGE && !*end); + if (errno == ERANGE || *end) { + goto invalid; + } f->defaultval.flt = val; break; } @@ -1390,25 +1493,30 @@ static bool parse_default(const symtab_addctx *ctx, const char *str, size_t len, } else if (streql2(str, len, "true")) { f->defaultval.boolean = true; } else { - return false; } break; } case UPB_TYPE_STRING: - f->defaultval.str = newstr(ctx->alloc, str, len); + f->defaultval.str = newstr(ctx, str, len); break; case UPB_TYPE_BYTES: /* XXX: need to interpret the C-escaped value. */ - f->defaultval.str = newstr(ctx->alloc, str, len); + f->defaultval.str = newstr(ctx, str, len); break; case UPB_TYPE_MESSAGE: /* Should not have a default value. */ - return false; + symtab_errf(ctx, "Message should not have a default (%s)", + upb_fielddef_fullname(f)); } - return true; + + return; + +invalid: + symtab_errf(ctx, "Invalid default '%.*s' for field %f", (int)len, str, + upb_fielddef_fullname(f)); } -static void set_default_default(const symtab_addctx *ctx, upb_fielddef *f) { +static void set_default_default(symtab_addctx *ctx, upb_fielddef *f) { switch (upb_fielddef_type(f)) { case UPB_TYPE_INT32: case UPB_TYPE_INT64: @@ -1425,7 +1533,7 @@ static void set_default_default(const symtab_addctx *ctx, upb_fielddef *f) { break; case UPB_TYPE_STRING: case UPB_TYPE_BYTES: - f->defaultval.str = newstr(ctx->alloc, NULL, 0); + f->defaultval.str = newstr(ctx, NULL, 0); break; case UPB_TYPE_BOOL: f->defaultval.boolean = false; @@ -1435,8 +1543,8 @@ static void set_default_default(const symtab_addctx *ctx, upb_fielddef *f) { } } -static bool create_fielddef( - const symtab_addctx *ctx, const char *prefix, upb_msgdef *m, +static void create_fielddef( + symtab_addctx *ctx, const char *prefix, upb_msgdef *m, const google_protobuf_FieldDescriptorProto *field_proto) { upb_alloc *alloc = ctx->alloc; upb_fielddef *f; @@ -1448,12 +1556,11 @@ static bool create_fielddef( uint32_t field_number; if (!google_protobuf_FieldDescriptorProto_has_name(field_proto)) { - upb_status_seterrmsg(ctx->status, "field has no name"); - return false; + symtab_errf(ctx, "field has no name (%s)", upb_msgdef_fullname(m)); } name = google_protobuf_FieldDescriptorProto_name(field_proto); - CHK(upb_isident(name, false, ctx->status)); + check_ident(ctx, name, false); full_name = makefullname(ctx, prefix, name); shortname = shortdefname(full_name); @@ -1461,14 +1568,13 @@ static bool create_fielddef( json_name = strviewdup( ctx, google_protobuf_FieldDescriptorProto_json_name(field_proto)); } else { - json_name = makejsonname(shortname, ctx->alloc); + json_name = makejsonname(ctx, shortname); } field_number = google_protobuf_FieldDescriptorProto_number(field_proto); if (field_number == 0 || field_number > UPB_MAX_FIELDNUMBER) { - upb_status_seterrf(ctx->status, "invalid field number (%u)", field_number); - return false; + symtab_errf(ctx, "invalid field number (%u)", field_number); } if (m) { @@ -1481,19 +1587,15 @@ static bool create_fielddef( f->is_extension_ = false; if (upb_strtable_lookup(&m->ntof, shortname, NULL)) { - upb_status_seterrf(ctx->status, "duplicate field name (%s)", shortname); - return false; + symtab_errf(ctx, "duplicate field name (%s)", shortname); } if (upb_strtable_lookup(&m->ntof, json_name, NULL)) { - upb_status_seterrf(ctx->status, "duplicate json_name (%s)", json_name); - return false; + symtab_errf(ctx, "duplicate json_name (%s)", json_name); } if (upb_inttable_lookup(&m->itof, field_number, NULL)) { - upb_status_seterrf(ctx->status, "duplicate field number (%u)", - field_number); - return false; + symtab_errf(ctx, "duplicate field number (%u)", field_number); } field_v = pack_def(f, UPB_DEFTYPE_FIELD); @@ -1527,7 +1629,7 @@ static bool create_fielddef( /* extension field. */ f = (upb_fielddef*)&ctx->file->exts[ctx->file->ext_count++]; f->is_extension_ = true; - CHK_OOM(symtab_add(ctx, full_name, pack_def(f, UPB_DEFTYPE_FIELD))); + symtab_add(ctx, full_name, pack_def(f, UPB_DEFTYPE_FIELD)); } f->full_name = full_name; @@ -1546,9 +1648,7 @@ static bool create_fielddef( f->sub.unresolved = field_proto; if (f->label_ == UPB_LABEL_REQUIRED && f->file->syntax == UPB_SYNTAX_PROTO3) { - upb_status_seterrf(ctx->status, "proto3 fields cannot be required (%s)", - f->full_name); - return false; + symtab_errf(ctx, "proto3 fields cannot be required (%s)", f->full_name); } if (google_protobuf_FieldDescriptorProto_has_oneof_index(field_proto)) { @@ -1558,23 +1658,17 @@ static bool create_fielddef( upb_value v = upb_value_constptr(f); if (upb_fielddef_label(f) != UPB_LABEL_OPTIONAL) { - upb_status_seterrf(ctx->status, - "fields in oneof must have OPTIONAL label (%s)", - f->full_name); - return false; + symtab_errf(ctx, "fields in oneof must have OPTIONAL label (%s)", + f->full_name); } if (!m) { - upb_status_seterrf(ctx->status, - "oneof_index provided for extension field (%s)", - f->full_name); - return false; + symtab_errf(ctx, "oneof_index provided for extension field (%s)", + f->full_name); } if (oneof_index >= m->oneof_count) { - upb_status_seterrf(ctx->status, "oneof_index out of range (%s)", - f->full_name); - return false; + symtab_errf(ctx, "oneof_index out of range (%s)", f->full_name); } oneof = (upb_oneofdef*)&m->oneofs[oneof_index]; @@ -1584,15 +1678,13 @@ static bool create_fielddef( if (f->proto3_optional_) { oneof->synthetic = true; } - CHK(upb_inttable_insert2(&oneof->itof, f->number_, v, alloc)); - CHK(upb_strtable_insert3(&oneof->ntof, name.data, name.size, v, alloc)); + CHK_OOM(upb_inttable_insert2(&oneof->itof, f->number_, v, alloc)); + CHK_OOM(upb_strtable_insert3(&oneof->ntof, name.data, name.size, v, alloc)); } else { f->oneof = NULL; if (f->proto3_optional_) { - upb_status_seterrf(ctx->status, - "field with proto3_optional was not in a oneof (%s)", - f->full_name); - return false; + symtab_errf(ctx, "field with proto3_optional was not in a oneof (%s)", + f->full_name); } } @@ -1612,12 +1704,10 @@ static bool create_fielddef( } else { f->lazy_ = false; } - - return true; } -static bool create_enumdef( - const symtab_addctx *ctx, const char *prefix, +static void create_enumdef( + symtab_addctx *ctx, const char *prefix, const google_protobuf_EnumDescriptorProto *enum_proto) { upb_enumdef *e; const google_protobuf_EnumValueDescriptorProto *const *values; @@ -1625,11 +1715,11 @@ static bool create_enumdef( size_t i, n; name = google_protobuf_EnumDescriptorProto_name(enum_proto); - CHK(upb_isident(name, false, ctx->status)); + check_ident(ctx, name, false); e = (upb_enumdef*)&ctx->file->enums[ctx->file->enum_count++]; e->full_name = makefullname(ctx, prefix, name); - CHK_OOM(symtab_add(ctx, e->full_name, pack_def(e, UPB_DEFTYPE_ENUM))); + symtab_add(ctx, e->full_name, pack_def(e, UPB_DEFTYPE_ENUM)); values = google_protobuf_EnumDescriptorProto_value(enum_proto, &n); CHK_OOM(upb_strtable_init2(&e->ntoi, UPB_CTYPE_INT32, n, ctx->alloc)); @@ -1639,10 +1729,8 @@ static bool create_enumdef( e->defaultval = 0; if (n == 0) { - upb_status_seterrf(ctx->status, - "enums must contain at least one value (%s)", - e->full_name); - return false; + symtab_errf(ctx, "enums must contain at least one value (%s)", + e->full_name); } for (i = 0; i < n; i++) { @@ -1653,15 +1741,12 @@ static bool create_enumdef( upb_value v = upb_value_int32(num); if (i == 0 && e->file->syntax == UPB_SYNTAX_PROTO3 && num != 0) { - upb_status_seterrf(ctx->status, - "for proto3, the first enum value must be zero (%s)", - e->full_name); - return false; + symtab_errf(ctx, "for proto3, the first enum value must be zero (%s)", + e->full_name); } if (upb_strtable_lookup(&e->ntoi, name2, NULL)) { - upb_status_seterrf(ctx->status, "duplicate enum label '%s'", name2); - return false; + symtab_errf(ctx, "duplicate enum label '%s'", name2); } CHK_OOM(name2) @@ -1675,11 +1760,9 @@ static bool create_enumdef( } upb_inttable_compact2(&e->iton, ctx->alloc); - - return true; } -static bool create_msgdef(symtab_addctx *ctx, const char *prefix, +static void create_msgdef(symtab_addctx *ctx, const char *prefix, const google_protobuf_DescriptorProto *msg_proto) { upb_msgdef *m; const google_protobuf_MessageOptions *options; @@ -1691,11 +1774,11 @@ static bool create_msgdef(symtab_addctx *ctx, const char *prefix, upb_strview name; name = google_protobuf_DescriptorProto_name(msg_proto); - CHK(upb_isident(name, false, ctx->status)); + check_ident(ctx, name, false); m = (upb_msgdef*)&ctx->file->msgs[ctx->file->msg_count++]; m->full_name = makefullname(ctx, prefix, name); - CHK_OOM(symtab_add(ctx, m->full_name, pack_def(m, UPB_DEFTYPE_MSG))); + symtab_add(ctx, m->full_name, pack_def(m, UPB_DEFTYPE_MSG)); oneofs = google_protobuf_DescriptorProto_oneof_decl(msg_proto, &n_oneof); fields = google_protobuf_DescriptorProto_field(msg_proto, &n_field); @@ -1718,23 +1801,23 @@ static bool create_msgdef(symtab_addctx *ctx, const char *prefix, ctx->layouts++; } else { /* Allocate now (to allow cross-linking), populate later. */ - m->layout = upb_malloc(ctx->alloc, sizeof(*m->layout)); + m->layout = symtab_alloc(ctx, sizeof(*m->layout)); } m->oneof_count = 0; - m->oneofs = upb_malloc(ctx->alloc, sizeof(*m->oneofs) * n_oneof); + m->oneofs = symtab_alloc(ctx, sizeof(*m->oneofs) * n_oneof); for (i = 0; i < n_oneof; i++) { - CHK(create_oneofdef(ctx, m, oneofs[i])); + create_oneofdef(ctx, m, oneofs[i]); } m->field_count = 0; - m->fields = upb_malloc(ctx->alloc, sizeof(*m->fields) * n_field); + m->fields = symtab_alloc(ctx, sizeof(*m->fields) * n_field); for (i = 0; i < n_field; i++) { - CHK(create_fielddef(ctx, m->full_name, m, fields[i])); + create_fielddef(ctx, m->full_name, m, fields[i]); } - CHK(assign_msg_indices(m, ctx->status)); - CHK(finalize_oneofs(ctx, m)); + assign_msg_indices(ctx, m); + finalize_oneofs(ctx, m); assign_msg_wellknowntype(m); upb_inttable_compact2(&m->itof, ctx->alloc); @@ -1742,93 +1825,78 @@ static bool create_msgdef(symtab_addctx *ctx, const char *prefix, enums = google_protobuf_DescriptorProto_enum_type(msg_proto, &n); for (i = 0; i < n; i++) { - CHK(create_enumdef(ctx, m->full_name, enums[i])); + create_enumdef(ctx, m->full_name, enums[i]); } msgs = google_protobuf_DescriptorProto_nested_type(msg_proto, &n); for (i = 0; i < n; i++) { - CHK(create_msgdef(ctx, m->full_name, msgs[i])); + create_msgdef(ctx, m->full_name, msgs[i]); } - - return true; } -typedef struct { - int msg_count; - int enum_count; - int ext_count; -} decl_counts; - static void count_types_in_msg(const google_protobuf_DescriptorProto *msg_proto, - decl_counts *counts) { + upb_filedef *file) { const google_protobuf_DescriptorProto *const *msgs; size_t i, n; - counts->msg_count++; + file->msg_count++; msgs = google_protobuf_DescriptorProto_nested_type(msg_proto, &n); for (i = 0; i < n; i++) { - count_types_in_msg(msgs[i], counts); + count_types_in_msg(msgs[i], file); } google_protobuf_DescriptorProto_enum_type(msg_proto, &n); - counts->enum_count += n; + file->enum_count += n; google_protobuf_DescriptorProto_extension(msg_proto, &n); - counts->ext_count += n; + file->ext_count += n; } static void count_types_in_file( const google_protobuf_FileDescriptorProto *file_proto, - decl_counts *counts) { + upb_filedef *file) { const google_protobuf_DescriptorProto *const *msgs; size_t i, n; msgs = google_protobuf_FileDescriptorProto_message_type(file_proto, &n); for (i = 0; i < n; i++) { - count_types_in_msg(msgs[i], counts); + count_types_in_msg(msgs[i], file); } google_protobuf_FileDescriptorProto_enum_type(file_proto, &n); - counts->enum_count += n; + file->enum_count += n; google_protobuf_FileDescriptorProto_extension(file_proto, &n); - counts->ext_count += n; + file->ext_count += n; } -static bool resolve_fielddef(const symtab_addctx *ctx, const char *prefix, +static void resolve_fielddef(symtab_addctx *ctx, const char *prefix, upb_fielddef *f) { upb_strview name; const google_protobuf_FieldDescriptorProto *field_proto = f->sub.unresolved; if (f->is_extension_) { if (!google_protobuf_FieldDescriptorProto_has_extendee(field_proto)) { - upb_status_seterrf(ctx->status, - "extension for field '%s' had no extendee", - f->full_name); - return false; + symtab_errf(ctx, "extension for field '%s' had no extendee", + f->full_name); } name = google_protobuf_FieldDescriptorProto_extendee(field_proto); f->msgdef = symtab_resolve(ctx, f, prefix, name, UPB_DEFTYPE_MSG); - CHK(f->msgdef); } if ((upb_fielddef_issubmsg(f) || f->type_ == UPB_DESCRIPTOR_TYPE_ENUM) && !google_protobuf_FieldDescriptorProto_has_type_name(field_proto)) { - upb_status_seterrf(ctx->status, "field '%s' is missing type name", - f->full_name); - return false; + symtab_errf(ctx, "field '%s' is missing type name", f->full_name); } name = google_protobuf_FieldDescriptorProto_type_name(field_proto); if (upb_fielddef_issubmsg(f)) { f->sub.msgdef = symtab_resolve(ctx, f, prefix, name, UPB_DEFTYPE_MSG); - CHK(f->sub.msgdef); } else if (f->type_ == UPB_DESCRIPTOR_TYPE_ENUM) { f->sub.enumdef = symtab_resolve(ctx, f, prefix, name, UPB_DEFTYPE_ENUM); - CHK(f->sub.enumdef); } /* Have to delay resolving of the default value until now because of the enum @@ -1838,54 +1906,36 @@ static bool resolve_fielddef(const symtab_addctx *ctx, const char *prefix, google_protobuf_FieldDescriptorProto_default_value(field_proto); if (f->file->syntax == UPB_SYNTAX_PROTO3) { - upb_status_seterrf(ctx->status, - "proto3 fields cannot have explicit defaults (%s)", - f->full_name); - return false; + symtab_errf(ctx, "proto3 fields cannot have explicit defaults (%s)", + f->full_name); } if (upb_fielddef_issubmsg(f)) { - upb_status_seterrf(ctx->status, - "message fields cannot have explicit defaults (%s)", - f->full_name); - return false; + symtab_errf(ctx, "message fields cannot have explicit defaults (%s)", + f->full_name); } - if (!parse_default(ctx, defaultval.data, defaultval.size, f)) { - upb_status_seterrf(ctx->status, - "couldn't parse default '" UPB_STRVIEW_FORMAT - "' for field (%s)", - UPB_STRVIEW_ARGS(defaultval), f->full_name); - return false; - } + parse_default(ctx, defaultval.data, defaultval.size, f); } else { set_default_default(ctx, f); } - - return true; } -static bool build_filedef( +static void build_filedef( symtab_addctx *ctx, upb_filedef *file, const google_protobuf_FileDescriptorProto *file_proto) { - upb_alloc *alloc = ctx->alloc; const google_protobuf_FileOptions *file_options_proto; const google_protobuf_DescriptorProto *const *msgs; const google_protobuf_EnumDescriptorProto *const *enums; const google_protobuf_FieldDescriptorProto *const *exts; const upb_strview* strs; size_t i, n; - decl_counts counts = {0, 0, 0}; - count_types_in_file(file_proto, &counts); + count_types_in_file(file_proto, file); - file->msgs = upb_malloc(alloc, sizeof(*file->msgs) * counts.msg_count); - file->enums = upb_malloc(alloc, sizeof(*file->enums) * counts.enum_count); - file->exts = upb_malloc(alloc, sizeof(*file->exts) * counts.ext_count); - - CHK_OOM(counts.msg_count == 0 || file->msgs); - CHK_OOM(counts.enum_count == 0 || file->enums); - CHK_OOM(counts.ext_count == 0 || file->exts); + file->msgs = symtab_alloc(ctx, sizeof(*file->msgs) * file->msg_count); + file->enums = symtab_alloc(ctx, sizeof(*file->enums) * file->enum_count); + file->exts = symtab_alloc(ctx, sizeof(*file->exts) * file->ext_count); /* We increment these as defs are added. */ file->msg_count = 0; @@ -1893,8 +1943,7 @@ static bool build_filedef( file->ext_count = 0; if (!google_protobuf_FileDescriptorProto_has_name(file_proto)) { - upb_status_seterrmsg(ctx->status, "File has no name"); - return false; + symtab_errf(ctx, "File has no name"); } file->name = @@ -1905,7 +1954,7 @@ static bool build_filedef( if (google_protobuf_FileDescriptorProto_has_package(file_proto)) { upb_strview package = google_protobuf_FileDescriptorProto_package(file_proto); - CHK(upb_isident(package, true, ctx->status)); + check_ident(ctx, package, true); file->package = strviewdup(ctx, package); } else { file->package = NULL; @@ -1920,9 +1969,8 @@ static bool build_filedef( } else if (streql_view(syntax, "proto3")) { file->syntax = UPB_SYNTAX_PROTO3; } else { - upb_status_seterrf(ctx->status, "Invalid syntax '" UPB_STRVIEW_FORMAT "'", - UPB_STRVIEW_ARGS(syntax)); - return false; + symtab_errf(ctx, "Invalid syntax '" UPB_STRVIEW_FORMAT "'", + UPB_STRVIEW_ARGS(syntax)); } } else { file->syntax = UPB_SYNTAX_PROTO2; @@ -1944,19 +1992,17 @@ static bool build_filedef( /* Verify dependencies. */ strs = google_protobuf_FileDescriptorProto_dependency(file_proto, &n); - file->deps = upb_malloc(alloc, sizeof(*file->deps) * n) ; - CHK_OOM(n == 0 || file->deps); + file->deps = symtab_alloc(ctx, sizeof(*file->deps) * n); for (i = 0; i < n; i++) { upb_strview dep_name = strs[i]; upb_value v; if (!upb_strtable_lookup2(&ctx->symtab->files, dep_name.data, dep_name.size, &v)) { - upb_status_seterrf(ctx->status, - "Depends on file '" UPB_STRVIEW_FORMAT - "', but it has not been loaded", - UPB_STRVIEW_ARGS(dep_name)); - return false; + symtab_errf(ctx, + "Depends on file '" UPB_STRVIEW_FORMAT + "', but it has not been loaded", + UPB_STRVIEW_ARGS(dep_name)); } file->deps[i] = upb_value_getconstptr(v); } @@ -1964,194 +2010,92 @@ static bool build_filedef( /* Create messages. */ msgs = google_protobuf_FileDescriptorProto_message_type(file_proto, &n); for (i = 0; i < n; i++) { - CHK(create_msgdef(ctx, file->package, msgs[i])); + create_msgdef(ctx, file->package, msgs[i]); } /* Create enums. */ enums = google_protobuf_FileDescriptorProto_enum_type(file_proto, &n); for (i = 0; i < n; i++) { - CHK(create_enumdef(ctx, file->package, enums[i])); + create_enumdef(ctx, file->package, enums[i]); } /* Create extensions. */ exts = google_protobuf_FileDescriptorProto_extension(file_proto, &n); - file->exts = upb_malloc(alloc, sizeof(*file->exts) * n); - CHK_OOM(n == 0 || file->exts); + file->exts = symtab_alloc(ctx, sizeof(*file->exts) * n); for (i = 0; i < n; i++) { - CHK(create_fielddef(ctx, file->package, NULL, exts[i])); + create_fielddef(ctx, file->package, NULL, exts[i]); } /* Now that all names are in the table, build layouts and resolve refs. */ for (i = 0; i < (size_t)file->ext_count; i++) { - CHK(resolve_fielddef(ctx, file->package, (upb_fielddef*)&file->exts[i])); + resolve_fielddef(ctx, file->package, (upb_fielddef*)&file->exts[i]); } for (i = 0; i < (size_t)file->msg_count; i++) { const upb_msgdef *m = &file->msgs[i]; int j; for (j = 0; j < m->field_count; j++) { - CHK(resolve_fielddef(ctx, m->full_name, (upb_fielddef*)&m->fields[j])); + resolve_fielddef(ctx, m->full_name, (upb_fielddef*)&m->fields[j]); } } if (!ctx->layouts) { for (i = 0; i < (size_t)file->msg_count; i++) { const upb_msgdef *m = &file->msgs[i]; - make_layout(ctx->symtab, m); + make_layout(ctx, m); } } +} - return true; - } - -static bool upb_symtab_addtotabs(upb_symtab *s, symtab_addctx *ctx) { - const upb_filedef *file = ctx->file; +static void remove_filedef(upb_symtab *s, upb_filedef *file) { upb_alloc *alloc = upb_arena_alloc(s->arena); - upb_strtable_iter iter; - - CHK_OOM(upb_strtable_insert3(&s->files, file->name, strlen(file->name), - upb_value_constptr(file), alloc)); - - upb_strtable_begin(&iter, ctx->addtab); - for (; !upb_strtable_done(&iter); upb_strtable_next(&iter)) { - upb_strview key = upb_strtable_iter_key(&iter); - upb_value value = upb_strtable_iter_value(&iter); - CHK_OOM(upb_strtable_insert3(&s->syms, key.data, key.size, value, alloc)); + int i; + for (i = 0; i < file->msg_count; i++) { + const char *name = file->msgs[i].full_name; + upb_strtable_remove3(&s->syms, name, strlen(name), NULL, alloc); } - - return true; -} - -/* upb_filedef ****************************************************************/ - -const char *upb_filedef_name(const upb_filedef *f) { - return f->name; -} - -const char *upb_filedef_package(const upb_filedef *f) { - return f->package; -} - -const char *upb_filedef_phpprefix(const upb_filedef *f) { - return f->phpprefix; -} - -const char *upb_filedef_phpnamespace(const upb_filedef *f) { - return f->phpnamespace; -} - -upb_syntax_t upb_filedef_syntax(const upb_filedef *f) { - return f->syntax; -} - -int upb_filedef_msgcount(const upb_filedef *f) { - return f->msg_count; -} - -int upb_filedef_depcount(const upb_filedef *f) { - return f->dep_count; -} - -int upb_filedef_enumcount(const upb_filedef *f) { - return f->enum_count; -} - -const upb_filedef *upb_filedef_dep(const upb_filedef *f, int i) { - return i < 0 || i >= f->dep_count ? NULL : f->deps[i]; -} - -const upb_msgdef *upb_filedef_msg(const upb_filedef *f, int i) { - return i < 0 || i >= f->msg_count ? NULL : &f->msgs[i]; -} - -const upb_enumdef *upb_filedef_enum(const upb_filedef *f, int i) { - return i < 0 || i >= f->enum_count ? NULL : &f->enums[i]; -} - -void upb_symtab_free(upb_symtab *s) { - upb_arena_free(s->arena); - upb_gfree(s); -} - -upb_symtab *upb_symtab_new(void) { - upb_symtab *s = upb_gmalloc(sizeof(*s)); - upb_alloc *alloc; - - if (!s) { - return NULL; + for (i = 0; i < file->enum_count; i++) { + const char *name = file->enums[i].full_name; + upb_strtable_remove3(&s->syms, name, strlen(name), NULL, alloc); } - - s->arena = upb_arena_new(); - s->bytes_loaded = 0; - alloc = upb_arena_alloc(s->arena); - - if (!upb_strtable_init2(&s->syms, UPB_CTYPE_CONSTPTR, 32, alloc) || - !upb_strtable_init2(&s->files, UPB_CTYPE_CONSTPTR, 4, alloc)) { - upb_arena_free(s->arena); - upb_gfree(s); - s = NULL; + for (i = 0; i < file->ext_count; i++) { + const char *name = file->exts[i].full_name; + upb_strtable_remove3(&s->syms, name, strlen(name), NULL, alloc); } - return s; -} - -const upb_msgdef *upb_symtab_lookupmsg(const upb_symtab *s, const char *sym) { - upb_value v; - return upb_strtable_lookup(&s->syms, sym, &v) ? - unpack_def(v, UPB_DEFTYPE_MSG) : NULL; -} - -const upb_msgdef *upb_symtab_lookupmsg2(const upb_symtab *s, const char *sym, - size_t len) { - upb_value v; - return upb_strtable_lookup2(&s->syms, sym, len, &v) ? - unpack_def(v, UPB_DEFTYPE_MSG) : NULL; -} - -const upb_enumdef *upb_symtab_lookupenum(const upb_symtab *s, const char *sym) { - upb_value v; - return upb_strtable_lookup(&s->syms, sym, &v) ? - unpack_def(v, UPB_DEFTYPE_ENUM) : NULL; -} - -const upb_filedef *upb_symtab_lookupfile(const upb_symtab *s, const char *name) { - upb_value v; - return upb_strtable_lookup(&s->files, name, &v) ? upb_value_getconstptr(v) - : NULL; -} - -const upb_filedef *upb_symtab_lookupfile2( - const upb_symtab *s, const char *name, size_t len) { - upb_value v; - return upb_strtable_lookup2(&s->files, name, len, &v) ? - upb_value_getconstptr(v) : NULL; -} - -int upb_symtab_filecount(const upb_symtab *s) { - return (int)upb_strtable_count(&s->files); } static const upb_filedef *_upb_symtab_addfile( upb_symtab *s, const google_protobuf_FileDescriptorProto *file_proto, const upb_msglayout **layouts, upb_status *status) { - upb_arena *tmparena = upb_arena_new(); - upb_strtable addtab; - upb_alloc *alloc = upb_arena_alloc(s->arena); - upb_filedef *file = upb_malloc(alloc, sizeof(*file)); - bool ok; + upb_arena *file_arena = upb_arena_new(); + upb_filedef *file = upb_arena_malloc(file_arena, sizeof(*file)); + bool ok = true; symtab_addctx ctx; ctx.file = file; ctx.symtab = s; - ctx.alloc = alloc; - ctx.tmp = upb_arena_alloc(tmparena); - ctx.addtab = &addtab; + ctx.file_arena = file_arena; + ctx.alloc = upb_arena_alloc(file_arena); ctx.layouts = layouts; ctx.status = status; - ok = file && upb_strtable_init2(&addtab, UPB_CTYPE_CONSTPTR, 8, ctx.tmp) && - build_filedef(&ctx, file, file_proto) && upb_symtab_addtotabs(s, &ctx); + file->msg_count = 0; + file->enum_count = 0; + file->ext_count = 0; + + if (UPB_UNLIKELY(setjmp(ctx.err))) { + UPB_ASSERT(!upb_ok(status)); + ok = false; + remove_filedef(s, file); + } else { + build_filedef(&ctx, file, file_proto); + upb_strtable_insert3(&s->files, file->name, strlen(file->name), + upb_value_constptr(file), ctx.alloc); + UPB_ASSERT(upb_ok(status)); + upb_arena_fuse(s->arena, file_arena); + } - upb_arena_free(tmparena); + upb_arena_free(file_arena); return ok ? file : NULL; } @@ -2213,5 +2157,4 @@ size_t _upb_symtab_bytesloaded(const upb_symtab *s) { return s->bytes_loaded; } -#undef CHK #undef CHK_OOM From dd0994d377f2f10a2dd5d6e562a628d5ea86f719 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Sun, 1 Nov 2020 15:42:10 -0800 Subject: [PATCH 12/21] Bugfix for JSON decoding: only check real oneofs for duplicates. Also fixed upb_msg_whichoneof() to work properly for synthetic fields, and to be simpler in general. --- upb/json_decode.c | 2 +- upb/reflection.c | 25 +++++++++++-------------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/upb/json_decode.c b/upb/json_decode.c index 769c9ea062..fb53c3de35 100644 --- a/upb/json_decode.c +++ b/upb/json_decode.c @@ -910,7 +910,7 @@ static void jsondec_field(jsondec *d, upb_msg *msg, const upb_msgdef *m) { return; } - if (upb_fielddef_containingoneof(f) && + if (upb_fielddef_realcontainingoneof(f) && upb_msg_whichoneof(msg, upb_fielddef_containingoneof(f))) { jsondec_err(d, "More than one field for this oneof."); } diff --git a/upb/reflection.c b/upb/reflection.c index 738bb186ec..fd8623253a 100644 --- a/upb/reflection.c +++ b/upb/reflection.c @@ -96,20 +96,17 @@ bool upb_msg_has(const upb_msg *msg, const upb_fielddef *f) { const upb_fielddef *upb_msg_whichoneof(const upb_msg *msg, const upb_oneofdef *o) { - upb_oneof_iter i; - const upb_fielddef *f; - const upb_msglayout_field *field; - const upb_msgdef *m = upb_oneofdef_containingtype(o); - uint32_t oneof_case; - - /* This is far from optimal. */ - upb_oneof_begin(&i, o); - if (upb_oneof_done(&i)) return false; - f = upb_oneof_iter_field(&i); - field = upb_fielddef_layout(f); - oneof_case = _upb_getoneofcase_field(msg, field); - - return oneof_case ? upb_msgdef_itof(m, oneof_case) : NULL; + const upb_fielddef *f = upb_oneofdef_field(o, 0); + if (upb_oneofdef_issynthetic(o)) { + UPB_ASSERT(upb_oneofdef_fieldcount(o) == 1); + return upb_msg_has(msg, f) ? f : NULL; + } else { + const upb_msglayout_field *field = upb_fielddef_layout(f); + uint32_t oneof_case = _upb_getoneofcase_field(msg, field); + f = oneof_case ? upb_oneofdef_itof(o, oneof_case) : NULL; + UPB_ASSERT((f != NULL) == (oneof_case != 0)); + return f; + } } upb_msgval upb_msg_get(const upb_msg *msg, const upb_fielddef *f) { From 64abb5eb111de9ce2c53dbd41601124db60ed094 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Mon, 2 Nov 2020 09:00:53 -0800 Subject: [PATCH 13/21] Amalgamation no longer bundles wyhash, but #includes it. Also fixed a few spelling mistakes. --- BUILD | 16 +++++++++++++--- benchmarks/benchmark.cc | 2 +- tests/bindings/lua/test_upb.lua | 2 +- tools/amalgamate.py | 2 +- upb/json_encode.h | 2 +- upb/table.int.h | 2 +- 6 files changed, 18 insertions(+), 8 deletions(-) diff --git a/BUILD b/BUILD index 6c4cbfa5a6..2c86b3cb43 100644 --- a/BUILD +++ b/BUILD @@ -289,7 +289,10 @@ upb_amalgamation( cc_library( name = "amalgamation", - srcs = ["upb.c"], + srcs = [ + "upb.c", + "third_party/wyhash/wyhash.h", + ], hdrs = ["upb.h"], copts = UPB_DEFAULT_COPTS, ) @@ -314,9 +317,13 @@ upb_amalgamation( cc_library( name = "php_amalgamation", - srcs = ["php-upb.c"], + srcs = [ + "php-upb.c", + "third_party/wyhash/wyhash.h", + ], hdrs = ["php-upb.h"], copts = UPB_DEFAULT_COPTS, + ) upb_amalgamation( @@ -338,7 +345,10 @@ upb_amalgamation( cc_library( name = "ruby_amalgamation", - srcs = ["ruby-upb.c"], + srcs = [ + "ruby-upb.c", + "third_party/wyhash/wyhash.h", + ], hdrs = ["ruby-upb.h"], copts = UPB_DEFAULT_COPTS, ) diff --git a/benchmarks/benchmark.cc b/benchmarks/benchmark.cc index ebeaed7654..9f41588453 100644 --- a/benchmarks/benchmark.cc +++ b/benchmarks/benchmark.cc @@ -56,7 +56,7 @@ static void BM_LoadDescriptor_Upb(benchmark::State& state) { google_protobuf_DescriptorProto_getmsgdef(symtab.ptr()); bytes_per_iter = _upb_symtab_bytesloaded(symtab.ptr()); } - state.SetBytesProcessed(state.iterations() * descriptor.size); + state.SetBytesProcessed(state.iterations() * bytes_per_iter); } BENCHMARK(BM_LoadDescriptor_Upb); diff --git a/tests/bindings/lua/test_upb.lua b/tests/bindings/lua/test_upb.lua index 3a490c6f6d..9d8efa2856 100644 --- a/tests/bindings/lua/test_upb.lua +++ b/tests/bindings/lua/test_upb.lua @@ -104,7 +104,7 @@ function test_utf8() upb.decode(test_messages_proto3.TestAllTypesProto3, serialized) end) - -- TOOD(haberman): should proto3 accessors also check UTF-8 at set time? + -- TODO(haberman): should proto3 accessors also check UTF-8 at set time? end function test_string_double_map() diff --git a/tools/amalgamate.py b/tools/amalgamate.py index 950e9aa9ac..b244eff8e8 100755 --- a/tools/amalgamate.py +++ b/tools/amalgamate.py @@ -52,7 +52,7 @@ class Amalgamator: include = parse_include(line) if not include: return False - if not (include.startswith("upb") or include.startswith("google") or include.startswith("third_party")): + if not (include.startswith("upb") or include.startswith("google")): return False if include.endswith("hpp"): # Skip, we don't support the amalgamation from C++. diff --git a/upb/json_encode.h b/upb/json_encode.h index 8396fbf80f..0ad4163cdf 100644 --- a/upb/json_encode.h +++ b/upb/json_encode.h @@ -9,7 +9,7 @@ extern "C" { #endif enum { - /* When set, emits 0/default values. TOOD(haberman): proto3 only? */ + /* When set, emits 0/default values. TODO(haberman): proto3 only? */ UPB_JSONENC_EMITDEFAULTS = 1, /* When set, use normal (snake_caes) field names instead of JSON (camelCase) diff --git a/upb/table.int.h b/upb/table.int.h index 1e6c232cf8..ab67cc34d6 100644 --- a/upb/table.int.h +++ b/upb/table.int.h @@ -13,7 +13,7 @@ ** store pointers or integers of at least 32 bits (upb isn't really useful on ** systems where sizeof(void*) < 4). ** -** The table must be homogenous (all values of the same type). In debug +** The table must be homogeneous (all values of the same type). In debug ** mode, we check this on insert and lookup. */ From 0497f8deed605bd38735e061a738c181dbec3982 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Mon, 2 Nov 2020 19:00:23 -0800 Subject: [PATCH 14/21] Fixed a critical bug on 32-bit builds, and added much more Kokoro testing. There was a bug in our arena code where we assumed that sizeof(upb_array) would be a multiple of 8. On i386 it was not, and this was causing memory corruption on 32-bit builds. --- .bazelrc | 11 +++++++++++ kokoro/ubuntu/build.sh | 26 ++++++++++++++------------ tests/test_generated_code.c | 4 ++-- upb/decode.c | 3 ++- upb/msg.c | 2 +- upb/msg.h | 9 +++++++-- upb/upb.h | 2 +- 7 files changed, 38 insertions(+), 19 deletions(-) create mode 100644 .bazelrc diff --git a/.bazelrc b/.bazelrc new file mode 100644 index 0000000000..a4867e6c3a --- /dev/null +++ b/.bazelrc @@ -0,0 +1,11 @@ +# Use our custom-configured c++ toolchain. + +build:m32 --copt=-m32 --linkopt=-m32 +build:asan --copt=-fsanitize=address --linkopt=-fsanitize=address +build:valgrind --run_under='valgrind --leak-check=full --error-exitcode=1' + +build:ubsan --copt=-fsanitize=undefined --linkopt=-fsanitize=undefined --action_env=UBSAN_OPTIONS=halt_on_error=1:print_stacktrace=1 +# Workaround for the fact that Bazel links with $CC, not $CXX +# https://github.com/bazelbuild/bazel/issues/11122#issuecomment-613746748 +build:ubsan --copt=-fno-sanitize=function --copt=-fno-sanitize=vptr + diff --git a/kokoro/ubuntu/build.sh b/kokoro/ubuntu/build.sh index 1b1089097e..fb92fceb1e 100644 --- a/kokoro/ubuntu/build.sh +++ b/kokoro/ubuntu/build.sh @@ -11,29 +11,31 @@ fi echo PATH=$PATH ls -l `which cmake` cmake --version -echo CC=${CC:-cc} -${CC:-cc} --version # Log the bazel path and version. which bazel bazel version cd $(dirname $0)/../.. -bazel test --test_output=errors ... -if [[ $(uname) = "Linux" ]]; then - # Verify the ASAN build. Have to exclude test_conformance_upb as protobuf - # currently leaks memory in the conformance test runner. - bazel test --copt=-fsanitize=address --linkopt=-fsanitize=address --test_output=errors ... +if which gcc; then + gcc --version + CC=gcc bazel test --test_output=errors ... +fi - # Verify the UBSan build. Have to exclude Lua as the version we are using - # fails some UBSan tests. +if which clang; then + CC=clang bazel test --test_output=errors ... + CC=clang bazel test --test_output=errors --config=m32 ... + CC=clang bazel test --test_output=errors -c opt ... - # For some reason kokoro doesn't have Clang available right now. - #CC=clang CXX=clang++ bazel test -c dbg --copt=-fsanitize=undefined --copt=-fno-sanitize=function,vptr --linkopt=-fsanitize=undefined --action_env=UBSAN_OPTIONS=halt_on_error=1:print_stacktrace=1 -- :all -:test_lua + if [[ $(uname) = "Linux" ]]; then + CC=clang bazel test --test_output=errors --config=asan ... + # TODO: update to a newer Lua that hopefully does not trigger UBSAN. + CC=clang bazel test --test_output=errors --config=ubsan ... -- -tests/bindings/lua:test_lua + fi fi if which valgrind; then - bazel test --run_under='valgrind --leak-check=full --error-exitcode=1' ... -- -tests:test_conformance_upb -cmake:cmake_build + bazel test --config=valgrind ... -- -tests:test_conformance_upb -cmake:cmake_build fi diff --git a/tests/test_generated_code.c b/tests/test_generated_code.c index d2c3748e55..1cfeb2733b 100644 --- a/tests/test_generated_code.c +++ b/tests/test_generated_code.c @@ -56,9 +56,9 @@ static void test_scalars(void) { ASSERT(protobuf_test_messages_proto3_TestAllTypesProto3_optional_uint64( msg2) == 40); ASSERT(protobuf_test_messages_proto3_TestAllTypesProto3_optional_float( - msg2) == 50.5); + msg2) - 50.5 < 0.01); ASSERT(protobuf_test_messages_proto3_TestAllTypesProto3_optional_double( - msg2) == 60.6); + msg2) - 60.6 < 0.01); ASSERT(protobuf_test_messages_proto3_TestAllTypesProto3_optional_bool( msg2) == 1); ASSERT(upb_strview_eql( diff --git a/upb/decode.c b/upb/decode.c index 347f964a91..01997f2d30 100644 --- a/upb/decode.c +++ b/upb/decode.c @@ -615,7 +615,8 @@ static const char *decode_msg(upb_decstate *d, const char *ptr, upb_msg *msg, int ndx = field->descriptortype; if (_upb_isrepeated(field)) ndx += 18; ptr = decode_varint32(d, ptr, &val.size); - if (val.size >= INT32_MAX || ptr - d->end + val.size > d->limit) { + if (val.size >= INT32_MAX || + ptr - d->end + (int32_t)val.size > d->limit) { decode_err(d); /* Length overflow. */ } op = delim_ops[ndx]; diff --git a/upb/msg.c b/upb/msg.c index baafe59293..e074f43312 100644 --- a/upb/msg.c +++ b/upb/msg.c @@ -11,7 +11,7 @@ static const size_t overhead = sizeof(upb_msg_internal); static const upb_msg_internal *upb_msg_getinternal_const(const upb_msg *msg) { ptrdiff_t size = sizeof(upb_msg_internal); - return UPB_PTR_AT(msg, -size, upb_msg_internal); + return (upb_msg_internal*)((char*)msg - size); } upb_msg *_upb_msg_new(const upb_msglayout *l, upb_arena *a) { diff --git a/upb/msg.h b/upb/msg.h index 290c762079..fcce4c2f5d 100644 --- a/upb/msg.h +++ b/upb/msg.h @@ -94,7 +94,8 @@ UPB_INLINE upb_msg *_upb_msg_new_inl(const upb_msglayout *l, upb_arena *a) { upb_msg *_upb_msg_new(const upb_msglayout *l, upb_arena *a); UPB_INLINE upb_msg_internal *upb_msg_getinternal(upb_msg *msg) { - return UPB_PTR_AT(msg, -sizeof(upb_msg_internal), upb_msg_internal); + ptrdiff_t size = sizeof(upb_msg_internal); + return (upb_msg_internal*)((char*)msg - size); } /* Clears the given message. */ @@ -189,9 +190,11 @@ typedef struct { uintptr_t data; /* Tagged ptr: low 3 bits of ptr are lg2(elem size). */ size_t len; /* Measured in elements. */ size_t size; /* Measured in elements. */ + uint64_t junk; } upb_array; UPB_INLINE const void *_upb_array_constptr(const upb_array *arr) { + UPB_ASSERT((arr->data & 7) <= 4); return (void*)(arr->data & ~(uintptr_t)7); } @@ -201,15 +204,17 @@ UPB_INLINE void *_upb_array_ptr(upb_array *arr) { UPB_INLINE uintptr_t _upb_tag_arrptr(void* ptr, int elem_size_lg2) { UPB_ASSERT(elem_size_lg2 <= 4); + UPB_ASSERT(((uintptr_t)ptr & 7) == 0); return (uintptr_t)ptr | (unsigned)elem_size_lg2; } UPB_INLINE upb_array *_upb_array_new(upb_arena *a, size_t init_size, int elem_size_lg2) { + const size_t arr_size = UPB_ALIGN_UP(sizeof(upb_array), 8); const size_t bytes = sizeof(upb_array) + (init_size << elem_size_lg2); upb_array *arr = (upb_array*)upb_arena_malloc(a, bytes); if (!arr) return NULL; - arr->data = _upb_tag_arrptr(arr + 1, elem_size_lg2); + arr->data = _upb_tag_arrptr(UPB_PTR_AT(arr, arr_size, void), elem_size_lg2); arr->len = 0; arr->size = init_size; return arr; diff --git a/upb/upb.h b/upb/upb.h index f52e812478..055f9c104a 100644 --- a/upb/upb.h +++ b/upb/upb.h @@ -314,7 +314,7 @@ UPB_INLINE uint64_t _upb_be_swap64(uint64_t val) { } UPB_INLINE int _upb_lg2ceil(int x) { - if (x == 0) return 0; + if (x <= 1) return 0; #ifdef __GNUC__ return 32 - __builtin_clz(x - 1); #else From 5b1f0d86a126ea43986267fee7c4a7f0cf26a0dd Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Mon, 2 Nov 2020 21:22:51 -0800 Subject: [PATCH 15/21] For Kokoro, only build/test -m32 on Linux. Also fixed a bunch of bugs found by gcc's -fanalyzer. --- .bazelrc | 9 +++++++++ bazel/build_defs.bzl | 5 +++++ benchmarks/benchmark.cc | 6 +++--- kokoro/ubuntu/build.sh | 7 ++++++- upb/def.c | 13 +++++++++---- upb/json_decode.c | 2 ++ upb/msg.c | 13 ++++++++----- upb/msg.h | 15 ++++++++------- 8 files changed, 50 insertions(+), 20 deletions(-) diff --git a/.bazelrc b/.bazelrc index a4867e6c3a..df04df138f 100644 --- a/.bazelrc +++ b/.bazelrc @@ -9,3 +9,12 @@ build:ubsan --copt=-fsanitize=undefined --linkopt=-fsanitize=undefined --action_ # https://github.com/bazelbuild/bazel/issues/11122#issuecomment-613746748 build:ubsan --copt=-fno-sanitize=function --copt=-fno-sanitize=vptr +build:Werror --copt=-Werror +build:Werror --per_file_copt=json/parser@-Wno-error +build:Werror --per_file_copt=com_google_protobuf@-Wno-error + +# GCC's -fanalyzer, a deeper static analysis than normal warnings. +build:analyzer --copt=-fanalyzer --copt=-Werror +build:analyzer --per_file_copt=json/parser@-fno-analyzer +build:analyzer --per_file_copt=com_google_protobuf@-fno-analyzer +build:analyzer --per_file_copt=com_github_google_benchmark@-fno-analyzer diff --git a/bazel/build_defs.bzl b/bazel/build_defs.bzl index a199243947..4413d946ce 100644 --- a/bazel/build_defs.bzl +++ b/bazel/build_defs.bzl @@ -21,7 +21,12 @@ UPB_DEFAULT_COPTS = select({ "-std=c99", "-pedantic", "-Werror=pedantic", + "-Wall", "-Wstrict-prototypes", + # GCC (at least) emits spurious warnings for this that cannot be fixed + # without introducing redundant initialization (with runtime cost): + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80635 + #"-Wno-maybe-uninitialized", # copybara:strip_end ], }) diff --git a/benchmarks/benchmark.cc b/benchmarks/benchmark.cc index 9f41588453..7bb7f12364 100644 --- a/benchmarks/benchmark.cc +++ b/benchmarks/benchmark.cc @@ -50,7 +50,7 @@ static void BM_ArenaInitialBlockOneAlloc(benchmark::State& state) { BENCHMARK(BM_ArenaInitialBlockOneAlloc); static void BM_LoadDescriptor_Upb(benchmark::State& state) { - size_t bytes_per_iter; + size_t bytes_per_iter = 0; for (auto _ : state) { upb::SymbolTable symtab; google_protobuf_DescriptorProto_getmsgdef(symtab.ptr()); @@ -61,7 +61,7 @@ static void BM_LoadDescriptor_Upb(benchmark::State& state) { BENCHMARK(BM_LoadDescriptor_Upb); static void BM_LoadAdsDescriptor_Upb(benchmark::State& state) { - size_t bytes_per_iter; + size_t bytes_per_iter = 0; for (auto _ : state) { upb::SymbolTable symtab; google_ads_googleads_v5_services_SearchGoogleAdsRequest_getmsgdef( @@ -97,7 +97,7 @@ static void BM_LoadAdsDescriptor_Proto2(benchmark::State& state) { CollectFileDescriptors( &google_ads_googleads_v5_services_google_ads_service_proto_upbdefinit, serialized_files, seen_files); - size_t bytes_per_iter; + size_t bytes_per_iter = 0; for (auto _ : state) { bytes_per_iter = 0; protobuf::Arena arena; diff --git a/kokoro/ubuntu/build.sh b/kokoro/ubuntu/build.sh index fb92fceb1e..9ea1e67f0d 100644 --- a/kokoro/ubuntu/build.sh +++ b/kokoro/ubuntu/build.sh @@ -21,14 +21,19 @@ cd $(dirname $0)/../.. if which gcc; then gcc --version CC=gcc bazel test --test_output=errors ... + CC=gcc bazel test -c opt --test_output=errors ... + # TODO: work through these errors and enable this. + # if gcc -fanalyzer -x c /dev/null -c -o /dev/null; then + # CC=gcc bazel test --copt=-fanalyzer --test_output=errors ... + # fi fi if which clang; then CC=clang bazel test --test_output=errors ... - CC=clang bazel test --test_output=errors --config=m32 ... CC=clang bazel test --test_output=errors -c opt ... if [[ $(uname) = "Linux" ]]; then + CC=clang bazel test --test_output=errors --config=m32 ... CC=clang bazel test --test_output=errors --config=asan ... # TODO: update to a newer Lua that hopefully does not trigger UBSAN. diff --git a/upb/def.c b/upb/def.c index 7379a768a0..0f17f1b9db 100644 --- a/upb/def.c +++ b/upb/def.c @@ -2068,10 +2068,14 @@ static const upb_filedef *_upb_symtab_addfile( upb_symtab *s, const google_protobuf_FileDescriptorProto *file_proto, const upb_msglayout **layouts, upb_status *status) { upb_arena *file_arena = upb_arena_new(); - upb_filedef *file = upb_arena_malloc(file_arena, sizeof(*file)); - bool ok = true; + upb_filedef *file; symtab_addctx ctx; + if (!file_arena) return NULL; + + file = upb_arena_malloc(file_arena, sizeof(*file)); + if (!file) goto done; + ctx.file = file; ctx.symtab = s; ctx.file_arena = file_arena; @@ -2085,8 +2089,8 @@ static const upb_filedef *_upb_symtab_addfile( if (UPB_UNLIKELY(setjmp(ctx.err))) { UPB_ASSERT(!upb_ok(status)); - ok = false; remove_filedef(s, file); + file = NULL; } else { build_filedef(&ctx, file, file_proto); upb_strtable_insert3(&s->files, file->name, strlen(file->name), @@ -2095,8 +2099,9 @@ static const upb_filedef *_upb_symtab_addfile( upb_arena_fuse(s->arena, file_arena); } +done: upb_arena_free(file_arena); - return ok ? file : NULL; + return file; } const upb_filedef *upb_symtab_addfile( diff --git a/upb/json_decode.c b/upb/json_decode.c index fb53c3de35..44ee56a857 100644 --- a/upb/json_decode.c +++ b/upb/json_decode.c @@ -396,6 +396,8 @@ static void jsondec_resize(jsondec *d, char **buf, char **end, char **buf_end) { size_t size = UPB_MAX(8, 2 * oldsize); *buf = upb_arena_realloc(d->arena, *buf, len, size); + if (!*buf) jsondec_err(d, "Out of memory"); + *end = *buf + len; *buf_end = *buf + size; } diff --git a/upb/msg.c b/upb/msg.c index e074f43312..1dbb8b0fab 100644 --- a/upb/msg.c +++ b/upb/msg.c @@ -111,13 +111,16 @@ void *_upb_array_resize_fallback(upb_array **arr_ptr, size_t size, bool _upb_array_append_fallback(upb_array **arr_ptr, const void *value, int elem_size_lg2, upb_arena *arena) { upb_array *arr = getorcreate_array(arr_ptr, elem_size_lg2, arena); - size_t elem = arr->len; - char *data; + if (!arr) return false; - if (!arr || !_upb_array_resize(arr, elem + 1, arena)) return false; + size_t elems = arr->len; - data = _upb_array_ptr(arr); - memcpy(data + (elem << elem_size_lg2), value, 1 << elem_size_lg2); + if (!_upb_array_resize(arr, elems + 1, arena)) { + return false; + } + + char *data = _upb_array_ptr(arr); + memcpy(data + (elems << elem_size_lg2), value, 1 << elem_size_lg2); return true; } diff --git a/upb/msg.h b/upb/msg.h index fcce4c2f5d..06381820e3 100644 --- a/upb/msg.h +++ b/upb/msg.h @@ -387,17 +387,17 @@ UPB_INLINE void _upb_map_fromkey(upb_strview key, void* out, size_t size) { } } -UPB_INLINE upb_value _upb_map_tovalue(const void *val, size_t size, - upb_arena *a) { - upb_value ret = {0}; +UPB_INLINE bool _upb_map_tovalue(const void *val, size_t size, upb_value *msgval, + upb_arena *a) { if (size == UPB_MAPTYPE_STRING) { upb_strview *strp = (upb_strview*)upb_arena_malloc(a, sizeof(*strp)); + if (!strp) return false; *strp = *(upb_strview*)val; - ret = upb_value_ptr(strp); + *msgval = upb_value_ptr(strp); } else { - memcpy(&ret, val, size); + memcpy(msgval, val, size); } - return ret; + return true; } UPB_INLINE void _upb_map_fromvalue(upb_value val, void* out, size_t size) { @@ -439,7 +439,8 @@ UPB_INLINE void* _upb_map_next(const upb_map *map, size_t *iter) { UPB_INLINE bool _upb_map_set(upb_map *map, const void *key, size_t key_size, void *val, size_t val_size, upb_arena *arena) { upb_strview strkey = _upb_map_tokey(key, key_size); - upb_value tabval = _upb_map_tovalue(val, val_size, arena); + upb_value tabval = {0}; + if (!_upb_map_tovalue(val, val_size, &tabval, arena)) return false; upb_alloc *a = upb_arena_alloc(arena); /* TODO(haberman): add overwrite operation to minimize number of lookups. */ From 484d8f746a7e8260f3407e412c3b957aa58c082d Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Mon, 2 Nov 2020 22:59:05 -0800 Subject: [PATCH 16/21] Updated comment in wyhash.h to correct spelling mistake. --- third_party/wyhash/wyhash.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/third_party/wyhash/wyhash.h b/third_party/wyhash/wyhash.h index 1265169fd6..5658f02df3 100644 --- a/third_party/wyhash/wyhash.h +++ b/third_party/wyhash/wyhash.h @@ -1,9 +1,12 @@ -//Author: Wang Yi +/* Copyright 2020 王一 Wang Yi + This is free and unencumbered software released into the public domain. http://unlicense.org/ + See github.com/wangyi-fudan/wyhash/ LICENSE + */ #ifndef wyhash_final_version #define wyhash_final_version //defines that change behavior #ifndef WYHASH_CONDOM -#define WYHASH_CONDOM 1 //0: read 8 bytes before and after boudaries, dangerous but fastest. 1: normal valid behavior 2: extra protection against entropy loss (probability=2^-63), aka. "blind multiplication" +#define WYHASH_CONDOM 1 //0: read 8 bytes before and after boundaries, dangerous but fastest. 1: normal valid behavior 2: extra protection against entropy loss (probability=2^-63), aka. "blind multiplication" #endif #define WYHASH_32BIT_MUM 0 //faster on 32 bit system //includes From 6c16cba83f8b692c558d60a5a967808bb02d0e2a Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Tue, 3 Nov 2020 12:30:10 -0800 Subject: [PATCH 17/21] Removed obsolete port.c file. --- BUILD | 3 --- cmake/CMakeLists.txt | 3 +-- upb/port.c | 26 -------------------------- 3 files changed, 1 insertion(+), 31 deletions(-) delete mode 100644 upb/port.c diff --git a/BUILD b/BUILD index 2c86b3cb43..68933101fb 100644 --- a/BUILD +++ b/BUILD @@ -46,9 +46,6 @@ upb_proto_library_copts( cc_library( name = "port", - srcs = [ - "upb/port.c", - ], copts = UPB_DEFAULT_COPTS, textual_hdrs = [ "upb/port_def.inc", diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index e23703fcfd..638482d438 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -61,8 +61,7 @@ endif() enable_testing() -add_library(port - ../upb/port.c) +add_library(port INTERFACE) add_library(upb ../upb/decode.c ../upb/encode.c diff --git a/upb/port.c b/upb/port.c deleted file mode 100644 index 9ecf135167..0000000000 --- a/upb/port.c +++ /dev/null @@ -1,26 +0,0 @@ - -#include "upb/port_def.inc" - -#ifdef UPB_MSVC_VSNPRINTF -/* Visual C++ earlier than 2015 doesn't have standard C99 snprintf and - * vsnprintf. To support them, missing functions are manually implemented - * using the existing secure functions. */ -int msvc_vsnprintf(char* s, size_t n, const char* format, va_list arg) { - if (!s) { - return _vscprintf(format, arg); - } - int ret = _vsnprintf_s(s, n, _TRUNCATE, format, arg); - if (ret < 0) { - ret = _vscprintf(format, arg); - } - return ret; -} - -int msvc_snprintf(char* s, size_t n, const char* format, ...) { - va_list arg; - va_start(arg, format); - int ret = msvc_vsnprintf(s, n, format, arg); - va_end(arg); - return ret; -} -#endif From e9b79542adbfa1096e590b7c79cd526d71dfcd33 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Wed, 4 Nov 2020 11:12:36 -0800 Subject: [PATCH 18/21] Added a BUILD file for wyhash. This will make the build more closely resemble the google3 build. The CMake output from this is a bit busted, but the build does succeed. --- BUILD | 25 ++++++++++--------------- cmake/BUILD | 2 ++ cmake/CMakeLists.txt | 5 +++-- cmake/make_cmakelists.py | 1 + third_party/wyhash/BUILD | 18 ++++++++++++++++++ 5 files changed, 34 insertions(+), 17 deletions(-) create mode 100644 third_party/wyhash/BUILD diff --git a/BUILD b/BUILD index 68933101fb..0c95f89be7 100644 --- a/BUILD +++ b/BUILD @@ -65,7 +65,6 @@ cc_library( "upb/table.int.h", "upb/upb.c", "upb/upb.int.h", - "third_party/wyhash/wyhash.h", ], hdrs = [ "upb/decode.h", @@ -75,7 +74,10 @@ cc_library( ], copts = UPB_DEFAULT_COPTS, visibility = ["//visibility:public"], - deps = [":port"], + deps = [ + ":port", + "//third_party/wyhash", + ], ) # Common support routines used by generated code. This library has no @@ -286,12 +288,10 @@ upb_amalgamation( cc_library( name = "amalgamation", - srcs = [ - "upb.c", - "third_party/wyhash/wyhash.h", - ], + srcs = ["upb.c"], hdrs = ["upb.h"], copts = UPB_DEFAULT_COPTS, + deps = ["//third_party/wyhash"], ) upb_amalgamation( @@ -314,13 +314,10 @@ upb_amalgamation( cc_library( name = "php_amalgamation", - srcs = [ - "php-upb.c", - "third_party/wyhash/wyhash.h", - ], + srcs = ["php-upb.c"], hdrs = ["php-upb.h"], copts = UPB_DEFAULT_COPTS, - + deps = ["//third_party/wyhash"], ) upb_amalgamation( @@ -342,12 +339,10 @@ upb_amalgamation( cc_library( name = "ruby_amalgamation", - srcs = [ - "ruby-upb.c", - "third_party/wyhash/wyhash.h", - ], + srcs = ["ruby-upb.c"], hdrs = ["ruby-upb.h"], copts = UPB_DEFAULT_COPTS, + deps = ["//third_party/wyhash"], ) exports_files( diff --git a/cmake/BUILD b/cmake/BUILD index d59475d909..53fbd07abe 100644 --- a/cmake/BUILD +++ b/cmake/BUILD @@ -28,6 +28,7 @@ genrule( "//:BUILD", "//:WORKSPACE", "//:cmake_files", + "//third_party/wyhash:cmake_files", ":cmake_files", ], outs = ["generated-in/CMakeLists.txt"], @@ -84,6 +85,7 @@ sh_test( data = [ ":cmake_files", "//:cmake_files", + "//third_party/wyhash:cmake_files", ], deps = ["@bazel_tools//tools/bash/runfiles"], ) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 638482d438..8ec855aedb 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -71,13 +71,13 @@ add_library(upb ../upb/table.int.h ../upb/upb.c ../upb/upb.int.h - ../third_party/wyhash/wyhash.h ../upb/decode.h ../upb/encode.h ../upb/upb.h ../upb/upb.hpp) target_link_libraries(upb - port) + port + /third_party/wyhash) add_library(generated_code_support__only_for_generated_code_do_not_use__i_give_permission_to_break_me INTERFACE) target_link_libraries(generated_code_support__only_for_generated_code_do_not_use__i_give_permission_to_break_me INTERFACE table @@ -150,5 +150,6 @@ add_library(upb_json target_link_libraries(upb_json upb upb_pb) +add_library(wyhash INTERFACE) diff --git a/cmake/make_cmakelists.py b/cmake/make_cmakelists.py index ee9760a47c..13ba7a5127 100755 --- a/cmake/make_cmakelists.py +++ b/cmake/make_cmakelists.py @@ -280,6 +280,7 @@ globs = GetDict(converter) exec(open("WORKSPACE").read(), GetDict(WorkspaceFileFunctions(converter))) exec(open("BUILD").read(), GetDict(BuildFileFunctions(converter))) +exec(open("third_party/wyhash/BUILD").read(), GetDict(BuildFileFunctions(converter))) with open(sys.argv[1], "w") as f: f.write(converter.convert()) diff --git a/third_party/wyhash/BUILD b/third_party/wyhash/BUILD new file mode 100644 index 0000000000..28fa3be6a5 --- /dev/null +++ b/third_party/wyhash/BUILD @@ -0,0 +1,18 @@ + +licenses(["unencumbered"]) + +exports_files(["LICENSE"]) + +cc_library( + name = "wyhash", + hdrs = ["wyhash.h"], + visibility = ["//:__pkg__"], +) + +filegroup( + name = "cmake_files", + srcs = glob([ + "**/*", + ]), + visibility = ["//cmake:__pkg__"], +) From fe62fc83e174c75901ed5628c5687e2262105c82 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Wed, 4 Nov 2020 13:29:07 -0800 Subject: [PATCH 19/21] Removed obsolete includes in benchmark. --- benchmarks/benchmark.cc | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/benchmarks/benchmark.cc b/benchmarks/benchmark.cc index 7bb7f12364..8921ac7f8b 100644 --- a/benchmarks/benchmark.cc +++ b/benchmarks/benchmark.cc @@ -2,17 +2,12 @@ #include #include -// For benchmarks of parsing speed. #include "benchmarks/descriptor.pb.h" #include "benchmarks/descriptor.upb.h" #include "benchmarks/descriptor.upbdefs.h" #include "benchmarks/descriptor_sv.pb.h" - -// For for benchmarks of building descriptors. #include "google/ads/googleads/v5/services/google_ads_service.upbdefs.h" #include "google/protobuf/descriptor.pb.h" -#include "google/protobuf/descriptor.upb.h" -#include "google/protobuf/descriptor.upbdefs.h" #include "upb/def.hpp" upb_strview descriptor = benchmarks_descriptor_proto_upbdefinit.descriptor; @@ -53,7 +48,7 @@ static void BM_LoadDescriptor_Upb(benchmark::State& state) { size_t bytes_per_iter = 0; for (auto _ : state) { upb::SymbolTable symtab; - google_protobuf_DescriptorProto_getmsgdef(symtab.ptr()); + upb_benchmark_DescriptorProto_getmsgdef(symtab.ptr()); bytes_per_iter = _upb_symtab_bytesloaded(symtab.ptr()); } state.SetBytesProcessed(state.iterations() * bytes_per_iter); From 7b4e376f793eebbf911718974c67008fa3d51429 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Wed, 4 Nov 2020 13:43:44 -0800 Subject: [PATCH 20/21] Switch unordered_set -> absl::flat_hash_set. --- benchmarks/BUILD | 1 + benchmarks/benchmark.cc | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/benchmarks/BUILD b/benchmarks/BUILD index 45bb870bc2..4ed4042a6c 100644 --- a/benchmarks/BUILD +++ b/benchmarks/BUILD @@ -53,6 +53,7 @@ cc_binary( ":ads_upb_proto_reflection", "//:descriptor_upb_proto", "//:reflection", + "@com_google_absl//absl/container:flat_hash_set", "@com_github_google_benchmark//:benchmark_main", "@com_google_protobuf//:protobuf", ], diff --git a/benchmarks/benchmark.cc b/benchmarks/benchmark.cc index 8921ac7f8b..0fa995d965 100644 --- a/benchmarks/benchmark.cc +++ b/benchmarks/benchmark.cc @@ -2,6 +2,7 @@ #include #include +#include "absl/container/flat_hash_set.h" #include "benchmarks/descriptor.pb.h" #include "benchmarks/descriptor.upb.h" #include "benchmarks/descriptor.upbdefs.h" @@ -18,7 +19,7 @@ char buf[65535]; void CollectFileDescriptors(const upb_def_init* file, std::vector& serialized_files, - std::unordered_set& seen) { + absl::flat_hash_set& seen) { if (!seen.insert(file).second) return; for (upb_def_init **deps = file->deps; *deps; deps++) { CollectFileDescriptors(*deps, serialized_files, seen); @@ -88,7 +89,7 @@ BENCHMARK(BM_LoadDescriptor_Proto2); static void BM_LoadAdsDescriptor_Proto2(benchmark::State& state) { extern upb_def_init google_ads_googleads_v5_services_google_ads_service_proto_upbdefinit; std::vector serialized_files; - std::unordered_set seen_files; + absl::flat_hash_set seen_files; CollectFileDescriptors( &google_ads_googleads_v5_services_google_ads_service_proto_upbdefinit, serialized_files, seen_files); From 3a3efe69a29e2975ba2943c7402f0420b532f0f1 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Wed, 4 Nov 2020 13:52:54 -0800 Subject: [PATCH 21/21] Added incompatible_use_toolchain_transition = True per https://github.com/bazelbuild/bazel/issues/11584 --- bazel/upb_proto_library.bzl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bazel/upb_proto_library.bzl b/bazel/upb_proto_library.bzl index 8a1b2857f5..9bdfc718ee 100644 --- a/bazel/upb_proto_library.bzl +++ b/bazel/upb_proto_library.bzl @@ -267,6 +267,7 @@ _upb_proto_library_aspect = aspect( attr_aspects = ["deps"], fragments = ["cpp"], toolchains = ["@bazel_tools//tools/cpp:toolchain_type"], + incompatible_use_toolchain_transition = True, ) upb_proto_library = rule( @@ -329,6 +330,7 @@ _upb_proto_reflection_library_aspect = aspect( attr_aspects = ["deps"], fragments = ["cpp"], toolchains = ["@bazel_tools//tools/cpp:toolchain_type"], + incompatible_use_toolchain_transition = True, ) upb_proto_reflection_library = rule(