I think this may have reached the optimization limit.

-------------------------------------------------------------------------
Benchmark                                  Time           CPU Iterations
-------------------------------------------------------------------------
BM_ArenaOneAlloc                          21 ns         21 ns   32994231
BM_ArenaInitialBlockOneAlloc               6 ns          6 ns  116318005
BM_ParseDescriptorNoHeap                3028 ns       3028 ns     231138   2.34354GB/s
BM_ParseDescriptor                      3557 ns       3557 ns     196583   1.99498GB/s
BM_ParseDescriptorProto2NoArena        33228 ns      33226 ns      21196   218.688MB/s
BM_ParseDescriptorProto2WithArena      22863 ns      22861 ns      30666   317.831MB/s
BM_SerializeDescriptorProto2            5444 ns       5444 ns     127368   1.30348GB/s
BM_SerializeDescriptor                 12509 ns      12508 ns      55816   580.914MB/s

$ perf stat bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap
2020-10-08 14:07:06
Running bazel-bin/benchmark
Run on (72 X 3700 MHz CPU s)
CPU Caches:
  L1 Data 32K (x36)
  L1 Instruction 32K (x36)
  L2 Unified 1024K (x36)
  L3 Unified 25344K (x2)
----------------------------------------------------------------
Benchmark                         Time           CPU Iterations
----------------------------------------------------------------
BM_ParseDescriptorNoHeap       3071 ns       3071 ns     227743   2.31094GB/s

 Performance counter stats for 'bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap':

          1,050.22 msec task-clock                #    0.978 CPUs utilized
                 4      context-switches          #    0.004 K/sec
                 0      cpu-migrations            #    0.000 K/sec
               179      page-faults               #    0.170 K/sec
     3,875,796,334      cycles                    #    3.690 GHz
    13,282,835,967      instructions              #    3.43  insn per cycle
     2,887,725,848      branches                  # 2749.627 M/sec
         8,324,912      branch-misses             #    0.29% of all branches

       1.073924364 seconds time elapsed

       1.042806000 seconds user
       0.008021000 seconds sys

Profile:
  23.96%  benchmark  benchmark          [.] upb_prm_1bt_max192b
  22.44%  benchmark  benchmark          [.] fastdecode_dispatch
  18.96%  benchmark  benchmark          [.] upb_pss_1bt
  14.20%  benchmark  benchmark          [.] upb_psv4_1bt
   8.33%  benchmark  benchmark          [.] upb_prm_1bt_max64b
   6.66%  benchmark  benchmark          [.] upb_prm_1bt_max128b
   1.29%  benchmark  benchmark          [.] upb_psm_1bt_max64b
   0.77%  benchmark  benchmark          [.] fastdecode_generic
   0.55%  benchmark  [kernel.kallsyms]  [k] smp_call_function_single
   0.42%  benchmark  [kernel.kallsyms]  [k] _raw_spin_lock_irqsave
   0.42%  benchmark  benchmark          [.] upb_psm_1bt_max256b
   0.31%  benchmark  benchmark          [.] upb_psb1_1bt
   0.21%  benchmark  benchmark          [.] upb_plv4_5bv
   0.14%  benchmark  benchmark          [.] upb_psb1_2bt
   0.12%  benchmark  benchmark          [.] decode_longvarint64
   0.08%  benchmark  [kernel.kallsyms]  [k] vsnprintf
   0.07%  benchmark  [kernel.kallsyms]  [k] _raw_spin_lock
   0.07%  benchmark  benchmark          [.] _upb_msg_new
   0.06%  benchmark  ld-2.31.so         [.] check_match
pull/13171/head
Joshua Haberman 4 years ago
parent 4c65b25daf
commit 526e430794
  1. 4
      benchmark.py
  2. 56
      generated_for_cmake/google/protobuf/descriptor.upb.c
  3. 43
      upb/decode.h
  4. 45
      upb/decode_fast.c
  5. 23
      upbc/generator.cc

@ -30,8 +30,8 @@ def Run(cmd):
def Benchmark(outbase, runs=12):
tmpfile = "/tmp/bench-output.json"
Run("rm -rf {}".format(tmpfile))
#Run("CC=clang bazel test :all")
Run("CC=clang bazel build -c opt :benchmark")
Run("CC=clang bazel test :all")
Run("CC=clang bazel build -c opt --copt=-march=native :benchmark")
Run("./bazel-bin/benchmark --benchmark_out_format=json --benchmark_out={} --benchmark_repetitions={}".format(tmpfile, runs))

@ -23,7 +23,7 @@ static const upb_msglayout_field google_protobuf_FileDescriptorSet__fields[1] =
const upb_msglayout google_protobuf_FileDescriptorSet_msginit = {
{
&fastdecode_generic,
&upb_prm_1bt,
&upb_prm_1bt_max192b,
&fastdecode_generic,
&fastdecode_generic,
&fastdecode_generic,
@ -124,12 +124,12 @@ const upb_msglayout google_protobuf_FileDescriptorProto_msginit = {
&upb_pss_1bt,
&upb_pss_1bt,
&fastdecode_generic,
&upb_prm_1bt,
&upb_prm_1bt,
&upb_prm_1bt,
&upb_prm_1bt,
&upb_psm_1bt,
&upb_psm_1bt,
&upb_prm_1bt_max128b,
&upb_prm_1bt_max128b,
&upb_prm_1bt_max64b,
&upb_prm_1bt_max192b,
&upb_psm_1bt_max256b,
&upb_psm_1bt_max64b,
&fastdecode_generic,
&fastdecode_generic,
&upb_pss_1bt,
@ -219,14 +219,14 @@ const upb_msglayout google_protobuf_DescriptorProto_msginit = {
{
&fastdecode_generic,
&upb_pss_1bt,
&upb_prm_1bt,
&upb_prm_1bt,
&upb_prm_1bt,
&upb_prm_1bt,
&upb_prm_1bt,
&upb_psm_1bt,
&upb_prm_1bt,
&upb_prm_1bt,
&upb_prm_1bt_max192b,
&upb_prm_1bt_max128b,
&upb_prm_1bt_max128b,
&upb_prm_1bt_max64b,
&upb_prm_1bt_max192b,
&upb_psm_1bt_max64b,
&upb_prm_1bt_max64b,
&upb_prm_1bt_max64b,
&fastdecode_generic,
&fastdecode_generic,
&fastdecode_generic,
@ -304,7 +304,7 @@ const upb_msglayout google_protobuf_DescriptorProto_ExtensionRange_msginit = {
&fastdecode_generic,
&upb_psv4_1bt,
&upb_psv4_1bt,
&upb_psm_1bt,
&upb_psm_1bt_max64b,
&fastdecode_generic,
&fastdecode_generic,
&fastdecode_generic,
@ -562,7 +562,7 @@ const upb_msglayout google_protobuf_FieldDescriptorProto_msginit = {
&upb_psv4_1bt,
&upb_pss_1bt,
&upb_pss_1bt,
&upb_psm_1bt,
&upb_psm_1bt_max64b,
&upb_psv4_1bt,
&upb_pss_1bt,
&fastdecode_generic,
@ -639,7 +639,7 @@ const upb_msglayout google_protobuf_OneofDescriptorProto_msginit = {
{
&fastdecode_generic,
&upb_pss_1bt,
&upb_psm_1bt,
&upb_psm_1bt_max64b,
&fastdecode_generic,
&fastdecode_generic,
&fastdecode_generic,
@ -727,9 +727,9 @@ const upb_msglayout google_protobuf_EnumDescriptorProto_msginit = {
{
&fastdecode_generic,
&upb_pss_1bt,
&upb_prm_1bt,
&upb_psm_1bt,
&upb_prm_1bt,
&upb_prm_1bt_max64b,
&upb_psm_1bt_max64b,
&upb_prm_1bt_max64b,
&fastdecode_generic,
&fastdecode_generic,
&fastdecode_generic,
@ -891,7 +891,7 @@ const upb_msglayout google_protobuf_EnumValueDescriptorProto_msginit = {
&fastdecode_generic,
&upb_pss_1bt,
&upb_psv4_1bt,
&upb_psm_1bt,
&upb_psm_1bt_max64b,
&fastdecode_generic,
&fastdecode_generic,
&fastdecode_generic,
@ -975,8 +975,8 @@ const upb_msglayout google_protobuf_ServiceDescriptorProto_msginit = {
{
&fastdecode_generic,
&upb_pss_1bt,
&upb_prm_1bt,
&upb_psm_1bt,
&upb_prm_1bt_max128b,
&upb_psm_1bt_max64b,
&fastdecode_generic,
&fastdecode_generic,
&fastdecode_generic,
@ -1064,7 +1064,7 @@ const upb_msglayout google_protobuf_MethodDescriptorProto_msginit = {
&upb_pss_1bt,
&upb_pss_1bt,
&upb_pss_1bt,
&upb_psm_1bt,
&upb_psm_1bt_max64b,
&upb_psb1_1bt,
&upb_psb1_1bt,
&fastdecode_generic,
@ -1842,7 +1842,7 @@ const upb_msglayout google_protobuf_UninterpretedOption_msginit = {
{
&fastdecode_generic,
&fastdecode_generic,
&upb_prm_1bt,
&upb_prm_1bt_max64b,
&upb_pss_1bt,
&upb_psv8_1bt,
&upb_psv8_1bt,
@ -2002,7 +2002,7 @@ static const upb_msglayout_field google_protobuf_SourceCodeInfo__fields[1] = {
const upb_msglayout google_protobuf_SourceCodeInfo_msginit = {
{
&fastdecode_generic,
&upb_prm_1bt,
&upb_prm_1bt_max128b,
&fastdecode_generic,
&fastdecode_generic,
&fastdecode_generic,
@ -2166,7 +2166,7 @@ static const upb_msglayout_field google_protobuf_GeneratedCodeInfo__fields[1] =
const upb_msglayout google_protobuf_GeneratedCodeInfo_msginit = {
{
&fastdecode_generic,
&upb_prm_1bt,
&upb_prm_1bt_max64b,
&fastdecode_generic,
&fastdecode_generic,
&fastdecode_generic,

@ -53,13 +53,15 @@ static void *decode_malloc(upb_decstate *d, size_t size) {
}
UPB_INLINE
upb_msg *decode_newmsg(upb_decstate *d, const upb_msglayout *l) {
const size_t cutoff = 192;
upb_msg *decode_newmsg_ceil(upb_decstate *d, const upb_msglayout *l,
int msg_ceil_bytes) {
size_t size = l->size + sizeof(upb_msg_internal);
char *msg_data;
if (size <= cutoff && (size_t)(d->arena_end - d->arena_ptr) >= cutoff) {
if (msg_ceil_bytes > 0 &&
(size_t)(d->arena_end - d->arena_ptr) >= (size_t)msg_ceil_bytes) {
UPB_ASSERT(size <= (size_t)msg_ceil_bytes);
msg_data = d->arena_ptr;
memset(msg_data, 0, cutoff);
memset(msg_data, 0, msg_ceil_bytes);
d->arena_ptr += size;
} else {
msg_data = (char*)decode_malloc(d, size);
@ -68,6 +70,10 @@ upb_msg *decode_newmsg(upb_decstate *d, const upb_msglayout *l) {
return msg_data + sizeof(upb_msg_internal);
}
UPB_INLINE
upb_msg *decode_newmsg(upb_decstate *d, const upb_msglayout *l) {
return decode_newmsg_ceil(d, l, -1);
}
#define UPB_PARSE_PARAMS \
upb_decstate *d, const char *ptr, upb_msg *msg, const upb_msglayout *table, \
@ -95,16 +101,33 @@ const char *upb_pss_1bt(UPB_PARSE_PARAMS);
const char *upb_pss_2bt(UPB_PARSE_PARAMS);
const char *upb_pos_1bt(UPB_PARSE_PARAMS);
const char *upb_pos_2bt(UPB_PARSE_PARAMS);
const char *upb_psm_1bt(UPB_PARSE_PARAMS);
const char *upb_pom_1bt(UPB_PARSE_PARAMS);
const char *upb_prm_1bt(UPB_PARSE_PARAMS);
const char *upb_psm_2bt(UPB_PARSE_PARAMS);
const char *upb_pom_2bt(UPB_PARSE_PARAMS);
const char *upb_prm_2bt(UPB_PARSE_PARAMS);
#undef F
#undef TYPES
#undef TAGBYTES
#define F(card, tagbytes, size_ceil, ceil_arg) \
const char *upb_p##card##m_##tagbytes##bt_max##size_ceil##b(UPB_PARSE_PARAMS);
#define SIZES(card, tagbytes) \
F(card, tagbytes, 64, 64) \
F(card, tagbytes, 128, 128) \
F(card, tagbytes, 192, 192) \
F(card, tagbytes, 256, 256) \
F(card, tagbytes, max, -1)
#define TAGBYTES(card) \
SIZES(card, 1) \
SIZES(card, 2)
TAGBYTES(s)
TAGBYTES(o)
TAGBYTES(r)
#undef TAGBYTES
#undef SIZES
#undef F
#undef UPB_PARSE_PARAMS
#ifdef __cplusplus

@ -81,7 +81,7 @@ static void *fastdecode_getfield_ofs(upb_decstate *d, const char *ptr,
*(uint32_t*)msg |= *hasbits;
*hasbits = 0;
if (UPB_LIKELY(!*arr_p)) {
const size_t initial_len = 8;
const size_t initial_len = 32;
size_t need = (valbytes * initial_len) + sizeof(upb_array);
if (UPB_UNLIKELY((size_t)(d->arena_end - d->arena_ptr) < need)) {
*outarr = NULL;
@ -348,7 +348,7 @@ const char *upb_pos_2bt(UPB_PARSE_PARAMS) {
UPB_FORCEINLINE
static const char *fastdecode_submsg(UPB_PARSE_PARAMS, int tagbytes,
upb_card card) {
int msg_ceil_bytes, upb_card card) {
const char *saved_limit = d->limit;
const char *saved_fastlimit = d->fastlimit;
const upb_msglayout_field *field = &table->fields[data >> 48];
@ -403,7 +403,7 @@ again:
}
if (card == CARD_r || !*submsg) {
*submsg = decode_newmsg(d, subl);
*submsg = decode_newmsg_ceil(d, subl, msg_ceil_bytes);
}
ptr = fastdecode_dispatch(d, ptr, *submsg, subl, 0);
submsg++;
@ -427,26 +427,27 @@ again:
return fastdecode_dispatch(d, ptr, msg, table, hasbits);
}
const char *upb_psm_1bt(UPB_PARSE_PARAMS) {
return fastdecode_submsg(UPB_PARSE_ARGS, 1, CARD_s);
}
const char *upb_pom_1bt(UPB_PARSE_PARAMS) {
return fastdecode_submsg(UPB_PARSE_ARGS, 1, CARD_o);
}
#define F(card, tagbytes, size_ceil, ceil_arg) \
const char *upb_p##card##m_##tagbytes##bt_max##size_ceil##b( \
UPB_PARSE_PARAMS) { \
return fastdecode_submsg(UPB_PARSE_ARGS, tagbytes, ceil_arg, CARD_##card); \
}
const char *upb_prm_1bt(UPB_PARSE_PARAMS) {
return fastdecode_submsg(UPB_PARSE_ARGS, 1, CARD_r);
}
#define SIZES(card, tagbytes) \
F(card, tagbytes, 64, 64) \
F(card, tagbytes, 128, 128) \
F(card, tagbytes, 192, 192) \
F(card, tagbytes, 256, 256) \
F(card, tagbytes, max, -1)
const char *upb_psm_2bt(UPB_PARSE_PARAMS) {
return fastdecode_submsg(UPB_PARSE_ARGS, 2, CARD_s);
}
#define TAGBYTES(card) \
SIZES(card, 1) \
SIZES(card, 2)
const char *upb_pom_2bt(UPB_PARSE_PARAMS) {
return fastdecode_submsg(UPB_PARSE_ARGS, 2, CARD_o);
}
TAGBYTES(s)
TAGBYTES(o)
TAGBYTES(r)
const char *upb_prm_2bt(UPB_PARSE_PARAMS) {
return fastdecode_submsg(UPB_PARSE_ARGS, 2, CARD_r);
}
#undef TAGBYTES
#undef SIZES
#undef F

@ -803,8 +803,27 @@ void TryFillTableEntry(const protobuf::Descriptor* message,
data.size64 |= (uint64_t)hasbit_mask << 16;
}
ent.first = absl::Substitute("upb_p$0$1_$2bt", cardinality, type,
(num > 15) ? "2" : "1");
if (field->type() == protobuf::FieldDescriptor::TYPE_MESSAGE) {
std::string size_ceil = "max";
size_t size = SIZE_MAX;
if (field->message_type()->file() == field->file()) {
MessageLayout sub_layout(field->message_type());
size = sub_layout.message_size().size64 + 8;
}
std::vector<size_t> breaks = {64, 128, 192, 256};
for (auto brk : breaks) {
if (size <= brk) {
size_ceil = std::to_string(brk);
break;
}
}
ent.first = absl::Substitute("upb_p$0$1_$2bt_max$3b", cardinality, type,
(num > 15) ? "2" : "1", size_ceil);
} else {
ent.first = absl::Substitute("upb_p$0$1_$2bt", cardinality, type,
(num > 15) ? "2" : "1");
}
ent.second = data;
}

Loading…
Cancel
Save