Optimized short string copying.

This sped up the alias=false case:

Before:

------------------------------------------------------------------------------
Benchmark                                       Time           CPU Iterations
------------------------------------------------------------------------------
BM_Parse_Upb_FileDesc_WithInitialBlock       4562 ns       4562 ns     153251   1.53276GB/s

 Performance counter stats for 'bazel-bin/benchmarks/benchmark --benchmark_filter=BM_Parse_Upb_FileDesc_WithInitialBlock':

          1,216.65 msec task-clock                #    0.936 CPUs utilized
                 6      context-switches          #    0.005 K/sec
                 0      cpu-migrations            #    0.000 K/sec
               200      page-faults               #    0.164 K/sec
     4,490,925,650      cycles                    #    3.691 GHz
    16,516,403,731      instructions              #    3.68  insn per cycle
     2,828,536,650      branches                  # 2324.861 M/sec
         5,425,830      branch-misses             #    0.19% of all branches

       1.300178903 seconds time elapsed

       1.211475000 seconds user
       0.072207000 seconds sys

After:

------------------------------------------------------------------------------
Benchmark                                       Time           CPU Iterations
------------------------------------------------------------------------------
BM_Parse_Upb_FileDesc_WithInitialBlock       3587 ns       3587 ns     195749   1.94935GB/s

 Performance counter stats for 'bazel-bin/benchmarks/benchmark --benchmark_filter=BM_Parse_Upb_FileDesc_WithInitialBlock':

          1,109.69 msec task-clock                #    0.930 CPUs utilized
                 5      context-switches          #    0.005 K/sec
                 0      cpu-migrations            #    0.000 K/sec
               198      page-faults               #    0.178 K/sec
     4,094,010,257      cycles                    #    3.689 GHz
    15,672,677,812      instructions              #    3.83  insn per cycle
     2,589,291,160      branches                  # 2333.346 M/sec
         3,306,386      branch-misses             #    0.13% of all branches

       1.193221789 seconds time elapsed

       1.102538000 seconds user
       0.072166000 seconds sys
pull/13171/head
Joshua Haberman 4 years ago
parent f3a2a79349
commit d81ba58215
  1. 2
      benchmarks/compare.py
  2. 30
      upb/decode_fast.c

@ -53,7 +53,7 @@ def Benchmark(outbase, bench_cpu=True, runs=12):
baseline = "master"
bench_cpu = True
bench_cpu = False
if len(sys.argv) > 1:
baseline = sys.argv[1]

@ -48,11 +48,6 @@ upb_msg *decode_newmsg_ceil(upb_decstate *d, const upb_msglayout *l,
return msg_data + sizeof(upb_msg_internal);
}
typedef struct {
const char *limit_ptr;
int val; /* If <=0, the old limit, else a delta */
} fastdecode_savedlimit;
UPB_FORCEINLINE
static const char *fastdecode_tagdispatch(upb_decstate *d, const char *ptr,
upb_msg *msg,
@ -308,10 +303,23 @@ static const char *fastdecode_string(UPB_PARSE_PARAMS, int tagbytes,
dst = fastdecode_getfield(d, ptr, msg, &data, &hasbits,
sizeof(upb_strview), card);
if (UPB_UNLIKELY(!d->alias)) {
return func(d, ptr + tagbytes, msg, table, hasbits, dst);
len = (uint8_t)ptr[tagbytes];
if (UPB_UNLIKELY(len > 15 - tagbytes || !_upb_arenahas(&d->arena, 16))) {
return func(d, ptr + tagbytes, msg, table, hasbits, dst);
}
char *data = d->arena.head.ptr;
d->arena.head.ptr += 16;
UPB_UNPOISON_MEMORY_REGION(data, 16);
memcpy(data, ptr, 16);
UPB_ASSERT(tagbytes + 1 + len <= 16);
ptr += tagbytes + 1;
dst->data = data + tagbytes + 1;
dst->size = len;
UPB_POISON_MEMORY_REGION(data, 1);
UPB_POISON_MEMORY_REGION(data + 1 + len, 16 - len - 1);
return fastdecode_dispatch(d, ptr + len, msg, table, hasbits);
}
len = (int8_t)ptr[tagbytes];
ptr += tagbytes + 1;
dst->data = ptr;
@ -356,14 +364,6 @@ const char *upb_pos_2bt(UPB_PARSE_PARAMS) {
/* message fields *************************************************************/
UPB_NOINLINE
static const char *fastdecode_longsubmsg(upb_decstate *d, const char *ptr,
upb_msg *msg,
const upb_msglayout *table,
size_t len) {
return ptr;
}
UPB_FORCEINLINE
static bool fastdecode_boundscheck2(const char *ptr, size_t len,
const char *end) {

Loading…
Cancel
Save