From d81ba58215752aadb3dec5b24e201505aae30266 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Thu, 22 Oct 2020 11:01:42 -0700 Subject: [PATCH] Optimized short string copying. This sped up the alias=false case: Before: ------------------------------------------------------------------------------ Benchmark Time CPU Iterations ------------------------------------------------------------------------------ BM_Parse_Upb_FileDesc_WithInitialBlock 4562 ns 4562 ns 153251 1.53276GB/s Performance counter stats for 'bazel-bin/benchmarks/benchmark --benchmark_filter=BM_Parse_Upb_FileDesc_WithInitialBlock': 1,216.65 msec task-clock # 0.936 CPUs utilized 6 context-switches # 0.005 K/sec 0 cpu-migrations # 0.000 K/sec 200 page-faults # 0.164 K/sec 4,490,925,650 cycles # 3.691 GHz 16,516,403,731 instructions # 3.68 insn per cycle 2,828,536,650 branches # 2324.861 M/sec 5,425,830 branch-misses # 0.19% of all branches 1.300178903 seconds time elapsed 1.211475000 seconds user 0.072207000 seconds sys After: ------------------------------------------------------------------------------ Benchmark Time CPU Iterations ------------------------------------------------------------------------------ BM_Parse_Upb_FileDesc_WithInitialBlock 3587 ns 3587 ns 195749 1.94935GB/s Performance counter stats for 'bazel-bin/benchmarks/benchmark --benchmark_filter=BM_Parse_Upb_FileDesc_WithInitialBlock': 1,109.69 msec task-clock # 0.930 CPUs utilized 5 context-switches # 0.005 K/sec 0 cpu-migrations # 0.000 K/sec 198 page-faults # 0.178 K/sec 4,094,010,257 cycles # 3.689 GHz 15,672,677,812 instructions # 3.83 insn per cycle 2,589,291,160 branches # 2333.346 M/sec 3,306,386 branch-misses # 0.13% of all branches 1.193221789 seconds time elapsed 1.102538000 seconds user 0.072166000 seconds sys --- benchmarks/compare.py | 2 +- upb/decode_fast.c | 30 +++++++++++++++--------------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/benchmarks/compare.py b/benchmarks/compare.py index ad8a1901e8..0bfa3db7c3 100755 --- a/benchmarks/compare.py +++ b/benchmarks/compare.py @@ -53,7 +53,7 @@ def Benchmark(outbase, bench_cpu=True, runs=12): baseline = "master" -bench_cpu = True +bench_cpu = False if len(sys.argv) > 1: baseline = sys.argv[1] diff --git a/upb/decode_fast.c b/upb/decode_fast.c index a93708c07c..c8cdf97c48 100644 --- a/upb/decode_fast.c +++ b/upb/decode_fast.c @@ -48,11 +48,6 @@ upb_msg *decode_newmsg_ceil(upb_decstate *d, const upb_msglayout *l, return msg_data + sizeof(upb_msg_internal); } -typedef struct { - const char *limit_ptr; - int val; /* If <=0, the old limit, else a delta */ -} fastdecode_savedlimit; - UPB_FORCEINLINE static const char *fastdecode_tagdispatch(upb_decstate *d, const char *ptr, upb_msg *msg, @@ -308,10 +303,23 @@ static const char *fastdecode_string(UPB_PARSE_PARAMS, int tagbytes, dst = fastdecode_getfield(d, ptr, msg, &data, &hasbits, sizeof(upb_strview), card); if (UPB_UNLIKELY(!d->alias)) { - return func(d, ptr + tagbytes, msg, table, hasbits, dst); + len = (uint8_t)ptr[tagbytes]; + if (UPB_UNLIKELY(len > 15 - tagbytes || !_upb_arenahas(&d->arena, 16))) { + return func(d, ptr + tagbytes, msg, table, hasbits, dst); + } + char *data = d->arena.head.ptr; + d->arena.head.ptr += 16; + UPB_UNPOISON_MEMORY_REGION(data, 16); + memcpy(data, ptr, 16); + UPB_ASSERT(tagbytes + 1 + len <= 16); + ptr += tagbytes + 1; + dst->data = data + tagbytes + 1; + dst->size = len; + UPB_POISON_MEMORY_REGION(data, 1); + UPB_POISON_MEMORY_REGION(data + 1 + len, 16 - len - 1); + return fastdecode_dispatch(d, ptr + len, msg, table, hasbits); } - len = (int8_t)ptr[tagbytes]; ptr += tagbytes + 1; dst->data = ptr; @@ -356,14 +364,6 @@ const char *upb_pos_2bt(UPB_PARSE_PARAMS) { /* message fields *************************************************************/ -UPB_NOINLINE -static const char *fastdecode_longsubmsg(upb_decstate *d, const char *ptr, - upb_msg *msg, - const upb_msglayout *table, - size_t len) { - return ptr; -} - UPB_FORCEINLINE static bool fastdecode_boundscheck2(const char *ptr, size_t len, const char *end) {