protobuf/benchmarks/compare.py

#!/usr/bin/env python3
"""Benchmarks the current working directory against a given baseline.

This script benchmarks both size and speed. Sample output:
"""

import contextlib
import json
import os
import re
import subprocess
import sys
import tempfile

@contextlib.contextmanager
def GitWorktree(commit):
  tmpdir = tempfile.mkdtemp()
  subprocess.run(['git', 'worktree', 'add', '-q', '-d', tmpdir, commit], check=True)
  cwd = os.getcwd()
  os.chdir(tmpdir)
  try:
    yield tmpdir
  finally:
    os.chdir(cwd)
    subprocess.run(['git', 'worktree', 'remove', tmpdir], check=True)

def Run(cmd):
  subprocess.check_call(cmd, shell=True)

def Benchmark(outbase, bench_cpu=True, runs=12, fasttable=False):
  tmpfile = "/tmp/bench-output.json"
  Run("rm -rf {}".format(tmpfile))
  #Run("CC=clang bazel test ...")
  if fasttable:
    extra_args = " --//:fasttable_enabled=true"
  else:
    extra_args = ""

  if bench_cpu:
    Run("CC=clang bazel build -c opt --copt=-march=native benchmarks:benchmark" + extra_args)
    Run("./bazel-bin/benchmarks/benchmark --benchmark_out_format=json --benchmark_out={} --benchmark_repetitions={}".format(tmpfile, runs))
    with open(tmpfile) as f:
      bench_json = json.load(f)

    # Translate into the format expected by benchstat.
    with open(outbase + ".txt", "w") as f:
      for run in bench_json["benchmarks"]:
        name = run["name"]
        name = name.replace(" ", "")
        name = re.sub(r'^BM_', 'Benchmark', name)
        if name.endswith("_mean") or name.endswith("_median") or name.endswith("_stddev"):
          continue
        values = (name, run["iterations"], run["cpu_time"])
        print("{} {} {} ns/op".format(*values), file=f)

  Run("CC=clang bazel build -c opt --copt=-g tests:conformance_upb" + extra_args)
  Run("cp -f bazel-bin/tests/conformance_upb {}.bin".format(outbase))


baseline = "master"
bench_cpu = False
fasttable = False

if len(sys.argv) > 1:
  baseline = sys.argv[1]

  # Quickly verify that the baseline exists.
  with GitWorktree(baseline):
    pass

# Benchmark our current directory first, since it's more likely to be broken.
Benchmark("/tmp/new", bench_cpu, fasttable=fasttable)

# Benchmark the baseline.
with GitWorktree(baseline):
  Benchmark("/tmp/old", bench_cpu, fasttable=fasttable)

print()
print()

if bench_cpu:
  Run("~/go/bin/benchstat /tmp/old.txt /tmp/new.txt")

print()
print()

Run("objcopy --strip-debug /tmp/old.bin /tmp/old.bin.stripped")
Run("objcopy --strip-debug /tmp/new.bin /tmp/new.bin.stripped")
Run("~/code/bloaty/bloaty /tmp/new.bin.stripped -- /tmp/old.bin.stripped --debug-file=/tmp/old.bin --debug-file=/tmp/new.bin -d compileunits,symbols")
Rewrite of the decoder (#263) New code is smaller (in both source size and compiled size) and faster. # Speed The decoder speeds up on all machines I tested, though the amount of speedup varies. I was only able to test Intel CPUs. ### Linux Desktop ``` CPU: Intel(R) Core(TM) i7-8700K CPU @ 3.70GHz OS: Linux name old time/op new time/op delta CreateArena 4.72ns ± 0% 4.93ns ± 0% +4.47% (p=0.000 n=11+11) ParseDescriptor 12.4µs ± 1% 9.1µs ± 1% -26.65% (p=0.000 n=11+11) ``` ### Mac Laptop ``` CPU: Intel(R) Core(TM) i7-8850H CPU @ 2.60GHz OS: macOS name old time/op new time/op delta CreateArena 5.33ns ± 3% 5.58ns ± 2% +4.69% (p=0.000 n=12+12) ParseDescriptor 15.0µs ± 2% 11.9µs ± 2% -20.20% (p=0.000 n=12+12) ``` ### Linux Workstation ``` CPU: Intel(R) Xeon(R) Gold 6154 CPU @ 3.00GHz OS: Linux name old time/op new time/op delta CreateArena 5.29ns ± 0% 5.52ns ± 0% +4.37% (p=0.000 n=10+12) ParseDescriptor 18.6µs ± 0% 16.4µs ± 0% -11.54% (p=0.000 n=12+12) ``` # Size A few source files grow marginally because of some arena functionality moved inline. But `upb/decode.c` shrinks by 30% on Linux: ``` VM SIZE -------------- +2.1% +283 upb/json_decode.c +24% +205 upb/msg.c +8.4% +115 upb/upb.c +0.9% +28 upb/reflection.c [ = ] 0 upb/def.c [ = ] 0 upb/encode.c [ = ] 0 upb/json_encode.c [ = ] 0 upb/table.c -30.3% -1.51Ki upb/decode.c -0.7% -738 TOTAL ``` 5 years ago			`#!/usr/bin/env python3`
Expanded benchmarking script and added one size opt to the encoder. 4 years ago			`"""Benchmarks the current working directory against a given baseline.`
Rewrite of the decoder (#263) New code is smaller (in both source size and compiled size) and faster. # Speed The decoder speeds up on all machines I tested, though the amount of speedup varies. I was only able to test Intel CPUs. ### Linux Desktop ``` CPU: Intel(R) Core(TM) i7-8700K CPU @ 3.70GHz OS: Linux name old time/op new time/op delta CreateArena 4.72ns ± 0% 4.93ns ± 0% +4.47% (p=0.000 n=11+11) ParseDescriptor 12.4µs ± 1% 9.1µs ± 1% -26.65% (p=0.000 n=11+11) ``` ### Mac Laptop ``` CPU: Intel(R) Core(TM) i7-8850H CPU @ 2.60GHz OS: macOS name old time/op new time/op delta CreateArena 5.33ns ± 3% 5.58ns ± 2% +4.69% (p=0.000 n=12+12) ParseDescriptor 15.0µs ± 2% 11.9µs ± 2% -20.20% (p=0.000 n=12+12) ``` ### Linux Workstation ``` CPU: Intel(R) Xeon(R) Gold 6154 CPU @ 3.00GHz OS: Linux name old time/op new time/op delta CreateArena 5.29ns ± 0% 5.52ns ± 0% +4.37% (p=0.000 n=10+12) ParseDescriptor 18.6µs ± 0% 16.4µs ± 0% -11.54% (p=0.000 n=12+12) ``` # Size A few source files grow marginally because of some arena functionality moved inline. But `upb/decode.c` shrinks by 30% on Linux: ``` VM SIZE -------------- +2.1% +283 upb/json_decode.c +24% +205 upb/msg.c +8.4% +115 upb/upb.c +0.9% +28 upb/reflection.c [ = ] 0 upb/def.c [ = ] 0 upb/encode.c [ = ] 0 upb/json_encode.c [ = ] 0 upb/table.c -30.3% -1.51Ki upb/decode.c -0.7% -738 TOTAL ``` 5 years ago
Expanded benchmarking script and added one size opt to the encoder. 4 years ago			`This script benchmarks both size and speed. Sample output:`
			`"""`

			`import contextlib`
Rewrite of the decoder (#263) New code is smaller (in both source size and compiled size) and faster. # Speed The decoder speeds up on all machines I tested, though the amount of speedup varies. I was only able to test Intel CPUs. ### Linux Desktop ``` CPU: Intel(R) Core(TM) i7-8700K CPU @ 3.70GHz OS: Linux name old time/op new time/op delta CreateArena 4.72ns ± 0% 4.93ns ± 0% +4.47% (p=0.000 n=11+11) ParseDescriptor 12.4µs ± 1% 9.1µs ± 1% -26.65% (p=0.000 n=11+11) ``` ### Mac Laptop ``` CPU: Intel(R) Core(TM) i7-8850H CPU @ 2.60GHz OS: macOS name old time/op new time/op delta CreateArena 5.33ns ± 3% 5.58ns ± 2% +4.69% (p=0.000 n=12+12) ParseDescriptor 15.0µs ± 2% 11.9µs ± 2% -20.20% (p=0.000 n=12+12) ``` ### Linux Workstation ``` CPU: Intel(R) Xeon(R) Gold 6154 CPU @ 3.00GHz OS: Linux name old time/op new time/op delta CreateArena 5.29ns ± 0% 5.52ns ± 0% +4.37% (p=0.000 n=10+12) ParseDescriptor 18.6µs ± 0% 16.4µs ± 0% -11.54% (p=0.000 n=12+12) ``` # Size A few source files grow marginally because of some arena functionality moved inline. But `upb/decode.c` shrinks by 30% on Linux: ``` VM SIZE -------------- +2.1% +283 upb/json_decode.c +24% +205 upb/msg.c +8.4% +115 upb/upb.c +0.9% +28 upb/reflection.c [ = ] 0 upb/def.c [ = ] 0 upb/encode.c [ = ] 0 upb/json_encode.c [ = ] 0 upb/table.c -30.3% -1.51Ki upb/decode.c -0.7% -738 TOTAL ``` 5 years ago			`import json`
Expanded benchmarking script and added one size opt to the encoder. 4 years ago			`import os`
Rewrite of the decoder (#263) New code is smaller (in both source size and compiled size) and faster. # Speed The decoder speeds up on all machines I tested, though the amount of speedup varies. I was only able to test Intel CPUs. ### Linux Desktop ``` CPU: Intel(R) Core(TM) i7-8700K CPU @ 3.70GHz OS: Linux name old time/op new time/op delta CreateArena 4.72ns ± 0% 4.93ns ± 0% +4.47% (p=0.000 n=11+11) ParseDescriptor 12.4µs ± 1% 9.1µs ± 1% -26.65% (p=0.000 n=11+11) ``` ### Mac Laptop ``` CPU: Intel(R) Core(TM) i7-8850H CPU @ 2.60GHz OS: macOS name old time/op new time/op delta CreateArena 5.33ns ± 3% 5.58ns ± 2% +4.69% (p=0.000 n=12+12) ParseDescriptor 15.0µs ± 2% 11.9µs ± 2% -20.20% (p=0.000 n=12+12) ``` ### Linux Workstation ``` CPU: Intel(R) Xeon(R) Gold 6154 CPU @ 3.00GHz OS: Linux name old time/op new time/op delta CreateArena 5.29ns ± 0% 5.52ns ± 0% +4.37% (p=0.000 n=10+12) ParseDescriptor 18.6µs ± 0% 16.4µs ± 0% -11.54% (p=0.000 n=12+12) ``` # Size A few source files grow marginally because of some arena functionality moved inline. But `upb/decode.c` shrinks by 30% on Linux: ``` VM SIZE -------------- +2.1% +283 upb/json_decode.c +24% +205 upb/msg.c +8.4% +115 upb/upb.c +0.9% +28 upb/reflection.c [ = ] 0 upb/def.c [ = ] 0 upb/encode.c [ = ] 0 upb/json_encode.c [ = ] 0 upb/table.c -30.3% -1.51Ki upb/decode.c -0.7% -738 TOTAL ``` 5 years ago			`import re`
Expanded benchmarking script and added one size opt to the encoder. 4 years ago			`import subprocess`
			`import sys`
			`import tempfile`

			`@contextlib.contextmanager`
			`def GitWorktree(commit):`
			`tmpdir = tempfile.mkdtemp()`
Add UPB_FORCEINLINE for varint32 decoding. This speeds up the decoder by >20% and also reduces code size slightly! name old time/op new time/op delta ArenaOneAlloc 20.4ns ± 0% 20.2ns ± 0% -1.10% (p=0.000 n=12+11) ArenaInitialBlockOneAlloc 5.25ns ± 0% 5.25ns ± 0% ~ (p=0.786 n=11+12) ParseDescriptorNoHeap 17.1µs ± 0% 13.1µs ± 0% -23.29% (p=0.000 n=11+12) ParseDescriptor 17.4µs ± 1% 13.5µs ± 1% -22.51% (p=0.000 n=12+12) SerializeDescriptor 10.7µs ± 0% 10.9µs ± 0% +1.95% (p=0.000 n=12+12) FILE SIZE VM SIZE -------------- -------------- +2.7% +16 +2.7% +16 [LOAD #2 [RX]] +0.5% +16 [ = ] 0 [Unmapped] -1.4% -72 -0.7% -32 upb/decode.c +3.1% +98 +3.1% +98 decode_msg [DEL] -170 [DEL] -130 decode_varint32 -0.0% -40 -0.0% -16 TOTAL 4 years ago			`subprocess.run(['git', 'worktree', 'add', '-q', '-d', tmpdir, commit], check=True)`
Expanded benchmarking script and added one size opt to the encoder. 4 years ago			`cwd = os.getcwd()`
			`os.chdir(tmpdir)`
			`try:`
			`yield tmpdir`
			`finally:`
			`os.chdir(cwd)`
			`subprocess.run(['git', 'worktree', 'remove', tmpdir], check=True)`
Rewrite of the decoder (#263) New code is smaller (in both source size and compiled size) and faster. # Speed The decoder speeds up on all machines I tested, though the amount of speedup varies. I was only able to test Intel CPUs. ### Linux Desktop ``` CPU: Intel(R) Core(TM) i7-8700K CPU @ 3.70GHz OS: Linux name old time/op new time/op delta CreateArena 4.72ns ± 0% 4.93ns ± 0% +4.47% (p=0.000 n=11+11) ParseDescriptor 12.4µs ± 1% 9.1µs ± 1% -26.65% (p=0.000 n=11+11) ``` ### Mac Laptop ``` CPU: Intel(R) Core(TM) i7-8850H CPU @ 2.60GHz OS: macOS name old time/op new time/op delta CreateArena 5.33ns ± 3% 5.58ns ± 2% +4.69% (p=0.000 n=12+12) ParseDescriptor 15.0µs ± 2% 11.9µs ± 2% -20.20% (p=0.000 n=12+12) ``` ### Linux Workstation ``` CPU: Intel(R) Xeon(R) Gold 6154 CPU @ 3.00GHz OS: Linux name old time/op new time/op delta CreateArena 5.29ns ± 0% 5.52ns ± 0% +4.37% (p=0.000 n=10+12) ParseDescriptor 18.6µs ± 0% 16.4µs ± 0% -11.54% (p=0.000 n=12+12) ``` # Size A few source files grow marginally because of some arena functionality moved inline. But `upb/decode.c` shrinks by 30% on Linux: ``` VM SIZE -------------- +2.1% +283 upb/json_decode.c +24% +205 upb/msg.c +8.4% +115 upb/upb.c +0.9% +28 upb/reflection.c [ = ] 0 upb/def.c [ = ] 0 upb/encode.c [ = ] 0 upb/json_encode.c [ = ] 0 upb/table.c -30.3% -1.51Ki upb/decode.c -0.7% -738 TOTAL ``` 5 years ago
			`def Run(cmd):`
			`subprocess.check_call(cmd, shell=True)`

Fixed headers and updated benchmark script. 4 years ago			`def Benchmark(outbase, bench_cpu=True, runs=12, fasttable=False):`
Rewrite of the decoder (#263) New code is smaller (in both source size and compiled size) and faster. # Speed The decoder speeds up on all machines I tested, though the amount of speedup varies. I was only able to test Intel CPUs. ### Linux Desktop ``` CPU: Intel(R) Core(TM) i7-8700K CPU @ 3.70GHz OS: Linux name old time/op new time/op delta CreateArena 4.72ns ± 0% 4.93ns ± 0% +4.47% (p=0.000 n=11+11) ParseDescriptor 12.4µs ± 1% 9.1µs ± 1% -26.65% (p=0.000 n=11+11) ``` ### Mac Laptop ``` CPU: Intel(R) Core(TM) i7-8850H CPU @ 2.60GHz OS: macOS name old time/op new time/op delta CreateArena 5.33ns ± 3% 5.58ns ± 2% +4.69% (p=0.000 n=12+12) ParseDescriptor 15.0µs ± 2% 11.9µs ± 2% -20.20% (p=0.000 n=12+12) ``` ### Linux Workstation ``` CPU: Intel(R) Xeon(R) Gold 6154 CPU @ 3.00GHz OS: Linux name old time/op new time/op delta CreateArena 5.29ns ± 0% 5.52ns ± 0% +4.37% (p=0.000 n=10+12) ParseDescriptor 18.6µs ± 0% 16.4µs ± 0% -11.54% (p=0.000 n=12+12) ``` # Size A few source files grow marginally because of some arena functionality moved inline. But `upb/decode.c` shrinks by 30% on Linux: ``` VM SIZE -------------- +2.1% +283 upb/json_decode.c +24% +205 upb/msg.c +8.4% +115 upb/upb.c +0.9% +28 upb/reflection.c [ = ] 0 upb/def.c [ = ] 0 upb/encode.c [ = ] 0 upb/json_encode.c [ = ] 0 upb/table.c -30.3% -1.51Ki upb/decode.c -0.7% -738 TOTAL ``` 5 years ago			`tmpfile = "/tmp/bench-output.json"`
			`Run("rm -rf {}".format(tmpfile))`
Fixed headers and updated benchmark script. 4 years ago			`#Run("CC=clang bazel test ...")`
			`if fasttable:`
			`extra_args = " --//:fasttable_enabled=true"`
			`else:`
			`extra_args = ""`
Rewrite of the decoder (#263) New code is smaller (in both source size and compiled size) and faster. # Speed The decoder speeds up on all machines I tested, though the amount of speedup varies. I was only able to test Intel CPUs. ### Linux Desktop ``` CPU: Intel(R) Core(TM) i7-8700K CPU @ 3.70GHz OS: Linux name old time/op new time/op delta CreateArena 4.72ns ± 0% 4.93ns ± 0% +4.47% (p=0.000 n=11+11) ParseDescriptor 12.4µs ± 1% 9.1µs ± 1% -26.65% (p=0.000 n=11+11) ``` ### Mac Laptop ``` CPU: Intel(R) Core(TM) i7-8850H CPU @ 2.60GHz OS: macOS name old time/op new time/op delta CreateArena 5.33ns ± 3% 5.58ns ± 2% +4.69% (p=0.000 n=12+12) ParseDescriptor 15.0µs ± 2% 11.9µs ± 2% -20.20% (p=0.000 n=12+12) ``` ### Linux Workstation ``` CPU: Intel(R) Xeon(R) Gold 6154 CPU @ 3.00GHz OS: Linux name old time/op new time/op delta CreateArena 5.29ns ± 0% 5.52ns ± 0% +4.37% (p=0.000 n=10+12) ParseDescriptor 18.6µs ± 0% 16.4µs ± 0% -11.54% (p=0.000 n=12+12) ``` # Size A few source files grow marginally because of some arena functionality moved inline. But `upb/decode.c` shrinks by 30% on Linux: ``` VM SIZE -------------- +2.1% +283 upb/json_decode.c +24% +205 upb/msg.c +8.4% +115 upb/upb.c +0.9% +28 upb/reflection.c [ = ] 0 upb/def.c [ = ] 0 upb/encode.c [ = ] 0 upb/json_encode.c [ = ] 0 upb/table.c -30.3% -1.51Ki upb/decode.c -0.7% -738 TOTAL ``` 5 years ago
Shrink overhead of message representation. 4 years ago			`if bench_cpu:`
Fixed headers and updated benchmark script. 4 years ago			`Run("CC=clang bazel build -c opt --copt=-march=native benchmarks:benchmark" + extra_args)`
Fixed benchmark script. 4 years ago			`Run("./bazel-bin/benchmarks/benchmark --benchmark_out_format=json --benchmark_out={} --benchmark_repetitions={}".format(tmpfile, runs))`
Shrink overhead of message representation. 4 years ago			`with open(tmpfile) as f:`
			`bench_json = json.load(f)`

			`# Translate into the format expected by benchstat.`
			`with open(outbase + ".txt", "w") as f:`
			`for run in bench_json["benchmarks"]:`
Clamp 32-bit varints to 5 bytes to fix a fuzz failure. 4 years ago			`name = run["name"]`
			`name = name.replace(" ", "")`
			`name = re.sub(r'^BM_', 'Benchmark', name)`
Shrink overhead of message representation. 4 years ago			`if name.endswith("_mean") or name.endswith("_median") or name.endswith("_stddev"):`
			`continue`
			`values = (name, run["iterations"], run["cpu_time"])`
			`print("{} {} {} ns/op".format(*values), file=f)`
Merged common encode tag paths. 4 years ago
Fixed headers and updated benchmark script. 4 years ago			`Run("CC=clang bazel build -c opt --copt=-g tests:conformance_upb" + extra_args)`
Fixed benchmark script. 4 years ago			`Run("cp -f bazel-bin/tests/conformance_upb {}.bin".format(outbase))`
Rewrite of the decoder (#263) New code is smaller (in both source size and compiled size) and faster. # Speed The decoder speeds up on all machines I tested, though the amount of speedup varies. I was only able to test Intel CPUs. ### Linux Desktop ``` CPU: Intel(R) Core(TM) i7-8700K CPU @ 3.70GHz OS: Linux name old time/op new time/op delta CreateArena 4.72ns ± 0% 4.93ns ± 0% +4.47% (p=0.000 n=11+11) ParseDescriptor 12.4µs ± 1% 9.1µs ± 1% -26.65% (p=0.000 n=11+11) ``` ### Mac Laptop ``` CPU: Intel(R) Core(TM) i7-8850H CPU @ 2.60GHz OS: macOS name old time/op new time/op delta CreateArena 5.33ns ± 3% 5.58ns ± 2% +4.69% (p=0.000 n=12+12) ParseDescriptor 15.0µs ± 2% 11.9µs ± 2% -20.20% (p=0.000 n=12+12) ``` ### Linux Workstation ``` CPU: Intel(R) Xeon(R) Gold 6154 CPU @ 3.00GHz OS: Linux name old time/op new time/op delta CreateArena 5.29ns ± 0% 5.52ns ± 0% +4.37% (p=0.000 n=10+12) ParseDescriptor 18.6µs ± 0% 16.4µs ± 0% -11.54% (p=0.000 n=12+12) ``` # Size A few source files grow marginally because of some arena functionality moved inline. But `upb/decode.c` shrinks by 30% on Linux: ``` VM SIZE -------------- +2.1% +283 upb/json_decode.c +24% +205 upb/msg.c +8.4% +115 upb/upb.c +0.9% +28 upb/reflection.c [ = ] 0 upb/def.c [ = ] 0 upb/encode.c [ = ] 0 upb/json_encode.c [ = ] 0 upb/table.c -30.3% -1.51Ki upb/decode.c -0.7% -738 TOTAL ``` 5 years ago

Expanded benchmarking script and added one size opt to the encoder. 4 years ago			`baseline = "master"`
Added map sorting to binary and text encoders. For the binary encoder, sorting is off by default. For the text encoder, sorting is on by default. Both defaults can be explicitly overridden. This grows code size a bit. I think we could potentially shave this (and other map-related code size) by having the generated code inject a function pointer to the map-related parsing/serialization code if maps are present. FILE SIZE VM SIZE -------------- -------------- +86% +1.07Ki +71% +768 upb/msg.c [NEW] +391 [NEW] +344 _upb_mapsorter_pushmap [NEW] +158 [NEW] +112 _upb_mapsorter_cmpstr [NEW] +111 [NEW] +64 _upb_mapsorter_cmpbool [NEW] +110 [NEW] +64 _upb_mapsorter_cmpi32 [NEW] +110 [NEW] +64 _upb_mapsorter_cmpi64 [NEW] +110 [NEW] +64 _upb_mapsorter_cmpu32 [NEW] +110 [NEW] +64 _upb_mapsorter_cmpu64 -3.6% -8 -4.3% -8 _upb_map_new +9.5% +464 +9.2% +424 upb/text_encode.c [NEW] +656 [NEW] +616 txtenc_mapentry +15% +32 +20% +32 upb_text_encode -20.1% -224 -20.7% -224 txtenc_msg +5.7% +342 +5.3% +296 upb/encode.c [NEW] +344 [NEW] +304 encode_mapentry [NEW] +246 [NEW] +208 upb_encode_ex [NEW] +41 [NEW] +16 upb_encode_ex.ch +0.7% +8 +0.7% +8 encode_scalar -1.0% -32 -1.0% -32 encode_message [DEL] -38 [DEL] -16 upb_encode.ch [DEL] -227 [DEL] -192 upb_encode +2.0% +152 +2.2% +152 upb/decode.c +44% +128 +44% +128 [section .rodata] +3.4% +24 +3.4% +24 _GLOBAL_OFFSET_TABLE_ +0.6% +107 +0.3% +48 upb/def.c [NEW] +100 [NEW] +48 upb_fielddef_descriptortype +7.1% +7 [ = ] 0 upb_fielddef_defaultint32 +2.9% +24 +2.9% +24 [section .dynsym] +1.2% +24 [ = ] 0 [section .symtab] +3.2% +16 +3.2% +16 [section .plt] [NEW] +16 [NEW] +16 memcmp@plt +0.5% +16 +0.6% +16 tests/conformance_upb.c +1.5% +16 +1.6% +16 DoTestIo +0.1% +16 +0.1% +16 upb/json_decode.c +0.4% +16 +0.4% +16 jsondec_wellknown +3.0% +8 +3.0% +8 [section .got.plt] +3.0% +8 +3.0% +8 _GLOBAL_OFFSET_TABLE_ +1.6% +7 +1.6% +7 [section .dynstr] +1.8% +4 +1.8% +4 [section .hash] +0.5% +3 +0.5% +3 [LOAD #2 [RX]] +2.8% +2 +2.8% +2 [section .gnu.version] -60.0% -1.74Ki [ = ] 0 [Unmapped] +0.3% +496 +1.4% +1.74Ki TOTAL 4 years ago			`bench_cpu = False`
Fixed headers and updated benchmark script. 4 years ago			`fasttable = False`
Expanded benchmarking script and added one size opt to the encoder. 4 years ago
			`if len(sys.argv) > 1:`
			`baseline = sys.argv[1]`

			`# Quickly verify that the baseline exists.`
			`with GitWorktree(baseline):`
			`pass`

			`# Benchmark our current directory first, since it's more likely to be broken.`
Fixed headers and updated benchmark script. 4 years ago			`Benchmark("/tmp/new", bench_cpu, fasttable=fasttable)`
Expanded benchmarking script and added one size opt to the encoder. 4 years ago
			`# Benchmark the baseline.`
			`with GitWorktree(baseline):`
Fixed headers and updated benchmark script. 4 years ago			`Benchmark("/tmp/old", bench_cpu, fasttable=fasttable)`
Merged common encode tag paths. 4 years ago
			`print()`
			`print()`
Rewrite of the decoder (#263) New code is smaller (in both source size and compiled size) and faster. # Speed The decoder speeds up on all machines I tested, though the amount of speedup varies. I was only able to test Intel CPUs. ### Linux Desktop ``` CPU: Intel(R) Core(TM) i7-8700K CPU @ 3.70GHz OS: Linux name old time/op new time/op delta CreateArena 4.72ns ± 0% 4.93ns ± 0% +4.47% (p=0.000 n=11+11) ParseDescriptor 12.4µs ± 1% 9.1µs ± 1% -26.65% (p=0.000 n=11+11) ``` ### Mac Laptop ``` CPU: Intel(R) Core(TM) i7-8850H CPU @ 2.60GHz OS: macOS name old time/op new time/op delta CreateArena 5.33ns ± 3% 5.58ns ± 2% +4.69% (p=0.000 n=12+12) ParseDescriptor 15.0µs ± 2% 11.9µs ± 2% -20.20% (p=0.000 n=12+12) ``` ### Linux Workstation ``` CPU: Intel(R) Xeon(R) Gold 6154 CPU @ 3.00GHz OS: Linux name old time/op new time/op delta CreateArena 5.29ns ± 0% 5.52ns ± 0% +4.37% (p=0.000 n=10+12) ParseDescriptor 18.6µs ± 0% 16.4µs ± 0% -11.54% (p=0.000 n=12+12) ``` # Size A few source files grow marginally because of some arena functionality moved inline. But `upb/decode.c` shrinks by 30% on Linux: ``` VM SIZE -------------- +2.1% +283 upb/json_decode.c +24% +205 upb/msg.c +8.4% +115 upb/upb.c +0.9% +28 upb/reflection.c [ = ] 0 upb/def.c [ = ] 0 upb/encode.c [ = ] 0 upb/json_encode.c [ = ] 0 upb/table.c -30.3% -1.51Ki upb/decode.c -0.7% -738 TOTAL ``` 5 years ago
Shrink overhead of message representation. 4 years ago			`if bench_cpu:`
			`Run("~/go/bin/benchstat /tmp/old.txt /tmp/new.txt")`
Merged common encode tag paths. 4 years ago
			`print()`
			`print()`

			`Run("objcopy --strip-debug /tmp/old.bin /tmp/old.bin.stripped")`
			`Run("objcopy --strip-debug /tmp/new.bin /tmp/new.bin.stripped")`
			`Run("~/code/bloaty/bloaty /tmp/new.bin.stripped -- /tmp/old.bin.stripped --debug-file=/tmp/old.bin --debug-file=/tmp/new.bin -d compileunits,symbols")`