#!/usr/bin/env python3 # # Copyright 2017 gRPC authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Computes the diff between two bm runs and outputs significant results """ import argparse import collections import json import os import subprocess import sys sys.path.append(os.path.join(os.path.dirname(sys.argv[0]), '..')) import bm_constants import bm_json import bm_speedup import tabulate verbose = False def _median(ary): assert (len(ary)) ary = sorted(ary) n = len(ary) if n % 2 == 0: return (ary[(n - 1) // 2] + ary[(n - 1) // 2 + 1]) / 2.0 else: return ary[n // 2] def _args(): argp = argparse.ArgumentParser( description='Perform diff on microbenchmarks') argp.add_argument('-t', '--track', choices=sorted(bm_constants._INTERESTING), nargs='+', default=sorted(bm_constants._INTERESTING), help='Which metrics to track') argp.add_argument('-b', '--benchmarks', nargs='+', choices=bm_constants._AVAILABLE_BENCHMARK_TESTS, default=bm_constants._AVAILABLE_BENCHMARK_TESTS, help='Which benchmarks to run') argp.add_argument( '-l', '--loops', type=int, default=20, help= 'Number of times to loops the benchmarks. Must match what was passed to bm_run.py' ) argp.add_argument('-r', '--regex', type=str, default="", help='Regex to filter benchmarks run') argp.add_argument('-n', '--new', type=str, help='New benchmark name') argp.add_argument('-o', '--old', type=str, help='Old benchmark name') argp.add_argument('-v', '--verbose', type=bool, help='Print details of before/after') args = argp.parse_args() global verbose if args.verbose: verbose = True assert args.new assert args.old return args def _maybe_print(str): if verbose: print(str) class Benchmark: def __init__(self): self.samples = { True: collections.defaultdict(list), False: collections.defaultdict(list) } self.final = {} self.speedup = {} def add_sample(self, track, data, new): for f in track: if f in data: self.samples[new][f].append(float(data[f])) def process(self, track, new_name, old_name): for f in sorted(track): new = self.samples[True][f] old = self.samples[False][f] if not new or not old: continue mdn_diff = abs(_median(new) - _median(old)) _maybe_print('%s: %s=%r %s=%r mdn_diff=%r' % (f, new_name, new, old_name, old, mdn_diff)) s = bm_speedup.speedup(new, old, 1e-5) self.speedup[f] = s if abs(s) > 3: if mdn_diff > 0.5: self.final[f] = '%+d%%' % s return self.final.keys() def skip(self): return not self.final def row(self, flds): return [self.final[f] if f in self.final else '' for f in flds] def speedup(self, name): if name in self.speedup: return self.speedup[name] return None def _read_json(filename, badjson_files, nonexistant_files): stripped = ".".join(filename.split(".")[:-2]) try: with open(filename) as f: r = f.read() return json.loads(r) except IOError as e: if stripped in nonexistant_files: nonexistant_files[stripped] += 1 else: nonexistant_files[stripped] = 1 return None except ValueError as e: print(r) if stripped in badjson_files: badjson_files[stripped] += 1 else: badjson_files[stripped] = 1 return None def fmt_dict(d): return ''.join([" " + k + ": " + str(d[k]) + "\n" for k in d]) def diff(bms, loops, regex, track, old, new): benchmarks = collections.defaultdict(Benchmark) badjson_files = {} nonexistant_files = {} for bm in bms: for loop in range(0, loops): for line in subprocess.check_output([ 'bm_diff_%s/opt/%s' % (old, bm), '--benchmark_list_tests', '--benchmark_filter=%s' % regex ]).splitlines(): line = line.decode('UTF-8') stripped_line = line.strip().replace("/", "_").replace( "<", "_").replace(">", "_").replace(", ", "_") js_new_opt = _read_json( '%s.%s.opt.%s.%d.json' % (bm, stripped_line, new, loop), badjson_files, nonexistant_files) js_old_opt = _read_json( '%s.%s.opt.%s.%d.json' % (bm, stripped_line, old, loop), badjson_files, nonexistant_files) if js_new_opt: for row in bm_json.expand_json(js_new_opt): name = row['cpp_name'] if name.endswith('_mean') or name.endswith('_stddev'): continue benchmarks[name].add_sample(track, row, True) if js_old_opt: for row in bm_json.expand_json(js_old_opt): name = row['cpp_name'] if name.endswith('_mean') or name.endswith('_stddev'): continue benchmarks[name].add_sample(track, row, False) really_interesting = set() for name, bm in benchmarks.items(): _maybe_print(name) really_interesting.update(bm.process(track, new, old)) fields = [f for f in track if f in really_interesting] # figure out the significance of the changes... right now we take the 95%-ile # benchmark delta %-age, and then apply some hand chosen thresholds histogram = [] _NOISY = ["BM_WellFlushed"] for name, bm in benchmarks.items(): if name in _NOISY: print("skipping noisy benchmark '%s' for labelling evaluation" % name) if bm.skip(): continue d = bm.speedup['cpu_time'] if d is None: continue histogram.append(d) histogram.sort() print("histogram of speedups: ", histogram) if len(histogram) == 0: significance = 0 else: delta = histogram[int(len(histogram) * 0.95)] mul = 1 if delta < 0: delta = -delta mul = -1 if delta < 2: significance = 0 elif delta < 5: significance = 1 elif delta < 10: significance = 2 else: significance = 3 significance *= mul headers = ['Benchmark'] + fields rows = [] for name in sorted(benchmarks.keys()): if benchmarks[name].skip(): continue rows.append([name] + benchmarks[name].row(fields)) note = None if len(badjson_files): note = 'Corrupt JSON data (indicates timeout or crash): \n%s' % fmt_dict( badjson_files) if len(nonexistant_files): if note: note += '\n\nMissing files (indicates new benchmark): \n%s' % fmt_dict( nonexistant_files) else: note = '\n\nMissing files (indicates new benchmark): \n%s' % fmt_dict( nonexistant_files) if rows: return tabulate.tabulate(rows, headers=headers, floatfmt='+.2f'), note, significance else: return None, note, 0 if __name__ == '__main__': args = _args() diff, note = diff(args.benchmarks, args.loops, args.regex, args.track, args.old, args.new, args.counters) print('%s\n%s' % (note, diff if diff else "No performance differences"))