Label microbenchmark differences similarly to bloat (#27998)

* benchmark differences as a label * debug * Automated change: Fix sanity tests Co-authored-by: ctiller <ctiller@users.noreply.github.com>
3 years ago · 6003710534
parent 05b695c95a
commit 6003710534
2 changed files with 44 additions and 4 deletions
--- a/tools/profiling/microbenchmarks/bm_diff/bm_diff.py
+++ b/tools/profiling/microbenchmarks/bm_diff/bm_diff.py
@ -101,6 +101,7 @@ class Benchmark:
            False: collections.defaultdict(list)
        }
        self.final = {}
+        self.speedup = {}

    def add_sample(self, track, data, new):
        for f in track:
@ -117,6 +118,7 @@ class Benchmark:
            _maybe_print('%s: %s=%r %s=%r mdn_diff=%r' %
                         (f, new_name, new, old_name, old, mdn_diff))
            s = bm_speedup.speedup(new, old, 1e-5)
+            self.speedup[f] = s
            if abs(s) > 3:
                if mdn_diff > 0.5:
                    self.final[f] = '%+d%%' % s
@ -128,6 +130,11 @@ class Benchmark:
    def row(self, flds):
        return [self.final[f] if f in self.final else '' for f in flds]

+    def speedup(self, name):
+        if name in self.speedup:
+            return self.speedup[name]
+        return None
+

 def _read_json(filename, badjson_files, nonexistant_files):
    stripped = ".".join(filename.split(".")[:-2])
@ -204,6 +211,36 @@ def diff(bms, loops, regex, track, old, new, counters):
        really_interesting.update(bm.process(track, new, old))
    fields = [f for f in track if f in really_interesting]

+    # figure out the significance of the changes... right now we take the 95%-ile
+    # benchmark delta %-age, and then apply some hand chosen thresholds
+    histogram = []
+    for bm in benchmarks.values():
+        if bm.skip():
+            continue
+        d = bm.speedup('cpu_time')
+        if d is None:
+            continue
+        histogram.append(d)
+    histogram.sort()
+    print("histogram of speedups: ", histogram)
+    if len(histogram) == 0:
+        significance = 0
+    else:
+        delta = histogram[int(len(histogram) * 0.95)]
+        mul = 1
+        if delta < 0:
+            delta = -delta
+            mul = -1
+        if delta < 2:
+            significance = 0
+        elif delta < 5:
+            significance = 1
+        elif delta < 10:
+            significance = 2
+        else:
+            significance = 3
+        significance *= mul
+
    headers = ['Benchmark'] + fields
    rows = []
    for name in sorted(benchmarks.keys()):
@ -222,9 +259,10 @@ def diff(bms, loops, regex, track, old, new, counters):
            note = '\n\nMissing files (indicates new benchmark): \n%s' % fmt_dict(
                nonexistant_files)
    if rows:
-        return tabulate.tabulate(rows, headers=headers, floatfmt='+.2f'), note
+        return tabulate.tabulate(rows, headers=headers,
+                                 floatfmt='+.2f'), note, significance
    else:
-        return None, note
+        return None, note, 0


 if __name__ == '__main__':
--- a/tools/profiling/microbenchmarks/bm_diff/bm_main.py
+++ b/tools/profiling/microbenchmarks/bm_diff/bm_main.py
@ -135,8 +135,9 @@ def main(args):
    random.shuffle(jobs_list, random.SystemRandom().random)
    jobset.run(jobs_list, maxjobs=args.jobs)

-    diff, note = bm_diff.diff(args.benchmarks, args.loops, args.regex,
-                              args.track, old, 'new', args.counters)
+    diff, note, significance = bm_diff.diff(args.benchmarks, args.loops,
+                                            args.regex, args.track, old, 'new',
+                                            args.counters)
    if diff:
        text = '[%s] Performance differences noted:\n%s' % (
            args.pr_comment_name, diff)
@ -146,6 +147,7 @@ def main(args):
        text = note + '\n\n' + text
    print('%s' % text)
    check_on_pr.check_on_pr('Benchmark', '```\n%s\n```' % text)
+    check_on_pr.label_significance_on_pr('perf-change', significance)


 if __name__ == '__main__':