cmd/benchstat/main.go

// Copyright 2021 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Note: Blocks that begin with "$ benchstat" below will be tested by
// doc_test.go.

// Benchstat computes statistical summaries and A/B comparisons of Go
// benchmarks.
//
// Usage:
//
//	benchstat [flags] inputs...
//
// Each input file should be in the Go benchmark format
// (https://golang.org/design/14313-benchmark-format), such as the
// output of “go test -bench .”. Typically, there should be two (or
// more) inputs files for before and after some change (or series of
// changes) to be measured. Each benchmark should be run at least 10
// times to gather a statistically significant sample of results. For
// each benchmark, benchstat computes the median and the confidence
// interval for the median. By default, if there are two or more
// inputs files, it compares each benchmark in the first file to the
// same benchmark in each subsequent file and reports whether there
// was a statistically significant difference, though it can be
// configured to compare on other dimensions.
//
// # Example
//
// Suppose we collect results from running a set of benchmarks 10 times
// before a particular change:
//
//	go test -run='^$' -bench=. -count=10 > old.txt
//
// And the same benchmarks 10 times after:
//
//	go test -run='^$' -bench=. -count=10 > new.txt
//
// The file old.txt contains:
//
//	goos: linux
//	goarch: amd64
//	pkg: golang.org/x/perf/cmd/benchstat/testdata
//	BenchmarkEncode/format=json-48         	  690848	      1726 ns/op
//	BenchmarkEncode/format=json-48         	  684861	      1723 ns/op
//	BenchmarkEncode/format=json-48         	  693285	      1707 ns/op
//	BenchmarkEncode/format=json-48         	  677692	      1707 ns/op
//	BenchmarkEncode/format=json-48         	  692130	      1713 ns/op
//	BenchmarkEncode/format=json-48         	  684164	      1729 ns/op
//	BenchmarkEncode/format=json-48         	  682500	      1736 ns/op
//	BenchmarkEncode/format=json-48         	  677509	      1707 ns/op
//	BenchmarkEncode/format=json-48         	  687295	      1705 ns/op
//	BenchmarkEncode/format=json-48         	  695533	      1774 ns/op
//	BenchmarkEncode/format=gob-48          	  372699	      3069 ns/op
//	BenchmarkEncode/format=gob-48          	  394740	      3075 ns/op
//	BenchmarkEncode/format=gob-48          	  391335	      3069 ns/op
//	BenchmarkEncode/format=gob-48          	  383588	      3067 ns/op
//	BenchmarkEncode/format=gob-48          	  385885	      3207 ns/op
//	BenchmarkEncode/format=gob-48          	  389970	      3064 ns/op
//	BenchmarkEncode/format=gob-48          	  393361	      3064 ns/op
//	BenchmarkEncode/format=gob-48          	  393882	      3058 ns/op
//	BenchmarkEncode/format=gob-48          	  396171	      3059 ns/op
//	BenchmarkEncode/format=gob-48          	  397812	      3062 ns/op
//
// The file new.txt contains:
//
//	goos: linux
//	goarch: amd64
//	pkg: golang.org/x/perf/cmd/benchstat/testdata
//	BenchmarkEncode/format=json-48         	  714387	      1423 ns/op
//	BenchmarkEncode/format=json-48         	  845445	      1416 ns/op
//	BenchmarkEncode/format=json-48         	  815714	      1411 ns/op
//	BenchmarkEncode/format=json-48         	  828824	      1413 ns/op
//	BenchmarkEncode/format=json-48         	  834070	      1412 ns/op
//	BenchmarkEncode/format=json-48         	  828123	      1426 ns/op
//	BenchmarkEncode/format=json-48         	  834493	      1422 ns/op
//	BenchmarkEncode/format=json-48         	  838406	      1424 ns/op
//	BenchmarkEncode/format=json-48         	  836227	      1447 ns/op
//	BenchmarkEncode/format=json-48         	  830835	      1425 ns/op
//	BenchmarkEncode/format=gob-48          	  394441	      3075 ns/op
//	BenchmarkEncode/format=gob-48          	  393207	      3065 ns/op
//	BenchmarkEncode/format=gob-48          	  392374	      3059 ns/op
//	BenchmarkEncode/format=gob-48          	  396037	      3065 ns/op
//	BenchmarkEncode/format=gob-48          	  393255	      3060 ns/op
//	BenchmarkEncode/format=gob-48          	  382629	      3081 ns/op
//	BenchmarkEncode/format=gob-48          	  389558	      3186 ns/op
//	BenchmarkEncode/format=gob-48          	  392668	      3135 ns/op
//	BenchmarkEncode/format=gob-48          	  392313	      3087 ns/op
//	BenchmarkEncode/format=gob-48          	  394274	      3062 ns/op
//
// The order of the lines in the file does not matter, except that the
// output lists benchmarks in order of appearance.
//
// If we run “benchstat old.txt new.txt”, it will summarize the
// benchmarks and compare the before and after results:
//
//	$ benchstat old.txt new.txt
//	goos: linux
//	goarch: amd64
//	pkg: golang.org/x/perf/cmd/benchstat/testdata
//	                      │   old.txt   │               new.txt               │
//	                      │   sec/op    │   sec/op     vs base                │
//	Encode/format=json-48   1.718µ ± 1%   1.423µ ± 1%  -17.20% (p=0.000 n=10)
//	Encode/format=gob-48    3.066µ ± 0%   3.070µ ± 2%        ~ (p=0.446 n=10)
//	geomean                 2.295µ        2.090µ        -8.94%
//
// Before the comparison table, we see common file-level
// configuration. If there are benchmarks with different configuration
// (for example, from different packages), benchstat will print
// separate tables for each configuration.
//
// The table then compares the two input files for each benchmark. It
// shows the median and 95% confidence interval summaries for each
// benchmark before and after the change, and an A/B comparison under
// "vs base". The comparison shows that Encode/format=json got 17.20%
// faster with a p-value of 0.000 and 10 samples from each input file.
// The p-value measures how likely it is that any differences were due
// to random chance (i.e., noise). In this case, it's extremely
// unlikely the difference between the medians was due to chance. For
// Encode/format=gob, the "~" means benchstat did not detect a
// statistically significant difference between the two inputs. In
// this case, we see a p-value of 0.446, meaning it's very likely the
// differences for this benchmark are simply due to random chance.
//
// Note that "statistically significant" is not the same as "large":
// with enough low-noise data, even very small changes can be
// distinguished from noise and considered statistically significant.
// It is, of course, generally easier to distinguish large changes
// from noise.
//
// Finally, the last row of the table shows the geometric mean of each
// column, giving an overall picture of how the benchmarks changed.
// Proportional changes in the geomean reflect proportional changes in
// the benchmarks. For example, given n benchmarks, if sec/op for one
// of them increases by a factor of 2, then the sec/op geomean will
// increase by a factor of ⁿ√2.
//
// # Filtering
//
// benchstat has a very flexible system of configuring exactly which
// benchmarks are summarized and compared. First, all inputs are
// filtered according to an expression provided as the -filter flag.
//
// Filters are built from key-value terms:
//
//	key:value     - Match if key equals value.
//	key:"value"   - Same, but value is a double-quoted Go string that
//	                may contain spaces or other special characters.
//	"key":value   - Keys may also be double-quoted.
//	key:/regexp/  - Match if key matches a regular expression.
//	key:(val1 OR val2 OR ...)
//	              - Short-hand for key:val1 OR key:val2. Values may be
//	                double-quoted strings or regexps.
//	*             - Match everything.
//
// These terms can be combined into larger expressions as follows:
//
//	x y ...       - Match if x, y, etc. all match.
//	x AND y       - Same as x y.
//	x OR y        - Match if x or y match.
//	-x            - Match if x does not match.
//	(...)         - Subexpression.
//
// Each key is one of the following:
//
//	.name         - The base name of a benchmark
//	.fullname     - The full name of a benchmark (including configuration)
//	.file         - The name of the input file or user-provided file label
//	/{name-key}   - Per-benchmark sub-name configuration key
//	{file-key}    - File-level configuration key
//	.unit         - The name of a unit for a particular metric
//
// For example, the following matches benchmarks with "/format=json"
// in the sub-name keys with file-level configuration "goos" equal to
// "linux" and extracts the "ns/op" and "B/op" measurements:
//
//	$ benchstat -filter "/format:json goos:linux .unit:(ns/op OR B/op)" old.txt new.txt
//	goos: linux
//	goarch: amd64
//	pkg: golang.org/x/perf/cmd/benchstat/testdata
//	                      │   old.txt   │               new.txt               │
//	                      │   sec/op    │   sec/op     vs base                │
//	Encode/format=json-48   1.718µ ± 1%   1.423µ ± 1%  -17.20% (p=0.000 n=10)
//
// # Configuring comparisons
//
// The way benchstat groups and compares results is configurable using
// a similar set of keys as used for filtering. By default, benchstat
// groups results into tables using all file-level configuration keys,
// then within each table, it groups results into rows by .fullname
// (the benchmark's full name) and compares across columns by .file
// (the name of each input file). This can be changed via the
// following flags:
//
//	-table KEYS   - Group results into tables by KEYS
//	-row KEYS     - Group results into table rows by KEYS
//	-col KEYS     - Compare across results with different values of KEYS
//
// Using these flags, benchstat "projects" each result into a
// particular table cell. Each KEYS argument is a comma- or
// space-separated list of keys, each of which can optionally also
// specify a sort order (described below).
//
// Each key is one of the following:
//
//	.name         - The base name of a benchmark
//	.fullname     - The full name of a benchmark (including configuration)
//	.file         - The name of the input file or user-provided file label
//	/{name-key}   - Per-benchmark sub-name configuration key
//	{file-key}    - File-level configuration key
//	.config       - All file-level configuration keys
//
// Some of these keys can overlap. For example, ".config" includes the
// file-level key "goos", and ".fullname" includes the sub-name key
// "/format". When keys overlap like this, benchstat omits the more
// specific key from the general key. For example, if -table is the
// full file-level configuration ".config", and -col is the specific
// file key "goos", benchstat will omit "goos" from ".config".
//
// Finally, the -ignore flag can list keys that benchstat should
// ignore when grouping results. Continuing the previous example, if
// -table is ".config" and -ignore is "goos", benchstat will omit
// "goos" from ".config", but also not use it for any grouping.
//
// For precise details of the filter syntax and supported keys, see
// https://pkg.go.dev/golang.org/x/perf/benchproc/syntax.
//
// # Projection examples
//
// Returning to our first example, we can now see how the default
// projection flags produce this output:
//
//	$ benchstat -table .config -row .fullname -col .file old.txt new.txt
//	goos: linux
//	goarch: amd64
//	pkg: golang.org/x/perf/cmd/benchstat/testdata
//	                      │   old.txt   │               new.txt               │
//	                      │   sec/op    │   sec/op     vs base                │
//	Encode/format=json-48   1.718µ ± 1%   1.423µ ± 1%  -17.20% (p=0.000 n=10)
//	Encode/format=gob-48    3.066µ ± 0%   3.070µ ± 2%        ~ (p=0.446 n=10)
//	geomean                 2.295µ        2.090µ        -8.94%
//
// In this example, all benchmarks have the same file-level
// configuration, consisting of "goos", "goarch", and "pkg", so
// ".config" groups them into just one table. Within this table,
// results are grouped into rows by their full name, including
// configuration, and grouped into columns by the name of each input
// file.
//
// Suppose we instead want to compare json encoding to gob encoding
// from new.txt.
//
//	$ benchstat -col /format new.txt
//	goos: linux
//	goarch: amd64
//	pkg: golang.org/x/perf/cmd/benchstat/testdata
//	          │    json     │                 gob                  │
//	          │   sec/op    │   sec/op     vs base                 │
//	Encode-48   1.423µ ± 1%   3.070µ ± 2%  +115.82% (p=0.000 n=10)
//
// The columns are now labeled by the "/format" configuration from the
// benchmark name. benchstat still compares columns even though we've
// only provided a single input file. We also see that /format has
// been removed from the benchmark name to make a single row.
//
// We can simplify the output by grouping rows by just the benchmark name,
// rather than the full name:
//
//	$ benchstat -col /format -row .name new.txt
//	goos: linux
//	goarch: amd64
//	pkg: golang.org/x/perf/cmd/benchstat/testdata
//	       │    json     │                 gob                  │
//	       │   sec/op    │   sec/op     vs base                 │
//	Encode   1.423µ ± 1%   3.070µ ± 2%  +115.82% (p=0.000 n=10)
//
// benchstat will attempt to detect and warn if projections strip away
// too much information. For example, here we group together json and
// gob results into a single row:
//
//	$ benchstat  -row .name new.txt
//	goos: linux
//	goarch: amd64
//	pkg: golang.org/x/perf/cmd/benchstat/testdata
//	       │    new.txt     │
//	       │     sec/op     │
//	Encode   2.253µ ± 37% ¹
//	¹ benchmarks vary in .fullname
//
// Since this is probably not a meaningful comparison, benchstat warns
// that the benchmarks it grouped together vary in a hidden dimension.
// If this really were our intent, we could -ignore .fullname.
//
// # Sorting
//
// By default, benchstat sorts each dimension according to the order
// in which it first observes each value of that dimension. This can
// be overridden in each projection using the following syntax:
//
// {key}@{order} - specifies one of the built-in named sort orders.
// This can be "alpha" or "num" for alphabetic or numeric sorting.
// "num" understands basic use of metric and IEC prefixes like "2k"
// and "1Mi".
//
// {key}@({value} {value} ...) - specifies a fixed value order for
// key. It also specifies a filter: if key has a value that isn't any
// of the specified values, the result is filtered out.
//
// For example, we can use a fixed order to compare the improvement of
// json over gob rather than the other way around:
//
//	$ benchstat -col "/format@(gob json)" -row .name -ignore .file new.txt
//	goos: linux
//	goarch: amd64
//	pkg: golang.org/x/perf/cmd/benchstat/testdata
//	       │     gob     │                json                 │
//	       │   sec/op    │   sec/op     vs base                │
//	Encode   3.070µ ± 2%   1.423µ ± 1%  -53.66% (p=0.000 n=10)
//
// # Overriding .file
//
// Often, you want to compare results from different files, but want
// to provide more meaningful (or perhaps shorter) column labels than
// raw file names. File name labels can be overridden by specifying an
// input argument of the form "label=path" instead of just "path".
// This provides a custom value for the .file key.
//
// For example, the following will perform the default comparison, but
// label the columns O and N instead of old.txt and new.txt:
//
//	$ benchstat O=old.txt N=new.txt
//	goos: linux
//	goarch: amd64
//	pkg: golang.org/x/perf/cmd/benchstat/testdata
//	                      │      O      │                  N                  │
//	                      │   sec/op    │   sec/op     vs base                │
//	Encode/format=json-48   1.718µ ± 1%   1.423µ ± 1%  -17.20% (p=0.000 n=10)
//	Encode/format=gob-48    3.066µ ± 0%   3.070µ ± 2%        ~ (p=0.446 n=10)
//	geomean                 2.295µ        2.090µ        -8.94%
//
// # Units
//
// benchstat normalizes the units "ns" to "sec" and "MB" to "B" to
// avoid creating nonsense units like "µns/op". These appear in the
// testing package's default metrics and are also common in custom
// metrics.
//
// benchstat supports custom unit metadata (see
// https://golang.org/design/14313-benchmark-format). In particular,
// "assume" metadata is useful for controlling the statistics used by
// benchstat. By default, units use "assume=nothing", so benchstat
// uses non-parametric statistics: median for summaries, and the
// Mann-Whitney U-test for A/B comparisons.
//
// Some benchmarks measure things that have no noise, such as the size
// of a binary produced by a compiler. These do not benefit from
// repeated measurements or non-parametric statistics. For these
// units, it's useful to set "assume=exact". This will cause benchstat
// to warn if there's any variation in the measured values, and to
// show A/B comparisons even if there's only one before and after
// measurement.
//
// # Tips
//
// Reducing noise and/or increasing the number of benchmark runs will
// enable benchstat to discern smaller changes as "statistically
// significant". To reduce noise, make sure you run benchmarks on an
// otherwise idle machine, ideally one that isn't running on battery
// and isn't likely to be affected by thermal throttling.
// https://llvm.org/docs/Benchmarking.html has many good tips on
// reducing noise in benchmarks.
//
// It's also important that noise is evenly distributed across
// benchmark runs. The best way to do this is to interleave before and
// after runs, rather than running, say, 10 iterations of the before
// benchmark, and then 10 iterations of the after benchmark. For Go
// benchmarks, you can often speed up this process by using "go test
// -c" to pre-compile the benchmark binary.
//
// Pick a number of benchmark runs (at least 10, ideally 20) and stick
// to it. If benchstat reports no statistically significant change,
// avoid simply rerunning your benchmarks until it reports a
// significant change. This is known as "multiple testing" and is a
// common statistical error. By default, benchstat uses an ɑ threshold
// of 0.05, which means it is *expected* to show a difference 5% of
// the time even if there is no difference. Hence, if you rerun
// benchmarks looking for a change, benchstat will probably eventually
// say there is a change, even if there isn't, which creates a
// statistical bias.
//
// As an extension of this, if you compare a large number of
// benchmarks, you should expect that about 5% of them will report a
// statistically significant change even if there is no difference
// between the before and after.
package main

import (
	"flag"
	"fmt"
	"io"
	"os"

	"golang.org/x/perf/benchfmt"
	"golang.org/x/perf/benchmath"
	"golang.org/x/perf/benchproc"
	"golang.org/x/perf/cmd/benchstat/internal/benchtab"
)

// TODO: Add a flag to perform Holm–Bonferroni correction for
// family-wise error rates. This can be done after-the-fact on a
// collection of benchstat.Comparison values.

// TODO: -unit flag.

// TODO: Support sorting by commit order.

// TODO: Add some quick usage examples to the -h output?

// TODO: If the projection results in a very sparse table, that's
// usually the result of correlated keys. Can we detect that and
// suggest fixes?

func main() {
	if err := benchstat(os.Stdout, os.Stderr, os.Args[1:]); err != nil {
		fmt.Fprintf(os.Stderr, "benchstat: %s\n", err)
	}
}

func benchstat(w, wErr io.Writer, args []string) error {
	flags := flag.NewFlagSet("", flag.ExitOnError)
	flags.Usage = func() {
		fmt.Fprintf(flags.Output(), `Usage: benchstat [flags] inputs...

benchstat computes statistical summaries and A/B comparisons of Go
benchmarks. It shows benchmark medians in a table with a row for each
benchmark and a column for each input file. If there is more than one
input file, it also shows A/B comparisons between the files. If a
difference is likely to be noise, it shows "~".

For details, see https://pkg.go.dev/golang.org/x/perf/cmd/benchstat.
`)
		flags.PrintDefaults()
	}

	thresholds := benchmath.DefaultThresholds
	flagTable := flags.String("table", ".config", "split results into tables by distinct values of `projection`")
	flagRow := flags.String("row", ".fullname", "split results into rows by distinct values of `projection`")
	flagCol := flags.String("col", ".file", "split results into columns by distinct values of `projection`")
	flagIgnore := flags.String("ignore", "", "ignore variations in `keys`")
	flagFilter := flags.String("filter", "*", "use only benchmarks matching benchfilter `query`")
	flags.Float64Var(&thresholds.CompareAlpha, "alpha", thresholds.CompareAlpha, "consider change significant if p < `α`")
	// TODO: Support -confidence none to disable CI column? This
	// would be equivalent to benchstat v1's -norange for CSV.
	flagConfidence := flags.Float64("confidence", 0.95, "confidence `level` for ranges")
	flagFormat := flags.String("format", "text", "print results in `format`:\n  text - plain text\n  csv  - comma-separated values (warnings will be written to stderr)\n")
	flags.Parse(args)

	if flags.NArg() == 0 {
		flags.Usage()
		os.Exit(2)
	}

	filter, err := benchproc.NewFilter(*flagFilter)
	if err != nil {
		return fmt.Errorf("parsing -filter: %s", err)
	}

	var parser benchproc.ProjectionParser
	var parseErr error
	mustParse := func(name, val string, unit bool) *benchproc.Projection {
		var proj *benchproc.Projection
		var err error
		if unit {
			proj, _, err = parser.ParseWithUnit(val, filter)
		} else {
			proj, err = parser.Parse(val, filter)
		}
		if err != nil && parseErr == nil {
			parseErr = fmt.Errorf("parsing %s: %s", name, err)
		}
		return proj
	}
	tableBy := mustParse("-table", *flagTable, true)
	rowBy := mustParse("-row", *flagRow, false)
	colBy := mustParse("-col", *flagCol, false)
	mustParse("-ignore", *flagIgnore, false)
	residue := parser.Residue()
	if parseErr != nil {
		return parseErr
	}

	if thresholds.CompareAlpha < 0 || thresholds.CompareAlpha > 1 {
		return fmt.Errorf("-alpha must be in range [0, 1]")
	}
	if *flagConfidence < 0 || *flagConfidence > 1 {
		return fmt.Errorf("-confidence must be in range [0, 1]")
	}
	var format func(t *benchtab.Tables) error
	switch *flagFormat {
	default:
		return fmt.Errorf("-format must be text or csv")
	case "text":
		format = func(t *benchtab.Tables) error { return t.ToText(w, false) }
	case "csv":
		format = func(t *benchtab.Tables) error { return t.ToCSV(w, wErr) }
	}

	stat := benchtab.NewBuilder(tableBy, rowBy, colBy, residue)
	files := benchfmt.Files{Paths: flags.Args(), AllowStdin: true, AllowLabels: true}
	for files.Scan() {
		switch rec := files.Result(); rec := rec.(type) {
		case *benchfmt.SyntaxError:
			// Non-fatal result parse error. Warn
			// but keep going.
			fmt.Fprintln(wErr, rec)
		case *benchfmt.Result:
			if ok, err := filter.Apply(rec); !ok {
				if err != nil {
					// Print the reason we rejected this result.
					fmt.Fprintln(wErr, err)
				}
				continue
			}

			stat.Add(rec)
		}
	}
	if err := files.Err(); err != nil {
		return err
	}

	tables := stat.ToTables(benchtab.TableOpts{
		Confidence: *flagConfidence,
		Thresholds: &thresholds,
		Units:      files.Units(),
	})
	return format(tables)
}