From 2e83110230b7e91b07835e9c718a1d6fbcb8b617 Mon Sep 17 00:00:00 2001 From: Josh Haberman Date: Wed, 27 Apr 2016 18:22:22 -0700 Subject: [PATCH] Added framework for generating/consuming benchmarking data sets. This takes the code that was sitting in benchmarks/ already and makes it easier for language-specific benchmarks to consume. Future PRs will enhance this so that the language-specific benchmarks can report metrics back that will be tracked over time in PerfKit. --- Makefile.am | 6 +- benchmarks/Makefile.am | 75 ++++++++++++ ....proto => benchmark_messages_proto2.proto} | 19 +-- benchmarks/benchmark_messages_proto3.proto | 76 ++++++++++++ benchmarks/benchmarks.proto | 102 ++++++++++++++++ benchmarks/generate_datasets.cc | 114 ++++++++++++++++++ configure.ac | 2 +- 7 files changed, 384 insertions(+), 10 deletions(-) create mode 100644 benchmarks/Makefile.am rename benchmarks/{google_speed.proto => benchmark_messages_proto2.proto} (91%) create mode 100644 benchmarks/benchmark_messages_proto3.proto create mode 100644 benchmarks/benchmarks.proto create mode 100644 benchmarks/generate_datasets.cc diff --git a/Makefile.am b/Makefile.am index a7a1f413b2..3e9888166f 100644 --- a/Makefile.am +++ b/Makefile.am @@ -9,7 +9,7 @@ AUTOMAKE_OPTIONS = foreign SUBDIRS = . src # Always include gmock in distributions. -DIST_SUBDIRS = $(subdirs) src conformance +DIST_SUBDIRS = $(subdirs) src conformance benchmarks # Build gmock before we build protobuf tests. We don't add gmock to SUBDIRS # because then "make check" would also build and run all of gmock's own tests, @@ -36,6 +36,10 @@ clean-local: echo "Making clean in conformance"; \ cd conformance && $(MAKE) $(AM_MAKEFLAGS) clean; \ fi; \ + if test -e benchmarks/Makefile; then \ + echo "Making clean in benchmarks"; \ + cd benchmarks && $(MAKE) $(AM_MAKEFLAGS) clean; \ + fi; \ if test -e objectivec/DevTools; then \ echo "Cleaning any ObjC pyc files"; \ rm -f objectivec/DevTools/*.pyc; \ diff --git a/benchmarks/Makefile.am b/benchmarks/Makefile.am new file mode 100644 index 0000000000..79581ee983 --- /dev/null +++ b/benchmarks/Makefile.am @@ -0,0 +1,75 @@ + +benchmarks_protoc_inputs = \ + benchmarks.proto \ + benchmark_messages_proto3.proto + +benchmarks_protoc_inputs_proto2 = \ + benchmark_messages_proto2.proto + +benchmarks_protoc_outputs = \ + benchmarks.pb.cc \ + benchmarks.pb.h \ + benchmark_messages_proto3.pb.cc \ + benchmark_messages_proto3.pb.h + +benchmarks_protoc_outputs_proto2 = \ + benchmark_messages_proto2.pb.cc \ + benchmark_messages_proto2.pb.h + +bin_PROGRAMS = generate-datasets + +generate_datasets_LDADD = $(top_srcdir)/src/libprotobuf.la +generate_datasets_SOURCES = generate_datasets.cc +generate_datasets_CPPFLAGS = -I$(top_srcdir)/src -I$(srcdir) +nodist_generate_datasets_SOURCES = \ + google_message1.h \ + google_message2.h \ + $(benchmarks_protoc_outputs) \ + $(benchmarks_protoc_outputs_proto2) + +# Explicit deps beacuse BUILT_SOURCES are only done before a "make all/check" +# so a direct "make test_cpp" could fail if parallel enough. +generate_datasets-generate_datasets.$(OBJEXT): benchmarks.pb.h google_message1.h google_message2.h + +$(benchmarks_protoc_outputs): protoc_middleman +$(benchmarks_protoc_outputs_proto2): protoc_middleman2 + +google_message1.h: google_message1.dat + xxd -i $< $@ + +google_message2.h: google_message2.dat + xxd -i $< $@ + +CLEANFILES = \ + $(benchmarks_protoc_outputs) \ + $(benchmarks_protoc_outputs_proto2) \ + google_message1.h \ + google_message2.h \ + protoc_middleman \ + protoc_middleman2 \ + dataset.* + +if USE_EXTERNAL_PROTOC + +protoc_middleman: $(benchmarks_protoc_inputs) + $(PROTOC) -I$(srcdir) -I$(top_srcdir) --cpp_out=. $(benchmarks_protoc_inputs) + touch protoc_middleman + +protoc_middleman2: $(benchmarks_protoc_inputs_proto2) + $(PROTOC) -I$(srcdir) -I$(top_srcdir) --cpp_out=. $(benchmarks_protoc_inputs_proto2) + touch protoc_middleman2 + +else + +# We have to cd to $(srcdir) before executing protoc because $(protoc_inputs) is +# relative to srcdir, which may not be the same as the current directory when +# building out-of-tree. +protoc_middleman: $(top_srcdir)/src/protoc$(EXEEXT) $(benchmarks_protoc_inputs) $(well_known_type_protoc_inputs) + oldpwd=`pwd` && ( cd $(srcdir) && $$oldpwd/../src/protoc$(EXEEXT) -I. -I$(top_srcdir)/src --cpp_out=$$oldpwd $(benchmarks_protoc_inputs) ) + touch protoc_middleman + +protoc_middleman2: $(top_srcdir)/src/protoc$(EXEEXT) $(benchmarks_protoc_inputs_proto2) $(well_known_type_protoc_inputs) + oldpwd=`pwd` && ( cd $(srcdir) && $$oldpwd/../src/protoc$(EXEEXT) -I. -I$(top_srcdir)/src --cpp_out=$$oldpwd $(benchmarks_protoc_inputs_proto2) ) + touch protoc_middleman + +endif diff --git a/benchmarks/google_speed.proto b/benchmarks/benchmark_messages_proto2.proto similarity index 91% rename from benchmarks/google_speed.proto rename to benchmarks/benchmark_messages_proto2.proto index 16f6d67804..c7103be5f6 100644 --- a/benchmarks/google_speed.proto +++ b/benchmarks/benchmark_messages_proto2.proto @@ -1,11 +1,14 @@ +// Benchmark messages for proto2. + syntax = "proto2"; -package benchmarks; +package benchmarks.p2; +option java_package = "com.google.protobuf.benchmarks"; -option java_outer_classname = "GoogleSpeed"; +// This is the default, but we specify it here explicitly. option optimize_for = SPEED; -message SpeedMessage1 { +message GoogleMessage1 { required string field1 = 1; optional string field9 = 9; optional string field18 = 18; @@ -40,7 +43,7 @@ message SpeedMessage1 { optional int32 field23 = 23 [default=0]; optional bool field24 = 24 [default=false]; optional int32 field25 = 25 [default=0]; - optional SpeedMessage1SubMessage field15 = 15; + optional GoogleMessage1SubMessage field15 = 15; optional bool field78 = 78; optional int32 field67 = 67 [default=0]; optional int32 field68 = 68; @@ -49,7 +52,7 @@ message SpeedMessage1 { optional int32 field131 = 131 [default=0]; } -message SpeedMessage1SubMessage { +message GoogleMessage1SubMessage { optional int32 field1 = 1 [default=0]; optional int32 field2 = 2 [default=0]; optional int32 field3 = 3 [default=0]; @@ -72,7 +75,7 @@ message SpeedMessage1SubMessage { optional uint64 field300 = 300; } -message SpeedMessage2 { +message GoogleMessage2 { optional string field1 = 1; optional int64 field3 = 3; optional int64 field4 = 4; @@ -112,7 +115,7 @@ message SpeedMessage2 { repeated int32 field73 = 73; optional int32 field20 = 20 [default=0]; optional string field24 = 24; - optional SpeedMessage2GroupedMessage field31 = 31; + optional GoogleMessage2GroupedMessage field31 = 31; } repeated string field128 = 128; optional int64 field131 = 131; @@ -123,7 +126,7 @@ message SpeedMessage2 { optional bool field206 = 206 [default=false]; } -message SpeedMessage2GroupedMessage { +message GoogleMessage2GroupedMessage { optional float field1 = 1; optional float field2 = 2; optional float field3 = 3 [default=0.0]; diff --git a/benchmarks/benchmark_messages_proto3.proto b/benchmarks/benchmark_messages_proto3.proto new file mode 100644 index 0000000000..4ea39c22f2 --- /dev/null +++ b/benchmarks/benchmark_messages_proto3.proto @@ -0,0 +1,76 @@ +// Benchmark messages for proto3. + +syntax = "proto3"; + +package benchmarks.p3; +option java_package = "com.google.protobuf.benchmarks"; + +// This is the default, but we specify it here explicitly. +option optimize_for = SPEED; + +message GoogleMessage1 { + string field1 = 1; + string field9 = 9; + string field18 = 18; + bool field80 = 80; + bool field81 = 81; + int32 field2 = 2; + int32 field3 = 3; + int32 field280 = 280; + int32 field6 = 6; + int64 field22 = 22; + string field4 = 4; + repeated fixed64 field5 = 5; + bool field59 = 59; + string field7 = 7; + int32 field16 = 16; + int32 field130 = 130; + bool field12 = 12; + bool field17 = 17; + bool field13 = 13; + bool field14 = 14; + int32 field104 = 104; + int32 field100 = 100; + int32 field101 = 101; + string field102 = 102; + string field103 = 103; + int32 field29 = 29; + bool field30 = 30; + int32 field60 = 60; + int32 field271 = 271; + int32 field272 = 272; + int32 field150 = 150; + int32 field23 = 23; + bool field24 = 24; + int32 field25 = 25; + GoogleMessage1SubMessage field15 = 15; + bool field78 = 78; + int32 field67 = 67; + int32 field68 = 68; + int32 field128 = 128; + string field129 = 129; + int32 field131 = 131; +} + +message GoogleMessage1SubMessage { + int32 field1 = 1; + int32 field2 = 2; + int32 field3 = 3; + string field15 = 15; + bool field12 = 12; + int64 field13 = 13; + int64 field14 = 14; + int32 field16 = 16; + int32 field19 = 19; + bool field20 = 20; + bool field28 = 28; + fixed64 field21 = 21; + int32 field22 = 22; + bool field23 = 23; + bool field206 = 206; + fixed32 field203 = 203; + int32 field204 = 204; + string field205 = 205; + uint64 field207 = 207; + uint64 field300 = 300; +} diff --git a/benchmarks/benchmarks.proto b/benchmarks/benchmarks.proto new file mode 100644 index 0000000000..a891eb9ec7 --- /dev/null +++ b/benchmarks/benchmarks.proto @@ -0,0 +1,102 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +syntax = "proto3"; +package benchmarks; +option java_package = "com.google.protobuf.benchmarks"; + +message BenchmarkDataset { + // Name of the benchmark dataset. This should be unique across all datasets. + // Should only contain word characters: [a-zA-Z0-9_] + string name = 1; + + // Fully-qualified name of the protobuf message for this dataset. + // It will be one of the messages defined benchmark_messages.proto. + // Implementations that do not support reflection can implement this with + // an explicit "if/else" chain that lists every possible message defined + // in this file. + string message_name = 2; + + // The payload(s) for this dataset. They should be parsed or serialized + // in sequence, in a loop, ie. + // + // while (!benchmarkDone) { // Benchmark runner decides when to exit. + // for (i = 0; i < benchmark.payload.length; i++) { + // parse(benchmark.payload[i]) + // } + // } + // + // This is intended to let datasets include a variety of data to provide + // potentially more realistic results than just parsing the same message + // over and over. A single message parsed repeatedly could yield unusually + // good branch prediction performance. + repeated bytes payload = 3; +} + +// A benchmark can write out metrics that we will then upload to our metrics +// database for tracking over time. +message Metric { + // A unique ID for these results. Used for de-duping. + string guid = 1; + + // The tags specify exactly what benchmark was run against the dataset. + // The specific benchmark suite can decide what these mean, but here are + // some common tags that have a predefined meaning: + // + // - "dataset": for tests that pertain to a specific dataset. + // + // For example: + // + // # Tests parsing from binary proto string using arenas. + // tags={ + // dataset: "testalltypes", + // op: "parse", + // format: "binaryproto", + // input: "string" + // arena: "true" + // } + // + // # Tests serializing to JSON string. + // tags={ + // dataset: "testalltypes", + // op: "serialize", + // format: "json", + // input: "string" + // } + map labels = 2; + + // Unit of measurement for the metric: + // - a speed test might be "mb_per_second" or "ops_per_second" + // - a size test might be "kb". + string unit = 3; + + // Metric value. + double value = 4; +} diff --git a/benchmarks/generate_datasets.cc b/benchmarks/generate_datasets.cc new file mode 100644 index 0000000000..f6f30cd85b --- /dev/null +++ b/benchmarks/generate_datasets.cc @@ -0,0 +1,114 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +const char *file_prefix = "dataset."; +const char *file_suffix = ".pb"; + +#include +#include +#include "benchmarks.pb.h" +#include "google_message1.h" +#include "google_message2.h" + +using benchmarks::BenchmarkDataset; +using google::protobuf::Descriptor; +using google::protobuf::DescriptorPool; +using google::protobuf::Message; +using google::protobuf::MessageFactory; + +#define ARRAY_TO_STRING(arr) std::string(arr, arr + sizeof(arr)) + +std::set names; + +void WriteFileWithPayloads(const std::string& name, + const std::string& message_name, + const std::vector& payload) { + if (!names.insert(name).second) { + std::cerr << "Duplicate test name: " << name << "\n"; + abort(); + } + + // First verify that this message name exists in our set of benchmark messages + // and that these payloads are valid for the given message. + const Descriptor* d = + DescriptorPool::generated_pool()->FindMessageTypeByName(message_name); + + if (!d) { + std::cerr << "For dataset " << name << ", no such message: " + << message_name << "\n"; + abort(); + } + + Message* m = MessageFactory::generated_factory()->GetPrototype(d)->New(); + + for (size_t i = 0; i < payload.size(); i++) { + if (!m->ParseFromString(payload[i])) { + std::cerr << "For dataset " << name << ", payload[" << i << "] fails " + << "to parse\n"; + abort(); + } + } + + BenchmarkDataset dataset; + dataset.set_name(name); + dataset.set_message_name(message_name); + for (size_t i = 0; i < payload.size(); i++) { + dataset.add_payload()->assign(payload[i]); + } + + std::string serialized; + dataset.SerializeToString(&serialized); + + std::ofstream writer; + std::string fname = file_prefix + name + file_suffix; + writer.open(fname); + writer << serialized; + writer.close(); + + std::cerr << "Wrote dataset: " << fname << "\n"; +} + +void WriteFile(const std::string& name, const std::string& message_name, + const std::string& payload) { + std::vector payloads; + payloads.push_back(payload); + WriteFileWithPayloads(name, message_name, payloads); +} + +int main() { + WriteFile("google_message1_proto3", "benchmarks.p3.GoogleMessage1", + ARRAY_TO_STRING(google_message1_dat)); + WriteFile("google_message1_proto2", "benchmarks.p2.GoogleMessage1", + ARRAY_TO_STRING(google_message1_dat)); + + // Not in proto3 because it has a group, which is not supported. + WriteFile("google_message2", "benchmarks.p2.GoogleMessage2", + ARRAY_TO_STRING(google_message2_dat)); +} diff --git a/configure.ac b/configure.ac index 33a6c64d6f..d56a704705 100644 --- a/configure.ac +++ b/configure.ac @@ -180,5 +180,5 @@ export CFLAGS export CXXFLAGS AC_CONFIG_SUBDIRS([gmock]) -AC_CONFIG_FILES([Makefile src/Makefile conformance/Makefile protobuf.pc protobuf-lite.pc]) +AC_CONFIG_FILES([Makefile src/Makefile benchmarks/Makefile conformance/Makefile protobuf.pc protobuf-lite.pc]) AC_OUTPUT