From b872ffe2f2b9806b4774e93be23cd07e7f1ec5b1 Mon Sep 17 00:00:00 2001 From: Alexander Polcyn Date: Wed, 6 May 2020 15:17:15 -0700 Subject: [PATCH] Make channel soak interop test more debuggable, add max_failures and max_acceptable_latency_ms options --- test/cpp/interop/client.cc | 20 +++-- test/cpp/interop/interop_client.cc | 124 ++++++++++++++++++++++++----- test/cpp/interop/interop_client.h | 16 +++- 3 files changed, 135 insertions(+), 25 deletions(-) diff --git a/test/cpp/interop/client.cc b/test/cpp/interop/client.cc index 45fb68df5b5..fc12d4b8ae6 100644 --- a/test/cpp/interop/client.cc +++ b/test/cpp/interop/client.cc @@ -88,8 +88,15 @@ DEFINE_bool(do_not_abort_on_transient_failures, false, "test is retried in case of transient failures (and currently the " "interop tests are not retried even if this flag is set to true)"); DEFINE_int32(soak_iterations, 1000, - "number of iterations to use for the two soak tests; rpc_soak and " - "channel_soak"); + "The number of iterations to use for the two soak tests; rpc_soak " + "and channel_soak."); +DEFINE_int32(soak_max_failures, 0, + "The number of iterations in soak tests that are allowed to fail " + "(either due to non-OK status code or exceeding the " + "per-iteration max acceptable latency)."); +DEFINE_int64(soak_per_iteration_max_acceptable_latency_ms, 0, + "The number of milliseconds a single iteration in the two soak " + "tests (rpc_soak and channel_soak) is allowed to take."); DEFINE_int32(iteration_interval, 10, "The interval in seconds between rpcs. This is used by " "long_connection test"); @@ -257,9 +264,12 @@ int main(int argc, char** argv) { std::bind(&grpc::testing::InteropClient::DoCacheableUnary, &client); actions["channel_soak"] = std::bind(&grpc::testing::InteropClient::DoChannelSoakTest, &client, - FLAGS_soak_iterations); - actions["rpc_soak"] = std::bind(&grpc::testing::InteropClient::DoRpcSoakTest, - &client, FLAGS_soak_iterations); + FLAGS_soak_iterations, FLAGS_soak_max_failures, + FLAGS_soak_per_iteration_max_acceptable_latency_ms); + actions["rpc_soak"] = + std::bind(&grpc::testing::InteropClient::DoRpcSoakTest, &client, + FLAGS_soak_iterations, FLAGS_soak_max_failures, + FLAGS_soak_per_iteration_max_acceptable_latency_ms); actions["long_lived_channel"] = std::bind(&grpc::testing::InteropClient::DoLongLivedChannelTest, &client, FLAGS_soak_iterations, FLAGS_iteration_interval); diff --git a/test/cpp/interop/interop_client.cc b/test/cpp/interop/interop_client.cc index 71ee79a6e96..63343ded620 100644 --- a/test/cpp/interop/interop_client.cc +++ b/test/cpp/interop/interop_client.cc @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -30,6 +31,7 @@ #include #include +#include "src/core/lib/iomgr/exec_ctx.h" #include "src/proto/grpc/testing/empty.pb.h" #include "src/proto/grpc/testing/messages.pb.h" #include "src/proto/grpc/testing/test.grpc.pb.h" @@ -1065,36 +1067,122 @@ bool InteropClient::DoCustomMetadata() { return true; } -bool InteropClient::DoRpcSoakTest(int32_t soak_iterations) { - gpr_log(GPR_DEBUG, "Sending %d RPCs...", soak_iterations); - GPR_ASSERT(soak_iterations > 0); +std::tuple +InteropClient::PerformOneSoakTestIteration( + const bool reset_channel, + const int64_t max_acceptable_per_iteration_latency_ms) { + gpr_timespec start = gpr_now(GPR_CLOCK_MONOTONIC); SimpleRequest request; SimpleResponse response; + // Don't set the deadline on the RPC, and instead just + // record how long the RPC took and compare. This makes + // debugging easier when looking at failure results. + ClientContext context; + InteropClientContextInspector inspector(context); + request.set_response_size(kLargeResponseSize); + grpc::string payload(kLargeRequestSize, '\0'); + request.mutable_payload()->set_body(payload.c_str(), kLargeRequestSize); + if (reset_channel) { + serviceStub_.ResetChannel(); + } + Status s = serviceStub_.Get()->UnaryCall(&context, request, &response); + gpr_timespec now = gpr_now(GPR_CLOCK_MONOTONIC); + grpc_millis elapsed_ms = + grpc_timespec_to_millis_round_down(gpr_time_sub(now, start)); + if (!s.ok()) { + return std::make_tuple(false, elapsed_ms, context.debug_error_string()); + } else if (elapsed_ms > max_acceptable_per_iteration_latency_ms) { + char* out; + GPR_ASSERT(gpr_asprintf( + &out, "%ld ms exceeds max acceptable latency: %ld ms.", + elapsed_ms, max_acceptable_per_iteration_latency_ms) != -1); + std::string debug_string(out); + gpr_free(out); + return std::make_tuple(false, elapsed_ms, debug_string); + } else { + return std::make_tuple(true, elapsed_ms, ""); + } +} + +void InteropClient::PerformSoakTest( + const bool reset_channel_per_iteration, const int32_t soak_iterations, + const int32_t max_failures, + const int64_t max_acceptable_per_iteration_latency_ms) { + std::vector> results; + std::vector latencies_ms; for (int i = 0; i < soak_iterations; ++i) { - if (!PerformLargeUnary(&request, &response)) { - gpr_log(GPR_ERROR, "rpc_soak test failed on iteration %d", i); - return false; + auto result = PerformOneSoakTestIteration( + reset_channel_per_iteration, max_acceptable_per_iteration_latency_ms); + results.push_back(result); + latencies_ms.push_back(std::get<1>(result)); + } + int total_failures = 0; + for (size_t i = 0; i < results.size(); i++) { + bool success = std::get<0>(results[i]); + grpc_millis elapsed_ms = std::get<1>(results[i]); + std::string debug_string = std::get<2>(results[i]); + if (!success) { + gpr_log(GPR_DEBUG, "soak iteration:%ld elapsed_ms:%ld failed: %s", i, + elapsed_ms, debug_string.c_str()); + total_failures++; + } else { + gpr_log(GPR_DEBUG, "soak iteration:%ld elapsed_ms:%ld succeeded", i, + elapsed_ms); } } + std::sort(latencies_ms.begin(), latencies_ms.end()); + grpc_millis latency_ms_median = latencies_ms.size() >= 2 + ? latencies_ms[latencies_ms.size() / 2] + : latencies_ms.back(); + grpc_millis latency_ms_90th = + latencies_ms.size() >= 10 ? latencies_ms[(latencies_ms.size() / 10) * 9] + : latencies_ms.back(); + grpc_millis latency_ms_worst = latencies_ms.back(); + if (total_failures > max_failures) { + gpr_log(GPR_ERROR, + "soak test ran:%d iterations. total_failures:%d exceeds " + "max_failures_threshold:%d. " + "median_soak_iteration_latency:%ld ms. " + "90th_soak_iteration_latency:%ld ms. " + "worst_soak_iteration_latency:%ld ms. " + "See breakdown above for which iterations succeeded, failed, and " + "why for more info.", + soak_iterations, total_failures, max_failures, latency_ms_median, + latency_ms_90th, latency_ms_worst); + GPR_ASSERT(0); + } else { + gpr_log(GPR_INFO, + "soak test ran:%d iterations. total_failures:%d is within " + "max_failures_threshold:%d. " + "median_soak_iteration_latency:%ld ms. " + "90th_soak_iteration_latency:%ld ms. " + "worst_soak_iteration_latency:%ld ms. " + "See breakdown above for which iterations succeeded, failed, and " + "why for more info.", + soak_iterations, total_failures, max_failures, latency_ms_median, + latency_ms_90th, latency_ms_worst); + } +} + +bool InteropClient::DoRpcSoakTest( + int32_t soak_iterations, int32_t max_failures, + int64_t max_acceptable_per_iteration_latency_ms) { + gpr_log(GPR_DEBUG, "Sending %d RPCs...", soak_iterations); + GPR_ASSERT(soak_iterations > 0); + PerformSoakTest(false /* reset channel per iteration */, soak_iterations, + max_failures, max_acceptable_per_iteration_latency_ms); gpr_log(GPR_DEBUG, "rpc_soak test done."); return true; } -bool InteropClient::DoChannelSoakTest(int32_t soak_iterations) { +bool InteropClient::DoChannelSoakTest( + int32_t soak_iterations, int32_t max_failures, + int64_t max_acceptable_per_iteration_latency_ms) { gpr_log(GPR_DEBUG, "Sending %d RPCs, tearing down the channel each time...", soak_iterations); GPR_ASSERT(soak_iterations > 0); - SimpleRequest request; - SimpleResponse response; - for (int i = 0; i < soak_iterations; ++i) { - serviceStub_.ResetChannel(); - gpr_log(GPR_DEBUG, "Starting channel_soak iteration %d...", i); - if (!PerformLargeUnary(&request, &response)) { - gpr_log(GPR_ERROR, "channel_soak test failed on iteration %d", i); - return false; - } - gpr_log(GPR_DEBUG, "channel_soak iteration %d finished", i); - } + PerformSoakTest(true /* reset channel per iteration */, soak_iterations, + max_failures, max_acceptable_per_iteration_latency_ms); gpr_log(GPR_DEBUG, "channel_soak test done."); return true; } diff --git a/test/cpp/interop/interop_client.h b/test/cpp/interop/interop_client.h index 483d9becac2..42da648f336 100644 --- a/test/cpp/interop/interop_client.h +++ b/test/cpp/interop/interop_client.h @@ -23,6 +23,7 @@ #include #include +#include "src/core/lib/iomgr/exec_ctx.h" #include "src/proto/grpc/testing/messages.pb.h" #include "src/proto/grpc/testing/test.grpc.pb.h" @@ -76,8 +77,10 @@ class InteropClient { // not implemented cross-language. They are considered experimental for now, // but at some point in the future, might be codified and implemented in all // languages - bool DoChannelSoakTest(int32_t soak_iterations); - bool DoRpcSoakTest(int32_t soak_iterations); + bool DoChannelSoakTest(int32_t soak_iterations, int32_t max_failures, + int64_t max_acceptable_per_iteration_latency_ms); + bool DoRpcSoakTest(int32_t soak_iterations, int32_t max_failures, + int64_t max_acceptable_per_iteration_latency_ms); bool DoLongLivedChannelTest(int32_t soak_iterations, int32_t iteration_interval); @@ -128,6 +131,15 @@ class InteropClient { const grpc::string& optional_debug_string); bool TransientFailureOrAbort(); + std::tuple PerformOneSoakTestIteration( + const bool reset_channel, + const int64_t max_acceptable_per_iteration_latency_ms); + + void PerformSoakTest(const bool reset_channel_per_iteration, + const int32_t soak_iterations, + const int32_t max_failures, + const int64_t max_acceptable_per_iteration_latency_ms); + ServiceStub serviceStub_; /// If true, abort() is not called for transient failures bool do_not_abort_on_transient_failures_;