diff --git a/src/android/test/interop/app/CMakeLists.txt b/src/android/test/interop/app/CMakeLists.txt index 2e23a7759c5..e288dd63cec 100644 --- a/src/android/test/interop/app/CMakeLists.txt +++ b/src/android/test/interop/app/CMakeLists.txt @@ -110,7 +110,9 @@ add_library(grpc-interop SHARED src/main/cpp/grpc-interop.cc ${GRPC_SRC_DIR}/test/cpp/interop/interop_client.h - ${GRPC_SRC_DIR}/test/cpp/interop/interop_client.cc) + ${GRPC_SRC_DIR}/test/cpp/interop/interop_client.cc + ${GRPC_SRC_DIR}/test/core/util/histogram.h + ${GRPC_SRC_DIR}/test/core/util/histogram.cc) target_link_libraries(grpc-interop messages_proto_lib diff --git a/test/cpp/interop/client.cc b/test/cpp/interop/client.cc index 45fb68df5b5..a23c5ddb6d5 100644 --- a/test/cpp/interop/client.cc +++ b/test/cpp/interop/client.cc @@ -88,8 +88,15 @@ DEFINE_bool(do_not_abort_on_transient_failures, false, "test is retried in case of transient failures (and currently the " "interop tests are not retried even if this flag is set to true)"); DEFINE_int32(soak_iterations, 1000, - "number of iterations to use for the two soak tests; rpc_soak and " - "channel_soak"); + "The number of iterations to use for the two soak tests; rpc_soak " + "and channel_soak."); +DEFINE_int32(soak_max_failures, 0, + "The number of iterations in soak tests that are allowed to fail " + "(either due to non-OK status code or exceeding the " + "per-iteration max acceptable latency)."); +DEFINE_int32(soak_per_iteration_max_acceptable_latency_ms, 0, + "The number of milliseconds a single iteration in the two soak " + "tests (rpc_soak and channel_soak) is allowed to take."); DEFINE_int32(iteration_interval, 10, "The interval in seconds between rpcs. This is used by " "long_connection test"); @@ -257,9 +264,12 @@ int main(int argc, char** argv) { std::bind(&grpc::testing::InteropClient::DoCacheableUnary, &client); actions["channel_soak"] = std::bind(&grpc::testing::InteropClient::DoChannelSoakTest, &client, - FLAGS_soak_iterations); - actions["rpc_soak"] = std::bind(&grpc::testing::InteropClient::DoRpcSoakTest, - &client, FLAGS_soak_iterations); + FLAGS_soak_iterations, FLAGS_soak_max_failures, + FLAGS_soak_per_iteration_max_acceptable_latency_ms); + actions["rpc_soak"] = + std::bind(&grpc::testing::InteropClient::DoRpcSoakTest, &client, + FLAGS_soak_iterations, FLAGS_soak_max_failures, + FLAGS_soak_per_iteration_max_acceptable_latency_ms); actions["long_lived_channel"] = std::bind(&grpc::testing::InteropClient::DoLongLivedChannelTest, &client, FLAGS_soak_iterations, FLAGS_iteration_interval); diff --git a/test/cpp/interop/interop_client.cc b/test/cpp/interop/interop_client.cc index 71ee79a6e96..e889d2424c5 100644 --- a/test/cpp/interop/interop_client.cc +++ b/test/cpp/interop/interop_client.cc @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -33,6 +34,7 @@ #include "src/proto/grpc/testing/empty.pb.h" #include "src/proto/grpc/testing/messages.pb.h" #include "src/proto/grpc/testing/test.grpc.pb.h" +#include "test/core/util/histogram.h" #include "test/cpp/interop/client_helper.h" #include "test/cpp/interop/interop_client.h" @@ -1065,36 +1067,121 @@ bool InteropClient::DoCustomMetadata() { return true; } -bool InteropClient::DoRpcSoakTest(int32_t soak_iterations) { - gpr_log(GPR_DEBUG, "Sending %d RPCs...", soak_iterations); - GPR_ASSERT(soak_iterations > 0); +std::tuple +InteropClient::PerformOneSoakTestIteration( + const bool reset_channel, + const int32_t max_acceptable_per_iteration_latency_ms) { + gpr_timespec start = gpr_now(GPR_CLOCK_MONOTONIC); SimpleRequest request; SimpleResponse response; + // Don't set the deadline on the RPC, and instead just + // record how long the RPC took and compare. This makes + // debugging easier when looking at failure results. + ClientContext context; + InteropClientContextInspector inspector(context); + request.set_response_size(kLargeResponseSize); + grpc::string payload(kLargeRequestSize, '\0'); + request.mutable_payload()->set_body(payload.c_str(), kLargeRequestSize); + if (reset_channel) { + serviceStub_.ResetChannel(); + } + Status s = serviceStub_.Get()->UnaryCall(&context, request, &response); + gpr_timespec now = gpr_now(GPR_CLOCK_MONOTONIC); + int32_t elapsed_ms = gpr_time_to_millis(gpr_time_sub(now, start)); + if (!s.ok()) { + return std::make_tuple(false, elapsed_ms, context.debug_error_string()); + } else if (elapsed_ms > max_acceptable_per_iteration_latency_ms) { + char* out; + GPR_ASSERT(gpr_asprintf( + &out, "%d ms exceeds max acceptable latency: %d ms.", + elapsed_ms, max_acceptable_per_iteration_latency_ms) != -1); + std::string debug_string(out); + gpr_free(out); + return std::make_tuple(false, elapsed_ms, debug_string); + } else { + return std::make_tuple(true, elapsed_ms, ""); + } +} + +void InteropClient::PerformSoakTest( + const bool reset_channel_per_iteration, const int32_t soak_iterations, + const int32_t max_failures, + const int32_t max_acceptable_per_iteration_latency_ms) { + std::vector> results; + grpc_histogram* latencies_ms_histogram = grpc_histogram_create( + 1 /* resolution */, + 500 * 1e3 /* largest bucket; 500 seconds is unlikely */); for (int i = 0; i < soak_iterations; ++i) { - if (!PerformLargeUnary(&request, &response)) { - gpr_log(GPR_ERROR, "rpc_soak test failed on iteration %d", i); - return false; + auto result = PerformOneSoakTestIteration( + reset_channel_per_iteration, max_acceptable_per_iteration_latency_ms); + results.push_back(result); + grpc_histogram_add(latencies_ms_histogram, std::get<1>(result)); + } + int total_failures = 0; + for (size_t i = 0; i < results.size(); i++) { + bool success = std::get<0>(results[i]); + int32_t elapsed_ms = std::get<1>(results[i]); + std::string debug_string = std::get<2>(results[i]); + if (!success) { + gpr_log(GPR_DEBUG, "soak iteration: %ld elapsed_ms: %d failed: %s", i, + elapsed_ms, debug_string.c_str()); + total_failures++; + } else { + gpr_log(GPR_DEBUG, "soak iteration: %ld elapsed_ms: %d succeeded", i, + elapsed_ms); } } + double latency_ms_median = + grpc_histogram_percentile(latencies_ms_histogram, 50); + double latency_ms_90th = + grpc_histogram_percentile(latencies_ms_histogram, 90); + double latency_ms_worst = grpc_histogram_maximum(latencies_ms_histogram); + grpc_histogram_destroy(latencies_ms_histogram); + if (total_failures > max_failures) { + gpr_log(GPR_ERROR, + "soak test ran: %d iterations. total_failures: %d exceeds " + "max_failures_threshold: %d. " + "median_soak_iteration_latency: %lf ms. " + "90th_soak_iteration_latency: %lf ms. " + "worst_soak_iteration_latency: %lf ms. " + "See breakdown above for which iterations succeeded, failed, and " + "why for more info.", + soak_iterations, total_failures, max_failures, latency_ms_median, + latency_ms_90th, latency_ms_worst); + GPR_ASSERT(0); + } else { + gpr_log(GPR_INFO, + "soak test ran: %d iterations. total_failures: %d is within " + "max_failures_threshold: %d. " + "median_soak_iteration_latency: %lf ms. " + "90th_soak_iteration_latency: %lf ms. " + "worst_soak_iteration_latency: %lf ms. " + "See breakdown above for which iterations succeeded, failed, and " + "why for more info.", + soak_iterations, total_failures, max_failures, latency_ms_median, + latency_ms_90th, latency_ms_worst); + } +} + +bool InteropClient::DoRpcSoakTest( + int32_t soak_iterations, int32_t max_failures, + int64_t max_acceptable_per_iteration_latency_ms) { + gpr_log(GPR_DEBUG, "Sending %d RPCs...", soak_iterations); + GPR_ASSERT(soak_iterations > 0); + PerformSoakTest(false /* reset channel per iteration */, soak_iterations, + max_failures, max_acceptable_per_iteration_latency_ms); gpr_log(GPR_DEBUG, "rpc_soak test done."); return true; } -bool InteropClient::DoChannelSoakTest(int32_t soak_iterations) { +bool InteropClient::DoChannelSoakTest( + int32_t soak_iterations, int32_t max_failures, + int64_t max_acceptable_per_iteration_latency_ms) { gpr_log(GPR_DEBUG, "Sending %d RPCs, tearing down the channel each time...", soak_iterations); GPR_ASSERT(soak_iterations > 0); - SimpleRequest request; - SimpleResponse response; - for (int i = 0; i < soak_iterations; ++i) { - serviceStub_.ResetChannel(); - gpr_log(GPR_DEBUG, "Starting channel_soak iteration %d...", i); - if (!PerformLargeUnary(&request, &response)) { - gpr_log(GPR_ERROR, "channel_soak test failed on iteration %d", i); - return false; - } - gpr_log(GPR_DEBUG, "channel_soak iteration %d finished", i); - } + PerformSoakTest(true /* reset channel per iteration */, soak_iterations, + max_failures, max_acceptable_per_iteration_latency_ms); gpr_log(GPR_DEBUG, "channel_soak test done."); return true; } diff --git a/test/cpp/interop/interop_client.h b/test/cpp/interop/interop_client.h index 483d9becac2..2dc0d8047ec 100644 --- a/test/cpp/interop/interop_client.h +++ b/test/cpp/interop/interop_client.h @@ -76,8 +76,10 @@ class InteropClient { // not implemented cross-language. They are considered experimental for now, // but at some point in the future, might be codified and implemented in all // languages - bool DoChannelSoakTest(int32_t soak_iterations); - bool DoRpcSoakTest(int32_t soak_iterations); + bool DoChannelSoakTest(int32_t soak_iterations, int32_t max_failures, + int64_t max_acceptable_per_iteration_latency_ms); + bool DoRpcSoakTest(int32_t soak_iterations, int32_t max_failures, + int64_t max_acceptable_per_iteration_latency_ms); bool DoLongLivedChannelTest(int32_t soak_iterations, int32_t iteration_interval); @@ -128,6 +130,15 @@ class InteropClient { const grpc::string& optional_debug_string); bool TransientFailureOrAbort(); + std::tuple PerformOneSoakTestIteration( + const bool reset_channel, + const int32_t max_acceptable_per_iteration_latency_ms); + + void PerformSoakTest(const bool reset_channel_per_iteration, + const int32_t soak_iterations, + const int32_t max_failures, + const int32_t max_acceptable_per_iteration_latency_ms); + ServiceStub serviceStub_; /// If true, abort() is not called for transient failures bool do_not_abort_on_transient_failures_;