client_lb_end2end_test: fix flake in RoundRobin.SingleReconnect test (#30229)

* client_lb_end2end_test: fix flake in RoundRobin.SingleReconnect test * fix condition used to determine when client has seen backend 0 down * remove duplicate counter reset * Automated change: Fix sanity tests Co-authored-by: markdroth <markdroth@users.noreply.github.com>
3 years ago · 694e634a79
parent 38284a07a2
commit 694e634a79
1 changed files with 46 additions and 13 deletions
--- a/test/cpp/end2end/client_lb_end2end_test.cc
+++ b/test/cpp/end2end/client_lb_end2end_test.cc
@ -362,6 +362,25 @@ class ClientLbEnd2endTest : public ::testing::Test {
        << location.file() << ":" << location.line();
  }

+  void SendRpcsUntil(
+      const grpc_core::DebugLocation& debug_location,
+      const std::unique_ptr<grpc::testing::EchoTestService::Stub>& stub,
+      std::function<bool(const Status&)> continue_predicate,
+      int timeout_ms = 15000) {
+    absl::Time deadline = absl::InfiniteFuture();
+    if (timeout_ms != 0) {
+      deadline = absl::Now() +
+                 (absl::Milliseconds(timeout_ms) * grpc_test_slowdown_factor());
+    }
+    while (true) {
+      Status status = SendRpc(stub);
+      if (!continue_predicate(status)) return;
+      EXPECT_LE(absl::Now(), deadline)
+          << debug_location.file() << ":" << debug_location.line();
+      if (absl::Now() >= deadline) break;
+    }
+  }
+
  struct ServerData {
    const int port_;
    std::unique_ptr<Server> server_;
@ -415,6 +434,13 @@ class ClientLbEnd2endTest : public ::testing::Test {
      started_ = false;
    }

+    void StopListeningAndSendGoaways() {
+      grpc_core::ExecCtx exec_ctx;
+      auto* server = grpc_core::Server::FromC(server_->c_server());
+      server->StopListening();
+      server->SendGoaways();
+    }
+
    void SetServingStatus(const std::string& service, bool serving) {
      server_->GetHealthCheckService()->SetServingStatus(service, serving);
    }
@ -1749,25 +1775,32 @@ TEST_F(RoundRobinTest, SingleReconnect) {
  for (size_t i = 0; i < servers_.size(); ++i) {
    EXPECT_EQ(1, servers_[i]->service_.request_count());
  }
-  const auto pre_death = servers_[0]->service_.request_count();
  // Kill the first server.
-  servers_[0]->Shutdown();
-  // Client request still succeed. May need retrying if RR had returned a pick
-  // before noticing the change in the server's connectivity.
-  while (true) {
-    Status status = SendRpc(stub);
-    if (status.ok()) break;
-    EXPECT_EQ(StatusCode::UNAVAILABLE, status.error_code());
-    EXPECT_EQ("", status.error_message());
-  }
-  // Send a bunch of RPCs that should succeed.
+  servers_[0]->StopListeningAndSendGoaways();
+  // Wait for client to notice that the backend is down.  We know that's
+  // happened when we see kNumServers RPCs that do not go to backend 0.
+  ResetCounters();
+  SendRpcsUntil(DEBUG_LOCATION, stub,
+                [&, num_rpcs_not_on_backend_0 = 0](Status status) mutable {
+                  EXPECT_TRUE(status.ok())
+                      << "code=" << status.error_code()
+                      << " message=" << status.error_message();
+                  if (servers_[0]->service_.request_count() == 1) {
+                    num_rpcs_not_on_backend_0 = 0;
+                  } else {
+                    ++num_rpcs_not_on_backend_0;
+                  }
+                  ResetCounters();
+                  return num_rpcs_not_on_backend_0 < kNumServers;
+                });
+  // Send a bunch of RPCs.
  for (int i = 0; i < 10 * kNumServers; ++i) {
    CheckRpcSendOk(DEBUG_LOCATION, stub);
  }
-  const auto post_death = servers_[0]->service_.request_count();
  // No requests have gone to the deceased server.
-  EXPECT_EQ(pre_death, post_death);
+  EXPECT_EQ(0UL, servers_[0]->service_.request_count());
  // Bring the first server back up.
+  servers_[0]->Shutdown();
  StartServer(0);
  // Requests should start arriving at the first server either right away (if
  // the server managed to start before the RR policy retried the subchannel) or