Fix PickFirstTest.PendingUpdateAndSelectedSubchannelFails flake in client_lb_end2end_test (#30741)

* ConnectionAttemptInjector: fix tsan failures

* Add hold to test

* Add hold to test

* Address comments

* Address comments

* Address comments

* Fix typo

Co-authored-by: Mark D. Roth <roth@google.com>
pull/30767/head
Alisha Nanda 2 years ago committed by GitHub
parent 1837dae106
commit b94fb2b894
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 35
      test/cpp/end2end/client_lb_end2end_test.cc

@ -1318,24 +1318,25 @@ TEST_F(PickFirstTest, PendingUpdateAndSelectedSubchannelFails) {
auto channel = auto channel =
BuildChannel("", response_generator); // pick_first is the default. BuildChannel("", response_generator); // pick_first is the default.
auto stub = BuildStub(channel); auto stub = BuildStub(channel);
// Create a number of servers, but only start 1 of them. StartServers(2);
CreateServers(10);
StartServer(0);
// Initially resolve to first server and make sure it connects. // Initially resolve to first server and make sure it connects.
gpr_log(GPR_INFO, "Phase 1: Connect to first server."); gpr_log(GPR_INFO, "Phase 1: Connect to first server.");
response_generator.SetNextResolution({servers_[0]->port_}); response_generator.SetNextResolution({servers_[0]->port_});
CheckRpcSendOk(DEBUG_LOCATION, stub, true /* wait_for_ready */); CheckRpcSendOk(DEBUG_LOCATION, stub, true /* wait_for_ready */);
EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY); EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY);
ConnectionAttemptInjector injector;
auto hold = injector.AddHold(servers_[1]->port_);
// Send a resolution update with the remaining servers, none of which are // Send a resolution update with the remaining servers, none of which are
// running yet, so the update will stay pending. Note that it's important // running yet, so the update will stay pending.
// to have multiple servers here, or else the test will be flaky; with only
// one server, the pending subchannel list has already gone into
// TRANSIENT_FAILURE due to hitting the end of the list by the time we
// check the state.
gpr_log(GPR_INFO, gpr_log(GPR_INFO,
"Phase 2: Resolver update pointing to remaining " "Phase 2: Resolver update pointing to remaining "
"(not started) servers."); "(not started) servers.");
response_generator.SetNextResolution(GetServersPorts(1 /* start_index */)); response_generator.SetNextResolution(GetServersPorts(1 /* start_index */));
// Add hold before connection attempt to ensure RPCs will be sent to first
// server. Otherwise, pending subchannel list might already have gone into
// TRANSIENT_FAILURE due to hitting the end of the server list by the time
// we check the state.
hold->Wait();
// RPCs will continue to be sent to the first server. // RPCs will continue to be sent to the first server.
CheckRpcSendOk(DEBUG_LOCATION, stub); CheckRpcSendOk(DEBUG_LOCATION, stub);
// Now stop the first server, so that the current subchannel list // Now stop the first server, so that the current subchannel list
@ -1346,18 +1347,12 @@ TEST_F(PickFirstTest, PendingUpdateAndSelectedSubchannelFails) {
gpr_log(GPR_INFO, "Phase 3: Stopping first server."); gpr_log(GPR_INFO, "Phase 3: Stopping first server.");
servers_[0]->Shutdown(); servers_[0]->Shutdown();
WaitForChannelNotReady(channel.get()); WaitForChannelNotReady(channel.get());
// TODO(roth): This should always return CONNECTING, but it's flaky EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_CONNECTING);
// between that and TRANSIENT_FAILURE. I suspect that this problem // Resume connection attempt to second server now that first server is down.
// will go away once we move the backoff code out of the subchannel // The channel should go to READY state and RPCs should go to the second
// and into the LB policies. // server.
EXPECT_THAT(channel->GetState(false), gpr_log(GPR_INFO, "Phase 4: Resuming connection attempt to second server.");
::testing::AnyOf(GRPC_CHANNEL_CONNECTING, hold->Resume();
GRPC_CHANNEL_TRANSIENT_FAILURE));
// Now start the second server.
gpr_log(GPR_INFO, "Phase 4: Starting second server.");
StartServer(1);
// The channel should go to READY state and RPCs should go to the
// second server.
WaitForChannelReady(channel.get()); WaitForChannelReady(channel.get());
WaitForServer(DEBUG_LOCATION, stub, 1, [](const Status& status) { WaitForServer(DEBUG_LOCATION, stub, 1, [](const Status& status) {
EXPECT_EQ(StatusCode::UNAVAILABLE, status.error_code()); EXPECT_EQ(StatusCode::UNAVAILABLE, status.error_code());

Loading…
Cancel
Save