From b94fb2b894883689f2b5cfd603ed49ae754d7099 Mon Sep 17 00:00:00 2001 From: Alisha Nanda Date: Fri, 26 Aug 2022 14:26:54 -0700 Subject: [PATCH] Fix PickFirstTest.PendingUpdateAndSelectedSubchannelFails flake in client_lb_end2end_test (#30741) * ConnectionAttemptInjector: fix tsan failures * Add hold to test * Add hold to test * Address comments * Address comments * Address comments * Fix typo Co-authored-by: Mark D. Roth --- test/cpp/end2end/client_lb_end2end_test.cc | 35 ++++++++++------------ 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/test/cpp/end2end/client_lb_end2end_test.cc b/test/cpp/end2end/client_lb_end2end_test.cc index 13c5d4944b0..0014935e436 100644 --- a/test/cpp/end2end/client_lb_end2end_test.cc +++ b/test/cpp/end2end/client_lb_end2end_test.cc @@ -1318,24 +1318,25 @@ TEST_F(PickFirstTest, PendingUpdateAndSelectedSubchannelFails) { auto channel = BuildChannel("", response_generator); // pick_first is the default. auto stub = BuildStub(channel); - // Create a number of servers, but only start 1 of them. - CreateServers(10); - StartServer(0); + StartServers(2); // Initially resolve to first server and make sure it connects. gpr_log(GPR_INFO, "Phase 1: Connect to first server."); response_generator.SetNextResolution({servers_[0]->port_}); CheckRpcSendOk(DEBUG_LOCATION, stub, true /* wait_for_ready */); EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY); + ConnectionAttemptInjector injector; + auto hold = injector.AddHold(servers_[1]->port_); // Send a resolution update with the remaining servers, none of which are - // running yet, so the update will stay pending. Note that it's important - // to have multiple servers here, or else the test will be flaky; with only - // one server, the pending subchannel list has already gone into - // TRANSIENT_FAILURE due to hitting the end of the list by the time we - // check the state. + // running yet, so the update will stay pending. gpr_log(GPR_INFO, "Phase 2: Resolver update pointing to remaining " "(not started) servers."); response_generator.SetNextResolution(GetServersPorts(1 /* start_index */)); + // Add hold before connection attempt to ensure RPCs will be sent to first + // server. Otherwise, pending subchannel list might already have gone into + // TRANSIENT_FAILURE due to hitting the end of the server list by the time + // we check the state. + hold->Wait(); // RPCs will continue to be sent to the first server. CheckRpcSendOk(DEBUG_LOCATION, stub); // Now stop the first server, so that the current subchannel list @@ -1346,18 +1347,12 @@ TEST_F(PickFirstTest, PendingUpdateAndSelectedSubchannelFails) { gpr_log(GPR_INFO, "Phase 3: Stopping first server."); servers_[0]->Shutdown(); WaitForChannelNotReady(channel.get()); - // TODO(roth): This should always return CONNECTING, but it's flaky - // between that and TRANSIENT_FAILURE. I suspect that this problem - // will go away once we move the backoff code out of the subchannel - // and into the LB policies. - EXPECT_THAT(channel->GetState(false), - ::testing::AnyOf(GRPC_CHANNEL_CONNECTING, - GRPC_CHANNEL_TRANSIENT_FAILURE)); - // Now start the second server. - gpr_log(GPR_INFO, "Phase 4: Starting second server."); - StartServer(1); - // The channel should go to READY state and RPCs should go to the - // second server. + EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_CONNECTING); + // Resume connection attempt to second server now that first server is down. + // The channel should go to READY state and RPCs should go to the second + // server. + gpr_log(GPR_INFO, "Phase 4: Resuming connection attempt to second server."); + hold->Resume(); WaitForChannelReady(channel.get()); WaitForServer(DEBUG_LOCATION, stub, 1, [](const Status& status) { EXPECT_EQ(StatusCode::UNAVAILABLE, status.error_code());