From d19fd1c689b1d60cf329331da7fab5d1ca6063cc Mon Sep 17 00:00:00 2001 From: Juanli Shen Date: Thu, 16 Aug 2018 17:54:27 -0700 Subject: [PATCH] PF: Check connectivity state before watching --- .../lb_policy/pick_first/pick_first.cc | 112 ++++++++++++------ test/cpp/end2end/client_lb_end2end_test.cc | 55 +++++++++ 2 files changed, 128 insertions(+), 39 deletions(-) diff --git a/src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.cc b/src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.cc index bc51903ef56..ab33d933981 100644 --- a/src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.cc +++ b/src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.cc @@ -80,6 +80,11 @@ class PickFirst : public LoadBalancingPolicy { void ProcessConnectivityChangeLocked( grpc_connectivity_state connectivity_state, grpc_error* error) override; + + // Processes the connectivity change to READY for an unselected subchannel. + void ProcessUnselectedReadyLocked(); + + void CheckConnectivityStateAndStartWatchingLocked(); }; class PickFirstSubchannelList @@ -247,7 +252,8 @@ void PickFirst::StartPickingLocked() { if (subchannel_list_ != nullptr) { for (size_t i = 0; i < subchannel_list_->num_subchannels(); ++i) { if (subchannel_list_->subchannel(i)->subchannel() != nullptr) { - subchannel_list_->subchannel(i)->StartConnectivityWatchLocked(); + subchannel_list_->subchannel(i) + ->CheckConnectivityStateAndStartWatchingLocked(); break; } } @@ -386,7 +392,8 @@ void PickFirst::UpdateLocked(const grpc_channel_args& args) { // If we've started picking, start trying to connect to the first // subchannel in the new list. if (started_picking_) { - subchannel_list_->subchannel(0)->StartConnectivityWatchLocked(); + subchannel_list_->subchannel(0) + ->CheckConnectivityStateAndStartWatchingLocked(); } } else { // We do have a selected subchannel. @@ -440,7 +447,7 @@ void PickFirst::UpdateLocked(const grpc_channel_args& args) { // subchannel in the new list. if (started_picking_) { latest_pending_subchannel_list_->subchannel(0) - ->StartConnectivityWatchLocked(); + ->CheckConnectivityStateAndStartWatchingLocked(); } } } @@ -519,41 +526,7 @@ void PickFirst::PickFirstSubchannelData::ProcessConnectivityChangeLocked( // select in place of the current one. switch (connectivity_state) { case GRPC_CHANNEL_READY: { - // Case 2. Promote p->latest_pending_subchannel_list_ to - // p->subchannel_list_. - if (subchannel_list() == p->latest_pending_subchannel_list_.get()) { - if (grpc_lb_pick_first_trace.enabled()) { - gpr_log(GPR_INFO, - "Pick First %p promoting pending subchannel list %p to " - "replace %p", - p, p->latest_pending_subchannel_list_.get(), - p->subchannel_list_.get()); - } - p->subchannel_list_ = std::move(p->latest_pending_subchannel_list_); - } - // Cases 1 and 2. - grpc_connectivity_state_set(&p->state_tracker_, GRPC_CHANNEL_READY, - GRPC_ERROR_NONE, "connecting_ready"); - p->selected_ = this; - if (grpc_lb_pick_first_trace.enabled()) { - gpr_log(GPR_INFO, "Pick First %p selected subchannel %p", p, - subchannel()); - } - // Drop all other subchannels, since we are now connected. - p->DestroyUnselectedSubchannelsLocked(); - // Update any calls that were waiting for a pick. - PickState* pick; - while ((pick = p->pending_picks_)) { - p->pending_picks_ = pick->next; - pick->connected_subchannel = - p->selected_->connected_subchannel()->Ref(); - if (grpc_lb_pick_first_trace.enabled()) { - gpr_log(GPR_INFO, - "Servicing pending pick with selected subchannel %p", - p->selected_->subchannel()); - } - GRPC_CLOSURE_SCHED(pick->on_complete, GRPC_ERROR_NONE); - } + ProcessUnselectedReadyLocked(); // Renew notification. RenewConnectivityWatchLocked(); break; @@ -574,7 +547,7 @@ void PickFirst::PickFirstSubchannelData::ProcessConnectivityChangeLocked( &p->state_tracker_, GRPC_CHANNEL_TRANSIENT_FAILURE, GRPC_ERROR_REF(error), "exhausted_subchannels"); } - sd->StartConnectivityWatchLocked(); + sd->CheckConnectivityStateAndStartWatchingLocked(); break; } case GRPC_CHANNEL_CONNECTING: @@ -595,6 +568,67 @@ void PickFirst::PickFirstSubchannelData::ProcessConnectivityChangeLocked( GRPC_ERROR_UNREF(error); } +void PickFirst::PickFirstSubchannelData::ProcessUnselectedReadyLocked() { + PickFirst* p = static_cast(subchannel_list()->policy()); + // If we get here, there are two possible cases: + // 1. We do not currently have a selected subchannel, and the update is + // for a subchannel in p->subchannel_list_ that we're trying to + // connect to. The goal here is to find a subchannel that we can + // select. + // 2. We do currently have a selected subchannel, and the update is + // for a subchannel in p->latest_pending_subchannel_list_. The + // goal here is to find a subchannel from the update that we can + // select in place of the current one. + GPR_ASSERT(subchannel_list() == p->subchannel_list_.get() || + subchannel_list() == p->latest_pending_subchannel_list_.get()); + // Case 2. Promote p->latest_pending_subchannel_list_ to p->subchannel_list_. + if (subchannel_list() == p->latest_pending_subchannel_list_.get()) { + if (grpc_lb_pick_first_trace.enabled()) { + gpr_log(GPR_INFO, + "Pick First %p promoting pending subchannel list %p to " + "replace %p", + p, p->latest_pending_subchannel_list_.get(), + p->subchannel_list_.get()); + } + p->subchannel_list_ = std::move(p->latest_pending_subchannel_list_); + } + // Cases 1 and 2. + grpc_connectivity_state_set(&p->state_tracker_, GRPC_CHANNEL_READY, + GRPC_ERROR_NONE, "subchannel_ready"); + p->selected_ = this; + if (grpc_lb_pick_first_trace.enabled()) { + gpr_log(GPR_INFO, "Pick First %p selected subchannel %p", p, subchannel()); + } + // Drop all other subchannels, since we are now connected. + p->DestroyUnselectedSubchannelsLocked(); + // Update any calls that were waiting for a pick. + PickState* pick; + while ((pick = p->pending_picks_)) { + p->pending_picks_ = pick->next; + pick->connected_subchannel = p->selected_->connected_subchannel()->Ref(); + if (grpc_lb_pick_first_trace.enabled()) { + gpr_log(GPR_INFO, "Servicing pending pick with selected subchannel %p", + p->selected_->subchannel()); + } + GRPC_CLOSURE_SCHED(pick->on_complete, GRPC_ERROR_NONE); + } +} + +void PickFirst::PickFirstSubchannelData:: + CheckConnectivityStateAndStartWatchingLocked() { + PickFirst* p = static_cast(subchannel_list()->policy()); + grpc_error* error = GRPC_ERROR_NONE; + if (p->selected_ != this && + CheckConnectivityStateLocked(&error) == GRPC_CHANNEL_READY) { + // We must process the READY subchannel before we start watching it. + // Otherwise, we won't know it's READY because we will be waiting for its + // connectivity state to change from READY. + ProcessUnselectedReadyLocked(); + } + GRPC_ERROR_UNREF(error); + StartConnectivityWatchLocked(); +} + // // factory // diff --git a/test/cpp/end2end/client_lb_end2end_test.cc b/test/cpp/end2end/client_lb_end2end_test.cc index 26c241b74a2..68219c16dcb 100644 --- a/test/cpp/end2end/client_lb_end2end_test.cc +++ b/test/cpp/end2end/client_lb_end2end_test.cc @@ -291,6 +291,17 @@ class ClientLbEnd2endTest : public ::testing::Test { ResetCounters(); } + bool WaitForChannelNotReady(Channel* channel, int timeout_seconds = 5) { + const gpr_timespec deadline = + grpc_timeout_seconds_to_deadline(timeout_seconds); + grpc_connectivity_state state; + while ((state = channel->GetState(false /* try_to_connect */)) == + GRPC_CHANNEL_READY) { + if (!channel->WaitForStateChange(state, deadline)) return false; + } + return true; + } + bool SeenAllServers() { for (const auto& server : servers_) { if (server->service_.request_count() == 0) return false; @@ -590,6 +601,50 @@ TEST_F(ClientLbEnd2endTest, PickFirstReresolutionNoSelected) { EXPECT_EQ("pick_first", channel->GetLoadBalancingPolicyName()); } +TEST_F(ClientLbEnd2endTest, PickFirstCheckStateBeforeStartWatch) { + std::vector ports = {grpc_pick_unused_port_or_die()}; + StartServers(1, ports); + auto channel_1 = BuildChannel("pick_first"); + auto stub_1 = BuildStub(channel_1); + SetNextResolution(ports); + gpr_log(GPR_INFO, "****** RESOLUTION SET FOR CHANNEL 1 *******"); + WaitForServer(stub_1, 0, DEBUG_LOCATION); + gpr_log(GPR_INFO, "****** CHANNEL 1 CONNECTED *******"); + servers_[0]->Shutdown(); + // Channel 1 will receive a re-resolution containing the same server. It will + // create a new subchannel and hold a ref to it. + servers_.clear(); + StartServers(1, ports); + gpr_log(GPR_INFO, "****** SERVER RESTARTED *******"); + auto channel_2 = BuildChannel("pick_first"); + auto stub_2 = BuildStub(channel_2); + // TODO(juanlishen): This resolution result will only be visible to channel 2 + // since the response generator is only associated with channel 2 now. We + // should change the response generator to be able to deliver updates to + // multiple channels at once. + SetNextResolution(ports); + gpr_log(GPR_INFO, "****** RESOLUTION SET FOR CHANNEL 2 *******"); + WaitForServer(stub_2, 0, DEBUG_LOCATION, true); + gpr_log(GPR_INFO, "****** CHANNEL 2 CONNECTED *******"); + servers_[0]->Shutdown(); + // Wait until the disconnection has triggered the connectivity notification. + // Otherwise, the subchannel may be picked for next call but will fail soon. + EXPECT_TRUE(WaitForChannelNotReady(channel_2.get())); + // Channel 2 will also receive a re-resolution containing the same server. + // Both channels will ref the same subchannel that failed. + servers_.clear(); + StartServers(1, ports); + gpr_log(GPR_INFO, "****** SERVER RESTARTED AGAIN *******"); + gpr_log(GPR_INFO, "****** CHANNEL 2 STARTING A CALL *******"); + // The first call after the server restart will succeed. + CheckRpcSendOk(stub_2, DEBUG_LOCATION); + gpr_log(GPR_INFO, "****** CHANNEL 2 FINISHED A CALL *******"); + // Check LB policy name for the channel. + EXPECT_EQ("pick_first", channel_1->GetLoadBalancingPolicyName()); + // Check LB policy name for the channel. + EXPECT_EQ("pick_first", channel_2->GetLoadBalancingPolicyName()); +} + TEST_F(ClientLbEnd2endTest, RoundRobin) { // Start servers and send one RPC per server. const int kNumServers = 3;