[pick_first] fix race condition for detecting idleness (#34885)

This fixes a bug accidentally introduced in #33753. The symptom is that
if we exit idle and then get a new address list before any of the
subchannels in the old list can report their initial connectivity state,
we will incorrectly ignore the new address list.
pull/34806/head^2
Mark D. Roth 1 year ago committed by GitHub
parent 78825f79ca
commit 1324ce42ea
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 15
      src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.cc
  2. 96
      test/core/client_channel/lb_policy/pick_first_test.cc

@ -321,6 +321,14 @@ class PickFirst : public LoadBalancingPolicy {
void UnsetSelectedSubchannel();
// When ExitIdleLocked() is called, we create a subchannel_list_ and start
// trying to connect, but we don't actually change state_ until the first
// subchannel reports CONNECTING. So in order to know if we're really
// idle, we need to check both state_ and subchannel_list_.
bool IsIdle() const {
return state_ == GRPC_CHANNEL_IDLE && subchannel_list_ == nullptr;
}
// Whether we should enable health watching.
const bool enable_health_watch_;
// Whether we should omit our status message prefix.
@ -388,10 +396,7 @@ void PickFirst::ShutdownLocked() {
void PickFirst::ExitIdleLocked() {
if (shutdown_) return;
// Need to check subchannel_list_ being null here, in case the
// application calls channel->GetState(true) again before we
// transition to state CONNECTING.
if (state_ == GRPC_CHANNEL_IDLE && subchannel_list_ == nullptr) {
if (IsIdle()) {
if (GRPC_TRACE_FLAG_ENABLED(grpc_lb_pick_first_trace)) {
gpr_log(GPR_INFO, "Pick First %p exiting idle", this);
}
@ -546,7 +551,7 @@ absl::Status PickFirst::UpdateLocked(UpdateArgs args) {
latest_update_args_ = std::move(args);
// If we are not in idle, start connection attempt immediately.
// Otherwise, we defer the attempt into ExitIdleLocked().
if (state_ != GRPC_CHANNEL_IDLE) {
if (!IsIdle()) {
AttemptToConnectUsingLatestUpdateArgsLocked();
}
return status;

@ -423,6 +423,102 @@ TEST_F(PickFirstTest, StaysInTransientFailureAfterAddressListUpdate) {
}
}
// This tests a real-world bug in which PF ignored a resolver update if
// it had just created the subchannels but had not yet seen their
// initial connectivity state notification.
TEST_F(PickFirstTest, ResolverUpdateBeforeLeavingIdle) {
constexpr std::array<absl::string_view, 2> kAddresses = {
"ipv4:127.0.0.1:443", "ipv4:127.0.0.1:444"};
constexpr std::array<absl::string_view, 2> kNewAddresses = {
"ipv4:127.0.0.1:445", "ipv4:127.0.0.1:446"};
// Send initial update containing two addresses.
absl::Status status = ApplyUpdate(
BuildUpdate(kAddresses, MakePickFirstConfig(false)), lb_policy());
EXPECT_TRUE(status.ok()) << status;
// LB policy should have created a subchannel for both addresses.
auto* subchannel = FindSubchannel(kAddresses[0]);
ASSERT_NE(subchannel, nullptr);
auto* subchannel2 = FindSubchannel(kAddresses[1]);
ASSERT_NE(subchannel2, nullptr);
// When the LB policy receives the first subchannel's initial connectivity
// state notification (IDLE), it will request a connection.
EXPECT_TRUE(subchannel->ConnectionRequested());
// This causes the subchannel to start to connect, so it reports CONNECTING.
subchannel->SetConnectivityState(GRPC_CHANNEL_CONNECTING);
// LB policy should have reported CONNECTING state.
ExpectConnectingUpdate();
// The second subchannel should not be connecting.
EXPECT_FALSE(subchannel2->ConnectionRequested());
// When the first subchannel becomes connected, it reports READY.
subchannel->SetConnectivityState(GRPC_CHANNEL_READY);
// The LB policy will report CONNECTING some number of times (doesn't
// matter how many) and then report READY.
auto picker = WaitForConnected();
ASSERT_NE(picker, nullptr);
// Picker should return the same subchannel repeatedly.
for (size_t i = 0; i < 3; ++i) {
EXPECT_EQ(ExpectPickComplete(picker.get()), kAddresses[0]);
}
// Now the connection is closed, so we go IDLE.
subchannel->SetConnectivityState(GRPC_CHANNEL_IDLE);
ExpectReresolutionRequest();
ExpectState(GRPC_CHANNEL_IDLE);
// Now we tell the LB policy to exit idle. This causes it to create a
// new subchannel list from the original update. However, before it
// can get the initial connectivity state notifications for those
// subchannels (i.e., before it can transition from IDLE to CONNECTING),
// we send a new update.
absl::Notification notification;
work_serializer_->Run(
[&]() {
// Inject second update into WorkSerializer queue before we
// exit idle, so that the second update gets run before the initial
// subchannel connectivity state notifications from the first update
// are delivered.
work_serializer_->Run(
[&]() {
// Second update.
absl::Status status = lb_policy()->UpdateLocked(
BuildUpdate(kNewAddresses, MakePickFirstConfig(false)));
EXPECT_TRUE(status.ok()) << status;
// Trigger notification once all connectivity state
// notifications have been delivered.
work_serializer_->Run([&]() { notification.Notify(); },
DEBUG_LOCATION);
},
DEBUG_LOCATION);
// Exit idle.
lb_policy()->ExitIdleLocked();
},
DEBUG_LOCATION);
notification.WaitForNotification();
// The LB policy should have created subchannels for the new addresses.
auto* subchannel3 = FindSubchannel(kNewAddresses[0]);
ASSERT_NE(subchannel3, nullptr);
auto* subchannel4 = FindSubchannel(kNewAddresses[1]);
ASSERT_NE(subchannel4, nullptr);
// The LB policy will request a connection on the first new subchannel,
// none of the others.
EXPECT_TRUE(subchannel3->ConnectionRequested());
EXPECT_FALSE(subchannel->ConnectionRequested());
EXPECT_FALSE(subchannel2->ConnectionRequested());
EXPECT_FALSE(subchannel4->ConnectionRequested());
// The subchannel starts a connection attempt.
subchannel3->SetConnectivityState(GRPC_CHANNEL_CONNECTING);
// The LB policy should now report CONNECTING.
ExpectConnectingUpdate();
// The connection attempt succeeds.
subchannel3->SetConnectivityState(GRPC_CHANNEL_READY);
// The LB policy will report CONNECTING some number of times (doesn't
// matter how many) and then report READY.
picker = WaitForConnected();
ASSERT_NE(picker, nullptr);
// Picker should return the same subchannel repeatedly.
for (size_t i = 0; i < 3; ++i) {
EXPECT_EQ(ExpectPickComplete(picker.get()), kNewAddresses[0]);
}
}
TEST_F(PickFirstTest, HappyEyeballs) {
if (!IsPickFirstHappyEyeballsEnabled()) return;
// Send an update containing three addresses.

Loading…
Cancel
Save