From ec39600872e9cc93307ddc2c30cfa3267f08dab4 Mon Sep 17 00:00:00 2001 From: "Mark D. Roth" Date: Thu, 13 Jul 2023 15:35:17 -0700 Subject: [PATCH] [WRR] fix bugs that caused us to re-enter blackout period upon updates (#33694) As per gRFC A58, when WRR sees a subchannel report READY, it reset the non_empty_since value, thus restarting the blackout period. However, there were two cases where we were incorrectly triggering this code: 1. When WRR got an updated address list that contained addresses that were already present on the old list and whose subchannels were already in READY state, the initial notification for those subchannels on the new list was READY, which incorrectly triggered resetting the non_empty_since value. 2. Due to a bug in the outlier_detection policy, whenever an update was propagated down through the OD policy without actually enabling OD, it would incorrectly send a duplicate connectivity state notification for the subchannels. This meant that a subchannel that was already in state READY would report READY again, which would also incorrectly trigger resetting the non_empty_since value. This PR makes two changes: 1. Fix the bug in outlier_detection that caused it to generate the spurious duplicate READY updates. 2. Fix WRR to reset the non_empty_since value when a subchannel goes READY only if the subchannel has seen a previous state update and only if that previous state was not READY. (The duplicate READY notifications should not actually happen anymore now that the OD policy has been fixed, but better to be defensive.) Fixes b/290983884. --- .../outlier_detection/outlier_detection.cc | 2 +- .../weighted_round_robin.cc | 11 ++- .../lb_policy/weighted_round_robin_test.cc | 42 +++++++++- test/cpp/end2end/client_lb_end2end_test.cc | 78 ++++++++++++++++++- 4 files changed, 128 insertions(+), 5 deletions(-) diff --git a/src/core/ext/filters/client_channel/lb_policy/outlier_detection/outlier_detection.cc b/src/core/ext/filters/client_channel/lb_policy/outlier_detection/outlier_detection.cc index 0fcad6bd9d8..8b52dbe8108 100644 --- a/src/core/ext/filters/client_channel/lb_policy/outlier_detection/outlier_detection.cc +++ b/src/core/ext/filters/client_channel/lb_policy/outlier_detection/outlier_detection.cc @@ -294,7 +294,7 @@ class OutlierDetectionLb : public LoadBalancingPolicy { } void DisableEjection() { - Uneject(); + if (ejection_time_.has_value()) Uneject(); multiplier_ = 0; } diff --git a/src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/weighted_round_robin.cc b/src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/weighted_round_robin.cc index 295c2a36231..fb169c725d9 100644 --- a/src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/weighted_round_robin.cc +++ b/src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/weighted_round_robin.cc @@ -793,12 +793,21 @@ void WeightedRoundRobin::WrrEndpointList::WrrEndpoint::OnStateUpdate( ExitIdleLocked(); } else if (new_state == GRPC_CHANNEL_READY) { // If we transition back to READY state, restart the blackout period. + // Skip this if this is the initial notification for this + // subchannel (which happens whenever we get updated addresses and + // create a new endpoint list). Also skip it if the previous state + // was READY (which should never happen in practice, but we've seen + // at least one bug that caused this in the outlier_detection + // policy, so let's be defensive here). + // // Note that we cannot guarantee that we will never receive // lingering callbacks for backend metric reports from the previous // connection after the new connection has been established, but they // should be masked by new backend metric reports from the new // connection by the time the blackout period ends. - weight_->ResetNonEmptySince(); + if (old_state.has_value() && old_state != GRPC_CHANNEL_READY) { + weight_->ResetNonEmptySince(); + } } // If state changed, update state counters. if (!old_state.has_value() || *old_state != new_state) { diff --git a/test/core/client_channel/lb_policy/weighted_round_robin_test.cc b/test/core/client_channel/lb_policy/weighted_round_robin_test.cc index 58b039bab67..536bb779069 100644 --- a/test/core/client_channel/lb_policy/weighted_round_robin_test.cc +++ b/test/core/client_channel/lb_policy/weighted_round_robin_test.cc @@ -252,6 +252,7 @@ class WeightedRoundRobinTest : public TimeAwareLoadBalancingPolicyTest { backend_metrics, std::map expected, absl::Duration timeout = absl::Seconds(5), + bool run_timer_callbacks = true, SourceLocation location = SourceLocation()) { gpr_log(GPR_INFO, "==> WaitForWeightedRoundRobinPicks(): Expecting %s", PickMapString(expected).c_str()); @@ -308,7 +309,7 @@ class WeightedRoundRobinTest : public TimeAwareLoadBalancingPolicyTest { EXPECT_NE(*picker, nullptr) << location.file() << ":" << location.line(); if (*picker == nullptr) return false; - } else { + } else if (run_timer_callbacks) { gpr_log(GPR_INFO, "running timer callback..."); RunTimerCallback(); } @@ -803,6 +804,45 @@ TEST_F(WeightedRoundRobinTest, BlackoutPeriodAfterDisconnect) { {{kAddresses[0], 1}, {kAddresses[1], 3}, {kAddresses[2], 3}}); } +TEST_F(WeightedRoundRobinTest, BlackoutPeriodDoesNotGetResetAfterUpdate) { + // Send address list to LB policy. + const std::array kAddresses = { + "ipv4:127.0.0.1:441", "ipv4:127.0.0.1:442", "ipv4:127.0.0.1:443"}; + auto config_builder = + ConfigBuilder().SetWeightExpirationPeriod(Duration::Seconds(2)); + auto picker = + SendInitialUpdateAndWaitForConnected(kAddresses, config_builder); + ASSERT_NE(picker, nullptr); + // All backends report weights. + WaitForWeightedRoundRobinPicks( + &picker, + {{kAddresses[0], MakeBackendMetricData(/*app_utilization=*/0.9, + /*qps=*/100.0, /*eps=*/0.0)}, + {kAddresses[1], MakeBackendMetricData(/*app_utilization=*/0.3, + /*qps=*/100.0, /*eps=*/0.0)}, + {kAddresses[2], MakeBackendMetricData(/*app_utilization=*/0.3, + /*qps=*/100.0, /*eps=*/0.0)}}, + {{kAddresses[0], 1}, {kAddresses[1], 3}, {kAddresses[2], 3}}); + // Send a duplicate update with the same addresses and config. + EXPECT_EQ(ApplyUpdate(BuildUpdate(kAddresses, config_builder.Build()), + lb_policy_.get()), + absl::OkStatus()); + // Note that we have not advanced time, so if the update incorrectly + // triggers resetting the blackout period, none of the weights will + // actually be used. + picker = ExpectState(GRPC_CHANNEL_READY, absl::OkStatus()); + WaitForWeightedRoundRobinPicks( + &picker, + {{kAddresses[0], MakeBackendMetricData(/*app_utilization=*/0.9, + /*qps=*/100.0, /*eps=*/0.0)}, + {kAddresses[1], MakeBackendMetricData(/*app_utilization=*/0.3, + /*qps=*/100.0, /*eps=*/0.0)}, + {kAddresses[2], MakeBackendMetricData(/*app_utilization=*/0.3, + /*qps=*/100.0, /*eps=*/0.0)}}, + {{kAddresses[0], 1}, {kAddresses[1], 3}, {kAddresses[2], 3}}, + /*timeout=*/absl::Seconds(5), /*run_timer_callbacks=*/false); +} + TEST_F(WeightedRoundRobinTest, ZeroErrorUtilPenalty) { // Send address list to LB policy. const std::array kAddresses = { diff --git a/test/cpp/end2end/client_lb_end2end_test.cc b/test/cpp/end2end/client_lb_end2end_test.cc index 92b7b9f94cf..d363d5cc640 100644 --- a/test/cpp/end2end/client_lb_end2end_test.cc +++ b/test/cpp/end2end/client_lb_end2end_test.cc @@ -3138,13 +3138,27 @@ const char kServiceConfigOob[] = " ]\n" "}"; +const char kServiceConfigWithOutlierDetection[] = + "{\n" + " \"loadBalancingConfig\": [\n" + " {\"outlier_detection_experimental\": {\n" + " \"childPolicy\": [\n" + " {\"weighted_round_robin\": {\n" + " \"blackoutPeriod\": \"%ds\",\n" + " \"weightUpdatePeriod\": \"0.1s\"\n" + " }}\n" + " ]\n" + " }}\n" + " ]\n" + "}"; + class WeightedRoundRobinTest : public ClientLbEnd2endTest { protected: void ExpectWeightedRoundRobinPicks( const grpc_core::DebugLocation& location, const std::unique_ptr& stub, const std::vector& expected_weights, size_t total_passes = 3, - EchoRequest* request_ptr = nullptr) { + EchoRequest* request_ptr = nullptr, int timeout_ms = 15000) { GPR_ASSERT(expected_weights.size() == servers_.size()); size_t total_picks_per_pass = 0; for (size_t picks : expected_weights) { @@ -3174,7 +3188,7 @@ class WeightedRoundRobinTest : public ClientLbEnd2endTest { } return true; }, - request_ptr); + request_ptr, timeout_ms); } }; @@ -3218,6 +3232,66 @@ TEST_F(WeightedRoundRobinTest, CallAndServerMetric) { EXPECT_EQ("weighted_round_robin", channel->GetLoadBalancingPolicyName()); } +// This tests a bug seen in production where the outlier_detection +// policy would incorrectly generate a duplicate READY notification on +// all of its subchannels every time it saw an update, thus causing the +// WRR policy to re-enter the blackout period for that address. +TEST_F(WeightedRoundRobinTest, WithOutlierDetection) { + const int kBlackoutPeriodSeconds = 5; + const int kNumServers = 3; + StartServers(kNumServers); + // Report server metrics that should give 6:4:3 WRR picks. + // weights = qps / (util + (eps/qps)) = + // 1/(0.2+0.2) : 1/(0.3+0.3) : 2/(1.5+0.1) = 6:4:3 + // where util is app_util if set, or cpu_util. + servers_[0]->server_metric_recorder_->SetApplicationUtilization(0.2); + servers_[0]->server_metric_recorder_->SetEps(20); + servers_[0]->server_metric_recorder_->SetQps(100); + servers_[1]->server_metric_recorder_->SetApplicationUtilization(0.3); + servers_[1]->server_metric_recorder_->SetEps(30); + servers_[1]->server_metric_recorder_->SetQps(100); + servers_[2]->server_metric_recorder_->SetApplicationUtilization(1.5); + servers_[2]->server_metric_recorder_->SetEps(20); + servers_[2]->server_metric_recorder_->SetQps(200); + // Create channel. + // Initial blackout period is 0, so that we start seeing traffic in + // the right proportions right away. + auto response_generator = BuildResolverResponseGenerator(); + auto channel = BuildChannel("", response_generator); + auto stub = BuildStub(channel); + response_generator.SetNextResolution( + GetServersPorts(), + absl::StrFormat(kServiceConfigWithOutlierDetection, 0).c_str()); + // Send requests with per-call reported EPS/QPS set to 0/100. + // This should give 1/2:1/3:1/15 = 15:10:2 WRR picks. + // Keep sending RPCs long enough to go past the new blackout period + // that we're going to add later. + absl::Time deadline = + absl::Now() + + absl::Seconds(kBlackoutPeriodSeconds * grpc_test_slowdown_factor()); + EchoRequest request; + // We cannot override with 0 with proto3, so setting it to almost 0. + request.mutable_param()->mutable_backend_metrics()->set_eps( + std::numeric_limits::min()); + request.mutable_param()->mutable_backend_metrics()->set_rps_fractional(100); + do { + ExpectWeightedRoundRobinPicks(DEBUG_LOCATION, stub, + /*expected_weights=*/{15, 10, 2}, + /*total_passes=*/3, &request); + } while (absl::Now() < deadline); + // Send a new resolver response that increases blackout period. + response_generator.SetNextResolution( + GetServersPorts(), + absl::StrFormat(kServiceConfigWithOutlierDetection, + kBlackoutPeriodSeconds * grpc_test_slowdown_factor()) + .c_str()); + // Weights should be the same before the blackout period expires. + ExpectWeightedRoundRobinPicks( + DEBUG_LOCATION, stub, /*expected_weights=*/{15, 10, 2}, + /*total_passes=*/3, &request, + /*timeout_ms=*/(kBlackoutPeriodSeconds - 1) * 1000); +} + class WeightedRoundRobinParamTest : public WeightedRoundRobinTest, public ::testing::WithParamInterface {};