[WRR] fix bugs that caused us to re-enter blackout period upon updates (#33694)

As per gRFC A58, when WRR sees a subchannel report READY, it reset the non_empty_since value, thus restarting the blackout period. However, there were two cases where we were incorrectly triggering this code: 1. When WRR got an updated address list that contained addresses that were already present on the old list and whose subchannels were already in READY state, the initial notification for those subchannels on the new list was READY, which incorrectly triggered resetting the non_empty_since value. 2. Due to a bug in the outlier_detection policy, whenever an update was propagated down through the OD policy without actually enabling OD, it would incorrectly send a duplicate connectivity state notification for the subchannels. This meant that a subchannel that was already in state READY would report READY again, which would also incorrectly trigger resetting the non_empty_since value. This PR makes two changes: 1. Fix the bug in outlier_detection that caused it to generate the spurious duplicate READY updates. 2. Fix WRR to reset the non_empty_since value when a subchannel goes READY only if the subchannel has seen a previous state update and only if that previous state was not READY. (The duplicate READY notifications should not actually happen anymore now that the OD policy has been fixed, but better to be defensive.) Fixes b/290983884.
2 years ago · ec39600872
parent 82fb0b1676
commit ec39600872
4 changed files with 128 additions and 5 deletions
--- a/src/core/ext/filters/client_channel/lb_policy/outlier_detection/outlier_detection.cc
+++ b/src/core/ext/filters/client_channel/lb_policy/outlier_detection/outlier_detection.cc
@ -294,7 +294,7 @@ class OutlierDetectionLb : public LoadBalancingPolicy {
    }

    void DisableEjection() {
-      Uneject();
+      if (ejection_time_.has_value()) Uneject();
      multiplier_ = 0;
    }

--- a/src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/weighted_round_robin.cc
+++ b/src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/weighted_round_robin.cc
@ -793,12 +793,21 @@ void WeightedRoundRobin::WrrEndpointList::WrrEndpoint::OnStateUpdate(
    ExitIdleLocked();
  } else if (new_state == GRPC_CHANNEL_READY) {
    // If we transition back to READY state, restart the blackout period.
+    // Skip this if this is the initial notification for this
+    // subchannel (which happens whenever we get updated addresses and
+    // create a new endpoint list).  Also skip it if the previous state
+    // was READY (which should never happen in practice, but we've seen
+    // at least one bug that caused this in the outlier_detection
+    // policy, so let's be defensive here).
+    //
    // Note that we cannot guarantee that we will never receive
    // lingering callbacks for backend metric reports from the previous
    // connection after the new connection has been established, but they
    // should be masked by new backend metric reports from the new
    // connection by the time the blackout period ends.
-    weight_->ResetNonEmptySince();
+    if (old_state.has_value() && old_state != GRPC_CHANNEL_READY) {
+      weight_->ResetNonEmptySince();
+    }
  }
  // If state changed, update state counters.
  if (!old_state.has_value() || *old_state != new_state) {
--- a/test/core/client_channel/lb_policy/weighted_round_robin_test.cc
+++ b/test/core/client_channel/lb_policy/weighted_round_robin_test.cc
@ -252,6 +252,7 @@ class WeightedRoundRobinTest : public TimeAwareLoadBalancingPolicyTest {
          backend_metrics,
      std::map<absl::string_view /*address*/, size_t /*num_picks*/> expected,
      absl::Duration timeout = absl::Seconds(5),
+      bool run_timer_callbacks = true,
      SourceLocation location = SourceLocation()) {
    gpr_log(GPR_INFO, "==> WaitForWeightedRoundRobinPicks(): Expecting %s",
            PickMapString(expected).c_str());
@ -308,7 +309,7 @@ class WeightedRoundRobinTest : public TimeAwareLoadBalancingPolicyTest {
        EXPECT_NE(*picker, nullptr)
            << location.file() << ":" << location.line();
        if (*picker == nullptr) return false;
-      } else {
+      } else if (run_timer_callbacks) {
        gpr_log(GPR_INFO, "running timer callback...");
        RunTimerCallback();
      }
@ -803,6 +804,45 @@ TEST_F(WeightedRoundRobinTest, BlackoutPeriodAfterDisconnect) {
      {{kAddresses[0], 1}, {kAddresses[1], 3}, {kAddresses[2], 3}});
 }

+TEST_F(WeightedRoundRobinTest, BlackoutPeriodDoesNotGetResetAfterUpdate) {
+  // Send address list to LB policy.
+  const std::array<absl::string_view, 3> kAddresses = {
+      "ipv4:127.0.0.1:441", "ipv4:127.0.0.1:442", "ipv4:127.0.0.1:443"};
+  auto config_builder =
+      ConfigBuilder().SetWeightExpirationPeriod(Duration::Seconds(2));
+  auto picker =
+      SendInitialUpdateAndWaitForConnected(kAddresses, config_builder);
+  ASSERT_NE(picker, nullptr);
+  // All backends report weights.
+  WaitForWeightedRoundRobinPicks(
+      &picker,
+      {{kAddresses[0], MakeBackendMetricData(/*app_utilization=*/0.9,
+                                             /*qps=*/100.0, /*eps=*/0.0)},
+       {kAddresses[1], MakeBackendMetricData(/*app_utilization=*/0.3,
+                                             /*qps=*/100.0, /*eps=*/0.0)},
+       {kAddresses[2], MakeBackendMetricData(/*app_utilization=*/0.3,
+                                             /*qps=*/100.0, /*eps=*/0.0)}},
+      {{kAddresses[0], 1}, {kAddresses[1], 3}, {kAddresses[2], 3}});
+  // Send a duplicate update with the same addresses and config.
+  EXPECT_EQ(ApplyUpdate(BuildUpdate(kAddresses, config_builder.Build()),
+                        lb_policy_.get()),
+            absl::OkStatus());
+  // Note that we have not advanced time, so if the update incorrectly
+  // triggers resetting the blackout period, none of the weights will
+  // actually be used.
+  picker = ExpectState(GRPC_CHANNEL_READY, absl::OkStatus());
+  WaitForWeightedRoundRobinPicks(
+      &picker,
+      {{kAddresses[0], MakeBackendMetricData(/*app_utilization=*/0.9,
+                                             /*qps=*/100.0, /*eps=*/0.0)},
+       {kAddresses[1], MakeBackendMetricData(/*app_utilization=*/0.3,
+                                             /*qps=*/100.0, /*eps=*/0.0)},
+       {kAddresses[2], MakeBackendMetricData(/*app_utilization=*/0.3,
+                                             /*qps=*/100.0, /*eps=*/0.0)}},
+      {{kAddresses[0], 1}, {kAddresses[1], 3}, {kAddresses[2], 3}},
+      /*timeout=*/absl::Seconds(5), /*run_timer_callbacks=*/false);
+}
+
 TEST_F(WeightedRoundRobinTest, ZeroErrorUtilPenalty) {
  // Send address list to LB policy.
  const std::array<absl::string_view, 3> kAddresses = {
--- a/test/cpp/end2end/client_lb_end2end_test.cc
+++ b/test/cpp/end2end/client_lb_end2end_test.cc
@ -3138,13 +3138,27 @@ const char kServiceConfigOob[] =
    "  ]\n"
    "}";

+const char kServiceConfigWithOutlierDetection[] =
+    "{\n"
+    "  \"loadBalancingConfig\": [\n"
+    "    {\"outlier_detection_experimental\": {\n"
+    "      \"childPolicy\": [\n"
+    "        {\"weighted_round_robin\": {\n"
+    "          \"blackoutPeriod\": \"%ds\",\n"
+    "          \"weightUpdatePeriod\": \"0.1s\"\n"
+    "        }}\n"
+    "      ]\n"
+    "    }}\n"
+    "  ]\n"
+    "}";
+
 class WeightedRoundRobinTest : public ClientLbEnd2endTest {
 protected:
  void ExpectWeightedRoundRobinPicks(
      const grpc_core::DebugLocation& location,
      const std::unique_ptr<grpc::testing::EchoTestService::Stub>& stub,
      const std::vector<size_t>& expected_weights, size_t total_passes = 3,
-      EchoRequest* request_ptr = nullptr) {
+      EchoRequest* request_ptr = nullptr, int timeout_ms = 15000) {
    GPR_ASSERT(expected_weights.size() == servers_.size());
    size_t total_picks_per_pass = 0;
    for (size_t picks : expected_weights) {
@ -3174,7 +3188,7 @@ class WeightedRoundRobinTest : public ClientLbEnd2endTest {
          }
          return true;
        },
-        request_ptr);
+        request_ptr, timeout_ms);
  }
 };

@ -3218,6 +3232,66 @@ TEST_F(WeightedRoundRobinTest, CallAndServerMetric) {
  EXPECT_EQ("weighted_round_robin", channel->GetLoadBalancingPolicyName());
 }

+// This tests a bug seen in production where the outlier_detection
+// policy would incorrectly generate a duplicate READY notification on
+// all of its subchannels every time it saw an update, thus causing the
+// WRR policy to re-enter the blackout period for that address.
+TEST_F(WeightedRoundRobinTest, WithOutlierDetection) {
+  const int kBlackoutPeriodSeconds = 5;
+  const int kNumServers = 3;
+  StartServers(kNumServers);
+  // Report server metrics that should give 6:4:3 WRR picks.
+  // weights = qps / (util + (eps/qps)) =
+  //   1/(0.2+0.2) : 1/(0.3+0.3) : 2/(1.5+0.1) = 6:4:3
+  // where util is app_util if set, or cpu_util.
+  servers_[0]->server_metric_recorder_->SetApplicationUtilization(0.2);
+  servers_[0]->server_metric_recorder_->SetEps(20);
+  servers_[0]->server_metric_recorder_->SetQps(100);
+  servers_[1]->server_metric_recorder_->SetApplicationUtilization(0.3);
+  servers_[1]->server_metric_recorder_->SetEps(30);
+  servers_[1]->server_metric_recorder_->SetQps(100);
+  servers_[2]->server_metric_recorder_->SetApplicationUtilization(1.5);
+  servers_[2]->server_metric_recorder_->SetEps(20);
+  servers_[2]->server_metric_recorder_->SetQps(200);
+  // Create channel.
+  // Initial blackout period is 0, so that we start seeing traffic in
+  // the right proportions right away.
+  auto response_generator = BuildResolverResponseGenerator();
+  auto channel = BuildChannel("", response_generator);
+  auto stub = BuildStub(channel);
+  response_generator.SetNextResolution(
+      GetServersPorts(),
+      absl::StrFormat(kServiceConfigWithOutlierDetection, 0).c_str());
+  // Send requests with per-call reported EPS/QPS set to 0/100.
+  // This should give 1/2:1/3:1/15 = 15:10:2 WRR picks.
+  // Keep sending RPCs long enough to go past the new blackout period
+  // that we're going to add later.
+  absl::Time deadline =
+      absl::Now() +
+      absl::Seconds(kBlackoutPeriodSeconds * grpc_test_slowdown_factor());
+  EchoRequest request;
+  // We cannot override with 0 with proto3, so setting it to almost 0.
+  request.mutable_param()->mutable_backend_metrics()->set_eps(
+      std::numeric_limits<double>::min());
+  request.mutable_param()->mutable_backend_metrics()->set_rps_fractional(100);
+  do {
+    ExpectWeightedRoundRobinPicks(DEBUG_LOCATION, stub,
+                                  /*expected_weights=*/{15, 10, 2},
+                                  /*total_passes=*/3, &request);
+  } while (absl::Now() < deadline);
+  // Send a new resolver response that increases blackout period.
+  response_generator.SetNextResolution(
+      GetServersPorts(),
+      absl::StrFormat(kServiceConfigWithOutlierDetection,
+                      kBlackoutPeriodSeconds * grpc_test_slowdown_factor())
+          .c_str());
+  // Weights should be the same before the blackout period expires.
+  ExpectWeightedRoundRobinPicks(
+      DEBUG_LOCATION, stub, /*expected_weights=*/{15, 10, 2},
+      /*total_passes=*/3, &request,
+      /*timeout_ms=*/(kBlackoutPeriodSeconds - 1) * 1000);
+}
+
 class WeightedRoundRobinParamTest
    : public WeightedRoundRobinTest,
      public ::testing::WithParamInterface<const char*> {};