diff --git a/src/core/ext/filters/client_channel/lb_policy/outlier_detection/outlier_detection.cc b/src/core/ext/filters/client_channel/lb_policy/outlier_detection/outlier_detection.cc
index 0fcad6bd9d8..8b52dbe8108 100644
--- a/src/core/ext/filters/client_channel/lb_policy/outlier_detection/outlier_detection.cc
+++ b/src/core/ext/filters/client_channel/lb_policy/outlier_detection/outlier_detection.cc
@@ -294,7 +294,7 @@ class OutlierDetectionLb : public LoadBalancingPolicy {
}
void DisableEjection() {
- Uneject();
+ if (ejection_time_.has_value()) Uneject();
multiplier_ = 0;
}
diff --git a/src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/weighted_round_robin.cc b/src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/weighted_round_robin.cc
index 295c2a36231..fb169c725d9 100644
--- a/src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/weighted_round_robin.cc
+++ b/src/core/ext/filters/client_channel/lb_policy/weighted_round_robin/weighted_round_robin.cc
@@ -793,12 +793,21 @@ void WeightedRoundRobin::WrrEndpointList::WrrEndpoint::OnStateUpdate(
ExitIdleLocked();
} else if (new_state == GRPC_CHANNEL_READY) {
// If we transition back to READY state, restart the blackout period.
+ // Skip this if this is the initial notification for this
+ // subchannel (which happens whenever we get updated addresses and
+ // create a new endpoint list). Also skip it if the previous state
+ // was READY (which should never happen in practice, but we've seen
+ // at least one bug that caused this in the outlier_detection
+ // policy, so let's be defensive here).
+ //
// Note that we cannot guarantee that we will never receive
// lingering callbacks for backend metric reports from the previous
// connection after the new connection has been established, but they
// should be masked by new backend metric reports from the new
// connection by the time the blackout period ends.
- weight_->ResetNonEmptySince();
+ if (old_state.has_value() && old_state != GRPC_CHANNEL_READY) {
+ weight_->ResetNonEmptySince();
+ }
}
// If state changed, update state counters.
if (!old_state.has_value() || *old_state != new_state) {
diff --git a/test/core/client_channel/lb_policy/weighted_round_robin_test.cc b/test/core/client_channel/lb_policy/weighted_round_robin_test.cc
index 58b039bab67..536bb779069 100644
--- a/test/core/client_channel/lb_policy/weighted_round_robin_test.cc
+++ b/test/core/client_channel/lb_policy/weighted_round_robin_test.cc
@@ -252,6 +252,7 @@ class WeightedRoundRobinTest : public TimeAwareLoadBalancingPolicyTest {
backend_metrics,
std::map expected,
absl::Duration timeout = absl::Seconds(5),
+ bool run_timer_callbacks = true,
SourceLocation location = SourceLocation()) {
gpr_log(GPR_INFO, "==> WaitForWeightedRoundRobinPicks(): Expecting %s",
PickMapString(expected).c_str());
@@ -308,7 +309,7 @@ class WeightedRoundRobinTest : public TimeAwareLoadBalancingPolicyTest {
EXPECT_NE(*picker, nullptr)
<< location.file() << ":" << location.line();
if (*picker == nullptr) return false;
- } else {
+ } else if (run_timer_callbacks) {
gpr_log(GPR_INFO, "running timer callback...");
RunTimerCallback();
}
@@ -803,6 +804,45 @@ TEST_F(WeightedRoundRobinTest, BlackoutPeriodAfterDisconnect) {
{{kAddresses[0], 1}, {kAddresses[1], 3}, {kAddresses[2], 3}});
}
+TEST_F(WeightedRoundRobinTest, BlackoutPeriodDoesNotGetResetAfterUpdate) {
+ // Send address list to LB policy.
+ const std::array kAddresses = {
+ "ipv4:127.0.0.1:441", "ipv4:127.0.0.1:442", "ipv4:127.0.0.1:443"};
+ auto config_builder =
+ ConfigBuilder().SetWeightExpirationPeriod(Duration::Seconds(2));
+ auto picker =
+ SendInitialUpdateAndWaitForConnected(kAddresses, config_builder);
+ ASSERT_NE(picker, nullptr);
+ // All backends report weights.
+ WaitForWeightedRoundRobinPicks(
+ &picker,
+ {{kAddresses[0], MakeBackendMetricData(/*app_utilization=*/0.9,
+ /*qps=*/100.0, /*eps=*/0.0)},
+ {kAddresses[1], MakeBackendMetricData(/*app_utilization=*/0.3,
+ /*qps=*/100.0, /*eps=*/0.0)},
+ {kAddresses[2], MakeBackendMetricData(/*app_utilization=*/0.3,
+ /*qps=*/100.0, /*eps=*/0.0)}},
+ {{kAddresses[0], 1}, {kAddresses[1], 3}, {kAddresses[2], 3}});
+ // Send a duplicate update with the same addresses and config.
+ EXPECT_EQ(ApplyUpdate(BuildUpdate(kAddresses, config_builder.Build()),
+ lb_policy_.get()),
+ absl::OkStatus());
+ // Note that we have not advanced time, so if the update incorrectly
+ // triggers resetting the blackout period, none of the weights will
+ // actually be used.
+ picker = ExpectState(GRPC_CHANNEL_READY, absl::OkStatus());
+ WaitForWeightedRoundRobinPicks(
+ &picker,
+ {{kAddresses[0], MakeBackendMetricData(/*app_utilization=*/0.9,
+ /*qps=*/100.0, /*eps=*/0.0)},
+ {kAddresses[1], MakeBackendMetricData(/*app_utilization=*/0.3,
+ /*qps=*/100.0, /*eps=*/0.0)},
+ {kAddresses[2], MakeBackendMetricData(/*app_utilization=*/0.3,
+ /*qps=*/100.0, /*eps=*/0.0)}},
+ {{kAddresses[0], 1}, {kAddresses[1], 3}, {kAddresses[2], 3}},
+ /*timeout=*/absl::Seconds(5), /*run_timer_callbacks=*/false);
+}
+
TEST_F(WeightedRoundRobinTest, ZeroErrorUtilPenalty) {
// Send address list to LB policy.
const std::array kAddresses = {
diff --git a/test/cpp/end2end/client_lb_end2end_test.cc b/test/cpp/end2end/client_lb_end2end_test.cc
index 92b7b9f94cf..d363d5cc640 100644
--- a/test/cpp/end2end/client_lb_end2end_test.cc
+++ b/test/cpp/end2end/client_lb_end2end_test.cc
@@ -3138,13 +3138,27 @@ const char kServiceConfigOob[] =
" ]\n"
"}";
+const char kServiceConfigWithOutlierDetection[] =
+ "{\n"
+ " \"loadBalancingConfig\": [\n"
+ " {\"outlier_detection_experimental\": {\n"
+ " \"childPolicy\": [\n"
+ " {\"weighted_round_robin\": {\n"
+ " \"blackoutPeriod\": \"%ds\",\n"
+ " \"weightUpdatePeriod\": \"0.1s\"\n"
+ " }}\n"
+ " ]\n"
+ " }}\n"
+ " ]\n"
+ "}";
+
class WeightedRoundRobinTest : public ClientLbEnd2endTest {
protected:
void ExpectWeightedRoundRobinPicks(
const grpc_core::DebugLocation& location,
const std::unique_ptr& stub,
const std::vector& expected_weights, size_t total_passes = 3,
- EchoRequest* request_ptr = nullptr) {
+ EchoRequest* request_ptr = nullptr, int timeout_ms = 15000) {
GPR_ASSERT(expected_weights.size() == servers_.size());
size_t total_picks_per_pass = 0;
for (size_t picks : expected_weights) {
@@ -3174,7 +3188,7 @@ class WeightedRoundRobinTest : public ClientLbEnd2endTest {
}
return true;
},
- request_ptr);
+ request_ptr, timeout_ms);
}
};
@@ -3218,6 +3232,66 @@ TEST_F(WeightedRoundRobinTest, CallAndServerMetric) {
EXPECT_EQ("weighted_round_robin", channel->GetLoadBalancingPolicyName());
}
+// This tests a bug seen in production where the outlier_detection
+// policy would incorrectly generate a duplicate READY notification on
+// all of its subchannels every time it saw an update, thus causing the
+// WRR policy to re-enter the blackout period for that address.
+TEST_F(WeightedRoundRobinTest, WithOutlierDetection) {
+ const int kBlackoutPeriodSeconds = 5;
+ const int kNumServers = 3;
+ StartServers(kNumServers);
+ // Report server metrics that should give 6:4:3 WRR picks.
+ // weights = qps / (util + (eps/qps)) =
+ // 1/(0.2+0.2) : 1/(0.3+0.3) : 2/(1.5+0.1) = 6:4:3
+ // where util is app_util if set, or cpu_util.
+ servers_[0]->server_metric_recorder_->SetApplicationUtilization(0.2);
+ servers_[0]->server_metric_recorder_->SetEps(20);
+ servers_[0]->server_metric_recorder_->SetQps(100);
+ servers_[1]->server_metric_recorder_->SetApplicationUtilization(0.3);
+ servers_[1]->server_metric_recorder_->SetEps(30);
+ servers_[1]->server_metric_recorder_->SetQps(100);
+ servers_[2]->server_metric_recorder_->SetApplicationUtilization(1.5);
+ servers_[2]->server_metric_recorder_->SetEps(20);
+ servers_[2]->server_metric_recorder_->SetQps(200);
+ // Create channel.
+ // Initial blackout period is 0, so that we start seeing traffic in
+ // the right proportions right away.
+ auto response_generator = BuildResolverResponseGenerator();
+ auto channel = BuildChannel("", response_generator);
+ auto stub = BuildStub(channel);
+ response_generator.SetNextResolution(
+ GetServersPorts(),
+ absl::StrFormat(kServiceConfigWithOutlierDetection, 0).c_str());
+ // Send requests with per-call reported EPS/QPS set to 0/100.
+ // This should give 1/2:1/3:1/15 = 15:10:2 WRR picks.
+ // Keep sending RPCs long enough to go past the new blackout period
+ // that we're going to add later.
+ absl::Time deadline =
+ absl::Now() +
+ absl::Seconds(kBlackoutPeriodSeconds * grpc_test_slowdown_factor());
+ EchoRequest request;
+ // We cannot override with 0 with proto3, so setting it to almost 0.
+ request.mutable_param()->mutable_backend_metrics()->set_eps(
+ std::numeric_limits::min());
+ request.mutable_param()->mutable_backend_metrics()->set_rps_fractional(100);
+ do {
+ ExpectWeightedRoundRobinPicks(DEBUG_LOCATION, stub,
+ /*expected_weights=*/{15, 10, 2},
+ /*total_passes=*/3, &request);
+ } while (absl::Now() < deadline);
+ // Send a new resolver response that increases blackout period.
+ response_generator.SetNextResolution(
+ GetServersPorts(),
+ absl::StrFormat(kServiceConfigWithOutlierDetection,
+ kBlackoutPeriodSeconds * grpc_test_slowdown_factor())
+ .c_str());
+ // Weights should be the same before the blackout period expires.
+ ExpectWeightedRoundRobinPicks(
+ DEBUG_LOCATION, stub, /*expected_weights=*/{15, 10, 2},
+ /*total_passes=*/3, &request,
+ /*timeout_ms=*/(kBlackoutPeriodSeconds - 1) * 1000);
+}
+
class WeightedRoundRobinParamTest
: public WeightedRoundRobinTest,
public ::testing::WithParamInterface {};