outlier_detection LB: reset state when OD is disabled (#30418)

* outlier_detection LB: reset state when OD is disabled

* leave interval unset when no OD config is present

* Automated change: Fix sanity tests

* fix clang-tidy

* don't check for interval of infinite duration

Co-authored-by: markdroth <markdroth@users.noreply.github.com>
pull/30465/head
Mark D. Roth 3 years ago committed by GitHub
parent 3ab8b2ee62
commit 7fcb2e1229
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 1
      BUILD
  2. 16
      src/core/ext/filters/client_channel/lb_policy/outlier_detection/outlier_detection.cc
  3. 26
      src/core/ext/filters/client_channel/lb_policy/outlier_detection/outlier_detection.h
  4. 31
      src/core/ext/filters/client_channel/lb_policy/xds/xds_cluster_resolver.cc
  5. 3
      src/core/ext/xds/xds_cluster.h
  6. 110
      test/cpp/end2end/xds/xds_outlier_detection_end2end_test.cc

@ -4377,7 +4377,6 @@ grpc_cc_library(
"orphanable",
"ref_counted_ptr",
"server_address",
"time",
"work_serializer",
],
)

@ -101,10 +101,8 @@ class OutlierDetectionLbConfig : public LoadBalancingPolicy::Config {
absl::string_view name() const override { return kOutlierDetection; }
bool CountingEnabled() const {
return (
outlier_detection_config_.interval != Duration::Infinity() &&
(outlier_detection_config_.success_rate_ejection.has_value() ||
outlier_detection_config_.failure_percentage_ejection.has_value()));
return outlier_detection_config_.success_rate_ejection.has_value() ||
outlier_detection_config_.failure_percentage_ejection.has_value();
}
const OutlierDetectionConfig& outlier_detection_config() const {
@ -306,6 +304,11 @@ class OutlierDetectionLb : public LoadBalancingPolicy {
return false;
}
void DisableEjection() {
Uneject();
multiplier_ = 0;
}
private:
std::unique_ptr<Bucket> current_bucket_ = absl::make_unique<Bucket>();
std::unique_ptr<Bucket> backup_bucket_ = absl::make_unique<Bucket>();
@ -644,6 +647,9 @@ void OutlierDetectionLb::UpdateLocked(UpdateArgs args) {
"[outlier_detection_lb %p] adding map entry for %s (%p)",
this, address_key.c_str(), subchannel_state.get());
}
} else if (!config_->CountingEnabled()) {
// If counting is not enabled, reset state.
subchannel_state->DisableEjection();
}
current_addresses.emplace(address_key);
}
@ -1074,7 +1080,7 @@ class OutlierDetectionLbFactory : public LoadBalancingPolicyFactory {
}
ParseJsonObjectFieldAsDuration(json.object_value(), "interval",
&outlier_detection_config.interval,
&error_list);
&error_list, /*required=*/false);
ParseJsonObjectFieldAsDuration(json.object_value(), "baseEjectionTime",
&outlier_detection_config.base_ejection_time,
&error_list, /*required=*/false);

@ -30,7 +30,7 @@ namespace grpc_core {
bool XdsOutlierDetectionEnabled();
struct OutlierDetectionConfig {
Duration interval = Duration::Infinity();
Duration interval = Duration::Seconds(10);
Duration base_ejection_time = Duration::Milliseconds(30000);
Duration max_ejection_time = Duration::Milliseconds(30000);
uint32_t max_ejection_percent = 10;
@ -39,16 +39,40 @@ struct OutlierDetectionConfig {
uint32_t enforcement_percentage = 0;
uint32_t minimum_hosts = 5;
uint32_t request_volume = 100;
bool operator==(const SuccessRateEjection& other) const {
return stdev_factor == other.stdev_factor &&
enforcement_percentage == other.enforcement_percentage &&
minimum_hosts == other.minimum_hosts &&
request_volume == other.request_volume;
}
};
struct FailurePercentageEjection {
uint32_t threshold = 85;
uint32_t enforcement_percentage = 0;
uint32_t minimum_hosts = 5;
uint32_t request_volume = 50;
bool operator==(const FailurePercentageEjection& other) const {
return threshold == other.threshold &&
enforcement_percentage == other.enforcement_percentage &&
minimum_hosts == other.minimum_hosts &&
request_volume == other.request_volume;
}
};
absl::optional<SuccessRateEjection> success_rate_ejection;
absl::optional<FailurePercentageEjection> failure_percentage_ejection;
bool operator==(const OutlierDetectionConfig& other) const {
return interval == other.interval &&
base_ejection_time == other.base_ejection_time &&
max_ejection_time == other.max_ejection_time &&
max_ejection_percent == other.max_ejection_percent &&
success_rate_ejection == other.success_rate_ejection &&
failure_percentage_ejection == other.failure_percentage_ejection;
}
};
} // namespace grpc_core
#endif // GRPC_CORE_EXT_FILTERS_CLIENT_CHANNEL_LB_POLICY_OUTLIER_DETECTION_OUTLIER_DETECTION_H

@ -63,7 +63,6 @@
#include "src/core/lib/gprpp/debug_location.h"
#include "src/core/lib/gprpp/orphanable.h"
#include "src/core/lib/gprpp/ref_counted_ptr.h"
#include "src/core/lib/gprpp/time.h"
#include "src/core/lib/gprpp/work_serializer.h"
#include "src/core/lib/iomgr/error.h"
#include "src/core/lib/iomgr/pollset_set.h"
@ -934,10 +933,6 @@ XdsClusterResolverLb::CreateChildPolicyConfigLocked() {
if (discovery_entry.config().outlier_detection_lb_config.has_value()) {
outlier_detection_config =
discovery_entry.config().outlier_detection_lb_config.value();
} else {
// outlier detection will be a no-op
outlier_detection_config["interval"] =
Duration::Infinity().ToJsonString();
}
outlier_detection_config["childPolicy"] = Json::Array{Json::Object{
{"xds_cluster_impl_experimental",
@ -1297,8 +1292,30 @@ class XdsClusterResolverLbFactory : public LoadBalancingPolicyFactory {
static_cast<XdsClusterResolverLbConfig*>(old_config);
XdsClusterResolverLbConfig* new_xds_cluster_resolver_config =
static_cast<XdsClusterResolverLbConfig*>(new_config);
return old_xds_cluster_resolver_config->discovery_mechanisms() !=
new_xds_cluster_resolver_config->discovery_mechanisms();
if (old_xds_cluster_resolver_config->discovery_mechanisms().size() !=
new_xds_cluster_resolver_config->discovery_mechanisms().size()) {
return true;
}
for (size_t i = 0;
i < old_xds_cluster_resolver_config->discovery_mechanisms().size();
++i) {
auto& old_discovery_mechanism =
old_xds_cluster_resolver_config->discovery_mechanisms()[i];
auto& new_discovery_mechanism =
new_xds_cluster_resolver_config->discovery_mechanisms()[i];
if (old_discovery_mechanism.type != new_discovery_mechanism.type ||
old_discovery_mechanism.cluster_name !=
new_discovery_mechanism.cluster_name ||
old_discovery_mechanism.eds_service_name !=
new_discovery_mechanism.eds_service_name ||
old_discovery_mechanism.dns_hostname !=
new_discovery_mechanism.dns_hostname ||
!(old_discovery_mechanism.lrs_load_reporting_server ==
new_discovery_mechanism.lrs_load_reporting_server)) {
return true;
}
}
return false;
}
OrphanablePtr<LoadBalancingPolicy> CreateLoadBalancingPolicy(

@ -84,7 +84,8 @@ struct XdsClusterResource {
lb_policy == other.lb_policy &&
min_ring_size == other.min_ring_size &&
max_ring_size == other.max_ring_size &&
max_concurrent_requests == other.max_concurrent_requests;
max_concurrent_requests == other.max_concurrent_requests &&
outlier_detection == other.outlier_detection;
}
std::string ToString() const;

@ -46,15 +46,14 @@ INSTANTIATE_TEST_SUITE_P(XdsTest, OutlierDetectionTest,
// test/cpp/end2end/outlier_detection_end2end_test.cc
// Tests SuccessRateEjectionAndUnejection:
// 1. Use ring hash policy that hashes using a
// header value to ensure rpcs go to all backends.
// 2. Cause a single error on 1
// backend and wait for 1 outlier detection interval to pass.
// 3. We should skip
// exactly 1 backend due to ejection and all the loads sticky to that backend
// should go to 1 other backend.
// 1. Use ring hash policy that hashes using a header value to ensure rpcs
// go to all backends.
// 2. Cause a single error on 1 backend and wait for 1 outlier detection
// interval to pass.
// 3. We should skip exactly 1 backend due to ejection and all the loads
// sticky to that backend should go to 1 other backend.
// 4. Let the ejection period pass and verify we can go back to both backends
// after the uneject.
// after the uneject.
TEST_P(OutlierDetectionTest, SuccessRateEjectionAndUnejection) {
ScopedExperimentalEnvVar env_var(
"GRPC_EXPERIMENTAL_ENABLE_OUTLIER_DETECTION");
@ -500,15 +499,14 @@ TEST_P(OutlierDetectionTest, SuccessRateRequestVolume) {
}
// Tests FailurePercentageEjectionAndUnejection:
// 1. Use ring hash policy that hashes using a
// header value to ensure rpcs go to all backends.
// 2. Cause a single error on 1
// backend and wait for 1 outlier detection interval to pass.
// 3. We should skip
// exactly 1 backend due to ejection and all the loads sticky to that backend
// should go to 1 other backend.
// 1. Use ring hash policy that hashes using a header value to ensure RPCs
// go to all backends.
// 2. Cause a single error on 1 backend and wait for 1 outlier detection
// interval to pass.
// 3. We should skip exactly 1 backend due to ejection and all the loads
// sticky to that backend should go to 1 other backend.
// 4. Let the ejection period pass and verify that traffic will again go both
// backends as we have unejected the backend.
// backends as we have unejected the backend.
TEST_P(OutlierDetectionTest, FailurePercentageEjectionAndUnejection) {
ScopedExperimentalEnvVar env_var(
"GRPC_EXPERIMENTAL_ENABLE_OUTLIER_DETECTION");
@ -1237,6 +1235,86 @@ TEST_P(OutlierDetectionTest,
EXPECT_EQ(100, backends_[3]->backend_service()->request_count());
}
// Tests that we uneject any ejected addresses when the OD policy is
// disabled.
TEST_P(OutlierDetectionTest, DisableOutlierDetectionWhileAddressesAreEjected) {
ScopedExperimentalEnvVar env_var(
"GRPC_EXPERIMENTAL_ENABLE_OUTLIER_DETECTION");
CreateAndStartBackends(2);
auto cluster = default_cluster_;
cluster.set_lb_policy(Cluster::RING_HASH);
// Setup outlier failure percentage parameters.
// Any failure will cause an potential ejection with the probability of 100%
// (to eliminate flakiness of the test).
auto* interval = cluster.mutable_outlier_detection()->mutable_interval();
interval->set_nanos(100000000 * grpc_test_slowdown_factor());
auto* base_time =
cluster.mutable_outlier_detection()->mutable_base_ejection_time();
base_time->set_seconds(1 * grpc_test_slowdown_factor());
cluster.mutable_outlier_detection()
->mutable_failure_percentage_threshold()
->set_value(0);
cluster.mutable_outlier_detection()
->mutable_enforcing_failure_percentage()
->set_value(100);
cluster.mutable_outlier_detection()
->mutable_failure_percentage_minimum_hosts()
->set_value(1);
cluster.mutable_outlier_detection()
->mutable_failure_percentage_request_volume()
->set_value(1);
balancer_->ads_service()->SetCdsResource(cluster);
auto new_route_config = default_route_config_;
auto* route = new_route_config.mutable_virtual_hosts(0)->mutable_routes(0);
auto* hash_policy = route->mutable_route()->add_hash_policy();
hash_policy->mutable_header()->set_header_name("address_hash");
SetListenerAndRouteConfiguration(balancer_.get(), default_listener_,
new_route_config);
EdsResourceArgs args({{"locality0", CreateEndpointsForBackends()}});
balancer_->ads_service()->SetEdsResource(BuildEdsResource(args));
// Note each type of RPC will contains a header value that will always be
// hashed to a specific backend as the header value matches the value used
// to create the entry in the ring.
std::vector<std::pair<std::string, std::string>> metadata = {
{"address_hash", CreateMetadataValueThatHashesToBackend(0)}};
std::vector<std::pair<std::string, std::string>> metadata1 = {
{"address_hash", CreateMetadataValueThatHashesToBackend(1)}};
const auto rpc_options = RpcOptions().set_metadata(metadata);
const auto rpc_options1 = RpcOptions().set_metadata(std::move(metadata1));
WaitForBackend(DEBUG_LOCATION, 0, /*check_status=*/nullptr,
WaitForBackendOptions(), rpc_options);
WaitForBackend(DEBUG_LOCATION, 1, /*check_status=*/nullptr,
WaitForBackendOptions(), rpc_options1);
// Cause an error and wait for 1 outlier detection interval to pass to cause
// the backend to be ejected.
CheckRpcSendFailure(
DEBUG_LOCATION, StatusCode::CANCELLED, "",
RpcOptions().set_metadata(metadata).set_server_expected_error(
StatusCode::CANCELLED));
gpr_sleep_until(grpc_timeout_milliseconds_to_deadline(100));
ResetBackendCounters();
// 1 backend is ejected all traffic going to the ejected backend should now
// all be going to the other backend.
// failure percentage enforcement_percentage of 100% is honored as this test
// will consistently reject 1 backend.
CheckRpcSendOk(DEBUG_LOCATION, 1, rpc_options);
EXPECT_EQ(1, backends_[1]->backend_service()->request_count());
// Send an update that disables outlier detection.
cluster.clear_outlier_detection();
balancer_->ads_service()->SetCdsResource(cluster);
// Wait for the backend to start being used again.
WaitForBackend(
DEBUG_LOCATION, 0,
[](const RpcResult& result) {
EXPECT_EQ(result.status.error_code(), StatusCode::CANCELLED)
<< "Error: " << result.status.error_message();
},
WaitForBackendOptions(),
RpcOptions()
.set_metadata(std::move(metadata))
.set_server_expected_error(StatusCode::CANCELLED));
}
} // namespace
} // namespace testing
} // namespace grpc

Loading…
Cancel
Save