mirror of https://github.com/grpc/grpc.git
outlier detection: implement LB policy and xDS configuration (#29343)
* Initial skeleton for outlier detection * fixing code review comments (modifying child policy) * Skeleton and all tests passing except for 1 * small code review comments fix * Adding the parsing of the policy in cds and put it in discovery mechansim json format * Parsing outlier detection json policy from parent * Adding parsing of the updates * Added Subchannel wrapper and watcher wrapper: and all states pass through and all tests still pass * added framework to do eject and uneject * fixing code review comments * restore a test * fixing code review comments * taking care of code review comments * removing debug code and rebuild build files * fixing according to code review comments * fixing code review comments * Adding address to subchannel map * addressing code review comments * adding call counter * Refcount SubchannelState (in the map) and store them in Subcahnnel Wrapper * fixing counterss * Call counter and tracker skleton added * Call counter * addressing code review comments * addressing code review comments * Added CallCounter and timer * fixing sanity; but more importantly: taking out timer temporarly as it was causing test failures. * sanity * fixing according to code review comments * addressing code review comments * all algorithms implemented * addressing code review comment about starting the timer * protect private vars * small fix * Added one more corner case * fixing EjectionTimer * Fixing according to code review suggestions. * fixing according to code reveiw comments * taking care of code review comments * fixing sanity issues * Adding proto to tests * First test * Fixing according to code review comments * Tests all working now * fixing a crash * fixing build files * fixing sanity * sanity * Simplifying tests * merge and update * format * sanity and format * Fixing asan error * fixing parsing logic and error handling * 6 more tests done * Added verifying unejection to tests * Added all the tests * fixing according to code review comments * fixing asan and ubsan * Fixing tests according to code review comments * Added both algorithm tests * added percentage enforcement tests * fixing tsan error * keeping debug, but fix warning * remove debugs * fixing IWYU and build errors after * test comments change only but very important * fixing code review comments * one more refactorying of util function * Removed debugs and added one more helper method * one more logic fix * Fixing last bit of code review comments and added disable tests * fixing code review comments * fixing IWYU * sanity format * protecting the feature with environment var: registering policy and generating policy * added a todo according to code review comments * fixing a clang finding at import time * build fix after synching to latestpull/29708/head
parent
c11f66faef
commit
03cf989610
28 changed files with 3136 additions and 4 deletions
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,54 @@ |
||||
//
|
||||
// Copyright 2022 gRPC authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
#ifndef GRPC_CORE_EXT_FILTERS_CLIENT_CHANNEL_LB_POLICY_OUTLIER_DETECTION_OUTLIER_DETECTION_H |
||||
#define GRPC_CORE_EXT_FILTERS_CLIENT_CHANNEL_LB_POLICY_OUTLIER_DETECTION_OUTLIER_DETECTION_H |
||||
|
||||
#include <grpc/support/port_platform.h> |
||||
|
||||
#include <stdint.h> // for uint32_t |
||||
|
||||
#include "absl/types/optional.h" |
||||
|
||||
#include "src/core/lib/gprpp/time.h" |
||||
|
||||
namespace grpc_core { |
||||
|
||||
bool XdsOutlierDetectionEnabled(); |
||||
|
||||
struct OutlierDetectionConfig { |
||||
Duration interval = Duration::Infinity(); |
||||
Duration base_ejection_time = Duration::Milliseconds(30000); |
||||
Duration max_ejection_time = Duration::Milliseconds(30000); |
||||
uint32_t max_ejection_percent = 10; |
||||
struct SuccessRateEjection { |
||||
uint32_t stdev_factor = 1900; |
||||
uint32_t enforcement_percentage = 0; |
||||
uint32_t minimum_hosts = 5; |
||||
uint32_t request_volume = 100; |
||||
}; |
||||
struct FailurePercentageEjection { |
||||
uint32_t threshold = 85; |
||||
uint32_t enforcement_percentage = 0; |
||||
uint32_t minimum_hosts = 5; |
||||
uint32_t request_volume = 50; |
||||
}; |
||||
absl::optional<SuccessRateEjection> success_rate_ejection; |
||||
absl::optional<FailurePercentageEjection> failure_percentage_ejection; |
||||
}; |
||||
} // namespace grpc_core
|
||||
|
||||
#endif // GRPC_CORE_EXT_FILTERS_CLIENT_CHANNEL_LB_POLICY_OUTLIER_DETECTION_OUTLIER_DETECTION_H
|
@ -0,0 +1,96 @@ |
||||
// Copyright 2020 The gRPC Authors |
||||
// |
||||
// Licensed under the Apache License, Version 2.0 (the "License"); |
||||
// you may not use this file except in compliance with the License. |
||||
// You may obtain a copy of the License at |
||||
// |
||||
// http://www.apache.org/licenses/LICENSE-2.0 |
||||
// |
||||
// Unless required by applicable law or agreed to in writing, software |
||||
// distributed under the License is distributed on an "AS IS" BASIS, |
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
// See the License for the specific language governing permissions and |
||||
// limitations under the License. |
||||
|
||||
// Local copy of Envoy xDS proto file, used for testing only. |
||||
|
||||
syntax = "proto3"; |
||||
|
||||
package envoy.config.cluster.v3; |
||||
|
||||
import "google/protobuf/duration.proto"; |
||||
import "google/protobuf/wrappers.proto"; |
||||
|
||||
message OutlierDetection { |
||||
// The time interval between ejection analysis sweeps. This can result in |
||||
// both new ejections as well as hosts being returned to service. Defaults |
||||
// to 10000ms or 10s. |
||||
google.protobuf.Duration interval = 2; |
||||
|
||||
// The base time that a host is ejected for. The real time is equal to the |
||||
// base time multiplied by the number of times the host has been ejected and is |
||||
// capped by :ref:`max_ejection_time<envoy_v3_api_field_config.cluster.v3.OutlierDetection.max_ejection_time>`. |
||||
// Defaults to 30000ms or 30s. |
||||
google.protobuf.Duration base_ejection_time = 3; |
||||
|
||||
// The maximum % of an upstream cluster that can be ejected due to outlier |
||||
// detection. Defaults to 10% but will eject at least one host regardless of the value. |
||||
google.protobuf.UInt32Value max_ejection_percent = 4; |
||||
|
||||
// The % chance that a host will be actually ejected when an outlier status |
||||
// is detected through success rate statistics. This setting can be used to |
||||
// disable ejection or to ramp it up slowly. Defaults to 100. |
||||
google.protobuf.UInt32Value enforcing_success_rate = 6; |
||||
|
||||
// The number of hosts in a cluster that must have enough request volume to |
||||
// detect success rate outliers. If the number of hosts is less than this |
||||
// setting, outlier detection via success rate statistics is not performed |
||||
// for any host in the cluster. Defaults to 5. |
||||
google.protobuf.UInt32Value success_rate_minimum_hosts = 7; |
||||
|
||||
// The minimum number of total requests that must be collected in one |
||||
// interval (as defined by the interval duration above) to include this host |
||||
// in success rate based outlier detection. If the volume is lower than this |
||||
// setting, outlier detection via success rate statistics is not performed |
||||
// for that host. Defaults to 100. |
||||
google.protobuf.UInt32Value success_rate_request_volume = 8; |
||||
|
||||
// This factor is used to determine the ejection threshold for success rate |
||||
// outlier ejection. The ejection threshold is the difference between the |
||||
// mean success rate, and the product of this factor and the standard |
||||
// deviation of the mean success rate: mean - (stdev * |
||||
// success_rate_stdev_factor). This factor is divided by a thousand to get a |
||||
// double. That is, if the desired factor is 1.9, the runtime value should |
||||
// be 1900. Defaults to 1900. |
||||
google.protobuf.UInt32Value success_rate_stdev_factor = 9; |
||||
|
||||
// The failure percentage to use when determining failure percentage-based outlier detection. If |
||||
// the failure percentage of a given host is greater than or equal to this value, it will be |
||||
// ejected. Defaults to 85. |
||||
google.protobuf.UInt32Value failure_percentage_threshold = 16; |
||||
|
||||
// The % chance that a host will be actually ejected when an outlier status is detected through |
||||
// failure percentage statistics. This setting can be used to disable ejection or to ramp it up |
||||
// slowly. Defaults to 0. |
||||
// |
||||
// [#next-major-version: setting this without setting failure_percentage_threshold should be |
||||
// invalid in v4.] |
||||
google.protobuf.UInt32Value enforcing_failure_percentage = 17; |
||||
|
||||
// The minimum number of hosts in a cluster in order to perform failure percentage-based ejection. |
||||
// If the total number of hosts in the cluster is less than this value, failure percentage-based |
||||
// ejection will not be performed. Defaults to 5. |
||||
google.protobuf.UInt32Value failure_percentage_minimum_hosts = 19; |
||||
|
||||
// The minimum number of total requests that must be collected in one interval (as defined by the |
||||
// interval duration above) to perform failure percentage-based ejection for this host. If the |
||||
// volume is lower than this setting, failure percentage-based ejection will not be performed for |
||||
// this host. Defaults to 50. |
||||
google.protobuf.UInt32Value failure_percentage_request_volume = 20; |
||||
|
||||
// The maximum time that a host is ejected for. See :ref:`base_ejection_time<envoy_v3_api_field_config.cluster.v3.OutlierDetection.base_ejection_time>` |
||||
// for more information. If not specified, the default value (300000ms or 300s) or |
||||
// :ref:`base_ejection_time<envoy_v3_api_field_config.cluster.v3.OutlierDetection.base_ejection_time>` value is applied, whatever is larger. |
||||
google.protobuf.Duration max_ejection_time = 21; |
||||
|
||||
} |
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue