healthcheck: exclude hosts when receiving x-envoy-immediate-health-check-fail (#14772)

* Send x-envoy-immediate-health-check-fail on all responses that the
  health check filter processes, not just non-HC responses.
* Exclude hosts from load balancing when x-envoy-immediate-health-check-fail
  is received.
* Can be reverted via the envoy.reloadable_features.health_check.immediate_failure_exclude_from_cluster
  feature flag.

Fixes https://github.com/envoyproxy/envoy/issues/9246

Signed-off-by: Matt Klein <mklein@lyft.com>

Mirrored from https://github.com/envoyproxy/envoy @ deed328494064bf28e09055c1cda4e3a3cdd6b67
pull/624/head
data-plane-api(Azure Pipelines) 4 years ago
parent fd49a04d29
commit 232083e6f7
  1. 9
      envoy/admin/v3/clusters.proto
  2. 9
      envoy/admin/v4alpha/clusters.proto
  3. 22
      envoy/config/cluster/v3/cluster.proto
  4. 22
      envoy/config/cluster/v4alpha/cluster.proto
  5. 1
      envoy/data/core/v3/health_check_event.proto

@ -139,7 +139,7 @@ message HostStatus {
}
// Health status for a host.
// [#next-free-field: 7]
// [#next-free-field: 9]
message HostHealthStatus {
option (udpa.annotations.versioning).previous_message_type =
"envoy.admin.v2alpha.HostHealthStatus";
@ -160,6 +160,13 @@ message HostHealthStatus {
// The host has not yet been health checked.
bool pending_active_hc = 6;
// The host should be excluded from panic, spillover, etc. calculations because it was explicitly
// taken out of rotation via protocol signal and is not meant to be routed to.
bool excluded_via_immediate_hc_fail = 7;
// The host failed active HC due to timeout.
bool active_hc_timeout = 8;
// Health status as reported by EDS. Note: only HEALTHY and UNHEALTHY are currently supported
// here.
// [#comment:TODO(mrice32): pipe through remaining EDS health status possibilities.]

@ -139,7 +139,7 @@ message HostStatus {
}
// Health status for a host.
// [#next-free-field: 7]
// [#next-free-field: 9]
message HostHealthStatus {
option (udpa.annotations.versioning).previous_message_type = "envoy.admin.v3.HostHealthStatus";
@ -159,6 +159,13 @@ message HostHealthStatus {
// The host has not yet been health checked.
bool pending_active_hc = 6;
// The host should be excluded from panic, spillover, etc. calculations because it was explicitly
// taken out of rotation via protocol signal and is not meant to be routed to.
bool excluded_via_immediate_hc_fail = 7;
// The host failed active HC due to timeout.
bool active_hc_timeout = 8;
// Health status as reported by EDS. Note: only HEALTHY and UNHEALTHY are currently supported
// here.
// [#comment:TODO(mrice32): pipe through remaining EDS health status possibilities.]

@ -536,25 +536,9 @@ message Cluster {
// https://github.com/envoyproxy/envoy/pull/3941.
google.protobuf.Duration update_merge_window = 4;
// If set to true, Envoy will not consider new hosts when computing load balancing weights until
// they have been health checked for the first time. This will have no effect unless
// active health checking is also configured.
//
// Ignoring a host means that for any load balancing calculations that adjust weights based
// on the ratio of eligible hosts and total hosts (priority spillover, locality weighting and
// panic mode) Envoy will exclude these hosts in the denominator.
//
// For example, with hosts in two priorities P0 and P1, where P0 looks like
// {healthy, unhealthy (new), unhealthy (new)}
// and where P1 looks like
// {healthy, healthy}
// all traffic will still hit P0, as 1 / (3 - 2) = 1.
//
// Enabling this will allow scaling up the number of hosts for a given cluster without entering
// panic mode or triggering priority spillover, assuming the hosts pass the first health check.
//
// If panic mode is triggered, new hosts are still eligible for traffic; they simply do not
// contribute to the calculation when deciding whether panic mode is enabled or not.
// If set to true, Envoy will :ref:`exclude <arch_overview_load_balancing_excluded>` new hosts
// when computing load balancing weights until they have been health checked for the first time.
// This will have no effect unless active health checking is also configured.
bool ignore_new_hosts_until_first_hc = 5;
// If set to `true`, the cluster manager will drain all existing

@ -540,25 +540,9 @@ message Cluster {
// https://github.com/envoyproxy/envoy/pull/3941.
google.protobuf.Duration update_merge_window = 4;
// If set to true, Envoy will not consider new hosts when computing load balancing weights until
// they have been health checked for the first time. This will have no effect unless
// active health checking is also configured.
//
// Ignoring a host means that for any load balancing calculations that adjust weights based
// on the ratio of eligible hosts and total hosts (priority spillover, locality weighting and
// panic mode) Envoy will exclude these hosts in the denominator.
//
// For example, with hosts in two priorities P0 and P1, where P0 looks like
// {healthy, unhealthy (new), unhealthy (new)}
// and where P1 looks like
// {healthy, healthy}
// all traffic will still hit P0, as 1 / (3 - 2) = 1.
//
// Enabling this will allow scaling up the number of hosts for a given cluster without entering
// panic mode or triggering priority spillover, assuming the hosts pass the first health check.
//
// If panic mode is triggered, new hosts are still eligible for traffic; they simply do not
// contribute to the calculation when deciding whether panic mode is enabled or not.
// If set to true, Envoy will :ref:`exclude <arch_overview_load_balancing_excluded>` new hosts
// when computing load balancing weights until they have been health checked for the first time.
// This will have no effect unless active health checking is also configured.
bool ignore_new_hosts_until_first_hc = 5;
// If set to `true`, the cluster manager will drain all existing

@ -22,6 +22,7 @@ enum HealthCheckFailureType {
ACTIVE = 0;
PASSIVE = 1;
NETWORK = 2;
NETWORK_TIMEOUT = 3;
}
enum HealthCheckerType {

Loading…
Cancel
Save