From 328894a0d70a1a56b4722d9d1326b6c9258d38cf Mon Sep 17 00:00:00 2001 From: "data-plane-api(CircleCI)" Date: Thu, 15 Nov 2018 04:43:55 +0000 Subject: [PATCH] Add logging of health check failure events (#4965) Currently health check failure events are only log if the HealthFlag for a host transition from non-FAILED_ACTIVE_HC to FAILED_ACTIVE_HC. However, since hosts are initialized in the FAILED_ACTIVE_HC state, hosts that never became healthy have no events associated with it. Since the current health check events only log transitions, we'll have to scan the entire log in order to find the hosts in a current failing state. Then we'll still have to filter the hosts permanently removed from the cluster by the discovery service. This makes the events very difficult to use in operations. Proposed solution Both of these 2 issues can be solved by emitting a health check failure event if either of these conditions are true: If the active health check failed and it's the first health check for a host. This ensures we have events for hosts that never became healthy. If the active health check failed and a AlwaysLogFailures configuration is set to true, by default this flag is set to false. This makes it very easy to find the hosts currently failing by looking at the last few seconds of logs. Signed-off-by: Henry Yang Mirrored from https://github.com/envoyproxy/envoy @ 11e196b67ee9124f33c45f5adf542841386e3c39 --- envoy/api/v2/core/health_check.proto | 5 +++++ envoy/data/core/v2alpha/health_check_event.proto | 10 ++++++++++ 2 files changed, 15 insertions(+) diff --git a/envoy/api/v2/core/health_check.proto b/envoy/api/v2/core/health_check.proto index 2ad6ed31..b2633d89 100644 --- a/envoy/api/v2/core/health_check.proto +++ b/envoy/api/v2/core/health_check.proto @@ -218,6 +218,11 @@ message HealthCheck { // Specifies the path to the :ref:`health check event log `. // If empty, no event log will be written. string event_log_path = 17; + + // If set to true, health check failure events will always be logged. If set to false, only the + // initial health check failure event will be logged. + // The default value is false. + bool always_log_health_check_failures = 19; } // Endpoint health status. diff --git a/envoy/data/core/v2alpha/health_check_event.proto b/envoy/data/core/v2alpha/health_check_event.proto index e9442cfb..dfb016e8 100644 --- a/envoy/data/core/v2alpha/health_check_event.proto +++ b/envoy/data/core/v2alpha/health_check_event.proto @@ -27,6 +27,9 @@ message HealthCheckEvent { // Host addition. HealthCheckAddHealthy add_healthy_event = 5; + + // Host failure. + HealthCheckFailure health_check_failure_event = 7; } // Timestamp for event. @@ -57,3 +60,10 @@ message HealthCheckAddHealthy { // is bypassed and the host is immediately added. bool first_check = 1; } + +message HealthCheckFailure { + // The type of failure that caused this event. + HealthCheckFailureType failure_type = 1 [(validate.rules).enum.defined_only = true]; + // Whether this event is the result of the first ever health check on a host. + bool first_check = 2; +}