From 104aef6ddf5b2f9636661f1d2d1c0b976d9e679b Mon Sep 17 00:00:00 2001 From: "Mark D. Roth" Date: Thu, 28 May 2020 10:55:18 -0700 Subject: [PATCH] Fix race condition in health checking client. --- .../health/health_check_client.cc | 29 +++++-------------- .../health/health_check_client.h | 6 ++-- 2 files changed, 10 insertions(+), 25 deletions(-) diff --git a/src/core/ext/filters/client_channel/health/health_check_client.cc b/src/core/ext/filters/client_channel/health/health_check_client.cc index 5c4aa986e80..a17d49953bf 100644 --- a/src/core/ext/filters/client_channel/health/health_check_client.cc +++ b/src/core/ext/filters/client_channel/health/health_check_client.cc @@ -125,8 +125,7 @@ void HealthCheckClient::StartCallLocked() { call_state_->StartCall(); } -void HealthCheckClient::StartRetryTimer() { - MutexLock lock(&mu_); +void HealthCheckClient::StartRetryTimerLocked() { SetHealthStatusLocked(GRPC_CHANNEL_TRANSIENT_FAILURE, "health check call failed; will retry after backoff"); grpc_millis next_try = retry_backoff_.NextAttemptTime(); @@ -297,14 +296,7 @@ void HealthCheckClient::CallState::StartCall() { "checking call on subchannel (%s); will retry", health_check_client_.get(), this, grpc_error_string(error)); GRPC_ERROR_UNREF(error); - // Schedule instead of running directly, since we must not be - // holding health_check_client_->mu_ when CallEnded() is called. - call_->Ref(DEBUG_LOCATION, "call_end_closure").release(); - ExecCtx::Run( - DEBUG_LOCATION, - GRPC_CLOSURE_INIT(&batch_.handler_private.closure, CallEndedRetry, this, - grpc_schedule_on_exec_ctx), - GRPC_ERROR_NONE); + CallEndedLocked(/*retry=*/true); return; } // Initialize payload and batch. @@ -582,18 +574,11 @@ void HealthCheckClient::CallState::RecvTrailingMetadataReady( kErrorMessage); retry = false; } - self->CallEnded(retry); -} - -void HealthCheckClient::CallState::CallEndedRetry(void* arg, - grpc_error* /*error*/) { - HealthCheckClient::CallState* self = - static_cast(arg); - self->CallEnded(true /* retry */); - self->call_->Unref(DEBUG_LOCATION, "call_end_closure"); + MutexLock lock(&self->health_check_client_->mu_); + self->CallEndedLocked(retry); } -void HealthCheckClient::CallState::CallEnded(bool retry) { +void HealthCheckClient::CallState::CallEndedLocked(bool retry) { // If this CallState is still in use, this call ended because of a failure, // so we need to stop using it and optionally create a new one. // Otherwise, we have deliberately ended this call, and no further action @@ -606,10 +591,10 @@ void HealthCheckClient::CallState::CallEnded(bool retry) { // If the call fails after we've gotten a successful response, reset // the backoff and restart the call immediately. health_check_client_->retry_backoff_.Reset(); - health_check_client_->StartCall(); + health_check_client_->StartCallLocked(); } else { // If the call failed without receiving any messages, retry later. - health_check_client_->StartRetryTimer(); + health_check_client_->StartRetryTimerLocked(); } } } diff --git a/src/core/ext/filters/client_channel/health/health_check_client.h b/src/core/ext/filters/client_channel/health/health_check_client.h index f8b9ade5ab6..e9555b5f955 100644 --- a/src/core/ext/filters/client_channel/health/health_check_client.h +++ b/src/core/ext/filters/client_channel/health/health_check_client.h @@ -72,8 +72,8 @@ class HealthCheckClient : public InternallyRefCounted { void StartBatch(grpc_transport_stream_op_batch* batch); static void StartBatchInCallCombiner(void* arg, grpc_error* error); - static void CallEndedRetry(void* arg, grpc_error* error); - void CallEnded(bool retry); + // Requires holding health_check_client_->mu_. + void CallEndedLocked(bool retry); static void OnComplete(void* arg, grpc_error* error); static void RecvInitialMetadataReady(void* arg, grpc_error* error); @@ -143,7 +143,7 @@ class HealthCheckClient : public InternallyRefCounted { void StartCall(); void StartCallLocked(); // Requires holding mu_. - void StartRetryTimer(); + void StartRetryTimerLocked(); // Requires holding mu_. static void OnRetryTimer(void* arg, grpc_error* error); void SetHealthStatus(grpc_connectivity_state state, const char* reason);