From fc4ce88e3a5bf16dca25077e694e1d045125fb51 Mon Sep 17 00:00:00 2001 From: "Mark D. Roth" Date: Thu, 1 Sep 2022 08:41:30 -0700 Subject: [PATCH] retry: fix TSAN failure in BatchData dtor (#30811) * retry: fix TSAN failure in BatchData dtor * remove previous work-around * use std::exchange() --- .../filters/client_channel/retry_filter.cc | 42 +++++++++---------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/src/core/ext/filters/client_channel/retry_filter.cc b/src/core/ext/filters/client_channel/retry_filter.cc index 1ca8e5eec35..abb6f508018 100644 --- a/src/core/ext/filters/client_channel/retry_filter.cc +++ b/src/core/ext/filters/client_channel/retry_filter.cc @@ -350,7 +350,11 @@ class RetryFilter::CallData { // cancel_stream op. static void OnCompleteForCancelOp(void* arg, grpc_error_handle error); - RefCountedPtr call_attempt_; + // This DOES hold a ref, but it cannot be a RefCountedPtr<>, because + // our dtor unrefs the owning call, which may delete the arena in + // which we are allocated, which means that running the dtor of any + // data members after that would cause a crash. + CallAttempt* call_attempt_; // The batch to use in the LB call. // Its payload field points to CallAttempt::batch_payload_. grpc_transport_stream_op_batch batch_; @@ -1329,11 +1333,11 @@ RetryFilter::CallData::CallAttempt::BatchData::BatchData( : RefCounted( GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace) ? "BatchData" : nullptr, refcount), - call_attempt_(std::move(attempt)) { + call_attempt_(attempt.release()) { if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) { gpr_log(GPR_INFO, "chand=%p calld=%p attempt=%p: creating batch %p", - call_attempt_->calld_->chand_, call_attempt_->calld_, - call_attempt_.get(), this); + call_attempt_->calld_->chand_, call_attempt_->calld_, call_attempt_, + this); } // We hold a ref to the call stack for every batch sent on a call attempt. // This is because some batches on the call attempt may not complete @@ -1353,11 +1357,12 @@ RetryFilter::CallData::CallAttempt::BatchData::BatchData( RetryFilter::CallData::CallAttempt::BatchData::~BatchData() { if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) { gpr_log(GPR_INFO, "chand=%p calld=%p attempt=%p: destroying batch %p", - call_attempt_->calld_->chand_, call_attempt_->calld_, - call_attempt_.get(), this); + call_attempt_->calld_->chand_, call_attempt_->calld_, call_attempt_, + this); } - GRPC_CALL_STACK_UNREF(call_attempt_->calld_->owning_call_, "Retry BatchData"); - call_attempt_.reset(DEBUG_LOCATION, "~BatchData"); + CallAttempt* call_attempt = std::exchange(call_attempt_, nullptr); + GRPC_CALL_STACK_UNREF(call_attempt->calld_->owning_call_, "Retry BatchData"); + call_attempt->Unref(DEBUG_LOCATION, "~BatchData"); } void RetryFilter::CallData::CallAttempt::BatchData:: @@ -1421,7 +1426,7 @@ void RetryFilter::CallData::CallAttempt::BatchData:: void RetryFilter::CallData::CallAttempt::BatchData::RecvInitialMetadataReady( void* arg, grpc_error_handle error) { RefCountedPtr batch_data(static_cast(arg)); - CallAttempt* call_attempt = batch_data->call_attempt_.get(); + CallAttempt* call_attempt = batch_data->call_attempt_; CallData* calld = call_attempt->calld_; if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) { gpr_log(GPR_INFO, @@ -1523,7 +1528,7 @@ void RetryFilter::CallData::CallAttempt::BatchData:: void RetryFilter::CallData::CallAttempt::BatchData::RecvMessageReady( void* arg, grpc_error_handle error) { RefCountedPtr batch_data(static_cast(arg)); - CallAttempt* call_attempt = batch_data->call_attempt_.get(); + CallAttempt* call_attempt = batch_data->call_attempt_; CallData* calld = call_attempt->calld_; if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) { gpr_log(GPR_INFO, @@ -1721,15 +1726,8 @@ void RetryFilter::CallData::CallAttempt::BatchData::RunClosuresForCompletedCall( void RetryFilter::CallData::CallAttempt::BatchData::RecvTrailingMetadataReady( void* arg, grpc_error_handle error) { - // Add a ref to the outer call stack. This works around a TSAN reported - // failure that we do not understand, but since we expect the lifetime of - // this code to be relatively short we think it's OK to work around for now. - // TSAN failure: - // https://source.cloud.google.com/results/invocations/5b122974-4977-4862-beb1-dc1af9fbbd1d/targets/%2F%2Ftest%2Fcore%2Fend2end:h2_census_test@max_concurrent_streams@poller%3Dpoll/log - RefCountedPtr owning_call_stack = - static_cast(arg)->call_attempt_->calld_->owning_call_->Ref(); RefCountedPtr batch_data(static_cast(arg)); - CallAttempt* call_attempt = batch_data->call_attempt_.get(); + CallAttempt* call_attempt = batch_data->call_attempt_; CallData* calld = call_attempt->calld_; if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) { gpr_log(GPR_INFO, @@ -1885,7 +1883,7 @@ void RetryFilter::CallData::CallAttempt::BatchData:: gpr_log(GPR_INFO, "chand=%p calld=%p attempt=%p: starting next batch for pending " "send op(s)", - calld->chand_, calld, call_attempt_.get()); + calld->chand_, calld, call_attempt_); } call_attempt_->AddRetriableBatches(closures); } @@ -1894,7 +1892,7 @@ void RetryFilter::CallData::CallAttempt::BatchData:: void RetryFilter::CallData::CallAttempt::BatchData::OnComplete( void* arg, grpc_error_handle error) { RefCountedPtr batch_data(static_cast(arg)); - CallAttempt* call_attempt = batch_data->call_attempt_.get(); + CallAttempt* call_attempt = batch_data->call_attempt_; CallData* calld = call_attempt->calld_; if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) { gpr_log(GPR_INFO, @@ -1970,7 +1968,7 @@ void RetryFilter::CallData::CallAttempt::BatchData::OnComplete( void RetryFilter::CallData::CallAttempt::BatchData::OnCompleteForCancelOp( void* arg, grpc_error_handle error) { RefCountedPtr batch_data(static_cast(arg)); - CallAttempt* call_attempt = batch_data->call_attempt_.get(); + CallAttempt* call_attempt = batch_data->call_attempt_; CallData* calld = call_attempt->calld_; if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) { gpr_log(GPR_INFO, @@ -2021,7 +2019,7 @@ void RetryFilter::CallData::CallAttempt::BatchData:: GPR_INFO, "chand=%p calld=%p attempt=%p: starting calld->send_messages[%" PRIuPTR "]", - calld->chand_, calld, call_attempt_.get(), + calld->chand_, calld, call_attempt_, call_attempt_->started_send_message_count_); } CachedSendMessage cache =