retry: fix TSAN failure in BatchData dtor (#30811)

* retry: fix TSAN failure in BatchData dtor

* remove previous work-around

* use std::exchange()
pull/30819/head
Mark D. Roth 2 years ago committed by GitHub
parent 09a5d2979e
commit fc4ce88e3a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 42
      src/core/ext/filters/client_channel/retry_filter.cc

@ -350,7 +350,11 @@ class RetryFilter::CallData {
// cancel_stream op.
static void OnCompleteForCancelOp(void* arg, grpc_error_handle error);
RefCountedPtr<CallAttempt> call_attempt_;
// This DOES hold a ref, but it cannot be a RefCountedPtr<>, because
// our dtor unrefs the owning call, which may delete the arena in
// which we are allocated, which means that running the dtor of any
// data members after that would cause a crash.
CallAttempt* call_attempt_;
// The batch to use in the LB call.
// Its payload field points to CallAttempt::batch_payload_.
grpc_transport_stream_op_batch batch_;
@ -1329,11 +1333,11 @@ RetryFilter::CallData::CallAttempt::BatchData::BatchData(
: RefCounted(
GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace) ? "BatchData" : nullptr,
refcount),
call_attempt_(std::move(attempt)) {
call_attempt_(attempt.release()) {
if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
gpr_log(GPR_INFO, "chand=%p calld=%p attempt=%p: creating batch %p",
call_attempt_->calld_->chand_, call_attempt_->calld_,
call_attempt_.get(), this);
call_attempt_->calld_->chand_, call_attempt_->calld_, call_attempt_,
this);
}
// We hold a ref to the call stack for every batch sent on a call attempt.
// This is because some batches on the call attempt may not complete
@ -1353,11 +1357,12 @@ RetryFilter::CallData::CallAttempt::BatchData::BatchData(
RetryFilter::CallData::CallAttempt::BatchData::~BatchData() {
if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
gpr_log(GPR_INFO, "chand=%p calld=%p attempt=%p: destroying batch %p",
call_attempt_->calld_->chand_, call_attempt_->calld_,
call_attempt_.get(), this);
call_attempt_->calld_->chand_, call_attempt_->calld_, call_attempt_,
this);
}
GRPC_CALL_STACK_UNREF(call_attempt_->calld_->owning_call_, "Retry BatchData");
call_attempt_.reset(DEBUG_LOCATION, "~BatchData");
CallAttempt* call_attempt = std::exchange(call_attempt_, nullptr);
GRPC_CALL_STACK_UNREF(call_attempt->calld_->owning_call_, "Retry BatchData");
call_attempt->Unref(DEBUG_LOCATION, "~BatchData");
}
void RetryFilter::CallData::CallAttempt::BatchData::
@ -1421,7 +1426,7 @@ void RetryFilter::CallData::CallAttempt::BatchData::
void RetryFilter::CallData::CallAttempt::BatchData::RecvInitialMetadataReady(
void* arg, grpc_error_handle error) {
RefCountedPtr<BatchData> batch_data(static_cast<BatchData*>(arg));
CallAttempt* call_attempt = batch_data->call_attempt_.get();
CallAttempt* call_attempt = batch_data->call_attempt_;
CallData* calld = call_attempt->calld_;
if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
gpr_log(GPR_INFO,
@ -1523,7 +1528,7 @@ void RetryFilter::CallData::CallAttempt::BatchData::
void RetryFilter::CallData::CallAttempt::BatchData::RecvMessageReady(
void* arg, grpc_error_handle error) {
RefCountedPtr<BatchData> batch_data(static_cast<BatchData*>(arg));
CallAttempt* call_attempt = batch_data->call_attempt_.get();
CallAttempt* call_attempt = batch_data->call_attempt_;
CallData* calld = call_attempt->calld_;
if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
gpr_log(GPR_INFO,
@ -1721,15 +1726,8 @@ void RetryFilter::CallData::CallAttempt::BatchData::RunClosuresForCompletedCall(
void RetryFilter::CallData::CallAttempt::BatchData::RecvTrailingMetadataReady(
void* arg, grpc_error_handle error) {
// Add a ref to the outer call stack. This works around a TSAN reported
// failure that we do not understand, but since we expect the lifetime of
// this code to be relatively short we think it's OK to work around for now.
// TSAN failure:
// https://source.cloud.google.com/results/invocations/5b122974-4977-4862-beb1-dc1af9fbbd1d/targets/%2F%2Ftest%2Fcore%2Fend2end:h2_census_test@max_concurrent_streams@poller%3Dpoll/log
RefCountedPtr<grpc_call_stack> owning_call_stack =
static_cast<BatchData*>(arg)->call_attempt_->calld_->owning_call_->Ref();
RefCountedPtr<BatchData> batch_data(static_cast<BatchData*>(arg));
CallAttempt* call_attempt = batch_data->call_attempt_.get();
CallAttempt* call_attempt = batch_data->call_attempt_;
CallData* calld = call_attempt->calld_;
if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
gpr_log(GPR_INFO,
@ -1885,7 +1883,7 @@ void RetryFilter::CallData::CallAttempt::BatchData::
gpr_log(GPR_INFO,
"chand=%p calld=%p attempt=%p: starting next batch for pending "
"send op(s)",
calld->chand_, calld, call_attempt_.get());
calld->chand_, calld, call_attempt_);
}
call_attempt_->AddRetriableBatches(closures);
}
@ -1894,7 +1892,7 @@ void RetryFilter::CallData::CallAttempt::BatchData::
void RetryFilter::CallData::CallAttempt::BatchData::OnComplete(
void* arg, grpc_error_handle error) {
RefCountedPtr<BatchData> batch_data(static_cast<BatchData*>(arg));
CallAttempt* call_attempt = batch_data->call_attempt_.get();
CallAttempt* call_attempt = batch_data->call_attempt_;
CallData* calld = call_attempt->calld_;
if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
gpr_log(GPR_INFO,
@ -1970,7 +1968,7 @@ void RetryFilter::CallData::CallAttempt::BatchData::OnComplete(
void RetryFilter::CallData::CallAttempt::BatchData::OnCompleteForCancelOp(
void* arg, grpc_error_handle error) {
RefCountedPtr<BatchData> batch_data(static_cast<BatchData*>(arg));
CallAttempt* call_attempt = batch_data->call_attempt_.get();
CallAttempt* call_attempt = batch_data->call_attempt_;
CallData* calld = call_attempt->calld_;
if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
gpr_log(GPR_INFO,
@ -2021,7 +2019,7 @@ void RetryFilter::CallData::CallAttempt::BatchData::
GPR_INFO,
"chand=%p calld=%p attempt=%p: starting calld->send_messages[%" PRIuPTR
"]",
calld->chand_, calld, call_attempt_.get(),
calld->chand_, calld, call_attempt_,
call_attempt_->started_send_message_count_);
}
CachedSendMessage cache =

Loading…
Cancel
Save