From 2dc32eabcbe4f5c4ef502d3b8d162ec57cfe46ae Mon Sep 17 00:00:00 2001 From: Craig Tiller Date: Tue, 31 Jan 2017 15:32:34 -0800 Subject: [PATCH 1/2] Fix race detected by TSAN --- src/core/lib/surface/call.c | 81 +++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 44 deletions(-) diff --git a/src/core/lib/surface/call.c b/src/core/lib/surface/call.c index 5519a1ec7e2..e9c623c2b10 100644 --- a/src/core/lib/surface/call.c +++ b/src/core/lib/surface/call.c @@ -86,8 +86,11 @@ typedef enum { /* Status came from 'the wire' - or somewhere below the surface layer */ STATUS_FROM_WIRE, - /* Status was created by some internal channel stack operation */ + /* Status was created by some internal channel stack operation: must come via + add_batch_error */ STATUS_FROM_CORE, + /* Status was created by some surface error */ + STATUS_FROM_SURFACE, /* Status came from the server sending status */ STATUS_FROM_SERVER_STATUS, STATUS_SOURCE_COUNT @@ -221,11 +224,11 @@ struct grpc_call { static void execute_op(grpc_exec_ctx *exec_ctx, grpc_call *call, grpc_transport_stream_op *op); -static grpc_call_error cancel_with_status(grpc_exec_ctx *exec_ctx, grpc_call *c, - grpc_status_code status, - const char *description); +static void cancel_with_status(grpc_exec_ctx *exec_ctx, grpc_call *c, + status_source source, grpc_status_code status, + const char *description); static void cancel_with_error(grpc_exec_ctx *exec_ctx, grpc_call *c, - grpc_error *error); + status_source source, grpc_error *error); static void destroy_call(grpc_exec_ctx *exec_ctx, void *call_stack, grpc_error *error); static void receiving_slice_ready(grpc_exec_ctx *exec_ctx, void *bctlp, @@ -337,7 +340,7 @@ grpc_error *grpc_call_create(grpc_exec_ctx *exec_ctx, args->server_transport_data, path, call->start_time, send_deadline, CALL_STACK_FROM_CALL(call)); if (error != GRPC_ERROR_NONE) { - cancel_with_error(exec_ctx, call, GRPC_ERROR_REF(error)); + cancel_with_error(exec_ctx, call, STATUS_FROM_SURFACE, GRPC_ERROR_REF(error)); } if (args->cq != NULL) { GPR_ASSERT( @@ -512,7 +515,6 @@ grpc_call_error grpc_call_cancel_with_status(grpc_call *c, grpc_status_code status, const char *description, void *reserved) { - grpc_call_error r; grpc_exec_ctx exec_ctx = GRPC_EXEC_CTX_INIT; GRPC_API_TRACE( "grpc_call_cancel_with_status(" @@ -520,16 +522,16 @@ grpc_call_error grpc_call_cancel_with_status(grpc_call *c, 4, (c, (int)status, description, reserved)); GPR_ASSERT(reserved == NULL); gpr_mu_lock(&c->mu); - r = cancel_with_status(&exec_ctx, c, status, description); + cancel_with_status(&exec_ctx, c, STATUS_FROM_API_OVERRIDE, status, + description); gpr_mu_unlock(&c->mu); grpc_exec_ctx_finish(&exec_ctx); - return r; + return GRPC_CALL_OK; } typedef struct termination_closure { grpc_closure closure; grpc_call *call; - grpc_error *error; grpc_transport_stream_op op; } termination_closure; @@ -544,36 +546,27 @@ static void send_termination(grpc_exec_ctx *exec_ctx, void *tcp, grpc_error *error) { termination_closure *tc = tcp; memset(&tc->op, 0, sizeof(tc->op)); - tc->op.cancel_error = tc->error; + tc->op.cancel_error = GRPC_ERROR_REF(error); /* reuse closure to catch completion */ - grpc_closure_init(&tc->closure, done_termination, tc, - grpc_schedule_on_exec_ctx); - tc->op.on_complete = &tc->closure; + tc->op.on_complete = grpc_closure_init(&tc->closure, done_termination, tc, + grpc_schedule_on_exec_ctx); execute_op(exec_ctx, tc->call, &tc->op); } -static grpc_call_error terminate_with_status(grpc_exec_ctx *exec_ctx, - termination_closure *tc) { - set_status_from_error(exec_ctx, tc->call, STATUS_FROM_API_OVERRIDE, - GRPC_ERROR_REF(tc->error)); - grpc_closure_init(&tc->closure, send_termination, tc, - grpc_schedule_on_exec_ctx); - GRPC_CALL_INTERNAL_REF(tc->call, "termination"); - grpc_closure_sched(exec_ctx, &tc->closure, GRPC_ERROR_NONE); - return GRPC_CALL_OK; -} - -static grpc_call_error terminate_with_error(grpc_exec_ctx *exec_ctx, - grpc_call *c, grpc_error *error) { +static void terminate_with_error(grpc_exec_ctx *exec_ctx, grpc_call *c, + grpc_error *error) { termination_closure *tc = gpr_malloc(sizeof(*tc)); memset(tc, 0, sizeof(*tc)); tc->call = c; - tc->error = error; - return terminate_with_status(exec_ctx, tc); + GRPC_CALL_INTERNAL_REF(tc->call, "termination"); + grpc_closure_sched(exec_ctx, grpc_closure_init(&tc->closure, send_termination, + tc, grpc_schedule_on_exec_ctx), + error); } static void cancel_with_error(grpc_exec_ctx *exec_ctx, grpc_call *c, - grpc_error *error) { + status_source source, grpc_error *error) { + set_status_from_error(exec_ctx, c, source, GRPC_ERROR_REF(error)); terminate_with_error(exec_ctx, c, error); } @@ -585,11 +578,11 @@ static grpc_error *error_from_status(grpc_status_code status, GRPC_ERROR_INT_GRPC_STATUS, status); } -static grpc_call_error cancel_with_status(grpc_exec_ctx *exec_ctx, grpc_call *c, - grpc_status_code status, - const char *description) { - return terminate_with_error(exec_ctx, c, - error_from_status(status, description)); +static void cancel_with_status(grpc_exec_ctx *exec_ctx, grpc_call *c, + status_source source, grpc_status_code status, + const char *description) { + cancel_with_error(exec_ctx, c, source, + error_from_status(status, description)); } /******************************************************************************* @@ -1014,11 +1007,6 @@ static void post_batch_completion(grpc_exec_ctx *exec_ctx, gpr_mu_lock(&call->mu); - if (error != GRPC_ERROR_NONE) { - set_status_from_error(exec_ctx, call, STATUS_FROM_CORE, - GRPC_ERROR_REF(error)); - } - if (bctl->send_initial_metadata) { grpc_metadata_batch_destroy( exec_ctx, @@ -1161,7 +1149,7 @@ static void receiving_stream_ready(grpc_exec_ctx *exec_ctx, void *bctlp, grpc_call *call = bctl->call; gpr_mu_lock(&bctl->call->mu); if (error != GRPC_ERROR_NONE) { - cancel_with_error(exec_ctx, call, GRPC_ERROR_REF(error)); + cancel_with_error(exec_ctx, call, STATUS_FROM_SURFACE, GRPC_ERROR_REF(error)); } if (call->has_initial_md_been_received || error != GRPC_ERROR_NONE || call->receiving_stream == NULL) { @@ -1188,7 +1176,8 @@ static void validate_filtered_metadata(grpc_exec_ctx *exec_ctx, gpr_asprintf(&error_msg, "Invalid compression algorithm value '%d'.", algo); gpr_log(GPR_ERROR, "%s", error_msg); - cancel_with_status(exec_ctx, call, GRPC_STATUS_UNIMPLEMENTED, error_msg); + cancel_with_status(exec_ctx, call, STATUS_FROM_SURFACE, + GRPC_STATUS_UNIMPLEMENTED, error_msg); } else if (grpc_compression_options_is_algorithm_enabled( &compression_options, algo) == 0) { /* check if algorithm is supported by current channel config */ @@ -1197,7 +1186,8 @@ static void validate_filtered_metadata(grpc_exec_ctx *exec_ctx, gpr_asprintf(&error_msg, "Compression algorithm '%s' is disabled.", algo_name); gpr_log(GPR_ERROR, "%s", error_msg); - cancel_with_status(exec_ctx, call, GRPC_STATUS_UNIMPLEMENTED, error_msg); + cancel_with_status(exec_ctx, call, STATUS_FROM_SURFACE, + GRPC_STATUS_UNIMPLEMENTED, error_msg); } else { call->incoming_compression_algorithm = algo; } @@ -1227,7 +1217,10 @@ static void add_batch_error(grpc_exec_ctx *exec_ctx, batch_control *bctl, grpc_error *error) { if (error == GRPC_ERROR_NONE) return; int idx = (int)gpr_atm_no_barrier_fetch_add(&bctl->num_errors, 1); - if (idx == 0) cancel_with_error(exec_ctx, bctl->call, GRPC_ERROR_REF(error)); + if (idx == 0) { + cancel_with_error(exec_ctx, bctl->call, STATUS_FROM_CORE, + GRPC_ERROR_REF(error)); + } bctl->errors[idx] = error; } From 58b30cd4a1bf641f5b382879a9d6147e5d265c48 Mon Sep 17 00:00:00 2001 From: Craig Tiller Date: Tue, 31 Jan 2017 17:07:36 -0800 Subject: [PATCH 2/2] Refine error selection rules --- doc/environment_variables.md | 5 +-- src/core/lib/surface/call.c | 64 ++++++++++++++++++++++++------------ src/core/lib/surface/call.h | 2 ++ src/core/lib/surface/init.c | 1 + 4 files changed, 49 insertions(+), 23 deletions(-) diff --git a/doc/environment_variables.md b/doc/environment_variables.md index 832762a5f52..2c83972aa1b 100644 --- a/doc/environment_variables.md +++ b/doc/environment_variables.md @@ -35,6 +35,7 @@ some configuration as environment variables that can be set. A comma separated list of tracers that provide additional insight into how gRPC C core is processing requests via debug logs. Available tracers include: - api - traces api calls to the C core + - call_error - traces the possible errors contributing to final call status - channel - traces operations on the C core channel stack - combiner - traces combiner lock state - compression - traces compression operations @@ -55,10 +56,10 @@ some configuration as environment variables that can be set. - secure_endpoint - traces bytes flowing through encrypted channels - transport_security - traces metadata about secure channel establishment - tcp - traces bytes in and out of a channel - + 'all' can additionally be used to turn all traces on. Individual traces can be disabled by prefixing them with '-'. - + Example: export GRPC_TRACE=all,-pending_tags diff --git a/src/core/lib/surface/call.c b/src/core/lib/surface/call.c index e9c623c2b10..11f122f21d8 100644 --- a/src/core/lib/surface/call.c +++ b/src/core/lib/surface/call.c @@ -215,6 +215,8 @@ struct grpc_call { void *saved_receiving_stream_ready_bctlp; }; +int grpc_call_error_trace = 0; + #define CALL_STACK_FROM_CALL(call) ((grpc_call_stack *)((call) + 1)) #define CALL_FROM_CALL_STACK(call_stack) (((grpc_call *)(call_stack)) - 1) #define CALL_ELEM_FROM_CALL(call, idx) \ @@ -340,7 +342,8 @@ grpc_error *grpc_call_create(grpc_exec_ctx *exec_ctx, args->server_transport_data, path, call->start_time, send_deadline, CALL_STACK_FROM_CALL(call)); if (error != GRPC_ERROR_NONE) { - cancel_with_error(exec_ctx, call, STATUS_FROM_SURFACE, GRPC_ERROR_REF(error)); + cancel_with_error(exec_ctx, call, STATUS_FROM_SURFACE, + GRPC_ERROR_REF(error)); } if (args->cq != NULL) { GPR_ASSERT( @@ -589,21 +592,24 @@ static void cancel_with_status(grpc_exec_ctx *exec_ctx, grpc_call *c, * FINAL STATUS CODE MANIPULATION */ -static void get_final_status_from(grpc_call *call, status_source from_source, - void (*set_value)(grpc_status_code code, - void *user_data), - void *set_value_user_data, - grpc_slice *details) { +static bool get_final_status_from( + grpc_call *call, status_source from_source, bool allow_ok_status, + void (*set_value)(grpc_status_code code, void *user_data), + void *set_value_user_data, grpc_slice *details) { grpc_status_code code; const char *msg = NULL; grpc_error_get_status(call->status[from_source].error, call->send_deadline, &code, &msg, NULL); + if (code == GRPC_STATUS_OK && !allow_ok_status) { + return false; + } set_value(code, set_value_user_data); if (details != NULL) { *details = msg == NULL ? grpc_empty_slice() : grpc_slice_from_copied_string(msg); } + return true; } static void get_final_status(grpc_call *call, @@ -611,22 +617,37 @@ static void get_final_status(grpc_call *call, void *user_data), void *set_value_user_data, grpc_slice *details) { int i; - /* search for the best status we can present: ideally the error we use has a - clearly defined grpc-status, and we'll prefer that. */ - for (i = 0; i < STATUS_SOURCE_COUNT; i++) { - if (call->status[i].is_set && - grpc_error_has_clear_grpc_status(call->status[i].error)) { - get_final_status_from(call, (status_source)i, set_value, - set_value_user_data, details); - return; + if (grpc_call_error_trace) { + gpr_log(GPR_DEBUG, "get_final_status %s", call->is_client ? "CLI" : "SVR"); + for (i = 0; i < STATUS_SOURCE_COUNT; i++) { + if (call->status[i].is_set) { + gpr_log(GPR_DEBUG, " %d: %s", i, + grpc_error_string(call->status[i].error)); + } } } - /* If no clearly defined status exists, search for 'anything' */ - for (i = 0; i < STATUS_SOURCE_COUNT; i++) { - if (call->status[i].is_set) { - get_final_status_from(call, (status_source)i, set_value, - set_value_user_data, details); - return; + /* first search through ignoring "OK" statuses: if something went wrong, + * ensure we report it */ + for (int allow_ok_status = 0; allow_ok_status < 2; allow_ok_status++) { + /* search for the best status we can present: ideally the error we use has a + clearly defined grpc-status, and we'll prefer that. */ + for (i = 0; i < STATUS_SOURCE_COUNT; i++) { + if (call->status[i].is_set && + grpc_error_has_clear_grpc_status(call->status[i].error)) { + if (get_final_status_from(call, (status_source)i, allow_ok_status != 0, + set_value, set_value_user_data, details)) { + return; + } + } + } + /* If no clearly defined status exists, search for 'anything' */ + for (i = 0; i < STATUS_SOURCE_COUNT; i++) { + if (call->status[i].is_set) { + if (get_final_status_from(call, (status_source)i, allow_ok_status != 0, + set_value, set_value_user_data, details)) { + return; + } + } } } /* If nothing exists, set some default */ @@ -1149,7 +1170,8 @@ static void receiving_stream_ready(grpc_exec_ctx *exec_ctx, void *bctlp, grpc_call *call = bctl->call; gpr_mu_lock(&bctl->call->mu); if (error != GRPC_ERROR_NONE) { - cancel_with_error(exec_ctx, call, STATUS_FROM_SURFACE, GRPC_ERROR_REF(error)); + cancel_with_error(exec_ctx, call, STATUS_FROM_SURFACE, + GRPC_ERROR_REF(error)); } if (call->has_initial_md_been_received || error != GRPC_ERROR_NONE || call->receiving_stream == NULL) { diff --git a/src/core/lib/surface/call.h b/src/core/lib/surface/call.h index 8c46a83d427..b70343ddf18 100644 --- a/src/core/lib/surface/call.h +++ b/src/core/lib/surface/call.h @@ -125,6 +125,8 @@ uint8_t grpc_call_is_client(grpc_call *call); grpc_compression_algorithm grpc_call_compression_for_level( grpc_call *call, grpc_compression_level level); +extern int grpc_call_error_trace; + #ifdef __cplusplus } #endif diff --git a/src/core/lib/surface/init.c b/src/core/lib/surface/init.c index cfa1882775f..787e4d0dd25 100644 --- a/src/core/lib/surface/init.c +++ b/src/core/lib/surface/init.c @@ -199,6 +199,7 @@ void grpc_init(void) { grpc_cq_event_timeout_trace = 1; grpc_register_tracer("op_failure", &grpc_trace_operation_failures); grpc_register_tracer("resource_quota", &grpc_resource_quota_trace); + grpc_register_tracer("call_error", &grpc_call_error_trace); #ifndef NDEBUG grpc_register_tracer("pending_tags", &grpc_trace_pending_tags); #endif