From 2f2175c207fab465db52184eb9a9c492026ac471 Mon Sep 17 00:00:00 2001 From: Sree Kuchibhotla Date: Tue, 3 Oct 2017 14:25:08 -0700 Subject: [PATCH 1/2] Counter for number of failed trylocks in cq --- src/core/lib/debug/stats_data.c | 3 +++ src/core/lib/debug/stats_data.h | 4 ++++ src/core/lib/debug/stats_data.yaml | 4 ++++ src/core/lib/debug/stats_data_bq_schema.sql | 3 ++- src/core/lib/surface/completion_queue.c | 4 ++++ tools/run_tests/performance/massage_qps_stats.py | 1 + .../run_tests/performance/scenario_result_schema.json | 10 ++++++++++ 7 files changed, 28 insertions(+), 1 deletion(-) diff --git a/src/core/lib/debug/stats_data.c b/src/core/lib/debug/stats_data.c index c0aec63c1d3..41d9e95e294 100644 --- a/src/core/lib/debug/stats_data.c +++ b/src/core/lib/debug/stats_data.c @@ -111,6 +111,7 @@ const char *grpc_stats_counter_name[GRPC_STATS_COUNTER_COUNT] = { "executor_push_retries", "server_requested_calls", "server_slowpath_requests_queued", + "cq_failed_queue_trylocks", }; const char *grpc_stats_counter_doc[GRPC_STATS_COUNTER_COUNT] = { "Number of client side calls created by this process", @@ -220,6 +221,8 @@ const char *grpc_stats_counter_doc[GRPC_STATS_COUNTER_COUNT] = { "How many calls were requested (not necessarily received) by the server", "How many times was the server slow path taken (indicates too few " "outstanding requests)", + "Number of lock (trylock) acquisition failures on completion queue event " + "queue. High value here indicates high contention on completion queues", }; const char *grpc_stats_histogram_name[GRPC_STATS_HISTOGRAM_COUNT] = { "call_initial_size", diff --git a/src/core/lib/debug/stats_data.h b/src/core/lib/debug/stats_data.h index 28dab00117b..0a2b323bdfa 100644 --- a/src/core/lib/debug/stats_data.h +++ b/src/core/lib/debug/stats_data.h @@ -113,6 +113,7 @@ typedef enum { GRPC_STATS_COUNTER_EXECUTOR_PUSH_RETRIES, GRPC_STATS_COUNTER_SERVER_REQUESTED_CALLS, GRPC_STATS_COUNTER_SERVER_SLOWPATH_REQUESTS_QUEUED, + GRPC_STATS_COUNTER_CQ_FAILED_QUEUE_TRYLOCKS, GRPC_STATS_COUNTER_COUNT } grpc_stats_counters; extern const char *grpc_stats_counter_name[GRPC_STATS_COUNTER_COUNT]; @@ -417,6 +418,9 @@ typedef enum { #define GRPC_STATS_INC_SERVER_SLOWPATH_REQUESTS_QUEUED(exec_ctx) \ GRPC_STATS_INC_COUNTER((exec_ctx), \ GRPC_STATS_COUNTER_SERVER_SLOWPATH_REQUESTS_QUEUED) +#define GRPC_STATS_INC_CQ_FAILED_QUEUE_TRYLOCKS(exec_ctx) \ + GRPC_STATS_INC_COUNTER((exec_ctx), \ + GRPC_STATS_COUNTER_CQ_FAILED_QUEUE_TRYLOCKS) #define GRPC_STATS_INC_CALL_INITIAL_SIZE(exec_ctx, value) \ grpc_stats_inc_call_initial_size((exec_ctx), (int)(value)) void grpc_stats_inc_call_initial_size(grpc_exec_ctx *exec_ctx, int x); diff --git a/src/core/lib/debug/stats_data.yaml b/src/core/lib/debug/stats_data.yaml index b5c15ff55c0..5efe3532000 100644 --- a/src/core/lib/debug/stats_data.yaml +++ b/src/core/lib/debug/stats_data.yaml @@ -270,3 +270,7 @@ - counter: server_slowpath_requests_queued doc: How many times was the server slow path taken (indicates too few outstanding requests) +# cq +- counter: cq_failed_queue_trylocks + doc: Number of lock (trylock) acquisition failures on completion queue event + queue. High value here indicates high contention on completion queues diff --git a/src/core/lib/debug/stats_data_bq_schema.sql b/src/core/lib/debug/stats_data_bq_schema.sql index f96e40c00ef..6ad58e2530d 100644 --- a/src/core/lib/debug/stats_data_bq_schema.sql +++ b/src/core/lib/debug/stats_data_bq_schema.sql @@ -85,4 +85,5 @@ executor_wakeup_initiated_per_iteration:FLOAT, executor_queue_drained_per_iteration:FLOAT, executor_push_retries_per_iteration:FLOAT, server_requested_calls_per_iteration:FLOAT, -server_slowpath_requests_queued_per_iteration:FLOAT +server_slowpath_requests_queued_per_iteration:FLOAT, +cq_failed_queue_trylocks_per_iteration:FLOAT diff --git a/src/core/lib/surface/completion_queue.c b/src/core/lib/surface/completion_queue.c index fed66e3a209..037b10e8486 100644 --- a/src/core/lib/surface/completion_queue.c +++ b/src/core/lib/surface/completion_queue.c @@ -378,6 +378,10 @@ static grpc_cq_completion *cq_event_queue_pop(grpc_cq_event_queue *q) { if (gpr_spinlock_trylock(&q->queue_lock)) { c = (grpc_cq_completion *)gpr_mpscq_pop(&q->queue); gpr_spinlock_unlock(&q->queue_lock); + } else { + grpc_exec_ctx exec_ctx = GRPC_EXEC_CTX_INIT; + GRPC_STATS_INC_CQ_FAILED_QUEUE_TRYLOCKS(&exec_ctx); + grpc_exec_ctx_finish(&exec_ctx); } if (c) { diff --git a/tools/run_tests/performance/massage_qps_stats.py b/tools/run_tests/performance/massage_qps_stats.py index 9b9355308a3..49d165d94f6 100644 --- a/tools/run_tests/performance/massage_qps_stats.py +++ b/tools/run_tests/performance/massage_qps_stats.py @@ -108,6 +108,7 @@ def massage_qps_stats(scenario_result): stats["core_executor_push_retries"] = massage_qps_stats_helpers.counter(core_stats, "executor_push_retries") stats["core_server_requested_calls"] = massage_qps_stats_helpers.counter(core_stats, "server_requested_calls") stats["core_server_slowpath_requests_queued"] = massage_qps_stats_helpers.counter(core_stats, "server_slowpath_requests_queued") + stats["core_cq_failed_queue_trylocks"] = massage_qps_stats_helpers.counter(core_stats, "cq_failed_queue_trylocks") h = massage_qps_stats_helpers.histogram(core_stats, "call_initial_size") stats["core_call_initial_size"] = ",".join("%f" % x for x in h.buckets) stats["core_call_initial_size_bkts"] = ",".join("%f" % x for x in h.boundaries) diff --git a/tools/run_tests/performance/scenario_result_schema.json b/tools/run_tests/performance/scenario_result_schema.json index 2f0fd916d4f..11d8dba5d3f 100644 --- a/tools/run_tests/performance/scenario_result_schema.json +++ b/tools/run_tests/performance/scenario_result_schema.json @@ -550,6 +550,11 @@ "name": "core_server_slowpath_requests_queued", "type": "INTEGER" }, + { + "mode": "NULLABLE", + "name": "core_cq_failed_queue_trylocks", + "type": "INTEGER" + }, { "mode": "NULLABLE", "name": "core_call_initial_size", @@ -1342,6 +1347,11 @@ "name": "core_server_slowpath_requests_queued", "type": "INTEGER" }, + { + "mode": "NULLABLE", + "name": "core_cq_failed_queue_trylocks", + "type": "INTEGER" + }, { "mode": "NULLABLE", "name": "core_call_initial_size", From 0d0fa06488e09bb404511af15523f2e75a2e86c8 Mon Sep 17 00:00:00 2001 From: Sree Kuchibhotla Date: Tue, 3 Oct 2017 16:04:42 -0700 Subject: [PATCH 2/2] Add more counters in cq --- src/core/lib/debug/stats_data.c | 8 ++++++- src/core/lib/debug/stats_data.h | 16 +++++++++---- src/core/lib/debug/stats_data.yaml | 8 ++++++- src/core/lib/debug/stats_data_bq_schema.sql | 4 +++- src/core/lib/surface/completion_queue.c | 17 +++++++++---- .../performance/massage_qps_stats.py | 4 +++- .../performance/scenario_result_schema.json | 24 +++++++++++++++++-- 7 files changed, 67 insertions(+), 14 deletions(-) diff --git a/src/core/lib/debug/stats_data.c b/src/core/lib/debug/stats_data.c index 41d9e95e294..a9ffaecfb9e 100644 --- a/src/core/lib/debug/stats_data.c +++ b/src/core/lib/debug/stats_data.c @@ -111,7 +111,9 @@ const char *grpc_stats_counter_name[GRPC_STATS_COUNTER_COUNT] = { "executor_push_retries", "server_requested_calls", "server_slowpath_requests_queued", - "cq_failed_queue_trylocks", + "cq_ev_queue_trylock_failures", + "cq_ev_queue_trylock_successes", + "cq_ev_queue_transient_pop_failures", }; const char *grpc_stats_counter_doc[GRPC_STATS_COUNTER_COUNT] = { "Number of client side calls created by this process", @@ -223,6 +225,10 @@ const char *grpc_stats_counter_doc[GRPC_STATS_COUNTER_COUNT] = { "outstanding requests)", "Number of lock (trylock) acquisition failures on completion queue event " "queue. High value here indicates high contention on completion queues", + "Number of lock (trylock) acquisition successes on completion queue event " + "queue.", + "Number of times NULL was popped out of completion queue's event queue " + "even though the event queue was not empty", }; const char *grpc_stats_histogram_name[GRPC_STATS_HISTOGRAM_COUNT] = { "call_initial_size", diff --git a/src/core/lib/debug/stats_data.h b/src/core/lib/debug/stats_data.h index 0a2b323bdfa..86696cc438b 100644 --- a/src/core/lib/debug/stats_data.h +++ b/src/core/lib/debug/stats_data.h @@ -113,7 +113,9 @@ typedef enum { GRPC_STATS_COUNTER_EXECUTOR_PUSH_RETRIES, GRPC_STATS_COUNTER_SERVER_REQUESTED_CALLS, GRPC_STATS_COUNTER_SERVER_SLOWPATH_REQUESTS_QUEUED, - GRPC_STATS_COUNTER_CQ_FAILED_QUEUE_TRYLOCKS, + GRPC_STATS_COUNTER_CQ_EV_QUEUE_TRYLOCK_FAILURES, + GRPC_STATS_COUNTER_CQ_EV_QUEUE_TRYLOCK_SUCCESSES, + GRPC_STATS_COUNTER_CQ_EV_QUEUE_TRANSIENT_POP_FAILURES, GRPC_STATS_COUNTER_COUNT } grpc_stats_counters; extern const char *grpc_stats_counter_name[GRPC_STATS_COUNTER_COUNT]; @@ -418,9 +420,15 @@ typedef enum { #define GRPC_STATS_INC_SERVER_SLOWPATH_REQUESTS_QUEUED(exec_ctx) \ GRPC_STATS_INC_COUNTER((exec_ctx), \ GRPC_STATS_COUNTER_SERVER_SLOWPATH_REQUESTS_QUEUED) -#define GRPC_STATS_INC_CQ_FAILED_QUEUE_TRYLOCKS(exec_ctx) \ - GRPC_STATS_INC_COUNTER((exec_ctx), \ - GRPC_STATS_COUNTER_CQ_FAILED_QUEUE_TRYLOCKS) +#define GRPC_STATS_INC_CQ_EV_QUEUE_TRYLOCK_FAILURES(exec_ctx) \ + GRPC_STATS_INC_COUNTER((exec_ctx), \ + GRPC_STATS_COUNTER_CQ_EV_QUEUE_TRYLOCK_FAILURES) +#define GRPC_STATS_INC_CQ_EV_QUEUE_TRYLOCK_SUCCESSES(exec_ctx) \ + GRPC_STATS_INC_COUNTER((exec_ctx), \ + GRPC_STATS_COUNTER_CQ_EV_QUEUE_TRYLOCK_SUCCESSES) +#define GRPC_STATS_INC_CQ_EV_QUEUE_TRANSIENT_POP_FAILURES(exec_ctx) \ + GRPC_STATS_INC_COUNTER( \ + (exec_ctx), GRPC_STATS_COUNTER_CQ_EV_QUEUE_TRANSIENT_POP_FAILURES) #define GRPC_STATS_INC_CALL_INITIAL_SIZE(exec_ctx, value) \ grpc_stats_inc_call_initial_size((exec_ctx), (int)(value)) void grpc_stats_inc_call_initial_size(grpc_exec_ctx *exec_ctx, int x); diff --git a/src/core/lib/debug/stats_data.yaml b/src/core/lib/debug/stats_data.yaml index 5efe3532000..e6602e13968 100644 --- a/src/core/lib/debug/stats_data.yaml +++ b/src/core/lib/debug/stats_data.yaml @@ -271,6 +271,12 @@ doc: How many times was the server slow path taken (indicates too few outstanding requests) # cq -- counter: cq_failed_queue_trylocks +- counter: cq_ev_queue_trylock_failures doc: Number of lock (trylock) acquisition failures on completion queue event queue. High value here indicates high contention on completion queues +- counter: cq_ev_queue_trylock_successes + doc: Number of lock (trylock) acquisition successes on completion queue event + queue. +- counter: cq_ev_queue_transient_pop_failures + doc: Number of times NULL was popped out of completion queue's event queue + even though the event queue was not empty diff --git a/src/core/lib/debug/stats_data_bq_schema.sql b/src/core/lib/debug/stats_data_bq_schema.sql index 6ad58e2530d..0f70a52a6cf 100644 --- a/src/core/lib/debug/stats_data_bq_schema.sql +++ b/src/core/lib/debug/stats_data_bq_schema.sql @@ -86,4 +86,6 @@ executor_queue_drained_per_iteration:FLOAT, executor_push_retries_per_iteration:FLOAT, server_requested_calls_per_iteration:FLOAT, server_slowpath_requests_queued_per_iteration:FLOAT, -cq_failed_queue_trylocks_per_iteration:FLOAT +cq_ev_queue_trylock_failures_per_iteration:FLOAT, +cq_ev_queue_trylock_successes_per_iteration:FLOAT, +cq_ev_queue_transient_pop_failures_per_iteration:FLOAT diff --git a/src/core/lib/surface/completion_queue.c b/src/core/lib/surface/completion_queue.c index 037b10e8486..689b51cfbe5 100644 --- a/src/core/lib/surface/completion_queue.c +++ b/src/core/lib/surface/completion_queue.c @@ -375,15 +375,24 @@ static bool cq_event_queue_push(grpc_cq_event_queue *q, grpc_cq_completion *c) { static grpc_cq_completion *cq_event_queue_pop(grpc_cq_event_queue *q) { grpc_cq_completion *c = NULL; + grpc_exec_ctx exec_ctx = GRPC_EXEC_CTX_INIT; + if (gpr_spinlock_trylock(&q->queue_lock)) { - c = (grpc_cq_completion *)gpr_mpscq_pop(&q->queue); + GRPC_STATS_INC_CQ_EV_QUEUE_TRYLOCK_SUCCESSES(&exec_ctx); + + bool is_empty = false; + c = (grpc_cq_completion *)gpr_mpscq_pop_and_check_end(&q->queue, &is_empty); gpr_spinlock_unlock(&q->queue_lock); + + if (c == NULL && !is_empty) { + GRPC_STATS_INC_CQ_EV_QUEUE_TRANSIENT_POP_FAILURES(&exec_ctx); + } } else { - grpc_exec_ctx exec_ctx = GRPC_EXEC_CTX_INIT; - GRPC_STATS_INC_CQ_FAILED_QUEUE_TRYLOCKS(&exec_ctx); - grpc_exec_ctx_finish(&exec_ctx); + GRPC_STATS_INC_CQ_EV_QUEUE_TRYLOCK_FAILURES(&exec_ctx); } + grpc_exec_ctx_finish(&exec_ctx); + if (c) { gpr_atm_no_barrier_fetch_add(&q->num_queue_items, -1); } diff --git a/tools/run_tests/performance/massage_qps_stats.py b/tools/run_tests/performance/massage_qps_stats.py index 49d165d94f6..ad8585c27e5 100644 --- a/tools/run_tests/performance/massage_qps_stats.py +++ b/tools/run_tests/performance/massage_qps_stats.py @@ -108,7 +108,9 @@ def massage_qps_stats(scenario_result): stats["core_executor_push_retries"] = massage_qps_stats_helpers.counter(core_stats, "executor_push_retries") stats["core_server_requested_calls"] = massage_qps_stats_helpers.counter(core_stats, "server_requested_calls") stats["core_server_slowpath_requests_queued"] = massage_qps_stats_helpers.counter(core_stats, "server_slowpath_requests_queued") - stats["core_cq_failed_queue_trylocks"] = massage_qps_stats_helpers.counter(core_stats, "cq_failed_queue_trylocks") + stats["core_cq_ev_queue_trylock_failures"] = massage_qps_stats_helpers.counter(core_stats, "cq_ev_queue_trylock_failures") + stats["core_cq_ev_queue_trylock_successes"] = massage_qps_stats_helpers.counter(core_stats, "cq_ev_queue_trylock_successes") + stats["core_cq_ev_queue_transient_pop_failures"] = massage_qps_stats_helpers.counter(core_stats, "cq_ev_queue_transient_pop_failures") h = massage_qps_stats_helpers.histogram(core_stats, "call_initial_size") stats["core_call_initial_size"] = ",".join("%f" % x for x in h.buckets) stats["core_call_initial_size_bkts"] = ",".join("%f" % x for x in h.boundaries) diff --git a/tools/run_tests/performance/scenario_result_schema.json b/tools/run_tests/performance/scenario_result_schema.json index 11d8dba5d3f..099048c2975 100644 --- a/tools/run_tests/performance/scenario_result_schema.json +++ b/tools/run_tests/performance/scenario_result_schema.json @@ -552,7 +552,17 @@ }, { "mode": "NULLABLE", - "name": "core_cq_failed_queue_trylocks", + "name": "core_cq_ev_queue_trylock_failures", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "core_cq_ev_queue_trylock_successes", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "core_cq_ev_queue_transient_pop_failures", "type": "INTEGER" }, { @@ -1349,7 +1359,17 @@ }, { "mode": "NULLABLE", - "name": "core_cq_failed_queue_trylocks", + "name": "core_cq_ev_queue_trylock_failures", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "core_cq_ev_queue_trylock_successes", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "core_cq_ev_queue_transient_pop_failures", "type": "INTEGER" }, {