Add stats to tcp_posix.cc to log commonly seen I/O errnos.

PiperOrigin-RevId: 610799993
pull/36004/head
Vignesh Babu 12 months ago committed by Copybara-Service
parent 9054988785
commit 338fc05c21
  1. 67
      src/core/lib/debug/stats_data.cc
  2. 68
      src/core/lib/debug/stats_data.h
  3. 22
      src/core/lib/debug/stats_data.yaml
  4. 47
      src/core/lib/iomgr/tcp_posix.cc

@ -116,6 +116,17 @@ const absl::string_view
"wrr_updates",
"work_serializer_items_enqueued",
"work_serializer_items_dequeued",
"econnaborted_count",
"econnreset_count",
"epipe_count",
"etimedout_count",
"econnrefused_count",
"enetunreach_count",
"enomsg_count",
"enotconn_count",
"enobufs_count",
"uncommon_io_error_count",
"msg_errqueue_error_count",
};
const absl::string_view GlobalStats::counter_doc[static_cast<int>(
Counter::COUNT)] = {
@ -146,6 +157,17 @@ const absl::string_view GlobalStats::counter_doc[static_cast<int>(
"Number of wrr updates that have been received",
"Number of items enqueued onto work serializers",
"Number of items dequeued from work serializers",
"Number of ECONNABORTED errors",
"Number of ECONNRESET errors",
"Number of EPIPE errors",
"Number of ETIMEDOUT errors",
"Number of ECONNREFUSED errors",
"Number of ENETUNREACH errors",
"Number of ENOMSG errors",
"Number of ENOTCONN errors",
"Number of ENOBUFS errors",
"Number of uncommon io errors",
"Number of uncommon errors returned by MSG_ERRQUEUE",
};
const absl::string_view
GlobalStats::histogram_name[static_cast<int>(Histogram::COUNT)] = {
@ -336,7 +358,18 @@ GlobalStats::GlobalStats()
cq_callback_creates{0},
wrr_updates{0},
work_serializer_items_enqueued{0},
work_serializer_items_dequeued{0} {}
work_serializer_items_dequeued{0},
econnaborted_count{0},
econnreset_count{0},
epipe_count{0},
etimedout_count{0},
econnrefused_count{0},
enetunreach_count{0},
enomsg_count{0},
enotconn_count{0},
enobufs_count{0},
uncommon_io_error_count{0},
msg_errqueue_error_count{0} {}
HistogramView GlobalStats::histogram(Histogram which) const {
switch (which) {
default:
@ -427,6 +460,25 @@ std::unique_ptr<GlobalStats> GlobalStatsCollector::Collect() const {
data.work_serializer_items_enqueued.load(std::memory_order_relaxed);
result->work_serializer_items_dequeued +=
data.work_serializer_items_dequeued.load(std::memory_order_relaxed);
result->econnaborted_count +=
data.econnaborted_count.load(std::memory_order_relaxed);
result->econnreset_count +=
data.econnreset_count.load(std::memory_order_relaxed);
result->epipe_count += data.epipe_count.load(std::memory_order_relaxed);
result->etimedout_count +=
data.etimedout_count.load(std::memory_order_relaxed);
result->econnrefused_count +=
data.econnrefused_count.load(std::memory_order_relaxed);
result->enetunreach_count +=
data.enetunreach_count.load(std::memory_order_relaxed);
result->enomsg_count += data.enomsg_count.load(std::memory_order_relaxed);
result->enotconn_count +=
data.enotconn_count.load(std::memory_order_relaxed);
result->enobufs_count += data.enobufs_count.load(std::memory_order_relaxed);
result->uncommon_io_error_count +=
data.uncommon_io_error_count.load(std::memory_order_relaxed);
result->msg_errqueue_error_count +=
data.msg_errqueue_error_count.load(std::memory_order_relaxed);
data.call_initial_size.Collect(&result->call_initial_size);
data.tcp_write_size.Collect(&result->tcp_write_size);
data.tcp_write_iov_size.Collect(&result->tcp_write_iov_size);
@ -481,6 +533,19 @@ std::unique_ptr<GlobalStats> GlobalStats::Diff(const GlobalStats& other) const {
work_serializer_items_enqueued - other.work_serializer_items_enqueued;
result->work_serializer_items_dequeued =
work_serializer_items_dequeued - other.work_serializer_items_dequeued;
result->econnaborted_count = econnaborted_count - other.econnaborted_count;
result->econnreset_count = econnreset_count - other.econnreset_count;
result->epipe_count = epipe_count - other.epipe_count;
result->etimedout_count = etimedout_count - other.etimedout_count;
result->econnrefused_count = econnrefused_count - other.econnrefused_count;
result->enetunreach_count = enetunreach_count - other.enetunreach_count;
result->enomsg_count = enomsg_count - other.enomsg_count;
result->enotconn_count = enotconn_count - other.enotconn_count;
result->enobufs_count = enobufs_count - other.enobufs_count;
result->uncommon_io_error_count =
uncommon_io_error_count - other.uncommon_io_error_count;
result->msg_errqueue_error_count =
msg_errqueue_error_count - other.msg_errqueue_error_count;
result->call_initial_size = call_initial_size - other.call_initial_size;
result->tcp_write_size = tcp_write_size - other.tcp_write_size;
result->tcp_write_iov_size = tcp_write_iov_size - other.tcp_write_iov_size;

@ -168,6 +168,17 @@ struct GlobalStats {
kWrrUpdates,
kWorkSerializerItemsEnqueued,
kWorkSerializerItemsDequeued,
kEconnabortedCount,
kEconnresetCount,
kEpipeCount,
kEtimedoutCount,
kEconnrefusedCount,
kEnetunreachCount,
kEnomsgCount,
kEnotconnCount,
kEnobufsCount,
kUncommonIoErrorCount,
kMsgErrqueueErrorCount,
COUNT
};
enum class Histogram {
@ -217,6 +228,17 @@ struct GlobalStats {
uint64_t wrr_updates;
uint64_t work_serializer_items_enqueued;
uint64_t work_serializer_items_dequeued;
uint64_t econnaborted_count;
uint64_t econnreset_count;
uint64_t epipe_count;
uint64_t etimedout_count;
uint64_t econnrefused_count;
uint64_t enetunreach_count;
uint64_t enomsg_count;
uint64_t enotconn_count;
uint64_t enobufs_count;
uint64_t uncommon_io_error_count;
uint64_t msg_errqueue_error_count;
};
uint64_t counters[static_cast<int>(Counter::COUNT)];
};
@ -315,6 +337,41 @@ class GlobalStatsCollector {
data_.this_cpu().work_serializer_items_dequeued.fetch_add(
1, std::memory_order_relaxed);
}
void IncrementEconnabortedCount() {
data_.this_cpu().econnaborted_count.fetch_add(1, std::memory_order_relaxed);
}
void IncrementEconnresetCount() {
data_.this_cpu().econnreset_count.fetch_add(1, std::memory_order_relaxed);
}
void IncrementEpipeCount() {
data_.this_cpu().epipe_count.fetch_add(1, std::memory_order_relaxed);
}
void IncrementEtimedoutCount() {
data_.this_cpu().etimedout_count.fetch_add(1, std::memory_order_relaxed);
}
void IncrementEconnrefusedCount() {
data_.this_cpu().econnrefused_count.fetch_add(1, std::memory_order_relaxed);
}
void IncrementEnetunreachCount() {
data_.this_cpu().enetunreach_count.fetch_add(1, std::memory_order_relaxed);
}
void IncrementEnomsgCount() {
data_.this_cpu().enomsg_count.fetch_add(1, std::memory_order_relaxed);
}
void IncrementEnotconnCount() {
data_.this_cpu().enotconn_count.fetch_add(1, std::memory_order_relaxed);
}
void IncrementEnobufsCount() {
data_.this_cpu().enobufs_count.fetch_add(1, std::memory_order_relaxed);
}
void IncrementUncommonIoErrorCount() {
data_.this_cpu().uncommon_io_error_count.fetch_add(
1, std::memory_order_relaxed);
}
void IncrementMsgErrqueueErrorCount() {
data_.this_cpu().msg_errqueue_error_count.fetch_add(
1, std::memory_order_relaxed);
}
void IncrementCallInitialSize(int value) {
data_.this_cpu().call_initial_size.Increment(value);
}
@ -381,6 +438,17 @@ class GlobalStatsCollector {
std::atomic<uint64_t> wrr_updates{0};
std::atomic<uint64_t> work_serializer_items_enqueued{0};
std::atomic<uint64_t> work_serializer_items_dequeued{0};
std::atomic<uint64_t> econnaborted_count{0};
std::atomic<uint64_t> econnreset_count{0};
std::atomic<uint64_t> epipe_count{0};
std::atomic<uint64_t> etimedout_count{0};
std::atomic<uint64_t> econnrefused_count{0};
std::atomic<uint64_t> enetunreach_count{0};
std::atomic<uint64_t> enomsg_count{0};
std::atomic<uint64_t> enotconn_count{0};
std::atomic<uint64_t> enobufs_count{0};
std::atomic<uint64_t> uncommon_io_error_count{0};
std::atomic<uint64_t> msg_errqueue_error_count{0};
HistogramCollector_65536_26 call_initial_size;
HistogramCollector_16777216_20 tcp_write_size;
HistogramCollector_80_10 tcp_write_iov_size;

@ -119,3 +119,25 @@
doc: Number of items enqueued onto work serializers
- counter: work_serializer_items_dequeued
doc: Number of items dequeued from work serializers
- counter: econnaborted_count
doc: Number of ECONNABORTED errors
- counter: econnreset_count
doc: Number of ECONNRESET errors
- counter: epipe_count
doc: Number of EPIPE errors
- counter: etimedout_count
doc: Number of ETIMEDOUT errors
- counter: econnrefused_count
doc: Number of ECONNREFUSED errors
- counter: enetunreach_count
doc: Number of ENETUNREACH errors
- counter: enomsg_count
doc: Number of ENOMSG errors
- counter: enotconn_count
doc: Number of ENOTCONN errors
- counter: enobufs_count
doc: Number of ENOBUFS errors
- counter: uncommon_io_error_count
doc: Number of uncommon io errors
- counter: msg_errqueue_error_count
doc: Number of uncommon errors returned by MSG_ERRQUEUE

@ -64,6 +64,7 @@
#include "src/core/lib/gprpp/crash.h"
#include "src/core/lib/gprpp/strerror.h"
#include "src/core/lib/gprpp/sync.h"
#include "src/core/lib/gprpp/time.h"
#include "src/core/lib/iomgr/buffer_list.h"
#include "src/core/lib/iomgr/ev_posix.h"
#include "src/core/lib/iomgr/event_engine_shims/endpoint.h"
@ -562,6 +563,44 @@ struct backup_poller {
grpc_closure run_poller;
};
void LogCommonIOErrors(absl::string_view prefix, int error_no) {
switch (error_no) {
case ECONNABORTED:
grpc_core::global_stats().IncrementEconnabortedCount();
return;
case ECONNRESET:
grpc_core::global_stats().IncrementEconnresetCount();
return;
case EPIPE:
grpc_core::global_stats().IncrementEpipeCount();
return;
case ETIMEDOUT:
grpc_core::global_stats().IncrementEtimedoutCount();
return;
case ECONNREFUSED:
grpc_core::global_stats().IncrementEconnrefusedCount();
return;
case ENETUNREACH:
grpc_core::global_stats().IncrementEnetunreachCount();
return;
case ENOMSG:
grpc_core::global_stats().IncrementEnomsgCount();
return;
case ENOTCONN:
grpc_core::global_stats().IncrementEnotconnCount();
return;
case ENOBUFS:
grpc_core::global_stats().IncrementEnobufsCount();
return;
default:
grpc_core::global_stats().IncrementUncommonIoErrorCount();
GRPC_LOG_EVERY_N_SEC(1, GPR_ERROR, "%s encountered uncommon error: %s",
prefix.data(),
grpc_core::StrError(error_no).c_str());
return;
}
}
} // namespace
static void ZerocopyDisableAndWaitForRemaining(grpc_tcp* tcp);
@ -957,6 +996,9 @@ static bool tcp_do_read(grpc_tcp* tcp, grpc_error_handle* error)
// We have read something in previous reads. We need to deliver those
// bytes to the upper layer.
if (read_bytes <= 0 && total_read_bytes >= 1) {
if (read_bytes < 0) {
LogCommonIOErrors("recvmsg", errno);
}
tcp->inq = 1;
break;
}
@ -1414,6 +1456,8 @@ static bool process_errors(grpc_tcp* tcp) {
return processed_err; // No more errors to process
}
if (r == -1) {
LogCommonIOErrors("recvmsg(MSG_ERRQUEUE)", saved_errno);
grpc_core::global_stats().IncrementMsgErrqueueErrorCount();
return processed_err;
}
if (GPR_UNLIKELY((msg.msg_flags & MSG_CTRUNC) != 0)) {
@ -1614,6 +1658,9 @@ static bool do_tcp_flush_zerocopy(grpc_tcp* tcp, TcpZerocopySendRecord* record,
grpc_fd_set_writable(tcp->em_fd);
}
if (sent_length < 0) {
if (saved_errno != EAGAIN) {
LogCommonIOErrors("sendmsg", saved_errno);
}
// If this particular send failed, drop ref taken earlier in this method.
tcp->tcp_zerocopy_send_ctx.UndoSend();
if (saved_errno == EAGAIN || saved_errno == ENOBUFS) {

Loading…
Cancel
Save