From 27ee3913d119be2f1594711a5681e2934763c5bd Mon Sep 17 00:00:00 2001 From: apolcyn Date: Fri, 3 Mar 2023 13:18:01 -0800 Subject: [PATCH] [iomgr and EE logging] Log ERROR if socket() returns EMFILE on posix (#32204) There are potentially surprising deployment bugs that can cause `EMFILE` to be hit. For example, file descriptor limits can be easily reached if - the round robin LB policy is used - the load balancer hands out an assignment with a lot of backends - using debian's default 1024 file descriptor limit. To make such problems more apparent, we can pay special attention to this error and log ERROR when it happens. Related: b/265199104 --- CMakeLists.txt | 42 +++++++++++++++ build_autogenerated.yaml | 14 +++++ src/core/BUILD | 1 + .../posix_engine/posix_endpoint.cc | 2 +- .../posix_engine/tcp_socket_utils.cc | 18 ++++++- src/core/lib/gprpp/time.cc | 1 + src/core/lib/gprpp/time.h | 8 +-- .../lib/iomgr/socket_utils_common_posix.cc | 19 +++++-- .../event_engine/event_engine_test_utils.cc | 3 +- test/core/event_engine/posix/BUILD | 19 +++++++ .../posix/log_too_many_open_files_test.cc | 53 +++++++++++++++++++ tools/run_tests/generated/tests.json | 22 ++++++++ 12 files changed, 191 insertions(+), 11 deletions(-) create mode 100644 test/core/event_engine/posix/log_too_many_open_files_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 72e0e55e7ae..8568ecd0f66 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1042,6 +1042,9 @@ if(gRPC_BUILD_TESTS) add_dependencies(buildtests_cxx lock_free_event_test) endif() add_dependencies(buildtests_cxx log_test) + if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX) + add_dependencies(buildtests_cxx log_too_many_open_files_test) + endif() add_dependencies(buildtests_cxx loop_test) add_dependencies(buildtests_cxx map_pipe_test) add_dependencies(buildtests_cxx match_test) @@ -14709,6 +14712,45 @@ target_link_libraries(log_test ) +endif() +if(gRPC_BUILD_TESTS) +if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX) + + add_executable(log_too_many_open_files_test + test/core/event_engine/posix/log_too_many_open_files_test.cc + third_party/googletest/googletest/src/gtest-all.cc + third_party/googletest/googlemock/src/gmock-all.cc + ) + target_compile_features(log_too_many_open_files_test PUBLIC cxx_std_14) + target_include_directories(log_too_many_open_files_test + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/include + ${_gRPC_ADDRESS_SORTING_INCLUDE_DIR} + ${_gRPC_RE2_INCLUDE_DIR} + ${_gRPC_SSL_INCLUDE_DIR} + ${_gRPC_UPB_GENERATED_DIR} + ${_gRPC_UPB_GRPC_GENERATED_DIR} + ${_gRPC_UPB_INCLUDE_DIR} + ${_gRPC_XXHASH_INCLUDE_DIR} + ${_gRPC_ZLIB_INCLUDE_DIR} + third_party/googletest/googletest/include + third_party/googletest/googletest + third_party/googletest/googlemock/include + third_party/googletest/googlemock + ${_gRPC_PROTO_GENS_DIR} + ) + + target_link_libraries(log_too_many_open_files_test + ${_gRPC_BASELIB_LIBRARIES} + ${_gRPC_PROTOBUF_LIBRARIES} + ${_gRPC_ZLIB_LIBRARIES} + ${_gRPC_ALLTARGETS_LIBRARIES} + grpc_test_util + ) + + +endif() endif() if(gRPC_BUILD_TESTS) diff --git a/build_autogenerated.yaml b/build_autogenerated.yaml index aed9439765d..67a66275876 100644 --- a/build_autogenerated.yaml +++ b/build_autogenerated.yaml @@ -9206,6 +9206,20 @@ targets: deps: - grpc_test_util uses_polling: false +- name: log_too_many_open_files_test + gtest: true + build: test + language: c++ + headers: [] + src: + - test/core/event_engine/posix/log_too_many_open_files_test.cc + deps: + - grpc_test_util + platforms: + - linux + - posix + - mac + uses_polling: false - name: loop_test gtest: true build: test diff --git a/src/core/BUILD b/src/core/BUILD index b9c7f9f2341..dd2c2f50f17 100644 --- a/src/core/BUILD +++ b/src/core/BUILD @@ -1849,6 +1849,7 @@ grpc_cc_library( "socket_mutator", "status_helper", "strerror", + "time", "useful", "//:event_engine_base_hdrs", "//:gpr", diff --git a/src/core/lib/event_engine/posix_engine/posix_endpoint.cc b/src/core/lib/event_engine/posix_engine/posix_endpoint.cc index e418e6b5c13..b99dde4bc4b 100644 --- a/src/core/lib/event_engine/posix_engine/posix_endpoint.cc +++ b/src/core/lib/event_engine/posix_engine/posix_endpoint.cc @@ -961,7 +961,7 @@ bool PosixEndpointImpl::DoFlushZerocopy(TcpZerocopySendRecord* record, } else { #ifdef GRPC_LINUX_ERRQUEUE GRPC_LOG_EVERY_N_SEC( - 1, + 1, GPR_INFO, "Tx0cp encountered an ENOBUFS error possibly because one or " "both of RLIMIT_MEMLOCK or hard memlock ulimit values are too " "small for the intended user. Current system value of " diff --git a/src/core/lib/event_engine/posix_engine/tcp_socket_utils.cc b/src/core/lib/event_engine/posix_engine/tcp_socket_utils.cc index 201a3964625..7c0f1775a59 100644 --- a/src/core/lib/event_engine/posix_engine/tcp_socket_utils.cc +++ b/src/core/lib/event_engine/posix_engine/tcp_socket_utils.cc @@ -30,6 +30,7 @@ #include "src/core/lib/gpr/useful.h" #include "src/core/lib/gprpp/crash.h" // IWYU pragma: keep +#include "src/core/lib/gprpp/time.h" #include "src/core/lib/iomgr/port.h" #ifdef GRPC_POSIX_SOCKET_UTILS_COMMON @@ -96,8 +97,21 @@ absl::Status ErrorForFd( int CreateSocket(std::function socket_factory, int family, int type, int protocol) { - return socket_factory != nullptr ? socket_factory(family, type, protocol) - : socket(family, type, protocol); + int res = socket_factory != nullptr ? socket_factory(family, type, protocol) + : socket(family, type, protocol); + if (res < 0 && errno == EMFILE) { + int saved_errno = errno; + GRPC_LOG_EVERY_N_SEC( + 10, GPR_ERROR, + "socket(%d, %d, %d) returned %d with error: |%s|. This process " + "might not have a sufficient file descriptor limit for the number " + "of connections grpc wants to open (which is generally a function of " + "the number of grpc channels, the lb policy of each channel, and the " + "number of backends each channel is load balancing across).", + family, type, protocol, res, grpc_core::StrError(errno).c_str()); + errno = saved_errno; + } + return res; } absl::Status PrepareTcpClientSocket(PosixSocketWrapper sock, diff --git a/src/core/lib/gprpp/time.cc b/src/core/lib/gprpp/time.cc index e11bfe197f2..2c9e96d4211 100644 --- a/src/core/lib/gprpp/time.cc +++ b/src/core/lib/gprpp/time.cc @@ -25,6 +25,7 @@ #include "absl/strings/str_format.h" +#include #include #include "src/core/lib/gprpp/no_destruct.h" diff --git a/src/core/lib/gprpp/time.h b/src/core/lib/gprpp/time.h index 995852e1c9f..557139eb46a 100644 --- a/src/core/lib/gprpp/time.h +++ b/src/core/lib/gprpp/time.h @@ -26,20 +26,20 @@ #include "absl/types/optional.h" #include -#include #include #include "src/core/lib/gpr/time_precise.h" #include "src/core/lib/gpr/useful.h" -#define GRPC_LOG_EVERY_N_SEC(n, format, ...) \ +#define GRPC_LOG_EVERY_N_SEC(n, severity, format, ...) \ do { \ static std::atomic prev{0}; \ uint64_t now = grpc_core::Timestamp::FromTimespecRoundDown( \ gpr_now(GPR_CLOCK_MONOTONIC)) \ .milliseconds_after_process_epoch(); \ - if ((now - prev.exchange(now)) > (n)*1000) { \ - gpr_log(GPR_INFO, format, __VA_ARGS__); \ + uint64_t prev_tsamp = prev.exchange(now); \ + if (prev_tsamp == 0 || now - prev_tsamp > (n)*1000) { \ + gpr_log(severity, format, __VA_ARGS__); \ } \ } while (0) diff --git a/src/core/lib/iomgr/socket_utils_common_posix.cc b/src/core/lib/iomgr/socket_utils_common_posix.cc index df5d9f05a11..2f99d35d50e 100644 --- a/src/core/lib/iomgr/socket_utils_common_posix.cc +++ b/src/core/lib/iomgr/socket_utils_common_posix.cc @@ -437,9 +437,22 @@ grpc_error_handle grpc_create_dualstack_socket( static int create_socket(grpc_socket_factory* factory, int domain, int type, int protocol) { - return (factory != nullptr) - ? grpc_socket_factory_socket(factory, domain, type, protocol) - : socket(domain, type, protocol); + int res = (factory != nullptr) + ? grpc_socket_factory_socket(factory, domain, type, protocol) + : socket(domain, type, protocol); + if (res < 0 && errno == EMFILE) { + int saved_errno = errno; + GRPC_LOG_EVERY_N_SEC( + 10, GPR_ERROR, + "socket(%d, %d, %d) returned %d with error: |%s|. This process " + "might not have a sufficient file descriptor limit for the number " + "of connections grpc wants to open (which is generally a function of " + "the number of grpc channels, the lb policy of each channel, and the " + "number of backends each channel is load balancing across).", + domain, type, protocol, res, grpc_core::StrError(errno).c_str()); + errno = saved_errno; + } + return res; } grpc_error_handle grpc_create_dualstack_socket_using_factory( diff --git a/test/core/event_engine/event_engine_test_utils.cc b/test/core/event_engine/event_engine_test_utils.cc index a9029e2613b..7a1c56f90da 100644 --- a/test/core/event_engine/event_engine_test_utils.cc +++ b/test/core/event_engine/event_engine_test_utils.cc @@ -77,7 +77,8 @@ std::string GetNextSendMessage() { void WaitForSingleOwner(std::shared_ptr&& engine) { while (engine.use_count() > 1) { - GRPC_LOG_EVERY_N_SEC(2, "engine.use_count() = %ld", engine.use_count()); + GRPC_LOG_EVERY_N_SEC(2, GPR_INFO, "engine.use_count() = %ld", + engine.use_count()); absl::SleepFor(absl::Milliseconds(100)); } } diff --git a/test/core/event_engine/posix/BUILD b/test/core/event_engine/posix/BUILD index a08236e0548..0d8ee42474b 100644 --- a/test/core/event_engine/posix/BUILD +++ b/test/core/event_engine/posix/BUILD @@ -241,3 +241,22 @@ grpc_cc_test( "//test/core/util:grpc_test_util", ], ) + +grpc_cc_test( + name = "log_too_many_open_files_test", + srcs = ["log_too_many_open_files_test.cc"], + external_deps = [ + "gtest", + ], + language = "C++", + tags = [ + "no_windows", + ], + uses_event_engine = False, + uses_polling = False, + deps = [ + "//:grpc", + "//src/core:posix_event_engine", + "//test/core/util:grpc_test_util", + ], +) diff --git a/test/core/event_engine/posix/log_too_many_open_files_test.cc b/test/core/event_engine/posix/log_too_many_open_files_test.cc new file mode 100644 index 00000000000..0f0ae6c1ec7 --- /dev/null +++ b/test/core/event_engine/posix/log_too_many_open_files_test.cc @@ -0,0 +1,53 @@ +// Copyright 2023 gRPC authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "absl/status/status.h" +#include "absl/status/statusor.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +#include "src/core/lib/event_engine/posix_engine/tcp_socket_utils.h" +#include "src/core/lib/event_engine/tcp_socket_utils.h" +#include "test/core/util/test_config.h" + +using ::grpc_event_engine::experimental::PosixSocketWrapper; + +// There is a special code path in create_socket to log errors upon EMFILE. +// Goal of this test is just to exercise that code path and also make sure +// it doesn't mess up "errno", so that we get the right error message. +TEST(LogTooManyOpenFilesTest, MainTest) { + const auto mock_socket_factory = [](int, int, int) { + errno = EMFILE; + return -1; + }; + auto addr = grpc_event_engine::experimental::URIToResolvedAddress( + "ipv4:127.0.0.1:80"); + ASSERT_TRUE(addr.ok()); + PosixSocketWrapper::DSMode dsmode; + absl::StatusOr result = + PosixSocketWrapper::CreateDualStackSocket(mock_socket_factory, *addr, + SOCK_STREAM, AF_INET, dsmode); + EXPECT_FALSE(result.ok()); + EXPECT_THAT(result.status().message(), + ::testing::HasSubstr("Too many open files")); +} + +int main(int argc, char** argv) { + grpc::testing::TestEnvironment env(&argc, argv); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tools/run_tests/generated/tests.json b/tools/run_tests/generated/tests.json index 6de701327e4..fbeeba34bb5 100644 --- a/tools/run_tests/generated/tests.json +++ b/tools/run_tests/generated/tests.json @@ -4547,6 +4547,28 @@ ], "uses_polling": false }, + { + "args": [], + "benchmark": false, + "ci_platforms": [ + "linux", + "mac", + "posix" + ], + "cpu_cost": 1.0, + "exclude_configs": [], + "exclude_iomgrs": [], + "flaky": false, + "gtest": true, + "language": "c++", + "name": "log_too_many_open_files_test", + "platforms": [ + "linux", + "mac", + "posix" + ], + "uses_polling": false + }, { "args": [], "benchmark": false,