[iomgr and EE logging] Log ERROR if socket() returns EMFILE on posix (#32204)

There are potentially surprising deployment bugs that can cause `EMFILE`
to be hit. For example, file descriptor limits can be easily reached if
- the round robin LB policy is used
- the load balancer hands out an assignment with a lot of backends
- using debian's default 1024 file descriptor limit.

To make such problems more apparent, we can pay special attention to
this error and log ERROR when it happens.

Related: b/265199104
pull/32549/head
apolcyn 2 years ago committed by GitHub
parent 141592df55
commit 27ee3913d1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 42
      CMakeLists.txt
  2. 14
      build_autogenerated.yaml
  3. 1
      src/core/BUILD
  4. 2
      src/core/lib/event_engine/posix_engine/posix_endpoint.cc
  5. 16
      src/core/lib/event_engine/posix_engine/tcp_socket_utils.cc
  6. 1
      src/core/lib/gprpp/time.cc
  7. 8
      src/core/lib/gprpp/time.h
  8. 15
      src/core/lib/iomgr/socket_utils_common_posix.cc
  9. 3
      test/core/event_engine/event_engine_test_utils.cc
  10. 19
      test/core/event_engine/posix/BUILD
  11. 53
      test/core/event_engine/posix/log_too_many_open_files_test.cc
  12. 22
      tools/run_tests/generated/tests.json

42
CMakeLists.txt generated

@ -1042,6 +1042,9 @@ if(gRPC_BUILD_TESTS)
add_dependencies(buildtests_cxx lock_free_event_test) add_dependencies(buildtests_cxx lock_free_event_test)
endif() endif()
add_dependencies(buildtests_cxx log_test) add_dependencies(buildtests_cxx log_test)
if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
add_dependencies(buildtests_cxx log_too_many_open_files_test)
endif()
add_dependencies(buildtests_cxx loop_test) add_dependencies(buildtests_cxx loop_test)
add_dependencies(buildtests_cxx map_pipe_test) add_dependencies(buildtests_cxx map_pipe_test)
add_dependencies(buildtests_cxx match_test) add_dependencies(buildtests_cxx match_test)
@ -14709,6 +14712,45 @@ target_link_libraries(log_test
) )
endif()
if(gRPC_BUILD_TESTS)
if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
add_executable(log_too_many_open_files_test
test/core/event_engine/posix/log_too_many_open_files_test.cc
third_party/googletest/googletest/src/gtest-all.cc
third_party/googletest/googlemock/src/gmock-all.cc
)
target_compile_features(log_too_many_open_files_test PUBLIC cxx_std_14)
target_include_directories(log_too_many_open_files_test
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_CURRENT_SOURCE_DIR}/include
${_gRPC_ADDRESS_SORTING_INCLUDE_DIR}
${_gRPC_RE2_INCLUDE_DIR}
${_gRPC_SSL_INCLUDE_DIR}
${_gRPC_UPB_GENERATED_DIR}
${_gRPC_UPB_GRPC_GENERATED_DIR}
${_gRPC_UPB_INCLUDE_DIR}
${_gRPC_XXHASH_INCLUDE_DIR}
${_gRPC_ZLIB_INCLUDE_DIR}
third_party/googletest/googletest/include
third_party/googletest/googletest
third_party/googletest/googlemock/include
third_party/googletest/googlemock
${_gRPC_PROTO_GENS_DIR}
)
target_link_libraries(log_too_many_open_files_test
${_gRPC_BASELIB_LIBRARIES}
${_gRPC_PROTOBUF_LIBRARIES}
${_gRPC_ZLIB_LIBRARIES}
${_gRPC_ALLTARGETS_LIBRARIES}
grpc_test_util
)
endif()
endif() endif()
if(gRPC_BUILD_TESTS) if(gRPC_BUILD_TESTS)

@ -9206,6 +9206,20 @@ targets:
deps: deps:
- grpc_test_util - grpc_test_util
uses_polling: false uses_polling: false
- name: log_too_many_open_files_test
gtest: true
build: test
language: c++
headers: []
src:
- test/core/event_engine/posix/log_too_many_open_files_test.cc
deps:
- grpc_test_util
platforms:
- linux
- posix
- mac
uses_polling: false
- name: loop_test - name: loop_test
gtest: true gtest: true
build: test build: test

@ -1849,6 +1849,7 @@ grpc_cc_library(
"socket_mutator", "socket_mutator",
"status_helper", "status_helper",
"strerror", "strerror",
"time",
"useful", "useful",
"//:event_engine_base_hdrs", "//:event_engine_base_hdrs",
"//:gpr", "//:gpr",

@ -961,7 +961,7 @@ bool PosixEndpointImpl::DoFlushZerocopy(TcpZerocopySendRecord* record,
} else { } else {
#ifdef GRPC_LINUX_ERRQUEUE #ifdef GRPC_LINUX_ERRQUEUE
GRPC_LOG_EVERY_N_SEC( GRPC_LOG_EVERY_N_SEC(
1, 1, GPR_INFO,
"Tx0cp encountered an ENOBUFS error possibly because one or " "Tx0cp encountered an ENOBUFS error possibly because one or "
"both of RLIMIT_MEMLOCK or hard memlock ulimit values are too " "both of RLIMIT_MEMLOCK or hard memlock ulimit values are too "
"small for the intended user. Current system value of " "small for the intended user. Current system value of "

@ -30,6 +30,7 @@
#include "src/core/lib/gpr/useful.h" #include "src/core/lib/gpr/useful.h"
#include "src/core/lib/gprpp/crash.h" // IWYU pragma: keep #include "src/core/lib/gprpp/crash.h" // IWYU pragma: keep
#include "src/core/lib/gprpp/time.h"
#include "src/core/lib/iomgr/port.h" #include "src/core/lib/iomgr/port.h"
#ifdef GRPC_POSIX_SOCKET_UTILS_COMMON #ifdef GRPC_POSIX_SOCKET_UTILS_COMMON
@ -96,8 +97,21 @@ absl::Status ErrorForFd(
int CreateSocket(std::function<int(int, int, int)> socket_factory, int family, int CreateSocket(std::function<int(int, int, int)> socket_factory, int family,
int type, int protocol) { int type, int protocol) {
return socket_factory != nullptr ? socket_factory(family, type, protocol) int res = socket_factory != nullptr ? socket_factory(family, type, protocol)
: socket(family, type, protocol); : socket(family, type, protocol);
if (res < 0 && errno == EMFILE) {
int saved_errno = errno;
GRPC_LOG_EVERY_N_SEC(
10, GPR_ERROR,
"socket(%d, %d, %d) returned %d with error: |%s|. This process "
"might not have a sufficient file descriptor limit for the number "
"of connections grpc wants to open (which is generally a function of "
"the number of grpc channels, the lb policy of each channel, and the "
"number of backends each channel is load balancing across).",
family, type, protocol, res, grpc_core::StrError(errno).c_str());
errno = saved_errno;
}
return res;
} }
absl::Status PrepareTcpClientSocket(PosixSocketWrapper sock, absl::Status PrepareTcpClientSocket(PosixSocketWrapper sock,

@ -25,6 +25,7 @@
#include "absl/strings/str_format.h" #include "absl/strings/str_format.h"
#include <grpc/support/log.h>
#include <grpc/support/time.h> #include <grpc/support/time.h>
#include "src/core/lib/gprpp/no_destruct.h" #include "src/core/lib/gprpp/no_destruct.h"

@ -26,20 +26,20 @@
#include "absl/types/optional.h" #include "absl/types/optional.h"
#include <grpc/event_engine/event_engine.h> #include <grpc/event_engine/event_engine.h>
#include <grpc/support/log.h>
#include <grpc/support/time.h> #include <grpc/support/time.h>
#include "src/core/lib/gpr/time_precise.h" #include "src/core/lib/gpr/time_precise.h"
#include "src/core/lib/gpr/useful.h" #include "src/core/lib/gpr/useful.h"
#define GRPC_LOG_EVERY_N_SEC(n, format, ...) \ #define GRPC_LOG_EVERY_N_SEC(n, severity, format, ...) \
do { \ do { \
static std::atomic<uint64_t> prev{0}; \ static std::atomic<uint64_t> prev{0}; \
uint64_t now = grpc_core::Timestamp::FromTimespecRoundDown( \ uint64_t now = grpc_core::Timestamp::FromTimespecRoundDown( \
gpr_now(GPR_CLOCK_MONOTONIC)) \ gpr_now(GPR_CLOCK_MONOTONIC)) \
.milliseconds_after_process_epoch(); \ .milliseconds_after_process_epoch(); \
if ((now - prev.exchange(now)) > (n)*1000) { \ uint64_t prev_tsamp = prev.exchange(now); \
gpr_log(GPR_INFO, format, __VA_ARGS__); \ if (prev_tsamp == 0 || now - prev_tsamp > (n)*1000) { \
gpr_log(severity, format, __VA_ARGS__); \
} \ } \
} while (0) } while (0)

@ -437,9 +437,22 @@ grpc_error_handle grpc_create_dualstack_socket(
static int create_socket(grpc_socket_factory* factory, int domain, int type, static int create_socket(grpc_socket_factory* factory, int domain, int type,
int protocol) { int protocol) {
return (factory != nullptr) int res = (factory != nullptr)
? grpc_socket_factory_socket(factory, domain, type, protocol) ? grpc_socket_factory_socket(factory, domain, type, protocol)
: socket(domain, type, protocol); : socket(domain, type, protocol);
if (res < 0 && errno == EMFILE) {
int saved_errno = errno;
GRPC_LOG_EVERY_N_SEC(
10, GPR_ERROR,
"socket(%d, %d, %d) returned %d with error: |%s|. This process "
"might not have a sufficient file descriptor limit for the number "
"of connections grpc wants to open (which is generally a function of "
"the number of grpc channels, the lb policy of each channel, and the "
"number of backends each channel is load balancing across).",
domain, type, protocol, res, grpc_core::StrError(errno).c_str());
errno = saved_errno;
}
return res;
} }
grpc_error_handle grpc_create_dualstack_socket_using_factory( grpc_error_handle grpc_create_dualstack_socket_using_factory(

@ -77,7 +77,8 @@ std::string GetNextSendMessage() {
void WaitForSingleOwner(std::shared_ptr<EventEngine>&& engine) { void WaitForSingleOwner(std::shared_ptr<EventEngine>&& engine) {
while (engine.use_count() > 1) { while (engine.use_count() > 1) {
GRPC_LOG_EVERY_N_SEC(2, "engine.use_count() = %ld", engine.use_count()); GRPC_LOG_EVERY_N_SEC(2, GPR_INFO, "engine.use_count() = %ld",
engine.use_count());
absl::SleepFor(absl::Milliseconds(100)); absl::SleepFor(absl::Milliseconds(100));
} }
} }

@ -241,3 +241,22 @@ grpc_cc_test(
"//test/core/util:grpc_test_util", "//test/core/util:grpc_test_util",
], ],
) )
grpc_cc_test(
name = "log_too_many_open_files_test",
srcs = ["log_too_many_open_files_test.cc"],
external_deps = [
"gtest",
],
language = "C++",
tags = [
"no_windows",
],
uses_event_engine = False,
uses_polling = False,
deps = [
"//:grpc",
"//src/core:posix_event_engine",
"//test/core/util:grpc_test_util",
],
)

@ -0,0 +1,53 @@
// Copyright 2023 gRPC authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <errno.h>
#include <sys/socket.h>
#include "absl/status/status.h"
#include "absl/status/statusor.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "src/core/lib/event_engine/posix_engine/tcp_socket_utils.h"
#include "src/core/lib/event_engine/tcp_socket_utils.h"
#include "test/core/util/test_config.h"
using ::grpc_event_engine::experimental::PosixSocketWrapper;
// There is a special code path in create_socket to log errors upon EMFILE.
// Goal of this test is just to exercise that code path and also make sure
// it doesn't mess up "errno", so that we get the right error message.
TEST(LogTooManyOpenFilesTest, MainTest) {
const auto mock_socket_factory = [](int, int, int) {
errno = EMFILE;
return -1;
};
auto addr = grpc_event_engine::experimental::URIToResolvedAddress(
"ipv4:127.0.0.1:80");
ASSERT_TRUE(addr.ok());
PosixSocketWrapper::DSMode dsmode;
absl::StatusOr<PosixSocketWrapper> result =
PosixSocketWrapper::CreateDualStackSocket(mock_socket_factory, *addr,
SOCK_STREAM, AF_INET, dsmode);
EXPECT_FALSE(result.ok());
EXPECT_THAT(result.status().message(),
::testing::HasSubstr("Too many open files"));
}
int main(int argc, char** argv) {
grpc::testing::TestEnvironment env(&argc, argv);
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}

@ -4547,6 +4547,28 @@
], ],
"uses_polling": false "uses_polling": false
}, },
{
"args": [],
"benchmark": false,
"ci_platforms": [
"linux",
"mac",
"posix"
],
"cpu_cost": 1.0,
"exclude_configs": [],
"exclude_iomgrs": [],
"flaky": false,
"gtest": true,
"language": "c++",
"name": "log_too_many_open_files_test",
"platforms": [
"linux",
"mac",
"posix"
],
"uses_polling": false
},
{ {
"args": [], "args": [],
"benchmark": false, "benchmark": false,

Loading…
Cancel
Save