|
|
|
@ -15,7 +15,6 @@ |
|
|
|
|
// limitations under the License.
|
|
|
|
|
//
|
|
|
|
|
//
|
|
|
|
|
|
|
|
|
|
#include <grpc/support/port_platform.h> |
|
|
|
|
|
|
|
|
|
#include "src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.h" |
|
|
|
@ -24,24 +23,36 @@ |
|
|
|
|
|
|
|
|
|
#include <atomic> |
|
|
|
|
#include <chrono> |
|
|
|
|
#include <cstddef> |
|
|
|
|
#include <memory> |
|
|
|
|
#include <utility> |
|
|
|
|
|
|
|
|
|
#include "absl/functional/any_invocable.h" |
|
|
|
|
#include "absl/time/clock.h" |
|
|
|
|
#include "absl/time/time.h" |
|
|
|
|
#include "absl/types/optional.h" |
|
|
|
|
|
|
|
|
|
#include <grpc/support/log.h> |
|
|
|
|
#include <grpc/support/thd_id.h> |
|
|
|
|
|
|
|
|
|
#include "src/core/lib/backoff/backoff.h" |
|
|
|
|
#include "src/core/lib/debug/trace.h" |
|
|
|
|
#include "src/core/lib/event_engine/common_closures.h" |
|
|
|
|
#include "src/core/lib/event_engine/thread_local.h" |
|
|
|
|
#include "src/core/lib/event_engine/trace.h" |
|
|
|
|
#include "src/core/lib/event_engine/work_queue/basic_work_queue.h" |
|
|
|
|
#include "src/core/lib/event_engine/work_queue/work_queue.h" |
|
|
|
|
#include "src/core/lib/gprpp/crash.h" |
|
|
|
|
#include "src/core/lib/gprpp/env.h" |
|
|
|
|
#include "src/core/lib/gprpp/examine_stack.h" |
|
|
|
|
#include "src/core/lib/gprpp/thd.h" |
|
|
|
|
#include "src/core/lib/gprpp/time.h" |
|
|
|
|
|
|
|
|
|
#ifdef GPR_POSIX_SYNC |
|
|
|
|
#include <csignal> |
|
|
|
|
#elif defined(GPR_WINDOWS) |
|
|
|
|
#include <signal.h> |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
// IWYU pragma: no_include <ratio>
|
|
|
|
|
|
|
|
|
|
// ## Thread Pool Fork-handling
|
|
|
|
@ -84,6 +95,13 @@ |
|
|
|
|
// non-existent Lifeguard thread to finish. Trying a simple
|
|
|
|
|
// `lifeguard_thread_.Join()` leads to memory access errors. This implementation
|
|
|
|
|
// uses Notifications to coordinate startup and shutdown states.
|
|
|
|
|
//
|
|
|
|
|
// ## Debugging
|
|
|
|
|
//
|
|
|
|
|
// Set the environment variable GRPC_THREAD_POOL_VERBOSE_FAILURES=anything to
|
|
|
|
|
// enable advanced debugging. When the pool takes too long to quiesce, a
|
|
|
|
|
// backtrace will be printed for every running thread, and the process will
|
|
|
|
|
// abort.
|
|
|
|
|
|
|
|
|
|
namespace grpc_event_engine { |
|
|
|
|
namespace experimental { |
|
|
|
@ -117,6 +135,37 @@ constexpr grpc_core::Duration kLifeguardMinSleepBetweenChecks{ |
|
|
|
|
// Maximum time the lifeguard thread should sleep between checking for new work.
|
|
|
|
|
constexpr grpc_core::Duration kLifeguardMaxSleepBetweenChecks{ |
|
|
|
|
grpc_core::Duration::Seconds(1)}; |
|
|
|
|
constexpr grpc_core::Duration kBlockUntilThreadCountTimeout{ |
|
|
|
|
grpc_core::Duration::Seconds(60)}; |
|
|
|
|
|
|
|
|
|
#ifdef GPR_POSIX_SYNC |
|
|
|
|
const bool g_log_verbose_failures = |
|
|
|
|
grpc_core::GetEnv("GRPC_THREAD_POOL_VERBOSE_FAILURES").has_value(); |
|
|
|
|
constexpr int kDumpStackSignal = SIGUSR1; |
|
|
|
|
#elif defined(GPR_WINDOWS) |
|
|
|
|
const bool g_log_verbose_failures = |
|
|
|
|
grpc_core::GetEnv("GRPC_THREAD_POOL_VERBOSE_FAILURES").has_value(); |
|
|
|
|
constexpr int kDumpStackSignal = SIGTERM; |
|
|
|
|
#else |
|
|
|
|
constexpr bool g_log_verbose_failures = false; |
|
|
|
|
constexpr int kDumpStackSignal = -1; |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
std::atomic<size_t> g_reported_dump_count{0}; |
|
|
|
|
|
|
|
|
|
void DumpSignalHandler(int /* sig */) { |
|
|
|
|
const auto trace = grpc_core::GetCurrentStackTrace(); |
|
|
|
|
if (!trace.has_value()) { |
|
|
|
|
gpr_log(GPR_ERROR, "DumpStack::%" PRIdPTR ": Stack trace not available", |
|
|
|
|
gpr_thd_currentid()); |
|
|
|
|
} else { |
|
|
|
|
gpr_log(GPR_ERROR, "DumpStack::%" PRIdPTR ": %s", gpr_thd_currentid(), |
|
|
|
|
trace->c_str()); |
|
|
|
|
} |
|
|
|
|
g_reported_dump_count.fetch_add(1); |
|
|
|
|
grpc_core::Thread::Kill(gpr_thd_currentid()); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
|
|
thread_local WorkQueue* g_local_queue = nullptr; |
|
|
|
@ -125,6 +174,10 @@ thread_local WorkQueue* g_local_queue = nullptr; |
|
|
|
|
|
|
|
|
|
WorkStealingThreadPool::WorkStealingThreadPool(size_t reserve_threads) |
|
|
|
|
: pool_{std::make_shared<WorkStealingThreadPoolImpl>(reserve_threads)} { |
|
|
|
|
if (g_log_verbose_failures) { |
|
|
|
|
GRPC_EVENT_ENGINE_TRACE( |
|
|
|
|
"%s", "WorkStealingThreadPool verbose failures are enabled"); |
|
|
|
|
} |
|
|
|
|
pool_->Start(); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -221,8 +274,13 @@ void WorkStealingThreadPool::WorkStealingThreadPoolImpl::Quiesce() { |
|
|
|
|
// running instead of zero.
|
|
|
|
|
bool is_threadpool_thread = g_local_queue != nullptr; |
|
|
|
|
work_signal()->SignalAll(); |
|
|
|
|
living_thread_count_.BlockUntilThreadCount(is_threadpool_thread ? 1 : 0, |
|
|
|
|
"shutting down"); |
|
|
|
|
auto threads_were_shut_down = living_thread_count_.BlockUntilThreadCount( |
|
|
|
|
is_threadpool_thread ? 1 : 0, "shutting down", |
|
|
|
|
g_log_verbose_failures ? kBlockUntilThreadCountTimeout |
|
|
|
|
: grpc_core::Duration::Infinity()); |
|
|
|
|
if (!threads_were_shut_down.ok() && g_log_verbose_failures) { |
|
|
|
|
DumpStacksAndCrash(); |
|
|
|
|
} |
|
|
|
|
GPR_ASSERT(queue_.Empty()); |
|
|
|
|
quiesced_.store(true, std::memory_order_relaxed); |
|
|
|
|
lifeguard_.BlockUntilShutdownAndReset(); |
|
|
|
@ -262,7 +320,11 @@ void WorkStealingThreadPool::WorkStealingThreadPoolImpl::PrepareFork() { |
|
|
|
|
gpr_log(GPR_INFO, "WorkStealingThreadPoolImpl::PrepareFork"); |
|
|
|
|
SetForking(true); |
|
|
|
|
work_signal_.SignalAll(); |
|
|
|
|
living_thread_count_.BlockUntilThreadCount(0, "forking"); |
|
|
|
|
auto threads_were_shut_down = living_thread_count_.BlockUntilThreadCount( |
|
|
|
|
0, "forking", kBlockUntilThreadCountTimeout); |
|
|
|
|
if (!threads_were_shut_down.ok() && g_log_verbose_failures) { |
|
|
|
|
DumpStacksAndCrash(); |
|
|
|
|
} |
|
|
|
|
lifeguard_.BlockUntilShutdownAndReset(); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -271,6 +333,37 @@ void WorkStealingThreadPool::WorkStealingThreadPoolImpl::Postfork() { |
|
|
|
|
Start(); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
void WorkStealingThreadPool::WorkStealingThreadPoolImpl::TrackThread( |
|
|
|
|
gpr_thd_id tid) { |
|
|
|
|
grpc_core::MutexLock lock(&thd_set_mu_); |
|
|
|
|
thds_.insert(tid); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
void WorkStealingThreadPool::WorkStealingThreadPoolImpl::UntrackThread( |
|
|
|
|
gpr_thd_id tid) { |
|
|
|
|
grpc_core::MutexLock lock(&thd_set_mu_); |
|
|
|
|
thds_.erase(tid); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
void WorkStealingThreadPool::WorkStealingThreadPoolImpl::DumpStacksAndCrash() { |
|
|
|
|
grpc_core::MutexLock lock(&thd_set_mu_); |
|
|
|
|
gpr_log(GPR_ERROR, |
|
|
|
|
"Pool did not quiesce in time, gRPC will not shut down cleanly. " |
|
|
|
|
"Dumping all %zu thread stacks.", |
|
|
|
|
thds_.size()); |
|
|
|
|
for (const auto tid : thds_) { |
|
|
|
|
grpc_core::Thread::Signal(tid, kDumpStackSignal); |
|
|
|
|
} |
|
|
|
|
// If this is a thread pool thread, wait for one fewer thread.
|
|
|
|
|
auto ignore_thread_count = g_local_queue != nullptr ? 1 : 0; |
|
|
|
|
while (living_thread_count_.count() - ignore_thread_count > |
|
|
|
|
g_reported_dump_count.load()) { |
|
|
|
|
absl::SleepFor(absl::Milliseconds(200)); |
|
|
|
|
} |
|
|
|
|
grpc_core::Crash( |
|
|
|
|
"Pool did not quiesce in time, gRPC will not shut down cleanly."); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// -------- WorkStealingThreadPool::WorkStealingThreadPoolImpl::Lifeguard -----
|
|
|
|
|
|
|
|
|
|
WorkStealingThreadPool::WorkStealingThreadPoolImpl::Lifeguard::Lifeguard( |
|
|
|
@ -388,6 +481,14 @@ WorkStealingThreadPool::ThreadState::ThreadState( |
|
|
|
|
busy_count_idx_(pool_->busy_thread_count()->NextIndex()) {} |
|
|
|
|
|
|
|
|
|
void WorkStealingThreadPool::ThreadState::ThreadBody() { |
|
|
|
|
if (g_log_verbose_failures) { |
|
|
|
|
#ifdef GPR_POSIX_SYNC |
|
|
|
|
std::signal(kDumpStackSignal, DumpSignalHandler); |
|
|
|
|
#elif defined(GPR_WINDOWS) |
|
|
|
|
signal(kDumpStackSignal, DumpSignalHandler); |
|
|
|
|
#endif |
|
|
|
|
pool_->TrackThread(gpr_thd_currentid()); |
|
|
|
|
} |
|
|
|
|
g_local_queue = new BasicWorkQueue(pool_.get()); |
|
|
|
|
pool_->theft_registry()->Enroll(g_local_queue); |
|
|
|
|
ThreadLocal::SetIsEventEngineThread(true); |
|
|
|
@ -410,6 +511,9 @@ void WorkStealingThreadPool::ThreadState::ThreadBody() { |
|
|
|
|
GPR_ASSERT(g_local_queue->Empty()); |
|
|
|
|
pool_->theft_registry()->Unenroll(g_local_queue); |
|
|
|
|
delete g_local_queue; |
|
|
|
|
if (g_log_verbose_failures) { |
|
|
|
|
pool_->UntrackThread(gpr_thd_currentid()); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
void WorkStealingThreadPool::ThreadState::SleepIfRunning() { |
|
|
|
|