[EventEngine] Improve lock contention in WorkStealingThreadPool (alternative) (#34065)

Proposed alternative to https://github.com/grpc/grpc/pull/34024. This version has a simpler, faster busy-count implementation based on a sharded set of atomic counts: fast increment/decrement operations, relatively slower summation of total counts (which need to happen much less frequently).
1 year ago · 108af0a94f
parent b85b57fdc7
commit 108af0a94f
21 changed files with 489 additions and 153 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -2179,6 +2179,7 @@ add_library(grpc
  src/core/lib/event_engine/slice_buffer.cc
  src/core/lib/event_engine/tcp_socket_utils.cc
  src/core/lib/event_engine/thread_pool/original_thread_pool.cc
  src/core/lib/event_engine/thread_pool/thread_count.cc
  src/core/lib/event_engine/thread_pool/thread_pool_factory.cc
  src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.cc
  src/core/lib/event_engine/thready_event_engine/thready_event_engine.cc
@ -2883,6 +2884,7 @@ add_library(grpc_unsecure
  src/core/lib/event_engine/slice_buffer.cc
  src/core/lib/event_engine/tcp_socket_utils.cc
  src/core/lib/event_engine/thread_pool/original_thread_pool.cc
  src/core/lib/event_engine/thread_pool/thread_count.cc
  src/core/lib/event_engine/thread_pool/thread_pool_factory.cc
  src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.cc
  src/core/lib/event_engine/thready_event_engine/thready_event_engine.cc
@ -4779,6 +4781,7 @@ add_library(grpc_authorization_provider
  src/core/lib/event_engine/slice_buffer.cc
  src/core/lib/event_engine/tcp_socket_utils.cc
  src/core/lib/event_engine/thread_pool/original_thread_pool.cc
  src/core/lib/event_engine/thread_pool/thread_count.cc
  src/core/lib/event_engine/thread_pool/thread_pool_factory.cc
  src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.cc
  src/core/lib/event_engine/thready_event_engine/thready_event_engine.cc
@ -12071,6 +12074,7 @@ add_executable(frame_test
  src/core/lib/event_engine/slice_buffer.cc
  src/core/lib/event_engine/tcp_socket_utils.cc
  src/core/lib/event_engine/thread_pool/original_thread_pool.cc
  src/core/lib/event_engine/thread_pool/thread_count.cc
  src/core/lib/event_engine/thread_pool/thread_pool_factory.cc
  src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.cc
  src/core/lib/event_engine/thready_event_engine/thready_event_engine.cc
--- a/2
+++ b/2
@ -1462,6 +1462,7 @@ LIBGRPC_SRC = \
    src/core/lib/event_engine/slice_buffer.cc \
    src/core/lib/event_engine/tcp_socket_utils.cc \
    src/core/lib/event_engine/thread_pool/original_thread_pool.cc \
    src/core/lib/event_engine/thread_pool/thread_count.cc \
    src/core/lib/event_engine/thread_pool/thread_pool_factory.cc \
    src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.cc \
    src/core/lib/event_engine/thready_event_engine/thready_event_engine.cc \
@ -2027,6 +2028,7 @@ LIBGRPC_UNSECURE_SRC = \
    src/core/lib/event_engine/slice_buffer.cc \
    src/core/lib/event_engine/tcp_socket_utils.cc \
    src/core/lib/event_engine/thread_pool/original_thread_pool.cc \
    src/core/lib/event_engine/thread_pool/thread_count.cc \
    src/core/lib/event_engine/thread_pool/thread_pool_factory.cc \
    src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.cc \
    src/core/lib/event_engine/thready_event_engine/thready_event_engine.cc \
--- a/Package.swift
+++ b/Package.swift
@ -1137,6 +1137,8 @@ let package = Package(
        "src/core/lib/event_engine/thread_local.h",
        "src/core/lib/event_engine/thread_pool/original_thread_pool.cc",
        "src/core/lib/event_engine/thread_pool/original_thread_pool.h",
        "src/core/lib/event_engine/thread_pool/thread_count.cc",
        "src/core/lib/event_engine/thread_pool/thread_count.h",
        "src/core/lib/event_engine/thread_pool/thread_pool.h",
        "src/core/lib/event_engine/thread_pool/thread_pool_factory.cc",
        "src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.cc",
--- a/build_autogenerated.yaml
+++ b/build_autogenerated.yaml
@ -723,6 +723,7 @@ libs:
  - src/core/lib/event_engine/shim.h
  - src/core/lib/event_engine/tcp_socket_utils.h
  - src/core/lib/event_engine/thread_pool/original_thread_pool.h
  - src/core/lib/event_engine/thread_pool/thread_count.h
  - src/core/lib/event_engine/thread_pool/thread_pool.h
  - src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.h
  - src/core/lib/event_engine/thready_event_engine/thready_event_engine.h
@ -1532,6 +1533,7 @@ libs:
  - src/core/lib/event_engine/slice_buffer.cc
  - src/core/lib/event_engine/tcp_socket_utils.cc
  - src/core/lib/event_engine/thread_pool/original_thread_pool.cc
  - src/core/lib/event_engine/thread_pool/thread_count.cc
  - src/core/lib/event_engine/thread_pool/thread_pool_factory.cc
  - src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.cc
  - src/core/lib/event_engine/thready_event_engine/thready_event_engine.cc
@ -2126,6 +2128,7 @@ libs:
  - src/core/lib/event_engine/shim.h
  - src/core/lib/event_engine/tcp_socket_utils.h
  - src/core/lib/event_engine/thread_pool/original_thread_pool.h
  - src/core/lib/event_engine/thread_pool/thread_count.h
  - src/core/lib/event_engine/thread_pool/thread_pool.h
  - src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.h
  - src/core/lib/event_engine/thready_event_engine/thready_event_engine.h
@ -2542,6 +2545,7 @@ libs:
  - src/core/lib/event_engine/slice_buffer.cc
  - src/core/lib/event_engine/tcp_socket_utils.cc
  - src/core/lib/event_engine/thread_pool/original_thread_pool.cc
  - src/core/lib/event_engine/thread_pool/thread_count.cc
  - src/core/lib/event_engine/thread_pool/thread_pool_factory.cc
  - src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.cc
  - src/core/lib/event_engine/thready_event_engine/thready_event_engine.cc
@ -4088,6 +4092,7 @@ libs:
  - src/core/lib/event_engine/shim.h
  - src/core/lib/event_engine/tcp_socket_utils.h
  - src/core/lib/event_engine/thread_pool/original_thread_pool.h
  - src/core/lib/event_engine/thread_pool/thread_count.h
  - src/core/lib/event_engine/thread_pool/thread_pool.h
  - src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.h
  - src/core/lib/event_engine/thready_event_engine/thready_event_engine.h
@ -4384,6 +4389,7 @@ libs:
  - src/core/lib/event_engine/slice_buffer.cc
  - src/core/lib/event_engine/tcp_socket_utils.cc
  - src/core/lib/event_engine/thread_pool/original_thread_pool.cc
  - src/core/lib/event_engine/thread_pool/thread_count.cc
  - src/core/lib/event_engine/thread_pool/thread_pool_factory.cc
  - src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.cc
  - src/core/lib/event_engine/thready_event_engine/thready_event_engine.cc
@ -8731,6 +8737,7 @@ targets:
  - src/core/lib/event_engine/shim.h
  - src/core/lib/event_engine/tcp_socket_utils.h
  - src/core/lib/event_engine/thread_pool/original_thread_pool.h
  - src/core/lib/event_engine/thread_pool/thread_count.h
  - src/core/lib/event_engine/thread_pool/thread_pool.h
  - src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.h
  - src/core/lib/event_engine/thready_event_engine/thready_event_engine.h
@ -9009,6 +9016,7 @@ targets:
  - src/core/lib/event_engine/slice_buffer.cc
  - src/core/lib/event_engine/tcp_socket_utils.cc
  - src/core/lib/event_engine/thread_pool/original_thread_pool.cc
  - src/core/lib/event_engine/thread_pool/thread_count.cc
  - src/core/lib/event_engine/thread_pool/thread_pool_factory.cc
  - src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.cc
  - src/core/lib/event_engine/thready_event_engine/thready_event_engine.cc
--- a/config.m4
+++ b/config.m4
@ -553,6 +553,7 @@ if test "$PHP_GRPC" != "no"; then
    src/core/lib/event_engine/tcp_socket_utils.cc \
    src/core/lib/event_engine/thread_local.cc \
    src/core/lib/event_engine/thread_pool/original_thread_pool.cc \
    src/core/lib/event_engine/thread_pool/thread_count.cc \
    src/core/lib/event_engine/thread_pool/thread_pool_factory.cc \
    src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.cc \
    src/core/lib/event_engine/thready_event_engine/thready_event_engine.cc \
--- a/config.w32
+++ b/config.w32
@ -518,6 +518,7 @@ if (PHP_GRPC != "no") {
    "src\\core\\lib\\event_engine\\tcp_socket_utils.cc " +
    "src\\core\\lib\\event_engine\\thread_local.cc " +
    "src\\core\\lib\\event_engine\\thread_pool\\original_thread_pool.cc " +
    "src\\core\\lib\\event_engine\\thread_pool\\thread_count.cc " +
    "src\\core\\lib\\event_engine\\thread_pool\\thread_pool_factory.cc " +
    "src\\core\\lib\\event_engine\\thread_pool\\work_stealing_thread_pool.cc " +
    "src\\core\\lib\\event_engine\\thready_event_engine\\thready_event_engine.cc " +
--- a/gRPC-C++.podspec
+++ b/gRPC-C++.podspec
@ -795,6 +795,7 @@ Pod::Spec.new do |s|
                      'src/core/lib/event_engine/tcp_socket_utils.h',
                      'src/core/lib/event_engine/thread_local.h',
                      'src/core/lib/event_engine/thread_pool/original_thread_pool.h',
                      'src/core/lib/event_engine/thread_pool/thread_count.h',
                      'src/core/lib/event_engine/thread_pool/thread_pool.h',
                      'src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.h',
                      'src/core/lib/event_engine/thready_event_engine/thready_event_engine.h',
@ -1850,6 +1851,7 @@ Pod::Spec.new do |s|
                              'src/core/lib/event_engine/tcp_socket_utils.h',
                              'src/core/lib/event_engine/thread_local.h',
                              'src/core/lib/event_engine/thread_pool/original_thread_pool.h',
                              'src/core/lib/event_engine/thread_pool/thread_count.h',
                              'src/core/lib/event_engine/thread_pool/thread_pool.h',
                              'src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.h',
                              'src/core/lib/event_engine/thready_event_engine/thready_event_engine.h',
--- a/gRPC-Core.podspec
+++ b/gRPC-Core.podspec
@ -1238,6 +1238,8 @@ Pod::Spec.new do |s|
                      'src/core/lib/event_engine/thread_local.h',
                      'src/core/lib/event_engine/thread_pool/original_thread_pool.cc',
                      'src/core/lib/event_engine/thread_pool/original_thread_pool.h',
                      'src/core/lib/event_engine/thread_pool/thread_count.cc',
                      'src/core/lib/event_engine/thread_pool/thread_count.h',
                      'src/core/lib/event_engine/thread_pool/thread_pool.h',
                      'src/core/lib/event_engine/thread_pool/thread_pool_factory.cc',
                      'src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.cc',
@ -2596,6 +2598,7 @@ Pod::Spec.new do |s|
                              'src/core/lib/event_engine/tcp_socket_utils.h',
                              'src/core/lib/event_engine/thread_local.h',
                              'src/core/lib/event_engine/thread_pool/original_thread_pool.h',
                              'src/core/lib/event_engine/thread_pool/thread_count.h',
                              'src/core/lib/event_engine/thread_pool/thread_pool.h',
                              'src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.h',
                              'src/core/lib/event_engine/thready_event_engine/thready_event_engine.h',
--- a/grpc.gemspec
+++ b/grpc.gemspec
@ -1143,6 +1143,8 @@ Gem::Specification.new do |s|
  s.files += %w( src/core/lib/event_engine/thread_local.h )
  s.files += %w( src/core/lib/event_engine/thread_pool/original_thread_pool.cc )
  s.files += %w( src/core/lib/event_engine/thread_pool/original_thread_pool.h )
  s.files += %w( src/core/lib/event_engine/thread_pool/thread_count.cc )
  s.files += %w( src/core/lib/event_engine/thread_pool/thread_count.h )
  s.files += %w( src/core/lib/event_engine/thread_pool/thread_pool.h )
  s.files += %w( src/core/lib/event_engine/thread_pool/thread_pool_factory.cc )
  s.files += %w( src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.cc )
--- a/grpc.gyp
+++ b/grpc.gyp
@ -778,6 +778,7 @@
        'src/core/lib/event_engine/slice_buffer.cc',
        'src/core/lib/event_engine/tcp_socket_utils.cc',
        'src/core/lib/event_engine/thread_pool/original_thread_pool.cc',
        'src/core/lib/event_engine/thread_pool/thread_count.cc',
        'src/core/lib/event_engine/thread_pool/thread_pool_factory.cc',
        'src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.cc',
        'src/core/lib/event_engine/thready_event_engine/thready_event_engine.cc',
@ -1283,6 +1284,7 @@
        'src/core/lib/event_engine/slice_buffer.cc',
        'src/core/lib/event_engine/tcp_socket_utils.cc',
        'src/core/lib/event_engine/thread_pool/original_thread_pool.cc',
        'src/core/lib/event_engine/thread_pool/thread_count.cc',
        'src/core/lib/event_engine/thread_pool/thread_pool_factory.cc',
        'src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.cc',
        'src/core/lib/event_engine/thready_event_engine/thready_event_engine.cc',
@ -2008,6 +2010,7 @@
        'src/core/lib/event_engine/slice_buffer.cc',
        'src/core/lib/event_engine/tcp_socket_utils.cc',
        'src/core/lib/event_engine/thread_pool/original_thread_pool.cc',
        'src/core/lib/event_engine/thread_pool/thread_count.cc',
        'src/core/lib/event_engine/thread_pool/thread_pool_factory.cc',
        'src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.cc',
        'src/core/lib/event_engine/thready_event_engine/thready_event_engine.cc',
--- a/package.xml
+++ b/package.xml
@ -1125,6 +1125,8 @@
    <file baseinstalldir="/" name="src/core/lib/event_engine/thread_local.h" role="src" />
    <file baseinstalldir="/" name="src/core/lib/event_engine/thread_pool/original_thread_pool.cc" role="src" />
    <file baseinstalldir="/" name="src/core/lib/event_engine/thread_pool/original_thread_pool.h" role="src" />
    <file baseinstalldir="/" name="src/core/lib/event_engine/thread_pool/thread_count.cc" role="src" />
    <file baseinstalldir="/" name="src/core/lib/event_engine/thread_pool/thread_count.h" role="src" />
    <file baseinstalldir="/" name="src/core/lib/event_engine/thread_pool/thread_pool.h" role="src" />
    <file baseinstalldir="/" name="src/core/lib/event_engine/thread_pool/thread_pool_factory.cc" role="src" />
    <file baseinstalldir="/" name="src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.cc" role="src" />
--- a/src/core/BUILD
+++ b/src/core/BUILD
@ -1508,6 +1508,23 @@ grpc_cc_library(
    deps = ["//:gpr_platform"],
 )
 grpc_cc_library(
    name = "event_engine_thread_count",
    srcs = [
        "lib/event_engine/thread_pool/thread_count.cc",
    ],
    hdrs = ["lib/event_engine/thread_pool/thread_count.h"],
    external_deps = [
        "absl/base:core_headers",
        "absl/time",
    ],
    deps = [
        "time",
        "useful",
        "//:gpr",
    ],
 )
 grpc_cc_library(
    name = "event_engine_thread_pool",
    srcs = [
@ -1529,6 +1546,7 @@ grpc_cc_library(
    deps = [
        "common_event_engine_closures",
        "event_engine_basic_work_queue",
        "event_engine_thread_count",
        "event_engine_thread_local",
        "event_engine_trace",
        "event_engine_work_queue",
--- a/src/core/lib/event_engine/thread_pool/thread_count.cc
+++ b/src/core/lib/event_engine/thread_pool/thread_count.cc
@ -0,0 +1,58 @@
 // Copyright 2023 The gRPC Authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <grpc/support/port_platform.h>
 #include "src/core/lib/event_engine/thread_pool/thread_count.h"
 #include <inttypes.h>
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include <grpc/support/log.h>
 namespace grpc_event_engine {
 namespace experimental {
 // -------- LivingThreadCount --------
 void LivingThreadCount::BlockUntilThreadCount(size_t desired_threads,
                                              const char* why) {
  constexpr grpc_core::Duration log_rate = grpc_core::Duration::Seconds(3);
  while (true) {
    auto curr_threads = WaitForCountChange(desired_threads, log_rate);
    if (curr_threads == desired_threads) break;
    GRPC_LOG_EVERY_N_SEC_DELAYED(
        log_rate.seconds(), GPR_DEBUG,
        "Waiting for thread pool to idle before %s. (%" PRIdPTR " to %" PRIdPTR
        ")",
        why, curr_threads, desired_threads);
  }
 }
 size_t LivingThreadCount::WaitForCountChange(size_t desired_threads,
                                             grpc_core::Duration timeout) {
  size_t count;
  auto deadline = absl::Now() + absl::Milliseconds(timeout.millis());
  do {
    grpc_core::MutexLock lock(&mu_);
    count = CountLocked();
    if (count == desired_threads) break;
    cv_.WaitWithDeadline(&mu_, deadline);
  } while (absl::Now() < deadline);
  return count;
 }
 }  // namespace experimental
 }  // namespace grpc_event_engine
--- a/src/core/lib/event_engine/thread_pool/thread_count.h
+++ b/src/core/lib/event_engine/thread_pool/thread_count.h
@ -0,0 +1,161 @@
 // Copyright 2023 The gRPC Authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #ifndef GRPC_SRC_CORE_LIB_EVENT_ENGINE_THREAD_POOL_THREAD_COUNT_H
 #define GRPC_SRC_CORE_LIB_EVENT_ENGINE_THREAD_POOL_THREAD_COUNT_H
 #include <grpc/support/port_platform.h>
 #include <atomic>
 #include <cstddef>
 #include <numeric>
 #include <utility>
 #include <vector>
 #include "absl/base/thread_annotations.h"
 #include <grpc/support/cpu.h>
 #include "src/core/lib/gpr/useful.h"
 #include "src/core/lib/gprpp/sync.h"
 #include "src/core/lib/gprpp/time.h"
 namespace grpc_event_engine {
 namespace experimental {
 // Tracks counts across some fixed number of shards.
 // It is intended for fast increment/decrement operations, but a slower overall
 // count operation.
 class BusyThreadCount {
 public:
  // Increments a per-shard counter on construction, decrements on destruction.
  class AutoThreadCounter {
   public:
    AutoThreadCounter(BusyThreadCount* counter, size_t idx)
        : counter_(counter), idx_(idx) {
      counter_->Increment(idx_);
    }
    ~AutoThreadCounter() {
      if (counter_ != nullptr) counter_->Decrement(idx_);
    }
    // not copyable
    AutoThreadCounter(const AutoThreadCounter&) = delete;
    AutoThreadCounter& operator=(const AutoThreadCounter&) = delete;
    // moveable
    AutoThreadCounter(AutoThreadCounter&& other) noexcept {
      counter_ = std::exchange(other.counter_, nullptr);
      idx_ = other.idx_;
    }
    AutoThreadCounter& operator=(AutoThreadCounter&& other) noexcept {
      counter_ = std::exchange(other.counter_, nullptr);
      idx_ = other.idx_;
      return *this;
    }
   private:
    BusyThreadCount* counter_;
    size_t idx_;
  };
  BusyThreadCount() : shards_(grpc_core::Clamp(gpr_cpu_num_cores(), 2u, 64u)) {}
  AutoThreadCounter MakeAutoThreadCounter(size_t idx) {
    return AutoThreadCounter(this, idx);
  };
  void Increment(size_t idx) {
    shards_[idx].busy_count.fetch_add(1, std::memory_order_relaxed);
  }
  void Decrement(size_t idx) {
    shards_[idx].busy_count.fetch_sub(1, std::memory_order_relaxed);
  }
  size_t count() {
    return std::accumulate(
        shards_.begin(), shards_.end(), 0, [](size_t running, ShardedData& d) {
          return running + d.busy_count.load(std::memory_order_relaxed);
        });
  }
  // Returns some valid index into the per-shard data, which is rotated on every
  // call to distribute load and reduce contention.
  size_t NextIndex() { return next_idx_.fetch_add(1) % shards_.size(); }
 private:
  struct ShardedData {
    std::atomic<size_t> busy_count{0};
  } GPR_ALIGN_STRUCT(GPR_CACHELINE_SIZE);
  std::vector<ShardedData> shards_;
  std::atomic<size_t> next_idx_{0};
 };
 // Tracks the number of living threads. It is intended for a fast count
 // operation, with relatively slower increment/decrement operations.
 class LivingThreadCount {
 public:
  // Increments the global counter on construction, decrements on destruction.
  class AutoThreadCounter {
   public:
    explicit AutoThreadCounter(LivingThreadCount* counter) : counter_(counter) {
      counter_->Increment();
    }
    ~AutoThreadCounter() {
      if (counter_ != nullptr) counter_->Decrement();
    }
    // not copyable
    AutoThreadCounter(const AutoThreadCounter&) = delete;
    AutoThreadCounter& operator=(const AutoThreadCounter&) = delete;
    // moveable
    AutoThreadCounter(AutoThreadCounter&& other) noexcept {
      counter_ = std::exchange(other.counter_, nullptr);
    }
    AutoThreadCounter& operator=(AutoThreadCounter&& other) noexcept {
      counter_ = std::exchange(other.counter_, nullptr);
      return *this;
    }
   private:
    LivingThreadCount* counter_;
  };
  AutoThreadCounter MakeAutoThreadCounter() { return AutoThreadCounter(this); };
  void Increment() ABSL_LOCKS_EXCLUDED(mu_) {
    grpc_core::MutexLock lock(&mu_);
    ++living_count_;
    cv_.SignalAll();
  }
  void Decrement() ABSL_LOCKS_EXCLUDED(mu_) {
    grpc_core::MutexLock lock(&mu_);
    --living_count_;
    cv_.SignalAll();
  }
  void BlockUntilThreadCount(size_t desired_threads, const char* why)
      ABSL_LOCKS_EXCLUDED(mu_);
  size_t count() ABSL_LOCKS_EXCLUDED(mu_) {
    grpc_core::MutexLock lock(&mu_);
    return CountLocked();
  }
 private:
  size_t CountLocked() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
    return living_count_;
  }
  size_t WaitForCountChange(size_t desired_threads,
                            grpc_core::Duration timeout);
  grpc_core::Mutex mu_;
  grpc_core::CondVar cv_ ABSL_GUARDED_BY(mu_);
  size_t living_count_ ABSL_GUARDED_BY(mu_) = 0;
 };
 }  // namespace experimental
 }  // namespace grpc_event_engine
 #endif  // GRPC_SRC_CORE_LIB_EVENT_ENGINE_THREAD_POOL_THREAD_COUNT_H
--- a/src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.cc
+++ b/src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.cc
@ -209,9 +209,9 @@ void WorkStealingThreadPool::WorkStealingThreadPoolImpl::Quiesce() {
  // until all other threads have exited, so we need to wait for just one thread
  // running instead of zero.
  bool is_threadpool_thread = g_local_queue != nullptr;
-  thread_count()->BlockUntilThreadCount(CounterType::kLivingThreadCount,
+  work_signal()->SignalAll();
-                                        is_threadpool_thread ? 1 : 0,
+  living_thread_count_.BlockUntilThreadCount(is_threadpool_thread ? 1 : 0,
-                                        "shutting down", work_signal());
+                                             "shutting down");
  GPR_ASSERT(queue_.Empty());
  quiesced_.store(true, std::memory_order_relaxed);
  lifeguard_.BlockUntilShutdownAndReset();
@ -249,8 +249,8 @@ bool WorkStealingThreadPool::WorkStealingThreadPoolImpl::IsQuiesced() {
 void WorkStealingThreadPool::WorkStealingThreadPoolImpl::PrepareFork() {
  SetForking(true);
-  thread_count()->BlockUntilThreadCount(CounterType::kLivingThreadCount, 0,
+  work_signal_.SignalAll();
-                                        "forking", &work_signal_);
+  living_thread_count_.BlockUntilThreadCount(0, "forking");
  lifeguard_.BlockUntilShutdownAndReset();
 }
@ -329,12 +329,9 @@ void WorkStealingThreadPool::WorkStealingThreadPoolImpl::Lifeguard::
  // No new threads are started when forking.
  // No new work is done when forking needs to begin.
  if (pool_->forking_.load()) return;
-  int busy_thread_count =
+  const auto living_thread_count = pool_->living_thread_count()->count();
      pool_->thread_count_.GetCount(CounterType::kBusyCount);
  int living_thread_count =
      pool_->thread_count_.GetCount(CounterType::kLivingThreadCount);
  // Wake an idle worker thread if there's global work to be had.
-  if (busy_thread_count < living_thread_count) {
+  if (pool_->busy_thread_count()->count() < living_thread_count) {
    if (!pool_->queue_.Empty()) {
      pool_->work_signal()->Signal();
      backoff_.Reset();
@ -357,7 +354,8 @@ void WorkStealingThreadPool::WorkStealingThreadPoolImpl::Lifeguard::
  // queue, nor any work to steal. Add more sophisticated logic about when to
  // start a thread.
  GRPC_EVENT_ENGINE_TRACE(
-      "Starting new ThreadPool thread due to backlog (total threads: %d)",
+      "Starting new ThreadPool thread due to backlog (total threads: %" PRIuPTR
      ")",
      living_thread_count + 1);
  pool_->StartThread();
  // Tell the lifeguard to monitor the pool more closely.
@ -369,12 +367,13 @@ void WorkStealingThreadPool::WorkStealingThreadPoolImpl::Lifeguard::
 WorkStealingThreadPool::ThreadState::ThreadState(
    std::shared_ptr<WorkStealingThreadPoolImpl> pool)
    : pool_(std::move(pool)),
-      auto_thread_count_(pool_->thread_count(),
+      auto_thread_counter_(
-                         CounterType::kLivingThreadCount),
+          pool_->living_thread_count()->MakeAutoThreadCounter()),
      backoff_(grpc_core::BackOff::Options()
                   .set_initial_backoff(kWorkerThreadMinSleepBetweenChecks)
                   .set_max_backoff(kWorkerThreadMaxSleepBetweenChecks)
-                   .set_multiplier(1.3)) {}
+                   .set_multiplier(1.3)),
      busy_count_idx_(pool_->busy_thread_count()->NextIndex()) {}
 void WorkStealingThreadPool::ThreadState::ThreadBody() {
  g_local_queue = new BasicWorkQueue();
@ -412,8 +411,8 @@ bool WorkStealingThreadPool::ThreadState::Step() {
  auto* closure = g_local_queue->PopMostRecent();
  // If local work is available, run it.
  if (closure != nullptr) {
-    ThreadCount::AutoThreadCount auto_busy{pool_->thread_count(),
+    auto busy =
-                                           CounterType::kBusyCount};
+        pool_->busy_thread_count()->MakeAutoThreadCounter(busy_count_idx_);
    closure->Run();
    return true;
  }
@ -450,8 +449,7 @@ bool WorkStealingThreadPool::ThreadState::Step() {
    // Quit a thread if the pool has more than it requires, and this thread
    // has been idle long enough.
    if (timed_out &&
-        pool_->thread_count()->GetCount(CounterType::kLivingThreadCount) >
+        pool_->living_thread_count()->count() > pool_->reserve_threads() &&
            pool_->reserve_threads() &&
        grpc_core::Timestamp::Now() - start_time > kIdleThreadLimit) {
      return false;
    }
@ -462,8 +460,8 @@ bool WorkStealingThreadPool::ThreadState::Step() {
    return false;
  }
  if (closure != nullptr) {
-    ThreadCount::AutoThreadCount auto_busy{pool_->thread_count(),
+    auto busy =
-                                           CounterType::kBusyCount};
+        pool_->busy_thread_count()->MakeAutoThreadCounter(busy_count_idx_);
    closure->Run();
  }
  backoff_.Reset();
@ -472,8 +470,8 @@ bool WorkStealingThreadPool::ThreadState::Step() {
 void WorkStealingThreadPool::ThreadState::FinishDraining() {
  // The thread is definitionally busy while draining
-  ThreadCount::AutoThreadCount auto_busy{pool_->thread_count(),
+  auto busy =
-                                         CounterType::kBusyCount};
+      pool_->busy_thread_count()->MakeAutoThreadCounter(busy_count_idx_);
  // If a fork occurs at any point during shutdown, quit draining. The post-fork
  // threads will finish draining the global queue.
  while (!pool_->IsForking()) {
@ -495,72 +493,6 @@ void WorkStealingThreadPool::ThreadState::FinishDraining() {
  }
 }
 // -------- WorkStealingThreadPool::ThreadCount --------
 void WorkStealingThreadPool::ThreadCount::Add(CounterType counter_type) {
  grpc_core::MutexLock lock(&wait_mu_[counter_type]);
  ++thread_counts_[counter_type];
  wait_cv_[counter_type].SignalAll();
 }
 void WorkStealingThreadPool::ThreadCount::Remove(CounterType counter_type) {
  grpc_core::MutexLock lock(&wait_mu_[counter_type]);
  --thread_counts_[counter_type];
  wait_cv_[counter_type].SignalAll();
 }
 void WorkStealingThreadPool::ThreadCount::BlockUntilThreadCount(
    CounterType counter_type, size_t desired_threads, const char* why,
    WorkSignal* work_signal) {
  // Wait for all threads to exit.
  work_signal->SignalAll();
  while (true) {
    auto curr_threads = WaitForCountChange(
        counter_type, desired_threads,
        grpc_core::Duration::Seconds(kBlockingQuiesceLogRateSeconds));
    if (curr_threads == desired_threads) break;
    GRPC_LOG_EVERY_N_SEC_DELAYED(
        kBlockingQuiesceLogRateSeconds, GPR_DEBUG,
        "Waiting for thread pool to idle before %s. (%" PRIdPTR " to %" PRIdPTR
        ")",
        why, curr_threads, desired_threads);
  }
 }
 size_t WorkStealingThreadPool::ThreadCount::WaitForCountChange(
    CounterType counter_type, size_t desired_threads,
    grpc_core::Duration timeout) {
  size_t count;
  auto deadline = absl::Now() + absl::Milliseconds(timeout.millis());
  do {
    grpc_core::MutexLock lock(&wait_mu_[counter_type]);
    count = GetCountLocked(counter_type);
    if (count == desired_threads) break;
    wait_cv_[counter_type].WaitWithDeadline(&wait_mu_[counter_type], deadline);
  } while (absl::Now() < deadline);
  return count;
 }
 size_t WorkStealingThreadPool::ThreadCount::GetCount(CounterType counter_type) {
  grpc_core::MutexLock lock(&wait_mu_[counter_type]);
  return GetCountLocked(counter_type);
 }
 size_t WorkStealingThreadPool::ThreadCount::GetCountLocked(
    CounterType counter_type) {
  return thread_counts_[counter_type];
 }
 WorkStealingThreadPool::ThreadCount::AutoThreadCount::AutoThreadCount(
    ThreadCount* counter, CounterType counter_type)
    : counter_(counter), counter_type_(counter_type) {
  counter_->Add(counter_type_);
 }
 WorkStealingThreadPool::ThreadCount::AutoThreadCount::~AutoThreadCount() {
  counter_->Remove(counter_type_);
 }
 // -------- WorkStealingThreadPool::WorkSignal --------
 void WorkStealingThreadPool::WorkSignal::Signal() {
--- a/src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.h
+++ b/src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.h
@ -33,6 +33,7 @@
 #include <grpc/event_engine/event_engine.h>
 #include "src/core/lib/backoff/backoff.h"
 #include "src/core/lib/event_engine/thread_pool/thread_count.h"
 #include "src/core/lib/event_engine/thread_pool/thread_pool.h"
 #include "src/core/lib/event_engine/work_queue/basic_work_queue.h"
 #include "src/core/lib/event_engine/work_queue/work_queue.h"
@ -76,56 +77,6 @@ class WorkStealingThreadPool final : public ThreadPool {
    grpc_core::CondVar cv_ ABSL_GUARDED_BY(mu_);
  };
  // Types of thread counts.
  // Note this is intentionally not an enum class, the keys are used as indexes
  // into the ThreadCount's private array.
  enum CounterType {
    kLivingThreadCount = 0,
    kBusyCount,
  };
  class ThreadCount {
   public:
    // Adds 1 to the thread count for that counter type.
    void Add(CounterType counter_type)
        ABSL_LOCKS_EXCLUDED(wait_mu_[counter_type]);
    // Subtracts 1 from the thread count for that counter type.
    void Remove(CounterType counter_type)
        ABSL_LOCKS_EXCLUDED(wait_mu_[counter_type]);
    // Blocks until the thread count for that type reaches `desired_threads`.
    void BlockUntilThreadCount(CounterType counter_type, size_t desired_threads,
                               const char* why, WorkSignal* work_signal)
        ABSL_LOCKS_EXCLUDED(wait_mu_[counter_type]);
    // Returns the current thread count for the tracked type.
    size_t GetCount(CounterType counter_type)
        ABSL_LOCKS_EXCLUDED(wait_mu_[counter_type]);
    // Returns the current thread count for the tracked type.
    size_t GetCountLocked(CounterType counter_type)
        ABSL_EXCLUSIVE_LOCKS_REQUIRED(wait_mu_[counter_type]);
    // Adds and removes thread counts on construction and destruction
    class AutoThreadCount {
     public:
      AutoThreadCount(ThreadCount* counter, CounterType counter_type);
      ~AutoThreadCount();
     private:
      ThreadCount* counter_;
      CounterType counter_type_;
    };
   private:
    // Wait for the desired count to be reached.
    // Returns the current thread count either when the desired count is
    // reached, or when the deadline has passed, whichever happens first.
    size_t WaitForCountChange(CounterType counter_type, size_t desired_threads,
                              grpc_core::Duration timeout);
    grpc_core::Mutex wait_mu_[2];
    grpc_core::CondVar wait_cv_[2];
    size_t thread_counts_[2]{0, 0};
  };
  // A pool of WorkQueues that participate in work stealing.
  //
  // Every worker thread registers and unregisters its thread-local thread pool
@ -186,7 +137,8 @@ class WorkStealingThreadPool final : public ThreadPool {
    bool IsForking();
    bool IsQuiesced();
    size_t reserve_threads() { return reserve_threads_; }
-    ThreadCount* thread_count() { return &thread_count_; }
+    BusyThreadCount* busy_thread_count() { return &busy_thread_count_; }
    LivingThreadCount* living_thread_count() { return &living_thread_count_; }
    TheftRegistry* theft_registry() { return &theft_registry_; }
    WorkQueue* queue() { return &queue_; }
    WorkSignal* work_signal() { return &work_signal_; }
@ -221,7 +173,8 @@ class WorkStealingThreadPool final : public ThreadPool {
    };
    const size_t reserve_threads_;
-    ThreadCount thread_count_;
+    BusyThreadCount busy_thread_count_;
    LivingThreadCount living_thread_count_;
    TheftRegistry theft_registry_;
    BasicWorkQueue queue_;
    // Track shutdown and fork bits separately.
@ -254,11 +207,11 @@ class WorkStealingThreadPool final : public ThreadPool {
    // is decremented at time of destruction. This is necessary when this thread
    // state holds the last shared_ptr keeping the pool alive.
    std::shared_ptr<WorkStealingThreadPoolImpl> pool_;
-    // auto_thread_count_ must be the second member declared, so that the thread
+    // auto_thread_counter_ must be declared after pool_, so that the thread
-    // count is decremented after all other state is cleaned up (preventing
+    // count is decremented after all other pool state is cleaned up.
-    // leaks).
+    LivingThreadCount::AutoThreadCounter auto_thread_counter_;
    ThreadCount::AutoThreadCount auto_thread_count_;
    grpc_core::BackOff backoff_;
    size_t busy_count_idx_;
  };
  const std::shared_ptr<WorkStealingThreadPoolImpl> pool_;
--- a/src/python/grpcio/grpc_core_dependencies.py
+++ b/src/python/grpcio/grpc_core_dependencies.py
@ -527,6 +527,7 @@ CORE_SOURCE_FILES = [
    'src/core/lib/event_engine/tcp_socket_utils.cc',
    'src/core/lib/event_engine/thread_local.cc',
    'src/core/lib/event_engine/thread_pool/original_thread_pool.cc',
    'src/core/lib/event_engine/thread_pool/thread_count.cc',
    'src/core/lib/event_engine/thread_pool/thread_pool_factory.cc',
    'src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.cc',
    'src/core/lib/event_engine/thready_event_engine/thready_event_engine.cc',
--- a/test/core/event_engine/BUILD
+++ b/test/core/event_engine/BUILD
@ -60,6 +60,7 @@ grpc_cc_test(
    deps = [
        "//:gpr",
        "//:grpc",
        "//src/core:event_engine_thread_count",
        "//src/core:event_engine_thread_pool",
        "//src/core:notification",
        "//test/core/util:grpc_test_util_unsecure",
--- a/test/core/event_engine/thread_pool_test.cc
+++ b/test/core/event_engine/thread_pool_test.cc
@ -14,11 +14,13 @@
 #include "src/core/lib/event_engine/thread_pool/thread_pool.h"
 #include <algorithm>
 #include <atomic>
 #include <chrono>
 #include <cmath>
 #include <functional>
 #include <thread>
 #include <vector>
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
@ -27,6 +29,7 @@
 #include <grpc/grpc.h>
 #include "src/core/lib/event_engine/thread_pool/original_thread_pool.h"
 #include "src/core/lib/event_engine/thread_pool/thread_count.h"
 #include "src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.h"
 #include "src/core/lib/gprpp/notification.h"
 #include "src/core/lib/gprpp/thd.h"
@ -112,7 +115,7 @@ TYPED_TEST(ThreadPoolTest, ForkStressTest) {
    }
    runcount.fetch_add(1, std::memory_order_relaxed);
  };
-  for (int i = 0; i < expected_runcount; i++) {
+  for (auto i = 0; i < expected_runcount; i++) {
    pool.Run(inner_fn);
  }
  // simulate multiple forks at a fixed frequency
@ -141,7 +144,7 @@ TYPED_TEST(ThreadPoolTest, StartQuiesceRaceStressTest) {
    std::unique_ptr<TypeParam> pool;
    int i;
  };
-  for (int i = 0; i < iter_count; i++) {
+  for (auto i = 0; i < iter_count; i++) {
    ThdState state{std::make_unique<TypeParam>(8), i};
    state.pool->PrepareFork();
    grpc_core::Thread t1(
@ -209,7 +212,7 @@ TEST_F(WorkStealingThreadPoolTest, ScalesWhenBackloggedFromGlobalQueue) {
  // Ensures the pool is saturated before signaling closures to continue.
  std::atomic<int> waiters{0};
  std::atomic<bool> signaled{false};
-  for (int i = 0; i < pool_thread_count; i++) {
+  for (auto i = 0; i < pool_thread_count; i++) {
    p.Run([&]() {
      waiters.fetch_add(1);
      while (!signaled.load()) {
@ -232,7 +235,7 @@ TEST_F(WorkStealingThreadPoolTest, ScalesWhenBackloggedFromGlobalQueue) {
 // is fixed, or the implementation is deleted, make this a typed test again.
 TEST_F(WorkStealingThreadPoolTest,
       ScalesWhenBackloggedFromSingleThreadLocalQueue) {
-  int pool_thread_count = 8;
+  constexpr int pool_thread_count = 8;
  WorkStealingThreadPool p(pool_thread_count);
  grpc_core::Notification signal;
  // Ensures the pool is saturated before signaling closures to continue.
@ -262,18 +265,193 @@ TEST_F(WorkStealingThreadPoolTest,
 // pool, it takes around 50s to run. When that is fixed, or the implementation
 // is deleted, make this a typed test again.
 TEST_F(WorkStealingThreadPoolTest, QuiesceRaceStressTest) {
-  int cycle_count = 333;
+  constexpr int cycle_count = 333;
-  int thread_count = 8;
+  constexpr int thread_count = 8;
-  int run_count = thread_count * 2;
+  constexpr int run_count = thread_count * 2;
-  for (int i = 0; i < cycle_count; i++) {
+  for (auto i = 0; i < cycle_count; i++) {
    WorkStealingThreadPool p(thread_count);
-    for (int j = 0; j < run_count; j++) {
+    for (auto j = 0; j < run_count; j++) {
      p.Run([]() {});
    }
    p.Quiesce();
  }
 }
 class BusyThreadCountTest : public testing::Test {};
 TEST_F(BusyThreadCountTest, StressTest) {
  // Spawns a large number of threads to concurrently increments/decrement the
  // counters, and request count totals. Magic numbers were tuned for tests to
  // run in a reasonable amount of time.
  constexpr size_t thread_count = 300;
  constexpr int run_count = 1000;
  constexpr int increment_by = 50;
  BusyThreadCount busy_thread_count;
  grpc_core::Notification stop_counting;
  std::thread counter_thread([&]() {
    while (!stop_counting.HasBeenNotified()) {
      busy_thread_count.count();
    }
  });
  std::vector<std::thread> threads;
  threads.reserve(thread_count);
  for (size_t i = 0; i < thread_count; i++) {
    threads.emplace_back([&]() {
      for (int j = 0; j < run_count; j++) {
        // Get a new index for every iteration.
        // This is not the intended use, but further stress tests the NextIndex
        // function.
        auto thread_idx = busy_thread_count.NextIndex();
        for (int inc = 0; inc < increment_by; inc++) {
          busy_thread_count.Increment(thread_idx);
        }
        for (int inc = 0; inc < increment_by; inc++) {
          busy_thread_count.Decrement(thread_idx);
        }
      }
    });
  }
  for (auto& thd : threads) thd.join();
  stop_counting.Notify();
  counter_thread.join();
  ASSERT_EQ(busy_thread_count.count(), 0);
 }
 TEST_F(BusyThreadCountTest, AutoCountStressTest) {
  // Spawns a large number of threads to concurrently increments/decrement the
  // counters, and request count totals. Magic numbers were tuned for tests to
  // run in a reasonable amount of time.
  constexpr size_t thread_count = 150;
  constexpr int run_count = 1000;
  constexpr int increment_by = 30;
  BusyThreadCount busy_thread_count;
  grpc_core::Notification stop_counting;
  std::thread counter_thread([&]() {
    while (!stop_counting.HasBeenNotified()) {
      busy_thread_count.count();
    }
  });
  std::vector<std::thread> threads;
  threads.reserve(thread_count);
  for (size_t i = 0; i < thread_count; i++) {
    threads.emplace_back([&]() {
      for (int j = 0; j < run_count; j++) {
        std::vector<BusyThreadCount::AutoThreadCounter> auto_counters;
        auto_counters.reserve(increment_by);
        for (int ctr_count = 0; ctr_count < increment_by; ctr_count++) {
          auto_counters.push_back(busy_thread_count.MakeAutoThreadCounter(
              busy_thread_count.NextIndex()));
        }
      }
    });
  }
  for (auto& thd : threads) thd.join();
  stop_counting.Notify();
  counter_thread.join();
  ASSERT_EQ(busy_thread_count.count(), 0);
 }
 class LivingThreadCountTest : public testing::Test {};
 TEST_F(LivingThreadCountTest, StressTest) {
  // Spawns a large number of threads to concurrently increments/decrement the
  // counters, and request count totals. Magic numbers were tuned for tests to
  // run in a reasonable amount of time.
  constexpr size_t thread_count = 50;
  constexpr int run_count = 1000;
  constexpr int increment_by = 10;
  LivingThreadCount living_thread_count;
  grpc_core::Notification stop_counting;
  std::thread counter_thread([&]() {
    while (!stop_counting.HasBeenNotified()) {
      living_thread_count.count();
    }
  });
  std::vector<std::thread> threads;
  threads.reserve(thread_count);
  for (size_t i = 0; i < thread_count; i++) {
    threads.emplace_back([&]() {
      for (int j = 0; j < run_count; j++) {
        // Get a new index for every iteration.
        // This is not the intended use, but further stress tests the NextIndex
        // function.
        for (int inc = 0; inc < increment_by; inc++) {
          living_thread_count.Increment();
        }
        for (int inc = 0; inc < increment_by; inc++) {
          living_thread_count.Decrement();
        }
      }
    });
  }
  for (auto& thd : threads) thd.join();
  stop_counting.Notify();
  counter_thread.join();
  ASSERT_EQ(living_thread_count.count(), 0);
 }
 TEST_F(LivingThreadCountTest, AutoCountStressTest) {
  // Spawns a large number of threads to concurrently increments/decrement the
  // counters, and request count totals. Magic numbers were tuned for tests to
  // run in a reasonable amount of time.
  constexpr size_t thread_count = 50;
  constexpr int run_count = 1000;
  constexpr int increment_by = 10;
  LivingThreadCount living_thread_count;
  grpc_core::Notification stop_counting;
  std::thread counter_thread([&]() {
    while (!stop_counting.HasBeenNotified()) {
      living_thread_count.count();
    }
  });
  std::vector<std::thread> threads;
  threads.reserve(thread_count);
  for (size_t i = 0; i < thread_count; i++) {
    threads.emplace_back([&]() {
      for (int j = 0; j < run_count; j++) {
        std::vector<LivingThreadCount::AutoThreadCounter> auto_counters;
        auto_counters.reserve(increment_by);
        for (int ctr_count = 0; ctr_count < increment_by; ctr_count++) {
          auto_counters.push_back(living_thread_count.MakeAutoThreadCounter());
        }
      }
    });
  }
  for (auto& thd : threads) thd.join();
  stop_counting.Notify();
  counter_thread.join();
  ASSERT_EQ(living_thread_count.count(), 0);
 }
 TEST_F(LivingThreadCountTest, BlockUntilThreadCountTest) {
  constexpr size_t thread_count = 100;
  grpc_core::Notification waiting;
  LivingThreadCount living_thread_count;
  std::vector<std::thread> threads;
  threads.reserve(thread_count);
  // Start N living threads
  for (size_t i = 0; i < thread_count; i++) {
    threads.emplace_back([&]() {
      auto alive = living_thread_count.MakeAutoThreadCounter();
      waiting.WaitForNotification();
    });
  }
  // Join in a separate thread
  std::thread joiner([&]() {
    waiting.Notify();
    for (auto& thd : threads) thd.join();
  });
  {
    auto alive = living_thread_count.MakeAutoThreadCounter();
    living_thread_count.BlockUntilThreadCount(1,
                                              "block until 1 thread remains");
  }
  living_thread_count.BlockUntilThreadCount(0,
                                            "block until all threads are gone");
  joiner.join();
  ASSERT_EQ(living_thread_count.count(), 0);
 }
 }  // namespace experimental
 }  // namespace grpc_event_engine
--- a/tools/doxygen/Doxyfile.c++.internal
+++ b/tools/doxygen/Doxyfile.c++.internal
@ -2140,6 +2140,8 @@ src/core/lib/event_engine/thread_local.cc \
 src/core/lib/event_engine/thread_local.h \
 src/core/lib/event_engine/thread_pool/original_thread_pool.cc \
 src/core/lib/event_engine/thread_pool/original_thread_pool.h \
 src/core/lib/event_engine/thread_pool/thread_count.cc \
 src/core/lib/event_engine/thread_pool/thread_count.h \
 src/core/lib/event_engine/thread_pool/thread_pool.h \
 src/core/lib/event_engine/thread_pool/thread_pool_factory.cc \
 src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.cc \
--- a/tools/doxygen/Doxyfile.core.internal
+++ b/tools/doxygen/Doxyfile.core.internal
@ -1918,6 +1918,8 @@ src/core/lib/event_engine/thread_local.cc \
 src/core/lib/event_engine/thread_local.h \
 src/core/lib/event_engine/thread_pool/original_thread_pool.cc \
 src/core/lib/event_engine/thread_pool/original_thread_pool.h \
 src/core/lib/event_engine/thread_pool/thread_count.cc \
 src/core/lib/event_engine/thread_pool/thread_count.h \
 src/core/lib/event_engine/thread_pool/thread_pool.h \
 src/core/lib/event_engine/thread_pool/thread_pool_factory.cc \
 src/core/lib/event_engine/thread_pool/work_stealing_thread_pool.cc \