grpc/test/cpp/microbenchmarks/bm_work_queue.cc

// Copyright 2022 The gRPC Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <grpc/support/port_platform.h>

#include <cmath>
#include <deque>
#include <sstream>

// ensure assert() is enabled
#undef NDEBUG
#include <cassert>

#include <benchmark/benchmark.h>

#include <grpc/support/log.h>

#include "src/core/lib/event_engine/common_closures.h"
#include "src/core/lib/event_engine/work_queue.h"
#include "src/core/lib/gprpp/crash.h"
#include "test/core/util/test_config.h"

namespace {

using ::grpc_event_engine::experimental::AnyInvocableClosure;
using ::grpc_event_engine::experimental::EventEngine;
using ::grpc_event_engine::experimental::WorkQueue;

grpc_core::Mutex globalMu;
std::vector<WorkQueue*>* globalWorkQueueList;
std::vector<std::deque<EventEngine::Closure*>*>* globalDequeList;
std::vector<grpc_core::Mutex>* globalDequeMutexList;

void GlobalSetup(const benchmark::State& state) {
  // called for every test, resets all state
  globalWorkQueueList = new std::vector<WorkQueue*>();
  globalWorkQueueList->reserve(state.threads());
  globalDequeList = new std::vector<std::deque<EventEngine::Closure*>*>();
  globalDequeList->reserve(state.threads());
  globalDequeMutexList = new std::vector<grpc_core::Mutex>(
      std::vector<grpc_core::Mutex>(state.threads()));
}

void GlobalTeardown(const benchmark::State& /* state */) {
  // called for every test, resets all state
  delete globalWorkQueueList;
  delete globalDequeList;
  delete globalDequeMutexList;
}

void BM_WorkQueueIntptrPopFront(benchmark::State& state) {
  WorkQueue queue;
  grpc_event_engine::experimental::AnyInvocableClosure closure([] {});
  int element_count = state.range(0);
  for (auto _ : state) {
    int cnt = 0;
    for (int i = 0; i < element_count; i++) queue.Add(&closure);
    absl::optional<EventEngine::Closure*> popped;
    cnt = 0;
    do {
      popped = queue.PopFront();
      if (popped.has_value()) ++cnt;
    } while (cnt < element_count);
  }
  state.counters["Added"] = element_count * state.iterations();
  state.counters["Popped"] = state.counters["Added"];
  state.counters["Steal Rate"] =
      benchmark::Counter(state.counters["Popped"], benchmark::Counter::kIsRate);
}
BENCHMARK(BM_WorkQueueIntptrPopFront)
    ->Setup(GlobalSetup)
    ->Teardown(GlobalTeardown)
    ->Range(1, 512)
    ->UseRealTime()
    ->MeasureProcessCPUTime();

void BM_MultithreadedWorkQueuePopBack(benchmark::State& state) {
  if (state.thread_index() == 0) (*globalWorkQueueList)[0] = new WorkQueue();
  AnyInvocableClosure closure([] {});
  int element_count = state.range(0);
  for (auto _ : state) {
    int cnt = 0;
    auto* queue = (*globalWorkQueueList)[0];
    for (int i = 0; i < element_count; i++) queue->Add(&closure);
    absl::optional<EventEngine::Closure*> popped;
    cnt = 0;
    do {
      popped = queue->PopBack();
      if (popped.has_value()) ++cnt;
    } while (cnt < element_count);
  }
  state.counters["Added"] = element_count * state.iterations();
  state.counters["Popped"] = state.counters["Added"];
  state.counters["Steal Rate"] =
      benchmark::Counter(state.counters["Popped"], benchmark::Counter::kIsRate);
  if (state.thread_index() == 0) {
    delete (*globalWorkQueueList)[0];
  }
}
BENCHMARK(BM_MultithreadedWorkQueuePopBack)
    ->Setup(GlobalSetup)
    ->Teardown(GlobalTeardown)
    ->Range(1, 512)
    ->UseRealTime()
    ->MeasureProcessCPUTime()
    ->Threads(1)
    ->Threads(4)
    ->ThreadPerCpu();

void BM_WorkQueueClosureExecution(benchmark::State& state) {
  WorkQueue queue;
  int element_count = state.range(0);
  int run_count = 0;
  grpc_event_engine::experimental::AnyInvocableClosure closure(
      [&run_count] { ++run_count; });
  for (auto _ : state) {
    for (int i = 0; i < element_count; i++) queue.Add(&closure);
    do {
      queue.PopFront()->Run();
    } while (run_count < element_count);
    run_count = 0;
  }
  state.counters["Added"] = element_count * state.iterations();
  state.counters["Popped"] = state.counters["Added"];
  state.counters["Steal Rate"] =
      benchmark::Counter(state.counters["Popped"], benchmark::Counter::kIsRate);
}
BENCHMARK(BM_WorkQueueClosureExecution)
    ->Range(8, 128)
    ->UseRealTime()
    ->MeasureProcessCPUTime();

void BM_WorkQueueAnyInvocableExecution(benchmark::State& state) {
  WorkQueue queue;
  int element_count = state.range(0);
  int run_count = 0;
  for (auto _ : state) {
    for (int i = 0; i < element_count; i++) {
      queue.Add([&run_count] { ++run_count; });
    }
    do {
      queue.PopFront()->Run();
    } while (run_count < element_count);
    run_count = 0;
  }
  state.counters["Added"] = element_count * state.iterations();
  state.counters["Popped"] = state.counters["Added"];
  state.counters["Steal Rate"] =
      benchmark::Counter(state.counters["Popped"], benchmark::Counter::kIsRate);
}
BENCHMARK(BM_WorkQueueAnyInvocableExecution)
    ->Range(8, 128)
    ->UseRealTime()
    ->MeasureProcessCPUTime();

void BM_StdDequeLIFO(benchmark::State& state) {
  if (state.thread_index() == 0) {
    (*globalDequeList)[0] = new std::deque<EventEngine::Closure*>();
  }
  auto& mu = (*globalDequeMutexList)[0];
  int element_count = state.range(0);
  AnyInvocableClosure closure([] {});
  for (auto _ : state) {
    auto* queue = (*globalDequeList)[0];
    for (int i = 0; i < element_count; i++) {
      grpc_core::MutexLock lock(&mu);
      queue->emplace_back(&closure);
    }
    for (int i = 0; i < element_count; i++) {
      grpc_core::MutexLock lock(&mu);
      EventEngine::Closure* popped = queue->back();
      queue->pop_back();
      assert(popped != nullptr);
    }
  }
  state.counters["Added"] = element_count * state.iterations();
  state.counters["Popped"] = state.counters["Added"];
  state.counters["Steal Rate"] =
      benchmark::Counter(state.counters["Popped"], benchmark::Counter::kIsRate);
  if (state.thread_index() == 0) {
    delete (*globalDequeList)[0];
  }
}
BENCHMARK(BM_StdDequeLIFO)
    ->Setup(GlobalSetup)
    ->Teardown(GlobalTeardown)
    ->Range(1, 512)
    ->UseRealTime()
    ->MeasureProcessCPUTime()
    ->Threads(1)
    ->Threads(4)
    ->ThreadPerCpu();

void PerThreadArguments(benchmark::internal::Benchmark* b) {
  b->Setup(GlobalSetup)
      ->Teardown(GlobalTeardown)
      ->ArgsProduct({/*pop_attempts=*/{10, 50, 250},
                     /*pct_fill=*/{2, 10, 50}})
      ->UseRealTime()
      ->MeasureProcessCPUTime()
      ->Threads(10)
      ->ThreadPerCpu();
}

void BM_WorkQueuePerThread(benchmark::State& state) {
  WorkQueue local_queue;
  {
    grpc_core::MutexLock lock(&globalMu);
    (*globalWorkQueueList)[state.thread_index()] = &local_queue;
  }
  AnyInvocableClosure closure([] {});
  int element_count = state.range(0);
  float pct_fill = state.range(1) / 100.0;
  for (auto _ : state) {
    // sparsely populate a queue
    for (int i = 0; i < std::ceil(element_count * pct_fill); i++) {
      local_queue.Add(&closure);
    }
    // attempt to pop from all thread queues `element_count` times
    int pop_attempts = 0;
    auto iq = globalWorkQueueList->begin();
    while (pop_attempts++ < element_count) {
      // may not get a value if the queue being looked at from another thread
      (*iq)->PopBack();
      if (iq == globalWorkQueueList->end()) {
        iq = globalWorkQueueList->begin();
      } else {
        iq++;
      };
    }
  }
  state.counters["Added"] =
      std::ceil(element_count * pct_fill) * state.iterations();
  state.counters["Steal Attempts"] = element_count * state.iterations();
  state.counters["Steal Rate"] = benchmark::Counter(
      state.counters["Steal Attempts"], benchmark::Counter::kIsRate);
  if (state.thread_index() == 0) {
    for (auto* queue : *globalWorkQueueList) {
      assert(queue->Empty());
    }
  }
}
BENCHMARK(BM_WorkQueuePerThread)->Apply(PerThreadArguments);

void BM_StdDequePerThread(benchmark::State& state) {
  std::deque<EventEngine::Closure*> local_queue;
  (*globalDequeList)[state.thread_index()] = &local_queue;
  int element_count = state.range(0);
  float pct_fill = state.range(1) / 100.0;
  AnyInvocableClosure closure([] {});
  auto& local_mu = (*globalDequeMutexList)[state.thread_index()];
  for (auto _ : state) {
    // sparsely populate a queue
    for (int i = 0; i < std::ceil(element_count * pct_fill); i++) {
      grpc_core::MutexLock lock(&local_mu);
      local_queue.emplace_back(&closure);
    }
    int pop_attempts = 0;
    auto iq = globalDequeList->begin();
    auto mu = globalDequeMutexList->begin();
    while (pop_attempts++ < element_count) {
      {
        grpc_core::MutexLock lock(&*mu);
        if (!(*iq)->empty()) {
          assert((*iq)->back() != nullptr);
          (*iq)->pop_back();
        }
      }
      if (iq == globalDequeList->end()) {
        iq = globalDequeList->begin();
        mu = globalDequeMutexList->begin();
      } else {
        ++iq;
        ++mu;
      };
    }
  }
  state.counters["Added"] =
      std::ceil(element_count * pct_fill) * state.iterations();
  state.counters["Steal Attempts"] = element_count * state.iterations();
  state.counters["Steal Rate"] = benchmark::Counter(
      state.counters["Steal Attempts"], benchmark::Counter::kIsRate);
  if (state.thread_index() == 0) {
    for (auto* queue : *globalDequeList) {
      assert(queue->empty());
    }
  }
}
BENCHMARK(BM_StdDequePerThread)->Apply(PerThreadArguments);

}  // namespace

// Some distros have RunSpecifiedBenchmarks under the benchmark namespace,
// and others do not. This allows us to support both modes.
namespace benchmark {
void RunTheBenchmarksNamespaced() { RunSpecifiedBenchmarks(); }
}  // namespace benchmark

int main(int argc, char** argv) {
  grpc::testing::TestEnvironment env(&argc, argv);
  ::benchmark::Initialize(&argc, argv);
  benchmark::RunTheBenchmarksNamespaced();
  return 0;
}
Performant thread-safe Work Queue (#30821) * WorkQueue * weaken the large obj stress test for Windows; documentation * update comment * Add WorkQueue microbenchmark. Results below ... ------------------------------------------------------------------------------------------ Benchmark Time CPU Iterations UserCounters... ------------------------------------------------------------------------------------------ BM_WorkQueueIntptrPopFront/1 297 ns 297 ns 2343500 items_per_second=3.3679M/s BM_WorkQueueIntptrPopFront/8 7022 ns 7020 ns 99356 items_per_second=1.13956M/s BM_WorkQueueIntptrPopFront/64 59606 ns 59590 ns 11770 items_per_second=1074k/s BM_WorkQueueIntptrPopFront/512 477867 ns 477748 ns 1469 items_per_second=1071.7k/s BM_WorkQueueIntptrPopFront/4096 3815786 ns 3814925 ns 184 items_per_second=1073.68k/s I0902 19:05:22.138022069 12 test_config.cc:194] TestEnvironment ends ================================================================================ * use int64_t for times. 0 performance change ------------------------------------------------------------------------------------------ Benchmark Time CPU Iterations UserCounters... ------------------------------------------------------------------------------------------ BM_WorkQueueIntptrPopFront/1 277 ns 277 ns 2450292 items_per_second=3.60967M/s BM_WorkQueueIntptrPopFront/8 6718 ns 6716 ns 105497 items_per_second=1.19126M/s BM_WorkQueueIntptrPopFront/64 56428 ns 56401 ns 12268 items_per_second=1.13474M/s BM_WorkQueueIntptrPopFront/512 458953 ns 458817 ns 1550 items_per_second=1.11591M/s BM_WorkQueueIntptrPopFront/4096 3686357 ns 3685120 ns 191 items_per_second=1.1115M/s I0902 19:25:31.549382949 12 test_config.cc:194] TestEnvironment ends ================================================================================ * add PopBack tests: same performance profile exactly * use Mutex instead of Spinlock It's safer, and so far equally performant in benchmarks of opt builds * add deque test for comparison. It is faster on all tests. * Add sparsely-populated multi-threaded benchmarks. * fix * fix * refactor to help thread safety analysis * Specialize WorkQueue for Closures and AnyInvocables remove unused callback storage * add single-threaded benchmark for closure vs invocable * sanitize * missing include * move bm_work_queue to microbenchmarks so it isn't exported * s/workqueue/work_queue/g * use nullptr instead of optionals for popped closures * reviewer test suggestion * private things are private * add a work_queue fuzzer Ran for 10 minutes @ 42 jobs @ 42 workers. Zero failures. Checked in a selection of 100 good seeds after merging the thousands of results. * fix * fix header guards * nuke the corpora * feedback * sanitize * Timestamp::Now * fix * fuzzers do not work on windows * windows does not like multithreaded benchmark tests 2 years ago			`// Copyright 2022 The gRPC Authors`
			`//`
			`// Licensed under the Apache License, Version 2.0 (the "License");`
			`// you may not use this file except in compliance with the License.`
			`// You may obtain a copy of the License at`
			`//`
			`// http://www.apache.org/licenses/LICENSE-2.0`
			`//`
			`// Unless required by applicable law or agreed to in writing, software`
			`// distributed under the License is distributed on an "AS IS" BASIS,`
			`// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`// See the License for the specific language governing permissions and`
			`// limitations under the License.`
			`#include <grpc/support/port_platform.h>`

			`#include <cmath>`
			`#include <deque>`
			`#include <sstream>`

			`// ensure assert() is enabled`
			`#undef NDEBUG`
			`#include <cassert>`

			`#include <benchmark/benchmark.h>`

			`#include <grpc/support/log.h>`

			`#include "src/core/lib/event_engine/common_closures.h"`
			`#include "src/core/lib/event_engine/work_queue.h"`
[cleanup] Eliminate usage of GRPC_ASSERT(false...); (#31757) * crash function * progress * fix * fix * Automated change: Fix sanity tests * fix * fix * fix * fixes * Automated change: Fix sanity tests * fix * Automated change: Fix sanity tests * fix * fix * use cpp attr * Automated change: Fix sanity tests * fix * fix * fix * fix * fix * fix * add exclusion * fix * typo * fix * fmt * Update tcp_socket_utils.cc * Automated change: Fix sanity tests * fix * revert php changes * Automated change: Fix sanity tests Co-authored-by: ctiller <ctiller@users.noreply.github.com> 2 years ago			`#include "src/core/lib/gprpp/crash.h"`
Performant thread-safe Work Queue (#30821) * WorkQueue * weaken the large obj stress test for Windows; documentation * update comment * Add WorkQueue microbenchmark. Results below ... ------------------------------------------------------------------------------------------ Benchmark Time CPU Iterations UserCounters... ------------------------------------------------------------------------------------------ BM_WorkQueueIntptrPopFront/1 297 ns 297 ns 2343500 items_per_second=3.3679M/s BM_WorkQueueIntptrPopFront/8 7022 ns 7020 ns 99356 items_per_second=1.13956M/s BM_WorkQueueIntptrPopFront/64 59606 ns 59590 ns 11770 items_per_second=1074k/s BM_WorkQueueIntptrPopFront/512 477867 ns 477748 ns 1469 items_per_second=1071.7k/s BM_WorkQueueIntptrPopFront/4096 3815786 ns 3814925 ns 184 items_per_second=1073.68k/s I0902 19:05:22.138022069 12 test_config.cc:194] TestEnvironment ends ================================================================================ * use int64_t for times. 0 performance change ------------------------------------------------------------------------------------------ Benchmark Time CPU Iterations UserCounters... ------------------------------------------------------------------------------------------ BM_WorkQueueIntptrPopFront/1 277 ns 277 ns 2450292 items_per_second=3.60967M/s BM_WorkQueueIntptrPopFront/8 6718 ns 6716 ns 105497 items_per_second=1.19126M/s BM_WorkQueueIntptrPopFront/64 56428 ns 56401 ns 12268 items_per_second=1.13474M/s BM_WorkQueueIntptrPopFront/512 458953 ns 458817 ns 1550 items_per_second=1.11591M/s BM_WorkQueueIntptrPopFront/4096 3686357 ns 3685120 ns 191 items_per_second=1.1115M/s I0902 19:25:31.549382949 12 test_config.cc:194] TestEnvironment ends ================================================================================ * add PopBack tests: same performance profile exactly * use Mutex instead of Spinlock It's safer, and so far equally performant in benchmarks of opt builds * add deque test for comparison. It is faster on all tests. * Add sparsely-populated multi-threaded benchmarks. * fix * fix * refactor to help thread safety analysis * Specialize WorkQueue for Closures and AnyInvocables remove unused callback storage * add single-threaded benchmark for closure vs invocable * sanitize * missing include * move bm_work_queue to microbenchmarks so it isn't exported * s/workqueue/work_queue/g * use nullptr instead of optionals for popped closures * reviewer test suggestion * private things are private * add a work_queue fuzzer Ran for 10 minutes @ 42 jobs @ 42 workers. Zero failures. Checked in a selection of 100 good seeds after merging the thousands of results. * fix * fix header guards * nuke the corpora * feedback * sanitize * Timestamp::Now * fix * fuzzers do not work on windows * windows does not like multithreaded benchmark tests 2 years ago			`#include "test/core/util/test_config.h"`

			`namespace {`

			`using ::grpc_event_engine::experimental::AnyInvocableClosure;`
			`using ::grpc_event_engine::experimental::EventEngine;`
			`using ::grpc_event_engine::experimental::WorkQueue;`

			`grpc_core::Mutex globalMu;`
			`std::vector<WorkQueue> globalWorkQueueList;`
			`std::vector<std::deque<EventEngine::Closure>>* globalDequeList;`
			`std::vector<grpc_core::Mutex>* globalDequeMutexList;`

			`void GlobalSetup(const benchmark::State& state) {`
			`// called for every test, resets all state`
			`globalWorkQueueList = new std::vector<WorkQueue*>();`
			`globalWorkQueueList->reserve(state.threads());`
			`globalDequeList = new std::vector<std::deque<EventEngine::Closure>>();`
			`globalDequeList->reserve(state.threads());`
			`globalDequeMutexList = new std::vector<grpc_core::Mutex>(`
			`std::vector<grpc_core::Mutex>(state.threads()));`
			`}`

			`void GlobalTeardown(const benchmark::State& /* state */) {`
			`// called for every test, resets all state`
			`delete globalWorkQueueList;`
			`delete globalDequeList;`
			`delete globalDequeMutexList;`
			`}`

			`void BM_WorkQueueIntptrPopFront(benchmark::State& state) {`
			`WorkQueue queue;`
			`grpc_event_engine::experimental::AnyInvocableClosure closure([] {});`
			`int element_count = state.range(0);`
			`for (auto _ : state) {`
			`int cnt = 0;`
			`for (int i = 0; i < element_count; i++) queue.Add(&closure);`
			`absl::optional<EventEngine::Closure*> popped;`
			`cnt = 0;`
			`do {`
			`popped = queue.PopFront();`
			`if (popped.has_value()) ++cnt;`
			`} while (cnt < element_count);`
			`}`
			`state.counters["Added"] = element_count * state.iterations();`
			`state.counters["Popped"] = state.counters["Added"];`
			`state.counters["Steal Rate"] =`
			`benchmark::Counter(state.counters["Popped"], benchmark::Counter::kIsRate);`
			`}`
			`BENCHMARK(BM_WorkQueueIntptrPopFront)`
			`->Setup(GlobalSetup)`
			`->Teardown(GlobalTeardown)`
			`->Range(1, 512)`
			`->UseRealTime()`
			`->MeasureProcessCPUTime();`

			`void BM_MultithreadedWorkQueuePopBack(benchmark::State& state) {`
			`if (state.thread_index() == 0) (*globalWorkQueueList)[0] = new WorkQueue();`
			`AnyInvocableClosure closure([] {});`
			`int element_count = state.range(0);`
			`for (auto _ : state) {`
			`int cnt = 0;`
			`auto* queue = (*globalWorkQueueList)[0];`
			`for (int i = 0; i < element_count; i++) queue->Add(&closure);`
			`absl::optional<EventEngine::Closure*> popped;`
			`cnt = 0;`
			`do {`
			`popped = queue->PopBack();`
			`if (popped.has_value()) ++cnt;`
			`} while (cnt < element_count);`
			`}`
			`state.counters["Added"] = element_count * state.iterations();`
			`state.counters["Popped"] = state.counters["Added"];`
			`state.counters["Steal Rate"] =`
			`benchmark::Counter(state.counters["Popped"], benchmark::Counter::kIsRate);`
			`if (state.thread_index() == 0) {`
			`delete (*globalWorkQueueList)[0];`
			`}`
			`}`
			`BENCHMARK(BM_MultithreadedWorkQueuePopBack)`
			`->Setup(GlobalSetup)`
			`->Teardown(GlobalTeardown)`
			`->Range(1, 512)`
			`->UseRealTime()`
			`->MeasureProcessCPUTime()`
			`->Threads(1)`
			`->Threads(4)`
			`->ThreadPerCpu();`

			`void BM_WorkQueueClosureExecution(benchmark::State& state) {`
			`WorkQueue queue;`
			`int element_count = state.range(0);`
			`int run_count = 0;`
			`grpc_event_engine::experimental::AnyInvocableClosure closure(`
			`[&run_count] { ++run_count; });`
			`for (auto _ : state) {`
			`for (int i = 0; i < element_count; i++) queue.Add(&closure);`
			`do {`
			`queue.PopFront()->Run();`
			`} while (run_count < element_count);`
			`run_count = 0;`
			`}`
			`state.counters["Added"] = element_count * state.iterations();`
			`state.counters["Popped"] = state.counters["Added"];`
			`state.counters["Steal Rate"] =`
			`benchmark::Counter(state.counters["Popped"], benchmark::Counter::kIsRate);`
			`}`
			`BENCHMARK(BM_WorkQueueClosureExecution)`
			`->Range(8, 128)`
			`->UseRealTime()`
			`->MeasureProcessCPUTime();`

			`void BM_WorkQueueAnyInvocableExecution(benchmark::State& state) {`
			`WorkQueue queue;`
			`int element_count = state.range(0);`
			`int run_count = 0;`
			`for (auto _ : state) {`
			`for (int i = 0; i < element_count; i++) {`
			`queue.Add([&run_count] { ++run_count; });`
			`}`
			`do {`
			`queue.PopFront()->Run();`
			`} while (run_count < element_count);`
			`run_count = 0;`
			`}`
			`state.counters["Added"] = element_count * state.iterations();`
			`state.counters["Popped"] = state.counters["Added"];`
			`state.counters["Steal Rate"] =`
			`benchmark::Counter(state.counters["Popped"], benchmark::Counter::kIsRate);`
			`}`
			`BENCHMARK(BM_WorkQueueAnyInvocableExecution)`
			`->Range(8, 128)`
			`->UseRealTime()`
			`->MeasureProcessCPUTime();`

			`void BM_StdDequeLIFO(benchmark::State& state) {`
			`if (state.thread_index() == 0) {`
			`(globalDequeList)[0] = new std::deque<EventEngine::Closure>();`
			`}`
			`auto& mu = (*globalDequeMutexList)[0];`
			`int element_count = state.range(0);`
			`AnyInvocableClosure closure([] {});`
			`for (auto _ : state) {`
			`auto* queue = (*globalDequeList)[0];`
			`for (int i = 0; i < element_count; i++) {`
			`grpc_core::MutexLock lock(&mu);`
			`queue->emplace_back(&closure);`
			`}`
			`for (int i = 0; i < element_count; i++) {`
			`grpc_core::MutexLock lock(&mu);`
			`EventEngine::Closure* popped = queue->back();`
			`queue->pop_back();`
			`assert(popped != nullptr);`
			`}`
			`}`
			`state.counters["Added"] = element_count * state.iterations();`
			`state.counters["Popped"] = state.counters["Added"];`
			`state.counters["Steal Rate"] =`
			`benchmark::Counter(state.counters["Popped"], benchmark::Counter::kIsRate);`
			`if (state.thread_index() == 0) {`
			`delete (*globalDequeList)[0];`
			`}`
			`}`
			`BENCHMARK(BM_StdDequeLIFO)`
			`->Setup(GlobalSetup)`
			`->Teardown(GlobalTeardown)`
			`->Range(1, 512)`
			`->UseRealTime()`
			`->MeasureProcessCPUTime()`
			`->Threads(1)`
			`->Threads(4)`
			`->ThreadPerCpu();`

			`void PerThreadArguments(benchmark::internal::Benchmark* b) {`
			`b->Setup(GlobalSetup)`
			`->Teardown(GlobalTeardown)`
			`->ArgsProduct({/pop_attempts=/{10, 50, 250},`
			`/pct_fill=/{2, 10, 50}})`
			`->UseRealTime()`
			`->MeasureProcessCPUTime()`
			`->Threads(10)`
			`->ThreadPerCpu();`
			`}`

			`void BM_WorkQueuePerThread(benchmark::State& state) {`
			`WorkQueue local_queue;`
			`{`
			`grpc_core::MutexLock lock(&globalMu);`
			`(*globalWorkQueueList)[state.thread_index()] = &local_queue;`
			`}`
			`AnyInvocableClosure closure([] {});`
			`int element_count = state.range(0);`
			`float pct_fill = state.range(1) / 100.0;`
			`for (auto _ : state) {`
			`// sparsely populate a queue`
			`for (int i = 0; i < std::ceil(element_count * pct_fill); i++) {`
			`local_queue.Add(&closure);`
			`}`
			// attempt to pop from all thread queues `element_count` times
			`int pop_attempts = 0;`
			`auto iq = globalWorkQueueList->begin();`
			`while (pop_attempts++ < element_count) {`
			`// may not get a value if the queue being looked at from another thread`
			`(*iq)->PopBack();`
			`if (iq == globalWorkQueueList->end()) {`
			`iq = globalWorkQueueList->begin();`
			`} else {`
			`iq++;`
			`};`
			`}`
			`}`
			`state.counters["Added"] =`
			`std::ceil(element_count * pct_fill) * state.iterations();`
			`state.counters["Steal Attempts"] = element_count * state.iterations();`
			`state.counters["Steal Rate"] = benchmark::Counter(`
			`state.counters["Steal Attempts"], benchmark::Counter::kIsRate);`
			`if (state.thread_index() == 0) {`
			`for (auto* queue : *globalWorkQueueList) {`
			`assert(queue->Empty());`
			`}`
			`}`
			`}`
			`BENCHMARK(BM_WorkQueuePerThread)->Apply(PerThreadArguments);`

			`void BM_StdDequePerThread(benchmark::State& state) {`
			`std::deque<EventEngine::Closure*> local_queue;`
			`(*globalDequeList)[state.thread_index()] = &local_queue;`
			`int element_count = state.range(0);`
			`float pct_fill = state.range(1) / 100.0;`
			`AnyInvocableClosure closure([] {});`
			`auto& local_mu = (*globalDequeMutexList)[state.thread_index()];`
			`for (auto _ : state) {`
			`// sparsely populate a queue`
			`for (int i = 0; i < std::ceil(element_count * pct_fill); i++) {`
			`grpc_core::MutexLock lock(&local_mu);`
			`local_queue.emplace_back(&closure);`
			`}`
			`int pop_attempts = 0;`
			`auto iq = globalDequeList->begin();`
			`auto mu = globalDequeMutexList->begin();`
			`while (pop_attempts++ < element_count) {`
			`{`
			`grpc_core::MutexLock lock(&*mu);`
			`if (!(*iq)->empty()) {`
			`assert((*iq)->back() != nullptr);`
			`(*iq)->pop_back();`
			`}`
			`}`
			`if (iq == globalDequeList->end()) {`
			`iq = globalDequeList->begin();`
			`mu = globalDequeMutexList->begin();`
			`} else {`
			`++iq;`
			`++mu;`
			`};`
			`}`
			`}`
			`state.counters["Added"] =`
			`std::ceil(element_count * pct_fill) * state.iterations();`
			`state.counters["Steal Attempts"] = element_count * state.iterations();`
			`state.counters["Steal Rate"] = benchmark::Counter(`
			`state.counters["Steal Attempts"], benchmark::Counter::kIsRate);`
			`if (state.thread_index() == 0) {`
			`for (auto* queue : *globalDequeList) {`
			`assert(queue->empty());`
			`}`
			`}`
			`}`
			`BENCHMARK(BM_StdDequePerThread)->Apply(PerThreadArguments);`

			`} // namespace`

			`// Some distros have RunSpecifiedBenchmarks under the benchmark namespace,`
			`// and others do not. This allows us to support both modes.`
			`namespace benchmark {`
			`void RunTheBenchmarksNamespaced() { RunSpecifiedBenchmarks(); }`
			`} // namespace benchmark`

			`int main(int argc, char** argv) {`
			`grpc::testing::TestEnvironment env(&argc, argv);`
			`::benchmark::Initialize(&argc, argv);`
			`benchmark::RunTheBenchmarksNamespaced();`
			`return 0;`
			`}`