abseil-cpp/absl/synchronization/mutex_benchmark.cc

// Copyright 2017 The Abseil Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <cstdint>
#include <mutex>  // NOLINT(build/c++11)
#include <vector>

#include "absl/base/config.h"
#include "absl/base/internal/cycleclock.h"
#include "absl/base/internal/spinlock.h"
#include "absl/synchronization/blocking_counter.h"
#include "absl/synchronization/internal/thread_pool.h"
#include "absl/synchronization/mutex.h"
#include "benchmark/benchmark.h"

namespace {

void BM_Mutex(benchmark::State& state) {
  static absl::Mutex* mu = new absl::Mutex;
  for (auto _ : state) {
    absl::MutexLock lock(mu);
  }
}
BENCHMARK(BM_Mutex)->UseRealTime()->Threads(1)->ThreadPerCpu();

static void DelayNs(int64_t ns, int* data) {
  int64_t end = absl::base_internal::CycleClock::Now() +
                ns * absl::base_internal::CycleClock::Frequency() / 1e9;
  while (absl::base_internal::CycleClock::Now() < end) {
    ++(*data);
    benchmark::DoNotOptimize(*data);
  }
}

template <typename MutexType>
class RaiiLocker {
 public:
  explicit RaiiLocker(MutexType* mu) : mu_(mu) { mu_->Lock(); }
  ~RaiiLocker() { mu_->Unlock(); }
 private:
  MutexType* mu_;
};

template <>
class RaiiLocker<std::mutex> {
 public:
  explicit RaiiLocker(std::mutex* mu) : mu_(mu) { mu_->lock(); }
  ~RaiiLocker() { mu_->unlock(); }
 private:
  std::mutex* mu_;
};

template <typename MutexType>
void BM_Contended(benchmark::State& state) {
  struct Shared {
    MutexType mu;
    int data = 0;
  };
  static auto* shared = new Shared;
  int local = 0;
  for (auto _ : state) {
    // Here we model both local work outside of the critical section as well as
    // some work inside of the critical section. The idea is to capture some
    // more or less realisitic contention levels.
    // If contention is too low, the benchmark won't measure anything useful.
    // If contention is unrealistically high, the benchmark will favor
    // bad mutex implementations that block and otherwise distract threads
    // from the mutex and shared state for as much as possible.
    // To achieve this amount of local work is multiplied by number of threads
    // to keep ratio between local work and critical section approximately
    // equal regardless of number of threads.
    DelayNs(100 * state.threads, &local);
    RaiiLocker<MutexType> locker(&shared->mu);
    DelayNs(state.range(0), &shared->data);
  }
}

BENCHMARK_TEMPLATE(BM_Contended, absl::Mutex)
    ->UseRealTime()
    // ThreadPerCpu poorly handles non-power-of-two CPU counts.
    ->Threads(1)
    ->Threads(2)
    ->Threads(4)
    ->Threads(6)
    ->Threads(8)
    ->Threads(12)
    ->Threads(16)
    ->Threads(24)
    ->Threads(32)
    ->Threads(48)
    ->Threads(64)
    ->Threads(96)
    ->Threads(128)
    ->Threads(192)
    ->Threads(256)
    // Some empirically chosen amounts of work in critical section.
    // 1 is low contention, 200 is high contention and few values in between.
    ->Arg(1)
    ->Arg(20)
    ->Arg(50)
    ->Arg(200);

BENCHMARK_TEMPLATE(BM_Contended, absl::base_internal::SpinLock)
    ->UseRealTime()
    // ThreadPerCpu poorly handles non-power-of-two CPU counts.
    ->Threads(1)
    ->Threads(2)
    ->Threads(4)
    ->Threads(6)
    ->Threads(8)
    ->Threads(12)
    ->Threads(16)
    ->Threads(24)
    ->Threads(32)
    ->Threads(48)
    ->Threads(64)
    ->Threads(96)
    ->Threads(128)
    ->Threads(192)
    ->Threads(256)
    // Some empirically chosen amounts of work in critical section.
    // 1 is low contention, 200 is high contention and few values in between.
    ->Arg(1)
    ->Arg(20)
    ->Arg(50)
    ->Arg(200);

BENCHMARK_TEMPLATE(BM_Contended, std::mutex)
    ->UseRealTime()
    // ThreadPerCpu poorly handles non-power-of-two CPU counts.
    ->Threads(1)
    ->Threads(2)
    ->Threads(4)
    ->Threads(6)
    ->Threads(8)
    ->Threads(12)
    ->Threads(16)
    ->Threads(24)
    ->Threads(32)
    ->Threads(48)
    ->Threads(64)
    ->Threads(96)
    ->Threads(128)
    ->Threads(192)
    ->Threads(256)
    // Some empirically chosen amounts of work in critical section.
    // 1 is low contention, 200 is high contention and few values in between.
    ->Arg(1)
    ->Arg(20)
    ->Arg(50)
    ->Arg(200);

// Measure the overhead of conditions on mutex release (when they must be
// evaluated).  Mutex has (some) support for equivalence classes allowing
// Conditions with the same function/argument to potentially not be multiply
// evaluated.
//
// num_classes==0 is used for the special case of every waiter being distinct.
void BM_ConditionWaiters(benchmark::State& state) {
  int num_classes = state.range(0);
  int num_waiters = state.range(1);

  struct Helper {
    static void Waiter(absl::BlockingCounter* init, absl::Mutex* m, int* p) {
      init->DecrementCount();
      m->LockWhen(absl::Condition(
          static_cast<bool (*)(int*)>([](int* v) { return *v == 0; }), p));
      m->Unlock();
    }
  };

  if (num_classes == 0) {
    // No equivalence classes.
    num_classes = num_waiters;
  }

  absl::BlockingCounter init(num_waiters);
  absl::Mutex mu;
  std::vector<int> equivalence_classes(num_classes, 1);

  // Must be declared last to be destroyed first.
  absl::synchronization_internal::ThreadPool pool(num_waiters);

  for (int i = 0; i < num_waiters; i++) {
    // Mutex considers Conditions with the same function and argument
    // to be equivalent.
    pool.Schedule([&, i] {
      Helper::Waiter(&init, &mu, &equivalence_classes[i % num_classes]);
    });
  }
  init.Wait();

  for (auto _ : state) {
    mu.Lock();
    mu.Unlock();  // Each unlock requires Condition evaluation for our waiters.
  }

  mu.Lock();
  for (int i = 0; i < num_classes; i++) {
    equivalence_classes[i] = 0;
  }
  mu.Unlock();
}

// Some configurations have higher thread limits than others.
#if defined(__linux__) && !defined(ABSL_HAVE_THREAD_SANITIZER)
constexpr int kMaxConditionWaiters = 8192;
#else
constexpr int kMaxConditionWaiters = 1024;
#endif
BENCHMARK(BM_ConditionWaiters)->RangePair(0, 2, 1, kMaxConditionWaiters);

}  // namespace
Export of internal Abseil changes -- f012012ef78234a6a4585321b67d7b7c92ebc266 by Laramie Leavitt <lar@google.com>: Slight restructuring of absl/random/internal randen implementation. Convert round-keys.inc into randen_round_keys.cc file. Consistently use a 128-bit pointer type for internal method parameters. This allows simpler pointer arithmetic in C++ & permits removal of some constants and casts. Remove some redundancy in comments & constexpr variables. Specifically, all references to Randen algorithm parameters use RandenTraits; duplication in RandenSlow removed. PiperOrigin-RevId: 312190313 -- dc8b42e054046741e9ed65335bfdface997c6063 by Abseil Team <absl-team@google.com>: Internal change. PiperOrigin-RevId: 312167304 -- f13d248fafaf206492c1362c3574031aea3abaf7 by Matthew Brown <matthewbr@google.com>: Cleanup StrFormat extensions a little. PiperOrigin-RevId: 312166336 -- 9d9117589667afe2332bb7ad42bc967ca7c54502 by Derek Mauro <dmauro@google.com>: Internal change PiperOrigin-RevId: 312105213 -- 9a12b9b3aa0e59b8ee6cf9408ed0029045543a9b by Abseil Team <absl-team@google.com>: Complete IGNORE_TYPE macro renaming. PiperOrigin-RevId: 311999699 -- 64756f20d61021d999bd0d4c15e9ad3857382f57 by Gennadiy Rozental <rogeeff@google.com>: Switch to fixed bytes specific default value. This fixes the Abseil Flags for big endian platforms. PiperOrigin-RevId: 311844448 -- bdbe6b5b29791dbc3816ada1828458b3010ff1e9 by Laramie Leavitt <lar@google.com>: Change many distribution tests to use pcg_engine as a deterministic source of entropy. It's reasonable to test that the BitGen itself has good entropy, however when testing the cross product of all random distributions x all the architecture variations x all submitted changes results in a large number of tests. In order to account for these failures while still using good entropy requires that our allowed sigma need to account for all of these independent tests. Our current sigma values are too restrictive, and we see a lot of failures, so we have to either relax the sigma values or convert some of the statistical tests to use deterministic values. This changelist does the latter. PiperOrigin-RevId: 311840096 GitOrigin-RevId: f012012ef78234a6a4585321b67d7b7c92ebc266 Change-Id: Ic84886f38ff30d7d72c126e9b63c9a61eb729a1a 5 years ago			`// Copyright 2017 The Abseil Authors.`
			`//`
			`// Licensed under the Apache License, Version 2.0 (the "License");`
			`// you may not use this file except in compliance with the License.`
			`// You may obtain a copy of the License at`
			`//`
			`// https://www.apache.org/licenses/LICENSE-2.0`
			`//`
			`// Unless required by applicable law or agreed to in writing, software`
			`// distributed under the License is distributed on an "AS IS" BASIS,`
			`// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`// See the License for the specific language governing permissions and`
			`// limitations under the License.`

			`#include <cstdint>`
			`#include <mutex> // NOLINT(build/c++11)`
			`#include <vector>`

Export of internal Abseil changes -- 587e6db882749fa7faa12815e614afab04d218b9 by Derek Mauro <dmauro@google.com>: Use attribute detection for other sanitizer related attributes PiperOrigin-RevId: 324077073 -- 3ee55e4935b4235516b1fcac3c55945e510f7afc by Evan Brown <ezb@google.com>: Simplify CordRepExternal allocation/deallocation. I think this can save some memory when `Releaser` is empty and when on platforms where alignof(CordRepExternal) < (default `::operator new` alignment). We no longer need the API requirement that alignof(Releaser) <= (default `::operator new` alignment). Also remove another static_assert from a TODO in cord_internal.h and fix some warnings about calling std::move on a forwarding reference. PiperOrigin-RevId: 324053720 -- 9fc78436565eb3b204d4aa425ee3773354392f45 by Derek Mauro <dmauro@google.com>: Use auto-detected sanitizer attributes for ASAN, MSAN, and TSAN builds PiperOrigin-RevId: 323831461 GitOrigin-RevId: 587e6db882749fa7faa12815e614afab04d218b9 Change-Id: Ie0e4a2846d7f66988a2d81a5e50721b62fdb3d6d 4 years ago			`#include "absl/base/config.h"`
Export of internal Abseil changes -- f012012ef78234a6a4585321b67d7b7c92ebc266 by Laramie Leavitt <lar@google.com>: Slight restructuring of absl/random/internal randen implementation. Convert round-keys.inc into randen_round_keys.cc file. Consistently use a 128-bit pointer type for internal method parameters. This allows simpler pointer arithmetic in C++ & permits removal of some constants and casts. Remove some redundancy in comments & constexpr variables. Specifically, all references to Randen algorithm parameters use RandenTraits; duplication in RandenSlow removed. PiperOrigin-RevId: 312190313 -- dc8b42e054046741e9ed65335bfdface997c6063 by Abseil Team <absl-team@google.com>: Internal change. PiperOrigin-RevId: 312167304 -- f13d248fafaf206492c1362c3574031aea3abaf7 by Matthew Brown <matthewbr@google.com>: Cleanup StrFormat extensions a little. PiperOrigin-RevId: 312166336 -- 9d9117589667afe2332bb7ad42bc967ca7c54502 by Derek Mauro <dmauro@google.com>: Internal change PiperOrigin-RevId: 312105213 -- 9a12b9b3aa0e59b8ee6cf9408ed0029045543a9b by Abseil Team <absl-team@google.com>: Complete IGNORE_TYPE macro renaming. PiperOrigin-RevId: 311999699 -- 64756f20d61021d999bd0d4c15e9ad3857382f57 by Gennadiy Rozental <rogeeff@google.com>: Switch to fixed bytes specific default value. This fixes the Abseil Flags for big endian platforms. PiperOrigin-RevId: 311844448 -- bdbe6b5b29791dbc3816ada1828458b3010ff1e9 by Laramie Leavitt <lar@google.com>: Change many distribution tests to use pcg_engine as a deterministic source of entropy. It's reasonable to test that the BitGen itself has good entropy, however when testing the cross product of all random distributions x all the architecture variations x all submitted changes results in a large number of tests. In order to account for these failures while still using good entropy requires that our allowed sigma need to account for all of these independent tests. Our current sigma values are too restrictive, and we see a lot of failures, so we have to either relax the sigma values or convert some of the statistical tests to use deterministic values. This changelist does the latter. PiperOrigin-RevId: 311840096 GitOrigin-RevId: f012012ef78234a6a4585321b67d7b7c92ebc266 Change-Id: Ic84886f38ff30d7d72c126e9b63c9a61eb729a1a 5 years ago			`#include "absl/base/internal/cycleclock.h"`
			`#include "absl/base/internal/spinlock.h"`
			`#include "absl/synchronization/blocking_counter.h"`
			`#include "absl/synchronization/internal/thread_pool.h"`
			`#include "absl/synchronization/mutex.h"`
			`#include "benchmark/benchmark.h"`

			`namespace {`

			`void BM_Mutex(benchmark::State& state) {`
			`static absl::Mutex* mu = new absl::Mutex;`
			`for (auto _ : state) {`
			`absl::MutexLock lock(mu);`
			`}`
			`}`
			`BENCHMARK(BM_Mutex)->UseRealTime()->Threads(1)->ThreadPerCpu();`

			`static void DelayNs(int64_t ns, int* data) {`
			`int64_t end = absl::base_internal::CycleClock::Now() +`
			`ns * absl::base_internal::CycleClock::Frequency() / 1e9;`
			`while (absl::base_internal::CycleClock::Now() < end) {`
			`++(*data);`
			`benchmark::DoNotOptimize(*data);`
			`}`
			`}`

			`template <typename MutexType>`
			`class RaiiLocker {`
			`public:`
			`explicit RaiiLocker(MutexType* mu) : mu_(mu) { mu_->Lock(); }`
			`~RaiiLocker() { mu_->Unlock(); }`
			`private:`
			`MutexType* mu_;`
			`};`

			`template <>`
			`class RaiiLocker<std::mutex> {`
			`public:`
			`explicit RaiiLocker(std::mutex* mu) : mu_(mu) { mu_->lock(); }`
			`~RaiiLocker() { mu_->unlock(); }`
			`private:`
			`std::mutex* mu_;`
			`};`

			`template <typename MutexType>`
			`void BM_Contended(benchmark::State& state) {`
			`struct Shared {`
			`MutexType mu;`
			`int data = 0;`
			`};`
			`static auto* shared = new Shared;`
			`int local = 0;`
			`for (auto _ : state) {`
			`// Here we model both local work outside of the critical section as well as`
			`// some work inside of the critical section. The idea is to capture some`
			`// more or less realisitic contention levels.`
			`// If contention is too low, the benchmark won't measure anything useful.`
			`// If contention is unrealistically high, the benchmark will favor`
			`// bad mutex implementations that block and otherwise distract threads`
			`// from the mutex and shared state for as much as possible.`
			`// To achieve this amount of local work is multiplied by number of threads`
			`// to keep ratio between local work and critical section approximately`
			`// equal regardless of number of threads.`
			`DelayNs(100 * state.threads, &local);`
			`RaiiLocker<MutexType> locker(&shared->mu);`
			`DelayNs(state.range(0), &shared->data);`
			`}`
			`}`

			`BENCHMARK_TEMPLATE(BM_Contended, absl::Mutex)`
			`->UseRealTime()`
			`// ThreadPerCpu poorly handles non-power-of-two CPU counts.`
			`->Threads(1)`
			`->Threads(2)`
			`->Threads(4)`
			`->Threads(6)`
			`->Threads(8)`
			`->Threads(12)`
			`->Threads(16)`
			`->Threads(24)`
			`->Threads(32)`
			`->Threads(48)`
			`->Threads(64)`
			`->Threads(96)`
			`->Threads(128)`
			`->Threads(192)`
			`->Threads(256)`
			`// Some empirically chosen amounts of work in critical section.`
			`// 1 is low contention, 200 is high contention and few values in between.`
			`->Arg(1)`
			`->Arg(20)`
			`->Arg(50)`
			`->Arg(200);`

			`BENCHMARK_TEMPLATE(BM_Contended, absl::base_internal::SpinLock)`
			`->UseRealTime()`
			`// ThreadPerCpu poorly handles non-power-of-two CPU counts.`
			`->Threads(1)`
			`->Threads(2)`
			`->Threads(4)`
			`->Threads(6)`
			`->Threads(8)`
			`->Threads(12)`
			`->Threads(16)`
			`->Threads(24)`
			`->Threads(32)`
			`->Threads(48)`
			`->Threads(64)`
			`->Threads(96)`
			`->Threads(128)`
			`->Threads(192)`
			`->Threads(256)`
			`// Some empirically chosen amounts of work in critical section.`
			`// 1 is low contention, 200 is high contention and few values in between.`
			`->Arg(1)`
			`->Arg(20)`
			`->Arg(50)`
			`->Arg(200);`

			`BENCHMARK_TEMPLATE(BM_Contended, std::mutex)`
			`->UseRealTime()`
			`// ThreadPerCpu poorly handles non-power-of-two CPU counts.`
			`->Threads(1)`
			`->Threads(2)`
			`->Threads(4)`
			`->Threads(6)`
			`->Threads(8)`
			`->Threads(12)`
			`->Threads(16)`
			`->Threads(24)`
			`->Threads(32)`
			`->Threads(48)`
			`->Threads(64)`
			`->Threads(96)`
			`->Threads(128)`
			`->Threads(192)`
			`->Threads(256)`
			`// Some empirically chosen amounts of work in critical section.`
			`// 1 is low contention, 200 is high contention and few values in between.`
			`->Arg(1)`
			`->Arg(20)`
			`->Arg(50)`
			`->Arg(200);`

			`// Measure the overhead of conditions on mutex release (when they must be`
			`// evaluated). Mutex has (some) support for equivalence classes allowing`
			`// Conditions with the same function/argument to potentially not be multiply`
			`// evaluated.`
			`//`
			`// num_classes==0 is used for the special case of every waiter being distinct.`
			`void BM_ConditionWaiters(benchmark::State& state) {`
			`int num_classes = state.range(0);`
			`int num_waiters = state.range(1);`

			`struct Helper {`
			`static void Waiter(absl::BlockingCounter* init, absl::Mutex* m, int* p) {`
			`init->DecrementCount();`
			`m->LockWhen(absl::Condition(`
			`static_cast<bool ()(int)>([](int* v) { return *v == 0; }), p));`
			`m->Unlock();`
			`}`
			`};`

			`if (num_classes == 0) {`
			`// No equivalence classes.`
			`num_classes = num_waiters;`
			`}`

			`absl::BlockingCounter init(num_waiters);`
			`absl::Mutex mu;`
			`std::vector<int> equivalence_classes(num_classes, 1);`

			`// Must be declared last to be destroyed first.`
			`absl::synchronization_internal::ThreadPool pool(num_waiters);`

			`for (int i = 0; i < num_waiters; i++) {`
			`// Mutex considers Conditions with the same function and argument`
			`// to be equivalent.`
			`pool.Schedule([&, i] {`
			`Helper::Waiter(&init, &mu, &equivalence_classes[i % num_classes]);`
			`});`
			`}`
			`init.Wait();`

			`for (auto _ : state) {`
			`mu.Lock();`
			`mu.Unlock(); // Each unlock requires Condition evaluation for our waiters.`
			`}`

			`mu.Lock();`
			`for (int i = 0; i < num_classes; i++) {`
			`equivalence_classes[i] = 0;`
			`}`
			`mu.Unlock();`
			`}`

			`// Some configurations have higher thread limits than others.`
Export of internal Abseil changes -- 587e6db882749fa7faa12815e614afab04d218b9 by Derek Mauro <dmauro@google.com>: Use attribute detection for other sanitizer related attributes PiperOrigin-RevId: 324077073 -- 3ee55e4935b4235516b1fcac3c55945e510f7afc by Evan Brown <ezb@google.com>: Simplify CordRepExternal allocation/deallocation. I think this can save some memory when `Releaser` is empty and when on platforms where alignof(CordRepExternal) < (default `::operator new` alignment). We no longer need the API requirement that alignof(Releaser) <= (default `::operator new` alignment). Also remove another static_assert from a TODO in cord_internal.h and fix some warnings about calling std::move on a forwarding reference. PiperOrigin-RevId: 324053720 -- 9fc78436565eb3b204d4aa425ee3773354392f45 by Derek Mauro <dmauro@google.com>: Use auto-detected sanitizer attributes for ASAN, MSAN, and TSAN builds PiperOrigin-RevId: 323831461 GitOrigin-RevId: 587e6db882749fa7faa12815e614afab04d218b9 Change-Id: Ie0e4a2846d7f66988a2d81a5e50721b62fdb3d6d 4 years ago			`#if defined(__linux__) && !defined(ABSL_HAVE_THREAD_SANITIZER)`
Export of internal Abseil changes -- f012012ef78234a6a4585321b67d7b7c92ebc266 by Laramie Leavitt <lar@google.com>: Slight restructuring of absl/random/internal randen implementation. Convert round-keys.inc into randen_round_keys.cc file. Consistently use a 128-bit pointer type for internal method parameters. This allows simpler pointer arithmetic in C++ & permits removal of some constants and casts. Remove some redundancy in comments & constexpr variables. Specifically, all references to Randen algorithm parameters use RandenTraits; duplication in RandenSlow removed. PiperOrigin-RevId: 312190313 -- dc8b42e054046741e9ed65335bfdface997c6063 by Abseil Team <absl-team@google.com>: Internal change. PiperOrigin-RevId: 312167304 -- f13d248fafaf206492c1362c3574031aea3abaf7 by Matthew Brown <matthewbr@google.com>: Cleanup StrFormat extensions a little. PiperOrigin-RevId: 312166336 -- 9d9117589667afe2332bb7ad42bc967ca7c54502 by Derek Mauro <dmauro@google.com>: Internal change PiperOrigin-RevId: 312105213 -- 9a12b9b3aa0e59b8ee6cf9408ed0029045543a9b by Abseil Team <absl-team@google.com>: Complete IGNORE_TYPE macro renaming. PiperOrigin-RevId: 311999699 -- 64756f20d61021d999bd0d4c15e9ad3857382f57 by Gennadiy Rozental <rogeeff@google.com>: Switch to fixed bytes specific default value. This fixes the Abseil Flags for big endian platforms. PiperOrigin-RevId: 311844448 -- bdbe6b5b29791dbc3816ada1828458b3010ff1e9 by Laramie Leavitt <lar@google.com>: Change many distribution tests to use pcg_engine as a deterministic source of entropy. It's reasonable to test that the BitGen itself has good entropy, however when testing the cross product of all random distributions x all the architecture variations x all submitted changes results in a large number of tests. In order to account for these failures while still using good entropy requires that our allowed sigma need to account for all of these independent tests. Our current sigma values are too restrictive, and we see a lot of failures, so we have to either relax the sigma values or convert some of the statistical tests to use deterministic values. This changelist does the latter. PiperOrigin-RevId: 311840096 GitOrigin-RevId: f012012ef78234a6a4585321b67d7b7c92ebc266 Change-Id: Ic84886f38ff30d7d72c126e9b63c9a61eb729a1a 5 years ago			`constexpr int kMaxConditionWaiters = 8192;`
			`#else`
			`constexpr int kMaxConditionWaiters = 1024;`
			`#endif`
			`BENCHMARK(BM_ConditionWaiters)->RangePair(0, 2, 1, kMaxConditionWaiters);`

			`} // namespace`