Abseil Common Libraries (C++) (grcp 依赖)
https://abseil.io/
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
310 lines
11 KiB
310 lines
11 KiB
// Copyright 2017 The Abseil Authors. |
|
// |
|
// Licensed under the Apache License, Version 2.0 (the "License"); |
|
// you may not use this file except in compliance with the License. |
|
// You may obtain a copy of the License at |
|
// |
|
// https://www.apache.org/licenses/LICENSE-2.0 |
|
// |
|
// Unless required by applicable law or agreed to in writing, software |
|
// distributed under the License is distributed on an "AS IS" BASIS, |
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
// See the License for the specific language governing permissions and |
|
// limitations under the License. |
|
|
|
#include <cstdint> |
|
#include <mutex> // NOLINT(build/c++11) |
|
#include <vector> |
|
|
|
#include "absl/base/config.h" |
|
#include "absl/base/internal/cycleclock.h" |
|
#include "absl/base/internal/spinlock.h" |
|
#include "absl/synchronization/blocking_counter.h" |
|
#include "absl/synchronization/internal/thread_pool.h" |
|
#include "absl/synchronization/mutex.h" |
|
#include "benchmark/benchmark.h" |
|
|
|
namespace { |
|
|
|
void BM_Mutex(benchmark::State& state) { |
|
static absl::Mutex* mu = new absl::Mutex; |
|
for (auto _ : state) { |
|
absl::MutexLock lock(mu); |
|
} |
|
} |
|
BENCHMARK(BM_Mutex)->UseRealTime()->Threads(1)->ThreadPerCpu(); |
|
|
|
static void DelayNs(int64_t ns, int* data) { |
|
int64_t end = absl::base_internal::CycleClock::Now() + |
|
ns * absl::base_internal::CycleClock::Frequency() / 1e9; |
|
while (absl::base_internal::CycleClock::Now() < end) { |
|
++(*data); |
|
benchmark::DoNotOptimize(*data); |
|
} |
|
} |
|
|
|
template <typename MutexType> |
|
class RaiiLocker { |
|
public: |
|
explicit RaiiLocker(MutexType* mu) : mu_(mu) { mu_->Lock(); } |
|
~RaiiLocker() { mu_->Unlock(); } |
|
private: |
|
MutexType* mu_; |
|
}; |
|
|
|
template <> |
|
class RaiiLocker<std::mutex> { |
|
public: |
|
explicit RaiiLocker(std::mutex* mu) : mu_(mu) { mu_->lock(); } |
|
~RaiiLocker() { mu_->unlock(); } |
|
private: |
|
std::mutex* mu_; |
|
}; |
|
|
|
// RAII object to change the Mutex priority of the running thread. |
|
class ScopedThreadMutexPriority { |
|
public: |
|
explicit ScopedThreadMutexPriority(int priority) { |
|
absl::base_internal::ThreadIdentity* identity = |
|
absl::synchronization_internal::GetOrCreateCurrentThreadIdentity(); |
|
identity->per_thread_synch.priority = priority; |
|
// Bump next_priority_read_cycles to the infinite future so that the |
|
// implementation doesn't re-read the thread's actual scheduler priority |
|
// and replace our temporary scoped priority. |
|
identity->per_thread_synch.next_priority_read_cycles = |
|
std::numeric_limits<int64_t>::max(); |
|
} |
|
~ScopedThreadMutexPriority() { |
|
// Reset the "next priority read time" back to the infinite past so that |
|
// the next time the Mutex implementation wants to know this thread's |
|
// priority, it re-reads it from the OS instead of using our overridden |
|
// priority. |
|
absl::synchronization_internal::GetOrCreateCurrentThreadIdentity() |
|
->per_thread_synch.next_priority_read_cycles = |
|
std::numeric_limits<int64_t>::min(); |
|
} |
|
}; |
|
|
|
void BM_MutexEnqueue(benchmark::State& state) { |
|
// In the "multiple priorities" variant of the benchmark, one of the |
|
// threads runs with Mutex priority 0 while the rest run at elevated priority. |
|
// This benchmarks the performance impact of the presence of a low priority |
|
// waiter when a higher priority waiter adds itself of the queue |
|
// (b/175224064). |
|
// |
|
// NOTE: The actual scheduler priority is not modified in this benchmark: |
|
// all of the threads get CPU slices with the same priority. Only the |
|
// Mutex queueing behavior is modified. |
|
const bool multiple_priorities = state.range(0); |
|
ScopedThreadMutexPriority priority_setter( |
|
(multiple_priorities && state.thread_index != 0) ? 1 : 0); |
|
|
|
struct Shared { |
|
absl::Mutex mu; |
|
std::atomic<int> looping_threads{0}; |
|
std::atomic<int> blocked_threads{0}; |
|
std::atomic<bool> thread_has_mutex{false}; |
|
}; |
|
static Shared* shared = new Shared; |
|
|
|
// Set up 'blocked_threads' to count how many threads are currently blocked |
|
// in Abseil synchronization code. |
|
// |
|
// NOTE: Blocking done within the Google Benchmark library itself (e.g. |
|
// the barrier which synchronizes threads entering and exiting the benchmark |
|
// loop) does _not_ get registered in this counter. This is because Google |
|
// Benchmark uses its own synchronization primitives based on std::mutex, not |
|
// Abseil synchronization primitives. If at some point the benchmark library |
|
// merges into Abseil, this code may break. |
|
absl::synchronization_internal::PerThreadSem::SetThreadBlockedCounter( |
|
&shared->blocked_threads); |
|
|
|
// The benchmark framework may run several iterations in the same process, |
|
// reusing the same static-initialized 'shared' object. Given the semantics |
|
// of the members, here, we expect everything to be reset to zero by the |
|
// end of any iteration. Assert that's the case, just to be sure. |
|
ABSL_RAW_CHECK( |
|
shared->looping_threads.load(std::memory_order_relaxed) == 0 && |
|
shared->blocked_threads.load(std::memory_order_relaxed) == 0 && |
|
!shared->thread_has_mutex.load(std::memory_order_relaxed), |
|
"Shared state isn't zeroed at start of benchmark iteration"); |
|
|
|
static constexpr int kBatchSize = 1000; |
|
while (state.KeepRunningBatch(kBatchSize)) { |
|
shared->looping_threads.fetch_add(1); |
|
for (int i = 0; i < kBatchSize; i++) { |
|
{ |
|
absl::MutexLock l(&shared->mu); |
|
shared->thread_has_mutex.store(true, std::memory_order_relaxed); |
|
// Spin until all other threads are either out of the benchmark loop |
|
// or blocked on the mutex. This ensures that the mutex queue is kept |
|
// at its maximal length to benchmark the performance of queueing on |
|
// a highly contended mutex. |
|
while (shared->looping_threads.load(std::memory_order_relaxed) - |
|
shared->blocked_threads.load(std::memory_order_relaxed) != |
|
1) { |
|
} |
|
shared->thread_has_mutex.store(false); |
|
} |
|
// Spin until some other thread has acquired the mutex before we block |
|
// again. This ensures that we always go through the slow (queueing) |
|
// acquisition path rather than reacquiring the mutex we just released. |
|
while (!shared->thread_has_mutex.load(std::memory_order_relaxed) && |
|
shared->looping_threads.load(std::memory_order_relaxed) > 1) { |
|
} |
|
} |
|
// The benchmark framework uses a barrier to ensure that all of the threads |
|
// complete their benchmark loop together before any of the threads exit |
|
// the loop. So, we need to remove ourselves from the "looping threads" |
|
// counter here before potentially blocking on that barrier. Otherwise, |
|
// another thread spinning above might wait forever for this thread to |
|
// block on the mutex while we in fact are waiting to exit. |
|
shared->looping_threads.fetch_add(-1); |
|
} |
|
absl::synchronization_internal::PerThreadSem::SetThreadBlockedCounter( |
|
nullptr); |
|
} |
|
|
|
BENCHMARK(BM_MutexEnqueue) |
|
->Threads(4) |
|
->Threads(64) |
|
->Threads(128) |
|
->Threads(512) |
|
->ArgName("multiple_priorities") |
|
->Arg(false) |
|
->Arg(true); |
|
|
|
template <typename MutexType> |
|
void BM_Contended(benchmark::State& state) { |
|
int priority = state.thread_index % state.range(1); |
|
ScopedThreadMutexPriority priority_setter(priority); |
|
|
|
struct Shared { |
|
MutexType mu; |
|
int data = 0; |
|
}; |
|
static auto* shared = new Shared; |
|
int local = 0; |
|
for (auto _ : state) { |
|
// Here we model both local work outside of the critical section as well as |
|
// some work inside of the critical section. The idea is to capture some |
|
// more or less realisitic contention levels. |
|
// If contention is too low, the benchmark won't measure anything useful. |
|
// If contention is unrealistically high, the benchmark will favor |
|
// bad mutex implementations that block and otherwise distract threads |
|
// from the mutex and shared state for as much as possible. |
|
// To achieve this amount of local work is multiplied by number of threads |
|
// to keep ratio between local work and critical section approximately |
|
// equal regardless of number of threads. |
|
DelayNs(100 * state.threads, &local); |
|
RaiiLocker<MutexType> locker(&shared->mu); |
|
DelayNs(state.range(0), &shared->data); |
|
} |
|
} |
|
void SetupBenchmarkArgs(benchmark::internal::Benchmark* bm, |
|
bool do_test_priorities) { |
|
const int max_num_priorities = do_test_priorities ? 2 : 1; |
|
bm->UseRealTime() |
|
// ThreadPerCpu poorly handles non-power-of-two CPU counts. |
|
->Threads(1) |
|
->Threads(2) |
|
->Threads(4) |
|
->Threads(6) |
|
->Threads(8) |
|
->Threads(12) |
|
->Threads(16) |
|
->Threads(24) |
|
->Threads(32) |
|
->Threads(48) |
|
->Threads(64) |
|
->Threads(96) |
|
->Threads(128) |
|
->Threads(192) |
|
->Threads(256) |
|
->ArgNames({"cs_ns", "num_prios"}); |
|
// Some empirically chosen amounts of work in critical section. |
|
// 1 is low contention, 2000 is high contention and few values in between. |
|
for (int critical_section_ns : {1, 20, 50, 200, 2000}) { |
|
for (int num_priorities = 1; num_priorities <= max_num_priorities; |
|
num_priorities++) { |
|
bm->ArgPair(critical_section_ns, num_priorities); |
|
} |
|
} |
|
} |
|
|
|
BENCHMARK_TEMPLATE(BM_Contended, absl::Mutex) |
|
->Apply([](benchmark::internal::Benchmark* bm) { |
|
SetupBenchmarkArgs(bm, /*do_test_priorities=*/true); |
|
}); |
|
|
|
BENCHMARK_TEMPLATE(BM_Contended, absl::base_internal::SpinLock) |
|
->Apply([](benchmark::internal::Benchmark* bm) { |
|
SetupBenchmarkArgs(bm, /*do_test_priorities=*/false); |
|
}); |
|
|
|
BENCHMARK_TEMPLATE(BM_Contended, std::mutex) |
|
->Apply([](benchmark::internal::Benchmark* bm) { |
|
SetupBenchmarkArgs(bm, /*do_test_priorities=*/false); |
|
}); |
|
|
|
// Measure the overhead of conditions on mutex release (when they must be |
|
// evaluated). Mutex has (some) support for equivalence classes allowing |
|
// Conditions with the same function/argument to potentially not be multiply |
|
// evaluated. |
|
// |
|
// num_classes==0 is used for the special case of every waiter being distinct. |
|
void BM_ConditionWaiters(benchmark::State& state) { |
|
int num_classes = state.range(0); |
|
int num_waiters = state.range(1); |
|
|
|
struct Helper { |
|
static void Waiter(absl::BlockingCounter* init, absl::Mutex* m, int* p) { |
|
init->DecrementCount(); |
|
m->LockWhen(absl::Condition( |
|
static_cast<bool (*)(int*)>([](int* v) { return *v == 0; }), p)); |
|
m->Unlock(); |
|
} |
|
}; |
|
|
|
if (num_classes == 0) { |
|
// No equivalence classes. |
|
num_classes = num_waiters; |
|
} |
|
|
|
absl::BlockingCounter init(num_waiters); |
|
absl::Mutex mu; |
|
std::vector<int> equivalence_classes(num_classes, 1); |
|
|
|
// Must be declared last to be destroyed first. |
|
absl::synchronization_internal::ThreadPool pool(num_waiters); |
|
|
|
for (int i = 0; i < num_waiters; i++) { |
|
// Mutex considers Conditions with the same function and argument |
|
// to be equivalent. |
|
pool.Schedule([&, i] { |
|
Helper::Waiter(&init, &mu, &equivalence_classes[i % num_classes]); |
|
}); |
|
} |
|
init.Wait(); |
|
|
|
for (auto _ : state) { |
|
mu.Lock(); |
|
mu.Unlock(); // Each unlock requires Condition evaluation for our waiters. |
|
} |
|
|
|
mu.Lock(); |
|
for (int i = 0; i < num_classes; i++) { |
|
equivalence_classes[i] = 0; |
|
} |
|
mu.Unlock(); |
|
} |
|
|
|
// Some configurations have higher thread limits than others. |
|
#if defined(__linux__) && !defined(ABSL_HAVE_THREAD_SANITIZER) |
|
constexpr int kMaxConditionWaiters = 8192; |
|
#else |
|
constexpr int kMaxConditionWaiters = 1024; |
|
#endif |
|
BENCHMARK(BM_ConditionWaiters)->RangePair(0, 2, 1, kMaxConditionWaiters); |
|
|
|
} // namespace
|
|
|