[party] Speed up iteration over wakeup bits (#37037)

Built on #37036 which should be merged first.

Before:
```
---------------------------------------------------------------
Benchmark                     Time             CPU   Iterations
---------------------------------------------------------------
BM_PartyCreate             37.6 ns         37.6 ns    111332125
BM_AddParticipant          40.1 ns         40.1 ns    104740937
BM_WakeupParticipant       17.3 ns         17.3 ns    242484270
```

After:
```
---------------------------------------------------------------
Benchmark                     Time             CPU   Iterations
---------------------------------------------------------------
BM_PartyCreate             36.7 ns         36.7 ns    111888436
BM_AddParticipant          39.0 ns         39.0 ns    107068935
BM_WakeupParticipant       17.0 ns         17.0 ns    244844476
```

Closes #37037

COPYBARA_INTEGRATE_REVIEW=https://github.com/grpc/grpc/pull/37037 from ctiller:no-after-2 c3caee98ea
PiperOrigin-RevId: 647429841
pull/37082/head
Craig Tiller 5 months ago committed by Copybara-Service
parent 87862222cb
commit 43bf1afd3d
  1. 6
      include/grpc/support/port_platform.h
  2. 2
      src/core/BUILD
  3. 7
      src/core/lib/promise/party.h
  4. 70
      src/core/util/useful.h
  5. 138
      test/core/util/useful_test.cc

@ -836,6 +836,12 @@ extern void gpr_unreachable_code(const char* reason, const char* file,
#endif /* __GPR_WINDOWS */
#endif /* GRPC_ALLOW_EXCEPTIONS */
#ifdef __has_builtin
#define GRPC_HAS_BUILTIN(a) __has_builtin(a)
#else
#define GRPC_HAS_BUILTIN(a) 0
#endif
/* Use GPR_LIKELY only in cases where you are sure that a certain outcome is the
* most likely. Ideally, also collect performance numbers to justify the claim.
*/

@ -242,6 +242,7 @@ grpc_cc_library(
name = "useful",
hdrs = ["util/useful.h"],
external_deps = [
"absl/log:check",
"absl/strings",
"absl/types:variant",
],
@ -583,6 +584,7 @@ grpc_cc_library(
"poll",
"promise_factory",
"ref_counted",
"useful",
"//:event_engine_base_hdrs",
"//:exec_ctx",
"//:gpr",

@ -42,6 +42,7 @@
#include "src/core/lib/promise/detail/promise_factory.h"
#include "src/core/lib/promise/poll.h"
#include "src/core/lib/resource_quota/arena.h"
#include "src/core/util/useful.h"
// Two implementations of party synchronization are provided: one using a single
// atomic, the other using a mutex and a set of state variables.
@ -121,9 +122,11 @@ class PartySyncUsingAtomics {
// Now update prev_state to be what we want the CAS to see below.
prev_state &= kRefMask | kLocked | kAllocatedMask;
// For each wakeup bit...
for (size_t i = 0; wakeups != 0; i++, wakeups >>= 1) {
while (wakeups != 0) {
uint64_t t = LowestOneBit(wakeups);
const int i = CountTrailingZeros(t);
wakeups ^= t;
// If the bit is not set, skip.
if ((wakeups & 1) == 0) continue;
if (poll_one_participant(i)) {
const uint64_t allocated_bit = (1u << i << kAllocatedShift);
prev_state &= ~allocated_bit;

@ -21,6 +21,8 @@
#include <grpc/support/port_platform.h>
#include "absl/log/check.h"
#include <cstddef>
#include "absl/strings/string_view.h"
@ -66,43 +68,76 @@ bool GetBit(T i, size_t n) {
}
namespace useful_detail {
inline constexpr uint32_t HexdigitBitcount(uint32_t x) {
GPR_ATTRIBUTE_ALWAYS_INLINE_FUNCTION inline constexpr uint32_t HexdigitBitcount(
uint32_t x) {
return (x - ((x >> 1) & 0x77777777) - ((x >> 2) & 0x33333333) -
((x >> 3) & 0x11111111));
}
} // namespace useful_detail
inline constexpr uint32_t BitCount(uint32_t i) {
GPR_ATTRIBUTE_ALWAYS_INLINE_FUNCTION inline constexpr uint32_t BitCount(
uint32_t i) {
return (((useful_detail::HexdigitBitcount(i) +
(useful_detail::HexdigitBitcount(i) >> 4)) &
0x0f0f0f0f) %
255);
}
inline constexpr uint32_t BitCount(uint64_t i) {
GPR_ATTRIBUTE_ALWAYS_INLINE_FUNCTION inline constexpr uint32_t BitCount(
uint64_t i) {
return BitCount(static_cast<uint32_t>(i)) +
BitCount(static_cast<uint32_t>(i >> 32));
}
inline constexpr uint32_t BitCount(uint16_t i) {
GPR_ATTRIBUTE_ALWAYS_INLINE_FUNCTION inline constexpr uint32_t BitCount(
uint16_t i) {
return BitCount(static_cast<uint32_t>(i));
}
inline constexpr uint32_t BitCount(uint8_t i) {
GPR_ATTRIBUTE_ALWAYS_INLINE_FUNCTION inline constexpr uint32_t BitCount(
uint8_t i) {
return BitCount(static_cast<uint32_t>(i));
}
inline constexpr uint32_t BitCount(int64_t i) {
GPR_ATTRIBUTE_ALWAYS_INLINE_FUNCTION inline constexpr uint32_t BitCount(
int64_t i) {
return BitCount(static_cast<uint64_t>(i));
}
inline constexpr uint32_t BitCount(int32_t i) {
GPR_ATTRIBUTE_ALWAYS_INLINE_FUNCTION inline constexpr uint32_t BitCount(
int32_t i) {
return BitCount(static_cast<uint32_t>(i));
}
inline constexpr uint32_t BitCount(int16_t i) {
GPR_ATTRIBUTE_ALWAYS_INLINE_FUNCTION inline constexpr uint32_t BitCount(
int16_t i) {
return BitCount(static_cast<uint16_t>(i));
}
inline constexpr uint32_t BitCount(int8_t i) {
GPR_ATTRIBUTE_ALWAYS_INLINE_FUNCTION inline constexpr uint32_t BitCount(
int8_t i) {
return BitCount(static_cast<uint8_t>(i));
}
#if GRPC_HAS_BUILTIN(__builtin_ctz)
GPR_ATTRIBUTE_ALWAYS_INLINE_FUNCTION inline constexpr uint32_t
CountTrailingZeros(uint32_t i) {
DCHECK_NE(i, 0); // __builtin_ctz returns undefined behavior for 0
return __builtin_ctz(i);
}
GPR_ATTRIBUTE_ALWAYS_INLINE_FUNCTION inline constexpr uint32_t
CountTrailingZeros(uint64_t i) {
DCHECK_NE(i, 0); // __builtin_ctz returns undefined behavior for 0
return __builtin_ctzll(i);
}
#else
GPR_ATTRIBUTE_ALWAYS_INLINE_FUNCTION inline constexpr uint32_t
CountTrailingZeros(uint32_t i) {
DCHECK_NE(i, 0); // __builtin_ctz returns undefined behavior for 0
return BitCount((i & -i) - 1);
}
GPR_ATTRIBUTE_ALWAYS_INLINE_FUNCTION inline constexpr uint32_t
CountTrailingZeros(uint64_t i) {
DCHECK_NE(i, 0); // __builtin_ctz returns undefined behavior for 0
return BitCount((i & -i) - 1);
}
#endif
// This function uses operator< to implement a qsort-style comparison, whereby:
// if a is smaller than b, a number smaller than 0 is returned.
// if a is bigger than b, a number greater than 0 is returned.
@ -177,6 +212,23 @@ inline uint32_t RoundUpToPowerOf2(uint32_t v) {
return v;
}
// Return a value with only the lowest bit left on.
GPR_ATTRIBUTE_ALWAYS_INLINE_FUNCTION inline uint8_t LowestOneBit(uint8_t x) {
return x & -x;
}
GPR_ATTRIBUTE_ALWAYS_INLINE_FUNCTION inline uint16_t LowestOneBit(uint16_t x) {
return x & -x;
}
GPR_ATTRIBUTE_ALWAYS_INLINE_FUNCTION inline uint32_t LowestOneBit(uint32_t x) {
return x & -x;
}
GPR_ATTRIBUTE_ALWAYS_INLINE_FUNCTION inline uint64_t LowestOneBit(uint64_t x) {
return x & -x;
}
} // namespace grpc_core
#define GPR_ARRAY_SIZE(array) (sizeof(array) / sizeof(*(array)))

@ -93,6 +93,144 @@ TEST(UsefulTest, RoundUpToPowerOf2) {
EXPECT_EQ(RoundUpToPowerOf2(8), 8);
}
TEST(UsefulTest, CountTrailingZeros32) {
EXPECT_EQ(CountTrailingZeros(static_cast<uint32_t>(1)), 0);
EXPECT_EQ(CountTrailingZeros(static_cast<uint32_t>(2)), 1);
EXPECT_EQ(CountTrailingZeros(static_cast<uint32_t>(3)), 0);
EXPECT_EQ(CountTrailingZeros(static_cast<uint32_t>(4)), 2);
EXPECT_EQ(CountTrailingZeros(static_cast<uint32_t>(5)), 0);
EXPECT_EQ(CountTrailingZeros(static_cast<uint32_t>(6)), 1);
EXPECT_EQ(CountTrailingZeros(static_cast<uint32_t>(7)), 0);
EXPECT_EQ(CountTrailingZeros(static_cast<uint32_t>(8)), 3);
EXPECT_EQ(CountTrailingZeros(static_cast<uint32_t>(9)), 0);
EXPECT_EQ(CountTrailingZeros(static_cast<uint32_t>(10)), 1);
EXPECT_EQ(CountTrailingZeros(static_cast<uint32_t>(11)), 0);
EXPECT_EQ(CountTrailingZeros(static_cast<uint32_t>(12)), 2);
EXPECT_EQ(CountTrailingZeros(static_cast<uint32_t>(13)), 0);
EXPECT_EQ(CountTrailingZeros(static_cast<uint32_t>(14)), 1);
EXPECT_EQ(CountTrailingZeros(static_cast<uint32_t>(15)), 0);
EXPECT_EQ(CountTrailingZeros(static_cast<uint32_t>(16)), 4);
EXPECT_EQ(CountTrailingZeros(static_cast<uint32_t>(256)), 8);
EXPECT_EQ(CountTrailingZeros(static_cast<uint32_t>(65535)), 0);
EXPECT_EQ(CountTrailingZeros(static_cast<uint32_t>(65536)), 16);
EXPECT_EQ(CountTrailingZeros(static_cast<uint32_t>(0x80000000)), 31);
}
TEST(UsefulTest, CountTrailingZeros64) {
EXPECT_EQ(CountTrailingZeros(static_cast<uint64_t>(1)), 0);
EXPECT_EQ(CountTrailingZeros(static_cast<uint64_t>(2)), 1);
EXPECT_EQ(CountTrailingZeros(static_cast<uint64_t>(3)), 0);
EXPECT_EQ(CountTrailingZeros(static_cast<uint64_t>(4)), 2);
EXPECT_EQ(CountTrailingZeros(static_cast<uint64_t>(5)), 0);
EXPECT_EQ(CountTrailingZeros(static_cast<uint64_t>(6)), 1);
EXPECT_EQ(CountTrailingZeros(static_cast<uint64_t>(7)), 0);
EXPECT_EQ(CountTrailingZeros(static_cast<uint64_t>(8)), 3);
EXPECT_EQ(CountTrailingZeros(static_cast<uint64_t>(9)), 0);
EXPECT_EQ(CountTrailingZeros(static_cast<uint64_t>(10)), 1);
EXPECT_EQ(CountTrailingZeros(static_cast<uint64_t>(11)), 0);
EXPECT_EQ(CountTrailingZeros(static_cast<uint64_t>(12)), 2);
EXPECT_EQ(CountTrailingZeros(static_cast<uint64_t>(13)), 0);
EXPECT_EQ(CountTrailingZeros(static_cast<uint64_t>(14)), 1);
EXPECT_EQ(CountTrailingZeros(static_cast<uint64_t>(15)), 0);
EXPECT_EQ(CountTrailingZeros(static_cast<uint64_t>(16)), 4);
EXPECT_EQ(CountTrailingZeros(static_cast<uint64_t>(256)), 8);
EXPECT_EQ(CountTrailingZeros(static_cast<uint64_t>(65535)), 0);
EXPECT_EQ(CountTrailingZeros(static_cast<uint64_t>(65536)), 16);
EXPECT_EQ(CountTrailingZeros(static_cast<uint64_t>(0x80000000)), 31);
EXPECT_EQ(CountTrailingZeros(static_cast<uint64_t>(0x100000000)), 32);
EXPECT_EQ(CountTrailingZeros(static_cast<uint64_t>(0x1000000000000)), 48);
EXPECT_EQ(CountTrailingZeros(static_cast<uint64_t>(0x8000000000000000)), 63);
}
TEST(UsefulTest, LowestOneBit8) {
EXPECT_EQ(LowestOneBit(static_cast<uint8_t>(0)), 0);
EXPECT_EQ(LowestOneBit(static_cast<uint8_t>(1)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint8_t>(2)), 2);
EXPECT_EQ(LowestOneBit(static_cast<uint8_t>(3)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint8_t>(4)), 4);
EXPECT_EQ(LowestOneBit(static_cast<uint8_t>(5)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint8_t>(6)), 2);
EXPECT_EQ(LowestOneBit(static_cast<uint8_t>(7)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint8_t>(8)), 8);
EXPECT_EQ(LowestOneBit(static_cast<uint8_t>(9)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint8_t>(10)), 2);
EXPECT_EQ(LowestOneBit(static_cast<uint8_t>(11)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint8_t>(12)), 4);
EXPECT_EQ(LowestOneBit(static_cast<uint8_t>(13)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint8_t>(14)), 2);
EXPECT_EQ(LowestOneBit(static_cast<uint8_t>(15)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint8_t>(16)), 16);
EXPECT_EQ(LowestOneBit(static_cast<uint8_t>(127)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint8_t>(128)), 128);
}
TEST(UsefulTest, LowestOneBit16) {
EXPECT_EQ(LowestOneBit(static_cast<uint16_t>(0)), 0);
EXPECT_EQ(LowestOneBit(static_cast<uint16_t>(1)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint16_t>(2)), 2);
EXPECT_EQ(LowestOneBit(static_cast<uint16_t>(3)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint16_t>(4)), 4);
EXPECT_EQ(LowestOneBit(static_cast<uint16_t>(5)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint16_t>(6)), 2);
EXPECT_EQ(LowestOneBit(static_cast<uint16_t>(7)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint16_t>(8)), 8);
EXPECT_EQ(LowestOneBit(static_cast<uint16_t>(9)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint16_t>(10)), 2);
EXPECT_EQ(LowestOneBit(static_cast<uint16_t>(11)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint16_t>(12)), 4);
EXPECT_EQ(LowestOneBit(static_cast<uint16_t>(13)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint16_t>(14)), 2);
EXPECT_EQ(LowestOneBit(static_cast<uint16_t>(15)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint16_t>(16)), 16);
EXPECT_EQ(LowestOneBit(static_cast<uint16_t>(32767)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint16_t>(32768)), 32768);
}
TEST(UsefulTest, LowestOneBit32) {
EXPECT_EQ(LowestOneBit(static_cast<uint32_t>(0)), 0);
EXPECT_EQ(LowestOneBit(static_cast<uint32_t>(1)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint32_t>(2)), 2);
EXPECT_EQ(LowestOneBit(static_cast<uint32_t>(3)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint32_t>(4)), 4);
EXPECT_EQ(LowestOneBit(static_cast<uint32_t>(5)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint32_t>(6)), 2);
EXPECT_EQ(LowestOneBit(static_cast<uint32_t>(7)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint32_t>(8)), 8);
EXPECT_EQ(LowestOneBit(static_cast<uint32_t>(9)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint32_t>(10)), 2);
EXPECT_EQ(LowestOneBit(static_cast<uint32_t>(11)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint32_t>(12)), 4);
EXPECT_EQ(LowestOneBit(static_cast<uint32_t>(13)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint32_t>(14)), 2);
EXPECT_EQ(LowestOneBit(static_cast<uint32_t>(15)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint32_t>(16)), 16);
EXPECT_EQ(LowestOneBit(static_cast<uint32_t>(2147483647)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint32_t>(2147483648)), 2147483648);
}
TEST(UsefulTest, LowestOneBit64) {
EXPECT_EQ(LowestOneBit(static_cast<uint64_t>(0)), 0);
EXPECT_EQ(LowestOneBit(static_cast<uint64_t>(1)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint64_t>(2)), 2);
EXPECT_EQ(LowestOneBit(static_cast<uint64_t>(3)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint64_t>(4)), 4);
EXPECT_EQ(LowestOneBit(static_cast<uint64_t>(5)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint64_t>(6)), 2);
EXPECT_EQ(LowestOneBit(static_cast<uint64_t>(7)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint64_t>(8)), 8);
EXPECT_EQ(LowestOneBit(static_cast<uint64_t>(9)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint64_t>(10)), 2);
EXPECT_EQ(LowestOneBit(static_cast<uint64_t>(11)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint64_t>(12)), 4);
EXPECT_EQ(LowestOneBit(static_cast<uint64_t>(13)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint64_t>(14)), 2);
EXPECT_EQ(LowestOneBit(static_cast<uint64_t>(15)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint64_t>(16)), 16);
EXPECT_EQ(LowestOneBit(static_cast<uint64_t>(9223372036854775807)), 1);
EXPECT_EQ(LowestOneBit(static_cast<uint64_t>(9223372036854775808U)),
9223372036854775808U);
}
} // namespace grpc_core
int main(int argc, char** argv) {

Loading…
Cancel
Save