Optimize SwissMap iteration for aarch64 by 5-6%

Benchmarks: https://pastebin.com/tZ7dr67W. Works well especially on smaller ranges. After a week on spending optimizing NEON SIMD where I almost managed to make hash tables work with NEON SIMD without performance hits (still 1 cycle to optimize and I gave up a little), I found an interesting optimization for aarch64 to use cls instruction (count leading sign bits). The loop has a property that ctrl_ group is not matched against count when the first slot is empty or deleted. ``` void skip_empty_or_deleted() { while (IsEmptyOrDeleted(*ctrl_)) { uint32_t shift = Group{ctrl_}.CountLeadingEmptyOrDeleted(); ctrl_ += shift; slot_ += shift; } ... } ``` However, `kEmpty` and `kDeleted` have format of `1xxxxxx0` and `~ctrl & (ctrl >> 7)` always sets the lowest bit to 1. In naive implementation, it does +1 to start counting zero bits, however, in aarch64 we may start counting one bits immediately. This saves 1 cycle and 5% of iteration performance. Then it becomes hard to find a supported and sustainable C++ version of it. `__clsll` is not supported by GCC and was supported only since clang 8, `__builtin_clrsb` is not producing optimal codegen for clang. `__rbit` is not supported by GCC and there is no intrinsic to do that, however, in clang we have `__builtin_bitreverse{32,64}`. For now I decided to enable this only for clang, only if they have appropriate builtins. PiperOrigin-RevId: 451168570 Change-Id: I7e9256a60aecdc88ced4e6eb15ebc257281b6664
3 years ago · 591a2cdacb
parent 4bb6a0e045
commit 591a2cdacb
3 changed files with 56 additions and 1 deletions
--- a/absl/base/config.h
+++ b/absl/base/config.h
@ -882,4 +882,19 @@ static_assert(ABSL_INTERNAL_INLINE_NAMESPACE_STR[0] != 'h' ||
 #define ABSL_INTERNAL_HAVE_SSSE3 1
 #endif

+// ABSL_INTERNAL_HAVE_ARM_ACLE is used for compile-time detection of ACLE (ARM
+// C language extensions).
+#ifdef ABSL_INTERNAL_HAVE_ARM_ACLE
+#error ABSL_INTERNAL_HAVE_ARM_ACLE cannot be directly set
+// __cls, __rbit were added quite late in clang. They are not supported
+// by GCC as well. __cls can be replaced with __builtin_clrsb but clang does
+// not recognize cls instruction in latest versions.
+// TODO(b/233604649): Relax to __builtin_clrsb and __builtin_bitreverse64 (note
+// that the latter is not supported by GCC).
+#elif defined(__ARM_ACLE) && defined(__clang__) && \
+    ABSL_HAVE_BUILTIN(__builtin_arm_cls64) &&      \
+    ABSL_HAVE_BUILTIN(__builtin_arm_rbit64)
+#define ABSL_INTERNAL_HAVE_ARM_ACLE 1
+#endif
+
 #endif  // ABSL_BASE_CONFIG_H_
--- a/absl/container/internal/raw_hash_set.h
+++ b/absl/container/internal/raw_hash_set.h
@ -211,6 +211,10 @@
 #include "absl/numeric/bits.h"
 #include "absl/utility/utility.h"

+#ifdef ABSL_INTERNAL_HAVE_ARM_ACLE
+#include <arm_acle.h>
+#endif
+
 namespace absl {
 ABSL_NAMESPACE_BEGIN
 namespace container_internal {
@ -627,8 +631,40 @@ struct GroupPortableImpl {
  uint64_t ctrl;
 };

+#ifdef ABSL_INTERNAL_HAVE_ARM_ACLE
+struct GroupAArch64Impl : public GroupPortableImpl {
+  static constexpr size_t kWidth = GroupPortableImpl::kWidth;
+
+  using GroupPortableImpl::GroupPortableImpl;
+
+  uint32_t CountLeadingEmptyOrDeleted() const {
+    assert(IsEmptyOrDeleted(static_cast<ctrl_t>(ctrl & 0xff)));
+    constexpr uint64_t gaps = 0x00FEFEFEFEFEFEFEULL;
+    // cls: Count leading sign bits.
+    // clsll(1ull << 63) -> 0
+    // clsll((1ull << 63) | (1ull << 62)) -> 1
+    // clsll((1ull << 63) | (1ull << 61)) -> 0
+    // clsll(~0ull) -> 63
+    // clsll(1) -> 62
+    // clsll(3) -> 61
+    // clsll(5) -> 60
+    // Note that CountLeadingEmptyOrDeleted is called when first control block
+    // is kDeleted or kEmpty. The implementation is similar to GroupPortableImpl
+    // but avoids +1 and __clsll returns result not including the high bit. Thus
+    // saves one cycle.
+    // kEmpty = -128,   // 0b10000000
+    // kDeleted = -2,   // 0b11111110
+    // ~ctrl & (ctrl >> 7) will have the lowest bit set to 1. After rbit,
+    // it will the highest one.
+    return (__clsll(__rbitll((~ctrl & (ctrl >> 7)) | gaps)) + 8) >> 3;
+  }
+};
+#endif
+
 #ifdef ABSL_INTERNAL_HAVE_SSE2
 using Group = GroupSse2Impl;
+#elif defined(ABSL_INTERNAL_HAVE_ARM_ACLE)
+using Group = GroupAArch64Impl;
 #else
 using Group = GroupPortableImpl;
 #endif
--- a/absl/container/internal/raw_hash_set_test.cc
+++ b/absl/container/internal/raw_hash_set_test.cc
@ -262,16 +262,20 @@ TEST(Group, CountLeadingEmptyOrDeleted) {

  for (ctrl_t empty : empty_examples) {
    std::vector<ctrl_t> e(Group::kWidth, empty);
+    EXPECT_TRUE(IsEmptyOrDeleted(e[0]));
    EXPECT_EQ(Group::kWidth, Group{e.data()}.CountLeadingEmptyOrDeleted());
    for (ctrl_t full : full_examples) {
-      for (size_t i = 0; i != Group::kWidth; ++i) {
+      // First is always kEmpty or kDeleted.
+      for (size_t i = 1; i != Group::kWidth; ++i) {
        std::vector<ctrl_t> f(Group::kWidth, empty);
        f[i] = full;
+        EXPECT_TRUE(IsEmptyOrDeleted(f[0]));
        EXPECT_EQ(i, Group{f.data()}.CountLeadingEmptyOrDeleted());
      }
      std::vector<ctrl_t> f(Group::kWidth, empty);
      f[Group::kWidth * 2 / 3] = full;
      f[Group::kWidth / 2] = full;
+      EXPECT_TRUE(IsEmptyOrDeleted(f[0]));
      EXPECT_EQ(
          Group::kWidth / 2, Group{f.data()}.CountLeadingEmptyOrDeleted());
    }