From 474152de0ff367e9db20a0d3daa4b2eefda97716 Mon Sep 17 00:00:00 2001 From: Martijn Vels Date: Mon, 5 Dec 2022 15:58:42 -0800 Subject: [PATCH] Optimize and simplify arena cleanup logic This CL changes the prefetch to NTA instead of T0 to reduce cache pollution with a simplified, yet more efficient cleanup prefetch loop which removes stores and loads on the local prefetch stack, and instead simply repeats the load and branch of the prefetch resulting in CPU savings. PiperOrigin-RevId: 493136574 --- src/google/protobuf/arena.cc | 35 ++----------------- src/google/protobuf/arena_cleanup.h | 53 ++++++++++++++++------------- 2 files changed, 32 insertions(+), 56 deletions(-) diff --git a/src/google/protobuf/arena.cc b/src/google/protobuf/arena.cc index 327c26eace..a977d77fd9 100644 --- a/src/google/protobuf/arena.cc +++ b/src/google/protobuf/arena.cc @@ -277,39 +277,8 @@ void SerialArena::CleanupList() { char* limit = b->Limit(); char* it = reinterpret_cast(b->cleanup_nodes); GOOGLE_DCHECK(!b->IsSentry() || it == limit); - if (it < limit) { - // A prefetch distance of 8 here was chosen arbitrarily. It makes the - // pending nodes fill a cacheline which seemed nice. - constexpr int kPrefetchDist = 8; - cleanup::Tag pending_type[kPrefetchDist]; - char* pending_node[kPrefetchDist]; - - int pos = 0; - for (; pos < kPrefetchDist && it < limit; ++pos) { - pending_type[pos] = cleanup::Type(it); - pending_node[pos] = it; - it += cleanup::Size(pending_type[pos]); - } - - if (pos < kPrefetchDist) { - for (int i = 0; i < pos; ++i) { - cleanup::DestroyNode(pending_type[i], pending_node[i]); - } - } else { - pos = 0; - while (it < limit) { - cleanup::PrefetchNode(it); - cleanup::DestroyNode(pending_type[pos], pending_node[pos]); - pending_type[pos] = cleanup::Type(it); - pending_node[pos] = it; - it += cleanup::Size(pending_type[pos]); - pos = (pos + 1) % kPrefetchDist; - } - for (int i = pos; i < pos + kPrefetchDist; ++i) { - cleanup::DestroyNode(pending_type[i % kPrefetchDist], - pending_node[i % kPrefetchDist]); - } - } + while (it < limit) { + it += cleanup::DestroyNode(it); } b = b->next; } while (b); diff --git a/src/google/protobuf/arena_cleanup.h b/src/google/protobuf/arena_cleanup.h index 659716bcd9..8416342b75 100644 --- a/src/google/protobuf/arena_cleanup.h +++ b/src/google/protobuf/arena_cleanup.h @@ -113,42 +113,49 @@ inline ABSL_ATTRIBUTE_ALWAYS_INLINE void CreateNode(Tag tag, void* pos, } // Optimization: performs a prefetch on `elem_address`. -inline ABSL_ATTRIBUTE_ALWAYS_INLINE void PrefetchNode( - const void* elem_address) { - (void)elem_address; +// Returns the size of the cleanup (meta) data at this address, allowing the +// caller to advance cleanup iterators without needing to examine or know +// anything about the underlying cleanup node or cleanup meta data / tags. +inline ABSL_ATTRIBUTE_ALWAYS_INLINE size_t +PrefetchNode(const void* elem_address) { + if (EnableSpecializedTags()) { + uintptr_t elem; + memcpy(&elem, elem_address, sizeof(elem)); + if (static_cast(elem & 3) != Tag::kDynamic) { + return sizeof(TaggedNode); + } + } + return sizeof(DynamicNode); } -// Destroys the node idenitfied by `tag` stored at memory location `pos`. -inline ABSL_ATTRIBUTE_ALWAYS_INLINE void DestroyNode(Tag tag, const void* pos) { +// Destroys the object referenced by the cleanup node at memory location `pos`. +// Returns the size of the cleanup (meta) data at this address, allowing the +// caller to advance cleanup iterators without needing to examine or know +// anything about the underlying cleanup node or cleanup meta data / tags. +inline ABSL_ATTRIBUTE_ALWAYS_INLINE size_t DestroyNode(const void* pos) { + uintptr_t elem; + memcpy(&elem, pos, sizeof(elem)); if (EnableSpecializedTags()) { - switch (tag) { + switch (static_cast(elem & 3)) { case Tag::kString: { - TaggedNode n; - memcpy(&n, pos, sizeof(n)); - auto* s = reinterpret_cast(n.elem & ~0x7ULL); // Some compilers don't like fully qualified explicit dtor calls, // so use an alias to avoid having to type `::`. - using string_type = std::string; - s->~string_type(); - return; + using T = std::string; + reinterpret_cast(elem - static_cast(Tag::kString))->~T(); + return sizeof(TaggedNode); } case Tag::kCord: { - TaggedNode n; - memcpy(&n, pos, sizeof(n)); - auto* s = reinterpret_cast(n.elem & ~0x7ULL); - // Some compilers don't like fully qualified explicit dtor calls, - // so use an alias to avoid having to type `::`. - using cord_type = absl::Cord; - s->~cord_type(); - return; + using T = absl::Cord; + reinterpret_cast(elem - static_cast(Tag::kCord))->~T(); + return sizeof(TaggedNode); } default: break; } } - DynamicNode n; - memcpy(&n, pos, sizeof(n)); - n.destructor(reinterpret_cast(n.elem)); + static_cast(pos)->destructor( + reinterpret_cast(elem - static_cast(Tag::kDynamic))); + return sizeof(DynamicNode); } // Returns the `tag` identifying the type of object for `destructor` or