Optimize and simplify arena cleanup logic

This CL changes the prefetch to NTA instead of T0 to reduce cache pollution with a simplified, yet more efficient cleanup prefetch loop which removes stores and loads on the local prefetch stack, and instead simply repeats the load and branch of the prefetch resulting in CPU savings.

PiperOrigin-RevId: 493136574
pull/11160/head
Martijn Vels 2 years ago committed by Copybara-Service
parent 3f36a91442
commit 474152de0f
  1. 35
      src/google/protobuf/arena.cc
  2. 53
      src/google/protobuf/arena_cleanup.h

@ -277,39 +277,8 @@ void SerialArena::CleanupList() {
char* limit = b->Limit(); char* limit = b->Limit();
char* it = reinterpret_cast<char*>(b->cleanup_nodes); char* it = reinterpret_cast<char*>(b->cleanup_nodes);
GOOGLE_DCHECK(!b->IsSentry() || it == limit); GOOGLE_DCHECK(!b->IsSentry() || it == limit);
if (it < limit) { while (it < limit) {
// A prefetch distance of 8 here was chosen arbitrarily. It makes the it += cleanup::DestroyNode(it);
// pending nodes fill a cacheline which seemed nice.
constexpr int kPrefetchDist = 8;
cleanup::Tag pending_type[kPrefetchDist];
char* pending_node[kPrefetchDist];
int pos = 0;
for (; pos < kPrefetchDist && it < limit; ++pos) {
pending_type[pos] = cleanup::Type(it);
pending_node[pos] = it;
it += cleanup::Size(pending_type[pos]);
}
if (pos < kPrefetchDist) {
for (int i = 0; i < pos; ++i) {
cleanup::DestroyNode(pending_type[i], pending_node[i]);
}
} else {
pos = 0;
while (it < limit) {
cleanup::PrefetchNode(it);
cleanup::DestroyNode(pending_type[pos], pending_node[pos]);
pending_type[pos] = cleanup::Type(it);
pending_node[pos] = it;
it += cleanup::Size(pending_type[pos]);
pos = (pos + 1) % kPrefetchDist;
}
for (int i = pos; i < pos + kPrefetchDist; ++i) {
cleanup::DestroyNode(pending_type[i % kPrefetchDist],
pending_node[i % kPrefetchDist]);
}
}
} }
b = b->next; b = b->next;
} while (b); } while (b);

@ -113,42 +113,49 @@ inline ABSL_ATTRIBUTE_ALWAYS_INLINE void CreateNode(Tag tag, void* pos,
} }
// Optimization: performs a prefetch on `elem_address`. // Optimization: performs a prefetch on `elem_address`.
inline ABSL_ATTRIBUTE_ALWAYS_INLINE void PrefetchNode( // Returns the size of the cleanup (meta) data at this address, allowing the
const void* elem_address) { // caller to advance cleanup iterators without needing to examine or know
(void)elem_address; // anything about the underlying cleanup node or cleanup meta data / tags.
inline ABSL_ATTRIBUTE_ALWAYS_INLINE size_t
PrefetchNode(const void* elem_address) {
if (EnableSpecializedTags()) {
uintptr_t elem;
memcpy(&elem, elem_address, sizeof(elem));
if (static_cast<Tag>(elem & 3) != Tag::kDynamic) {
return sizeof(TaggedNode);
}
}
return sizeof(DynamicNode);
} }
// Destroys the node idenitfied by `tag` stored at memory location `pos`. // Destroys the object referenced by the cleanup node at memory location `pos`.
inline ABSL_ATTRIBUTE_ALWAYS_INLINE void DestroyNode(Tag tag, const void* pos) { // Returns the size of the cleanup (meta) data at this address, allowing the
// caller to advance cleanup iterators without needing to examine or know
// anything about the underlying cleanup node or cleanup meta data / tags.
inline ABSL_ATTRIBUTE_ALWAYS_INLINE size_t DestroyNode(const void* pos) {
uintptr_t elem;
memcpy(&elem, pos, sizeof(elem));
if (EnableSpecializedTags()) { if (EnableSpecializedTags()) {
switch (tag) { switch (static_cast<Tag>(elem & 3)) {
case Tag::kString: { case Tag::kString: {
TaggedNode n;
memcpy(&n, pos, sizeof(n));
auto* s = reinterpret_cast<std::string*>(n.elem & ~0x7ULL);
// Some compilers don't like fully qualified explicit dtor calls, // Some compilers don't like fully qualified explicit dtor calls,
// so use an alias to avoid having to type `::`. // so use an alias to avoid having to type `::`.
using string_type = std::string; using T = std::string;
s->~string_type(); reinterpret_cast<T*>(elem - static_cast<uintptr_t>(Tag::kString))->~T();
return; return sizeof(TaggedNode);
} }
case Tag::kCord: { case Tag::kCord: {
TaggedNode n; using T = absl::Cord;
memcpy(&n, pos, sizeof(n)); reinterpret_cast<T*>(elem - static_cast<uintptr_t>(Tag::kCord))->~T();
auto* s = reinterpret_cast<absl::Cord*>(n.elem & ~0x7ULL); return sizeof(TaggedNode);
// Some compilers don't like fully qualified explicit dtor calls,
// so use an alias to avoid having to type `::`.
using cord_type = absl::Cord;
s->~cord_type();
return;
} }
default: default:
break; break;
} }
} }
DynamicNode n; static_cast<const DynamicNode*>(pos)->destructor(
memcpy(&n, pos, sizeof(n)); reinterpret_cast<void*>(elem - static_cast<uintptr_t>(Tag::kDynamic)));
n.destructor(reinterpret_cast<void*>(n.elem)); return sizeof(DynamicNode);
} }
// Returns the `tag` identifying the type of object for `destructor` or // Returns the `tag` identifying the type of object for `destructor` or

Loading…
Cancel
Save