Switch upb_Arena_Fuse from a CAS based list insertion to an exchange based one

Second try with improved testing

(Generated by http://go/benchy. Settings: --runs 20 --reference "srcfs" --perflab)

```
name                                           old cpu/op   new cpu/op   delta
BM_ArenaOneAlloc                                 18.2ns ± 2%  18.1ns ± 1%  -0.72%  (p=0.002 n=18+17)
BM_ArenaInitialBlockOneAlloc                     5.31ns ± 0%  5.30ns ± 1%    ~     (p=0.345 n=16+19)
BM_ArenaFuseUnbalanced/2                         67.8ns ± 1%  68.0ns ± 0%  +0.35%  (p=0.011 n=16+17)
BM_ArenaFuseUnbalanced/8                          526ns ± 2%   524ns ± 1%    ~     (p=0.708 n=18+17)
BM_ArenaFuseUnbalanced/64                        4.82µs ± 1%  4.84µs ± 1%  +0.31%  (p=0.049 n=16+17)
BM_ArenaFuseUnbalanced/128                       9.78µs ± 1%  9.82µs ± 1%  +0.46%  (p=0.001 n=17+17)
BM_ArenaFuseBalanced/2                           66.9ns ± 1%  67.2ns ± 1%  +0.36%  (p=0.025 n=17+16)
BM_ArenaFuseBalanced/8                            527ns ± 2%   529ns ± 1%    ~     (p=0.081 n=17+19)
BM_ArenaFuseBalanced/64                          4.92µs ± 4%  4.88µs ± 2%    ~     (p=0.184 n=18+17)
BM_ArenaFuseBalanced/128                         9.92µs ± 1%  9.91µs ± 1%    ~     (p=0.883 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout>               5.89ms ± 2%  5.94ms ± 1%  +0.88%  (p=0.005 n=18+17)
BM_LoadAdsDescriptor_Upb<WithLayout>             6.55ms ± 2%  6.55ms ± 1%    ~     (p=0.961 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout>            12.3ms ± 2%  12.4ms ± 1%    ~     (p=0.226 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout>          12.5ms ± 1%  12.6ms ± 1%  +0.61%  (p=0.005 n=17+19)
BM_Parse_Upb_FileDesc<UseArena, Copy>            12.6µs ± 1%  12.7µs ± 2%    ~     (p=0.219 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias>           11.6µs ± 2%  11.6µs ± 3%    ~     (p=0.721 n=16+18)
BM_Parse_Upb_FileDesc<InitBlock, Copy>           12.4µs ± 1%  12.5µs ± 1%    ~     (p=0.118 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Alias>          11.3µs ± 2%  11.4µs ± 1%    ~     (p=0.327 n=18+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy>         25.2µs ± 2%  25.3µs ± 1%    ~     (p=0.301 n=16+19)
BM_Parse_Proto2<FileDesc, UseArena, Copy>        12.1µs ± 3%  12.1µs ± 2%    ~     (p=0.869 n=18+19)
BM_Parse_Proto2<FileDesc, InitBlock, Copy>       11.8µs ± 3%  11.8µs ± 3%    ~     (p=0.462 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias>    13.2µs ± 1%  13.2µs ± 1%    ~     (p=0.333 n=16+19)
BM_SerializeDescriptor_Proto2                    5.83µs ± 3%  5.86µs ± 4%    ~     (p=0.496 n=18+20)
BM_SerializeDescriptor_Upb                       10.5µs ± 2%  10.4µs ± 1%  -1.20%  (p=0.000 n=18+16)

name                                           old time/op             new time/op             delta
BM_ArenaOneAlloc                                 18.2ns ± 2%             18.1ns ± 0%  -0.73%  (p=0.010 n=18+17)
BM_ArenaInitialBlockOneAlloc                     5.32ns ± 0%             5.31ns ± 1%    ~     (p=0.106 n=15+18)
BM_ArenaFuseUnbalanced/2                         67.9ns ± 1%             68.1ns ± 0%  +0.31%  (p=0.044 n=16+16)
BM_ArenaFuseUnbalanced/8                          527ns ± 2%              526ns ± 1%    ~     (p=0.772 n=18+16)
BM_ArenaFuseUnbalanced/64                        4.83µs ± 1%             4.84µs ± 2%    ~     (p=0.144 n=16+18)
BM_ArenaFuseUnbalanced/128                       9.79µs ± 1%             9.84µs ± 1%  +0.52%  (p=0.001 n=17+18)
BM_ArenaFuseBalanced/2                           67.0ns ± 1%             67.3ns ± 3%  +0.41%  (p=0.019 n=15+16)
BM_ArenaFuseBalanced/8                            528ns ± 2%              530ns ± 1%    ~     (p=0.121 n=17+19)
BM_ArenaFuseBalanced/64                          4.93µs ± 4%             4.89µs ± 2%    ~     (p=0.103 n=18+17)
BM_ArenaFuseBalanced/128                         9.93µs ± 1%             9.93µs ± 1%    ~     (p=0.806 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout>               5.91ms ± 2%             5.96ms ± 1%  +0.93%  (p=0.002 n=18+16)
BM_LoadAdsDescriptor_Upb<WithLayout>             6.57ms ± 2%             6.57ms ± 1%    ~     (p=0.935 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout>            12.4ms ± 2%             12.4ms ± 1%    ~     (p=0.239 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout>          12.5ms ± 2%             12.6ms ± 1%  +0.43%  (p=0.024 n=18+19)
BM_Parse_Upb_FileDesc<UseArena, Copy>            12.7µs ± 2%             12.7µs ± 2%    ~     (p=0.245 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias>           11.6µs ± 2%             11.6µs ± 2%    ~     (p=0.772 n=16+18)
BM_Parse_Upb_FileDesc<InitBlock, Copy>           12.5µs ± 1%             12.5µs ± 1%    ~     (p=0.136 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Alias>          11.4µs ± 2%             11.4µs ± 1%    ~     (p=0.391 n=18+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy>         25.3µs ± 2%             25.4µs ± 1%    ~     (p=0.403 n=16+19)
BM_Parse_Proto2<FileDesc, UseArena, Copy>        12.1µs ± 2%             12.1µs ± 2%    ~     (p=0.731 n=17+19)
BM_Parse_Proto2<FileDesc, InitBlock, Copy>       11.9µs ± 3%             11.8µs ± 3%    ~     (p=0.424 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias>    13.2µs ± 2%             13.3µs ± 1%    ~     (p=0.683 n=16+19)
BM_SerializeDescriptor_Proto2                    5.84µs ± 3%             5.86µs ± 4%    ~     (p=0.496 n=18+20)
BM_SerializeDescriptor_Upb                       10.5µs ± 2%             10.4µs ± 1%  -1.27%  (p=0.000 n=18+16)

name                                           old speed               new speed               delta
BM_LoadAdsDescriptor_Upb<NoLayout>              133MB/s ± 2%            132MB/s ± 1%  -0.97%  (p=0.002 n=18+16)
BM_LoadAdsDescriptor_Upb<WithLayout>            120MB/s ± 2%            120MB/s ± 1%    ~     (p=0.961 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout>          63.5MB/s ± 2%           63.3MB/s ± 1%    ~     (p=0.226 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout>        62.7MB/s ± 1%           62.4MB/s ± 1%  -0.60%  (p=0.005 n=17+19)
BM_Parse_Upb_FileDesc<UseArena, Copy>           596MB/s ± 1%            594MB/s ± 2%    ~     (p=0.219 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias>          650MB/s ± 2%            649MB/s ± 3%    ~     (p=0.721 n=16+18)
BM_Parse_Upb_FileDesc<InitBlock, Copy>          605MB/s ± 1%            603MB/s ± 1%    ~     (p=0.118 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Alias>         663MB/s ± 2%            661MB/s ± 1%    ~     (p=0.327 n=18+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy>        298MB/s ± 2%            297MB/s ± 1%    ~     (p=0.490 n=17+19)
BM_Parse_Proto2<FileDesc, UseArena, Copy>       623MB/s ± 3%            624MB/s ± 2%    ~     (p=0.869 n=18+19)
BM_Parse_Proto2<FileDesc, InitBlock, Copy>      636MB/s ± 3%            637MB/s ± 3%    ~     (p=0.462 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias>   570MB/s ± 1%            568MB/s ± 1%    ~     (p=0.333 n=16+19)
BM_SerializeDescriptor_Proto2                  1.29GB/s ± 3%           1.29GB/s ± 4%    ~     (p=0.496 n=18+20)
BM_SerializeDescriptor_Upb                      716MB/s ± 2%            725MB/s ± 1%  +1.20%  (p=0.000 n=18+16)
```

PiperOrigin-RevId: 525132431
pull/13171/head
Matt Kulukundis 2 years ago committed by Copybara-Service
parent 433e737c0e
commit fa1c2a6d29
  1. 43
      upb/mem/arena.c
  2. 13
      upb/port/atomic.h

@ -251,26 +251,29 @@ retry:
goto retry;
}
static void _upb_Arena_DoFuseArenaLists(upb_Arena* r1, upb_Arena* r2) {
// Find the region for `r2`'s linked list.
upb_Arena* r1_tail = upb_Atomic_Load(&r1->tail, memory_order_relaxed);
while (true) {
upb_Arena* r1_next = upb_Atomic_Load(&r1_tail->next, memory_order_relaxed);
while (r1_next != NULL) {
// r1->tail was stale. This can happen, but tail should always converge
// on the true tail.
r1_tail = r1_next;
r1_next = upb_Atomic_Load(&r1_tail->next, memory_order_relaxed);
}
if (upb_Atomic_CompareExchangeStrong(&r1_tail->next, &r1_next, r2,
memory_order_relaxed,
memory_order_relaxed)) {
break;
}
}
upb_Arena* r2_tail = upb_Atomic_Load(&r2->tail, memory_order_relaxed);
upb_Atomic_Store(&r1->tail, r2_tail, memory_order_relaxed);
static void _upb_Arena_DoFuseArenaLists(upb_Arena* const parent,
upb_Arena* child) {
upb_Arena* parent_tail = upb_Atomic_Load(&parent->tail, memory_order_relaxed);
do {
// Our tail might be stale, but it will always converge to the true tail.
upb_Arena* parent_tail_next =
upb_Atomic_Load(&parent_tail->next, memory_order_relaxed);
while (parent_tail_next != NULL) {
parent_tail = parent_tail_next;
parent_tail_next =
upb_Atomic_Load(&parent_tail->next, memory_order_relaxed);
}
upb_Arena* displaced =
upb_Atomic_Exchange(&parent_tail->next, child, memory_order_relaxed);
parent_tail = upb_Atomic_Load(&child->tail, memory_order_relaxed);
// If we displaced something that got installed racily, we can simply
// reinstall it on our new tail.
child = displaced;
} while (child != NULL);
upb_Atomic_Store(&parent->tail, parent_tail, memory_order_relaxed);
}
static upb_Arena* _upb_Arena_DoFuse(upb_Arena* a1, upb_Arena* a2,

@ -42,7 +42,9 @@
#define upb_Atomic_Add(addr, val, order) \
atomic_fetch_add_explicit(addr, val, order)
#define upb_Atomic_Sub(addr, val, order) \
atomic_fetch_sub_explicit(addr, val, memory_order_release);
atomic_fetch_sub_explicit(addr, val, order)
#define upb_Atomic_Exchange(addr, val, order) \
atomic_exchange_explicit(addr, val, order)
#define upb_Atomic_CompareExchangeStrong(addr, expected, desired, \
success_order, failure_order) \
atomic_compare_exchange_strong_explicit(addr, expected, desired, \
@ -62,6 +64,15 @@
#define upb_Atomic_Add(addr, val, order) (*(addr) += val)
#define upb_Atomic_Sub(addr, val, order) (*(addr) -= val)
UPB_INLINE void* _upb_NonAtomic_Exchange(void* addr, void* value) {
void* old;
memcpy(&old, addr, sizeof(value));
memcpy(addr, &value, sizeof(value));
return old;
}
#define upb_Atomic_Exchange(addr, val, order) _upb_NonAtomic_Exchange(addr, val)
// `addr` and `expected` are logically double pointers.
UPB_INLINE bool _upb_NonAtomic_CompareExchangeStrongP(void* addr,
void* expected,

Loading…
Cancel
Save