|
|
|
/*
|
|
|
|
* Copyright (c) 2009-2021, Google LLC
|
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions are met:
|
|
|
|
* * Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
* * Neither the name of Google LLC nor the
|
|
|
|
* names of its contributors may be used to endorse or promote products
|
|
|
|
* derived from this software without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL Google LLC BE LIABLE FOR ANY DIRECT,
|
|
|
|
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
|
|
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
|
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
|
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
|
|
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "upb/mem/arena_internal.h"
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
#include "upb/port/atomic.h"
|
|
|
|
|
|
|
|
// Must be last.
|
|
|
|
#include "upb/port/def.inc"
|
|
|
|
|
|
|
|
static uintptr_t upb_cleanup_metadata(uint32_t* cleanup,
|
|
|
|
bool has_initial_block) {
|
|
|
|
return (uintptr_t)cleanup | has_initial_block;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct _upb_MemBlock {
|
Changed Arena representation so that fusing links arenas together instead of blocks.
Previously when fusing, we would concatenate all blocks into a single list that lived in the arena root. From then on, all arenas would add their blocks to this single unified list.
After this CL, arenas keep their distinct list of blocks even after being fused. Instead of unifying the block list, fuse now puts the arenas themselves into a list, so all arenas in the fused group can be iterated over at any time.
This design makes it easier to keep each individual arena thread-compatible, because fuse and free are now the only mutating operations that touch state that is shared with the entire group. Read-only operations like `SpaceAllocated()` also iterate the list of arenas, but in a read-only fashion.
(Note: we need tests for SpaceAllocated(), both single-threaded for correctness and multi-threaded for resilience to crashes and data races).
Performance of fuse regresses by 5-20%. This is somewhat expected as we are performing more atomic operations during a fuse.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.7ns ± 4% +2.00% (p=0.016 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.50ns ± 4% 6.57ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.3ns ±10% 68.7ns ± 4% +15.85% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 479ns ± 5% 540ns ± 8% +12.57% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.50µs ± 4% 4.93µs ± 8% +9.59% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.24µs ± 3% 9.96µs ± 3% +7.81% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.3ns ±18% 71.0ns ± 4% +12.14% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 484ns ± 9% 543ns ±10% +12.11% (p=0.000 n=17+16)
BM_ArenaFuseBalanced/64 4.50µs ± 6% 4.94µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.20µs ± 4% 9.95µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.50ms ± 8% 5.69ms ±17% ~ (p=0.189 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.10ms ± 5% 6.05ms ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.8ms ± 5% 12.4ms ±17% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.1µs ± 8% 12.1µs ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 10.9µs ± 7% 11.0µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 4% 24.4µs ± 7% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 5% 11.6µs ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.77µs ± 5% 5.71µs ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.102 n=19+16)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.8ns ± 4% +1.98% (p=0.019 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.51ns ± 4% 6.58ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.5ns ±10% 68.9ns ± 4% +15.83% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 481ns ± 5% 541ns ± 8% +12.54% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.51µs ± 4% 4.94µs ± 8% +9.53% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.26µs ± 3% 9.98µs ± 3% +7.79% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.5ns ±19% 71.1ns ± 3% +12.07% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 485ns ± 9% 551ns ±20% +13.47% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/64 4.51µs ± 6% 4.95µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.22µs ± 4% 9.97µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 8% 5.72ms ±18% ~ (p=0.199 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.12ms ± 5% 6.07ms ± 4% ~ (p=0.273 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 5% 12.5ms ±18% ~ (p=0.582 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 8% 12.1µs ± 3% ~ (p=0.963 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.126 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.0µs ± 6% 11.1µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.3µs ± 4% 24.5µs ± 6% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.7µs ± 5% 11.6µs ± 4% ~ (p=0.574 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.78µs ± 5% 5.73µs ± 5% ~ (p=0.357 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.117 n=19+16)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.08k ± 0% 6.05k ± 0% -0.54% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.39k ± 0% 6.36k ± 0% -0.55% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 336 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 9.89M ± 0% 9.95M ± 0% +0.65% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 9.95M ± 0% 10.02M ± 0% +0.70% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 138MB/s ± 7% 132MB/s ±15% ~ (p=0.126 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 124MB/s ± 5% 125MB/s ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 63.9MB/s ±13% 65.2MB/s ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 64.0MB/s ± 5% 61.3MB/s ±15% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 620MB/s ± 8% 622MB/s ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 644MB/s ±15% 679MB/s ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 627MB/s ± 4% 633MB/s ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 688MB/s ± 6% 682MB/s ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 310MB/s ± 4% 309MB/s ± 6% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 646MB/s ± 4% 649MB/s ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 666MB/s ± 3% 666MB/s ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 592MB/s ± 7% 593MB/s ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 1.30GB/s ± 5% 1.32GB/s ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 756MB/s ± 5% 745MB/s ± 6% ~ (p=0.102 n=19+16)
```
PiperOrigin-RevId: 520144430
2 years ago
|
|
|
// Atomic only for the benefit of SpaceAllocated().
|
|
|
|
UPB_ATOMIC(_upb_MemBlock*) next;
|
|
|
|
uint32_t size;
|
|
|
|
// Data follows.
|
|
|
|
};
|
|
|
|
|
|
|
|
static const size_t memblock_reserve =
|
|
|
|
UPB_ALIGN_UP(sizeof(_upb_MemBlock), UPB_MALLOC_ALIGN);
|
|
|
|
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
static upb_Arena* _upb_Arena_FindRoot(upb_Arena* a) {
|
|
|
|
uintptr_t poc = upb_Atomic_Load(&a->parent_or_count, memory_order_acquire);
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
while (_upb_Arena_IsTaggedPointer(poc)) {
|
|
|
|
upb_Arena* next = _upb_Arena_PointerFromTagged(poc);
|
|
|
|
uintptr_t next_poc =
|
|
|
|
upb_Atomic_Load(&next->parent_or_count, memory_order_acquire);
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
|
|
|
|
if (_upb_Arena_IsTaggedPointer(next_poc)) {
|
|
|
|
// To keep complexity down, we lazily collapse levels of the tree. This
|
|
|
|
// keeps it flat in the final case, but doesn't cost much incrementally.
|
|
|
|
//
|
|
|
|
// Path splitting keeps time complexity down, see:
|
|
|
|
// https://en.wikipedia.org/wiki/Disjoint-set_data_structure
|
|
|
|
//
|
|
|
|
// We can safely use a relaxed atomic here because all threads doing this
|
|
|
|
// will converge on the same value and we don't need memory orderings to
|
|
|
|
// be visible.
|
|
|
|
//
|
|
|
|
// This is true because:
|
|
|
|
// - If no fuses occur, this will eventually become the root.
|
|
|
|
// - If fuses are actively occuring, the root may change, but the
|
|
|
|
// invariant is that `parent_or_count` merely points to *a* parent.
|
|
|
|
//
|
|
|
|
// In other words, it is moving towards "the" root, and that root may move
|
|
|
|
// further away over time, but the path towards that root will continue to
|
|
|
|
// be valid and the creation of the path carries all the memory orderings
|
|
|
|
// required.
|
|
|
|
upb_Atomic_Store(&a->parent_or_count, next_poc, memory_order_relaxed);
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
}
|
|
|
|
a = next;
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
poc = next_poc;
|
|
|
|
}
|
|
|
|
return a;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t upb_Arena_SpaceAllocated(upb_Arena* arena) {
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
arena = _upb_Arena_FindRoot(arena);
|
|
|
|
size_t memsize = 0;
|
|
|
|
|
Changed Arena representation so that fusing links arenas together instead of blocks.
Previously when fusing, we would concatenate all blocks into a single list that lived in the arena root. From then on, all arenas would add their blocks to this single unified list.
After this CL, arenas keep their distinct list of blocks even after being fused. Instead of unifying the block list, fuse now puts the arenas themselves into a list, so all arenas in the fused group can be iterated over at any time.
This design makes it easier to keep each individual arena thread-compatible, because fuse and free are now the only mutating operations that touch state that is shared with the entire group. Read-only operations like `SpaceAllocated()` also iterate the list of arenas, but in a read-only fashion.
(Note: we need tests for SpaceAllocated(), both single-threaded for correctness and multi-threaded for resilience to crashes and data races).
Performance of fuse regresses by 5-20%. This is somewhat expected as we are performing more atomic operations during a fuse.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.7ns ± 4% +2.00% (p=0.016 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.50ns ± 4% 6.57ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.3ns ±10% 68.7ns ± 4% +15.85% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 479ns ± 5% 540ns ± 8% +12.57% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.50µs ± 4% 4.93µs ± 8% +9.59% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.24µs ± 3% 9.96µs ± 3% +7.81% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.3ns ±18% 71.0ns ± 4% +12.14% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 484ns ± 9% 543ns ±10% +12.11% (p=0.000 n=17+16)
BM_ArenaFuseBalanced/64 4.50µs ± 6% 4.94µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.20µs ± 4% 9.95µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.50ms ± 8% 5.69ms ±17% ~ (p=0.189 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.10ms ± 5% 6.05ms ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.8ms ± 5% 12.4ms ±17% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.1µs ± 8% 12.1µs ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 10.9µs ± 7% 11.0µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 4% 24.4µs ± 7% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 5% 11.6µs ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.77µs ± 5% 5.71µs ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.102 n=19+16)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.8ns ± 4% +1.98% (p=0.019 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.51ns ± 4% 6.58ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.5ns ±10% 68.9ns ± 4% +15.83% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 481ns ± 5% 541ns ± 8% +12.54% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.51µs ± 4% 4.94µs ± 8% +9.53% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.26µs ± 3% 9.98µs ± 3% +7.79% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.5ns ±19% 71.1ns ± 3% +12.07% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 485ns ± 9% 551ns ±20% +13.47% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/64 4.51µs ± 6% 4.95µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.22µs ± 4% 9.97µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 8% 5.72ms ±18% ~ (p=0.199 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.12ms ± 5% 6.07ms ± 4% ~ (p=0.273 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 5% 12.5ms ±18% ~ (p=0.582 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 8% 12.1µs ± 3% ~ (p=0.963 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.126 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.0µs ± 6% 11.1µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.3µs ± 4% 24.5µs ± 6% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.7µs ± 5% 11.6µs ± 4% ~ (p=0.574 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.78µs ± 5% 5.73µs ± 5% ~ (p=0.357 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.117 n=19+16)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.08k ± 0% 6.05k ± 0% -0.54% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.39k ± 0% 6.36k ± 0% -0.55% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 336 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 9.89M ± 0% 9.95M ± 0% +0.65% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 9.95M ± 0% 10.02M ± 0% +0.70% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 138MB/s ± 7% 132MB/s ±15% ~ (p=0.126 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 124MB/s ± 5% 125MB/s ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 63.9MB/s ±13% 65.2MB/s ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 64.0MB/s ± 5% 61.3MB/s ±15% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 620MB/s ± 8% 622MB/s ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 644MB/s ±15% 679MB/s ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 627MB/s ± 4% 633MB/s ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 688MB/s ± 6% 682MB/s ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 310MB/s ± 4% 309MB/s ± 6% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 646MB/s ± 4% 649MB/s ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 666MB/s ± 3% 666MB/s ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 592MB/s ± 7% 593MB/s ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 1.30GB/s ± 5% 1.32GB/s ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 756MB/s ± 5% 745MB/s ± 6% ~ (p=0.102 n=19+16)
```
PiperOrigin-RevId: 520144430
2 years ago
|
|
|
while (arena != NULL) {
|
|
|
|
_upb_MemBlock* block = arena->blocks;
|
|
|
|
while (block != NULL) {
|
|
|
|
memsize += sizeof(_upb_MemBlock) + block->size;
|
|
|
|
block = upb_Atomic_Load(&block->next, memory_order_relaxed);
|
|
|
|
}
|
|
|
|
arena = upb_Atomic_Load(&arena->next, memory_order_relaxed);
|
|
|
|
}
|
|
|
|
|
|
|
|
return memsize;
|
|
|
|
}
|
|
|
|
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
uint32_t upb_Arena_DebugRefCount(upb_Arena* a) {
|
|
|
|
// These loads could probably be relaxed, but given that this is debug-only,
|
|
|
|
// it's not worth introducing a new variant for it.
|
|
|
|
uintptr_t poc = upb_Atomic_Load(&a->parent_or_count, memory_order_acquire);
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
while (_upb_Arena_IsTaggedPointer(poc)) {
|
|
|
|
a = _upb_Arena_PointerFromTagged(poc);
|
|
|
|
poc = upb_Atomic_Load(&a->parent_or_count, memory_order_acquire);
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
}
|
|
|
|
return _upb_Arena_RefCountFromTagged(poc);
|
|
|
|
}
|
|
|
|
|
Changed Arena representation so that fusing links arenas together instead of blocks.
Previously when fusing, we would concatenate all blocks into a single list that lived in the arena root. From then on, all arenas would add their blocks to this single unified list.
After this CL, arenas keep their distinct list of blocks even after being fused. Instead of unifying the block list, fuse now puts the arenas themselves into a list, so all arenas in the fused group can be iterated over at any time.
This design makes it easier to keep each individual arena thread-compatible, because fuse and free are now the only mutating operations that touch state that is shared with the entire group. Read-only operations like `SpaceAllocated()` also iterate the list of arenas, but in a read-only fashion.
(Note: we need tests for SpaceAllocated(), both single-threaded for correctness and multi-threaded for resilience to crashes and data races).
Performance of fuse regresses by 5-20%. This is somewhat expected as we are performing more atomic operations during a fuse.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.7ns ± 4% +2.00% (p=0.016 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.50ns ± 4% 6.57ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.3ns ±10% 68.7ns ± 4% +15.85% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 479ns ± 5% 540ns ± 8% +12.57% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.50µs ± 4% 4.93µs ± 8% +9.59% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.24µs ± 3% 9.96µs ± 3% +7.81% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.3ns ±18% 71.0ns ± 4% +12.14% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 484ns ± 9% 543ns ±10% +12.11% (p=0.000 n=17+16)
BM_ArenaFuseBalanced/64 4.50µs ± 6% 4.94µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.20µs ± 4% 9.95µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.50ms ± 8% 5.69ms ±17% ~ (p=0.189 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.10ms ± 5% 6.05ms ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.8ms ± 5% 12.4ms ±17% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.1µs ± 8% 12.1µs ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 10.9µs ± 7% 11.0µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 4% 24.4µs ± 7% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 5% 11.6µs ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.77µs ± 5% 5.71µs ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.102 n=19+16)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.8ns ± 4% +1.98% (p=0.019 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.51ns ± 4% 6.58ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.5ns ±10% 68.9ns ± 4% +15.83% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 481ns ± 5% 541ns ± 8% +12.54% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.51µs ± 4% 4.94µs ± 8% +9.53% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.26µs ± 3% 9.98µs ± 3% +7.79% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.5ns ±19% 71.1ns ± 3% +12.07% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 485ns ± 9% 551ns ±20% +13.47% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/64 4.51µs ± 6% 4.95µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.22µs ± 4% 9.97µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 8% 5.72ms ±18% ~ (p=0.199 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.12ms ± 5% 6.07ms ± 4% ~ (p=0.273 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 5% 12.5ms ±18% ~ (p=0.582 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 8% 12.1µs ± 3% ~ (p=0.963 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.126 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.0µs ± 6% 11.1µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.3µs ± 4% 24.5µs ± 6% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.7µs ± 5% 11.6µs ± 4% ~ (p=0.574 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.78µs ± 5% 5.73µs ± 5% ~ (p=0.357 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.117 n=19+16)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.08k ± 0% 6.05k ± 0% -0.54% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.39k ± 0% 6.36k ± 0% -0.55% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 336 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 9.89M ± 0% 9.95M ± 0% +0.65% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 9.95M ± 0% 10.02M ± 0% +0.70% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 138MB/s ± 7% 132MB/s ±15% ~ (p=0.126 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 124MB/s ± 5% 125MB/s ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 63.9MB/s ±13% 65.2MB/s ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 64.0MB/s ± 5% 61.3MB/s ±15% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 620MB/s ± 8% 622MB/s ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 644MB/s ±15% 679MB/s ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 627MB/s ± 4% 633MB/s ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 688MB/s ± 6% 682MB/s ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 310MB/s ± 4% 309MB/s ± 6% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 646MB/s ± 4% 649MB/s ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 666MB/s ± 3% 666MB/s ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 592MB/s ± 7% 593MB/s ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 1.30GB/s ± 5% 1.32GB/s ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 756MB/s ± 5% 745MB/s ± 6% ~ (p=0.102 n=19+16)
```
PiperOrigin-RevId: 520144430
2 years ago
|
|
|
static void upb_Arena_AddBlock(upb_Arena* a, void* ptr, size_t size) {
|
|
|
|
_upb_MemBlock* block = ptr;
|
|
|
|
|
Changed Arena representation so that fusing links arenas together instead of blocks.
Previously when fusing, we would concatenate all blocks into a single list that lived in the arena root. From then on, all arenas would add their blocks to this single unified list.
After this CL, arenas keep their distinct list of blocks even after being fused. Instead of unifying the block list, fuse now puts the arenas themselves into a list, so all arenas in the fused group can be iterated over at any time.
This design makes it easier to keep each individual arena thread-compatible, because fuse and free are now the only mutating operations that touch state that is shared with the entire group. Read-only operations like `SpaceAllocated()` also iterate the list of arenas, but in a read-only fashion.
(Note: we need tests for SpaceAllocated(), both single-threaded for correctness and multi-threaded for resilience to crashes and data races).
Performance of fuse regresses by 5-20%. This is somewhat expected as we are performing more atomic operations during a fuse.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.7ns ± 4% +2.00% (p=0.016 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.50ns ± 4% 6.57ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.3ns ±10% 68.7ns ± 4% +15.85% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 479ns ± 5% 540ns ± 8% +12.57% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.50µs ± 4% 4.93µs ± 8% +9.59% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.24µs ± 3% 9.96µs ± 3% +7.81% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.3ns ±18% 71.0ns ± 4% +12.14% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 484ns ± 9% 543ns ±10% +12.11% (p=0.000 n=17+16)
BM_ArenaFuseBalanced/64 4.50µs ± 6% 4.94µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.20µs ± 4% 9.95µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.50ms ± 8% 5.69ms ±17% ~ (p=0.189 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.10ms ± 5% 6.05ms ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.8ms ± 5% 12.4ms ±17% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.1µs ± 8% 12.1µs ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 10.9µs ± 7% 11.0µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 4% 24.4µs ± 7% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 5% 11.6µs ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.77µs ± 5% 5.71µs ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.102 n=19+16)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.8ns ± 4% +1.98% (p=0.019 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.51ns ± 4% 6.58ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.5ns ±10% 68.9ns ± 4% +15.83% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 481ns ± 5% 541ns ± 8% +12.54% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.51µs ± 4% 4.94µs ± 8% +9.53% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.26µs ± 3% 9.98µs ± 3% +7.79% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.5ns ±19% 71.1ns ± 3% +12.07% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 485ns ± 9% 551ns ±20% +13.47% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/64 4.51µs ± 6% 4.95µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.22µs ± 4% 9.97µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 8% 5.72ms ±18% ~ (p=0.199 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.12ms ± 5% 6.07ms ± 4% ~ (p=0.273 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 5% 12.5ms ±18% ~ (p=0.582 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 8% 12.1µs ± 3% ~ (p=0.963 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.126 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.0µs ± 6% 11.1µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.3µs ± 4% 24.5µs ± 6% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.7µs ± 5% 11.6µs ± 4% ~ (p=0.574 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.78µs ± 5% 5.73µs ± 5% ~ (p=0.357 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.117 n=19+16)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.08k ± 0% 6.05k ± 0% -0.54% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.39k ± 0% 6.36k ± 0% -0.55% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 336 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 9.89M ± 0% 9.95M ± 0% +0.65% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 9.95M ± 0% 10.02M ± 0% +0.70% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 138MB/s ± 7% 132MB/s ±15% ~ (p=0.126 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 124MB/s ± 5% 125MB/s ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 63.9MB/s ±13% 65.2MB/s ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 64.0MB/s ± 5% 61.3MB/s ±15% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 620MB/s ± 8% 622MB/s ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 644MB/s ±15% 679MB/s ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 627MB/s ± 4% 633MB/s ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 688MB/s ± 6% 682MB/s ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 310MB/s ± 4% 309MB/s ± 6% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 646MB/s ± 4% 649MB/s ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 666MB/s ± 3% 666MB/s ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 592MB/s ± 7% 593MB/s ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 1.30GB/s ± 5% 1.32GB/s ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 756MB/s ± 5% 745MB/s ± 6% ~ (p=0.102 n=19+16)
```
PiperOrigin-RevId: 520144430
2 years ago
|
|
|
// Insert into linked list.
|
|
|
|
upb_Atomic_Init(&block->next, a->blocks);
|
|
|
|
block->size = (uint32_t)size;
|
Changed Arena representation so that fusing links arenas together instead of blocks.
Previously when fusing, we would concatenate all blocks into a single list that lived in the arena root. From then on, all arenas would add their blocks to this single unified list.
After this CL, arenas keep their distinct list of blocks even after being fused. Instead of unifying the block list, fuse now puts the arenas themselves into a list, so all arenas in the fused group can be iterated over at any time.
This design makes it easier to keep each individual arena thread-compatible, because fuse and free are now the only mutating operations that touch state that is shared with the entire group. Read-only operations like `SpaceAllocated()` also iterate the list of arenas, but in a read-only fashion.
(Note: we need tests for SpaceAllocated(), both single-threaded for correctness and multi-threaded for resilience to crashes and data races).
Performance of fuse regresses by 5-20%. This is somewhat expected as we are performing more atomic operations during a fuse.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.7ns ± 4% +2.00% (p=0.016 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.50ns ± 4% 6.57ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.3ns ±10% 68.7ns ± 4% +15.85% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 479ns ± 5% 540ns ± 8% +12.57% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.50µs ± 4% 4.93µs ± 8% +9.59% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.24µs ± 3% 9.96µs ± 3% +7.81% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.3ns ±18% 71.0ns ± 4% +12.14% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 484ns ± 9% 543ns ±10% +12.11% (p=0.000 n=17+16)
BM_ArenaFuseBalanced/64 4.50µs ± 6% 4.94µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.20µs ± 4% 9.95µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.50ms ± 8% 5.69ms ±17% ~ (p=0.189 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.10ms ± 5% 6.05ms ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.8ms ± 5% 12.4ms ±17% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.1µs ± 8% 12.1µs ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 10.9µs ± 7% 11.0µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 4% 24.4µs ± 7% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 5% 11.6µs ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.77µs ± 5% 5.71µs ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.102 n=19+16)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.8ns ± 4% +1.98% (p=0.019 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.51ns ± 4% 6.58ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.5ns ±10% 68.9ns ± 4% +15.83% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 481ns ± 5% 541ns ± 8% +12.54% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.51µs ± 4% 4.94µs ± 8% +9.53% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.26µs ± 3% 9.98µs ± 3% +7.79% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.5ns ±19% 71.1ns ± 3% +12.07% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 485ns ± 9% 551ns ±20% +13.47% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/64 4.51µs ± 6% 4.95µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.22µs ± 4% 9.97µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 8% 5.72ms ±18% ~ (p=0.199 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.12ms ± 5% 6.07ms ± 4% ~ (p=0.273 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 5% 12.5ms ±18% ~ (p=0.582 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 8% 12.1µs ± 3% ~ (p=0.963 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.126 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.0µs ± 6% 11.1µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.3µs ± 4% 24.5µs ± 6% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.7µs ± 5% 11.6µs ± 4% ~ (p=0.574 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.78µs ± 5% 5.73µs ± 5% ~ (p=0.357 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.117 n=19+16)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.08k ± 0% 6.05k ± 0% -0.54% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.39k ± 0% 6.36k ± 0% -0.55% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 336 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 9.89M ± 0% 9.95M ± 0% +0.65% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 9.95M ± 0% 10.02M ± 0% +0.70% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 138MB/s ± 7% 132MB/s ±15% ~ (p=0.126 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 124MB/s ± 5% 125MB/s ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 63.9MB/s ±13% 65.2MB/s ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 64.0MB/s ± 5% 61.3MB/s ±15% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 620MB/s ± 8% 622MB/s ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 644MB/s ±15% 679MB/s ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 627MB/s ± 4% 633MB/s ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 688MB/s ± 6% 682MB/s ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 310MB/s ± 4% 309MB/s ± 6% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 646MB/s ± 4% 649MB/s ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 666MB/s ± 3% 666MB/s ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 592MB/s ± 7% 593MB/s ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 1.30GB/s ± 5% 1.32GB/s ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 756MB/s ± 5% 745MB/s ± 6% ~ (p=0.102 n=19+16)
```
PiperOrigin-RevId: 520144430
2 years ago
|
|
|
upb_Atomic_Store(&a->blocks, block, memory_order_relaxed);
|
|
|
|
|
|
|
|
a->head.ptr = UPB_PTR_AT(block, memblock_reserve, char);
|
|
|
|
a->head.end = UPB_PTR_AT(block, size, char);
|
|
|
|
|
|
|
|
UPB_POISON_MEMORY_REGION(a->head.ptr, a->head.end - a->head.ptr);
|
|
|
|
}
|
|
|
|
|
Changed Arena representation so that fusing links arenas together instead of blocks.
Previously when fusing, we would concatenate all blocks into a single list that lived in the arena root. From then on, all arenas would add their blocks to this single unified list.
After this CL, arenas keep their distinct list of blocks even after being fused. Instead of unifying the block list, fuse now puts the arenas themselves into a list, so all arenas in the fused group can be iterated over at any time.
This design makes it easier to keep each individual arena thread-compatible, because fuse and free are now the only mutating operations that touch state that is shared with the entire group. Read-only operations like `SpaceAllocated()` also iterate the list of arenas, but in a read-only fashion.
(Note: we need tests for SpaceAllocated(), both single-threaded for correctness and multi-threaded for resilience to crashes and data races).
Performance of fuse regresses by 5-20%. This is somewhat expected as we are performing more atomic operations during a fuse.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.7ns ± 4% +2.00% (p=0.016 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.50ns ± 4% 6.57ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.3ns ±10% 68.7ns ± 4% +15.85% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 479ns ± 5% 540ns ± 8% +12.57% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.50µs ± 4% 4.93µs ± 8% +9.59% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.24µs ± 3% 9.96µs ± 3% +7.81% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.3ns ±18% 71.0ns ± 4% +12.14% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 484ns ± 9% 543ns ±10% +12.11% (p=0.000 n=17+16)
BM_ArenaFuseBalanced/64 4.50µs ± 6% 4.94µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.20µs ± 4% 9.95µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.50ms ± 8% 5.69ms ±17% ~ (p=0.189 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.10ms ± 5% 6.05ms ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.8ms ± 5% 12.4ms ±17% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.1µs ± 8% 12.1µs ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 10.9µs ± 7% 11.0µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 4% 24.4µs ± 7% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 5% 11.6µs ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.77µs ± 5% 5.71µs ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.102 n=19+16)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.8ns ± 4% +1.98% (p=0.019 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.51ns ± 4% 6.58ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.5ns ±10% 68.9ns ± 4% +15.83% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 481ns ± 5% 541ns ± 8% +12.54% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.51µs ± 4% 4.94µs ± 8% +9.53% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.26µs ± 3% 9.98µs ± 3% +7.79% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.5ns ±19% 71.1ns ± 3% +12.07% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 485ns ± 9% 551ns ±20% +13.47% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/64 4.51µs ± 6% 4.95µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.22µs ± 4% 9.97µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 8% 5.72ms ±18% ~ (p=0.199 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.12ms ± 5% 6.07ms ± 4% ~ (p=0.273 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 5% 12.5ms ±18% ~ (p=0.582 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 8% 12.1µs ± 3% ~ (p=0.963 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.126 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.0µs ± 6% 11.1µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.3µs ± 4% 24.5µs ± 6% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.7µs ± 5% 11.6µs ± 4% ~ (p=0.574 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.78µs ± 5% 5.73µs ± 5% ~ (p=0.357 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.117 n=19+16)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.08k ± 0% 6.05k ± 0% -0.54% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.39k ± 0% 6.36k ± 0% -0.55% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 336 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 9.89M ± 0% 9.95M ± 0% +0.65% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 9.95M ± 0% 10.02M ± 0% +0.70% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 138MB/s ± 7% 132MB/s ±15% ~ (p=0.126 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 124MB/s ± 5% 125MB/s ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 63.9MB/s ±13% 65.2MB/s ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 64.0MB/s ± 5% 61.3MB/s ±15% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 620MB/s ± 8% 622MB/s ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 644MB/s ±15% 679MB/s ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 627MB/s ± 4% 633MB/s ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 688MB/s ± 6% 682MB/s ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 310MB/s ± 4% 309MB/s ± 6% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 646MB/s ± 4% 649MB/s ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 666MB/s ± 3% 666MB/s ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 592MB/s ± 7% 593MB/s ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 1.30GB/s ± 5% 1.32GB/s ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 756MB/s ± 5% 745MB/s ± 6% ~ (p=0.102 n=19+16)
```
PiperOrigin-RevId: 520144430
2 years ago
|
|
|
static bool upb_Arena_AllocBlock(upb_Arena* a, size_t size) {
|
|
|
|
if (!a->block_alloc) return false;
|
|
|
|
_upb_MemBlock* last_block = upb_Atomic_Load(&a->blocks, memory_order_relaxed);
|
|
|
|
size_t last_size = last_block != NULL ? last_block->size : 128;
|
|
|
|
size_t block_size = UPB_MAX(size, last_size * 2) + memblock_reserve;
|
|
|
|
_upb_MemBlock* block = upb_malloc(upb_Arena_BlockAlloc(a), block_size);
|
|
|
|
|
|
|
|
if (!block) return false;
|
Changed Arena representation so that fusing links arenas together instead of blocks.
Previously when fusing, we would concatenate all blocks into a single list that lived in the arena root. From then on, all arenas would add their blocks to this single unified list.
After this CL, arenas keep their distinct list of blocks even after being fused. Instead of unifying the block list, fuse now puts the arenas themselves into a list, so all arenas in the fused group can be iterated over at any time.
This design makes it easier to keep each individual arena thread-compatible, because fuse and free are now the only mutating operations that touch state that is shared with the entire group. Read-only operations like `SpaceAllocated()` also iterate the list of arenas, but in a read-only fashion.
(Note: we need tests for SpaceAllocated(), both single-threaded for correctness and multi-threaded for resilience to crashes and data races).
Performance of fuse regresses by 5-20%. This is somewhat expected as we are performing more atomic operations during a fuse.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.7ns ± 4% +2.00% (p=0.016 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.50ns ± 4% 6.57ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.3ns ±10% 68.7ns ± 4% +15.85% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 479ns ± 5% 540ns ± 8% +12.57% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.50µs ± 4% 4.93µs ± 8% +9.59% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.24µs ± 3% 9.96µs ± 3% +7.81% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.3ns ±18% 71.0ns ± 4% +12.14% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 484ns ± 9% 543ns ±10% +12.11% (p=0.000 n=17+16)
BM_ArenaFuseBalanced/64 4.50µs ± 6% 4.94µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.20µs ± 4% 9.95µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.50ms ± 8% 5.69ms ±17% ~ (p=0.189 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.10ms ± 5% 6.05ms ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.8ms ± 5% 12.4ms ±17% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.1µs ± 8% 12.1µs ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 10.9µs ± 7% 11.0µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 4% 24.4µs ± 7% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 5% 11.6µs ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.77µs ± 5% 5.71µs ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.102 n=19+16)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.8ns ± 4% +1.98% (p=0.019 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.51ns ± 4% 6.58ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.5ns ±10% 68.9ns ± 4% +15.83% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 481ns ± 5% 541ns ± 8% +12.54% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.51µs ± 4% 4.94µs ± 8% +9.53% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.26µs ± 3% 9.98µs ± 3% +7.79% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.5ns ±19% 71.1ns ± 3% +12.07% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 485ns ± 9% 551ns ±20% +13.47% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/64 4.51µs ± 6% 4.95µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.22µs ± 4% 9.97µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 8% 5.72ms ±18% ~ (p=0.199 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.12ms ± 5% 6.07ms ± 4% ~ (p=0.273 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 5% 12.5ms ±18% ~ (p=0.582 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 8% 12.1µs ± 3% ~ (p=0.963 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.126 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.0µs ± 6% 11.1µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.3µs ± 4% 24.5µs ± 6% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.7µs ± 5% 11.6µs ± 4% ~ (p=0.574 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.78µs ± 5% 5.73µs ± 5% ~ (p=0.357 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.117 n=19+16)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.08k ± 0% 6.05k ± 0% -0.54% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.39k ± 0% 6.36k ± 0% -0.55% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 336 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 9.89M ± 0% 9.95M ± 0% +0.65% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 9.95M ± 0% 10.02M ± 0% +0.70% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 138MB/s ± 7% 132MB/s ±15% ~ (p=0.126 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 124MB/s ± 5% 125MB/s ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 63.9MB/s ±13% 65.2MB/s ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 64.0MB/s ± 5% 61.3MB/s ±15% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 620MB/s ± 8% 622MB/s ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 644MB/s ±15% 679MB/s ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 627MB/s ± 4% 633MB/s ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 688MB/s ± 6% 682MB/s ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 310MB/s ± 4% 309MB/s ± 6% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 646MB/s ± 4% 649MB/s ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 666MB/s ± 3% 666MB/s ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 592MB/s ± 7% 593MB/s ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 1.30GB/s ± 5% 1.32GB/s ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 756MB/s ± 5% 745MB/s ± 6% ~ (p=0.102 n=19+16)
```
PiperOrigin-RevId: 520144430
2 years ago
|
|
|
upb_Arena_AddBlock(a, block, block_size);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
void* _upb_Arena_SlowMalloc(upb_Arena* a, size_t size) {
|
Changed Arena representation so that fusing links arenas together instead of blocks.
Previously when fusing, we would concatenate all blocks into a single list that lived in the arena root. From then on, all arenas would add their blocks to this single unified list.
After this CL, arenas keep their distinct list of blocks even after being fused. Instead of unifying the block list, fuse now puts the arenas themselves into a list, so all arenas in the fused group can be iterated over at any time.
This design makes it easier to keep each individual arena thread-compatible, because fuse and free are now the only mutating operations that touch state that is shared with the entire group. Read-only operations like `SpaceAllocated()` also iterate the list of arenas, but in a read-only fashion.
(Note: we need tests for SpaceAllocated(), both single-threaded for correctness and multi-threaded for resilience to crashes and data races).
Performance of fuse regresses by 5-20%. This is somewhat expected as we are performing more atomic operations during a fuse.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.7ns ± 4% +2.00% (p=0.016 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.50ns ± 4% 6.57ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.3ns ±10% 68.7ns ± 4% +15.85% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 479ns ± 5% 540ns ± 8% +12.57% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.50µs ± 4% 4.93µs ± 8% +9.59% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.24µs ± 3% 9.96µs ± 3% +7.81% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.3ns ±18% 71.0ns ± 4% +12.14% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 484ns ± 9% 543ns ±10% +12.11% (p=0.000 n=17+16)
BM_ArenaFuseBalanced/64 4.50µs ± 6% 4.94µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.20µs ± 4% 9.95µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.50ms ± 8% 5.69ms ±17% ~ (p=0.189 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.10ms ± 5% 6.05ms ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.8ms ± 5% 12.4ms ±17% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.1µs ± 8% 12.1µs ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 10.9µs ± 7% 11.0µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 4% 24.4µs ± 7% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 5% 11.6µs ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.77µs ± 5% 5.71µs ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.102 n=19+16)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.8ns ± 4% +1.98% (p=0.019 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.51ns ± 4% 6.58ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.5ns ±10% 68.9ns ± 4% +15.83% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 481ns ± 5% 541ns ± 8% +12.54% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.51µs ± 4% 4.94µs ± 8% +9.53% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.26µs ± 3% 9.98µs ± 3% +7.79% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.5ns ±19% 71.1ns ± 3% +12.07% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 485ns ± 9% 551ns ±20% +13.47% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/64 4.51µs ± 6% 4.95µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.22µs ± 4% 9.97µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 8% 5.72ms ±18% ~ (p=0.199 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.12ms ± 5% 6.07ms ± 4% ~ (p=0.273 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 5% 12.5ms ±18% ~ (p=0.582 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 8% 12.1µs ± 3% ~ (p=0.963 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.126 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.0µs ± 6% 11.1µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.3µs ± 4% 24.5µs ± 6% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.7µs ± 5% 11.6µs ± 4% ~ (p=0.574 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.78µs ± 5% 5.73µs ± 5% ~ (p=0.357 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.117 n=19+16)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.08k ± 0% 6.05k ± 0% -0.54% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.39k ± 0% 6.36k ± 0% -0.55% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 336 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 9.89M ± 0% 9.95M ± 0% +0.65% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 9.95M ± 0% 10.02M ± 0% +0.70% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 138MB/s ± 7% 132MB/s ±15% ~ (p=0.126 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 124MB/s ± 5% 125MB/s ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 63.9MB/s ±13% 65.2MB/s ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 64.0MB/s ± 5% 61.3MB/s ±15% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 620MB/s ± 8% 622MB/s ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 644MB/s ±15% 679MB/s ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 627MB/s ± 4% 633MB/s ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 688MB/s ± 6% 682MB/s ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 310MB/s ± 4% 309MB/s ± 6% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 646MB/s ± 4% 649MB/s ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 666MB/s ± 3% 666MB/s ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 592MB/s ± 7% 593MB/s ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 1.30GB/s ± 5% 1.32GB/s ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 756MB/s ± 5% 745MB/s ± 6% ~ (p=0.102 n=19+16)
```
PiperOrigin-RevId: 520144430
2 years ago
|
|
|
if (!upb_Arena_AllocBlock(a, size)) return NULL; /* Out of memory. */
|
|
|
|
UPB_ASSERT(_upb_ArenaHas(a) >= size);
|
|
|
|
return upb_Arena_Malloc(a, size);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Public Arena API ***********************************************************/
|
|
|
|
|
|
|
|
static upb_Arena* upb_Arena_InitSlow(upb_alloc* alloc) {
|
|
|
|
const size_t first_block_overhead = sizeof(upb_Arena) + memblock_reserve;
|
|
|
|
upb_Arena* a;
|
|
|
|
|
|
|
|
/* We need to malloc the initial block. */
|
|
|
|
char* mem;
|
|
|
|
size_t n = first_block_overhead + 256;
|
|
|
|
if (!alloc || !(mem = upb_malloc(alloc, n))) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
a = UPB_PTR_AT(mem, n - sizeof(*a), upb_Arena);
|
|
|
|
n -= sizeof(*a);
|
|
|
|
|
|
|
|
a->block_alloc = upb_Arena_MakeBlockAlloc(alloc, 0);
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
upb_Atomic_Init(&a->parent_or_count, _upb_Arena_TaggedFromRefcount(1));
|
Changed Arena representation so that fusing links arenas together instead of blocks.
Previously when fusing, we would concatenate all blocks into a single list that lived in the arena root. From then on, all arenas would add their blocks to this single unified list.
After this CL, arenas keep their distinct list of blocks even after being fused. Instead of unifying the block list, fuse now puts the arenas themselves into a list, so all arenas in the fused group can be iterated over at any time.
This design makes it easier to keep each individual arena thread-compatible, because fuse and free are now the only mutating operations that touch state that is shared with the entire group. Read-only operations like `SpaceAllocated()` also iterate the list of arenas, but in a read-only fashion.
(Note: we need tests for SpaceAllocated(), both single-threaded for correctness and multi-threaded for resilience to crashes and data races).
Performance of fuse regresses by 5-20%. This is somewhat expected as we are performing more atomic operations during a fuse.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.7ns ± 4% +2.00% (p=0.016 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.50ns ± 4% 6.57ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.3ns ±10% 68.7ns ± 4% +15.85% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 479ns ± 5% 540ns ± 8% +12.57% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.50µs ± 4% 4.93µs ± 8% +9.59% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.24µs ± 3% 9.96µs ± 3% +7.81% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.3ns ±18% 71.0ns ± 4% +12.14% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 484ns ± 9% 543ns ±10% +12.11% (p=0.000 n=17+16)
BM_ArenaFuseBalanced/64 4.50µs ± 6% 4.94µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.20µs ± 4% 9.95µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.50ms ± 8% 5.69ms ±17% ~ (p=0.189 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.10ms ± 5% 6.05ms ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.8ms ± 5% 12.4ms ±17% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.1µs ± 8% 12.1µs ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 10.9µs ± 7% 11.0µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 4% 24.4µs ± 7% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 5% 11.6µs ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.77µs ± 5% 5.71µs ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.102 n=19+16)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.8ns ± 4% +1.98% (p=0.019 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.51ns ± 4% 6.58ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.5ns ±10% 68.9ns ± 4% +15.83% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 481ns ± 5% 541ns ± 8% +12.54% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.51µs ± 4% 4.94µs ± 8% +9.53% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.26µs ± 3% 9.98µs ± 3% +7.79% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.5ns ±19% 71.1ns ± 3% +12.07% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 485ns ± 9% 551ns ±20% +13.47% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/64 4.51µs ± 6% 4.95µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.22µs ± 4% 9.97µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 8% 5.72ms ±18% ~ (p=0.199 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.12ms ± 5% 6.07ms ± 4% ~ (p=0.273 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 5% 12.5ms ±18% ~ (p=0.582 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 8% 12.1µs ± 3% ~ (p=0.963 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.126 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.0µs ± 6% 11.1µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.3µs ± 4% 24.5µs ± 6% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.7µs ± 5% 11.6µs ± 4% ~ (p=0.574 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.78µs ± 5% 5.73µs ± 5% ~ (p=0.357 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.117 n=19+16)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.08k ± 0% 6.05k ± 0% -0.54% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.39k ± 0% 6.36k ± 0% -0.55% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 336 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 9.89M ± 0% 9.95M ± 0% +0.65% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 9.95M ± 0% 10.02M ± 0% +0.70% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 138MB/s ± 7% 132MB/s ±15% ~ (p=0.126 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 124MB/s ± 5% 125MB/s ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 63.9MB/s ±13% 65.2MB/s ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 64.0MB/s ± 5% 61.3MB/s ±15% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 620MB/s ± 8% 622MB/s ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 644MB/s ±15% 679MB/s ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 627MB/s ± 4% 633MB/s ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 688MB/s ± 6% 682MB/s ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 310MB/s ± 4% 309MB/s ± 6% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 646MB/s ± 4% 649MB/s ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 666MB/s ± 3% 666MB/s ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 592MB/s ± 7% 593MB/s ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 1.30GB/s ± 5% 1.32GB/s ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 756MB/s ± 5% 745MB/s ± 6% ~ (p=0.102 n=19+16)
```
PiperOrigin-RevId: 520144430
2 years ago
|
|
|
upb_Atomic_Init(&a->next, NULL);
|
|
|
|
upb_Atomic_Init(&a->tail, a);
|
|
|
|
upb_Atomic_Init(&a->blocks, NULL);
|
|
|
|
|
Changed Arena representation so that fusing links arenas together instead of blocks.
Previously when fusing, we would concatenate all blocks into a single list that lived in the arena root. From then on, all arenas would add their blocks to this single unified list.
After this CL, arenas keep their distinct list of blocks even after being fused. Instead of unifying the block list, fuse now puts the arenas themselves into a list, so all arenas in the fused group can be iterated over at any time.
This design makes it easier to keep each individual arena thread-compatible, because fuse and free are now the only mutating operations that touch state that is shared with the entire group. Read-only operations like `SpaceAllocated()` also iterate the list of arenas, but in a read-only fashion.
(Note: we need tests for SpaceAllocated(), both single-threaded for correctness and multi-threaded for resilience to crashes and data races).
Performance of fuse regresses by 5-20%. This is somewhat expected as we are performing more atomic operations during a fuse.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.7ns ± 4% +2.00% (p=0.016 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.50ns ± 4% 6.57ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.3ns ±10% 68.7ns ± 4% +15.85% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 479ns ± 5% 540ns ± 8% +12.57% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.50µs ± 4% 4.93µs ± 8% +9.59% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.24µs ± 3% 9.96µs ± 3% +7.81% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.3ns ±18% 71.0ns ± 4% +12.14% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 484ns ± 9% 543ns ±10% +12.11% (p=0.000 n=17+16)
BM_ArenaFuseBalanced/64 4.50µs ± 6% 4.94µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.20µs ± 4% 9.95µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.50ms ± 8% 5.69ms ±17% ~ (p=0.189 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.10ms ± 5% 6.05ms ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.8ms ± 5% 12.4ms ±17% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.1µs ± 8% 12.1µs ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 10.9µs ± 7% 11.0µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 4% 24.4µs ± 7% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 5% 11.6µs ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.77µs ± 5% 5.71µs ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.102 n=19+16)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.8ns ± 4% +1.98% (p=0.019 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.51ns ± 4% 6.58ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.5ns ±10% 68.9ns ± 4% +15.83% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 481ns ± 5% 541ns ± 8% +12.54% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.51µs ± 4% 4.94µs ± 8% +9.53% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.26µs ± 3% 9.98µs ± 3% +7.79% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.5ns ±19% 71.1ns ± 3% +12.07% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 485ns ± 9% 551ns ±20% +13.47% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/64 4.51µs ± 6% 4.95µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.22µs ± 4% 9.97µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 8% 5.72ms ±18% ~ (p=0.199 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.12ms ± 5% 6.07ms ± 4% ~ (p=0.273 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 5% 12.5ms ±18% ~ (p=0.582 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 8% 12.1µs ± 3% ~ (p=0.963 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.126 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.0µs ± 6% 11.1µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.3µs ± 4% 24.5µs ± 6% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.7µs ± 5% 11.6µs ± 4% ~ (p=0.574 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.78µs ± 5% 5.73µs ± 5% ~ (p=0.357 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.117 n=19+16)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.08k ± 0% 6.05k ± 0% -0.54% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.39k ± 0% 6.36k ± 0% -0.55% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 336 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 9.89M ± 0% 9.95M ± 0% +0.65% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 9.95M ± 0% 10.02M ± 0% +0.70% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 138MB/s ± 7% 132MB/s ±15% ~ (p=0.126 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 124MB/s ± 5% 125MB/s ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 63.9MB/s ±13% 65.2MB/s ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 64.0MB/s ± 5% 61.3MB/s ±15% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 620MB/s ± 8% 622MB/s ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 644MB/s ±15% 679MB/s ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 627MB/s ± 4% 633MB/s ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 688MB/s ± 6% 682MB/s ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 310MB/s ± 4% 309MB/s ± 6% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 646MB/s ± 4% 649MB/s ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 666MB/s ± 3% 666MB/s ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 592MB/s ± 7% 593MB/s ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 1.30GB/s ± 5% 1.32GB/s ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 756MB/s ± 5% 745MB/s ± 6% ~ (p=0.102 n=19+16)
```
PiperOrigin-RevId: 520144430
2 years ago
|
|
|
upb_Arena_AddBlock(a, mem, n);
|
|
|
|
|
|
|
|
return a;
|
|
|
|
}
|
|
|
|
|
|
|
|
upb_Arena* upb_Arena_Init(void* mem, size_t n, upb_alloc* alloc) {
|
|
|
|
upb_Arena* a;
|
|
|
|
|
|
|
|
if (n) {
|
|
|
|
/* Align initial pointer up so that we return properly-aligned pointers. */
|
|
|
|
void* aligned = (void*)UPB_ALIGN_UP((uintptr_t)mem, UPB_MALLOC_ALIGN);
|
|
|
|
size_t delta = (uintptr_t)aligned - (uintptr_t)mem;
|
|
|
|
n = delta <= n ? n - delta : 0;
|
|
|
|
mem = aligned;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Round block size down to alignof(*a) since we will allocate the arena
|
|
|
|
* itself at the end. */
|
|
|
|
n = UPB_ALIGN_DOWN(n, UPB_ALIGN_OF(upb_Arena));
|
|
|
|
|
|
|
|
if (UPB_UNLIKELY(n < sizeof(upb_Arena))) {
|
|
|
|
return upb_Arena_InitSlow(alloc);
|
|
|
|
}
|
|
|
|
|
|
|
|
a = UPB_PTR_AT(mem, n - sizeof(*a), upb_Arena);
|
|
|
|
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
upb_Atomic_Init(&a->parent_or_count, _upb_Arena_TaggedFromRefcount(1));
|
Changed Arena representation so that fusing links arenas together instead of blocks.
Previously when fusing, we would concatenate all blocks into a single list that lived in the arena root. From then on, all arenas would add their blocks to this single unified list.
After this CL, arenas keep their distinct list of blocks even after being fused. Instead of unifying the block list, fuse now puts the arenas themselves into a list, so all arenas in the fused group can be iterated over at any time.
This design makes it easier to keep each individual arena thread-compatible, because fuse and free are now the only mutating operations that touch state that is shared with the entire group. Read-only operations like `SpaceAllocated()` also iterate the list of arenas, but in a read-only fashion.
(Note: we need tests for SpaceAllocated(), both single-threaded for correctness and multi-threaded for resilience to crashes and data races).
Performance of fuse regresses by 5-20%. This is somewhat expected as we are performing more atomic operations during a fuse.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.7ns ± 4% +2.00% (p=0.016 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.50ns ± 4% 6.57ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.3ns ±10% 68.7ns ± 4% +15.85% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 479ns ± 5% 540ns ± 8% +12.57% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.50µs ± 4% 4.93µs ± 8% +9.59% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.24µs ± 3% 9.96µs ± 3% +7.81% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.3ns ±18% 71.0ns ± 4% +12.14% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 484ns ± 9% 543ns ±10% +12.11% (p=0.000 n=17+16)
BM_ArenaFuseBalanced/64 4.50µs ± 6% 4.94µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.20µs ± 4% 9.95µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.50ms ± 8% 5.69ms ±17% ~ (p=0.189 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.10ms ± 5% 6.05ms ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.8ms ± 5% 12.4ms ±17% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.1µs ± 8% 12.1µs ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 10.9µs ± 7% 11.0µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 4% 24.4µs ± 7% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 5% 11.6µs ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.77µs ± 5% 5.71µs ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.102 n=19+16)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.8ns ± 4% +1.98% (p=0.019 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.51ns ± 4% 6.58ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.5ns ±10% 68.9ns ± 4% +15.83% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 481ns ± 5% 541ns ± 8% +12.54% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.51µs ± 4% 4.94µs ± 8% +9.53% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.26µs ± 3% 9.98µs ± 3% +7.79% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.5ns ±19% 71.1ns ± 3% +12.07% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 485ns ± 9% 551ns ±20% +13.47% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/64 4.51µs ± 6% 4.95µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.22µs ± 4% 9.97µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 8% 5.72ms ±18% ~ (p=0.199 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.12ms ± 5% 6.07ms ± 4% ~ (p=0.273 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 5% 12.5ms ±18% ~ (p=0.582 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 8% 12.1µs ± 3% ~ (p=0.963 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.126 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.0µs ± 6% 11.1µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.3µs ± 4% 24.5µs ± 6% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.7µs ± 5% 11.6µs ± 4% ~ (p=0.574 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.78µs ± 5% 5.73µs ± 5% ~ (p=0.357 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.117 n=19+16)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.08k ± 0% 6.05k ± 0% -0.54% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.39k ± 0% 6.36k ± 0% -0.55% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 336 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 9.89M ± 0% 9.95M ± 0% +0.65% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 9.95M ± 0% 10.02M ± 0% +0.70% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 138MB/s ± 7% 132MB/s ±15% ~ (p=0.126 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 124MB/s ± 5% 125MB/s ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 63.9MB/s ±13% 65.2MB/s ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 64.0MB/s ± 5% 61.3MB/s ±15% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 620MB/s ± 8% 622MB/s ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 644MB/s ±15% 679MB/s ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 627MB/s ± 4% 633MB/s ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 688MB/s ± 6% 682MB/s ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 310MB/s ± 4% 309MB/s ± 6% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 646MB/s ± 4% 649MB/s ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 666MB/s ± 3% 666MB/s ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 592MB/s ± 7% 593MB/s ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 1.30GB/s ± 5% 1.32GB/s ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 756MB/s ± 5% 745MB/s ± 6% ~ (p=0.102 n=19+16)
```
PiperOrigin-RevId: 520144430
2 years ago
|
|
|
upb_Atomic_Init(&a->next, NULL);
|
|
|
|
upb_Atomic_Init(&a->tail, a);
|
|
|
|
upb_Atomic_Init(&a->blocks, NULL);
|
|
|
|
a->block_alloc = upb_Arena_MakeBlockAlloc(alloc, 1);
|
|
|
|
a->head.ptr = mem;
|
|
|
|
a->head.end = UPB_PTR_AT(mem, n - sizeof(*a), char);
|
|
|
|
|
|
|
|
return a;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void arena_dofree(upb_Arena* a) {
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
UPB_ASSERT(_upb_Arena_RefCountFromTagged(a->parent_or_count) == 1);
|
|
|
|
|
Changed Arena representation so that fusing links arenas together instead of blocks.
Previously when fusing, we would concatenate all blocks into a single list that lived in the arena root. From then on, all arenas would add their blocks to this single unified list.
After this CL, arenas keep their distinct list of blocks even after being fused. Instead of unifying the block list, fuse now puts the arenas themselves into a list, so all arenas in the fused group can be iterated over at any time.
This design makes it easier to keep each individual arena thread-compatible, because fuse and free are now the only mutating operations that touch state that is shared with the entire group. Read-only operations like `SpaceAllocated()` also iterate the list of arenas, but in a read-only fashion.
(Note: we need tests for SpaceAllocated(), both single-threaded for correctness and multi-threaded for resilience to crashes and data races).
Performance of fuse regresses by 5-20%. This is somewhat expected as we are performing more atomic operations during a fuse.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.7ns ± 4% +2.00% (p=0.016 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.50ns ± 4% 6.57ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.3ns ±10% 68.7ns ± 4% +15.85% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 479ns ± 5% 540ns ± 8% +12.57% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.50µs ± 4% 4.93µs ± 8% +9.59% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.24µs ± 3% 9.96µs ± 3% +7.81% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.3ns ±18% 71.0ns ± 4% +12.14% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 484ns ± 9% 543ns ±10% +12.11% (p=0.000 n=17+16)
BM_ArenaFuseBalanced/64 4.50µs ± 6% 4.94µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.20µs ± 4% 9.95µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.50ms ± 8% 5.69ms ±17% ~ (p=0.189 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.10ms ± 5% 6.05ms ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.8ms ± 5% 12.4ms ±17% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.1µs ± 8% 12.1µs ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 10.9µs ± 7% 11.0µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 4% 24.4µs ± 7% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 5% 11.6µs ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.77µs ± 5% 5.71µs ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.102 n=19+16)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.8ns ± 4% +1.98% (p=0.019 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.51ns ± 4% 6.58ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.5ns ±10% 68.9ns ± 4% +15.83% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 481ns ± 5% 541ns ± 8% +12.54% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.51µs ± 4% 4.94µs ± 8% +9.53% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.26µs ± 3% 9.98µs ± 3% +7.79% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.5ns ±19% 71.1ns ± 3% +12.07% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 485ns ± 9% 551ns ±20% +13.47% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/64 4.51µs ± 6% 4.95µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.22µs ± 4% 9.97µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 8% 5.72ms ±18% ~ (p=0.199 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.12ms ± 5% 6.07ms ± 4% ~ (p=0.273 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 5% 12.5ms ±18% ~ (p=0.582 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 8% 12.1µs ± 3% ~ (p=0.963 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.126 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.0µs ± 6% 11.1µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.3µs ± 4% 24.5µs ± 6% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.7µs ± 5% 11.6µs ± 4% ~ (p=0.574 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.78µs ± 5% 5.73µs ± 5% ~ (p=0.357 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.117 n=19+16)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.08k ± 0% 6.05k ± 0% -0.54% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.39k ± 0% 6.36k ± 0% -0.55% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 336 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 9.89M ± 0% 9.95M ± 0% +0.65% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 9.95M ± 0% 10.02M ± 0% +0.70% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 138MB/s ± 7% 132MB/s ±15% ~ (p=0.126 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 124MB/s ± 5% 125MB/s ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 63.9MB/s ±13% 65.2MB/s ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 64.0MB/s ± 5% 61.3MB/s ±15% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 620MB/s ± 8% 622MB/s ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 644MB/s ±15% 679MB/s ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 627MB/s ± 4% 633MB/s ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 688MB/s ± 6% 682MB/s ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 310MB/s ± 4% 309MB/s ± 6% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 646MB/s ± 4% 649MB/s ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 666MB/s ± 3% 666MB/s ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 592MB/s ± 7% 593MB/s ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 1.30GB/s ± 5% 1.32GB/s ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 756MB/s ± 5% 745MB/s ± 6% ~ (p=0.102 n=19+16)
```
PiperOrigin-RevId: 520144430
2 years ago
|
|
|
while (a != NULL) {
|
|
|
|
// Load first since arena itself is likely from one of its blocks.
|
|
|
|
upb_Arena* next_arena =
|
|
|
|
(upb_Arena*)upb_Atomic_Load(&a->next, memory_order_acquire);
|
|
|
|
upb_alloc* block_alloc = upb_Arena_BlockAlloc(a);
|
Changed Arena representation so that fusing links arenas together instead of blocks.
Previously when fusing, we would concatenate all blocks into a single list that lived in the arena root. From then on, all arenas would add their blocks to this single unified list.
After this CL, arenas keep their distinct list of blocks even after being fused. Instead of unifying the block list, fuse now puts the arenas themselves into a list, so all arenas in the fused group can be iterated over at any time.
This design makes it easier to keep each individual arena thread-compatible, because fuse and free are now the only mutating operations that touch state that is shared with the entire group. Read-only operations like `SpaceAllocated()` also iterate the list of arenas, but in a read-only fashion.
(Note: we need tests for SpaceAllocated(), both single-threaded for correctness and multi-threaded for resilience to crashes and data races).
Performance of fuse regresses by 5-20%. This is somewhat expected as we are performing more atomic operations during a fuse.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.7ns ± 4% +2.00% (p=0.016 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.50ns ± 4% 6.57ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.3ns ±10% 68.7ns ± 4% +15.85% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 479ns ± 5% 540ns ± 8% +12.57% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.50µs ± 4% 4.93µs ± 8% +9.59% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.24µs ± 3% 9.96µs ± 3% +7.81% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.3ns ±18% 71.0ns ± 4% +12.14% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 484ns ± 9% 543ns ±10% +12.11% (p=0.000 n=17+16)
BM_ArenaFuseBalanced/64 4.50µs ± 6% 4.94µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.20µs ± 4% 9.95µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.50ms ± 8% 5.69ms ±17% ~ (p=0.189 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.10ms ± 5% 6.05ms ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.8ms ± 5% 12.4ms ±17% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.1µs ± 8% 12.1µs ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 10.9µs ± 7% 11.0µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 4% 24.4µs ± 7% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 5% 11.6µs ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.77µs ± 5% 5.71µs ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.102 n=19+16)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.8ns ± 4% +1.98% (p=0.019 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.51ns ± 4% 6.58ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.5ns ±10% 68.9ns ± 4% +15.83% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 481ns ± 5% 541ns ± 8% +12.54% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.51µs ± 4% 4.94µs ± 8% +9.53% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.26µs ± 3% 9.98µs ± 3% +7.79% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.5ns ±19% 71.1ns ± 3% +12.07% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 485ns ± 9% 551ns ±20% +13.47% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/64 4.51µs ± 6% 4.95µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.22µs ± 4% 9.97µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 8% 5.72ms ±18% ~ (p=0.199 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.12ms ± 5% 6.07ms ± 4% ~ (p=0.273 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 5% 12.5ms ±18% ~ (p=0.582 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 8% 12.1µs ± 3% ~ (p=0.963 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.126 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.0µs ± 6% 11.1µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.3µs ± 4% 24.5µs ± 6% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.7µs ± 5% 11.6µs ± 4% ~ (p=0.574 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.78µs ± 5% 5.73µs ± 5% ~ (p=0.357 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.117 n=19+16)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.08k ± 0% 6.05k ± 0% -0.54% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.39k ± 0% 6.36k ± 0% -0.55% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 336 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 9.89M ± 0% 9.95M ± 0% +0.65% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 9.95M ± 0% 10.02M ± 0% +0.70% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 138MB/s ± 7% 132MB/s ±15% ~ (p=0.126 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 124MB/s ± 5% 125MB/s ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 63.9MB/s ±13% 65.2MB/s ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 64.0MB/s ± 5% 61.3MB/s ±15% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 620MB/s ± 8% 622MB/s ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 644MB/s ±15% 679MB/s ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 627MB/s ± 4% 633MB/s ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 688MB/s ± 6% 682MB/s ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 310MB/s ± 4% 309MB/s ± 6% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 646MB/s ± 4% 649MB/s ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 666MB/s ± 3% 666MB/s ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 592MB/s ± 7% 593MB/s ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 1.30GB/s ± 5% 1.32GB/s ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 756MB/s ± 5% 745MB/s ± 6% ~ (p=0.102 n=19+16)
```
PiperOrigin-RevId: 520144430
2 years ago
|
|
|
_upb_MemBlock* block = upb_Atomic_Load(&a->blocks, memory_order_relaxed);
|
|
|
|
while (block != NULL) {
|
|
|
|
// Load first since we are deleting block.
|
Changed Arena representation so that fusing links arenas together instead of blocks.
Previously when fusing, we would concatenate all blocks into a single list that lived in the arena root. From then on, all arenas would add their blocks to this single unified list.
After this CL, arenas keep their distinct list of blocks even after being fused. Instead of unifying the block list, fuse now puts the arenas themselves into a list, so all arenas in the fused group can be iterated over at any time.
This design makes it easier to keep each individual arena thread-compatible, because fuse and free are now the only mutating operations that touch state that is shared with the entire group. Read-only operations like `SpaceAllocated()` also iterate the list of arenas, but in a read-only fashion.
(Note: we need tests for SpaceAllocated(), both single-threaded for correctness and multi-threaded for resilience to crashes and data races).
Performance of fuse regresses by 5-20%. This is somewhat expected as we are performing more atomic operations during a fuse.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.7ns ± 4% +2.00% (p=0.016 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.50ns ± 4% 6.57ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.3ns ±10% 68.7ns ± 4% +15.85% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 479ns ± 5% 540ns ± 8% +12.57% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.50µs ± 4% 4.93µs ± 8% +9.59% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.24µs ± 3% 9.96µs ± 3% +7.81% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.3ns ±18% 71.0ns ± 4% +12.14% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 484ns ± 9% 543ns ±10% +12.11% (p=0.000 n=17+16)
BM_ArenaFuseBalanced/64 4.50µs ± 6% 4.94µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.20µs ± 4% 9.95µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.50ms ± 8% 5.69ms ±17% ~ (p=0.189 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.10ms ± 5% 6.05ms ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.8ms ± 5% 12.4ms ±17% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.1µs ± 8% 12.1µs ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 10.9µs ± 7% 11.0µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 4% 24.4µs ± 7% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 5% 11.6µs ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.77µs ± 5% 5.71µs ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.102 n=19+16)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.8ns ± 4% +1.98% (p=0.019 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.51ns ± 4% 6.58ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.5ns ±10% 68.9ns ± 4% +15.83% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 481ns ± 5% 541ns ± 8% +12.54% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.51µs ± 4% 4.94µs ± 8% +9.53% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.26µs ± 3% 9.98µs ± 3% +7.79% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.5ns ±19% 71.1ns ± 3% +12.07% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 485ns ± 9% 551ns ±20% +13.47% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/64 4.51µs ± 6% 4.95µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.22µs ± 4% 9.97µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 8% 5.72ms ±18% ~ (p=0.199 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.12ms ± 5% 6.07ms ± 4% ~ (p=0.273 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 5% 12.5ms ±18% ~ (p=0.582 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 8% 12.1µs ± 3% ~ (p=0.963 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.126 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.0µs ± 6% 11.1µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.3µs ± 4% 24.5µs ± 6% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.7µs ± 5% 11.6µs ± 4% ~ (p=0.574 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.78µs ± 5% 5.73µs ± 5% ~ (p=0.357 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.117 n=19+16)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.08k ± 0% 6.05k ± 0% -0.54% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.39k ± 0% 6.36k ± 0% -0.55% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 336 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 9.89M ± 0% 9.95M ± 0% +0.65% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 9.95M ± 0% 10.02M ± 0% +0.70% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 138MB/s ± 7% 132MB/s ±15% ~ (p=0.126 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 124MB/s ± 5% 125MB/s ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 63.9MB/s ±13% 65.2MB/s ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 64.0MB/s ± 5% 61.3MB/s ±15% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 620MB/s ± 8% 622MB/s ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 644MB/s ±15% 679MB/s ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 627MB/s ± 4% 633MB/s ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 688MB/s ± 6% 682MB/s ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 310MB/s ± 4% 309MB/s ± 6% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 646MB/s ± 4% 649MB/s ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 666MB/s ± 3% 666MB/s ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 592MB/s ± 7% 593MB/s ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 1.30GB/s ± 5% 1.32GB/s ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 756MB/s ± 5% 745MB/s ± 6% ~ (p=0.102 n=19+16)
```
PiperOrigin-RevId: 520144430
2 years ago
|
|
|
_upb_MemBlock* next_block =
|
|
|
|
upb_Atomic_Load(&block->next, memory_order_relaxed);
|
|
|
|
upb_free(block_alloc, block);
|
Changed Arena representation so that fusing links arenas together instead of blocks.
Previously when fusing, we would concatenate all blocks into a single list that lived in the arena root. From then on, all arenas would add their blocks to this single unified list.
After this CL, arenas keep their distinct list of blocks even after being fused. Instead of unifying the block list, fuse now puts the arenas themselves into a list, so all arenas in the fused group can be iterated over at any time.
This design makes it easier to keep each individual arena thread-compatible, because fuse and free are now the only mutating operations that touch state that is shared with the entire group. Read-only operations like `SpaceAllocated()` also iterate the list of arenas, but in a read-only fashion.
(Note: we need tests for SpaceAllocated(), both single-threaded for correctness and multi-threaded for resilience to crashes and data races).
Performance of fuse regresses by 5-20%. This is somewhat expected as we are performing more atomic operations during a fuse.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.7ns ± 4% +2.00% (p=0.016 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.50ns ± 4% 6.57ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.3ns ±10% 68.7ns ± 4% +15.85% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 479ns ± 5% 540ns ± 8% +12.57% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.50µs ± 4% 4.93µs ± 8% +9.59% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.24µs ± 3% 9.96µs ± 3% +7.81% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.3ns ±18% 71.0ns ± 4% +12.14% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 484ns ± 9% 543ns ±10% +12.11% (p=0.000 n=17+16)
BM_ArenaFuseBalanced/64 4.50µs ± 6% 4.94µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.20µs ± 4% 9.95µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.50ms ± 8% 5.69ms ±17% ~ (p=0.189 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.10ms ± 5% 6.05ms ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.8ms ± 5% 12.4ms ±17% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.1µs ± 8% 12.1µs ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 10.9µs ± 7% 11.0µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 4% 24.4µs ± 7% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 5% 11.6µs ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.77µs ± 5% 5.71µs ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.102 n=19+16)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.8ns ± 4% +1.98% (p=0.019 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.51ns ± 4% 6.58ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.5ns ±10% 68.9ns ± 4% +15.83% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 481ns ± 5% 541ns ± 8% +12.54% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.51µs ± 4% 4.94µs ± 8% +9.53% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.26µs ± 3% 9.98µs ± 3% +7.79% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.5ns ±19% 71.1ns ± 3% +12.07% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 485ns ± 9% 551ns ±20% +13.47% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/64 4.51µs ± 6% 4.95µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.22µs ± 4% 9.97µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 8% 5.72ms ±18% ~ (p=0.199 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.12ms ± 5% 6.07ms ± 4% ~ (p=0.273 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 5% 12.5ms ±18% ~ (p=0.582 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 8% 12.1µs ± 3% ~ (p=0.963 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.126 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.0µs ± 6% 11.1µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.3µs ± 4% 24.5µs ± 6% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.7µs ± 5% 11.6µs ± 4% ~ (p=0.574 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.78µs ± 5% 5.73µs ± 5% ~ (p=0.357 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.117 n=19+16)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.08k ± 0% 6.05k ± 0% -0.54% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.39k ± 0% 6.36k ± 0% -0.55% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 336 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 9.89M ± 0% 9.95M ± 0% +0.65% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 9.95M ± 0% 10.02M ± 0% +0.70% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 138MB/s ± 7% 132MB/s ±15% ~ (p=0.126 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 124MB/s ± 5% 125MB/s ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 63.9MB/s ±13% 65.2MB/s ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 64.0MB/s ± 5% 61.3MB/s ±15% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 620MB/s ± 8% 622MB/s ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 644MB/s ±15% 679MB/s ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 627MB/s ± 4% 633MB/s ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 688MB/s ± 6% 682MB/s ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 310MB/s ± 4% 309MB/s ± 6% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 646MB/s ± 4% 649MB/s ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 666MB/s ± 3% 666MB/s ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 592MB/s ± 7% 593MB/s ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 1.30GB/s ± 5% 1.32GB/s ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 756MB/s ± 5% 745MB/s ± 6% ~ (p=0.102 n=19+16)
```
PiperOrigin-RevId: 520144430
2 years ago
|
|
|
block = next_block;
|
|
|
|
}
|
|
|
|
a = next_arena;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void upb_Arena_Free(upb_Arena* a) {
|
|
|
|
uintptr_t poc = upb_Atomic_Load(&a->parent_or_count, memory_order_acquire);
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
retry:
|
|
|
|
while (_upb_Arena_IsTaggedPointer(poc)) {
|
|
|
|
a = _upb_Arena_PointerFromTagged(poc);
|
|
|
|
poc = upb_Atomic_Load(&a->parent_or_count, memory_order_acquire);
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
}
|
|
|
|
|
|
|
|
// compare_exchange or fetch_sub are RMW operations, which are more
|
|
|
|
// expensive then direct loads. As an optimization, we only do RMW ops
|
|
|
|
// when we need to update things for other threads to see.
|
|
|
|
if (poc == _upb_Arena_TaggedFromRefcount(1)) {
|
|
|
|
arena_dofree(a);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (upb_Atomic_CompareExchangeStrong(
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
&a->parent_or_count, &poc,
|
|
|
|
_upb_Arena_TaggedFromRefcount(_upb_Arena_RefCountFromTagged(poc) - 1),
|
|
|
|
memory_order_release, memory_order_acquire)) {
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
// We were >1 and we decremented it successfully, so we are done.
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// We failed our update, so someone has done something, retry the whole
|
|
|
|
// process, but the failed exchange reloaded `poc` for us.
|
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool upb_Arena_Fuse(upb_Arena* a1, upb_Arena* a2) {
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
// SAFE IN THE PRESENCE OF FUSE/FREE RACES BUT NOT IN THE
|
|
|
|
// PRESENCE OF FUSE/FUSE RACES!!!
|
|
|
|
//
|
|
|
|
// `parent_or_count` has two disctint modes
|
|
|
|
// - parent pointer mode
|
|
|
|
// - refcount mode
|
|
|
|
//
|
|
|
|
// In parent pointer mode, it may change what pointer it refers to in the
|
|
|
|
// tree, but it will always approach a root. Any operation that walks the
|
|
|
|
// tree to the root may collapse levels of the tree concurrently.
|
|
|
|
//
|
|
|
|
// In refcount mode, any free operation may lower the refcount.
|
|
|
|
//
|
|
|
|
// Only a fuse operation may increase the refcount.
|
|
|
|
// Only a fuse operation may switch `parent_or_count` from parent mode to
|
|
|
|
// refcount mode.
|
|
|
|
//
|
|
|
|
// Given that we do not allow fuse/fuse races, we may rely on the invariant
|
|
|
|
// that only refcounts can change once we have found the root. Because the
|
|
|
|
// threads doing the fuse must hold references, we can guarantee that no
|
|
|
|
// refcounts will reach zero concurrently.
|
|
|
|
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
upb_Arena* r1 = _upb_Arena_FindRoot(a1);
|
|
|
|
upb_Arena* r2 = _upb_Arena_FindRoot(a2);
|
|
|
|
|
|
|
|
if (r1 == r2) return true; // Already fused.
|
|
|
|
|
|
|
|
// Do not fuse initial blocks since we cannot lifetime extend them.
|
|
|
|
// Any other fuse scenario is allowed.
|
|
|
|
if (upb_Arena_HasInitialBlock(r1)) return false;
|
|
|
|
if (upb_Arena_HasInitialBlock(r2)) return false;
|
|
|
|
|
|
|
|
uintptr_t r1_poc =
|
|
|
|
upb_Atomic_Load(&r1->parent_or_count, memory_order_acquire);
|
|
|
|
uintptr_t r2_poc =
|
|
|
|
upb_Atomic_Load(&r2->parent_or_count, memory_order_acquire);
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
UPB_ASSERT(_upb_Arena_IsTaggedRefcount(r1_poc));
|
|
|
|
UPB_ASSERT(_upb_Arena_IsTaggedRefcount(r2_poc));
|
|
|
|
|
|
|
|
// Keep the tree shallow by joining the smaller tree to the larger.
|
|
|
|
if (_upb_Arena_RefCountFromTagged(r1_poc) <
|
|
|
|
_upb_Arena_RefCountFromTagged(r2_poc)) {
|
|
|
|
upb_Arena* tmp = r1;
|
|
|
|
r1 = r2;
|
|
|
|
r2 = tmp;
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
|
|
|
|
uintptr_t tmp_poc = r1_poc;
|
|
|
|
r1_poc = r2_poc;
|
|
|
|
r2_poc = tmp_poc;
|
|
|
|
}
|
|
|
|
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
// The moment we install `r1` as the parent for `r2` all racing frees may
|
|
|
|
// immediately begin decrementing `r1`'s refcount. So we must install all the
|
|
|
|
// refcounts that we know about first to prevent a premature unref to zero.
|
|
|
|
uint32_t r2_refcount = _upb_Arena_RefCountFromTagged(r2_poc);
|
|
|
|
upb_Atomic_Add(&r1->parent_or_count, ((uintptr_t)r2_refcount) << 1,
|
|
|
|
memory_order_release);
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
|
|
|
|
// When installing `r1` as the parent for `r2` racing frees may have changed
|
|
|
|
// the refcount for `r2` so we need to capture the old value to fix up `r1`'s
|
|
|
|
// refcount based on the delta from what we saw the first time.
|
|
|
|
r2_poc = upb_Atomic_Exchange(&r2->parent_or_count,
|
|
|
|
_upb_Arena_TaggedFromPointer(r1),
|
|
|
|
memory_order_acq_rel);
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
UPB_ASSERT(_upb_Arena_IsTaggedRefcount(r2_poc));
|
|
|
|
uint32_t delta_refcount = r2_refcount - _upb_Arena_RefCountFromTagged(r2_poc);
|
|
|
|
if (delta_refcount != 0) {
|
|
|
|
upb_Atomic_Sub(&r1->parent_or_count, ((uintptr_t)delta_refcount) << 1,
|
|
|
|
memory_order_release);
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
}
|
Changed Arena representation so that fusing links arenas together instead of blocks.
Previously when fusing, we would concatenate all blocks into a single list that lived in the arena root. From then on, all arenas would add their blocks to this single unified list.
After this CL, arenas keep their distinct list of blocks even after being fused. Instead of unifying the block list, fuse now puts the arenas themselves into a list, so all arenas in the fused group can be iterated over at any time.
This design makes it easier to keep each individual arena thread-compatible, because fuse and free are now the only mutating operations that touch state that is shared with the entire group. Read-only operations like `SpaceAllocated()` also iterate the list of arenas, but in a read-only fashion.
(Note: we need tests for SpaceAllocated(), both single-threaded for correctness and multi-threaded for resilience to crashes and data races).
Performance of fuse regresses by 5-20%. This is somewhat expected as we are performing more atomic operations during a fuse.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.7ns ± 4% +2.00% (p=0.016 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.50ns ± 4% 6.57ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.3ns ±10% 68.7ns ± 4% +15.85% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 479ns ± 5% 540ns ± 8% +12.57% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.50µs ± 4% 4.93µs ± 8% +9.59% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.24µs ± 3% 9.96µs ± 3% +7.81% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.3ns ±18% 71.0ns ± 4% +12.14% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 484ns ± 9% 543ns ±10% +12.11% (p=0.000 n=17+16)
BM_ArenaFuseBalanced/64 4.50µs ± 6% 4.94µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.20µs ± 4% 9.95µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.50ms ± 8% 5.69ms ±17% ~ (p=0.189 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.10ms ± 5% 6.05ms ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.8ms ± 5% 12.4ms ±17% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.1µs ± 8% 12.1µs ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 10.9µs ± 7% 11.0µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 4% 24.4µs ± 7% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 5% 11.6µs ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.77µs ± 5% 5.71µs ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.102 n=19+16)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.8ns ± 4% +1.98% (p=0.019 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.51ns ± 4% 6.58ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.5ns ±10% 68.9ns ± 4% +15.83% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 481ns ± 5% 541ns ± 8% +12.54% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.51µs ± 4% 4.94µs ± 8% +9.53% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.26µs ± 3% 9.98µs ± 3% +7.79% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.5ns ±19% 71.1ns ± 3% +12.07% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 485ns ± 9% 551ns ±20% +13.47% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/64 4.51µs ± 6% 4.95µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.22µs ± 4% 9.97µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 8% 5.72ms ±18% ~ (p=0.199 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.12ms ± 5% 6.07ms ± 4% ~ (p=0.273 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 5% 12.5ms ±18% ~ (p=0.582 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 8% 12.1µs ± 3% ~ (p=0.963 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.126 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.0µs ± 6% 11.1µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.3µs ± 4% 24.5µs ± 6% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.7µs ± 5% 11.6µs ± 4% ~ (p=0.574 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.78µs ± 5% 5.73µs ± 5% ~ (p=0.357 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.117 n=19+16)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.08k ± 0% 6.05k ± 0% -0.54% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.39k ± 0% 6.36k ± 0% -0.55% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 336 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 9.89M ± 0% 9.95M ± 0% +0.65% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 9.95M ± 0% 10.02M ± 0% +0.70% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 138MB/s ± 7% 132MB/s ±15% ~ (p=0.126 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 124MB/s ± 5% 125MB/s ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 63.9MB/s ±13% 65.2MB/s ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 64.0MB/s ± 5% 61.3MB/s ±15% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 620MB/s ± 8% 622MB/s ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 644MB/s ±15% 679MB/s ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 627MB/s ± 4% 633MB/s ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 688MB/s ± 6% 682MB/s ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 310MB/s ± 4% 309MB/s ± 6% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 646MB/s ± 4% 649MB/s ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 666MB/s ± 3% 666MB/s ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 592MB/s ± 7% 593MB/s ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 1.30GB/s ± 5% 1.32GB/s ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 756MB/s ± 5% 745MB/s ± 6% ~ (p=0.102 n=19+16)
```
PiperOrigin-RevId: 520144430
2 years ago
|
|
|
|
|
|
|
// Now append r2's linked list of arenas to r1's.
|
|
|
|
upb_Arena* r2_tail = upb_Atomic_Load(&r2->tail, memory_order_relaxed);
|
|
|
|
upb_Arena* r1_tail = upb_Atomic_Load(&r1->tail, memory_order_relaxed);
|
|
|
|
upb_Arena* r1_next = upb_Atomic_Load(&r1_tail->next, memory_order_relaxed);
|
|
|
|
while (r1_next != NULL) {
|
|
|
|
// r1->tail was stale. This can happen, but tail should always converge on
|
|
|
|
// the true tail.
|
|
|
|
r1_tail = r1_next;
|
|
|
|
r1_next = upb_Atomic_Load(&r1_tail->next, memory_order_relaxed);
|
|
|
|
}
|
|
|
|
|
|
|
|
upb_Arena* old_next =
|
|
|
|
upb_Atomic_Exchange(&r1_tail->next, r2, memory_order_relaxed);
|
|
|
|
|
|
|
|
// Once fuse/fuse races are allowed, it will need to be a CAS instead that
|
|
|
|
// handles this mismatch gracefully.
|
|
|
|
UPB_ASSERT(old_next == NULL);
|
|
|
|
|
|
|
|
upb_Atomic_Store(&r1->tail, r2_tail, memory_order_relaxed);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|