|
|
|
// Protocol Buffers - Google's data interchange format
|
|
|
|
// Copyright 2023 Google LLC. All rights reserved.
|
|
|
|
//
|
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
// license that can be found in the LICENSE file or at
|
|
|
|
// https://developers.google.com/open-source/licenses/bsd
|
|
|
|
|
|
|
|
#ifndef UPB_MEM_INTERNAL_ARENA_H_
|
|
|
|
#define UPB_MEM_INTERNAL_ARENA_H_
|
|
|
|
|
|
|
|
#include "upb/mem/arena.h"
|
|
|
|
|
|
|
|
// Must be last.
|
|
|
|
#include "upb/port/def.inc"
|
|
|
|
|
|
|
|
typedef struct _upb_MemBlock _upb_MemBlock;
|
|
|
|
|
|
|
|
struct upb_Arena {
|
|
|
|
_upb_ArenaHead head;
|
|
|
|
|
|
|
|
// upb_alloc* together with a low bit which signals if there is an initial
|
|
|
|
// block.
|
|
|
|
uintptr_t block_alloc;
|
|
|
|
|
|
|
|
// When multiple arenas are fused together, each arena points to a parent
|
|
|
|
// arena (root points to itself). The root tracks how many live arenas
|
|
|
|
// reference it.
|
|
|
|
|
|
|
|
// The low bit is tagged:
|
|
|
|
// 0: pointer to parent
|
|
|
|
// 1: count, left shifted by one
|
Changed Arena representation so that fusing links arenas together instead of blocks.
Previously when fusing, we would concatenate all blocks into a single list that lived in the arena root. From then on, all arenas would add their blocks to this single unified list.
After this CL, arenas keep their distinct list of blocks even after being fused. Instead of unifying the block list, fuse now puts the arenas themselves into a list, so all arenas in the fused group can be iterated over at any time.
This design makes it easier to keep each individual arena thread-compatible, because fuse and free are now the only mutating operations that touch state that is shared with the entire group. Read-only operations like `SpaceAllocated()` also iterate the list of arenas, but in a read-only fashion.
(Note: we need tests for SpaceAllocated(), both single-threaded for correctness and multi-threaded for resilience to crashes and data races).
Performance of fuse regresses by 5-20%. This is somewhat expected as we are performing more atomic operations during a fuse.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.7ns ± 4% +2.00% (p=0.016 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.50ns ± 4% 6.57ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.3ns ±10% 68.7ns ± 4% +15.85% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 479ns ± 5% 540ns ± 8% +12.57% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.50µs ± 4% 4.93µs ± 8% +9.59% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.24µs ± 3% 9.96µs ± 3% +7.81% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.3ns ±18% 71.0ns ± 4% +12.14% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 484ns ± 9% 543ns ±10% +12.11% (p=0.000 n=17+16)
BM_ArenaFuseBalanced/64 4.50µs ± 6% 4.94µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.20µs ± 4% 9.95µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.50ms ± 8% 5.69ms ±17% ~ (p=0.189 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.10ms ± 5% 6.05ms ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.8ms ± 5% 12.4ms ±17% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.1µs ± 8% 12.1µs ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 10.9µs ± 7% 11.0µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 4% 24.4µs ± 7% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 5% 11.6µs ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.77µs ± 5% 5.71µs ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.102 n=19+16)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.8ns ± 4% +1.98% (p=0.019 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.51ns ± 4% 6.58ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.5ns ±10% 68.9ns ± 4% +15.83% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 481ns ± 5% 541ns ± 8% +12.54% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.51µs ± 4% 4.94µs ± 8% +9.53% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.26µs ± 3% 9.98µs ± 3% +7.79% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.5ns ±19% 71.1ns ± 3% +12.07% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 485ns ± 9% 551ns ±20% +13.47% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/64 4.51µs ± 6% 4.95µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.22µs ± 4% 9.97µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 8% 5.72ms ±18% ~ (p=0.199 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.12ms ± 5% 6.07ms ± 4% ~ (p=0.273 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 5% 12.5ms ±18% ~ (p=0.582 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 8% 12.1µs ± 3% ~ (p=0.963 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.126 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.0µs ± 6% 11.1µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.3µs ± 4% 24.5µs ± 6% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.7µs ± 5% 11.6µs ± 4% ~ (p=0.574 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.78µs ± 5% 5.73µs ± 5% ~ (p=0.357 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.117 n=19+16)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.08k ± 0% 6.05k ± 0% -0.54% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.39k ± 0% 6.36k ± 0% -0.55% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 336 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 9.89M ± 0% 9.95M ± 0% +0.65% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 9.95M ± 0% 10.02M ± 0% +0.70% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 138MB/s ± 7% 132MB/s ±15% ~ (p=0.126 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 124MB/s ± 5% 125MB/s ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 63.9MB/s ±13% 65.2MB/s ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 64.0MB/s ± 5% 61.3MB/s ±15% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 620MB/s ± 8% 622MB/s ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 644MB/s ±15% 679MB/s ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 627MB/s ± 4% 633MB/s ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 688MB/s ± 6% 682MB/s ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 310MB/s ± 4% 309MB/s ± 6% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 646MB/s ± 4% 649MB/s ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 666MB/s ± 3% 666MB/s ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 592MB/s ± 7% 593MB/s ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 1.30GB/s ± 5% 1.32GB/s ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 756MB/s ± 5% 745MB/s ± 6% ~ (p=0.102 n=19+16)
```
PiperOrigin-RevId: 520144430
2 years ago
|
|
|
UPB_ATOMIC(uintptr_t) parent_or_count;
|
|
|
|
|
Changed Arena representation so that fusing links arenas together instead of blocks.
Previously when fusing, we would concatenate all blocks into a single list that lived in the arena root. From then on, all arenas would add their blocks to this single unified list.
After this CL, arenas keep their distinct list of blocks even after being fused. Instead of unifying the block list, fuse now puts the arenas themselves into a list, so all arenas in the fused group can be iterated over at any time.
This design makes it easier to keep each individual arena thread-compatible, because fuse and free are now the only mutating operations that touch state that is shared with the entire group. Read-only operations like `SpaceAllocated()` also iterate the list of arenas, but in a read-only fashion.
(Note: we need tests for SpaceAllocated(), both single-threaded for correctness and multi-threaded for resilience to crashes and data races).
Performance of fuse regresses by 5-20%. This is somewhat expected as we are performing more atomic operations during a fuse.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.7ns ± 4% +2.00% (p=0.016 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.50ns ± 4% 6.57ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.3ns ±10% 68.7ns ± 4% +15.85% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 479ns ± 5% 540ns ± 8% +12.57% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.50µs ± 4% 4.93µs ± 8% +9.59% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.24µs ± 3% 9.96µs ± 3% +7.81% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.3ns ±18% 71.0ns ± 4% +12.14% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 484ns ± 9% 543ns ±10% +12.11% (p=0.000 n=17+16)
BM_ArenaFuseBalanced/64 4.50µs ± 6% 4.94µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.20µs ± 4% 9.95µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.50ms ± 8% 5.69ms ±17% ~ (p=0.189 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.10ms ± 5% 6.05ms ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.8ms ± 5% 12.4ms ±17% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.1µs ± 8% 12.1µs ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 10.9µs ± 7% 11.0µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 4% 24.4µs ± 7% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 5% 11.6µs ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.77µs ± 5% 5.71µs ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.102 n=19+16)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.8ns ± 4% +1.98% (p=0.019 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.51ns ± 4% 6.58ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.5ns ±10% 68.9ns ± 4% +15.83% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 481ns ± 5% 541ns ± 8% +12.54% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.51µs ± 4% 4.94µs ± 8% +9.53% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.26µs ± 3% 9.98µs ± 3% +7.79% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.5ns ±19% 71.1ns ± 3% +12.07% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 485ns ± 9% 551ns ±20% +13.47% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/64 4.51µs ± 6% 4.95µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.22µs ± 4% 9.97µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 8% 5.72ms ±18% ~ (p=0.199 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.12ms ± 5% 6.07ms ± 4% ~ (p=0.273 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 5% 12.5ms ±18% ~ (p=0.582 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 8% 12.1µs ± 3% ~ (p=0.963 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.126 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.0µs ± 6% 11.1µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.3µs ± 4% 24.5µs ± 6% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.7µs ± 5% 11.6µs ± 4% ~ (p=0.574 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.78µs ± 5% 5.73µs ± 5% ~ (p=0.357 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.117 n=19+16)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.08k ± 0% 6.05k ± 0% -0.54% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.39k ± 0% 6.36k ± 0% -0.55% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 336 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 9.89M ± 0% 9.95M ± 0% +0.65% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 9.95M ± 0% 10.02M ± 0% +0.70% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 138MB/s ± 7% 132MB/s ±15% ~ (p=0.126 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 124MB/s ± 5% 125MB/s ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 63.9MB/s ±13% 65.2MB/s ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 64.0MB/s ± 5% 61.3MB/s ±15% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 620MB/s ± 8% 622MB/s ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 644MB/s ±15% 679MB/s ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 627MB/s ± 4% 633MB/s ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 688MB/s ± 6% 682MB/s ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 310MB/s ± 4% 309MB/s ± 6% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 646MB/s ± 4% 649MB/s ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 666MB/s ± 3% 666MB/s ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 592MB/s ± 7% 593MB/s ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 1.30GB/s ± 5% 1.32GB/s ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 756MB/s ± 5% 745MB/s ± 6% ~ (p=0.102 n=19+16)
```
PiperOrigin-RevId: 520144430
2 years ago
|
|
|
// All nodes that are fused together are in a singly-linked list.
|
|
|
|
UPB_ATOMIC(upb_Arena*) next; // NULL at end of list.
|
|
|
|
|
|
|
|
// The last element of the linked list. This is present only as an
|
|
|
|
// optimization, so that we do not have to iterate over all members for every
|
|
|
|
// fuse. Only significant for an arena root. In other cases it is ignored.
|
|
|
|
UPB_ATOMIC(upb_Arena*) tail; // == self when no other list members.
|
|
|
|
|
|
|
|
// Linked list of blocks to free/cleanup. Atomic only for the benefit of
|
|
|
|
// upb_Arena_SpaceAllocated().
|
|
|
|
UPB_ATOMIC(_upb_MemBlock*) blocks;
|
|
|
|
};
|
|
|
|
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
UPB_INLINE bool _upb_Arena_IsTaggedRefcount(uintptr_t parent_or_count) {
|
|
|
|
return (parent_or_count & 1) == 1;
|
|
|
|
}
|
|
|
|
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
UPB_INLINE bool _upb_Arena_IsTaggedPointer(uintptr_t parent_or_count) {
|
|
|
|
return (parent_or_count & 1) == 0;
|
|
|
|
}
|
|
|
|
|
Allow fuse/fuse races, so that upb_Arena is fully thread-compatible.
Previously upb_Arena was not thread-compatible when `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` executed in parallel if `b` and `c` were previously fused. This CL fixed that by allowing `upb_Arena_Fuse()` to run in parallel without limitations.
Details on the design of the algorithm are captured in comments.
The CL slightly improves the performance of `upb_Arena_Fuse()`.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.5ns ± 4% -12.30% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.65ns ± 4% 5.17ns ± 3% -22.23% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.1ns ± 7% 68.5ns ± 4% ~ (p=0.327 n=18+19)
BM_ArenaFuseUnbalanced/8 542ns ± 3% 513ns ± 4% -5.25% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.04µs ± 8% 4.74µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.80% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 71.8ns ± 7% 68.4ns ± 6% -4.75% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 541ns ± 3% 519ns ± 3% -4.21% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.00µs ± 7% 4.86µs ± 4% -2.78% (p=0.003 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 4% 9.7µs ± 4% -2.68% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 2% 5.54ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.18ms ± 3% 6.15ms ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.8ms ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 4% 12.3µs ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.1µs ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.6µs ±16% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 4% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.782 n=18+17)
BM_SerializeDescriptor_Proto2 5.69µs ± 5% 5.76µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old time/op new time/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.6ns ± 4% -12.37% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.66ns ± 4% 5.18ns ± 3% -22.24% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.2ns ± 7% 68.6ns ± 4% ~ (p=0.343 n=18+19)
BM_ArenaFuseUnbalanced/8 543ns ± 3% 515ns ± 4% -5.21% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.05µs ± 8% 4.75µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.78% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 72.0ns ± 7% 68.6ns ± 6% -4.73% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 543ns ± 3% 520ns ± 3% -4.20% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.01µs ± 7% 4.87µs ± 4% -2.78% (p=0.004 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 3% 9.8µs ± 4% -2.67% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.53ms ± 2% 5.56ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.20ms ± 3% 6.17ms ± 2% ~ (p=0.424 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.297 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.9ms ± 3% ~ (p=0.351 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.3µs ± 4% 12.3µs ± 4% ~ (p=1.000 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.845 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.542 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.2µs ± 2% ~ (p=0.330 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.7µs ±17% ~ (p=0.167 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.799 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.807 n=18+17)
BM_SerializeDescriptor_Proto2 5.71µs ± 5% 5.78µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.05k ± 0% 6.05k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.36k ± 0% 6.36k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% -0.00% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 328 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<NoLayout> 10.0M ± 0% 9.9M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 10.0M ± 0% 10.0M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% -0.01% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 137MB/s ± 2% 137MB/s ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 122MB/s ± 3% 123MB/s ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 64.2MB/s ± 7% 64.7MB/s ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 63.6MB/s ± 3% 63.9MB/s ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 614MB/s ± 4% 613MB/s ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 665MB/s ± 6% 667MB/s ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 624MB/s ± 4% 622MB/s ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 681MB/s ± 4% 675MB/s ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 311MB/s ± 3% 296MB/s ±15% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 649MB/s ± 3% 644MB/s ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 656MB/s ± 7% 659MB/s ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 587MB/s ± 5% 576MB/s ±16% ~ (p=0.584 n=18+18)
BM_SerializeDescriptor_Proto2 1.32GB/s ± 5% 1.31GB/s ± 7% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 737MB/s ± 4% 737MB/s ± 7% ~ (p=0.839 n=18+18)
```
PiperOrigin-RevId: 520452349
2 years ago
|
|
|
UPB_INLINE uintptr_t _upb_Arena_RefCountFromTagged(uintptr_t parent_or_count) {
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
UPB_ASSERT(_upb_Arena_IsTaggedRefcount(parent_or_count));
|
|
|
|
return parent_or_count >> 1;
|
|
|
|
}
|
|
|
|
|
Allow fuse/fuse races, so that upb_Arena is fully thread-compatible.
Previously upb_Arena was not thread-compatible when `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` executed in parallel if `b` and `c` were previously fused. This CL fixed that by allowing `upb_Arena_Fuse()` to run in parallel without limitations.
Details on the design of the algorithm are captured in comments.
The CL slightly improves the performance of `upb_Arena_Fuse()`.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.5ns ± 4% -12.30% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.65ns ± 4% 5.17ns ± 3% -22.23% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.1ns ± 7% 68.5ns ± 4% ~ (p=0.327 n=18+19)
BM_ArenaFuseUnbalanced/8 542ns ± 3% 513ns ± 4% -5.25% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.04µs ± 8% 4.74µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.80% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 71.8ns ± 7% 68.4ns ± 6% -4.75% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 541ns ± 3% 519ns ± 3% -4.21% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.00µs ± 7% 4.86µs ± 4% -2.78% (p=0.003 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 4% 9.7µs ± 4% -2.68% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 2% 5.54ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.18ms ± 3% 6.15ms ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.8ms ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 4% 12.3µs ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.1µs ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.6µs ±16% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 4% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.782 n=18+17)
BM_SerializeDescriptor_Proto2 5.69µs ± 5% 5.76µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old time/op new time/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.6ns ± 4% -12.37% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.66ns ± 4% 5.18ns ± 3% -22.24% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.2ns ± 7% 68.6ns ± 4% ~ (p=0.343 n=18+19)
BM_ArenaFuseUnbalanced/8 543ns ± 3% 515ns ± 4% -5.21% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.05µs ± 8% 4.75µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.78% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 72.0ns ± 7% 68.6ns ± 6% -4.73% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 543ns ± 3% 520ns ± 3% -4.20% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.01µs ± 7% 4.87µs ± 4% -2.78% (p=0.004 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 3% 9.8µs ± 4% -2.67% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.53ms ± 2% 5.56ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.20ms ± 3% 6.17ms ± 2% ~ (p=0.424 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.297 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.9ms ± 3% ~ (p=0.351 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.3µs ± 4% 12.3µs ± 4% ~ (p=1.000 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.845 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.542 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.2µs ± 2% ~ (p=0.330 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.7µs ±17% ~ (p=0.167 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.799 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.807 n=18+17)
BM_SerializeDescriptor_Proto2 5.71µs ± 5% 5.78µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.05k ± 0% 6.05k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.36k ± 0% 6.36k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% -0.00% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 328 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<NoLayout> 10.0M ± 0% 9.9M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 10.0M ± 0% 10.0M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% -0.01% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 137MB/s ± 2% 137MB/s ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 122MB/s ± 3% 123MB/s ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 64.2MB/s ± 7% 64.7MB/s ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 63.6MB/s ± 3% 63.9MB/s ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 614MB/s ± 4% 613MB/s ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 665MB/s ± 6% 667MB/s ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 624MB/s ± 4% 622MB/s ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 681MB/s ± 4% 675MB/s ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 311MB/s ± 3% 296MB/s ±15% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 649MB/s ± 3% 644MB/s ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 656MB/s ± 7% 659MB/s ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 587MB/s ± 5% 576MB/s ±16% ~ (p=0.584 n=18+18)
BM_SerializeDescriptor_Proto2 1.32GB/s ± 5% 1.31GB/s ± 7% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 737MB/s ± 4% 737MB/s ± 7% ~ (p=0.839 n=18+18)
```
PiperOrigin-RevId: 520452349
2 years ago
|
|
|
UPB_INLINE uintptr_t _upb_Arena_TaggedFromRefcount(uintptr_t refcount) {
|
|
|
|
uintptr_t parent_or_count = (refcount << 1) | 1;
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
UPB_ASSERT(_upb_Arena_IsTaggedRefcount(parent_or_count));
|
|
|
|
return parent_or_count;
|
|
|
|
}
|
|
|
|
|
|
|
|
UPB_INLINE upb_Arena* _upb_Arena_PointerFromTagged(uintptr_t parent_or_count) {
|
|
|
|
UPB_ASSERT(_upb_Arena_IsTaggedPointer(parent_or_count));
|
|
|
|
return (upb_Arena*)parent_or_count;
|
|
|
|
}
|
|
|
|
|
|
|
|
UPB_INLINE uintptr_t _upb_Arena_TaggedFromPointer(upb_Arena* a) {
|
|
|
|
uintptr_t parent_or_count = (uintptr_t)a;
|
|
|
|
UPB_ASSERT(_upb_Arena_IsTaggedPointer(parent_or_count));
|
|
|
|
return parent_or_count;
|
|
|
|
}
|
|
|
|
|
|
|
|
UPB_INLINE upb_alloc* upb_Arena_BlockAlloc(upb_Arena* arena) {
|
|
|
|
return (upb_alloc*)(arena->block_alloc & ~0x1);
|
|
|
|
}
|
|
|
|
|
|
|
|
UPB_INLINE uintptr_t upb_Arena_MakeBlockAlloc(upb_alloc* alloc,
|
|
|
|
bool has_initial) {
|
|
|
|
uintptr_t alloc_uint = (uintptr_t)alloc;
|
|
|
|
UPB_ASSERT((alloc_uint & 1) == 0);
|
|
|
|
return alloc_uint | (has_initial ? 1 : 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
UPB_INLINE bool upb_Arena_HasInitialBlock(upb_Arena* arena) {
|
|
|
|
return arena->block_alloc & 0x1;
|
|
|
|
}
|
|
|
|
|
|
|
|
#include "upb/port/undef.inc"
|
|
|
|
|
|
|
|
#endif /* UPB_MEM_INTERNAL_ARENA_H_ */
|