|
|
|
// Protocol Buffers - Google's data interchange format
|
|
|
|
// Copyright 2023 Google LLC. All rights reserved.
|
|
|
|
//
|
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
// license that can be found in the LICENSE file or at
|
|
|
|
// https://developers.google.com/open-source/licenses/bsd
|
|
|
|
|
|
|
|
#include "upb/mem/arena.h"
|
|
|
|
|
|
|
|
#ifdef UPB_TRACING_ENABLED
|
|
|
|
#include <stdatomic.h>
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#include <stddef.h>
|
|
|
|
#include <stdint.h>
|
|
|
|
|
|
|
|
#include "upb/mem/alloc.h"
|
|
|
|
#include "upb/mem/internal/arena.h"
|
|
|
|
#include "upb/port/atomic.h"
|
|
|
|
|
|
|
|
// Must be last.
|
|
|
|
#include "upb/port/def.inc"
|
|
|
|
|
|
|
|
static UPB_ATOMIC(size_t) g_max_block_size = 32 << 10;
|
|
|
|
|
|
|
|
void upb_Arena_SetMaxBlockSize(size_t max) {
|
|
|
|
upb_Atomic_Store(&g_max_block_size, max, memory_order_relaxed);
|
|
|
|
}
|
|
|
|
|
|
|
|
typedef struct upb_MemBlock {
|
Remove atomics from linked list of blocks
We no longer need to traverse the linked list of blocks to check allocated space, which means we also no longer need atomics in the linked list or even its head. This is especially beneficial as the previous implementation contained a race where we could dereference uninitialized memory; because the setting of the `next` pointers did not use release semantics and the reading of them in `SpaceAllocated` reads with relaxed order, there's no guarantee that `size` has actually been initialized - but worse, *there is also no guarantee that `next` has been!*. Simplified:
```
AddBlock:
1 ptr = malloc();
2 ptr->size = 123;
3 ptr->next = ai->blocks;
4 ai->blocks = ptr (release order);
```
```
SpaceAllocated:
5 block = ai->blocks (relaxed order)
6 block->size (acquire, but probably by accident)
7 block = block->next (relaxed order)
```
So I think a second thread calling SpaceAllocated could see the order 1, 4, 5, 6, 7, 2, 3 and read uninitialized memory - there is no data-dependency relationship or happens-before edge that this order violates, and so it would be valid for a compiler+hardware to produce.
In reality, operation 4 will produce an `stlr` on arm (forcing an order of 1, 2, 3 before 4), and `block->next` has a data dependency on `ai->blocks` which would force an ordering in the hardware between 5->6 and 5->7 even for regular `ldr` instructions.
Delete arena contains, it's private and the only user is its own test.
PiperOrigin-RevId: 709918443
2 months ago
|
|
|
struct upb_MemBlock* next;
|
|
|
|
size_t size;
|
|
|
|
// Data follows.
|
|
|
|
} upb_MemBlock;
|
|
|
|
|
|
|
|
typedef struct upb_ArenaInternal {
|
|
|
|
// upb_alloc* together with a low bit which signals if there is an initial
|
|
|
|
// block.
|
|
|
|
uintptr_t block_alloc;
|
|
|
|
|
|
|
|
// The cleanup for the allocator. This is called after all the blocks are
|
|
|
|
// freed in an arena.
|
|
|
|
upb_AllocCleanupFunc* upb_alloc_cleanup;
|
|
|
|
|
|
|
|
// When multiple arenas are fused together, each arena points to a parent
|
|
|
|
// arena (root points to itself). The root tracks how many live arenas
|
|
|
|
// reference it.
|
|
|
|
|
|
|
|
// The low bit is tagged:
|
|
|
|
// 0: pointer to parent
|
|
|
|
// 1: count, left shifted by one
|
|
|
|
UPB_ATOMIC(uintptr_t) parent_or_count;
|
|
|
|
|
|
|
|
// All nodes that are fused together are in a singly-linked list.
|
|
|
|
// == NULL at end of list.
|
|
|
|
UPB_ATOMIC(struct upb_ArenaInternal*) next;
|
|
|
|
|
|
|
|
// If the low bit is set, is a pointer to the tail of the list (populated for
|
|
|
|
// roots, set to self for roots with no fused arenas). If the low bit is not
|
|
|
|
// set, is a pointer to the previous node in the list, such that
|
|
|
|
// a->previous_or_tail->next == a.
|
|
|
|
UPB_ATOMIC(uintptr_t) previous_or_tail;
|
|
|
|
|
Remove atomics from linked list of blocks
We no longer need to traverse the linked list of blocks to check allocated space, which means we also no longer need atomics in the linked list or even its head. This is especially beneficial as the previous implementation contained a race where we could dereference uninitialized memory; because the setting of the `next` pointers did not use release semantics and the reading of them in `SpaceAllocated` reads with relaxed order, there's no guarantee that `size` has actually been initialized - but worse, *there is also no guarantee that `next` has been!*. Simplified:
```
AddBlock:
1 ptr = malloc();
2 ptr->size = 123;
3 ptr->next = ai->blocks;
4 ai->blocks = ptr (release order);
```
```
SpaceAllocated:
5 block = ai->blocks (relaxed order)
6 block->size (acquire, but probably by accident)
7 block = block->next (relaxed order)
```
So I think a second thread calling SpaceAllocated could see the order 1, 4, 5, 6, 7, 2, 3 and read uninitialized memory - there is no data-dependency relationship or happens-before edge that this order violates, and so it would be valid for a compiler+hardware to produce.
In reality, operation 4 will produce an `stlr` on arm (forcing an order of 1, 2, 3 before 4), and `block->next` has a data dependency on `ai->blocks` which would force an ordering in the hardware between 5->6 and 5->7 even for regular `ldr` instructions.
Delete arena contains, it's private and the only user is its own test.
PiperOrigin-RevId: 709918443
2 months ago
|
|
|
// Linked list of blocks to free/cleanup.
|
|
|
|
upb_MemBlock* blocks;
|
|
|
|
|
|
|
|
// Total space allocated in blocks, atomic only for SpaceAllocated
|
|
|
|
UPB_ATOMIC(uintptr_t) space_allocated;
|
|
|
|
|
|
|
|
UPB_TSAN_PUBLISHED_MEMBER
|
|
|
|
} upb_ArenaInternal;
|
|
|
|
|
|
|
|
// All public + private state for an arena.
|
|
|
|
typedef struct {
|
|
|
|
upb_Arena head;
|
|
|
|
upb_ArenaInternal body;
|
|
|
|
} upb_ArenaState;
|
|
|
|
|
|
|
|
typedef struct {
|
|
|
|
upb_ArenaInternal* root;
|
Allow fuse/fuse races, so that upb_Arena is fully thread-compatible.
Previously upb_Arena was not thread-compatible when `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` executed in parallel if `b` and `c` were previously fused. This CL fixed that by allowing `upb_Arena_Fuse()` to run in parallel without limitations.
Details on the design of the algorithm are captured in comments.
The CL slightly improves the performance of `upb_Arena_Fuse()`.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.5ns ± 4% -12.30% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.65ns ± 4% 5.17ns ± 3% -22.23% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.1ns ± 7% 68.5ns ± 4% ~ (p=0.327 n=18+19)
BM_ArenaFuseUnbalanced/8 542ns ± 3% 513ns ± 4% -5.25% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.04µs ± 8% 4.74µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.80% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 71.8ns ± 7% 68.4ns ± 6% -4.75% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 541ns ± 3% 519ns ± 3% -4.21% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.00µs ± 7% 4.86µs ± 4% -2.78% (p=0.003 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 4% 9.7µs ± 4% -2.68% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 2% 5.54ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.18ms ± 3% 6.15ms ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.8ms ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 4% 12.3µs ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.1µs ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.6µs ±16% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 4% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.782 n=18+17)
BM_SerializeDescriptor_Proto2 5.69µs ± 5% 5.76µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old time/op new time/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.6ns ± 4% -12.37% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.66ns ± 4% 5.18ns ± 3% -22.24% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.2ns ± 7% 68.6ns ± 4% ~ (p=0.343 n=18+19)
BM_ArenaFuseUnbalanced/8 543ns ± 3% 515ns ± 4% -5.21% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.05µs ± 8% 4.75µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.78% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 72.0ns ± 7% 68.6ns ± 6% -4.73% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 543ns ± 3% 520ns ± 3% -4.20% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.01µs ± 7% 4.87µs ± 4% -2.78% (p=0.004 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 3% 9.8µs ± 4% -2.67% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.53ms ± 2% 5.56ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.20ms ± 3% 6.17ms ± 2% ~ (p=0.424 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.297 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.9ms ± 3% ~ (p=0.351 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.3µs ± 4% 12.3µs ± 4% ~ (p=1.000 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.845 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.542 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.2µs ± 2% ~ (p=0.330 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.7µs ±17% ~ (p=0.167 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.799 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.807 n=18+17)
BM_SerializeDescriptor_Proto2 5.71µs ± 5% 5.78µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.05k ± 0% 6.05k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.36k ± 0% 6.36k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% -0.00% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 328 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<NoLayout> 10.0M ± 0% 9.9M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 10.0M ± 0% 10.0M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% -0.01% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 137MB/s ± 2% 137MB/s ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 122MB/s ± 3% 123MB/s ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 64.2MB/s ± 7% 64.7MB/s ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 63.6MB/s ± 3% 63.9MB/s ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 614MB/s ± 4% 613MB/s ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 665MB/s ± 6% 667MB/s ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 624MB/s ± 4% 622MB/s ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 681MB/s ± 4% 675MB/s ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 311MB/s ± 3% 296MB/s ±15% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 649MB/s ± 3% 644MB/s ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 656MB/s ± 7% 659MB/s ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 587MB/s ± 5% 576MB/s ±16% ~ (p=0.584 n=18+18)
BM_SerializeDescriptor_Proto2 1.32GB/s ± 5% 1.31GB/s ± 7% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 737MB/s ± 4% 737MB/s ± 7% ~ (p=0.839 n=18+18)
```
PiperOrigin-RevId: 520452349
2 years ago
|
|
|
uintptr_t tagged_count;
|
|
|
|
} upb_ArenaRoot;
|
|
|
|
|
|
|
|
static const size_t kUpb_MemblockReserve =
|
|
|
|
UPB_ALIGN_MALLOC(sizeof(upb_MemBlock));
|
|
|
|
|
|
|
|
// Extracts the (upb_ArenaInternal*) from a (upb_Arena*)
|
|
|
|
static upb_ArenaInternal* upb_Arena_Internal(const upb_Arena* a) {
|
|
|
|
return &((upb_ArenaState*)a)->body;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool _upb_Arena_IsTaggedRefcount(uintptr_t parent_or_count) {
|
|
|
|
return (parent_or_count & 1) == 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool _upb_Arena_IsTaggedPointer(uintptr_t parent_or_count) {
|
|
|
|
return (parent_or_count & 1) == 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static uintptr_t _upb_Arena_RefCountFromTagged(uintptr_t parent_or_count) {
|
|
|
|
UPB_ASSERT(_upb_Arena_IsTaggedRefcount(parent_or_count));
|
|
|
|
return parent_or_count >> 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static uintptr_t _upb_Arena_TaggedFromRefcount(uintptr_t refcount) {
|
|
|
|
uintptr_t parent_or_count = (refcount << 1) | 1;
|
|
|
|
UPB_ASSERT(_upb_Arena_IsTaggedRefcount(parent_or_count));
|
|
|
|
return parent_or_count;
|
|
|
|
}
|
Allow fuse/fuse races, so that upb_Arena is fully thread-compatible.
Previously upb_Arena was not thread-compatible when `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` executed in parallel if `b` and `c` were previously fused. This CL fixed that by allowing `upb_Arena_Fuse()` to run in parallel without limitations.
Details on the design of the algorithm are captured in comments.
The CL slightly improves the performance of `upb_Arena_Fuse()`.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.5ns ± 4% -12.30% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.65ns ± 4% 5.17ns ± 3% -22.23% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.1ns ± 7% 68.5ns ± 4% ~ (p=0.327 n=18+19)
BM_ArenaFuseUnbalanced/8 542ns ± 3% 513ns ± 4% -5.25% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.04µs ± 8% 4.74µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.80% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 71.8ns ± 7% 68.4ns ± 6% -4.75% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 541ns ± 3% 519ns ± 3% -4.21% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.00µs ± 7% 4.86µs ± 4% -2.78% (p=0.003 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 4% 9.7µs ± 4% -2.68% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 2% 5.54ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.18ms ± 3% 6.15ms ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.8ms ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 4% 12.3µs ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.1µs ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.6µs ±16% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 4% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.782 n=18+17)
BM_SerializeDescriptor_Proto2 5.69µs ± 5% 5.76µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old time/op new time/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.6ns ± 4% -12.37% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.66ns ± 4% 5.18ns ± 3% -22.24% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.2ns ± 7% 68.6ns ± 4% ~ (p=0.343 n=18+19)
BM_ArenaFuseUnbalanced/8 543ns ± 3% 515ns ± 4% -5.21% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.05µs ± 8% 4.75µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.78% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 72.0ns ± 7% 68.6ns ± 6% -4.73% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 543ns ± 3% 520ns ± 3% -4.20% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.01µs ± 7% 4.87µs ± 4% -2.78% (p=0.004 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 3% 9.8µs ± 4% -2.67% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.53ms ± 2% 5.56ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.20ms ± 3% 6.17ms ± 2% ~ (p=0.424 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.297 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.9ms ± 3% ~ (p=0.351 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.3µs ± 4% 12.3µs ± 4% ~ (p=1.000 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.845 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.542 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.2µs ± 2% ~ (p=0.330 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.7µs ±17% ~ (p=0.167 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.799 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.807 n=18+17)
BM_SerializeDescriptor_Proto2 5.71µs ± 5% 5.78µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.05k ± 0% 6.05k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.36k ± 0% 6.36k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% -0.00% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 328 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<NoLayout> 10.0M ± 0% 9.9M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 10.0M ± 0% 10.0M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% -0.01% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 137MB/s ± 2% 137MB/s ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 122MB/s ± 3% 123MB/s ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 64.2MB/s ± 7% 64.7MB/s ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 63.6MB/s ± 3% 63.9MB/s ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 614MB/s ± 4% 613MB/s ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 665MB/s ± 6% 667MB/s ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 624MB/s ± 4% 622MB/s ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 681MB/s ± 4% 675MB/s ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 311MB/s ± 3% 296MB/s ±15% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 649MB/s ± 3% 644MB/s ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 656MB/s ± 7% 659MB/s ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 587MB/s ± 5% 576MB/s ±16% ~ (p=0.584 n=18+18)
BM_SerializeDescriptor_Proto2 1.32GB/s ± 5% 1.31GB/s ± 7% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 737MB/s ± 4% 737MB/s ± 7% ~ (p=0.839 n=18+18)
```
PiperOrigin-RevId: 520452349
2 years ago
|
|
|
|
|
|
|
static upb_ArenaInternal* _upb_Arena_PointerFromTagged(
|
|
|
|
uintptr_t parent_or_count) {
|
|
|
|
UPB_ASSERT(_upb_Arena_IsTaggedPointer(parent_or_count));
|
|
|
|
return (upb_ArenaInternal*)parent_or_count;
|
|
|
|
}
|
|
|
|
|
|
|
|
static uintptr_t _upb_Arena_TaggedFromPointer(upb_ArenaInternal* ai) {
|
|
|
|
uintptr_t parent_or_count = (uintptr_t)ai;
|
|
|
|
UPB_ASSERT(_upb_Arena_IsTaggedPointer(parent_or_count));
|
|
|
|
return parent_or_count;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool _upb_Arena_IsTaggedTail(uintptr_t previous_or_tail) {
|
|
|
|
return (previous_or_tail & 1) == 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool _upb_Arena_IsTaggedPrevious(uintptr_t previous_or_tail) {
|
|
|
|
return (previous_or_tail & 1) == 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static upb_ArenaInternal* _upb_Arena_TailFromTagged(
|
|
|
|
uintptr_t previous_or_tail) {
|
|
|
|
UPB_ASSERT(_upb_Arena_IsTaggedTail(previous_or_tail));
|
|
|
|
return (upb_ArenaInternal*)(previous_or_tail ^ 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static uintptr_t _upb_Arena_TaggedFromTail(upb_ArenaInternal* tail) {
|
|
|
|
uintptr_t previous_or_tail = (uintptr_t)tail | 1;
|
|
|
|
UPB_ASSERT(_upb_Arena_IsTaggedTail(previous_or_tail));
|
|
|
|
return previous_or_tail;
|
|
|
|
}
|
|
|
|
|
|
|
|
static upb_ArenaInternal* _upb_Arena_PreviousFromTagged(
|
|
|
|
uintptr_t previous_or_tail) {
|
|
|
|
UPB_ASSERT(_upb_Arena_IsTaggedPrevious(previous_or_tail));
|
|
|
|
return (upb_ArenaInternal*)previous_or_tail;
|
|
|
|
}
|
|
|
|
|
|
|
|
static uintptr_t _upb_Arena_TaggedFromPrevious(upb_ArenaInternal* ai) {
|
|
|
|
uintptr_t previous = (uintptr_t)ai;
|
|
|
|
UPB_ASSERT(_upb_Arena_IsTaggedPrevious(previous));
|
|
|
|
return previous;
|
|
|
|
}
|
|
|
|
|
|
|
|
static upb_alloc* _upb_ArenaInternal_BlockAlloc(upb_ArenaInternal* ai) {
|
|
|
|
return (upb_alloc*)(ai->block_alloc & ~0x1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static uintptr_t _upb_Arena_MakeBlockAlloc(upb_alloc* alloc, bool has_initial) {
|
|
|
|
uintptr_t alloc_uint = (uintptr_t)alloc;
|
|
|
|
UPB_ASSERT((alloc_uint & 1) == 0);
|
|
|
|
return alloc_uint | (has_initial ? 1 : 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool _upb_ArenaInternal_HasInitialBlock(upb_ArenaInternal* ai) {
|
|
|
|
return ai->block_alloc & 0x1;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef UPB_TRACING_ENABLED
|
|
|
|
static void (*_init_arena_trace_handler)(const upb_Arena*, size_t size) = NULL;
|
|
|
|
static void (*_fuse_arena_trace_handler)(const upb_Arena*,
|
|
|
|
const upb_Arena*) = NULL;
|
|
|
|
static void (*_free_arena_trace_handler)(const upb_Arena*) = NULL;
|
|
|
|
|
|
|
|
void upb_Arena_SetTraceHandler(
|
|
|
|
void (*initArenaTraceHandler)(const upb_Arena*, size_t size),
|
|
|
|
void (*fuseArenaTraceHandler)(const upb_Arena*, const upb_Arena*),
|
|
|
|
void (*freeArenaTraceHandler)(const upb_Arena*)) {
|
|
|
|
_init_arena_trace_handler = initArenaTraceHandler;
|
|
|
|
_fuse_arena_trace_handler = fuseArenaTraceHandler;
|
|
|
|
_free_arena_trace_handler = freeArenaTraceHandler;
|
|
|
|
}
|
|
|
|
|
|
|
|
void upb_Arena_LogInit(const upb_Arena* arena, size_t size) {
|
|
|
|
if (_init_arena_trace_handler) {
|
|
|
|
_init_arena_trace_handler(arena, size);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
void upb_Arena_LogFuse(const upb_Arena* arena1, const upb_Arena* arena2) {
|
|
|
|
if (_fuse_arena_trace_handler) {
|
|
|
|
_fuse_arena_trace_handler(arena1, arena2);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
void upb_Arena_LogFree(const upb_Arena* arena) {
|
|
|
|
if (_free_arena_trace_handler) {
|
|
|
|
_free_arena_trace_handler(arena);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif // UPB_TRACING_ENABLED
|
|
|
|
|
|
|
|
// If the param a is already the root, provides no memory order of refcount.
|
|
|
|
// If it has a parent, then acquire memory order is provided for both the root
|
|
|
|
// and the refcount. Thread safe.
|
|
|
|
static upb_ArenaRoot _upb_Arena_FindRoot(upb_ArenaInternal* ai) {
|
|
|
|
uintptr_t poc = upb_Atomic_Load(&ai->parent_or_count, memory_order_relaxed);
|
|
|
|
if (_upb_Arena_IsTaggedRefcount(poc)) {
|
|
|
|
// Fast, relaxed path - arenas that have never been fused to a parent only
|
|
|
|
// need relaxed memory order, since they're returning themselves and the
|
|
|
|
// refcount.
|
|
|
|
return (upb_ArenaRoot){.root = ai, .tagged_count = poc};
|
|
|
|
}
|
|
|
|
// Slow path needs acquire order; reloading is cheaper than a fence on ARM
|
|
|
|
// (LDA vs DMB ISH). Even though this is a reread, we know it must be a tagged
|
|
|
|
// pointer because if this Arena isn't a root, it can't ever become one.
|
|
|
|
poc = upb_Atomic_Load(&ai->parent_or_count, memory_order_acquire);
|
|
|
|
do {
|
|
|
|
upb_ArenaInternal* next = _upb_Arena_PointerFromTagged(poc);
|
|
|
|
UPB_TSAN_CHECK_PUBLISHED(next);
|
|
|
|
UPB_ASSERT(ai != next);
|
|
|
|
poc = upb_Atomic_Load(&next->parent_or_count, memory_order_acquire);
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
|
|
|
|
if (_upb_Arena_IsTaggedPointer(poc)) {
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
// To keep complexity down, we lazily collapse levels of the tree. This
|
|
|
|
// keeps it flat in the final case, but doesn't cost much incrementally.
|
|
|
|
//
|
|
|
|
// Path splitting keeps time complexity down, see:
|
|
|
|
// https://en.wikipedia.org/wiki/Disjoint-set_data_structure
|
|
|
|
UPB_ASSERT(ai != _upb_Arena_PointerFromTagged(poc));
|
|
|
|
upb_Atomic_Store(&ai->parent_or_count, poc, memory_order_release);
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
}
|
|
|
|
ai = next;
|
|
|
|
} while (_upb_Arena_IsTaggedPointer(poc));
|
|
|
|
return (upb_ArenaRoot){.root = ai, .tagged_count = poc};
|
|
|
|
}
|
|
|
|
|
|
|
|
uintptr_t upb_Arena_SpaceAllocated(const upb_Arena* arena,
|
|
|
|
size_t* fused_count) {
|
|
|
|
upb_ArenaInternal* ai = upb_Arena_Internal(arena);
|
|
|
|
uintptr_t memsize = 0;
|
|
|
|
size_t local_fused_count = 0;
|
|
|
|
// Our root would get updated by any racing fuses before our target arena
|
|
|
|
// became reachable from the root via the linked list; in order to preserve
|
|
|
|
// monotonic output (any arena counted by a previous invocation is counted by
|
|
|
|
// this one), we instead iterate forwards and backwards so that we only see
|
|
|
|
// the results of completed fuses.
|
|
|
|
uintptr_t previous_or_tail =
|
|
|
|
upb_Atomic_Load(&ai->previous_or_tail, memory_order_acquire);
|
|
|
|
while (_upb_Arena_IsTaggedPrevious(previous_or_tail)) {
|
|
|
|
upb_ArenaInternal* previous =
|
|
|
|
_upb_Arena_PreviousFromTagged(previous_or_tail);
|
|
|
|
UPB_ASSERT(previous != ai);
|
|
|
|
UPB_TSAN_CHECK_PUBLISHED(previous);
|
|
|
|
// Unfortunate macro behavior; prior to C11 when using nonstandard atomics
|
|
|
|
// this returns a void* and can't be used with += without an intermediate
|
|
|
|
// conversion to an integer.
|
|
|
|
// Relaxed is safe - no subsequent reads depend this one
|
|
|
|
uintptr_t allocated =
|
|
|
|
upb_Atomic_Load(&previous->space_allocated, memory_order_relaxed);
|
|
|
|
memsize += allocated;
|
|
|
|
previous_or_tail =
|
|
|
|
upb_Atomic_Load(&previous->previous_or_tail, memory_order_acquire);
|
|
|
|
local_fused_count++;
|
|
|
|
}
|
|
|
|
while (ai != NULL) {
|
|
|
|
UPB_TSAN_CHECK_PUBLISHED(ai);
|
|
|
|
// Unfortunate macro behavior; prior to C11 when using nonstandard atomics
|
|
|
|
// this returns a void* and can't be used with += without an intermediate
|
|
|
|
// conversion to an integer.
|
|
|
|
// Relaxed is safe - no subsequent reads depend this one
|
|
|
|
uintptr_t allocated =
|
|
|
|
upb_Atomic_Load(&ai->space_allocated, memory_order_relaxed);
|
|
|
|
memsize += allocated;
|
|
|
|
ai = upb_Atomic_Load(&ai->next, memory_order_acquire);
|
|
|
|
local_fused_count++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (fused_count) *fused_count = local_fused_count;
|
|
|
|
return memsize;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t upb_Arena_DebugRefCount(const upb_Arena* a) {
|
|
|
|
uintptr_t tagged = _upb_Arena_FindRoot(upb_Arena_Internal(a)).tagged_count;
|
|
|
|
return (uint32_t)_upb_Arena_RefCountFromTagged(tagged);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void _upb_Arena_AddBlock(upb_Arena* a, void* ptr, size_t offset,
|
|
|
|
size_t block_size) {
|
|
|
|
upb_ArenaInternal* ai = upb_Arena_Internal(a);
|
|
|
|
upb_MemBlock* block = ptr;
|
|
|
|
|
|
|
|
block->size = block_size;
|
Remove atomics from linked list of blocks
We no longer need to traverse the linked list of blocks to check allocated space, which means we also no longer need atomics in the linked list or even its head. This is especially beneficial as the previous implementation contained a race where we could dereference uninitialized memory; because the setting of the `next` pointers did not use release semantics and the reading of them in `SpaceAllocated` reads with relaxed order, there's no guarantee that `size` has actually been initialized - but worse, *there is also no guarantee that `next` has been!*. Simplified:
```
AddBlock:
1 ptr = malloc();
2 ptr->size = 123;
3 ptr->next = ai->blocks;
4 ai->blocks = ptr (release order);
```
```
SpaceAllocated:
5 block = ai->blocks (relaxed order)
6 block->size (acquire, but probably by accident)
7 block = block->next (relaxed order)
```
So I think a second thread calling SpaceAllocated could see the order 1, 4, 5, 6, 7, 2, 3 and read uninitialized memory - there is no data-dependency relationship or happens-before edge that this order violates, and so it would be valid for a compiler+hardware to produce.
In reality, operation 4 will produce an `stlr` on arm (forcing an order of 1, 2, 3 before 4), and `block->next` has a data dependency on `ai->blocks` which would force an ordering in the hardware between 5->6 and 5->7 even for regular `ldr` instructions.
Delete arena contains, it's private and the only user is its own test.
PiperOrigin-RevId: 709918443
2 months ago
|
|
|
// Insert into linked list.
|
|
|
|
block->next = ai->blocks;
|
|
|
|
ai->blocks = block;
|
|
|
|
|
|
|
|
UPB_ASSERT(offset >= kUpb_MemblockReserve);
|
|
|
|
a->UPB_PRIVATE(ptr) = UPB_PTR_AT(block, offset, char);
|
|
|
|
a->UPB_PRIVATE(end) = UPB_PTR_AT(block, block_size, char);
|
|
|
|
|
|
|
|
UPB_POISON_MEMORY_REGION(a->UPB_PRIVATE(ptr),
|
|
|
|
a->UPB_PRIVATE(end) - a->UPB_PRIVATE(ptr));
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool _upb_Arena_AllocBlock(upb_Arena* a, size_t size) {
|
|
|
|
upb_ArenaInternal* ai = upb_Arena_Internal(a);
|
|
|
|
if (!ai->block_alloc) return false;
|
|
|
|
size_t last_size = 128;
|
Remove atomics from linked list of blocks
We no longer need to traverse the linked list of blocks to check allocated space, which means we also no longer need atomics in the linked list or even its head. This is especially beneficial as the previous implementation contained a race where we could dereference uninitialized memory; because the setting of the `next` pointers did not use release semantics and the reading of them in `SpaceAllocated` reads with relaxed order, there's no guarantee that `size` has actually been initialized - but worse, *there is also no guarantee that `next` has been!*. Simplified:
```
AddBlock:
1 ptr = malloc();
2 ptr->size = 123;
3 ptr->next = ai->blocks;
4 ai->blocks = ptr (release order);
```
```
SpaceAllocated:
5 block = ai->blocks (relaxed order)
6 block->size (acquire, but probably by accident)
7 block = block->next (relaxed order)
```
So I think a second thread calling SpaceAllocated could see the order 1, 4, 5, 6, 7, 2, 3 and read uninitialized memory - there is no data-dependency relationship or happens-before edge that this order violates, and so it would be valid for a compiler+hardware to produce.
In reality, operation 4 will produce an `stlr` on arm (forcing an order of 1, 2, 3 before 4), and `block->next` has a data dependency on `ai->blocks` which would force an ordering in the hardware between 5->6 and 5->7 even for regular `ldr` instructions.
Delete arena contains, it's private and the only user is its own test.
PiperOrigin-RevId: 709918443
2 months ago
|
|
|
upb_MemBlock* last_block = ai->blocks;
|
|
|
|
if (last_block) {
|
|
|
|
last_size = a->UPB_PRIVATE(end) - (char*)last_block;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Relaxed order is safe here as we don't need any ordering with the setter.
|
|
|
|
size_t max_block_size =
|
|
|
|
upb_Atomic_Load(&g_max_block_size, memory_order_relaxed);
|
|
|
|
|
|
|
|
// Don't naturally grow beyond the max block size.
|
|
|
|
size_t clamped_size = UPB_MIN(last_size * 2, max_block_size);
|
|
|
|
|
|
|
|
// We may need to exceed the max block size if the user requested a large
|
|
|
|
// allocation.
|
|
|
|
size_t block_size = UPB_MAX(kUpb_MemblockReserve + size, clamped_size);
|
|
|
|
|
|
|
|
upb_MemBlock* block =
|
|
|
|
upb_malloc(_upb_ArenaInternal_BlockAlloc(ai), block_size);
|
|
|
|
|
|
|
|
if (!block) return false;
|
|
|
|
_upb_Arena_AddBlock(a, block, kUpb_MemblockReserve, block_size);
|
Remove atomics from linked list of blocks
We no longer need to traverse the linked list of blocks to check allocated space, which means we also no longer need atomics in the linked list or even its head. This is especially beneficial as the previous implementation contained a race where we could dereference uninitialized memory; because the setting of the `next` pointers did not use release semantics and the reading of them in `SpaceAllocated` reads with relaxed order, there's no guarantee that `size` has actually been initialized - but worse, *there is also no guarantee that `next` has been!*. Simplified:
```
AddBlock:
1 ptr = malloc();
2 ptr->size = 123;
3 ptr->next = ai->blocks;
4 ai->blocks = ptr (release order);
```
```
SpaceAllocated:
5 block = ai->blocks (relaxed order)
6 block->size (acquire, but probably by accident)
7 block = block->next (relaxed order)
```
So I think a second thread calling SpaceAllocated could see the order 1, 4, 5, 6, 7, 2, 3 and read uninitialized memory - there is no data-dependency relationship or happens-before edge that this order violates, and so it would be valid for a compiler+hardware to produce.
In reality, operation 4 will produce an `stlr` on arm (forcing an order of 1, 2, 3 before 4), and `block->next` has a data dependency on `ai->blocks` which would force an ordering in the hardware between 5->6 and 5->7 even for regular `ldr` instructions.
Delete arena contains, it's private and the only user is its own test.
PiperOrigin-RevId: 709918443
2 months ago
|
|
|
// Atomic add not required here, as threads won't race allocating blocks, plus
|
|
|
|
// atomic fetch-add is slower than load/add/store on arm devices compiled
|
|
|
|
// targetting pre-v8.1. Relaxed order is safe as nothing depends on order of
|
|
|
|
// size allocated.
|
|
|
|
|
|
|
|
uintptr_t old_space_allocated =
|
Remove atomics from linked list of blocks
We no longer need to traverse the linked list of blocks to check allocated space, which means we also no longer need atomics in the linked list or even its head. This is especially beneficial as the previous implementation contained a race where we could dereference uninitialized memory; because the setting of the `next` pointers did not use release semantics and the reading of them in `SpaceAllocated` reads with relaxed order, there's no guarantee that `size` has actually been initialized - but worse, *there is also no guarantee that `next` has been!*. Simplified:
```
AddBlock:
1 ptr = malloc();
2 ptr->size = 123;
3 ptr->next = ai->blocks;
4 ai->blocks = ptr (release order);
```
```
SpaceAllocated:
5 block = ai->blocks (relaxed order)
6 block->size (acquire, but probably by accident)
7 block = block->next (relaxed order)
```
So I think a second thread calling SpaceAllocated could see the order 1, 4, 5, 6, 7, 2, 3 and read uninitialized memory - there is no data-dependency relationship or happens-before edge that this order violates, and so it would be valid for a compiler+hardware to produce.
In reality, operation 4 will produce an `stlr` on arm (forcing an order of 1, 2, 3 before 4), and `block->next` has a data dependency on `ai->blocks` which would force an ordering in the hardware between 5->6 and 5->7 even for regular `ldr` instructions.
Delete arena contains, it's private and the only user is its own test.
PiperOrigin-RevId: 709918443
2 months ago
|
|
|
upb_Atomic_Load(&ai->space_allocated, memory_order_relaxed);
|
|
|
|
upb_Atomic_Store(&ai->space_allocated, old_space_allocated + block_size,
|
|
|
|
memory_order_relaxed);
|
|
|
|
UPB_ASSERT(UPB_PRIVATE(_upb_ArenaHas)(a) >= size);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
void* UPB_PRIVATE(_upb_Arena_SlowMalloc)(upb_Arena* a, size_t size) {
|
|
|
|
if (!_upb_Arena_AllocBlock(a, size)) return NULL; // OOM
|
|
|
|
return upb_Arena_Malloc(a, size - UPB_ASAN_GUARD_SIZE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static upb_Arena* _upb_Arena_InitSlow(upb_alloc* alloc, size_t first_size) {
|
|
|
|
const size_t first_block_overhead =
|
|
|
|
UPB_ALIGN_MALLOC(kUpb_MemblockReserve + sizeof(upb_ArenaState));
|
|
|
|
upb_ArenaState* a;
|
|
|
|
|
|
|
|
// We need to malloc the initial block.
|
|
|
|
char* mem;
|
|
|
|
size_t block_size =
|
|
|
|
first_block_overhead +
|
|
|
|
UPB_MAX(256, UPB_ALIGN_MALLOC(first_size) + UPB_ASAN_GUARD_SIZE);
|
|
|
|
if (!alloc || !(mem = upb_malloc(alloc, block_size))) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
a = UPB_PTR_AT(mem, kUpb_MemblockReserve, upb_ArenaState);
|
|
|
|
|
|
|
|
a->body.block_alloc = _upb_Arena_MakeBlockAlloc(alloc, 0);
|
|
|
|
upb_Atomic_Init(&a->body.parent_or_count, _upb_Arena_TaggedFromRefcount(1));
|
|
|
|
upb_Atomic_Init(&a->body.next, NULL);
|
|
|
|
upb_Atomic_Init(&a->body.previous_or_tail,
|
|
|
|
_upb_Arena_TaggedFromTail(&a->body));
|
|
|
|
upb_Atomic_Init(&a->body.space_allocated, block_size);
|
Remove atomics from linked list of blocks
We no longer need to traverse the linked list of blocks to check allocated space, which means we also no longer need atomics in the linked list or even its head. This is especially beneficial as the previous implementation contained a race where we could dereference uninitialized memory; because the setting of the `next` pointers did not use release semantics and the reading of them in `SpaceAllocated` reads with relaxed order, there's no guarantee that `size` has actually been initialized - but worse, *there is also no guarantee that `next` has been!*. Simplified:
```
AddBlock:
1 ptr = malloc();
2 ptr->size = 123;
3 ptr->next = ai->blocks;
4 ai->blocks = ptr (release order);
```
```
SpaceAllocated:
5 block = ai->blocks (relaxed order)
6 block->size (acquire, but probably by accident)
7 block = block->next (relaxed order)
```
So I think a second thread calling SpaceAllocated could see the order 1, 4, 5, 6, 7, 2, 3 and read uninitialized memory - there is no data-dependency relationship or happens-before edge that this order violates, and so it would be valid for a compiler+hardware to produce.
In reality, operation 4 will produce an `stlr` on arm (forcing an order of 1, 2, 3 before 4), and `block->next` has a data dependency on `ai->blocks` which would force an ordering in the hardware between 5->6 and 5->7 even for regular `ldr` instructions.
Delete arena contains, it's private and the only user is its own test.
PiperOrigin-RevId: 709918443
2 months ago
|
|
|
a->body.blocks = NULL;
|
|
|
|
a->body.upb_alloc_cleanup = NULL;
|
|
|
|
UPB_TSAN_INIT_PUBLISHED(&a->body);
|
|
|
|
|
|
|
|
_upb_Arena_AddBlock(&a->head, mem, first_block_overhead, block_size);
|
|
|
|
|
|
|
|
return &a->head;
|
|
|
|
}
|
|
|
|
|
|
|
|
upb_Arena* upb_Arena_Init(void* mem, size_t n, upb_alloc* alloc) {
|
|
|
|
UPB_ASSERT(sizeof(void*) * UPB_ARENA_SIZE_HACK >= sizeof(upb_ArenaState));
|
|
|
|
upb_ArenaState* a;
|
|
|
|
|
|
|
|
if (mem) {
|
|
|
|
/* Align initial pointer up so that we return properly-aligned pointers. */
|
|
|
|
void* aligned = (void*)UPB_ALIGN_MALLOC((uintptr_t)mem);
|
|
|
|
size_t delta = (uintptr_t)aligned - (uintptr_t)mem;
|
|
|
|
n = delta <= n ? n - delta : 0;
|
|
|
|
mem = aligned;
|
|
|
|
}
|
|
|
|
if (UPB_UNLIKELY(n < sizeof(upb_ArenaState) || !mem)) {
|
|
|
|
upb_Arena* ret = _upb_Arena_InitSlow(alloc, mem ? 0 : n);
|
|
|
|
#ifdef UPB_TRACING_ENABLED
|
|
|
|
upb_Arena_LogInit(ret, n);
|
|
|
|
#endif
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
a = mem;
|
|
|
|
|
|
|
|
upb_Atomic_Init(&a->body.parent_or_count, _upb_Arena_TaggedFromRefcount(1));
|
|
|
|
upb_Atomic_Init(&a->body.next, NULL);
|
|
|
|
upb_Atomic_Init(&a->body.previous_or_tail,
|
|
|
|
_upb_Arena_TaggedFromTail(&a->body));
|
Remove atomics from linked list of blocks
We no longer need to traverse the linked list of blocks to check allocated space, which means we also no longer need atomics in the linked list or even its head. This is especially beneficial as the previous implementation contained a race where we could dereference uninitialized memory; because the setting of the `next` pointers did not use release semantics and the reading of them in `SpaceAllocated` reads with relaxed order, there's no guarantee that `size` has actually been initialized - but worse, *there is also no guarantee that `next` has been!*. Simplified:
```
AddBlock:
1 ptr = malloc();
2 ptr->size = 123;
3 ptr->next = ai->blocks;
4 ai->blocks = ptr (release order);
```
```
SpaceAllocated:
5 block = ai->blocks (relaxed order)
6 block->size (acquire, but probably by accident)
7 block = block->next (relaxed order)
```
So I think a second thread calling SpaceAllocated could see the order 1, 4, 5, 6, 7, 2, 3 and read uninitialized memory - there is no data-dependency relationship or happens-before edge that this order violates, and so it would be valid for a compiler+hardware to produce.
In reality, operation 4 will produce an `stlr` on arm (forcing an order of 1, 2, 3 before 4), and `block->next` has a data dependency on `ai->blocks` which would force an ordering in the hardware between 5->6 and 5->7 even for regular `ldr` instructions.
Delete arena contains, it's private and the only user is its own test.
PiperOrigin-RevId: 709918443
2 months ago
|
|
|
upb_Atomic_Init(&a->body.space_allocated, 0);
|
|
|
|
a->body.blocks = NULL;
|
|
|
|
a->body.upb_alloc_cleanup = NULL;
|
|
|
|
a->body.block_alloc = _upb_Arena_MakeBlockAlloc(alloc, 1);
|
|
|
|
a->head.UPB_PRIVATE(ptr) = (void*)UPB_ALIGN_MALLOC((uintptr_t)(a + 1));
|
|
|
|
a->head.UPB_PRIVATE(end) = UPB_PTR_AT(mem, n, char);
|
|
|
|
UPB_TSAN_INIT_PUBLISHED(&a->body);
|
|
|
|
#ifdef UPB_TRACING_ENABLED
|
|
|
|
upb_Arena_LogInit(&a->head, n);
|
|
|
|
#endif
|
|
|
|
return &a->head;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void _upb_Arena_DoFree(upb_ArenaInternal* ai) {
|
|
|
|
UPB_ASSERT(_upb_Arena_RefCountFromTagged(ai->parent_or_count) == 1);
|
|
|
|
while (ai != NULL) {
|
|
|
|
UPB_TSAN_CHECK_PUBLISHED(ai);
|
Changed Arena representation so that fusing links arenas together instead of blocks.
Previously when fusing, we would concatenate all blocks into a single list that lived in the arena root. From then on, all arenas would add their blocks to this single unified list.
After this CL, arenas keep their distinct list of blocks even after being fused. Instead of unifying the block list, fuse now puts the arenas themselves into a list, so all arenas in the fused group can be iterated over at any time.
This design makes it easier to keep each individual arena thread-compatible, because fuse and free are now the only mutating operations that touch state that is shared with the entire group. Read-only operations like `SpaceAllocated()` also iterate the list of arenas, but in a read-only fashion.
(Note: we need tests for SpaceAllocated(), both single-threaded for correctness and multi-threaded for resilience to crashes and data races).
Performance of fuse regresses by 5-20%. This is somewhat expected as we are performing more atomic operations during a fuse.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.7ns ± 4% +2.00% (p=0.016 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.50ns ± 4% 6.57ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.3ns ±10% 68.7ns ± 4% +15.85% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 479ns ± 5% 540ns ± 8% +12.57% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.50µs ± 4% 4.93µs ± 8% +9.59% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.24µs ± 3% 9.96µs ± 3% +7.81% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.3ns ±18% 71.0ns ± 4% +12.14% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 484ns ± 9% 543ns ±10% +12.11% (p=0.000 n=17+16)
BM_ArenaFuseBalanced/64 4.50µs ± 6% 4.94µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.20µs ± 4% 9.95µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.50ms ± 8% 5.69ms ±17% ~ (p=0.189 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.10ms ± 5% 6.05ms ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.8ms ± 5% 12.4ms ±17% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.1µs ± 8% 12.1µs ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 10.9µs ± 7% 11.0µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 4% 24.4µs ± 7% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 5% 11.6µs ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.77µs ± 5% 5.71µs ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.102 n=19+16)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.8ns ± 4% +1.98% (p=0.019 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.51ns ± 4% 6.58ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.5ns ±10% 68.9ns ± 4% +15.83% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 481ns ± 5% 541ns ± 8% +12.54% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.51µs ± 4% 4.94µs ± 8% +9.53% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.26µs ± 3% 9.98µs ± 3% +7.79% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.5ns ±19% 71.1ns ± 3% +12.07% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 485ns ± 9% 551ns ±20% +13.47% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/64 4.51µs ± 6% 4.95µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.22µs ± 4% 9.97µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 8% 5.72ms ±18% ~ (p=0.199 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.12ms ± 5% 6.07ms ± 4% ~ (p=0.273 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 5% 12.5ms ±18% ~ (p=0.582 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 8% 12.1µs ± 3% ~ (p=0.963 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.126 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.0µs ± 6% 11.1µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.3µs ± 4% 24.5µs ± 6% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.7µs ± 5% 11.6µs ± 4% ~ (p=0.574 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.78µs ± 5% 5.73µs ± 5% ~ (p=0.357 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.117 n=19+16)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.08k ± 0% 6.05k ± 0% -0.54% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.39k ± 0% 6.36k ± 0% -0.55% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 336 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 9.89M ± 0% 9.95M ± 0% +0.65% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 9.95M ± 0% 10.02M ± 0% +0.70% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 138MB/s ± 7% 132MB/s ±15% ~ (p=0.126 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 124MB/s ± 5% 125MB/s ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 63.9MB/s ±13% 65.2MB/s ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 64.0MB/s ± 5% 61.3MB/s ±15% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 620MB/s ± 8% 622MB/s ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 644MB/s ±15% 679MB/s ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 627MB/s ± 4% 633MB/s ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 688MB/s ± 6% 682MB/s ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 310MB/s ± 4% 309MB/s ± 6% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 646MB/s ± 4% 649MB/s ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 666MB/s ± 3% 666MB/s ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 592MB/s ± 7% 593MB/s ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 1.30GB/s ± 5% 1.32GB/s ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 756MB/s ± 5% 745MB/s ± 6% ~ (p=0.102 n=19+16)
```
PiperOrigin-RevId: 520144430
2 years ago
|
|
|
// Load first since arena itself is likely from one of its blocks.
|
|
|
|
upb_ArenaInternal* next_arena =
|
|
|
|
(upb_ArenaInternal*)upb_Atomic_Load(&ai->next, memory_order_acquire);
|
|
|
|
// Freeing may have memory barriers that confuse tsan, so assert immdiately
|
|
|
|
// after load here
|
|
|
|
if (next_arena) {
|
|
|
|
UPB_TSAN_CHECK_PUBLISHED(next_arena);
|
|
|
|
}
|
|
|
|
upb_alloc* block_alloc = _upb_ArenaInternal_BlockAlloc(ai);
|
Remove atomics from linked list of blocks
We no longer need to traverse the linked list of blocks to check allocated space, which means we also no longer need atomics in the linked list or even its head. This is especially beneficial as the previous implementation contained a race where we could dereference uninitialized memory; because the setting of the `next` pointers did not use release semantics and the reading of them in `SpaceAllocated` reads with relaxed order, there's no guarantee that `size` has actually been initialized - but worse, *there is also no guarantee that `next` has been!*. Simplified:
```
AddBlock:
1 ptr = malloc();
2 ptr->size = 123;
3 ptr->next = ai->blocks;
4 ai->blocks = ptr (release order);
```
```
SpaceAllocated:
5 block = ai->blocks (relaxed order)
6 block->size (acquire, but probably by accident)
7 block = block->next (relaxed order)
```
So I think a second thread calling SpaceAllocated could see the order 1, 4, 5, 6, 7, 2, 3 and read uninitialized memory - there is no data-dependency relationship or happens-before edge that this order violates, and so it would be valid for a compiler+hardware to produce.
In reality, operation 4 will produce an `stlr` on arm (forcing an order of 1, 2, 3 before 4), and `block->next` has a data dependency on `ai->blocks` which would force an ordering in the hardware between 5->6 and 5->7 even for regular `ldr` instructions.
Delete arena contains, it's private and the only user is its own test.
PiperOrigin-RevId: 709918443
2 months ago
|
|
|
upb_MemBlock* block = ai->blocks;
|
|
|
|
upb_AllocCleanupFunc* alloc_cleanup = *ai->upb_alloc_cleanup;
|
Changed Arena representation so that fusing links arenas together instead of blocks.
Previously when fusing, we would concatenate all blocks into a single list that lived in the arena root. From then on, all arenas would add their blocks to this single unified list.
After this CL, arenas keep their distinct list of blocks even after being fused. Instead of unifying the block list, fuse now puts the arenas themselves into a list, so all arenas in the fused group can be iterated over at any time.
This design makes it easier to keep each individual arena thread-compatible, because fuse and free are now the only mutating operations that touch state that is shared with the entire group. Read-only operations like `SpaceAllocated()` also iterate the list of arenas, but in a read-only fashion.
(Note: we need tests for SpaceAllocated(), both single-threaded for correctness and multi-threaded for resilience to crashes and data races).
Performance of fuse regresses by 5-20%. This is somewhat expected as we are performing more atomic operations during a fuse.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.7ns ± 4% +2.00% (p=0.016 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.50ns ± 4% 6.57ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.3ns ±10% 68.7ns ± 4% +15.85% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 479ns ± 5% 540ns ± 8% +12.57% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.50µs ± 4% 4.93µs ± 8% +9.59% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.24µs ± 3% 9.96µs ± 3% +7.81% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.3ns ±18% 71.0ns ± 4% +12.14% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 484ns ± 9% 543ns ±10% +12.11% (p=0.000 n=17+16)
BM_ArenaFuseBalanced/64 4.50µs ± 6% 4.94µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.20µs ± 4% 9.95µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.50ms ± 8% 5.69ms ±17% ~ (p=0.189 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.10ms ± 5% 6.05ms ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.8ms ± 5% 12.4ms ±17% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.1µs ± 8% 12.1µs ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 10.9µs ± 7% 11.0µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 4% 24.4µs ± 7% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 5% 11.6µs ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.77µs ± 5% 5.71µs ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.102 n=19+16)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.8ns ± 4% +1.98% (p=0.019 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.51ns ± 4% 6.58ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.5ns ±10% 68.9ns ± 4% +15.83% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 481ns ± 5% 541ns ± 8% +12.54% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.51µs ± 4% 4.94µs ± 8% +9.53% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.26µs ± 3% 9.98µs ± 3% +7.79% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.5ns ±19% 71.1ns ± 3% +12.07% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 485ns ± 9% 551ns ±20% +13.47% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/64 4.51µs ± 6% 4.95µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.22µs ± 4% 9.97µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 8% 5.72ms ±18% ~ (p=0.199 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.12ms ± 5% 6.07ms ± 4% ~ (p=0.273 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 5% 12.5ms ±18% ~ (p=0.582 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 8% 12.1µs ± 3% ~ (p=0.963 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.126 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.0µs ± 6% 11.1µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.3µs ± 4% 24.5µs ± 6% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.7µs ± 5% 11.6µs ± 4% ~ (p=0.574 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.78µs ± 5% 5.73µs ± 5% ~ (p=0.357 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.117 n=19+16)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.08k ± 0% 6.05k ± 0% -0.54% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.39k ± 0% 6.36k ± 0% -0.55% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 336 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 9.89M ± 0% 9.95M ± 0% +0.65% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 9.95M ± 0% 10.02M ± 0% +0.70% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 138MB/s ± 7% 132MB/s ±15% ~ (p=0.126 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 124MB/s ± 5% 125MB/s ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 63.9MB/s ±13% 65.2MB/s ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 64.0MB/s ± 5% 61.3MB/s ±15% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 620MB/s ± 8% 622MB/s ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 644MB/s ±15% 679MB/s ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 627MB/s ± 4% 633MB/s ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 688MB/s ± 6% 682MB/s ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 310MB/s ± 4% 309MB/s ± 6% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 646MB/s ± 4% 649MB/s ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 666MB/s ± 3% 666MB/s ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 592MB/s ± 7% 593MB/s ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 1.30GB/s ± 5% 1.32GB/s ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 756MB/s ± 5% 745MB/s ± 6% ~ (p=0.102 n=19+16)
```
PiperOrigin-RevId: 520144430
2 years ago
|
|
|
while (block != NULL) {
|
|
|
|
// Load first since we are deleting block.
|
Remove atomics from linked list of blocks
We no longer need to traverse the linked list of blocks to check allocated space, which means we also no longer need atomics in the linked list or even its head. This is especially beneficial as the previous implementation contained a race where we could dereference uninitialized memory; because the setting of the `next` pointers did not use release semantics and the reading of them in `SpaceAllocated` reads with relaxed order, there's no guarantee that `size` has actually been initialized - but worse, *there is also no guarantee that `next` has been!*. Simplified:
```
AddBlock:
1 ptr = malloc();
2 ptr->size = 123;
3 ptr->next = ai->blocks;
4 ai->blocks = ptr (release order);
```
```
SpaceAllocated:
5 block = ai->blocks (relaxed order)
6 block->size (acquire, but probably by accident)
7 block = block->next (relaxed order)
```
So I think a second thread calling SpaceAllocated could see the order 1, 4, 5, 6, 7, 2, 3 and read uninitialized memory - there is no data-dependency relationship or happens-before edge that this order violates, and so it would be valid for a compiler+hardware to produce.
In reality, operation 4 will produce an `stlr` on arm (forcing an order of 1, 2, 3 before 4), and `block->next` has a data dependency on `ai->blocks` which would force an ordering in the hardware between 5->6 and 5->7 even for regular `ldr` instructions.
Delete arena contains, it's private and the only user is its own test.
PiperOrigin-RevId: 709918443
2 months ago
|
|
|
upb_MemBlock* next_block = block->next;
|
|
|
|
upb_free_sized(block_alloc, block, block->size);
|
Changed Arena representation so that fusing links arenas together instead of blocks.
Previously when fusing, we would concatenate all blocks into a single list that lived in the arena root. From then on, all arenas would add their blocks to this single unified list.
After this CL, arenas keep their distinct list of blocks even after being fused. Instead of unifying the block list, fuse now puts the arenas themselves into a list, so all arenas in the fused group can be iterated over at any time.
This design makes it easier to keep each individual arena thread-compatible, because fuse and free are now the only mutating operations that touch state that is shared with the entire group. Read-only operations like `SpaceAllocated()` also iterate the list of arenas, but in a read-only fashion.
(Note: we need tests for SpaceAllocated(), both single-threaded for correctness and multi-threaded for resilience to crashes and data races).
Performance of fuse regresses by 5-20%. This is somewhat expected as we are performing more atomic operations during a fuse.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.7ns ± 4% +2.00% (p=0.016 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.50ns ± 4% 6.57ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.3ns ±10% 68.7ns ± 4% +15.85% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 479ns ± 5% 540ns ± 8% +12.57% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.50µs ± 4% 4.93µs ± 8% +9.59% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.24µs ± 3% 9.96µs ± 3% +7.81% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.3ns ±18% 71.0ns ± 4% +12.14% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 484ns ± 9% 543ns ±10% +12.11% (p=0.000 n=17+16)
BM_ArenaFuseBalanced/64 4.50µs ± 6% 4.94µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.20µs ± 4% 9.95µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.50ms ± 8% 5.69ms ±17% ~ (p=0.189 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.10ms ± 5% 6.05ms ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.8ms ± 5% 12.4ms ±17% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.1µs ± 8% 12.1µs ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 10.9µs ± 7% 11.0µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 4% 24.4µs ± 7% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 5% 11.6µs ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.77µs ± 5% 5.71µs ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.102 n=19+16)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.8ns ± 4% +1.98% (p=0.019 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.51ns ± 4% 6.58ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.5ns ±10% 68.9ns ± 4% +15.83% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 481ns ± 5% 541ns ± 8% +12.54% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.51µs ± 4% 4.94µs ± 8% +9.53% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.26µs ± 3% 9.98µs ± 3% +7.79% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.5ns ±19% 71.1ns ± 3% +12.07% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 485ns ± 9% 551ns ±20% +13.47% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/64 4.51µs ± 6% 4.95µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.22µs ± 4% 9.97µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 8% 5.72ms ±18% ~ (p=0.199 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.12ms ± 5% 6.07ms ± 4% ~ (p=0.273 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 5% 12.5ms ±18% ~ (p=0.582 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 8% 12.1µs ± 3% ~ (p=0.963 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.126 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.0µs ± 6% 11.1µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.3µs ± 4% 24.5µs ± 6% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.7µs ± 5% 11.6µs ± 4% ~ (p=0.574 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.78µs ± 5% 5.73µs ± 5% ~ (p=0.357 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.117 n=19+16)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.08k ± 0% 6.05k ± 0% -0.54% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.39k ± 0% 6.36k ± 0% -0.55% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 336 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 9.89M ± 0% 9.95M ± 0% +0.65% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 9.95M ± 0% 10.02M ± 0% +0.70% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 138MB/s ± 7% 132MB/s ±15% ~ (p=0.126 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 124MB/s ± 5% 125MB/s ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 63.9MB/s ±13% 65.2MB/s ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 64.0MB/s ± 5% 61.3MB/s ±15% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 620MB/s ± 8% 622MB/s ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 644MB/s ±15% 679MB/s ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 627MB/s ± 4% 633MB/s ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 688MB/s ± 6% 682MB/s ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 310MB/s ± 4% 309MB/s ± 6% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 646MB/s ± 4% 649MB/s ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 666MB/s ± 3% 666MB/s ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 592MB/s ± 7% 593MB/s ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 1.30GB/s ± 5% 1.32GB/s ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 756MB/s ± 5% 745MB/s ± 6% ~ (p=0.102 n=19+16)
```
PiperOrigin-RevId: 520144430
2 years ago
|
|
|
block = next_block;
|
|
|
|
}
|
|
|
|
if (alloc_cleanup != NULL) {
|
|
|
|
alloc_cleanup(block_alloc);
|
|
|
|
}
|
|
|
|
ai = next_arena;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void upb_Arena_Free(upb_Arena* a) {
|
|
|
|
upb_ArenaInternal* ai = upb_Arena_Internal(a);
|
|
|
|
// Cannot be replaced with _upb_Arena_FindRoot, as that provides only a
|
|
|
|
// relaxed read of the refcount if ai is already the root.
|
|
|
|
uintptr_t poc = upb_Atomic_Load(&ai->parent_or_count, memory_order_acquire);
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
retry:
|
|
|
|
while (_upb_Arena_IsTaggedPointer(poc)) {
|
|
|
|
ai = _upb_Arena_PointerFromTagged(poc);
|
|
|
|
UPB_TSAN_CHECK_PUBLISHED(ai);
|
|
|
|
poc = upb_Atomic_Load(&ai->parent_or_count, memory_order_acquire);
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
}
|
|
|
|
|
|
|
|
// compare_exchange or fetch_sub are RMW operations, which are more
|
|
|
|
// expensive then direct loads. As an optimization, we only do RMW ops
|
|
|
|
// when we need to update things for other threads to see.
|
|
|
|
if (poc == _upb_Arena_TaggedFromRefcount(1)) {
|
|
|
|
#ifdef UPB_TRACING_ENABLED
|
|
|
|
upb_Arena_LogFree(a);
|
|
|
|
#endif
|
|
|
|
_upb_Arena_DoFree(ai);
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
Allow fuse/fuse races, so that upb_Arena is fully thread-compatible.
Previously upb_Arena was not thread-compatible when `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` executed in parallel if `b` and `c` were previously fused. This CL fixed that by allowing `upb_Arena_Fuse()` to run in parallel without limitations.
Details on the design of the algorithm are captured in comments.
The CL slightly improves the performance of `upb_Arena_Fuse()`.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.5ns ± 4% -12.30% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.65ns ± 4% 5.17ns ± 3% -22.23% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.1ns ± 7% 68.5ns ± 4% ~ (p=0.327 n=18+19)
BM_ArenaFuseUnbalanced/8 542ns ± 3% 513ns ± 4% -5.25% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.04µs ± 8% 4.74µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.80% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 71.8ns ± 7% 68.4ns ± 6% -4.75% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 541ns ± 3% 519ns ± 3% -4.21% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.00µs ± 7% 4.86µs ± 4% -2.78% (p=0.003 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 4% 9.7µs ± 4% -2.68% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 2% 5.54ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.18ms ± 3% 6.15ms ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.8ms ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 4% 12.3µs ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.1µs ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.6µs ±16% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 4% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.782 n=18+17)
BM_SerializeDescriptor_Proto2 5.69µs ± 5% 5.76µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old time/op new time/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.6ns ± 4% -12.37% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.66ns ± 4% 5.18ns ± 3% -22.24% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.2ns ± 7% 68.6ns ± 4% ~ (p=0.343 n=18+19)
BM_ArenaFuseUnbalanced/8 543ns ± 3% 515ns ± 4% -5.21% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.05µs ± 8% 4.75µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.78% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 72.0ns ± 7% 68.6ns ± 6% -4.73% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 543ns ± 3% 520ns ± 3% -4.20% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.01µs ± 7% 4.87µs ± 4% -2.78% (p=0.004 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 3% 9.8µs ± 4% -2.67% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.53ms ± 2% 5.56ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.20ms ± 3% 6.17ms ± 2% ~ (p=0.424 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.297 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.9ms ± 3% ~ (p=0.351 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.3µs ± 4% 12.3µs ± 4% ~ (p=1.000 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.845 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.542 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.2µs ± 2% ~ (p=0.330 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.7µs ±17% ~ (p=0.167 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.799 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.807 n=18+17)
BM_SerializeDescriptor_Proto2 5.71µs ± 5% 5.78µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.05k ± 0% 6.05k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.36k ± 0% 6.36k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% -0.00% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 328 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<NoLayout> 10.0M ± 0% 9.9M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 10.0M ± 0% 10.0M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% -0.01% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 137MB/s ± 2% 137MB/s ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 122MB/s ± 3% 123MB/s ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 64.2MB/s ± 7% 64.7MB/s ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 63.6MB/s ± 3% 63.9MB/s ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 614MB/s ± 4% 613MB/s ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 665MB/s ± 6% 667MB/s ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 624MB/s ± 4% 622MB/s ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 681MB/s ± 4% 675MB/s ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 311MB/s ± 3% 296MB/s ±15% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 649MB/s ± 3% 644MB/s ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 656MB/s ± 7% 659MB/s ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 587MB/s ± 5% 576MB/s ±16% ~ (p=0.584 n=18+18)
BM_SerializeDescriptor_Proto2 1.32GB/s ± 5% 1.31GB/s ± 7% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 737MB/s ± 4% 737MB/s ± 7% ~ (p=0.839 n=18+18)
```
PiperOrigin-RevId: 520452349
2 years ago
|
|
|
if (upb_Atomic_CompareExchangeWeak(
|
|
|
|
&ai->parent_or_count, &poc,
|
|
|
|
_upb_Arena_TaggedFromRefcount(_upb_Arena_RefCountFromTagged(poc) - 1),
|
|
|
|
memory_order_release, memory_order_acquire)) {
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
// We were >1 and we decremented it successfully, so we are done.
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// We failed our update, so someone has done something, retry the whole
|
|
|
|
// process, but the failed exchange reloaded `poc` for us.
|
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void _upb_Arena_DoFuseArenaLists(upb_ArenaInternal* const parent,
|
|
|
|
upb_ArenaInternal* child) {
|
|
|
|
UPB_TSAN_CHECK_PUBLISHED(parent);
|
|
|
|
uintptr_t parent_previous_or_tail =
|
|
|
|
upb_Atomic_Load(&parent->previous_or_tail, memory_order_acquire);
|
|
|
|
upb_ArenaInternal* parent_tail = parent;
|
|
|
|
if (_upb_Arena_IsTaggedTail(parent_previous_or_tail)) {
|
|
|
|
// Our tail might be stale, but it will always converge to the true tail.
|
|
|
|
parent_tail = _upb_Arena_TailFromTagged(parent_previous_or_tail);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Link parent to child going forwards
|
|
|
|
while (true) {
|
|
|
|
UPB_TSAN_CHECK_PUBLISHED(parent_tail);
|
|
|
|
upb_ArenaInternal* parent_tail_next =
|
|
|
|
upb_Atomic_Load(&parent_tail->next, memory_order_acquire);
|
Switch upb_Arena_Fuse from a CAS based list insertion to an exchange based one
Second try with improved testing
(Generated by http://go/benchy. Settings: --runs 20 --reference "srcfs" --perflab)
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.2ns ± 2% 18.1ns ± 1% -0.72% (p=0.002 n=18+17)
BM_ArenaInitialBlockOneAlloc 5.31ns ± 0% 5.30ns ± 1% ~ (p=0.345 n=16+19)
BM_ArenaFuseUnbalanced/2 67.8ns ± 1% 68.0ns ± 0% +0.35% (p=0.011 n=16+17)
BM_ArenaFuseUnbalanced/8 526ns ± 2% 524ns ± 1% ~ (p=0.708 n=18+17)
BM_ArenaFuseUnbalanced/64 4.82µs ± 1% 4.84µs ± 1% +0.31% (p=0.049 n=16+17)
BM_ArenaFuseUnbalanced/128 9.78µs ± 1% 9.82µs ± 1% +0.46% (p=0.001 n=17+17)
BM_ArenaFuseBalanced/2 66.9ns ± 1% 67.2ns ± 1% +0.36% (p=0.025 n=17+16)
BM_ArenaFuseBalanced/8 527ns ± 2% 529ns ± 1% ~ (p=0.081 n=17+19)
BM_ArenaFuseBalanced/64 4.92µs ± 4% 4.88µs ± 2% ~ (p=0.184 n=18+17)
BM_ArenaFuseBalanced/128 9.92µs ± 1% 9.91µs ± 1% ~ (p=0.883 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.89ms ± 2% 5.94ms ± 1% +0.88% (p=0.005 n=18+17)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.55ms ± 2% 6.55ms ± 1% ~ (p=0.961 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.3ms ± 2% 12.4ms ± 1% ~ (p=0.226 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.5ms ± 1% 12.6ms ± 1% +0.61% (p=0.005 n=17+19)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.6µs ± 1% 12.7µs ± 2% ~ (p=0.219 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 2% 11.6µs ± 3% ~ (p=0.721 n=16+18)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.4µs ± 1% 12.5µs ± 1% ~ (p=0.118 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.3µs ± 2% 11.4µs ± 1% ~ (p=0.327 n=18+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.2µs ± 2% 25.3µs ± 1% ~ (p=0.301 n=16+19)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 3% 12.1µs ± 2% ~ (p=0.869 n=18+19)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.8µs ± 3% 11.8µs ± 3% ~ (p=0.462 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.333 n=16+19)
BM_SerializeDescriptor_Proto2 5.83µs ± 3% 5.86µs ± 4% ~ (p=0.496 n=18+20)
BM_SerializeDescriptor_Upb 10.5µs ± 2% 10.4µs ± 1% -1.20% (p=0.000 n=18+16)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.2ns ± 2% 18.1ns ± 0% -0.73% (p=0.010 n=18+17)
BM_ArenaInitialBlockOneAlloc 5.32ns ± 0% 5.31ns ± 1% ~ (p=0.106 n=15+18)
BM_ArenaFuseUnbalanced/2 67.9ns ± 1% 68.1ns ± 0% +0.31% (p=0.044 n=16+16)
BM_ArenaFuseUnbalanced/8 527ns ± 2% 526ns ± 1% ~ (p=0.772 n=18+16)
BM_ArenaFuseUnbalanced/64 4.83µs ± 1% 4.84µs ± 2% ~ (p=0.144 n=16+18)
BM_ArenaFuseUnbalanced/128 9.79µs ± 1% 9.84µs ± 1% +0.52% (p=0.001 n=17+18)
BM_ArenaFuseBalanced/2 67.0ns ± 1% 67.3ns ± 3% +0.41% (p=0.019 n=15+16)
BM_ArenaFuseBalanced/8 528ns ± 2% 530ns ± 1% ~ (p=0.121 n=17+19)
BM_ArenaFuseBalanced/64 4.93µs ± 4% 4.89µs ± 2% ~ (p=0.103 n=18+17)
BM_ArenaFuseBalanced/128 9.93µs ± 1% 9.93µs ± 1% ~ (p=0.806 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.91ms ± 2% 5.96ms ± 1% +0.93% (p=0.002 n=18+16)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.57ms ± 2% 6.57ms ± 1% ~ (p=0.935 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.4ms ± 2% 12.4ms ± 1% ~ (p=0.239 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.5ms ± 2% 12.6ms ± 1% +0.43% (p=0.024 n=18+19)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 2% 12.7µs ± 2% ~ (p=0.245 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 2% 11.6µs ± 2% ~ (p=0.772 n=16+18)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.136 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 2% 11.4µs ± 1% ~ (p=0.391 n=18+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.3µs ± 2% 25.4µs ± 1% ~ (p=0.403 n=16+19)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 2% ~ (p=0.731 n=17+19)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.8µs ± 3% ~ (p=0.424 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 2% 13.3µs ± 1% ~ (p=0.683 n=16+19)
BM_SerializeDescriptor_Proto2 5.84µs ± 3% 5.86µs ± 4% ~ (p=0.496 n=18+20)
BM_SerializeDescriptor_Upb 10.5µs ± 2% 10.4µs ± 1% -1.27% (p=0.000 n=18+16)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 133MB/s ± 2% 132MB/s ± 1% -0.97% (p=0.002 n=18+16)
BM_LoadAdsDescriptor_Upb<WithLayout> 120MB/s ± 2% 120MB/s ± 1% ~ (p=0.961 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 63.5MB/s ± 2% 63.3MB/s ± 1% ~ (p=0.226 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 62.7MB/s ± 1% 62.4MB/s ± 1% -0.60% (p=0.005 n=17+19)
BM_Parse_Upb_FileDesc<UseArena, Copy> 596MB/s ± 1% 594MB/s ± 2% ~ (p=0.219 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 650MB/s ± 2% 649MB/s ± 3% ~ (p=0.721 n=16+18)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 605MB/s ± 1% 603MB/s ± 1% ~ (p=0.118 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 663MB/s ± 2% 661MB/s ± 1% ~ (p=0.327 n=18+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 298MB/s ± 2% 297MB/s ± 1% ~ (p=0.490 n=17+19)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 623MB/s ± 3% 624MB/s ± 2% ~ (p=0.869 n=18+19)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 636MB/s ± 3% 637MB/s ± 3% ~ (p=0.462 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 570MB/s ± 1% 568MB/s ± 1% ~ (p=0.333 n=16+19)
BM_SerializeDescriptor_Proto2 1.29GB/s ± 3% 1.29GB/s ± 4% ~ (p=0.496 n=18+20)
BM_SerializeDescriptor_Upb 716MB/s ± 2% 725MB/s ± 1% +1.20% (p=0.000 n=18+16)
```
PiperOrigin-RevId: 525132431
2 years ago
|
|
|
while (parent_tail_next != NULL) {
|
|
|
|
parent_tail = parent_tail_next;
|
|
|
|
UPB_TSAN_CHECK_PUBLISHED(parent_tail);
|
Switch upb_Arena_Fuse from a CAS based list insertion to an exchange based one
Second try with improved testing
(Generated by http://go/benchy. Settings: --runs 20 --reference "srcfs" --perflab)
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.2ns ± 2% 18.1ns ± 1% -0.72% (p=0.002 n=18+17)
BM_ArenaInitialBlockOneAlloc 5.31ns ± 0% 5.30ns ± 1% ~ (p=0.345 n=16+19)
BM_ArenaFuseUnbalanced/2 67.8ns ± 1% 68.0ns ± 0% +0.35% (p=0.011 n=16+17)
BM_ArenaFuseUnbalanced/8 526ns ± 2% 524ns ± 1% ~ (p=0.708 n=18+17)
BM_ArenaFuseUnbalanced/64 4.82µs ± 1% 4.84µs ± 1% +0.31% (p=0.049 n=16+17)
BM_ArenaFuseUnbalanced/128 9.78µs ± 1% 9.82µs ± 1% +0.46% (p=0.001 n=17+17)
BM_ArenaFuseBalanced/2 66.9ns ± 1% 67.2ns ± 1% +0.36% (p=0.025 n=17+16)
BM_ArenaFuseBalanced/8 527ns ± 2% 529ns ± 1% ~ (p=0.081 n=17+19)
BM_ArenaFuseBalanced/64 4.92µs ± 4% 4.88µs ± 2% ~ (p=0.184 n=18+17)
BM_ArenaFuseBalanced/128 9.92µs ± 1% 9.91µs ± 1% ~ (p=0.883 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.89ms ± 2% 5.94ms ± 1% +0.88% (p=0.005 n=18+17)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.55ms ± 2% 6.55ms ± 1% ~ (p=0.961 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.3ms ± 2% 12.4ms ± 1% ~ (p=0.226 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.5ms ± 1% 12.6ms ± 1% +0.61% (p=0.005 n=17+19)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.6µs ± 1% 12.7µs ± 2% ~ (p=0.219 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 2% 11.6µs ± 3% ~ (p=0.721 n=16+18)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.4µs ± 1% 12.5µs ± 1% ~ (p=0.118 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.3µs ± 2% 11.4µs ± 1% ~ (p=0.327 n=18+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.2µs ± 2% 25.3µs ± 1% ~ (p=0.301 n=16+19)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 3% 12.1µs ± 2% ~ (p=0.869 n=18+19)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.8µs ± 3% 11.8µs ± 3% ~ (p=0.462 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.333 n=16+19)
BM_SerializeDescriptor_Proto2 5.83µs ± 3% 5.86µs ± 4% ~ (p=0.496 n=18+20)
BM_SerializeDescriptor_Upb 10.5µs ± 2% 10.4µs ± 1% -1.20% (p=0.000 n=18+16)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.2ns ± 2% 18.1ns ± 0% -0.73% (p=0.010 n=18+17)
BM_ArenaInitialBlockOneAlloc 5.32ns ± 0% 5.31ns ± 1% ~ (p=0.106 n=15+18)
BM_ArenaFuseUnbalanced/2 67.9ns ± 1% 68.1ns ± 0% +0.31% (p=0.044 n=16+16)
BM_ArenaFuseUnbalanced/8 527ns ± 2% 526ns ± 1% ~ (p=0.772 n=18+16)
BM_ArenaFuseUnbalanced/64 4.83µs ± 1% 4.84µs ± 2% ~ (p=0.144 n=16+18)
BM_ArenaFuseUnbalanced/128 9.79µs ± 1% 9.84µs ± 1% +0.52% (p=0.001 n=17+18)
BM_ArenaFuseBalanced/2 67.0ns ± 1% 67.3ns ± 3% +0.41% (p=0.019 n=15+16)
BM_ArenaFuseBalanced/8 528ns ± 2% 530ns ± 1% ~ (p=0.121 n=17+19)
BM_ArenaFuseBalanced/64 4.93µs ± 4% 4.89µs ± 2% ~ (p=0.103 n=18+17)
BM_ArenaFuseBalanced/128 9.93µs ± 1% 9.93µs ± 1% ~ (p=0.806 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.91ms ± 2% 5.96ms ± 1% +0.93% (p=0.002 n=18+16)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.57ms ± 2% 6.57ms ± 1% ~ (p=0.935 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.4ms ± 2% 12.4ms ± 1% ~ (p=0.239 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.5ms ± 2% 12.6ms ± 1% +0.43% (p=0.024 n=18+19)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 2% 12.7µs ± 2% ~ (p=0.245 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 2% 11.6µs ± 2% ~ (p=0.772 n=16+18)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.136 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 2% 11.4µs ± 1% ~ (p=0.391 n=18+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.3µs ± 2% 25.4µs ± 1% ~ (p=0.403 n=16+19)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 2% ~ (p=0.731 n=17+19)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.8µs ± 3% ~ (p=0.424 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 2% 13.3µs ± 1% ~ (p=0.683 n=16+19)
BM_SerializeDescriptor_Proto2 5.84µs ± 3% 5.86µs ± 4% ~ (p=0.496 n=18+20)
BM_SerializeDescriptor_Upb 10.5µs ± 2% 10.4µs ± 1% -1.27% (p=0.000 n=18+16)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 133MB/s ± 2% 132MB/s ± 1% -0.97% (p=0.002 n=18+16)
BM_LoadAdsDescriptor_Upb<WithLayout> 120MB/s ± 2% 120MB/s ± 1% ~ (p=0.961 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 63.5MB/s ± 2% 63.3MB/s ± 1% ~ (p=0.226 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 62.7MB/s ± 1% 62.4MB/s ± 1% -0.60% (p=0.005 n=17+19)
BM_Parse_Upb_FileDesc<UseArena, Copy> 596MB/s ± 1% 594MB/s ± 2% ~ (p=0.219 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 650MB/s ± 2% 649MB/s ± 3% ~ (p=0.721 n=16+18)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 605MB/s ± 1% 603MB/s ± 1% ~ (p=0.118 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 663MB/s ± 2% 661MB/s ± 1% ~ (p=0.327 n=18+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 298MB/s ± 2% 297MB/s ± 1% ~ (p=0.490 n=17+19)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 623MB/s ± 3% 624MB/s ± 2% ~ (p=0.869 n=18+19)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 636MB/s ± 3% 637MB/s ± 3% ~ (p=0.462 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 570MB/s ± 1% 568MB/s ± 1% ~ (p=0.333 n=16+19)
BM_SerializeDescriptor_Proto2 1.29GB/s ± 3% 1.29GB/s ± 4% ~ (p=0.496 n=18+20)
BM_SerializeDescriptor_Upb 716MB/s ± 2% 725MB/s ± 1% +1.20% (p=0.000 n=18+16)
```
PiperOrigin-RevId: 525132431
2 years ago
|
|
|
parent_tail_next =
|
|
|
|
upb_Atomic_Load(&parent_tail->next, memory_order_acquire);
|
|
|
|
}
|
|
|
|
if (upb_Atomic_CompareExchangeWeak(&parent_tail->next, &parent_tail_next,
|
|
|
|
child, memory_order_release,
|
|
|
|
memory_order_acquire)) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (parent_tail_next != NULL) {
|
|
|
|
parent_tail = parent_tail_next;
|
|
|
|
}
|
|
|
|
}
|
Allow fuse/fuse races, so that upb_Arena is fully thread-compatible.
Previously upb_Arena was not thread-compatible when `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` executed in parallel if `b` and `c` were previously fused. This CL fixed that by allowing `upb_Arena_Fuse()` to run in parallel without limitations.
Details on the design of the algorithm are captured in comments.
The CL slightly improves the performance of `upb_Arena_Fuse()`.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.5ns ± 4% -12.30% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.65ns ± 4% 5.17ns ± 3% -22.23% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.1ns ± 7% 68.5ns ± 4% ~ (p=0.327 n=18+19)
BM_ArenaFuseUnbalanced/8 542ns ± 3% 513ns ± 4% -5.25% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.04µs ± 8% 4.74µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.80% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 71.8ns ± 7% 68.4ns ± 6% -4.75% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 541ns ± 3% 519ns ± 3% -4.21% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.00µs ± 7% 4.86µs ± 4% -2.78% (p=0.003 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 4% 9.7µs ± 4% -2.68% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 2% 5.54ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.18ms ± 3% 6.15ms ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.8ms ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 4% 12.3µs ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.1µs ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.6µs ±16% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 4% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.782 n=18+17)
BM_SerializeDescriptor_Proto2 5.69µs ± 5% 5.76µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old time/op new time/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.6ns ± 4% -12.37% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.66ns ± 4% 5.18ns ± 3% -22.24% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.2ns ± 7% 68.6ns ± 4% ~ (p=0.343 n=18+19)
BM_ArenaFuseUnbalanced/8 543ns ± 3% 515ns ± 4% -5.21% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.05µs ± 8% 4.75µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.78% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 72.0ns ± 7% 68.6ns ± 6% -4.73% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 543ns ± 3% 520ns ± 3% -4.20% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.01µs ± 7% 4.87µs ± 4% -2.78% (p=0.004 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 3% 9.8µs ± 4% -2.67% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.53ms ± 2% 5.56ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.20ms ± 3% 6.17ms ± 2% ~ (p=0.424 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.297 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.9ms ± 3% ~ (p=0.351 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.3µs ± 4% 12.3µs ± 4% ~ (p=1.000 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.845 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.542 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.2µs ± 2% ~ (p=0.330 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.7µs ±17% ~ (p=0.167 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.799 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.807 n=18+17)
BM_SerializeDescriptor_Proto2 5.71µs ± 5% 5.78µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.05k ± 0% 6.05k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.36k ± 0% 6.36k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% -0.00% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 328 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<NoLayout> 10.0M ± 0% 9.9M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 10.0M ± 0% 10.0M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% -0.01% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 137MB/s ± 2% 137MB/s ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 122MB/s ± 3% 123MB/s ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 64.2MB/s ± 7% 64.7MB/s ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 63.6MB/s ± 3% 63.9MB/s ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 614MB/s ± 4% 613MB/s ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 665MB/s ± 6% 667MB/s ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 624MB/s ± 4% 622MB/s ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 681MB/s ± 4% 675MB/s ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 311MB/s ± 3% 296MB/s ±15% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 649MB/s ± 3% 644MB/s ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 656MB/s ± 7% 659MB/s ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 587MB/s ± 5% 576MB/s ±16% ~ (p=0.584 n=18+18)
BM_SerializeDescriptor_Proto2 1.32GB/s ± 5% 1.31GB/s ± 7% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 737MB/s ± 4% 737MB/s ± 7% ~ (p=0.839 n=18+18)
```
PiperOrigin-RevId: 520452349
2 years ago
|
|
|
|
|
|
|
// Update parent's tail (may be stale).
|
|
|
|
uintptr_t child_previous_or_tail =
|
|
|
|
upb_Atomic_Load(&child->previous_or_tail, memory_order_acquire);
|
|
|
|
upb_ArenaInternal* new_parent_tail =
|
|
|
|
_upb_Arena_TailFromTagged(child_previous_or_tail);
|
|
|
|
UPB_TSAN_CHECK_PUBLISHED(new_parent_tail);
|
|
|
|
|
|
|
|
// If another thread fused with us, don't overwrite their previous pointer
|
|
|
|
// with our tail. Relaxed order is fine here as we only inspect the tag bit
|
|
|
|
parent_previous_or_tail =
|
|
|
|
upb_Atomic_Load(&parent->previous_or_tail, memory_order_relaxed);
|
|
|
|
if (_upb_Arena_IsTaggedTail(parent_previous_or_tail)) {
|
|
|
|
upb_Atomic_CompareExchangeStrong(
|
|
|
|
&parent->previous_or_tail, &parent_previous_or_tail,
|
|
|
|
_upb_Arena_TaggedFromTail(new_parent_tail), memory_order_release,
|
|
|
|
memory_order_relaxed);
|
|
|
|
}
|
Switch upb_Arena_Fuse from a CAS based list insertion to an exchange based one
Second try with improved testing
(Generated by http://go/benchy. Settings: --runs 20 --reference "srcfs" --perflab)
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.2ns ± 2% 18.1ns ± 1% -0.72% (p=0.002 n=18+17)
BM_ArenaInitialBlockOneAlloc 5.31ns ± 0% 5.30ns ± 1% ~ (p=0.345 n=16+19)
BM_ArenaFuseUnbalanced/2 67.8ns ± 1% 68.0ns ± 0% +0.35% (p=0.011 n=16+17)
BM_ArenaFuseUnbalanced/8 526ns ± 2% 524ns ± 1% ~ (p=0.708 n=18+17)
BM_ArenaFuseUnbalanced/64 4.82µs ± 1% 4.84µs ± 1% +0.31% (p=0.049 n=16+17)
BM_ArenaFuseUnbalanced/128 9.78µs ± 1% 9.82µs ± 1% +0.46% (p=0.001 n=17+17)
BM_ArenaFuseBalanced/2 66.9ns ± 1% 67.2ns ± 1% +0.36% (p=0.025 n=17+16)
BM_ArenaFuseBalanced/8 527ns ± 2% 529ns ± 1% ~ (p=0.081 n=17+19)
BM_ArenaFuseBalanced/64 4.92µs ± 4% 4.88µs ± 2% ~ (p=0.184 n=18+17)
BM_ArenaFuseBalanced/128 9.92µs ± 1% 9.91µs ± 1% ~ (p=0.883 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.89ms ± 2% 5.94ms ± 1% +0.88% (p=0.005 n=18+17)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.55ms ± 2% 6.55ms ± 1% ~ (p=0.961 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.3ms ± 2% 12.4ms ± 1% ~ (p=0.226 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.5ms ± 1% 12.6ms ± 1% +0.61% (p=0.005 n=17+19)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.6µs ± 1% 12.7µs ± 2% ~ (p=0.219 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 2% 11.6µs ± 3% ~ (p=0.721 n=16+18)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.4µs ± 1% 12.5µs ± 1% ~ (p=0.118 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.3µs ± 2% 11.4µs ± 1% ~ (p=0.327 n=18+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.2µs ± 2% 25.3µs ± 1% ~ (p=0.301 n=16+19)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 3% 12.1µs ± 2% ~ (p=0.869 n=18+19)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.8µs ± 3% 11.8µs ± 3% ~ (p=0.462 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.333 n=16+19)
BM_SerializeDescriptor_Proto2 5.83µs ± 3% 5.86µs ± 4% ~ (p=0.496 n=18+20)
BM_SerializeDescriptor_Upb 10.5µs ± 2% 10.4µs ± 1% -1.20% (p=0.000 n=18+16)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.2ns ± 2% 18.1ns ± 0% -0.73% (p=0.010 n=18+17)
BM_ArenaInitialBlockOneAlloc 5.32ns ± 0% 5.31ns ± 1% ~ (p=0.106 n=15+18)
BM_ArenaFuseUnbalanced/2 67.9ns ± 1% 68.1ns ± 0% +0.31% (p=0.044 n=16+16)
BM_ArenaFuseUnbalanced/8 527ns ± 2% 526ns ± 1% ~ (p=0.772 n=18+16)
BM_ArenaFuseUnbalanced/64 4.83µs ± 1% 4.84µs ± 2% ~ (p=0.144 n=16+18)
BM_ArenaFuseUnbalanced/128 9.79µs ± 1% 9.84µs ± 1% +0.52% (p=0.001 n=17+18)
BM_ArenaFuseBalanced/2 67.0ns ± 1% 67.3ns ± 3% +0.41% (p=0.019 n=15+16)
BM_ArenaFuseBalanced/8 528ns ± 2% 530ns ± 1% ~ (p=0.121 n=17+19)
BM_ArenaFuseBalanced/64 4.93µs ± 4% 4.89µs ± 2% ~ (p=0.103 n=18+17)
BM_ArenaFuseBalanced/128 9.93µs ± 1% 9.93µs ± 1% ~ (p=0.806 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.91ms ± 2% 5.96ms ± 1% +0.93% (p=0.002 n=18+16)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.57ms ± 2% 6.57ms ± 1% ~ (p=0.935 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.4ms ± 2% 12.4ms ± 1% ~ (p=0.239 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.5ms ± 2% 12.6ms ± 1% +0.43% (p=0.024 n=18+19)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 2% 12.7µs ± 2% ~ (p=0.245 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 2% 11.6µs ± 2% ~ (p=0.772 n=16+18)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.136 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 2% 11.4µs ± 1% ~ (p=0.391 n=18+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.3µs ± 2% 25.4µs ± 1% ~ (p=0.403 n=16+19)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 2% ~ (p=0.731 n=17+19)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.8µs ± 3% ~ (p=0.424 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 2% 13.3µs ± 1% ~ (p=0.683 n=16+19)
BM_SerializeDescriptor_Proto2 5.84µs ± 3% 5.86µs ± 4% ~ (p=0.496 n=18+20)
BM_SerializeDescriptor_Upb 10.5µs ± 2% 10.4µs ± 1% -1.27% (p=0.000 n=18+16)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 133MB/s ± 2% 132MB/s ± 1% -0.97% (p=0.002 n=18+16)
BM_LoadAdsDescriptor_Upb<WithLayout> 120MB/s ± 2% 120MB/s ± 1% ~ (p=0.961 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 63.5MB/s ± 2% 63.3MB/s ± 1% ~ (p=0.226 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 62.7MB/s ± 1% 62.4MB/s ± 1% -0.60% (p=0.005 n=17+19)
BM_Parse_Upb_FileDesc<UseArena, Copy> 596MB/s ± 1% 594MB/s ± 2% ~ (p=0.219 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 650MB/s ± 2% 649MB/s ± 3% ~ (p=0.721 n=16+18)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 605MB/s ± 1% 603MB/s ± 1% ~ (p=0.118 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 663MB/s ± 2% 661MB/s ± 1% ~ (p=0.327 n=18+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 298MB/s ± 2% 297MB/s ± 1% ~ (p=0.490 n=17+19)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 623MB/s ± 3% 624MB/s ± 2% ~ (p=0.869 n=18+19)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 636MB/s ± 3% 637MB/s ± 3% ~ (p=0.462 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 570MB/s ± 1% 568MB/s ± 1% ~ (p=0.333 n=16+19)
BM_SerializeDescriptor_Proto2 1.29GB/s ± 3% 1.29GB/s ± 4% ~ (p=0.496 n=18+20)
BM_SerializeDescriptor_Upb 716MB/s ± 2% 725MB/s ± 1% +1.20% (p=0.000 n=18+16)
```
PiperOrigin-RevId: 525132431
2 years ago
|
|
|
|
|
|
|
// Link child to parent going backwards, for SpaceAllocated
|
|
|
|
upb_Atomic_Store(&child->previous_or_tail,
|
|
|
|
_upb_Arena_TaggedFromPrevious(parent_tail),
|
|
|
|
memory_order_release);
|
|
|
|
}
|
|
|
|
|
|
|
|
void upb_Arena_SetAllocCleanup(upb_Arena* a, upb_AllocCleanupFunc* func) {
|
|
|
|
UPB_TSAN_CHECK_READ(a->UPB_ONLYBITS(ptr));
|
|
|
|
upb_ArenaInternal* ai = upb_Arena_Internal(a);
|
|
|
|
UPB_ASSERT(ai->upb_alloc_cleanup == NULL);
|
|
|
|
ai->upb_alloc_cleanup = func;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Thread safe.
|
|
|
|
static upb_ArenaInternal* _upb_Arena_DoFuse(upb_ArenaInternal** ai1,
|
|
|
|
upb_ArenaInternal** ai2,
|
|
|
|
uintptr_t* ref_delta) {
|
|
|
|
// `parent_or_count` has two distinct modes
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
// - parent pointer mode
|
|
|
|
// - refcount mode
|
|
|
|
//
|
|
|
|
// In parent pointer mode, it may change what pointer it refers to in the
|
|
|
|
// tree, but it will always approach a root. Any operation that walks the
|
|
|
|
// tree to the root may collapse levels of the tree concurrently.
|
|
|
|
upb_ArenaRoot r1 = _upb_Arena_FindRoot(*ai1);
|
|
|
|
upb_ArenaRoot r2 = _upb_Arena_FindRoot(*ai2);
|
|
|
|
|
Allow fuse/fuse races, so that upb_Arena is fully thread-compatible.
Previously upb_Arena was not thread-compatible when `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` executed in parallel if `b` and `c` were previously fused. This CL fixed that by allowing `upb_Arena_Fuse()` to run in parallel without limitations.
Details on the design of the algorithm are captured in comments.
The CL slightly improves the performance of `upb_Arena_Fuse()`.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.5ns ± 4% -12.30% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.65ns ± 4% 5.17ns ± 3% -22.23% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.1ns ± 7% 68.5ns ± 4% ~ (p=0.327 n=18+19)
BM_ArenaFuseUnbalanced/8 542ns ± 3% 513ns ± 4% -5.25% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.04µs ± 8% 4.74µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.80% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 71.8ns ± 7% 68.4ns ± 6% -4.75% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 541ns ± 3% 519ns ± 3% -4.21% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.00µs ± 7% 4.86µs ± 4% -2.78% (p=0.003 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 4% 9.7µs ± 4% -2.68% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 2% 5.54ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.18ms ± 3% 6.15ms ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.8ms ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 4% 12.3µs ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.1µs ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.6µs ±16% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 4% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.782 n=18+17)
BM_SerializeDescriptor_Proto2 5.69µs ± 5% 5.76µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old time/op new time/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.6ns ± 4% -12.37% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.66ns ± 4% 5.18ns ± 3% -22.24% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.2ns ± 7% 68.6ns ± 4% ~ (p=0.343 n=18+19)
BM_ArenaFuseUnbalanced/8 543ns ± 3% 515ns ± 4% -5.21% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.05µs ± 8% 4.75µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.78% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 72.0ns ± 7% 68.6ns ± 6% -4.73% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 543ns ± 3% 520ns ± 3% -4.20% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.01µs ± 7% 4.87µs ± 4% -2.78% (p=0.004 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 3% 9.8µs ± 4% -2.67% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.53ms ± 2% 5.56ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.20ms ± 3% 6.17ms ± 2% ~ (p=0.424 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.297 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.9ms ± 3% ~ (p=0.351 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.3µs ± 4% 12.3µs ± 4% ~ (p=1.000 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.845 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.542 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.2µs ± 2% ~ (p=0.330 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.7µs ±17% ~ (p=0.167 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.799 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.807 n=18+17)
BM_SerializeDescriptor_Proto2 5.71µs ± 5% 5.78µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.05k ± 0% 6.05k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.36k ± 0% 6.36k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% -0.00% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 328 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<NoLayout> 10.0M ± 0% 9.9M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 10.0M ± 0% 10.0M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% -0.01% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 137MB/s ± 2% 137MB/s ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 122MB/s ± 3% 123MB/s ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 64.2MB/s ± 7% 64.7MB/s ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 63.6MB/s ± 3% 63.9MB/s ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 614MB/s ± 4% 613MB/s ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 665MB/s ± 6% 667MB/s ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 624MB/s ± 4% 622MB/s ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 681MB/s ± 4% 675MB/s ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 311MB/s ± 3% 296MB/s ±15% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 649MB/s ± 3% 644MB/s ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 656MB/s ± 7% 659MB/s ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 587MB/s ± 5% 576MB/s ±16% ~ (p=0.584 n=18+18)
BM_SerializeDescriptor_Proto2 1.32GB/s ± 5% 1.31GB/s ± 7% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 737MB/s ± 4% 737MB/s ± 7% ~ (p=0.839 n=18+18)
```
PiperOrigin-RevId: 520452349
2 years ago
|
|
|
if (r1.root == r2.root) return r1.root; // Already fused.
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
|
|
|
|
*ai1 = r1.root;
|
|
|
|
*ai2 = r2.root;
|
|
|
|
|
Allow fuse/fuse races, so that upb_Arena is fully thread-compatible.
Previously upb_Arena was not thread-compatible when `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` executed in parallel if `b` and `c` were previously fused. This CL fixed that by allowing `upb_Arena_Fuse()` to run in parallel without limitations.
Details on the design of the algorithm are captured in comments.
The CL slightly improves the performance of `upb_Arena_Fuse()`.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.5ns ± 4% -12.30% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.65ns ± 4% 5.17ns ± 3% -22.23% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.1ns ± 7% 68.5ns ± 4% ~ (p=0.327 n=18+19)
BM_ArenaFuseUnbalanced/8 542ns ± 3% 513ns ± 4% -5.25% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.04µs ± 8% 4.74µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.80% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 71.8ns ± 7% 68.4ns ± 6% -4.75% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 541ns ± 3% 519ns ± 3% -4.21% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.00µs ± 7% 4.86µs ± 4% -2.78% (p=0.003 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 4% 9.7µs ± 4% -2.68% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 2% 5.54ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.18ms ± 3% 6.15ms ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.8ms ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 4% 12.3µs ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.1µs ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.6µs ±16% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 4% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.782 n=18+17)
BM_SerializeDescriptor_Proto2 5.69µs ± 5% 5.76µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old time/op new time/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.6ns ± 4% -12.37% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.66ns ± 4% 5.18ns ± 3% -22.24% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.2ns ± 7% 68.6ns ± 4% ~ (p=0.343 n=18+19)
BM_ArenaFuseUnbalanced/8 543ns ± 3% 515ns ± 4% -5.21% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.05µs ± 8% 4.75µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.78% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 72.0ns ± 7% 68.6ns ± 6% -4.73% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 543ns ± 3% 520ns ± 3% -4.20% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.01µs ± 7% 4.87µs ± 4% -2.78% (p=0.004 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 3% 9.8µs ± 4% -2.67% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.53ms ± 2% 5.56ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.20ms ± 3% 6.17ms ± 2% ~ (p=0.424 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.297 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.9ms ± 3% ~ (p=0.351 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.3µs ± 4% 12.3µs ± 4% ~ (p=1.000 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.845 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.542 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.2µs ± 2% ~ (p=0.330 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.7µs ±17% ~ (p=0.167 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.799 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.807 n=18+17)
BM_SerializeDescriptor_Proto2 5.71µs ± 5% 5.78µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.05k ± 0% 6.05k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.36k ± 0% 6.36k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% -0.00% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 328 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<NoLayout> 10.0M ± 0% 9.9M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 10.0M ± 0% 10.0M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% -0.01% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 137MB/s ± 2% 137MB/s ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 122MB/s ± 3% 123MB/s ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 64.2MB/s ± 7% 64.7MB/s ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 63.6MB/s ± 3% 63.9MB/s ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 614MB/s ± 4% 613MB/s ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 665MB/s ± 6% 667MB/s ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 624MB/s ± 4% 622MB/s ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 681MB/s ± 4% 675MB/s ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 311MB/s ± 3% 296MB/s ±15% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 649MB/s ± 3% 644MB/s ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 656MB/s ± 7% 659MB/s ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 587MB/s ± 5% 576MB/s ±16% ~ (p=0.584 n=18+18)
BM_SerializeDescriptor_Proto2 1.32GB/s ± 5% 1.31GB/s ± 7% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 737MB/s ± 4% 737MB/s ± 7% ~ (p=0.839 n=18+18)
```
PiperOrigin-RevId: 520452349
2 years ago
|
|
|
// Avoid cycles by always fusing into the root with the lower address.
|
|
|
|
if ((uintptr_t)r1.root > (uintptr_t)r2.root) {
|
|
|
|
upb_ArenaRoot tmp = r1;
|
|
|
|
r1 = r2;
|
|
|
|
r2 = tmp;
|
|
|
|
}
|
|
|
|
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
// The moment we install `r1` as the parent for `r2` all racing frees may
|
Allow fuse/fuse races, so that upb_Arena is fully thread-compatible.
Previously upb_Arena was not thread-compatible when `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` executed in parallel if `b` and `c` were previously fused. This CL fixed that by allowing `upb_Arena_Fuse()` to run in parallel without limitations.
Details on the design of the algorithm are captured in comments.
The CL slightly improves the performance of `upb_Arena_Fuse()`.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.5ns ± 4% -12.30% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.65ns ± 4% 5.17ns ± 3% -22.23% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.1ns ± 7% 68.5ns ± 4% ~ (p=0.327 n=18+19)
BM_ArenaFuseUnbalanced/8 542ns ± 3% 513ns ± 4% -5.25% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.04µs ± 8% 4.74µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.80% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 71.8ns ± 7% 68.4ns ± 6% -4.75% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 541ns ± 3% 519ns ± 3% -4.21% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.00µs ± 7% 4.86µs ± 4% -2.78% (p=0.003 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 4% 9.7µs ± 4% -2.68% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 2% 5.54ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.18ms ± 3% 6.15ms ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.8ms ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 4% 12.3µs ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.1µs ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.6µs ±16% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 4% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.782 n=18+17)
BM_SerializeDescriptor_Proto2 5.69µs ± 5% 5.76µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old time/op new time/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.6ns ± 4% -12.37% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.66ns ± 4% 5.18ns ± 3% -22.24% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.2ns ± 7% 68.6ns ± 4% ~ (p=0.343 n=18+19)
BM_ArenaFuseUnbalanced/8 543ns ± 3% 515ns ± 4% -5.21% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.05µs ± 8% 4.75µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.78% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 72.0ns ± 7% 68.6ns ± 6% -4.73% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 543ns ± 3% 520ns ± 3% -4.20% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.01µs ± 7% 4.87µs ± 4% -2.78% (p=0.004 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 3% 9.8µs ± 4% -2.67% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.53ms ± 2% 5.56ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.20ms ± 3% 6.17ms ± 2% ~ (p=0.424 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.297 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.9ms ± 3% ~ (p=0.351 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.3µs ± 4% 12.3µs ± 4% ~ (p=1.000 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.845 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.542 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.2µs ± 2% ~ (p=0.330 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.7µs ±17% ~ (p=0.167 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.799 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.807 n=18+17)
BM_SerializeDescriptor_Proto2 5.71µs ± 5% 5.78µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.05k ± 0% 6.05k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.36k ± 0% 6.36k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% -0.00% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 328 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<NoLayout> 10.0M ± 0% 9.9M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 10.0M ± 0% 10.0M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% -0.01% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 137MB/s ± 2% 137MB/s ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 122MB/s ± 3% 123MB/s ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 64.2MB/s ± 7% 64.7MB/s ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 63.6MB/s ± 3% 63.9MB/s ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 614MB/s ± 4% 613MB/s ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 665MB/s ± 6% 667MB/s ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 624MB/s ± 4% 622MB/s ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 681MB/s ± 4% 675MB/s ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 311MB/s ± 3% 296MB/s ±15% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 649MB/s ± 3% 644MB/s ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 656MB/s ± 7% 659MB/s ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 587MB/s ± 5% 576MB/s ±16% ~ (p=0.584 n=18+18)
BM_SerializeDescriptor_Proto2 1.32GB/s ± 5% 1.31GB/s ± 7% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 737MB/s ± 4% 737MB/s ± 7% ~ (p=0.839 n=18+18)
```
PiperOrigin-RevId: 520452349
2 years ago
|
|
|
// immediately begin decrementing `r1`'s refcount (including pending
|
|
|
|
// increments to that refcount and their frees!). We need to add `r2`'s refs
|
|
|
|
// now, so that `r1` can withstand any unrefs that come from r2.
|
|
|
|
//
|
|
|
|
// Note that while it is possible for `r2`'s refcount to increase
|
|
|
|
// asynchronously, we will not actually do the reparenting operation below
|
|
|
|
// unless `r2`'s refcount is unchanged from when we read it.
|
|
|
|
//
|
|
|
|
// Note that we may have done this previously, either to this node or a
|
|
|
|
// different node, during a previous and failed DoFuse() attempt. But we will
|
|
|
|
// not lose track of these refs because we always add them to our overall
|
|
|
|
// delta.
|
|
|
|
uintptr_t r2_untagged_count = r2.tagged_count & ~1;
|
|
|
|
uintptr_t with_r2_refs = r1.tagged_count + r2_untagged_count;
|
|
|
|
if (!upb_Atomic_CompareExchangeStrong(
|
|
|
|
&r1.root->parent_or_count, &r1.tagged_count, with_r2_refs,
|
|
|
|
memory_order_release, memory_order_acquire)) {
|
|
|
|
return NULL;
|
Allow for fuse/free races in `upb_Arena`.
Implementation is by kfm@, I only added the portability code around it.
`upb_Arena` was designed to be only thread-compatible. However, fusing of arenas muddies the waters somewhat, because two distinct `upb_Arena` objects will end up sharing state when fused. This causes a `upb_Arena_Free(a)` to interfere with `upb_Arena_Fuse(b, c)` if `a` and `b` were previously fused.
It turns out that we can use atomics to fix this with about a 35% regression in fuse performance (see below). Arena create+free does not regress, thanks to special-case logic in Free().
`upb_Arena` is still a thread-compatible type, and it is still never safe to call `upb_Arena_xxx(a)` and `upb_Arena_yyy(a)` in parallel. However you can at least now call `upb_Arena_Free(a)` and `upb_Arena_Fuse(b, c)` in parallel, even if `a` and `b` were previously fused.
Note that `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` is still not allowed if `b` and `c` were previously fused. In practice this means that fuses must still be single-threaded within a single fused group.
Performance results:
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.6ns ± 1% 18.6ns ± 1% ~ (p=0.726 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.28ns ± 1% 5.73ns ± 1% -8.68% (p=0.000 n=17+20)
BM_ArenaFuseUnbalanced/2 44.1ns ± 2% 60.4ns ± 1% +37.05% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/8 370ns ± 2% 500ns ± 1% +35.12% (p=0.000 n=19+20)
BM_ArenaFuseUnbalanced/64 3.52µs ± 1% 4.71µs ± 1% +33.80% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.20µs ± 1% 9.72µs ± 2% +34.93% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.4ns ± 2% 61.4ns ± 1% +38.23% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 509ns ± 1% +36.57% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/64 3.55µs ± 2% 4.79µs ± 1% +34.80% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.26µs ± 1% 9.76µs ± 1% +34.45% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.66ms ± 1% 5.69ms ± 1% +0.57% (p=0.013 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.30ms ± 1% 6.36ms ± 1% +0.90% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.1ms ± 1% ~ (p=0.118 n=18+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.50% (p=0.006 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.194 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.192 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 0% ~ (p=0.750 n=18+14)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% -0.34% (p=0.046 n=19+19)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.7µs ± 2% +1.37% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.143 n=18+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.9µs ± 3% 11.9µs ± 1% ~ (p=0.076 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.053 n=19+19)
BM_SerializeDescriptor_Proto2 5.97µs ± 4% 5.90µs ± 4% ~ (p=0.093 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.909 n=17+18)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.7ns ± 2% 18.6ns ± 0% ~ (p=0.607 n=18+17)
BM_ArenaInitialBlockOneAlloc 6.29ns ± 1% 5.74ns ± 1% -8.71% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/2 44.1ns ± 1% 60.6ns ± 1% +37.21% (p=0.000 n=17+19)
BM_ArenaFuseUnbalanced/8 371ns ± 2% 500ns ± 1% +35.02% (p=0.000 n=19+16)
BM_ArenaFuseUnbalanced/64 3.53µs ± 1% 4.72µs ± 1% +33.85% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/128 7.22µs ± 1% 9.73µs ± 2% +34.87% (p=0.000 n=16+19)
BM_ArenaFuseBalanced/2 44.5ns ± 2% 61.5ns ± 1% +38.22% (p=0.000 n=20+17)
BM_ArenaFuseBalanced/8 373ns ± 2% 510ns ± 1% +36.58% (p=0.000 n=19+16)
BM_ArenaFuseBalanced/64 3.56µs ± 2% 4.80µs ± 1% +34.87% (p=0.000 n=19+19)
BM_ArenaFuseBalanced/128 7.27µs ± 1% 9.77µs ± 1% +34.40% (p=0.000 n=17+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.67ms ± 1% 5.71ms ± 1% +0.60% (p=0.011 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.32ms ± 1% 6.37ms ± 1% +0.87% (p=0.000 n=19+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 12.1ms ± 1% 12.2ms ± 1% ~ (p=0.126 n=18+19)
BM_LoadAdsDescriptor_Proto2<WithLayout> 12.2ms ± 1% 12.3ms ± 1% +0.51% (p=0.002 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.7µs ± 1% 12.7µs ± 1% ~ (p=0.149 n=20+19)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.6µs ± 1% 11.6µs ± 1% ~ (p=0.211 n=20+20)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.5µs ± 1% 12.5µs ± 1% ~ (p=0.986 n=18+15)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.4µs ± 1% 11.3µs ± 1% ~ (p=0.081 n=19+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 25.4µs ± 1% 25.8µs ± 2% +1.41% (p=0.000 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 12.1µs ± 2% 12.1µs ± 1% ~ (p=0.558 n=19+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 12.0µs ± 3% 11.9µs ± 1% ~ (p=0.165 n=17+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.070 n=19+19)
BM_SerializeDescriptor_Proto2 5.98µs ± 4% 5.92µs ± 3% ~ (p=0.138 n=17+19)
BM_SerializeDescriptor_Upb 10.4µs ± 1% 10.4µs ± 1% ~ (p=0.858 n=17+18)
```
PiperOrigin-RevId: 518573683
2 years ago
|
|
|
}
|
Changed Arena representation so that fusing links arenas together instead of blocks.
Previously when fusing, we would concatenate all blocks into a single list that lived in the arena root. From then on, all arenas would add their blocks to this single unified list.
After this CL, arenas keep their distinct list of blocks even after being fused. Instead of unifying the block list, fuse now puts the arenas themselves into a list, so all arenas in the fused group can be iterated over at any time.
This design makes it easier to keep each individual arena thread-compatible, because fuse and free are now the only mutating operations that touch state that is shared with the entire group. Read-only operations like `SpaceAllocated()` also iterate the list of arenas, but in a read-only fashion.
(Note: we need tests for SpaceAllocated(), both single-threaded for correctness and multi-threaded for resilience to crashes and data races).
Performance of fuse regresses by 5-20%. This is somewhat expected as we are performing more atomic operations during a fuse.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.7ns ± 4% +2.00% (p=0.016 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.50ns ± 4% 6.57ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.3ns ±10% 68.7ns ± 4% +15.85% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 479ns ± 5% 540ns ± 8% +12.57% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.50µs ± 4% 4.93µs ± 8% +9.59% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.24µs ± 3% 9.96µs ± 3% +7.81% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.3ns ±18% 71.0ns ± 4% +12.14% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 484ns ± 9% 543ns ±10% +12.11% (p=0.000 n=17+16)
BM_ArenaFuseBalanced/64 4.50µs ± 6% 4.94µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.20µs ± 4% 9.95µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.50ms ± 8% 5.69ms ±17% ~ (p=0.189 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.10ms ± 5% 6.05ms ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.8ms ± 5% 12.4ms ±17% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.1µs ± 8% 12.1µs ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 10.9µs ± 7% 11.0µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 4% 24.4µs ± 7% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 5% 11.6µs ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.77µs ± 5% 5.71µs ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.102 n=19+16)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.8ns ± 4% +1.98% (p=0.019 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.51ns ± 4% 6.58ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.5ns ±10% 68.9ns ± 4% +15.83% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 481ns ± 5% 541ns ± 8% +12.54% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.51µs ± 4% 4.94µs ± 8% +9.53% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.26µs ± 3% 9.98µs ± 3% +7.79% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.5ns ±19% 71.1ns ± 3% +12.07% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 485ns ± 9% 551ns ±20% +13.47% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/64 4.51µs ± 6% 4.95µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.22µs ± 4% 9.97µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 8% 5.72ms ±18% ~ (p=0.199 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.12ms ± 5% 6.07ms ± 4% ~ (p=0.273 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 5% 12.5ms ±18% ~ (p=0.582 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 8% 12.1µs ± 3% ~ (p=0.963 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.126 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.0µs ± 6% 11.1µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.3µs ± 4% 24.5µs ± 6% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.7µs ± 5% 11.6µs ± 4% ~ (p=0.574 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.78µs ± 5% 5.73µs ± 5% ~ (p=0.357 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.117 n=19+16)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.08k ± 0% 6.05k ± 0% -0.54% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.39k ± 0% 6.36k ± 0% -0.55% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 336 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 9.89M ± 0% 9.95M ± 0% +0.65% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 9.95M ± 0% 10.02M ± 0% +0.70% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 138MB/s ± 7% 132MB/s ±15% ~ (p=0.126 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 124MB/s ± 5% 125MB/s ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 63.9MB/s ±13% 65.2MB/s ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 64.0MB/s ± 5% 61.3MB/s ±15% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 620MB/s ± 8% 622MB/s ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 644MB/s ±15% 679MB/s ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 627MB/s ± 4% 633MB/s ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 688MB/s ± 6% 682MB/s ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 310MB/s ± 4% 309MB/s ± 6% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 646MB/s ± 4% 649MB/s ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 666MB/s ± 3% 666MB/s ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 592MB/s ± 7% 593MB/s ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 1.30GB/s ± 5% 1.32GB/s ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 756MB/s ± 5% 745MB/s ± 6% ~ (p=0.102 n=19+16)
```
PiperOrigin-RevId: 520144430
2 years ago
|
|
|
|
Allow fuse/fuse races, so that upb_Arena is fully thread-compatible.
Previously upb_Arena was not thread-compatible when `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` executed in parallel if `b` and `c` were previously fused. This CL fixed that by allowing `upb_Arena_Fuse()` to run in parallel without limitations.
Details on the design of the algorithm are captured in comments.
The CL slightly improves the performance of `upb_Arena_Fuse()`.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.5ns ± 4% -12.30% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.65ns ± 4% 5.17ns ± 3% -22.23% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.1ns ± 7% 68.5ns ± 4% ~ (p=0.327 n=18+19)
BM_ArenaFuseUnbalanced/8 542ns ± 3% 513ns ± 4% -5.25% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.04µs ± 8% 4.74µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.80% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 71.8ns ± 7% 68.4ns ± 6% -4.75% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 541ns ± 3% 519ns ± 3% -4.21% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.00µs ± 7% 4.86µs ± 4% -2.78% (p=0.003 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 4% 9.7µs ± 4% -2.68% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 2% 5.54ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.18ms ± 3% 6.15ms ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.8ms ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 4% 12.3µs ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.1µs ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.6µs ±16% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 4% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.782 n=18+17)
BM_SerializeDescriptor_Proto2 5.69µs ± 5% 5.76µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old time/op new time/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.6ns ± 4% -12.37% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.66ns ± 4% 5.18ns ± 3% -22.24% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.2ns ± 7% 68.6ns ± 4% ~ (p=0.343 n=18+19)
BM_ArenaFuseUnbalanced/8 543ns ± 3% 515ns ± 4% -5.21% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.05µs ± 8% 4.75µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.78% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 72.0ns ± 7% 68.6ns ± 6% -4.73% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 543ns ± 3% 520ns ± 3% -4.20% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.01µs ± 7% 4.87µs ± 4% -2.78% (p=0.004 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 3% 9.8µs ± 4% -2.67% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.53ms ± 2% 5.56ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.20ms ± 3% 6.17ms ± 2% ~ (p=0.424 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.297 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.9ms ± 3% ~ (p=0.351 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.3µs ± 4% 12.3µs ± 4% ~ (p=1.000 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.845 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.542 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.2µs ± 2% ~ (p=0.330 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.7µs ±17% ~ (p=0.167 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.799 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.807 n=18+17)
BM_SerializeDescriptor_Proto2 5.71µs ± 5% 5.78µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.05k ± 0% 6.05k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.36k ± 0% 6.36k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% -0.00% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 328 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<NoLayout> 10.0M ± 0% 9.9M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 10.0M ± 0% 10.0M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% -0.01% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 137MB/s ± 2% 137MB/s ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 122MB/s ± 3% 123MB/s ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 64.2MB/s ± 7% 64.7MB/s ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 63.6MB/s ± 3% 63.9MB/s ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 614MB/s ± 4% 613MB/s ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 665MB/s ± 6% 667MB/s ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 624MB/s ± 4% 622MB/s ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 681MB/s ± 4% 675MB/s ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 311MB/s ± 3% 296MB/s ±15% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 649MB/s ± 3% 644MB/s ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 656MB/s ± 7% 659MB/s ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 587MB/s ± 5% 576MB/s ±16% ~ (p=0.584 n=18+18)
BM_SerializeDescriptor_Proto2 1.32GB/s ± 5% 1.31GB/s ± 7% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 737MB/s ± 4% 737MB/s ± 7% ~ (p=0.839 n=18+18)
```
PiperOrigin-RevId: 520452349
2 years ago
|
|
|
// Perform the actual fuse by removing the refs from `r2` and swapping in the
|
|
|
|
// parent pointer.
|
|
|
|
if (!upb_Atomic_CompareExchangeStrong(
|
Allow fuse/fuse races, so that upb_Arena is fully thread-compatible.
Previously upb_Arena was not thread-compatible when `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` executed in parallel if `b` and `c` were previously fused. This CL fixed that by allowing `upb_Arena_Fuse()` to run in parallel without limitations.
Details on the design of the algorithm are captured in comments.
The CL slightly improves the performance of `upb_Arena_Fuse()`.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.5ns ± 4% -12.30% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.65ns ± 4% 5.17ns ± 3% -22.23% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.1ns ± 7% 68.5ns ± 4% ~ (p=0.327 n=18+19)
BM_ArenaFuseUnbalanced/8 542ns ± 3% 513ns ± 4% -5.25% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.04µs ± 8% 4.74µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.80% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 71.8ns ± 7% 68.4ns ± 6% -4.75% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 541ns ± 3% 519ns ± 3% -4.21% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.00µs ± 7% 4.86µs ± 4% -2.78% (p=0.003 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 4% 9.7µs ± 4% -2.68% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 2% 5.54ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.18ms ± 3% 6.15ms ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.8ms ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 4% 12.3µs ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.1µs ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.6µs ±16% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 4% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.782 n=18+17)
BM_SerializeDescriptor_Proto2 5.69µs ± 5% 5.76µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old time/op new time/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.6ns ± 4% -12.37% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.66ns ± 4% 5.18ns ± 3% -22.24% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.2ns ± 7% 68.6ns ± 4% ~ (p=0.343 n=18+19)
BM_ArenaFuseUnbalanced/8 543ns ± 3% 515ns ± 4% -5.21% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.05µs ± 8% 4.75µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.78% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 72.0ns ± 7% 68.6ns ± 6% -4.73% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 543ns ± 3% 520ns ± 3% -4.20% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.01µs ± 7% 4.87µs ± 4% -2.78% (p=0.004 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 3% 9.8µs ± 4% -2.67% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.53ms ± 2% 5.56ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.20ms ± 3% 6.17ms ± 2% ~ (p=0.424 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.297 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.9ms ± 3% ~ (p=0.351 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.3µs ± 4% 12.3µs ± 4% ~ (p=1.000 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.845 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.542 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.2µs ± 2% ~ (p=0.330 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.7µs ±17% ~ (p=0.167 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.799 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.807 n=18+17)
BM_SerializeDescriptor_Proto2 5.71µs ± 5% 5.78µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.05k ± 0% 6.05k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.36k ± 0% 6.36k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% -0.00% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 328 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<NoLayout> 10.0M ± 0% 9.9M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 10.0M ± 0% 10.0M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% -0.01% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 137MB/s ± 2% 137MB/s ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 122MB/s ± 3% 123MB/s ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 64.2MB/s ± 7% 64.7MB/s ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 63.6MB/s ± 3% 63.9MB/s ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 614MB/s ± 4% 613MB/s ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 665MB/s ± 6% 667MB/s ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 624MB/s ± 4% 622MB/s ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 681MB/s ± 4% 675MB/s ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 311MB/s ± 3% 296MB/s ±15% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 649MB/s ± 3% 644MB/s ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 656MB/s ± 7% 659MB/s ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 587MB/s ± 5% 576MB/s ±16% ~ (p=0.584 n=18+18)
BM_SerializeDescriptor_Proto2 1.32GB/s ± 5% 1.31GB/s ± 7% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 737MB/s ± 4% 737MB/s ± 7% ~ (p=0.839 n=18+18)
```
PiperOrigin-RevId: 520452349
2 years ago
|
|
|
&r2.root->parent_or_count, &r2.tagged_count,
|
|
|
|
_upb_Arena_TaggedFromPointer(r1.root), memory_order_release,
|
|
|
|
memory_order_acquire)) {
|
|
|
|
// We'll need to remove the excess refs we added to r1 previously.
|
|
|
|
*ref_delta += r2_untagged_count;
|
Allow fuse/fuse races, so that upb_Arena is fully thread-compatible.
Previously upb_Arena was not thread-compatible when `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` executed in parallel if `b` and `c` were previously fused. This CL fixed that by allowing `upb_Arena_Fuse()` to run in parallel without limitations.
Details on the design of the algorithm are captured in comments.
The CL slightly improves the performance of `upb_Arena_Fuse()`.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.5ns ± 4% -12.30% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.65ns ± 4% 5.17ns ± 3% -22.23% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.1ns ± 7% 68.5ns ± 4% ~ (p=0.327 n=18+19)
BM_ArenaFuseUnbalanced/8 542ns ± 3% 513ns ± 4% -5.25% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.04µs ± 8% 4.74µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.80% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 71.8ns ± 7% 68.4ns ± 6% -4.75% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 541ns ± 3% 519ns ± 3% -4.21% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.00µs ± 7% 4.86µs ± 4% -2.78% (p=0.003 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 4% 9.7µs ± 4% -2.68% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 2% 5.54ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.18ms ± 3% 6.15ms ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.8ms ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 4% 12.3µs ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.1µs ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.6µs ±16% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 4% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.782 n=18+17)
BM_SerializeDescriptor_Proto2 5.69µs ± 5% 5.76µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old time/op new time/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.6ns ± 4% -12.37% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.66ns ± 4% 5.18ns ± 3% -22.24% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.2ns ± 7% 68.6ns ± 4% ~ (p=0.343 n=18+19)
BM_ArenaFuseUnbalanced/8 543ns ± 3% 515ns ± 4% -5.21% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.05µs ± 8% 4.75µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.78% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 72.0ns ± 7% 68.6ns ± 6% -4.73% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 543ns ± 3% 520ns ± 3% -4.20% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.01µs ± 7% 4.87µs ± 4% -2.78% (p=0.004 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 3% 9.8µs ± 4% -2.67% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.53ms ± 2% 5.56ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.20ms ± 3% 6.17ms ± 2% ~ (p=0.424 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.297 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.9ms ± 3% ~ (p=0.351 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.3µs ± 4% 12.3µs ± 4% ~ (p=1.000 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.845 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.542 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.2µs ± 2% ~ (p=0.330 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.7µs ±17% ~ (p=0.167 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.799 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.807 n=18+17)
BM_SerializeDescriptor_Proto2 5.71µs ± 5% 5.78µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.05k ± 0% 6.05k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.36k ± 0% 6.36k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% -0.00% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 328 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<NoLayout> 10.0M ± 0% 9.9M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 10.0M ± 0% 10.0M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% -0.01% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 137MB/s ± 2% 137MB/s ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 122MB/s ± 3% 123MB/s ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 64.2MB/s ± 7% 64.7MB/s ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 63.6MB/s ± 3% 63.9MB/s ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 614MB/s ± 4% 613MB/s ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 665MB/s ± 6% 667MB/s ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 624MB/s ± 4% 622MB/s ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 681MB/s ± 4% 675MB/s ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 311MB/s ± 3% 296MB/s ±15% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 649MB/s ± 3% 644MB/s ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 656MB/s ± 7% 659MB/s ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 587MB/s ± 5% 576MB/s ±16% ~ (p=0.584 n=18+18)
BM_SerializeDescriptor_Proto2 1.32GB/s ± 5% 1.31GB/s ± 7% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 737MB/s ± 4% 737MB/s ± 7% ~ (p=0.839 n=18+18)
```
PiperOrigin-RevId: 520452349
2 years ago
|
|
|
return NULL;
|
Changed Arena representation so that fusing links arenas together instead of blocks.
Previously when fusing, we would concatenate all blocks into a single list that lived in the arena root. From then on, all arenas would add their blocks to this single unified list.
After this CL, arenas keep their distinct list of blocks even after being fused. Instead of unifying the block list, fuse now puts the arenas themselves into a list, so all arenas in the fused group can be iterated over at any time.
This design makes it easier to keep each individual arena thread-compatible, because fuse and free are now the only mutating operations that touch state that is shared with the entire group. Read-only operations like `SpaceAllocated()` also iterate the list of arenas, but in a read-only fashion.
(Note: we need tests for SpaceAllocated(), both single-threaded for correctness and multi-threaded for resilience to crashes and data races).
Performance of fuse regresses by 5-20%. This is somewhat expected as we are performing more atomic operations during a fuse.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.7ns ± 4% +2.00% (p=0.016 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.50ns ± 4% 6.57ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.3ns ±10% 68.7ns ± 4% +15.85% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 479ns ± 5% 540ns ± 8% +12.57% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.50µs ± 4% 4.93µs ± 8% +9.59% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.24µs ± 3% 9.96µs ± 3% +7.81% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.3ns ±18% 71.0ns ± 4% +12.14% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 484ns ± 9% 543ns ±10% +12.11% (p=0.000 n=17+16)
BM_ArenaFuseBalanced/64 4.50µs ± 6% 4.94µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.20µs ± 4% 9.95µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.50ms ± 8% 5.69ms ±17% ~ (p=0.189 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.10ms ± 5% 6.05ms ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.8ms ± 5% 12.4ms ±17% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.1µs ± 8% 12.1µs ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 10.9µs ± 7% 11.0µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 4% 24.4µs ± 7% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 5% 11.6µs ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.77µs ± 5% 5.71µs ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.102 n=19+16)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.8ns ± 4% +1.98% (p=0.019 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.51ns ± 4% 6.58ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.5ns ±10% 68.9ns ± 4% +15.83% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 481ns ± 5% 541ns ± 8% +12.54% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.51µs ± 4% 4.94µs ± 8% +9.53% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.26µs ± 3% 9.98µs ± 3% +7.79% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.5ns ±19% 71.1ns ± 3% +12.07% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 485ns ± 9% 551ns ±20% +13.47% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/64 4.51µs ± 6% 4.95µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.22µs ± 4% 9.97µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 8% 5.72ms ±18% ~ (p=0.199 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.12ms ± 5% 6.07ms ± 4% ~ (p=0.273 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 5% 12.5ms ±18% ~ (p=0.582 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 8% 12.1µs ± 3% ~ (p=0.963 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.126 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.0µs ± 6% 11.1µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.3µs ± 4% 24.5µs ± 6% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.7µs ± 5% 11.6µs ± 4% ~ (p=0.574 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.78µs ± 5% 5.73µs ± 5% ~ (p=0.357 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.117 n=19+16)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.08k ± 0% 6.05k ± 0% -0.54% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.39k ± 0% 6.36k ± 0% -0.55% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 336 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 9.89M ± 0% 9.95M ± 0% +0.65% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 9.95M ± 0% 10.02M ± 0% +0.70% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 138MB/s ± 7% 132MB/s ±15% ~ (p=0.126 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 124MB/s ± 5% 125MB/s ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 63.9MB/s ±13% 65.2MB/s ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 64.0MB/s ± 5% 61.3MB/s ±15% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 620MB/s ± 8% 622MB/s ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 644MB/s ±15% 679MB/s ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 627MB/s ± 4% 633MB/s ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 688MB/s ± 6% 682MB/s ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 310MB/s ± 4% 309MB/s ± 6% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 646MB/s ± 4% 649MB/s ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 666MB/s ± 3% 666MB/s ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 592MB/s ± 7% 593MB/s ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 1.30GB/s ± 5% 1.32GB/s ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 756MB/s ± 5% 745MB/s ± 6% ~ (p=0.102 n=19+16)
```
PiperOrigin-RevId: 520144430
2 years ago
|
|
|
}
|
|
|
|
|
Allow fuse/fuse races, so that upb_Arena is fully thread-compatible.
Previously upb_Arena was not thread-compatible when `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` executed in parallel if `b` and `c` were previously fused. This CL fixed that by allowing `upb_Arena_Fuse()` to run in parallel without limitations.
Details on the design of the algorithm are captured in comments.
The CL slightly improves the performance of `upb_Arena_Fuse()`.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.5ns ± 4% -12.30% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.65ns ± 4% 5.17ns ± 3% -22.23% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.1ns ± 7% 68.5ns ± 4% ~ (p=0.327 n=18+19)
BM_ArenaFuseUnbalanced/8 542ns ± 3% 513ns ± 4% -5.25% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.04µs ± 8% 4.74µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.80% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 71.8ns ± 7% 68.4ns ± 6% -4.75% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 541ns ± 3% 519ns ± 3% -4.21% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.00µs ± 7% 4.86µs ± 4% -2.78% (p=0.003 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 4% 9.7µs ± 4% -2.68% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 2% 5.54ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.18ms ± 3% 6.15ms ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.8ms ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 4% 12.3µs ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.1µs ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.6µs ±16% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 4% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.782 n=18+17)
BM_SerializeDescriptor_Proto2 5.69µs ± 5% 5.76µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old time/op new time/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.6ns ± 4% -12.37% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.66ns ± 4% 5.18ns ± 3% -22.24% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.2ns ± 7% 68.6ns ± 4% ~ (p=0.343 n=18+19)
BM_ArenaFuseUnbalanced/8 543ns ± 3% 515ns ± 4% -5.21% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.05µs ± 8% 4.75µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.78% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 72.0ns ± 7% 68.6ns ± 6% -4.73% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 543ns ± 3% 520ns ± 3% -4.20% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.01µs ± 7% 4.87µs ± 4% -2.78% (p=0.004 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 3% 9.8µs ± 4% -2.67% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.53ms ± 2% 5.56ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.20ms ± 3% 6.17ms ± 2% ~ (p=0.424 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.297 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.9ms ± 3% ~ (p=0.351 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.3µs ± 4% 12.3µs ± 4% ~ (p=1.000 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.845 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.542 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.2µs ± 2% ~ (p=0.330 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.7µs ±17% ~ (p=0.167 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.799 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.807 n=18+17)
BM_SerializeDescriptor_Proto2 5.71µs ± 5% 5.78µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.05k ± 0% 6.05k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.36k ± 0% 6.36k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% -0.00% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 328 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<NoLayout> 10.0M ± 0% 9.9M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 10.0M ± 0% 10.0M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% -0.01% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 137MB/s ± 2% 137MB/s ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 122MB/s ± 3% 123MB/s ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 64.2MB/s ± 7% 64.7MB/s ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 63.6MB/s ± 3% 63.9MB/s ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 614MB/s ± 4% 613MB/s ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 665MB/s ± 6% 667MB/s ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 624MB/s ± 4% 622MB/s ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 681MB/s ± 4% 675MB/s ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 311MB/s ± 3% 296MB/s ±15% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 649MB/s ± 3% 644MB/s ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 656MB/s ± 7% 659MB/s ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 587MB/s ± 5% 576MB/s ±16% ~ (p=0.584 n=18+18)
BM_SerializeDescriptor_Proto2 1.32GB/s ± 5% 1.31GB/s ± 7% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 737MB/s ± 4% 737MB/s ± 7% ~ (p=0.839 n=18+18)
```
PiperOrigin-RevId: 520452349
2 years ago
|
|
|
// Now that the fuse has been performed (and can no longer fail) we need to
|
|
|
|
// append `r2` to `r1`'s linked list.
|
|
|
|
_upb_Arena_DoFuseArenaLists(r1.root, r2.root);
|
Allow fuse/fuse races, so that upb_Arena is fully thread-compatible.
Previously upb_Arena was not thread-compatible when `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` executed in parallel if `b` and `c` were previously fused. This CL fixed that by allowing `upb_Arena_Fuse()` to run in parallel without limitations.
Details on the design of the algorithm are captured in comments.
The CL slightly improves the performance of `upb_Arena_Fuse()`.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.5ns ± 4% -12.30% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.65ns ± 4% 5.17ns ± 3% -22.23% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.1ns ± 7% 68.5ns ± 4% ~ (p=0.327 n=18+19)
BM_ArenaFuseUnbalanced/8 542ns ± 3% 513ns ± 4% -5.25% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.04µs ± 8% 4.74µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.80% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 71.8ns ± 7% 68.4ns ± 6% -4.75% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 541ns ± 3% 519ns ± 3% -4.21% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.00µs ± 7% 4.86µs ± 4% -2.78% (p=0.003 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 4% 9.7µs ± 4% -2.68% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 2% 5.54ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.18ms ± 3% 6.15ms ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.8ms ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 4% 12.3µs ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.1µs ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.6µs ±16% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 4% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.782 n=18+17)
BM_SerializeDescriptor_Proto2 5.69µs ± 5% 5.76µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old time/op new time/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.6ns ± 4% -12.37% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.66ns ± 4% 5.18ns ± 3% -22.24% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.2ns ± 7% 68.6ns ± 4% ~ (p=0.343 n=18+19)
BM_ArenaFuseUnbalanced/8 543ns ± 3% 515ns ± 4% -5.21% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.05µs ± 8% 4.75µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.78% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 72.0ns ± 7% 68.6ns ± 6% -4.73% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 543ns ± 3% 520ns ± 3% -4.20% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.01µs ± 7% 4.87µs ± 4% -2.78% (p=0.004 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 3% 9.8µs ± 4% -2.67% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.53ms ± 2% 5.56ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.20ms ± 3% 6.17ms ± 2% ~ (p=0.424 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.297 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.9ms ± 3% ~ (p=0.351 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.3µs ± 4% 12.3µs ± 4% ~ (p=1.000 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.845 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.542 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.2µs ± 2% ~ (p=0.330 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.7µs ±17% ~ (p=0.167 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.799 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.807 n=18+17)
BM_SerializeDescriptor_Proto2 5.71µs ± 5% 5.78µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.05k ± 0% 6.05k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.36k ± 0% 6.36k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% -0.00% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 328 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<NoLayout> 10.0M ± 0% 9.9M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 10.0M ± 0% 10.0M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% -0.01% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 137MB/s ± 2% 137MB/s ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 122MB/s ± 3% 123MB/s ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 64.2MB/s ± 7% 64.7MB/s ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 63.6MB/s ± 3% 63.9MB/s ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 614MB/s ± 4% 613MB/s ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 665MB/s ± 6% 667MB/s ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 624MB/s ± 4% 622MB/s ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 681MB/s ± 4% 675MB/s ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 311MB/s ± 3% 296MB/s ±15% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 649MB/s ± 3% 644MB/s ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 656MB/s ± 7% 659MB/s ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 587MB/s ± 5% 576MB/s ±16% ~ (p=0.584 n=18+18)
BM_SerializeDescriptor_Proto2 1.32GB/s ± 5% 1.31GB/s ± 7% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 737MB/s ± 4% 737MB/s ± 7% ~ (p=0.839 n=18+18)
```
PiperOrigin-RevId: 520452349
2 years ago
|
|
|
return r1.root;
|
|
|
|
}
|
Changed Arena representation so that fusing links arenas together instead of blocks.
Previously when fusing, we would concatenate all blocks into a single list that lived in the arena root. From then on, all arenas would add their blocks to this single unified list.
After this CL, arenas keep their distinct list of blocks even after being fused. Instead of unifying the block list, fuse now puts the arenas themselves into a list, so all arenas in the fused group can be iterated over at any time.
This design makes it easier to keep each individual arena thread-compatible, because fuse and free are now the only mutating operations that touch state that is shared with the entire group. Read-only operations like `SpaceAllocated()` also iterate the list of arenas, but in a read-only fashion.
(Note: we need tests for SpaceAllocated(), both single-threaded for correctness and multi-threaded for resilience to crashes and data races).
Performance of fuse regresses by 5-20%. This is somewhat expected as we are performing more atomic operations during a fuse.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.7ns ± 4% +2.00% (p=0.016 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.50ns ± 4% 6.57ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.3ns ±10% 68.7ns ± 4% +15.85% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 479ns ± 5% 540ns ± 8% +12.57% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.50µs ± 4% 4.93µs ± 8% +9.59% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.24µs ± 3% 9.96µs ± 3% +7.81% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.3ns ±18% 71.0ns ± 4% +12.14% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 484ns ± 9% 543ns ±10% +12.11% (p=0.000 n=17+16)
BM_ArenaFuseBalanced/64 4.50µs ± 6% 4.94µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.20µs ± 4% 9.95µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.50ms ± 8% 5.69ms ±17% ~ (p=0.189 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.10ms ± 5% 6.05ms ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.8ms ± 5% 12.4ms ±17% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.1µs ± 8% 12.1µs ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 10.9µs ± 7% 11.0µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 4% 24.4µs ± 7% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 5% 11.6µs ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.77µs ± 5% 5.71µs ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.102 n=19+16)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.8ns ± 4% +1.98% (p=0.019 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.51ns ± 4% 6.58ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.5ns ±10% 68.9ns ± 4% +15.83% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 481ns ± 5% 541ns ± 8% +12.54% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.51µs ± 4% 4.94µs ± 8% +9.53% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.26µs ± 3% 9.98µs ± 3% +7.79% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.5ns ±19% 71.1ns ± 3% +12.07% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 485ns ± 9% 551ns ±20% +13.47% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/64 4.51µs ± 6% 4.95µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.22µs ± 4% 9.97µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 8% 5.72ms ±18% ~ (p=0.199 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.12ms ± 5% 6.07ms ± 4% ~ (p=0.273 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 5% 12.5ms ±18% ~ (p=0.582 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 8% 12.1µs ± 3% ~ (p=0.963 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.126 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.0µs ± 6% 11.1µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.3µs ± 4% 24.5µs ± 6% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.7µs ± 5% 11.6µs ± 4% ~ (p=0.574 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.78µs ± 5% 5.73µs ± 5% ~ (p=0.357 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.117 n=19+16)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.08k ± 0% 6.05k ± 0% -0.54% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.39k ± 0% 6.36k ± 0% -0.55% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 336 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 9.89M ± 0% 9.95M ± 0% +0.65% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 9.95M ± 0% 10.02M ± 0% +0.70% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 138MB/s ± 7% 132MB/s ±15% ~ (p=0.126 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 124MB/s ± 5% 125MB/s ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 63.9MB/s ±13% 65.2MB/s ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 64.0MB/s ± 5% 61.3MB/s ±15% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 620MB/s ± 8% 622MB/s ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 644MB/s ±15% 679MB/s ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 627MB/s ± 4% 633MB/s ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 688MB/s ± 6% 682MB/s ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 310MB/s ± 4% 309MB/s ± 6% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 646MB/s ± 4% 649MB/s ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 666MB/s ± 3% 666MB/s ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 592MB/s ± 7% 593MB/s ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 1.30GB/s ± 5% 1.32GB/s ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 756MB/s ± 5% 745MB/s ± 6% ~ (p=0.102 n=19+16)
```
PiperOrigin-RevId: 520144430
2 years ago
|
|
|
|
|
|
|
// Thread safe.
|
|
|
|
static bool _upb_Arena_FixupRefs(upb_ArenaInternal* new_root,
|
|
|
|
uintptr_t ref_delta) {
|
Allow fuse/fuse races, so that upb_Arena is fully thread-compatible.
Previously upb_Arena was not thread-compatible when `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` executed in parallel if `b` and `c` were previously fused. This CL fixed that by allowing `upb_Arena_Fuse()` to run in parallel without limitations.
Details on the design of the algorithm are captured in comments.
The CL slightly improves the performance of `upb_Arena_Fuse()`.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.5ns ± 4% -12.30% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.65ns ± 4% 5.17ns ± 3% -22.23% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.1ns ± 7% 68.5ns ± 4% ~ (p=0.327 n=18+19)
BM_ArenaFuseUnbalanced/8 542ns ± 3% 513ns ± 4% -5.25% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.04µs ± 8% 4.74µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.80% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 71.8ns ± 7% 68.4ns ± 6% -4.75% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 541ns ± 3% 519ns ± 3% -4.21% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.00µs ± 7% 4.86µs ± 4% -2.78% (p=0.003 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 4% 9.7µs ± 4% -2.68% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 2% 5.54ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.18ms ± 3% 6.15ms ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.8ms ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 4% 12.3µs ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.1µs ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.6µs ±16% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 4% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.782 n=18+17)
BM_SerializeDescriptor_Proto2 5.69µs ± 5% 5.76µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old time/op new time/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.6ns ± 4% -12.37% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.66ns ± 4% 5.18ns ± 3% -22.24% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.2ns ± 7% 68.6ns ± 4% ~ (p=0.343 n=18+19)
BM_ArenaFuseUnbalanced/8 543ns ± 3% 515ns ± 4% -5.21% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.05µs ± 8% 4.75µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.78% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 72.0ns ± 7% 68.6ns ± 6% -4.73% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 543ns ± 3% 520ns ± 3% -4.20% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.01µs ± 7% 4.87µs ± 4% -2.78% (p=0.004 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 3% 9.8µs ± 4% -2.67% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.53ms ± 2% 5.56ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.20ms ± 3% 6.17ms ± 2% ~ (p=0.424 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.297 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.9ms ± 3% ~ (p=0.351 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.3µs ± 4% 12.3µs ± 4% ~ (p=1.000 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.845 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.542 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.2µs ± 2% ~ (p=0.330 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.7µs ±17% ~ (p=0.167 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.799 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.807 n=18+17)
BM_SerializeDescriptor_Proto2 5.71µs ± 5% 5.78µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.05k ± 0% 6.05k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.36k ± 0% 6.36k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% -0.00% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 328 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<NoLayout> 10.0M ± 0% 9.9M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 10.0M ± 0% 10.0M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% -0.01% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 137MB/s ± 2% 137MB/s ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 122MB/s ± 3% 123MB/s ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 64.2MB/s ± 7% 64.7MB/s ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 63.6MB/s ± 3% 63.9MB/s ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 614MB/s ± 4% 613MB/s ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 665MB/s ± 6% 667MB/s ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 624MB/s ± 4% 622MB/s ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 681MB/s ± 4% 675MB/s ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 311MB/s ± 3% 296MB/s ±15% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 649MB/s ± 3% 644MB/s ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 656MB/s ± 7% 659MB/s ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 587MB/s ± 5% 576MB/s ±16% ~ (p=0.584 n=18+18)
BM_SerializeDescriptor_Proto2 1.32GB/s ± 5% 1.31GB/s ± 7% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 737MB/s ± 4% 737MB/s ± 7% ~ (p=0.839 n=18+18)
```
PiperOrigin-RevId: 520452349
2 years ago
|
|
|
if (ref_delta == 0) return true; // No fixup required.
|
|
|
|
// Relaxed order is safe here as if the value is a pointer, we don't deref it
|
|
|
|
// or publish it anywhere else. The refcount does provide memory order
|
|
|
|
// between allocations on arenas and the eventual free and thus normally
|
|
|
|
// requires acquire/release; but in this case any edges provided by the refs
|
|
|
|
// we are cleaning up were already provided by the fuse operation itself. It's
|
|
|
|
// not valid for a decrement that could cause the overall fused arena to reach
|
|
|
|
// a zero refcount to race with this function, as that could result in a
|
|
|
|
// use-after-free anyway.
|
Allow fuse/fuse races, so that upb_Arena is fully thread-compatible.
Previously upb_Arena was not thread-compatible when `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` executed in parallel if `b` and `c` were previously fused. This CL fixed that by allowing `upb_Arena_Fuse()` to run in parallel without limitations.
Details on the design of the algorithm are captured in comments.
The CL slightly improves the performance of `upb_Arena_Fuse()`.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.5ns ± 4% -12.30% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.65ns ± 4% 5.17ns ± 3% -22.23% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.1ns ± 7% 68.5ns ± 4% ~ (p=0.327 n=18+19)
BM_ArenaFuseUnbalanced/8 542ns ± 3% 513ns ± 4% -5.25% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.04µs ± 8% 4.74µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.80% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 71.8ns ± 7% 68.4ns ± 6% -4.75% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 541ns ± 3% 519ns ± 3% -4.21% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.00µs ± 7% 4.86µs ± 4% -2.78% (p=0.003 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 4% 9.7µs ± 4% -2.68% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 2% 5.54ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.18ms ± 3% 6.15ms ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.8ms ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 4% 12.3µs ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.1µs ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.6µs ±16% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 4% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.782 n=18+17)
BM_SerializeDescriptor_Proto2 5.69µs ± 5% 5.76µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old time/op new time/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.6ns ± 4% -12.37% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.66ns ± 4% 5.18ns ± 3% -22.24% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.2ns ± 7% 68.6ns ± 4% ~ (p=0.343 n=18+19)
BM_ArenaFuseUnbalanced/8 543ns ± 3% 515ns ± 4% -5.21% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.05µs ± 8% 4.75µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.78% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 72.0ns ± 7% 68.6ns ± 6% -4.73% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 543ns ± 3% 520ns ± 3% -4.20% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.01µs ± 7% 4.87µs ± 4% -2.78% (p=0.004 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 3% 9.8µs ± 4% -2.67% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.53ms ± 2% 5.56ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.20ms ± 3% 6.17ms ± 2% ~ (p=0.424 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.297 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.9ms ± 3% ~ (p=0.351 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.3µs ± 4% 12.3µs ± 4% ~ (p=1.000 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.845 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.542 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.2µs ± 2% ~ (p=0.330 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.7µs ±17% ~ (p=0.167 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.799 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.807 n=18+17)
BM_SerializeDescriptor_Proto2 5.71µs ± 5% 5.78µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.05k ± 0% 6.05k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.36k ± 0% 6.36k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% -0.00% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 328 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<NoLayout> 10.0M ± 0% 9.9M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 10.0M ± 0% 10.0M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% -0.01% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 137MB/s ± 2% 137MB/s ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 122MB/s ± 3% 123MB/s ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 64.2MB/s ± 7% 64.7MB/s ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 63.6MB/s ± 3% 63.9MB/s ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 614MB/s ± 4% 613MB/s ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 665MB/s ± 6% 667MB/s ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 624MB/s ± 4% 622MB/s ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 681MB/s ± 4% 675MB/s ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 311MB/s ± 3% 296MB/s ±15% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 649MB/s ± 3% 644MB/s ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 656MB/s ± 7% 659MB/s ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 587MB/s ± 5% 576MB/s ±16% ~ (p=0.584 n=18+18)
BM_SerializeDescriptor_Proto2 1.32GB/s ± 5% 1.31GB/s ± 7% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 737MB/s ± 4% 737MB/s ± 7% ~ (p=0.839 n=18+18)
```
PiperOrigin-RevId: 520452349
2 years ago
|
|
|
uintptr_t poc =
|
|
|
|
upb_Atomic_Load(&new_root->parent_or_count, memory_order_relaxed);
|
|
|
|
if (_upb_Arena_IsTaggedPointer(poc)) return false;
|
|
|
|
uintptr_t with_refs = poc - ref_delta;
|
Allow fuse/fuse races, so that upb_Arena is fully thread-compatible.
Previously upb_Arena was not thread-compatible when `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` executed in parallel if `b` and `c` were previously fused. This CL fixed that by allowing `upb_Arena_Fuse()` to run in parallel without limitations.
Details on the design of the algorithm are captured in comments.
The CL slightly improves the performance of `upb_Arena_Fuse()`.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.5ns ± 4% -12.30% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.65ns ± 4% 5.17ns ± 3% -22.23% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.1ns ± 7% 68.5ns ± 4% ~ (p=0.327 n=18+19)
BM_ArenaFuseUnbalanced/8 542ns ± 3% 513ns ± 4% -5.25% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.04µs ± 8% 4.74µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.80% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 71.8ns ± 7% 68.4ns ± 6% -4.75% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 541ns ± 3% 519ns ± 3% -4.21% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.00µs ± 7% 4.86µs ± 4% -2.78% (p=0.003 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 4% 9.7µs ± 4% -2.68% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 2% 5.54ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.18ms ± 3% 6.15ms ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.8ms ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 4% 12.3µs ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.1µs ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.6µs ±16% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 4% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.782 n=18+17)
BM_SerializeDescriptor_Proto2 5.69µs ± 5% 5.76µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old time/op new time/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.6ns ± 4% -12.37% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.66ns ± 4% 5.18ns ± 3% -22.24% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.2ns ± 7% 68.6ns ± 4% ~ (p=0.343 n=18+19)
BM_ArenaFuseUnbalanced/8 543ns ± 3% 515ns ± 4% -5.21% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.05µs ± 8% 4.75µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.78% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 72.0ns ± 7% 68.6ns ± 6% -4.73% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 543ns ± 3% 520ns ± 3% -4.20% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.01µs ± 7% 4.87µs ± 4% -2.78% (p=0.004 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 3% 9.8µs ± 4% -2.67% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.53ms ± 2% 5.56ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.20ms ± 3% 6.17ms ± 2% ~ (p=0.424 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.297 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.9ms ± 3% ~ (p=0.351 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.3µs ± 4% 12.3µs ± 4% ~ (p=1.000 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.845 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.542 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.2µs ± 2% ~ (p=0.330 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.7µs ±17% ~ (p=0.167 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.799 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.807 n=18+17)
BM_SerializeDescriptor_Proto2 5.71µs ± 5% 5.78µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.05k ± 0% 6.05k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.36k ± 0% 6.36k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% -0.00% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 328 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<NoLayout> 10.0M ± 0% 9.9M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 10.0M ± 0% 10.0M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% -0.01% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 137MB/s ± 2% 137MB/s ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 122MB/s ± 3% 123MB/s ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 64.2MB/s ± 7% 64.7MB/s ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 63.6MB/s ± 3% 63.9MB/s ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 614MB/s ± 4% 613MB/s ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 665MB/s ± 6% 667MB/s ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 624MB/s ± 4% 622MB/s ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 681MB/s ± 4% 675MB/s ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 311MB/s ± 3% 296MB/s ±15% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 649MB/s ± 3% 644MB/s ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 656MB/s ± 7% 659MB/s ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 587MB/s ± 5% 576MB/s ±16% ~ (p=0.584 n=18+18)
BM_SerializeDescriptor_Proto2 1.32GB/s ± 5% 1.31GB/s ± 7% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 737MB/s ± 4% 737MB/s ± 7% ~ (p=0.839 n=18+18)
```
PiperOrigin-RevId: 520452349
2 years ago
|
|
|
UPB_ASSERT(!_upb_Arena_IsTaggedPointer(with_refs));
|
|
|
|
// Relaxed order on success is safe here, for the same reasons as the relaxed
|
|
|
|
// read above. Relaxed order is safe on failure because the updated value is
|
|
|
|
// stored in a local variable which goes immediately out of scope; the retry
|
|
|
|
// loop will reread what it needs with proper memory order.
|
Allow fuse/fuse races, so that upb_Arena is fully thread-compatible.
Previously upb_Arena was not thread-compatible when `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` executed in parallel if `b` and `c` were previously fused. This CL fixed that by allowing `upb_Arena_Fuse()` to run in parallel without limitations.
Details on the design of the algorithm are captured in comments.
The CL slightly improves the performance of `upb_Arena_Fuse()`.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.5ns ± 4% -12.30% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.65ns ± 4% 5.17ns ± 3% -22.23% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.1ns ± 7% 68.5ns ± 4% ~ (p=0.327 n=18+19)
BM_ArenaFuseUnbalanced/8 542ns ± 3% 513ns ± 4% -5.25% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.04µs ± 8% 4.74µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.80% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 71.8ns ± 7% 68.4ns ± 6% -4.75% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 541ns ± 3% 519ns ± 3% -4.21% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.00µs ± 7% 4.86µs ± 4% -2.78% (p=0.003 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 4% 9.7µs ± 4% -2.68% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 2% 5.54ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.18ms ± 3% 6.15ms ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.8ms ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 4% 12.3µs ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.1µs ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.6µs ±16% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 4% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.782 n=18+17)
BM_SerializeDescriptor_Proto2 5.69µs ± 5% 5.76µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old time/op new time/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.6ns ± 4% -12.37% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.66ns ± 4% 5.18ns ± 3% -22.24% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.2ns ± 7% 68.6ns ± 4% ~ (p=0.343 n=18+19)
BM_ArenaFuseUnbalanced/8 543ns ± 3% 515ns ± 4% -5.21% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.05µs ± 8% 4.75µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.78% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 72.0ns ± 7% 68.6ns ± 6% -4.73% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 543ns ± 3% 520ns ± 3% -4.20% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.01µs ± 7% 4.87µs ± 4% -2.78% (p=0.004 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 3% 9.8µs ± 4% -2.67% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.53ms ± 2% 5.56ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.20ms ± 3% 6.17ms ± 2% ~ (p=0.424 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.297 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.9ms ± 3% ~ (p=0.351 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.3µs ± 4% 12.3µs ± 4% ~ (p=1.000 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.845 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.542 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.2µs ± 2% ~ (p=0.330 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.7µs ±17% ~ (p=0.167 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.799 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.807 n=18+17)
BM_SerializeDescriptor_Proto2 5.71µs ± 5% 5.78µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.05k ± 0% 6.05k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.36k ± 0% 6.36k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% -0.00% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 328 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<NoLayout> 10.0M ± 0% 9.9M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 10.0M ± 0% 10.0M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% -0.01% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 137MB/s ± 2% 137MB/s ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 122MB/s ± 3% 123MB/s ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 64.2MB/s ± 7% 64.7MB/s ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 63.6MB/s ± 3% 63.9MB/s ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 614MB/s ± 4% 613MB/s ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 665MB/s ± 6% 667MB/s ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 624MB/s ± 4% 622MB/s ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 681MB/s ± 4% 675MB/s ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 311MB/s ± 3% 296MB/s ±15% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 649MB/s ± 3% 644MB/s ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 656MB/s ± 7% 659MB/s ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 587MB/s ± 5% 576MB/s ±16% ~ (p=0.584 n=18+18)
BM_SerializeDescriptor_Proto2 1.32GB/s ± 5% 1.31GB/s ± 7% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 737MB/s ± 4% 737MB/s ± 7% ~ (p=0.839 n=18+18)
```
PiperOrigin-RevId: 520452349
2 years ago
|
|
|
return upb_Atomic_CompareExchangeStrong(&new_root->parent_or_count, &poc,
|
|
|
|
with_refs, memory_order_relaxed,
|
|
|
|
memory_order_relaxed);
|
|
|
|
}
|
Changed Arena representation so that fusing links arenas together instead of blocks.
Previously when fusing, we would concatenate all blocks into a single list that lived in the arena root. From then on, all arenas would add their blocks to this single unified list.
After this CL, arenas keep their distinct list of blocks even after being fused. Instead of unifying the block list, fuse now puts the arenas themselves into a list, so all arenas in the fused group can be iterated over at any time.
This design makes it easier to keep each individual arena thread-compatible, because fuse and free are now the only mutating operations that touch state that is shared with the entire group. Read-only operations like `SpaceAllocated()` also iterate the list of arenas, but in a read-only fashion.
(Note: we need tests for SpaceAllocated(), both single-threaded for correctness and multi-threaded for resilience to crashes and data races).
Performance of fuse regresses by 5-20%. This is somewhat expected as we are performing more atomic operations during a fuse.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.7ns ± 4% +2.00% (p=0.016 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.50ns ± 4% 6.57ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.3ns ±10% 68.7ns ± 4% +15.85% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 479ns ± 5% 540ns ± 8% +12.57% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.50µs ± 4% 4.93µs ± 8% +9.59% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.24µs ± 3% 9.96µs ± 3% +7.81% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.3ns ±18% 71.0ns ± 4% +12.14% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 484ns ± 9% 543ns ±10% +12.11% (p=0.000 n=17+16)
BM_ArenaFuseBalanced/64 4.50µs ± 6% 4.94µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.20µs ± 4% 9.95µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.50ms ± 8% 5.69ms ±17% ~ (p=0.189 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.10ms ± 5% 6.05ms ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.8ms ± 5% 12.4ms ±17% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.1µs ± 8% 12.1µs ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 10.9µs ± 7% 11.0µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 4% 24.4µs ± 7% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 5% 11.6µs ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.77µs ± 5% 5.71µs ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.102 n=19+16)
name old time/op new time/op delta
BM_ArenaOneAlloc 18.4ns ± 6% 18.8ns ± 4% +1.98% (p=0.019 n=18+18)
BM_ArenaInitialBlockOneAlloc 5.51ns ± 4% 6.58ns ± 4% +19.42% (p=0.000 n=16+17)
BM_ArenaFuseUnbalanced/2 59.5ns ±10% 68.9ns ± 4% +15.83% (p=0.000 n=19+19)
BM_ArenaFuseUnbalanced/8 481ns ± 5% 541ns ± 8% +12.54% (p=0.000 n=18+19)
BM_ArenaFuseUnbalanced/64 4.51µs ± 4% 4.94µs ± 8% +9.53% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 9.26µs ± 3% 9.98µs ± 3% +7.79% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/2 63.5ns ±19% 71.1ns ± 3% +12.07% (p=0.000 n=19+18)
BM_ArenaFuseBalanced/8 485ns ± 9% 551ns ±20% +13.47% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/64 4.51µs ± 6% 4.95µs ± 4% +9.62% (p=0.000 n=19+17)
BM_ArenaFuseBalanced/128 9.22µs ± 4% 9.97µs ± 4% +8.12% (p=0.000 n=16+19)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 8% 5.72ms ±18% ~ (p=0.199 n=18+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.12ms ± 5% 6.07ms ± 4% ~ (p=0.273 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.9ms ±15% 11.6ms ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 5% 12.5ms ±18% ~ (p=0.582 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 8% 12.1µs ± 3% ~ (p=0.963 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.8µs ±17% 11.1µs ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.0µs ± 5% 11.9µs ± 4% ~ (p=0.126 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.0µs ± 6% 11.1µs ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.3µs ± 4% 24.5µs ± 6% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.7µs ± 5% 11.6µs ± 4% ~ (p=0.574 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.3µs ± 3% 11.3µs ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.7µs ± 8% 12.7µs ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 5.78µs ± 5% 5.73µs ± 5% ~ (p=0.357 n=17+17)
BM_SerializeDescriptor_Upb 10.0µs ± 5% 10.1µs ± 7% ~ (p=0.117 n=19+16)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.08k ± 0% 6.05k ± 0% -0.54% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.39k ± 0% 6.36k ± 0% -0.55% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 336 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 672 ± 0% 672 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.69k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.5k ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 43.0k ± 0% 43.0k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 9.89M ± 0% 9.95M ± 0% +0.65% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 9.95M ± 0% 10.02M ± 0% +0.70% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (p=0.800 n=20+20)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% ~ (p=0.752 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 138MB/s ± 7% 132MB/s ±15% ~ (p=0.126 n=18+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 124MB/s ± 5% 125MB/s ± 4% ~ (p=0.258 n=17+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 63.9MB/s ±13% 65.2MB/s ± 5% ~ (p=0.589 n=19+16)
BM_LoadAdsDescriptor_Proto2<WithLayout> 64.0MB/s ± 5% 61.3MB/s ±15% ~ (p=0.604 n=16+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 620MB/s ± 8% 622MB/s ± 4% ~ (p=1.000 n=18+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 644MB/s ±15% 679MB/s ± 4% ~ (p=0.104 n=20+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 627MB/s ± 4% 633MB/s ± 4% ~ (p=0.134 n=18+19)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 688MB/s ± 6% 682MB/s ± 4% ~ (p=0.195 n=17+18)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 310MB/s ± 4% 309MB/s ± 6% ~ (p=0.767 n=18+18)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 646MB/s ± 4% 649MB/s ± 4% ~ (p=0.621 n=18+16)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 666MB/s ± 3% 666MB/s ± 3% ~ (p=0.743 n=18+18)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 592MB/s ± 7% 593MB/s ± 4% ~ (p=0.988 n=18+19)
BM_SerializeDescriptor_Proto2 1.30GB/s ± 5% 1.32GB/s ± 5% ~ (p=0.433 n=17+17)
BM_SerializeDescriptor_Upb 756MB/s ± 5% 745MB/s ± 6% ~ (p=0.102 n=19+16)
```
PiperOrigin-RevId: 520144430
2 years ago
|
|
|
|
|
|
|
bool upb_Arena_Fuse(const upb_Arena* a1, const upb_Arena* a2) {
|
|
|
|
if (a1 == a2) return true; // trivial fuse
|
Allow fuse/fuse races, so that upb_Arena is fully thread-compatible.
Previously upb_Arena was not thread-compatible when `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` executed in parallel if `b` and `c` were previously fused. This CL fixed that by allowing `upb_Arena_Fuse()` to run in parallel without limitations.
Details on the design of the algorithm are captured in comments.
The CL slightly improves the performance of `upb_Arena_Fuse()`.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.5ns ± 4% -12.30% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.65ns ± 4% 5.17ns ± 3% -22.23% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.1ns ± 7% 68.5ns ± 4% ~ (p=0.327 n=18+19)
BM_ArenaFuseUnbalanced/8 542ns ± 3% 513ns ± 4% -5.25% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.04µs ± 8% 4.74µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.80% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 71.8ns ± 7% 68.4ns ± 6% -4.75% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 541ns ± 3% 519ns ± 3% -4.21% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.00µs ± 7% 4.86µs ± 4% -2.78% (p=0.003 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 4% 9.7µs ± 4% -2.68% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 2% 5.54ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.18ms ± 3% 6.15ms ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.8ms ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 4% 12.3µs ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.1µs ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.6µs ±16% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 4% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.782 n=18+17)
BM_SerializeDescriptor_Proto2 5.69µs ± 5% 5.76µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old time/op new time/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.6ns ± 4% -12.37% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.66ns ± 4% 5.18ns ± 3% -22.24% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.2ns ± 7% 68.6ns ± 4% ~ (p=0.343 n=18+19)
BM_ArenaFuseUnbalanced/8 543ns ± 3% 515ns ± 4% -5.21% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.05µs ± 8% 4.75µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.78% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 72.0ns ± 7% 68.6ns ± 6% -4.73% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 543ns ± 3% 520ns ± 3% -4.20% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.01µs ± 7% 4.87µs ± 4% -2.78% (p=0.004 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 3% 9.8µs ± 4% -2.67% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.53ms ± 2% 5.56ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.20ms ± 3% 6.17ms ± 2% ~ (p=0.424 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.297 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.9ms ± 3% ~ (p=0.351 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.3µs ± 4% 12.3µs ± 4% ~ (p=1.000 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.845 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.542 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.2µs ± 2% ~ (p=0.330 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.7µs ±17% ~ (p=0.167 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.799 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.807 n=18+17)
BM_SerializeDescriptor_Proto2 5.71µs ± 5% 5.78µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.05k ± 0% 6.05k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.36k ± 0% 6.36k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% -0.00% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 328 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<NoLayout> 10.0M ± 0% 9.9M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 10.0M ± 0% 10.0M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% -0.01% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 137MB/s ± 2% 137MB/s ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 122MB/s ± 3% 123MB/s ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 64.2MB/s ± 7% 64.7MB/s ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 63.6MB/s ± 3% 63.9MB/s ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 614MB/s ± 4% 613MB/s ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 665MB/s ± 6% 667MB/s ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 624MB/s ± 4% 622MB/s ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 681MB/s ± 4% 675MB/s ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 311MB/s ± 3% 296MB/s ±15% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 649MB/s ± 3% 644MB/s ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 656MB/s ± 7% 659MB/s ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 587MB/s ± 5% 576MB/s ±16% ~ (p=0.584 n=18+18)
BM_SerializeDescriptor_Proto2 1.32GB/s ± 5% 1.31GB/s ± 7% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 737MB/s ± 4% 737MB/s ± 7% ~ (p=0.839 n=18+18)
```
PiperOrigin-RevId: 520452349
2 years ago
|
|
|
|
|
|
|
#ifdef UPB_TRACING_ENABLED
|
|
|
|
upb_Arena_LogFuse(a1, a2);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
upb_ArenaInternal* ai1 = upb_Arena_Internal(a1);
|
|
|
|
upb_ArenaInternal* ai2 = upb_Arena_Internal(a2);
|
|
|
|
|
|
|
|
// Do not fuse initial blocks since we cannot lifetime extend them.
|
|
|
|
// Any other fuse scenario is allowed.
|
|
|
|
if (_upb_ArenaInternal_HasInitialBlock(ai1) ||
|
|
|
|
_upb_ArenaInternal_HasInitialBlock(ai2)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// The number of refs we ultimately need to transfer to the new root.
|
|
|
|
uintptr_t ref_delta = 0;
|
Allow fuse/fuse races, so that upb_Arena is fully thread-compatible.
Previously upb_Arena was not thread-compatible when `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` executed in parallel if `b` and `c` were previously fused. This CL fixed that by allowing `upb_Arena_Fuse()` to run in parallel without limitations.
Details on the design of the algorithm are captured in comments.
The CL slightly improves the performance of `upb_Arena_Fuse()`.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.5ns ± 4% -12.30% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.65ns ± 4% 5.17ns ± 3% -22.23% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.1ns ± 7% 68.5ns ± 4% ~ (p=0.327 n=18+19)
BM_ArenaFuseUnbalanced/8 542ns ± 3% 513ns ± 4% -5.25% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.04µs ± 8% 4.74µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.80% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 71.8ns ± 7% 68.4ns ± 6% -4.75% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 541ns ± 3% 519ns ± 3% -4.21% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.00µs ± 7% 4.86µs ± 4% -2.78% (p=0.003 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 4% 9.7µs ± 4% -2.68% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 2% 5.54ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.18ms ± 3% 6.15ms ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.8ms ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 4% 12.3µs ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.1µs ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.6µs ±16% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 4% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.782 n=18+17)
BM_SerializeDescriptor_Proto2 5.69µs ± 5% 5.76µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old time/op new time/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.6ns ± 4% -12.37% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.66ns ± 4% 5.18ns ± 3% -22.24% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.2ns ± 7% 68.6ns ± 4% ~ (p=0.343 n=18+19)
BM_ArenaFuseUnbalanced/8 543ns ± 3% 515ns ± 4% -5.21% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.05µs ± 8% 4.75µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.78% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 72.0ns ± 7% 68.6ns ± 6% -4.73% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 543ns ± 3% 520ns ± 3% -4.20% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.01µs ± 7% 4.87µs ± 4% -2.78% (p=0.004 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 3% 9.8µs ± 4% -2.67% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.53ms ± 2% 5.56ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.20ms ± 3% 6.17ms ± 2% ~ (p=0.424 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.297 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.9ms ± 3% ~ (p=0.351 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.3µs ± 4% 12.3µs ± 4% ~ (p=1.000 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.845 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.542 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.2µs ± 2% ~ (p=0.330 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.7µs ±17% ~ (p=0.167 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.799 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.807 n=18+17)
BM_SerializeDescriptor_Proto2 5.71µs ± 5% 5.78µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.05k ± 0% 6.05k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.36k ± 0% 6.36k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% -0.00% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 328 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<NoLayout> 10.0M ± 0% 9.9M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 10.0M ± 0% 10.0M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% -0.01% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 137MB/s ± 2% 137MB/s ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 122MB/s ± 3% 123MB/s ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 64.2MB/s ± 7% 64.7MB/s ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 63.6MB/s ± 3% 63.9MB/s ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 614MB/s ± 4% 613MB/s ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 665MB/s ± 6% 667MB/s ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 624MB/s ± 4% 622MB/s ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 681MB/s ± 4% 675MB/s ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 311MB/s ± 3% 296MB/s ±15% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 649MB/s ± 3% 644MB/s ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 656MB/s ± 7% 659MB/s ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 587MB/s ± 5% 576MB/s ±16% ~ (p=0.584 n=18+18)
BM_SerializeDescriptor_Proto2 1.32GB/s ± 5% 1.31GB/s ± 7% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 737MB/s ± 4% 737MB/s ± 7% ~ (p=0.839 n=18+18)
```
PiperOrigin-RevId: 520452349
2 years ago
|
|
|
while (true) {
|
|
|
|
upb_ArenaInternal* new_root = _upb_Arena_DoFuse(&ai1, &ai2, &ref_delta);
|
|
|
|
if (new_root != NULL && _upb_Arena_FixupRefs(new_root, ref_delta)) {
|
|
|
|
return true;
|
Allow fuse/fuse races, so that upb_Arena is fully thread-compatible.
Previously upb_Arena was not thread-compatible when `upb_Arena_Fuse(a, b)` and `upb_Arena_Fuse(c, d)` executed in parallel if `b` and `c` were previously fused. This CL fixed that by allowing `upb_Arena_Fuse()` to run in parallel without limitations.
Details on the design of the algorithm are captured in comments.
The CL slightly improves the performance of `upb_Arena_Fuse()`.
```
name old cpu/op new cpu/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.5ns ± 4% -12.30% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.65ns ± 4% 5.17ns ± 3% -22.23% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.1ns ± 7% 68.5ns ± 4% ~ (p=0.327 n=18+19)
BM_ArenaFuseUnbalanced/8 542ns ± 3% 513ns ± 4% -5.25% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.04µs ± 8% 4.74µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.80% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 71.8ns ± 7% 68.4ns ± 6% -4.75% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 541ns ± 3% 519ns ± 3% -4.21% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.00µs ± 7% 4.86µs ± 4% -2.78% (p=0.003 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 4% 9.7µs ± 4% -2.68% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.52ms ± 2% 5.54ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.18ms ± 3% 6.15ms ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.8ms ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.2µs ± 4% 12.3µs ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.1µs ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.6µs ±16% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 4% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.782 n=18+17)
BM_SerializeDescriptor_Proto2 5.69µs ± 5% 5.76µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old time/op new time/op delta
BM_ArenaOneAlloc 20.0ns ±19% 17.6ns ± 4% -12.37% (p=0.000 n=19+17)
BM_ArenaInitialBlockOneAlloc 6.66ns ± 4% 5.18ns ± 3% -22.24% (p=0.000 n=18+17)
BM_ArenaFuseUnbalanced/2 69.2ns ± 7% 68.6ns ± 4% ~ (p=0.343 n=18+19)
BM_ArenaFuseUnbalanced/8 543ns ± 3% 515ns ± 4% -5.21% (p=0.000 n=18+18)
BM_ArenaFuseUnbalanced/64 5.05µs ± 8% 4.75µs ± 4% -5.93% (p=0.000 n=17+17)
BM_ArenaFuseUnbalanced/128 10.1µs ± 4% 9.6µs ± 4% -4.78% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/2 72.0ns ± 7% 68.6ns ± 6% -4.73% (p=0.000 n=17+17)
BM_ArenaFuseBalanced/8 543ns ± 3% 520ns ± 3% -4.20% (p=0.000 n=18+17)
BM_ArenaFuseBalanced/64 5.01µs ± 7% 4.87µs ± 4% -2.78% (p=0.004 n=17+18)
BM_ArenaFuseBalanced/128 10.0µs ± 3% 9.8µs ± 4% -2.67% (p=0.001 n=16+18)
BM_LoadAdsDescriptor_Upb<NoLayout> 5.53ms ± 2% 5.56ms ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.20ms ± 3% 6.17ms ± 2% ~ (p=0.424 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 11.8ms ± 7% 11.7ms ± 5% ~ (p=0.297 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 11.9ms ± 3% 11.9ms ± 3% ~ (p=0.351 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 12.3µs ± 4% 12.3µs ± 4% ~ (p=1.000 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 11.3µs ± 6% 11.3µs ± 3% ~ (p=0.845 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 12.1µs ± 4% 12.1µs ± 3% ~ (p=0.542 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 11.1µs ± 4% 11.2µs ± 2% ~ (p=0.330 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 24.2µs ± 3% 25.7µs ±17% ~ (p=0.167 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 11.6µs ± 3% 11.7µs ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 11.5µs ± 7% 11.4µs ± 4% ~ (p=0.799 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 12.8µs ± 5% 13.0µs ±14% ~ (p=0.807 n=18+17)
BM_SerializeDescriptor_Proto2 5.71µs ± 5% 5.78µs ± 6% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 10.2µs ± 4% 10.2µs ± 3% ~ (p=0.613 n=18+17)
name old allocs/op new allocs/op delta
BM_ArenaOneAlloc 1.00 ± 0% 1.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseUnbalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/2 2.00 ± 0% 2.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/8 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/64 64.0 ± 0% 64.0 ± 0% ~ (all samples are equal)
BM_ArenaFuseBalanced/128 128 ± 0% 128 ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<NoLayout> 6.05k ± 0% 6.05k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Upb<WithLayout> 6.36k ± 0% 6.36k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<NoLayout> 83.4k ± 0% 83.4k ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 84.4k ± 0% 84.4k ± 0% -0.00% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Upb_FileDesc<UseArena, Alias> 7.00 ± 0% 7.00 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 765 ± 0% 765 ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 8.00 ± 0% 8.00 ± 0% ~ (all samples are equal)
name old peak-mem(Bytes)/op new peak-mem(Bytes)/op delta
BM_ArenaOneAlloc 336 ± 0% 328 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseUnbalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/2 672 ± 0% 656 ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/8 2.69k ± 0% 2.62k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/64 21.5k ± 0% 21.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_ArenaFuseBalanced/128 43.0k ± 0% 42.0k ± 0% -2.38% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<NoLayout> 10.0M ± 0% 9.9M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Upb<WithLayout> 10.0M ± 0% 10.0M ± 0% -0.05% (p=0.000 n=20+20)
BM_LoadAdsDescriptor_Proto2<NoLayout> 6.62M ± 0% 6.62M ± 0% ~ (all samples are equal)
BM_LoadAdsDescriptor_Proto2<WithLayout> 6.66M ± 0% 6.66M ± 0% -0.01% (p=0.013 n=19+20)
BM_Parse_Upb_FileDesc<UseArena, Copy> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Upb_FileDesc<UseArena, Alias> 36.5k ± 0% 36.5k ± 0% -0.02% (p=0.000 n=20+20)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 35.8k ± 0% 35.8k ± 0% ~ (all samples are equal)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 65.3k ± 0% 65.3k ± 0% ~ (all samples are equal)
name old speed new speed delta
BM_LoadAdsDescriptor_Upb<NoLayout> 137MB/s ± 2% 137MB/s ± 4% ~ (p=0.707 n=16+19)
BM_LoadAdsDescriptor_Upb<WithLayout> 122MB/s ± 3% 123MB/s ± 3% ~ (p=0.501 n=18+18)
BM_LoadAdsDescriptor_Proto2<NoLayout> 64.2MB/s ± 7% 64.7MB/s ± 5% ~ (p=0.330 n=16+18)
BM_LoadAdsDescriptor_Proto2<WithLayout> 63.6MB/s ± 3% 63.9MB/s ± 3% ~ (p=0.303 n=18+17)
BM_Parse_Upb_FileDesc<UseArena, Copy> 614MB/s ± 4% 613MB/s ± 4% ~ (p=0.935 n=17+18)
BM_Parse_Upb_FileDesc<UseArena, Alias> 665MB/s ± 6% 667MB/s ± 3% ~ (p=0.873 n=16+17)
BM_Parse_Upb_FileDesc<InitBlock, Copy> 624MB/s ± 4% 622MB/s ± 3% ~ (p=0.501 n=18+18)
BM_Parse_Upb_FileDesc<InitBlock, Alias> 681MB/s ± 4% 675MB/s ± 2% ~ (p=0.297 n=18+16)
BM_Parse_Proto2<FileDesc, NoArena, Copy> 311MB/s ± 3% 296MB/s ±15% ~ (p=0.177 n=17+20)
BM_Parse_Proto2<FileDesc, UseArena, Copy> 649MB/s ± 3% 644MB/s ± 3% ~ (p=0.232 n=17+18)
BM_Parse_Proto2<FileDesc, InitBlock, Copy> 656MB/s ± 7% 659MB/s ± 4% ~ (p=0.707 n=18+19)
BM_Parse_Proto2<FileDescSV, InitBlock, Alias> 587MB/s ± 5% 576MB/s ±16% ~ (p=0.584 n=18+18)
BM_SerializeDescriptor_Proto2 1.32GB/s ± 5% 1.31GB/s ± 7% ~ (p=0.143 n=18+18)
BM_SerializeDescriptor_Upb 737MB/s ± 4% 737MB/s ± 7% ~ (p=0.839 n=18+18)
```
PiperOrigin-RevId: 520452349
2 years ago
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool upb_Arena_IsFused(const upb_Arena* a, const upb_Arena* b) {
|
|
|
|
if (a == b) return true; // trivial fuse
|
|
|
|
upb_ArenaInternal* ra = _upb_Arena_FindRoot(upb_Arena_Internal(a)).root;
|
|
|
|
upb_ArenaInternal* rb = upb_Arena_Internal(b);
|
|
|
|
while (true) {
|
|
|
|
rb = _upb_Arena_FindRoot(rb).root;
|
|
|
|
if (ra == rb) return true;
|
|
|
|
upb_ArenaInternal* tmp = _upb_Arena_FindRoot(ra).root;
|
|
|
|
if (ra == tmp) return false;
|
|
|
|
// a's root changed since we last checked. Retry.
|
|
|
|
ra = tmp;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool upb_Arena_IncRefFor(const upb_Arena* a, const void* owner) {
|
|
|
|
upb_ArenaInternal* ai = upb_Arena_Internal(a);
|
|
|
|
if (_upb_ArenaInternal_HasInitialBlock(ai)) return false;
|
|
|
|
upb_ArenaRoot r;
|
|
|
|
r.root = ai;
|
|
|
|
|
|
|
|
retry:
|
|
|
|
r = _upb_Arena_FindRoot(r.root);
|
|
|
|
if (upb_Atomic_CompareExchangeWeak(
|
|
|
|
&r.root->parent_or_count, &r.tagged_count,
|
|
|
|
_upb_Arena_TaggedFromRefcount(
|
|
|
|
_upb_Arena_RefCountFromTagged(r.tagged_count) + 1),
|
|
|
|
// Relaxed order is safe on success, incrementing the refcount
|
|
|
|
// need not perform any synchronization with the eventual free of the
|
|
|
|
// arena - that's provided by decrements.
|
|
|
|
memory_order_relaxed,
|
|
|
|
// Relaxed order is safe on failure as r.tagged_count is immediately
|
|
|
|
// overwritten by retrying the find root operation.
|
|
|
|
memory_order_relaxed)) {
|
|
|
|
// We incremented it successfully, so we are done.
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
// We failed update due to parent switching on the arena.
|
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
|
|
|
|
void upb_Arena_DecRefFor(const upb_Arena* a, const void* owner) {
|
|
|
|
upb_Arena_Free((upb_Arena*)a);
|
|
|
|
}
|
|
|
|
|
|
|
|
upb_alloc* upb_Arena_GetUpbAlloc(upb_Arena* a) {
|
|
|
|
UPB_TSAN_CHECK_READ(a->UPB_ONLYBITS(ptr));
|
|
|
|
upb_ArenaInternal* ai = upb_Arena_Internal(a);
|
|
|
|
return _upb_ArenaInternal_BlockAlloc(ai);
|
|
|
|
}
|
|
|
|
|
|
|
|
void UPB_PRIVATE(_upb_Arena_SwapIn)(upb_Arena* des, const upb_Arena* src) {
|
|
|
|
upb_ArenaInternal* desi = upb_Arena_Internal(des);
|
|
|
|
upb_ArenaInternal* srci = upb_Arena_Internal(src);
|
|
|
|
|
|
|
|
*des = *src;
|
|
|
|
desi->block_alloc = srci->block_alloc;
|
Remove atomics from linked list of blocks
We no longer need to traverse the linked list of blocks to check allocated space, which means we also no longer need atomics in the linked list or even its head. This is especially beneficial as the previous implementation contained a race where we could dereference uninitialized memory; because the setting of the `next` pointers did not use release semantics and the reading of them in `SpaceAllocated` reads with relaxed order, there's no guarantee that `size` has actually been initialized - but worse, *there is also no guarantee that `next` has been!*. Simplified:
```
AddBlock:
1 ptr = malloc();
2 ptr->size = 123;
3 ptr->next = ai->blocks;
4 ai->blocks = ptr (release order);
```
```
SpaceAllocated:
5 block = ai->blocks (relaxed order)
6 block->size (acquire, but probably by accident)
7 block = block->next (relaxed order)
```
So I think a second thread calling SpaceAllocated could see the order 1, 4, 5, 6, 7, 2, 3 and read uninitialized memory - there is no data-dependency relationship or happens-before edge that this order violates, and so it would be valid for a compiler+hardware to produce.
In reality, operation 4 will produce an `stlr` on arm (forcing an order of 1, 2, 3 before 4), and `block->next` has a data dependency on `ai->blocks` which would force an ordering in the hardware between 5->6 and 5->7 even for regular `ldr` instructions.
Delete arena contains, it's private and the only user is its own test.
PiperOrigin-RevId: 709918443
2 months ago
|
|
|
desi->blocks = srci->blocks;
|
|
|
|
}
|
|
|
|
|
|
|
|
void UPB_PRIVATE(_upb_Arena_SwapOut)(upb_Arena* des, const upb_Arena* src) {
|
|
|
|
upb_ArenaInternal* desi = upb_Arena_Internal(des);
|
|
|
|
upb_ArenaInternal* srci = upb_Arena_Internal(src);
|
|
|
|
|
|
|
|
*des = *src;
|
Remove atomics from linked list of blocks
We no longer need to traverse the linked list of blocks to check allocated space, which means we also no longer need atomics in the linked list or even its head. This is especially beneficial as the previous implementation contained a race where we could dereference uninitialized memory; because the setting of the `next` pointers did not use release semantics and the reading of them in `SpaceAllocated` reads with relaxed order, there's no guarantee that `size` has actually been initialized - but worse, *there is also no guarantee that `next` has been!*. Simplified:
```
AddBlock:
1 ptr = malloc();
2 ptr->size = 123;
3 ptr->next = ai->blocks;
4 ai->blocks = ptr (release order);
```
```
SpaceAllocated:
5 block = ai->blocks (relaxed order)
6 block->size (acquire, but probably by accident)
7 block = block->next (relaxed order)
```
So I think a second thread calling SpaceAllocated could see the order 1, 4, 5, 6, 7, 2, 3 and read uninitialized memory - there is no data-dependency relationship or happens-before edge that this order violates, and so it would be valid for a compiler+hardware to produce.
In reality, operation 4 will produce an `stlr` on arm (forcing an order of 1, 2, 3 before 4), and `block->next` has a data dependency on `ai->blocks` which would force an ordering in the hardware between 5->6 and 5->7 even for regular `ldr` instructions.
Delete arena contains, it's private and the only user is its own test.
PiperOrigin-RevId: 709918443
2 months ago
|
|
|
desi->blocks = srci->blocks;
|
|
|
|
}
|