Protocol Buffers - Google's data interchange format (grpc依赖) https://developers.google.com/protocol-buffers/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1002 lines
50 KiB

// Protocol Buffers - Google's data interchange format
// Copyright 2023 Google LLC. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file or at
// https://developers.google.com/open-source/licenses/bsd
// Fast decoder: ~3x the speed of decode.c, but requires x86-64/ARM64.
// Also the table size grows by 2x.
//
// Could potentially be ported to other 64-bit archs that pass at least six
// arguments in registers and have 8 unused high bits in pointers.
//
// The overall design is to create specialized functions for every possible
// field type (eg. oneof boolean field with a 1 byte tag) and then dispatch
// to the specialized function as quickly as possible.
4 years ago
#include "upb/wire/decode_fast.h"
#include "upb/message/array.h"
#include "upb/message/internal/array.h"
#include "upb/message/internal/types.h"
#include "upb/mini_table/sub.h"
#include "upb/wire/internal/decode.h"
4 years ago
// Must be last.
#include "upb/port/def.inc"
4 years ago
#if UPB_FASTTABLE
// The standard set of arguments passed to each parsing function.
// Thanks to x86-64 calling conventions, these will stay in registers.
#define UPB_PARSE_PARAMS \
upb_Decoder *d, const char *ptr, upb_Message *msg, intptr_t table, \
4 years ago
uint64_t hasbits, uint64_t data
4 years ago
#define UPB_PARSE_ARGS d, ptr, msg, table, hasbits, data
#define RETURN_GENERIC(m) \
/* Uncomment either of these for debugging purposes. */ \
/* fprintf(stderr, m); */ \
/*__builtin_trap(); */ \
return _upb_FastDecoder_DecodeGeneric(d, ptr, msg, table, hasbits, 0);
typedef enum {
CARD_s = 0, /* Singular (optional, non-repeated) */
CARD_o = 1, /* Oneof */
CARD_r = 2, /* Repeated */
CARD_p = 3 /* Packed Repeated */
} upb_card;
UPB_NOINLINE
static const char* fastdecode_isdonefallback(UPB_PARSE_PARAMS) {
int overrun = data;
ptr = _upb_EpsCopyInputStream_IsDoneFallbackInline(
&d->input, ptr, overrun, _upb_Decoder_BufferFlipCallback);
data = _upb_FastDecoder_LoadTag(ptr);
UPB_MUSTTAIL return _upb_FastDecoder_TagDispatch(UPB_PARSE_ARGS);
}
UPB_FORCEINLINE
static const char* fastdecode_dispatch(UPB_PARSE_PARAMS) {
int overrun;
switch (upb_EpsCopyInputStream_IsDoneStatus(&d->input, ptr, &overrun)) {
case kUpb_IsDoneStatus_Done:
*(uint32_t*)msg |= hasbits; // Sync hasbits.
const upb_MiniTable* l = decode_totablep(table);
return UPB_UNLIKELY(l->required_count)
? _upb_Decoder_CheckRequired(d, ptr, msg, l)
: ptr;
case kUpb_IsDoneStatus_NotDone:
break;
case kUpb_IsDoneStatus_NeedFallback:
data = overrun;
UPB_MUSTTAIL return fastdecode_isdonefallback(UPB_PARSE_ARGS);
}
// Read two bytes of tag data (for a one-byte tag, the high byte is junk).
data = _upb_FastDecoder_LoadTag(ptr);
UPB_MUSTTAIL return _upb_FastDecoder_TagDispatch(UPB_PARSE_ARGS);
}
4 years ago
UPB_FORCEINLINE
static bool fastdecode_checktag(uint16_t data, int tagbytes) {
4 years ago
if (tagbytes == 1) {
return (data & 0xff) == 0;
} else {
return data == 0;
4 years ago
}
}
UPB_FORCEINLINE
static const char* fastdecode_longsize(const char* ptr, int* size) {
int i;
UPB_ASSERT(*size & 0x80);
*size &= 0xff;
for (i = 0; i < 3; i++) {
ptr++;
size_t byte = (uint8_t)ptr[-1];
*size += (byte - 1) << (7 + 7 * i);
if (UPB_LIKELY((byte & 0x80) == 0)) return ptr;
}
ptr++;
size_t byte = (uint8_t)ptr[-1];
// len is limited by 2gb not 4gb, hence 8 and not 16 as normally expected
// for a 32 bit varint.
if (UPB_UNLIKELY(byte >= 8)) return NULL;
*size += (byte - 1) << 28;
return ptr;
}
UPB_FORCEINLINE
static const char* fastdecode_delimited(
upb_Decoder* d, const char* ptr,
upb_EpsCopyInputStream_ParseDelimitedFunc* func, void* ctx) {
ptr++;
// Sign-extend so varint greater than one byte becomes negative, causing
// fast delimited parse to fail.
int len = (int8_t)ptr[-1];
if (!upb_EpsCopyInputStream_TryParseDelimitedFast(&d->input, &ptr, len, func,
ctx)) {
// Slow case: Sub-message is >=128 bytes and/or exceeds the current buffer.
// If it exceeds the buffer limit, limit/limit_ptr will change during
// sub-message parsing, so we need to preserve delta, not limit.
if (UPB_UNLIKELY(len & 0x80)) {
// Size varint >1 byte (length >= 128).
ptr = fastdecode_longsize(ptr, &len);
if (!ptr) {
// Corrupt wire format: size exceeded INT_MAX.
return NULL;
}
}
if (!upb_EpsCopyInputStream_CheckSize(&d->input, ptr, len)) {
// Corrupt wire format: invalid limit.
return NULL;
}
int delta = upb_EpsCopyInputStream_PushLimit(&d->input, ptr, len);
ptr = func(&d->input, ptr, ctx);
upb_EpsCopyInputStream_PopLimit(&d->input, ptr, delta);
}
return ptr;
}
/* singular, oneof, repeated field handling ***********************************/
typedef struct {
upb_Array* arr;
void* end;
} fastdecode_arr;
typedef enum {
FD_NEXT_ATLIMIT,
FD_NEXT_SAMEFIELD,
FD_NEXT_OTHERFIELD
} fastdecode_next;
typedef struct {
void* dst;
fastdecode_next next;
uint32_t tag;
} fastdecode_nextret;
UPB_FORCEINLINE
static void* fastdecode_resizearr(upb_Decoder* d, void* dst,
fastdecode_arr* farr, int valbytes) {
if (UPB_UNLIKELY(dst == farr->end)) {
size_t old_capacity = farr->arr->UPB_PRIVATE(capacity);
size_t old_bytes = old_capacity * valbytes;
size_t new_capacity = old_capacity * 2;
size_t new_bytes = new_capacity * valbytes;
char* old_ptr = _upb_array_ptr(farr->arr);
char* new_ptr = upb_Arena_Realloc(&d->arena, old_ptr, old_bytes, new_bytes);
uint8_t elem_size_lg2 = __builtin_ctz(valbytes);
UPB_PRIVATE(_upb_Array_SetTaggedPtr)(farr->arr, new_ptr, elem_size_lg2);
farr->arr->UPB_PRIVATE(capacity) = new_capacity;
dst = (void*)(new_ptr + (old_capacity * valbytes));
farr->end = (void*)(new_ptr + (new_capacity * valbytes));
}
return dst;
}
UPB_FORCEINLINE
static bool fastdecode_tagmatch(uint32_t tag, uint64_t data, int tagbytes) {
if (tagbytes == 1) {
return (uint8_t)tag == (uint8_t)data;
} else {
return (uint16_t)tag == (uint16_t)data;
}
}
UPB_FORCEINLINE
static void fastdecode_commitarr(void* dst, fastdecode_arr* farr,
int valbytes) {
farr->arr->size =
(size_t)((char*)dst - (char*)_upb_array_ptr(farr->arr)) / valbytes;
}
UPB_FORCEINLINE
static fastdecode_nextret fastdecode_nextrepeated(upb_Decoder* d, void* dst,
const char** ptr,
fastdecode_arr* farr,
uint64_t data, int tagbytes,
int valbytes) {
fastdecode_nextret ret;
dst = (char*)dst + valbytes;
if (UPB_LIKELY(!_upb_Decoder_IsDone(d, ptr))) {
ret.tag = _upb_FastDecoder_LoadTag(*ptr);
if (fastdecode_tagmatch(ret.tag, data, tagbytes)) {
ret.next = FD_NEXT_SAMEFIELD;
} else {
fastdecode_commitarr(dst, farr, valbytes);
ret.next = FD_NEXT_OTHERFIELD;
}
} else {
fastdecode_commitarr(dst, farr, valbytes);
ret.next = FD_NEXT_ATLIMIT;
}
4 years ago
ret.dst = dst;
return ret;
}
UPB_FORCEINLINE
static void* fastdecode_fieldmem(upb_Message* msg, uint64_t data) {
size_t ofs = data >> 48;
return (char*)msg + ofs;
}
UPB_FORCEINLINE
static void* fastdecode_getfield(upb_Decoder* d, const char* ptr,
upb_Message* msg, uint64_t* data,
uint64_t* hasbits, fastdecode_arr* farr,
int valbytes, upb_card card) {
switch (card) {
case CARD_s: {
uint8_t hasbit_index = *data >> 24;
// Set hasbit and return pointer to scalar field.
*hasbits |= 1ull << hasbit_index;
return fastdecode_fieldmem(msg, *data);
}
case CARD_o: {
uint16_t case_ofs = *data >> 32;
uint32_t* oneof_case = UPB_PTR_AT(msg, case_ofs, uint32_t);
uint8_t field_number = *data >> 24;
*oneof_case = field_number;
return fastdecode_fieldmem(msg, *data);
}
case CARD_r: {
// Get pointer to upb_Array and allocate/expand if necessary.
uint8_t elem_size_lg2 = __builtin_ctz(valbytes);
upb_Array** arr_p = fastdecode_fieldmem(msg, *data);
char* begin;
*(uint32_t*)msg |= *hasbits;
*hasbits = 0;
if (UPB_LIKELY(!*arr_p)) {
farr->arr = UPB_PRIVATE(_upb_Array_New)(&d->arena, 8, elem_size_lg2);
*arr_p = farr->arr;
} else {
farr->arr = *arr_p;
}
begin = _upb_array_ptr(farr->arr);
farr->end = begin + (farr->arr->UPB_PRIVATE(capacity) * valbytes);
*data = _upb_FastDecoder_LoadTag(ptr);
return begin + (farr->arr->size * valbytes);
}
default:
UPB_UNREACHABLE();
4 years ago
}
}
UPB_FORCEINLINE
static bool fastdecode_flippacked(uint64_t* data, int tagbytes) {
*data ^= (0x2 ^ 0x0); // Patch data to match packed wiretype.
return fastdecode_checktag(*data, tagbytes);
}
#define FASTDECODE_CHECKPACKED(tagbytes, card, func) \
if (UPB_UNLIKELY(!fastdecode_checktag(data, tagbytes))) { \
if (card == CARD_r && fastdecode_flippacked(&data, tagbytes)) { \
UPB_MUSTTAIL return func(UPB_PARSE_ARGS); \
} \
RETURN_GENERIC("packed check tag mismatch\n"); \
}
/* varint fields **************************************************************/
4 years ago
UPB_FORCEINLINE
static uint64_t fastdecode_munge(uint64_t val, int valbytes, bool zigzag) {
if (valbytes == 1) {
return val != 0;
} else if (zigzag) {
if (valbytes == 4) {
uint32_t n = val;
return (n >> 1) ^ -(int32_t)(n & 1);
} else if (valbytes == 8) {
return (val >> 1) ^ -(int64_t)(val & 1);
}
UPB_UNREACHABLE();
}
return val;
}
4 years ago
UPB_FORCEINLINE
static const char* fastdecode_varint64(const char* ptr, uint64_t* val) {
ptr++;
*val = (uint8_t)ptr[-1];
if (UPB_UNLIKELY(*val & 0x80)) {
int i;
for (i = 0; i < 8; i++) {
ptr++;
uint64_t byte = (uint8_t)ptr[-1];
*val += (byte - 1) << (7 + 7 * i);
if (UPB_LIKELY((byte & 0x80) == 0)) goto done;
}
ptr++;
uint64_t byte = (uint8_t)ptr[-1];
if (byte > 1) {
return NULL;
}
*val += (byte - 1) << 63;
}
done:
UPB_ASSUME(ptr != NULL);
return ptr;
}
#define FASTDECODE_UNPACKEDVARINT(d, ptr, msg, table, hasbits, data, tagbytes, \
valbytes, card, zigzag, packed) \
uint64_t val; \
void* dst; \
fastdecode_arr farr; \
\
FASTDECODE_CHECKPACKED(tagbytes, card, packed); \
\
dst = fastdecode_getfield(d, ptr, msg, &data, &hasbits, &farr, valbytes, \
card); \
if (card == CARD_r) { \
if (UPB_UNLIKELY(!dst)) { \
RETURN_GENERIC("need array resize\n"); \
} \
} \
\
again: \
if (card == CARD_r) { \
dst = fastdecode_resizearr(d, dst, &farr, valbytes); \
} \
\
ptr += tagbytes; \
ptr = fastdecode_varint64(ptr, &val); \
if (ptr == NULL) _upb_FastDecoder_ErrorJmp(d, kUpb_DecodeStatus_Malformed); \
val = fastdecode_munge(val, valbytes, zigzag); \
memcpy(dst, &val, valbytes); \
\
if (card == CARD_r) { \
fastdecode_nextret ret = fastdecode_nextrepeated( \
d, dst, &ptr, &farr, data, tagbytes, valbytes); \
switch (ret.next) { \
case FD_NEXT_SAMEFIELD: \
dst = ret.dst; \
goto again; \
case FD_NEXT_OTHERFIELD: \
data = ret.tag; \
UPB_MUSTTAIL return _upb_FastDecoder_TagDispatch(UPB_PARSE_ARGS); \
case FD_NEXT_ATLIMIT: \
return ptr; \
} \
} \
\
UPB_MUSTTAIL return fastdecode_dispatch(UPB_PARSE_ARGS);
4 years ago
typedef struct {
uint8_t valbytes;
bool zigzag;
void* dst;
fastdecode_arr farr;
} fastdecode_varintdata;
UPB_FORCEINLINE
static const char* fastdecode_topackedvarint(upb_EpsCopyInputStream* e,
const char* ptr, void* ctx) {
upb_Decoder* d = (upb_Decoder*)e;
fastdecode_varintdata* data = ctx;
void* dst = data->dst;
uint64_t val;
while (!_upb_Decoder_IsDone(d, &ptr)) {
dst = fastdecode_resizearr(d, dst, &data->farr, data->valbytes);
ptr = fastdecode_varint64(ptr, &val);
if (ptr == NULL) return NULL;
val = fastdecode_munge(val, data->valbytes, data->zigzag);
memcpy(dst, &val, data->valbytes);
dst = (char*)dst + data->valbytes;
}
fastdecode_commitarr(dst, &data->farr, data->valbytes);
return ptr;
}
#define FASTDECODE_PACKEDVARINT(d, ptr, msg, table, hasbits, data, tagbytes, \
valbytes, zigzag, unpacked) \
fastdecode_varintdata ctx = {valbytes, zigzag}; \
\
FASTDECODE_CHECKPACKED(tagbytes, CARD_r, unpacked); \
\
ctx.dst = fastdecode_getfield(d, ptr, msg, &data, &hasbits, &ctx.farr, \
valbytes, CARD_r); \
if (UPB_UNLIKELY(!ctx.dst)) { \
RETURN_GENERIC("need array resize\n"); \
} \
\
ptr += tagbytes; \
ptr = fastdecode_delimited(d, ptr, &fastdecode_topackedvarint, &ctx); \
\
if (UPB_UNLIKELY(ptr == NULL)) { \
_upb_FastDecoder_ErrorJmp(d, kUpb_DecodeStatus_Malformed); \
} \
\
UPB_MUSTTAIL return fastdecode_dispatch(d, ptr, msg, table, hasbits, 0);
#define FASTDECODE_VARINT(d, ptr, msg, table, hasbits, data, tagbytes, \
valbytes, card, zigzag, unpacked, packed) \
if (card == CARD_p) { \
FASTDECODE_PACKEDVARINT(d, ptr, msg, table, hasbits, data, tagbytes, \
valbytes, zigzag, unpacked); \
} else { \
FASTDECODE_UNPACKEDVARINT(d, ptr, msg, table, hasbits, data, tagbytes, \
valbytes, card, zigzag, packed); \
}
4 years ago
#define z_ZZ true
#define b_ZZ false
#define v_ZZ false
4 years ago
/* Generate all combinations:
* {s,o,r,p} x {b1,v4,z4,v8,z8} x {1bt,2bt} */
4 years ago
#define F(card, type, valbytes, tagbytes) \
UPB_NOINLINE \
const char* upb_p##card##type##valbytes##_##tagbytes##bt(UPB_PARSE_PARAMS) { \
FASTDECODE_VARINT(d, ptr, msg, table, hasbits, data, tagbytes, valbytes, \
CARD_##card, type##_ZZ, \
upb_pr##type##valbytes##_##tagbytes##bt, \
upb_pp##type##valbytes##_##tagbytes##bt); \
4 years ago
}
4 years ago
4 years ago
#define TYPES(card, tagbytes) \
F(card, b, 1, tagbytes) \
F(card, v, 4, tagbytes) \
F(card, v, 8, tagbytes) \
F(card, z, 4, tagbytes) \
F(card, z, 8, tagbytes)
#define TAGBYTES(card) \
TYPES(card, 1) \
TYPES(card, 2)
TAGBYTES(s)
TAGBYTES(o)
TAGBYTES(r)
TAGBYTES(p)
4 years ago
#undef z_ZZ
#undef b_ZZ
#undef v_ZZ
#undef o_ONEOF
#undef s_ONEOF
#undef r_ONEOF
#undef F
#undef TYPES
#undef TAGBYTES
#undef FASTDECODE_UNPACKEDVARINT
#undef FASTDECODE_PACKEDVARINT
#undef FASTDECODE_VARINT
/* fixed fields ***************************************************************/
#define FASTDECODE_UNPACKEDFIXED(d, ptr, msg, table, hasbits, data, tagbytes, \
valbytes, card, packed) \
void* dst; \
fastdecode_arr farr; \
\
FASTDECODE_CHECKPACKED(tagbytes, card, packed) \
\
dst = fastdecode_getfield(d, ptr, msg, &data, &hasbits, &farr, valbytes, \
card); \
if (card == CARD_r) { \
if (UPB_UNLIKELY(!dst)) { \
RETURN_GENERIC("couldn't allocate array in arena\n"); \
} \
} \
\
again: \
if (card == CARD_r) { \
dst = fastdecode_resizearr(d, dst, &farr, valbytes); \
} \
\
ptr += tagbytes; \
memcpy(dst, ptr, valbytes); \
ptr += valbytes; \
\
if (card == CARD_r) { \
fastdecode_nextret ret = fastdecode_nextrepeated( \
d, dst, &ptr, &farr, data, tagbytes, valbytes); \
switch (ret.next) { \
case FD_NEXT_SAMEFIELD: \
dst = ret.dst; \
goto again; \
case FD_NEXT_OTHERFIELD: \
data = ret.tag; \
UPB_MUSTTAIL return _upb_FastDecoder_TagDispatch(UPB_PARSE_ARGS); \
case FD_NEXT_ATLIMIT: \
return ptr; \
} \
} \
\
UPB_MUSTTAIL return fastdecode_dispatch(UPB_PARSE_ARGS);
#define FASTDECODE_PACKEDFIXED(d, ptr, msg, table, hasbits, data, tagbytes, \
valbytes, unpacked) \
FASTDECODE_CHECKPACKED(tagbytes, CARD_r, unpacked) \
\
ptr += tagbytes; \
int size = (uint8_t)ptr[0]; \
ptr++; \
if (size & 0x80) { \
ptr = fastdecode_longsize(ptr, &size); \
} \
\
if (UPB_UNLIKELY(!upb_EpsCopyInputStream_CheckDataSizeAvailable( \
&d->input, ptr, size) || \
(size % valbytes) != 0)) { \
_upb_FastDecoder_ErrorJmp(d, kUpb_DecodeStatus_Malformed); \
} \
\
upb_Array** arr_p = fastdecode_fieldmem(msg, data); \
upb_Array* arr = *arr_p; \
uint8_t elem_size_lg2 = __builtin_ctz(valbytes); \
int elems = size / valbytes; \
\
if (UPB_LIKELY(!arr)) { \
*arr_p = arr = \
UPB_PRIVATE(_upb_Array_New)(&d->arena, elems, elem_size_lg2); \
if (!arr) { \
_upb_FastDecoder_ErrorJmp(d, kUpb_DecodeStatus_Malformed); \
} \
} else { \
_upb_Array_ResizeUninitialized(arr, elems, &d->arena); \
} \
\
char* dst = _upb_array_ptr(arr); \
memcpy(dst, ptr, size); \
arr->size = elems; \
\
ptr += size; \
UPB_MUSTTAIL return fastdecode_dispatch(UPB_PARSE_ARGS);
#define FASTDECODE_FIXED(d, ptr, msg, table, hasbits, data, tagbytes, \
valbytes, card, unpacked, packed) \
if (card == CARD_p) { \
FASTDECODE_PACKEDFIXED(d, ptr, msg, table, hasbits, data, tagbytes, \
valbytes, unpacked); \
} else { \
FASTDECODE_UNPACKEDFIXED(d, ptr, msg, table, hasbits, data, tagbytes, \
valbytes, card, packed); \
}
/* Generate all combinations:
* {s,o,r,p} x {f4,f8} x {1bt,2bt} */
#define F(card, valbytes, tagbytes) \
UPB_NOINLINE \
const char* upb_p##card##f##valbytes##_##tagbytes##bt(UPB_PARSE_PARAMS) { \
FASTDECODE_FIXED(d, ptr, msg, table, hasbits, data, tagbytes, valbytes, \
CARD_##card, upb_ppf##valbytes##_##tagbytes##bt, \
upb_prf##valbytes##_##tagbytes##bt); \
}
#define TYPES(card, tagbytes) \
F(card, 4, tagbytes) \
F(card, 8, tagbytes)
#define TAGBYTES(card) \
TYPES(card, 1) \
TYPES(card, 2)
TAGBYTES(s)
TAGBYTES(o)
TAGBYTES(r)
TAGBYTES(p)
4 years ago
#undef F
#undef TYPES
#undef TAGBYTES
#undef FASTDECODE_UNPACKEDFIXED
#undef FASTDECODE_PACKEDFIXED
4 years ago
/* string fields **************************************************************/
typedef const char* fastdecode_copystr_func(struct upb_Decoder* d,
const char* ptr, upb_Message* msg,
const upb_MiniTable* table,
uint64_t hasbits,
upb_StringView* dst);
Fixed a size regression due to inlining UTF-8 verification. Overall size/speed impact on fasttable decoder is now: name old time/op new time/op delta ArenaOneAlloc 21.5ns ± 0% 21.5ns ± 0% ~ (p=0.060 n=12+12) ArenaInitialBlockOneAlloc 6.33ns ± 0% 6.33ns ± 0% ~ (p=0.413 n=11+12) LoadDescriptor_Upb 43.4µs ± 1% 45.5µs ± 1% +4.79% (p=0.000 n=12+12) LoadAdsDescriptor_Upb 2.50ms ± 0% 2.51ms ± 2% ~ (p=0.512 n=10+11) LoadDescriptor_Proto2 240µs ± 0% 240µs ± 0% -0.25% (p=0.000 n=12+12) LoadAdsDescriptor_Proto2 12.9ms ± 0% 12.9ms ± 0% +0.20% (p=0.014 n=10+12) Parse_Upb_FileDesc<UseArena,Copy> 4.99µs ± 0% 5.04µs ± 0% +0.98% (p=0.000 n=11+10) Parse_Upb_FileDesc<UseArena,Alias> 4.02µs ± 0% 4.18µs ± 0% +4.16% (p=0.000 n=10+12) Parse_Upb_FileDesc<InitBlock,Copy> 4.49µs ± 0% 4.54µs ± 0% +1.16% (p=0.000 n=11+10) Parse_Upb_FileDesc<InitBlock,Alias> 3.60µs ± 0% 3.80µs ± 0% +5.73% (p=0.000 n=12+11) Parse_Proto2<FileDesc,NoArena,Copy> 29.3µs ± 0% 29.3µs ± 0% ~ (p=0.069 n=11+12) Parse_Proto2<FileDesc,UseArena,Copy> 20.2µs ± 3% 20.3µs ± 2% ~ (p=0.880 n=12+11) Parse_Proto2<FileDesc,InitBlock,Copy> 16.5µs ± 0% 16.5µs ± 0% ~ (p=1.000 n=12+12) Parse_Proto2<FileDescSV,InitBlock,Alias> 16.4µs ± 0% 16.4µs ± 1% ~ (p=0.590 n=12+12) SerializeDescriptor_Proto2 5.31µs ± 1% 6.65µs ±29% +25.07% (p=0.000 n=12+12) SerializeDescriptor_Upb 12.4µs ± 0% 12.5µs ± 0% +1.23% (p=0.000 n=12+12) FILE SIZE VM SIZE -------------- -------------- +16% +128 [ = ] 0 [Unmapped] -1.2% -4 -1.2% -4 [section .text] [NEW] +2 [NEW] +2 fastdecode_isdonefallback [DEL] -6 [DEL] -6 fastdecode_longstring_noutf8 -0.2% -124 -0.2% -124 upb/decode_fast.c +5.8% +64 +6.0% +64 upb_pom_1bt_max64b +2.7% +64 +2.7% +64 upb_ppv8_2bt +2.7% +32 +2.8% +32 upb_psm_1bt_max256b +2.8% +32 +3.0% +32 upb_psm_1bt_max64b +2.8% +32 +3.0% +32 upb_psm_2bt_max64b +4.0% +24 +4.2% +24 upb_psv8_1bt +2.0% +16 +2.1% +16 upb_prf4_2bt +1.3% +16 +1.4% +16 upb_prz8_2bt -0.3% -4 -0.3% -4 [3 Others] -1.6% -8 -1.7% -8 upb_cob_1bt -1.6% -8 -1.7% -8 upb_csb_1bt -2.5% -16 -2.6% -16 upb_pov4_1bt -1.3% -16 -1.3% -16 upb_prv8_2bt -2.5% -16 -2.7% -16 upb_psv4_1bt -2.5% -16 -2.6% -16 upb_psv4_2bt -3.0% -32 -3.1% -32 upb_prs_2bt -2.6% -32 -2.6% -32 upb_prv4_2bt -4.9% -48 -5.1% -48 upb_prb_2bt -3.9% -48 -4.0% -48 upb_prv4_1bt -7.2% -72 -7.5% -72 upb_prb_1bt -7.8% -88 -8.0% -88 upb_prs_1bt [ = ] 0 -0.1% -128 TOTAL There is a bit of speed regression, but it appears there were bigger CPU regressions prior to this. We probably need some separate optimization attention again to get back to the performance numbers we had when fasttable was first submitted.
4 years ago
UPB_NOINLINE
static const char* fastdecode_verifyutf8(upb_Decoder* d, const char* ptr,
upb_Message* msg, intptr_t table,
Fixed a size regression due to inlining UTF-8 verification. Overall size/speed impact on fasttable decoder is now: name old time/op new time/op delta ArenaOneAlloc 21.5ns ± 0% 21.5ns ± 0% ~ (p=0.060 n=12+12) ArenaInitialBlockOneAlloc 6.33ns ± 0% 6.33ns ± 0% ~ (p=0.413 n=11+12) LoadDescriptor_Upb 43.4µs ± 1% 45.5µs ± 1% +4.79% (p=0.000 n=12+12) LoadAdsDescriptor_Upb 2.50ms ± 0% 2.51ms ± 2% ~ (p=0.512 n=10+11) LoadDescriptor_Proto2 240µs ± 0% 240µs ± 0% -0.25% (p=0.000 n=12+12) LoadAdsDescriptor_Proto2 12.9ms ± 0% 12.9ms ± 0% +0.20% (p=0.014 n=10+12) Parse_Upb_FileDesc<UseArena,Copy> 4.99µs ± 0% 5.04µs ± 0% +0.98% (p=0.000 n=11+10) Parse_Upb_FileDesc<UseArena,Alias> 4.02µs ± 0% 4.18µs ± 0% +4.16% (p=0.000 n=10+12) Parse_Upb_FileDesc<InitBlock,Copy> 4.49µs ± 0% 4.54µs ± 0% +1.16% (p=0.000 n=11+10) Parse_Upb_FileDesc<InitBlock,Alias> 3.60µs ± 0% 3.80µs ± 0% +5.73% (p=0.000 n=12+11) Parse_Proto2<FileDesc,NoArena,Copy> 29.3µs ± 0% 29.3µs ± 0% ~ (p=0.069 n=11+12) Parse_Proto2<FileDesc,UseArena,Copy> 20.2µs ± 3% 20.3µs ± 2% ~ (p=0.880 n=12+11) Parse_Proto2<FileDesc,InitBlock,Copy> 16.5µs ± 0% 16.5µs ± 0% ~ (p=1.000 n=12+12) Parse_Proto2<FileDescSV,InitBlock,Alias> 16.4µs ± 0% 16.4µs ± 1% ~ (p=0.590 n=12+12) SerializeDescriptor_Proto2 5.31µs ± 1% 6.65µs ±29% +25.07% (p=0.000 n=12+12) SerializeDescriptor_Upb 12.4µs ± 0% 12.5µs ± 0% +1.23% (p=0.000 n=12+12) FILE SIZE VM SIZE -------------- -------------- +16% +128 [ = ] 0 [Unmapped] -1.2% -4 -1.2% -4 [section .text] [NEW] +2 [NEW] +2 fastdecode_isdonefallback [DEL] -6 [DEL] -6 fastdecode_longstring_noutf8 -0.2% -124 -0.2% -124 upb/decode_fast.c +5.8% +64 +6.0% +64 upb_pom_1bt_max64b +2.7% +64 +2.7% +64 upb_ppv8_2bt +2.7% +32 +2.8% +32 upb_psm_1bt_max256b +2.8% +32 +3.0% +32 upb_psm_1bt_max64b +2.8% +32 +3.0% +32 upb_psm_2bt_max64b +4.0% +24 +4.2% +24 upb_psv8_1bt +2.0% +16 +2.1% +16 upb_prf4_2bt +1.3% +16 +1.4% +16 upb_prz8_2bt -0.3% -4 -0.3% -4 [3 Others] -1.6% -8 -1.7% -8 upb_cob_1bt -1.6% -8 -1.7% -8 upb_csb_1bt -2.5% -16 -2.6% -16 upb_pov4_1bt -1.3% -16 -1.3% -16 upb_prv8_2bt -2.5% -16 -2.7% -16 upb_psv4_1bt -2.5% -16 -2.6% -16 upb_psv4_2bt -3.0% -32 -3.1% -32 upb_prs_2bt -2.6% -32 -2.6% -32 upb_prv4_2bt -4.9% -48 -5.1% -48 upb_prb_2bt -3.9% -48 -4.0% -48 upb_prv4_1bt -7.2% -72 -7.5% -72 upb_prb_1bt -7.8% -88 -8.0% -88 upb_prs_1bt [ = ] 0 -0.1% -128 TOTAL There is a bit of speed regression, but it appears there were bigger CPU regressions prior to this. We probably need some separate optimization attention again to get back to the performance numbers we had when fasttable was first submitted.
4 years ago
uint64_t hasbits, uint64_t data) {
upb_StringView* dst = (upb_StringView*)data;
if (!_upb_Decoder_VerifyUtf8Inline(dst->data, dst->size)) {
_upb_FastDecoder_ErrorJmp(d, kUpb_DecodeStatus_BadUtf8);
Fixed a size regression due to inlining UTF-8 verification. Overall size/speed impact on fasttable decoder is now: name old time/op new time/op delta ArenaOneAlloc 21.5ns ± 0% 21.5ns ± 0% ~ (p=0.060 n=12+12) ArenaInitialBlockOneAlloc 6.33ns ± 0% 6.33ns ± 0% ~ (p=0.413 n=11+12) LoadDescriptor_Upb 43.4µs ± 1% 45.5µs ± 1% +4.79% (p=0.000 n=12+12) LoadAdsDescriptor_Upb 2.50ms ± 0% 2.51ms ± 2% ~ (p=0.512 n=10+11) LoadDescriptor_Proto2 240µs ± 0% 240µs ± 0% -0.25% (p=0.000 n=12+12) LoadAdsDescriptor_Proto2 12.9ms ± 0% 12.9ms ± 0% +0.20% (p=0.014 n=10+12) Parse_Upb_FileDesc<UseArena,Copy> 4.99µs ± 0% 5.04µs ± 0% +0.98% (p=0.000 n=11+10) Parse_Upb_FileDesc<UseArena,Alias> 4.02µs ± 0% 4.18µs ± 0% +4.16% (p=0.000 n=10+12) Parse_Upb_FileDesc<InitBlock,Copy> 4.49µs ± 0% 4.54µs ± 0% +1.16% (p=0.000 n=11+10) Parse_Upb_FileDesc<InitBlock,Alias> 3.60µs ± 0% 3.80µs ± 0% +5.73% (p=0.000 n=12+11) Parse_Proto2<FileDesc,NoArena,Copy> 29.3µs ± 0% 29.3µs ± 0% ~ (p=0.069 n=11+12) Parse_Proto2<FileDesc,UseArena,Copy> 20.2µs ± 3% 20.3µs ± 2% ~ (p=0.880 n=12+11) Parse_Proto2<FileDesc,InitBlock,Copy> 16.5µs ± 0% 16.5µs ± 0% ~ (p=1.000 n=12+12) Parse_Proto2<FileDescSV,InitBlock,Alias> 16.4µs ± 0% 16.4µs ± 1% ~ (p=0.590 n=12+12) SerializeDescriptor_Proto2 5.31µs ± 1% 6.65µs ±29% +25.07% (p=0.000 n=12+12) SerializeDescriptor_Upb 12.4µs ± 0% 12.5µs ± 0% +1.23% (p=0.000 n=12+12) FILE SIZE VM SIZE -------------- -------------- +16% +128 [ = ] 0 [Unmapped] -1.2% -4 -1.2% -4 [section .text] [NEW] +2 [NEW] +2 fastdecode_isdonefallback [DEL] -6 [DEL] -6 fastdecode_longstring_noutf8 -0.2% -124 -0.2% -124 upb/decode_fast.c +5.8% +64 +6.0% +64 upb_pom_1bt_max64b +2.7% +64 +2.7% +64 upb_ppv8_2bt +2.7% +32 +2.8% +32 upb_psm_1bt_max256b +2.8% +32 +3.0% +32 upb_psm_1bt_max64b +2.8% +32 +3.0% +32 upb_psm_2bt_max64b +4.0% +24 +4.2% +24 upb_psv8_1bt +2.0% +16 +2.1% +16 upb_prf4_2bt +1.3% +16 +1.4% +16 upb_prz8_2bt -0.3% -4 -0.3% -4 [3 Others] -1.6% -8 -1.7% -8 upb_cob_1bt -1.6% -8 -1.7% -8 upb_csb_1bt -2.5% -16 -2.6% -16 upb_pov4_1bt -1.3% -16 -1.3% -16 upb_prv8_2bt -2.5% -16 -2.7% -16 upb_psv4_1bt -2.5% -16 -2.6% -16 upb_psv4_2bt -3.0% -32 -3.1% -32 upb_prs_2bt -2.6% -32 -2.6% -32 upb_prv4_2bt -4.9% -48 -5.1% -48 upb_prb_2bt -3.9% -48 -4.0% -48 upb_prv4_1bt -7.2% -72 -7.5% -72 upb_prb_1bt -7.8% -88 -8.0% -88 upb_prs_1bt [ = ] 0 -0.1% -128 TOTAL There is a bit of speed regression, but it appears there were bigger CPU regressions prior to this. We probably need some separate optimization attention again to get back to the performance numbers we had when fasttable was first submitted.
4 years ago
}
UPB_MUSTTAIL return fastdecode_dispatch(UPB_PARSE_ARGS);
Fixed a size regression due to inlining UTF-8 verification. Overall size/speed impact on fasttable decoder is now: name old time/op new time/op delta ArenaOneAlloc 21.5ns ± 0% 21.5ns ± 0% ~ (p=0.060 n=12+12) ArenaInitialBlockOneAlloc 6.33ns ± 0% 6.33ns ± 0% ~ (p=0.413 n=11+12) LoadDescriptor_Upb 43.4µs ± 1% 45.5µs ± 1% +4.79% (p=0.000 n=12+12) LoadAdsDescriptor_Upb 2.50ms ± 0% 2.51ms ± 2% ~ (p=0.512 n=10+11) LoadDescriptor_Proto2 240µs ± 0% 240µs ± 0% -0.25% (p=0.000 n=12+12) LoadAdsDescriptor_Proto2 12.9ms ± 0% 12.9ms ± 0% +0.20% (p=0.014 n=10+12) Parse_Upb_FileDesc<UseArena,Copy> 4.99µs ± 0% 5.04µs ± 0% +0.98% (p=0.000 n=11+10) Parse_Upb_FileDesc<UseArena,Alias> 4.02µs ± 0% 4.18µs ± 0% +4.16% (p=0.000 n=10+12) Parse_Upb_FileDesc<InitBlock,Copy> 4.49µs ± 0% 4.54µs ± 0% +1.16% (p=0.000 n=11+10) Parse_Upb_FileDesc<InitBlock,Alias> 3.60µs ± 0% 3.80µs ± 0% +5.73% (p=0.000 n=12+11) Parse_Proto2<FileDesc,NoArena,Copy> 29.3µs ± 0% 29.3µs ± 0% ~ (p=0.069 n=11+12) Parse_Proto2<FileDesc,UseArena,Copy> 20.2µs ± 3% 20.3µs ± 2% ~ (p=0.880 n=12+11) Parse_Proto2<FileDesc,InitBlock,Copy> 16.5µs ± 0% 16.5µs ± 0% ~ (p=1.000 n=12+12) Parse_Proto2<FileDescSV,InitBlock,Alias> 16.4µs ± 0% 16.4µs ± 1% ~ (p=0.590 n=12+12) SerializeDescriptor_Proto2 5.31µs ± 1% 6.65µs ±29% +25.07% (p=0.000 n=12+12) SerializeDescriptor_Upb 12.4µs ± 0% 12.5µs ± 0% +1.23% (p=0.000 n=12+12) FILE SIZE VM SIZE -------------- -------------- +16% +128 [ = ] 0 [Unmapped] -1.2% -4 -1.2% -4 [section .text] [NEW] +2 [NEW] +2 fastdecode_isdonefallback [DEL] -6 [DEL] -6 fastdecode_longstring_noutf8 -0.2% -124 -0.2% -124 upb/decode_fast.c +5.8% +64 +6.0% +64 upb_pom_1bt_max64b +2.7% +64 +2.7% +64 upb_ppv8_2bt +2.7% +32 +2.8% +32 upb_psm_1bt_max256b +2.8% +32 +3.0% +32 upb_psm_1bt_max64b +2.8% +32 +3.0% +32 upb_psm_2bt_max64b +4.0% +24 +4.2% +24 upb_psv8_1bt +2.0% +16 +2.1% +16 upb_prf4_2bt +1.3% +16 +1.4% +16 upb_prz8_2bt -0.3% -4 -0.3% -4 [3 Others] -1.6% -8 -1.7% -8 upb_cob_1bt -1.6% -8 -1.7% -8 upb_csb_1bt -2.5% -16 -2.6% -16 upb_pov4_1bt -1.3% -16 -1.3% -16 upb_prv8_2bt -2.5% -16 -2.7% -16 upb_psv4_1bt -2.5% -16 -2.6% -16 upb_psv4_2bt -3.0% -32 -3.1% -32 upb_prs_2bt -2.6% -32 -2.6% -32 upb_prv4_2bt -4.9% -48 -5.1% -48 upb_prb_2bt -3.9% -48 -4.0% -48 upb_prv4_1bt -7.2% -72 -7.5% -72 upb_prb_1bt -7.8% -88 -8.0% -88 upb_prs_1bt [ = ] 0 -0.1% -128 TOTAL There is a bit of speed regression, but it appears there were bigger CPU regressions prior to this. We probably need some separate optimization attention again to get back to the performance numbers we had when fasttable was first submitted.
4 years ago
}
#define FASTDECODE_LONGSTRING(d, ptr, msg, table, hasbits, dst, validate_utf8) \
int size = (uint8_t)ptr[0]; /* Could plumb through hasbits. */ \
ptr++; \
if (size & 0x80) { \
ptr = fastdecode_longsize(ptr, &size); \
} \
\
if (UPB_UNLIKELY(!upb_EpsCopyInputStream_CheckSize(&d->input, ptr, size))) { \
dst->size = 0; \
_upb_FastDecoder_ErrorJmp(d, kUpb_DecodeStatus_Malformed); \
} \
\
const char* s_ptr = ptr; \
ptr = upb_EpsCopyInputStream_ReadString(&d->input, &s_ptr, size, &d->arena); \
if (!ptr) _upb_FastDecoder_ErrorJmp(d, kUpb_DecodeStatus_OutOfMemory); \
dst->data = s_ptr; \
dst->size = size; \
\
if (validate_utf8) { \
Fixed a size regression due to inlining UTF-8 verification. Overall size/speed impact on fasttable decoder is now: name old time/op new time/op delta ArenaOneAlloc 21.5ns ± 0% 21.5ns ± 0% ~ (p=0.060 n=12+12) ArenaInitialBlockOneAlloc 6.33ns ± 0% 6.33ns ± 0% ~ (p=0.413 n=11+12) LoadDescriptor_Upb 43.4µs ± 1% 45.5µs ± 1% +4.79% (p=0.000 n=12+12) LoadAdsDescriptor_Upb 2.50ms ± 0% 2.51ms ± 2% ~ (p=0.512 n=10+11) LoadDescriptor_Proto2 240µs ± 0% 240µs ± 0% -0.25% (p=0.000 n=12+12) LoadAdsDescriptor_Proto2 12.9ms ± 0% 12.9ms ± 0% +0.20% (p=0.014 n=10+12) Parse_Upb_FileDesc<UseArena,Copy> 4.99µs ± 0% 5.04µs ± 0% +0.98% (p=0.000 n=11+10) Parse_Upb_FileDesc<UseArena,Alias> 4.02µs ± 0% 4.18µs ± 0% +4.16% (p=0.000 n=10+12) Parse_Upb_FileDesc<InitBlock,Copy> 4.49µs ± 0% 4.54µs ± 0% +1.16% (p=0.000 n=11+10) Parse_Upb_FileDesc<InitBlock,Alias> 3.60µs ± 0% 3.80µs ± 0% +5.73% (p=0.000 n=12+11) Parse_Proto2<FileDesc,NoArena,Copy> 29.3µs ± 0% 29.3µs ± 0% ~ (p=0.069 n=11+12) Parse_Proto2<FileDesc,UseArena,Copy> 20.2µs ± 3% 20.3µs ± 2% ~ (p=0.880 n=12+11) Parse_Proto2<FileDesc,InitBlock,Copy> 16.5µs ± 0% 16.5µs ± 0% ~ (p=1.000 n=12+12) Parse_Proto2<FileDescSV,InitBlock,Alias> 16.4µs ± 0% 16.4µs ± 1% ~ (p=0.590 n=12+12) SerializeDescriptor_Proto2 5.31µs ± 1% 6.65µs ±29% +25.07% (p=0.000 n=12+12) SerializeDescriptor_Upb 12.4µs ± 0% 12.5µs ± 0% +1.23% (p=0.000 n=12+12) FILE SIZE VM SIZE -------------- -------------- +16% +128 [ = ] 0 [Unmapped] -1.2% -4 -1.2% -4 [section .text] [NEW] +2 [NEW] +2 fastdecode_isdonefallback [DEL] -6 [DEL] -6 fastdecode_longstring_noutf8 -0.2% -124 -0.2% -124 upb/decode_fast.c +5.8% +64 +6.0% +64 upb_pom_1bt_max64b +2.7% +64 +2.7% +64 upb_ppv8_2bt +2.7% +32 +2.8% +32 upb_psm_1bt_max256b +2.8% +32 +3.0% +32 upb_psm_1bt_max64b +2.8% +32 +3.0% +32 upb_psm_2bt_max64b +4.0% +24 +4.2% +24 upb_psv8_1bt +2.0% +16 +2.1% +16 upb_prf4_2bt +1.3% +16 +1.4% +16 upb_prz8_2bt -0.3% -4 -0.3% -4 [3 Others] -1.6% -8 -1.7% -8 upb_cob_1bt -1.6% -8 -1.7% -8 upb_csb_1bt -2.5% -16 -2.6% -16 upb_pov4_1bt -1.3% -16 -1.3% -16 upb_prv8_2bt -2.5% -16 -2.7% -16 upb_psv4_1bt -2.5% -16 -2.6% -16 upb_psv4_2bt -3.0% -32 -3.1% -32 upb_prs_2bt -2.6% -32 -2.6% -32 upb_prv4_2bt -4.9% -48 -5.1% -48 upb_prb_2bt -3.9% -48 -4.0% -48 upb_prv4_1bt -7.2% -72 -7.5% -72 upb_prb_1bt -7.8% -88 -8.0% -88 upb_prs_1bt [ = ] 0 -0.1% -128 TOTAL There is a bit of speed regression, but it appears there were bigger CPU regressions prior to this. We probably need some separate optimization attention again to get back to the performance numbers we had when fasttable was first submitted.
4 years ago
data = (uint64_t)dst; \
UPB_MUSTTAIL return fastdecode_verifyutf8(UPB_PARSE_ARGS); \
} else { \
UPB_MUSTTAIL return fastdecode_dispatch(UPB_PARSE_ARGS); \
}
UPB_NOINLINE
static const char* fastdecode_longstring_utf8(struct upb_Decoder* d,
const char* ptr, upb_Message* msg,
intptr_t table, uint64_t hasbits,
uint64_t data) {
upb_StringView* dst = (upb_StringView*)data;
FASTDECODE_LONGSTRING(d, ptr, msg, table, hasbits, dst, true);
}
UPB_NOINLINE
static const char* fastdecode_longstring_noutf8(
struct upb_Decoder* d, const char* ptr, upb_Message* msg, intptr_t table,
uint64_t hasbits, uint64_t data) {
upb_StringView* dst = (upb_StringView*)data;
FASTDECODE_LONGSTRING(d, ptr, msg, table, hasbits, dst, false);
}
UPB_FORCEINLINE
static void fastdecode_docopy(upb_Decoder* d, const char* ptr, uint32_t size,
int copy, char* data, size_t data_offset,
upb_StringView* dst) {
Fixed some small bugs and performance problems in string copying. Before this CL, with alias=false: ------------------------------------------------------------------------------ Benchmark Time CPU Iterations ------------------------------------------------------------------------------ BM_Parse_Upb_FileDesc_WithInitialBlock 3715 ns 3715 ns 188916 1.88206GB/s Performance counter stats for 'bazel-bin/benchmarks/benchmark --benchmark_filter=BM_Parse_Upb_FileDesc_WithInitialBlock': 1,122.92 msec task-clock # 0.979 CPUs utilized 3 context-switches # 0.003 K/sec 0 cpu-migrations # 0.000 K/sec 196 page-faults # 0.175 K/sec 4,144,746,717 cycles # 3.691 GHz 15,351,966,804 instructions # 3.70 insn per cycle 2,590,281,905 branches # 2306.728 M/sec 2,996,157 branch-misses # 0.12% of all branches 1.146615328 seconds time elapsed 1.115578000 seconds user 0.008025000 seconds sys After this CL: ------------------------------------------------------------------------------ Benchmark Time CPU Iterations ------------------------------------------------------------------------------ BM_Parse_Upb_FileDesc_WithInitialBlock 3554 ns 3554 ns 197527 1.9674GB/s Performance counter stats for 'bazel-bin/benchmarks/benchmark --benchmark_filter=BM_Parse_Upb_FileDesc_WithInitialBlock': 1,105.34 msec task-clock # 0.982 CPUs utilized 3 context-switches # 0.003 K/sec 0 cpu-migrations # 0.000 K/sec 197 page-faults # 0.178 K/sec 4,077,736,892 cycles # 3.689 GHz 15,442,709,352 instructions # 3.79 insn per cycle 2,435,131,301 branches # 2203.068 M/sec 2,643,775 branch-misses # 0.11% of all branches 1.125393845 seconds time elapsed 1.097770000 seconds user 0.008012000 seconds sys
4 years ago
d->arena.head.ptr += copy;
dst->data = data + data_offset;
UPB_UNPOISON_MEMORY_REGION(data, copy);
memcpy(data, ptr, copy);
UPB_POISON_MEMORY_REGION(data + data_offset + size,
copy - data_offset - size);
}
#define FASTDECODE_COPYSTRING(d, ptr, msg, table, hasbits, data, tagbytes, \
card, validate_utf8) \
upb_StringView* dst; \
fastdecode_arr farr; \
int64_t size; \
size_t arena_has; \
size_t common_has; \
char* buf; \
\
UPB_ASSERT(!upb_EpsCopyInputStream_AliasingAvailable(&d->input, ptr, 0)); \
UPB_ASSERT(fastdecode_checktag(data, tagbytes)); \
\
dst = fastdecode_getfield(d, ptr, msg, &data, &hasbits, &farr, \
sizeof(upb_StringView), card); \
\
again: \
if (card == CARD_r) { \
dst = fastdecode_resizearr(d, dst, &farr, sizeof(upb_StringView)); \
} \
\
size = (uint8_t)ptr[tagbytes]; \
ptr += tagbytes + 1; \
dst->size = size; \
\
buf = d->arena.head.ptr; \
arena_has = _upb_ArenaHas(&d->arena); \
common_has = UPB_MIN(arena_has, \
upb_EpsCopyInputStream_BytesAvailable(&d->input, ptr)); \
\
if (UPB_LIKELY(size <= 15 - tagbytes)) { \
if (arena_has < 16) goto longstr; \
fastdecode_docopy(d, ptr - tagbytes - 1, size, 16, buf, tagbytes + 1, \
dst); \
} else if (UPB_LIKELY(size <= 32)) { \
if (UPB_UNLIKELY(common_has < 32)) goto longstr; \
fastdecode_docopy(d, ptr, size, 32, buf, 0, dst); \
} else if (UPB_LIKELY(size <= 64)) { \
if (UPB_UNLIKELY(common_has < 64)) goto longstr; \
fastdecode_docopy(d, ptr, size, 64, buf, 0, dst); \
} else if (UPB_LIKELY(size < 128)) { \
if (UPB_UNLIKELY(common_has < 128)) goto longstr; \
fastdecode_docopy(d, ptr, size, 128, buf, 0, dst); \
} else { \
goto longstr; \
} \
\
ptr += size; \
\
if (card == CARD_r) { \
if (validate_utf8 && \
!_upb_Decoder_VerifyUtf8Inline(dst->data, dst->size)) { \
_upb_FastDecoder_ErrorJmp(d, kUpb_DecodeStatus_BadUtf8); \
} \
fastdecode_nextret ret = fastdecode_nextrepeated( \
d, dst, &ptr, &farr, data, tagbytes, sizeof(upb_StringView)); \
switch (ret.next) { \
case FD_NEXT_SAMEFIELD: \
dst = ret.dst; \
goto again; \
case FD_NEXT_OTHERFIELD: \
data = ret.tag; \
UPB_MUSTTAIL return _upb_FastDecoder_TagDispatch(UPB_PARSE_ARGS); \
case FD_NEXT_ATLIMIT: \
return ptr; \
} \
} \
\
if (card != CARD_r && validate_utf8) { \
data = (uint64_t)dst; \
UPB_MUSTTAIL return fastdecode_verifyutf8(UPB_PARSE_ARGS); \
} \
\
UPB_MUSTTAIL return fastdecode_dispatch(UPB_PARSE_ARGS); \
\
longstr: \
if (card == CARD_r) { \
fastdecode_commitarr(dst + 1, &farr, sizeof(upb_StringView)); \
} \
ptr--; \
if (validate_utf8) { \
UPB_MUSTTAIL return fastdecode_longstring_utf8(d, ptr, msg, table, \
hasbits, (uint64_t)dst); \
} else { \
UPB_MUSTTAIL return fastdecode_longstring_noutf8(d, ptr, msg, table, \
hasbits, (uint64_t)dst); \
}
#define FASTDECODE_STRING(d, ptr, msg, table, hasbits, data, tagbytes, card, \
copyfunc, validate_utf8) \
upb_StringView* dst; \
fastdecode_arr farr; \
int64_t size; \
\
if (UPB_UNLIKELY(!fastdecode_checktag(data, tagbytes))) { \
RETURN_GENERIC("string field tag mismatch\n"); \
} \
\
if (UPB_UNLIKELY( \
!upb_EpsCopyInputStream_AliasingAvailable(&d->input, ptr, 0))) { \
UPB_MUSTTAIL return copyfunc(UPB_PARSE_ARGS); \
} \
\
dst = fastdecode_getfield(d, ptr, msg, &data, &hasbits, &farr, \
sizeof(upb_StringView), card); \
\
again: \
if (card == CARD_r) { \
dst = fastdecode_resizearr(d, dst, &farr, sizeof(upb_StringView)); \
} \
\
size = (int8_t)ptr[tagbytes]; \
ptr += tagbytes + 1; \
\
if (UPB_UNLIKELY( \
!upb_EpsCopyInputStream_AliasingAvailable(&d->input, ptr, size))) { \
ptr--; \
if (validate_utf8) { \
return fastdecode_longstring_utf8(d, ptr, msg, table, hasbits, \
(uint64_t)dst); \
} else { \
return fastdecode_longstring_noutf8(d, ptr, msg, table, hasbits, \
(uint64_t)dst); \
} \
} \
\
dst->data = ptr; \
dst->size = size; \
ptr = upb_EpsCopyInputStream_ReadStringAliased(&d->input, &dst->data, \
dst->size); \
\
if (card == CARD_r) { \
if (validate_utf8 && \
!_upb_Decoder_VerifyUtf8Inline(dst->data, dst->size)) { \
_upb_FastDecoder_ErrorJmp(d, kUpb_DecodeStatus_BadUtf8); \
} \
fastdecode_nextret ret = fastdecode_nextrepeated( \
d, dst, &ptr, &farr, data, tagbytes, sizeof(upb_StringView)); \
switch (ret.next) { \
case FD_NEXT_SAMEFIELD: \
dst = ret.dst; \
goto again; \
case FD_NEXT_OTHERFIELD: \
data = ret.tag; \
UPB_MUSTTAIL return _upb_FastDecoder_TagDispatch(UPB_PARSE_ARGS); \
case FD_NEXT_ATLIMIT: \
return ptr; \
} \
} \
\
if (card != CARD_r && validate_utf8) { \
data = (uint64_t)dst; \
UPB_MUSTTAIL return fastdecode_verifyutf8(UPB_PARSE_ARGS); \
} \
\
UPB_MUSTTAIL return fastdecode_dispatch(UPB_PARSE_ARGS);
/* Generate all combinations:
* {p,c} x {s,o,r} x {s, b} x {1bt,2bt} */
#define s_VALIDATE true
#define b_VALIDATE false
#define F(card, tagbytes, type) \
UPB_NOINLINE \
const char* upb_c##card##type##_##tagbytes##bt(UPB_PARSE_PARAMS) { \
FASTDECODE_COPYSTRING(d, ptr, msg, table, hasbits, data, tagbytes, \
CARD_##card, type##_VALIDATE); \
} \
const char* upb_p##card##type##_##tagbytes##bt(UPB_PARSE_PARAMS) { \
FASTDECODE_STRING(d, ptr, msg, table, hasbits, data, tagbytes, \
CARD_##card, upb_c##card##type##_##tagbytes##bt, \
type##_VALIDATE); \
}
#define UTF8(card, tagbytes) \
F(card, tagbytes, s) \
F(card, tagbytes, b)
#define TAGBYTES(card) \
UTF8(card, 1) \
UTF8(card, 2)
TAGBYTES(s)
TAGBYTES(o)
TAGBYTES(r)
#undef s_VALIDATE
#undef b_VALIDATE
#undef F
#undef TAGBYTES
#undef FASTDECODE_LONGSTRING
#undef FASTDECODE_COPYSTRING
#undef FASTDECODE_STRING
/* message fields *************************************************************/
UPB_INLINE
upb_Message* decode_newmsg_ceil(upb_Decoder* d, const upb_MiniTable* l,
int msg_ceil_bytes) {
size_t size = l->size + sizeof(upb_Message_Internal);
char* msg_data;
if (UPB_LIKELY(msg_ceil_bytes > 0 &&
_upb_ArenaHas(&d->arena) >= msg_ceil_bytes)) {
UPB_ASSERT(size <= (size_t)msg_ceil_bytes);
msg_data = d->arena.head.ptr;
d->arena.head.ptr += size;
UPB_UNPOISON_MEMORY_REGION(msg_data, msg_ceil_bytes);
memset(msg_data, 0, msg_ceil_bytes);
UPB_POISON_MEMORY_REGION(msg_data + size, msg_ceil_bytes - size);
} else {
msg_data = (char*)upb_Arena_Malloc(&d->arena, size);
memset(msg_data, 0, size);
}
return msg_data + sizeof(upb_Message_Internal);
}
typedef struct {
intptr_t table;
upb_Message* msg;
} fastdecode_submsgdata;
UPB_FORCEINLINE
static const char* fastdecode_tosubmsg(upb_EpsCopyInputStream* e,
const char* ptr, void* ctx) {
upb_Decoder* d = (upb_Decoder*)e;
fastdecode_submsgdata* submsg = ctx;
ptr = fastdecode_dispatch(d, ptr, submsg->msg, submsg->table, 0, 0);
UPB_ASSUME(ptr != NULL);
return ptr;
}
#define FASTDECODE_SUBMSG(d, ptr, msg, table, hasbits, data, tagbytes, \
msg_ceil_bytes, card) \
\
if (UPB_UNLIKELY(!fastdecode_checktag(data, tagbytes))) { \
RETURN_GENERIC("submessage field tag mismatch\n"); \
} \
\
if (--d->depth == 0) { \
_upb_FastDecoder_ErrorJmp(d, kUpb_DecodeStatus_MaxDepthExceeded); \
} \
\
upb_Message** dst; \
uint32_t submsg_idx = (data >> 16) & 0xff; \
const upb_MiniTable* tablep = decode_totablep(table); \
const upb_MiniTable* subtablep = \
upb_MiniTableSub_Message(tablep->subs[submsg_idx]); \
fastdecode_submsgdata submsg = {decode_totable(subtablep)}; \
fastdecode_arr farr; \
\
if (subtablep->table_mask == (uint8_t)-1) { \
d->depth++; \
RETURN_GENERIC("submessage doesn't have fast tables."); \
} \
\
dst = fastdecode_getfield(d, ptr, msg, &data, &hasbits, &farr, \
sizeof(upb_Message*), card); \
\
if (card == CARD_s) { \
*(uint32_t*)msg |= hasbits; \
hasbits = 0; \
} \
\
again: \
if (card == CARD_r) { \
dst = fastdecode_resizearr(d, dst, &farr, sizeof(upb_Message*)); \
} \
\
submsg.msg = *dst; \
\
if (card == CARD_r || UPB_LIKELY(!submsg.msg)) { \
*dst = submsg.msg = decode_newmsg_ceil(d, subtablep, msg_ceil_bytes); \
} \
\
ptr += tagbytes; \
ptr = fastdecode_delimited(d, ptr, fastdecode_tosubmsg, &submsg); \
\
if (UPB_UNLIKELY(ptr == NULL || d->end_group != DECODE_NOGROUP)) { \
_upb_FastDecoder_ErrorJmp(d, kUpb_DecodeStatus_Malformed); \
} \
\
if (card == CARD_r) { \
fastdecode_nextret ret = fastdecode_nextrepeated( \
d, dst, &ptr, &farr, data, tagbytes, sizeof(upb_Message*)); \
switch (ret.next) { \
case FD_NEXT_SAMEFIELD: \
dst = ret.dst; \
goto again; \
case FD_NEXT_OTHERFIELD: \
d->depth++; \
data = ret.tag; \
UPB_MUSTTAIL return _upb_FastDecoder_TagDispatch(UPB_PARSE_ARGS); \
case FD_NEXT_ATLIMIT: \
d->depth++; \
return ptr; \
} \
} \
\
d->depth++; \
UPB_MUSTTAIL return fastdecode_dispatch(UPB_PARSE_ARGS);
#define F(card, tagbytes, size_ceil, ceil_arg) \
const char* upb_p##card##m_##tagbytes##bt_max##size_ceil##b( \
UPB_PARSE_PARAMS) { \
FASTDECODE_SUBMSG(d, ptr, msg, table, hasbits, data, tagbytes, ceil_arg, \
CARD_##card); \
I think this may have reached the optimization limit. ------------------------------------------------------------------------- Benchmark Time CPU Iterations ------------------------------------------------------------------------- BM_ArenaOneAlloc 21 ns 21 ns 32994231 BM_ArenaInitialBlockOneAlloc 6 ns 6 ns 116318005 BM_ParseDescriptorNoHeap 3028 ns 3028 ns 231138 2.34354GB/s BM_ParseDescriptor 3557 ns 3557 ns 196583 1.99498GB/s BM_ParseDescriptorProto2NoArena 33228 ns 33226 ns 21196 218.688MB/s BM_ParseDescriptorProto2WithArena 22863 ns 22861 ns 30666 317.831MB/s BM_SerializeDescriptorProto2 5444 ns 5444 ns 127368 1.30348GB/s BM_SerializeDescriptor 12509 ns 12508 ns 55816 580.914MB/s $ perf stat bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap 2020-10-08 14:07:06 Running bazel-bin/benchmark Run on (72 X 3700 MHz CPU s) CPU Caches: L1 Data 32K (x36) L1 Instruction 32K (x36) L2 Unified 1024K (x36) L3 Unified 25344K (x2) ---------------------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------------------- BM_ParseDescriptorNoHeap 3071 ns 3071 ns 227743 2.31094GB/s Performance counter stats for 'bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap': 1,050.22 msec task-clock # 0.978 CPUs utilized 4 context-switches # 0.004 K/sec 0 cpu-migrations # 0.000 K/sec 179 page-faults # 0.170 K/sec 3,875,796,334 cycles # 3.690 GHz 13,282,835,967 instructions # 3.43 insn per cycle 2,887,725,848 branches # 2749.627 M/sec 8,324,912 branch-misses # 0.29% of all branches 1.073924364 seconds time elapsed 1.042806000 seconds user 0.008021000 seconds sys Profile: 23.96% benchmark benchmark [.] upb_prm_1bt_max192b 22.44% benchmark benchmark [.] fastdecode_dispatch 18.96% benchmark benchmark [.] upb_pss_1bt 14.20% benchmark benchmark [.] upb_psv4_1bt 8.33% benchmark benchmark [.] upb_prm_1bt_max64b 6.66% benchmark benchmark [.] upb_prm_1bt_max128b 1.29% benchmark benchmark [.] upb_psm_1bt_max64b 0.77% benchmark benchmark [.] fastdecode_generic 0.55% benchmark [kernel.kallsyms] [k] smp_call_function_single 0.42% benchmark [kernel.kallsyms] [k] _raw_spin_lock_irqsave 0.42% benchmark benchmark [.] upb_psm_1bt_max256b 0.31% benchmark benchmark [.] upb_psb1_1bt 0.21% benchmark benchmark [.] upb_plv4_5bv 0.14% benchmark benchmark [.] upb_psb1_2bt 0.12% benchmark benchmark [.] decode_longvarint64 0.08% benchmark [kernel.kallsyms] [k] vsnprintf 0.07% benchmark [kernel.kallsyms] [k] _raw_spin_lock 0.07% benchmark benchmark [.] _upb_msg_new 0.06% benchmark ld-2.31.so [.] check_match
4 years ago
}
I think this may have reached the optimization limit. ------------------------------------------------------------------------- Benchmark Time CPU Iterations ------------------------------------------------------------------------- BM_ArenaOneAlloc 21 ns 21 ns 32994231 BM_ArenaInitialBlockOneAlloc 6 ns 6 ns 116318005 BM_ParseDescriptorNoHeap 3028 ns 3028 ns 231138 2.34354GB/s BM_ParseDescriptor 3557 ns 3557 ns 196583 1.99498GB/s BM_ParseDescriptorProto2NoArena 33228 ns 33226 ns 21196 218.688MB/s BM_ParseDescriptorProto2WithArena 22863 ns 22861 ns 30666 317.831MB/s BM_SerializeDescriptorProto2 5444 ns 5444 ns 127368 1.30348GB/s BM_SerializeDescriptor 12509 ns 12508 ns 55816 580.914MB/s $ perf stat bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap 2020-10-08 14:07:06 Running bazel-bin/benchmark Run on (72 X 3700 MHz CPU s) CPU Caches: L1 Data 32K (x36) L1 Instruction 32K (x36) L2 Unified 1024K (x36) L3 Unified 25344K (x2) ---------------------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------------------- BM_ParseDescriptorNoHeap 3071 ns 3071 ns 227743 2.31094GB/s Performance counter stats for 'bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap': 1,050.22 msec task-clock # 0.978 CPUs utilized 4 context-switches # 0.004 K/sec 0 cpu-migrations # 0.000 K/sec 179 page-faults # 0.170 K/sec 3,875,796,334 cycles # 3.690 GHz 13,282,835,967 instructions # 3.43 insn per cycle 2,887,725,848 branches # 2749.627 M/sec 8,324,912 branch-misses # 0.29% of all branches 1.073924364 seconds time elapsed 1.042806000 seconds user 0.008021000 seconds sys Profile: 23.96% benchmark benchmark [.] upb_prm_1bt_max192b 22.44% benchmark benchmark [.] fastdecode_dispatch 18.96% benchmark benchmark [.] upb_pss_1bt 14.20% benchmark benchmark [.] upb_psv4_1bt 8.33% benchmark benchmark [.] upb_prm_1bt_max64b 6.66% benchmark benchmark [.] upb_prm_1bt_max128b 1.29% benchmark benchmark [.] upb_psm_1bt_max64b 0.77% benchmark benchmark [.] fastdecode_generic 0.55% benchmark [kernel.kallsyms] [k] smp_call_function_single 0.42% benchmark [kernel.kallsyms] [k] _raw_spin_lock_irqsave 0.42% benchmark benchmark [.] upb_psm_1bt_max256b 0.31% benchmark benchmark [.] upb_psb1_1bt 0.21% benchmark benchmark [.] upb_plv4_5bv 0.14% benchmark benchmark [.] upb_psb1_2bt 0.12% benchmark benchmark [.] decode_longvarint64 0.08% benchmark [kernel.kallsyms] [k] vsnprintf 0.07% benchmark [kernel.kallsyms] [k] _raw_spin_lock 0.07% benchmark benchmark [.] _upb_msg_new 0.06% benchmark ld-2.31.so [.] check_match
4 years ago
#define SIZES(card, tagbytes) \
F(card, tagbytes, 64, 64) \
I think this may have reached the optimization limit. ------------------------------------------------------------------------- Benchmark Time CPU Iterations ------------------------------------------------------------------------- BM_ArenaOneAlloc 21 ns 21 ns 32994231 BM_ArenaInitialBlockOneAlloc 6 ns 6 ns 116318005 BM_ParseDescriptorNoHeap 3028 ns 3028 ns 231138 2.34354GB/s BM_ParseDescriptor 3557 ns 3557 ns 196583 1.99498GB/s BM_ParseDescriptorProto2NoArena 33228 ns 33226 ns 21196 218.688MB/s BM_ParseDescriptorProto2WithArena 22863 ns 22861 ns 30666 317.831MB/s BM_SerializeDescriptorProto2 5444 ns 5444 ns 127368 1.30348GB/s BM_SerializeDescriptor 12509 ns 12508 ns 55816 580.914MB/s $ perf stat bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap 2020-10-08 14:07:06 Running bazel-bin/benchmark Run on (72 X 3700 MHz CPU s) CPU Caches: L1 Data 32K (x36) L1 Instruction 32K (x36) L2 Unified 1024K (x36) L3 Unified 25344K (x2) ---------------------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------------------- BM_ParseDescriptorNoHeap 3071 ns 3071 ns 227743 2.31094GB/s Performance counter stats for 'bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap': 1,050.22 msec task-clock # 0.978 CPUs utilized 4 context-switches # 0.004 K/sec 0 cpu-migrations # 0.000 K/sec 179 page-faults # 0.170 K/sec 3,875,796,334 cycles # 3.690 GHz 13,282,835,967 instructions # 3.43 insn per cycle 2,887,725,848 branches # 2749.627 M/sec 8,324,912 branch-misses # 0.29% of all branches 1.073924364 seconds time elapsed 1.042806000 seconds user 0.008021000 seconds sys Profile: 23.96% benchmark benchmark [.] upb_prm_1bt_max192b 22.44% benchmark benchmark [.] fastdecode_dispatch 18.96% benchmark benchmark [.] upb_pss_1bt 14.20% benchmark benchmark [.] upb_psv4_1bt 8.33% benchmark benchmark [.] upb_prm_1bt_max64b 6.66% benchmark benchmark [.] upb_prm_1bt_max128b 1.29% benchmark benchmark [.] upb_psm_1bt_max64b 0.77% benchmark benchmark [.] fastdecode_generic 0.55% benchmark [kernel.kallsyms] [k] smp_call_function_single 0.42% benchmark [kernel.kallsyms] [k] _raw_spin_lock_irqsave 0.42% benchmark benchmark [.] upb_psm_1bt_max256b 0.31% benchmark benchmark [.] upb_psb1_1bt 0.21% benchmark benchmark [.] upb_plv4_5bv 0.14% benchmark benchmark [.] upb_psb1_2bt 0.12% benchmark benchmark [.] decode_longvarint64 0.08% benchmark [kernel.kallsyms] [k] vsnprintf 0.07% benchmark [kernel.kallsyms] [k] _raw_spin_lock 0.07% benchmark benchmark [.] _upb_msg_new 0.06% benchmark ld-2.31.so [.] check_match
4 years ago
F(card, tagbytes, 128, 128) \
F(card, tagbytes, 192, 192) \
F(card, tagbytes, 256, 256) \
F(card, tagbytes, max, -1)
I think this may have reached the optimization limit. ------------------------------------------------------------------------- Benchmark Time CPU Iterations ------------------------------------------------------------------------- BM_ArenaOneAlloc 21 ns 21 ns 32994231 BM_ArenaInitialBlockOneAlloc 6 ns 6 ns 116318005 BM_ParseDescriptorNoHeap 3028 ns 3028 ns 231138 2.34354GB/s BM_ParseDescriptor 3557 ns 3557 ns 196583 1.99498GB/s BM_ParseDescriptorProto2NoArena 33228 ns 33226 ns 21196 218.688MB/s BM_ParseDescriptorProto2WithArena 22863 ns 22861 ns 30666 317.831MB/s BM_SerializeDescriptorProto2 5444 ns 5444 ns 127368 1.30348GB/s BM_SerializeDescriptor 12509 ns 12508 ns 55816 580.914MB/s $ perf stat bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap 2020-10-08 14:07:06 Running bazel-bin/benchmark Run on (72 X 3700 MHz CPU s) CPU Caches: L1 Data 32K (x36) L1 Instruction 32K (x36) L2 Unified 1024K (x36) L3 Unified 25344K (x2) ---------------------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------------------- BM_ParseDescriptorNoHeap 3071 ns 3071 ns 227743 2.31094GB/s Performance counter stats for 'bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap': 1,050.22 msec task-clock # 0.978 CPUs utilized 4 context-switches # 0.004 K/sec 0 cpu-migrations # 0.000 K/sec 179 page-faults # 0.170 K/sec 3,875,796,334 cycles # 3.690 GHz 13,282,835,967 instructions # 3.43 insn per cycle 2,887,725,848 branches # 2749.627 M/sec 8,324,912 branch-misses # 0.29% of all branches 1.073924364 seconds time elapsed 1.042806000 seconds user 0.008021000 seconds sys Profile: 23.96% benchmark benchmark [.] upb_prm_1bt_max192b 22.44% benchmark benchmark [.] fastdecode_dispatch 18.96% benchmark benchmark [.] upb_pss_1bt 14.20% benchmark benchmark [.] upb_psv4_1bt 8.33% benchmark benchmark [.] upb_prm_1bt_max64b 6.66% benchmark benchmark [.] upb_prm_1bt_max128b 1.29% benchmark benchmark [.] upb_psm_1bt_max64b 0.77% benchmark benchmark [.] fastdecode_generic 0.55% benchmark [kernel.kallsyms] [k] smp_call_function_single 0.42% benchmark [kernel.kallsyms] [k] _raw_spin_lock_irqsave 0.42% benchmark benchmark [.] upb_psm_1bt_max256b 0.31% benchmark benchmark [.] upb_psb1_1bt 0.21% benchmark benchmark [.] upb_plv4_5bv 0.14% benchmark benchmark [.] upb_psb1_2bt 0.12% benchmark benchmark [.] decode_longvarint64 0.08% benchmark [kernel.kallsyms] [k] vsnprintf 0.07% benchmark [kernel.kallsyms] [k] _raw_spin_lock 0.07% benchmark benchmark [.] _upb_msg_new 0.06% benchmark ld-2.31.so [.] check_match
4 years ago
#define TAGBYTES(card) \
SIZES(card, 1) \
I think this may have reached the optimization limit. ------------------------------------------------------------------------- Benchmark Time CPU Iterations ------------------------------------------------------------------------- BM_ArenaOneAlloc 21 ns 21 ns 32994231 BM_ArenaInitialBlockOneAlloc 6 ns 6 ns 116318005 BM_ParseDescriptorNoHeap 3028 ns 3028 ns 231138 2.34354GB/s BM_ParseDescriptor 3557 ns 3557 ns 196583 1.99498GB/s BM_ParseDescriptorProto2NoArena 33228 ns 33226 ns 21196 218.688MB/s BM_ParseDescriptorProto2WithArena 22863 ns 22861 ns 30666 317.831MB/s BM_SerializeDescriptorProto2 5444 ns 5444 ns 127368 1.30348GB/s BM_SerializeDescriptor 12509 ns 12508 ns 55816 580.914MB/s $ perf stat bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap 2020-10-08 14:07:06 Running bazel-bin/benchmark Run on (72 X 3700 MHz CPU s) CPU Caches: L1 Data 32K (x36) L1 Instruction 32K (x36) L2 Unified 1024K (x36) L3 Unified 25344K (x2) ---------------------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------------------- BM_ParseDescriptorNoHeap 3071 ns 3071 ns 227743 2.31094GB/s Performance counter stats for 'bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap': 1,050.22 msec task-clock # 0.978 CPUs utilized 4 context-switches # 0.004 K/sec 0 cpu-migrations # 0.000 K/sec 179 page-faults # 0.170 K/sec 3,875,796,334 cycles # 3.690 GHz 13,282,835,967 instructions # 3.43 insn per cycle 2,887,725,848 branches # 2749.627 M/sec 8,324,912 branch-misses # 0.29% of all branches 1.073924364 seconds time elapsed 1.042806000 seconds user 0.008021000 seconds sys Profile: 23.96% benchmark benchmark [.] upb_prm_1bt_max192b 22.44% benchmark benchmark [.] fastdecode_dispatch 18.96% benchmark benchmark [.] upb_pss_1bt 14.20% benchmark benchmark [.] upb_psv4_1bt 8.33% benchmark benchmark [.] upb_prm_1bt_max64b 6.66% benchmark benchmark [.] upb_prm_1bt_max128b 1.29% benchmark benchmark [.] upb_psm_1bt_max64b 0.77% benchmark benchmark [.] fastdecode_generic 0.55% benchmark [kernel.kallsyms] [k] smp_call_function_single 0.42% benchmark [kernel.kallsyms] [k] _raw_spin_lock_irqsave 0.42% benchmark benchmark [.] upb_psm_1bt_max256b 0.31% benchmark benchmark [.] upb_psb1_1bt 0.21% benchmark benchmark [.] upb_plv4_5bv 0.14% benchmark benchmark [.] upb_psb1_2bt 0.12% benchmark benchmark [.] decode_longvarint64 0.08% benchmark [kernel.kallsyms] [k] vsnprintf 0.07% benchmark [kernel.kallsyms] [k] _raw_spin_lock 0.07% benchmark benchmark [.] _upb_msg_new 0.06% benchmark ld-2.31.so [.] check_match
4 years ago
SIZES(card, 2)
I think this may have reached the optimization limit. ------------------------------------------------------------------------- Benchmark Time CPU Iterations ------------------------------------------------------------------------- BM_ArenaOneAlloc 21 ns 21 ns 32994231 BM_ArenaInitialBlockOneAlloc 6 ns 6 ns 116318005 BM_ParseDescriptorNoHeap 3028 ns 3028 ns 231138 2.34354GB/s BM_ParseDescriptor 3557 ns 3557 ns 196583 1.99498GB/s BM_ParseDescriptorProto2NoArena 33228 ns 33226 ns 21196 218.688MB/s BM_ParseDescriptorProto2WithArena 22863 ns 22861 ns 30666 317.831MB/s BM_SerializeDescriptorProto2 5444 ns 5444 ns 127368 1.30348GB/s BM_SerializeDescriptor 12509 ns 12508 ns 55816 580.914MB/s $ perf stat bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap 2020-10-08 14:07:06 Running bazel-bin/benchmark Run on (72 X 3700 MHz CPU s) CPU Caches: L1 Data 32K (x36) L1 Instruction 32K (x36) L2 Unified 1024K (x36) L3 Unified 25344K (x2) ---------------------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------------------- BM_ParseDescriptorNoHeap 3071 ns 3071 ns 227743 2.31094GB/s Performance counter stats for 'bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap': 1,050.22 msec task-clock # 0.978 CPUs utilized 4 context-switches # 0.004 K/sec 0 cpu-migrations # 0.000 K/sec 179 page-faults # 0.170 K/sec 3,875,796,334 cycles # 3.690 GHz 13,282,835,967 instructions # 3.43 insn per cycle 2,887,725,848 branches # 2749.627 M/sec 8,324,912 branch-misses # 0.29% of all branches 1.073924364 seconds time elapsed 1.042806000 seconds user 0.008021000 seconds sys Profile: 23.96% benchmark benchmark [.] upb_prm_1bt_max192b 22.44% benchmark benchmark [.] fastdecode_dispatch 18.96% benchmark benchmark [.] upb_pss_1bt 14.20% benchmark benchmark [.] upb_psv4_1bt 8.33% benchmark benchmark [.] upb_prm_1bt_max64b 6.66% benchmark benchmark [.] upb_prm_1bt_max128b 1.29% benchmark benchmark [.] upb_psm_1bt_max64b 0.77% benchmark benchmark [.] fastdecode_generic 0.55% benchmark [kernel.kallsyms] [k] smp_call_function_single 0.42% benchmark [kernel.kallsyms] [k] _raw_spin_lock_irqsave 0.42% benchmark benchmark [.] upb_psm_1bt_max256b 0.31% benchmark benchmark [.] upb_psb1_1bt 0.21% benchmark benchmark [.] upb_plv4_5bv 0.14% benchmark benchmark [.] upb_psb1_2bt 0.12% benchmark benchmark [.] decode_longvarint64 0.08% benchmark [kernel.kallsyms] [k] vsnprintf 0.07% benchmark [kernel.kallsyms] [k] _raw_spin_lock 0.07% benchmark benchmark [.] _upb_msg_new 0.06% benchmark ld-2.31.so [.] check_match
4 years ago
TAGBYTES(s)
TAGBYTES(o)
TAGBYTES(r)
I think this may have reached the optimization limit. ------------------------------------------------------------------------- Benchmark Time CPU Iterations ------------------------------------------------------------------------- BM_ArenaOneAlloc 21 ns 21 ns 32994231 BM_ArenaInitialBlockOneAlloc 6 ns 6 ns 116318005 BM_ParseDescriptorNoHeap 3028 ns 3028 ns 231138 2.34354GB/s BM_ParseDescriptor 3557 ns 3557 ns 196583 1.99498GB/s BM_ParseDescriptorProto2NoArena 33228 ns 33226 ns 21196 218.688MB/s BM_ParseDescriptorProto2WithArena 22863 ns 22861 ns 30666 317.831MB/s BM_SerializeDescriptorProto2 5444 ns 5444 ns 127368 1.30348GB/s BM_SerializeDescriptor 12509 ns 12508 ns 55816 580.914MB/s $ perf stat bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap 2020-10-08 14:07:06 Running bazel-bin/benchmark Run on (72 X 3700 MHz CPU s) CPU Caches: L1 Data 32K (x36) L1 Instruction 32K (x36) L2 Unified 1024K (x36) L3 Unified 25344K (x2) ---------------------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------------------- BM_ParseDescriptorNoHeap 3071 ns 3071 ns 227743 2.31094GB/s Performance counter stats for 'bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap': 1,050.22 msec task-clock # 0.978 CPUs utilized 4 context-switches # 0.004 K/sec 0 cpu-migrations # 0.000 K/sec 179 page-faults # 0.170 K/sec 3,875,796,334 cycles # 3.690 GHz 13,282,835,967 instructions # 3.43 insn per cycle 2,887,725,848 branches # 2749.627 M/sec 8,324,912 branch-misses # 0.29% of all branches 1.073924364 seconds time elapsed 1.042806000 seconds user 0.008021000 seconds sys Profile: 23.96% benchmark benchmark [.] upb_prm_1bt_max192b 22.44% benchmark benchmark [.] fastdecode_dispatch 18.96% benchmark benchmark [.] upb_pss_1bt 14.20% benchmark benchmark [.] upb_psv4_1bt 8.33% benchmark benchmark [.] upb_prm_1bt_max64b 6.66% benchmark benchmark [.] upb_prm_1bt_max128b 1.29% benchmark benchmark [.] upb_psm_1bt_max64b 0.77% benchmark benchmark [.] fastdecode_generic 0.55% benchmark [kernel.kallsyms] [k] smp_call_function_single 0.42% benchmark [kernel.kallsyms] [k] _raw_spin_lock_irqsave 0.42% benchmark benchmark [.] upb_psm_1bt_max256b 0.31% benchmark benchmark [.] upb_psb1_1bt 0.21% benchmark benchmark [.] upb_plv4_5bv 0.14% benchmark benchmark [.] upb_psb1_2bt 0.12% benchmark benchmark [.] decode_longvarint64 0.08% benchmark [kernel.kallsyms] [k] vsnprintf 0.07% benchmark [kernel.kallsyms] [k] _raw_spin_lock 0.07% benchmark benchmark [.] _upb_msg_new 0.06% benchmark ld-2.31.so [.] check_match
4 years ago
#undef TAGBYTES
#undef SIZES
#undef F
#undef FASTDECODE_SUBMSG
#endif /* UPB_FASTTABLE */