Protocol Buffers - Google's data interchange format (grpc依赖) https://developers.google.com/protocol-buffers/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1056 lines
52 KiB

/*
* Copyright (c) 2009-2021, Google LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Google LLC nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL Google LLC BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Fast decoder: ~3x the speed of decode.c, but requires x86-64/ARM64.
// Also the table size grows by 2x.
//
// Could potentially be ported to other 64-bit archs that pass at least six
// arguments in registers and have 8 unused high bits in pointers.
//
// The overall design is to create specialized functions for every possible
// field type (eg. oneof boolean field with a 1 byte tag) and then dispatch
// to the specialized function as quickly as possible.
4 years ago
#include "upb/decode_fast.h"
#include "upb/internal/decode.h"
4 years ago
/* Must be last. */
4 years ago
#include "upb/port_def.inc"
#if UPB_FASTTABLE
// The standard set of arguments passed to each parsing function.
// Thanks to x86-64 calling conventions, these will stay in registers.
#define UPB_PARSE_PARAMS \
upb_Decoder *d, const char *ptr, upb_Message *msg, intptr_t table, \
4 years ago
uint64_t hasbits, uint64_t data
4 years ago
#define UPB_PARSE_ARGS d, ptr, msg, table, hasbits, data
#define RETURN_GENERIC(m) \
/* Uncomment either of these for debugging purposes. */ \
/* fprintf(stderr, m); */ \
/*__builtin_trap(); */ \
return fastdecode_generic(d, ptr, msg, table, hasbits, 0);
typedef enum {
CARD_s = 0, /* Singular (optional, non-repeated) */
CARD_o = 1, /* Oneof */
CARD_r = 2, /* Repeated */
CARD_p = 3 /* Packed Repeated */
} upb_card;
UPB_NOINLINE
static const char* fastdecode_isdonefallback(UPB_PARSE_PARAMS) {
int overrun = data;
int status;
ptr = decode_isdonefallback_inl(d, ptr, overrun, &status);
if (ptr == NULL) {
return fastdecode_err(d, status);
}
data = fastdecode_loadtag(ptr);
UPB_MUSTTAIL return fastdecode_tagdispatch(UPB_PARSE_ARGS);
}
UPB_FORCEINLINE
static const char* fastdecode_dispatch(UPB_PARSE_PARAMS) {
if (UPB_UNLIKELY(ptr >= d->limit_ptr)) {
int overrun = ptr - d->end;
if (UPB_LIKELY(overrun == d->limit)) {
// Parse is finished.
*(uint32_t*)msg |= hasbits; // Sync hasbits.
const upb_MiniTable* l = decode_totablep(table);
return UPB_UNLIKELY(l->required_count)
? decode_checkrequired(d, ptr, msg, l)
: ptr;
} else {
data = overrun;
UPB_MUSTTAIL return fastdecode_isdonefallback(UPB_PARSE_ARGS);
}
}
// Read two bytes of tag data (for a one-byte tag, the high byte is junk).
data = fastdecode_loadtag(ptr);
UPB_MUSTTAIL return fastdecode_tagdispatch(UPB_PARSE_ARGS);
}
4 years ago
UPB_FORCEINLINE
static bool fastdecode_checktag(uint16_t data, int tagbytes) {
4 years ago
if (tagbytes == 1) {
return (data & 0xff) == 0;
} else {
return data == 0;
4 years ago
}
}
UPB_FORCEINLINE
static const char* fastdecode_longsize(const char* ptr, int* size) {
int i;
UPB_ASSERT(*size & 0x80);
*size &= 0xff;
for (i = 0; i < 3; i++) {
ptr++;
size_t byte = (uint8_t)ptr[-1];
*size += (byte - 1) << (7 + 7 * i);
if (UPB_LIKELY((byte & 0x80) == 0)) return ptr;
}
ptr++;
size_t byte = (uint8_t)ptr[-1];
// len is limited by 2gb not 4gb, hence 8 and not 16 as normally expected
// for a 32 bit varint.
if (UPB_UNLIKELY(byte >= 8)) return NULL;
*size += (byte - 1) << 28;
return ptr;
}
UPB_FORCEINLINE
static bool fastdecode_boundscheck(const char* ptr, size_t len,
const char* end) {
uintptr_t uptr = (uintptr_t)ptr;
uintptr_t uend = (uintptr_t)end + 16;
uintptr_t res = uptr + len;
return res < uptr || res > uend;
}
UPB_FORCEINLINE
static bool fastdecode_boundscheck2(const char* ptr, size_t len,
const char* end) {
// This is one extra branch compared to the more normal:
// return (size_t)(end - ptr) < size;
// However it is one less computation if we are just about to use "ptr + len":
// https://godbolt.org/z/35YGPz
// In microbenchmarks this shows an overall 4% improvement.
uintptr_t uptr = (uintptr_t)ptr;
uintptr_t uend = (uintptr_t)end;
uintptr_t res = uptr + len;
return res < uptr || res > uend;
}
typedef const char* fastdecode_delimfunc(upb_Decoder* d, const char* ptr,
void* ctx);
UPB_FORCEINLINE
static const char* fastdecode_delimited(upb_Decoder* d, const char* ptr,
fastdecode_delimfunc* func, void* ctx) {
ptr++;
int len = (int8_t)ptr[-1];
if (fastdecode_boundscheck2(ptr, len, d->limit_ptr)) {
// Slow case: Sub-message is >=128 bytes and/or exceeds the current buffer.
// If it exceeds the buffer limit, limit/limit_ptr will change during
// sub-message parsing, so we need to preserve delta, not limit.
if (UPB_UNLIKELY(len & 0x80)) {
// Size varint >1 byte (length >= 128).
ptr = fastdecode_longsize(ptr, &len);
if (!ptr) {
// Corrupt wire format: size exceeded INT_MAX.
return NULL;
}
}
if (ptr - d->end + (int)len > d->limit) {
// Corrupt wire format: invalid limit.
return NULL;
}
int delta = decode_pushlimit(d, ptr, len);
ptr = func(d, ptr, ctx);
decode_poplimit(d, ptr, delta);
} else {
// Fast case: Sub-message is <128 bytes and fits in the current buffer.
// This means we can preserve limit/limit_ptr verbatim.
const char* saved_limit_ptr = d->limit_ptr;
int saved_limit = d->limit;
d->limit_ptr = ptr + len;
d->limit = d->limit_ptr - d->end;
UPB_ASSERT(d->limit_ptr == d->end + UPB_MIN(0, d->limit));
ptr = func(d, ptr, ctx);
d->limit_ptr = saved_limit_ptr;
d->limit = saved_limit;
UPB_ASSERT(d->limit_ptr == d->end + UPB_MIN(0, d->limit));
}
return ptr;
}
/* singular, oneof, repeated field handling ***********************************/
typedef struct {
upb_Array* arr;
void* end;
} fastdecode_arr;
typedef enum {
FD_NEXT_ATLIMIT,
FD_NEXT_SAMEFIELD,
FD_NEXT_OTHERFIELD
} fastdecode_next;
typedef struct {
void* dst;
fastdecode_next next;
uint32_t tag;
} fastdecode_nextret;
UPB_FORCEINLINE
static void* fastdecode_resizearr(upb_Decoder* d, void* dst,
fastdecode_arr* farr, int valbytes) {
if (UPB_UNLIKELY(dst == farr->end)) {
size_t old_size = farr->arr->size;
size_t old_bytes = old_size * valbytes;
size_t new_size = old_size * 2;
size_t new_bytes = new_size * valbytes;
char* old_ptr = _upb_array_ptr(farr->arr);
char* new_ptr = upb_Arena_Realloc(&d->arena, old_ptr, old_bytes, new_bytes);
uint8_t elem_size_lg2 = __builtin_ctz(valbytes);
farr->arr->size = new_size;
farr->arr->data = _upb_array_tagptr(new_ptr, elem_size_lg2);
dst = (void*)(new_ptr + (old_size * valbytes));
farr->end = (void*)(new_ptr + (new_size * valbytes));
}
return dst;
}
UPB_FORCEINLINE
static bool fastdecode_tagmatch(uint32_t tag, uint64_t data, int tagbytes) {
if (tagbytes == 1) {
return (uint8_t)tag == (uint8_t)data;
} else {
return (uint16_t)tag == (uint16_t)data;
}
}
UPB_FORCEINLINE
static void fastdecode_commitarr(void* dst, fastdecode_arr* farr,
int valbytes) {
farr->arr->len =
(size_t)((char*)dst - (char*)_upb_array_ptr(farr->arr)) / valbytes;
}
UPB_FORCEINLINE
static fastdecode_nextret fastdecode_nextrepeated(upb_Decoder* d, void* dst,
const char** ptr,
fastdecode_arr* farr,
uint64_t data, int tagbytes,
int valbytes) {
fastdecode_nextret ret;
dst = (char*)dst + valbytes;
if (UPB_LIKELY(!decode_isdone(d, ptr))) {
ret.tag = fastdecode_loadtag(*ptr);
if (fastdecode_tagmatch(ret.tag, data, tagbytes)) {
ret.next = FD_NEXT_SAMEFIELD;
} else {
fastdecode_commitarr(dst, farr, valbytes);
ret.next = FD_NEXT_OTHERFIELD;
}
} else {
fastdecode_commitarr(dst, farr, valbytes);
ret.next = FD_NEXT_ATLIMIT;
}
4 years ago
ret.dst = dst;
return ret;
}
UPB_FORCEINLINE
static void* fastdecode_fieldmem(upb_Message* msg, uint64_t data) {
size_t ofs = data >> 48;
return (char*)msg + ofs;
}
UPB_FORCEINLINE
static void* fastdecode_getfield(upb_Decoder* d, const char* ptr,
upb_Message* msg, uint64_t* data,
uint64_t* hasbits, fastdecode_arr* farr,
int valbytes, upb_card card) {
switch (card) {
case CARD_s: {
uint8_t hasbit_index = *data >> 24;
// Set hasbit and return pointer to scalar field.
*hasbits |= 1ull << hasbit_index;
return fastdecode_fieldmem(msg, *data);
}
case CARD_o: {
uint16_t case_ofs = *data >> 32;
uint32_t* oneof_case = UPB_PTR_AT(msg, case_ofs, uint32_t);
uint8_t field_number = *data >> 24;
*oneof_case = field_number;
return fastdecode_fieldmem(msg, *data);
}
case CARD_r: {
// Get pointer to upb_Array and allocate/expand if necessary.
uint8_t elem_size_lg2 = __builtin_ctz(valbytes);
upb_Array** arr_p = fastdecode_fieldmem(msg, *data);
char* begin;
*(uint32_t*)msg |= *hasbits;
*hasbits = 0;
if (UPB_LIKELY(!*arr_p)) {
farr->arr = _upb_Array_New(&d->arena, 8, elem_size_lg2);
*arr_p = farr->arr;
} else {
farr->arr = *arr_p;
}
begin = _upb_array_ptr(farr->arr);
farr->end = begin + (farr->arr->size * valbytes);
*data = fastdecode_loadtag(ptr);
return begin + (farr->arr->len * valbytes);
}
default:
UPB_UNREACHABLE();
4 years ago
}
}
UPB_FORCEINLINE
static bool fastdecode_flippacked(uint64_t* data, int tagbytes) {
*data ^= (0x2 ^ 0x0); // Patch data to match packed wiretype.
return fastdecode_checktag(*data, tagbytes);
}
#define FASTDECODE_CHECKPACKED(tagbytes, card, func) \
if (UPB_UNLIKELY(!fastdecode_checktag(data, tagbytes))) { \
if (card == CARD_r && fastdecode_flippacked(&data, tagbytes)) { \
UPB_MUSTTAIL return func(UPB_PARSE_ARGS); \
} \
RETURN_GENERIC("packed check tag mismatch\n"); \
}
/* varint fields **************************************************************/
4 years ago
UPB_FORCEINLINE
static uint64_t fastdecode_munge(uint64_t val, int valbytes, bool zigzag) {
if (valbytes == 1) {
return val != 0;
} else if (zigzag) {
if (valbytes == 4) {
uint32_t n = val;
return (n >> 1) ^ -(int32_t)(n & 1);
} else if (valbytes == 8) {
return (val >> 1) ^ -(int64_t)(val & 1);
}
UPB_UNREACHABLE();
}
return val;
}
4 years ago
UPB_FORCEINLINE
static const char* fastdecode_varint64(const char* ptr, uint64_t* val) {
ptr++;
*val = (uint8_t)ptr[-1];
if (UPB_UNLIKELY(*val & 0x80)) {
int i;
for (i = 0; i < 8; i++) {
ptr++;
uint64_t byte = (uint8_t)ptr[-1];
*val += (byte - 1) << (7 + 7 * i);
if (UPB_LIKELY((byte & 0x80) == 0)) goto done;
}
ptr++;
uint64_t byte = (uint8_t)ptr[-1];
if (byte > 1) {
return NULL;
}
*val += (byte - 1) << 63;
}
done:
UPB_ASSUME(ptr != NULL);
return ptr;
}
#define FASTDECODE_UNPACKEDVARINT(d, ptr, msg, table, hasbits, data, tagbytes, \
valbytes, card, zigzag, packed) \
uint64_t val; \
void* dst; \
fastdecode_arr farr; \
\
FASTDECODE_CHECKPACKED(tagbytes, card, packed); \
\
dst = fastdecode_getfield(d, ptr, msg, &data, &hasbits, &farr, valbytes, \
card); \
if (card == CARD_r) { \
if (UPB_UNLIKELY(!dst)) { \
RETURN_GENERIC("need array resize\n"); \
} \
} \
\
again: \
if (card == CARD_r) { \
dst = fastdecode_resizearr(d, dst, &farr, valbytes); \
} \
\
ptr += tagbytes; \
ptr = fastdecode_varint64(ptr, &val); \
if (ptr == NULL) return fastdecode_err(d, kUpb_DecodeStatus_Malformed); \
val = fastdecode_munge(val, valbytes, zigzag); \
memcpy(dst, &val, valbytes); \
\
if (card == CARD_r) { \
fastdecode_nextret ret = fastdecode_nextrepeated( \
d, dst, &ptr, &farr, data, tagbytes, valbytes); \
switch (ret.next) { \
case FD_NEXT_SAMEFIELD: \
dst = ret.dst; \
goto again; \
case FD_NEXT_OTHERFIELD: \
data = ret.tag; \
UPB_MUSTTAIL return fastdecode_tagdispatch(UPB_PARSE_ARGS); \
case FD_NEXT_ATLIMIT: \
return ptr; \
} \
} \
\
UPB_MUSTTAIL return fastdecode_dispatch(UPB_PARSE_ARGS);
4 years ago
typedef struct {
uint8_t valbytes;
bool zigzag;
void* dst;
fastdecode_arr farr;
} fastdecode_varintdata;
UPB_FORCEINLINE
static const char* fastdecode_topackedvarint(upb_Decoder* d, const char* ptr,
void* ctx) {
fastdecode_varintdata* data = ctx;
void* dst = data->dst;
uint64_t val;
while (!decode_isdone(d, &ptr)) {
dst = fastdecode_resizearr(d, dst, &data->farr, data->valbytes);
ptr = fastdecode_varint64(ptr, &val);
if (ptr == NULL) return NULL;
val = fastdecode_munge(val, data->valbytes, data->zigzag);
memcpy(dst, &val, data->valbytes);
dst = (char*)dst + data->valbytes;
}
fastdecode_commitarr(dst, &data->farr, data->valbytes);
return ptr;
}
#define FASTDECODE_PACKEDVARINT(d, ptr, msg, table, hasbits, data, tagbytes, \
valbytes, zigzag, unpacked) \
fastdecode_varintdata ctx = {valbytes, zigzag}; \
\
FASTDECODE_CHECKPACKED(tagbytes, CARD_r, unpacked); \
\
ctx.dst = fastdecode_getfield(d, ptr, msg, &data, &hasbits, &ctx.farr, \
valbytes, CARD_r); \
if (UPB_UNLIKELY(!ctx.dst)) { \
RETURN_GENERIC("need array resize\n"); \
} \
\
ptr += tagbytes; \
ptr = fastdecode_delimited(d, ptr, &fastdecode_topackedvarint, &ctx); \
\
if (UPB_UNLIKELY(ptr == NULL)) { \
return fastdecode_err(d, kUpb_DecodeStatus_Malformed); \
} \
\
UPB_MUSTTAIL return fastdecode_dispatch(d, ptr, msg, table, hasbits, 0);
#define FASTDECODE_VARINT(d, ptr, msg, table, hasbits, data, tagbytes, \
valbytes, card, zigzag, unpacked, packed) \
if (card == CARD_p) { \
FASTDECODE_PACKEDVARINT(d, ptr, msg, table, hasbits, data, tagbytes, \
valbytes, zigzag, unpacked); \
} else { \
FASTDECODE_UNPACKEDVARINT(d, ptr, msg, table, hasbits, data, tagbytes, \
valbytes, card, zigzag, packed); \
}
4 years ago
#define z_ZZ true
#define b_ZZ false
#define v_ZZ false
4 years ago
/* Generate all combinations:
* {s,o,r,p} x {b1,v4,z4,v8,z8} x {1bt,2bt} */
4 years ago
#define F(card, type, valbytes, tagbytes) \
UPB_NOINLINE \
const char* upb_p##card##type##valbytes##_##tagbytes##bt(UPB_PARSE_PARAMS) { \
FASTDECODE_VARINT(d, ptr, msg, table, hasbits, data, tagbytes, valbytes, \
CARD_##card, type##_ZZ, \
upb_pr##type##valbytes##_##tagbytes##bt, \
upb_pp##type##valbytes##_##tagbytes##bt); \
4 years ago
}
4 years ago
4 years ago
#define TYPES(card, tagbytes) \
F(card, b, 1, tagbytes) \
F(card, v, 4, tagbytes) \
F(card, v, 8, tagbytes) \
F(card, z, 4, tagbytes) \
F(card, z, 8, tagbytes)
#define TAGBYTES(card) \
TYPES(card, 1) \
TYPES(card, 2)
TAGBYTES(s)
TAGBYTES(o)
TAGBYTES(r)
TAGBYTES(p)
4 years ago
#undef z_ZZ
#undef b_ZZ
#undef v_ZZ
#undef o_ONEOF
#undef s_ONEOF
#undef r_ONEOF
#undef F
#undef TYPES
#undef TAGBYTES
#undef FASTDECODE_UNPACKEDVARINT
#undef FASTDECODE_PACKEDVARINT
#undef FASTDECODE_VARINT
/* fixed fields ***************************************************************/
#define FASTDECODE_UNPACKEDFIXED(d, ptr, msg, table, hasbits, data, tagbytes, \
valbytes, card, packed) \
void* dst; \
fastdecode_arr farr; \
\
FASTDECODE_CHECKPACKED(tagbytes, card, packed) \
\
dst = fastdecode_getfield(d, ptr, msg, &data, &hasbits, &farr, valbytes, \
card); \
if (card == CARD_r) { \
if (UPB_UNLIKELY(!dst)) { \
RETURN_GENERIC("couldn't allocate array in arena\n"); \
} \
} \
\
again: \
if (card == CARD_r) { \
dst = fastdecode_resizearr(d, dst, &farr, valbytes); \
} \
\
ptr += tagbytes; \
memcpy(dst, ptr, valbytes); \
ptr += valbytes; \
\
if (card == CARD_r) { \
fastdecode_nextret ret = fastdecode_nextrepeated( \
d, dst, &ptr, &farr, data, tagbytes, valbytes); \
switch (ret.next) { \
case FD_NEXT_SAMEFIELD: \
dst = ret.dst; \
goto again; \
case FD_NEXT_OTHERFIELD: \
data = ret.tag; \
UPB_MUSTTAIL return fastdecode_tagdispatch(UPB_PARSE_ARGS); \
case FD_NEXT_ATLIMIT: \
return ptr; \
} \
} \
\
UPB_MUSTTAIL return fastdecode_dispatch(UPB_PARSE_ARGS);
#define FASTDECODE_PACKEDFIXED(d, ptr, msg, table, hasbits, data, tagbytes, \
valbytes, unpacked) \
FASTDECODE_CHECKPACKED(tagbytes, CARD_r, unpacked) \
\
ptr += tagbytes; \
int size = (uint8_t)ptr[0]; \
ptr++; \
if (size & 0x80) { \
ptr = fastdecode_longsize(ptr, &size); \
} \
\
if (UPB_UNLIKELY(fastdecode_boundscheck(ptr, size, d->limit_ptr) || \
(size % valbytes) != 0)) { \
return fastdecode_err(d, kUpb_DecodeStatus_Malformed); \
} \
\
upb_Array** arr_p = fastdecode_fieldmem(msg, data); \
upb_Array* arr = *arr_p; \
uint8_t elem_size_lg2 = __builtin_ctz(valbytes); \
int elems = size / valbytes; \
\
if (UPB_LIKELY(!arr)) { \
*arr_p = arr = _upb_Array_New(&d->arena, elems, elem_size_lg2); \
if (!arr) { \
return fastdecode_err(d, kUpb_DecodeStatus_Malformed); \
} \
} else { \
_upb_Array_Resize(arr, elems, &d->arena); \
} \
\
char* dst = _upb_array_ptr(arr); \
memcpy(dst, ptr, size); \
arr->len = elems; \
\
ptr += size; \
UPB_MUSTTAIL return fastdecode_dispatch(UPB_PARSE_ARGS);
#define FASTDECODE_FIXED(d, ptr, msg, table, hasbits, data, tagbytes, \
valbytes, card, unpacked, packed) \
if (card == CARD_p) { \
FASTDECODE_PACKEDFIXED(d, ptr, msg, table, hasbits, data, tagbytes, \
valbytes, unpacked); \
} else { \
FASTDECODE_UNPACKEDFIXED(d, ptr, msg, table, hasbits, data, tagbytes, \
valbytes, card, packed); \
}
/* Generate all combinations:
* {s,o,r,p} x {f4,f8} x {1bt,2bt} */
#define F(card, valbytes, tagbytes) \
UPB_NOINLINE \
const char* upb_p##card##f##valbytes##_##tagbytes##bt(UPB_PARSE_PARAMS) { \
FASTDECODE_FIXED(d, ptr, msg, table, hasbits, data, tagbytes, valbytes, \
CARD_##card, upb_ppf##valbytes##_##tagbytes##bt, \
upb_prf##valbytes##_##tagbytes##bt); \
}
#define TYPES(card, tagbytes) \
F(card, 4, tagbytes) \
F(card, 8, tagbytes)
#define TAGBYTES(card) \
TYPES(card, 1) \
TYPES(card, 2)
TAGBYTES(s)
TAGBYTES(o)
TAGBYTES(r)
TAGBYTES(p)
4 years ago
#undef F
#undef TYPES
#undef TAGBYTES
#undef FASTDECODE_UNPACKEDFIXED
#undef FASTDECODE_PACKEDFIXED
4 years ago
/* string fields **************************************************************/
typedef const char* fastdecode_copystr_func(struct upb_Decoder* d,
const char* ptr, upb_Message* msg,
const upb_MiniTable* table,
uint64_t hasbits,
upb_StringView* dst);
Fixed a size regression due to inlining UTF-8 verification. Overall size/speed impact on fasttable decoder is now: name old time/op new time/op delta ArenaOneAlloc 21.5ns ± 0% 21.5ns ± 0% ~ (p=0.060 n=12+12) ArenaInitialBlockOneAlloc 6.33ns ± 0% 6.33ns ± 0% ~ (p=0.413 n=11+12) LoadDescriptor_Upb 43.4µs ± 1% 45.5µs ± 1% +4.79% (p=0.000 n=12+12) LoadAdsDescriptor_Upb 2.50ms ± 0% 2.51ms ± 2% ~ (p=0.512 n=10+11) LoadDescriptor_Proto2 240µs ± 0% 240µs ± 0% -0.25% (p=0.000 n=12+12) LoadAdsDescriptor_Proto2 12.9ms ± 0% 12.9ms ± 0% +0.20% (p=0.014 n=10+12) Parse_Upb_FileDesc<UseArena,Copy> 4.99µs ± 0% 5.04µs ± 0% +0.98% (p=0.000 n=11+10) Parse_Upb_FileDesc<UseArena,Alias> 4.02µs ± 0% 4.18µs ± 0% +4.16% (p=0.000 n=10+12) Parse_Upb_FileDesc<InitBlock,Copy> 4.49µs ± 0% 4.54µs ± 0% +1.16% (p=0.000 n=11+10) Parse_Upb_FileDesc<InitBlock,Alias> 3.60µs ± 0% 3.80µs ± 0% +5.73% (p=0.000 n=12+11) Parse_Proto2<FileDesc,NoArena,Copy> 29.3µs ± 0% 29.3µs ± 0% ~ (p=0.069 n=11+12) Parse_Proto2<FileDesc,UseArena,Copy> 20.2µs ± 3% 20.3µs ± 2% ~ (p=0.880 n=12+11) Parse_Proto2<FileDesc,InitBlock,Copy> 16.5µs ± 0% 16.5µs ± 0% ~ (p=1.000 n=12+12) Parse_Proto2<FileDescSV,InitBlock,Alias> 16.4µs ± 0% 16.4µs ± 1% ~ (p=0.590 n=12+12) SerializeDescriptor_Proto2 5.31µs ± 1% 6.65µs ±29% +25.07% (p=0.000 n=12+12) SerializeDescriptor_Upb 12.4µs ± 0% 12.5µs ± 0% +1.23% (p=0.000 n=12+12) FILE SIZE VM SIZE -------------- -------------- +16% +128 [ = ] 0 [Unmapped] -1.2% -4 -1.2% -4 [section .text] [NEW] +2 [NEW] +2 fastdecode_isdonefallback [DEL] -6 [DEL] -6 fastdecode_longstring_noutf8 -0.2% -124 -0.2% -124 upb/decode_fast.c +5.8% +64 +6.0% +64 upb_pom_1bt_max64b +2.7% +64 +2.7% +64 upb_ppv8_2bt +2.7% +32 +2.8% +32 upb_psm_1bt_max256b +2.8% +32 +3.0% +32 upb_psm_1bt_max64b +2.8% +32 +3.0% +32 upb_psm_2bt_max64b +4.0% +24 +4.2% +24 upb_psv8_1bt +2.0% +16 +2.1% +16 upb_prf4_2bt +1.3% +16 +1.4% +16 upb_prz8_2bt -0.3% -4 -0.3% -4 [3 Others] -1.6% -8 -1.7% -8 upb_cob_1bt -1.6% -8 -1.7% -8 upb_csb_1bt -2.5% -16 -2.6% -16 upb_pov4_1bt -1.3% -16 -1.3% -16 upb_prv8_2bt -2.5% -16 -2.7% -16 upb_psv4_1bt -2.5% -16 -2.6% -16 upb_psv4_2bt -3.0% -32 -3.1% -32 upb_prs_2bt -2.6% -32 -2.6% -32 upb_prv4_2bt -4.9% -48 -5.1% -48 upb_prb_2bt -3.9% -48 -4.0% -48 upb_prv4_1bt -7.2% -72 -7.5% -72 upb_prb_1bt -7.8% -88 -8.0% -88 upb_prs_1bt [ = ] 0 -0.1% -128 TOTAL There is a bit of speed regression, but it appears there were bigger CPU regressions prior to this. We probably need some separate optimization attention again to get back to the performance numbers we had when fasttable was first submitted.
4 years ago
UPB_NOINLINE
static const char* fastdecode_verifyutf8(upb_Decoder* d, const char* ptr,
upb_Message* msg, intptr_t table,
Fixed a size regression due to inlining UTF-8 verification. Overall size/speed impact on fasttable decoder is now: name old time/op new time/op delta ArenaOneAlloc 21.5ns ± 0% 21.5ns ± 0% ~ (p=0.060 n=12+12) ArenaInitialBlockOneAlloc 6.33ns ± 0% 6.33ns ± 0% ~ (p=0.413 n=11+12) LoadDescriptor_Upb 43.4µs ± 1% 45.5µs ± 1% +4.79% (p=0.000 n=12+12) LoadAdsDescriptor_Upb 2.50ms ± 0% 2.51ms ± 2% ~ (p=0.512 n=10+11) LoadDescriptor_Proto2 240µs ± 0% 240µs ± 0% -0.25% (p=0.000 n=12+12) LoadAdsDescriptor_Proto2 12.9ms ± 0% 12.9ms ± 0% +0.20% (p=0.014 n=10+12) Parse_Upb_FileDesc<UseArena,Copy> 4.99µs ± 0% 5.04µs ± 0% +0.98% (p=0.000 n=11+10) Parse_Upb_FileDesc<UseArena,Alias> 4.02µs ± 0% 4.18µs ± 0% +4.16% (p=0.000 n=10+12) Parse_Upb_FileDesc<InitBlock,Copy> 4.49µs ± 0% 4.54µs ± 0% +1.16% (p=0.000 n=11+10) Parse_Upb_FileDesc<InitBlock,Alias> 3.60µs ± 0% 3.80µs ± 0% +5.73% (p=0.000 n=12+11) Parse_Proto2<FileDesc,NoArena,Copy> 29.3µs ± 0% 29.3µs ± 0% ~ (p=0.069 n=11+12) Parse_Proto2<FileDesc,UseArena,Copy> 20.2µs ± 3% 20.3µs ± 2% ~ (p=0.880 n=12+11) Parse_Proto2<FileDesc,InitBlock,Copy> 16.5µs ± 0% 16.5µs ± 0% ~ (p=1.000 n=12+12) Parse_Proto2<FileDescSV,InitBlock,Alias> 16.4µs ± 0% 16.4µs ± 1% ~ (p=0.590 n=12+12) SerializeDescriptor_Proto2 5.31µs ± 1% 6.65µs ±29% +25.07% (p=0.000 n=12+12) SerializeDescriptor_Upb 12.4µs ± 0% 12.5µs ± 0% +1.23% (p=0.000 n=12+12) FILE SIZE VM SIZE -------------- -------------- +16% +128 [ = ] 0 [Unmapped] -1.2% -4 -1.2% -4 [section .text] [NEW] +2 [NEW] +2 fastdecode_isdonefallback [DEL] -6 [DEL] -6 fastdecode_longstring_noutf8 -0.2% -124 -0.2% -124 upb/decode_fast.c +5.8% +64 +6.0% +64 upb_pom_1bt_max64b +2.7% +64 +2.7% +64 upb_ppv8_2bt +2.7% +32 +2.8% +32 upb_psm_1bt_max256b +2.8% +32 +3.0% +32 upb_psm_1bt_max64b +2.8% +32 +3.0% +32 upb_psm_2bt_max64b +4.0% +24 +4.2% +24 upb_psv8_1bt +2.0% +16 +2.1% +16 upb_prf4_2bt +1.3% +16 +1.4% +16 upb_prz8_2bt -0.3% -4 -0.3% -4 [3 Others] -1.6% -8 -1.7% -8 upb_cob_1bt -1.6% -8 -1.7% -8 upb_csb_1bt -2.5% -16 -2.6% -16 upb_pov4_1bt -1.3% -16 -1.3% -16 upb_prv8_2bt -2.5% -16 -2.7% -16 upb_psv4_1bt -2.5% -16 -2.6% -16 upb_psv4_2bt -3.0% -32 -3.1% -32 upb_prs_2bt -2.6% -32 -2.6% -32 upb_prv4_2bt -4.9% -48 -5.1% -48 upb_prb_2bt -3.9% -48 -4.0% -48 upb_prv4_1bt -7.2% -72 -7.5% -72 upb_prb_1bt -7.8% -88 -8.0% -88 upb_prs_1bt [ = ] 0 -0.1% -128 TOTAL There is a bit of speed regression, but it appears there were bigger CPU regressions prior to this. We probably need some separate optimization attention again to get back to the performance numbers we had when fasttable was first submitted.
4 years ago
uint64_t hasbits, uint64_t data) {
upb_StringView* dst = (upb_StringView*)data;
Fixed a size regression due to inlining UTF-8 verification. Overall size/speed impact on fasttable decoder is now: name old time/op new time/op delta ArenaOneAlloc 21.5ns ± 0% 21.5ns ± 0% ~ (p=0.060 n=12+12) ArenaInitialBlockOneAlloc 6.33ns ± 0% 6.33ns ± 0% ~ (p=0.413 n=11+12) LoadDescriptor_Upb 43.4µs ± 1% 45.5µs ± 1% +4.79% (p=0.000 n=12+12) LoadAdsDescriptor_Upb 2.50ms ± 0% 2.51ms ± 2% ~ (p=0.512 n=10+11) LoadDescriptor_Proto2 240µs ± 0% 240µs ± 0% -0.25% (p=0.000 n=12+12) LoadAdsDescriptor_Proto2 12.9ms ± 0% 12.9ms ± 0% +0.20% (p=0.014 n=10+12) Parse_Upb_FileDesc<UseArena,Copy> 4.99µs ± 0% 5.04µs ± 0% +0.98% (p=0.000 n=11+10) Parse_Upb_FileDesc<UseArena,Alias> 4.02µs ± 0% 4.18µs ± 0% +4.16% (p=0.000 n=10+12) Parse_Upb_FileDesc<InitBlock,Copy> 4.49µs ± 0% 4.54µs ± 0% +1.16% (p=0.000 n=11+10) Parse_Upb_FileDesc<InitBlock,Alias> 3.60µs ± 0% 3.80µs ± 0% +5.73% (p=0.000 n=12+11) Parse_Proto2<FileDesc,NoArena,Copy> 29.3µs ± 0% 29.3µs ± 0% ~ (p=0.069 n=11+12) Parse_Proto2<FileDesc,UseArena,Copy> 20.2µs ± 3% 20.3µs ± 2% ~ (p=0.880 n=12+11) Parse_Proto2<FileDesc,InitBlock,Copy> 16.5µs ± 0% 16.5µs ± 0% ~ (p=1.000 n=12+12) Parse_Proto2<FileDescSV,InitBlock,Alias> 16.4µs ± 0% 16.4µs ± 1% ~ (p=0.590 n=12+12) SerializeDescriptor_Proto2 5.31µs ± 1% 6.65µs ±29% +25.07% (p=0.000 n=12+12) SerializeDescriptor_Upb 12.4µs ± 0% 12.5µs ± 0% +1.23% (p=0.000 n=12+12) FILE SIZE VM SIZE -------------- -------------- +16% +128 [ = ] 0 [Unmapped] -1.2% -4 -1.2% -4 [section .text] [NEW] +2 [NEW] +2 fastdecode_isdonefallback [DEL] -6 [DEL] -6 fastdecode_longstring_noutf8 -0.2% -124 -0.2% -124 upb/decode_fast.c +5.8% +64 +6.0% +64 upb_pom_1bt_max64b +2.7% +64 +2.7% +64 upb_ppv8_2bt +2.7% +32 +2.8% +32 upb_psm_1bt_max256b +2.8% +32 +3.0% +32 upb_psm_1bt_max64b +2.8% +32 +3.0% +32 upb_psm_2bt_max64b +4.0% +24 +4.2% +24 upb_psv8_1bt +2.0% +16 +2.1% +16 upb_prf4_2bt +1.3% +16 +1.4% +16 upb_prz8_2bt -0.3% -4 -0.3% -4 [3 Others] -1.6% -8 -1.7% -8 upb_cob_1bt -1.6% -8 -1.7% -8 upb_csb_1bt -2.5% -16 -2.6% -16 upb_pov4_1bt -1.3% -16 -1.3% -16 upb_prv8_2bt -2.5% -16 -2.7% -16 upb_psv4_1bt -2.5% -16 -2.6% -16 upb_psv4_2bt -3.0% -32 -3.1% -32 upb_prs_2bt -2.6% -32 -2.6% -32 upb_prv4_2bt -4.9% -48 -5.1% -48 upb_prb_2bt -3.9% -48 -4.0% -48 upb_prv4_1bt -7.2% -72 -7.5% -72 upb_prb_1bt -7.8% -88 -8.0% -88 upb_prs_1bt [ = ] 0 -0.1% -128 TOTAL There is a bit of speed regression, but it appears there were bigger CPU regressions prior to this. We probably need some separate optimization attention again to get back to the performance numbers we had when fasttable was first submitted.
4 years ago
if (!decode_verifyutf8_inl(dst->data, dst->size)) {
return fastdecode_err(d, kUpb_DecodeStatus_BadUtf8);
Fixed a size regression due to inlining UTF-8 verification. Overall size/speed impact on fasttable decoder is now: name old time/op new time/op delta ArenaOneAlloc 21.5ns ± 0% 21.5ns ± 0% ~ (p=0.060 n=12+12) ArenaInitialBlockOneAlloc 6.33ns ± 0% 6.33ns ± 0% ~ (p=0.413 n=11+12) LoadDescriptor_Upb 43.4µs ± 1% 45.5µs ± 1% +4.79% (p=0.000 n=12+12) LoadAdsDescriptor_Upb 2.50ms ± 0% 2.51ms ± 2% ~ (p=0.512 n=10+11) LoadDescriptor_Proto2 240µs ± 0% 240µs ± 0% -0.25% (p=0.000 n=12+12) LoadAdsDescriptor_Proto2 12.9ms ± 0% 12.9ms ± 0% +0.20% (p=0.014 n=10+12) Parse_Upb_FileDesc<UseArena,Copy> 4.99µs ± 0% 5.04µs ± 0% +0.98% (p=0.000 n=11+10) Parse_Upb_FileDesc<UseArena,Alias> 4.02µs ± 0% 4.18µs ± 0% +4.16% (p=0.000 n=10+12) Parse_Upb_FileDesc<InitBlock,Copy> 4.49µs ± 0% 4.54µs ± 0% +1.16% (p=0.000 n=11+10) Parse_Upb_FileDesc<InitBlock,Alias> 3.60µs ± 0% 3.80µs ± 0% +5.73% (p=0.000 n=12+11) Parse_Proto2<FileDesc,NoArena,Copy> 29.3µs ± 0% 29.3µs ± 0% ~ (p=0.069 n=11+12) Parse_Proto2<FileDesc,UseArena,Copy> 20.2µs ± 3% 20.3µs ± 2% ~ (p=0.880 n=12+11) Parse_Proto2<FileDesc,InitBlock,Copy> 16.5µs ± 0% 16.5µs ± 0% ~ (p=1.000 n=12+12) Parse_Proto2<FileDescSV,InitBlock,Alias> 16.4µs ± 0% 16.4µs ± 1% ~ (p=0.590 n=12+12) SerializeDescriptor_Proto2 5.31µs ± 1% 6.65µs ±29% +25.07% (p=0.000 n=12+12) SerializeDescriptor_Upb 12.4µs ± 0% 12.5µs ± 0% +1.23% (p=0.000 n=12+12) FILE SIZE VM SIZE -------------- -------------- +16% +128 [ = ] 0 [Unmapped] -1.2% -4 -1.2% -4 [section .text] [NEW] +2 [NEW] +2 fastdecode_isdonefallback [DEL] -6 [DEL] -6 fastdecode_longstring_noutf8 -0.2% -124 -0.2% -124 upb/decode_fast.c +5.8% +64 +6.0% +64 upb_pom_1bt_max64b +2.7% +64 +2.7% +64 upb_ppv8_2bt +2.7% +32 +2.8% +32 upb_psm_1bt_max256b +2.8% +32 +3.0% +32 upb_psm_1bt_max64b +2.8% +32 +3.0% +32 upb_psm_2bt_max64b +4.0% +24 +4.2% +24 upb_psv8_1bt +2.0% +16 +2.1% +16 upb_prf4_2bt +1.3% +16 +1.4% +16 upb_prz8_2bt -0.3% -4 -0.3% -4 [3 Others] -1.6% -8 -1.7% -8 upb_cob_1bt -1.6% -8 -1.7% -8 upb_csb_1bt -2.5% -16 -2.6% -16 upb_pov4_1bt -1.3% -16 -1.3% -16 upb_prv8_2bt -2.5% -16 -2.7% -16 upb_psv4_1bt -2.5% -16 -2.6% -16 upb_psv4_2bt -3.0% -32 -3.1% -32 upb_prs_2bt -2.6% -32 -2.6% -32 upb_prv4_2bt -4.9% -48 -5.1% -48 upb_prb_2bt -3.9% -48 -4.0% -48 upb_prv4_1bt -7.2% -72 -7.5% -72 upb_prb_1bt -7.8% -88 -8.0% -88 upb_prs_1bt [ = ] 0 -0.1% -128 TOTAL There is a bit of speed regression, but it appears there were bigger CPU regressions prior to this. We probably need some separate optimization attention again to get back to the performance numbers we had when fasttable was first submitted.
4 years ago
}
UPB_MUSTTAIL return fastdecode_dispatch(UPB_PARSE_ARGS);
Fixed a size regression due to inlining UTF-8 verification. Overall size/speed impact on fasttable decoder is now: name old time/op new time/op delta ArenaOneAlloc 21.5ns ± 0% 21.5ns ± 0% ~ (p=0.060 n=12+12) ArenaInitialBlockOneAlloc 6.33ns ± 0% 6.33ns ± 0% ~ (p=0.413 n=11+12) LoadDescriptor_Upb 43.4µs ± 1% 45.5µs ± 1% +4.79% (p=0.000 n=12+12) LoadAdsDescriptor_Upb 2.50ms ± 0% 2.51ms ± 2% ~ (p=0.512 n=10+11) LoadDescriptor_Proto2 240µs ± 0% 240µs ± 0% -0.25% (p=0.000 n=12+12) LoadAdsDescriptor_Proto2 12.9ms ± 0% 12.9ms ± 0% +0.20% (p=0.014 n=10+12) Parse_Upb_FileDesc<UseArena,Copy> 4.99µs ± 0% 5.04µs ± 0% +0.98% (p=0.000 n=11+10) Parse_Upb_FileDesc<UseArena,Alias> 4.02µs ± 0% 4.18µs ± 0% +4.16% (p=0.000 n=10+12) Parse_Upb_FileDesc<InitBlock,Copy> 4.49µs ± 0% 4.54µs ± 0% +1.16% (p=0.000 n=11+10) Parse_Upb_FileDesc<InitBlock,Alias> 3.60µs ± 0% 3.80µs ± 0% +5.73% (p=0.000 n=12+11) Parse_Proto2<FileDesc,NoArena,Copy> 29.3µs ± 0% 29.3µs ± 0% ~ (p=0.069 n=11+12) Parse_Proto2<FileDesc,UseArena,Copy> 20.2µs ± 3% 20.3µs ± 2% ~ (p=0.880 n=12+11) Parse_Proto2<FileDesc,InitBlock,Copy> 16.5µs ± 0% 16.5µs ± 0% ~ (p=1.000 n=12+12) Parse_Proto2<FileDescSV,InitBlock,Alias> 16.4µs ± 0% 16.4µs ± 1% ~ (p=0.590 n=12+12) SerializeDescriptor_Proto2 5.31µs ± 1% 6.65µs ±29% +25.07% (p=0.000 n=12+12) SerializeDescriptor_Upb 12.4µs ± 0% 12.5µs ± 0% +1.23% (p=0.000 n=12+12) FILE SIZE VM SIZE -------------- -------------- +16% +128 [ = ] 0 [Unmapped] -1.2% -4 -1.2% -4 [section .text] [NEW] +2 [NEW] +2 fastdecode_isdonefallback [DEL] -6 [DEL] -6 fastdecode_longstring_noutf8 -0.2% -124 -0.2% -124 upb/decode_fast.c +5.8% +64 +6.0% +64 upb_pom_1bt_max64b +2.7% +64 +2.7% +64 upb_ppv8_2bt +2.7% +32 +2.8% +32 upb_psm_1bt_max256b +2.8% +32 +3.0% +32 upb_psm_1bt_max64b +2.8% +32 +3.0% +32 upb_psm_2bt_max64b +4.0% +24 +4.2% +24 upb_psv8_1bt +2.0% +16 +2.1% +16 upb_prf4_2bt +1.3% +16 +1.4% +16 upb_prz8_2bt -0.3% -4 -0.3% -4 [3 Others] -1.6% -8 -1.7% -8 upb_cob_1bt -1.6% -8 -1.7% -8 upb_csb_1bt -2.5% -16 -2.6% -16 upb_pov4_1bt -1.3% -16 -1.3% -16 upb_prv8_2bt -2.5% -16 -2.7% -16 upb_psv4_1bt -2.5% -16 -2.6% -16 upb_psv4_2bt -3.0% -32 -3.1% -32 upb_prs_2bt -2.6% -32 -2.6% -32 upb_prv4_2bt -4.9% -48 -5.1% -48 upb_prb_2bt -3.9% -48 -4.0% -48 upb_prv4_1bt -7.2% -72 -7.5% -72 upb_prb_1bt -7.8% -88 -8.0% -88 upb_prs_1bt [ = ] 0 -0.1% -128 TOTAL There is a bit of speed regression, but it appears there were bigger CPU regressions prior to this. We probably need some separate optimization attention again to get back to the performance numbers we had when fasttable was first submitted.
4 years ago
}
#define FASTDECODE_LONGSTRING(d, ptr, msg, table, hasbits, dst, validate_utf8) \
int size = (uint8_t)ptr[0]; /* Could plumb through hasbits. */ \
ptr++; \
if (size & 0x80) { \
ptr = fastdecode_longsize(ptr, &size); \
} \
\
if (UPB_UNLIKELY(fastdecode_boundscheck(ptr, size, d->limit_ptr))) { \
dst->size = 0; \
return fastdecode_err(d, kUpb_DecodeStatus_Malformed); \
} \
\
if (d->options & kUpb_DecodeOption_AliasString) { \
dst->data = ptr; \
dst->size = size; \
} else { \
char* data = upb_Arena_Malloc(&d->arena, size); \
if (!data) { \
return fastdecode_err(d, kUpb_DecodeStatus_OutOfMemory); \
} \
memcpy(data, ptr, size); \
dst->data = data; \
dst->size = size; \
} \
\
ptr += size; \
if (validate_utf8) { \
Fixed a size regression due to inlining UTF-8 verification. Overall size/speed impact on fasttable decoder is now: name old time/op new time/op delta ArenaOneAlloc 21.5ns ± 0% 21.5ns ± 0% ~ (p=0.060 n=12+12) ArenaInitialBlockOneAlloc 6.33ns ± 0% 6.33ns ± 0% ~ (p=0.413 n=11+12) LoadDescriptor_Upb 43.4µs ± 1% 45.5µs ± 1% +4.79% (p=0.000 n=12+12) LoadAdsDescriptor_Upb 2.50ms ± 0% 2.51ms ± 2% ~ (p=0.512 n=10+11) LoadDescriptor_Proto2 240µs ± 0% 240µs ± 0% -0.25% (p=0.000 n=12+12) LoadAdsDescriptor_Proto2 12.9ms ± 0% 12.9ms ± 0% +0.20% (p=0.014 n=10+12) Parse_Upb_FileDesc<UseArena,Copy> 4.99µs ± 0% 5.04µs ± 0% +0.98% (p=0.000 n=11+10) Parse_Upb_FileDesc<UseArena,Alias> 4.02µs ± 0% 4.18µs ± 0% +4.16% (p=0.000 n=10+12) Parse_Upb_FileDesc<InitBlock,Copy> 4.49µs ± 0% 4.54µs ± 0% +1.16% (p=0.000 n=11+10) Parse_Upb_FileDesc<InitBlock,Alias> 3.60µs ± 0% 3.80µs ± 0% +5.73% (p=0.000 n=12+11) Parse_Proto2<FileDesc,NoArena,Copy> 29.3µs ± 0% 29.3µs ± 0% ~ (p=0.069 n=11+12) Parse_Proto2<FileDesc,UseArena,Copy> 20.2µs ± 3% 20.3µs ± 2% ~ (p=0.880 n=12+11) Parse_Proto2<FileDesc,InitBlock,Copy> 16.5µs ± 0% 16.5µs ± 0% ~ (p=1.000 n=12+12) Parse_Proto2<FileDescSV,InitBlock,Alias> 16.4µs ± 0% 16.4µs ± 1% ~ (p=0.590 n=12+12) SerializeDescriptor_Proto2 5.31µs ± 1% 6.65µs ±29% +25.07% (p=0.000 n=12+12) SerializeDescriptor_Upb 12.4µs ± 0% 12.5µs ± 0% +1.23% (p=0.000 n=12+12) FILE SIZE VM SIZE -------------- -------------- +16% +128 [ = ] 0 [Unmapped] -1.2% -4 -1.2% -4 [section .text] [NEW] +2 [NEW] +2 fastdecode_isdonefallback [DEL] -6 [DEL] -6 fastdecode_longstring_noutf8 -0.2% -124 -0.2% -124 upb/decode_fast.c +5.8% +64 +6.0% +64 upb_pom_1bt_max64b +2.7% +64 +2.7% +64 upb_ppv8_2bt +2.7% +32 +2.8% +32 upb_psm_1bt_max256b +2.8% +32 +3.0% +32 upb_psm_1bt_max64b +2.8% +32 +3.0% +32 upb_psm_2bt_max64b +4.0% +24 +4.2% +24 upb_psv8_1bt +2.0% +16 +2.1% +16 upb_prf4_2bt +1.3% +16 +1.4% +16 upb_prz8_2bt -0.3% -4 -0.3% -4 [3 Others] -1.6% -8 -1.7% -8 upb_cob_1bt -1.6% -8 -1.7% -8 upb_csb_1bt -2.5% -16 -2.6% -16 upb_pov4_1bt -1.3% -16 -1.3% -16 upb_prv8_2bt -2.5% -16 -2.7% -16 upb_psv4_1bt -2.5% -16 -2.6% -16 upb_psv4_2bt -3.0% -32 -3.1% -32 upb_prs_2bt -2.6% -32 -2.6% -32 upb_prv4_2bt -4.9% -48 -5.1% -48 upb_prb_2bt -3.9% -48 -4.0% -48 upb_prv4_1bt -7.2% -72 -7.5% -72 upb_prb_1bt -7.8% -88 -8.0% -88 upb_prs_1bt [ = ] 0 -0.1% -128 TOTAL There is a bit of speed regression, but it appears there were bigger CPU regressions prior to this. We probably need some separate optimization attention again to get back to the performance numbers we had when fasttable was first submitted.
4 years ago
data = (uint64_t)dst; \
UPB_MUSTTAIL return fastdecode_verifyutf8(UPB_PARSE_ARGS); \
} else { \
UPB_MUSTTAIL return fastdecode_dispatch(UPB_PARSE_ARGS); \
}
UPB_NOINLINE
static const char* fastdecode_longstring_utf8(struct upb_Decoder* d,
const char* ptr, upb_Message* msg,
intptr_t table, uint64_t hasbits,
uint64_t data) {
upb_StringView* dst = (upb_StringView*)data;
FASTDECODE_LONGSTRING(d, ptr, msg, table, hasbits, dst, true);
}
UPB_NOINLINE
static const char* fastdecode_longstring_noutf8(
struct upb_Decoder* d, const char* ptr, upb_Message* msg, intptr_t table,
uint64_t hasbits, uint64_t data) {
upb_StringView* dst = (upb_StringView*)data;
FASTDECODE_LONGSTRING(d, ptr, msg, table, hasbits, dst, false);
}
UPB_FORCEINLINE
static void fastdecode_docopy(upb_Decoder* d, const char* ptr, uint32_t size,
int copy, char* data, upb_StringView* dst) {
Fixed some small bugs and performance problems in string copying. Before this CL, with alias=false: ------------------------------------------------------------------------------ Benchmark Time CPU Iterations ------------------------------------------------------------------------------ BM_Parse_Upb_FileDesc_WithInitialBlock 3715 ns 3715 ns 188916 1.88206GB/s Performance counter stats for 'bazel-bin/benchmarks/benchmark --benchmark_filter=BM_Parse_Upb_FileDesc_WithInitialBlock': 1,122.92 msec task-clock # 0.979 CPUs utilized 3 context-switches # 0.003 K/sec 0 cpu-migrations # 0.000 K/sec 196 page-faults # 0.175 K/sec 4,144,746,717 cycles # 3.691 GHz 15,351,966,804 instructions # 3.70 insn per cycle 2,590,281,905 branches # 2306.728 M/sec 2,996,157 branch-misses # 0.12% of all branches 1.146615328 seconds time elapsed 1.115578000 seconds user 0.008025000 seconds sys After this CL: ------------------------------------------------------------------------------ Benchmark Time CPU Iterations ------------------------------------------------------------------------------ BM_Parse_Upb_FileDesc_WithInitialBlock 3554 ns 3554 ns 197527 1.9674GB/s Performance counter stats for 'bazel-bin/benchmarks/benchmark --benchmark_filter=BM_Parse_Upb_FileDesc_WithInitialBlock': 1,105.34 msec task-clock # 0.982 CPUs utilized 3 context-switches # 0.003 K/sec 0 cpu-migrations # 0.000 K/sec 197 page-faults # 0.178 K/sec 4,077,736,892 cycles # 3.689 GHz 15,442,709,352 instructions # 3.79 insn per cycle 2,435,131,301 branches # 2203.068 M/sec 2,643,775 branch-misses # 0.11% of all branches 1.125393845 seconds time elapsed 1.097770000 seconds user 0.008012000 seconds sys
4 years ago
d->arena.head.ptr += copy;
dst->data = data;
UPB_UNPOISON_MEMORY_REGION(data, copy);
memcpy(data, ptr, copy);
UPB_POISON_MEMORY_REGION(data + size, copy - size);
}
#define FASTDECODE_COPYSTRING(d, ptr, msg, table, hasbits, data, tagbytes, \
card, validate_utf8) \
upb_StringView* dst; \
fastdecode_arr farr; \
int64_t size; \
size_t arena_has; \
size_t common_has; \
char* buf; \
\
UPB_ASSERT((d->options & kUpb_DecodeOption_AliasString) == 0); \
UPB_ASSERT(fastdecode_checktag(data, tagbytes)); \
\
dst = fastdecode_getfield(d, ptr, msg, &data, &hasbits, &farr, \
sizeof(upb_StringView), card); \
\
again: \
if (card == CARD_r) { \
dst = fastdecode_resizearr(d, dst, &farr, sizeof(upb_StringView)); \
} \
\
size = (uint8_t)ptr[tagbytes]; \
ptr += tagbytes + 1; \
dst->size = size; \
\
buf = d->arena.head.ptr; \
arena_has = _upb_ArenaHas(&d->arena); \
common_has = UPB_MIN(arena_has, (d->end - ptr) + 16); \
\
if (UPB_LIKELY(size <= 15 - tagbytes)) { \
if (arena_has < 16) goto longstr; \
d->arena.head.ptr += 16; \
memcpy(buf, ptr - tagbytes - 1, 16); \
dst->data = buf + tagbytes + 1; \
} else if (UPB_LIKELY(size <= 32)) { \
if (UPB_UNLIKELY(common_has < 32)) goto longstr; \
fastdecode_docopy(d, ptr, size, 32, buf, dst); \
} else if (UPB_LIKELY(size <= 64)) { \
if (UPB_UNLIKELY(common_has < 64)) goto longstr; \
fastdecode_docopy(d, ptr, size, 64, buf, dst); \
} else if (UPB_LIKELY(size < 128)) { \
if (UPB_UNLIKELY(common_has < 128)) goto longstr; \
fastdecode_docopy(d, ptr, size, 128, buf, dst); \
} else { \
goto longstr; \
} \
\
ptr += size; \
\
if (card == CARD_r) { \
if (validate_utf8 && !decode_verifyutf8_inl(dst->data, dst->size)) { \
return fastdecode_err(d, kUpb_DecodeStatus_BadUtf8); \
} \
fastdecode_nextret ret = fastdecode_nextrepeated( \
d, dst, &ptr, &farr, data, tagbytes, sizeof(upb_StringView)); \
switch (ret.next) { \
case FD_NEXT_SAMEFIELD: \
dst = ret.dst; \
goto again; \
case FD_NEXT_OTHERFIELD: \
data = ret.tag; \
UPB_MUSTTAIL return fastdecode_tagdispatch(UPB_PARSE_ARGS); \
case FD_NEXT_ATLIMIT: \
return ptr; \
} \
} \
\
if (card != CARD_r && validate_utf8) { \
data = (uint64_t)dst; \
UPB_MUSTTAIL return fastdecode_verifyutf8(UPB_PARSE_ARGS); \
} \
\
UPB_MUSTTAIL return fastdecode_dispatch(UPB_PARSE_ARGS); \
\
longstr: \
if (card == CARD_r) { \
fastdecode_commitarr(dst + 1, &farr, sizeof(upb_StringView)); \
} \
ptr--; \
if (validate_utf8) { \
UPB_MUSTTAIL return fastdecode_longstring_utf8(d, ptr, msg, table, \
hasbits, (uint64_t)dst); \
} else { \
UPB_MUSTTAIL return fastdecode_longstring_noutf8(d, ptr, msg, table, \
hasbits, (uint64_t)dst); \
}
Fixed a size regression due to inlining UTF-8 verification. Overall size/speed impact on fasttable decoder is now: name old time/op new time/op delta ArenaOneAlloc 21.5ns ± 0% 21.5ns ± 0% ~ (p=0.060 n=12+12) ArenaInitialBlockOneAlloc 6.33ns ± 0% 6.33ns ± 0% ~ (p=0.413 n=11+12) LoadDescriptor_Upb 43.4µs ± 1% 45.5µs ± 1% +4.79% (p=0.000 n=12+12) LoadAdsDescriptor_Upb 2.50ms ± 0% 2.51ms ± 2% ~ (p=0.512 n=10+11) LoadDescriptor_Proto2 240µs ± 0% 240µs ± 0% -0.25% (p=0.000 n=12+12) LoadAdsDescriptor_Proto2 12.9ms ± 0% 12.9ms ± 0% +0.20% (p=0.014 n=10+12) Parse_Upb_FileDesc<UseArena,Copy> 4.99µs ± 0% 5.04µs ± 0% +0.98% (p=0.000 n=11+10) Parse_Upb_FileDesc<UseArena,Alias> 4.02µs ± 0% 4.18µs ± 0% +4.16% (p=0.000 n=10+12) Parse_Upb_FileDesc<InitBlock,Copy> 4.49µs ± 0% 4.54µs ± 0% +1.16% (p=0.000 n=11+10) Parse_Upb_FileDesc<InitBlock,Alias> 3.60µs ± 0% 3.80µs ± 0% +5.73% (p=0.000 n=12+11) Parse_Proto2<FileDesc,NoArena,Copy> 29.3µs ± 0% 29.3µs ± 0% ~ (p=0.069 n=11+12) Parse_Proto2<FileDesc,UseArena,Copy> 20.2µs ± 3% 20.3µs ± 2% ~ (p=0.880 n=12+11) Parse_Proto2<FileDesc,InitBlock,Copy> 16.5µs ± 0% 16.5µs ± 0% ~ (p=1.000 n=12+12) Parse_Proto2<FileDescSV,InitBlock,Alias> 16.4µs ± 0% 16.4µs ± 1% ~ (p=0.590 n=12+12) SerializeDescriptor_Proto2 5.31µs ± 1% 6.65µs ±29% +25.07% (p=0.000 n=12+12) SerializeDescriptor_Upb 12.4µs ± 0% 12.5µs ± 0% +1.23% (p=0.000 n=12+12) FILE SIZE VM SIZE -------------- -------------- +16% +128 [ = ] 0 [Unmapped] -1.2% -4 -1.2% -4 [section .text] [NEW] +2 [NEW] +2 fastdecode_isdonefallback [DEL] -6 [DEL] -6 fastdecode_longstring_noutf8 -0.2% -124 -0.2% -124 upb/decode_fast.c +5.8% +64 +6.0% +64 upb_pom_1bt_max64b +2.7% +64 +2.7% +64 upb_ppv8_2bt +2.7% +32 +2.8% +32 upb_psm_1bt_max256b +2.8% +32 +3.0% +32 upb_psm_1bt_max64b +2.8% +32 +3.0% +32 upb_psm_2bt_max64b +4.0% +24 +4.2% +24 upb_psv8_1bt +2.0% +16 +2.1% +16 upb_prf4_2bt +1.3% +16 +1.4% +16 upb_prz8_2bt -0.3% -4 -0.3% -4 [3 Others] -1.6% -8 -1.7% -8 upb_cob_1bt -1.6% -8 -1.7% -8 upb_csb_1bt -2.5% -16 -2.6% -16 upb_pov4_1bt -1.3% -16 -1.3% -16 upb_prv8_2bt -2.5% -16 -2.7% -16 upb_psv4_1bt -2.5% -16 -2.6% -16 upb_psv4_2bt -3.0% -32 -3.1% -32 upb_prs_2bt -2.6% -32 -2.6% -32 upb_prv4_2bt -4.9% -48 -5.1% -48 upb_prb_2bt -3.9% -48 -4.0% -48 upb_prv4_1bt -7.2% -72 -7.5% -72 upb_prb_1bt -7.8% -88 -8.0% -88 upb_prs_1bt [ = ] 0 -0.1% -128 TOTAL There is a bit of speed regression, but it appears there were bigger CPU regressions prior to this. We probably need some separate optimization attention again to get back to the performance numbers we had when fasttable was first submitted.
4 years ago
#define FASTDECODE_STRING(d, ptr, msg, table, hasbits, data, tagbytes, card, \
copyfunc, validate_utf8) \
upb_StringView* dst; \
Fixed a size regression due to inlining UTF-8 verification. Overall size/speed impact on fasttable decoder is now: name old time/op new time/op delta ArenaOneAlloc 21.5ns ± 0% 21.5ns ± 0% ~ (p=0.060 n=12+12) ArenaInitialBlockOneAlloc 6.33ns ± 0% 6.33ns ± 0% ~ (p=0.413 n=11+12) LoadDescriptor_Upb 43.4µs ± 1% 45.5µs ± 1% +4.79% (p=0.000 n=12+12) LoadAdsDescriptor_Upb 2.50ms ± 0% 2.51ms ± 2% ~ (p=0.512 n=10+11) LoadDescriptor_Proto2 240µs ± 0% 240µs ± 0% -0.25% (p=0.000 n=12+12) LoadAdsDescriptor_Proto2 12.9ms ± 0% 12.9ms ± 0% +0.20% (p=0.014 n=10+12) Parse_Upb_FileDesc<UseArena,Copy> 4.99µs ± 0% 5.04µs ± 0% +0.98% (p=0.000 n=11+10) Parse_Upb_FileDesc<UseArena,Alias> 4.02µs ± 0% 4.18µs ± 0% +4.16% (p=0.000 n=10+12) Parse_Upb_FileDesc<InitBlock,Copy> 4.49µs ± 0% 4.54µs ± 0% +1.16% (p=0.000 n=11+10) Parse_Upb_FileDesc<InitBlock,Alias> 3.60µs ± 0% 3.80µs ± 0% +5.73% (p=0.000 n=12+11) Parse_Proto2<FileDesc,NoArena,Copy> 29.3µs ± 0% 29.3µs ± 0% ~ (p=0.069 n=11+12) Parse_Proto2<FileDesc,UseArena,Copy> 20.2µs ± 3% 20.3µs ± 2% ~ (p=0.880 n=12+11) Parse_Proto2<FileDesc,InitBlock,Copy> 16.5µs ± 0% 16.5µs ± 0% ~ (p=1.000 n=12+12) Parse_Proto2<FileDescSV,InitBlock,Alias> 16.4µs ± 0% 16.4µs ± 1% ~ (p=0.590 n=12+12) SerializeDescriptor_Proto2 5.31µs ± 1% 6.65µs ±29% +25.07% (p=0.000 n=12+12) SerializeDescriptor_Upb 12.4µs ± 0% 12.5µs ± 0% +1.23% (p=0.000 n=12+12) FILE SIZE VM SIZE -------------- -------------- +16% +128 [ = ] 0 [Unmapped] -1.2% -4 -1.2% -4 [section .text] [NEW] +2 [NEW] +2 fastdecode_isdonefallback [DEL] -6 [DEL] -6 fastdecode_longstring_noutf8 -0.2% -124 -0.2% -124 upb/decode_fast.c +5.8% +64 +6.0% +64 upb_pom_1bt_max64b +2.7% +64 +2.7% +64 upb_ppv8_2bt +2.7% +32 +2.8% +32 upb_psm_1bt_max256b +2.8% +32 +3.0% +32 upb_psm_1bt_max64b +2.8% +32 +3.0% +32 upb_psm_2bt_max64b +4.0% +24 +4.2% +24 upb_psv8_1bt +2.0% +16 +2.1% +16 upb_prf4_2bt +1.3% +16 +1.4% +16 upb_prz8_2bt -0.3% -4 -0.3% -4 [3 Others] -1.6% -8 -1.7% -8 upb_cob_1bt -1.6% -8 -1.7% -8 upb_csb_1bt -2.5% -16 -2.6% -16 upb_pov4_1bt -1.3% -16 -1.3% -16 upb_prv8_2bt -2.5% -16 -2.7% -16 upb_psv4_1bt -2.5% -16 -2.6% -16 upb_psv4_2bt -3.0% -32 -3.1% -32 upb_prs_2bt -2.6% -32 -2.6% -32 upb_prv4_2bt -4.9% -48 -5.1% -48 upb_prb_2bt -3.9% -48 -4.0% -48 upb_prv4_1bt -7.2% -72 -7.5% -72 upb_prb_1bt -7.8% -88 -8.0% -88 upb_prs_1bt [ = ] 0 -0.1% -128 TOTAL There is a bit of speed regression, but it appears there were bigger CPU regressions prior to this. We probably need some separate optimization attention again to get back to the performance numbers we had when fasttable was first submitted.
4 years ago
fastdecode_arr farr; \
int64_t size; \
\
if (UPB_UNLIKELY(!fastdecode_checktag(data, tagbytes))) { \
RETURN_GENERIC("string field tag mismatch\n"); \
} \
\
if (UPB_UNLIKELY((d->options & kUpb_DecodeOption_AliasString) == 0)) { \
Fixed a size regression due to inlining UTF-8 verification. Overall size/speed impact on fasttable decoder is now: name old time/op new time/op delta ArenaOneAlloc 21.5ns ± 0% 21.5ns ± 0% ~ (p=0.060 n=12+12) ArenaInitialBlockOneAlloc 6.33ns ± 0% 6.33ns ± 0% ~ (p=0.413 n=11+12) LoadDescriptor_Upb 43.4µs ± 1% 45.5µs ± 1% +4.79% (p=0.000 n=12+12) LoadAdsDescriptor_Upb 2.50ms ± 0% 2.51ms ± 2% ~ (p=0.512 n=10+11) LoadDescriptor_Proto2 240µs ± 0% 240µs ± 0% -0.25% (p=0.000 n=12+12) LoadAdsDescriptor_Proto2 12.9ms ± 0% 12.9ms ± 0% +0.20% (p=0.014 n=10+12) Parse_Upb_FileDesc<UseArena,Copy> 4.99µs ± 0% 5.04µs ± 0% +0.98% (p=0.000 n=11+10) Parse_Upb_FileDesc<UseArena,Alias> 4.02µs ± 0% 4.18µs ± 0% +4.16% (p=0.000 n=10+12) Parse_Upb_FileDesc<InitBlock,Copy> 4.49µs ± 0% 4.54µs ± 0% +1.16% (p=0.000 n=11+10) Parse_Upb_FileDesc<InitBlock,Alias> 3.60µs ± 0% 3.80µs ± 0% +5.73% (p=0.000 n=12+11) Parse_Proto2<FileDesc,NoArena,Copy> 29.3µs ± 0% 29.3µs ± 0% ~ (p=0.069 n=11+12) Parse_Proto2<FileDesc,UseArena,Copy> 20.2µs ± 3% 20.3µs ± 2% ~ (p=0.880 n=12+11) Parse_Proto2<FileDesc,InitBlock,Copy> 16.5µs ± 0% 16.5µs ± 0% ~ (p=1.000 n=12+12) Parse_Proto2<FileDescSV,InitBlock,Alias> 16.4µs ± 0% 16.4µs ± 1% ~ (p=0.590 n=12+12) SerializeDescriptor_Proto2 5.31µs ± 1% 6.65µs ±29% +25.07% (p=0.000 n=12+12) SerializeDescriptor_Upb 12.4µs ± 0% 12.5µs ± 0% +1.23% (p=0.000 n=12+12) FILE SIZE VM SIZE -------------- -------------- +16% +128 [ = ] 0 [Unmapped] -1.2% -4 -1.2% -4 [section .text] [NEW] +2 [NEW] +2 fastdecode_isdonefallback [DEL] -6 [DEL] -6 fastdecode_longstring_noutf8 -0.2% -124 -0.2% -124 upb/decode_fast.c +5.8% +64 +6.0% +64 upb_pom_1bt_max64b +2.7% +64 +2.7% +64 upb_ppv8_2bt +2.7% +32 +2.8% +32 upb_psm_1bt_max256b +2.8% +32 +3.0% +32 upb_psm_1bt_max64b +2.8% +32 +3.0% +32 upb_psm_2bt_max64b +4.0% +24 +4.2% +24 upb_psv8_1bt +2.0% +16 +2.1% +16 upb_prf4_2bt +1.3% +16 +1.4% +16 upb_prz8_2bt -0.3% -4 -0.3% -4 [3 Others] -1.6% -8 -1.7% -8 upb_cob_1bt -1.6% -8 -1.7% -8 upb_csb_1bt -2.5% -16 -2.6% -16 upb_pov4_1bt -1.3% -16 -1.3% -16 upb_prv8_2bt -2.5% -16 -2.7% -16 upb_psv4_1bt -2.5% -16 -2.6% -16 upb_psv4_2bt -3.0% -32 -3.1% -32 upb_prs_2bt -2.6% -32 -2.6% -32 upb_prv4_2bt -4.9% -48 -5.1% -48 upb_prb_2bt -3.9% -48 -4.0% -48 upb_prv4_1bt -7.2% -72 -7.5% -72 upb_prb_1bt -7.8% -88 -8.0% -88 upb_prs_1bt [ = ] 0 -0.1% -128 TOTAL There is a bit of speed regression, but it appears there were bigger CPU regressions prior to this. We probably need some separate optimization attention again to get back to the performance numbers we had when fasttable was first submitted.
4 years ago
UPB_MUSTTAIL return copyfunc(UPB_PARSE_ARGS); \
} \
\
dst = fastdecode_getfield(d, ptr, msg, &data, &hasbits, &farr, \
sizeof(upb_StringView), card); \
Fixed a size regression due to inlining UTF-8 verification. Overall size/speed impact on fasttable decoder is now: name old time/op new time/op delta ArenaOneAlloc 21.5ns ± 0% 21.5ns ± 0% ~ (p=0.060 n=12+12) ArenaInitialBlockOneAlloc 6.33ns ± 0% 6.33ns ± 0% ~ (p=0.413 n=11+12) LoadDescriptor_Upb 43.4µs ± 1% 45.5µs ± 1% +4.79% (p=0.000 n=12+12) LoadAdsDescriptor_Upb 2.50ms ± 0% 2.51ms ± 2% ~ (p=0.512 n=10+11) LoadDescriptor_Proto2 240µs ± 0% 240µs ± 0% -0.25% (p=0.000 n=12+12) LoadAdsDescriptor_Proto2 12.9ms ± 0% 12.9ms ± 0% +0.20% (p=0.014 n=10+12) Parse_Upb_FileDesc<UseArena,Copy> 4.99µs ± 0% 5.04µs ± 0% +0.98% (p=0.000 n=11+10) Parse_Upb_FileDesc<UseArena,Alias> 4.02µs ± 0% 4.18µs ± 0% +4.16% (p=0.000 n=10+12) Parse_Upb_FileDesc<InitBlock,Copy> 4.49µs ± 0% 4.54µs ± 0% +1.16% (p=0.000 n=11+10) Parse_Upb_FileDesc<InitBlock,Alias> 3.60µs ± 0% 3.80µs ± 0% +5.73% (p=0.000 n=12+11) Parse_Proto2<FileDesc,NoArena,Copy> 29.3µs ± 0% 29.3µs ± 0% ~ (p=0.069 n=11+12) Parse_Proto2<FileDesc,UseArena,Copy> 20.2µs ± 3% 20.3µs ± 2% ~ (p=0.880 n=12+11) Parse_Proto2<FileDesc,InitBlock,Copy> 16.5µs ± 0% 16.5µs ± 0% ~ (p=1.000 n=12+12) Parse_Proto2<FileDescSV,InitBlock,Alias> 16.4µs ± 0% 16.4µs ± 1% ~ (p=0.590 n=12+12) SerializeDescriptor_Proto2 5.31µs ± 1% 6.65µs ±29% +25.07% (p=0.000 n=12+12) SerializeDescriptor_Upb 12.4µs ± 0% 12.5µs ± 0% +1.23% (p=0.000 n=12+12) FILE SIZE VM SIZE -------------- -------------- +16% +128 [ = ] 0 [Unmapped] -1.2% -4 -1.2% -4 [section .text] [NEW] +2 [NEW] +2 fastdecode_isdonefallback [DEL] -6 [DEL] -6 fastdecode_longstring_noutf8 -0.2% -124 -0.2% -124 upb/decode_fast.c +5.8% +64 +6.0% +64 upb_pom_1bt_max64b +2.7% +64 +2.7% +64 upb_ppv8_2bt +2.7% +32 +2.8% +32 upb_psm_1bt_max256b +2.8% +32 +3.0% +32 upb_psm_1bt_max64b +2.8% +32 +3.0% +32 upb_psm_2bt_max64b +4.0% +24 +4.2% +24 upb_psv8_1bt +2.0% +16 +2.1% +16 upb_prf4_2bt +1.3% +16 +1.4% +16 upb_prz8_2bt -0.3% -4 -0.3% -4 [3 Others] -1.6% -8 -1.7% -8 upb_cob_1bt -1.6% -8 -1.7% -8 upb_csb_1bt -2.5% -16 -2.6% -16 upb_pov4_1bt -1.3% -16 -1.3% -16 upb_prv8_2bt -2.5% -16 -2.7% -16 upb_psv4_1bt -2.5% -16 -2.6% -16 upb_psv4_2bt -3.0% -32 -3.1% -32 upb_prs_2bt -2.6% -32 -2.6% -32 upb_prv4_2bt -4.9% -48 -5.1% -48 upb_prb_2bt -3.9% -48 -4.0% -48 upb_prv4_1bt -7.2% -72 -7.5% -72 upb_prb_1bt -7.8% -88 -8.0% -88 upb_prs_1bt [ = ] 0 -0.1% -128 TOTAL There is a bit of speed regression, but it appears there were bigger CPU regressions prior to this. We probably need some separate optimization attention again to get back to the performance numbers we had when fasttable was first submitted.
4 years ago
\
again: \
if (card == CARD_r) { \
dst = fastdecode_resizearr(d, dst, &farr, sizeof(upb_StringView)); \
Fixed a size regression due to inlining UTF-8 verification. Overall size/speed impact on fasttable decoder is now: name old time/op new time/op delta ArenaOneAlloc 21.5ns ± 0% 21.5ns ± 0% ~ (p=0.060 n=12+12) ArenaInitialBlockOneAlloc 6.33ns ± 0% 6.33ns ± 0% ~ (p=0.413 n=11+12) LoadDescriptor_Upb 43.4µs ± 1% 45.5µs ± 1% +4.79% (p=0.000 n=12+12) LoadAdsDescriptor_Upb 2.50ms ± 0% 2.51ms ± 2% ~ (p=0.512 n=10+11) LoadDescriptor_Proto2 240µs ± 0% 240µs ± 0% -0.25% (p=0.000 n=12+12) LoadAdsDescriptor_Proto2 12.9ms ± 0% 12.9ms ± 0% +0.20% (p=0.014 n=10+12) Parse_Upb_FileDesc<UseArena,Copy> 4.99µs ± 0% 5.04µs ± 0% +0.98% (p=0.000 n=11+10) Parse_Upb_FileDesc<UseArena,Alias> 4.02µs ± 0% 4.18µs ± 0% +4.16% (p=0.000 n=10+12) Parse_Upb_FileDesc<InitBlock,Copy> 4.49µs ± 0% 4.54µs ± 0% +1.16% (p=0.000 n=11+10) Parse_Upb_FileDesc<InitBlock,Alias> 3.60µs ± 0% 3.80µs ± 0% +5.73% (p=0.000 n=12+11) Parse_Proto2<FileDesc,NoArena,Copy> 29.3µs ± 0% 29.3µs ± 0% ~ (p=0.069 n=11+12) Parse_Proto2<FileDesc,UseArena,Copy> 20.2µs ± 3% 20.3µs ± 2% ~ (p=0.880 n=12+11) Parse_Proto2<FileDesc,InitBlock,Copy> 16.5µs ± 0% 16.5µs ± 0% ~ (p=1.000 n=12+12) Parse_Proto2<FileDescSV,InitBlock,Alias> 16.4µs ± 0% 16.4µs ± 1% ~ (p=0.590 n=12+12) SerializeDescriptor_Proto2 5.31µs ± 1% 6.65µs ±29% +25.07% (p=0.000 n=12+12) SerializeDescriptor_Upb 12.4µs ± 0% 12.5µs ± 0% +1.23% (p=0.000 n=12+12) FILE SIZE VM SIZE -------------- -------------- +16% +128 [ = ] 0 [Unmapped] -1.2% -4 -1.2% -4 [section .text] [NEW] +2 [NEW] +2 fastdecode_isdonefallback [DEL] -6 [DEL] -6 fastdecode_longstring_noutf8 -0.2% -124 -0.2% -124 upb/decode_fast.c +5.8% +64 +6.0% +64 upb_pom_1bt_max64b +2.7% +64 +2.7% +64 upb_ppv8_2bt +2.7% +32 +2.8% +32 upb_psm_1bt_max256b +2.8% +32 +3.0% +32 upb_psm_1bt_max64b +2.8% +32 +3.0% +32 upb_psm_2bt_max64b +4.0% +24 +4.2% +24 upb_psv8_1bt +2.0% +16 +2.1% +16 upb_prf4_2bt +1.3% +16 +1.4% +16 upb_prz8_2bt -0.3% -4 -0.3% -4 [3 Others] -1.6% -8 -1.7% -8 upb_cob_1bt -1.6% -8 -1.7% -8 upb_csb_1bt -2.5% -16 -2.6% -16 upb_pov4_1bt -1.3% -16 -1.3% -16 upb_prv8_2bt -2.5% -16 -2.7% -16 upb_psv4_1bt -2.5% -16 -2.6% -16 upb_psv4_2bt -3.0% -32 -3.1% -32 upb_prs_2bt -2.6% -32 -2.6% -32 upb_prv4_2bt -4.9% -48 -5.1% -48 upb_prb_2bt -3.9% -48 -4.0% -48 upb_prv4_1bt -7.2% -72 -7.5% -72 upb_prb_1bt -7.8% -88 -8.0% -88 upb_prs_1bt [ = ] 0 -0.1% -128 TOTAL There is a bit of speed regression, but it appears there were bigger CPU regressions prior to this. We probably need some separate optimization attention again to get back to the performance numbers we had when fasttable was first submitted.
4 years ago
} \
\
size = (int8_t)ptr[tagbytes]; \
ptr += tagbytes + 1; \
dst->data = ptr; \
dst->size = size; \
\
if (UPB_UNLIKELY(fastdecode_boundscheck(ptr, size, d->end))) { \
ptr--; \
if (validate_utf8) { \
return fastdecode_longstring_utf8(d, ptr, msg, table, hasbits, \
(uint64_t)dst); \
} else { \
return fastdecode_longstring_noutf8(d, ptr, msg, table, hasbits, \
(uint64_t)dst); \
} \
} \
\
ptr += size; \
\
if (card == CARD_r) { \
if (validate_utf8 && !decode_verifyutf8_inl(dst->data, dst->size)) { \
return fastdecode_err(d, kUpb_DecodeStatus_BadUtf8); \
Fixed a size regression due to inlining UTF-8 verification. Overall size/speed impact on fasttable decoder is now: name old time/op new time/op delta ArenaOneAlloc 21.5ns ± 0% 21.5ns ± 0% ~ (p=0.060 n=12+12) ArenaInitialBlockOneAlloc 6.33ns ± 0% 6.33ns ± 0% ~ (p=0.413 n=11+12) LoadDescriptor_Upb 43.4µs ± 1% 45.5µs ± 1% +4.79% (p=0.000 n=12+12) LoadAdsDescriptor_Upb 2.50ms ± 0% 2.51ms ± 2% ~ (p=0.512 n=10+11) LoadDescriptor_Proto2 240µs ± 0% 240µs ± 0% -0.25% (p=0.000 n=12+12) LoadAdsDescriptor_Proto2 12.9ms ± 0% 12.9ms ± 0% +0.20% (p=0.014 n=10+12) Parse_Upb_FileDesc<UseArena,Copy> 4.99µs ± 0% 5.04µs ± 0% +0.98% (p=0.000 n=11+10) Parse_Upb_FileDesc<UseArena,Alias> 4.02µs ± 0% 4.18µs ± 0% +4.16% (p=0.000 n=10+12) Parse_Upb_FileDesc<InitBlock,Copy> 4.49µs ± 0% 4.54µs ± 0% +1.16% (p=0.000 n=11+10) Parse_Upb_FileDesc<InitBlock,Alias> 3.60µs ± 0% 3.80µs ± 0% +5.73% (p=0.000 n=12+11) Parse_Proto2<FileDesc,NoArena,Copy> 29.3µs ± 0% 29.3µs ± 0% ~ (p=0.069 n=11+12) Parse_Proto2<FileDesc,UseArena,Copy> 20.2µs ± 3% 20.3µs ± 2% ~ (p=0.880 n=12+11) Parse_Proto2<FileDesc,InitBlock,Copy> 16.5µs ± 0% 16.5µs ± 0% ~ (p=1.000 n=12+12) Parse_Proto2<FileDescSV,InitBlock,Alias> 16.4µs ± 0% 16.4µs ± 1% ~ (p=0.590 n=12+12) SerializeDescriptor_Proto2 5.31µs ± 1% 6.65µs ±29% +25.07% (p=0.000 n=12+12) SerializeDescriptor_Upb 12.4µs ± 0% 12.5µs ± 0% +1.23% (p=0.000 n=12+12) FILE SIZE VM SIZE -------------- -------------- +16% +128 [ = ] 0 [Unmapped] -1.2% -4 -1.2% -4 [section .text] [NEW] +2 [NEW] +2 fastdecode_isdonefallback [DEL] -6 [DEL] -6 fastdecode_longstring_noutf8 -0.2% -124 -0.2% -124 upb/decode_fast.c +5.8% +64 +6.0% +64 upb_pom_1bt_max64b +2.7% +64 +2.7% +64 upb_ppv8_2bt +2.7% +32 +2.8% +32 upb_psm_1bt_max256b +2.8% +32 +3.0% +32 upb_psm_1bt_max64b +2.8% +32 +3.0% +32 upb_psm_2bt_max64b +4.0% +24 +4.2% +24 upb_psv8_1bt +2.0% +16 +2.1% +16 upb_prf4_2bt +1.3% +16 +1.4% +16 upb_prz8_2bt -0.3% -4 -0.3% -4 [3 Others] -1.6% -8 -1.7% -8 upb_cob_1bt -1.6% -8 -1.7% -8 upb_csb_1bt -2.5% -16 -2.6% -16 upb_pov4_1bt -1.3% -16 -1.3% -16 upb_prv8_2bt -2.5% -16 -2.7% -16 upb_psv4_1bt -2.5% -16 -2.6% -16 upb_psv4_2bt -3.0% -32 -3.1% -32 upb_prs_2bt -2.6% -32 -2.6% -32 upb_prv4_2bt -4.9% -48 -5.1% -48 upb_prb_2bt -3.9% -48 -4.0% -48 upb_prv4_1bt -7.2% -72 -7.5% -72 upb_prb_1bt -7.8% -88 -8.0% -88 upb_prs_1bt [ = ] 0 -0.1% -128 TOTAL There is a bit of speed regression, but it appears there were bigger CPU regressions prior to this. We probably need some separate optimization attention again to get back to the performance numbers we had when fasttable was first submitted.
4 years ago
} \
fastdecode_nextret ret = fastdecode_nextrepeated( \
d, dst, &ptr, &farr, data, tagbytes, sizeof(upb_StringView)); \
Fixed a size regression due to inlining UTF-8 verification. Overall size/speed impact on fasttable decoder is now: name old time/op new time/op delta ArenaOneAlloc 21.5ns ± 0% 21.5ns ± 0% ~ (p=0.060 n=12+12) ArenaInitialBlockOneAlloc 6.33ns ± 0% 6.33ns ± 0% ~ (p=0.413 n=11+12) LoadDescriptor_Upb 43.4µs ± 1% 45.5µs ± 1% +4.79% (p=0.000 n=12+12) LoadAdsDescriptor_Upb 2.50ms ± 0% 2.51ms ± 2% ~ (p=0.512 n=10+11) LoadDescriptor_Proto2 240µs ± 0% 240µs ± 0% -0.25% (p=0.000 n=12+12) LoadAdsDescriptor_Proto2 12.9ms ± 0% 12.9ms ± 0% +0.20% (p=0.014 n=10+12) Parse_Upb_FileDesc<UseArena,Copy> 4.99µs ± 0% 5.04µs ± 0% +0.98% (p=0.000 n=11+10) Parse_Upb_FileDesc<UseArena,Alias> 4.02µs ± 0% 4.18µs ± 0% +4.16% (p=0.000 n=10+12) Parse_Upb_FileDesc<InitBlock,Copy> 4.49µs ± 0% 4.54µs ± 0% +1.16% (p=0.000 n=11+10) Parse_Upb_FileDesc<InitBlock,Alias> 3.60µs ± 0% 3.80µs ± 0% +5.73% (p=0.000 n=12+11) Parse_Proto2<FileDesc,NoArena,Copy> 29.3µs ± 0% 29.3µs ± 0% ~ (p=0.069 n=11+12) Parse_Proto2<FileDesc,UseArena,Copy> 20.2µs ± 3% 20.3µs ± 2% ~ (p=0.880 n=12+11) Parse_Proto2<FileDesc,InitBlock,Copy> 16.5µs ± 0% 16.5µs ± 0% ~ (p=1.000 n=12+12) Parse_Proto2<FileDescSV,InitBlock,Alias> 16.4µs ± 0% 16.4µs ± 1% ~ (p=0.590 n=12+12) SerializeDescriptor_Proto2 5.31µs ± 1% 6.65µs ±29% +25.07% (p=0.000 n=12+12) SerializeDescriptor_Upb 12.4µs ± 0% 12.5µs ± 0% +1.23% (p=0.000 n=12+12) FILE SIZE VM SIZE -------------- -------------- +16% +128 [ = ] 0 [Unmapped] -1.2% -4 -1.2% -4 [section .text] [NEW] +2 [NEW] +2 fastdecode_isdonefallback [DEL] -6 [DEL] -6 fastdecode_longstring_noutf8 -0.2% -124 -0.2% -124 upb/decode_fast.c +5.8% +64 +6.0% +64 upb_pom_1bt_max64b +2.7% +64 +2.7% +64 upb_ppv8_2bt +2.7% +32 +2.8% +32 upb_psm_1bt_max256b +2.8% +32 +3.0% +32 upb_psm_1bt_max64b +2.8% +32 +3.0% +32 upb_psm_2bt_max64b +4.0% +24 +4.2% +24 upb_psv8_1bt +2.0% +16 +2.1% +16 upb_prf4_2bt +1.3% +16 +1.4% +16 upb_prz8_2bt -0.3% -4 -0.3% -4 [3 Others] -1.6% -8 -1.7% -8 upb_cob_1bt -1.6% -8 -1.7% -8 upb_csb_1bt -2.5% -16 -2.6% -16 upb_pov4_1bt -1.3% -16 -1.3% -16 upb_prv8_2bt -2.5% -16 -2.7% -16 upb_psv4_1bt -2.5% -16 -2.6% -16 upb_psv4_2bt -3.0% -32 -3.1% -32 upb_prs_2bt -2.6% -32 -2.6% -32 upb_prv4_2bt -4.9% -48 -5.1% -48 upb_prb_2bt -3.9% -48 -4.0% -48 upb_prv4_1bt -7.2% -72 -7.5% -72 upb_prb_1bt -7.8% -88 -8.0% -88 upb_prs_1bt [ = ] 0 -0.1% -128 TOTAL There is a bit of speed regression, but it appears there were bigger CPU regressions prior to this. We probably need some separate optimization attention again to get back to the performance numbers we had when fasttable was first submitted.
4 years ago
switch (ret.next) { \
case FD_NEXT_SAMEFIELD: \
dst = ret.dst; \
if (UPB_UNLIKELY((d->options & kUpb_DecodeOption_AliasString) == 0)) { \
/* Buffer flipped and we can't alias any more. Bounce to */ \
/* copyfunc(), but via dispatch since we need to reload table */ \
/* data also. */ \
fastdecode_commitarr(dst, &farr, sizeof(upb_StringView)); \
data = ret.tag; \
UPB_MUSTTAIL return fastdecode_tagdispatch(UPB_PARSE_ARGS); \
} \
goto again; \
case FD_NEXT_OTHERFIELD: \
Fixed a size regression due to inlining UTF-8 verification. Overall size/speed impact on fasttable decoder is now: name old time/op new time/op delta ArenaOneAlloc 21.5ns ± 0% 21.5ns ± 0% ~ (p=0.060 n=12+12) ArenaInitialBlockOneAlloc 6.33ns ± 0% 6.33ns ± 0% ~ (p=0.413 n=11+12) LoadDescriptor_Upb 43.4µs ± 1% 45.5µs ± 1% +4.79% (p=0.000 n=12+12) LoadAdsDescriptor_Upb 2.50ms ± 0% 2.51ms ± 2% ~ (p=0.512 n=10+11) LoadDescriptor_Proto2 240µs ± 0% 240µs ± 0% -0.25% (p=0.000 n=12+12) LoadAdsDescriptor_Proto2 12.9ms ± 0% 12.9ms ± 0% +0.20% (p=0.014 n=10+12) Parse_Upb_FileDesc<UseArena,Copy> 4.99µs ± 0% 5.04µs ± 0% +0.98% (p=0.000 n=11+10) Parse_Upb_FileDesc<UseArena,Alias> 4.02µs ± 0% 4.18µs ± 0% +4.16% (p=0.000 n=10+12) Parse_Upb_FileDesc<InitBlock,Copy> 4.49µs ± 0% 4.54µs ± 0% +1.16% (p=0.000 n=11+10) Parse_Upb_FileDesc<InitBlock,Alias> 3.60µs ± 0% 3.80µs ± 0% +5.73% (p=0.000 n=12+11) Parse_Proto2<FileDesc,NoArena,Copy> 29.3µs ± 0% 29.3µs ± 0% ~ (p=0.069 n=11+12) Parse_Proto2<FileDesc,UseArena,Copy> 20.2µs ± 3% 20.3µs ± 2% ~ (p=0.880 n=12+11) Parse_Proto2<FileDesc,InitBlock,Copy> 16.5µs ± 0% 16.5µs ± 0% ~ (p=1.000 n=12+12) Parse_Proto2<FileDescSV,InitBlock,Alias> 16.4µs ± 0% 16.4µs ± 1% ~ (p=0.590 n=12+12) SerializeDescriptor_Proto2 5.31µs ± 1% 6.65µs ±29% +25.07% (p=0.000 n=12+12) SerializeDescriptor_Upb 12.4µs ± 0% 12.5µs ± 0% +1.23% (p=0.000 n=12+12) FILE SIZE VM SIZE -------------- -------------- +16% +128 [ = ] 0 [Unmapped] -1.2% -4 -1.2% -4 [section .text] [NEW] +2 [NEW] +2 fastdecode_isdonefallback [DEL] -6 [DEL] -6 fastdecode_longstring_noutf8 -0.2% -124 -0.2% -124 upb/decode_fast.c +5.8% +64 +6.0% +64 upb_pom_1bt_max64b +2.7% +64 +2.7% +64 upb_ppv8_2bt +2.7% +32 +2.8% +32 upb_psm_1bt_max256b +2.8% +32 +3.0% +32 upb_psm_1bt_max64b +2.8% +32 +3.0% +32 upb_psm_2bt_max64b +4.0% +24 +4.2% +24 upb_psv8_1bt +2.0% +16 +2.1% +16 upb_prf4_2bt +1.3% +16 +1.4% +16 upb_prz8_2bt -0.3% -4 -0.3% -4 [3 Others] -1.6% -8 -1.7% -8 upb_cob_1bt -1.6% -8 -1.7% -8 upb_csb_1bt -2.5% -16 -2.6% -16 upb_pov4_1bt -1.3% -16 -1.3% -16 upb_prv8_2bt -2.5% -16 -2.7% -16 upb_psv4_1bt -2.5% -16 -2.6% -16 upb_psv4_2bt -3.0% -32 -3.1% -32 upb_prs_2bt -2.6% -32 -2.6% -32 upb_prv4_2bt -4.9% -48 -5.1% -48 upb_prb_2bt -3.9% -48 -4.0% -48 upb_prv4_1bt -7.2% -72 -7.5% -72 upb_prb_1bt -7.8% -88 -8.0% -88 upb_prs_1bt [ = ] 0 -0.1% -128 TOTAL There is a bit of speed regression, but it appears there were bigger CPU regressions prior to this. We probably need some separate optimization attention again to get back to the performance numbers we had when fasttable was first submitted.
4 years ago
data = ret.tag; \
UPB_MUSTTAIL return fastdecode_tagdispatch(UPB_PARSE_ARGS); \
case FD_NEXT_ATLIMIT: \
return ptr; \
Fixed a size regression due to inlining UTF-8 verification. Overall size/speed impact on fasttable decoder is now: name old time/op new time/op delta ArenaOneAlloc 21.5ns ± 0% 21.5ns ± 0% ~ (p=0.060 n=12+12) ArenaInitialBlockOneAlloc 6.33ns ± 0% 6.33ns ± 0% ~ (p=0.413 n=11+12) LoadDescriptor_Upb 43.4µs ± 1% 45.5µs ± 1% +4.79% (p=0.000 n=12+12) LoadAdsDescriptor_Upb 2.50ms ± 0% 2.51ms ± 2% ~ (p=0.512 n=10+11) LoadDescriptor_Proto2 240µs ± 0% 240µs ± 0% -0.25% (p=0.000 n=12+12) LoadAdsDescriptor_Proto2 12.9ms ± 0% 12.9ms ± 0% +0.20% (p=0.014 n=10+12) Parse_Upb_FileDesc<UseArena,Copy> 4.99µs ± 0% 5.04µs ± 0% +0.98% (p=0.000 n=11+10) Parse_Upb_FileDesc<UseArena,Alias> 4.02µs ± 0% 4.18µs ± 0% +4.16% (p=0.000 n=10+12) Parse_Upb_FileDesc<InitBlock,Copy> 4.49µs ± 0% 4.54µs ± 0% +1.16% (p=0.000 n=11+10) Parse_Upb_FileDesc<InitBlock,Alias> 3.60µs ± 0% 3.80µs ± 0% +5.73% (p=0.000 n=12+11) Parse_Proto2<FileDesc,NoArena,Copy> 29.3µs ± 0% 29.3µs ± 0% ~ (p=0.069 n=11+12) Parse_Proto2<FileDesc,UseArena,Copy> 20.2µs ± 3% 20.3µs ± 2% ~ (p=0.880 n=12+11) Parse_Proto2<FileDesc,InitBlock,Copy> 16.5µs ± 0% 16.5µs ± 0% ~ (p=1.000 n=12+12) Parse_Proto2<FileDescSV,InitBlock,Alias> 16.4µs ± 0% 16.4µs ± 1% ~ (p=0.590 n=12+12) SerializeDescriptor_Proto2 5.31µs ± 1% 6.65µs ±29% +25.07% (p=0.000 n=12+12) SerializeDescriptor_Upb 12.4µs ± 0% 12.5µs ± 0% +1.23% (p=0.000 n=12+12) FILE SIZE VM SIZE -------------- -------------- +16% +128 [ = ] 0 [Unmapped] -1.2% -4 -1.2% -4 [section .text] [NEW] +2 [NEW] +2 fastdecode_isdonefallback [DEL] -6 [DEL] -6 fastdecode_longstring_noutf8 -0.2% -124 -0.2% -124 upb/decode_fast.c +5.8% +64 +6.0% +64 upb_pom_1bt_max64b +2.7% +64 +2.7% +64 upb_ppv8_2bt +2.7% +32 +2.8% +32 upb_psm_1bt_max256b +2.8% +32 +3.0% +32 upb_psm_1bt_max64b +2.8% +32 +3.0% +32 upb_psm_2bt_max64b +4.0% +24 +4.2% +24 upb_psv8_1bt +2.0% +16 +2.1% +16 upb_prf4_2bt +1.3% +16 +1.4% +16 upb_prz8_2bt -0.3% -4 -0.3% -4 [3 Others] -1.6% -8 -1.7% -8 upb_cob_1bt -1.6% -8 -1.7% -8 upb_csb_1bt -2.5% -16 -2.6% -16 upb_pov4_1bt -1.3% -16 -1.3% -16 upb_prv8_2bt -2.5% -16 -2.7% -16 upb_psv4_1bt -2.5% -16 -2.6% -16 upb_psv4_2bt -3.0% -32 -3.1% -32 upb_prs_2bt -2.6% -32 -2.6% -32 upb_prv4_2bt -4.9% -48 -5.1% -48 upb_prb_2bt -3.9% -48 -4.0% -48 upb_prv4_1bt -7.2% -72 -7.5% -72 upb_prb_1bt -7.8% -88 -8.0% -88 upb_prs_1bt [ = ] 0 -0.1% -128 TOTAL There is a bit of speed regression, but it appears there were bigger CPU regressions prior to this. We probably need some separate optimization attention again to get back to the performance numbers we had when fasttable was first submitted.
4 years ago
} \
} \
\
if (card != CARD_r && validate_utf8) { \
data = (uint64_t)dst; \
UPB_MUSTTAIL return fastdecode_verifyutf8(UPB_PARSE_ARGS); \
} \
\
UPB_MUSTTAIL return fastdecode_dispatch(UPB_PARSE_ARGS);
/* Generate all combinations:
* {p,c} x {s,o,r} x {s, b} x {1bt,2bt} */
#define s_VALIDATE true
#define b_VALIDATE false
#define F(card, tagbytes, type) \
UPB_NOINLINE \
const char* upb_c##card##type##_##tagbytes##bt(UPB_PARSE_PARAMS) { \
FASTDECODE_COPYSTRING(d, ptr, msg, table, hasbits, data, tagbytes, \
CARD_##card, type##_VALIDATE); \
} \
const char* upb_p##card##type##_##tagbytes##bt(UPB_PARSE_PARAMS) { \
FASTDECODE_STRING(d, ptr, msg, table, hasbits, data, tagbytes, \
CARD_##card, upb_c##card##type##_##tagbytes##bt, \
type##_VALIDATE); \
}
#define UTF8(card, tagbytes) \
F(card, tagbytes, s) \
F(card, tagbytes, b)
#define TAGBYTES(card) \
UTF8(card, 1) \
UTF8(card, 2)
TAGBYTES(s)
TAGBYTES(o)
TAGBYTES(r)
#undef s_VALIDATE
#undef b_VALIDATE
#undef F
#undef TAGBYTES
#undef FASTDECODE_LONGSTRING
#undef FASTDECODE_COPYSTRING
#undef FASTDECODE_STRING
/* message fields *************************************************************/
UPB_INLINE
upb_Message* decode_newmsg_ceil(upb_Decoder* d, const upb_MiniTable* l,
int msg_ceil_bytes) {
size_t size = l->size + sizeof(upb_Message_Internal);
char* msg_data;
if (UPB_LIKELY(msg_ceil_bytes > 0 &&
_upb_ArenaHas(&d->arena) >= msg_ceil_bytes)) {
UPB_ASSERT(size <= (size_t)msg_ceil_bytes);
msg_data = d->arena.head.ptr;
d->arena.head.ptr += size;
UPB_UNPOISON_MEMORY_REGION(msg_data, msg_ceil_bytes);
memset(msg_data, 0, msg_ceil_bytes);
UPB_POISON_MEMORY_REGION(msg_data + size, msg_ceil_bytes - size);
} else {
msg_data = (char*)upb_Arena_Malloc(&d->arena, size);
memset(msg_data, 0, size);
}
return msg_data + sizeof(upb_Message_Internal);
}
typedef struct {
intptr_t table;
upb_Message* msg;
} fastdecode_submsgdata;
UPB_FORCEINLINE
static const char* fastdecode_tosubmsg(upb_Decoder* d, const char* ptr,
void* ctx) {
fastdecode_submsgdata* submsg = ctx;
ptr = fastdecode_dispatch(d, ptr, submsg->msg, submsg->table, 0, 0);
UPB_ASSUME(ptr != NULL);
return ptr;
}
#define FASTDECODE_SUBMSG(d, ptr, msg, table, hasbits, data, tagbytes, \
msg_ceil_bytes, card) \
\
if (UPB_UNLIKELY(!fastdecode_checktag(data, tagbytes))) { \
RETURN_GENERIC("submessage field tag mismatch\n"); \
} \
\
if (--d->depth == 0) { \
return fastdecode_err(d, kUpb_DecodeStatus_MaxDepthExceeded); \
} \
\
upb_Message** dst; \
uint32_t submsg_idx = (data >> 16) & 0xff; \
const upb_MiniTable* tablep = decode_totablep(table); \
const upb_MiniTable* subtablep = tablep->subs[submsg_idx].submsg; \
fastdecode_submsgdata submsg = {decode_totable(subtablep)}; \
fastdecode_arr farr; \
\
if (subtablep->table_mask == (uint8_t)-1) { \
RETURN_GENERIC("submessage doesn't have fast tables."); \
} \
\
dst = fastdecode_getfield(d, ptr, msg, &data, &hasbits, &farr, \
sizeof(upb_Message*), card); \
\
if (card == CARD_s) { \
*(uint32_t*)msg |= hasbits; \
hasbits = 0; \
} \
\
again: \
if (card == CARD_r) { \
dst = fastdecode_resizearr(d, dst, &farr, sizeof(upb_Message*)); \
} \
\
submsg.msg = *dst; \
\
if (card == CARD_r || UPB_LIKELY(!submsg.msg)) { \
*dst = submsg.msg = decode_newmsg_ceil(d, subtablep, msg_ceil_bytes); \
} \
\
ptr += tagbytes; \
ptr = fastdecode_delimited(d, ptr, fastdecode_tosubmsg, &submsg); \
\
if (UPB_UNLIKELY(ptr == NULL || d->end_group != DECODE_NOGROUP)) { \
return fastdecode_err(d, kUpb_DecodeStatus_Malformed); \
} \
\
if (card == CARD_r) { \
fastdecode_nextret ret = fastdecode_nextrepeated( \
d, dst, &ptr, &farr, data, tagbytes, sizeof(upb_Message*)); \
switch (ret.next) { \
case FD_NEXT_SAMEFIELD: \
dst = ret.dst; \
goto again; \
case FD_NEXT_OTHERFIELD: \
d->depth++; \
data = ret.tag; \
UPB_MUSTTAIL return fastdecode_tagdispatch(UPB_PARSE_ARGS); \
case FD_NEXT_ATLIMIT: \
d->depth++; \
return ptr; \
} \
} \
\
d->depth++; \
UPB_MUSTTAIL return fastdecode_dispatch(UPB_PARSE_ARGS);
#define F(card, tagbytes, size_ceil, ceil_arg) \
const char* upb_p##card##m_##tagbytes##bt_max##size_ceil##b( \
UPB_PARSE_PARAMS) { \
FASTDECODE_SUBMSG(d, ptr, msg, table, hasbits, data, tagbytes, ceil_arg, \
CARD_##card); \
I think this may have reached the optimization limit. ------------------------------------------------------------------------- Benchmark Time CPU Iterations ------------------------------------------------------------------------- BM_ArenaOneAlloc 21 ns 21 ns 32994231 BM_ArenaInitialBlockOneAlloc 6 ns 6 ns 116318005 BM_ParseDescriptorNoHeap 3028 ns 3028 ns 231138 2.34354GB/s BM_ParseDescriptor 3557 ns 3557 ns 196583 1.99498GB/s BM_ParseDescriptorProto2NoArena 33228 ns 33226 ns 21196 218.688MB/s BM_ParseDescriptorProto2WithArena 22863 ns 22861 ns 30666 317.831MB/s BM_SerializeDescriptorProto2 5444 ns 5444 ns 127368 1.30348GB/s BM_SerializeDescriptor 12509 ns 12508 ns 55816 580.914MB/s $ perf stat bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap 2020-10-08 14:07:06 Running bazel-bin/benchmark Run on (72 X 3700 MHz CPU s) CPU Caches: L1 Data 32K (x36) L1 Instruction 32K (x36) L2 Unified 1024K (x36) L3 Unified 25344K (x2) ---------------------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------------------- BM_ParseDescriptorNoHeap 3071 ns 3071 ns 227743 2.31094GB/s Performance counter stats for 'bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap': 1,050.22 msec task-clock # 0.978 CPUs utilized 4 context-switches # 0.004 K/sec 0 cpu-migrations # 0.000 K/sec 179 page-faults # 0.170 K/sec 3,875,796,334 cycles # 3.690 GHz 13,282,835,967 instructions # 3.43 insn per cycle 2,887,725,848 branches # 2749.627 M/sec 8,324,912 branch-misses # 0.29% of all branches 1.073924364 seconds time elapsed 1.042806000 seconds user 0.008021000 seconds sys Profile: 23.96% benchmark benchmark [.] upb_prm_1bt_max192b 22.44% benchmark benchmark [.] fastdecode_dispatch 18.96% benchmark benchmark [.] upb_pss_1bt 14.20% benchmark benchmark [.] upb_psv4_1bt 8.33% benchmark benchmark [.] upb_prm_1bt_max64b 6.66% benchmark benchmark [.] upb_prm_1bt_max128b 1.29% benchmark benchmark [.] upb_psm_1bt_max64b 0.77% benchmark benchmark [.] fastdecode_generic 0.55% benchmark [kernel.kallsyms] [k] smp_call_function_single 0.42% benchmark [kernel.kallsyms] [k] _raw_spin_lock_irqsave 0.42% benchmark benchmark [.] upb_psm_1bt_max256b 0.31% benchmark benchmark [.] upb_psb1_1bt 0.21% benchmark benchmark [.] upb_plv4_5bv 0.14% benchmark benchmark [.] upb_psb1_2bt 0.12% benchmark benchmark [.] decode_longvarint64 0.08% benchmark [kernel.kallsyms] [k] vsnprintf 0.07% benchmark [kernel.kallsyms] [k] _raw_spin_lock 0.07% benchmark benchmark [.] _upb_msg_new 0.06% benchmark ld-2.31.so [.] check_match
4 years ago
}
I think this may have reached the optimization limit. ------------------------------------------------------------------------- Benchmark Time CPU Iterations ------------------------------------------------------------------------- BM_ArenaOneAlloc 21 ns 21 ns 32994231 BM_ArenaInitialBlockOneAlloc 6 ns 6 ns 116318005 BM_ParseDescriptorNoHeap 3028 ns 3028 ns 231138 2.34354GB/s BM_ParseDescriptor 3557 ns 3557 ns 196583 1.99498GB/s BM_ParseDescriptorProto2NoArena 33228 ns 33226 ns 21196 218.688MB/s BM_ParseDescriptorProto2WithArena 22863 ns 22861 ns 30666 317.831MB/s BM_SerializeDescriptorProto2 5444 ns 5444 ns 127368 1.30348GB/s BM_SerializeDescriptor 12509 ns 12508 ns 55816 580.914MB/s $ perf stat bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap 2020-10-08 14:07:06 Running bazel-bin/benchmark Run on (72 X 3700 MHz CPU s) CPU Caches: L1 Data 32K (x36) L1 Instruction 32K (x36) L2 Unified 1024K (x36) L3 Unified 25344K (x2) ---------------------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------------------- BM_ParseDescriptorNoHeap 3071 ns 3071 ns 227743 2.31094GB/s Performance counter stats for 'bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap': 1,050.22 msec task-clock # 0.978 CPUs utilized 4 context-switches # 0.004 K/sec 0 cpu-migrations # 0.000 K/sec 179 page-faults # 0.170 K/sec 3,875,796,334 cycles # 3.690 GHz 13,282,835,967 instructions # 3.43 insn per cycle 2,887,725,848 branches # 2749.627 M/sec 8,324,912 branch-misses # 0.29% of all branches 1.073924364 seconds time elapsed 1.042806000 seconds user 0.008021000 seconds sys Profile: 23.96% benchmark benchmark [.] upb_prm_1bt_max192b 22.44% benchmark benchmark [.] fastdecode_dispatch 18.96% benchmark benchmark [.] upb_pss_1bt 14.20% benchmark benchmark [.] upb_psv4_1bt 8.33% benchmark benchmark [.] upb_prm_1bt_max64b 6.66% benchmark benchmark [.] upb_prm_1bt_max128b 1.29% benchmark benchmark [.] upb_psm_1bt_max64b 0.77% benchmark benchmark [.] fastdecode_generic 0.55% benchmark [kernel.kallsyms] [k] smp_call_function_single 0.42% benchmark [kernel.kallsyms] [k] _raw_spin_lock_irqsave 0.42% benchmark benchmark [.] upb_psm_1bt_max256b 0.31% benchmark benchmark [.] upb_psb1_1bt 0.21% benchmark benchmark [.] upb_plv4_5bv 0.14% benchmark benchmark [.] upb_psb1_2bt 0.12% benchmark benchmark [.] decode_longvarint64 0.08% benchmark [kernel.kallsyms] [k] vsnprintf 0.07% benchmark [kernel.kallsyms] [k] _raw_spin_lock 0.07% benchmark benchmark [.] _upb_msg_new 0.06% benchmark ld-2.31.so [.] check_match
4 years ago
#define SIZES(card, tagbytes) \
F(card, tagbytes, 64, 64) \
I think this may have reached the optimization limit. ------------------------------------------------------------------------- Benchmark Time CPU Iterations ------------------------------------------------------------------------- BM_ArenaOneAlloc 21 ns 21 ns 32994231 BM_ArenaInitialBlockOneAlloc 6 ns 6 ns 116318005 BM_ParseDescriptorNoHeap 3028 ns 3028 ns 231138 2.34354GB/s BM_ParseDescriptor 3557 ns 3557 ns 196583 1.99498GB/s BM_ParseDescriptorProto2NoArena 33228 ns 33226 ns 21196 218.688MB/s BM_ParseDescriptorProto2WithArena 22863 ns 22861 ns 30666 317.831MB/s BM_SerializeDescriptorProto2 5444 ns 5444 ns 127368 1.30348GB/s BM_SerializeDescriptor 12509 ns 12508 ns 55816 580.914MB/s $ perf stat bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap 2020-10-08 14:07:06 Running bazel-bin/benchmark Run on (72 X 3700 MHz CPU s) CPU Caches: L1 Data 32K (x36) L1 Instruction 32K (x36) L2 Unified 1024K (x36) L3 Unified 25344K (x2) ---------------------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------------------- BM_ParseDescriptorNoHeap 3071 ns 3071 ns 227743 2.31094GB/s Performance counter stats for 'bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap': 1,050.22 msec task-clock # 0.978 CPUs utilized 4 context-switches # 0.004 K/sec 0 cpu-migrations # 0.000 K/sec 179 page-faults # 0.170 K/sec 3,875,796,334 cycles # 3.690 GHz 13,282,835,967 instructions # 3.43 insn per cycle 2,887,725,848 branches # 2749.627 M/sec 8,324,912 branch-misses # 0.29% of all branches 1.073924364 seconds time elapsed 1.042806000 seconds user 0.008021000 seconds sys Profile: 23.96% benchmark benchmark [.] upb_prm_1bt_max192b 22.44% benchmark benchmark [.] fastdecode_dispatch 18.96% benchmark benchmark [.] upb_pss_1bt 14.20% benchmark benchmark [.] upb_psv4_1bt 8.33% benchmark benchmark [.] upb_prm_1bt_max64b 6.66% benchmark benchmark [.] upb_prm_1bt_max128b 1.29% benchmark benchmark [.] upb_psm_1bt_max64b 0.77% benchmark benchmark [.] fastdecode_generic 0.55% benchmark [kernel.kallsyms] [k] smp_call_function_single 0.42% benchmark [kernel.kallsyms] [k] _raw_spin_lock_irqsave 0.42% benchmark benchmark [.] upb_psm_1bt_max256b 0.31% benchmark benchmark [.] upb_psb1_1bt 0.21% benchmark benchmark [.] upb_plv4_5bv 0.14% benchmark benchmark [.] upb_psb1_2bt 0.12% benchmark benchmark [.] decode_longvarint64 0.08% benchmark [kernel.kallsyms] [k] vsnprintf 0.07% benchmark [kernel.kallsyms] [k] _raw_spin_lock 0.07% benchmark benchmark [.] _upb_msg_new 0.06% benchmark ld-2.31.so [.] check_match
4 years ago
F(card, tagbytes, 128, 128) \
F(card, tagbytes, 192, 192) \
F(card, tagbytes, 256, 256) \
F(card, tagbytes, max, -1)
I think this may have reached the optimization limit. ------------------------------------------------------------------------- Benchmark Time CPU Iterations ------------------------------------------------------------------------- BM_ArenaOneAlloc 21 ns 21 ns 32994231 BM_ArenaInitialBlockOneAlloc 6 ns 6 ns 116318005 BM_ParseDescriptorNoHeap 3028 ns 3028 ns 231138 2.34354GB/s BM_ParseDescriptor 3557 ns 3557 ns 196583 1.99498GB/s BM_ParseDescriptorProto2NoArena 33228 ns 33226 ns 21196 218.688MB/s BM_ParseDescriptorProto2WithArena 22863 ns 22861 ns 30666 317.831MB/s BM_SerializeDescriptorProto2 5444 ns 5444 ns 127368 1.30348GB/s BM_SerializeDescriptor 12509 ns 12508 ns 55816 580.914MB/s $ perf stat bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap 2020-10-08 14:07:06 Running bazel-bin/benchmark Run on (72 X 3700 MHz CPU s) CPU Caches: L1 Data 32K (x36) L1 Instruction 32K (x36) L2 Unified 1024K (x36) L3 Unified 25344K (x2) ---------------------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------------------- BM_ParseDescriptorNoHeap 3071 ns 3071 ns 227743 2.31094GB/s Performance counter stats for 'bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap': 1,050.22 msec task-clock # 0.978 CPUs utilized 4 context-switches # 0.004 K/sec 0 cpu-migrations # 0.000 K/sec 179 page-faults # 0.170 K/sec 3,875,796,334 cycles # 3.690 GHz 13,282,835,967 instructions # 3.43 insn per cycle 2,887,725,848 branches # 2749.627 M/sec 8,324,912 branch-misses # 0.29% of all branches 1.073924364 seconds time elapsed 1.042806000 seconds user 0.008021000 seconds sys Profile: 23.96% benchmark benchmark [.] upb_prm_1bt_max192b 22.44% benchmark benchmark [.] fastdecode_dispatch 18.96% benchmark benchmark [.] upb_pss_1bt 14.20% benchmark benchmark [.] upb_psv4_1bt 8.33% benchmark benchmark [.] upb_prm_1bt_max64b 6.66% benchmark benchmark [.] upb_prm_1bt_max128b 1.29% benchmark benchmark [.] upb_psm_1bt_max64b 0.77% benchmark benchmark [.] fastdecode_generic 0.55% benchmark [kernel.kallsyms] [k] smp_call_function_single 0.42% benchmark [kernel.kallsyms] [k] _raw_spin_lock_irqsave 0.42% benchmark benchmark [.] upb_psm_1bt_max256b 0.31% benchmark benchmark [.] upb_psb1_1bt 0.21% benchmark benchmark [.] upb_plv4_5bv 0.14% benchmark benchmark [.] upb_psb1_2bt 0.12% benchmark benchmark [.] decode_longvarint64 0.08% benchmark [kernel.kallsyms] [k] vsnprintf 0.07% benchmark [kernel.kallsyms] [k] _raw_spin_lock 0.07% benchmark benchmark [.] _upb_msg_new 0.06% benchmark ld-2.31.so [.] check_match
4 years ago
#define TAGBYTES(card) \
SIZES(card, 1) \
I think this may have reached the optimization limit. ------------------------------------------------------------------------- Benchmark Time CPU Iterations ------------------------------------------------------------------------- BM_ArenaOneAlloc 21 ns 21 ns 32994231 BM_ArenaInitialBlockOneAlloc 6 ns 6 ns 116318005 BM_ParseDescriptorNoHeap 3028 ns 3028 ns 231138 2.34354GB/s BM_ParseDescriptor 3557 ns 3557 ns 196583 1.99498GB/s BM_ParseDescriptorProto2NoArena 33228 ns 33226 ns 21196 218.688MB/s BM_ParseDescriptorProto2WithArena 22863 ns 22861 ns 30666 317.831MB/s BM_SerializeDescriptorProto2 5444 ns 5444 ns 127368 1.30348GB/s BM_SerializeDescriptor 12509 ns 12508 ns 55816 580.914MB/s $ perf stat bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap 2020-10-08 14:07:06 Running bazel-bin/benchmark Run on (72 X 3700 MHz CPU s) CPU Caches: L1 Data 32K (x36) L1 Instruction 32K (x36) L2 Unified 1024K (x36) L3 Unified 25344K (x2) ---------------------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------------------- BM_ParseDescriptorNoHeap 3071 ns 3071 ns 227743 2.31094GB/s Performance counter stats for 'bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap': 1,050.22 msec task-clock # 0.978 CPUs utilized 4 context-switches # 0.004 K/sec 0 cpu-migrations # 0.000 K/sec 179 page-faults # 0.170 K/sec 3,875,796,334 cycles # 3.690 GHz 13,282,835,967 instructions # 3.43 insn per cycle 2,887,725,848 branches # 2749.627 M/sec 8,324,912 branch-misses # 0.29% of all branches 1.073924364 seconds time elapsed 1.042806000 seconds user 0.008021000 seconds sys Profile: 23.96% benchmark benchmark [.] upb_prm_1bt_max192b 22.44% benchmark benchmark [.] fastdecode_dispatch 18.96% benchmark benchmark [.] upb_pss_1bt 14.20% benchmark benchmark [.] upb_psv4_1bt 8.33% benchmark benchmark [.] upb_prm_1bt_max64b 6.66% benchmark benchmark [.] upb_prm_1bt_max128b 1.29% benchmark benchmark [.] upb_psm_1bt_max64b 0.77% benchmark benchmark [.] fastdecode_generic 0.55% benchmark [kernel.kallsyms] [k] smp_call_function_single 0.42% benchmark [kernel.kallsyms] [k] _raw_spin_lock_irqsave 0.42% benchmark benchmark [.] upb_psm_1bt_max256b 0.31% benchmark benchmark [.] upb_psb1_1bt 0.21% benchmark benchmark [.] upb_plv4_5bv 0.14% benchmark benchmark [.] upb_psb1_2bt 0.12% benchmark benchmark [.] decode_longvarint64 0.08% benchmark [kernel.kallsyms] [k] vsnprintf 0.07% benchmark [kernel.kallsyms] [k] _raw_spin_lock 0.07% benchmark benchmark [.] _upb_msg_new 0.06% benchmark ld-2.31.so [.] check_match
4 years ago
SIZES(card, 2)
I think this may have reached the optimization limit. ------------------------------------------------------------------------- Benchmark Time CPU Iterations ------------------------------------------------------------------------- BM_ArenaOneAlloc 21 ns 21 ns 32994231 BM_ArenaInitialBlockOneAlloc 6 ns 6 ns 116318005 BM_ParseDescriptorNoHeap 3028 ns 3028 ns 231138 2.34354GB/s BM_ParseDescriptor 3557 ns 3557 ns 196583 1.99498GB/s BM_ParseDescriptorProto2NoArena 33228 ns 33226 ns 21196 218.688MB/s BM_ParseDescriptorProto2WithArena 22863 ns 22861 ns 30666 317.831MB/s BM_SerializeDescriptorProto2 5444 ns 5444 ns 127368 1.30348GB/s BM_SerializeDescriptor 12509 ns 12508 ns 55816 580.914MB/s $ perf stat bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap 2020-10-08 14:07:06 Running bazel-bin/benchmark Run on (72 X 3700 MHz CPU s) CPU Caches: L1 Data 32K (x36) L1 Instruction 32K (x36) L2 Unified 1024K (x36) L3 Unified 25344K (x2) ---------------------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------------------- BM_ParseDescriptorNoHeap 3071 ns 3071 ns 227743 2.31094GB/s Performance counter stats for 'bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap': 1,050.22 msec task-clock # 0.978 CPUs utilized 4 context-switches # 0.004 K/sec 0 cpu-migrations # 0.000 K/sec 179 page-faults # 0.170 K/sec 3,875,796,334 cycles # 3.690 GHz 13,282,835,967 instructions # 3.43 insn per cycle 2,887,725,848 branches # 2749.627 M/sec 8,324,912 branch-misses # 0.29% of all branches 1.073924364 seconds time elapsed 1.042806000 seconds user 0.008021000 seconds sys Profile: 23.96% benchmark benchmark [.] upb_prm_1bt_max192b 22.44% benchmark benchmark [.] fastdecode_dispatch 18.96% benchmark benchmark [.] upb_pss_1bt 14.20% benchmark benchmark [.] upb_psv4_1bt 8.33% benchmark benchmark [.] upb_prm_1bt_max64b 6.66% benchmark benchmark [.] upb_prm_1bt_max128b 1.29% benchmark benchmark [.] upb_psm_1bt_max64b 0.77% benchmark benchmark [.] fastdecode_generic 0.55% benchmark [kernel.kallsyms] [k] smp_call_function_single 0.42% benchmark [kernel.kallsyms] [k] _raw_spin_lock_irqsave 0.42% benchmark benchmark [.] upb_psm_1bt_max256b 0.31% benchmark benchmark [.] upb_psb1_1bt 0.21% benchmark benchmark [.] upb_plv4_5bv 0.14% benchmark benchmark [.] upb_psb1_2bt 0.12% benchmark benchmark [.] decode_longvarint64 0.08% benchmark [kernel.kallsyms] [k] vsnprintf 0.07% benchmark [kernel.kallsyms] [k] _raw_spin_lock 0.07% benchmark benchmark [.] _upb_msg_new 0.06% benchmark ld-2.31.so [.] check_match
4 years ago
TAGBYTES(s)
TAGBYTES(o)
TAGBYTES(r)
I think this may have reached the optimization limit. ------------------------------------------------------------------------- Benchmark Time CPU Iterations ------------------------------------------------------------------------- BM_ArenaOneAlloc 21 ns 21 ns 32994231 BM_ArenaInitialBlockOneAlloc 6 ns 6 ns 116318005 BM_ParseDescriptorNoHeap 3028 ns 3028 ns 231138 2.34354GB/s BM_ParseDescriptor 3557 ns 3557 ns 196583 1.99498GB/s BM_ParseDescriptorProto2NoArena 33228 ns 33226 ns 21196 218.688MB/s BM_ParseDescriptorProto2WithArena 22863 ns 22861 ns 30666 317.831MB/s BM_SerializeDescriptorProto2 5444 ns 5444 ns 127368 1.30348GB/s BM_SerializeDescriptor 12509 ns 12508 ns 55816 580.914MB/s $ perf stat bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap 2020-10-08 14:07:06 Running bazel-bin/benchmark Run on (72 X 3700 MHz CPU s) CPU Caches: L1 Data 32K (x36) L1 Instruction 32K (x36) L2 Unified 1024K (x36) L3 Unified 25344K (x2) ---------------------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------------------- BM_ParseDescriptorNoHeap 3071 ns 3071 ns 227743 2.31094GB/s Performance counter stats for 'bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap': 1,050.22 msec task-clock # 0.978 CPUs utilized 4 context-switches # 0.004 K/sec 0 cpu-migrations # 0.000 K/sec 179 page-faults # 0.170 K/sec 3,875,796,334 cycles # 3.690 GHz 13,282,835,967 instructions # 3.43 insn per cycle 2,887,725,848 branches # 2749.627 M/sec 8,324,912 branch-misses # 0.29% of all branches 1.073924364 seconds time elapsed 1.042806000 seconds user 0.008021000 seconds sys Profile: 23.96% benchmark benchmark [.] upb_prm_1bt_max192b 22.44% benchmark benchmark [.] fastdecode_dispatch 18.96% benchmark benchmark [.] upb_pss_1bt 14.20% benchmark benchmark [.] upb_psv4_1bt 8.33% benchmark benchmark [.] upb_prm_1bt_max64b 6.66% benchmark benchmark [.] upb_prm_1bt_max128b 1.29% benchmark benchmark [.] upb_psm_1bt_max64b 0.77% benchmark benchmark [.] fastdecode_generic 0.55% benchmark [kernel.kallsyms] [k] smp_call_function_single 0.42% benchmark [kernel.kallsyms] [k] _raw_spin_lock_irqsave 0.42% benchmark benchmark [.] upb_psm_1bt_max256b 0.31% benchmark benchmark [.] upb_psb1_1bt 0.21% benchmark benchmark [.] upb_plv4_5bv 0.14% benchmark benchmark [.] upb_psb1_2bt 0.12% benchmark benchmark [.] decode_longvarint64 0.08% benchmark [kernel.kallsyms] [k] vsnprintf 0.07% benchmark [kernel.kallsyms] [k] _raw_spin_lock 0.07% benchmark benchmark [.] _upb_msg_new 0.06% benchmark ld-2.31.so [.] check_match
4 years ago
#undef TAGBYTES
#undef SIZES
#undef F
#undef FASTDECODE_SUBMSG
#endif /* UPB_FASTTABLE */