|
|
|
|
|
|
|
#include "upb/decode.h"
|
|
|
|
|
|
|
|
#include "upb/port_def.inc"
|
|
|
|
|
|
|
|
#define UPB_PARSE_PARAMS \
|
|
|
|
upb_decstate *d, const char *ptr, upb_msg *msg, const upb_msglayout *table, \
|
|
|
|
uint64_t hasbits, uint64_t data
|
|
|
|
|
|
|
|
#define UPB_PARSE_ARGS d, ptr, msg, table, hasbits, data
|
|
|
|
|
|
|
|
#define RETURN_GENERIC(msg) \
|
|
|
|
/* fprintf(stderr, msg); */ \
|
|
|
|
return fastdecode_generic(UPB_PARSE_ARGS);
|
|
|
|
|
|
|
|
typedef enum {
|
|
|
|
CARD_s = 0, /* Singular (optional, non-repeated) */
|
|
|
|
CARD_o = 1, /* Oneof */
|
|
|
|
CARD_r = 2 /* Repeated */
|
|
|
|
} upb_card;
|
|
|
|
|
|
|
|
UPB_FORCEINLINE
|
|
|
|
const char *fastdecode_dispatch(upb_decstate *d, const char *ptr, upb_msg *msg,
|
|
|
|
const upb_msglayout *table, uint64_t hasbits) {
|
|
|
|
uint16_t tag;
|
|
|
|
uint64_t data = 0;;
|
|
|
|
size_t idx;
|
|
|
|
if (UPB_UNLIKELY(ptr >= d->fastlimit)) {
|
|
|
|
if (UPB_LIKELY(ptr == d->limit)) {
|
|
|
|
*(uint32_t*)msg |= hasbits >> 16; /* Sync hasbits. */
|
|
|
|
return ptr;
|
|
|
|
}
|
|
|
|
RETURN_GENERIC("dispatch hit end\n");
|
|
|
|
}
|
|
|
|
memcpy(&tag, ptr, 2);
|
|
|
|
idx = (tag & 0xf8) >> 3;
|
|
|
|
data = table->field_data[idx] ^ tag;
|
|
|
|
return table->field_parser[idx](UPB_PARSE_ARGS);
|
|
|
|
}
|
|
|
|
|
|
|
|
UPB_FORCEINLINE
|
|
|
|
static bool fastdecode_checktag(uint64_t data, int tagbytes) {
|
|
|
|
if (tagbytes == 1) {
|
|
|
|
return (data & 0xff) == 0;
|
|
|
|
} else {
|
|
|
|
return (data & 0xffff) == 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
UPB_FORCEINLINE
|
|
|
|
static uint16_t fastdecode_readtag(const char *ptr, int tagbytes) {
|
|
|
|
uint16_t ret = 0;
|
|
|
|
memcpy(&ret, ptr, tagbytes);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
UPB_FORCEINLINE
|
|
|
|
static void *fastdecode_getfield_ofs(upb_decstate *d, const char *ptr,
|
|
|
|
upb_msg *msg, size_t ofs, uint64_t *data,
|
|
|
|
uint64_t *hasbits, upb_array **outarr,
|
|
|
|
void **end, int tagbytes, int valbytes,
|
|
|
|
upb_card card) {
|
|
|
|
void *field = (char *)msg + ofs;
|
|
|
|
|
|
|
|
switch (card) {
|
|
|
|
case CARD_s:
|
|
|
|
*hasbits |= *data;
|
|
|
|
return field;
|
|
|
|
case CARD_o: {
|
|
|
|
uint32_t *case_ptr = UPB_PTR_AT(msg, (*data >> 16) & 0xffff, uint32_t);
|
|
|
|
*case_ptr = (*data >> 32) & 0xffff;
|
|
|
|
return field;
|
|
|
|
}
|
|
|
|
case CARD_r: {
|
|
|
|
uint8_t elem_size_lg2 = __builtin_ctz(valbytes);
|
|
|
|
upb_array **arr_p = field;
|
|
|
|
upb_array *arr;
|
|
|
|
uint16_t expected_tag;
|
|
|
|
*hasbits >>= 16;
|
|
|
|
*(uint32_t*)msg |= *hasbits;
|
|
|
|
*hasbits = 0;
|
|
|
|
if (UPB_LIKELY(!*arr_p)) {
|
|
|
|
const size_t initial_len = 8;
|
|
|
|
size_t need = (valbytes * initial_len) + sizeof(upb_array);
|
|
|
|
if (UPB_UNLIKELY((size_t)(d->arena_end - d->arena_ptr) < need)) {
|
|
|
|
*outarr = NULL;
|
|
|
|
*data = 0;
|
|
|
|
*end = NULL;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
arr = (void*)d->arena_ptr;
|
|
|
|
field = arr + 1;
|
|
|
|
arr->data = _upb_array_tagptr(field, elem_size_lg2);
|
|
|
|
*arr_p = arr;
|
|
|
|
arr->size = initial_len;
|
|
|
|
*end = (char*)field + (arr->size * valbytes);
|
|
|
|
d->arena_ptr += need;
|
|
|
|
} else {
|
|
|
|
arr = *arr_p;
|
|
|
|
field = _upb_array_ptr(arr);
|
|
|
|
*end = (char*)field + (arr->size * valbytes);
|
|
|
|
field = (char*)field + (arr->len * valbytes);
|
|
|
|
}
|
|
|
|
expected_tag = fastdecode_readtag(ptr, tagbytes);
|
|
|
|
*data = expected_tag;
|
|
|
|
*outarr = arr;
|
|
|
|
return field;
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
UPB_UNREACHABLE();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
UPB_FORCEINLINE
|
|
|
|
static void *fastdecode_getfield(upb_decstate *d, const char *ptr, upb_msg *msg,
|
|
|
|
uint64_t *data, uint64_t *hasbits,
|
|
|
|
int tagbytes, int valbytes, upb_card card) {
|
|
|
|
return fastdecode_getfield_ofs(d, ptr, msg, *data >> 48, data, hasbits, NULL,
|
|
|
|
NULL, tagbytes, valbytes, card);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* varint fields **************************************************************/
|
|
|
|
|
|
|
|
#ifdef __BMI2__
|
|
|
|
#include <immintrin.h>
|
|
|
|
#endif
|
|
|
|
|
|
|
|
UPB_FORCEINLINE uint64_t fastdecode_munge(uint64_t val, int valbytes, bool zigzag) {
|
|
|
|
if (valbytes == 1) {
|
|
|
|
return val != 0;
|
|
|
|
} else if (zigzag) {
|
|
|
|
if (valbytes == 4) {
|
|
|
|
uint32_t n = val;
|
|
|
|
return (n >> 1) ^ -(int32_t)(n & 1);
|
|
|
|
} else if (valbytes == 8) {
|
|
|
|
return (val >> 1) ^ -(int64_t)(val & 1);
|
|
|
|
}
|
|
|
|
UPB_UNREACHABLE();
|
|
|
|
}
|
|
|
|
return val;
|
|
|
|
}
|
|
|
|
|
|
|
|
UPB_FORCEINLINE
|
|
|
|
static int fastdecode_varintlen(uint64_t data64) {
|
|
|
|
uint64_t clear_bits = ~data64 & 0x8080808080808080;
|
|
|
|
if (clear_bits == 0) return -1;
|
|
|
|
return __builtin_ctzl(clear_bits) / 8 + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
UPB_FORCEINLINE
|
|
|
|
static const char *fastdecode_longvarint(UPB_PARSE_PARAMS, int valbytes,
|
|
|
|
int varintbytes, bool zigzag) {
|
|
|
|
uint64_t val = data >> 18;
|
|
|
|
size_t ofs = (uint16_t)data;
|
|
|
|
uint64_t data64;
|
|
|
|
int sawbytes;
|
|
|
|
memcpy(&data64, ptr + 2, 8);
|
|
|
|
sawbytes = fastdecode_varintlen(data64) + 2;
|
|
|
|
UPB_ASSERT(sawbytes == varintbytes);
|
|
|
|
#ifdef __BMI2__
|
|
|
|
if (varintbytes != 3) {
|
|
|
|
uint64_t mask = 0x7f7f7f7f7f7f7f7f >> (8 * (10 - varintbytes));
|
|
|
|
val |= _pext_u64(data64, mask) << 14;
|
|
|
|
} else
|
|
|
|
#endif
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
for (i = 2; i < varintbytes; i++) {
|
|
|
|
uint64_t byte = ptr[i];
|
|
|
|
if (i != varintbytes - 1) byte &= 0x7f;
|
|
|
|
val |= byte << (7 * i);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
val = fastdecode_munge(val, valbytes, zigzag);
|
|
|
|
memcpy((char*)msg + ofs, &val, valbytes);
|
|
|
|
return fastdecode_dispatch(d, ptr + varintbytes, msg, table, hasbits);
|
|
|
|
}
|
|
|
|
|
|
|
|
UPB_FORCEINLINE
|
|
|
|
static const char *fastdecode_longvarintjmp(UPB_PARSE_PARAMS,
|
|
|
|
_upb_field_parser **funcs) {
|
|
|
|
int len;
|
|
|
|
uint64_t data64;
|
|
|
|
memcpy(&data64, ptr + 2, 8);
|
|
|
|
len = fastdecode_varintlen(data64);
|
|
|
|
if (len < 0) return fastdecode_err(d);
|
|
|
|
return funcs[len - 1](UPB_PARSE_ARGS);
|
|
|
|
}
|
|
|
|
|
|
|
|
UPB_FORCEINLINE
|
|
|
|
static const char *fastdecode_varint(UPB_PARSE_PARAMS, int tagbytes,
|
|
|
|
int valbytes, upb_card card, bool zigzag,
|
|
|
|
_upb_field_parser **funcs) {
|
|
|
|
uint64_t val;
|
|
|
|
void *dst;
|
|
|
|
if (UPB_UNLIKELY(!fastdecode_checktag(data, tagbytes))) {
|
|
|
|
RETURN_GENERIC("varint field tag mismatch\n");
|
|
|
|
}
|
|
|
|
dst = fastdecode_getfield(d, ptr, msg, &data, &hasbits, tagbytes, valbytes,
|
|
|
|
card);
|
|
|
|
val = (uint8_t)ptr[tagbytes];
|
|
|
|
if (UPB_UNLIKELY(val & 0x80)) {
|
|
|
|
uint32_t byte = (uint8_t)ptr[tagbytes + 1];
|
|
|
|
val += (byte - 1) << 7;
|
|
|
|
if (UPB_UNLIKELY(byte & 0x80)) {
|
|
|
|
ptr += tagbytes;
|
|
|
|
data = (uint32_t)(val << 18 | data >> 48);
|
|
|
|
return fastdecode_longvarintjmp(UPB_PARSE_ARGS, funcs);
|
|
|
|
}
|
|
|
|
ptr += tagbytes + 2;
|
|
|
|
} else {
|
|
|
|
ptr += tagbytes + 1;
|
|
|
|
}
|
|
|
|
val = fastdecode_munge(val, valbytes, zigzag);
|
|
|
|
memcpy(dst, &val, valbytes);
|
|
|
|
return fastdecode_dispatch(d, ptr, msg, table, hasbits);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define z_ZZ true
|
|
|
|
#define b_ZZ false
|
|
|
|
#define v_ZZ false
|
|
|
|
|
|
|
|
/* Generate varint vallbacks. */
|
|
|
|
|
|
|
|
#define FUNCNAME(type, valbytes, varintbytes) \
|
|
|
|
upb_pl##type##valbytes##_##varintbytes##bv
|
|
|
|
|
|
|
|
#define TABLENAME(type, valbytes) \
|
|
|
|
upb_pl##type##valbytes##_table
|
|
|
|
|
|
|
|
#define F(type, valbytes, varintbytes) \
|
|
|
|
static const char *FUNCNAME(type, valbytes, varintbytes)(UPB_PARSE_PARAMS) { \
|
|
|
|
return fastdecode_longvarint(UPB_PARSE_ARGS, valbytes, varintbytes, \
|
|
|
|
type##_ZZ); \
|
|
|
|
}
|
|
|
|
|
|
|
|
#define FALLBACKS(type, valbytes) \
|
|
|
|
F(type, valbytes, 3) \
|
|
|
|
F(type, valbytes, 4) \
|
|
|
|
F(type, valbytes, 5) \
|
|
|
|
F(type, valbytes, 6) \
|
|
|
|
F(type, valbytes, 7) \
|
|
|
|
F(type, valbytes, 8) \
|
|
|
|
F(type, valbytes, 9) \
|
|
|
|
F(type, valbytes, 10) \
|
|
|
|
static _upb_field_parser *TABLENAME(type, valbytes)[8] = { \
|
|
|
|
&FUNCNAME(type, valbytes, 3), &FUNCNAME(type, valbytes, 4), \
|
|
|
|
&FUNCNAME(type, valbytes, 5), &FUNCNAME(type, valbytes, 6), \
|
|
|
|
&FUNCNAME(type, valbytes, 7), &FUNCNAME(type, valbytes, 8), \
|
|
|
|
&FUNCNAME(type, valbytes, 9), &FUNCNAME(type, valbytes, 10)};
|
|
|
|
|
|
|
|
FALLBACKS(b, 1)
|
|
|
|
FALLBACKS(v, 4)
|
|
|
|
FALLBACKS(v, 8)
|
|
|
|
FALLBACKS(z, 4)
|
|
|
|
FALLBACKS(z, 8)
|
|
|
|
|
|
|
|
#undef F
|
|
|
|
#undef FALLBACKS
|
|
|
|
#undef FUNCNAME
|
|
|
|
|
|
|
|
/* Generate all varint functions.
|
|
|
|
* {s,o,r} x {b1,v4,z4,v8,z8} x {1bt,2bt} */
|
|
|
|
|
|
|
|
#define F(card, type, valbytes, tagbytes) \
|
|
|
|
const char *upb_p##card##type##valbytes##_##tagbytes##bt(UPB_PARSE_PARAMS) { \
|
|
|
|
return fastdecode_varint(UPB_PARSE_ARGS, tagbytes, valbytes, CARD_##card, \
|
|
|
|
type##_ZZ, TABLENAME(type, valbytes)); \
|
|
|
|
}
|
|
|
|
|
|
|
|
#define TYPES(card, tagbytes) \
|
|
|
|
F(card, b, 1, tagbytes) \
|
|
|
|
F(card, v, 4, tagbytes) \
|
|
|
|
F(card, v, 8, tagbytes) \
|
|
|
|
F(card, z, 4, tagbytes) \
|
|
|
|
F(card, z, 8, tagbytes)
|
|
|
|
|
|
|
|
#define TAGBYTES(card) \
|
|
|
|
TYPES(card, 1) \
|
|
|
|
TYPES(card, 2)
|
|
|
|
|
|
|
|
TAGBYTES(s)
|
|
|
|
TAGBYTES(o)
|
|
|
|
/* TAGBYTES(r) */
|
|
|
|
|
|
|
|
#undef z_ZZ
|
|
|
|
#undef b_ZZ
|
|
|
|
#undef v_ZZ
|
|
|
|
#undef o_ONEOF
|
|
|
|
#undef s_ONEOF
|
|
|
|
#undef r_ONEOF
|
|
|
|
#undef F
|
|
|
|
#undef TYPES
|
|
|
|
#undef TAGBYTES
|
|
|
|
|
|
|
|
/* string fields **************************************************************/
|
|
|
|
|
|
|
|
UPB_FORCEINLINE
|
|
|
|
bool fastdecode_boundscheck(const char *ptr, size_t len, const char *end) {
|
|
|
|
uintptr_t uptr = (uintptr_t)ptr;
|
|
|
|
uintptr_t uend = (uintptr_t)end;
|
|
|
|
uintptr_t res = uptr + len;
|
|
|
|
return res < uptr || res > uend;
|
|
|
|
}
|
|
|
|
|
|
|
|
UPB_FORCEINLINE
|
|
|
|
static const char *fastdecode_string(UPB_PARSE_PARAMS, int tagbytes,
|
|
|
|
upb_card card) {
|
|
|
|
upb_strview *dst;
|
|
|
|
const char *str;
|
|
|
|
int64_t len;
|
|
|
|
if (UPB_UNLIKELY(!fastdecode_checktag(data, tagbytes))) {
|
|
|
|
RETURN_GENERIC("string field tag mismatch\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
dst = fastdecode_getfield(d, ptr, msg, &data, &hasbits, tagbytes,
|
|
|
|
sizeof(upb_strview), card);
|
|
|
|
len = (int8_t)ptr[tagbytes];
|
|
|
|
str = ptr + tagbytes + 1;
|
|
|
|
dst->data = str;
|
|
|
|
dst->size = len;
|
|
|
|
if (UPB_UNLIKELY(fastdecode_boundscheck(str, len, d->limit))) {
|
|
|
|
dst->size = 0;
|
|
|
|
RETURN_GENERIC("string field len >1 byte\n");
|
|
|
|
}
|
|
|
|
return fastdecode_dispatch(d, str + len, msg, table, hasbits);
|
|
|
|
}
|
|
|
|
|
|
|
|
const char *upb_pss_1bt(UPB_PARSE_PARAMS) {
|
|
|
|
return fastdecode_string(UPB_PARSE_ARGS, 1, CARD_s);
|
|
|
|
}
|
|
|
|
|
|
|
|
const char *upb_pos_1bt(UPB_PARSE_PARAMS) {
|
|
|
|
return fastdecode_string(UPB_PARSE_ARGS, 1, CARD_o);
|
|
|
|
}
|
|
|
|
|
|
|
|
const char *upb_pss_2bt(UPB_PARSE_PARAMS) {
|
|
|
|
return fastdecode_string(UPB_PARSE_ARGS, 2, CARD_s);
|
|
|
|
}
|
|
|
|
|
|
|
|
const char *upb_pos_2bt(UPB_PARSE_PARAMS) {
|
|
|
|
return fastdecode_string(UPB_PARSE_ARGS, 2, CARD_o);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* message fields *************************************************************/
|
|
|
|
|
|
|
|
UPB_FORCEINLINE
|
|
|
|
bool fastdecode_boundscheck2(const char *ptr, unsigned len, const char *end) {
|
|
|
|
uintptr_t uptr = (uintptr_t)ptr;
|
|
|
|
uintptr_t uend = (uintptr_t)end;
|
|
|
|
uintptr_t res = uptr + len;
|
|
|
|
return res < uptr || res > uend;
|
|
|
|
}
|
|
|
|
|
|
|
|
UPB_FORCEINLINE
|
|
|
|
static const char *fastdecode_submsg(UPB_PARSE_PARAMS, int tagbytes,
|
I think this may have reached the optimization limit.
-------------------------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------------------------
BM_ArenaOneAlloc 21 ns 21 ns 32994231
BM_ArenaInitialBlockOneAlloc 6 ns 6 ns 116318005
BM_ParseDescriptorNoHeap 3028 ns 3028 ns 231138 2.34354GB/s
BM_ParseDescriptor 3557 ns 3557 ns 196583 1.99498GB/s
BM_ParseDescriptorProto2NoArena 33228 ns 33226 ns 21196 218.688MB/s
BM_ParseDescriptorProto2WithArena 22863 ns 22861 ns 30666 317.831MB/s
BM_SerializeDescriptorProto2 5444 ns 5444 ns 127368 1.30348GB/s
BM_SerializeDescriptor 12509 ns 12508 ns 55816 580.914MB/s
$ perf stat bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap
2020-10-08 14:07:06
Running bazel-bin/benchmark
Run on (72 X 3700 MHz CPU s)
CPU Caches:
L1 Data 32K (x36)
L1 Instruction 32K (x36)
L2 Unified 1024K (x36)
L3 Unified 25344K (x2)
----------------------------------------------------------------
Benchmark Time CPU Iterations
----------------------------------------------------------------
BM_ParseDescriptorNoHeap 3071 ns 3071 ns 227743 2.31094GB/s
Performance counter stats for 'bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap':
1,050.22 msec task-clock # 0.978 CPUs utilized
4 context-switches # 0.004 K/sec
0 cpu-migrations # 0.000 K/sec
179 page-faults # 0.170 K/sec
3,875,796,334 cycles # 3.690 GHz
13,282,835,967 instructions # 3.43 insn per cycle
2,887,725,848 branches # 2749.627 M/sec
8,324,912 branch-misses # 0.29% of all branches
1.073924364 seconds time elapsed
1.042806000 seconds user
0.008021000 seconds sys
Profile:
23.96% benchmark benchmark [.] upb_prm_1bt_max192b
22.44% benchmark benchmark [.] fastdecode_dispatch
18.96% benchmark benchmark [.] upb_pss_1bt
14.20% benchmark benchmark [.] upb_psv4_1bt
8.33% benchmark benchmark [.] upb_prm_1bt_max64b
6.66% benchmark benchmark [.] upb_prm_1bt_max128b
1.29% benchmark benchmark [.] upb_psm_1bt_max64b
0.77% benchmark benchmark [.] fastdecode_generic
0.55% benchmark [kernel.kallsyms] [k] smp_call_function_single
0.42% benchmark [kernel.kallsyms] [k] _raw_spin_lock_irqsave
0.42% benchmark benchmark [.] upb_psm_1bt_max256b
0.31% benchmark benchmark [.] upb_psb1_1bt
0.21% benchmark benchmark [.] upb_plv4_5bv
0.14% benchmark benchmark [.] upb_psb1_2bt
0.12% benchmark benchmark [.] decode_longvarint64
0.08% benchmark [kernel.kallsyms] [k] vsnprintf
0.07% benchmark [kernel.kallsyms] [k] _raw_spin_lock
0.07% benchmark benchmark [.] _upb_msg_new
0.06% benchmark ld-2.31.so [.] check_match
4 years ago
|
|
|
int msg_ceil_bytes, upb_card card) {
|
|
|
|
const char *saved_limit = d->limit;
|
|
|
|
const char *saved_fastlimit = d->fastlimit;
|
|
|
|
const upb_msglayout_field *field = &table->fields[data >> 48];
|
|
|
|
size_t ofs = field->offset;
|
|
|
|
const upb_msglayout *subl = table->submsgs[field->submsg_index];
|
|
|
|
upb_array *arr;
|
|
|
|
upb_msg **submsg;
|
|
|
|
void *end;
|
|
|
|
|
|
|
|
if (UPB_UNLIKELY(!fastdecode_checktag(data, tagbytes))) {
|
|
|
|
RETURN_GENERIC("submessage field tag mismatch\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
if (--d->depth < 0) return fastdecode_err(d);
|
|
|
|
|
|
|
|
submsg = fastdecode_getfield_ofs(d, ptr, msg, ofs, &data, &hasbits, &arr,
|
|
|
|
&end, tagbytes, sizeof(upb_msg *), card);
|
|
|
|
|
|
|
|
again:
|
|
|
|
if (card == CARD_r) {
|
|
|
|
if (UPB_UNLIKELY(submsg == end)) {
|
|
|
|
if (arr) {
|
|
|
|
size_t old_size = arr->size;
|
|
|
|
size_t old_bytes = old_size * sizeof(upb_msg*);
|
|
|
|
size_t new_size = old_size * 2;
|
|
|
|
size_t new_bytes = new_size * sizeof(upb_msg*);
|
|
|
|
char *old_ptr = _upb_array_ptr(arr);
|
|
|
|
if (UPB_UNLIKELY((size_t)(d->arena_end - d->arena_ptr) < new_bytes)) {
|
|
|
|
d->limit = saved_limit;
|
|
|
|
d->fastlimit = saved_fastlimit;
|
|
|
|
arr->len = submsg - (upb_msg**)_upb_array_ptr(arr);
|
|
|
|
d->depth++;
|
|
|
|
RETURN_GENERIC("repeated realloc failed: arena full");
|
|
|
|
}
|
|
|
|
memcpy(d->arena_ptr, old_ptr, old_bytes);
|
|
|
|
arr->size = new_size;
|
|
|
|
arr->data = _upb_array_tagptr(d->arena_ptr, 3);
|
|
|
|
submsg = (void*)(d->arena_ptr + (old_size * sizeof(upb_msg*)));
|
|
|
|
end = (void*)(d->arena_ptr + (new_size * sizeof(upb_msg*)));
|
|
|
|
d->arena_ptr += new_bytes;
|
|
|
|
} else {
|
|
|
|
d->limit = saved_limit;
|
|
|
|
d->fastlimit = saved_fastlimit;
|
|
|
|
d->depth++;
|
|
|
|
RETURN_GENERIC("need array realloc\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
uint32_t len = (uint8_t)ptr[tagbytes];
|
|
|
|
if (UPB_UNLIKELY(len & 0x80)) {
|
|
|
|
uint32_t byte = (uint8_t)ptr[tagbytes + 1];
|
|
|
|
len += (byte - 1) << 7;
|
|
|
|
if (UPB_UNLIKELY(byte & 0x80)) {
|
|
|
|
if (card == CARD_r) {
|
|
|
|
arr->len = submsg - (upb_msg**)_upb_array_ptr(arr);
|
|
|
|
}
|
|
|
|
d->limit = saved_limit;
|
|
|
|
d->fastlimit = saved_fastlimit;
|
|
|
|
d->depth++;
|
|
|
|
RETURN_GENERIC("submessage field len >2 bytes\n");
|
|
|
|
}
|
|
|
|
ptr++;
|
|
|
|
}
|
|
|
|
ptr += tagbytes + 1;
|
|
|
|
if (UPB_UNLIKELY(fastdecode_boundscheck2(ptr, len, saved_limit))) {
|
|
|
|
return fastdecode_err(d);
|
|
|
|
}
|
|
|
|
d->limit = ptr + len;
|
|
|
|
d->fastlimit = UPB_MIN(d->limit, d->fastend);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (card == CARD_r || !*submsg) {
|
I think this may have reached the optimization limit.
-------------------------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------------------------
BM_ArenaOneAlloc 21 ns 21 ns 32994231
BM_ArenaInitialBlockOneAlloc 6 ns 6 ns 116318005
BM_ParseDescriptorNoHeap 3028 ns 3028 ns 231138 2.34354GB/s
BM_ParseDescriptor 3557 ns 3557 ns 196583 1.99498GB/s
BM_ParseDescriptorProto2NoArena 33228 ns 33226 ns 21196 218.688MB/s
BM_ParseDescriptorProto2WithArena 22863 ns 22861 ns 30666 317.831MB/s
BM_SerializeDescriptorProto2 5444 ns 5444 ns 127368 1.30348GB/s
BM_SerializeDescriptor 12509 ns 12508 ns 55816 580.914MB/s
$ perf stat bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap
2020-10-08 14:07:06
Running bazel-bin/benchmark
Run on (72 X 3700 MHz CPU s)
CPU Caches:
L1 Data 32K (x36)
L1 Instruction 32K (x36)
L2 Unified 1024K (x36)
L3 Unified 25344K (x2)
----------------------------------------------------------------
Benchmark Time CPU Iterations
----------------------------------------------------------------
BM_ParseDescriptorNoHeap 3071 ns 3071 ns 227743 2.31094GB/s
Performance counter stats for 'bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap':
1,050.22 msec task-clock # 0.978 CPUs utilized
4 context-switches # 0.004 K/sec
0 cpu-migrations # 0.000 K/sec
179 page-faults # 0.170 K/sec
3,875,796,334 cycles # 3.690 GHz
13,282,835,967 instructions # 3.43 insn per cycle
2,887,725,848 branches # 2749.627 M/sec
8,324,912 branch-misses # 0.29% of all branches
1.073924364 seconds time elapsed
1.042806000 seconds user
0.008021000 seconds sys
Profile:
23.96% benchmark benchmark [.] upb_prm_1bt_max192b
22.44% benchmark benchmark [.] fastdecode_dispatch
18.96% benchmark benchmark [.] upb_pss_1bt
14.20% benchmark benchmark [.] upb_psv4_1bt
8.33% benchmark benchmark [.] upb_prm_1bt_max64b
6.66% benchmark benchmark [.] upb_prm_1bt_max128b
1.29% benchmark benchmark [.] upb_psm_1bt_max64b
0.77% benchmark benchmark [.] fastdecode_generic
0.55% benchmark [kernel.kallsyms] [k] smp_call_function_single
0.42% benchmark [kernel.kallsyms] [k] _raw_spin_lock_irqsave
0.42% benchmark benchmark [.] upb_psm_1bt_max256b
0.31% benchmark benchmark [.] upb_psb1_1bt
0.21% benchmark benchmark [.] upb_plv4_5bv
0.14% benchmark benchmark [.] upb_psb1_2bt
0.12% benchmark benchmark [.] decode_longvarint64
0.08% benchmark [kernel.kallsyms] [k] vsnprintf
0.07% benchmark [kernel.kallsyms] [k] _raw_spin_lock
0.07% benchmark benchmark [.] _upb_msg_new
0.06% benchmark ld-2.31.so [.] check_match
4 years ago
|
|
|
*submsg = decode_newmsg_ceil(d, subl, msg_ceil_bytes);
|
|
|
|
}
|
|
|
|
ptr = fastdecode_dispatch(d, ptr, *submsg, subl, 0);
|
|
|
|
submsg++;
|
|
|
|
|
|
|
|
if (ptr != d->limit || d->end_group != 0) {
|
|
|
|
return fastdecode_err(d);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (card == CARD_r) {
|
|
|
|
if (UPB_LIKELY(ptr < saved_fastlimit) &&
|
|
|
|
fastdecode_readtag(ptr, tagbytes) == (uint16_t)data) {
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
arr->len = submsg - (upb_msg**)_upb_array_ptr(arr);
|
|
|
|
}
|
|
|
|
|
|
|
|
d->limit = saved_limit;
|
|
|
|
d->fastlimit = saved_fastlimit;
|
|
|
|
d->depth++;
|
|
|
|
|
|
|
|
return fastdecode_dispatch(d, ptr, msg, table, hasbits);
|
|
|
|
}
|
|
|
|
|
I think this may have reached the optimization limit.
-------------------------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------------------------
BM_ArenaOneAlloc 21 ns 21 ns 32994231
BM_ArenaInitialBlockOneAlloc 6 ns 6 ns 116318005
BM_ParseDescriptorNoHeap 3028 ns 3028 ns 231138 2.34354GB/s
BM_ParseDescriptor 3557 ns 3557 ns 196583 1.99498GB/s
BM_ParseDescriptorProto2NoArena 33228 ns 33226 ns 21196 218.688MB/s
BM_ParseDescriptorProto2WithArena 22863 ns 22861 ns 30666 317.831MB/s
BM_SerializeDescriptorProto2 5444 ns 5444 ns 127368 1.30348GB/s
BM_SerializeDescriptor 12509 ns 12508 ns 55816 580.914MB/s
$ perf stat bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap
2020-10-08 14:07:06
Running bazel-bin/benchmark
Run on (72 X 3700 MHz CPU s)
CPU Caches:
L1 Data 32K (x36)
L1 Instruction 32K (x36)
L2 Unified 1024K (x36)
L3 Unified 25344K (x2)
----------------------------------------------------------------
Benchmark Time CPU Iterations
----------------------------------------------------------------
BM_ParseDescriptorNoHeap 3071 ns 3071 ns 227743 2.31094GB/s
Performance counter stats for 'bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap':
1,050.22 msec task-clock # 0.978 CPUs utilized
4 context-switches # 0.004 K/sec
0 cpu-migrations # 0.000 K/sec
179 page-faults # 0.170 K/sec
3,875,796,334 cycles # 3.690 GHz
13,282,835,967 instructions # 3.43 insn per cycle
2,887,725,848 branches # 2749.627 M/sec
8,324,912 branch-misses # 0.29% of all branches
1.073924364 seconds time elapsed
1.042806000 seconds user
0.008021000 seconds sys
Profile:
23.96% benchmark benchmark [.] upb_prm_1bt_max192b
22.44% benchmark benchmark [.] fastdecode_dispatch
18.96% benchmark benchmark [.] upb_pss_1bt
14.20% benchmark benchmark [.] upb_psv4_1bt
8.33% benchmark benchmark [.] upb_prm_1bt_max64b
6.66% benchmark benchmark [.] upb_prm_1bt_max128b
1.29% benchmark benchmark [.] upb_psm_1bt_max64b
0.77% benchmark benchmark [.] fastdecode_generic
0.55% benchmark [kernel.kallsyms] [k] smp_call_function_single
0.42% benchmark [kernel.kallsyms] [k] _raw_spin_lock_irqsave
0.42% benchmark benchmark [.] upb_psm_1bt_max256b
0.31% benchmark benchmark [.] upb_psb1_1bt
0.21% benchmark benchmark [.] upb_plv4_5bv
0.14% benchmark benchmark [.] upb_psb1_2bt
0.12% benchmark benchmark [.] decode_longvarint64
0.08% benchmark [kernel.kallsyms] [k] vsnprintf
0.07% benchmark [kernel.kallsyms] [k] _raw_spin_lock
0.07% benchmark benchmark [.] _upb_msg_new
0.06% benchmark ld-2.31.so [.] check_match
4 years ago
|
|
|
#define F(card, tagbytes, size_ceil, ceil_arg) \
|
|
|
|
const char *upb_p##card##m_##tagbytes##bt_max##size_ceil##b( \
|
|
|
|
UPB_PARSE_PARAMS) { \
|
|
|
|
return fastdecode_submsg(UPB_PARSE_ARGS, tagbytes, ceil_arg, CARD_##card); \
|
|
|
|
}
|
|
|
|
|
I think this may have reached the optimization limit.
-------------------------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------------------------
BM_ArenaOneAlloc 21 ns 21 ns 32994231
BM_ArenaInitialBlockOneAlloc 6 ns 6 ns 116318005
BM_ParseDescriptorNoHeap 3028 ns 3028 ns 231138 2.34354GB/s
BM_ParseDescriptor 3557 ns 3557 ns 196583 1.99498GB/s
BM_ParseDescriptorProto2NoArena 33228 ns 33226 ns 21196 218.688MB/s
BM_ParseDescriptorProto2WithArena 22863 ns 22861 ns 30666 317.831MB/s
BM_SerializeDescriptorProto2 5444 ns 5444 ns 127368 1.30348GB/s
BM_SerializeDescriptor 12509 ns 12508 ns 55816 580.914MB/s
$ perf stat bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap
2020-10-08 14:07:06
Running bazel-bin/benchmark
Run on (72 X 3700 MHz CPU s)
CPU Caches:
L1 Data 32K (x36)
L1 Instruction 32K (x36)
L2 Unified 1024K (x36)
L3 Unified 25344K (x2)
----------------------------------------------------------------
Benchmark Time CPU Iterations
----------------------------------------------------------------
BM_ParseDescriptorNoHeap 3071 ns 3071 ns 227743 2.31094GB/s
Performance counter stats for 'bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap':
1,050.22 msec task-clock # 0.978 CPUs utilized
4 context-switches # 0.004 K/sec
0 cpu-migrations # 0.000 K/sec
179 page-faults # 0.170 K/sec
3,875,796,334 cycles # 3.690 GHz
13,282,835,967 instructions # 3.43 insn per cycle
2,887,725,848 branches # 2749.627 M/sec
8,324,912 branch-misses # 0.29% of all branches
1.073924364 seconds time elapsed
1.042806000 seconds user
0.008021000 seconds sys
Profile:
23.96% benchmark benchmark [.] upb_prm_1bt_max192b
22.44% benchmark benchmark [.] fastdecode_dispatch
18.96% benchmark benchmark [.] upb_pss_1bt
14.20% benchmark benchmark [.] upb_psv4_1bt
8.33% benchmark benchmark [.] upb_prm_1bt_max64b
6.66% benchmark benchmark [.] upb_prm_1bt_max128b
1.29% benchmark benchmark [.] upb_psm_1bt_max64b
0.77% benchmark benchmark [.] fastdecode_generic
0.55% benchmark [kernel.kallsyms] [k] smp_call_function_single
0.42% benchmark [kernel.kallsyms] [k] _raw_spin_lock_irqsave
0.42% benchmark benchmark [.] upb_psm_1bt_max256b
0.31% benchmark benchmark [.] upb_psb1_1bt
0.21% benchmark benchmark [.] upb_plv4_5bv
0.14% benchmark benchmark [.] upb_psb1_2bt
0.12% benchmark benchmark [.] decode_longvarint64
0.08% benchmark [kernel.kallsyms] [k] vsnprintf
0.07% benchmark [kernel.kallsyms] [k] _raw_spin_lock
0.07% benchmark benchmark [.] _upb_msg_new
0.06% benchmark ld-2.31.so [.] check_match
4 years ago
|
|
|
#define SIZES(card, tagbytes) \
|
|
|
|
F(card, tagbytes, 64, 64) \
|
|
|
|
F(card, tagbytes, 128, 128) \
|
|
|
|
F(card, tagbytes, 192, 192) \
|
|
|
|
F(card, tagbytes, 256, 256) \
|
|
|
|
F(card, tagbytes, max, -1)
|
|
|
|
|
I think this may have reached the optimization limit.
-------------------------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------------------------
BM_ArenaOneAlloc 21 ns 21 ns 32994231
BM_ArenaInitialBlockOneAlloc 6 ns 6 ns 116318005
BM_ParseDescriptorNoHeap 3028 ns 3028 ns 231138 2.34354GB/s
BM_ParseDescriptor 3557 ns 3557 ns 196583 1.99498GB/s
BM_ParseDescriptorProto2NoArena 33228 ns 33226 ns 21196 218.688MB/s
BM_ParseDescriptorProto2WithArena 22863 ns 22861 ns 30666 317.831MB/s
BM_SerializeDescriptorProto2 5444 ns 5444 ns 127368 1.30348GB/s
BM_SerializeDescriptor 12509 ns 12508 ns 55816 580.914MB/s
$ perf stat bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap
2020-10-08 14:07:06
Running bazel-bin/benchmark
Run on (72 X 3700 MHz CPU s)
CPU Caches:
L1 Data 32K (x36)
L1 Instruction 32K (x36)
L2 Unified 1024K (x36)
L3 Unified 25344K (x2)
----------------------------------------------------------------
Benchmark Time CPU Iterations
----------------------------------------------------------------
BM_ParseDescriptorNoHeap 3071 ns 3071 ns 227743 2.31094GB/s
Performance counter stats for 'bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap':
1,050.22 msec task-clock # 0.978 CPUs utilized
4 context-switches # 0.004 K/sec
0 cpu-migrations # 0.000 K/sec
179 page-faults # 0.170 K/sec
3,875,796,334 cycles # 3.690 GHz
13,282,835,967 instructions # 3.43 insn per cycle
2,887,725,848 branches # 2749.627 M/sec
8,324,912 branch-misses # 0.29% of all branches
1.073924364 seconds time elapsed
1.042806000 seconds user
0.008021000 seconds sys
Profile:
23.96% benchmark benchmark [.] upb_prm_1bt_max192b
22.44% benchmark benchmark [.] fastdecode_dispatch
18.96% benchmark benchmark [.] upb_pss_1bt
14.20% benchmark benchmark [.] upb_psv4_1bt
8.33% benchmark benchmark [.] upb_prm_1bt_max64b
6.66% benchmark benchmark [.] upb_prm_1bt_max128b
1.29% benchmark benchmark [.] upb_psm_1bt_max64b
0.77% benchmark benchmark [.] fastdecode_generic
0.55% benchmark [kernel.kallsyms] [k] smp_call_function_single
0.42% benchmark [kernel.kallsyms] [k] _raw_spin_lock_irqsave
0.42% benchmark benchmark [.] upb_psm_1bt_max256b
0.31% benchmark benchmark [.] upb_psb1_1bt
0.21% benchmark benchmark [.] upb_plv4_5bv
0.14% benchmark benchmark [.] upb_psb1_2bt
0.12% benchmark benchmark [.] decode_longvarint64
0.08% benchmark [kernel.kallsyms] [k] vsnprintf
0.07% benchmark [kernel.kallsyms] [k] _raw_spin_lock
0.07% benchmark benchmark [.] _upb_msg_new
0.06% benchmark ld-2.31.so [.] check_match
4 years ago
|
|
|
#define TAGBYTES(card) \
|
|
|
|
SIZES(card, 1) \
|
|
|
|
SIZES(card, 2)
|
|
|
|
|
I think this may have reached the optimization limit.
-------------------------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------------------------
BM_ArenaOneAlloc 21 ns 21 ns 32994231
BM_ArenaInitialBlockOneAlloc 6 ns 6 ns 116318005
BM_ParseDescriptorNoHeap 3028 ns 3028 ns 231138 2.34354GB/s
BM_ParseDescriptor 3557 ns 3557 ns 196583 1.99498GB/s
BM_ParseDescriptorProto2NoArena 33228 ns 33226 ns 21196 218.688MB/s
BM_ParseDescriptorProto2WithArena 22863 ns 22861 ns 30666 317.831MB/s
BM_SerializeDescriptorProto2 5444 ns 5444 ns 127368 1.30348GB/s
BM_SerializeDescriptor 12509 ns 12508 ns 55816 580.914MB/s
$ perf stat bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap
2020-10-08 14:07:06
Running bazel-bin/benchmark
Run on (72 X 3700 MHz CPU s)
CPU Caches:
L1 Data 32K (x36)
L1 Instruction 32K (x36)
L2 Unified 1024K (x36)
L3 Unified 25344K (x2)
----------------------------------------------------------------
Benchmark Time CPU Iterations
----------------------------------------------------------------
BM_ParseDescriptorNoHeap 3071 ns 3071 ns 227743 2.31094GB/s
Performance counter stats for 'bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap':
1,050.22 msec task-clock # 0.978 CPUs utilized
4 context-switches # 0.004 K/sec
0 cpu-migrations # 0.000 K/sec
179 page-faults # 0.170 K/sec
3,875,796,334 cycles # 3.690 GHz
13,282,835,967 instructions # 3.43 insn per cycle
2,887,725,848 branches # 2749.627 M/sec
8,324,912 branch-misses # 0.29% of all branches
1.073924364 seconds time elapsed
1.042806000 seconds user
0.008021000 seconds sys
Profile:
23.96% benchmark benchmark [.] upb_prm_1bt_max192b
22.44% benchmark benchmark [.] fastdecode_dispatch
18.96% benchmark benchmark [.] upb_pss_1bt
14.20% benchmark benchmark [.] upb_psv4_1bt
8.33% benchmark benchmark [.] upb_prm_1bt_max64b
6.66% benchmark benchmark [.] upb_prm_1bt_max128b
1.29% benchmark benchmark [.] upb_psm_1bt_max64b
0.77% benchmark benchmark [.] fastdecode_generic
0.55% benchmark [kernel.kallsyms] [k] smp_call_function_single
0.42% benchmark [kernel.kallsyms] [k] _raw_spin_lock_irqsave
0.42% benchmark benchmark [.] upb_psm_1bt_max256b
0.31% benchmark benchmark [.] upb_psb1_1bt
0.21% benchmark benchmark [.] upb_plv4_5bv
0.14% benchmark benchmark [.] upb_psb1_2bt
0.12% benchmark benchmark [.] decode_longvarint64
0.08% benchmark [kernel.kallsyms] [k] vsnprintf
0.07% benchmark [kernel.kallsyms] [k] _raw_spin_lock
0.07% benchmark benchmark [.] _upb_msg_new
0.06% benchmark ld-2.31.so [.] check_match
4 years ago
|
|
|
TAGBYTES(s)
|
|
|
|
TAGBYTES(o)
|
|
|
|
TAGBYTES(r)
|
|
|
|
|
I think this may have reached the optimization limit.
-------------------------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------------------------
BM_ArenaOneAlloc 21 ns 21 ns 32994231
BM_ArenaInitialBlockOneAlloc 6 ns 6 ns 116318005
BM_ParseDescriptorNoHeap 3028 ns 3028 ns 231138 2.34354GB/s
BM_ParseDescriptor 3557 ns 3557 ns 196583 1.99498GB/s
BM_ParseDescriptorProto2NoArena 33228 ns 33226 ns 21196 218.688MB/s
BM_ParseDescriptorProto2WithArena 22863 ns 22861 ns 30666 317.831MB/s
BM_SerializeDescriptorProto2 5444 ns 5444 ns 127368 1.30348GB/s
BM_SerializeDescriptor 12509 ns 12508 ns 55816 580.914MB/s
$ perf stat bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap
2020-10-08 14:07:06
Running bazel-bin/benchmark
Run on (72 X 3700 MHz CPU s)
CPU Caches:
L1 Data 32K (x36)
L1 Instruction 32K (x36)
L2 Unified 1024K (x36)
L3 Unified 25344K (x2)
----------------------------------------------------------------
Benchmark Time CPU Iterations
----------------------------------------------------------------
BM_ParseDescriptorNoHeap 3071 ns 3071 ns 227743 2.31094GB/s
Performance counter stats for 'bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap':
1,050.22 msec task-clock # 0.978 CPUs utilized
4 context-switches # 0.004 K/sec
0 cpu-migrations # 0.000 K/sec
179 page-faults # 0.170 K/sec
3,875,796,334 cycles # 3.690 GHz
13,282,835,967 instructions # 3.43 insn per cycle
2,887,725,848 branches # 2749.627 M/sec
8,324,912 branch-misses # 0.29% of all branches
1.073924364 seconds time elapsed
1.042806000 seconds user
0.008021000 seconds sys
Profile:
23.96% benchmark benchmark [.] upb_prm_1bt_max192b
22.44% benchmark benchmark [.] fastdecode_dispatch
18.96% benchmark benchmark [.] upb_pss_1bt
14.20% benchmark benchmark [.] upb_psv4_1bt
8.33% benchmark benchmark [.] upb_prm_1bt_max64b
6.66% benchmark benchmark [.] upb_prm_1bt_max128b
1.29% benchmark benchmark [.] upb_psm_1bt_max64b
0.77% benchmark benchmark [.] fastdecode_generic
0.55% benchmark [kernel.kallsyms] [k] smp_call_function_single
0.42% benchmark [kernel.kallsyms] [k] _raw_spin_lock_irqsave
0.42% benchmark benchmark [.] upb_psm_1bt_max256b
0.31% benchmark benchmark [.] upb_psb1_1bt
0.21% benchmark benchmark [.] upb_plv4_5bv
0.14% benchmark benchmark [.] upb_psb1_2bt
0.12% benchmark benchmark [.] decode_longvarint64
0.08% benchmark [kernel.kallsyms] [k] vsnprintf
0.07% benchmark [kernel.kallsyms] [k] _raw_spin_lock
0.07% benchmark benchmark [.] _upb_msg_new
0.06% benchmark ld-2.31.so [.] check_match
4 years ago
|
|
|
#undef TAGBYTES
|
|
|
|
#undef SIZES
|
|
|
|
#undef F
|