Protocol Buffers - Google's data interchange format (grpc依赖) https://developers.google.com/protocol-buffers/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

422 lines
12 KiB

4 years ago
#include "upb/decode_fast.h"
4 years ago
#include "upb/decode.h"
#include "upb/decode.int.h"
4 years ago
/* Must be last. */
4 years ago
#include "upb/port_def.inc"
#define UPB_PARSE_PARAMS \
upb_decstate *d, const char *ptr, upb_msg *msg, intptr_t table, \
4 years ago
uint64_t hasbits, uint64_t data
4 years ago
#define UPB_PARSE_ARGS d, ptr, msg, table, hasbits, data
#define RETURN_GENERIC(msg) \
/* fprintf(stderr, msg); */ \
return fastdecode_generic(UPB_PARSE_ARGS);
typedef enum {
CARD_s = 0, /* Singular (optional, non-repeated) */
CARD_o = 1, /* Oneof */
CARD_r = 2 /* Repeated */
} upb_card;
UPB_FORCEINLINE
const char *fastdecode_tag_dispatch(upb_decstate *d, const char *ptr,
upb_msg *msg, intptr_t table,
uint64_t hasbits, uint32_t tag) {
const upb_msglayout *table_p = (void*)(table >> 8);
uint8_t mask = table;
uint64_t data;
size_t idx = tag & mask;
__builtin_assume((idx & 7) == 0);
idx >>= 3;
data = table_p->fasttable[idx].field_data ^ tag;
return table_p->fasttable[idx].field_parser(UPB_PARSE_ARGS);
}
UPB_FORCEINLINE
uint32_t fastdecode_load_tag(const char* ptr) {
uint16_t tag;
memcpy(&tag, ptr, 2);
return tag;
}
UPB_FORCEINLINE
const char *fastdecode_dispatch(upb_decstate *d, const char *ptr, upb_msg *msg,
intptr_t table, uint64_t hasbits) {
if (UPB_UNLIKELY(ptr >= d->fastlimit)) {
if (UPB_LIKELY(ptr == d->limit)) {
*(uint32_t*)msg |= hasbits >> 16; /* Sync hasbits. */
return ptr;
}
uint64_t data = 0;
RETURN_GENERIC("dispatch hit end\n");
}
return fastdecode_tag_dispatch(d, ptr, msg, table, hasbits, fastdecode_load_tag(ptr));
4 years ago
}
4 years ago
UPB_FORCEINLINE
static bool fastdecode_checktag(uint64_t data, int tagbytes) {
4 years ago
if (tagbytes == 1) {
return (data & 0xff) == 0;
} else {
return (data & 0xffff) == 0;
}
}
UPB_FORCEINLINE
static void *fastdecode_getfield_ofs(upb_decstate *d, const char *ptr,
upb_msg *msg, uint64_t *data,
uint64_t *hasbits, upb_array **outarr,
void **end, int valbytes,
upb_card card, bool hasbit_is_idx) {
size_t ofs = *data >> 48;
void *field = (char *)msg + ofs;
4 years ago
switch (card) {
case CARD_s:
if (hasbit_is_idx) {
*hasbits |= 1ull << ((*data >> 32) & 63);
} else {
*hasbits |= *data;
}
return field;
case CARD_r: {
uint8_t elem_size_lg2 = __builtin_ctz(valbytes);
upb_array **arr_p = field;
upb_array *arr;
*hasbits >>= 16;
*(uint32_t*)msg |= *hasbits;
*hasbits = 0;
if (UPB_LIKELY(!*arr_p)) {
const size_t initial_len = 8;
size_t need = (valbytes * initial_len) + sizeof(upb_array);
if (!hasbit_is_idx && UPB_UNLIKELY(!_upb_arenahas(&d->arena, need))) {
return NULL;
}
arr = upb_arena_malloc(&d->arena, need);
field = arr + 1;
arr->data = _upb_array_tagptr(field, elem_size_lg2);
*arr_p = arr;
arr->size = initial_len;
*end = (char*)field + (arr->size * valbytes);
} else {
arr = *arr_p;
field = _upb_array_ptr(arr);
*end = (char*)field + (arr->size * valbytes);
field = (char*)field + (arr->len * valbytes);
}
*data = fastdecode_load_tag(ptr);
*outarr = arr;
return field;
}
default:
UPB_UNREACHABLE();
4 years ago
}
}
UPB_FORCEINLINE
static void *fastdecode_getfield(upb_decstate *d, const char *ptr, upb_msg *msg,
uint64_t *data, uint64_t *hasbits,
int valbytes, upb_card card) {
return fastdecode_getfield_ofs(d, ptr, msg, data, hasbits, NULL, NULL,
valbytes, card, false);
}
/* varint fields **************************************************************/
4 years ago
#ifdef __BMI2__
#include <immintrin.h>
#endif
UPB_FORCEINLINE uint64_t fastdecode_munge(uint64_t val, int valbytes, bool zigzag) {
if (valbytes == 1) {
return val != 0;
} else if (zigzag) {
if (valbytes == 4) {
uint32_t n = val;
return (n >> 1) ^ -(int32_t)(n & 1);
} else if (valbytes == 8) {
return (val >> 1) ^ -(int64_t)(val & 1);
}
UPB_UNREACHABLE();
}
return val;
}
4 years ago
UPB_FORCEINLINE
4 years ago
static const char *fastdecode_varint(UPB_PARSE_PARAMS, int tagbytes,
int valbytes, upb_card card, bool zigzag) {
uint64_t val;
void *dst;
if (UPB_UNLIKELY(!fastdecode_checktag(data, tagbytes))) {
RETURN_GENERIC("varint field tag mismatch\n");
4 years ago
}
dst = fastdecode_getfield(d, ptr, msg, &data, &hasbits, valbytes,
card);
ptr += tagbytes + 1;
val = (uint8_t)ptr[-1];
if (UPB_UNLIKELY(val & 0x80)) {
4 years ago
int i;
for (i = 0; i < 8; i++) {
ptr++;
uint64_t byte = (uint8_t)ptr[-1];
val += (byte - 1) << (7 + 7 * i);
if (UPB_LIKELY((byte & 0x80) == 0)) goto done;
}
ptr++;
uint64_t byte = (uint8_t)ptr[-1];
if (byte > 1) return fastdecode_err(d);
val += (byte - 1) << 63;
}
done:
val = fastdecode_munge(val, valbytes, zigzag);
memcpy(dst, &val, valbytes);
return fastdecode_dispatch(d, ptr, msg, table, hasbits);
4 years ago
}
4 years ago
#define z_ZZ true
#define b_ZZ false
#define v_ZZ false
4 years ago
/* Generate all varint functions.
* {s,o,r} x {b1,v4,z4,v8,z8} x {1bt,2bt} */
4 years ago
#define F(card, type, valbytes, tagbytes) \
const char *upb_p##card##type##valbytes##_##tagbytes##bt(UPB_PARSE_PARAMS) { \
return fastdecode_varint(UPB_PARSE_ARGS, tagbytes, valbytes, CARD_##card, \
type##_ZZ); \
4 years ago
}
4 years ago
4 years ago
#define TYPES(card, tagbytes) \
F(card, b, 1, tagbytes) \
F(card, v, 4, tagbytes) \
F(card, v, 8, tagbytes) \
F(card, z, 4, tagbytes) \
F(card, z, 8, tagbytes)
#define TAGBYTES(card) \
TYPES(card, 1) \
TYPES(card, 2)
TAGBYTES(s)
TAGBYTES(o)
/* TAGBYTES(r) */
4 years ago
#undef z_ZZ
#undef b_ZZ
#undef v_ZZ
#undef o_ONEOF
#undef s_ONEOF
#undef r_ONEOF
#undef F
#undef TYPES
#undef TAGBYTES
/* string fields **************************************************************/
UPB_FORCEINLINE
bool fastdecode_boundscheck(const char *ptr, size_t len, const char *end) {
uintptr_t uptr = (uintptr_t)ptr;
uintptr_t uend = (uintptr_t)end;
uintptr_t res = uptr + len;
return res < uptr || res > uend;
}
UPB_FORCEINLINE
static const char *fastdecode_string(UPB_PARSE_PARAMS, int tagbytes,
upb_card card) {
upb_strview *dst;
const char *str;
int64_t len;
if (UPB_UNLIKELY(!fastdecode_checktag(data, tagbytes))) {
RETURN_GENERIC("string field tag mismatch\n");
}
dst = fastdecode_getfield(d, ptr, msg, &data, &hasbits,
sizeof(upb_strview), card);
len = (int8_t)ptr[tagbytes];
str = ptr + tagbytes + 1;
dst->data = str;
dst->size = len;
if (UPB_UNLIKELY(fastdecode_boundscheck(str, len, d->limit))) {
dst->size = 0;
RETURN_GENERIC("string field len >1 byte\n");
}
return fastdecode_dispatch(d, str + len, msg, table, hasbits);
}
const char *upb_pss_1bt(UPB_PARSE_PARAMS) {
return fastdecode_string(UPB_PARSE_ARGS, 1, CARD_s);
}
const char *upb_pos_1bt(UPB_PARSE_PARAMS) {
return fastdecode_string(UPB_PARSE_ARGS, 1, CARD_o);
}
const char *upb_pss_2bt(UPB_PARSE_PARAMS) {
return fastdecode_string(UPB_PARSE_ARGS, 2, CARD_s);
}
const char *upb_pos_2bt(UPB_PARSE_PARAMS) {
return fastdecode_string(UPB_PARSE_ARGS, 2, CARD_o);
}
/* message fields *************************************************************/
UPB_NOINLINE static const char *fastdecode_lendelim_submsg(
upb_decstate *d, const char *ptr, upb_msg *msg, intptr_t table,
uint64_t hasbits, const char *saved_limit) {
4 years ago
size_t len = (uint8_t)ptr[-1];
if (UPB_UNLIKELY(len & 0x80)) {
int i;
for (i = 0; i < 3; i++) {
ptr++;
4 years ago
size_t byte = (uint8_t)ptr[-1];
len += (byte - 1) << (7 + 7 * i);
if (UPB_LIKELY((byte & 0x80) == 0)) goto done;
}
4 years ago
ptr++;
size_t byte = (uint8_t)ptr[-1];
// len is limited by 2gb not 4gb, hence 8 and not 16 as normally expected for a 32 bit varint.
if (UPB_UNLIKELY(byte >= 8)) return fastdecode_err(d);
len += (byte - 1) << 28;
}
4 years ago
done:
if (UPB_UNLIKELY(fastdecode_boundscheck(ptr, len, saved_limit))) {
return fastdecode_err(d);
}
d->limit = ptr + len;
d->fastlimit = UPB_MIN(d->limit, d->fastend);
return fastdecode_dispatch(d, ptr, msg, table, hasbits);
}
UPB_FORCEINLINE
static const char *fastdecode_submsg(UPB_PARSE_PARAMS, int tagbytes,
I think this may have reached the optimization limit. ------------------------------------------------------------------------- Benchmark Time CPU Iterations ------------------------------------------------------------------------- BM_ArenaOneAlloc 21 ns 21 ns 32994231 BM_ArenaInitialBlockOneAlloc 6 ns 6 ns 116318005 BM_ParseDescriptorNoHeap 3028 ns 3028 ns 231138 2.34354GB/s BM_ParseDescriptor 3557 ns 3557 ns 196583 1.99498GB/s BM_ParseDescriptorProto2NoArena 33228 ns 33226 ns 21196 218.688MB/s BM_ParseDescriptorProto2WithArena 22863 ns 22861 ns 30666 317.831MB/s BM_SerializeDescriptorProto2 5444 ns 5444 ns 127368 1.30348GB/s BM_SerializeDescriptor 12509 ns 12508 ns 55816 580.914MB/s $ perf stat bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap 2020-10-08 14:07:06 Running bazel-bin/benchmark Run on (72 X 3700 MHz CPU s) CPU Caches: L1 Data 32K (x36) L1 Instruction 32K (x36) L2 Unified 1024K (x36) L3 Unified 25344K (x2) ---------------------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------------------- BM_ParseDescriptorNoHeap 3071 ns 3071 ns 227743 2.31094GB/s Performance counter stats for 'bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap': 1,050.22 msec task-clock # 0.978 CPUs utilized 4 context-switches # 0.004 K/sec 0 cpu-migrations # 0.000 K/sec 179 page-faults # 0.170 K/sec 3,875,796,334 cycles # 3.690 GHz 13,282,835,967 instructions # 3.43 insn per cycle 2,887,725,848 branches # 2749.627 M/sec 8,324,912 branch-misses # 0.29% of all branches 1.073924364 seconds time elapsed 1.042806000 seconds user 0.008021000 seconds sys Profile: 23.96% benchmark benchmark [.] upb_prm_1bt_max192b 22.44% benchmark benchmark [.] fastdecode_dispatch 18.96% benchmark benchmark [.] upb_pss_1bt 14.20% benchmark benchmark [.] upb_psv4_1bt 8.33% benchmark benchmark [.] upb_prm_1bt_max64b 6.66% benchmark benchmark [.] upb_prm_1bt_max128b 1.29% benchmark benchmark [.] upb_psm_1bt_max64b 0.77% benchmark benchmark [.] fastdecode_generic 0.55% benchmark [kernel.kallsyms] [k] smp_call_function_single 0.42% benchmark [kernel.kallsyms] [k] _raw_spin_lock_irqsave 0.42% benchmark benchmark [.] upb_psm_1bt_max256b 0.31% benchmark benchmark [.] upb_psb1_1bt 0.21% benchmark benchmark [.] upb_plv4_5bv 0.14% benchmark benchmark [.] upb_psb1_2bt 0.12% benchmark benchmark [.] decode_longvarint64 0.08% benchmark [kernel.kallsyms] [k] vsnprintf 0.07% benchmark [kernel.kallsyms] [k] _raw_spin_lock 0.07% benchmark benchmark [.] _upb_msg_new 0.06% benchmark ld-2.31.so [.] check_match
4 years ago
int msg_ceil_bytes, upb_card card) {
if (UPB_UNLIKELY(!fastdecode_checktag(data, tagbytes))) {
RETURN_GENERIC("submessage field tag mismatch\n");
}
if (--d->depth == 0) return fastdecode_err(d);
upb_msg **submsg;
upb_array *arr;
void *end;
uint32_t submsg_idx = data;
4 years ago
submsg_idx >>= 16;
const upb_msglayout *table_p = (void*)(table >> 8);
const upb_msglayout *subl = table_p->submsgs[submsg_idx];
intptr_t subt = (intptr_t)subl << 8 | subl->table_mask;
size_t submsg_size = subl->size + sizeof(upb_msg_internal);
submsg = fastdecode_getfield_ofs(d, ptr, msg, &data, &hasbits, &arr, &end,
sizeof(upb_msg *), card, true);
if (card == CARD_s) {
*(uint32_t*)msg |= hasbits >> 16;
hasbits = 0;
}
const char *saved_limit = d->limit;
const char *saved_fastlimit = d->fastlimit;
again:
if (card == CARD_r) {
if (UPB_UNLIKELY(submsg == end)) {
4 years ago
size_t old_size = arr->size;
size_t old_bytes = old_size * sizeof(upb_msg*);
size_t new_size = old_size * 2;
size_t new_bytes = new_size * sizeof(upb_msg*);
char *old_ptr = _upb_array_ptr(arr);
char *new_ptr = upb_arena_realloc(&d->arena, old_ptr, old_bytes, new_bytes);
4 years ago
arr->size = new_size;
arr->data = _upb_array_tagptr(new_ptr, 3);
submsg = (void*)(new_ptr + (old_size * sizeof(upb_msg*)));
end = (void*)(new_ptr + (new_size * sizeof(upb_msg*)));
}
}
upb_msg* child = *submsg;
if (card == CARD_r || UPB_LIKELY(!child)) {
*submsg = child = decode_newmsg_ceil(d, submsg_size, msg_ceil_bytes);
}
ptr += tagbytes + 1;
ptr = fastdecode_lendelim_submsg(d, ptr, child, subt, 0, saved_limit);
if (UPB_UNLIKELY(ptr != d->limit || d->end_group != 0)) {
return fastdecode_err(d);
}
if (card == CARD_r) {
submsg++;
if (UPB_LIKELY(ptr < saved_fastlimit)) {
uint32_t tag = fastdecode_load_tag(ptr);
if (tagbytes == 1) {
if ((uint8_t)tag == (uint8_t)data) goto again;
} else {
if ((uint16_t)tag == (uint16_t)data) goto again;
}
arr->len = submsg - (upb_msg**)_upb_array_ptr(arr);
d->limit = saved_limit;
d->fastlimit = saved_fastlimit;
d->depth++;
return fastdecode_tag_dispatch(d, ptr, msg, table, hasbits, tag);
} else {
if (ptr == saved_limit) {
arr->len = submsg - (upb_msg**)_upb_array_ptr(arr);
d->limit = saved_limit;
d->fastlimit = saved_fastlimit;
d->depth++;
return ptr;
}
goto repeated_generic;
}
}
d->limit = saved_limit;
d->fastlimit = saved_fastlimit;
d->depth++;
return fastdecode_dispatch(d, ptr, msg, table, hasbits);
repeated_generic:
arr->len = submsg - (upb_msg**)_upb_array_ptr(arr);
d->limit = saved_limit;
d->fastlimit = saved_fastlimit;
d->depth++;
RETURN_GENERIC("repeated generic");
}
I think this may have reached the optimization limit. ------------------------------------------------------------------------- Benchmark Time CPU Iterations ------------------------------------------------------------------------- BM_ArenaOneAlloc 21 ns 21 ns 32994231 BM_ArenaInitialBlockOneAlloc 6 ns 6 ns 116318005 BM_ParseDescriptorNoHeap 3028 ns 3028 ns 231138 2.34354GB/s BM_ParseDescriptor 3557 ns 3557 ns 196583 1.99498GB/s BM_ParseDescriptorProto2NoArena 33228 ns 33226 ns 21196 218.688MB/s BM_ParseDescriptorProto2WithArena 22863 ns 22861 ns 30666 317.831MB/s BM_SerializeDescriptorProto2 5444 ns 5444 ns 127368 1.30348GB/s BM_SerializeDescriptor 12509 ns 12508 ns 55816 580.914MB/s $ perf stat bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap 2020-10-08 14:07:06 Running bazel-bin/benchmark Run on (72 X 3700 MHz CPU s) CPU Caches: L1 Data 32K (x36) L1 Instruction 32K (x36) L2 Unified 1024K (x36) L3 Unified 25344K (x2) ---------------------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------------------- BM_ParseDescriptorNoHeap 3071 ns 3071 ns 227743 2.31094GB/s Performance counter stats for 'bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap': 1,050.22 msec task-clock # 0.978 CPUs utilized 4 context-switches # 0.004 K/sec 0 cpu-migrations # 0.000 K/sec 179 page-faults # 0.170 K/sec 3,875,796,334 cycles # 3.690 GHz 13,282,835,967 instructions # 3.43 insn per cycle 2,887,725,848 branches # 2749.627 M/sec 8,324,912 branch-misses # 0.29% of all branches 1.073924364 seconds time elapsed 1.042806000 seconds user 0.008021000 seconds sys Profile: 23.96% benchmark benchmark [.] upb_prm_1bt_max192b 22.44% benchmark benchmark [.] fastdecode_dispatch 18.96% benchmark benchmark [.] upb_pss_1bt 14.20% benchmark benchmark [.] upb_psv4_1bt 8.33% benchmark benchmark [.] upb_prm_1bt_max64b 6.66% benchmark benchmark [.] upb_prm_1bt_max128b 1.29% benchmark benchmark [.] upb_psm_1bt_max64b 0.77% benchmark benchmark [.] fastdecode_generic 0.55% benchmark [kernel.kallsyms] [k] smp_call_function_single 0.42% benchmark [kernel.kallsyms] [k] _raw_spin_lock_irqsave 0.42% benchmark benchmark [.] upb_psm_1bt_max256b 0.31% benchmark benchmark [.] upb_psb1_1bt 0.21% benchmark benchmark [.] upb_plv4_5bv 0.14% benchmark benchmark [.] upb_psb1_2bt 0.12% benchmark benchmark [.] decode_longvarint64 0.08% benchmark [kernel.kallsyms] [k] vsnprintf 0.07% benchmark [kernel.kallsyms] [k] _raw_spin_lock 0.07% benchmark benchmark [.] _upb_msg_new 0.06% benchmark ld-2.31.so [.] check_match
4 years ago
#define F(card, tagbytes, size_ceil, ceil_arg) \
const char *upb_p##card##m_##tagbytes##bt_max##size_ceil##b( \
UPB_PARSE_PARAMS) { \
return fastdecode_submsg(UPB_PARSE_ARGS, tagbytes, ceil_arg, CARD_##card); \
}
I think this may have reached the optimization limit. ------------------------------------------------------------------------- Benchmark Time CPU Iterations ------------------------------------------------------------------------- BM_ArenaOneAlloc 21 ns 21 ns 32994231 BM_ArenaInitialBlockOneAlloc 6 ns 6 ns 116318005 BM_ParseDescriptorNoHeap 3028 ns 3028 ns 231138 2.34354GB/s BM_ParseDescriptor 3557 ns 3557 ns 196583 1.99498GB/s BM_ParseDescriptorProto2NoArena 33228 ns 33226 ns 21196 218.688MB/s BM_ParseDescriptorProto2WithArena 22863 ns 22861 ns 30666 317.831MB/s BM_SerializeDescriptorProto2 5444 ns 5444 ns 127368 1.30348GB/s BM_SerializeDescriptor 12509 ns 12508 ns 55816 580.914MB/s $ perf stat bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap 2020-10-08 14:07:06 Running bazel-bin/benchmark Run on (72 X 3700 MHz CPU s) CPU Caches: L1 Data 32K (x36) L1 Instruction 32K (x36) L2 Unified 1024K (x36) L3 Unified 25344K (x2) ---------------------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------------------- BM_ParseDescriptorNoHeap 3071 ns 3071 ns 227743 2.31094GB/s Performance counter stats for 'bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap': 1,050.22 msec task-clock # 0.978 CPUs utilized 4 context-switches # 0.004 K/sec 0 cpu-migrations # 0.000 K/sec 179 page-faults # 0.170 K/sec 3,875,796,334 cycles # 3.690 GHz 13,282,835,967 instructions # 3.43 insn per cycle 2,887,725,848 branches # 2749.627 M/sec 8,324,912 branch-misses # 0.29% of all branches 1.073924364 seconds time elapsed 1.042806000 seconds user 0.008021000 seconds sys Profile: 23.96% benchmark benchmark [.] upb_prm_1bt_max192b 22.44% benchmark benchmark [.] fastdecode_dispatch 18.96% benchmark benchmark [.] upb_pss_1bt 14.20% benchmark benchmark [.] upb_psv4_1bt 8.33% benchmark benchmark [.] upb_prm_1bt_max64b 6.66% benchmark benchmark [.] upb_prm_1bt_max128b 1.29% benchmark benchmark [.] upb_psm_1bt_max64b 0.77% benchmark benchmark [.] fastdecode_generic 0.55% benchmark [kernel.kallsyms] [k] smp_call_function_single 0.42% benchmark [kernel.kallsyms] [k] _raw_spin_lock_irqsave 0.42% benchmark benchmark [.] upb_psm_1bt_max256b 0.31% benchmark benchmark [.] upb_psb1_1bt 0.21% benchmark benchmark [.] upb_plv4_5bv 0.14% benchmark benchmark [.] upb_psb1_2bt 0.12% benchmark benchmark [.] decode_longvarint64 0.08% benchmark [kernel.kallsyms] [k] vsnprintf 0.07% benchmark [kernel.kallsyms] [k] _raw_spin_lock 0.07% benchmark benchmark [.] _upb_msg_new 0.06% benchmark ld-2.31.so [.] check_match
4 years ago
#define SIZES(card, tagbytes) \
F(card, tagbytes, 64, 64) \
F(card, tagbytes, 128, 128) \
F(card, tagbytes, 192, 192) \
F(card, tagbytes, 256, 256) \
F(card, tagbytes, max, -1)
I think this may have reached the optimization limit. ------------------------------------------------------------------------- Benchmark Time CPU Iterations ------------------------------------------------------------------------- BM_ArenaOneAlloc 21 ns 21 ns 32994231 BM_ArenaInitialBlockOneAlloc 6 ns 6 ns 116318005 BM_ParseDescriptorNoHeap 3028 ns 3028 ns 231138 2.34354GB/s BM_ParseDescriptor 3557 ns 3557 ns 196583 1.99498GB/s BM_ParseDescriptorProto2NoArena 33228 ns 33226 ns 21196 218.688MB/s BM_ParseDescriptorProto2WithArena 22863 ns 22861 ns 30666 317.831MB/s BM_SerializeDescriptorProto2 5444 ns 5444 ns 127368 1.30348GB/s BM_SerializeDescriptor 12509 ns 12508 ns 55816 580.914MB/s $ perf stat bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap 2020-10-08 14:07:06 Running bazel-bin/benchmark Run on (72 X 3700 MHz CPU s) CPU Caches: L1 Data 32K (x36) L1 Instruction 32K (x36) L2 Unified 1024K (x36) L3 Unified 25344K (x2) ---------------------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------------------- BM_ParseDescriptorNoHeap 3071 ns 3071 ns 227743 2.31094GB/s Performance counter stats for 'bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap': 1,050.22 msec task-clock # 0.978 CPUs utilized 4 context-switches # 0.004 K/sec 0 cpu-migrations # 0.000 K/sec 179 page-faults # 0.170 K/sec 3,875,796,334 cycles # 3.690 GHz 13,282,835,967 instructions # 3.43 insn per cycle 2,887,725,848 branches # 2749.627 M/sec 8,324,912 branch-misses # 0.29% of all branches 1.073924364 seconds time elapsed 1.042806000 seconds user 0.008021000 seconds sys Profile: 23.96% benchmark benchmark [.] upb_prm_1bt_max192b 22.44% benchmark benchmark [.] fastdecode_dispatch 18.96% benchmark benchmark [.] upb_pss_1bt 14.20% benchmark benchmark [.] upb_psv4_1bt 8.33% benchmark benchmark [.] upb_prm_1bt_max64b 6.66% benchmark benchmark [.] upb_prm_1bt_max128b 1.29% benchmark benchmark [.] upb_psm_1bt_max64b 0.77% benchmark benchmark [.] fastdecode_generic 0.55% benchmark [kernel.kallsyms] [k] smp_call_function_single 0.42% benchmark [kernel.kallsyms] [k] _raw_spin_lock_irqsave 0.42% benchmark benchmark [.] upb_psm_1bt_max256b 0.31% benchmark benchmark [.] upb_psb1_1bt 0.21% benchmark benchmark [.] upb_plv4_5bv 0.14% benchmark benchmark [.] upb_psb1_2bt 0.12% benchmark benchmark [.] decode_longvarint64 0.08% benchmark [kernel.kallsyms] [k] vsnprintf 0.07% benchmark [kernel.kallsyms] [k] _raw_spin_lock 0.07% benchmark benchmark [.] _upb_msg_new 0.06% benchmark ld-2.31.so [.] check_match
4 years ago
#define TAGBYTES(card) \
SIZES(card, 1) \
SIZES(card, 2)
I think this may have reached the optimization limit. ------------------------------------------------------------------------- Benchmark Time CPU Iterations ------------------------------------------------------------------------- BM_ArenaOneAlloc 21 ns 21 ns 32994231 BM_ArenaInitialBlockOneAlloc 6 ns 6 ns 116318005 BM_ParseDescriptorNoHeap 3028 ns 3028 ns 231138 2.34354GB/s BM_ParseDescriptor 3557 ns 3557 ns 196583 1.99498GB/s BM_ParseDescriptorProto2NoArena 33228 ns 33226 ns 21196 218.688MB/s BM_ParseDescriptorProto2WithArena 22863 ns 22861 ns 30666 317.831MB/s BM_SerializeDescriptorProto2 5444 ns 5444 ns 127368 1.30348GB/s BM_SerializeDescriptor 12509 ns 12508 ns 55816 580.914MB/s $ perf stat bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap 2020-10-08 14:07:06 Running bazel-bin/benchmark Run on (72 X 3700 MHz CPU s) CPU Caches: L1 Data 32K (x36) L1 Instruction 32K (x36) L2 Unified 1024K (x36) L3 Unified 25344K (x2) ---------------------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------------------- BM_ParseDescriptorNoHeap 3071 ns 3071 ns 227743 2.31094GB/s Performance counter stats for 'bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap': 1,050.22 msec task-clock # 0.978 CPUs utilized 4 context-switches # 0.004 K/sec 0 cpu-migrations # 0.000 K/sec 179 page-faults # 0.170 K/sec 3,875,796,334 cycles # 3.690 GHz 13,282,835,967 instructions # 3.43 insn per cycle 2,887,725,848 branches # 2749.627 M/sec 8,324,912 branch-misses # 0.29% of all branches 1.073924364 seconds time elapsed 1.042806000 seconds user 0.008021000 seconds sys Profile: 23.96% benchmark benchmark [.] upb_prm_1bt_max192b 22.44% benchmark benchmark [.] fastdecode_dispatch 18.96% benchmark benchmark [.] upb_pss_1bt 14.20% benchmark benchmark [.] upb_psv4_1bt 8.33% benchmark benchmark [.] upb_prm_1bt_max64b 6.66% benchmark benchmark [.] upb_prm_1bt_max128b 1.29% benchmark benchmark [.] upb_psm_1bt_max64b 0.77% benchmark benchmark [.] fastdecode_generic 0.55% benchmark [kernel.kallsyms] [k] smp_call_function_single 0.42% benchmark [kernel.kallsyms] [k] _raw_spin_lock_irqsave 0.42% benchmark benchmark [.] upb_psm_1bt_max256b 0.31% benchmark benchmark [.] upb_psb1_1bt 0.21% benchmark benchmark [.] upb_plv4_5bv 0.14% benchmark benchmark [.] upb_psb1_2bt 0.12% benchmark benchmark [.] decode_longvarint64 0.08% benchmark [kernel.kallsyms] [k] vsnprintf 0.07% benchmark [kernel.kallsyms] [k] _raw_spin_lock 0.07% benchmark benchmark [.] _upb_msg_new 0.06% benchmark ld-2.31.so [.] check_match
4 years ago
TAGBYTES(s)
TAGBYTES(o)
TAGBYTES(r)
I think this may have reached the optimization limit. ------------------------------------------------------------------------- Benchmark Time CPU Iterations ------------------------------------------------------------------------- BM_ArenaOneAlloc 21 ns 21 ns 32994231 BM_ArenaInitialBlockOneAlloc 6 ns 6 ns 116318005 BM_ParseDescriptorNoHeap 3028 ns 3028 ns 231138 2.34354GB/s BM_ParseDescriptor 3557 ns 3557 ns 196583 1.99498GB/s BM_ParseDescriptorProto2NoArena 33228 ns 33226 ns 21196 218.688MB/s BM_ParseDescriptorProto2WithArena 22863 ns 22861 ns 30666 317.831MB/s BM_SerializeDescriptorProto2 5444 ns 5444 ns 127368 1.30348GB/s BM_SerializeDescriptor 12509 ns 12508 ns 55816 580.914MB/s $ perf stat bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap 2020-10-08 14:07:06 Running bazel-bin/benchmark Run on (72 X 3700 MHz CPU s) CPU Caches: L1 Data 32K (x36) L1 Instruction 32K (x36) L2 Unified 1024K (x36) L3 Unified 25344K (x2) ---------------------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------------------- BM_ParseDescriptorNoHeap 3071 ns 3071 ns 227743 2.31094GB/s Performance counter stats for 'bazel-bin/benchmark --benchmark_filter=BM_ParseDescriptorNoHeap': 1,050.22 msec task-clock # 0.978 CPUs utilized 4 context-switches # 0.004 K/sec 0 cpu-migrations # 0.000 K/sec 179 page-faults # 0.170 K/sec 3,875,796,334 cycles # 3.690 GHz 13,282,835,967 instructions # 3.43 insn per cycle 2,887,725,848 branches # 2749.627 M/sec 8,324,912 branch-misses # 0.29% of all branches 1.073924364 seconds time elapsed 1.042806000 seconds user 0.008021000 seconds sys Profile: 23.96% benchmark benchmark [.] upb_prm_1bt_max192b 22.44% benchmark benchmark [.] fastdecode_dispatch 18.96% benchmark benchmark [.] upb_pss_1bt 14.20% benchmark benchmark [.] upb_psv4_1bt 8.33% benchmark benchmark [.] upb_prm_1bt_max64b 6.66% benchmark benchmark [.] upb_prm_1bt_max128b 1.29% benchmark benchmark [.] upb_psm_1bt_max64b 0.77% benchmark benchmark [.] fastdecode_generic 0.55% benchmark [kernel.kallsyms] [k] smp_call_function_single 0.42% benchmark [kernel.kallsyms] [k] _raw_spin_lock_irqsave 0.42% benchmark benchmark [.] upb_psm_1bt_max256b 0.31% benchmark benchmark [.] upb_psb1_1bt 0.21% benchmark benchmark [.] upb_plv4_5bv 0.14% benchmark benchmark [.] upb_psb1_2bt 0.12% benchmark benchmark [.] decode_longvarint64 0.08% benchmark [kernel.kallsyms] [k] vsnprintf 0.07% benchmark [kernel.kallsyms] [k] _raw_spin_lock 0.07% benchmark benchmark [.] _upb_msg_new 0.06% benchmark ld-2.31.so [.] check_match
4 years ago
#undef TAGBYTES
#undef SIZES
#undef F