diff --git a/Makefile b/Makefile index f988657f50..f2bd34e5a6 100644 --- a/Makefile +++ b/Makefile @@ -77,6 +77,9 @@ STREAM= \ src/upb_msg.c \ src/upb_glue.c \ +ASMCORE= \ + src/upb_decoder_x64.asm + # Parts of core that are yet to be converted. OTHERSRC=src/upb_encoder.c src/upb_text.c @@ -114,8 +117,8 @@ LIBUPB=src/libupb.a LIBUPB_PIC=src/libupb_pic.a lib: $(LIBUPB) -OBJ=$(patsubst %.c,%.o,$(SRC)) -PICOBJ=$(patsubst %.c,%.lo,$(SRC)) +OBJ=$(patsubst %.c,%.o,$(SRC)) src/upb_decoder_x64.o +PICOBJ=$(patsubst %.c,%.lo,$(SRC)) src/upb_decoder_x64.lo $(LIBUPB): $(OBJ) $(E) AR $(LIBUPB) $(Q) ar rcs $(LIBUPB) $(OBJ) @@ -135,12 +138,19 @@ $(LIBUPB_PIC): $(PICOBJ) # critical path but gets very large when -O3 is used. src/upb_def.o: src/upb_def.c $(E) CC $< - $(Q) $(CC) $(CFLAGS) $(CPPFLAGS) -O0 -c -o $@ $< + $(Q) $(CC) $(CFLAGS) $(CPPFLAGS) -Os -c -o $@ $< src/upb_def.lo: src/upb_def.c $(E) 'CC -fPIC' $< - $(Q) $(CC) $(CFLAGS) $(CPPFLAGS) -O0 -c -o $@ $< -fPIC + $(Q) $(CC) $(CFLAGS) $(CPPFLAGS) -Os -c -o $@ $< -fPIC + +src/upb_decoder_x64.o: src/upb_decoder_x64.asm + $(E) NASM $< + $(Q) nasm -Ox src/upb_decoder_x64.asm -o src/upb_decoder_x64.o -f macho64 +src/upb_decoder_x64.lo: src/upb_decoder_x64.asm + $(E) NASM $< + $(Q) nasm -Ox src/upb_decoder_x64.asm -o src/upb_decoder_x64.o -f macho64 # Function to expand a wildcard pattern recursively. rwildcard=$(strip $(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2)$(filter $(subst *,%,$2),$d))) diff --git a/src/upb.h b/src/upb.h index c5b4310b4e..0698748cdc 100644 --- a/src/upb.h +++ b/src/upb.h @@ -29,6 +29,16 @@ extern "C" { #define UPB_MIN(x, y) ((x) < (y) ? (x) : (y)) #define UPB_INDEX(base, i, m) (void*)((char*)(base) + ((i)*(m))) +INLINE void nop_printf(const char *fmt, ...) { + (void)fmt; +} + +#ifdef NDEBUG +#define DEBUGPRINTF nop_printf +#else +#define DEBUGPRINTF printf +#endif + // Rounds val up to the next multiple of align. INLINE size_t upb_align_up(size_t val, size_t align) { return val % align == 0 ? val : val + align - (val % align); diff --git a/src/upb_decoder.c b/src/upb_decoder.c index 44674407de..0e6866baa8 100644 --- a/src/upb_decoder.c +++ b/src/upb_decoder.c @@ -14,7 +14,7 @@ // If the return value is other than UPB_CONTINUE, that is what the last // callback returned. extern upb_flow_t upb_fastdecode(const char **p, const char *end, - upb_value_handler_t *value_cb, void *closure, + upb_value_handler_t value_cb, void *closure, void *table, int table_size); /* Pure Decoding **************************************************************/ @@ -131,6 +131,7 @@ INLINE int64_t upb_zzdec_64(uint64_t n) { return (n >> 1) ^ -(int64_t)(n & 1); } INLINE void upb_decoder_advance(upb_decoder *d, size_t len) { d->ptr += len; + //d->bytes_parsed_slow += len; } INLINE size_t upb_decoder_bufleft(upb_decoder *d) { @@ -214,6 +215,7 @@ INLINE bool upb_decode_tag(upb_decoder *d, upb_tag *tag) { if ((*(p++) & 0x80) == 0) goto done; // likely slow: // Decode a full varint starting over from ptr. + DEBUGPRINTF("tag was >2 bytes.\n"); if (!upb_decode_varint_slow(d, &val)) return false; tag_int = upb_value_getint64(val); p = d->ptr; // Trick the next line into not overwriting us. @@ -317,6 +319,8 @@ void upb_decoder_run(upb_src *src, upb_status *status) { d->end = NULL; // Force a buffer pull. d->submsg_end = (void*)0x1; // But don't let end-of-message get triggered. d->msgdef = d->top->msgdef; + //d->bytes_parsed_fast = 0; + //d->bytes_parsed_slow = 0; // TODO: handle UPB_SKIPSUBMSG #define CHECK_FLOW(expr) if ((expr) == UPB_BREAK) { /*assert(!upb_ok(status));*/ goto err; } @@ -335,11 +339,25 @@ void upb_decoder_run(upb_src *src, upb_status *status) { CHECK_FLOW(upb_pop(d)); } + //const char *ptr = d->ptr; // Decodes as many fields as possible, updating d->ptr appropriately, // before falling through to the slow(er) path. - //CHECK_FLOW(upb_fastdecode(&d->ptr, d->end, - // d->dispatcher->top->handlers.set->value, - // d->top->handlers.closure)); + const char *end = UPB_MIN(d->end, d->submsg_end); +#ifdef USE_ASSEMBLY_FASTPATH + CHECK_FLOW(upb_fastdecode(&d->ptr, end, + d->dispatcher.top->handlers.set->value, + d->dispatcher.top->handlers.closure, + d->top->msgdef->itof.array, + d->top->msgdef->itof.array_size)); + if (end - d->ptr < 12) { + DEBUGPRINTF("Off the fast path because <12 bytes of data\n"); + } else { + DEBUGPRINTF("Off the fast path for some other reason.\n"); + } +#endif + //if (d->ptr > ptr) { + // d->bytes_parsed_fast += (d->ptr - ptr); + //} // Parse/handle tag. upb_tag tag; @@ -348,6 +366,7 @@ void upb_decoder_run(upb_src *src, upb_status *status) { // Normal end-of-file. upb_clearerr(status); CHECK_FLOW(upb_dispatch_endmsg(&d->dispatcher)); + //DEBUGPRINTF("bytes parsed fast: %d, bytes parsed slow; %d\n", d->bytes_parsed_fast, d->bytes_parsed_slow); return; } else { if (status->code == UPB_EOF) { @@ -363,8 +382,10 @@ void upb_decoder_run(upb_src *src, upb_status *status) { upb_value val; switch (tag.wire_type) { case UPB_WIRE_TYPE_START_GROUP: + DEBUGPRINTF("parsing start group\n"); break; // Nothing to do now, below we will push appropriately. case UPB_WIRE_TYPE_END_GROUP: + DEBUGPRINTF("parsing end group\n"); if(d->top->end_offset != UPB_GROUP_END_OFFSET) { upb_seterr(status, UPB_ERROR, "Unexpected END_GROUP tag."); goto err; @@ -395,6 +416,8 @@ void upb_decoder_run(upb_src *src, upb_status *status) { } upb_fielddef *f = e->f; + assert(e->field_type == f->type); + assert(e->native_wire_type == upb_types[f->type].native_wire_type); if (tag.wire_type != e->native_wire_type) { // TODO: Support packed fields. @@ -416,11 +439,13 @@ void upb_decoder_run(upb_src *src, upb_status *status) { // doesn't check this, and it would slow us down, so pass for now. switch (e->field_type) { case UPB_TYPE(MESSAGE): + DEBUGPRINTF("parsing submessage\n"); case UPB_TYPE(GROUP): CHECK_FLOW(upb_push(d, f, val, e->field_type)); continue; // We have no value to dispatch. case UPB_TYPE(STRING): case UPB_TYPE(BYTES): + DEBUGPRINTF("parsing string\n"); CHECK(upb_decode_string(d, &val, &d->tmp)); break; case UPB_TYPE(SINT32): diff --git a/src/upb_decoder.h b/src/upb_decoder.h index 5662c9c3fb..d0848f8a2c 100644 --- a/src/upb_decoder.h +++ b/src/upb_decoder.h @@ -70,6 +70,8 @@ struct _upb_decoder { // Msgdef for the current level. upb_msgdef *msgdef; + + size_t bytes_parsed_fast, bytes_parsed_slow; }; // A upb_decoder decodes the binary protocol buffer format, writing the data it diff --git a/src/upb_decoder_x64.asm b/src/upb_decoder_x64.asm new file mode 100644 index 0000000000..17d1ef7b00 --- /dev/null +++ b/src/upb_decoder_x64.asm @@ -0,0 +1,219 @@ +DEFAULT REL ; Default to RIP-relative addressing instead of absolute. + +extern _upb_decode_varint_fast64 + +SECTION .data + +; Our dispatch table; used to jump to the right handler, keyed on the field's +; type. +dispatch_table: + dq _upb_fastdecode.cant_fast_path ; field not in table (type == 0). (check_4). + dq _upb_fastdecode.fixed64 ; double + dq _upb_fastdecode.fixed32 ; float + dq _upb_fastdecode.varint ; int64 + dq _upb_fastdecode.varint ; uint64 + dq _upb_fastdecode.varint ; int32 + dq _upb_fastdecode.fixed64 ; fixed64 + dq _upb_fastdecode.fixed32 ; fixed32 + dq _upb_fastdecode.varint ; bool + dq _upb_fastdecode.cant_fast_path ; string (TODO) + dq _upb_fastdecode.cant_fast_path ; group (check_6) + dq _upb_fastdecode.cant_fast_path ; message + dq _upb_fastdecode.cant_fast_path ; bytes (TODO) + dq _upb_fastdecode.varint ; uint32 + dq _upb_fastdecode.varint ; enum + dq _upb_fastdecode.fixed32 ; sfixed32 + dq _upb_fastdecode.fixed64 ; sfixed64 + dq _upb_fastdecode.varint_sint32 ; sint32 + dq _upb_fastdecode.varint_sint64 ; sint64 + + GLOBAL _upb_decode_fast + +SECTION .text +; Register allocation. +%define BUF rbx ; const char *p, current buf position. +%define END rbp ; const char *end, where the buf ends (either submsg end or buf end) +%define BUF_ADDR r12 ; upb_decoder *d. +%define FIELDDEF r13 ; upb_fielddef *f, needs to be preserved across varint decoding call. +%define CALLBACK r14 +%define CLOSURE r15 + +; Stack layout: *tableptr, uint32_t maxfield_times_8 +%define STACK_SPACE 24 ; this value + 8 must be a multiple of 16. +%define TABLE_SPILL [rsp] ; our lookup table, indexed by field number. +%define MAXFIELD_TIMES_8_SPILL [rsp+8] + + +; Executing the fast path requires the following conditions: +; - check_1: there are >=12 bytes left (<=2 byte tag and <=10 byte varint). +; - check_2: the tag is <= 2 bytes. +; - check_3: the field number is <= the table size +; (ie. it must be an array lookup, not a hash lookup). +; - check_4: the field is known (found in the table). +; - check_5: the wire type we read is correct for the field number, +; ("packed" fields are not accepted, yet. this could be handled +; efficiently by doing an extra check on the "type check failed" +; path that goes into a tight loop if the encoding was packed). +; - check_6: the field is not a group or a message (or string, TODO) +; (this could be relaxed, but due to delegation it's a bit tricky). +; - if the value is a string, the entire string is available in +; the buffer, and our cached string object can be recycled. + + +%macro decode_and_dispatch_ 0 +align 16 +.decode_and_dispatch: + ; Load a few values we'll need in a sec. + mov r8, TABLE_SPILL + mov r9d, MAXFIELD_TIMES_8_SPILL + + mov rax, END + sub rax, BUF + cmp rax, 12 + jb _upb_fastdecode.cant_fast_path ; check_1 (<12 bytes left). + + ; Decode a 1 or 2-byte varint -> eax. + mov cl, byte [BUF] + lea rdi, [BUF+1] + movzx rax, cl ; Need all of rax since we're doing a 64-bit lea later. + and eax, 0x7f + test cl, cl + jns .one_byte_tag ; Should be predictable if fields are in order. + movzx ecx, byte [BUF+1] + lea rdi, [BUF+2] + mov edx, ecx + and edx, 0x7f + shl edx, 7 + or eax, edx + test al, al + js _upb_fastdecode.cant_fast_path ; check_2 (tag was >2 bytes). +.one_byte_tag: + mov BUF, rdi + + ; Decode tag and dispatch. + mov ecx, eax + and eax, 0x3ff8 ; eax now contains field number * 8 + lea r11, [r8+rax*2] ; *2 is really *16, since rax is already *8. + and ecx, 0x7 ; ecx now contains wire type. + cmp eax, r9d + jae _upb_fastdecode.cant_fast_path ; check_3 (field number > table size) + mov FIELDDEF, [r11+8] ; Lookup fielddef (upb_itof_ent.f) + movzx rdx, BYTE [r11+1] ; Lookup field type. + mov rax, qword dispatch_table + jmp [rax+rdx*8] +%endmacro + +%macro decode_and_dispatch 0 + jmp .decode_and_dispatch +%endmacro + +%macro call_callback 0 + ; Value arg must already be in rdx when macro is called. + mov rdi, CLOSURE + mov rsi, FIELDDEF + mov rcx, 33 ; RAW; we could pass the correct type, or only do this in non-debug modes. + call CALLBACK + mov [BUF_ADDR], BUF + cmp eax, 0 + jne .done ; Caller requested BREAK or SKIPSUBMSG. +%endmacro + +%macro check_type 1 + cmp ecx, %1 + jne _upb_fastdecode.cant_fast_path ; check_5 (wire type check failed). +%endmacro + +; extern upb_flow_t upb_fastdecode(const char **p, const char *end, +; upb_value_handler_t value_cb, void *closure, +; void *table, int table_size); +align 16 +global _upb_fastdecode +_upb_fastdecode: + ; We use all callee-save regs. + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + sub rsp, STACK_SPACE + + ; Parse arguments into reg vals and stack. + mov BUF_ADDR, rdi + mov BUF, [rdi] + mov END, rsi + mov CALLBACK, rdx + mov CLOSURE, rcx + mov TABLE_SPILL, r8 + shl r9, 3 + mov MAXFIELD_TIMES_8_SPILL, r9 + + decode_and_dispatch + +align 16 +.varint: + call _upb_decode_varint_fast64 ; BUF is already in rdi. + test rax, rax + jz _upb_fastdecode.cant_fast_path ; Varint was unterminated, slow path will handle error. + mov BUF, rax + call_callback ; rdx already holds value. + decode_and_dispatch_ + +align 16 +.fixed32: + mov edx, DWORD [BUF] ; Might be unaligned, but that's ok. + add BUF, 4 + call_callback + decode_and_dispatch + +align 16 +.fixed64: + mov rdx, QWORD [BUF] ; Might be unaligned, but that's ok. + add BUF, 8 + call_callback + decode_and_dispatch + +align 16 +.varint_sint32: + call _upb_decode_varint_fast64 ; BUF is already in rdi. + test rax, rax + jz _upb_fastdecode.cant_fast_path ; Varint was unterminated, slow path will handle error. + mov BUF, rax + + ; Perform 32-bit zig-zag decoding. + mov ecx, edx + shr edx, 1 + and ecx, 0x1 + neg ecx + xor edx, ecx + call_callback + decode_and_dispatch + +align 16 +.varint_sint64: + call _upb_decode_varint_fast64 ; BUF is already in rdi. + test rax, rax + jz _upb_fastdecode.cant_fast_path ; Varint was unterminated, slow path will handle error. + mov BUF, rax + + ; Perform 64-bit zig-zag decoding. + mov rcx, rdx + shr rdx, 1 + and ecx, 0x1 + neg rcx + xor rdx, rcx + call_callback + decode_and_dispatch + +.cant_fast_path: + mov rax, 0 ; UPB_CONTINUE -- continue as before. +.done: + ; If coming via done, preserve the user callback's return in rax. + add rsp, STACK_SPACE + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + ret diff --git a/src/upb_def.c b/src/upb_def.c index 01143da00b..6a1e543d50 100644 --- a/src/upb_def.c +++ b/src/upb_def.c @@ -615,7 +615,7 @@ static upb_flow_t upb_fielddef_endmsg(void *_b) { // Field was successfully read, add it as a field of the msgdef. upb_msgdef *m = upb_defbuilder_top(b); - upb_itof_ent itof_ent = {0, upb_types[f->type].native_wire_type, f->type, f}; + upb_itof_ent itof_ent = {0, f->type, upb_types[f->type].native_wire_type, f}; upb_ntof_ent ntof_ent = {{f->name, 0}, f}; upb_inttable_insert(&m->itof, f->number, &itof_ent); upb_strtable_insert(&m->ntof, &ntof_ent.e); diff --git a/src/upb_def.h b/src/upb_def.h index cca908bcab..db33fbb337 100644 --- a/src/upb_def.h +++ b/src/upb_def.h @@ -163,8 +163,8 @@ typedef struct _upb_msgdef { // Hash table entries for looking up fields by name or number. typedef struct { bool junk; - upb_wire_type_t native_wire_type; upb_fieldtype_t field_type; + upb_wire_type_t native_wire_type; upb_fielddef *f; } upb_itof_ent; typedef struct { diff --git a/src/upb_msg.h b/src/upb_msg.h index 8a3c63f296..3246971d80 100644 --- a/src/upb_msg.h +++ b/src/upb_msg.h @@ -97,11 +97,13 @@ INLINE upb_value upb_value_read(upb_valueptr ptr, upb_fieldtype_t ft) { INLINE void upb_value_write(upb_valueptr ptr, upb_value val, upb_fieldtype_t ft) { +#ifndef NDEBUG if (ft == UPB_VALUETYPE_ARRAY) { assert(val.type == UPB_VALUETYPE_ARRAY); - } else { + } else if (val.type != UPB_VALUETYPE_RAW) { assert(val.type == upb_types[ft].inmemory_type); } +#endif #define CASE(t, member_name) \ case UPB_TYPE(t): *ptr.member_name = val.val.member_name; break; diff --git a/src/upb_table.c b/src/upb_table.c index 9a6cf7a622..d2cb5dacb2 100644 --- a/src/upb_table.c +++ b/src/upb_table.c @@ -188,7 +188,7 @@ void upb_inttable_compact(upb_inttable *t) { while ((1UL << lg2_array) < largest_key) ++lg2_array; ++lg2_array; // Undo the first iteration. size_t array_size; - int array_count; + int array_count = 0; while (lg2_array > 0) { array_size = (1 << --lg2_array); //printf("Considering size %d (btw, our table has %d things total)\n", array_size, upb_inttable_count(t)); diff --git a/src/upb_table.h b/src/upb_table.h index 3903c757f6..c658a6e38d 100644 --- a/src/upb_table.h +++ b/src/upb_table.h @@ -123,18 +123,21 @@ INLINE void *_upb_inttable_fastlookup(upb_inttable *t, uint32_t key, upb_inttable_value *arrval = (upb_inttable_value*)UPB_INDEX(t->array, key, value_size); if (_upb_inttable_isarrkey(t, key)) { - //printf("array lookup for key %d, &val=%p, has_entry=%d\n", key, val, val->has_entry); + //DEBUGPRINTF("array lookup for key %d, &val=%p, has_entry=%d\n", key, val, val->has_entry); return (arrval->has_entry) ? arrval : NULL; } uint32_t bucket = _upb_inttable_bucket(t, key); upb_inttable_entry *e = (upb_inttable_entry*)UPB_INDEX(t->t.entries, bucket, entry_size); - //printf("looking in first bucket %d, entry size=%zd, addr=%p\n", bucket, entry_size, e); + //DEBUGPRINTF("looking in first bucket %d, entry size=%zd, addr=%p\n", bucket, entry_size, e); while (1) { - //printf("%d, %d, %d\n", e->val.has_entry, e->hdr.key, key); - if (e->hdr.key == key) return &e->val; + //DEBUGPRINTF("%d, %d, %d\n", e->val.has_entry, e->hdr.key, key); + if (e->hdr.key == key) { + DEBUGPRINTF("returning val from hash part\n"); + return &e->val; + } if ((bucket = e->hdr.next) == UPB_END_OF_CHAIN) return NULL; - //printf("looking in bucket %d\n", bucket); + //DEBUGPRINTF("looking in bucket %d\n", bucket); e = (upb_inttable_entry*)UPB_INDEX(t->t.entries, bucket, entry_size); } }