Protocol Buffers - Google's data interchange format (grpc依赖)
https://developers.google.com/protocol-buffers/
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1134 lines
32 KiB
1134 lines
32 KiB
|// |
|
|// upb - a minimalist implementation of protocol buffers. |
|
|// |
|
|// Copyright (c) 2011-2013 Google Inc. See LICENSE for details. |
|
|// Author: Josh Haberman <jhaberman@gmail.com> |
|
|// |
|
|// JIT compiler for upb_pbdecoder on x86-64. Generates machine code from the |
|
|// bytecode generated in compile_decoder.c. |
|
| |
|
|.arch x64 |
|
|.actionlist upb_jit_actionlist |
|
|.globals UPB_JIT_GLOBAL_ |
|
|.globalnames upb_jit_globalnames |
|
| |
|
|// Calling conventions. Note -- this will need to be changed for |
|
|// Windows, which uses a different calling convention! |
|
|.define ARG1_64, rdi |
|
|.define ARG2_8, r6b // DynASM's equivalent to "sil" -- low byte of esi. |
|
|.define ARG2_32, esi |
|
|.define ARG2_64, rsi |
|
|.define ARG3_8, dl |
|
|.define ARG3_32, edx |
|
|.define ARG3_64, rdx |
|
|.define ARG4_64, rcx |
|
|.define ARG5_64, r8 |
|
|.define XMMARG1, xmm0 |
|
| |
|
|// Register allocation / type map. |
|
|// ALL of the code in this file uses these register allocations. |
|
|// When we "call" within this file, we do not use regular calling |
|
|// conventions, but of course when calling to user callbacks we must. |
|
|.define PTR, rbx // DECODER->ptr (unsynced) |
|
|.define DATAEND, r12 // DECODER->data_end (unsynced) |
|
|.define CLOSURE, r13 // FRAME->closure (unsynced) |
|
|.type FRAME, upb_pbdecoder_frame, r14 // DECODER->top (unsynced) |
|
|.type DECODER, upb_pbdecoder, r15 // DECODER (immutable) |
|
|.define DELIMEND, rbp |
|
| |
|
| // Spills unsynced registers back to memory. |
|
|.macro commit_regs |
|
| mov DECODER->top, FRAME |
|
| mov DECODER->ptr, PTR |
|
| mov DECODER->data_end, DATAEND |
|
| // We don't guarantee that delim_end is NULL when out of range like the |
|
| // interpreter does. |
|
| mov DECODER->delim_end, DELIMEND |
|
| sub DELIMEND, DECODER->buf |
|
| add DELIMEND, DECODER->bufstart_ofs |
|
| mov FRAME->end_ofs, DELIMEND |
|
| mov FRAME->sink.closure, CLOSURE |
|
|.endmacro |
|
| |
|
| // Loads unsynced registers from memory back into registers. |
|
|.macro load_regs |
|
| mov FRAME, DECODER->top |
|
| mov PTR, DECODER->ptr |
|
| mov DATAEND, DECODER->data_end |
|
| mov CLOSURE, FRAME->sink.closure |
|
| mov DELIMEND, FRAME->end_ofs |
|
| sub DELIMEND, DECODER->bufstart_ofs |
|
| add DELIMEND, DECODER->buf |
|
|.endmacro |
|
| |
|
| // Calls an external C function at address "addr". |
|
|.macro callp, addr |
|
| mov64 rax, (uintptr_t)addr |
|
| |
|
| // Stack must be 16-byte aligned (x86-64 ABI requires this). |
|
| // |
|
| // OPT: possibly remove this by statically ensuring correct alignment. |
|
| // |
|
| // OPT: use "call rel32" where possible. |
|
| push r12 |
|
| mov r12, rsp |
|
| and rsp, 0xfffffffffffffff0UL // Align stack. |
|
| call rax |
|
| mov rsp, r12 |
|
| pop r12 |
|
|.endmacro |
|
| |
|
|.macro ld64, val |
|
|| { |
|
|| uintptr_t v = (uintptr_t)val; |
|
|| if (v > 0xffffffff) { |
|
| mov64 ARG2_64, v |
|
|| } else if (v) { |
|
| mov ARG2_32, v |
|
|| } else { |
|
| xor ARG2_32, ARG2_32 |
|
|| } |
|
|| } |
|
|.endmacro |
|
| |
|
|.macro load_handler_data, h, arg |
|
| ld64 upb_handlers_gethandlerdata(h, arg) |
|
|.endmacro |
|
| |
|
|.macro chkeob, bytes, target |
|
|| if (bytes == 1) { |
|
| cmp PTR, DATAEND |
|
| je target |
|
|| } else { |
|
| mov rcx, DATAEND |
|
| sub rcx, PTR |
|
| cmp rcx, bytes |
|
| jb target |
|
|| } |
|
|.endmacro |
|
| |
|
|.macro chkneob, bytes, target |
|
|| if (bytes == 1) { |
|
| cmp PTR, DATAEND |
|
| jne target |
|
|| } else { |
|
| mov rcx, DATAEND |
|
| sub rcx, PTR |
|
| cmp rcx, bytes |
|
| jae target |
|
|| } |
|
|.endmacro |
|
|
|
|.macro sethas, reg, hasbit |
|
|| if (hasbit >= 0) { |
|
| or byte [reg + ((uint32_t)hasbit / 8)], (1 << ((uint32_t)hasbit % 8)) |
|
|| } |
|
|.endmacro |
|
| |
|
| // Decodes 32-bit varint into rdx, inlining 1 byte. |
|
|.macro dv32 |
|
| chkeob 1, >7 |
|
| movzx edx, byte [PTR] |
|
| test dl, dl |
|
| jns >8 |
|
|7: |
|
| call ->decodev32_fallback |
|
|8: |
|
| add PTR, 1 |
|
|.endmacro |
|
|
|
#define DECODE_EOF -3 |
|
|
|
static upb_func *gethandler(const upb_handlers *h, upb_selector_t sel) { |
|
return h ? upb_handlers_gethandler(h, sel) : NULL; |
|
} |
|
|
|
// Defines an "assembly label" for the current code generation offset. |
|
// This label exists *purely* for debugging purposes: it is emitted into |
|
// the .so, and printed as part of JIT debugging output when UPB_JIT_LOAD_SO is |
|
// defined. |
|
// |
|
// We would define this in the .c file except that it conditionally defines a |
|
// pclabel. |
|
static void asmlabel(jitcompiler *jc, const char *fmt, ...) { |
|
#ifndef NDEBUG |
|
int ofs = jc->dynasm->section->ofs; |
|
assert(ofs != jc->lastlabelofs); |
|
jc->lastlabelofs = ofs; |
|
#endif |
|
|
|
#ifndef UPB_JIT_LOAD_SO |
|
UPB_UNUSED(jc); |
|
UPB_UNUSED(fmt); |
|
#else |
|
va_list args; |
|
va_start(args, fmt); |
|
char *str = upb_vasprintf(fmt, args); |
|
va_end(args); |
|
|
|
int pclabel = alloc_pclabel(jc); |
|
// Normally we would prefer to allocate this inline with the codegen, |
|
// ie. |
|
// |=>asmlabel(...) |
|
// But since we do this conditionally, only when UPB_JIT_LOAD_SO is defined, |
|
// we do it here instead. |
|
|=>pclabel: |
|
upb_inttable_insert(&jc->asmlabels, pclabel, upb_value_ptr(str)); |
|
#endif |
|
} |
|
|
|
// Should only be called when the associated handler is known to exist. |
|
static bool alwaysok(const upb_handlers *h, upb_selector_t sel) { |
|
upb_handlerattr attr = UPB_HANDLERATTR_INITIALIZER; |
|
bool ok = upb_handlers_getattr(h, sel, &attr); |
|
UPB_ASSERT_VAR(ok, ok); |
|
bool ret = upb_handlerattr_alwaysok(&attr); |
|
upb_handlerattr_uninit(&attr); |
|
return ret; |
|
} |
|
|
|
// Emit static assembly routines; code that does not vary based on the message |
|
// schema. Since it's not input-dependent, we only need one single copy of it. |
|
// For the moment we generate a single copy per generated handlers. Eventually |
|
// we should generate this code at compile time and link it into the binary so |
|
// we have one copy total. To do that we'll want to be sure that it is within |
|
// 2GB of our JIT code, so that branches between the two are near (rel32). |
|
// |
|
// We'd put this assembly in a .s file directly, but DynASM's ability to |
|
// calculate structure offsets automatically is too useful to pass up (it's way |
|
// more convenient to write DECODER->sink than [rbx + 0x96], especially since |
|
// the latter would have to be changed whenever the structure is updated). |
|
static void emit_static_asm(jitcompiler *jc) { |
|
| // Trampolines for entering/exiting the JIT. These are a bit tricky to |
|
| // support full resuming; when we suspend we copy the JIT's portion of |
|
| // the call stack into the upb_pbdecoder and restore it when we resume. |
|
asmlabel(jc, "enterjit"); |
|
|->enterjit: |
|
|1: |
|
| push rbp |
|
| push r15 |
|
| push r14 |
|
| push r13 |
|
| push r12 |
|
| push rbx |
|
| |
|
| mov rbx, ARG2_64 // Preserve JIT method. |
|
| |
|
| mov DECODER, rdi |
|
| callp upb_pbdecoder_resume // Same args as us; reuse regs. |
|
| test eax, eax |
|
| jns >1 |
|
| mov DECODER->saved_rsp, rsp |
|
| mov rax, rbx |
|
| load_regs |
|
| |
|
| // Test whether we have a saved stack to resume. |
|
| mov ARG3_64, DECODER->call_len |
|
| test ARG3_64, ARG3_64 |
|
| jnz >2 |
|
| |
|
| call rax |
|
| |
|
| mov rax, DECODER->size_param |
|
| mov qword DECODER->call_len, 0 |
|
|1: |
|
| pop rbx |
|
| pop r12 |
|
| pop r13 |
|
| pop r14 |
|
| pop r15 |
|
| pop rbp |
|
| ret |
|
| |
|
|2: |
|
| // Resume decoder. |
|
| lea ARG2_64, DECODER->callstack |
|
| sub rsp, ARG3_64 |
|
| mov ARG1_64, rsp |
|
| callp memcpy // Restore stack. |
|
| ret // Return to resumed function (not ->enterjit caller). |
|
| |
|
| // Other code can call this to suspend the JIT. |
|
| // To the calling code, it will appear that the function returns when |
|
| // the JIT resumes, and more buffer space will be available. |
|
| // Args: eax=the value that decode() should return. |
|
asmlabel(jc, "exitjit"); |
|
|->exitjit: |
|
| // Save the stack into DECODER->callstack. |
|
| lea ARG1_64, DECODER->callstack |
|
| mov ARG2_64, rsp |
|
| mov ARG3_64, DECODER->saved_rsp |
|
| sub ARG3_64, rsp |
|
| mov DECODER->call_len, ARG3_64 // Preserve len for next resume. |
|
| mov ebx, eax // Preserve return value across memcpy. |
|
| callp memcpy // Copy stack into decoder. |
|
| mov eax, ebx // This will be our return value. |
|
| |
|
| // Must NOT do this before the memcpy(), otherwise memcpy() will |
|
| // clobber the stack we are trying to save! |
|
| mov rsp, DECODER->saved_rsp |
|
| pop rbx |
|
| pop r12 |
|
| pop r13 |
|
| pop r14 |
|
| pop r15 |
|
| pop rbp |
|
| ret |
|
| |
|
| // Like suspend() in the C decoder, except that the function appears |
|
| // (from the caller's perspective) not to return until the decoder is |
|
| // resumed. |
|
asmlabel(jc, "suspend"); |
|
|->suspend: |
|
| cmp DECODER->ptr, PTR |
|
| je >1 |
|
| mov DECODER->checkpoint, PTR |
|
|1: |
|
| commit_regs |
|
| mov rdi, DECODER |
|
| callp upb_pbdecoder_suspend |
|
| jmp ->exitjit |
|
| |
|
asmlabel(jc, "pushlendelim"); |
|
|->pushlendelim: |
|
|1: |
|
| mov FRAME->sink.closure, CLOSURE |
|
| mov DECODER->checkpoint, PTR |
|
| dv32 |
|
| mov rcx, DELIMEND |
|
| sub rcx, PTR |
|
| sub rcx, rdx |
|
| jb ->err // Len is greater than enclosing message. |
|
| mov FRAME->end_ofs, rcx |
|
| add FRAME, sizeof(upb_pbdecoder_frame) |
|
| mov DELIMEND, PTR |
|
| add DELIMEND, rdx |
|
| cmp FRAME, DECODER->limit |
|
| je >3 // Stack overflow |
|
| mov dword FRAME->groupnum, 0 |
|
| test rcx, rcx |
|
| jz >2 |
|
| mov DATAEND, DECODER->end |
|
| cmp PTR, DELIMEND |
|
| ja >2 |
|
| cmp DELIMEND, DATAEND |
|
| ja >2 |
|
| mov DATAEND, DELIMEND // If DELIMEND >= PTR && DELIMEND < DATAEND |
|
|2: |
|
| ret |
|
|3: |
|
| // Error -- call seterr. |
|
| mov PTR, DECODER->checkpoint // Rollback to before the delim len. |
|
| // Prepare seterr args. |
|
| mov ARG1_64, DECODER |
|
| ld64 kPbDecoderStackOverflow |
|
| callp upb_pbdecoder_seterr |
|
| call ->suspend |
|
| jmp <1 |
|
| |
|
| // For getting a value that spans a buffer seam. Falls back to C. |
|
| // Args: rdi=C decoding function (prototype: int f(upb_pbdecoder*, void*)) |
|
asmlabel(jc, "getvalue_slow"); |
|
|->getvalue_slow: |
|
| sub rsp, 16 // Stack is [8-byte value, 8-byte func pointer] |
|
| mov [rsp + 8], rdi // Need to preserve fptr across suspends. |
|
|1: |
|
| mov qword [rsp], 0 // For parsing routines that only parse 32 bits. |
|
| mov ARG1_64, DECODER |
|
| mov ARG2_64, rsp |
|
| mov DECODER->checkpoint, PTR |
|
| commit_regs |
|
| call aword [rsp + 8] |
|
| load_regs |
|
| test eax, eax |
|
| jns >2 |
|
| // Success; return parsed data (in rdx AND xmm0). |
|
| mov rdx, [rsp] |
|
| movsd xmm0, qword [rsp] |
|
| add rsp, 16 |
|
| ret |
|
|2: |
|
| call ->exitjit // Return eax from decode function. |
|
| jmp <1 |
|
| |
|
asmlabel(jc, "parse_unknown"); |
|
| // Args: edx=fieldnum, cl=wire type |
|
|->parse_unknown: |
|
| // OPT: handle directly instead of kicking to C. |
|
| // Check for ENDGROUP. |
|
| mov ARG1_64, DECODER |
|
| mov ARG2_32, edx |
|
| movzx ARG3_32, cl |
|
| commit_regs |
|
| callp upb_pbdecoder_skipunknown |
|
| load_regs |
|
| cmp eax, DECODE_ENDGROUP |
|
| jne >1 |
|
| ret // Return eax=DECODE_ENDGROUP, not zero |
|
|1: |
|
| cmp eax, DECODE_OK |
|
| je >1 |
|
| call ->exitjit // Return eax from decode function. |
|
|1: |
|
| xor eax, eax |
|
| ret |
|
| |
|
| // Fallback functions for parsing single values. These are used when the |
|
| // buffer doesn't contain enough remaining data for the fast path. Each |
|
| // primitive type (v32, v64, f32, f64) has two functions: decode & skip. |
|
| // Decode functions return their value in rsi/esi. |
|
| // |
|
| // These functions leave PTR = value_end - fast_path_bytes, so that we can |
|
| // re-join the fast path which will add fast_path_bytes after the callback |
|
| // completes. We also set DECODER->ptr to this value which is a signal to |
|
| // ->suspend that DECODER->checkpoint is up to date. |
|
asmlabel(jc, "skip_decode_f32_fallback"); |
|
|->skipf32_fallback: |
|
|->decodef32_fallback: |
|
| mov64 rdi, (uintptr_t)upb_pbdecoder_decode_f32 |
|
| call ->getvalue_slow |
|
| sub PTR, 4 |
|
| mov DECODER->ptr, PTR |
|
| ret |
|
| |
|
asmlabel(jc, "skip_decode_f64_fallback"); |
|
|->skipf64_fallback: |
|
|->decodef64_fallback: |
|
| mov64 rdi, (uintptr_t)upb_pbdecoder_decode_f64 |
|
| call ->getvalue_slow |
|
| sub PTR, 8 |
|
| mov DECODER->ptr, PTR |
|
| ret |
|
| |
|
| // Called for varint >= 1 byte. |
|
asmlabel(jc, "skip_decode_v32_fallback"); |
|
|->skipv32_fallback: |
|
|->skipv64_fallback: |
|
| chkeob 16, >1 |
|
| // With at least 16 bytes left, we can do a branch-less SSE version. |
|
| movdqu xmm0, [PTR] |
|
| pmovmskb eax, xmm0 // bits 0-15 are continuation bits, 16-31 are 0. |
|
| not eax |
|
| bsf eax, eax |
|
| cmp al, 10 |
|
| jae ->decode_varint_slow // Error (>10 byte varint). |
|
| add PTR, rax // bsf result is 0-based, so PTR=end-1, as desired. |
|
| ret |
|
| |
|
|1: |
|
| // With fewer than 16 bytes, we have to read byte by byte. |
|
| lea rcx, [PTR + 10] |
|
| mov rax, PTR // Preserve PTR in case of fallback to slow path. |
|
| cmp rcx, DATAEND |
|
| cmova rcx, DATAEND // rcx = MIN(DATAEND, PTR + 10) |
|
|2: |
|
| cmp rax, rcx |
|
| je ->decode_varint_slow |
|
| test byte [rax], 0x80 |
|
| jz >3 |
|
| add rax, 1 |
|
| jmp <2 |
|
|3: |
|
| mov PTR, rax // PTR = varint_end - 1, as desired |
|
| ret |
|
| |
|
| // Returns tag in edx |
|
asmlabel(jc, "decode_unknown_tag_fallback"); |
|
|->decode_unknown_tag_fallback: |
|
| sub rsp, 16 |
|
|1: |
|
| cmp PTR, DELIMEND |
|
| jne >2 |
|
| add rsp, 16 |
|
| xor eax, eax |
|
| ret |
|
|2: |
|
| // OPT: Have a medium-fast path before falling back to _slow. |
|
| mov ARG1_64, DECODER |
|
| mov ARG2_64, rsp |
|
| commit_regs |
|
| callp upb_pbdecoder_decode_varint_slow |
|
| load_regs |
|
| cmp eax, 0 |
|
| jge >3 |
|
| mov edx, [rsp] // Success; return parsed data. |
|
| add rsp, 16 |
|
| ret |
|
|3: |
|
| call ->exitjit // Return eax from decode function. |
|
| jmp <1 |
|
| |
|
| // Called for varint >= 1 byte. |
|
asmlabel(jc, "decode_v32_v64_fallback"); |
|
|->decodev32_fallback: |
|
|->decodev64_fallback: |
|
| chkeob 10, ->decode_varint_slow |
|
| // OPT: do something faster than just calling the C version. |
|
| mov rdi, PTR |
|
| callp upb_vdecode_fast |
|
| test rax, rax |
|
| je ->decode_varint_slow // Unterminated varint. |
|
| mov PTR, rax |
|
| sub PTR, 1 |
|
| mov DECODER->ptr, PTR |
|
| ret |
|
| |
|
asmlabel(jc, "decode_varint_slow"); |
|
|->decode_varint_slow: |
|
| // Slow path: end of buffer or error (varint length >= 10). |
|
| mov64 rdi, (uintptr_t)upb_pbdecoder_decode_varint_slow |
|
| call ->getvalue_slow |
|
| sub PTR, 1 |
|
| mov DECODER->ptr, PTR |
|
| ret |
|
| |
|
| // Args: rsi=expected tag, return=rax (DECODE_{OK,MISMATCH}) |
|
asmlabel(jc, "checktag_fallback"); |
|
|->checktag_fallback: |
|
| sub rsp, 8 |
|
| mov [rsp], rsi // Preserve expected tag. |
|
|1: |
|
| mov ARG1_64, DECODER |
|
| commit_regs |
|
| mov DECODER->checkpoint, PTR |
|
| callp upb_pbdecoder_checktag_slow |
|
| load_regs |
|
| cmp eax, 0 |
|
| jge >2 |
|
| add rsp, 8 |
|
| ret |
|
|2: |
|
| call ->exitjit |
|
| mov rsi, [rsp] |
|
| cmp PTR, DELIMEND |
|
| jne <1 |
|
| mov eax, DECODE_EOF |
|
| add rsp, 8 |
|
| ret |
|
| |
|
| // Args: rsi=upb_inttable, rdx=key, return=rax (-1 if not found). |
|
| // Preserves: rcx, rdx |
|
| // OPT: Could write this in assembly if it's a hotspot. |
|
asmlabel(jc, "hashlookup"); |
|
|->hashlookup: |
|
| push rcx |
|
| push rdx |
|
| sub rsp, 16 |
|
| mov rdi, rsi |
|
| mov rsi, rdx |
|
| mov rdx, rsp |
|
| callp upb_inttable_lookup |
|
| add rsp, 16 |
|
| pop rdx |
|
| pop rcx |
|
| test al, al |
|
| jz >2 // Unknown field. |
|
| mov rax, [rsp-32] // Value from table. |
|
| ret |
|
|2: |
|
| xor rax, rax |
|
| not rax |
|
| ret |
|
} |
|
|
|
static void jitprimitive(jitcompiler *jc, opcode op, |
|
const upb_handlers *h, upb_selector_t sel) { |
|
typedef enum { V32, V64, F32, F64, X } valtype_t; |
|
static valtype_t types[] = { |
|
X, F64, F32, V64, V64, V32, F64, F32, V64, X, X, X, X, V32, V32, F32, F64, |
|
V32, V64 }; |
|
static char fastpath_bytes[] = { 1, 1, 4, 8 }; |
|
const valtype_t type = types[op]; |
|
const int fastbytes = fastpath_bytes[type]; |
|
upb_func *handler = gethandler(h, sel); |
|
|
|
if (handler) { |
|
|1: |
|
| chkneob fastbytes, >3 |
|
|2: |
|
switch (type) { |
|
case V32: |
|
| call ->decodev32_fallback |
|
break; |
|
case V64: |
|
| call ->decodev64_fallback |
|
break; |
|
case F32: |
|
| call ->decodef32_fallback |
|
break; |
|
case F64: |
|
| call ->decodef64_fallback |
|
break; |
|
case X: break; |
|
} |
|
| jmp >4 |
|
|
|
// Fast path decode; for when check_bytes bytes are available. |
|
|3: |
|
switch (op) { |
|
case OP_PARSE_SFIXED32: |
|
case OP_PARSE_FIXED32: |
|
| mov edx, dword [PTR] |
|
break; |
|
case OP_PARSE_SFIXED64: |
|
case OP_PARSE_FIXED64: |
|
| mov rdx, qword [PTR] |
|
break; |
|
case OP_PARSE_FLOAT: |
|
| movss xmm0, dword [PTR] |
|
break; |
|
case OP_PARSE_DOUBLE: |
|
| movsd xmm0, qword [PTR] |
|
break; |
|
default: |
|
// Inline one byte of varint decoding. |
|
| movzx edx, byte [PTR] |
|
| test dl, dl |
|
| js <2 // Fallback to slow path for >1 byte varint. |
|
break; |
|
} |
|
|
|
// Second-stage decode; used for both fast and slow paths |
|
// (only needed for a few types). |
|
|4: |
|
switch (op) { |
|
case OP_PARSE_SINT32: |
|
// 32-bit zig-zag decode. |
|
| mov eax, edx |
|
| shr edx, 1 |
|
| and eax, 1 |
|
| neg eax |
|
| xor edx, eax |
|
break; |
|
case OP_PARSE_SINT64: |
|
// 64-bit zig-zag decode. |
|
| mov rax, rdx |
|
| shr rdx, 1 |
|
| and rax, 1 |
|
| neg rax |
|
| xor rdx, rax |
|
break; |
|
case OP_PARSE_BOOL: |
|
| test rdx, rdx |
|
| setne dl |
|
break; |
|
default: break; |
|
} |
|
|
|
// Call callback (or specialize if we can). |
|
upb_fieldtype_t type; |
|
const upb_shim_data *data = upb_shim_getdata(h, sel, &type); |
|
if (data) { |
|
switch (type) { |
|
case UPB_TYPE_INT64: |
|
case UPB_TYPE_UINT64: |
|
| mov [CLOSURE + data->offset], rdx |
|
break; |
|
case UPB_TYPE_INT32: |
|
case UPB_TYPE_UINT32: |
|
case UPB_TYPE_ENUM: |
|
| mov [CLOSURE + data->offset], edx |
|
break; |
|
case UPB_TYPE_DOUBLE: |
|
| movsd qword [CLOSURE + data->offset], XMMARG1 |
|
break; |
|
case UPB_TYPE_FLOAT: |
|
| movss dword [CLOSURE + data->offset], XMMARG1 |
|
break; |
|
case UPB_TYPE_BOOL: |
|
| mov [CLOSURE + data->offset], dl |
|
break; |
|
case UPB_TYPE_STRING: |
|
case UPB_TYPE_BYTES: |
|
case UPB_TYPE_MESSAGE: |
|
assert(false); break; |
|
} |
|
| sethas CLOSURE, data->hasbit |
|
} else if (handler) { |
|
| mov ARG1_64, CLOSURE |
|
| load_handler_data h, sel |
|
| callp handler |
|
if (!alwaysok(h, sel)) { |
|
| test al, al |
|
| jnz >5 |
|
| call ->suspend |
|
| jmp <1 |
|
|5: |
|
} |
|
} |
|
|
|
// We do this last so that the checkpoint is not advanced past the user's |
|
// data until the callback has returned success. |
|
| add PTR, fastbytes |
|
} else { |
|
// No handler registered for this value, just skip it. |
|
| chkneob fastbytes, >3 |
|
|2: |
|
switch (type) { |
|
case V32: |
|
| call ->skipv32_fallback |
|
break; |
|
case V64: |
|
| call ->skipv64_fallback |
|
break; |
|
case F32: |
|
| call ->skipf32_fallback |
|
break; |
|
case F64: |
|
| call ->skipf64_fallback |
|
break; |
|
case X: break; |
|
} |
|
|
|
// Fast-path skip. |
|
|3: |
|
if (type == V32 || type == V64) { |
|
| test byte [PTR], 0x80 |
|
| jnz <2 |
|
} |
|
| add PTR, fastbytes |
|
} |
|
} |
|
|
|
static void jitdispatch(jitcompiler *jc, |
|
const upb_pbdecodermethod *method) { |
|
// Lots of room for tweaking/optimization here. |
|
|
|
const upb_inttable *dispatch = &method->dispatch; |
|
bool has_hash_entries = (dispatch->t.count > 0); |
|
|
|
// Whether any of the fields for this message can have two wire types which |
|
// are both valid (packed & non-packed). |
|
// |
|
// OPT: populate this more precisely; not all messages with hash entries have |
|
// this characteristic. |
|
bool has_multi_wiretype = has_hash_entries; |
|
|
|
|=>define_jmptarget(jc, &method->dispatch): |
|
|1: |
|
// Decode the field tag. |
|
| mov aword DECODER->checkpoint, PTR |
|
| chkeob 2, >6 |
|
| movzx edx, byte [PTR] |
|
| test dl, dl |
|
| jns >7 // Jump if first byte has no continuation bit. |
|
| movzx ecx, byte [PTR + 1] |
|
| test cl, cl |
|
| js >6 // Jump if second byte has continuation bit. |
|
| // Confirmed two-byte varint. |
|
| shl ecx, 7 |
|
| and edx, 0x7f |
|
| or edx, ecx |
|
| add PTR, 2 |
|
| jmp >8 |
|
|6: |
|
| call ->decode_unknown_tag_fallback |
|
| test eax, eax // Hit DELIMEND? |
|
| jnz >8 |
|
| ret |
|
|7: |
|
| add PTR, 1 |
|
|8: |
|
| mov ecx, edx |
|
| shr edx, 3 |
|
| and cl, 7 |
|
|
|
// See comment attached to upb_pbdecodermethod.dispatch for layout of the |
|
// dispatch table. |
|
|2: |
|
| cmp edx, dispatch->array_size |
|
if (has_hash_entries) { |
|
| jae >7 |
|
} else { |
|
| jae >5 |
|
} |
|
| // OPT: Compact the lookup arr into 32-bit entries. |
|
if ((uintptr_t)dispatch->array > 0x7fffffff) { |
|
| mov64 rax, (uintptr_t)dispatch->array |
|
| mov rax, qword [rax + rdx * 8] |
|
} else { |
|
| mov rax, qword [rdx * 8 + dispatch->array] |
|
} |
|
|3: |
|
| // We take advantage of the fact that non-present entries are stored |
|
| // as -1, which will result in wire types that will never match. |
|
| cmp al, cl |
|
if (has_multi_wiretype) { |
|
| jne >6 |
|
} else { |
|
| jne >5 |
|
} |
|
| shr rax, 16 |
|
| |
|
| // Load the machine code address from the table entry. |
|
| // The table entry is relative to the dispatch->array jmptarget |
|
| // (patchdispatch() took care of this) which is the same as |
|
| // local label "4". The "lea" is really just trying to do |
|
| // lea rax, [>4 + rax] |
|
| // |
|
| // But we can't write that directly for some reason, so we use |
|
| // rdx as a temporary. |
|
| lea rdx, [>4] |
|
|=>define_jmptarget(jc, dispatch->array): |
|
|4: |
|
| add rax, rdx |
|
| ret |
|
| |
|
|5: |
|
| // Field isn't in our table. |
|
| call ->parse_unknown |
|
| test eax, eax // ENDGROUP? |
|
| jz <1 |
|
| lea rax, [>9] // ENDGROUP; Load address of OP_ENDMSG. |
|
| ret |
|
|
|
if (has_multi_wiretype) { |
|
|6: |
|
| // Primary wire type didn't match, check secondary wire type. |
|
| cmp ah, cl |
|
| jne <5 |
|
| // Secondary wire type is a match, look up fn + UPB_MAX_FIELDNUMBER. |
|
| add rdx, UPB_MAX_FIELDNUMBER |
|
| // This key will never be in the array part, so do a hash lookup. |
|
assert(has_hash_entries); |
|
| ld64 dispatch |
|
| jmp ->hashlookup // Tail call. |
|
} |
|
|
|
if (has_hash_entries) { |
|
|7: |
|
| // Hash table lookup. |
|
| ld64 dispatch |
|
| call ->hashlookup |
|
| jmp <3 |
|
} |
|
} |
|
|
|
static void jittag(jitcompiler *jc, uint64_t tag, int n, int ofs, |
|
const upb_pbdecodermethod *method) { |
|
// Internally we parse unknown fields; if this runs us into DELIMEND we jump |
|
// to the corresponding DELIMEND target (either msg end or repeated field |
|
// end), which we find from the OP_CHECKDELIM which must have necessarily |
|
// preceded us. |
|
uint32_t last_instruction = *(jc->pc - 2); |
|
int last_arg = (int32_t)last_instruction >> 8; |
|
assert((last_instruction & 0xff) == OP_CHECKDELIM); |
|
uint32_t *delimend = (jc->pc - 1) + last_arg; |
|
const size_t ptr_words = sizeof(void*) / sizeof(uint32_t); |
|
|
|
if (getop(*(jc->pc - 1)) == OP_TAGN) { |
|
jc->pc += ptr_words; |
|
} |
|
|
|
| chkneob n, >1 |
|
|
|
| // OPT: this is way too much fallback code to put here. |
|
| // Reduce and/or move to a separate section to make better icache usage. |
|
| ld64 tag |
|
| call ->checktag_fallback |
|
| cmp eax, DECODE_MISMATCH |
|
| je >3 |
|
| cmp eax, DECODE_EOF |
|
| je =>jmptarget(jc, delimend) |
|
| jmp >5 |
|
|
|
|1: |
|
switch (n) { |
|
case 1: |
|
| cmp byte [PTR], tag |
|
break; |
|
case 2: |
|
| cmp word [PTR], tag |
|
break; |
|
case 3: |
|
| // OPT: Slightly more efficient code, but depends on an extra byte. |
|
| // mov eax, dword [PTR] |
|
| // shl eax, 8 |
|
| // cmp eax, tag << 8 |
|
| cmp word [PTR], (tag & 0xffff) |
|
| jne >2 |
|
| cmp byte [PTR + 2], (tag >> 16) |
|
|2: |
|
break; |
|
case 4: |
|
| cmp dword [PTR], tag |
|
break; |
|
case 5: |
|
| cmp dword [PTR], (tag & 0xffffffff) |
|
| jne >3 |
|
| cmp byte [PTR + 4], (tag >> 32) |
|
} |
|
| je >4 |
|
|3: |
|
if (ofs == 0) { |
|
| call =>jmptarget(jc, &method->dispatch) |
|
| test rax, rax |
|
| jz =>jmptarget(jc, delimend) |
|
| jmp rax |
|
} else { |
|
| jmp =>jmptarget(jc, jc->pc + ofs) |
|
} |
|
|4: |
|
| add PTR, n |
|
|5: |
|
} |
|
|
|
// Compile the bytecode to x64. |
|
static void jitbytecode(jitcompiler *jc) { |
|
upb_pbdecodermethod *method = NULL; |
|
const upb_handlers *h = NULL; |
|
for (jc->pc = jc->group->bytecode; jc->pc < jc->group->bytecode_end; ) { |
|
int32_t instr = *jc->pc; |
|
opcode op = instr & 0xff; |
|
uint32_t arg = instr >> 8; |
|
int32_t longofs = arg; |
|
|
|
if (op != OP_SETDISPATCH) { |
|
// Skipped for SETDISPATCH because it defines its own asmlabel for the |
|
// dispatch code it emits. |
|
asmlabel(jc, "0x%lx.%s", pcofs(jc), upb_pbdecoder_getopname(op)); |
|
|
|
// Skipped for SETDISPATCH because it should point at the function |
|
// prologue, not the dispatch function that is emitted first. |
|
// TODO: optimize this to only define pclabels that are actually used. |
|
|=>define_jmptarget(jc, jc->pc): |
|
} |
|
|
|
jc->pc++; |
|
|
|
switch (op) { |
|
case OP_STARTMSG: { |
|
upb_func *startmsg = gethandler(h, UPB_STARTMSG_SELECTOR); |
|
if (startmsg) { |
|
// bool startmsg(void *closure, const void *hd) |
|
|1: |
|
| mov ARG1_64, CLOSURE |
|
| load_handler_data h, UPB_STARTMSG_SELECTOR |
|
| callp startmsg |
|
if (!alwaysok(h, UPB_STARTMSG_SELECTOR)) { |
|
| test al, al |
|
| jnz >2 |
|
| call ->suspend |
|
| jmp <1 |
|
|2: |
|
} |
|
} else { |
|
| nop |
|
} |
|
break; |
|
} |
|
case OP_ENDMSG: { |
|
upb_func *endmsg = gethandler(h, UPB_ENDMSG_SELECTOR); |
|
|9: |
|
if (endmsg) { |
|
// bool endmsg(void *closure, const void *hd, upb_status *status) |
|
| mov ARG1_64, CLOSURE |
|
| load_handler_data h, UPB_ENDMSG_SELECTOR |
|
| mov ARG3_64, DECODER->status |
|
| callp endmsg |
|
} |
|
break; |
|
} |
|
case OP_SETDISPATCH: { |
|
uint32_t *op_pc = jc->pc - 1; |
|
|
|
// Load info for new method. |
|
upb_inttable *dispatch; |
|
memcpy(&dispatch, jc->pc, sizeof(void*)); |
|
jc->pc += sizeof(void*) / sizeof(uint32_t); |
|
// The OP_SETDISPATCH bytecode contains a pointer that is |
|
// &method->dispatch; we want to go backwards and recover method. |
|
method = |
|
(void*)((char*)dispatch - offsetof(upb_pbdecodermethod, dispatch)); |
|
// May be NULL, in which case no handlers for this message will be found. |
|
// OPT: we should do better by completely skipping the message in this |
|
// case instead of parsing it field by field. We should also do the skip |
|
// in the containing message's code. |
|
h = method->dest_handlers_; |
|
const char *msgname = upb_msgdef_fullname(upb_handlers_msgdef(h)); |
|
|
|
// Emit dispatch code for new method. |
|
asmlabel(jc, "0x%lx.dispatch.%s", pcofs(jc), msgname); |
|
jitdispatch(jc, method); |
|
|
|
// Emit function prologue for new method. |
|
asmlabel(jc, "0x%lx.parse.%s", pcofs(jc), msgname); |
|
|=>define_jmptarget(jc, op_pc): |
|
|=>define_jmptarget(jc, method): |
|
| sub rsp, 8 |
|
|
|
break; |
|
} |
|
case OP_PARSE_DOUBLE: |
|
case OP_PARSE_FLOAT: |
|
case OP_PARSE_INT64: |
|
case OP_PARSE_UINT64: |
|
case OP_PARSE_INT32: |
|
case OP_PARSE_FIXED64: |
|
case OP_PARSE_FIXED32: |
|
case OP_PARSE_BOOL: |
|
case OP_PARSE_UINT32: |
|
case OP_PARSE_SFIXED32: |
|
case OP_PARSE_SFIXED64: |
|
case OP_PARSE_SINT32: |
|
case OP_PARSE_SINT64: |
|
jitprimitive(jc, op, h, arg); |
|
break; |
|
case OP_STARTSEQ: |
|
case OP_STARTSUBMSG: |
|
case OP_STARTSTR: { |
|
upb_func *start = gethandler(h, arg); |
|
if (start) { |
|
// void *startseq(void *closure, const void *hd) |
|
// void *startsubmsg(void *closure, const void *hd) |
|
// void *startstr(void *closure, const void *hd, size_t size_hint) |
|
|1: |
|
| mov ARG1_64, CLOSURE |
|
| load_handler_data h, arg |
|
if (op == OP_STARTSTR) { |
|
| mov ARG3_64, DELIMEND |
|
| sub ARG3_64, PTR |
|
} |
|
| callp start |
|
if (!alwaysok(h, arg)) { |
|
| test rax, rax |
|
| jnz >2 |
|
| call ->suspend |
|
| jmp <1 |
|
|2: |
|
} |
|
| mov CLOSURE, rax |
|
} else { |
|
// TODO: nop is only required because of asmlabel(). |
|
| nop |
|
} |
|
break; |
|
} |
|
case OP_ENDSEQ: |
|
case OP_ENDSUBMSG: |
|
case OP_ENDSTR: { |
|
upb_func *end = gethandler(h, arg); |
|
if (end) { |
|
// bool endseq(void *closure, const void *hd) |
|
// bool endsubmsg(void *closure, const void *hd) |
|
// bool endstr(void *closure, const void *hd) |
|
|1: |
|
| mov ARG1_64, CLOSURE |
|
| load_handler_data h, arg |
|
| callp end |
|
if (!alwaysok(h, arg)) { |
|
| test al, al |
|
| jnz >2 |
|
| call ->suspend |
|
| jmp <1 |
|
|2: |
|
} |
|
} else { |
|
// TODO: nop is only required because of asmlabel(). |
|
| nop |
|
} |
|
break; |
|
} |
|
case OP_STRING: { |
|
upb_func *str = gethandler(h, arg); |
|
| cmp PTR, DELIMEND |
|
| je >4 |
|
|1: |
|
| cmp PTR, DATAEND |
|
| jne >2 |
|
| call ->suspend |
|
| jmp <1 |
|
|2: |
|
if (str) { |
|
// size_t str(void *closure, const void *hd, const char *str, size_t n) |
|
| mov ARG1_64, CLOSURE |
|
| load_handler_data h, arg |
|
| mov ARG3_64, PTR |
|
| mov ARG4_64, DATAEND |
|
| sub ARG4_64, PTR |
|
| mov ARG5_64, qword DECODER->handle |
|
| callp str |
|
| add PTR, rax |
|
if (!alwaysok(h, arg)) { |
|
| cmp PTR, DATAEND |
|
| je >3 |
|
| call ->strret_fallback |
|
|3: |
|
} |
|
} else { |
|
| mov PTR, DATAEND |
|
} |
|
| cmp PTR, DELIMEND |
|
| jne <1 |
|
|4: |
|
break; |
|
} |
|
case OP_PUSHTAGDELIM: |
|
| mov FRAME->sink.closure, CLOSURE |
|
| // This shouldn't need to be read, because tag-delimited fields |
|
| // shouldn't have an OP_SETDELIM after them. But for the moment |
|
| // non-packed repeated fields do OP_SETDELIM so they can share more |
|
| // code with the packed code-path. If this is changed later, this |
|
| // store can be removed. |
|
| mov qword FRAME->end_ofs, 0 |
|
| add FRAME, sizeof(upb_pbdecoder_frame) |
|
| cmp FRAME, DECODER->limit |
|
| je ->err |
|
| mov dword FRAME->groupnum, arg |
|
break; |
|
case OP_PUSHLENDELIM: |
|
| call ->pushlendelim |
|
break; |
|
case OP_POP: |
|
| sub FRAME, sizeof(upb_pbdecoder_frame) |
|
| mov CLOSURE, FRAME->sink.closure |
|
break; |
|
case OP_SETDELIM: |
|
// OPT: experiment with testing vs old offset to optimize away. |
|
| mov DATAEND, DECODER->end |
|
| add DELIMEND, FRAME->end_ofs |
|
| cmp DELIMEND, DECODER->buf |
|
| jb >1 |
|
| cmp DELIMEND, DATAEND |
|
| ja >1 // OPT: try cmov. |
|
| mov DATAEND, DELIMEND |
|
|1: |
|
break; |
|
case OP_SETBIGGROUPNUM: |
|
| mov dword FRAME->groupnum, *jc->pc++ |
|
break; |
|
case OP_CHECKDELIM: |
|
| cmp DELIMEND, PTR |
|
| je =>jmptarget(jc, jc->pc + longofs) |
|
break; |
|
case OP_CALL: |
|
| call =>jmptarget(jc, jc->pc + longofs) |
|
break; |
|
case OP_BRANCH: |
|
| jmp =>jmptarget(jc, jc->pc + longofs); |
|
break; |
|
case OP_RET: |
|
|9: |
|
| add rsp, 8 |
|
| ret |
|
break; |
|
case OP_TAG1: |
|
jittag(jc, (arg >> 8) & 0xff, 1, (int8_t)arg, method); |
|
break; |
|
case OP_TAG2: |
|
jittag(jc, (arg >> 8) & 0xffff, 2, (int8_t)arg, method); |
|
break; |
|
case OP_TAGN: { |
|
uint64_t tag; |
|
memcpy(&tag, jc->pc, 8); |
|
jittag(jc, tag, arg >> 8, (int8_t)arg, method); |
|
break; |
|
} |
|
case OP_HALT: |
|
assert(false); |
|
} |
|
} |
|
|
|
asmlabel(jc, "eof"); |
|
| nop |
|
}
|
|
|