It is slower than the C decoder for now because it falls off the fast path too often. But it can successfully decode varints, fixed32 and fixed64.pull/13171/head
parent
f1e1cc4695
commit
d8b2154862
10 changed files with 288 additions and 17 deletions
@ -0,0 +1,219 @@ |
|||||||
|
DEFAULT REL ; Default to RIP-relative addressing instead of absolute. |
||||||
|
|
||||||
|
extern _upb_decode_varint_fast64 |
||||||
|
|
||||||
|
SECTION .data |
||||||
|
|
||||||
|
; Our dispatch table; used to jump to the right handler, keyed on the field's |
||||||
|
; type. |
||||||
|
dispatch_table: |
||||||
|
dq _upb_fastdecode.cant_fast_path ; field not in table (type == 0). (check_4). |
||||||
|
dq _upb_fastdecode.fixed64 ; double |
||||||
|
dq _upb_fastdecode.fixed32 ; float |
||||||
|
dq _upb_fastdecode.varint ; int64 |
||||||
|
dq _upb_fastdecode.varint ; uint64 |
||||||
|
dq _upb_fastdecode.varint ; int32 |
||||||
|
dq _upb_fastdecode.fixed64 ; fixed64 |
||||||
|
dq _upb_fastdecode.fixed32 ; fixed32 |
||||||
|
dq _upb_fastdecode.varint ; bool |
||||||
|
dq _upb_fastdecode.cant_fast_path ; string (TODO) |
||||||
|
dq _upb_fastdecode.cant_fast_path ; group (check_6) |
||||||
|
dq _upb_fastdecode.cant_fast_path ; message |
||||||
|
dq _upb_fastdecode.cant_fast_path ; bytes (TODO) |
||||||
|
dq _upb_fastdecode.varint ; uint32 |
||||||
|
dq _upb_fastdecode.varint ; enum |
||||||
|
dq _upb_fastdecode.fixed32 ; sfixed32 |
||||||
|
dq _upb_fastdecode.fixed64 ; sfixed64 |
||||||
|
dq _upb_fastdecode.varint_sint32 ; sint32 |
||||||
|
dq _upb_fastdecode.varint_sint64 ; sint64 |
||||||
|
|
||||||
|
GLOBAL _upb_decode_fast |
||||||
|
|
||||||
|
SECTION .text |
||||||
|
; Register allocation. |
||||||
|
%define BUF rbx ; const char *p, current buf position. |
||||||
|
%define END rbp ; const char *end, where the buf ends (either submsg end or buf end) |
||||||
|
%define BUF_ADDR r12 ; upb_decoder *d. |
||||||
|
%define FIELDDEF r13 ; upb_fielddef *f, needs to be preserved across varint decoding call. |
||||||
|
%define CALLBACK r14 |
||||||
|
%define CLOSURE r15 |
||||||
|
|
||||||
|
; Stack layout: *tableptr, uint32_t maxfield_times_8 |
||||||
|
%define STACK_SPACE 24 ; this value + 8 must be a multiple of 16. |
||||||
|
%define TABLE_SPILL [rsp] ; our lookup table, indexed by field number. |
||||||
|
%define MAXFIELD_TIMES_8_SPILL [rsp+8] |
||||||
|
|
||||||
|
|
||||||
|
; Executing the fast path requires the following conditions: |
||||||
|
; - check_1: there are >=12 bytes left (<=2 byte tag and <=10 byte varint). |
||||||
|
; - check_2: the tag is <= 2 bytes. |
||||||
|
; - check_3: the field number is <= the table size |
||||||
|
; (ie. it must be an array lookup, not a hash lookup). |
||||||
|
; - check_4: the field is known (found in the table). |
||||||
|
; - check_5: the wire type we read is correct for the field number, |
||||||
|
; ("packed" fields are not accepted, yet. this could be handled |
||||||
|
; efficiently by doing an extra check on the "type check failed" |
||||||
|
; path that goes into a tight loop if the encoding was packed). |
||||||
|
; - check_6: the field is not a group or a message (or string, TODO) |
||||||
|
; (this could be relaxed, but due to delegation it's a bit tricky). |
||||||
|
; - if the value is a string, the entire string is available in |
||||||
|
; the buffer, and our cached string object can be recycled. |
||||||
|
|
||||||
|
|
||||||
|
%macro decode_and_dispatch_ 0 |
||||||
|
align 16 |
||||||
|
.decode_and_dispatch: |
||||||
|
; Load a few values we'll need in a sec. |
||||||
|
mov r8, TABLE_SPILL |
||||||
|
mov r9d, MAXFIELD_TIMES_8_SPILL |
||||||
|
|
||||||
|
mov rax, END |
||||||
|
sub rax, BUF |
||||||
|
cmp rax, 12 |
||||||
|
jb _upb_fastdecode.cant_fast_path ; check_1 (<12 bytes left). |
||||||
|
|
||||||
|
; Decode a 1 or 2-byte varint -> eax. |
||||||
|
mov cl, byte [BUF] |
||||||
|
lea rdi, [BUF+1] |
||||||
|
movzx rax, cl ; Need all of rax since we're doing a 64-bit lea later. |
||||||
|
and eax, 0x7f |
||||||
|
test cl, cl |
||||||
|
jns .one_byte_tag ; Should be predictable if fields are in order. |
||||||
|
movzx ecx, byte [BUF+1] |
||||||
|
lea rdi, [BUF+2] |
||||||
|
mov edx, ecx |
||||||
|
and edx, 0x7f |
||||||
|
shl edx, 7 |
||||||
|
or eax, edx |
||||||
|
test al, al |
||||||
|
js _upb_fastdecode.cant_fast_path ; check_2 (tag was >2 bytes). |
||||||
|
.one_byte_tag: |
||||||
|
mov BUF, rdi |
||||||
|
|
||||||
|
; Decode tag and dispatch. |
||||||
|
mov ecx, eax |
||||||
|
and eax, 0x3ff8 ; eax now contains field number * 8 |
||||||
|
lea r11, [r8+rax*2] ; *2 is really *16, since rax is already *8. |
||||||
|
and ecx, 0x7 ; ecx now contains wire type. |
||||||
|
cmp eax, r9d |
||||||
|
jae _upb_fastdecode.cant_fast_path ; check_3 (field number > table size) |
||||||
|
mov FIELDDEF, [r11+8] ; Lookup fielddef (upb_itof_ent.f) |
||||||
|
movzx rdx, BYTE [r11+1] ; Lookup field type. |
||||||
|
mov rax, qword dispatch_table |
||||||
|
jmp [rax+rdx*8] |
||||||
|
%endmacro |
||||||
|
|
||||||
|
%macro decode_and_dispatch 0 |
||||||
|
jmp .decode_and_dispatch |
||||||
|
%endmacro |
||||||
|
|
||||||
|
%macro call_callback 0 |
||||||
|
; Value arg must already be in rdx when macro is called. |
||||||
|
mov rdi, CLOSURE |
||||||
|
mov rsi, FIELDDEF |
||||||
|
mov rcx, 33 ; RAW; we could pass the correct type, or only do this in non-debug modes. |
||||||
|
call CALLBACK |
||||||
|
mov [BUF_ADDR], BUF |
||||||
|
cmp eax, 0 |
||||||
|
jne .done ; Caller requested BREAK or SKIPSUBMSG. |
||||||
|
%endmacro |
||||||
|
|
||||||
|
%macro check_type 1 |
||||||
|
cmp ecx, %1 |
||||||
|
jne _upb_fastdecode.cant_fast_path ; check_5 (wire type check failed). |
||||||
|
%endmacro |
||||||
|
|
||||||
|
; extern upb_flow_t upb_fastdecode(const char **p, const char *end, |
||||||
|
; upb_value_handler_t value_cb, void *closure, |
||||||
|
; void *table, int table_size); |
||||||
|
align 16 |
||||||
|
global _upb_fastdecode |
||||||
|
_upb_fastdecode: |
||||||
|
; We use all callee-save regs. |
||||||
|
push rbx |
||||||
|
push rbp |
||||||
|
push r12 |
||||||
|
push r13 |
||||||
|
push r14 |
||||||
|
push r15 |
||||||
|
sub rsp, STACK_SPACE |
||||||
|
|
||||||
|
; Parse arguments into reg vals and stack. |
||||||
|
mov BUF_ADDR, rdi |
||||||
|
mov BUF, [rdi] |
||||||
|
mov END, rsi |
||||||
|
mov CALLBACK, rdx |
||||||
|
mov CLOSURE, rcx |
||||||
|
mov TABLE_SPILL, r8 |
||||||
|
shl r9, 3 |
||||||
|
mov MAXFIELD_TIMES_8_SPILL, r9 |
||||||
|
|
||||||
|
decode_and_dispatch |
||||||
|
|
||||||
|
align 16 |
||||||
|
.varint: |
||||||
|
call _upb_decode_varint_fast64 ; BUF is already in rdi. |
||||||
|
test rax, rax |
||||||
|
jz _upb_fastdecode.cant_fast_path ; Varint was unterminated, slow path will handle error. |
||||||
|
mov BUF, rax |
||||||
|
call_callback ; rdx already holds value. |
||||||
|
decode_and_dispatch_ |
||||||
|
|
||||||
|
align 16 |
||||||
|
.fixed32: |
||||||
|
mov edx, DWORD [BUF] ; Might be unaligned, but that's ok. |
||||||
|
add BUF, 4 |
||||||
|
call_callback |
||||||
|
decode_and_dispatch |
||||||
|
|
||||||
|
align 16 |
||||||
|
.fixed64: |
||||||
|
mov rdx, QWORD [BUF] ; Might be unaligned, but that's ok. |
||||||
|
add BUF, 8 |
||||||
|
call_callback |
||||||
|
decode_and_dispatch |
||||||
|
|
||||||
|
align 16 |
||||||
|
.varint_sint32: |
||||||
|
call _upb_decode_varint_fast64 ; BUF is already in rdi. |
||||||
|
test rax, rax |
||||||
|
jz _upb_fastdecode.cant_fast_path ; Varint was unterminated, slow path will handle error. |
||||||
|
mov BUF, rax |
||||||
|
|
||||||
|
; Perform 32-bit zig-zag decoding. |
||||||
|
mov ecx, edx |
||||||
|
shr edx, 1 |
||||||
|
and ecx, 0x1 |
||||||
|
neg ecx |
||||||
|
xor edx, ecx |
||||||
|
call_callback |
||||||
|
decode_and_dispatch |
||||||
|
|
||||||
|
align 16 |
||||||
|
.varint_sint64: |
||||||
|
call _upb_decode_varint_fast64 ; BUF is already in rdi. |
||||||
|
test rax, rax |
||||||
|
jz _upb_fastdecode.cant_fast_path ; Varint was unterminated, slow path will handle error. |
||||||
|
mov BUF, rax |
||||||
|
|
||||||
|
; Perform 64-bit zig-zag decoding. |
||||||
|
mov rcx, rdx |
||||||
|
shr rdx, 1 |
||||||
|
and ecx, 0x1 |
||||||
|
neg rcx |
||||||
|
xor rdx, rcx |
||||||
|
call_callback |
||||||
|
decode_and_dispatch |
||||||
|
|
||||||
|
.cant_fast_path: |
||||||
|
mov rax, 0 ; UPB_CONTINUE -- continue as before. |
||||||
|
.done: |
||||||
|
; If coming via done, preserve the user callback's return in rax. |
||||||
|
add rsp, STACK_SPACE |
||||||
|
pop r15 |
||||||
|
pop r14 |
||||||
|
pop r13 |
||||||
|
pop r12 |
||||||
|
pop rbp |
||||||
|
pop rbx |
||||||
|
ret |
Loading…
Reference in new issue