|
|
|
@ -24,6 +24,7 @@ |
|
|
|
|
|
|
|
|
|
SECTION_RODATA |
|
|
|
|
|
|
|
|
|
pb_mask_shuffle2103_mmx times 8 dw 255 |
|
|
|
|
pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15 |
|
|
|
|
pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13 |
|
|
|
|
pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 |
|
|
|
@ -42,6 +43,68 @@ SECTION .text |
|
|
|
|
%endif |
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
|
|
;------------------------------------------------------------------------------ |
|
|
|
|
; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size) |
|
|
|
|
;------------------------------------------------------------------------------ |
|
|
|
|
INIT_MMX mmxext |
|
|
|
|
cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x |
|
|
|
|
mova m6, [pb_mask_shuffle2103_mmx] |
|
|
|
|
mova m7, m6 |
|
|
|
|
psllq m7, 8 |
|
|
|
|
|
|
|
|
|
movsxdifnidn wq, wd |
|
|
|
|
mov xq, wq |
|
|
|
|
|
|
|
|
|
add srcq, wq |
|
|
|
|
add dstq, wq |
|
|
|
|
neg wq |
|
|
|
|
|
|
|
|
|
;calc scalar loop |
|
|
|
|
and xq, mmsize*2 -4 |
|
|
|
|
je .loop_simd |
|
|
|
|
|
|
|
|
|
.loop_scalar: |
|
|
|
|
mov tmpb, [srcq + wq + 2] |
|
|
|
|
mov [dstq+wq + 0], tmpb |
|
|
|
|
mov tmpb, [srcq + wq + 1] |
|
|
|
|
mov [dstq+wq + 1], tmpb |
|
|
|
|
mov tmpb, [srcq + wq + 0] |
|
|
|
|
mov [dstq+wq + 2], tmpb |
|
|
|
|
mov tmpb, [srcq + wq + 3] |
|
|
|
|
mov [dstq+wq + 3], tmpb |
|
|
|
|
add wq, 4 |
|
|
|
|
sub xq, 4 |
|
|
|
|
jg .loop_scalar |
|
|
|
|
|
|
|
|
|
;check if src_size < mmsize * 2 |
|
|
|
|
cmp wq, 0 |
|
|
|
|
jge .end |
|
|
|
|
|
|
|
|
|
.loop_simd: |
|
|
|
|
movu m0, [srcq+wq] |
|
|
|
|
movu m1, [srcq+wq+8] |
|
|
|
|
|
|
|
|
|
pshufw m3, m0, 177 |
|
|
|
|
pshufw m5, m1, 177 |
|
|
|
|
|
|
|
|
|
pand m0, m7 |
|
|
|
|
pand m3, m6 |
|
|
|
|
|
|
|
|
|
pand m1, m7 |
|
|
|
|
pand m5, m6 |
|
|
|
|
|
|
|
|
|
por m0, m3 |
|
|
|
|
por m1, m5 |
|
|
|
|
|
|
|
|
|
movu [dstq+wq], m0 |
|
|
|
|
movu [dstq+wq + 8], m1 |
|
|
|
|
|
|
|
|
|
add wq, mmsize*2 |
|
|
|
|
jl .loop_simd |
|
|
|
|
|
|
|
|
|
.end: |
|
|
|
|
RET |
|
|
|
|
|
|
|
|
|
;------------------------------------------------------------------------------ |
|
|
|
|
; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size) |
|
|
|
|
;------------------------------------------------------------------------------ |
|
|
|
|