|
|
@ -29,11 +29,16 @@ pb_flip_short: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1 |
|
|
|
|
|
|
|
|
|
|
|
SECTION .text |
|
|
|
SECTION .text |
|
|
|
|
|
|
|
|
|
|
|
INIT_XMM ssse3 |
|
|
|
;%1 byte or short, %2 b or w, %3 size in byte (1 for byte, 2 for short) |
|
|
|
cglobal hflip_byte, 3, 5, 3, src, dst, w, r, x |
|
|
|
%macro HFLIP 3 |
|
|
|
mova m0, [pb_flip_byte] |
|
|
|
cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x |
|
|
|
|
|
|
|
mova m0, [pb_flip_%1] |
|
|
|
xor xq, xq |
|
|
|
xor xq, xq |
|
|
|
|
|
|
|
%if %3 == 1 |
|
|
|
movsxdifnidn wq, wd |
|
|
|
movsxdifnidn wq, wd |
|
|
|
|
|
|
|
%else ; short |
|
|
|
|
|
|
|
add wd, wd |
|
|
|
|
|
|
|
%endif |
|
|
|
mov rq, wq |
|
|
|
mov rq, wq |
|
|
|
and rq, 2 * mmsize - 1 |
|
|
|
and rq, 2 * mmsize - 1 |
|
|
|
cmp wq, 2 * mmsize |
|
|
|
cmp wq, 2 * mmsize |
|
|
@ -42,8 +47,8 @@ cglobal hflip_byte, 3, 5, 3, src, dst, w, r, x |
|
|
|
|
|
|
|
|
|
|
|
.loop0: |
|
|
|
.loop0: |
|
|
|
neg xq |
|
|
|
neg xq |
|
|
|
movu m1, [srcq + xq - mmsize + 1] |
|
|
|
movu m1, [srcq + xq - mmsize + %3] |
|
|
|
movu m2, [srcq + xq - 2 * mmsize + 1] |
|
|
|
movu m2, [srcq + xq - 2 * mmsize + %3] |
|
|
|
pshufb m1, m0 |
|
|
|
pshufb m1, m0 |
|
|
|
pshufb m2, m0 |
|
|
|
pshufb m2, m0 |
|
|
|
neg xq |
|
|
|
neg xq |
|
|
@ -59,49 +64,17 @@ cglobal hflip_byte, 3, 5, 3, src, dst, w, r, x |
|
|
|
|
|
|
|
|
|
|
|
.loop1: |
|
|
|
.loop1: |
|
|
|
neg xq |
|
|
|
neg xq |
|
|
|
mov rb, [srcq + xq] |
|
|
|
mov r%2, [srcq + xq] |
|
|
|
neg xq |
|
|
|
neg xq |
|
|
|
mov [dstq + xq], rb |
|
|
|
mov [dstq + xq], r%2 |
|
|
|
add xq, 1 |
|
|
|
add xq, %3 |
|
|
|
cmp xq, wq |
|
|
|
cmp xq, wq |
|
|
|
jl .loop1 |
|
|
|
jl .loop1 |
|
|
|
.end: |
|
|
|
.end: |
|
|
|
RET |
|
|
|
RET |
|
|
|
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
|
|
|
|
cglobal hflip_short, 3, 5, 3, src, dst, w, r, x |
|
|
|
INIT_XMM ssse3 |
|
|
|
mova m0, [pb_flip_short] |
|
|
|
HFLIP byte, b, 1 |
|
|
|
xor xq, xq |
|
|
|
HFLIP short, w, 2 |
|
|
|
add wd, wd |
|
|
|
|
|
|
|
mov rq, wq |
|
|
|
|
|
|
|
and rq, 2 * mmsize - 1 |
|
|
|
|
|
|
|
cmp wq, 2 * mmsize |
|
|
|
|
|
|
|
jl .loop1 |
|
|
|
|
|
|
|
sub wq, rq |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.loop0: |
|
|
|
|
|
|
|
neg xq |
|
|
|
|
|
|
|
movu m1, [srcq + xq - mmsize + 2] |
|
|
|
|
|
|
|
movu m2, [srcq + xq - 2 * mmsize + 2] |
|
|
|
|
|
|
|
pshufb m1, m0 |
|
|
|
|
|
|
|
pshufb m2, m0 |
|
|
|
|
|
|
|
neg xq |
|
|
|
|
|
|
|
movu [dstq + xq ], m1 |
|
|
|
|
|
|
|
movu [dstq + xq + mmsize], m2 |
|
|
|
|
|
|
|
add xq, mmsize * 2 |
|
|
|
|
|
|
|
cmp xq, wq |
|
|
|
|
|
|
|
jl .loop0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cmp rq, 0 |
|
|
|
|
|
|
|
je .end |
|
|
|
|
|
|
|
add wq, rq |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.loop1: |
|
|
|
|
|
|
|
neg xq |
|
|
|
|
|
|
|
mov rw, [srcq + xq] |
|
|
|
|
|
|
|
neg xq |
|
|
|
|
|
|
|
mov [dstq + xq], rw |
|
|
|
|
|
|
|
add xq, 2 |
|
|
|
|
|
|
|
cmp xq, wq |
|
|
|
|
|
|
|
jl .loop1 |
|
|
|
|
|
|
|
.end: |
|
|
|
|
|
|
|
RET |
|
|
|
|
|
|
|