|
|
|
@ -146,8 +146,13 @@ pw_20091: times 4 dw 20091 |
|
|
|
|
pw_17734: times 4 dw 17734 |
|
|
|
|
|
|
|
|
|
cextern pw_3 |
|
|
|
|
cextern pb_3 |
|
|
|
|
cextern pw_4 |
|
|
|
|
cextern pb_4 |
|
|
|
|
cextern pw_64 |
|
|
|
|
cextern pb_80 |
|
|
|
|
cextern pb_F8 |
|
|
|
|
cextern pb_FE |
|
|
|
|
|
|
|
|
|
SECTION .text |
|
|
|
|
|
|
|
|
@ -1063,3 +1068,304 @@ cglobal vp8_luma_dc_wht_mmxext, 2,3 |
|
|
|
|
add r0, 2*16*4 |
|
|
|
|
SCATTER_WHT 3 |
|
|
|
|
RET |
|
|
|
|
|
|
|
|
|
;----------------------------------------------------------------------------- |
|
|
|
|
; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim); |
|
|
|
|
;----------------------------------------------------------------------------- |
|
|
|
|
|
|
|
|
|
; macro called with 7 mm register indexes as argument, and 4 regular registers |
|
|
|
|
; |
|
|
|
|
; first 4 mm registers will carry the transposed pixel data |
|
|
|
|
; the other three are scratchspace (one would be sufficient, but this allows |
|
|
|
|
; for more spreading/pipelining and thus faster execution on OOE CPUs) |
|
|
|
|
; |
|
|
|
|
; first two regular registers are buf+4*stride and buf+5*stride |
|
|
|
|
; third is -stride, fourth is +stride |
|
|
|
|
%macro READ_8x4_INTERLEAVED 11 |
|
|
|
|
; interleave 8 (A-H) rows of 4 pixels each |
|
|
|
|
movd m%1, [%8+%10*4] ; A0-3 |
|
|
|
|
movd m%5, [%9+%10*4] ; B0-3 |
|
|
|
|
movd m%2, [%8+%10*2] ; C0-3 |
|
|
|
|
movd m%6, [%8+%10] ; D0-3 |
|
|
|
|
movd m%3, [%8] ; E0-3 |
|
|
|
|
movd m%7, [%9] ; F0-3 |
|
|
|
|
movd m%4, [%9+%11] ; G0-3 |
|
|
|
|
punpcklbw m%1, m%5 ; A/B interleaved |
|
|
|
|
movd m%5, [%9+%11*2] ; H0-3 |
|
|
|
|
punpcklbw m%2, m%6 ; C/D interleaved |
|
|
|
|
punpcklbw m%3, m%7 ; E/F interleaved |
|
|
|
|
punpcklbw m%4, m%5 ; G/H interleaved |
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
|
|
; macro called with 7 mm register indexes as argument, and 5 regular registers |
|
|
|
|
; first 11 mean the same as READ_8x4_TRANSPOSED above |
|
|
|
|
; fifth regular register is scratchspace to reach the bottom 8 rows, it |
|
|
|
|
; will be set to second regular register + 8*stride at the end |
|
|
|
|
%macro READ_16x4_INTERLEAVED 12 |
|
|
|
|
; transpose 16 (A-P) rows of 4 pixels each |
|
|
|
|
lea %12, [r0+8*r2] |
|
|
|
|
|
|
|
|
|
; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M |
|
|
|
|
movd m%1, [%8+%10*4] ; A0-3 |
|
|
|
|
movd m%3, [%12+%10*4] ; I0-3 |
|
|
|
|
movd m%2, [%8+%10*2] ; C0-3 |
|
|
|
|
movd m%4, [%12+%10*2] ; K0-3 |
|
|
|
|
movd m%6, [%8+%10] ; D0-3 |
|
|
|
|
movd m%5, [%12+%10] ; L0-3 |
|
|
|
|
movd m%7, [%12] ; M0-3 |
|
|
|
|
add %12, %11 |
|
|
|
|
punpcklbw m%1, m%3 ; A/I |
|
|
|
|
movd m%3, [%8] ; E0-3 |
|
|
|
|
punpcklbw m%2, m%4 ; C/K |
|
|
|
|
punpcklbw m%6, m%5 ; D/L |
|
|
|
|
punpcklbw m%3, m%7 ; E/M |
|
|
|
|
punpcklbw m%2, m%6 ; C/D/K/L interleaved |
|
|
|
|
|
|
|
|
|
; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P |
|
|
|
|
movd m%5, [%9+%10*4] ; B0-3 |
|
|
|
|
movd m%4, [%12+%10*4] ; J0-3 |
|
|
|
|
movd m%7, [%9] ; F0-3 |
|
|
|
|
movd m%6, [%12] ; N0-3 |
|
|
|
|
punpcklbw m%5, m%4 ; B/J |
|
|
|
|
punpcklbw m%7, m%6 ; F/N |
|
|
|
|
punpcklbw m%1, m%5 ; A/B/I/J interleaved |
|
|
|
|
punpcklbw m%3, m%7 ; E/F/M/N interleaved |
|
|
|
|
movd m%4, [%9+%11] ; G0-3 |
|
|
|
|
movd m%6, [%12+%11] ; O0-3 |
|
|
|
|
movd m%5, [%9+%11*2] ; H0-3 |
|
|
|
|
movd m%7, [%12+%11*2] ; P0-3 |
|
|
|
|
punpcklbw m%4, m%6 ; G/O |
|
|
|
|
punpcklbw m%5, m%7 ; H/P |
|
|
|
|
punpcklbw m%4, m%5 ; G/H/O/P interleaved |
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
|
|
; write 4 mm registers of 2 dwords each |
|
|
|
|
; first four arguments are mm register indexes containing source data |
|
|
|
|
; last four are registers containing buf+4*stride, buf+5*stride, |
|
|
|
|
; -stride and +stride |
|
|
|
|
%macro WRITE_4x2D 8 |
|
|
|
|
; write out (2 dwords per register) |
|
|
|
|
movd [%5+%7*4], m%1 |
|
|
|
|
movd [%5+%7*2], m%2 |
|
|
|
|
movd [%5], m%3 |
|
|
|
|
movd [%6+%8], m%4 |
|
|
|
|
punpckhdq m%1, m%1 |
|
|
|
|
punpckhdq m%2, m%2 |
|
|
|
|
punpckhdq m%3, m%3 |
|
|
|
|
punpckhdq m%4, m%4 |
|
|
|
|
movd [%6+%7*4], m%1 |
|
|
|
|
movd [%5+%7], m%2 |
|
|
|
|
movd [%6], m%3 |
|
|
|
|
movd [%6+%8*2], m%4 |
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
|
|
; write 4 xmm registers of 4 dwords each |
|
|
|
|
; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular |
|
|
|
|
; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride |
|
|
|
|
; we add 1*stride to the third regular registry in the process |
|
|
|
|
%macro WRITE_4x4D 9 |
|
|
|
|
; write out (4 dwords per register), start with dwords zero |
|
|
|
|
movd [%5+%8*4], m%1 |
|
|
|
|
movd [%5], m%2 |
|
|
|
|
movd [%5+%9*4], m%3 |
|
|
|
|
movd [%5+%9*8], m%4 |
|
|
|
|
|
|
|
|
|
; store dwords 1 |
|
|
|
|
psrldq m%1, 4 |
|
|
|
|
psrldq m%2, 4 |
|
|
|
|
psrldq m%3, 4 |
|
|
|
|
psrldq m%4, 4 |
|
|
|
|
movd [%6+%8*4], m%1 |
|
|
|
|
movd [%6], m%2 |
|
|
|
|
movd [%6+%9*4], m%3 |
|
|
|
|
movd [%6+%9*8], m%4 |
|
|
|
|
|
|
|
|
|
; write dwords 2 |
|
|
|
|
psrldq m%1, 4 |
|
|
|
|
psrldq m%2, 4 |
|
|
|
|
psrldq m%3, 4 |
|
|
|
|
psrldq m%4, 4 |
|
|
|
|
movd [%5+%8*2], m%1 |
|
|
|
|
movd [%6+%9], m%2 |
|
|
|
|
movd [%7+%8*2], m%3 |
|
|
|
|
movd [%7+%9*2], m%4 |
|
|
|
|
add %7, %9 |
|
|
|
|
|
|
|
|
|
; store dwords 3 |
|
|
|
|
psrldq m%1, 4 |
|
|
|
|
psrldq m%2, 4 |
|
|
|
|
psrldq m%3, 4 |
|
|
|
|
psrldq m%4, 4 |
|
|
|
|
movd [%5+%8], m%1 |
|
|
|
|
movd [%6+%9*2], m%2 |
|
|
|
|
movd [%7+%8*2], m%3 |
|
|
|
|
movd [%7+%9*2], m%4 |
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
|
|
%macro SIMPLE_LOOPFILTER 3 |
|
|
|
|
cglobal vp8_%2_loop_filter_simple_%1, 3, %3 |
|
|
|
|
%ifidn %2, h |
|
|
|
|
mov r5, rsp ; backup stack pointer |
|
|
|
|
and rsp, ~(mmsize-1) ; align stack |
|
|
|
|
%endif |
|
|
|
|
%if mmsize == 8 ; mmx/mmxext |
|
|
|
|
mov r3, 2 |
|
|
|
|
%endif |
|
|
|
|
|
|
|
|
|
; splat register with "flim" |
|
|
|
|
movd m7, r2 |
|
|
|
|
punpcklbw m7, m7 |
|
|
|
|
%if mmsize == 16 ; sse2 |
|
|
|
|
punpcklwd m7, m7 |
|
|
|
|
pshufd m7, m7, 0x0 |
|
|
|
|
%elifidn %1, mmx |
|
|
|
|
punpcklwd m7, m7 |
|
|
|
|
punpckldq m7, m7 |
|
|
|
|
%else ; mmxext |
|
|
|
|
pshufw m7, m7, 0x0 |
|
|
|
|
%endif |
|
|
|
|
|
|
|
|
|
; set up indexes to address 4 rows |
|
|
|
|
mov r2, r1 |
|
|
|
|
neg r1 |
|
|
|
|
%ifidn %2, h |
|
|
|
|
lea r0, [r0+4*r2-2] |
|
|
|
|
sub rsp, mmsize*2 ; (aligned) storage space for saving p1/q1 |
|
|
|
|
%endif |
|
|
|
|
|
|
|
|
|
%if mmsize == 8 ; mmx / mmxext |
|
|
|
|
.next8px |
|
|
|
|
%endif |
|
|
|
|
%ifidn %2, v |
|
|
|
|
; read 4 half/full rows of pixels |
|
|
|
|
mova m0, [r0+r1*2] ; p1 |
|
|
|
|
mova m1, [r0+r1] ; p0 |
|
|
|
|
mova m2, [r0] ; q0 |
|
|
|
|
mova m3, [r0+r2] ; q1 |
|
|
|
|
%else ; h |
|
|
|
|
lea r4, [r0+r2] |
|
|
|
|
|
|
|
|
|
%if mmsize == 8 ; mmx/mmxext |
|
|
|
|
READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2 |
|
|
|
|
%else ; sse2 |
|
|
|
|
READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3 |
|
|
|
|
%endif |
|
|
|
|
TRANSPOSE4x4W 0, 1, 2, 3, 4 |
|
|
|
|
|
|
|
|
|
mova [rsp], m0 ; store p1 |
|
|
|
|
mova [rsp+mmsize], m3 ; store q1 |
|
|
|
|
%endif |
|
|
|
|
|
|
|
|
|
; simple_limit |
|
|
|
|
mova m5, m2 ; m5=backup of q0 |
|
|
|
|
mova m6, m1 ; m6=backup of p0 |
|
|
|
|
psubusb m1, m2 ; p0-q0 |
|
|
|
|
psubusb m2, m6 ; q0-p0 |
|
|
|
|
por m1, m2 ; FFABS(p0-q0) |
|
|
|
|
paddusb m1, m1 ; m1=FFABS(p0-q0)*2 |
|
|
|
|
|
|
|
|
|
mova m4, m3 |
|
|
|
|
mova m2, m0 |
|
|
|
|
psubusb m3, m0 ; q1-p1 |
|
|
|
|
psubusb m0, m4 ; p1-q1 |
|
|
|
|
por m3, m0 ; FFABS(p1-q1) |
|
|
|
|
mova m0, [pb_80] |
|
|
|
|
pxor m2, m0 |
|
|
|
|
pxor m4, m0 |
|
|
|
|
psubsb m2, m4 ; m2=p1-q1 (signed) backup for below |
|
|
|
|
pand m3, [pb_FE] |
|
|
|
|
psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed |
|
|
|
|
paddusb m3, m1 |
|
|
|
|
psubusb m3, m7 |
|
|
|
|
pxor m1, m1 |
|
|
|
|
pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0) |
|
|
|
|
|
|
|
|
|
; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask) |
|
|
|
|
mova m4, m5 |
|
|
|
|
pxor m5, m0 |
|
|
|
|
pxor m0, m6 |
|
|
|
|
psubsb m5, m0 ; q0-p0 (signed) |
|
|
|
|
paddsb m2, m5 |
|
|
|
|
paddsb m2, m5 |
|
|
|
|
paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0) |
|
|
|
|
pand m2, m3 ; apply filter mask (m3) |
|
|
|
|
|
|
|
|
|
mova m3, [pb_F8] |
|
|
|
|
mova m1, m2 |
|
|
|
|
paddsb m2, [pb_4] ; f1<<3=a+4 |
|
|
|
|
paddsb m1, [pb_3] ; f2<<3=a+3 |
|
|
|
|
pand m2, m3 |
|
|
|
|
pand m1, m3 ; cache f2<<3 |
|
|
|
|
|
|
|
|
|
pxor m0, m0 |
|
|
|
|
pxor m3, m3 |
|
|
|
|
pcmpgtb m0, m2 ; which values are <0? |
|
|
|
|
psubb m3, m2 ; -f1<<3 |
|
|
|
|
psrlq m2, 3 ; +f1 |
|
|
|
|
psrlq m3, 3 ; -f1 |
|
|
|
|
pand m3, m0 |
|
|
|
|
pandn m0, m2 |
|
|
|
|
psubusb m4, m0 |
|
|
|
|
paddusb m4, m3 ; q0-f1 |
|
|
|
|
|
|
|
|
|
pxor m0, m0 |
|
|
|
|
pxor m3, m3 |
|
|
|
|
pcmpgtb m0, m1 ; which values are <0? |
|
|
|
|
psubb m3, m1 ; -f2<<3 |
|
|
|
|
psrlq m1, 3 ; +f2 |
|
|
|
|
psrlq m3, 3 ; -f2 |
|
|
|
|
pand m3, m0 |
|
|
|
|
pandn m0, m1 |
|
|
|
|
paddusb m6, m0 |
|
|
|
|
psubusb m6, m3 ; p0+f2 |
|
|
|
|
|
|
|
|
|
; store |
|
|
|
|
%ifidn %2, v |
|
|
|
|
mova [r0], m4 |
|
|
|
|
mova [r0+r1], m6 |
|
|
|
|
%else ; h |
|
|
|
|
mova m0, [rsp] ; p1 |
|
|
|
|
SWAP 2, 4 ; p0 |
|
|
|
|
SWAP 1, 6 ; q0 |
|
|
|
|
mova m3, [rsp+mmsize] ; q1 |
|
|
|
|
|
|
|
|
|
TRANSPOSE4x4B 0, 1, 2, 3, 4 |
|
|
|
|
%if mmsize == 16 ; sse2 |
|
|
|
|
add r3, r1 ; change from r4*8*stride to r0+8*stride |
|
|
|
|
WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2 |
|
|
|
|
%else ; mmx/mmxext |
|
|
|
|
WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2 |
|
|
|
|
%endif |
|
|
|
|
%endif |
|
|
|
|
|
|
|
|
|
%if mmsize == 8 ; mmx/mmxext |
|
|
|
|
; next 8 pixels |
|
|
|
|
%ifidn %2, v |
|
|
|
|
add r0, 8 ; advance 8 cols = pixels |
|
|
|
|
%else ; h |
|
|
|
|
lea r0, [r0+r2*8] ; advance 8 rows = lines |
|
|
|
|
%endif |
|
|
|
|
dec r3 |
|
|
|
|
jg .next8px |
|
|
|
|
%ifidn %2, v |
|
|
|
|
REP_RET |
|
|
|
|
%else ; h |
|
|
|
|
mov rsp, r5 ; restore stack pointer |
|
|
|
|
RET |
|
|
|
|
%endif |
|
|
|
|
%else ; sse2 |
|
|
|
|
%ifidn %2, h |
|
|
|
|
mov rsp, r5 ; restore stack pointer |
|
|
|
|
%endif |
|
|
|
|
RET |
|
|
|
|
%endif |
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
|
|
INIT_MMX |
|
|
|
|
SIMPLE_LOOPFILTER mmx, v, 4 |
|
|
|
|
SIMPLE_LOOPFILTER mmx, h, 6 |
|
|
|
|
SIMPLE_LOOPFILTER mmxext, v, 4 |
|
|
|
|
SIMPLE_LOOPFILTER mmxext, h, 6 |
|
|
|
|
INIT_XMM |
|
|
|
|
SIMPLE_LOOPFILTER sse2, v, 3 |
|
|
|
|
SIMPLE_LOOPFILTER sse2, h, 6 |
|
|
|
|