From ede1b9665a568bc038641f4154e33028c02d2778 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Fri, 16 Jul 2010 19:38:10 +0000 Subject: [PATCH] Give x86 r%d registers names, this will simplify implementation of the chroma inner loopfilter, and it also allows us to save one register on x86-64/sse2. Originally committed as revision 24269 to svn://svn.ffmpeg.org/ffmpeg/trunk --- libavcodec/x86/vp8dsp.asm | 139 ++++++++++++++++++++++---------------- 1 file changed, 81 insertions(+), 58 deletions(-) diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index e640fe3144..5507508e5b 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -1380,14 +1380,33 @@ SIMPLE_LOOPFILTER sse2, h, 6 %macro INNER_LOOPFILTER 4 cglobal vp8_%2_loop_filter16_inner_%1, 5, %3, %4 +%define dst_reg r0 +%define mstride_reg r1 +%define stride_reg r2 +%define E_reg r2 +%define I_reg r3 +%define hev_thr_reg r4 +%ifdef m8 ; x86-64, sse2 +%define dst8_reg r4 +%elif mmsize == 16 ; x86-32, sse2 +%define dst8_reg r5 +%else ; x86-32, mmx/mmxext +%define cnt_reg r5 +%endif +%define stride_reg E_reg +%define dst2_reg I_reg +%ifndef m8 +%define stack_reg hev_thr_reg +%endif + %ifndef m8 ; mmx/mmxext or sse2 on x86-32 ; splat function arguments - SPLATB_REG m0, r2, %1 ; E - SPLATB_REG m1, r3, %1 ; I - SPLATB_REG m2, r4, %1 ; hev_thresh + SPLATB_REG m0, E_reg, %1 ; E + SPLATB_REG m1, I_reg, %1 ; I + SPLATB_REG m2, hev_thr_reg, %1 ; hev_thresh ; align stack - mov r4, rsp ; backup stack pointer + mov stack_reg, rsp ; backup stack pointer and rsp, ~(mmsize-1) ; align stack %ifidn %2, v sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr @@ -1413,41 +1432,41 @@ cglobal vp8_%2_loop_filter16_inner_%1, 5, %3, %4 %define mask_res m12 ; splat function arguments - SPLATB_REG flim_E, r2, %1 ; E - SPLATB_REG flim_I, r3, %1 ; I - SPLATB_REG hev_thr, r4, %1 ; hev_thresh + SPLATB_REG flim_E, E_reg, %1 ; E + SPLATB_REG flim_I, I_reg, %1 ; I + SPLATB_REG hev_thr, hev_thr_reg, %1 ; hev_thresh %endif %if mmsize == 8 ; mmx/mmxext - mov r5, 2 + mov cnt_reg, 2 %endif - mov r2, r1 - neg r1 + mov stride_reg, mstride_reg + neg mstride_reg %ifidn %2, h - lea r0, [r0+4*r2-4] + lea dst_reg, [dst_reg + stride_reg*4-4] %endif %if mmsize == 8 .next8px %endif ; read - lea r3, [r0+r2] + lea dst2_reg, [dst_reg + stride_reg] %ifidn %2, v - mova m0, [r0+r1*4] ; p3 - mova m1, [r3+r1*4] ; p2 - mova m2, [r0+r1*2] ; p1 - mova m5, [r3] ; q1 - mova m6, [r3+r2] ; q2 - mova m7, [r3+r2*2] ; q3 + mova m0, [dst_reg +mstride_reg*4] ; p3 + mova m1, [dst2_reg+mstride_reg*4] ; p2 + mova m2, [dst_reg +mstride_reg*2] ; p1 + mova m5, [dst2_reg] ; q1 + mova m6, [dst2_reg+ stride_reg] ; q2 + mova m7, [dst2_reg+ stride_reg*2] ; q3 %elif mmsize == 8 ; mmx/mmxext (h) ; read 8 rows of 8px each - movu m0, [r0+r1*4] - movu m1, [r3+r1*4] - movu m2, [r0+r1*2] - movu m3, [r0+r1] - movu m4, [r0] - movu m5, [r3] - movu m6, [r3+r2] + movu m0, [dst_reg +mstride_reg*4] + movu m1, [dst2_reg+mstride_reg*4] + movu m2, [dst_reg +mstride_reg*2] + movu m3, [dst_reg +mstride_reg] + movu m4, [dst_reg] + movu m5, [dst2_reg] + movu m6, [dst2_reg+ stride_reg] ; 8x8 transpose TRANSPOSE4x4B 0, 1, 2, 3, 7 @@ -1456,7 +1475,7 @@ cglobal vp8_%2_loop_filter16_inner_%1, 5, %3, %4 %else mova [rsp+mmsize*4], m1 %endif - movu m7, [r3+r2*2] + movu m7, [dst2_reg+ stride_reg*2] TRANSPOSE4x4B 4, 5, 6, 7, 1 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 @@ -1479,31 +1498,31 @@ cglobal vp8_%2_loop_filter16_inner_%1, 5, %3, %4 SWAP 6, 3 SWAP 5, 3 %else ; sse2 (h) - lea r5, [r0+r2*8] + lea dst8_reg, [dst_reg + stride_reg*8] ; read 16 rows of 8px each, interleave - movh m0, [r0+r1*4] - movh m1, [r5+r1*4] - movh m2, [r0+r1*2] - movh m5, [r5+r1*2] - movh m3, [r0+r1] - movh m6, [r5+r1] - movh m4, [r0] - movh m7, [r5] + movh m0, [dst_reg +mstride_reg*4] + movh m1, [dst8_reg+mstride_reg*4] + movh m2, [dst_reg +mstride_reg*2] + movh m5, [dst8_reg+mstride_reg*2] + movh m3, [dst_reg +mstride_reg] + movh m6, [dst8_reg+mstride_reg] + movh m4, [dst_reg] + movh m7, [dst8_reg] punpcklbw m0, m1 ; A/I punpcklbw m2, m5 ; C/K punpcklbw m3, m6 ; D/L punpcklbw m4, m7 ; E/M - add r5, r2 - movh m1, [r3+r1*4] - movh m6, [r5+r1*4] - movh m5, [r3] - movh m7, [r5] + add dst8_reg, stride_reg + movh m1, [dst2_reg+mstride_reg*4] + movh m6, [dst8_reg+mstride_reg*4] + movh m5, [dst2_reg] + movh m7, [dst8_reg] punpcklbw m1, m6 ; B/J punpcklbw m5, m7 ; F/N - movh m6, [r3+r2] - movh m7, [r5+r2] + movh m6, [dst2_reg+ stride_reg] + movh m7, [dst8_reg+ stride_reg] punpcklbw m6, m7 ; G/O ; 8x16 transpose @@ -1513,8 +1532,8 @@ cglobal vp8_%2_loop_filter16_inner_%1, 5, %3, %4 %else mova [rsp+mmsize*4], m1 %endif - movh m7, [r3+r2*2] - movh m1, [r5+r2*2] + movh m7, [dst2_reg+ stride_reg*2] + movh m1, [dst8_reg+ stride_reg*2] punpcklbw m7, m1 ; H/P TRANSPOSE4x4B 4, 5, 6, 7, 1 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 @@ -1591,7 +1610,7 @@ cglobal vp8_%2_loop_filter16_inner_%1, 5, %3, %4 ; normal_limit and high_edge_variance for p1-p0, q1-q0 SWAP 7, 3 ; now m7 is zero %ifidn %2, v - mova m3, [r0+r1] ; p0 + mova m3, [dst_reg +mstride_reg] ; p0 %elifdef m14 SWAP 3, 14 %else @@ -1624,7 +1643,7 @@ cglobal vp8_%2_loop_filter16_inner_%1, 5, %3, %4 SWAP 6, 4 ; now m6 is I %ifidn %2, v - mova m4, [r0] ; q0 + mova m4, [dst_reg] ; q0 %elifdef m13 SWAP 4, 13 %else @@ -1775,37 +1794,37 @@ cglobal vp8_%2_loop_filter16_inner_%1, 5, %3, %4 ; store %ifidn %2, v - mova [r0+r1*2], m2 - mova [r0+r1], m3 - mova [r0], m4 - mova [r0+r2], m5 + mova [dst_reg+mstride_reg*2], m2 + mova [dst_reg+mstride_reg ], m3 + mova [dst_reg], m4 + mova [dst_reg+ stride_reg ], m5 %else ; h - add r0, 2 - add r3, 2 + add dst_reg, 2 + add dst2_reg, 2 ; 4x8/16 transpose TRANSPOSE4x4B 2, 3, 4, 5, 6 %if mmsize == 8 ; mmx/mmxext (h) - WRITE_4x2D 2, 3, 4, 5, r0, r3, r1, r2 + WRITE_4x2D 2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg %else ; sse2 (h) lea r5, [r5+r1+2] - WRITE_4x4D 2, 3, 4, 5, r0, r3, r5, r1, r2 + WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg %endif %endif %if mmsize == 8 %ifidn %2, h - lea r0, [r0+8*r2-2] + lea dst_reg, [dst_reg + stride_reg*8-2] %else ; v - add r0, 8 + add dst_reg, 8 %endif - dec r5 + dec cnt_reg jg .next8px %endif %ifndef m8 ; sse2 on x86-32 or mmx/mmxext - mov rsp, r4 ; restore stack pointer + mov rsp, stack_reg ; restore stack pointer %endif RET %endmacro @@ -1817,4 +1836,8 @@ INNER_LOOPFILTER mmxext, v, 6, 8 INNER_LOOPFILTER mmxext, h, 6, 8 INIT_XMM INNER_LOOPFILTER sse2, v, 5, 13 +%ifdef m8 +INNER_LOOPFILTER sse2, h, 5, 15 +%else INNER_LOOPFILTER sse2, h, 6, 15 +%endif