diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c index 76bb06e61b..6438644867 100644 --- a/libavcodec/x86/vp9dsp_init.c +++ b/libavcodec/x86/vp9dsp_init.c @@ -283,9 +283,7 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_##opt; \ } \ dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_##opt; \ - if (ARCH_X86_64) { \ - dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_##opt; \ - } \ + dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_##opt; \ dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_##opt; \ if (ARCH_X86_64) { \ dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_##opt; \ diff --git a/libavcodec/x86/vp9lpf.asm b/libavcodec/x86/vp9lpf.asm index 57536b9d83..881bdab8a8 100644 --- a/libavcodec/x86/vp9lpf.asm +++ b/libavcodec/x86/vp9lpf.asm @@ -291,38 +291,6 @@ SECTION .text SWAP %12, %14 %endmacro -; transpose 16 half lines (high part) to 8 full centered lines -%macro TRANSPOSE16x8B 16 - punpcklbw m%1, m%2 - punpcklbw m%3, m%4 - punpcklbw m%5, m%6 - punpcklbw m%7, m%8 - punpcklbw m%9, m%10 - punpcklbw m%11, m%12 - punpcklbw m%13, m%14 - punpcklbw m%15, m%16 - SBUTTERFLY wd, %1, %3, %2 - SBUTTERFLY wd, %5, %7, %2 - SBUTTERFLY wd, %9, %11, %2 - SBUTTERFLY wd, %13, %15, %2 - SBUTTERFLY dq, %1, %5, %2 - SBUTTERFLY dq, %3, %7, %2 - SBUTTERFLY dq, %9, %13, %2 - SBUTTERFLY dq, %11, %15, %2 - SBUTTERFLY qdq, %1, %9, %2 - SBUTTERFLY qdq, %3, %11, %2 - SBUTTERFLY qdq, %5, %13, %2 - SBUTTERFLY qdq, %7, %15, %2 - SWAP %5, %1 - SWAP %6, %9 - SWAP %7, %1 - SWAP %8, %13 - SWAP %9, %3 - SWAP %10, %11 - SWAP %11, %1 - SWAP %12, %15 -%endmacro - %macro DEFINE_REAL_P7_TO_Q7 0-1 0 %define P7 dstq + 4*mstrideq + %1 %define P6 dstq + mstride3q + %1 @@ -398,6 +366,7 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride, movx m5, [P2] movx m6, [P1] movx m7, [P0] +%if ARCH_X86_64 movx m8, [Q0] movx m9, [Q1] movx m10, [Q2] @@ -406,32 +375,67 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride, movx m13, [Q5] movx m14, [Q6] movx m15, [Q7] -%define P7 rsp + 0 -%define P6 rsp + 16 -%define P5 rsp + 32 -%define P4 rsp + 48 -%define P3 rsp + 64 -%define P2 rsp + 80 -%define P1 rsp + 96 -%define P0 rsp + 112 -%define Q0 rsp + 128 -%define Q1 rsp + 144 -%define Q2 rsp + 160 -%define Q3 rsp + 176 +%if %2 == 16 + TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp] +%define P7 rsp + 128 +%define P6 rsp + 144 +%define P5 rsp + 160 +%define P4 rsp + 176 %define Q4 rsp + 192 %define Q5 rsp + 208 %define Q6 rsp + 224 %define Q7 rsp + 240 - -%if %2 == 16 - TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp] mova [P7], m0 mova [P6], m1 mova [P5], m2 mova [P4], m3 %else - TRANSPOSE16x8B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 -%endif + ; 8x16 transpose + punpcklbw m0, m1 + punpcklbw m2, m3 + punpcklbw m4, m5 + punpcklbw m6, m7 + punpcklbw m8, m9 + punpcklbw m10, m11 + punpcklbw m12, m13 + punpcklbw m14, m15 + TRANSPOSE8x8W 0, 2, 4, 6, 8, 10, 12, 14, 15 + SWAP 0, 4 + SWAP 2, 5 + SWAP 0, 6 + SWAP 0, 7 + SWAP 10, 9 + SWAP 12, 10 + SWAP 14, 11 +%endif +%else ; x86-32 + punpcklbw m0, m1 + punpcklbw m2, m3 + punpcklbw m4, m5 + punpcklbw m6, m7 + movx m1, [Q0] + movx m3, [Q1] + movx m5, [Q2] + movx m7, [Q3] + punpcklbw m1, m3 + punpcklbw m5, m7 + movx m3, [Q4] + movx m7, [Q5] + punpcklbw m3, m7 + mova [rsp], m3 + movx m3, [Q6] + movx m7, [Q7] + punpcklbw m3, m7 +%endif +%define P3 rsp + 0 +%define P2 rsp + 16 +%define P1 rsp + 32 +%define P0 rsp + 48 +%define Q0 rsp + 64 +%define Q1 rsp + 80 +%define Q2 rsp + 96 +%define Q3 rsp + 112 +%if ARCH_X86_64 mova [P3], m4 mova [P2], m5 mova [P1], m6 @@ -446,7 +450,17 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride, mova [Q6], m14 mova [Q7], m15 %endif +%else ; x86-32 + TRANSPOSE8x8W 0, 2, 4, 6, 1, 5, 7, 3, [rsp], [Q0], 1 + mova [P3], m0 + mova [P2], m2 + mova [P1], m4 + mova [P0], m6 + mova [Q1], m5 + mova [Q2], m7 + mova [Q3], m3 %endif +%endif ; %1 == h ; calc fm mask %if %2 == 16 @@ -962,22 +976,22 @@ cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride, RET %endmacro -%macro LPF_16_VH 4 -INIT_XMM %4 -LOOPFILTER v, %1, %2, 0, %3 -%if ARCH_X86_64 -LOOPFILTER h, %1, %2, 256, %3 +%macro LPF_16_VH 5 +INIT_XMM %5 +LOOPFILTER v, %1, %2, 0, %4 +%if ARCH_X86_64 || %1 == 44 +LOOPFILTER h, %1, %2, %3, %4 %endif %endmacro -%macro LPF_16_VH_ALL_OPTS 2-3 0 -LPF_16_VH %1, %2, %3, sse2 -LPF_16_VH %1, %2, %3, ssse3 -LPF_16_VH %1, %2, %3, avx +%macro LPF_16_VH_ALL_OPTS 4 +LPF_16_VH %1, %2, %3, %4, sse2 +LPF_16_VH %1, %2, %3, %4, ssse3 +LPF_16_VH %1, %2, %3, %4, avx %endmacro -LPF_16_VH_ALL_OPTS 16, 512, 32 -LPF_16_VH_ALL_OPTS 44, 0, 0 -LPF_16_VH_ALL_OPTS 48, 256, 16 -LPF_16_VH_ALL_OPTS 84, 256, 16 -LPF_16_VH_ALL_OPTS 88, 256, 16 +LPF_16_VH_ALL_OPTS 16, 512, 256, 32 +LPF_16_VH_ALL_OPTS 44, 0, 128, 0 +LPF_16_VH_ALL_OPTS 48, 256, 128, 16 +LPF_16_VH_ALL_OPTS 84, 256, 128, 16 +LPF_16_VH_ALL_OPTS 88, 256, 128, 16