From 406fbd24dc62db4853cb24b24f40caf3e70ee2e8 Mon Sep 17 00:00:00 2001 From: Daniel Kang Date: Thu, 21 Jul 2011 21:15:58 -0400 Subject: [PATCH] H.264: Add optimizations to predict x86 assembly. Signed-off-by: Ronald S. Bultje --- libavcodec/x86/h264_intrapred.asm | 5 +- libavcodec/x86/h264_intrapred_10bit.asm | 1117 +++++++++-------------- libavcodec/x86/h264_intrapred_init.c | 29 +- 3 files changed, 437 insertions(+), 714 deletions(-) diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm index cbf3cf7a5c..c1cd5c4d25 100644 --- a/libavcodec/x86/h264_intrapred.asm +++ b/libavcodec/x86/h264_intrapred.asm @@ -2611,12 +2611,11 @@ cglobal pred4x4_down_left_mmxext, 3,3 punpckldq m1, [r1] movq m2, m1 movq m3, m1 - movq m4, m1 psllq m1, 8 pxor m2, m1 psrlq m2, 8 - pxor m3, m2 - PRED4x4_LOWPASS m0, m1, m3, m4, m5 + pxor m2, m3 + PRED4x4_LOWPASS m0, m1, m2, m3, m4 lea r1, [r0+r2*2] psrlq m0, 8 movd [r0+r2*1], m0 diff --git a/libavcodec/x86/h264_intrapred_10bit.asm b/libavcodec/x86/h264_intrapred_10bit.asm index 24a7bfa875..e14e31a38c 100644 --- a/libavcodec/x86/h264_intrapred_10bit.asm +++ b/libavcodec/x86/h264_intrapred_10bit.asm @@ -27,8 +27,6 @@ SECTION_RODATA -SECTION .text - cextern pw_16 cextern pw_8 cextern pw_4 @@ -42,6 +40,8 @@ pw_512: times 8 dw 512 pd_17: times 4 dd 17 pd_16: times 4 dd 16 +SECTION .text + ; dest, left, right, src ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 %macro PRED4x4_LOWPASS 4 @@ -64,13 +64,11 @@ cglobal pred4x4_down_right_10_%1, 3,3 movq m3, [r0] punpckhdq m1, m2 PALIGNR m3, m1, 10, m1 - mova m1, m3 movhps m4, [r1+r2*1-8] - PALIGNR m3, m4, 14, m4 - mova m2, m3 + PALIGNR m0, m3, m4, 14, m4 movhps m4, [r1+r2*2-8] - PALIGNR m3, m4, 14, m4 - PRED4x4_LOWPASS m0, m3, m1, m2 + PALIGNR m2, m0, m4, 14, m4 + PRED4x4_LOWPASS m0, m2, m3, m0 movq [r1+r2*2], m0 psrldq m0, 2 movq [r1+r2*1], m0 @@ -104,22 +102,20 @@ cglobal pred4x4_vertical_right_10_%1, 3,3,6 pavgw m5, m0 movhps m1, [r0+r2*1-8] PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0 - mova m1, m0 movhps m2, [r0+r2*2-8] - PALIGNR m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1 - mova m2, m0 + PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1 movhps m3, [r1+r2*1-8] - PALIGNR m0, m3, 14, m3 ; t3t2t1t0ltl0l1l2 - PRED4x4_LOWPASS m3, m1, m0, m2 - pslldq m1, m3, 12 - psrldq m3, 4 + PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2 + PRED4x4_LOWPASS m1, m0, m2, m1 + pslldq m0, m1, 12 + psrldq m1, 4 movq [r0+r2*1], m5 - movq [r0+r2*2], m3 - PALIGNR m5, m1, 14, m2 - pslldq m1, 2 + movq [r0+r2*2], m1 + PALIGNR m5, m0, 14, m2 + pslldq m0, 2 movq [r1+r2*1], m5 - PALIGNR m3, m1, 14, m1 - movq [r1+r2*2], m3 + PALIGNR m1, m0, 14, m0 + movq [r1+r2*2], m1 RET %endmacro @@ -152,9 +148,9 @@ cglobal pred4x4_horizontal_down_10_%1, 3,3 punpckhdq m1, m2 ; l0 l1 l2 l3 punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1 - psrldq m2, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2 - pavgw m5, m1, m2 - PRED4x4_LOWPASS m3, m1, m0, m2 + psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2 + pavgw m5, m1, m3 + PRED4x4_LOWPASS m3, m1, m0, m3 punpcklwd m5, m3 psrldq m3, 8 PALIGNR m3, m5, 12, m4 @@ -220,17 +216,15 @@ cglobal pred4x4_dc_10_mmxext, 3,3 ;----------------------------------------------------------------------------- ; void pred4x4_down_left(pixel *src, const pixel *topright, int stride) ;----------------------------------------------------------------------------- -;TODO: more AVX here %macro PRED4x4_DL 1 cglobal pred4x4_down_left_10_%1, 3,3 sub r0, r2 - movq m1, [r0] - movhps m1, [r1] - pslldq m5, m1, 2 - pxor m2, m5, m1 - psrldq m2, 2 - pxor m3, m1, m2 - PRED4x4_LOWPASS m0, m5, m3, m1 + movq m0, [r0] + movhps m0, [r1] + psrldq m2, m0, 2 + pslldq m3, m0, 2 + pshufhw m2, m2, 10100100b + PRED4x4_LOWPASS m0, m3, m2, m0 lea r1, [r0+r2*2] movhps [r1+r2*2], m0 psrldq m0, 2 @@ -257,10 +251,10 @@ cglobal pred4x4_vertical_left_10_%1, 3,3 sub r0, r2 movu m1, [r0] movhps m1, [r1] - psrldq m3, m1, 2 + psrldq m0, m1, 2 psrldq m2, m1, 4 - pavgw m4, m3, m1 - PRED4x4_LOWPASS m0, m1, m2, m3 + pavgw m4, m0, m1 + PRED4x4_LOWPASS m0, m1, m2, m0 lea r1, [r0+r2*2] movq [r0+r2*1], m4 movq [r0+r2*2], m0 @@ -298,13 +292,13 @@ cglobal pred4x4_horizontal_up_10_mmxext, 3,3 pavgw m2, m0 pshufw m5, m0, 11111110b - PRED4x4_LOWPASS m3, m0, m5, m1 + PRED4x4_LOWPASS m1, m0, m5, m1 movq m6, m2 - punpcklwd m6, m3 + punpcklwd m6, m1 movq [r0+r2*1], m6 psrlq m2, 16 - psrlq m3, 16 - punpcklwd m2, m3 + psrlq m1, 16 + punpcklwd m2, m1 movq [r0+r2*2], m2 psrlq m2, 32 movd [r1+r2*1], m2 @@ -333,7 +327,7 @@ cglobal pred8x8_vertical_10_sse2, 2,2 ;----------------------------------------------------------------------------- INIT_XMM cglobal pred8x8_horizontal_10_sse2, 2,3 - mov r2, 4 + mov r2d, 4 .loop: movq m0, [r0+r1*0-8] movq m1, [r0+r1*1-8] @@ -344,7 +338,7 @@ cglobal pred8x8_horizontal_10_sse2, 2,3 mova [r0+r1*0], m0 mova [r0+r1*1], m1 lea r0, [r0+r1*2] - dec r2 + dec r2d jg .loop REP_RET @@ -362,53 +356,53 @@ cglobal pred8x8_horizontal_10_sse2, 2,3 %endmacro %macro PRED8x8_DC 2 -cglobal pred8x8_dc_10_%1, 2,4 -%ifdef ARCH_X86_64 -%define t0 r10 -%else -%define t0 r0m -%endif +cglobal pred8x8_dc_10_%1, 2,6 sub r0, r1 pxor m4, m4 movq m0, [r0+0] movq m1, [r0+8] - HADDW m0, m2 - mov t0, r0 - HADDW m1, m2 +%if mmsize==16 + punpcklwd m0, m1 + movhlps m1, m0 + paddw m0, m1 +%else + pshufw m2, m0, 00001110b + pshufw m3, m1, 00001110b + paddw m0, m2 + paddw m1, m3 + punpcklwd m0, m1 +%endif + %2 m2, m0, 00001110b + paddw m0, m2 + lea r5, [r1*3] + lea r4, [r0+r1*4] movzx r2d, word [r0+r1*1-2] movzx r3d, word [r0+r1*2-2] - lea r0, [r0+r1*2] add r2d, r3d - movzx r3d, word [r0+r1*1-2] + movzx r3d, word [r0+r5*1-2] add r2d, r3d - movzx r3d, word [r0+r1*2-2] + movzx r3d, word [r4-2] add r2d, r3d - lea r0, [r0+r1*2] movd m2, r2d ; s2 - movzx r2d, word [r0+r1*1-2] - movzx r3d, word [r0+r1*2-2] - lea r0, [r0+r1*2] + movzx r2d, word [r4+r1*1-2] + movzx r3d, word [r4+r1*2-2] add r2d, r3d - movzx r3d, word [r0+r1*1-2] + movzx r3d, word [r4+r5*1-2] add r2d, r3d - movzx r3d, word [r0+r1*2-2] + movzx r3d, word [r4+r1*4-2] add r2d, r3d movd m3, r2d ; s3 - punpcklwd m0, m1 - mov r0, t0 punpcklwd m2, m3 punpckldq m0, m2 ; s0, s1, s2, s3 %2 m3, m0, 11110110b ; s2, s1, s3, s3 - lea r2, [r1+r1*2] %2 m0, m0, 01110100b ; s0, s1, s3, s1 paddw m0, m3 - lea r3, [r0+r1*4] psrlw m0, 2 pavgw m0, m4 ; s0+s2, s1, s3, s1+s3 -%ifidn %1, sse2 +%if mmsize==16 punpcklwd m0, m0 pshufd m3, m0, 11111010b punpckldq m0, m0 @@ -421,12 +415,12 @@ cglobal pred8x8_dc_10_%1, 2,4 %endif MOV8 r0+r1*1, m1, m2 MOV8 r0+r1*2, m1, m2 - MOV8 r0+r2*1, m1, m2 + MOV8 r0+r5*1, m1, m2 MOV8 r0+r1*4, m1, m2 - MOV8 r3+r1*1, m3, m4 - MOV8 r3+r1*2, m3, m4 - MOV8 r3+r2*1, m3, m4 - MOV8 r3+r1*4, m3, m4 + MOV8 r4+r1*1, m3, m4 + MOV8 r4+r1*2, m3, m4 + MOV8 r4+r5*1, m3, m4 + MOV8 r4+r1*4, m3, m4 RET %endmacro @@ -438,39 +432,29 @@ PRED8x8_DC sse2 , pshuflw ;----------------------------------------------------------------------------- ; void pred8x8_top_dc(pixel *src, int stride) ;----------------------------------------------------------------------------- -%macro PRED8x8_TOP_DC 2 -cglobal pred8x8_top_dc_10_%1, 2,4 +INIT_XMM +cglobal pred8x8_top_dc_10_sse2, 2,4 sub r0, r1 - movq m0, [r0+0] - movq m1, [r0+8] - HADDW m0, m2 - HADDW m1, m3 - lea r2, [r1+r1*2] - paddw m0, [pw_2] - paddw m1, [pw_2] + mova m0, [r0] + pshuflw m1, m0, 0x4e + pshufhw m1, m1, 0x4e + paddw m0, m1 + pshuflw m1, m0, 0xb1 + pshufhw m1, m1, 0xb1 + paddw m0, m1 + lea r2, [r1*3] lea r3, [r0+r1*4] + paddw m0, [pw_2] psrlw m0, 2 - psrlw m1, 2 - %2 m0, m0, 0 - %2 m1, m1, 0 -%ifidn %1, sse2 - punpcklqdq m0, m1 -%endif - MOV8 r0+r1*1, m0, m1 - MOV8 r0+r1*2, m0, m1 - MOV8 r0+r2*1, m0, m1 - MOV8 r0+r1*4, m0, m1 - MOV8 r3+r1*1, m0, m1 - MOV8 r3+r1*2, m0, m1 - MOV8 r3+r2*1, m0, m1 - MOV8 r3+r1*4, m0, m1 + mova [r0+r1*1], m0 + mova [r0+r1*2], m0 + mova [r0+r2*1], m0 + mova [r0+r1*4], m0 + mova [r3+r1*1], m0 + mova [r3+r1*2], m0 + mova [r3+r2*1], m0 + mova [r3+r1*4], m0 RET -%endmacro - -INIT_MMX -PRED8x8_TOP_DC mmxext, pshufw -INIT_XMM -PRED8x8_TOP_DC sse2 , pshuflw ;----------------------------------------------------------------------------- ; void pred8x8_plane(pixel *src, int stride) @@ -478,7 +462,7 @@ PRED8x8_TOP_DC sse2 , pshuflw INIT_XMM cglobal pred8x8_plane_10_sse2, 2,7,7 sub r0, r1 - lea r2, [r1+r1*2] + lea r2, [r1*3] lea r3, [r0+r1*4] mova m2, [r0] pmaddwd m2, [pw_m32101234] @@ -500,7 +484,7 @@ cglobal pred8x8_plane_10_sse2, 2,7,7 movzx r5d, word [r3+r2*1-2] ; src[6*stride-1] movzx r6d, word [r0+r1*1-2] ; src[0*stride-1] sub r5d, r6d - lea r5d, [r5+r5*2] + lea r5d, [r5*3] add r4d, r5d movzx r6d, word [r3+r1*4-2] ; src[7*stride-1] movzx r5d, word [r0+r1*0-2] ; src[ -stride-1] @@ -540,8 +524,8 @@ cglobal pred8x8_plane_10_sse2, 2,7,7 ;----------------------------------------------------------------------------- %macro PRED8x8L_128_DC 1 cglobal pred8x8l_128_dc_10_%1, 4,4 - mova m0, [pw_512] - lea r1, [r3+r3*2] + mova m0, [pw_512] ; (1<<(BIT_DEPTH-1)) + lea r1, [r3*3] lea r2, [r0+r3*4] MOV8 r0+r3*0, m0, m0 MOV8 r0+r3*1, m0, m0 @@ -565,37 +549,17 @@ PRED8x8L_128_DC sse2 %macro PRED8x8L_TOP_DC 1 cglobal pred8x8l_top_dc_10_%1, 4,4,6 sub r0, r3 - pxor m7, m7 - mova m0, [r0-16] - mova m3, [r0] - mova m1, [r0+16] - mova m2, m3 - mova m4, m3 - PALIGNR m2, m0, 14, m0 - PALIGNR m1, m4, 2, m4 - test r1, r1 ; top_left - jz .fix_lt_2 - test r2, r2 ; top_right - jz .fix_tr_1 - jmp .body -.fix_lt_2: - mova m5, m3 - pxor m5, m2 - pslldq m5, 14 - psrldq m5, 14 - pxor m2, m5 - test r2, r2 ; top_right - jnz .body -.fix_tr_1: - mova m5, m3 - pxor m5, m1 - psrldq m5, 14 - pslldq m5, 14 - pxor m1, m5 -.body - lea r1, [r3+r3*2] + mova m0, [r0] + shr r1d, 14 + shr r2d, 13 + neg r1 + pslldq m1, m0, 2 + psrldq m2, m0, 2 + pinsrw m1, [r0+r1], 0 + pinsrw m2, [r0+r2+14], 7 + lea r1, [r3*3] lea r2, [r0+r3*4] - PRED4x4_LOWPASS m0, m2, m1, m3 + PRED4x4_LOWPASS m0, m2, m1, m0 HADDW m0, m1 paddw m0, [pw_4] psrlw m0, 3 @@ -612,110 +576,70 @@ cglobal pred8x8l_top_dc_10_%1, 4,4,6 %endmacro INIT_XMM -%define PALIGNR PALIGNR_MMX PRED8x8L_TOP_DC sse2 -%define PALIGNR PALIGNR_SSSE3 -PRED8x8L_TOP_DC ssse3 +%ifdef HAVE_AVX +INIT_AVX +PRED8x8L_TOP_DC avx +%endif ;----------------------------------------------------------------------------- ;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride) ;----------------------------------------------------------------------------- ;TODO: see if scalar is faster %macro PRED8x8L_DC 1 -cglobal pred8x8l_dc_10_%1, 4,5,8 +cglobal pred8x8l_dc_10_%1, 4,6,6 sub r0, r3 - lea r4, [r0+r3*2] - mova m0, [r0+r3*1-16] - punpckhwd m0, [r0+r3*0-16] - mova m1, [r4+r3*1-16] - punpckhwd m1, [r0+r3*2-16] - mov r4, r0 + lea r4, [r0+r3*4] + lea r5, [r3*3] + mova m0, [r0+r3*2-16] + punpckhwd m0, [r0+r3*1-16] + mova m1, [r4+r3*0-16] + punpckhwd m1, [r0+r5*1-16] punpckhdq m1, m0 - lea r0, [r0+r3*4] - mova m2, [r0+r3*1-16] - punpckhwd m2, [r0+r3*0-16] - lea r0, [r0+r3*2] - mova m3, [r0+r3*1-16] - punpckhwd m3, [r0+r3*0-16] + mova m2, [r4+r3*2-16] + punpckhwd m2, [r4+r3*1-16] + mova m3, [r4+r3*4-16] + punpckhwd m3, [r4+r5*1-16] punpckhdq m3, m2 punpckhqdq m3, m1 - lea r0, [r0+r3*2] - mova m0, [r0+r3*0-16] - mova m1, [r4] - mov r0, r4 - mova m4, m3 - mova m2, m3 - PALIGNR m4, m0, 14, m0 - PALIGNR m1, m2, 2, m2 - test r1, r1 - jnz .do_left -.fix_lt_1: - mova m5, m3 - pxor m5, m4 - psrldq m5, 14 - pslldq m5, 12 - pxor m1, m5 - jmp .do_left -.fix_lt_2: - mova m5, m3 - pxor m5, m2 - pslldq m5, 14 - psrldq m5, 14 - pxor m2, m5 - test r2, r2 - jnz .body -.fix_tr_1: - mova m5, m3 - pxor m5, m1 - psrldq m5, 14 - pslldq m5, 14 - pxor m1, m5 - jmp .body -.do_left: - mova m0, m4 - PRED4x4_LOWPASS m2, m1, m4, m3 - mova m4, m0 - mova m7, m2 - PRED4x4_LOWPASS m1, m3, m0, m4 - pslldq m1, 14 - PALIGNR m7, m1, 14, m3 - mova m0, [r0-16] - mova m3, [r0] - mova m1, [r0+16] - mova m2, m3 - mova m4, m3 - PALIGNR m2, m0, 14, m0 - PALIGNR m1, m4, 2, m4 - test r1, r1 - jz .fix_lt_2 - test r2, r2 - jz .fix_tr_1 -.body - lea r1, [r3+r3*2] - PRED4x4_LOWPASS m6, m2, m1, m3 - HADDW m7, m0 - HADDW m6, m0 - lea r2, [r0+r3*4] - paddw m7, [pw_8] - paddw m7, m6 - psrlw m7, 4 - SPLATW m7, m7 - mova [r0+r3*1], m7 - mova [r0+r3*2], m7 - mova [r0+r1*1], m7 - mova [r0+r3*4], m7 - mova [r2+r3*1], m7 - mova [r2+r3*2], m7 - mova [r2+r1*1], m7 - mova [r2+r3*4], m7 + mova m0, [r0] + shr r1d, 14 + shr r2d, 13 + neg r1 + pslldq m1, m0, 2 + psrldq m2, m0, 2 + pinsrw m1, [r0+r1], 0 + pinsrw m2, [r0+r2+14], 7 + not r1 + and r1, r3 + pslldq m4, m3, 2 + psrldq m5, m3, 2 + pshuflw m4, m4, 11100101b + pinsrw m5, [r0+r1-2], 7 + PRED4x4_LOWPASS m3, m4, m5, m3 + PRED4x4_LOWPASS m0, m2, m1, m0 + paddw m0, m3 + HADDW m0, m1 + paddw m0, [pw_8] + psrlw m0, 4 + SPLATW m0, m0 + mova [r0+r3*1], m0 + mova [r0+r3*2], m0 + mova [r0+r5*1], m0 + mova [r0+r3*4], m0 + mova [r4+r3*1], m0 + mova [r4+r3*2], m0 + mova [r4+r5*1], m0 + mova [r4+r3*4], m0 RET %endmacro INIT_XMM -%define PALIGNR PALIGNR_MMX PRED8x8L_DC sse2 -%define PALIGNR PALIGNR_SSSE3 -PRED8x8L_DC ssse3 +%ifdef HAVE_AVX +INIT_AVX +PRED8x8L_DC avx +%endif ;----------------------------------------------------------------------------- ; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride) @@ -723,36 +647,17 @@ PRED8x8L_DC ssse3 %macro PRED8x8L_VERTICAL 1 cglobal pred8x8l_vertical_10_%1, 4,4,6 sub r0, r3 - mova m0, [r0-16] - mova m3, [r0] - mova m1, [r0+16] - mova m2, m3 - mova m4, m3 - PALIGNR m2, m0, 14, m0 - PALIGNR m1, m4, 2, m4 - test r1, r1 ; top_left - jz .fix_lt_2 - test r2, r2 ; top_right - jz .fix_tr_1 - jmp .body -.fix_lt_2: - mova m5, m3 - pxor m5, m2 - pslldq m5, 14 - psrldq m5, 14 - pxor m2, m5 - test r2, r2 ; top_right - jnz .body -.fix_tr_1: - mova m5, m3 - pxor m5, m1 - psrldq m5, 14 - pslldq m5, 14 - pxor m1, m5 -.body - lea r1, [r3+r3*2] + mova m0, [r0] + shr r1d, 14 + shr r2d, 13 + neg r1 + pslldq m1, m0, 2 + psrldq m2, m0, 2 + pinsrw m1, [r0+r1], 0 + pinsrw m2, [r0+r2+14], 7 + lea r1, [r3*3] lea r2, [r0+r3*4] - PRED4x4_LOWPASS m0, m2, m1, m3 + PRED4x4_LOWPASS m0, m2, m1, m0 mova [r0+r3*1], m0 mova [r0+r3*2], m0 mova [r0+r1*1], m0 @@ -765,70 +670,56 @@ cglobal pred8x8l_vertical_10_%1, 4,4,6 %endmacro INIT_XMM -%define PALIGNR PALIGNR_MMX PRED8x8L_VERTICAL sse2 -%define PALIGNR PALIGNR_SSSE3 -PRED8x8L_VERTICAL ssse3 +%ifdef HAVE_AVX +INIT_AVX +PRED8x8L_VERTICAL avx +%endif ;----------------------------------------------------------------------------- ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride) ;----------------------------------------------------------------------------- %macro PRED8x8L_HORIZONTAL 1 -cglobal pred8x8l_horizontal_10_%1, 4,4,8 - sub r0, r3 - lea r2, [r0+r3*2] - mova m0, [r0+r3*1-16] - test r1, r1 - lea r1, [r0+r3] - cmovnz r1, r0 - punpckhwd m0, [r1+r3*0-16] - mova m1, [r2+r3*1-16] - punpckhwd m1, [r0+r3*2-16] - mov r2, r0 +cglobal pred8x8l_horizontal_10_%1, 4,4,5 + mova m0, [r0-16] + shr r1d, 14 + dec r1 + and r1, r3 + sub r1, r3 + punpckhwd m0, [r0+r1-16] + mova m1, [r0+r3*2-16] + punpckhwd m1, [r0+r3*1-16] + lea r2, [r0+r3*4] + lea r1, [r3*3] punpckhdq m1, m0 - lea r0, [r0+r3*4] - mova m2, [r0+r3*1-16] - punpckhwd m2, [r0+r3*0-16] - lea r0, [r0+r3*2] - mova m3, [r0+r3*1-16] - punpckhwd m3, [r0+r3*0-16] + mova m2, [r2+r3*0-16] + punpckhwd m2, [r0+r1-16] + mova m3, [r2+r3*2-16] + punpckhwd m3, [r2+r3*1-16] punpckhdq m3, m2 punpckhqdq m3, m1 - lea r0, [r0+r3*2] - mova m0, [r0+r3*0-16] - mova m1, [r1+r3*0-16] - mov r0, r2 - mova m4, m3 - mova m2, m3 - PALIGNR m4, m0, 14, m0 - PALIGNR m1, m2, 2, m2 - mova m0, m4 - PRED4x4_LOWPASS m2, m1, m4, m3 - mova m4, m0 - mova m7, m2 - PRED4x4_LOWPASS m1, m3, m0, m4 - pslldq m1, 14 - PALIGNR m7, m1, 14, m3 - lea r1, [r3+r3*2] - punpckhwd m3, m7, m7 - punpcklwd m7, m7 + PALIGNR m4, m3, [r2+r1-16], 14, m0 + pslldq m0, m4, 2 + pshuflw m0, m0, 11100101b + PRED4x4_LOWPASS m4, m3, m0, m4 + punpckhwd m3, m4, m4 + punpcklwd m4, m4 pshufd m0, m3, 0xff pshufd m1, m3, 0xaa - lea r2, [r0+r3*4] pshufd m2, m3, 0x55 pshufd m3, m3, 0x00 - pshufd m4, m7, 0xff - pshufd m5, m7, 0xaa - pshufd m6, m7, 0x55 - pshufd m7, m7, 0x00 - mova [r0+r3*1], m0 - mova [r0+r3*2], m1 - mova [r0+r1*1], m2 - mova [r0+r3*4], m3 - mova [r2+r3*1], m4 - mova [r2+r3*2], m5 - mova [r2+r1*1], m6 - mova [r2+r3*4], m7 + mova [r0+r3*0], m0 + mova [r0+r3*1], m1 + mova [r0+r3*2], m2 + mova [r0+r1*1], m3 + pshufd m0, m4, 0xff + pshufd m1, m4, 0xaa + pshufd m2, m4, 0x55 + pshufd m3, m4, 0x00 + mova [r2+r3*0], m0 + mova [r2+r3*1], m1 + mova [r2+r3*2], m2 + mova [r2+r1*1], m3 RET %endmacro @@ -837,116 +728,68 @@ INIT_XMM PRED8x8L_HORIZONTAL sse2 %define PALIGNR PALIGNR_SSSE3 PRED8x8L_HORIZONTAL ssse3 +%ifdef HAVE_AVX +INIT_AVX +PRED8x8L_HORIZONTAL avx +%endif ;----------------------------------------------------------------------------- ;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride) ;----------------------------------------------------------------------------- %macro PRED8x8L_DOWN_LEFT 1 -cglobal pred8x8l_down_left_10_%1, 4,4,8 +cglobal pred8x8l_down_left_10_%1, 4,4,7 sub r0, r3 - mova m0, [r0-16] mova m3, [r0] + shr r1d, 14 + neg r1 + shr r2d, 13 + pslldq m1, m3, 2 + psrldq m2, m3, 2 + pinsrw m1, [r0+r1], 0 + pinsrw m2, [r0+r2+14], 7 + PRED4x4_LOWPASS m6, m2, m1, m3 + jz .fix_tr ; flags from shr r2d mova m1, [r0+16] - mova m2, m3 - mova m4, m3 - PALIGNR m2, m0, 14, m0 - PALIGNR m1, m4, 2, m4 - test r1, r1 - jz .fix_lt_2 - test r2, r2 - jz .fix_tr_1 - jmp .do_top -.fix_lt_2: - mova m5, m3 - pxor m5, m2 - pslldq m5, 14 - psrldq m5, 14 - pxor m2, m5 - test r2, r2 - jnz .do_top -.fix_tr_1: - mova m5, m3 - pxor m5, m1 - psrldq m5, 14 - pslldq m5, 14 - pxor m1, m5 - jmp .do_top -.fix_tr_2: - punpckhwd m3, m3 - pshufd m1, m3, 0xFF - jmp .do_topright -.do_top: - PRED4x4_LOWPASS m4, m2, m1, m3 - mova m7, m4 - test r2, r2 - jz .fix_tr_2 - mova m0, [r0+16] - mova m5, m0 - mova m2, m0 - mova m4, m0 - psrldq m5, 14 - PALIGNR m2, m3, 14, m3 - PALIGNR m5, m4, 2, m4 - PRED4x4_LOWPASS m1, m2, m5, m0 + psrldq m5, m1, 2 + PALIGNR m2, m1, m3, 14, m3 + pshufhw m5, m5, 10100100b + PRED4x4_LOWPASS m1, m2, m5, m1 .do_topright: - lea r1, [r3+r3*2] - mova m6, m1 - psrldq m1, 14 - mova m4, m1 + lea r1, [r3*3] + psrldq m5, m1, 14 lea r2, [r0+r3*4] - mova m2, m6 - PALIGNR m2, m7, 2, m0 - mova m3, m6 - PALIGNR m3, m7, 14, m0 - PALIGNR m4, m6, 2, m0 - mova m5, m7 - mova m1, m7 - mova m7, m6 - pslldq m1, 2 - PRED4x4_LOWPASS m0, m1, m2, m5 - PRED4x4_LOWPASS m1, m3, m4, m7 + PALIGNR m2, m1, m6, 2, m0 + PALIGNR m3, m1, m6, 14, m0 + PALIGNR m5, m1, 2, m0 + pslldq m4, m6, 2 + PRED4x4_LOWPASS m6, m4, m2, m6 + PRED4x4_LOWPASS m1, m3, m5, m1 mova [r2+r3*4], m1 - mova m2, m0 - pslldq m1, 2 - psrldq m2, 14 - pslldq m0, 2 - por m1, m2 + PALIGNR m1, m6, 14, m2 + pslldq m6, 2 mova [r2+r1*1], m1 - mova m2, m0 - pslldq m1, 2 - psrldq m2, 14 - pslldq m0, 2 - por m1, m2 + PALIGNR m1, m6, 14, m2 + pslldq m6, 2 mova [r2+r3*2], m1 - mova m2, m0 - pslldq m1, 2 - psrldq m2, 14 - pslldq m0, 2 - por m1, m2 + PALIGNR m1, m6, 14, m2 + pslldq m6, 2 mova [r2+r3*1], m1 - mova m2, m0 - pslldq m1, 2 - psrldq m2, 14 - pslldq m0, 2 - por m1, m2 + PALIGNR m1, m6, 14, m2 + pslldq m6, 2 mova [r0+r3*4], m1 - mova m2, m0 - pslldq m1, 2 - psrldq m2, 14 - pslldq m0, 2 - por m1, m2 + PALIGNR m1, m6, 14, m2 + pslldq m6, 2 mova [r0+r1*1], m1 - mova m2, m0 - pslldq m1, 2 - psrldq m2, 14 - pslldq m0, 2 - por m1, m2 + PALIGNR m1, m6, 14, m2 + pslldq m6, 2 mova [r0+r3*2], m1 - pslldq m1, 2 - psrldq m0, 14 - por m1, m0 + PALIGNR m1, m6, 14, m6 mova [r0+r3*1], m1 RET +.fix_tr: + punpckhwd m3, m3 + pshufd m1, m3, 0xFF + jmp .do_topright %endmacro INIT_XMM @@ -954,139 +797,73 @@ INIT_XMM PRED8x8L_DOWN_LEFT sse2 %define PALIGNR PALIGNR_SSSE3 PRED8x8L_DOWN_LEFT ssse3 +%ifdef HAVE_AVX +INIT_AVX +PRED8x8L_DOWN_LEFT avx +%endif ;----------------------------------------------------------------------------- -;void pred8x8l_down_right_mxext(pixel *src, int has_topleft, int has_topright, int stride) +;void pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, int stride) ;----------------------------------------------------------------------------- %macro PRED8x8L_DOWN_RIGHT 1 +; standard forbids this when has_topleft is false +; no need to check cglobal pred8x8l_down_right_10_%1, 4,5,8 sub r0, r3 - lea r4, [r0+r3*2] + lea r4, [r0+r3*4] + lea r1, [r3*3] mova m0, [r0+r3*1-16] punpckhwd m0, [r0+r3*0-16] - mova m1, [r4+r3*1-16] + mova m1, [r0+r1*1-16] punpckhwd m1, [r0+r3*2-16] - mov r4, r0 punpckhdq m1, m0 - lea r0, [r0+r3*4] - mova m2, [r0+r3*1-16] - punpckhwd m2, [r0+r3*0-16] - lea r0, [r0+r3*2] - mova m3, [r0+r3*1-16] - punpckhwd m3, [r0+r3*0-16] + mova m2, [r4+r3*1-16] + punpckhwd m2, [r4+r3*0-16] + mova m3, [r4+r1*1-16] + punpckhwd m3, [r4+r3*2-16] punpckhdq m3, m2 punpckhqdq m3, m1 - lea r0, [r0+r3*2] - mova m0, [r0+r3*0-16] - mova m1, [r4] - mov r0, r4 - mova m4, m3 - mova m2, m3 - PALIGNR m4, m0, 14, m0 - PALIGNR m1, m2, 2, m2 - test r1, r1 ; top_left - jz .fix_lt_1 -.do_left: - mova m0, m4 - PRED4x4_LOWPASS m2, m1, m4, m3 - mova m4, m0 - mova m7, m2 - mova m6, m2 - PRED4x4_LOWPASS m1, m3, m0, m4 - pslldq m1, 14 - PALIGNR m7, m1, 14, m3 - mova m0, [r0-16] + mova m0, [r4+r3*4-16] + mova m1, [r0] + PALIGNR m4, m3, m0, 14, m0 + PALIGNR m1, m3, 2, m2 + pslldq m0, m4, 2 + pshuflw m0, m0, 11100101b + PRED4x4_LOWPASS m6, m1, m4, m3 + PRED4x4_LOWPASS m4, m3, m0, m4 mova m3, [r0] - mova m1, [r0+16] - mova m2, m3 - mova m4, m3 - PALIGNR m2, m0, 14, m0 - PALIGNR m1, m4, 2, m4 - test r1, r1 ; top_left - jz .fix_lt_2 - test r2, r2 ; top_right - jz .fix_tr_1 -.do_top: - PRED4x4_LOWPASS m4, m2, m1, m3 - mova m5, m4 - jmp .body -.fix_lt_1: - mova m5, m3 - pxor m5, m4 - psrldq m5, 14 - pslldq m5, 12 - pxor m1, m5 - jmp .do_left -.fix_lt_2: - mova m5, m3 - pxor m5, m2 - pslldq m5, 14 - psrldq m5, 14 - pxor m2, m5 - test r2, r2 ; top_right - jnz .do_top -.fix_tr_1: - mova m5, m3 - pxor m5, m1 - psrldq m5, 14 - pslldq m5, 14 - pxor m1, m5 - jmp .do_top -.body - lea r1, [r3+r3*2] - mova m1, m7 - mova m7, m5 - mova m5, m6 - mova m2, m7 - lea r2, [r0+r3*4] - PALIGNR m2, m6, 2, m0 - mova m3, m7 - PALIGNR m3, m6, 14, m0 - mova m4, m7 - psrldq m4, 2 - PRED4x4_LOWPASS m0, m1, m2, m5 - PRED4x4_LOWPASS m1, m3, m4, m7 - mova [r2+r3*4], m0 - mova m2, m1 - psrldq m0, 2 - pslldq m2, 14 - psrldq m1, 2 - por m0, m2 - mova [r2+r1*1], m0 - mova m2, m1 - psrldq m0, 2 - pslldq m2, 14 - psrldq m1, 2 - por m0, m2 - mova [r2+r3*2], m0 - mova m2, m1 - psrldq m0, 2 - pslldq m2, 14 - psrldq m1, 2 - por m0, m2 - mova [r2+r3*1], m0 - mova m2, m1 - psrldq m0, 2 - pslldq m2, 14 - psrldq m1, 2 - por m0, m2 - mova [r0+r3*4], m0 - mova m2, m1 - psrldq m0, 2 - pslldq m2, 14 - psrldq m1, 2 - por m0, m2 - mova [r0+r1*1], m0 - mova m2, m1 - psrldq m0, 2 - pslldq m2, 14 - psrldq m1, 2 - por m0, m2 - mova [r0+r3*2], m0 - psrldq m0, 2 - pslldq m1, 14 - por m0, m1 - mova [r0+r3*1], m0 + shr r2d, 13 + pslldq m1, m3, 2 + psrldq m2, m3, 2 + pinsrw m1, [r0-2], 0 + pinsrw m2, [r0+r2+14], 7 + PRED4x4_LOWPASS m3, m2, m1, m3 + PALIGNR m2, m3, m6, 2, m0 + PALIGNR m5, m3, m6, 14, m0 + psrldq m7, m3, 2 + PRED4x4_LOWPASS m6, m4, m2, m6 + PRED4x4_LOWPASS m3, m5, m7, m3 + mova [r4+r3*4], m6 + PALIGNR m3, m6, 14, m2 + pslldq m6, 2 + mova [r0+r3*1], m3 + PALIGNR m3, m6, 14, m2 + pslldq m6, 2 + mova [r0+r3*2], m3 + PALIGNR m3, m6, 14, m2 + pslldq m6, 2 + mova [r0+r1*1], m3 + PALIGNR m3, m6, 14, m2 + pslldq m6, 2 + mova [r0+r3*4], m3 + PALIGNR m3, m6, 14, m2 + pslldq m6, 2 + mova [r4+r3*1], m3 + PALIGNR m3, m6, 14, m2 + pslldq m6, 2 + mova [r4+r3*2], m3 + PALIGNR m3, m6, 14, m6 + mova [r4+r1*1], m3 RET %endmacro @@ -1095,114 +872,69 @@ INIT_XMM PRED8x8L_DOWN_RIGHT sse2 %define PALIGNR PALIGNR_SSSE3 PRED8x8L_DOWN_RIGHT ssse3 +%ifdef HAVE_AVX +INIT_AVX +PRED8x8L_DOWN_RIGHT avx +%endif ;----------------------------------------------------------------------------- ; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride) ;----------------------------------------------------------------------------- %macro PRED8x8L_VERTICAL_RIGHT 1 -cglobal pred8x8l_vertical_right_10_%1, 4,5,8 +; likewise with 8x8l_down_right +cglobal pred8x8l_vertical_right_10_%1, 4,5,7 sub r0, r3 - lea r4, [r0+r3*2] + lea r4, [r0+r3*4] + lea r1, [r3*3] mova m0, [r0+r3*1-16] punpckhwd m0, [r0+r3*0-16] - mova m1, [r4+r3*1-16] + mova m1, [r0+r1*1-16] punpckhwd m1, [r0+r3*2-16] - mov r4, r0 punpckhdq m1, m0 - lea r0, [r0+r3*4] - mova m2, [r0+r3*1-16] - punpckhwd m2, [r0+r3*0-16] - lea r0, [r0+r3*2] - mova m3, [r0+r3*1-16] - punpckhwd m3, [r0+r3*0-16] + mova m2, [r4+r3*1-16] + punpckhwd m2, [r4+r3*0-16] + mova m3, [r4+r1*1-16] + punpckhwd m3, [r4+r3*2-16] punpckhdq m3, m2 punpckhqdq m3, m1 - lea r0, [r0+r3*2] - mova m0, [r0+r3*0-16] - mova m1, [r4] - mov r0, r4 - mova m4, m3 - mova m2, m3 - PALIGNR m4, m0, 14, m0 - PALIGNR m1, m2, 2, m2 - test r1, r1 - jz .fix_lt_1 - jmp .do_left -.fix_lt_1: - mova m5, m3 - pxor m5, m4 - psrldq m5, 14 - pslldq m5, 12 - pxor m1, m5 - jmp .do_left -.fix_lt_2: - mova m5, m3 - pxor m5, m2 - pslldq m5, 14 - psrldq m5, 14 - pxor m2, m5 - test r2, r2 - jnz .do_top -.fix_tr_1: - mova m5, m3 - pxor m5, m1 - psrldq m5, 14 - pslldq m5, 14 - pxor m1, m5 - jmp .do_top -.do_left: - mova m0, m4 - PRED4x4_LOWPASS m2, m1, m4, m3 - mova m7, m2 - mova m0, [r0-16] - mova m3, [r0] - mova m1, [r0+16] - mova m2, m3 - mova m4, m3 - PALIGNR m2, m0, 14, m0 - PALIGNR m1, m4, 2, m4 - test r1, r1 - jz .fix_lt_2 - test r2, r2 - jz .fix_tr_1 -.do_top - PRED4x4_LOWPASS m6, m2, m1, m3 - lea r1, [r3+r3*2] - mova m2, m6 - mova m3, m6 - PALIGNR m3, m7, 14, m0 - PALIGNR m6, m7, 12, m1 - mova m4, m3 - pavgw m3, m2 - lea r2, [r0+r3*4] - PRED4x4_LOWPASS m0, m6, m2, m4 - mova [r0+r3*1], m3 + mova m0, [r4+r3*4-16] + mova m1, [r0] + PALIGNR m4, m3, m0, 14, m0 + PALIGNR m1, m3, 2, m2 + PRED4x4_LOWPASS m3, m1, m4, m3 + mova m2, [r0] + shr r2d, 13 + pslldq m1, m2, 2 + psrldq m5, m2, 2 + pinsrw m1, [r0-2], 0 + pinsrw m5, [r0+r2+14], 7 + PRED4x4_LOWPASS m2, m5, m1, m2 + PALIGNR m6, m2, m3, 12, m1 + PALIGNR m5, m2, m3, 14, m0 + PRED4x4_LOWPASS m0, m6, m2, m5 + pavgw m2, m5 mova [r0+r3*2], m0 - mova m5, m0 - mova m6, m3 - mova m1, m7 - mova m2, m1 - pslldq m2, 2 - mova m3, m1 - pslldq m3, 4 - PRED4x4_LOWPASS m0, m1, m3, m2 - PALIGNR m6, m0, 14, m2 - mova [r0+r1*1], m6 - pslldq m0, 2 - PALIGNR m5, m0, 14, m1 - mova [r0+r3*4], m5 - pslldq m0, 2 - PALIGNR m6, m0, 14, m2 - mova [r2+r3*1], m6 - pslldq m0, 2 - PALIGNR m5, m0, 14, m1 - mova [r2+r3*2], m5 - pslldq m0, 2 - PALIGNR m6, m0, 14, m2 - mova [r2+r1*1], m6 - pslldq m0, 2 - PALIGNR m5, m0, 14, m1 - mova [r2+r3*4], m5 + mova [r0+r3*1], m2 + pslldq m6, m3, 4 + pslldq m1, m3, 2 + PRED4x4_LOWPASS m1, m3, m6, m1 + PALIGNR m2, m1, 14, m4 + mova [r0+r1*1], m2 + pslldq m1, 2 + PALIGNR m0, m1, 14, m3 + mova [r0+r3*4], m0 + pslldq m1, 2 + PALIGNR m2, m1, 14, m4 + mova [r4+r3*1], m2 + pslldq m1, 2 + PALIGNR m0, m1, 14, m3 + mova [r4+r3*2], m0 + pslldq m1, 2 + PALIGNR m2, m1, 14, m4 + mova [r4+r1*1], m2 + pslldq m1, 2 + PALIGNR m0, m1, 14, m1 + mova [r4+r3*4], m0 RET %endmacro @@ -1211,84 +943,60 @@ INIT_XMM PRED8x8L_VERTICAL_RIGHT sse2 %define PALIGNR PALIGNR_SSSE3 PRED8x8L_VERTICAL_RIGHT ssse3 +%ifdef HAVE_AVX +INIT_AVX +PRED8x8L_VERTICAL_RIGHT avx +%endif ;----------------------------------------------------------------------------- ; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride) ;----------------------------------------------------------------------------- %macro PRED8x8L_HORIZONTAL_UP 1 -cglobal pred8x8l_horizontal_up_10_%1, 4,4,8 - sub r0, r3 - lea r2, [r0+r3*2] - mova m0, [r0+r3*1-16] - test r1, r1 - lea r1, [r0+r3] - cmovnz r1, r0 - punpckhwd m0, [r1+r3*0-16] - mova m1, [r2+r3*1-16] - punpckhwd m1, [r0+r3*2-16] - mov r2, r0 - punpckhdq m1, m0 - lea r0, [r0+r3*4] - mova m2, [r0+r3*1-16] - punpckhwd m2, [r0+r3*0-16] - lea r0, [r0+r3*2] - mova m3, [r0+r3*1-16] - punpckhwd m3, [r0+r3*0-16] - punpckhdq m3, m2 - punpckhqdq m3, m1 - lea r0, [r0+r3*2] +cglobal pred8x8l_horizontal_up_10_%1, 4,4,6 mova m0, [r0+r3*0-16] - mova m1, [r1+r3*0-16] - mov r0, r2 - mova m4, m3 - mova m2, m3 - PALIGNR m4, m0, 14, m0 - PALIGNR m1, m2, 2, m2 - mova m0, m4 - PRED4x4_LOWPASS m2, m1, m4, m3 - mova m4, m0 - mova m7, m2 - PRED4x4_LOWPASS m1, m3, m0, m4 - pslldq m1, 14 - PALIGNR m7, m1, 14, m3 - lea r1, [r3+r3*2] - pshufd m0, m7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1 - pslldq m7, 14 ; l7 .. .. .. .. .. .. .. - mova m2, m0 - pslld m0, 16 - psrld m2, 16 - por m2, m0 ; l7 l6 l5 l4 l3 l2 l1 l0 - mova m3, m2 - mova m4, m2 - mova m5, m2 - psrldq m2, 2 - psrldq m3, 4 + punpckhwd m0, [r0+r3*1-16] + shr r1d, 14 + dec r1 + and r1, r3 + sub r1, r3 + mova m4, [r0+r1*1-16] + lea r1, [r3*3] lea r2, [r0+r3*4] - por m2, m7 ; l7 l7 l6 l5 l4 l3 l2 l1 - punpckhwd m7, m7 - por m3, m7 ; l7 l7 l7 l6 l5 l4 l3 l2 - pavgw m4, m2 - PRED4x4_LOWPASS m1, m3, m5, m2 - mova m5, m4 - punpcklwd m4, m1 ; p4 p3 p2 p1 - punpckhwd m5, m1 ; p8 p7 p6 p5 - mova m6, m5 - mova m7, m5 - mova m0, m5 - PALIGNR m5, m4, 4, m1 - pshufd m1, m6, 11111001b - PALIGNR m6, m4, 8, m2 - pshufd m2, m7, 11111110b - PALIGNR m7, m4, 12, m3 - pshufd m3, m0, 11111111b - mova [r0+r3*1], m4 - mova [r0+r3*2], m5 - mova [r0+r1*1], m6 - mova [r0+r3*4], m7 + mova m1, [r0+r3*2-16] + punpckhwd m1, [r0+r1*1-16] + punpckhdq m0, m1 + mova m2, [r2+r3*0-16] + punpckhwd m2, [r2+r3*1-16] + mova m3, [r2+r3*2-16] + punpckhwd m3, [r2+r1*1-16] + punpckhdq m2, m3 + punpckhqdq m0, m2 + PALIGNR m1, m0, m4, 14, m4 + psrldq m2, m0, 2 + pshufhw m2, m2, 10100100b + PRED4x4_LOWPASS m0, m1, m2, m0 + psrldq m1, m0, 2 + psrldq m2, m0, 4 + pshufhw m1, m1, 10100100b + pshufhw m2, m2, 01010100b + pavgw m4, m0, m1 + PRED4x4_LOWPASS m1, m2, m0, m1 + punpckhwd m5, m4, m1 + punpcklwd m4, m1 + mova [r2+r3*0], m5 + mova [r0+r3*0], m4 + pshufd m0, m5, 11111001b + pshufd m1, m5, 11111110b + pshufd m2, m5, 11111111b mova [r2+r3*1], m0 mova [r2+r3*2], m1 mova [r2+r1*1], m2 - mova [r2+r3*4], m3 + PALIGNR m2, m5, m4, 4, m0 + PALIGNR m3, m5, m4, 8, m1 + PALIGNR m5, m5, m4, 12, m4 + mova [r0+r3*1], m2 + mova [r0+r3*2], m3 + mova [r0+r1*1], m5 RET %endmacro @@ -1297,7 +1005,10 @@ INIT_XMM PRED8x8L_HORIZONTAL_UP sse2 %define PALIGNR PALIGNR_SSSE3 PRED8x8L_HORIZONTAL_UP ssse3 - +%ifdef HAVE_AVX +INIT_AVX +PRED8x8L_HORIZONTAL_UP avx +%endif ;----------------------------------------------------------------------------- @@ -1315,7 +1026,7 @@ PRED8x8L_HORIZONTAL_UP ssse3 %macro PRED16x16_VERTICAL 1 cglobal pred16x16_vertical_10_%1, 2,3 sub r0, r1 - mov r2, 8 + mov r2d, 8 mova m0, [r0+ 0] mova m1, [r0+mmsize] %if mmsize==8 @@ -1326,7 +1037,7 @@ cglobal pred16x16_vertical_10_%1, 2,3 MOV16 r0+r1*1, m0, m1, m2, m3 MOV16 r0+r1*2, m0, m1, m2, m3 lea r0, [r0+r1*2] - dec r2 + dec r2d jg .loop REP_RET %endmacro @@ -1341,7 +1052,7 @@ PRED16x16_VERTICAL sse2 ;----------------------------------------------------------------------------- %macro PRED16x16_HORIZONTAL 1 cglobal pred16x16_horizontal_10_%1, 2,3 - mov r2, 8 + mov r2d, 8 .vloop: movd m0, [r0+r1*0-4] movd m1, [r0+r1*1-4] @@ -1350,7 +1061,7 @@ cglobal pred16x16_horizontal_10_%1, 2,3 MOV16 r0+r1*0, m0, m0, m0, m0 MOV16 r0+r1*1, m1, m1, m1, m1 lea r0, [r0+r1*2] - dec r2 + dec r2d jg .vloop REP_RET %endmacro @@ -1364,8 +1075,8 @@ PRED16x16_HORIZONTAL sse2 ; void pred16x16_dc(pixel *src, int stride) ;----------------------------------------------------------------------------- %macro PRED16x16_DC 1 -cglobal pred16x16_dc_10_%1, 2,7 - mov r4, r0 +cglobal pred16x16_dc_10_%1, 2,6 + mov r5, r0 sub r0, r1 mova m0, [r0+0] paddw m0, [r0+mmsize] @@ -1375,17 +1086,17 @@ cglobal pred16x16_dc_10_%1, 2,7 %endif HADDW m0, m2 - sub r0, 2 - movzx r3d, word [r0+r1*1] - movzx r5d, word [r0+r1*2] + lea r0, [r0+r1-2] + movzx r3d, word [r0] + movzx r4d, word [r0+r1] %rep 7 lea r0, [r0+r1*2] - movzx r2d, word [r0+r1*1] + movzx r2d, word [r0] add r3d, r2d - movzx r2d, word [r0+r1*2] - add r5d, r2d + movzx r2d, word [r0+r1] + add r4d, r2d %endrep - lea r3d, [r3+r5+16] + lea r3d, [r3+r4+16] movd m1, r3d paddw m0, m1 @@ -1393,9 +1104,9 @@ cglobal pred16x16_dc_10_%1, 2,7 SPLATW m0, m0 mov r3d, 8 .loop: - MOV16 r4+r1*0, m0, m0, m0, m0 - MOV16 r4+r1*1, m0, m0, m0, m0 - lea r4, [r4+r1*2] + MOV16 r5+r1*0, m0, m0, m0, m0 + MOV16 r5+r1*1, m0, m0, m0, m0 + lea r5, [r5+r1*2] dec r3d jg .loop REP_RET @@ -1442,29 +1153,29 @@ PRED16x16_TOP_DC sse2 ; void pred16x16_left_dc(pixel *src, int stride) ;----------------------------------------------------------------------------- %macro PRED16x16_LEFT_DC 1 -cglobal pred16x16_left_dc_10_%1, 2,7 - mov r4, r0 +cglobal pred16x16_left_dc_10_%1, 2,6 + mov r5, r0 sub r0, 2 - movzx r5d, word [r0+r1*0] - movzx r6d, word [r0+r1*1] + movzx r3d, word [r0] + movzx r4d, word [r0+r1] %rep 7 lea r0, [r0+r1*2] - movzx r2d, word [r0+r1*0] - movzx r3d, word [r0+r1*1] - add r5d, r2d - add r6d, r3d + movzx r2d, word [r0] + add r3d, r2d + movzx r2d, word [r0+r1] + add r4d, r2d %endrep - lea r2d, [r5+r6+8] - shr r2d, 4 + lea r3d, [r3+r4+8] + shr r3d, 4 - movd m0, r2d + movd m0, r3d SPLATW m0, m0 mov r3d, 8 .loop: - MOV16 r4+r1*0, m0, m0, m0, m0 - MOV16 r4+r1*1, m0, m0, m0, m0 - lea r4, [r4+r1*2] + MOV16 r5+r1*0, m0, m0, m0, m0 + MOV16 r5+r1*1, m0, m0, m0, m0 + lea r5, [r5+r1*2] dec r3d jg .loop REP_RET diff --git a/libavcodec/x86/h264_intrapred_init.c b/libavcodec/x86/h264_intrapred_init.c index 62e4c8796b..55387f623e 100644 --- a/libavcodec/x86/h264_intrapred_init.c +++ b/libavcodec/x86/h264_intrapred_init.c @@ -45,7 +45,6 @@ void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int stride); PRED8x8(dc, 10, mmxext) PRED8x8(dc, 10, sse2) -PRED8x8(top_dc, 10, mmxext) PRED8x8(top_dc, 10, sse2) PRED8x8(plane, 10, sse2) PRED8x8(vertical, 10, sse2) @@ -55,23 +54,28 @@ PRED8x8(horizontal, 10, sse2) void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int has_topleft, int has_topright, int stride); PRED8x8L(dc, 10, sse2) -PRED8x8L(dc, 10, ssse3) +PRED8x8L(dc, 10, avx) PRED8x8L(128_dc, 10, mmxext) PRED8x8L(128_dc, 10, sse2) PRED8x8L(top_dc, 10, sse2) -PRED8x8L(top_dc, 10, ssse3) +PRED8x8L(top_dc, 10, avx) PRED8x8L(vertical, 10, sse2) -PRED8x8L(vertical, 10, ssse3) +PRED8x8L(vertical, 10, avx) PRED8x8L(horizontal, 10, sse2) PRED8x8L(horizontal, 10, ssse3) +PRED8x8L(horizontal, 10, avx) PRED8x8L(down_left, 10, sse2) PRED8x8L(down_left, 10, ssse3) +PRED8x8L(down_left, 10, avx) PRED8x8L(down_right, 10, sse2) PRED8x8L(down_right, 10, ssse3) +PRED8x8L(down_right, 10, avx) PRED8x8L(vertical_right, 10, sse2) PRED8x8L(vertical_right, 10, ssse3) +PRED8x8L(vertical_right, 10, avx) PRED8x8L(horizontal_up, 10, sse2) PRED8x8L(horizontal_up, 10, ssse3) +PRED8x8L(horizontal_up, 10, avx) #define PRED16x16(TYPE, DEPTH, OPT)\ void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int stride); @@ -298,7 +302,6 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth h->pred4x4[HOR_UP_PRED ] = ff_pred4x4_horizontal_up_10_mmxext; h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_mmxext; - h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_mmxext; h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_mmxext; @@ -344,18 +347,28 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_ssse3; h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_ssse3; - h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_ssse3; h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_ssse3; - h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_ssse3; - h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_ssse3; h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_ssse3; + h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_ssse3; + h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_ssse3; + h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_ssse3; } #if HAVE_AVX if (mm_flags & AV_CPU_FLAG_AVX) { h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_avx; h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_avx; + h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_avx; h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_avx; h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_avx; + + h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_avx; + h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_avx; + h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_avx; + h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_avx; + h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_avx; + h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_avx; + h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_avx; + h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_avx; } #endif /* HAVE_AVX */ }