diff --git a/libavcodec/x86/vp8dsp-init.c b/libavcodec/x86/vp8dsp-init.c index 3e05bb2fb9..d3f1456b71 100644 --- a/libavcodec/x86/vp8dsp-init.c +++ b/libavcodec/x86/vp8dsp-init.c @@ -29,16 +29,16 @@ /* * MC functions */ -extern void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride, +extern void ff_put_vp8_epel4_h4_mmx2 (uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); -extern void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride, +extern void ff_put_vp8_epel4_h6_mmx2 (uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); -extern void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride, +extern void ff_put_vp8_epel4_v4_mmx2 (uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); -extern void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride, +extern void ff_put_vp8_epel4_v6_mmx2 (uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); @@ -80,7 +80,7 @@ extern void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); -extern void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride, +extern void ff_put_vp8_bilinear4_h_mmx2 (uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); extern void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride, @@ -93,7 +93,7 @@ extern void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); -extern void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride, +extern void ff_put_vp8_bilinear4_v_mmx2 (uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); extern void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride, @@ -139,27 +139,27 @@ static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \ } #if ARCH_X86_32 -TAP_W8 (mmxext, epel, h4) -TAP_W8 (mmxext, epel, h6) -TAP_W16(mmxext, epel, h6) -TAP_W8 (mmxext, epel, v4) -TAP_W8 (mmxext, epel, v6) -TAP_W16(mmxext, epel, v6) -TAP_W8 (mmxext, bilinear, h) -TAP_W16(mmxext, bilinear, h) -TAP_W8 (mmxext, bilinear, v) -TAP_W16(mmxext, bilinear, v) +TAP_W8 (mmx2, epel, h4) +TAP_W8 (mmx2, epel, h6) +TAP_W16(mmx2, epel, h6) +TAP_W8 (mmx2, epel, v4) +TAP_W8 (mmx2, epel, v6) +TAP_W16(mmx2, epel, v6) +TAP_W8 (mmx2, bilinear, h) +TAP_W16(mmx2, bilinear, h) +TAP_W8 (mmx2, bilinear, v) +TAP_W16(mmx2, bilinear, v) #endif -TAP_W16(sse2, epel, h6) -TAP_W16(sse2, epel, v6) -TAP_W16(sse2, bilinear, h) -TAP_W16(sse2, bilinear, v) +TAP_W16(sse2, epel, h6) +TAP_W16(sse2, epel, v6) +TAP_W16(sse2, bilinear, h) +TAP_W16(sse2, bilinear, v) -TAP_W16(ssse3, epel, h6) -TAP_W16(ssse3, epel, v6) -TAP_W16(ssse3, bilinear, h) -TAP_W16(ssse3, bilinear, v) +TAP_W16(ssse3, epel, h6) +TAP_W16(ssse3, epel, v6) +TAP_W16(ssse3, bilinear, h) +TAP_W16(ssse3, bilinear, v) #define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \ @@ -177,13 +177,13 @@ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT #if ARCH_X86_32 #define HVTAPMMX(x, y) \ -HVTAP(mmxext, 8, x, y, 4, 8) \ -HVTAP(mmxext, 8, x, y, 8, 16) +HVTAP(mmx2, 8, x, y, 4, 8) \ +HVTAP(mmx2, 8, x, y, 8, 16) -HVTAP(mmxext, 8, 6, 6, 16, 16) +HVTAP(mmx2, 8, 6, 6, 16, 16) #else #define HVTAPMMX(x, y) \ -HVTAP(mmxext, 8, x, y, 4, 8) +HVTAP(mmx2, 8, x, y, 4, 8) #endif HVTAPMMX(4, 4) @@ -218,16 +218,16 @@ static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \ dst, dststride, tmp, SIZE, height, mx, my); \ } -HVBILIN(mmxext, 8, 4, 8) +HVBILIN(mmx2, 8, 4, 8) #if ARCH_X86_32 -HVBILIN(mmxext, 8, 8, 16) -HVBILIN(mmxext, 8, 16, 16) +HVBILIN(mmx2, 8, 8, 16) +HVBILIN(mmx2, 8, 16, 16) #endif -HVBILIN(sse2, 8, 8, 16) -HVBILIN(sse2, 8, 16, 16) -HVBILIN(ssse3, 8, 4, 8) -HVBILIN(ssse3, 8, 8, 16) -HVBILIN(ssse3, 8, 16, 16) +HVBILIN(sse2, 8, 8, 16) +HVBILIN(sse2, 8, 16, 16) +HVBILIN(ssse3, 8, 4, 8) +HVBILIN(ssse3, 8, 8, 16) +HVBILIN(ssse3, 8, 16, 16) extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], ptrdiff_t stride); @@ -283,7 +283,7 @@ extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \ int e, int i, int hvt); DECLARE_LOOP_FILTER(mmx) -DECLARE_LOOP_FILTER(mmxext) +DECLARE_LOOP_FILTER(mmx2) DECLARE_LOOP_FILTER(sse2) DECLARE_LOOP_FILTER(ssse3) DECLARE_LOOP_FILTER(sse4) @@ -351,26 +351,26 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) /* note that 4-tap width=16 functions are missing because w=16 * is only used for luma, and luma is always a copy or sixtap. */ if (mm_flags & AV_CPU_FLAG_MMX2) { - VP8_MC_FUNC(2, 4, mmxext); - VP8_BILINEAR_MC_FUNC(2, 4, mmxext); + VP8_MC_FUNC(2, 4, mmx2); + VP8_BILINEAR_MC_FUNC(2, 4, mmx2); #if ARCH_X86_32 - VP8_LUMA_MC_FUNC(0, 16, mmxext); - VP8_MC_FUNC(1, 8, mmxext); - VP8_BILINEAR_MC_FUNC(0, 16, mmxext); - VP8_BILINEAR_MC_FUNC(1, 8, mmxext); - - c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext; - c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext; - - c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext; - c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext; - c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmxext; - c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmxext; - - c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmxext; - c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmxext; - c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmxext; - c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext; + VP8_LUMA_MC_FUNC(0, 16, mmx2); + VP8_MC_FUNC(1, 8, mmx2); + VP8_BILINEAR_MC_FUNC(0, 16, mmx2); + VP8_BILINEAR_MC_FUNC(1, 8, mmx2); + + c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx2; + c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx2; + + c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx2; + c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx2; + c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx2; + c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx2; + + c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmx2; + c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmx2; + c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmx2; + c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmx2; #endif } diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index a7b83797ea..f21045d405 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -173,8 +173,8 @@ SECTION .text ; int height, int mx, int my); ;----------------------------------------------------------------------------- -%macro FILTER_SSSE3 3 -cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2 +%macro FILTER_SSSE3 1 +cglobal put_vp8_epel%1_h6, 6, 6, 8 lea r5d, [r5*3] mova m3, [filter_h6_shuf2] mova m4, [filter_h6_shuf3] @@ -189,7 +189,7 @@ cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2 movu m0, [r2-2] mova m1, m0 mova m2, m0 -%ifidn %1, 4 +%if mmsize == 8 ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the ; shuffle with a memory operand punpcklbw m0, [r2+3] @@ -215,7 +215,7 @@ cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2 jg .nextrow REP_RET -cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3 +cglobal put_vp8_epel%1_h4, 6, 6, 7 shl r5d, 4 mova m2, [pw_64] mova m3, [filter_h2_shuf] @@ -246,7 +246,7 @@ cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3 jg .nextrow REP_RET -cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2 +cglobal put_vp8_epel%1_v4, 7, 7, 8 shl r6d, 4 %ifdef PIC lea r11, [fourtap_filter_hb_m] @@ -285,7 +285,7 @@ cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2 jg .nextrow REP_RET -cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2 +cglobal put_vp8_epel%1_v6, 7, 7, 8 lea r6d, [r6*3] %ifdef PIC lea r11, [sixtap_filter_hb_m] @@ -333,13 +333,14 @@ cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2 REP_RET %endmacro -INIT_MMX -FILTER_SSSE3 4, 0, 0 -INIT_XMM -FILTER_SSSE3 8, 8, 7 +INIT_MMX ssse3 +FILTER_SSSE3 4 +INIT_XMM ssse3 +FILTER_SSSE3 8 ; 4x4 block, H-only 4-tap filter -cglobal put_vp8_epel4_h4_mmxext, 6, 6 +INIT_MMX mmx2 +cglobal put_vp8_epel4_h4, 6, 6 shl r5d, 4 %ifdef PIC lea r11, [fourtap_filter_hw_m] @@ -386,7 +387,8 @@ cglobal put_vp8_epel4_h4_mmxext, 6, 6 REP_RET ; 4x4 block, H-only 6-tap filter -cglobal put_vp8_epel4_h6_mmxext, 6, 6 +INIT_MMX mmx2 +cglobal put_vp8_epel4_h6, 6, 6 lea r5d, [r5*3] %ifdef PIC lea r11, [sixtap_filter_hw_m] @@ -442,8 +444,8 @@ cglobal put_vp8_epel4_h6_mmxext, 6, 6 jg .nextrow REP_RET -INIT_XMM -cglobal put_vp8_epel8_h4_sse2, 6, 6, 10 +INIT_XMM sse2 +cglobal put_vp8_epel8_h4, 6, 6, 10 shl r5d, 5 %ifdef PIC lea r11, [fourtap_filter_v_m] @@ -490,7 +492,8 @@ cglobal put_vp8_epel8_h4_sse2, 6, 6, 10 jg .nextrow REP_RET -cglobal put_vp8_epel8_h6_sse2, 6, 6, 14 +INIT_XMM sse2 +cglobal put_vp8_epel8_h6, 6, 6, 14 lea r5d, [r5*3] shl r5d, 4 %ifdef PIC @@ -552,9 +555,9 @@ cglobal put_vp8_epel8_h6_sse2, 6, 6, 14 jg .nextrow REP_RET -%macro FILTER_V 3 +%macro FILTER_V 1 ; 4x4 block, V-only 4-tap filter -cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 +cglobal put_vp8_epel%1_v4, 7, 7, 8 shl r6d, 5 %ifdef PIC lea r11, [fourtap_filter_v_m] @@ -607,7 +610,7 @@ cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 ; 4x4 block, V-only 6-tap filter -cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 +cglobal put_vp8_epel%1_v6, 7, 7, 8 shl r6d, 4 lea r6, [r6*3] %ifdef PIC @@ -671,13 +674,13 @@ cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 REP_RET %endmacro -INIT_MMX -FILTER_V mmxext, 4, 0 -INIT_XMM -FILTER_V sse2, 8, 8 +INIT_MMX mmx2 +FILTER_V 4 +INIT_XMM sse2 +FILTER_V 8 -%macro FILTER_BILINEAR 3 -cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 +%macro FILTER_BILINEAR 1 +cglobal put_vp8_bilinear%1_v, 7, 7, 7 mov r5d, 8*16 shl r6d, 4 sub r5d, r6d @@ -705,7 +708,7 @@ cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 psraw m2, 2 pavgw m0, m6 pavgw m2, m6 -%ifidn %1, mmxext +%if mmsize == 8 packuswb m0, m0 packuswb m2, m2 movh [r0+r1*0], m0 @@ -722,7 +725,7 @@ cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 jg .nextrow REP_RET -cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 +cglobal put_vp8_bilinear%1_h, 7, 7, 7 mov r6d, 8*16 shl r5d, 4 sub r6d, r5d @@ -751,7 +754,7 @@ cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 psraw m2, 2 pavgw m0, m6 pavgw m2, m6 -%ifidn %1, mmxext +%if mmsize == 8 packuswb m0, m0 packuswb m2, m2 movh [r0+r1*0], m0 @@ -769,13 +772,13 @@ cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 REP_RET %endmacro -INIT_MMX -FILTER_BILINEAR mmxext, 4, 0 -INIT_XMM -FILTER_BILINEAR sse2, 8, 7 +INIT_MMX mmx2 +FILTER_BILINEAR 4 +INIT_XMM sse2 +FILTER_BILINEAR 8 %macro FILTER_BILINEAR_SSSE3 1 -cglobal put_vp8_bilinear%1_v_ssse3, 7,7 +cglobal put_vp8_bilinear%1_v, 7, 7, 5 shl r6d, 4 %ifdef PIC lea r11, [bilinear_filter_vb_m] @@ -811,7 +814,7 @@ cglobal put_vp8_bilinear%1_v_ssse3, 7,7 jg .nextrow REP_RET -cglobal put_vp8_bilinear%1_h_ssse3, 7,7 +cglobal put_vp8_bilinear%1_h, 7, 7, 5 shl r5d, 4 %ifdef PIC lea r11, [bilinear_filter_vb_m] @@ -848,12 +851,13 @@ cglobal put_vp8_bilinear%1_h_ssse3, 7,7 REP_RET %endmacro -INIT_MMX +INIT_MMX ssse3 FILTER_BILINEAR_SSSE3 4 -INIT_XMM +INIT_XMM ssse3 FILTER_BILINEAR_SSSE3 8 -cglobal put_vp8_pixels8_mmx, 5,5 +INIT_MMX mmx +cglobal put_vp8_pixels8, 5,5 .nextrow: movq mm0, [r2+r3*0] movq mm1, [r2+r3*1] @@ -866,7 +870,8 @@ cglobal put_vp8_pixels8_mmx, 5,5 REP_RET %if ARCH_X86_32 -cglobal put_vp8_pixels16_mmx, 5,5 +INIT_MMX mmx +cglobal put_vp8_pixels16, 5,5 .nextrow: movq mm0, [r2+r3*0+0] movq mm1, [r2+r3*0+8] @@ -883,7 +888,8 @@ cglobal put_vp8_pixels16_mmx, 5,5 REP_RET %endif -cglobal put_vp8_pixels16_sse, 5,5,2 +INIT_XMM sse +cglobal put_vp8_pixels16, 5,5,2 .nextrow: movups xmm0, [r2+r3*0] movups xmm1, [r2+r3*1] @@ -918,8 +924,8 @@ cglobal put_vp8_pixels16_sse, 5,5,2 %4 [r1+r2+%3], m5 %endmacro -INIT_MMX -cglobal vp8_idct_dc_add_mmx, 3, 3 +INIT_MMX mmx +cglobal vp8_idct_dc_add, 3, 3 ; load data movd m0, [r1] @@ -941,8 +947,8 @@ cglobal vp8_idct_dc_add_mmx, 3, 3 ADD_DC m0, m1, 0, movh RET -INIT_XMM -cglobal vp8_idct_dc_add_sse4, 3, 3, 6 +INIT_XMM sse4 +cglobal vp8_idct_dc_add, 3, 3, 6 ; load data movd m0, [r1] pxor m1, m1 @@ -976,8 +982,8 @@ cglobal vp8_idct_dc_add_sse4, 3, 3, 6 ;----------------------------------------------------------------------------- %if ARCH_X86_32 -INIT_MMX -cglobal vp8_idct_dc_add4y_mmx, 3, 3 +INIT_MMX mmx +cglobal vp8_idct_dc_add4y, 3, 3 ; load data movd m0, [r1+32*0] ; A movd m1, [r1+32*2] ; C @@ -1012,8 +1018,8 @@ cglobal vp8_idct_dc_add4y_mmx, 3, 3 RET %endif -INIT_XMM -cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6 +INIT_XMM sse2 +cglobal vp8_idct_dc_add4y, 3, 3, 6 ; load data movd m0, [r1+32*0] ; A movd m1, [r1+32*2] ; C @@ -1046,8 +1052,8 @@ cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6 ; void vp8_idct_dc_add4uv_(uint8_t *dst, DCTELEM block[4][16], int stride); ;----------------------------------------------------------------------------- -INIT_MMX -cglobal vp8_idct_dc_add4uv_mmx, 3, 3 +INIT_MMX mmx +cglobal vp8_idct_dc_add4uv, 3, 3 ; load data movd m0, [r1+32*0] ; A movd m1, [r1+32*2] ; C @@ -1118,9 +1124,8 @@ cglobal vp8_idct_dc_add4uv_mmx, 3, 3 SWAP %4, %3 %endmacro -INIT_MMX -%macro VP8_IDCT_ADD 1 -cglobal vp8_idct_add_%1, 3, 3 +%macro VP8_IDCT_ADD 0 +cglobal vp8_idct_add, 3, 3 ; load block data movq m0, [r1+ 0] movq m1, [r1+ 8] @@ -1128,7 +1133,7 @@ cglobal vp8_idct_add_%1, 3, 3 movq m3, [r1+24] movq m6, [pw_20091] movq m7, [pw_17734] -%ifidn %1, sse +%if cpuflag(sse) xorps xmm0, xmm0 movaps [r1+ 0], xmm0 movaps [r1+16], xmm0 @@ -1157,9 +1162,11 @@ cglobal vp8_idct_add_%1, 3, 3 %endmacro %if ARCH_X86_32 -VP8_IDCT_ADD mmx +INIT_MMX mmx +VP8_IDCT_ADD %endif -VP8_IDCT_ADD sse +INIT_MMX sse +VP8_IDCT_ADD ;----------------------------------------------------------------------------- ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) @@ -1192,13 +1199,13 @@ VP8_IDCT_ADD sse SWAP %1, %4, %3 %endmacro -%macro VP8_DC_WHT 1 -cglobal vp8_luma_dc_wht_%1, 2,3 +%macro VP8_DC_WHT 0 +cglobal vp8_luma_dc_wht, 2, 3 movq m0, [r1] movq m1, [r1+8] movq m2, [r1+16] movq m3, [r1+24] -%ifidn %1, sse +%if cpuflag(sse) xorps xmm0, xmm0 movaps [r1+ 0], xmm0 movaps [r1+16], xmm0 @@ -1222,11 +1229,12 @@ cglobal vp8_luma_dc_wht_%1, 2,3 RET %endmacro -INIT_MMX %if ARCH_X86_32 -VP8_DC_WHT mmx +INIT_MMX mmx +VP8_DC_WHT %endif -VP8_DC_WHT sse +INIT_MMX sse +VP8_DC_WHT ;----------------------------------------------------------------------------- ; void vp8_h/v_loop_filter_simple_(uint8_t *dst, int stride, int flim);