From 7d7487e85c066bf3f4e5821a49081f520b6bc1e7 Mon Sep 17 00:00:00 2001 From: James Almer Date: Mon, 10 Mar 2014 17:09:20 -0300 Subject: [PATCH] x86/float_dsp: add ff_vector_{fmul_add, fmac_scalar}_fma3 ~7% faster than AVX Signed-off-by: James Almer Signed-off-by: Michael Niedermayer --- libavutil/x86/float_dsp.asm | 24 +++++++++++++++++++++++- libavutil/x86/float_dsp_init.c | 8 ++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm index f762e34f6a..d0f4be8c53 100644 --- a/libavutil/x86/float_dsp.asm +++ b/libavutil/x86/float_dsp.asm @@ -80,10 +80,17 @@ cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len .loop: %assign a 0 %rep 32/mmsize +%if cpuflag(fma3) + mova m1, [dstq+lenq+(a+0)*mmsize] + mova m2, [dstq+lenq+(a+1)*mmsize] + fmaddps m1, m0, [srcq+lenq+(a+0)*mmsize], m1 + fmaddps m2, m0, [srcq+lenq+(a+1)*mmsize], m2 +%else mulps m1, m0, [srcq+lenq+(a+0)*mmsize] mulps m2, m0, [srcq+lenq+(a+1)*mmsize] addps m1, m1, [dstq+lenq+(a+0)*mmsize] addps m2, m2, [dstq+lenq+(a+1)*mmsize] +%endif mova [dstq+lenq+(a+0)*mmsize], m1 mova [dstq+lenq+(a+1)*mmsize], m2 %assign a a+2 @@ -99,6 +106,10 @@ VECTOR_FMAC_SCALAR INIT_YMM avx VECTOR_FMAC_SCALAR %endif +%if HAVE_FMA3_EXTERNAL +INIT_YMM fma3 +VECTOR_FMAC_SCALAR +%endif ;------------------------------------------------------------------------------ ; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len) @@ -182,16 +193,23 @@ VECTOR_DMUL_SCALAR ; const float *src2, int len) ;----------------------------------------------------------------------------- %macro VECTOR_FMUL_ADD 0 -cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len +cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len lea lenq, [lend*4 - 2*mmsize] ALIGN 16 .loop: mova m0, [src0q + lenq] mova m1, [src0q + lenq + mmsize] +%if cpuflag(fma3) + mova m2, [src2q + lenq] + mova m3, [src2q + lenq + mmsize] + fmaddps m0, m0, [src1q + lenq], m2 + fmaddps m1, m1, [src1q + lenq + mmsize], m3 +%else mulps m0, m0, [src1q + lenq] mulps m1, m1, [src1q + lenq + mmsize] addps m0, m0, [src2q + lenq] addps m1, m1, [src2q + lenq + mmsize] +%endif mova [dstq + lenq], m0 mova [dstq + lenq + mmsize], m1 @@ -206,6 +224,10 @@ VECTOR_FMUL_ADD INIT_YMM avx VECTOR_FMUL_ADD %endif +%if HAVE_FMA3_EXTERNAL +INIT_YMM fma3 +VECTOR_FMUL_ADD +%endif ;----------------------------------------------------------------------------- ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1, diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c index 97f7b7c7ca..88ffbc11b5 100644 --- a/libavutil/x86/float_dsp_init.c +++ b/libavutil/x86/float_dsp_init.c @@ -33,6 +33,8 @@ void ff_vector_fmac_scalar_sse(float *dst, const float *src, float mul, int len); void ff_vector_fmac_scalar_avx(float *dst, const float *src, float mul, int len); +void ff_vector_fmac_scalar_fma3(float *dst, const float *src, float mul, + int len); void ff_vector_fmul_scalar_sse(float *dst, const float *src, float mul, int len); @@ -46,6 +48,8 @@ void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1, const float *src2, int len); void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1, const float *src2, int len); +void ff_vector_fmul_add_fma3(float *dst, const float *src0, const float *src1, + const float *src2, int len); void ff_vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len); @@ -153,4 +157,8 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp) fdsp->vector_fmul_add = ff_vector_fmul_add_avx; fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx; } + if (EXTERNAL_FMA3(cpu_flags)) { + fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3; + fdsp->vector_fmul_add = ff_vector_fmul_add_fma3; + } }