x86/float_dsp: add ff_vector_{fmul_add, fmac_scalar}_fma3

~7% faster than AVX

Signed-off-by: James Almer <jamrial@gmail.com>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
pull/293/head
James Almer 11 years ago committed by Michael Niedermayer
parent 12ce58bebd
commit 7d7487e85c
  1. 24
      libavutil/x86/float_dsp.asm
  2. 8
      libavutil/x86/float_dsp_init.c

@ -80,10 +80,17 @@ cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len
.loop: .loop:
%assign a 0 %assign a 0
%rep 32/mmsize %rep 32/mmsize
%if cpuflag(fma3)
mova m1, [dstq+lenq+(a+0)*mmsize]
mova m2, [dstq+lenq+(a+1)*mmsize]
fmaddps m1, m0, [srcq+lenq+(a+0)*mmsize], m1
fmaddps m2, m0, [srcq+lenq+(a+1)*mmsize], m2
%else
mulps m1, m0, [srcq+lenq+(a+0)*mmsize] mulps m1, m0, [srcq+lenq+(a+0)*mmsize]
mulps m2, m0, [srcq+lenq+(a+1)*mmsize] mulps m2, m0, [srcq+lenq+(a+1)*mmsize]
addps m1, m1, [dstq+lenq+(a+0)*mmsize] addps m1, m1, [dstq+lenq+(a+0)*mmsize]
addps m2, m2, [dstq+lenq+(a+1)*mmsize] addps m2, m2, [dstq+lenq+(a+1)*mmsize]
%endif
mova [dstq+lenq+(a+0)*mmsize], m1 mova [dstq+lenq+(a+0)*mmsize], m1
mova [dstq+lenq+(a+1)*mmsize], m2 mova [dstq+lenq+(a+1)*mmsize], m2
%assign a a+2 %assign a a+2
@ -99,6 +106,10 @@ VECTOR_FMAC_SCALAR
INIT_YMM avx INIT_YMM avx
VECTOR_FMAC_SCALAR VECTOR_FMAC_SCALAR
%endif %endif
%if HAVE_FMA3_EXTERNAL
INIT_YMM fma3
VECTOR_FMAC_SCALAR
%endif
;------------------------------------------------------------------------------ ;------------------------------------------------------------------------------
; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len) ; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
@ -182,16 +193,23 @@ VECTOR_DMUL_SCALAR
; const float *src2, int len) ; const float *src2, int len)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro VECTOR_FMUL_ADD 0 %macro VECTOR_FMUL_ADD 0
cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len
lea lenq, [lend*4 - 2*mmsize] lea lenq, [lend*4 - 2*mmsize]
ALIGN 16 ALIGN 16
.loop: .loop:
mova m0, [src0q + lenq] mova m0, [src0q + lenq]
mova m1, [src0q + lenq + mmsize] mova m1, [src0q + lenq + mmsize]
%if cpuflag(fma3)
mova m2, [src2q + lenq]
mova m3, [src2q + lenq + mmsize]
fmaddps m0, m0, [src1q + lenq], m2
fmaddps m1, m1, [src1q + lenq + mmsize], m3
%else
mulps m0, m0, [src1q + lenq] mulps m0, m0, [src1q + lenq]
mulps m1, m1, [src1q + lenq + mmsize] mulps m1, m1, [src1q + lenq + mmsize]
addps m0, m0, [src2q + lenq] addps m0, m0, [src2q + lenq]
addps m1, m1, [src2q + lenq + mmsize] addps m1, m1, [src2q + lenq + mmsize]
%endif
mova [dstq + lenq], m0 mova [dstq + lenq], m0
mova [dstq + lenq + mmsize], m1 mova [dstq + lenq + mmsize], m1
@ -206,6 +224,10 @@ VECTOR_FMUL_ADD
INIT_YMM avx INIT_YMM avx
VECTOR_FMUL_ADD VECTOR_FMUL_ADD
%endif %endif
%if HAVE_FMA3_EXTERNAL
INIT_YMM fma3
VECTOR_FMUL_ADD
%endif
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void vector_fmul_reverse(float *dst, const float *src0, const float *src1, ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,

@ -33,6 +33,8 @@ void ff_vector_fmac_scalar_sse(float *dst, const float *src, float mul,
int len); int len);
void ff_vector_fmac_scalar_avx(float *dst, const float *src, float mul, void ff_vector_fmac_scalar_avx(float *dst, const float *src, float mul,
int len); int len);
void ff_vector_fmac_scalar_fma3(float *dst, const float *src, float mul,
int len);
void ff_vector_fmul_scalar_sse(float *dst, const float *src, float mul, void ff_vector_fmul_scalar_sse(float *dst, const float *src, float mul,
int len); int len);
@ -46,6 +48,8 @@ void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
const float *src2, int len); const float *src2, int len);
void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1, void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
const float *src2, int len); const float *src2, int len);
void ff_vector_fmul_add_fma3(float *dst, const float *src0, const float *src1,
const float *src2, int len);
void ff_vector_fmul_reverse_sse(float *dst, const float *src0, void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
const float *src1, int len); const float *src1, int len);
@ -153,4 +157,8 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
fdsp->vector_fmul_add = ff_vector_fmul_add_avx; fdsp->vector_fmul_add = ff_vector_fmul_add_avx;
fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx; fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
} }
if (EXTERNAL_FMA3(cpu_flags)) {
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3;
fdsp->vector_fmul_add = ff_vector_fmul_add_fma3;
}
} }

Loading…
Cancel
Save