diff --git a/libavfilter/x86/af_afir.asm b/libavfilter/x86/af_afir.asm index 2cc09709a2..ed0276c7b9 100644 --- a/libavfilter/x86/af_afir.asm +++ b/libavfilter/x86/af_afir.asm @@ -67,3 +67,30 @@ INIT_XMM sse3 FCMUL_ADD INIT_YMM avx FCMUL_ADD + +%if HAVE_FMA3_EXTERNAL +INIT_YMM fma3 +cglobal fcmul_add, 4,4,4, sum, t, c, len + shl lend, 3 + add tq, lenq + add cq, lenq + add sumq, lenq + neg lenq +.loop: + movaps m0, [tq + lenq] + movaps m1, [cq + lenq] + vpermilps m3, m0, 177 + vpermilps m2, m1, 160 + vpermilps m1, m1, 245 + mulps m1, m1, m3 + vfmaddsub132ps m0, m1, m2 + addps m0, m0, [sumq + lenq] + movaps [sumq + lenq], m0 + add lenq, mmsize + jl .loop + movss xm0, [tq + lenq] + mulss xm0, [cq + lenq] + addss xm0, [sumq + lenq] + movss [sumq + lenq], xm0 + RET +%endif diff --git a/libavfilter/x86/af_afir_init.c b/libavfilter/x86/af_afir_init.c index e53817b9c0..d573acf10b 100644 --- a/libavfilter/x86/af_afir_init.c +++ b/libavfilter/x86/af_afir_init.c @@ -26,6 +26,8 @@ void ff_fcmul_add_sse3(float *sum, const float *t, const float *c, ptrdiff_t len); void ff_fcmul_add_avx(float *sum, const float *t, const float *c, ptrdiff_t len); +void ff_fcmul_add_fma3(float *sum, const float *t, const float *c, + ptrdiff_t len); av_cold void ff_afir_init_x86(AudioFIRDSPContext *s) { @@ -37,4 +39,7 @@ av_cold void ff_afir_init_x86(AudioFIRDSPContext *s) if (EXTERNAL_AVX_FAST(cpu_flags)) { s->fcmul_add = ff_fcmul_add_avx; } + if (EXTERNAL_FMA3_FAST(cpu_flags)) { + s->fcmul_add = ff_fcmul_add_fma3; + } }