From b545179fdff1ccfbbb9d422e4e9720cb6c6d9191 Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Tue, 18 Jun 2013 21:30:43 +0000 Subject: [PATCH] x86: lpc: simd av_evaluate_lls 1.5x-1.8x faster on sandybridge Signed-off-by: Luca Barbato --- libavutil/x86/lls.asm | 38 ++++++++++++++++++++++++++++++++++++++ libavutil/x86/lls_init.c | 3 +++ 2 files changed, 41 insertions(+) diff --git a/libavutil/x86/lls.asm b/libavutil/x86/lls.asm index 92c00fcda1..92b7f955c2 100644 --- a/libavutil/x86/lls.asm +++ b/libavutil/x86/lls.asm @@ -194,3 +194,41 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 jle .loop2x1 .ret: REP_RET + + +INIT_XMM sse2 +cglobal evaluate_lls, 2,4,2, ctx, var, order, i + ; This function is often called on the same buffer as update_lls, but with + ; an offset. They can't both be aligned. + ; Load halves rather than movu to avoid store-forwarding stalls, since the + ; input was initialized immediately prior to this function using scalar math. + %define coefsq ctxq + mov id, orderd + imul orderd, MAX_VARS + lea coefsq, [ctxq + LLSModel.coeff + orderq*8] + movsd m0, [varq] + movhpd m0, [varq + 8] + mulpd m0, [coefsq] + lea coefsq, [coefsq + iq*8] + lea varq, [varq + iq*8] + neg iq + add iq, 2 +.loop: + movsd m1, [varq + iq*8] + movhpd m1, [varq + iq*8 + 8] + mulpd m1, [coefsq + iq*8] + addpd m0, m1 + add iq, 2 + jl .loop + jg .skip1 + movsd m1, [varq + iq*8] + mulsd m1, [coefsq + iq*8] + addpd m0, m1 +.skip1: + movhlps m1, m0 + addsd m0, m1 +%if ARCH_X86_32 + movsd r0m, m0 + fld qword r0m +%endif + RET diff --git a/libavutil/x86/lls_init.c b/libavutil/x86/lls_init.c index 8a80f83002..888bc54a39 100644 --- a/libavutil/x86/lls_init.c +++ b/libavutil/x86/lls_init.c @@ -25,12 +25,15 @@ void ff_update_lls_sse2(LLSModel *m, double *var); void ff_update_lls_avx(LLSModel *m, double *var); +double ff_evaluate_lls_sse2(LLSModel *m, double *var, int order); av_cold void ff_init_lls_x86(LLSModel *m) { int cpu_flags = av_get_cpu_flags(); if (EXTERNAL_SSE2(cpu_flags)) { m->update_lls = ff_update_lls_sse2; + if (m->indep_count >= 4) + m->evaluate_lls = ff_evaluate_lls_sse2; } if (EXTERNAL_AVX(cpu_flags)) { m->update_lls = ff_update_lls_avx;