From 63dbba655e7b09bd5bd09d3a8eab270152bb803f Mon Sep 17 00:00:00 2001 From: James Almer Date: Sun, 23 Mar 2014 19:05:17 -0300 Subject: [PATCH] swresample/resample: sse float linear interpolation About two times faster Signed-off-by: James Almer Signed-off-by: Michael Niedermayer --- libswresample/resample_template.c | 1 + libswresample/x86/resample_mmx.h | 35 +++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/libswresample/resample_template.c b/libswresample/resample_template.c index c6612766b4..396b4e3055 100644 --- a/libswresample/resample_template.c +++ b/libswresample/resample_template.c @@ -48,6 +48,7 @@ # define RENAME(N) N ## _float # elif defined(TEMPLATE_RESAMPLE_FLT_SSE) # define COMMON_CORE COMMON_CORE_FLT_SSE +# define LINEAR_CORE LINEAR_CORE_FLT_SSE # define RENAME(N) N ## _float_sse # endif diff --git a/libswresample/x86/resample_mmx.h b/libswresample/x86/resample_mmx.h index 28a317ce78..a0df6e1e1e 100644 --- a/libswresample/x86/resample_mmx.h +++ b/libswresample/x86/resample_mmx.h @@ -156,3 +156,38 @@ __asm__ volatile(\ "r" (((uint8_t*)filter)-len),\ "r" (dst+dst_index)\ ); + +#define LINEAR_CORE_FLT_SSE \ + x86_reg len= -4*c->filter_length;\ +__asm__ volatile(\ + "xorps %%xmm0, %%xmm0 \n\t"\ + "xorps %%xmm2, %%xmm2 \n\t"\ + "1: \n\t"\ + "movups (%3, %0), %%xmm1 \n\t"\ + "movaps %%xmm1, %%xmm3 \n\t"\ + "mulps (%4, %0), %%xmm1 \n\t"\ + "mulps (%5, %0), %%xmm3 \n\t"\ + "addps %%xmm1, %%xmm0 \n\t"\ + "addps %%xmm3, %%xmm2 \n\t"\ + "add $16, %0 \n\t"\ + " js 1b \n\t"\ + "movhlps %%xmm0, %%xmm1 \n\t"\ + "movhlps %%xmm2, %%xmm3 \n\t"\ + "addps %%xmm1, %%xmm0 \n\t"\ + "addps %%xmm3, %%xmm2 \n\t"\ + "movss %%xmm0, %%xmm1 \n\t"\ + "movss %%xmm2, %%xmm3 \n\t"\ + "shufps $1, %%xmm0, %%xmm0 \n\t"\ + "shufps $1, %%xmm2, %%xmm2 \n\t"\ + "addps %%xmm1, %%xmm0 \n\t"\ + "addps %%xmm3, %%xmm2 \n\t"\ + "movss %%xmm0, %1 \n\t"\ + "movss %%xmm2, %2 \n\t"\ + : "+r" (len),\ + "=m" (val),\ + "=m" (v2)\ + : "r" (((uint8_t*)(src+sample_index))-len),\ + "r" (((uint8_t*)filter)-len),\ + "r" (((uint8_t*)(filter+c->filter_alloc))-len)\ + XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3")\ +);