swresample/resample: mmx2/sse2 int16 linear interpolation

About three times faster

Signed-off-by: James Almer <jamrial@gmail.com>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
pull/64/head
James Almer 11 years ago committed by Michael Niedermayer
parent ffd77f94a2
commit fa25c4c400
  1. 7
      libswresample/resample_template.c
  2. 61
      libswresample/x86/resample_mmx.h

@ -81,9 +81,11 @@
# define RENAME(N) N ## _int16 # define RENAME(N) N ## _int16
# elif defined(TEMPLATE_RESAMPLE_S16_MMX2) # elif defined(TEMPLATE_RESAMPLE_S16_MMX2)
# define COMMON_CORE COMMON_CORE_INT16_MMX2 # define COMMON_CORE COMMON_CORE_INT16_MMX2
# define LINEAR_CORE LINEAR_CORE_INT16_MMX2
# define RENAME(N) N ## _int16_mmx2 # define RENAME(N) N ## _int16_mmx2
# elif defined(TEMPLATE_RESAMPLE_S16_SSE2) # elif defined(TEMPLATE_RESAMPLE_S16_SSE2)
# define COMMON_CORE COMMON_CORE_INT16_SSE2 # define COMMON_CORE COMMON_CORE_INT16_SSE2
# define LINEAR_CORE LINEAR_CORE_INT16_SSE2
# define RENAME(N) N ## _int16_sse2 # define RENAME(N) N ## _int16_sse2
# endif # endif
@ -163,10 +165,14 @@ int RENAME(swri_resample)(ResampleContext *c, DELEM *dst, const DELEM *src, int
OUT(dst[dst_index], val); OUT(dst[dst_index], val);
}else if(c->linear){ }else if(c->linear){
FELEM2 v2=0; FELEM2 v2=0;
#ifdef LINEAR_CORE
LINEAR_CORE
#else
for(i=0; i<c->filter_length; i++){ for(i=0; i<c->filter_length; i++){
val += src[sample_index + i] * (FELEM2)filter[i]; val += src[sample_index + i] * (FELEM2)filter[i];
v2 += src[sample_index + i] * (FELEM2)filter[i + c->filter_alloc]; v2 += src[sample_index + i] * (FELEM2)filter[i + c->filter_alloc];
} }
#endif
val+=(v2-val)*(FELEML)frac / c->src_incr; val+=(v2-val)*(FELEML)frac / c->src_incr;
OUT(dst[dst_index], val); OUT(dst[dst_index], val);
}else{ }else{
@ -213,6 +219,7 @@ int RENAME(swri_resample)(ResampleContext *c, DELEM *dst, const DELEM *src, int
} }
#undef COMMON_CORE #undef COMMON_CORE
#undef LINEAR_CORE
#undef RENAME #undef RENAME
#undef FILTER_SHIFT #undef FILTER_SHIFT
#undef DELEM #undef DELEM

@ -50,6 +50,34 @@ __asm__ volatile(\
NAMED_CONSTRAINTS_ADD(ff_resample_int16_rounder)\ NAMED_CONSTRAINTS_ADD(ff_resample_int16_rounder)\
); );
#define LINEAR_CORE_INT16_MMX2 \
x86_reg len= -2*c->filter_length;\
__asm__ volatile(\
"pxor %%mm0, %%mm0 \n\t"\
"pxor %%mm2, %%mm2 \n\t"\
"1: \n\t"\
"movq (%3, %0), %%mm1 \n\t"\
"movq %%mm1, %%mm3 \n\t"\
"pmaddwd (%4, %0), %%mm1 \n\t"\
"pmaddwd (%5, %0), %%mm3 \n\t"\
"paddd %%mm1, %%mm0 \n\t"\
"paddd %%mm3, %%mm2 \n\t"\
"add $8, %0 \n\t"\
" js 1b \n\t"\
"pshufw $0x0E, %%mm0, %%mm1 \n\t"\
"pshufw $0x0E, %%mm2, %%mm3 \n\t"\
"paddd %%mm1, %%mm0 \n\t"\
"paddd %%mm3, %%mm2 \n\t"\
"movd %%mm0, %1 \n\t"\
"movd %%mm2, %2 \n\t"\
: "+r" (len),\
"=r" (val),\
"=r" (v2)\
: "r" (((uint8_t*)(src+sample_index))-len),\
"r" (((uint8_t*)filter)-len),\
"r" (((uint8_t*)(filter+c->filter_alloc))-len)\
);
#define COMMON_CORE_INT16_SSE2 \ #define COMMON_CORE_INT16_SSE2 \
x86_reg len= -2*c->filter_length;\ x86_reg len= -2*c->filter_length;\
__asm__ volatile(\ __asm__ volatile(\
@ -74,6 +102,39 @@ __asm__ volatile(\
NAMED_CONSTRAINTS_ADD(ff_resample_int16_rounder)\ NAMED_CONSTRAINTS_ADD(ff_resample_int16_rounder)\
); );
#define LINEAR_CORE_INT16_SSE2 \
x86_reg len= -2*c->filter_length;\
__asm__ volatile(\
"pxor %%xmm0, %%xmm0 \n\t"\
"pxor %%xmm2, %%xmm2 \n\t"\
"1: \n\t"\
"movdqu (%3, %0), %%xmm1 \n\t"\
"movdqa %%xmm1, %%xmm3 \n\t"\
"pmaddwd (%4, %0), %%xmm1 \n\t"\
"pmaddwd (%5, %0), %%xmm3 \n\t"\
"paddd %%xmm1, %%xmm0 \n\t"\
"paddd %%xmm3, %%xmm2 \n\t"\
"add $16, %0 \n\t"\
" js 1b \n\t"\
"pshufd $0x0E, %%xmm0, %%xmm1 \n\t"\
"pshufd $0x0E, %%xmm2, %%xmm3 \n\t"\
"paddd %%xmm1, %%xmm0 \n\t"\
"paddd %%xmm3, %%xmm2 \n\t"\
"pshufd $0x01, %%xmm0, %%xmm1 \n\t"\
"pshufd $0x01, %%xmm2, %%xmm3 \n\t"\
"paddd %%xmm1, %%xmm0 \n\t"\
"paddd %%xmm3, %%xmm2 \n\t"\
"movd %%xmm0, %1 \n\t"\
"movd %%xmm2, %2 \n\t"\
: "+r" (len),\
"=r" (val),\
"=r" (v2)\
: "r" (((uint8_t*)(src+sample_index))-len),\
"r" (((uint8_t*)filter)-len),\
"r" (((uint8_t*)(filter+c->filter_alloc))-len)\
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3")\
);
#define COMMON_CORE_FLT_SSE \ #define COMMON_CORE_FLT_SSE \
x86_reg len= -4*c->filter_length;\ x86_reg len= -4*c->filter_length;\
__asm__ volatile(\ __asm__ volatile(\

Loading…
Cancel
Save