diff --git a/libswresample/resample_template.c b/libswresample/resample_template.c index 1982992e8a..2a64f50038 100644 --- a/libswresample/resample_template.c +++ b/libswresample/resample_template.c @@ -44,17 +44,15 @@ #elif defined(TEMPLATE_RESAMPLE_FLT) +# define RENAME(N) N ## _float # define FILTER_SHIFT 0 # define DELEM float # define FELEM float # define FELEM2 float # define OUT(d, v) d = v -# if defined(TEMPLATE_RESAMPLE_FLT) -# define RENAME(N) N ## _float -# endif - #elif defined(TEMPLATE_RESAMPLE_S32) + # define RENAME(N) N ## _int32 # define FILTER_SHIFT 30 # define DELEM int32_t @@ -65,10 +63,9 @@ # define OUT(d, v) v = (v + (1<<(FILTER_SHIFT-1)))>>FILTER_SHIFT;\ d = (uint64_t)(v + 0x80000000) > 0xFFFFFFFF ? (v>>63) ^ 0x7FFFFFFF : v -#elif defined(TEMPLATE_RESAMPLE_S16) \ - || defined(TEMPLATE_RESAMPLE_S16_MMX2) \ - || defined(TEMPLATE_RESAMPLE_S16_SSE2) +#elif defined(TEMPLATE_RESAMPLE_S16) +# define RENAME(N) N ## _int16 # define FILTER_SHIFT 15 # define DELEM int16_t # define FELEM int16_t @@ -79,18 +76,6 @@ # define OUT(d, v) v = (v + (1<<(FILTER_SHIFT-1)))>>FILTER_SHIFT;\ d = (unsigned)(v + 32768) > 65535 ? (v>>31) ^ 32767 : v -# if defined(TEMPLATE_RESAMPLE_S16) -# define RENAME(N) N ## _int16 -# elif defined(TEMPLATE_RESAMPLE_S16_MMX2) -# define COMMON_CORE COMMON_CORE_INT16_MMX2 -# define LINEAR_CORE LINEAR_CORE_INT16_MMX2 -# define RENAME(N) N ## _int16_mmx2 -# elif defined(TEMPLATE_RESAMPLE_S16_SSE2) -# define COMMON_CORE COMMON_CORE_INT16_SSE2 -# define LINEAR_CORE LINEAR_CORE_INT16_SSE2 -# define RENAME(N) N ## _int16_sse2 -# endif - #endif #if DO_RESAMPLE_ONE diff --git a/libswresample/x86/resample.asm b/libswresample/x86/resample.asm index bb63fc7e8e..2fe03c846b 100644 --- a/libswresample/x86/resample.asm +++ b/libswresample/x86/resample.asm @@ -1,6 +1,7 @@ ;****************************************************************************** ;* Copyright (c) 2012 Michael Niedermayer ;* Copyright (c) 2014 James Almer gmail.com> +;* Copyright (c) 2014 Ronald S. Bultje ;* ;* This file is part of FFmpeg. ;* @@ -48,18 +49,19 @@ endstruc SECTION_RODATA -pf_1: dd 1.0 +pf_1: dd 1.0 +pd_0x4000: dd 0x4000 SECTION .text -%macro RESAMPLE_FLOAT_FNS 0 -; int resample_common_float(ResampleContext *ctx, float *dst, -; const float *src, int size, int update_ctx) +%macro RESAMPLE_FNS 3 ; format [float or int16], bps, log2_bps +; int resample_common_$format(ResampleContext *ctx, $format *dst, +; const $format *src, int size, int update_ctx) %if ARCH_X86_64 ; unix64 and win64 -cglobal resample_common_float, 0, 15, 2, ctx, dst, src, phase_shift, index, frac, \ - dst_incr_mod, size, min_filter_count_x4, \ - min_filter_len_x4, dst_incr_div, src_incr, \ - phase_mask, dst_end, filter_bank +cglobal resample_common_%1, 0, 15, 2, ctx, dst, src, phase_shift, index, frac, \ + dst_incr_mod, size, min_filter_count_x4, \ + min_filter_len_x4, dst_incr_div, src_incr, \ + phase_mask, dst_end, filter_bank ; use red-zone for variable storage %define ctx_stackq [rsp-0x8] @@ -85,8 +87,8 @@ cglobal resample_common_float, 0, 15, 2, ctx, dst, src, phase_shift, index, frac mov ctx_stackq, ctxq mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length] mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div] - shl min_filter_len_x4d, 2 - lea dst_endq, [dstq+sizeq*4] + shl min_filter_len_x4d, %3 + lea dst_endq, [dstq+sizeq*%2] %if UNIX64 mov ecx, [ctxq+ResampleContext.phase_shift] @@ -109,8 +111,8 @@ cglobal resample_common_float, 0, 15, 2, ctx, dst, src, phase_shift, index, frac sub srcq, min_filter_len_x4q mov src_stackq, srcq %else ; x86-32 -cglobal resample_common_float, 1, 7, 2, ctx, phase_shift, dst, frac, \ - index, min_filter_length_x4, filter_bank +cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \ + index, min_filter_length_x4, filter_bank ; push temp variables to stack %define ctx_stackq r0mp @@ -119,7 +121,7 @@ cglobal resample_common_float, 1, 7, 2, ctx, phase_shift, dst, frac, \ mov dstq, r1mp mov r3, r3mp - lea r3, [dstq+r3*4] + lea r3, [dstq+r3*%2] PUSH dword [ctxq+ResampleContext.dst_incr_div] PUSH dword [ctxq+ResampleContext.dst_incr_mod] PUSH dword [ctxq+ResampleContext.filter_alloc] @@ -128,7 +130,7 @@ cglobal resample_common_float, 1, 7, 2, ctx, phase_shift, dst, frac, \ PUSH dword [ctxq+ResampleContext.src_incr] mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length] mov indexd, [ctxq+ResampleContext.index] - shl min_filter_length_x4d, 2 + shl min_filter_length_x4d, %3 mov fracd, [ctxq+ResampleContext.frac] neg min_filter_length_x4q mov filter_bankq, [ctxq+ResampleContext.filter_bank] @@ -157,19 +159,28 @@ cglobal resample_common_float, 1, 7, 2, ctx, phase_shift, dst, frac, \ imul filterd, indexd %if ARCH_X86_64 mov min_filter_count_x4q, min_filter_len_x4q - lea filterq, [filter_bankq+filterq*4] + lea filterq, [filter_bankq+filterq*%2] %else ; x86-32 mov min_filter_count_x4q, filter_bankq - lea filterq, [min_filter_count_x4q+filterq*4] + lea filterq, [min_filter_count_x4q+filterq*%2] mov min_filter_count_x4q, min_filter_length_x4q %endif +%ifidn %1, float xorps m0, m0, m0 +%else ; int16 + movd m0, [pd_0x4000] +%endif align 16 .inner_loop: - movups m1, [srcq+min_filter_count_x4q*1] + movu m1, [srcq+min_filter_count_x4q*1] +%ifidn %1, float mulps m1, m1, [filterq+min_filter_count_x4q*1] addps m0, m0, m1 +%else ; int16 + pmaddwd m1, [filterq+min_filter_count_x4q*1] + paddd m0, m1 +%endif add min_filter_count_x4q, mmsize js .inner_loop @@ -179,6 +190,7 @@ cglobal resample_common_float, 1, 7, 2, ctx, phase_shift, dst, frac, \ %endif ; horizontal sum & store +%ifidn %1, float movhlps xm1, xm0 addps xm0, xm1 shufps xm1, xm0, xm0, q0001 @@ -186,6 +198,21 @@ cglobal resample_common_float, 1, 7, 2, ctx, phase_shift, dst, frac, \ addps xm0, xm1 add indexd, dst_incr_divd movss [dstq], xm0 +%else ; int16 +%if mmsize == 16 + pshufd m1, m0, q0032 + paddd m0, m1 + pshufd m1, m0, q0001 +%else ; mmsize == 8 + pshufw m1, m0, q0032 +%endif + paddd m0, m1 + psrad m0, 15 + add fracd, dst_incr_modd + packssdw m0, m0 + add indexd, dst_incr_divd + movd [dstq], m0 +%endif cmp fracd, src_incrd jl .skip sub fracd, src_incrd @@ -205,10 +232,10 @@ cglobal resample_common_float, 1, 7, 2, ctx, phase_shift, dst, frac, \ .skip: mov index_incrd, indexd - add dstq, 4 + add dstq, %2 and indexd, phase_maskd sar index_incrd, phase_shiftb - lea srcq, [srcq+index_incrq*4] + lea srcq, [srcq+index_incrq*%2] cmp dstq, dst_endq jne .loop @@ -228,7 +255,7 @@ cglobal resample_common_float, 1, 7, 2, ctx, phase_shift, dst, frac, \ mov [ctxq+ResampleContext.frac ], fracd sub rax, src_stackq mov [ctxq+ResampleContext.index], indexd - shr rax, 2 + shr rax, %3 .skip_store: %if ARCH_X86_32 @@ -236,13 +263,24 @@ cglobal resample_common_float, 1, 7, 2, ctx, phase_shift, dst, frac, \ %endif RET -; int resample_linear_float(ResampleContext *ctx, float *dst, -; const float *src, int size, int update_ctx) +; int resample_linear_$format(ResampleContext *ctx, float *dst, +; const float *src, int size, int update_ctx) %if ARCH_X86_64 ; unix64 and win64 -cglobal resample_linear_float, 0, 15, 5, ctx, dst, src, phase_shift, index, frac, \ - dst_incr_mod, size, min_filter_count_x4, \ - min_filter_len_x4, dst_incr_div, src_incr, \ - phase_mask, dst_end, filter_bank +%if UNIX64 +cglobal resample_linear_%1, 0, 15, 5, ctx, dst, phase_mask, phase_shift, index, frac, \ + size, dst_incr_mod, min_filter_count_x4, \ + min_filter_len_x4, dst_incr_div, src_incr, \ + src, dst_end, filter_bank + + mov srcq, r2mp +%else ; win64 +cglobal resample_linear_%1, 0, 15, 5, ctx, phase_mask, src, phase_shift, index, frac, \ + size, dst_incr_mod, min_filter_count_x4, \ + min_filter_len_x4, dst_incr_div, src_incr, \ + dst, dst_end, filter_bank + + mov dstq, r1mp +%endif ; use red-zone for variable storage %define ctx_stackq [rsp-0x8] @@ -269,27 +307,31 @@ cglobal resample_linear_float, 0, 15, 5, ctx, dst, src, phase_shift, index, frac mov ctx_stackq, ctxq mov phase_mask_stackd, phase_maskd mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length] +%ifidn %1, float cvtsi2ss xm0, src_incrd movss xm4, [pf_1] divss xm4, xm0 +%else ; int16 + movd m4, [pd_0x4000] +%endif mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div] - shl min_filter_len_x4d, 2 - lea dst_endq, [dstq+sizeq*4] + shl min_filter_len_x4d, %3 + lea dst_endq, [dstq+sizeq*%2] %if UNIX64 mov ecx, [ctxq+ResampleContext.phase_shift] mov edi, [ctxq+ResampleContext.filter_alloc] - DEFINE_ARGS filter_alloc, dst, src, phase_shift, index, frac, dst_incr_mod, \ - filter1, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ - src_incr, filter2, dst_end, filter_bank + DEFINE_ARGS filter_alloc, dst, filter2, phase_shift, index, frac, filter1, \ + dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ + dst_incr_div, src_incr, src, dst_end, filter_bank %elif WIN64 mov R9d, [ctxq+ResampleContext.filter_alloc] mov ecx, [ctxq+ResampleContext.phase_shift] - DEFINE_ARGS phase_shift, dst, src, filter_alloc, index, frac, dst_incr_mod, \ - filter1, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ - src_incr, filter2, dst_end, filter_bank + DEFINE_ARGS phase_shift, filter2, src, filter_alloc, index, frac, filter1, \ + dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ + dst_incr_div, src_incr, dst, dst_end, filter_bank %endif neg min_filter_len_x4q @@ -297,8 +339,8 @@ cglobal resample_linear_float, 0, 15, 5, ctx, dst, src, phase_shift, index, frac sub srcq, min_filter_len_x4q mov src_stackq, srcq %else ; x86-32 -cglobal resample_linear_float, 1, 7, 5, ctx, filter1, dst, frac, \ - index, min_filter_length_x4, filter_bank +cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \ + frac, index, dst, filter_bank ; push temp variables to stack %define ctx_stackq r0mp @@ -307,23 +349,27 @@ cglobal resample_linear_float, 1, 7, 5, ctx, filter1, dst, frac, \ mov dstq, r1mp mov r3, r3mp - lea r3, [dstq+r3*4] + lea r3, [dstq+r3*%2] PUSH dword [ctxq+ResampleContext.dst_incr_div] PUSH r3 mov r3, dword [ctxq+ResampleContext.filter_alloc] PUSH dword [ctxq+ResampleContext.dst_incr_mod] PUSH r3 - shl r3, 2 + shl r3, %3 PUSH r3 mov r3, dword [ctxq+ResampleContext.src_incr] PUSH dword [ctxq+ResampleContext.phase_mask] PUSH r3d +%ifidn %1, float cvtsi2ss xm0, r3d movss xm4, [pf_1] divss xm4, xm0 +%else ; int16 + movd m4, [pd_0x4000] +%endif mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length] mov indexd, [ctxq+ResampleContext.index] - shl min_filter_length_x4d, 2 + shl min_filter_length_x4d, %3 mov fracd, [ctxq+ResampleContext.frac] neg min_filter_length_x4q mov filter_bankq, [ctxq+ResampleContext.filter_bank] @@ -333,7 +379,7 @@ cglobal resample_linear_float, 1, 7, 5, ctx, filter1, dst, frac, \ PUSH filter_bankq PUSH dword [ctxq+ResampleContext.phase_shift] - DEFINE_ARGS src, filter1, dst, frac, index, min_filter_count_x4, filter2 + DEFINE_ARGS filter1, min_filter_count_x4, filter2, frac, index, dst, src %define phase_shift_stackd dword [rsp+0x0] %define filter_bankq dword [rsp+0x4] @@ -354,25 +400,37 @@ cglobal resample_linear_float, 1, 7, 5, ctx, filter1, dst, frac, \ imul filter1d, indexd %if ARCH_X86_64 mov min_filter_count_x4q, min_filter_len_x4q - lea filter1q, [filter_bankq+filter1q*4] - lea filter2q, [filter1q+filter_allocq*4] + lea filter1q, [filter_bankq+filter1q*%2] + lea filter2q, [filter1q+filter_allocq*%2] %else ; x86-32 mov min_filter_count_x4q, filter_bankq - lea filter1q, [min_filter_count_x4q+filter1q*4] + lea filter1q, [min_filter_count_x4q+filter1q*%2] mov min_filter_count_x4q, min_filter_length_x4q mov filter2q, filter1q add filter2q, filter_alloc_x4q %endif +%ifidn %1, float xorps m0, m0, m0 xorps m2, m2, m2 +%else ; int16 + mova m0, m4 + mova m2, m4 +%endif align 16 .inner_loop: - movups m1, [srcq+min_filter_count_x4q*1] + movu m1, [srcq+min_filter_count_x4q*1] +%ifidn %1, float mulps m3, m1, [filter2q+min_filter_count_x4q*1] mulps m1, m1, [filter1q+min_filter_count_x4q*1] addps m2, m2, m3 addps m0, m0, m1 +%else ; int16 + pmaddwd m3, m1, [filter2q+min_filter_count_x4q*1] + pmaddwd m1, [filter1q+min_filter_count_x4q*1] + paddd m2, m3 + paddd m0, m1 +%endif add min_filter_count_x4q, mmsize js .inner_loop @@ -383,6 +441,7 @@ cglobal resample_linear_float, 1, 7, 5, ctx, filter1, dst, frac, \ addps xm2, xm3 %endif +%ifidn %1, float ; val += (v2 - val) * (FELEML) frac / c->src_incr; cvtsi2ss xm1, fracd subps xm2, xm0 @@ -399,21 +458,55 @@ cglobal resample_linear_float, 1, 7, 5, ctx, filter1, dst, frac, \ addps xm0, xm1 add indexd, dst_incr_divd movss [dstq], xm0 +%else ; int16 +%if mmsize == 16 + pshufd m3, m2, q0032 + pshufd m1, m0, q0032 + paddd m2, m3 + paddd m0, m1 + pshufd m3, m2, q0001 + pshufd m1, m0, q0001 +%else ; mmsize == 8 + pshufw m3, m2, q0032 + pshufw m1, m0, q0032 +%endif + paddd m2, m3 + paddd m0, m1 + psubd m2, m0 + ; This is probably a really bad idea on atom and other machines with a + ; long transfer latency between GPRs and XMMs (atom). However, it does + ; make the clip a lot simpler... + movd eax, m2 + add indexd, dst_incr_divd + imul fracd + idiv src_incrd + movd m1, eax + add fracd, dst_incr_modd + paddd m0, m1 + psrad m0, 15 + packssdw m0, m0 + movd [dstq], m0 + + ; note that for imul/idiv, I need to move filter to edx/eax for each: + ; - 32bit: eax=r0[filter1], edx=r2[filter2] + ; - win64: eax=r6[filter1], edx=r1[todo] + ; - unix64: eax=r6[filter1], edx=r2[todo] +%endif cmp fracd, src_incrd jl .skip sub fracd, src_incrd inc indexd %if UNIX64 - DEFINE_ARGS filter_alloc, dst, src, phase_shift, index, frac, dst_incr_mod, \ - index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ - src_incr, filter2, dst_end, filter_bank + DEFINE_ARGS filter_alloc, dst, filter2, phase_shift, index, frac, index_incr, \ + dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ + dst_incr_div, src_incr, src, dst_end, filter_bank %elif WIN64 - DEFINE_ARGS phase_shift, dst, src, filter_alloc, index, frac, dst_incr_mod, \ - index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ - src_incr, filter2, dst_end, filter_bank + DEFINE_ARGS phase_shift, filter2, src, filter_alloc, index, frac, index_incr, \ + dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ + dst_incr_div, src_incr, dst, dst_end, filter_bank %else ; x86-32 - DEFINE_ARGS src, phase_shift, dst, frac, index, index_incr + DEFINE_ARGS filter1, phase_shift, index_incr, frac, index, dst, src %endif .skip: @@ -421,17 +514,23 @@ cglobal resample_linear_float, 1, 7, 5, ctx, filter1, dst, frac, \ mov phase_shiftd, phase_shift_stackd %endif mov index_incrd, indexd - add dstq, 4 + add dstq, %2 and indexd, phase_mask_stackd sar index_incrd, phase_shiftb - lea srcq, [srcq+index_incrq*4] + lea srcq, [srcq+index_incrq*%2] cmp dstq, dst_endq jne .loop -%if ARCH_X86_64 - DEFINE_ARGS ctx, dst, src, phase_shift, index, frac +%if UNIX64 + DEFINE_ARGS ctx, dst, filter2, phase_shift, index, frac, index_incr, \ + dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ + dst_incr_div, src_incr, src, dst_end, filter_bank +%elif WIN64 + DEFINE_ARGS ctx, filter2, src, phase_shift, index, frac, index_incr, \ + dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ + dst_incr_div, src_incr, dst, dst_end, filter_bank %else ; x86-32 - DEFINE_ARGS src, ctx, update_context, frac, index + DEFINE_ARGS filter1, ctx, update_context, frac, index, dst, src %endif cmp dword update_context_stackd, 0 @@ -444,7 +543,7 @@ cglobal resample_linear_float, 1, 7, 5, ctx, filter1, dst, frac, \ mov [ctxq+ResampleContext.frac ], fracd sub rax, src_stackq mov [ctxq+ResampleContext.index], indexd - shr rax, 2 + shr rax, %3 .skip_store: %if ARCH_X86_32 @@ -454,9 +553,17 @@ cglobal resample_linear_float, 1, 7, 5, ctx, filter1, dst, frac, \ %endmacro INIT_XMM sse -RESAMPLE_FLOAT_FNS +RESAMPLE_FNS float, 4, 2 %if HAVE_AVX_EXTERNAL INIT_YMM avx -RESAMPLE_FLOAT_FNS +RESAMPLE_FNS float, 4, 2 %endif + +%if ARCH_X86_32 +INIT_MMX mmxext +RESAMPLE_FNS int16, 2, 1 +%endif + +INIT_XMM sse2 +RESAMPLE_FNS int16, 2, 1 diff --git a/libswresample/x86/resample_mmx.h b/libswresample/x86/resample_mmx.h index 94237b0507..b0ea496361 100644 --- a/libswresample/x86/resample_mmx.h +++ b/libswresample/x86/resample_mmx.h @@ -22,116 +22,6 @@ #include "libavutil/cpu.h" #include "libswresample/swresample_internal.h" -DECLARE_ALIGNED(16, const uint64_t, ff_resample_int16_rounder)[2] = { 0x0000000000004000ULL, 0x0000000000000000ULL}; - -#define COMMON_CORE_INT16_MMX2 \ - x86_reg len= -2*c->filter_length;\ -__asm__ volatile(\ - "movq "MANGLE(ff_resample_int16_rounder)", %%mm0 \n\t"\ - "1: \n\t"\ - "movq (%1, %0), %%mm1 \n\t"\ - "pmaddwd (%2, %0), %%mm1 \n\t"\ - "paddd %%mm1, %%mm0 \n\t"\ - "add $8, %0 \n\t"\ - " js 1b \n\t"\ - "pshufw $0x0E, %%mm0, %%mm1 \n\t"\ - "paddd %%mm1, %%mm0 \n\t"\ - "psrad $15, %%mm0 \n\t"\ - "packssdw %%mm0, %%mm0 \n\t"\ - "movd %%mm0, (%3) \n\t"\ - : "+r" (len)\ - : "r" (((uint8_t*)(src+sample_index))-len),\ - "r" (((uint8_t*)filter)-len),\ - "r" (dst+dst_index)\ - NAMED_CONSTRAINTS_ARRAY_ADD(ff_resample_int16_rounder)\ -); - -#define LINEAR_CORE_INT16_MMX2 \ - x86_reg len= -2*c->filter_length;\ -__asm__ volatile(\ - "pxor %%mm0, %%mm0 \n\t"\ - "pxor %%mm2, %%mm2 \n\t"\ - "1: \n\t"\ - "movq (%3, %0), %%mm1 \n\t"\ - "movq %%mm1, %%mm3 \n\t"\ - "pmaddwd (%4, %0), %%mm1 \n\t"\ - "pmaddwd (%5, %0), %%mm3 \n\t"\ - "paddd %%mm1, %%mm0 \n\t"\ - "paddd %%mm3, %%mm2 \n\t"\ - "add $8, %0 \n\t"\ - " js 1b \n\t"\ - "pshufw $0x0E, %%mm0, %%mm1 \n\t"\ - "pshufw $0x0E, %%mm2, %%mm3 \n\t"\ - "paddd %%mm1, %%mm0 \n\t"\ - "paddd %%mm3, %%mm2 \n\t"\ - "movd %%mm0, %1 \n\t"\ - "movd %%mm2, %2 \n\t"\ - : "+r" (len),\ - "=r" (val),\ - "=r" (v2)\ - : "r" (((uint8_t*)(src+sample_index))-len),\ - "r" (((uint8_t*)filter)-len),\ - "r" (((uint8_t*)(filter+c->filter_alloc))-len)\ -); - -#define COMMON_CORE_INT16_SSE2 \ - x86_reg len= -2*c->filter_length;\ -__asm__ volatile(\ - "movdqa "MANGLE(ff_resample_int16_rounder)", %%xmm0 \n\t"\ - "1: \n\t"\ - "movdqu (%1, %0), %%xmm1 \n\t"\ - "pmaddwd (%2, %0), %%xmm1 \n\t"\ - "paddd %%xmm1, %%xmm0 \n\t"\ - "add $16, %0 \n\t"\ - " js 1b \n\t"\ - "pshufd $0x0E, %%xmm0, %%xmm1 \n\t"\ - "paddd %%xmm1, %%xmm0 \n\t"\ - "pshufd $0x01, %%xmm0, %%xmm1 \n\t"\ - "paddd %%xmm1, %%xmm0 \n\t"\ - "psrad $15, %%xmm0 \n\t"\ - "packssdw %%xmm0, %%xmm0 \n\t"\ - "movd %%xmm0, (%3) \n\t"\ - : "+r" (len)\ - : "r" (((uint8_t*)(src+sample_index))-len),\ - "r" (((uint8_t*)filter)-len),\ - "r" (dst+dst_index)\ - NAMED_CONSTRAINTS_ARRAY_ADD(ff_resample_int16_rounder)\ - XMM_CLOBBERS_ONLY("%xmm0", "%xmm1")\ -); - -#define LINEAR_CORE_INT16_SSE2 \ - x86_reg len= -2*c->filter_length;\ -__asm__ volatile(\ - "pxor %%xmm0, %%xmm0 \n\t"\ - "pxor %%xmm2, %%xmm2 \n\t"\ - "1: \n\t"\ - "movdqu (%3, %0), %%xmm1 \n\t"\ - "movdqa %%xmm1, %%xmm3 \n\t"\ - "pmaddwd (%4, %0), %%xmm1 \n\t"\ - "pmaddwd (%5, %0), %%xmm3 \n\t"\ - "paddd %%xmm1, %%xmm0 \n\t"\ - "paddd %%xmm3, %%xmm2 \n\t"\ - "add $16, %0 \n\t"\ - " js 1b \n\t"\ - "pshufd $0x0E, %%xmm0, %%xmm1 \n\t"\ - "pshufd $0x0E, %%xmm2, %%xmm3 \n\t"\ - "paddd %%xmm1, %%xmm0 \n\t"\ - "paddd %%xmm3, %%xmm2 \n\t"\ - "pshufd $0x01, %%xmm0, %%xmm1 \n\t"\ - "pshufd $0x01, %%xmm2, %%xmm3 \n\t"\ - "paddd %%xmm1, %%xmm0 \n\t"\ - "paddd %%xmm3, %%xmm2 \n\t"\ - "movd %%xmm0, %1 \n\t"\ - "movd %%xmm2, %2 \n\t"\ - : "+r" (len),\ - "=r" (val),\ - "=r" (v2)\ - : "r" (((uint8_t*)(src+sample_index))-len),\ - "r" (((uint8_t*)filter)-len),\ - "r" (((uint8_t*)(filter+c->filter_alloc))-len)\ - XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3")\ -); - #define COMMON_CORE_DBL_SSE2 \ x86_reg len= -8*c->filter_length;\ __asm__ volatile(\ diff --git a/libswresample/x86/resample_x86_dsp.c b/libswresample/x86/resample_x86_dsp.c index c7d2054f9c..5130ecdd1b 100644 --- a/libswresample/x86/resample_x86_dsp.c +++ b/libswresample/x86/resample_x86_dsp.c @@ -27,34 +27,14 @@ #include "libswresample/resample.h" -int swri_resample_common_int16_mmx2 (ResampleContext *c, int16_t *dst, const int16_t *src, int n, int update_ctx); -int swri_resample_linear_int16_mmx2 (ResampleContext *c, int16_t *dst, const int16_t *src, int n, int update_ctx); -int swri_resample_common_int16_sse2 (ResampleContext *c, int16_t *dst, const int16_t *src, int n, int update_ctx); -int swri_resample_linear_int16_sse2 (ResampleContext *c, int16_t *dst, const int16_t *src, int n, int update_ctx); -int swri_resample_common_float_sse (ResampleContext *c, float *dst, const float *src, int n, int update_ctx); -int swri_resample_linear_float_sse (ResampleContext *c, float *dst, const float *src, int n, int update_ctx); -int swri_resample_common_float_avx (ResampleContext *c, float *dst, const float *src, int n, int update_ctx); -int swri_resample_linear_float_avx (ResampleContext *c, float *dst, const float *src, int n, int update_ctx); int swri_resample_common_double_sse2(ResampleContext *c, double *dst, const double *src, int n, int update_ctx); int swri_resample_linear_double_sse2(ResampleContext *c, double *dst, const double *src, int n, int update_ctx); -#if HAVE_MMXEXT_INLINE - +#if HAVE_SSE2_INLINE #define DO_RESAMPLE_ONE 0 #include "resample_mmx.h" -#if ARCH_X86_32 -#define TEMPLATE_RESAMPLE_S16_MMX2 -#include "libswresample/resample_template.c" -#undef TEMPLATE_RESAMPLE_S16_MMX2 -#endif - -#if HAVE_SSE2_INLINE -#define TEMPLATE_RESAMPLE_S16_SSE2 -#include "libswresample/resample_template.c" -#undef TEMPLATE_RESAMPLE_S16_SSE2 - #define TEMPLATE_RESAMPLE_DBL_SSE2 #include "libswresample/resample_template.c" #undef TEMPLATE_RESAMPLE_DBL_SSE2 @@ -62,7 +42,15 @@ int swri_resample_linear_double_sse2(ResampleContext *c, double *dst, const do #undef DO_RESAMPLE_ONE -#endif // HAVE_MMXEXT_INLINE +int ff_resample_common_int16_mmxext(ResampleContext *c, uint8_t *dst, + const uint8_t *src, int sz, int upd); +int ff_resample_linear_int16_mmxext(ResampleContext *c, uint8_t *dst, + const uint8_t *src, int sz, int upd); + +int ff_resample_common_int16_sse2(ResampleContext *c, uint8_t *dst, + const uint8_t *src, int sz, int upd); +int ff_resample_linear_int16_sse2(ResampleContext *c, uint8_t *dst, + const uint8_t *src, int sz, int upd); int ff_resample_common_float_sse(ResampleContext *c, uint8_t *dst, const uint8_t *src, int sz, int upd); @@ -79,17 +67,19 @@ void swresample_dsp_x86_init(ResampleContext *c) int av_unused mm_flags = av_get_cpu_flags(); #define FNIDX(fmt) (AV_SAMPLE_FMT_##fmt - AV_SAMPLE_FMT_S16P) - if (ARCH_X86_32 && HAVE_MMXEXT_INLINE && mm_flags & AV_CPU_FLAG_MMX2) { - c->dsp.resample_common[FNIDX(S16P)] = (resample_fn) swri_resample_common_int16_mmx2; - c->dsp.resample_linear[FNIDX(S16P)] = (resample_fn) swri_resample_linear_int16_mmx2; + if (ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL && mm_flags & AV_CPU_FLAG_MMX2) { + c->dsp.resample_common[FNIDX(S16P)] = ff_resample_common_int16_mmxext; + c->dsp.resample_linear[FNIDX(S16P)] = ff_resample_linear_int16_mmxext; } if (HAVE_SSE_EXTERNAL && mm_flags & AV_CPU_FLAG_SSE) { c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_sse; c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_sse; } + if (HAVE_SSE2_EXTERNAL && mm_flags & AV_CPU_FLAG_SSE2) { + c->dsp.resample_common[FNIDX(S16P)] = ff_resample_common_int16_sse2; + c->dsp.resample_linear[FNIDX(S16P)] = ff_resample_linear_int16_sse2; + } if (HAVE_SSE2_INLINE && mm_flags & AV_CPU_FLAG_SSE2) { - c->dsp.resample_common[FNIDX(S16P)] = (resample_fn) swri_resample_common_int16_sse2; - c->dsp.resample_linear[FNIDX(S16P)] = (resample_fn) swri_resample_linear_int16_sse2; c->dsp.resample_common[FNIDX(DBLP)] = (resample_fn) swri_resample_common_double_sse2; c->dsp.resample_linear[FNIDX(DBLP)] = (resample_fn) swri_resample_linear_double_sse2; }