diff --git a/libswresample/resample_template.c b/libswresample/resample_template.c index 2a64f50038..4f1638edb1 100644 --- a/libswresample/resample_template.c +++ b/libswresample/resample_template.c @@ -25,23 +25,15 @@ * @author Michael Niedermayer */ -#if defined(TEMPLATE_RESAMPLE_DBL) \ - || defined(TEMPLATE_RESAMPLE_DBL_SSE2) +#if defined(TEMPLATE_RESAMPLE_DBL) +# define RENAME(N) N ## _double # define FILTER_SHIFT 0 # define DELEM double # define FELEM double # define FELEM2 double # define OUT(d, v) d = v -# if defined(TEMPLATE_RESAMPLE_DBL) -# define RENAME(N) N ## _double -# elif defined(TEMPLATE_RESAMPLE_DBL_SSE2) -# define COMMON_CORE COMMON_CORE_DBL_SSE2 -# define LINEAR_CORE LINEAR_CORE_DBL_SSE2 -# define RENAME(N) N ## _double_sse2 -# endif - #elif defined(TEMPLATE_RESAMPLE_FLT) # define RENAME(N) N ## _float @@ -104,16 +96,12 @@ int RENAME(swri_resample_common)(ResampleContext *c, for (dst_index = 0; dst_index < n; dst_index++) { FELEM *filter = ((FELEM *) c->filter_bank) + c->filter_alloc * index; -#ifdef COMMON_CORE - COMMON_CORE -#else FELEM2 val=0; int i; for (i = 0; i < c->filter_length; i++) { val += src[sample_index + i] * (FELEM2)filter[i]; } OUT(dst[dst_index], val); -#endif frac += c->dst_incr_mod; index += c->dst_incr_div; @@ -150,15 +138,11 @@ int RENAME(swri_resample_linear)(ResampleContext *c, FELEM *filter = ((FELEM *) c->filter_bank) + c->filter_alloc * index; FELEM2 val=0, v2 = 0; -#ifdef LINEAR_CORE - LINEAR_CORE -#else int i; for (i = 0; i < c->filter_length; i++) { val += src[sample_index + i] * (FELEM2)filter[i]; v2 += src[sample_index + i] * (FELEM2)filter[i + c->filter_alloc]; } -#endif #ifdef FELEML val += (v2 - val) * (FELEML) frac / c->src_incr; #else @@ -188,8 +172,6 @@ int RENAME(swri_resample_linear)(ResampleContext *c, return sample_index; } -#undef COMMON_CORE -#undef LINEAR_CORE #undef RENAME #undef FILTER_SHIFT #undef DELEM diff --git a/libswresample/x86/resample.asm b/libswresample/x86/resample.asm index 2fe03c846b..bce1389bec 100644 --- a/libswresample/x86/resample.asm +++ b/libswresample/x86/resample.asm @@ -50,11 +50,12 @@ endstruc SECTION_RODATA pf_1: dd 1.0 +pdbl_1: dq 1.0 pd_0x4000: dd 0x4000 SECTION .text -%macro RESAMPLE_FNS 3 ; format [float or int16], bps, log2_bps +%macro RESAMPLE_FNS 3-5 ; format [float or int16], bps, log2_bps, float op suffix [s or d], 1.0 constant ; int resample_common_$format(ResampleContext *ctx, $format *dst, ; const $format *src, int size, int update_ctx) %if ARCH_X86_64 ; unix64 and win64 @@ -165,21 +166,21 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \ lea filterq, [min_filter_count_x4q+filterq*%2] mov min_filter_count_x4q, min_filter_length_x4q %endif -%ifidn %1, float - xorps m0, m0, m0 -%else ; int16 +%ifidn %1, int16 movd m0, [pd_0x4000] +%else ; float/double + xorps m0, m0, m0 %endif align 16 .inner_loop: movu m1, [srcq+min_filter_count_x4q*1] -%ifidn %1, float - mulps m1, m1, [filterq+min_filter_count_x4q*1] - addps m0, m0, m1 -%else ; int16 +%ifidn %1, int16 pmaddwd m1, [filterq+min_filter_count_x4q*1] paddd m0, m1 +%else ; float/double + mulp%4 m1, m1, [filterq+min_filter_count_x4q*1] + addp%4 m0, m0, m1 %endif add min_filter_count_x4q, mmsize js .inner_loop @@ -189,16 +190,7 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \ addps xm0, xm1 %endif - ; horizontal sum & store -%ifidn %1, float - movhlps xm1, xm0 - addps xm0, xm1 - shufps xm1, xm0, xm0, q0001 - add fracd, dst_incr_modd - addps xm0, xm1 - add indexd, dst_incr_divd - movss [dstq], xm0 -%else ; int16 +%ifidn %1, int16 %if mmsize == 16 pshufd m1, m0, q0032 paddd m0, m1 @@ -212,6 +204,17 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \ packssdw m0, m0 add indexd, dst_incr_divd movd [dstq], m0 +%else ; float/double + ; horizontal sum & store + movhlps xm1, xm0 +%ifidn %1, float + addps xm0, xm1 + shufps xm1, xm0, xm0, q0001 +%endif + add fracd, dst_incr_modd + addp%4 xm0, xm1 + add indexd, dst_incr_divd + movs%4 [dstq], xm0 %endif cmp fracd, src_incrd jl .skip @@ -307,12 +310,12 @@ cglobal resample_linear_%1, 0, 15, 5, ctx, phase_mask, src, phase_shift, index, mov ctx_stackq, ctxq mov phase_mask_stackd, phase_maskd mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length] -%ifidn %1, float - cvtsi2ss xm0, src_incrd - movss xm4, [pf_1] - divss xm4, xm0 -%else ; int16 +%ifidn %1, int16 movd m4, [pd_0x4000] +%else ; float/double + cvtsi2s%4 xm0, src_incrd + movs%4 xm4, [%5] + divs%4 xm4, xm0 %endif mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div] shl min_filter_len_x4d, %3 @@ -360,12 +363,12 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \ mov r3, dword [ctxq+ResampleContext.src_incr] PUSH dword [ctxq+ResampleContext.phase_mask] PUSH r3d -%ifidn %1, float - cvtsi2ss xm0, r3d - movss xm4, [pf_1] - divss xm4, xm0 -%else ; int16 +%ifidn %1, int16 movd m4, [pd_0x4000] +%else ; float/double + cvtsi2s%4 xm0, r3d + movs%4 xm4, [%5] + divs%4 xm4, xm0 %endif mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length] mov indexd, [ctxq+ResampleContext.index] @@ -409,27 +412,27 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \ mov filter2q, filter1q add filter2q, filter_alloc_x4q %endif -%ifidn %1, float - xorps m0, m0, m0 - xorps m2, m2, m2 -%else ; int16 +%ifidn %1, int16 mova m0, m4 mova m2, m4 +%else ; float/double + xorps m0, m0, m0 + xorps m2, m2, m2 %endif align 16 .inner_loop: movu m1, [srcq+min_filter_count_x4q*1] -%ifidn %1, float - mulps m3, m1, [filter2q+min_filter_count_x4q*1] - mulps m1, m1, [filter1q+min_filter_count_x4q*1] - addps m2, m2, m3 - addps m0, m0, m1 -%else ; int16 +%ifidn %1, int16 pmaddwd m3, m1, [filter2q+min_filter_count_x4q*1] pmaddwd m1, [filter1q+min_filter_count_x4q*1] paddd m2, m3 paddd m0, m1 +%else ; float/double + mulp%4 m3, m1, [filter2q+min_filter_count_x4q*1] + mulp%4 m1, m1, [filter1q+min_filter_count_x4q*1] + addp%4 m2, m2, m3 + addp%4 m0, m0, m1 %endif add min_filter_count_x4q, mmsize js .inner_loop @@ -441,24 +444,7 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \ addps xm2, xm3 %endif -%ifidn %1, float - ; val += (v2 - val) * (FELEML) frac / c->src_incr; - cvtsi2ss xm1, fracd - subps xm2, xm0 - mulps xm1, xm4 - shufps xm1, xm1, q0000 - mulps xm2, xm1 - addps xm0, xm2 - - ; horizontal sum & store - movhlps xm1, xm0 - addps xm0, xm1 - shufps xm1, xm0, xm0, q0001 - add fracd, dst_incr_modd - addps xm0, xm1 - add indexd, dst_incr_divd - movss [dstq], xm0 -%else ; int16 +%ifidn %1, int16 %if mmsize == 16 pshufd m3, m2, q0032 pshufd m1, m0, q0032 @@ -491,6 +477,25 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \ ; - 32bit: eax=r0[filter1], edx=r2[filter2] ; - win64: eax=r6[filter1], edx=r1[todo] ; - unix64: eax=r6[filter1], edx=r2[todo] +%else ; float/double + ; val += (v2 - val) * (FELEML) frac / c->src_incr; + cvtsi2s%4 xm1, fracd + subp%4 xm2, xm0 + mulp%4 xm1, xm4 + shufp%4 xm1, xm1, q0000 + mulp%4 xm2, xm1 + addp%4 xm0, xm2 + + ; horizontal sum & store + movhlps xm1, xm0 +%ifidn %1, float + addps xm0, xm1 + shufps xm1, xm0, xm0, q0001 +%endif + add fracd, dst_incr_modd + addp%4 xm0, xm1 + add indexd, dst_incr_divd + movs%4 [dstq], xm0 %endif cmp fracd, src_incrd jl .skip @@ -553,11 +558,11 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \ %endmacro INIT_XMM sse -RESAMPLE_FNS float, 4, 2 +RESAMPLE_FNS float, 4, 2, s, pf_1 %if HAVE_AVX_EXTERNAL INIT_YMM avx -RESAMPLE_FNS float, 4, 2 +RESAMPLE_FNS float, 4, 2, s, pf_1 %endif %if ARCH_X86_32 @@ -567,3 +572,4 @@ RESAMPLE_FNS int16, 2, 1 INIT_XMM sse2 RESAMPLE_FNS int16, 2, 1 +RESAMPLE_FNS double, 8, 3, d, pdbl_1 diff --git a/libswresample/x86/resample_mmx.h b/libswresample/x86/resample_mmx.h deleted file mode 100644 index b0ea496361..0000000000 --- a/libswresample/x86/resample_mmx.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (c) 2012 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/x86/asm.h" -#include "libavutil/cpu.h" -#include "libswresample/swresample_internal.h" - -#define COMMON_CORE_DBL_SSE2 \ - x86_reg len= -8*c->filter_length;\ -__asm__ volatile(\ - "xorpd %%xmm0, %%xmm0 \n\t"\ - "1: \n\t"\ - "movupd (%1, %0), %%xmm1 \n\t"\ - "mulpd (%2, %0), %%xmm1 \n\t"\ - "addpd %%xmm1, %%xmm0 \n\t"\ - "add $16, %0 \n\t"\ - " js 1b \n\t"\ - "movhlps %%xmm0, %%xmm1 \n\t"\ - "addpd %%xmm1, %%xmm0 \n\t"\ - "movsd %%xmm0, (%3) \n\t"\ - : "+r" (len)\ - : "r" (((uint8_t*)(src+sample_index))-len),\ - "r" (((uint8_t*)filter)-len),\ - "r" (dst+dst_index)\ - XMM_CLOBBERS_ONLY("%xmm0", "%xmm1")\ -); - -#define LINEAR_CORE_DBL_SSE2 \ - x86_reg len= -8*c->filter_length;\ -__asm__ volatile(\ - "xorpd %%xmm0, %%xmm0 \n\t"\ - "xorpd %%xmm2, %%xmm2 \n\t"\ - "1: \n\t"\ - "movupd (%3, %0), %%xmm1 \n\t"\ - "movapd %%xmm1, %%xmm3 \n\t"\ - "mulpd (%4, %0), %%xmm1 \n\t"\ - "mulpd (%5, %0), %%xmm3 \n\t"\ - "addpd %%xmm1, %%xmm0 \n\t"\ - "addpd %%xmm3, %%xmm2 \n\t"\ - "add $16, %0 \n\t"\ - " js 1b \n\t"\ - "movhlps %%xmm0, %%xmm1 \n\t"\ - "movhlps %%xmm2, %%xmm3 \n\t"\ - "addpd %%xmm1, %%xmm0 \n\t"\ - "addpd %%xmm3, %%xmm2 \n\t"\ - "movsd %%xmm0, %1 \n\t"\ - "movsd %%xmm2, %2 \n\t"\ - : "+r" (len),\ - "=m" (val),\ - "=m" (v2)\ - : "r" (((uint8_t*)(src+sample_index))-len),\ - "r" (((uint8_t*)filter)-len),\ - "r" (((uint8_t*)(filter+c->filter_alloc))-len)\ - XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3")\ -); diff --git a/libswresample/x86/resample_x86_dsp.c b/libswresample/x86/resample_x86_dsp.c index 5130ecdd1b..9049da6951 100644 --- a/libswresample/x86/resample_x86_dsp.c +++ b/libswresample/x86/resample_x86_dsp.c @@ -27,21 +27,6 @@ #include "libswresample/resample.h" -int swri_resample_common_double_sse2(ResampleContext *c, double *dst, const double *src, int n, int update_ctx); -int swri_resample_linear_double_sse2(ResampleContext *c, double *dst, const double *src, int n, int update_ctx); - -#if HAVE_SSE2_INLINE -#define DO_RESAMPLE_ONE 0 - -#include "resample_mmx.h" - -#define TEMPLATE_RESAMPLE_DBL_SSE2 -#include "libswresample/resample_template.c" -#undef TEMPLATE_RESAMPLE_DBL_SSE2 -#endif - -#undef DO_RESAMPLE_ONE - int ff_resample_common_int16_mmxext(ResampleContext *c, uint8_t *dst, const uint8_t *src, int sz, int upd); int ff_resample_linear_int16_mmxext(ResampleContext *c, uint8_t *dst, @@ -62,6 +47,11 @@ int ff_resample_common_float_avx(ResampleContext *c, uint8_t *dst, int ff_resample_linear_float_avx(ResampleContext *c, uint8_t *dst, const uint8_t *src, int sz, int upd); +int ff_resample_common_double_sse2(ResampleContext *c, uint8_t *dst, + const uint8_t *src, int sz, int upd); +int ff_resample_linear_double_sse2(ResampleContext *c, uint8_t *dst, + const uint8_t *src, int sz, int upd); + void swresample_dsp_x86_init(ResampleContext *c) { int av_unused mm_flags = av_get_cpu_flags(); @@ -78,10 +68,9 @@ void swresample_dsp_x86_init(ResampleContext *c) if (HAVE_SSE2_EXTERNAL && mm_flags & AV_CPU_FLAG_SSE2) { c->dsp.resample_common[FNIDX(S16P)] = ff_resample_common_int16_sse2; c->dsp.resample_linear[FNIDX(S16P)] = ff_resample_linear_int16_sse2; - } - if (HAVE_SSE2_INLINE && mm_flags & AV_CPU_FLAG_SSE2) { - c->dsp.resample_common[FNIDX(DBLP)] = (resample_fn) swri_resample_common_double_sse2; - c->dsp.resample_linear[FNIDX(DBLP)] = (resample_fn) swri_resample_linear_double_sse2; + + c->dsp.resample_common[FNIDX(DBLP)] = ff_resample_common_double_sse2; + c->dsp.resample_linear[FNIDX(DBLP)] = ff_resample_linear_double_sse2; } if (HAVE_AVX_EXTERNAL && mm_flags & AV_CPU_FLAG_AVX) { c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_avx;