diff --git a/libavcodec/x86/fft.c b/libavcodec/x86/fft.c index 5495821e5b..6349c239c3 100644 --- a/libavcodec/x86/fft.c +++ b/libavcodec/x86/fft.c @@ -27,15 +27,15 @@ av_cold void ff_fft_init_mmx(FFTContext *s) int has_vectors = av_get_cpu_flags(); if (has_vectors & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) { /* 3DNow! for K6-2/3 */ - s->imdct_calc = ff_imdct_calc_3dn; - s->imdct_half = ff_imdct_half_3dn; - s->fft_calc = ff_fft_calc_3dn; + s->imdct_calc = ff_imdct_calc_3dnow; + s->imdct_half = ff_imdct_half_3dnow; + s->fft_calc = ff_fft_calc_3dnow; } if (has_vectors & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT) { /* 3DNowEx for K7 */ - s->imdct_calc = ff_imdct_calc_3dn2; - s->imdct_half = ff_imdct_half_3dn2; - s->fft_calc = ff_fft_calc_3dn2; + s->imdct_calc = ff_imdct_calc_3dnow2; + s->imdct_half = ff_imdct_half_3dnow2; + s->fft_calc = ff_fft_calc_3dnow2; } if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) { /* SSE for P3/P4/K8 */ diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h index 9d68d5b219..1cefe7a9ee 100644 --- a/libavcodec/x86/fft.h +++ b/libavcodec/x86/fft.h @@ -24,13 +24,13 @@ void ff_fft_permute_sse(FFTContext *s, FFTComplex *z); void ff_fft_calc_avx(FFTContext *s, FFTComplex *z); void ff_fft_calc_sse(FFTContext *s, FFTComplex *z); -void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z); -void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z); +void ff_fft_calc_3dnow(FFTContext *s, FFTComplex *z); +void ff_fft_calc_3dnow2(FFTContext *s, FFTComplex *z); -void ff_imdct_calc_3dn(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_half_3dn(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_calc_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_calc_3dnow2(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_3dnow2(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input); diff --git a/libavcodec/x86/fft_3dn2.c b/libavcodec/x86/fft_3dn2.c index ce3c9daddb..e684cc745f 100644 --- a/libavcodec/x86/fft_3dn2.c +++ b/libavcodec/x86/fft_3dn2.c @@ -30,30 +30,30 @@ DECLARE_ALIGNED(8, static const unsigned int, m1m1)[2] = { 1U<<31, 1U<<31 }; "movq "#s","#d"\n"\ "psrlq $32,"#d"\n"\ "punpckldq "#s","#d"\n" -#define ff_fft_calc_3dn2 ff_fft_calc_3dn -#define ff_fft_dispatch_3dn2 ff_fft_dispatch_3dn -#define ff_fft_dispatch_interleave_3dn2 ff_fft_dispatch_interleave_3dn -#define ff_imdct_calc_3dn2 ff_imdct_calc_3dn -#define ff_imdct_half_3dn2 ff_imdct_half_3dn +#define ff_fft_calc_3dnow2 ff_fft_calc_3dnow +#define ff_fft_dispatch_3dnow2 ff_fft_dispatch_3dnow +#define ff_fft_dispatch_interleave_3dnow2 ff_fft_dispatch_interleave_3dnow +#define ff_imdct_calc_3dnow2 ff_imdct_calc_3dnow +#define ff_imdct_half_3dnow2 ff_imdct_half_3dnow #else #define PSWAPD(s,d) "pswapd "#s","#d"\n" #endif -void ff_fft_dispatch_3dn2(FFTComplex *z, int nbits); -void ff_fft_dispatch_interleave_3dn2(FFTComplex *z, int nbits); +void ff_fft_dispatch_3dnow2(FFTComplex *z, int nbits); +void ff_fft_dispatch_interleave_3dnow2(FFTComplex *z, int nbits); -void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z) +void ff_fft_calc_3dnow2(FFTContext *s, FFTComplex *z) { int n = 1<nbits; int i; - ff_fft_dispatch_interleave_3dn2(z, s->nbits); + ff_fft_dispatch_interleave_3dnow2(z, s->nbits); __asm__ volatile("femms"); if(n <= 8) for(i=0; imdct_size; @@ -101,7 +101,7 @@ void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input ); } - ff_fft_dispatch_3dn2(z, s->nbits); + ff_fft_dispatch_3dnow2(z, s->nbits); #define CMUL(j,mm0,mm1)\ "movq (%2,"#j",2), %%mm6 \n"\ @@ -144,13 +144,13 @@ void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input __asm__ volatile("femms"); } -void ff_imdct_calc_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input) +void ff_imdct_calc_3dnow2(FFTContext *s, FFTSample *output, const FFTSample *input) { x86_reg j, k; long n = s->mdct_size; long n4 = n >> 2; - ff_imdct_half_3dn2(s, output+n4, input); + ff_imdct_half_3dnow2(s, output+n4, input); j = -n; k = n-8; diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm index 225c66635d..b60d8b0a47 100644 --- a/libavcodec/x86/fft_mmx.asm +++ b/libavcodec/x86/fft_mmx.asm @@ -297,7 +297,7 @@ IF%1 mova Z(1), m5 %define Z2(x) [r0+mmsize*x] %define ZH(x) [r0+mmsize*x+mmsize/2] -INIT_YMM +INIT_YMM avx %if HAVE_AVX align 16 @@ -390,7 +390,7 @@ fft32_interleave_avx: ret %endif -INIT_XMM +INIT_XMM sse %define movdqa movaps align 16 @@ -439,11 +439,9 @@ fft16_sse: ret -INIT_MMX - -%macro FFT48_3DN 1 +%macro FFT48_3DN 0 align 16 -fft4%1: +fft4 %+ SUFFIX: T2_3DN m0, m1, Z(0), Z(1) mova m2, Z(2) mova m3, Z(3) @@ -457,7 +455,7 @@ fft4%1: ret align 16 -fft8%1: +fft8 %+ SUFFIX: T2_3DN m0, m1, Z(0), Z(1) mova m2, Z(2) mova m3, Z(3) @@ -495,7 +493,8 @@ fft8%1: ret %endmacro -FFT48_3DN _3dn2 +INIT_MMX 3dnow2 +FFT48_3DN %macro pswapd 2 %ifidn %1, %2 @@ -508,7 +507,8 @@ FFT48_3DN _3dn2 %endif %endmacro -FFT48_3DN _3dn +INIT_MMX 3dnow +FFT48_3DN %define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)] @@ -532,7 +532,7 @@ DEFINE_ARGS z, w, n, o1, o3 rep ret %endmacro -INIT_YMM +INIT_YMM avx %if HAVE_AVX %macro INTERL_AVX 5 @@ -550,7 +550,7 @@ DECL_PASS pass_avx, PASS_BIG 1 DECL_PASS pass_interleave_avx, PASS_BIG 0 %endif -INIT_XMM +INIT_XMM sse %macro INTERL_SSE 5 mova %3, %2 @@ -565,16 +565,16 @@ INIT_XMM DECL_PASS pass_sse, PASS_BIG 1 DECL_PASS pass_interleave_sse, PASS_BIG 0 -INIT_MMX +INIT_MMX 3dnow %define mulps pfmul %define addps pfadd %define subps pfsub %define unpcklps punpckldq %define unpckhps punpckhdq -DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q] -DECL_PASS pass_interleave_3dn, PASS_BIG 0 -%define pass_3dn2 pass_3dn -%define pass_interleave_3dn2 pass_interleave_3dn +DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q] +DECL_PASS pass_interleave_3dnow, PASS_BIG 0 +%define pass_3dnow2 pass_3dnow +%define pass_interleave_3dnow2 pass_interleave_3dnow %ifdef PIC %define SECTION_REL - $$ @@ -592,67 +592,73 @@ DECL_PASS pass_interleave_3dn, PASS_BIG 0 call r2 %endmacro ; FFT_DISPATCH -%macro DECL_FFT 2-3 ; nbits, cpu, suffix -%xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL +%macro DECL_FFT 1-2 ; nbits, suffix +%ifidn %0, 1 +%xdefine fullsuffix SUFFIX +%else +%xdefine fullsuffix %2 %+ SUFFIX +%endif +%xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL %if %1>=5 -%xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL +%xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL %endif %if %1>=6 -%xdefine list_of_fft list_of_fft, fft32%3%2 SECTION_REL +%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL %endif %assign n 1<<%1 %rep 17-%1 %assign n2 n/2 %assign n4 n/4 -%xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2 SECTION_REL +%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL align 16 -fft %+ n %+ %3%2: - call fft %+ n2 %+ %2 +fft %+ n %+ fullsuffix: + call fft %+ n2 %+ SUFFIX add r0, n*4 - (n&(-2<<%1)) - call fft %+ n4 %+ %2 + call fft %+ n4 %+ SUFFIX add r0, n*2 - (n2&(-2<<%1)) - call fft %+ n4 %+ %2 + call fft %+ n4 %+ SUFFIX sub r0, n*6 + (n2&(-2<<%1)) lea r1, [cos_ %+ n] mov r2d, n4/2 - jmp pass%3%2 + jmp pass %+ fullsuffix %assign n n*2 %endrep %undef n align 8 -dispatch_tab%3%2: pointer list_of_fft +dispatch_tab %+ fullsuffix: pointer list_of_fft section .text ; On x86_32, this function does the register saving and restoring for all of fft. ; The others pass args in registers and don't spill anything. -cglobal fft_dispatch%3%2, 2,5,8, z, nbits - FFT_DISPATCH %3%2, nbits -%ifidn %2, _avx +cglobal fft_dispatch%2, 2,5,8, z, nbits + FFT_DISPATCH fullsuffix, nbits +%if mmsize == 32 vzeroupper %endif RET %endmacro ; DECL_FFT %if HAVE_AVX -INIT_YMM -DECL_FFT 6, _avx -DECL_FFT 6, _avx, _interleave +INIT_YMM avx +DECL_FFT 6 +DECL_FFT 6, _interleave %endif -INIT_XMM -DECL_FFT 5, _sse -DECL_FFT 5, _sse, _interleave -INIT_MMX -DECL_FFT 4, _3dn -DECL_FFT 4, _3dn, _interleave -DECL_FFT 4, _3dn2 -DECL_FFT 4, _3dn2, _interleave - -INIT_XMM +INIT_XMM sse +DECL_FFT 5 +DECL_FFT 5, _interleave +INIT_MMX 3dnow +DECL_FFT 4 +DECL_FFT 4, _interleave +INIT_MMX 3dnow2 +DECL_FFT 4 +DECL_FFT 4, _interleave + +INIT_XMM sse %undef mulps %undef addps %undef subps @@ -748,8 +754,8 @@ INIT_XMM jl .post %endmacro -%macro DECL_IMDCT 2 -cglobal imdct_half%1, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input +%macro DECL_IMDCT 1 +cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input %if ARCH_X86_64 %define rrevtab r7 %define rtcos r8 @@ -821,7 +827,7 @@ cglobal imdct_half%1, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample mov r0, r1 mov r1d, [r5+FFTContext.nbits] - FFT_DISPATCH %1, r1 + FFT_DISPATCH SUFFIX, r1 mov r0d, [r5+FFTContext.mdctsize] add r6, r0 @@ -835,20 +841,20 @@ cglobal imdct_half%1, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample neg r0 mov r1, -mmsize sub r1, r0 - %2 r0, r1, r6, rtcos, rtsin + %1 r0, r1, r6, rtcos, rtsin %if ARCH_X86_64 == 0 add esp, 12 %endif -%ifidn avx_enabled, 1 +%if mmsize == 32 vzeroupper %endif RET %endmacro -DECL_IMDCT _sse, POSROTATESHUF +DECL_IMDCT POSROTATESHUF -INIT_YMM +INIT_YMM avx %if HAVE_AVX -DECL_IMDCT _avx, POSROTATESHUF_AVX +DECL_IMDCT POSROTATESHUF_AVX %endif