FFmpeg/libavcodec/x86/fft_sse.c

/*
 * FFT/MDCT transform with SSE optimizations
 * Copyright (c) 2008 Loren Merritt
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/x86_cpu.h"
#include "libavcodec/dsputil.h"
#include "fft.h"
#include "config.h"

DECLARE_ASM_CONST(16, unsigned int, ff_m1m1m1m1)[4] =
    { 1U << 31, 1U << 31, 1U << 31, 1U << 31 };

void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
void ff_fft_dispatch_interleave_avx(FFTComplex *z, int nbits);

#if HAVE_AVX
void ff_fft_calc_avx(FFTContext *s, FFTComplex *z)
{
    ff_fft_dispatch_interleave_avx(z, s->nbits);
}
#endif

void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
{
    int n = 1 << s->nbits;

    ff_fft_dispatch_interleave_sse(z, s->nbits);

    if(n <= 16) {
        x86_reg i = -8*n;
        __asm__ volatile(
            "1: \n"
            "movaps     (%0,%1), %%xmm0 \n"
            "movaps      %%xmm0, %%xmm1 \n"
            "unpcklps 16(%0,%1), %%xmm0 \n"
            "unpckhps 16(%0,%1), %%xmm1 \n"
            "movaps      %%xmm0,   (%0,%1) \n"
            "movaps      %%xmm1, 16(%0,%1) \n"
            "add $32, %0 \n"
            "jl 1b \n"
            :"+r"(i)
            :"r"(z+n)
            :"memory"
        );
    }
}

void ff_fft_permute_sse(FFTContext *s, FFTComplex *z)
{
    int n = 1 << s->nbits;
    int i;
    for(i=0; i<n; i+=2) {
        __asm__ volatile(
            "movaps %2, %%xmm0 \n"
            "movlps %%xmm0, %0 \n"
            "movhps %%xmm0, %1 \n"
            :"=m"(s->tmp_buf[s->revtab[i]]),
             "=m"(s->tmp_buf[s->revtab[i+1]])
            :"m"(z[i])
        );
    }
    memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));
}

void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
{
    x86_reg j, k;
    long n = s->mdct_size;
    long n4 = n >> 2;

    s->imdct_half(s, output + n4, input);

    j = -n;
    k = n-16;
    __asm__ volatile(
        "movaps "MANGLE(ff_m1m1m1m1)", %%xmm7 \n"
        "1: \n"
        "movaps       (%2,%1), %%xmm0 \n"
        "movaps       (%3,%0), %%xmm1 \n"
        "shufps $0x1b, %%xmm0, %%xmm0 \n"
        "shufps $0x1b, %%xmm1, %%xmm1 \n"
        "xorps         %%xmm7, %%xmm0 \n"
        "movaps        %%xmm1, (%3,%1) \n"
        "movaps        %%xmm0, (%2,%0) \n"
        "sub $16, %1 \n"
        "add $16, %0 \n"
        "jl 1b \n"
        :"+r"(j), "+r"(k)
        :"r"(output+n4), "r"(output+n4*3)
        XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm7")
    );
}
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`/*`
			`* FFT/MDCT transform with SSE optimizations`
optimize imdct_half: remove tmp buffer. skip fft reinterleave pass, leaving data in a format more convenient for simd. merge post-rotate with post-reorder. Originally committed as revision 14700 to svn://svn.ffmpeg.org/ffmpeg/trunk 17 years ago			`* Copyright (c) 2008 Loren Merritt`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`*`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 18 years ago			`* This file is part of FFmpeg.`
			`*`
			`* FFmpeg is free software; you can redistribute it and/or`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 18 years ago			`* version 2.1 of the License, or (at your option) any later version.`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`*`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 18 years ago			`* FFmpeg is distributed in the hope that it will be useful,`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 18 years ago			`* License along with FFmpeg; if not, write to the Free Software`
Update licensing information: The FSF changed postal address. Originally committed as revision 4842 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`*/`
Use full path for #includes from another directory. Originally committed as revision 13098 to svn://svn.ffmpeg.org/ffmpeg/trunk 17 years ago
			`#include "libavutil/x86_cpu.h"`
			`#include "libavcodec/dsputil.h"`
Move per-arch fft init bits into the corresponding subdirs Originally committed as revision 19864 to svn://svn.ffmpeg.org/ffmpeg/trunk 15 years ago			`#include "fft.h"`
x86: Add appropriate ifdefs around certain AVX functions. nasm versions prior to 2.09 have trouble assembling some of our AVX code. Protect these sections by preprocessor macros to allow compilation to pass. 14 years ago			`#include "config.h"`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago
fft: avoid a signed overflow As a signed integer, 1<<31 overflows, so force it to unsigned. Signed-off-by: Alex Converse <alex.converse@gmail.com> 13 years ago			`DECLARE_ASM_CONST(16, unsigned int, ff_m1m1m1m1)[4] =`
			`{ 1U << 31, 1U << 31, 1U << 31, 1U << 31 };`
sse implementation of imdct. patch mostly by Zuxy Meng (zuxy dot meng at gmail dot com) Originally committed as revision 6311 to svn://svn.ffmpeg.org/ffmpeg/trunk 18 years ago
split-radix FFT c is 1.9x faster than previous c (on various x86 cpus), sse is 1.6x faster than previous sse. Originally committed as revision 14698 to svn://svn.ffmpeg.org/ffmpeg/trunk 17 years ago			`void ff_fft_dispatch_sse(FFTComplex *z, int nbits);`
			`void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);`
Add AVX FFT implementation. Signed-off-by: Reinhard Tartler <siretart@tauware.de> 14 years ago			`void ff_fft_dispatch_interleave_avx(FFTComplex *z, int nbits);`

Fix compilation with YASM/NASM versions not supporting AVX. 14 years ago			`#if HAVE_AVX`
Add AVX FFT implementation. Signed-off-by: Reinhard Tartler <siretart@tauware.de> 14 years ago			`void ff_fft_calc_avx(FFTContext s, FFTComplex z)`
			`{`
			`ff_fft_dispatch_interleave_avx(z, s->nbits);`
			`}`
Fix compilation with YASM/NASM versions not supporting AVX. 14 years ago			`#endif`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago
fft_() renamed into ff_fft_() patch by (Gildas Bazin <gbazin at altern dot org>) Originally committed as revision 2882 to svn://svn.ffmpeg.org/ffmpeg/trunk 21 years ago			`void ff_fft_calc_sse(FFTContext s, FFTComplex z)`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`{`
split-radix FFT c is 1.9x faster than previous c (on various x86 cpus), sse is 1.6x faster than previous sse. Originally committed as revision 14698 to svn://svn.ffmpeg.org/ffmpeg/trunk 17 years ago			`int n = 1 << s->nbits;`

			`ff_fft_dispatch_interleave_sse(z, s->nbits);`

			`if(n <= 16) {`
			`x86_reg i = -8*n;`
Convert asm keyword into __asm__. Neither the asm() nor the __asm__() keyword is part of the C99 standard, but while GCC accepts the former in C89 syntax, it is not accepted in C99 unless GNU extensions are turned on (with -fasm). The latter form is accepted in any syntax as an extension (without requiring further command-line options). Sun Studio C99 compiler also does not accept asm() while accepting __asm__(), albeit reporting warnings that it's not valid C99 syntax. Originally committed as revision 15627 to svn://svn.ffmpeg.org/ffmpeg/trunk 16 years ago			`__asm__ volatile(`
split-radix FFT c is 1.9x faster than previous c (on various x86 cpus), sse is 1.6x faster than previous sse. Originally committed as revision 14698 to svn://svn.ffmpeg.org/ffmpeg/trunk 17 years ago			`"1: \n"`
			`"movaps (%0,%1), %%xmm0 \n"`
			`"movaps %%xmm0, %%xmm1 \n"`
			`"unpcklps 16(%0,%1), %%xmm0 \n"`
			`"unpckhps 16(%0,%1), %%xmm1 \n"`
			`"movaps %%xmm0, (%0,%1) \n"`
			`"movaps %%xmm1, 16(%0,%1) \n"`
			`"add $32, %0 \n"`
			`"jl 1b \n"`
			`:"+r"(i)`
			`:"r"(z+n)`
			`:"memory"`
			`);`
			`}`
			`}`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago
split-radix FFT c is 1.9x faster than previous c (on various x86 cpus), sse is 1.6x faster than previous sse. Originally committed as revision 14698 to svn://svn.ffmpeg.org/ffmpeg/trunk 17 years ago			`void ff_fft_permute_sse(FFTContext s, FFTComplex z)`
			`{`
			`int n = 1 << s->nbits;`
			`int i;`
			`for(i=0; i<n; i+=2) {`
Convert asm keyword into __asm__. Neither the asm() nor the __asm__() keyword is part of the C99 standard, but while GCC accepts the former in C89 syntax, it is not accepted in C99 unless GNU extensions are turned on (with -fasm). The latter form is accepted in any syntax as an extension (without requiring further command-line options). Sun Studio C99 compiler also does not accept asm() while accepting __asm__(), albeit reporting warnings that it's not valid C99 syntax. Originally committed as revision 15627 to svn://svn.ffmpeg.org/ffmpeg/trunk 16 years ago			`__asm__ volatile(`
split-radix FFT c is 1.9x faster than previous c (on various x86 cpus), sse is 1.6x faster than previous sse. Originally committed as revision 14698 to svn://svn.ffmpeg.org/ffmpeg/trunk 17 years ago			`"movaps %2, %%xmm0 \n"`
			`"movlps %%xmm0, %0 \n"`
			`"movhps %%xmm0, %1 \n"`
			`:"=m"(s->tmp_buf[s->revtab[i]]),`
			`"=m"(s->tmp_buf[s->revtab[i+1]])`
			`:"m"(z[i])`
			`);`
			`}`
			`memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));`
new generic FFT/MDCT code for audio codecs Originally committed as revision 1088 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`}`
added define for builtins use - inverse fix by Romain Dolbeau Originally committed as revision 1410 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago
Merge FFTContext and MDCTContext Originally committed as revision 19931 to svn://svn.ffmpeg.org/ffmpeg/trunk 15 years ago			`void ff_imdct_calc_sse(FFTContext s, FFTSample output, const FFTSample *input)`
exploit mdct symmetry 2% faster vorbis on conroe, k8. 7% on celeron. Originally committed as revision 14207 to svn://svn.ffmpeg.org/ffmpeg/trunk 17 years ago			`{`
			`x86_reg j, k;`
imdct/x86: Use "s->mdct_size" instead of "1 << s->mdct_bits". It generates smaller cleaner code. Originally committed as revision 24887 to svn://svn.ffmpeg.org/ffmpeg/trunk 14 years ago			`long n = s->mdct_size;`
optimize imdct_half: remove tmp buffer. skip fft reinterleave pass, leaving data in a format more convenient for simd. merge post-rotate with post-reorder. Originally committed as revision 14700 to svn://svn.ffmpeg.org/ffmpeg/trunk 17 years ago			`long n4 = n >> 2;`
exploit mdct symmetry 2% faster vorbis on conroe, k8. 7% on celeron. Originally committed as revision 14207 to svn://svn.ffmpeg.org/ffmpeg/trunk 17 years ago
Add AVX FFT implementation. Signed-off-by: Reinhard Tartler <siretart@tauware.de> 14 years ago			`s->imdct_half(s, output + n4, input);`
exploit mdct symmetry 2% faster vorbis on conroe, k8. 7% on celeron. Originally committed as revision 14207 to svn://svn.ffmpeg.org/ffmpeg/trunk 17 years ago
			`j = -n;`
			`k = n-16;`
Convert asm keyword into __asm__. Neither the asm() nor the __asm__() keyword is part of the C99 standard, but while GCC accepts the former in C89 syntax, it is not accepted in C99 unless GNU extensions are turned on (with -fasm). The latter form is accepted in any syntax as an extension (without requiring further command-line options). Sun Studio C99 compiler also does not accept asm() while accepting __asm__(), albeit reporting warnings that it's not valid C99 syntax. Originally committed as revision 15627 to svn://svn.ffmpeg.org/ffmpeg/trunk 16 years ago			`__asm__ volatile(`
Fix ff_imdct_calc_sse() on gcc-4.6 Gcc 4.6 only preserves the first value when using an array with an "m" constraint. Signed-off-by: Mans Rullgard <mans@mansr.com> (cherry picked from commit 770c410fbb8e1b87ce8ad7f3d7eddaa55e2b8295) 14 years ago			`"movaps "MANGLE(ff_m1m1m1m1)", %%xmm7 \n"`
optimize imdct_half: remove tmp buffer. skip fft reinterleave pass, leaving data in a format more convenient for simd. merge post-rotate with post-reorder. Originally committed as revision 14700 to svn://svn.ffmpeg.org/ffmpeg/trunk 17 years ago			`"1: \n"`
			`"movaps (%2,%1), %%xmm0 \n"`
			`"movaps (%3,%0), %%xmm1 \n"`
			`"shufps $0x1b, %%xmm0, %%xmm0 \n"`
			`"shufps $0x1b, %%xmm1, %%xmm1 \n"`
			`"xorps %%xmm7, %%xmm0 \n"`
			`"movaps %%xmm1, (%3,%1) \n"`
			`"movaps %%xmm0, (%2,%0) \n"`
			`"sub $16, %1 \n"`
			`"add $16, %0 \n"`
			`"jl 1b \n"`
exploit mdct symmetry 2% faster vorbis on conroe, k8. 7% on celeron. Originally committed as revision 14207 to svn://svn.ffmpeg.org/ffmpeg/trunk 17 years ago			`:"+r"(j), "+r"(k)`
Fix ff_imdct_calc_sse() on gcc-4.6 Gcc 4.6 only preserves the first value when using an array with an "m" constraint. Signed-off-by: Mans Rullgard <mans@mansr.com> (cherry picked from commit 770c410fbb8e1b87ce8ad7f3d7eddaa55e2b8295) 14 years ago			`:"r"(output+n4), "r"(output+n4*3)`
fft: mark xmm registers as clobbered in ff_imdct_calc_sse Originally committed as revision 25363 to svn://svn.ffmpeg.org/ffmpeg/trunk 14 years ago			`XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm7")`
exploit mdct symmetry 2% faster vorbis on conroe, k8. 7% on celeron. Originally committed as revision 14207 to svn://svn.ffmpeg.org/ffmpeg/trunk 17 years ago			`);`
			`}`