Add AVX FFT implementation.

Signed-off-by: Reinhard Tartler <siretart@tauware.de>
14 years ago · 9d35fa520e
parent 13dfce3d44
commit 9d35fa520e
23 changed files with 450 additions and 207 deletions
--- a/2
+++ b/2
@ -5,7 +5,7 @@ releases are sorted from youngest to oldest.
 version <next>:
 - Lots of deprecated API cruft removed
-
+- fft and imdct optimizations for AVX (Sandy Bridge) processors
 version 0.7_beta1:
--- a/libavcodec/aac.h
+++ b/libavcodec/aac.h
@ -223,9 +223,9 @@ typedef struct {
    float sf[120];                                  ///< scalefactors
    int sf_idx[128];                                ///< scalefactor indices (used by encoder)
    uint8_t zeroes[128];                            ///< band is not coded (used by encoder)
-    DECLARE_ALIGNED(16, float,   coeffs)[1024];     ///< coefficients for IMDCT
+    DECLARE_ALIGNED(32, float,   coeffs)[1024];     ///< coefficients for IMDCT
-    DECLARE_ALIGNED(16, float,   saved)[1024];      ///< overlap
+    DECLARE_ALIGNED(32, float,   saved)[1024];      ///< overlap
-    DECLARE_ALIGNED(16, float,   ret)[2048];        ///< PCM output
+    DECLARE_ALIGNED(32, float,   ret)[2048];        ///< PCM output
    DECLARE_ALIGNED(16, int16_t, ltp_state)[3072];  ///< time signal for LTP
    PredictorState predictor_state[MAX_PREDICTORS];
 } SingleChannelElement;
@ -272,7 +272,7 @@ typedef struct {
     * @defgroup temporary aligned temporary buffers (We do not want to have these on the stack.)
     * @{
     */
-    DECLARE_ALIGNED(16, float, buf_mdct)[1024];
+    DECLARE_ALIGNED(32, float, buf_mdct)[1024];
    /** @} */
    /**
@ -296,7 +296,7 @@ typedef struct {
    int sf_offset;                                    ///< offset into pow2sf_tab as appropriate for dsp.float_to_int16
    /** @} */
-    DECLARE_ALIGNED(16, float, temp)[128];
+    DECLARE_ALIGNED(32, float, temp)[128];
    enum OCStatus output_configured;
 } AACContext;
--- a/libavcodec/aacenc.h
+++ b/libavcodec/aacenc.h
@ -64,7 +64,7 @@ typedef struct AACEncContext {
    int last_frame;
    float lambda;
    DECLARE_ALIGNED(16, int,   qcoefs)[96];      ///< quantized coefficients
-    DECLARE_ALIGNED(16, float, scoefs)[1024];    ///< scaled coefficients
+    DECLARE_ALIGNED(32, float, scoefs)[1024];    ///< scaled coefficients
 } AACEncContext;
 #endif /* AVCODEC_AACENC_H */
--- a/libavcodec/ac3dec.h
+++ b/libavcodec/ac3dec.h
@ -200,11 +200,11 @@ typedef struct {
 ///@defgroup arrays aligned arrays
    DECLARE_ALIGNED(16, int,   fixed_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS];       ///> fixed-point transform coefficients
-    DECLARE_ALIGNED(16, float, transform_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS];   ///< transform coefficients
+    DECLARE_ALIGNED(32, float, transform_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS];   ///< transform coefficients
-    DECLARE_ALIGNED(16, float, delay)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE];             ///< delay - added to the next block
+    DECLARE_ALIGNED(32, float, delay)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE];             ///< delay - added to the next block
-    DECLARE_ALIGNED(16, float, window)[AC3_BLOCK_SIZE];                              ///< window coefficients
+    DECLARE_ALIGNED(32, float, window)[AC3_BLOCK_SIZE];                              ///< window coefficients
-    DECLARE_ALIGNED(16, float, tmp_output)[AC3_BLOCK_SIZE];                          ///< temporary storage for output before windowing
+    DECLARE_ALIGNED(32, float, tmp_output)[AC3_BLOCK_SIZE];                          ///< temporary storage for output before windowing
-    DECLARE_ALIGNED(16, float, output)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE];            ///< output after imdct transform and windowing
+    DECLARE_ALIGNED(32, float, output)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE];            ///< output after imdct transform and windowing
 ///@}
 } AC3DecodeContext;
--- a/libavcodec/ac3enc.c
+++ b/libavcodec/ac3enc.c
@ -201,7 +201,7 @@ typedef struct AC3EncodeContext {
    uint8_t exp_strategy[AC3_MAX_CHANNELS][AC3_MAX_BLOCKS]; ///< exponent strategies
-    DECLARE_ALIGNED(16, SampleType, windowed_samples)[AC3_WINDOW_SIZE];
+    DECLARE_ALIGNED(32, SampleType, windowed_samples)[AC3_WINDOW_SIZE];
 } AC3EncodeContext;
 typedef struct AC3Mant {
--- a/libavcodec/atrac1.c
+++ b/libavcodec/atrac1.c
@ -60,11 +60,11 @@ typedef struct {
    int                 log2_block_count[AT1_QMF_BANDS];    ///< log2 number of blocks in a band
    int                 num_bfus;                           ///< number of Block Floating Units
    float*              spectrum[2];
-    DECLARE_ALIGNED(16, float, spec1)[AT1_SU_SAMPLES];     ///< mdct buffer
+    DECLARE_ALIGNED(32, float, spec1)[AT1_SU_SAMPLES];     ///< mdct buffer
-    DECLARE_ALIGNED(16, float, spec2)[AT1_SU_SAMPLES];     ///< mdct buffer
+    DECLARE_ALIGNED(32, float, spec2)[AT1_SU_SAMPLES];     ///< mdct buffer
-    DECLARE_ALIGNED(16, float, fst_qmf_delay)[46];         ///< delay line for the 1st stacked QMF filter
+    DECLARE_ALIGNED(32, float, fst_qmf_delay)[46];         ///< delay line for the 1st stacked QMF filter
-    DECLARE_ALIGNED(16, float, snd_qmf_delay)[46];         ///< delay line for the 2nd stacked QMF filter
+    DECLARE_ALIGNED(32, float, snd_qmf_delay)[46];         ///< delay line for the 2nd stacked QMF filter
-    DECLARE_ALIGNED(16, float, last_qmf_delay)[256+23];    ///< delay line for the last stacked QMF filter
+    DECLARE_ALIGNED(32, float, last_qmf_delay)[256+23];    ///< delay line for the last stacked QMF filter
 } AT1SUCtx;
 /**
@ -72,13 +72,13 @@ typedef struct {
 */
 typedef struct {
    AT1SUCtx            SUs[AT1_MAX_CHANNELS];              ///< channel sound unit
-    DECLARE_ALIGNED(16, float, spec)[AT1_SU_SAMPLES];      ///< the mdct spectrum buffer
+    DECLARE_ALIGNED(32, float, spec)[AT1_SU_SAMPLES];      ///< the mdct spectrum buffer
-    DECLARE_ALIGNED(16, float,  low)[256];
+    DECLARE_ALIGNED(32, float,  low)[256];
-    DECLARE_ALIGNED(16, float,  mid)[256];
+    DECLARE_ALIGNED(32, float,  mid)[256];
-    DECLARE_ALIGNED(16, float, high)[512];
+    DECLARE_ALIGNED(32, float, high)[512];
    float*              bands[3];
-    DECLARE_ALIGNED(16, float, out_samples)[AT1_MAX_CHANNELS][AT1_SU_SAMPLES];
+    DECLARE_ALIGNED(32, float, out_samples)[AT1_MAX_CHANNELS][AT1_SU_SAMPLES];
    FFTContext          mdct_ctx[3];
    int                 channels;
    DSPContext          dsp;
--- a/libavcodec/atrac3.c
+++ b/libavcodec/atrac3.c
@ -74,8 +74,8 @@ typedef struct {
    int               gcBlkSwitch;
    gain_block        gainBlock[2];
-    DECLARE_ALIGNED(16, float, spectrum)[1024];
+    DECLARE_ALIGNED(32, float, spectrum)[1024];
-    DECLARE_ALIGNED(16, float, IMDCT_buf)[1024];
+    DECLARE_ALIGNED(32, float, IMDCT_buf)[1024];
    float             delayBuf1[46]; ///<qmf delay buffers
    float             delayBuf2[46];
@ -122,7 +122,7 @@ typedef struct {
    FFTContext          mdct_ctx;
 } ATRAC3Context;
-static DECLARE_ALIGNED(16, float,mdct_window)[512];
+static DECLARE_ALIGNED(32, float, mdct_window)[512];
 static VLC              spectral_coeff_tab[7];
 static float            gain_tab1[16];
 static float            gain_tab2[31];
--- a/libavcodec/binkaudio.c
+++ b/libavcodec/binkaudio.c
@ -55,7 +55,7 @@ typedef struct {
    int num_bands;
    unsigned int *bands;
    float root;
-    DECLARE_ALIGNED(16, FFTSample, coeffs)[BINK_BLOCK_MAX_SIZE];
+    DECLARE_ALIGNED(32, FFTSample, coeffs)[BINK_BLOCK_MAX_SIZE];
    DECLARE_ALIGNED(16, short, previous)[BINK_BLOCK_MAX_SIZE / 16];  ///< coeffs from previous audio block
    float *coeffs_ptr[MAX_CHANNELS]; ///< pointers to the coeffs arrays for float_to_int16_interleave
    union {
--- a/libavcodec/cook.c
+++ b/libavcodec/cook.c
@ -153,7 +153,7 @@ typedef struct cook {
    /* data buffers */
    uint8_t*            decoded_bytes_buffer;
-    DECLARE_ALIGNED(16, float,mono_mdct_output)[2048];
+    DECLARE_ALIGNED(32, float, mono_mdct_output)[2048];
    float               decode_buffer_1[1024];
    float               decode_buffer_2[1024];
    float               decode_buffer_0[1060]; /* static allocation for joint decode */
--- a/libavcodec/dca.c
+++ b/libavcodec/dca.c
@ -321,16 +321,16 @@ typedef struct {
    /* Subband samples history (for ADPCM) */
    float subband_samples_hist[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][4];
-    DECLARE_ALIGNED(16, float, subband_fir_hist)[DCA_PRIM_CHANNELS_MAX][512];
+    DECLARE_ALIGNED(32, float, subband_fir_hist)[DCA_PRIM_CHANNELS_MAX][512];
-    DECLARE_ALIGNED(16, float, subband_fir_noidea)[DCA_PRIM_CHANNELS_MAX][32];
+    DECLARE_ALIGNED(32, float, subband_fir_noidea)[DCA_PRIM_CHANNELS_MAX][32];
    int hist_index[DCA_PRIM_CHANNELS_MAX];
-    DECLARE_ALIGNED(16, float, raXin)[32];
+    DECLARE_ALIGNED(32, float, raXin)[32];
    int output;                 ///< type of output
    float scale_bias;           ///< output scale
-    DECLARE_ALIGNED(16, float, subband_samples)[DCA_BLOCKS_MAX][DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][8];
+    DECLARE_ALIGNED(32, float, subband_samples)[DCA_BLOCKS_MAX][DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][8];
-    DECLARE_ALIGNED(16, float, samples)[(DCA_PRIM_CHANNELS_MAX+1)*256];
+    DECLARE_ALIGNED(32, float, samples)[(DCA_PRIM_CHANNELS_MAX+1)*256];
    const float *samples_chanptr[DCA_PRIM_CHANNELS_MAX+1];
    uint8_t dca_buffer[DCA_MAX_FRAME_SIZE + DCA_MAX_EXSS_HEADER_SIZE + DCA_BUFFER_PADDING_SIZE];
--- a/libavcodec/fft.c
+++ b/libavcodec/fft.c
@ -93,6 +93,44 @@ av_cold void ff_init_ff_cos_tabs(int index)
 #endif
 }
 static const int avx_tab[] = {
    0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15
 };
 static int is_second_half_of_fft32(int i, int n)
 {
    if (n <= 32)
        return i >= 16;
    else if (i < n/2)
        return is_second_half_of_fft32(i, n/2);
    else if (i < 3*n/4)
        return is_second_half_of_fft32(i - n/2, n/4);
    else
        return is_second_half_of_fft32(i - 3*n/4, n/4);
 }
 static av_cold void fft_perm_avx(FFTContext *s)
 {
    int i;
    int n = 1 << s->nbits;
    for (i = 0; i < n; i += 16) {
        int k;
        if (is_second_half_of_fft32(i, n)) {
            for (k = 0; k < 16; k++)
                s->revtab[-split_radix_permutation(i + k, n, s->inverse) & (n - 1)] =
                    i + avx_tab[k];
        } else {
            for (k = 0; k < 16; k++) {
                int j = i + k;
                j = (j & ~7) | ((j >> 1) & 3) | ((j << 2) & 4);
                s->revtab[-split_radix_permutation(i + k, n, s->inverse) & (n - 1)] = j;
            }
        }
    }
 }
 av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
 {
    int i, j, n;
@ -132,11 +170,16 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
    for(j=4; j<=nbits; j++) {
        ff_init_ff_cos_tabs(j);
    }
-    for(i=0; i<n; i++) {
+
-        int j = i;
+    if (s->fft_permutation == FF_FFT_PERM_AVX) {
-        if (s->fft_permutation == FF_FFT_PERM_SWAP_LSBS)
+        fft_perm_avx(s);
-            j = (j&~3) | ((j>>1)&1) | ((j<<1)&2);
+    } else {
-        s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = j;
+        for(i=0; i<n; i++) {
            int j = i;
            if (s->fft_permutation == FF_FFT_PERM_SWAP_LSBS)
                j = (j&~3) | ((j>>1)&1) | ((j<<1)&2);
            s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = j;
        }
    }
    return 0;
--- a/libavcodec/fft.h
+++ b/libavcodec/fft.h
@ -85,6 +85,7 @@ struct FFTContext {
    int fft_permutation;
 #define FF_FFT_PERM_DEFAULT   0
 #define FF_FFT_PERM_SWAP_LSBS 1
 #define FF_FFT_PERM_AVX       2
    int mdct_permutation;
 #define FF_MDCT_PERM_NONE       0
 #define FF_MDCT_PERM_INTERLEAVE 1
@ -97,7 +98,7 @@ struct FFTContext {
 #endif
 #define COSTABLE(size) \
-    COSTABLE_CONST DECLARE_ALIGNED(16, FFTSample, FFT_NAME(ff_cos_##size))[size/2]
+    COSTABLE_CONST DECLARE_ALIGNED(32, FFTSample, FFT_NAME(ff_cos_##size))[size/2]
 extern COSTABLE(16);
 extern COSTABLE(32);
--- a/libavcodec/imc.c
+++ b/libavcodec/imc.c
@ -88,7 +88,7 @@ typedef struct {
    DSPContext dsp;
    FFTContext fft;
-    DECLARE_ALIGNED(16, FFTComplex, samples)[COEFFS/2];
+    DECLARE_ALIGNED(32, FFTComplex, samples)[COEFFS/2];
    float *out_samples;
 } IMCContext;
--- a/libavcodec/nellymoserdec.c
+++ b/libavcodec/nellymoserdec.c
@ -47,7 +47,7 @@
 typedef struct NellyMoserDecodeContext {
    AVCodecContext* avctx;
-    DECLARE_ALIGNED(16, float,float_buf)[NELLY_SAMPLES];
+    DECLARE_ALIGNED(32, float, float_buf)[NELLY_SAMPLES];
    float           state[128];
    AVLFG           random_state;
    GetBitContext   gb;
@ -55,7 +55,7 @@ typedef struct NellyMoserDecodeContext {
    DSPContext      dsp;
    FFTContext      imdct_ctx;
    FmtConvertContext fmt_conv;
-    DECLARE_ALIGNED(16, float,imdct_out)[NELLY_BUF_LEN * 2];
+    DECLARE_ALIGNED(32, float, imdct_out)[NELLY_BUF_LEN * 2];
 } NellyMoserDecodeContext;
 static void overlap_and_window(NellyMoserDecodeContext *s, float *state, float *audio, float *a_in)
--- a/libavcodec/nellymoserenc.c
+++ b/libavcodec/nellymoserenc.c
@ -55,9 +55,9 @@ typedef struct NellyMoserEncodeContext {
    int             have_saved;
    DSPContext      dsp;
    FFTContext      mdct_ctx;
-    DECLARE_ALIGNED(16, float, mdct_out)[NELLY_SAMPLES];
+    DECLARE_ALIGNED(32, float, mdct_out)[NELLY_SAMPLES];
-    DECLARE_ALIGNED(16, float, in_buff)[NELLY_SAMPLES];
+    DECLARE_ALIGNED(32, float, in_buff)[NELLY_SAMPLES];
-    DECLARE_ALIGNED(16, float, buf)[2][3 * NELLY_BUF_LEN];     ///< sample buffer
+    DECLARE_ALIGNED(32, float, buf)[2][3 * NELLY_BUF_LEN];     ///< sample buffer
    float           (*opt )[NELLY_BANDS];
    uint8_t         (*path)[NELLY_BANDS];
 } NellyMoserEncodeContext;
--- a/libavcodec/qdm2.c
+++ b/libavcodec/qdm2.c
@ -120,7 +120,7 @@ typedef struct {
 } FFTCoefficient;
 typedef struct {
-    DECLARE_ALIGNED(16, QDM2Complex, complex)[MPA_MAX_CHANNELS][256];
+    DECLARE_ALIGNED(32, QDM2Complex, complex)[MPA_MAX_CHANNELS][256];
 } QDM2FFT;
 /**
--- a/libavcodec/wma.h
+++ b/libavcodec/wma.h
@ -113,15 +113,15 @@ typedef struct WMACodecContext {
    uint8_t ms_stereo;                      ///< true if mid/side stereo mode
    uint8_t channel_coded[MAX_CHANNELS];    ///< true if channel is coded
    int exponents_bsize[MAX_CHANNELS];      ///< log2 ratio frame/exp. length
-    DECLARE_ALIGNED(16, float, exponents)[MAX_CHANNELS][BLOCK_MAX_SIZE];
+    DECLARE_ALIGNED(32, float, exponents)[MAX_CHANNELS][BLOCK_MAX_SIZE];
    float max_exponent[MAX_CHANNELS];
    WMACoef coefs1[MAX_CHANNELS][BLOCK_MAX_SIZE];
-    DECLARE_ALIGNED(16, float, coefs)[MAX_CHANNELS][BLOCK_MAX_SIZE];
+    DECLARE_ALIGNED(32, float, coefs)[MAX_CHANNELS][BLOCK_MAX_SIZE];
-    DECLARE_ALIGNED(16, FFTSample, output)[BLOCK_MAX_SIZE * 2];
+    DECLARE_ALIGNED(32, FFTSample, output)[BLOCK_MAX_SIZE * 2];
    FFTContext mdct_ctx[BLOCK_NB_SIZES];
    float *windows[BLOCK_NB_SIZES];
    /* output buffer for one frame and the last for IMDCT windowing */
-    DECLARE_ALIGNED(16, float, frame_out)[MAX_CHANNELS][BLOCK_MAX_SIZE * 2];
+    DECLARE_ALIGNED(32, float, frame_out)[MAX_CHANNELS][BLOCK_MAX_SIZE * 2];
    /* last frame info */
    uint8_t last_superframe[MAX_CODED_SUPERFRAME_SIZE + 4]; /* padding added */
    int last_bitoffset;
--- a/libavcodec/wmaprodec.c
+++ b/libavcodec/wmaprodec.c
@ -145,7 +145,7 @@ typedef struct {
    uint8_t  table_idx;                               ///< index in sf_offsets for the scale factor reference block
    float*   coeffs;                                  ///< pointer to the subframe decode buffer
    uint16_t num_vec_coeffs;                          ///< number of vector coded coefficients
-    DECLARE_ALIGNED(16, float, out)[WMAPRO_BLOCK_MAX_SIZE + WMAPRO_BLOCK_MAX_SIZE / 2]; ///< output buffer
+    DECLARE_ALIGNED(32, float, out)[WMAPRO_BLOCK_MAX_SIZE + WMAPRO_BLOCK_MAX_SIZE / 2]; ///< output buffer
 } WMAProChannelCtx;
 /**
@ -170,7 +170,7 @@ typedef struct WMAProDecodeCtx {
                      FF_INPUT_BUFFER_PADDING_SIZE];///< compressed frame data
    PutBitContext    pb;                            ///< context for filling the frame_data buffer
    FFTContext       mdct_ctx[WMAPRO_BLOCK_SIZES];  ///< MDCT context per block size
-    DECLARE_ALIGNED(16, float, tmp)[WMAPRO_BLOCK_MAX_SIZE]; ///< IMDCT output buffer
+    DECLARE_ALIGNED(32, float, tmp)[WMAPRO_BLOCK_MAX_SIZE]; ///< IMDCT output buffer
    float*           windows[WMAPRO_BLOCK_SIZES];   ///< windows for the different block sizes
    /* frame size dependent frame information (set during initialization) */
--- a/libavcodec/wmavoice.c
+++ b/libavcodec/wmavoice.c
@ -275,11 +275,11 @@ typedef struct {
                                  ///< by postfilter
    float denoise_filter_cache[MAX_FRAMESIZE];
    int   denoise_filter_cache_size; ///< samples in #denoise_filter_cache
-    DECLARE_ALIGNED(16, float, tilted_lpcs_pf)[0x80];
+    DECLARE_ALIGNED(32, float, tilted_lpcs_pf)[0x80];
                                  ///< aligned buffer for LPC tilting
-    DECLARE_ALIGNED(16, float, denoise_coeffs_pf)[0x80];
+    DECLARE_ALIGNED(32, float, denoise_coeffs_pf)[0x80];
                                  ///< aligned buffer for denoise coefficients
-    DECLARE_ALIGNED(16, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
+    DECLARE_ALIGNED(32, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
                                  ///< aligned buffer for postfilter speech
                                  ///< synthesis
    /**
--- a/libavcodec/x86/fft.c
+++ b/libavcodec/x86/fft.c
@ -25,7 +25,14 @@ av_cold void ff_fft_init_mmx(FFTContext *s)
 {
 #if HAVE_YASM
    int has_vectors = av_get_cpu_flags();
-    if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) {
+    if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX && s->nbits >= 5) {
        /* AVX for SB */
        s->imdct_calc      = ff_imdct_calc_sse;
        s->imdct_half      = ff_imdct_half_avx;
        s->fft_permute     = ff_fft_permute_sse;
        s->fft_calc        = ff_fft_calc_avx;
        s->fft_permutation = FF_FFT_PERM_AVX;
    } else if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) {
        /* SSE for P3/P4/K8 */
        s->imdct_calc  = ff_imdct_calc_sse;
        s->imdct_half  = ff_imdct_half_sse;
--- a/libavcodec/x86/fft.h
+++ b/libavcodec/x86/fft.h
@ -22,6 +22,7 @@
 #include "libavcodec/fft.h"
 void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_avx(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z);
@ -32,6 +33,7 @@ void ff_imdct_calc_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input
 void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_dct32_float_sse(FFTSample *out, const FFTSample *in);
 #endif
--- a/libavcodec/x86/fft_mmx.asm
+++ b/libavcodec/x86/fft_mmx.asm
@ -1,6 +1,7 @@
 ;******************************************************************************
 ;* FFT transform with SSE/3DNow optimizations
 ;* Copyright (c) 2008 Loren Merritt
 ;* Copyright (c) 2011 Vitor Sessak
 ;*
 ;* This algorithm (though not any of the implementation details) is
 ;* based on libdjbfft by D. J. Bernstein.
@ -49,9 +50,21 @@ endstruc
 SECTION_RODATA
 %define M_SQRT1_2 0.70710678118654752440
-ps_root2: times 4 dd M_SQRT1_2
+%define M_COS_PI_1_8 0.923879532511287
-ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
+%define M_COS_PI_3_8 0.38268343236509
-ps_p1p1m1p1: dd 0, 0, 1<<31, 0
+
 align 32
 ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
 ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
 ps_root2: times 8 dd M_SQRT1_2
 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
 ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
 perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
 perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
 ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
 ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
 ps_m1p1: dd 1<<31, 0
 %assign i 16
@ -96,51 +109,80 @@ section .text align=16
    SWAP     %3, %6
 %endmacro
 ;  in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
 ;      %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
 ;      %3, %4, %5 tmp
 ; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
 ;      %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
 %macro T8_AVX 5
    vsubps     %5, %1, %2       ; v  = %1 - %2
    vaddps     %3, %1, %2       ; w  = %1 + %2
    vmulps     %2, %5, [ps_p1p1m1p1root2]  ; v *= vals1
    vpermilps  %2, %2, [perm1]
    vblendps   %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
    vshufps    %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
    vsubps     %4, %5, %1       ; s = r - q
    vaddps     %1, %5, %1       ; u = r + q
    vpermilps  %1, %1, [perm2]  ; k  = {u1,u2,u3,u4,u6,u5,u7,u8}
    vshufps    %5, %4, %1, 0xbb
    vshufps    %3, %4, %1, 0xee
    vperm2f128 %3, %3, %5, 0x13
    vxorps     %4, %4, [ps_m1m1p1m1p1m1m1m1]  ; s *= {1,1,-1,-1,1,-1,-1,-1}
    vshufps    %2, %1, %4, 0xdd
    vshufps    %1, %1, %4, 0x88
    vperm2f128 %4, %2, %1, 0x02 ; v  = {k1,k3,s1,s3,k2,k4,s2,s4}
    vperm2f128 %1, %1, %2, 0x13 ; w  = {k6,k8,s6,s8,k5,k7,s5,s7}
    vsubps     %5, %1, %3
    vblendps   %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
    vsubps     %2, %4, %1       ; %2 = v - w
    vaddps     %1, %4, %1       ; %1 = v + w
 %endmacro
 ; In SSE mode do one fft4 transforms
 ; in:  %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
 ;
 ; In AVX mode do two fft4 transforms
 ; in:  %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
 ; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
 %macro T4_SSE 3
-    mova     %3, %1
+    subps    %3, %1, %2       ; {t3,t4,-t8,t7}
-    addps    %1, %2       ; {t1,t2,t6,t5}
+    addps    %1, %1, %2       ; {t1,t2,t6,t5}
-    subps    %3, %2       ; {t3,t4,-t8,t7}
+    xorps    %3, %3, [ps_p1p1m1p1]
-    xorps    %3, [ps_p1p1m1p1]
+    shufps   %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
-    mova     %2, %1
+    shufps   %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
-    shufps   %1, %3, 0x44 ; {t1,t2,t3,t4}
+    subps    %3, %1, %2       ; {r2,i2,r3,i3}
-    shufps   %2, %3, 0xbe ; {t6,t5,t7,t8}
+    addps    %1, %1, %2       ; {r0,i0,r1,i1}
-    mova     %3, %1
+    shufps   %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
-    addps    %1, %2       ; {r0,i0,r1,i1}
+    shufps   %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
    subps    %3, %2       ; {r2,i2,r3,i3}
    mova     %2, %1
    shufps   %1, %3, 0x88 ; {r0,r1,r2,r3}
    shufps   %2, %3, 0xdd ; {i0,i1,i2,i3}
 %endmacro
 ; In SSE mode do one FFT8
 ; in:  %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
 ;
 ; In AVX mode do two FFT8
 ; in:  %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
 ;      %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
 ; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
 ;      %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
 %macro T8_SSE 6
-    mova     %6, %3
+    addps    %6, %3, %4       ; {t1,t2,t3,t4}
-    subps    %3, %4       ; {r5,i5,r7,i7}
+    subps    %3, %3, %4       ; {r5,i5,r7,i7}
-    addps    %6, %4       ; {t1,t2,t3,t4}
+    shufps   %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
-    mova     %4, %3
+    mulps    %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
-    shufps   %4, %4, 0xb1 ; {i5,r5,i7,r7}
+    mulps    %4, %4, [ps_root2]
-    mulps    %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
+    addps    %3, %3, %4       ; {t8,t7,ta,t9}
-    mulps    %4, [ps_root2]
+    shufps   %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
-    addps    %3, %4       ; {t8,t7,ta,t9}
+    shufps   %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
-    mova     %4, %6
+    subps    %3, %6, %4       ; {t6,t5,tc,tb}
-    shufps   %6, %3, 0x36 ; {t3,t2,t9,t8}
+    addps    %6, %6, %4       ; {t1,t2,t9,ta}
-    shufps   %4, %3, 0x9c ; {t1,t4,t7,ta}
+    shufps   %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
-    mova     %3, %6
+    shufps   %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
-    addps    %6, %4       ; {t1,t2,t9,ta}
+    subps    %3, %1, %6       ; {r4,r5,r6,r7}
-    subps    %3, %4       ; {t6,t5,tc,tb}
+    addps    %1, %1, %6       ; {r0,r1,r2,r3}
-    mova     %4, %6
+    subps    %4, %2, %5       ; {i4,i5,i6,i7}
-    shufps   %6, %3, 0xd8 ; {t1,t9,t5,tb}
+    addps    %2, %2, %5       ; {i0,i1,i2,i3}
    shufps   %4, %3, 0x8d ; {t2,ta,t6,tc}
    mova     %3, %1
    mova     %5, %2
    addps    %1, %6       ; {r0,r1,r2,r3}
    addps    %2, %4       ; {i0,i1,i2,i3}
    subps    %3, %6       ; {r4,r5,r6,r7}
    subps    %5, %4       ; {i4,i5,i6,i7}
    SWAP     %4, %5
 %endmacro
 ; scheduled for cpu-bound sizes
@ -148,52 +190,44 @@ section .text align=16
 IF%1 mova    m4, Z(4)
 IF%1 mova    m5, Z(5)
    mova     m0, %2 ; wre
    mova     m2, m4
    mova     m1, %3 ; wim
-    mova     m3, m5
+    mulps    m2, m4, m0 ; r2*wre
    mulps    m2, m0 ; r2*wre
 IF%1 mova    m6, Z2(6)
-    mulps    m3, m1 ; i2*wim
+    mulps    m3, m5, m1 ; i2*wim
 IF%1 mova    m7, Z2(7)
-    mulps    m4, m1 ; r2*wim
+    mulps    m4, m4, m1 ; r2*wim
-    mulps    m5, m0 ; i2*wre
+    mulps    m5, m5, m0 ; i2*wre
-    addps    m2, m3 ; r2*wre + i2*wim
+    addps    m2, m2, m3 ; r2*wre + i2*wim
-    mova     m3, m1
+    mulps    m3, m1, m7 ; i3*wim
-    mulps    m1, m6 ; r3*wim
+    subps    m5, m5, m4 ; i2*wre - r2*wim
-    subps    m5, m4 ; i2*wre - r2*wim
+    mulps    m1, m1, m6 ; r3*wim
-    mova     m4, m0
+    mulps    m4, m0, m6 ; r3*wre
-    mulps    m3, m7 ; i3*wim
+    mulps    m0, m0, m7 ; i3*wre
-    mulps    m4, m6 ; r3*wre
+    subps    m4, m4, m3 ; r3*wre - i3*wim
    mulps    m0, m7 ; i3*wre
    subps    m4, m3 ; r3*wre - i3*wim
    mova     m3, Z(0)
-    addps    m0, m1 ; i3*wre + r3*wim
+    addps    m0, m0, m1 ; i3*wre + r3*wim
-    mova     m1, m4
+    subps    m1, m4, m2 ; t3
-    addps    m4, m2 ; t5
+    addps    m4, m4, m2 ; t5
-    subps    m1, m2 ; t3
+    subps    m3, m3, m4 ; r2
-    subps    m3, m4 ; r2
+    addps    m4, m4, Z(0) ; r0
    addps    m4, Z(0) ; r0
    mova     m6, Z(2)
    mova   Z(4), m3
    mova   Z(0), m4
-    mova     m3, m5
+    subps    m3, m5, m0 ; t4
-    subps    m5, m0 ; t4
+    subps    m4, m6, m3 ; r3
-    mova     m4, m6
+    addps    m3, m3, m6 ; r1
-    subps    m6, m5 ; r3
+    mova  Z2(6), m4
-    addps    m5, m4 ; r1
+    mova   Z(2), m3
    mova  Z2(6), m6
    mova   Z(2), m5
    mova     m2, Z(3)
-    addps    m3, m0 ; t6
+    addps    m3, m5, m0 ; t6
-    subps    m2, m1 ; i3
+    subps    m2, m2, m1 ; i3
    mova     m7, Z(1)
-    addps    m1, Z(3) ; i1
+    addps    m1, m1, Z(3) ; i1
    mova  Z2(7), m2
    mova   Z(3), m1
-    mova     m4, m7
+    subps    m4, m7, m3 ; i2
-    subps    m7, m3 ; i2
+    addps    m3, m3, m7 ; i0
-    addps    m3, m4 ; i0
+    mova   Z(5), m4
    mova   Z(5), m7
    mova   Z(1), m3
 %endmacro
@ -201,77 +235,55 @@ IF%1 mova    m7, Z2(7)
 %macro PASS_BIG 1 ; (!interleave)
    mova     m4, Z(4) ; r2
    mova     m5, Z(5) ; i2
    mova     m2, m4
    mova     m0, [wq] ; wre
    mova     m3, m5
    mova     m1, [wq+o1q] ; wim
-    mulps    m2, m0 ; r2*wre
+    mulps    m2, m4, m0 ; r2*wre
    mova     m6, Z2(6) ; r3
-    mulps    m3, m1 ; i2*wim
+    mulps    m3, m5, m1 ; i2*wim
    mova     m7, Z2(7) ; i3
-    mulps    m4, m1 ; r2*wim
+    mulps    m4, m4, m1 ; r2*wim
-    mulps    m5, m0 ; i2*wre
+    mulps    m5, m5, m0 ; i2*wre
-    addps    m2, m3 ; r2*wre + i2*wim
+    addps    m2, m2, m3 ; r2*wre + i2*wim
-    mova     m3, m1
+    mulps    m3, m1, m7 ; i3*wim
-    mulps    m1, m6 ; r3*wim
+    mulps    m1, m1, m6 ; r3*wim
-    subps    m5, m4 ; i2*wre - r2*wim
+    subps    m5, m5, m4 ; i2*wre - r2*wim
-    mova     m4, m0
+    mulps    m4, m0, m6 ; r3*wre
-    mulps    m3, m7 ; i3*wim
+    mulps    m0, m0, m7 ; i3*wre
-    mulps    m4, m6 ; r3*wre
+    subps    m4, m4, m3 ; r3*wre - i3*wim
    mulps    m0, m7 ; i3*wre
    subps    m4, m3 ; r3*wre - i3*wim
    mova     m3, Z(0)
-    addps    m0, m1 ; i3*wre + r3*wim
+    addps    m0, m0, m1 ; i3*wre + r3*wim
-    mova     m1, m4
+    subps    m1, m4, m2 ; t3
-    addps    m4, m2 ; t5
+    addps    m4, m4, m2 ; t5
-    subps    m1, m2 ; t3
+    subps    m3, m3, m4 ; r2
-    subps    m3, m4 ; r2
+    addps    m4, m4, Z(0) ; r0
    addps    m4, Z(0) ; r0
    mova     m6, Z(2)
    mova   Z(4), m3
    mova   Z(0), m4
-    mova     m3, m5
+    subps    m3, m5, m0 ; t4
-    subps    m5, m0 ; t4
+    subps    m4, m6, m3 ; r3
-    mova     m4, m6
+    addps    m3, m3, m6 ; r1
-    subps    m6, m5 ; r3
+IF%1 mova Z2(6), m4
-    addps    m5, m4 ; r1
+IF%1 mova  Z(2), m3
 IF%1 mova Z2(6), m6
 IF%1 mova  Z(2), m5
    mova     m2, Z(3)
-    addps    m3, m0 ; t6
+    addps    m5, m5, m0 ; t6
-    subps    m2, m1 ; i3
+    subps    m2, m2, m1 ; i3
    mova     m7, Z(1)
-    addps    m1, Z(3) ; i1
+    addps    m1, m1, Z(3) ; i1
 IF%1 mova Z2(7), m2
 IF%1 mova  Z(3), m1
-    mova     m4, m7
+    subps    m6, m7, m5 ; i2
-    subps    m7, m3 ; i2
+    addps    m5, m5, m7 ; i0
-    addps    m3, m4 ; i0
+IF%1 mova  Z(5), m6
-IF%1 mova  Z(5), m7
+IF%1 mova  Z(1), m5
 IF%1 mova  Z(1), m3
 %if %1==0
-    mova     m4, m5 ; r1
+    INTERL m1, m3, m7, Z, 2
-    mova     m0, m6 ; r3
+    INTERL m2, m4, m0, Z2, 6
-    unpcklps m5, m1
+
    unpckhps m4, m1
    unpcklps m6, m2
    unpckhps m0, m2
    mova     m1, Z(0)
    mova     m2, Z(4)
-    mova   Z(2), m5
+
-    mova   Z(3), m4
+    INTERL m5, m1, m3, Z, 0
-    mova  Z2(6), m6
+    INTERL m6, m2, m7, Z, 4
    mova  Z2(7), m0
    mova     m5, m1 ; r0
    mova     m4, m2 ; r2
    unpcklps m1, m3
    unpckhps m5, m3
    unpcklps m2, m7
    unpckhps m4, m7
    mova   Z(0), m1
    mova   Z(1), m5
    mova   Z(4), m2
    mova   Z(5), m4
 %endif
 %endmacro
@ -281,13 +293,106 @@ IF%1 mova  Z(1), m3
    punpckhdq %3, %2
 %endmacro
 INIT_XMM
 %define mova movaps
 %define Z(x) [r0+mmsize*x]
 %define Z2(x) [r0+mmsize*x]
 %define ZH(x) [r0+mmsize*x+mmsize/2]
 INIT_YMM
 align 16
 fft8_avx:
    mova      m0, Z(0)
    mova      m1, Z(1)
    T8_AVX    m0, m1, m2, m3, m4
    mova      Z(0), m0
    mova      Z(1), m1
    ret
 align 16
 fft16_avx:
    mova       m2, Z(2)
    mova       m3, Z(3)
    T4_SSE     m2, m3, m7
    mova       m0, Z(0)
    mova       m1, Z(1)
    T8_AVX     m0, m1, m4, m5, m7
    mova       m4, [ps_cos16_1]
    mova       m5, [ps_cos16_2]
    vmulps     m6, m2, m4
    vmulps     m7, m3, m5
    vaddps     m7, m7, m6
    vmulps     m2, m2, m5
    vmulps     m3, m3, m4
    vsubps     m3, m3, m2
    vblendps   m2, m7, m3, 0xf0
    vperm2f128 m3, m7, m3, 0x21
    vaddps     m4, m2, m3
    vsubps     m2, m3, m2
    vperm2f128 m2, m2, m2, 0x01
    vsubps     m3, m1, m2
    vaddps     m1, m1, m2
    vsubps     m5, m0, m4
    vaddps     m0, m0, m4
    vextractf128   Z(0), m0, 0
    vextractf128  ZH(0), m1, 0
    vextractf128   Z(1), m0, 1
    vextractf128  ZH(1), m1, 1
    vextractf128   Z(2), m5, 0
    vextractf128  ZH(2), m3, 0
    vextractf128   Z(3), m5, 1
    vextractf128  ZH(3), m3, 1
    ret
 align 16
 fft32_avx:
    call fft16_avx
    mova m0, Z(4)
    mova m1, Z(5)
    T4_SSE      m0, m1, m4
    mova m2, Z(6)
    mova m3, Z(7)
    T8_SSE      m0, m1, m2, m3, m4, m6
    ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
    ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
    vperm2f128  m4, m0, m2, 0x20
    vperm2f128  m5, m1, m3, 0x20
    vperm2f128  m6, m0, m2, 0x31
    vperm2f128  m7, m1, m3, 0x31
    PASS_SMALL 0, [cos_32], [cos_32+32]
    ret
 fft32_interleave_avx:
    call fft32_avx
    mov r2d, 32
 .deint_loop:
    mova     m2, Z(0)
    mova     m3, Z(1)
    vunpcklps      m0, m2, m3
    vunpckhps      m1, m2, m3
    vextractf128   Z(0), m0, 0
    vextractf128  ZH(0), m1, 0
    vextractf128   Z(1), m0, 1
    vextractf128  ZH(1), m1, 1
    add r0, mmsize*2
    sub r2d, mmsize/4
    jg .deint_loop
    ret
 INIT_XMM
 %define movdqa  movaps
 align 16
 fft4_avx:
 fft4_sse:
    mova     m0, Z(0)
    mova     m1, Z(1)
@ -406,6 +511,8 @@ FFT48_3DN _3dn
 %define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)]
 %define Z2(x) [zq + o3q + mmsize*(x&1)]
 %define ZH(x) [zq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
 %define Z2H(x) [zq + o3q + mmsize*(x&1) + mmsize/2]
 %macro DECL_PASS 2+ ; name, payload
 align 16
@ -423,8 +530,34 @@ DEFINE_ARGS z, w, n, o1, o3
    rep ret
 %endmacro
 INIT_YMM
 %macro INTERL_AVX 5
    vunpckhps      %3, %2, %1
    vunpcklps      %2, %2, %1
    vextractf128   %4(%5), %2, 0
    vextractf128  %4 %+ H(%5), %3, 0
    vextractf128   %4(%5 + 1), %2, 1
    vextractf128  %4 %+ H(%5 + 1), %3, 1
 %endmacro
 %define INTERL INTERL_AVX
 DECL_PASS pass_avx, PASS_BIG 1
 DECL_PASS pass_interleave_avx, PASS_BIG 0
 INIT_XMM
-%define mova movaps
+
 %macro INTERL_SSE 5
    mova     %3, %2
    unpcklps %2, %1
    unpckhps %3, %1
    mova  %4(%5), %2
    mova  %4(%5+1), %3
 %endmacro
 %define INTERL INTERL_SSE
 DECL_PASS pass_sse, PASS_BIG 1
 DECL_PASS pass_interleave_sse, PASS_BIG 0
@ -457,9 +590,12 @@ DECL_PASS pass_interleave_3dn, PASS_BIG 0
 %macro DECL_FFT 2-3 ; nbits, cpu, suffix
 %xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL
-%if %1==5
+%if %1>=5
 %xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL
 %endif
 %if %1>=6
 %xdefine list_of_fft list_of_fft, fft32%3%2 SECTION_REL
 %endif
 %assign n 1<<%1
 %rep 17-%1
@ -492,9 +628,14 @@ section .text
 ; The others pass args in registers and don't spill anything.
 cglobal fft_dispatch%3%2, 2,5,8, z, nbits
    FFT_DISPATCH %3%2, nbits
 %ifidn %2, _avx
    vzeroupper
 %endif
    RET
 %endmacro ; DECL_FFT
 DECL_FFT 6, _avx
 DECL_FFT 6, _avx, _interleave
 DECL_FFT 5, _sse
 DECL_FFT 5, _sse, _interleave
 DECL_FFT 4, _3dn
@ -533,21 +674,53 @@ INIT_XMM
 %endmacro
 %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
-    movaps   xmm6, [%4+%1*2]
+    mulps      m6, %3, [%5+%1]
-    movaps   %2,   [%4+%1*2+0x10]
+    mulps      m7, %2, [%5+%1]
-    movaps   %3,   xmm6
+    mulps      %2, %2, [%6+%1]
-    movaps   xmm7, %2
+    mulps      %3, %3, [%6+%1]
-    mulps    xmm6, [%5+%1]
+    subps      %2, %2, m6
-    mulps    %2,   [%6+%1]
+    addps      %3, %3, m7
-    mulps    %3,   [%6+%1]
+%endmacro
-    mulps    xmm7, [%5+%1]
+
-    subps    %2,   xmm6
+%macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8
-    addps    %3,   xmm7
+.post:
    vmovaps      ymm1,   [%3+%1*2]
    vmovaps      ymm0,   [%3+%1*2+0x20]
    vmovaps      ymm3,   [%3+%2*2]
    vmovaps      ymm2,   [%3+%2*2+0x20]
    CMUL         %1, ymm0, ymm1, %3, %4, %5
    CMUL         %2, ymm2, ymm3, %3, %4, %5
    vshufps      ymm1, ymm1, ymm1, 0x1b
    vshufps      ymm3, ymm3, ymm3, 0x1b
    vperm2f128   ymm1, ymm1, ymm1, 0x01
    vperm2f128   ymm3, ymm3, ymm3, 0x01
    vunpcklps    ymm6, ymm2, ymm1
    vunpckhps    ymm4, ymm2, ymm1
    vunpcklps    ymm7, ymm0, ymm3
    vunpckhps    ymm5, ymm0, ymm3
    vextractf128 [%3+%1*2],      ymm7, 0
    vextractf128 [%3+%1*2+0x10], ymm5, 0
    vextractf128 [%3+%1*2+0x20], ymm7, 1
    vextractf128 [%3+%1*2+0x30], ymm5, 1
    vextractf128 [%3+%2*2],      ymm6, 0
    vextractf128 [%3+%2*2+0x10], ymm4, 0
    vextractf128 [%3+%2*2+0x20], ymm6, 1
    vextractf128 [%3+%2*2+0x30], ymm4, 1
    sub      %2,   0x20
    add      %1,   0x20
    jl       .post
 %endmacro
 %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
 .post:
    movaps   xmm1, [%3+%1*2]
    movaps   xmm0, [%3+%1*2+0x10]
    CMUL     %1,   xmm0, xmm1, %3, %4, %5
    movaps   xmm5, [%3+%2*2]
    movaps   xmm4, [%3+%2*2+0x10]
    CMUL     %2,   xmm4, xmm5, %3, %4, %5
    shufps   xmm1, xmm1, 0x1b
    shufps   xmm5, xmm5, 0x1b
@ -566,7 +739,8 @@ INIT_XMM
    jl       .post
 %endmacro
-cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input
+%macro DECL_IMDCT 2
 cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input
 %ifdef ARCH_X86_64
 %define rrevtab r10
 %define rtcos   r11
@ -641,7 +815,7 @@ cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample
    mov  r0, r1
    mov  r1d, [r5+FFTContext.nbits]
-    FFT_DISPATCH _sse, r1
+    FFT_DISPATCH %1, r1
    mov  r0d, [r5+FFTContext.mdctsize]
    add  r6, r0
@ -653,14 +827,24 @@ cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample
    mov  rtsin, [esp+4]
 %endif
    neg  r0
-    mov  r1, -16
+    mov  r1, -mmsize
    sub  r1, r0
-    POSROTATESHUF r0, r1, r6, rtcos, rtsin
+    %2 r0, r1, r6, rtcos, rtsin
 %ifdef ARCH_X86_64
    pop  r14
    pop  r13
    pop  r12
 %else
    add esp, 12
 %endif
 %ifidn avx_enabled, 1
    vzeroupper
 %endif
    RET
 %endmacro
 DECL_IMDCT _sse, POSROTATESHUF
 INIT_YMM
 DECL_IMDCT _avx, POSROTATESHUF_AVX
--- a/libavcodec/x86/fft_sse.c
+++ b/libavcodec/x86/fft_sse.c
@ -28,6 +28,12 @@ DECLARE_ASM_CONST(16, int, ff_m1m1m1m1)[4] =
 void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
 void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
 void ff_fft_dispatch_interleave_avx(FFTComplex *z, int nbits);
 void ff_fft_calc_avx(FFTContext *s, FFTComplex *z)
 {
    ff_fft_dispatch_interleave_avx(z, s->nbits);
 }
 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
 {
@ -77,7 +83,7 @@ void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
    long n = s->mdct_size;
    long n4 = n >> 2;
-    ff_imdct_half_sse(s, output+n4, input);
+    s->imdct_half(s, output + n4, input);
    j = -n;
    k = n-16;