Add AVX FFT implementation.

Signed-off-by: Reinhard Tartler <siretart@tauware.de>
pull/2/head
Vitor Sessak 14 years ago committed by Reinhard Tartler
parent 13dfce3d44
commit 9d35fa520e
  1. 2
      Changelog
  2. 10
      libavcodec/aac.h
  3. 2
      libavcodec/aacenc.h
  4. 10
      libavcodec/ac3dec.h
  5. 2
      libavcodec/ac3enc.c
  6. 20
      libavcodec/atrac1.c
  7. 6
      libavcodec/atrac3.c
  8. 2
      libavcodec/binkaudio.c
  9. 2
      libavcodec/cook.c
  10. 10
      libavcodec/dca.c
  11. 43
      libavcodec/fft.c
  12. 3
      libavcodec/fft.h
  13. 2
      libavcodec/imc.c
  14. 4
      libavcodec/nellymoserdec.c
  15. 6
      libavcodec/nellymoserenc.c
  16. 2
      libavcodec/qdm2.c
  17. 8
      libavcodec/wma.h
  18. 4
      libavcodec/wmaprodec.c
  19. 6
      libavcodec/wmavoice.c
  20. 9
      libavcodec/x86/fft.c
  21. 2
      libavcodec/x86/fft.h
  22. 484
      libavcodec/x86/fft_mmx.asm
  23. 8
      libavcodec/x86/fft_sse.c

@ -5,7 +5,7 @@ releases are sorted from youngest to oldest.
version <next>: version <next>:
- Lots of deprecated API cruft removed - Lots of deprecated API cruft removed
- fft and imdct optimizations for AVX (Sandy Bridge) processors
version 0.7_beta1: version 0.7_beta1:

@ -223,9 +223,9 @@ typedef struct {
float sf[120]; ///< scalefactors float sf[120]; ///< scalefactors
int sf_idx[128]; ///< scalefactor indices (used by encoder) int sf_idx[128]; ///< scalefactor indices (used by encoder)
uint8_t zeroes[128]; ///< band is not coded (used by encoder) uint8_t zeroes[128]; ///< band is not coded (used by encoder)
DECLARE_ALIGNED(16, float, coeffs)[1024]; ///< coefficients for IMDCT DECLARE_ALIGNED(32, float, coeffs)[1024]; ///< coefficients for IMDCT
DECLARE_ALIGNED(16, float, saved)[1024]; ///< overlap DECLARE_ALIGNED(32, float, saved)[1024]; ///< overlap
DECLARE_ALIGNED(16, float, ret)[2048]; ///< PCM output DECLARE_ALIGNED(32, float, ret)[2048]; ///< PCM output
DECLARE_ALIGNED(16, int16_t, ltp_state)[3072]; ///< time signal for LTP DECLARE_ALIGNED(16, int16_t, ltp_state)[3072]; ///< time signal for LTP
PredictorState predictor_state[MAX_PREDICTORS]; PredictorState predictor_state[MAX_PREDICTORS];
} SingleChannelElement; } SingleChannelElement;
@ -272,7 +272,7 @@ typedef struct {
* @defgroup temporary aligned temporary buffers (We do not want to have these on the stack.) * @defgroup temporary aligned temporary buffers (We do not want to have these on the stack.)
* @{ * @{
*/ */
DECLARE_ALIGNED(16, float, buf_mdct)[1024]; DECLARE_ALIGNED(32, float, buf_mdct)[1024];
/** @} */ /** @} */
/** /**
@ -296,7 +296,7 @@ typedef struct {
int sf_offset; ///< offset into pow2sf_tab as appropriate for dsp.float_to_int16 int sf_offset; ///< offset into pow2sf_tab as appropriate for dsp.float_to_int16
/** @} */ /** @} */
DECLARE_ALIGNED(16, float, temp)[128]; DECLARE_ALIGNED(32, float, temp)[128];
enum OCStatus output_configured; enum OCStatus output_configured;
} AACContext; } AACContext;

@ -64,7 +64,7 @@ typedef struct AACEncContext {
int last_frame; int last_frame;
float lambda; float lambda;
DECLARE_ALIGNED(16, int, qcoefs)[96]; ///< quantized coefficients DECLARE_ALIGNED(16, int, qcoefs)[96]; ///< quantized coefficients
DECLARE_ALIGNED(16, float, scoefs)[1024]; ///< scaled coefficients DECLARE_ALIGNED(32, float, scoefs)[1024]; ///< scaled coefficients
} AACEncContext; } AACEncContext;
#endif /* AVCODEC_AACENC_H */ #endif /* AVCODEC_AACENC_H */

@ -200,11 +200,11 @@ typedef struct {
///@defgroup arrays aligned arrays ///@defgroup arrays aligned arrays
DECLARE_ALIGNED(16, int, fixed_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS]; ///> fixed-point transform coefficients DECLARE_ALIGNED(16, int, fixed_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS]; ///> fixed-point transform coefficients
DECLARE_ALIGNED(16, float, transform_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS]; ///< transform coefficients DECLARE_ALIGNED(32, float, transform_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS]; ///< transform coefficients
DECLARE_ALIGNED(16, float, delay)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE]; ///< delay - added to the next block DECLARE_ALIGNED(32, float, delay)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE]; ///< delay - added to the next block
DECLARE_ALIGNED(16, float, window)[AC3_BLOCK_SIZE]; ///< window coefficients DECLARE_ALIGNED(32, float, window)[AC3_BLOCK_SIZE]; ///< window coefficients
DECLARE_ALIGNED(16, float, tmp_output)[AC3_BLOCK_SIZE]; ///< temporary storage for output before windowing DECLARE_ALIGNED(32, float, tmp_output)[AC3_BLOCK_SIZE]; ///< temporary storage for output before windowing
DECLARE_ALIGNED(16, float, output)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE]; ///< output after imdct transform and windowing DECLARE_ALIGNED(32, float, output)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE]; ///< output after imdct transform and windowing
///@} ///@}
} AC3DecodeContext; } AC3DecodeContext;

@ -201,7 +201,7 @@ typedef struct AC3EncodeContext {
uint8_t exp_strategy[AC3_MAX_CHANNELS][AC3_MAX_BLOCKS]; ///< exponent strategies uint8_t exp_strategy[AC3_MAX_CHANNELS][AC3_MAX_BLOCKS]; ///< exponent strategies
DECLARE_ALIGNED(16, SampleType, windowed_samples)[AC3_WINDOW_SIZE]; DECLARE_ALIGNED(32, SampleType, windowed_samples)[AC3_WINDOW_SIZE];
} AC3EncodeContext; } AC3EncodeContext;
typedef struct AC3Mant { typedef struct AC3Mant {

@ -60,11 +60,11 @@ typedef struct {
int log2_block_count[AT1_QMF_BANDS]; ///< log2 number of blocks in a band int log2_block_count[AT1_QMF_BANDS]; ///< log2 number of blocks in a band
int num_bfus; ///< number of Block Floating Units int num_bfus; ///< number of Block Floating Units
float* spectrum[2]; float* spectrum[2];
DECLARE_ALIGNED(16, float, spec1)[AT1_SU_SAMPLES]; ///< mdct buffer DECLARE_ALIGNED(32, float, spec1)[AT1_SU_SAMPLES]; ///< mdct buffer
DECLARE_ALIGNED(16, float, spec2)[AT1_SU_SAMPLES]; ///< mdct buffer DECLARE_ALIGNED(32, float, spec2)[AT1_SU_SAMPLES]; ///< mdct buffer
DECLARE_ALIGNED(16, float, fst_qmf_delay)[46]; ///< delay line for the 1st stacked QMF filter DECLARE_ALIGNED(32, float, fst_qmf_delay)[46]; ///< delay line for the 1st stacked QMF filter
DECLARE_ALIGNED(16, float, snd_qmf_delay)[46]; ///< delay line for the 2nd stacked QMF filter DECLARE_ALIGNED(32, float, snd_qmf_delay)[46]; ///< delay line for the 2nd stacked QMF filter
DECLARE_ALIGNED(16, float, last_qmf_delay)[256+23]; ///< delay line for the last stacked QMF filter DECLARE_ALIGNED(32, float, last_qmf_delay)[256+23]; ///< delay line for the last stacked QMF filter
} AT1SUCtx; } AT1SUCtx;
/** /**
@ -72,13 +72,13 @@ typedef struct {
*/ */
typedef struct { typedef struct {
AT1SUCtx SUs[AT1_MAX_CHANNELS]; ///< channel sound unit AT1SUCtx SUs[AT1_MAX_CHANNELS]; ///< channel sound unit
DECLARE_ALIGNED(16, float, spec)[AT1_SU_SAMPLES]; ///< the mdct spectrum buffer DECLARE_ALIGNED(32, float, spec)[AT1_SU_SAMPLES]; ///< the mdct spectrum buffer
DECLARE_ALIGNED(16, float, low)[256]; DECLARE_ALIGNED(32, float, low)[256];
DECLARE_ALIGNED(16, float, mid)[256]; DECLARE_ALIGNED(32, float, mid)[256];
DECLARE_ALIGNED(16, float, high)[512]; DECLARE_ALIGNED(32, float, high)[512];
float* bands[3]; float* bands[3];
DECLARE_ALIGNED(16, float, out_samples)[AT1_MAX_CHANNELS][AT1_SU_SAMPLES]; DECLARE_ALIGNED(32, float, out_samples)[AT1_MAX_CHANNELS][AT1_SU_SAMPLES];
FFTContext mdct_ctx[3]; FFTContext mdct_ctx[3];
int channels; int channels;
DSPContext dsp; DSPContext dsp;

@ -74,8 +74,8 @@ typedef struct {
int gcBlkSwitch; int gcBlkSwitch;
gain_block gainBlock[2]; gain_block gainBlock[2];
DECLARE_ALIGNED(16, float, spectrum)[1024]; DECLARE_ALIGNED(32, float, spectrum)[1024];
DECLARE_ALIGNED(16, float, IMDCT_buf)[1024]; DECLARE_ALIGNED(32, float, IMDCT_buf)[1024];
float delayBuf1[46]; ///<qmf delay buffers float delayBuf1[46]; ///<qmf delay buffers
float delayBuf2[46]; float delayBuf2[46];
@ -122,7 +122,7 @@ typedef struct {
FFTContext mdct_ctx; FFTContext mdct_ctx;
} ATRAC3Context; } ATRAC3Context;
static DECLARE_ALIGNED(16, float,mdct_window)[512]; static DECLARE_ALIGNED(32, float, mdct_window)[512];
static VLC spectral_coeff_tab[7]; static VLC spectral_coeff_tab[7];
static float gain_tab1[16]; static float gain_tab1[16];
static float gain_tab2[31]; static float gain_tab2[31];

@ -55,7 +55,7 @@ typedef struct {
int num_bands; int num_bands;
unsigned int *bands; unsigned int *bands;
float root; float root;
DECLARE_ALIGNED(16, FFTSample, coeffs)[BINK_BLOCK_MAX_SIZE]; DECLARE_ALIGNED(32, FFTSample, coeffs)[BINK_BLOCK_MAX_SIZE];
DECLARE_ALIGNED(16, short, previous)[BINK_BLOCK_MAX_SIZE / 16]; ///< coeffs from previous audio block DECLARE_ALIGNED(16, short, previous)[BINK_BLOCK_MAX_SIZE / 16]; ///< coeffs from previous audio block
float *coeffs_ptr[MAX_CHANNELS]; ///< pointers to the coeffs arrays for float_to_int16_interleave float *coeffs_ptr[MAX_CHANNELS]; ///< pointers to the coeffs arrays for float_to_int16_interleave
union { union {

@ -153,7 +153,7 @@ typedef struct cook {
/* data buffers */ /* data buffers */
uint8_t* decoded_bytes_buffer; uint8_t* decoded_bytes_buffer;
DECLARE_ALIGNED(16, float,mono_mdct_output)[2048]; DECLARE_ALIGNED(32, float, mono_mdct_output)[2048];
float decode_buffer_1[1024]; float decode_buffer_1[1024];
float decode_buffer_2[1024]; float decode_buffer_2[1024];
float decode_buffer_0[1060]; /* static allocation for joint decode */ float decode_buffer_0[1060]; /* static allocation for joint decode */

@ -321,16 +321,16 @@ typedef struct {
/* Subband samples history (for ADPCM) */ /* Subband samples history (for ADPCM) */
float subband_samples_hist[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][4]; float subband_samples_hist[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][4];
DECLARE_ALIGNED(16, float, subband_fir_hist)[DCA_PRIM_CHANNELS_MAX][512]; DECLARE_ALIGNED(32, float, subband_fir_hist)[DCA_PRIM_CHANNELS_MAX][512];
DECLARE_ALIGNED(16, float, subband_fir_noidea)[DCA_PRIM_CHANNELS_MAX][32]; DECLARE_ALIGNED(32, float, subband_fir_noidea)[DCA_PRIM_CHANNELS_MAX][32];
int hist_index[DCA_PRIM_CHANNELS_MAX]; int hist_index[DCA_PRIM_CHANNELS_MAX];
DECLARE_ALIGNED(16, float, raXin)[32]; DECLARE_ALIGNED(32, float, raXin)[32];
int output; ///< type of output int output; ///< type of output
float scale_bias; ///< output scale float scale_bias; ///< output scale
DECLARE_ALIGNED(16, float, subband_samples)[DCA_BLOCKS_MAX][DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][8]; DECLARE_ALIGNED(32, float, subband_samples)[DCA_BLOCKS_MAX][DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][8];
DECLARE_ALIGNED(16, float, samples)[(DCA_PRIM_CHANNELS_MAX+1)*256]; DECLARE_ALIGNED(32, float, samples)[(DCA_PRIM_CHANNELS_MAX+1)*256];
const float *samples_chanptr[DCA_PRIM_CHANNELS_MAX+1]; const float *samples_chanptr[DCA_PRIM_CHANNELS_MAX+1];
uint8_t dca_buffer[DCA_MAX_FRAME_SIZE + DCA_MAX_EXSS_HEADER_SIZE + DCA_BUFFER_PADDING_SIZE]; uint8_t dca_buffer[DCA_MAX_FRAME_SIZE + DCA_MAX_EXSS_HEADER_SIZE + DCA_BUFFER_PADDING_SIZE];

@ -93,6 +93,44 @@ av_cold void ff_init_ff_cos_tabs(int index)
#endif #endif
} }
static const int avx_tab[] = {
0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15
};
static int is_second_half_of_fft32(int i, int n)
{
if (n <= 32)
return i >= 16;
else if (i < n/2)
return is_second_half_of_fft32(i, n/2);
else if (i < 3*n/4)
return is_second_half_of_fft32(i - n/2, n/4);
else
return is_second_half_of_fft32(i - 3*n/4, n/4);
}
static av_cold void fft_perm_avx(FFTContext *s)
{
int i;
int n = 1 << s->nbits;
for (i = 0; i < n; i += 16) {
int k;
if (is_second_half_of_fft32(i, n)) {
for (k = 0; k < 16; k++)
s->revtab[-split_radix_permutation(i + k, n, s->inverse) & (n - 1)] =
i + avx_tab[k];
} else {
for (k = 0; k < 16; k++) {
int j = i + k;
j = (j & ~7) | ((j >> 1) & 3) | ((j << 2) & 4);
s->revtab[-split_radix_permutation(i + k, n, s->inverse) & (n - 1)] = j;
}
}
}
}
av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse) av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
{ {
int i, j, n; int i, j, n;
@ -132,12 +170,17 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
for(j=4; j<=nbits; j++) { for(j=4; j<=nbits; j++) {
ff_init_ff_cos_tabs(j); ff_init_ff_cos_tabs(j);
} }
if (s->fft_permutation == FF_FFT_PERM_AVX) {
fft_perm_avx(s);
} else {
for(i=0; i<n; i++) { for(i=0; i<n; i++) {
int j = i; int j = i;
if (s->fft_permutation == FF_FFT_PERM_SWAP_LSBS) if (s->fft_permutation == FF_FFT_PERM_SWAP_LSBS)
j = (j&~3) | ((j>>1)&1) | ((j<<1)&2); j = (j&~3) | ((j>>1)&1) | ((j<<1)&2);
s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = j; s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = j;
} }
}
return 0; return 0;
fail: fail:

@ -85,6 +85,7 @@ struct FFTContext {
int fft_permutation; int fft_permutation;
#define FF_FFT_PERM_DEFAULT 0 #define FF_FFT_PERM_DEFAULT 0
#define FF_FFT_PERM_SWAP_LSBS 1 #define FF_FFT_PERM_SWAP_LSBS 1
#define FF_FFT_PERM_AVX 2
int mdct_permutation; int mdct_permutation;
#define FF_MDCT_PERM_NONE 0 #define FF_MDCT_PERM_NONE 0
#define FF_MDCT_PERM_INTERLEAVE 1 #define FF_MDCT_PERM_INTERLEAVE 1
@ -97,7 +98,7 @@ struct FFTContext {
#endif #endif
#define COSTABLE(size) \ #define COSTABLE(size) \
COSTABLE_CONST DECLARE_ALIGNED(16, FFTSample, FFT_NAME(ff_cos_##size))[size/2] COSTABLE_CONST DECLARE_ALIGNED(32, FFTSample, FFT_NAME(ff_cos_##size))[size/2]
extern COSTABLE(16); extern COSTABLE(16);
extern COSTABLE(32); extern COSTABLE(32);

@ -88,7 +88,7 @@ typedef struct {
DSPContext dsp; DSPContext dsp;
FFTContext fft; FFTContext fft;
DECLARE_ALIGNED(16, FFTComplex, samples)[COEFFS/2]; DECLARE_ALIGNED(32, FFTComplex, samples)[COEFFS/2];
float *out_samples; float *out_samples;
} IMCContext; } IMCContext;

@ -47,7 +47,7 @@
typedef struct NellyMoserDecodeContext { typedef struct NellyMoserDecodeContext {
AVCodecContext* avctx; AVCodecContext* avctx;
DECLARE_ALIGNED(16, float,float_buf)[NELLY_SAMPLES]; DECLARE_ALIGNED(32, float, float_buf)[NELLY_SAMPLES];
float state[128]; float state[128];
AVLFG random_state; AVLFG random_state;
GetBitContext gb; GetBitContext gb;
@ -55,7 +55,7 @@ typedef struct NellyMoserDecodeContext {
DSPContext dsp; DSPContext dsp;
FFTContext imdct_ctx; FFTContext imdct_ctx;
FmtConvertContext fmt_conv; FmtConvertContext fmt_conv;
DECLARE_ALIGNED(16, float,imdct_out)[NELLY_BUF_LEN * 2]; DECLARE_ALIGNED(32, float, imdct_out)[NELLY_BUF_LEN * 2];
} NellyMoserDecodeContext; } NellyMoserDecodeContext;
static void overlap_and_window(NellyMoserDecodeContext *s, float *state, float *audio, float *a_in) static void overlap_and_window(NellyMoserDecodeContext *s, float *state, float *audio, float *a_in)

@ -55,9 +55,9 @@ typedef struct NellyMoserEncodeContext {
int have_saved; int have_saved;
DSPContext dsp; DSPContext dsp;
FFTContext mdct_ctx; FFTContext mdct_ctx;
DECLARE_ALIGNED(16, float, mdct_out)[NELLY_SAMPLES]; DECLARE_ALIGNED(32, float, mdct_out)[NELLY_SAMPLES];
DECLARE_ALIGNED(16, float, in_buff)[NELLY_SAMPLES]; DECLARE_ALIGNED(32, float, in_buff)[NELLY_SAMPLES];
DECLARE_ALIGNED(16, float, buf)[2][3 * NELLY_BUF_LEN]; ///< sample buffer DECLARE_ALIGNED(32, float, buf)[2][3 * NELLY_BUF_LEN]; ///< sample buffer
float (*opt )[NELLY_BANDS]; float (*opt )[NELLY_BANDS];
uint8_t (*path)[NELLY_BANDS]; uint8_t (*path)[NELLY_BANDS];
} NellyMoserEncodeContext; } NellyMoserEncodeContext;

@ -120,7 +120,7 @@ typedef struct {
} FFTCoefficient; } FFTCoefficient;
typedef struct { typedef struct {
DECLARE_ALIGNED(16, QDM2Complex, complex)[MPA_MAX_CHANNELS][256]; DECLARE_ALIGNED(32, QDM2Complex, complex)[MPA_MAX_CHANNELS][256];
} QDM2FFT; } QDM2FFT;
/** /**

@ -113,15 +113,15 @@ typedef struct WMACodecContext {
uint8_t ms_stereo; ///< true if mid/side stereo mode uint8_t ms_stereo; ///< true if mid/side stereo mode
uint8_t channel_coded[MAX_CHANNELS]; ///< true if channel is coded uint8_t channel_coded[MAX_CHANNELS]; ///< true if channel is coded
int exponents_bsize[MAX_CHANNELS]; ///< log2 ratio frame/exp. length int exponents_bsize[MAX_CHANNELS]; ///< log2 ratio frame/exp. length
DECLARE_ALIGNED(16, float, exponents)[MAX_CHANNELS][BLOCK_MAX_SIZE]; DECLARE_ALIGNED(32, float, exponents)[MAX_CHANNELS][BLOCK_MAX_SIZE];
float max_exponent[MAX_CHANNELS]; float max_exponent[MAX_CHANNELS];
WMACoef coefs1[MAX_CHANNELS][BLOCK_MAX_SIZE]; WMACoef coefs1[MAX_CHANNELS][BLOCK_MAX_SIZE];
DECLARE_ALIGNED(16, float, coefs)[MAX_CHANNELS][BLOCK_MAX_SIZE]; DECLARE_ALIGNED(32, float, coefs)[MAX_CHANNELS][BLOCK_MAX_SIZE];
DECLARE_ALIGNED(16, FFTSample, output)[BLOCK_MAX_SIZE * 2]; DECLARE_ALIGNED(32, FFTSample, output)[BLOCK_MAX_SIZE * 2];
FFTContext mdct_ctx[BLOCK_NB_SIZES]; FFTContext mdct_ctx[BLOCK_NB_SIZES];
float *windows[BLOCK_NB_SIZES]; float *windows[BLOCK_NB_SIZES];
/* output buffer for one frame and the last for IMDCT windowing */ /* output buffer for one frame and the last for IMDCT windowing */
DECLARE_ALIGNED(16, float, frame_out)[MAX_CHANNELS][BLOCK_MAX_SIZE * 2]; DECLARE_ALIGNED(32, float, frame_out)[MAX_CHANNELS][BLOCK_MAX_SIZE * 2];
/* last frame info */ /* last frame info */
uint8_t last_superframe[MAX_CODED_SUPERFRAME_SIZE + 4]; /* padding added */ uint8_t last_superframe[MAX_CODED_SUPERFRAME_SIZE + 4]; /* padding added */
int last_bitoffset; int last_bitoffset;

@ -145,7 +145,7 @@ typedef struct {
uint8_t table_idx; ///< index in sf_offsets for the scale factor reference block uint8_t table_idx; ///< index in sf_offsets for the scale factor reference block
float* coeffs; ///< pointer to the subframe decode buffer float* coeffs; ///< pointer to the subframe decode buffer
uint16_t num_vec_coeffs; ///< number of vector coded coefficients uint16_t num_vec_coeffs; ///< number of vector coded coefficients
DECLARE_ALIGNED(16, float, out)[WMAPRO_BLOCK_MAX_SIZE + WMAPRO_BLOCK_MAX_SIZE / 2]; ///< output buffer DECLARE_ALIGNED(32, float, out)[WMAPRO_BLOCK_MAX_SIZE + WMAPRO_BLOCK_MAX_SIZE / 2]; ///< output buffer
} WMAProChannelCtx; } WMAProChannelCtx;
/** /**
@ -170,7 +170,7 @@ typedef struct WMAProDecodeCtx {
FF_INPUT_BUFFER_PADDING_SIZE];///< compressed frame data FF_INPUT_BUFFER_PADDING_SIZE];///< compressed frame data
PutBitContext pb; ///< context for filling the frame_data buffer PutBitContext pb; ///< context for filling the frame_data buffer
FFTContext mdct_ctx[WMAPRO_BLOCK_SIZES]; ///< MDCT context per block size FFTContext mdct_ctx[WMAPRO_BLOCK_SIZES]; ///< MDCT context per block size
DECLARE_ALIGNED(16, float, tmp)[WMAPRO_BLOCK_MAX_SIZE]; ///< IMDCT output buffer DECLARE_ALIGNED(32, float, tmp)[WMAPRO_BLOCK_MAX_SIZE]; ///< IMDCT output buffer
float* windows[WMAPRO_BLOCK_SIZES]; ///< windows for the different block sizes float* windows[WMAPRO_BLOCK_SIZES]; ///< windows for the different block sizes
/* frame size dependent frame information (set during initialization) */ /* frame size dependent frame information (set during initialization) */

@ -275,11 +275,11 @@ typedef struct {
///< by postfilter ///< by postfilter
float denoise_filter_cache[MAX_FRAMESIZE]; float denoise_filter_cache[MAX_FRAMESIZE];
int denoise_filter_cache_size; ///< samples in #denoise_filter_cache int denoise_filter_cache_size; ///< samples in #denoise_filter_cache
DECLARE_ALIGNED(16, float, tilted_lpcs_pf)[0x80]; DECLARE_ALIGNED(32, float, tilted_lpcs_pf)[0x80];
///< aligned buffer for LPC tilting ///< aligned buffer for LPC tilting
DECLARE_ALIGNED(16, float, denoise_coeffs_pf)[0x80]; DECLARE_ALIGNED(32, float, denoise_coeffs_pf)[0x80];
///< aligned buffer for denoise coefficients ///< aligned buffer for denoise coefficients
DECLARE_ALIGNED(16, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16]; DECLARE_ALIGNED(32, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
///< aligned buffer for postfilter speech ///< aligned buffer for postfilter speech
///< synthesis ///< synthesis
/** /**

@ -25,7 +25,14 @@ av_cold void ff_fft_init_mmx(FFTContext *s)
{ {
#if HAVE_YASM #if HAVE_YASM
int has_vectors = av_get_cpu_flags(); int has_vectors = av_get_cpu_flags();
if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) { if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX && s->nbits >= 5) {
/* AVX for SB */
s->imdct_calc = ff_imdct_calc_sse;
s->imdct_half = ff_imdct_half_avx;
s->fft_permute = ff_fft_permute_sse;
s->fft_calc = ff_fft_calc_avx;
s->fft_permutation = FF_FFT_PERM_AVX;
} else if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) {
/* SSE for P3/P4/K8 */ /* SSE for P3/P4/K8 */
s->imdct_calc = ff_imdct_calc_sse; s->imdct_calc = ff_imdct_calc_sse;
s->imdct_half = ff_imdct_half_sse; s->imdct_half = ff_imdct_half_sse;

@ -22,6 +22,7 @@
#include "libavcodec/fft.h" #include "libavcodec/fft.h"
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z); void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
void ff_fft_calc_avx(FFTContext *s, FFTComplex *z);
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z); void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z); void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z);
void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z); void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z);
@ -32,6 +33,7 @@ void ff_imdct_calc_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input
void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_dct32_float_sse(FFTSample *out, const FFTSample *in); void ff_dct32_float_sse(FFTSample *out, const FFTSample *in);
#endif #endif

@ -1,6 +1,7 @@
;****************************************************************************** ;******************************************************************************
;* FFT transform with SSE/3DNow optimizations ;* FFT transform with SSE/3DNow optimizations
;* Copyright (c) 2008 Loren Merritt ;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2011 Vitor Sessak
;* ;*
;* This algorithm (though not any of the implementation details) is ;* This algorithm (though not any of the implementation details) is
;* based on libdjbfft by D. J. Bernstein. ;* based on libdjbfft by D. J. Bernstein.
@ -49,9 +50,21 @@ endstruc
SECTION_RODATA SECTION_RODATA
%define M_SQRT1_2 0.70710678118654752440 %define M_SQRT1_2 0.70710678118654752440
ps_root2: times 4 dd M_SQRT1_2 %define M_COS_PI_1_8 0.923879532511287
ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 %define M_COS_PI_3_8 0.38268343236509
ps_p1p1m1p1: dd 0, 0, 1<<31, 0
align 32
ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
ps_root2: times 8 dd M_SQRT1_2
ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
ps_m1p1: dd 1<<31, 0 ps_m1p1: dd 1<<31, 0
%assign i 16 %assign i 16
@ -96,51 +109,80 @@ section .text align=16
SWAP %3, %6 SWAP %3, %6
%endmacro %endmacro
; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
; %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
; %3, %4, %5 tmp
; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
; %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
%macro T8_AVX 5
vsubps %5, %1, %2 ; v = %1 - %2
vaddps %3, %1, %2 ; w = %1 + %2
vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1
vpermilps %2, %2, [perm1]
vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
vsubps %4, %5, %1 ; s = r - q
vaddps %1, %5, %1 ; u = r + q
vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8}
vshufps %5, %4, %1, 0xbb
vshufps %3, %4, %1, 0xee
vperm2f128 %3, %3, %5, 0x13
vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1}
vshufps %2, %1, %4, 0xdd
vshufps %1, %1, %4, 0x88
vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4}
vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7}
vsubps %5, %1, %3
vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
vsubps %2, %4, %1 ; %2 = v - w
vaddps %1, %4, %1 ; %1 = v + w
%endmacro
; In SSE mode do one fft4 transforms
; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3} ; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
;
; In AVX mode do two fft4 transforms
; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
%macro T4_SSE 3 %macro T4_SSE 3
mova %3, %1 subps %3, %1, %2 ; {t3,t4,-t8,t7}
addps %1, %2 ; {t1,t2,t6,t5} addps %1, %1, %2 ; {t1,t2,t6,t5}
subps %3, %2 ; {t3,t4,-t8,t7} xorps %3, %3, [ps_p1p1m1p1]
xorps %3, [ps_p1p1m1p1] shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
mova %2, %1 shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
shufps %1, %3, 0x44 ; {t1,t2,t3,t4} subps %3, %1, %2 ; {r2,i2,r3,i3}
shufps %2, %3, 0xbe ; {t6,t5,t7,t8} addps %1, %1, %2 ; {r0,i0,r1,i1}
mova %3, %1 shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
addps %1, %2 ; {r0,i0,r1,i1} shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
subps %3, %2 ; {r2,i2,r3,i3}
mova %2, %1
shufps %1, %3, 0x88 ; {r0,r1,r2,r3}
shufps %2, %3, 0xdd ; {i0,i1,i2,i3}
%endmacro %endmacro
; In SSE mode do one FFT8
; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7} ; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7} ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
;
; In AVX mode do two FFT8
; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
%macro T8_SSE 6 %macro T8_SSE 6
mova %6, %3 addps %6, %3, %4 ; {t1,t2,t3,t4}
subps %3, %4 ; {r5,i5,r7,i7} subps %3, %3, %4 ; {r5,i5,r7,i7}
addps %6, %4 ; {t1,t2,t3,t4} shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
mova %4, %3 mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
shufps %4, %4, 0xb1 ; {i5,r5,i7,r7} mulps %4, %4, [ps_root2]
mulps %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} addps %3, %3, %4 ; {t8,t7,ta,t9}
mulps %4, [ps_root2] shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
addps %3, %4 ; {t8,t7,ta,t9} shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
mova %4, %6 subps %3, %6, %4 ; {t6,t5,tc,tb}
shufps %6, %3, 0x36 ; {t3,t2,t9,t8} addps %6, %6, %4 ; {t1,t2,t9,ta}
shufps %4, %3, 0x9c ; {t1,t4,t7,ta} shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
mova %3, %6 shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
addps %6, %4 ; {t1,t2,t9,ta} subps %3, %1, %6 ; {r4,r5,r6,r7}
subps %3, %4 ; {t6,t5,tc,tb} addps %1, %1, %6 ; {r0,r1,r2,r3}
mova %4, %6 subps %4, %2, %5 ; {i4,i5,i6,i7}
shufps %6, %3, 0xd8 ; {t1,t9,t5,tb} addps %2, %2, %5 ; {i0,i1,i2,i3}
shufps %4, %3, 0x8d ; {t2,ta,t6,tc}
mova %3, %1
mova %5, %2
addps %1, %6 ; {r0,r1,r2,r3}
addps %2, %4 ; {i0,i1,i2,i3}
subps %3, %6 ; {r4,r5,r6,r7}
subps %5, %4 ; {i4,i5,i6,i7}
SWAP %4, %5
%endmacro %endmacro
; scheduled for cpu-bound sizes ; scheduled for cpu-bound sizes
@ -148,52 +190,44 @@ section .text align=16
IF%1 mova m4, Z(4) IF%1 mova m4, Z(4)
IF%1 mova m5, Z(5) IF%1 mova m5, Z(5)
mova m0, %2 ; wre mova m0, %2 ; wre
mova m2, m4
mova m1, %3 ; wim mova m1, %3 ; wim
mova m3, m5 mulps m2, m4, m0 ; r2*wre
mulps m2, m0 ; r2*wre
IF%1 mova m6, Z2(6) IF%1 mova m6, Z2(6)
mulps m3, m1 ; i2*wim mulps m3, m5, m1 ; i2*wim
IF%1 mova m7, Z2(7) IF%1 mova m7, Z2(7)
mulps m4, m1 ; r2*wim mulps m4, m4, m1 ; r2*wim
mulps m5, m0 ; i2*wre mulps m5, m5, m0 ; i2*wre
addps m2, m3 ; r2*wre + i2*wim addps m2, m2, m3 ; r2*wre + i2*wim
mova m3, m1 mulps m3, m1, m7 ; i3*wim
mulps m1, m6 ; r3*wim subps m5, m5, m4 ; i2*wre - r2*wim
subps m5, m4 ; i2*wre - r2*wim mulps m1, m1, m6 ; r3*wim
mova m4, m0 mulps m4, m0, m6 ; r3*wre
mulps m3, m7 ; i3*wim mulps m0, m0, m7 ; i3*wre
mulps m4, m6 ; r3*wre subps m4, m4, m3 ; r3*wre - i3*wim
mulps m0, m7 ; i3*wre
subps m4, m3 ; r3*wre - i3*wim
mova m3, Z(0) mova m3, Z(0)
addps m0, m1 ; i3*wre + r3*wim addps m0, m0, m1 ; i3*wre + r3*wim
mova m1, m4 subps m1, m4, m2 ; t3
addps m4, m2 ; t5 addps m4, m4, m2 ; t5
subps m1, m2 ; t3 subps m3, m3, m4 ; r2
subps m3, m4 ; r2 addps m4, m4, Z(0) ; r0
addps m4, Z(0) ; r0
mova m6, Z(2) mova m6, Z(2)
mova Z(4), m3 mova Z(4), m3
mova Z(0), m4 mova Z(0), m4
mova m3, m5 subps m3, m5, m0 ; t4
subps m5, m0 ; t4 subps m4, m6, m3 ; r3
mova m4, m6 addps m3, m3, m6 ; r1
subps m6, m5 ; r3 mova Z2(6), m4
addps m5, m4 ; r1 mova Z(2), m3
mova Z2(6), m6
mova Z(2), m5
mova m2, Z(3) mova m2, Z(3)
addps m3, m0 ; t6 addps m3, m5, m0 ; t6
subps m2, m1 ; i3 subps m2, m2, m1 ; i3
mova m7, Z(1) mova m7, Z(1)
addps m1, Z(3) ; i1 addps m1, m1, Z(3) ; i1
mova Z2(7), m2 mova Z2(7), m2
mova Z(3), m1 mova Z(3), m1
mova m4, m7 subps m4, m7, m3 ; i2
subps m7, m3 ; i2 addps m3, m3, m7 ; i0
addps m3, m4 ; i0 mova Z(5), m4
mova Z(5), m7
mova Z(1), m3 mova Z(1), m3
%endmacro %endmacro
@ -201,77 +235,55 @@ IF%1 mova m7, Z2(7)
%macro PASS_BIG 1 ; (!interleave) %macro PASS_BIG 1 ; (!interleave)
mova m4, Z(4) ; r2 mova m4, Z(4) ; r2
mova m5, Z(5) ; i2 mova m5, Z(5) ; i2
mova m2, m4
mova m0, [wq] ; wre mova m0, [wq] ; wre
mova m3, m5
mova m1, [wq+o1q] ; wim mova m1, [wq+o1q] ; wim
mulps m2, m0 ; r2*wre mulps m2, m4, m0 ; r2*wre
mova m6, Z2(6) ; r3 mova m6, Z2(6) ; r3
mulps m3, m1 ; i2*wim mulps m3, m5, m1 ; i2*wim
mova m7, Z2(7) ; i3 mova m7, Z2(7) ; i3
mulps m4, m1 ; r2*wim mulps m4, m4, m1 ; r2*wim
mulps m5, m0 ; i2*wre mulps m5, m5, m0 ; i2*wre
addps m2, m3 ; r2*wre + i2*wim addps m2, m2, m3 ; r2*wre + i2*wim
mova m3, m1 mulps m3, m1, m7 ; i3*wim
mulps m1, m6 ; r3*wim mulps m1, m1, m6 ; r3*wim
subps m5, m4 ; i2*wre - r2*wim subps m5, m5, m4 ; i2*wre - r2*wim
mova m4, m0 mulps m4, m0, m6 ; r3*wre
mulps m3, m7 ; i3*wim mulps m0, m0, m7 ; i3*wre
mulps m4, m6 ; r3*wre subps m4, m4, m3 ; r3*wre - i3*wim
mulps m0, m7 ; i3*wre
subps m4, m3 ; r3*wre - i3*wim
mova m3, Z(0) mova m3, Z(0)
addps m0, m1 ; i3*wre + r3*wim addps m0, m0, m1 ; i3*wre + r3*wim
mova m1, m4 subps m1, m4, m2 ; t3
addps m4, m2 ; t5 addps m4, m4, m2 ; t5
subps m1, m2 ; t3 subps m3, m3, m4 ; r2
subps m3, m4 ; r2 addps m4, m4, Z(0) ; r0
addps m4, Z(0) ; r0
mova m6, Z(2) mova m6, Z(2)
mova Z(4), m3 mova Z(4), m3
mova Z(0), m4 mova Z(0), m4
mova m3, m5 subps m3, m5, m0 ; t4
subps m5, m0 ; t4 subps m4, m6, m3 ; r3
mova m4, m6 addps m3, m3, m6 ; r1
subps m6, m5 ; r3 IF%1 mova Z2(6), m4
addps m5, m4 ; r1 IF%1 mova Z(2), m3
IF%1 mova Z2(6), m6
IF%1 mova Z(2), m5
mova m2, Z(3) mova m2, Z(3)
addps m3, m0 ; t6 addps m5, m5, m0 ; t6
subps m2, m1 ; i3 subps m2, m2, m1 ; i3
mova m7, Z(1) mova m7, Z(1)
addps m1, Z(3) ; i1 addps m1, m1, Z(3) ; i1
IF%1 mova Z2(7), m2 IF%1 mova Z2(7), m2
IF%1 mova Z(3), m1 IF%1 mova Z(3), m1
mova m4, m7 subps m6, m7, m5 ; i2
subps m7, m3 ; i2 addps m5, m5, m7 ; i0
addps m3, m4 ; i0 IF%1 mova Z(5), m6
IF%1 mova Z(5), m7 IF%1 mova Z(1), m5
IF%1 mova Z(1), m3
%if %1==0 %if %1==0
mova m4, m5 ; r1 INTERL m1, m3, m7, Z, 2
mova m0, m6 ; r3 INTERL m2, m4, m0, Z2, 6
unpcklps m5, m1
unpckhps m4, m1
unpcklps m6, m2
unpckhps m0, m2
mova m1, Z(0) mova m1, Z(0)
mova m2, Z(4) mova m2, Z(4)
mova Z(2), m5
mova Z(3), m4 INTERL m5, m1, m3, Z, 0
mova Z2(6), m6 INTERL m6, m2, m7, Z, 4
mova Z2(7), m0
mova m5, m1 ; r0
mova m4, m2 ; r2
unpcklps m1, m3
unpckhps m5, m3
unpcklps m2, m7
unpckhps m4, m7
mova Z(0), m1
mova Z(1), m5
mova Z(4), m2
mova Z(5), m4
%endif %endif
%endmacro %endmacro
@ -281,13 +293,106 @@ IF%1 mova Z(1), m3
punpckhdq %3, %2 punpckhdq %3, %2
%endmacro %endmacro
INIT_XMM
%define mova movaps
%define Z(x) [r0+mmsize*x] %define Z(x) [r0+mmsize*x]
%define Z2(x) [r0+mmsize*x] %define Z2(x) [r0+mmsize*x]
%define ZH(x) [r0+mmsize*x+mmsize/2]
INIT_YMM
align 16
fft8_avx:
mova m0, Z(0)
mova m1, Z(1)
T8_AVX m0, m1, m2, m3, m4
mova Z(0), m0
mova Z(1), m1
ret
align 16
fft16_avx:
mova m2, Z(2)
mova m3, Z(3)
T4_SSE m2, m3, m7
mova m0, Z(0)
mova m1, Z(1)
T8_AVX m0, m1, m4, m5, m7
mova m4, [ps_cos16_1]
mova m5, [ps_cos16_2]
vmulps m6, m2, m4
vmulps m7, m3, m5
vaddps m7, m7, m6
vmulps m2, m2, m5
vmulps m3, m3, m4
vsubps m3, m3, m2
vblendps m2, m7, m3, 0xf0
vperm2f128 m3, m7, m3, 0x21
vaddps m4, m2, m3
vsubps m2, m3, m2
vperm2f128 m2, m2, m2, 0x01
vsubps m3, m1, m2
vaddps m1, m1, m2
vsubps m5, m0, m4
vaddps m0, m0, m4
vextractf128 Z(0), m0, 0
vextractf128 ZH(0), m1, 0
vextractf128 Z(1), m0, 1
vextractf128 ZH(1), m1, 1
vextractf128 Z(2), m5, 0
vextractf128 ZH(2), m3, 0
vextractf128 Z(3), m5, 1
vextractf128 ZH(3), m3, 1
ret
align 16
fft32_avx:
call fft16_avx
mova m0, Z(4)
mova m1, Z(5)
T4_SSE m0, m1, m4
mova m2, Z(6)
mova m3, Z(7)
T8_SSE m0, m1, m2, m3, m4, m6
; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
vperm2f128 m4, m0, m2, 0x20
vperm2f128 m5, m1, m3, 0x20
vperm2f128 m6, m0, m2, 0x31
vperm2f128 m7, m1, m3, 0x31
PASS_SMALL 0, [cos_32], [cos_32+32]
ret
fft32_interleave_avx:
call fft32_avx
mov r2d, 32
.deint_loop:
mova m2, Z(0)
mova m3, Z(1)
vunpcklps m0, m2, m3
vunpckhps m1, m2, m3
vextractf128 Z(0), m0, 0
vextractf128 ZH(0), m1, 0
vextractf128 Z(1), m0, 1
vextractf128 ZH(1), m1, 1
add r0, mmsize*2
sub r2d, mmsize/4
jg .deint_loop
ret
INIT_XMM
%define movdqa movaps
align 16 align 16
fft4_avx:
fft4_sse: fft4_sse:
mova m0, Z(0) mova m0, Z(0)
mova m1, Z(1) mova m1, Z(1)
@ -406,6 +511,8 @@ FFT48_3DN _3dn
%define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)] %define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)]
%define Z2(x) [zq + o3q + mmsize*(x&1)] %define Z2(x) [zq + o3q + mmsize*(x&1)]
%define ZH(x) [zq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
%define Z2H(x) [zq + o3q + mmsize*(x&1) + mmsize/2]
%macro DECL_PASS 2+ ; name, payload %macro DECL_PASS 2+ ; name, payload
align 16 align 16
@ -423,8 +530,34 @@ DEFINE_ARGS z, w, n, o1, o3
rep ret rep ret
%endmacro %endmacro
INIT_YMM
%macro INTERL_AVX 5
vunpckhps %3, %2, %1
vunpcklps %2, %2, %1
vextractf128 %4(%5), %2, 0
vextractf128 %4 %+ H(%5), %3, 0
vextractf128 %4(%5 + 1), %2, 1
vextractf128 %4 %+ H(%5 + 1), %3, 1
%endmacro
%define INTERL INTERL_AVX
DECL_PASS pass_avx, PASS_BIG 1
DECL_PASS pass_interleave_avx, PASS_BIG 0
INIT_XMM INIT_XMM
%define mova movaps
%macro INTERL_SSE 5
mova %3, %2
unpcklps %2, %1
unpckhps %3, %1
mova %4(%5), %2
mova %4(%5+1), %3
%endmacro
%define INTERL INTERL_SSE
DECL_PASS pass_sse, PASS_BIG 1 DECL_PASS pass_sse, PASS_BIG 1
DECL_PASS pass_interleave_sse, PASS_BIG 0 DECL_PASS pass_interleave_sse, PASS_BIG 0
@ -457,9 +590,12 @@ DECL_PASS pass_interleave_3dn, PASS_BIG 0
%macro DECL_FFT 2-3 ; nbits, cpu, suffix %macro DECL_FFT 2-3 ; nbits, cpu, suffix
%xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL %xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL
%if %1==5 %if %1>=5
%xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL %xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL
%endif %endif
%if %1>=6
%xdefine list_of_fft list_of_fft, fft32%3%2 SECTION_REL
%endif
%assign n 1<<%1 %assign n 1<<%1
%rep 17-%1 %rep 17-%1
@ -492,9 +628,14 @@ section .text
; The others pass args in registers and don't spill anything. ; The others pass args in registers and don't spill anything.
cglobal fft_dispatch%3%2, 2,5,8, z, nbits cglobal fft_dispatch%3%2, 2,5,8, z, nbits
FFT_DISPATCH %3%2, nbits FFT_DISPATCH %3%2, nbits
%ifidn %2, _avx
vzeroupper
%endif
RET RET
%endmacro ; DECL_FFT %endmacro ; DECL_FFT
DECL_FFT 6, _avx
DECL_FFT 6, _avx, _interleave
DECL_FFT 5, _sse DECL_FFT 5, _sse
DECL_FFT 5, _sse, _interleave DECL_FFT 5, _sse, _interleave
DECL_FFT 4, _3dn DECL_FFT 4, _3dn
@ -533,21 +674,53 @@ INIT_XMM
%endmacro %endmacro
%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5 %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
movaps xmm6, [%4+%1*2] mulps m6, %3, [%5+%1]
movaps %2, [%4+%1*2+0x10] mulps m7, %2, [%5+%1]
movaps %3, xmm6 mulps %2, %2, [%6+%1]
movaps xmm7, %2 mulps %3, %3, [%6+%1]
mulps xmm6, [%5+%1] subps %2, %2, m6
mulps %2, [%6+%1] addps %3, %3, m7
mulps %3, [%6+%1] %endmacro
mulps xmm7, [%5+%1]
subps %2, xmm6 %macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8
addps %3, xmm7 .post:
vmovaps ymm1, [%3+%1*2]
vmovaps ymm0, [%3+%1*2+0x20]
vmovaps ymm3, [%3+%2*2]
vmovaps ymm2, [%3+%2*2+0x20]
CMUL %1, ymm0, ymm1, %3, %4, %5
CMUL %2, ymm2, ymm3, %3, %4, %5
vshufps ymm1, ymm1, ymm1, 0x1b
vshufps ymm3, ymm3, ymm3, 0x1b
vperm2f128 ymm1, ymm1, ymm1, 0x01
vperm2f128 ymm3, ymm3, ymm3, 0x01
vunpcklps ymm6, ymm2, ymm1
vunpckhps ymm4, ymm2, ymm1
vunpcklps ymm7, ymm0, ymm3
vunpckhps ymm5, ymm0, ymm3
vextractf128 [%3+%1*2], ymm7, 0
vextractf128 [%3+%1*2+0x10], ymm5, 0
vextractf128 [%3+%1*2+0x20], ymm7, 1
vextractf128 [%3+%1*2+0x30], ymm5, 1
vextractf128 [%3+%2*2], ymm6, 0
vextractf128 [%3+%2*2+0x10], ymm4, 0
vextractf128 [%3+%2*2+0x20], ymm6, 1
vextractf128 [%3+%2*2+0x30], ymm4, 1
sub %2, 0x20
add %1, 0x20
jl .post
%endmacro %endmacro
%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8 %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
.post: .post:
movaps xmm1, [%3+%1*2]
movaps xmm0, [%3+%1*2+0x10]
CMUL %1, xmm0, xmm1, %3, %4, %5 CMUL %1, xmm0, xmm1, %3, %4, %5
movaps xmm5, [%3+%2*2]
movaps xmm4, [%3+%2*2+0x10]
CMUL %2, xmm4, xmm5, %3, %4, %5 CMUL %2, xmm4, xmm5, %3, %4, %5
shufps xmm1, xmm1, 0x1b shufps xmm1, xmm1, 0x1b
shufps xmm5, xmm5, 0x1b shufps xmm5, xmm5, 0x1b
@ -566,7 +739,8 @@ INIT_XMM
jl .post jl .post
%endmacro %endmacro
cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input %macro DECL_IMDCT 2
cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input
%ifdef ARCH_X86_64 %ifdef ARCH_X86_64
%define rrevtab r10 %define rrevtab r10
%define rtcos r11 %define rtcos r11
@ -641,7 +815,7 @@ cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample
mov r0, r1 mov r0, r1
mov r1d, [r5+FFTContext.nbits] mov r1d, [r5+FFTContext.nbits]
FFT_DISPATCH _sse, r1 FFT_DISPATCH %1, r1
mov r0d, [r5+FFTContext.mdctsize] mov r0d, [r5+FFTContext.mdctsize]
add r6, r0 add r6, r0
@ -653,14 +827,24 @@ cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample
mov rtsin, [esp+4] mov rtsin, [esp+4]
%endif %endif
neg r0 neg r0
mov r1, -16 mov r1, -mmsize
sub r1, r0 sub r1, r0
POSROTATESHUF r0, r1, r6, rtcos, rtsin %2 r0, r1, r6, rtcos, rtsin
%ifdef ARCH_X86_64 %ifdef ARCH_X86_64
pop r14 pop r14
pop r13 pop r13
pop r12 pop r12
%else %else
add esp, 12 add esp, 12
%endif
%ifidn avx_enabled, 1
vzeroupper
%endif %endif
RET RET
%endmacro
DECL_IMDCT _sse, POSROTATESHUF
INIT_YMM
DECL_IMDCT _avx, POSROTATESHUF_AVX

@ -28,6 +28,12 @@ DECLARE_ASM_CONST(16, int, ff_m1m1m1m1)[4] =
void ff_fft_dispatch_sse(FFTComplex *z, int nbits); void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits); void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
void ff_fft_dispatch_interleave_avx(FFTComplex *z, int nbits);
void ff_fft_calc_avx(FFTContext *s, FFTComplex *z)
{
ff_fft_dispatch_interleave_avx(z, s->nbits);
}
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z) void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
{ {
@ -77,7 +83,7 @@ void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
long n = s->mdct_size; long n = s->mdct_size;
long n4 = n >> 2; long n4 = n >> 2;
ff_imdct_half_sse(s, output+n4, input); s->imdct_half(s, output + n4, input);
j = -n; j = -n;
k = n-16; k = n-16;

Loading…
Cancel
Save