FFmpeg/libavcodec/x86/ac3dsp_init.c

/*
 * x86-optimized AC-3 DSP utils
 * Copyright (c) 2011 Justin Ruggles
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/attributes.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "dsputil_x86.h"
#include "libavcodec/ac3.h"
#include "libavcodec/ac3dsp.h"

void ff_ac3_exponent_min_mmx   (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
void ff_ac3_exponent_min_sse2  (uint8_t *exp, int num_reuse_blocks, int nb_coefs);

int ff_ac3_max_msb_abs_int16_mmx  (const int16_t *src, int len);
int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len);
int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len);
int ff_ac3_max_msb_abs_int16_ssse3(const int16_t *src, int len);

void ff_ac3_lshift_int16_mmx (int16_t *src, unsigned int len, unsigned int shift);
void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned int shift);

void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift);
void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift);

void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len);
void ff_float_to_fixed24_sse  (int32_t *dst, const float *src, unsigned int len);
void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len);

int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]);

void ff_ac3_extract_exponents_3dnow(uint8_t *exp, int32_t *coef, int nb_coefs);
void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs);
void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs);

void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
                                        const int16_t *window, unsigned int len);
void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
                                      const int16_t *window, unsigned int len);
void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
                                  const int16_t *window, unsigned int len);
void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
                                const int16_t *window, unsigned int len);
void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
                                 const int16_t *window, unsigned int len);
void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
                                      const int16_t *window, unsigned int len);

#if ARCH_X86_32 && defined(__INTEL_COMPILER)
#       undef HAVE_7REGS
#       define HAVE_7REGS 0
#endif

#if HAVE_SSE_INLINE && HAVE_7REGS

#define IF1(x) x
#define IF0(x)

#define MIX5(mono, stereo)                                      \
    __asm__ volatile (                                          \
        "movss           0(%1), %%xmm5          \n"             \
        "movss           8(%1), %%xmm6          \n"             \
        "movss          24(%1), %%xmm7          \n"             \
        "shufps     $0, %%xmm5, %%xmm5          \n"             \
        "shufps     $0, %%xmm6, %%xmm6          \n"             \
        "shufps     $0, %%xmm7, %%xmm7          \n"             \
        "1:                                     \n"             \
        "movaps       (%0, %2), %%xmm0          \n"             \
        "movaps       (%0, %3), %%xmm1          \n"             \
        "movaps       (%0, %4), %%xmm2          \n"             \
        "movaps       (%0, %5), %%xmm3          \n"             \
        "movaps       (%0, %6), %%xmm4          \n"             \
        "mulps          %%xmm5, %%xmm0          \n"             \
        "mulps          %%xmm6, %%xmm1          \n"             \
        "mulps          %%xmm5, %%xmm2          \n"             \
        "mulps          %%xmm7, %%xmm3          \n"             \
        "mulps          %%xmm7, %%xmm4          \n"             \
 stereo("addps          %%xmm1, %%xmm0          \n")            \
        "addps          %%xmm1, %%xmm2          \n"             \
        "addps          %%xmm3, %%xmm0          \n"             \
        "addps          %%xmm4, %%xmm2          \n"             \
   mono("addps          %%xmm2, %%xmm0          \n")            \
        "movaps         %%xmm0, (%0, %2)        \n"             \
 stereo("movaps         %%xmm2, (%0, %3)        \n")            \
        "add               $16, %0              \n"             \
        "jl                 1b                  \n"             \
        : "+&r"(i)                                              \
        : "r"(matrix),                                          \
          "r"(samples[0] + len),                                \
          "r"(samples[1] + len),                                \
          "r"(samples[2] + len),                                \
          "r"(samples[3] + len),                                \
          "r"(samples[4] + len)                                 \
        : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3",      \
                      "%xmm4", "%xmm5", "%xmm6", "%xmm7",)      \
         "memory"                                               \
    );

#define MIX_MISC(stereo)                                        \
    __asm__ volatile (                                          \
        "mov              %5, %2            \n"                 \
        "1:                                 \n"                 \
        "mov -%c7(%6, %2, %c8), %3          \n"                 \
        "movaps     (%3, %0), %%xmm0        \n"                 \
 stereo("movaps       %%xmm0, %%xmm1        \n")                \
        "mulps        %%xmm4, %%xmm0        \n"                 \
 stereo("mulps        %%xmm5, %%xmm1        \n")                \
        "2:                                 \n"                 \
        "mov   (%6, %2, %c8), %1            \n"                 \
        "movaps     (%1, %0), %%xmm2        \n"                 \
 stereo("movaps       %%xmm2, %%xmm3        \n")                \
        "mulps   (%4, %2, 8), %%xmm2        \n"                 \
 stereo("mulps 16(%4, %2, 8), %%xmm3        \n")                \
        "addps        %%xmm2, %%xmm0        \n"                 \
 stereo("addps        %%xmm3, %%xmm1        \n")                \
        "add              $4, %2            \n"                 \
        "jl               2b                \n"                 \
        "mov              %5, %2            \n"                 \
 stereo("mov   (%6, %2, %c8), %1            \n")                \
        "movaps       %%xmm0, (%3, %0)      \n"                 \
 stereo("movaps       %%xmm1, (%1, %0)      \n")                \
        "add             $16, %0            \n"                 \
        "jl               1b                \n"                 \
        : "+&r"(i), "=&r"(j), "=&r"(k), "=&r"(m)                \
        : "r"(matrix_simd + in_ch),                             \
          "g"((intptr_t) - 4 * (in_ch - 1)),                    \
          "r"(samp + in_ch),                                    \
          "i"(sizeof(float *)), "i"(sizeof(float *)/4)          \
        : "memory"                                              \
    );

static void ac3_downmix_sse(float **samples, float (*matrix)[2],
                            int out_ch, int in_ch, int len)
{
    int (*matrix_cmp)[2] = (int(*)[2])matrix;
    intptr_t i, j, k, m;

    i = -len * sizeof(float);
    if (in_ch == 5 && out_ch == 2 &&
        !(matrix_cmp[0][1] | matrix_cmp[2][0]   |
          matrix_cmp[3][1] | matrix_cmp[4][0]   |
          (matrix_cmp[1][0] ^ matrix_cmp[1][1]) |
          (matrix_cmp[0][0] ^ matrix_cmp[2][1]))) {
        MIX5(IF0, IF1);
    } else if (in_ch == 5 && out_ch == 1 &&
               matrix_cmp[0][0] == matrix_cmp[2][0] &&
               matrix_cmp[3][0] == matrix_cmp[4][0]) {
        MIX5(IF1, IF0);
    } else {
        DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
        float *samp[AC3_MAX_CHANNELS];

        for (j = 0; j < in_ch; j++)
            samp[j] = samples[j] + len;

        j = 2 * in_ch * sizeof(float);
        __asm__ volatile (
            "1:                                 \n"
            "sub             $8, %0             \n"
            "movss     (%2, %0), %%xmm4         \n"
            "movss    4(%2, %0), %%xmm5         \n"
            "shufps          $0, %%xmm4, %%xmm4 \n"
            "shufps          $0, %%xmm5, %%xmm5 \n"
            "movaps      %%xmm4,   (%1, %0, 4)  \n"
            "movaps      %%xmm5, 16(%1, %0, 4)  \n"
            "jg              1b                 \n"
            : "+&r"(j)
            : "r"(matrix_simd), "r"(matrix)
            : "memory"
        );
        if (out_ch == 2) {
            MIX_MISC(IF1);
        } else {
            MIX_MISC(IF0);
        }
    }
}

#endif /* HAVE_SSE_INLINE && HAVE_7REGS */

av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
{
    int cpu_flags = av_get_cpu_flags();

    if (EXTERNAL_MMX(cpu_flags)) {
        c->ac3_exponent_min = ff_ac3_exponent_min_mmx;
        c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx;
        c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx;
        c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;
    }
    if (EXTERNAL_AMD3DNOW(cpu_flags)) {
        if (!bit_exact) {
            c->float_to_fixed24 = ff_float_to_fixed24_3dnow;
        }
    }
    if (EXTERNAL_MMXEXT(cpu_flags)) {
        c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
        c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext;
        if (bit_exact) {
            c->apply_window_int16 = ff_apply_window_int16_mmxext;
        } else {
            c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
        }
    }
    if (EXTERNAL_SSE(cpu_flags)) {
        c->float_to_fixed24 = ff_float_to_fixed24_sse;
    }
    if (EXTERNAL_SSE2(cpu_flags)) {
        c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
        c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
        c->float_to_fixed24 = ff_float_to_fixed24_sse2;
        c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2;
        c->extract_exponents = ff_ac3_extract_exponents_sse2;
        if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW)) {
            c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2;
            c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;
        }
        if (bit_exact) {
            c->apply_window_int16 = ff_apply_window_int16_sse2;
        } else if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW)) {
            c->apply_window_int16 = ff_apply_window_int16_round_sse2;
        }
    }
    if (EXTERNAL_SSSE3(cpu_flags)) {
        c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;
        if (cpu_flags & AV_CPU_FLAG_ATOM) {
            c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
        } else {
            c->extract_exponents = ff_ac3_extract_exponents_ssse3;
            c->apply_window_int16 = ff_apply_window_int16_ssse3;
        }
    }

#if HAVE_SSE_INLINE && HAVE_7REGS
    if (INLINE_SSE(cpu_flags)) {
        c->downmix = ac3_downmix_sse;
    }
#endif
}
Add x86-optimized versions of exponent_min(). Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 14 years ago			`/*`
			`* x86-optimized AC-3 DSP utils`
			`* Copyright (c) 2011 Justin Ruggles`
			`*`
			`* This file is part of FFmpeg.`
			`*`
			`* FFmpeg is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
			`* version 2.1 of the License, or (at your option) any later version.`
			`*`
			`* FFmpeg is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
			`* License along with FFmpeg; if not, write to the Free Software`
			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`*/`

x86: avcodec: Add a bunch of missing #includes for av_cold 11 years ago			`#include "libavutil/attributes.h"`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 13 years ago			`#include "libavutil/mem.h"`
x86: rename libavutil/x86_cpu.h to libavutil/x86/asm.h This puts x86-specific things in the x86/ subdirectory where they belong. Signed-off-by: Mans Rullgard <mans@mansr.com> 13 years ago			`#include "libavutil/x86/asm.h"`
x86: Replace checks for CPU extensions and flags by convenience macros This separates code relying on inline from that relying on external assembly and fixes instances where the coalesced check was incorrect. 13 years ago			`#include "libavutil/x86/cpu.h"`
x86: dsputil: Rename dsputil_mmx.h --> dsputil_x86.h The header is not (anymore) MMX-specific. 12 years ago			`#include "dsputil_x86.h"`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 13 years ago			`#include "libavcodec/ac3.h"`
Add x86-optimized versions of exponent_min(). Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 14 years ago			`#include "libavcodec/ac3dsp.h"`

cosmetics: Remove unnecessary extern keywords from function declarations 12 years ago			`void ff_ac3_exponent_min_mmx (uint8_t *exp, int num_reuse_blocks, int nb_coefs);`
			`void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs);`
			`void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs);`
Add x86-optimized versions of exponent_min(). Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 14 years ago
cosmetics: Remove unnecessary extern keywords from function declarations 12 years ago			`int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src, int len);`
			`int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len);`
			`int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len);`
			`int ff_ac3_max_msb_abs_int16_ssse3(const int16_t *src, int len);`
ac3enc: Add x86-optimized function to speed up log2_tab(). AC3DSPContext.ac3_max_msb_abs_int16() finds the maximum MSB of the absolute value of each element in an array of int16_t. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 14 years ago
cosmetics: Remove unnecessary extern keywords from function declarations 12 years ago			`void ff_ac3_lshift_int16_mmx (int16_t *src, unsigned int len, unsigned int shift);`
			`void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned int shift);`
ac3enc: add SIMD-optimized shifting functions for use with the fixed-point AC3 encoder. 14 years ago
cosmetics: Remove unnecessary extern keywords from function declarations 12 years ago			`void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift);`
			`void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift);`
ac3enc: add SIMD-optimized shifting functions for use with the fixed-point AC3 encoder. 14 years ago
cosmetics: Remove unnecessary extern keywords from function declarations 12 years ago			`void ff_float_to_fixed24_3dnow(int32_t dst, const float src, unsigned int len);`
			`void ff_float_to_fixed24_sse (int32_t dst, const float src, unsigned int len);`
			`void ff_float_to_fixed24_sse2 (int32_t dst, const float src, unsigned int len);`
ac3enc: add float_to_fixed24() with x86-optimized versions to AC3DSPContext and use in scale_coefficients() for the floating-point AC-3 encoder. 14 years ago
cosmetics: Remove unnecessary extern keywords from function declarations 12 years ago			`int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]);`
ac3enc: modify mantissa bit counting to keep bap counts for all values of bap instead of just 0 to 4. This does all the actual bit counting as a final step. 14 years ago
cosmetics: Remove unnecessary extern keywords from function declarations 12 years ago			`void ff_ac3_extract_exponents_3dnow(uint8_t exp, int32_t coef, int nb_coefs);`
			`void ff_ac3_extract_exponents_sse2 (uint8_t exp, int32_t coef, int nb_coefs);`
			`void ff_ac3_extract_exponents_ssse3(uint8_t exp, int32_t coef, int nb_coefs);`
ac3dsp: add x86-optimized versions of ac3dsp.extract_exponents(). 14 years ago
dsputil: Move apply_window_int16 to ac3dsp The (optimized) functions are used nowhere else. 11 years ago			`void ff_apply_window_int16_round_mmxext(int16_t output, const int16_t input,`
			`const int16_t *window, unsigned int len);`
			`void ff_apply_window_int16_round_sse2(int16_t output, const int16_t input,`
			`const int16_t *window, unsigned int len);`
			`void ff_apply_window_int16_mmxext(int16_t output, const int16_t input,`
			`const int16_t *window, unsigned int len);`
			`void ff_apply_window_int16_sse2(int16_t output, const int16_t input,`
			`const int16_t *window, unsigned int len);`
			`void ff_apply_window_int16_ssse3(int16_t output, const int16_t input,`
			`const int16_t *window, unsigned int len);`
			`void ff_apply_window_int16_ssse3_atom(int16_t output, const int16_t input,`
			`const int16_t *window, unsigned int len);`

x86/ac3dsp_init: try to workaround ICC failure. The asm code is not valid for older compilers as it uses too many operands, ICC on x86_32 seems affected by this. This patch disables the affected code for ICC on x86_32 and should make it compileable again. A better fix would be to use fewer operands or to change this code to yasm, later is being worked on AFAIK so this is a temporary solution. Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 12 years ago			`#if ARCH_X86_32 && defined(__INTEL_COMPILER)`
			`# undef HAVE_7REGS`
			`# define HAVE_7REGS 0`
			`#endif`

ac3dec: make downmix() take array of pointers to channel data 13 years ago			`#if HAVE_SSE_INLINE && HAVE_7REGS`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 13 years ago
			`#define IF1(x) x`
			`#define IF0(x)`

			`#define MIX5(mono, stereo) \`
			`__asm__ volatile ( \`
ac3dec: make downmix() take array of pointers to channel data 13 years ago			`"movss 0(%1), %%xmm5 \n" \`
			`"movss 8(%1), %%xmm6 \n" \`
			`"movss 24(%1), %%xmm7 \n" \`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 13 years ago			`"shufps $0, %%xmm5, %%xmm5 \n" \`
			`"shufps $0, %%xmm6, %%xmm6 \n" \`
			`"shufps $0, %%xmm7, %%xmm7 \n" \`
			`"1: \n" \`
ac3dec: make downmix() take array of pointers to channel data 13 years ago			`"movaps (%0, %2), %%xmm0 \n" \`
			`"movaps (%0, %3), %%xmm1 \n" \`
			`"movaps (%0, %4), %%xmm2 \n" \`
			`"movaps (%0, %5), %%xmm3 \n" \`
			`"movaps (%0, %6), %%xmm4 \n" \`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 13 years ago			`"mulps %%xmm5, %%xmm0 \n" \`
			`"mulps %%xmm6, %%xmm1 \n" \`
			`"mulps %%xmm5, %%xmm2 \n" \`
			`"mulps %%xmm7, %%xmm3 \n" \`
			`"mulps %%xmm7, %%xmm4 \n" \`
			`stereo("addps %%xmm1, %%xmm0 \n") \`
			`"addps %%xmm1, %%xmm2 \n" \`
			`"addps %%xmm3, %%xmm0 \n" \`
			`"addps %%xmm4, %%xmm2 \n" \`
			`mono("addps %%xmm2, %%xmm0 \n") \`
ac3dec: make downmix() take array of pointers to channel data 13 years ago			`"movaps %%xmm0, (%0, %2) \n" \`
			`stereo("movaps %%xmm2, (%0, %3) \n") \`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 13 years ago			`"add $16, %0 \n" \`
			`"jl 1b \n" \`
			`: "+&r"(i) \`
ac3dec: make downmix() take array of pointers to channel data 13 years ago			`: "r"(matrix), \`
			`"r"(samples[0] + len), \`
			`"r"(samples[1] + len), \`
			`"r"(samples[2] + len), \`
			`"r"(samples[3] + len), \`
			`"r"(samples[4] + len) \`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 13 years ago			`: XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \`
			`"%xmm4", "%xmm5", "%xmm6", "%xmm7",) \`
			`"memory" \`
			`);`

			`#define MIX_MISC(stereo) \`
			`__asm__ volatile ( \`
ac3dec: make downmix() take array of pointers to channel data 13 years ago			`"mov %5, %2 \n" \`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 13 years ago			`"1: \n" \`
ac3dec: make downmix() take array of pointers to channel data 13 years ago			`"mov -%c7(%6, %2, %c8), %3 \n" \`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 13 years ago			`"movaps (%3, %0), %%xmm0 \n" \`
			`stereo("movaps %%xmm0, %%xmm1 \n") \`
			`"mulps %%xmm4, %%xmm0 \n" \`
			`stereo("mulps %%xmm5, %%xmm1 \n") \`
			`"2: \n" \`
ac3dec: make downmix() take array of pointers to channel data 13 years ago			`"mov (%6, %2, %c8), %1 \n" \`
			`"movaps (%1, %0), %%xmm2 \n" \`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 13 years ago			`stereo("movaps %%xmm2, %%xmm3 \n") \`
ac3dec: make downmix() take array of pointers to channel data 13 years ago			`"mulps (%4, %2, 8), %%xmm2 \n" \`
			`stereo("mulps 16(%4, %2, 8), %%xmm3 \n") \`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 13 years ago			`"addps %%xmm2, %%xmm0 \n" \`
			`stereo("addps %%xmm3, %%xmm1 \n") \`
ac3dec: make downmix() take array of pointers to channel data 13 years ago			`"add $4, %2 \n" \`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 13 years ago			`"jl 2b \n" \`
ac3dec: make downmix() take array of pointers to channel data 13 years ago			`"mov %5, %2 \n" \`
			`stereo("mov (%6, %2, %c8), %1 \n") \`
			`"movaps %%xmm0, (%3, %0) \n" \`
			`stereo("movaps %%xmm1, (%1, %0) \n") \`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 13 years ago			`"add $16, %0 \n" \`
			`"jl 1b \n" \`
ac3dec: make downmix() take array of pointers to channel data 13 years ago			`: "+&r"(i), "=&r"(j), "=&r"(k), "=&r"(m) \`
			`: "r"(matrix_simd + in_ch), \`
			`"g"((intptr_t) - 4 * (in_ch - 1)), \`
			`"r"(samp + in_ch), \`
			`"i"(sizeof(float )), "i"(sizeof(float )/4) \`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 13 years ago			`: "memory" \`
			`);`

ac3dec: make downmix() take array of pointers to channel data 13 years ago			`static void ac3_downmix_sse(float *samples, float (matrix)[2],`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 13 years ago			`int out_ch, int in_ch, int len)`
			`{`
			`int (matrix_cmp)[2] = (int()[2])matrix;`
ac3dec: make downmix() take array of pointers to channel data 13 years ago			`intptr_t i, j, k, m;`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 13 years ago
			`i = -len * sizeof(float);`
			`if (in_ch == 5 && out_ch == 2 &&`
			`!(matrix_cmp[0][1] \| matrix_cmp[2][0] \|`
			`matrix_cmp[3][1] \| matrix_cmp[4][0] \|`
			`(matrix_cmp[1][0] ^ matrix_cmp[1][1]) \|`
			`(matrix_cmp[0][0] ^ matrix_cmp[2][1]))) {`
			`MIX5(IF0, IF1);`
			`} else if (in_ch == 5 && out_ch == 1 &&`
			`matrix_cmp[0][0] == matrix_cmp[2][0] &&`
			`matrix_cmp[3][0] == matrix_cmp[4][0]) {`
			`MIX5(IF1, IF0);`
			`} else {`
			`DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];`
ac3dec: make downmix() take array of pointers to channel data 13 years ago			`float *samp[AC3_MAX_CHANNELS];`

			`for (j = 0; j < in_ch; j++)`
			`samp[j] = samples[j] + len;`

ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 13 years ago			`j = 2 * in_ch * sizeof(float);`
			`__asm__ volatile (`
			`"1: \n"`
			`"sub $8, %0 \n"`
			`"movss (%2, %0), %%xmm4 \n"`
			`"movss 4(%2, %0), %%xmm5 \n"`
			`"shufps $0, %%xmm4, %%xmm4 \n"`
			`"shufps $0, %%xmm5, %%xmm5 \n"`
			`"movaps %%xmm4, (%1, %0, 4) \n"`
			`"movaps %%xmm5, 16(%1, %0, 4) \n"`
			`"jg 1b \n"`
			`: "+&r"(j)`
			`: "r"(matrix_simd), "r"(matrix)`
			`: "memory"`
			`);`
			`if (out_ch == 2) {`
			`MIX_MISC(IF1);`
			`} else {`
			`MIX_MISC(IF0);`
			`}`
			`}`
			`}`

ac3dec: make downmix() take array of pointers to channel data 13 years ago			`#endif /* HAVE_SSE_INLINE && HAVE_7REGS */`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 13 years ago
ac3enc: add float_to_fixed24() with x86-optimized versions to AC3DSPContext and use in scale_coefficients() for the floating-point AC-3 encoder. 14 years ago			`av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)`
Add x86-optimized versions of exponent_min(). Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 14 years ago			`{`
Consistently use "cpu_flags" as variable/parameter name for CPU flags 12 years ago			`int cpu_flags = av_get_cpu_flags();`
Add x86-optimized versions of exponent_min(). Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 14 years ago
Consistently use "cpu_flags" as variable/parameter name for CPU flags 12 years ago			`if (EXTERNAL_MMX(cpu_flags)) {`
Add x86-optimized versions of exponent_min(). Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 14 years ago			`c->ac3_exponent_min = ff_ac3_exponent_min_mmx;`
ac3enc: Add x86-optimized function to speed up log2_tab(). AC3DSPContext.ac3_max_msb_abs_int16() finds the maximum MSB of the absolute value of each element in an array of int16_t. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 14 years ago			`c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx;`
ac3enc: add SIMD-optimized shifting functions for use with the fixed-point AC3 encoder. 14 years ago			`c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx;`
			`c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;`
Add x86-optimized versions of exponent_min(). Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 14 years ago			`}`
Consistently use "cpu_flags" as variable/parameter name for CPU flags 12 years ago			`if (EXTERNAL_AMD3DNOW(cpu_flags)) {`
ac3enc: add float_to_fixed24() with x86-optimized versions to AC3DSPContext and use in scale_coefficients() for the floating-point AC-3 encoder. 14 years ago			`if (!bit_exact) {`
			`c->float_to_fixed24 = ff_float_to_fixed24_3dnow;`
			`}`
			`}`
Consistently use "cpu_flags" as variable/parameter name for CPU flags 12 years ago			`if (EXTERNAL_MMXEXT(cpu_flags)) {`
Add x86-optimized versions of exponent_min(). Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 14 years ago			`c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;`
x86: mmx2 ---> mmxext in asm constructs 13 years ago			`c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext;`
dsputil: Move apply_window_int16 to ac3dsp The (optimized) functions are used nowhere else. 11 years ago			`if (bit_exact) {`
			`c->apply_window_int16 = ff_apply_window_int16_mmxext;`
			`} else {`
			`c->apply_window_int16 = ff_apply_window_int16_round_mmxext;`
			`}`
Add x86-optimized versions of exponent_min(). Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 14 years ago			`}`
Consistently use "cpu_flags" as variable/parameter name for CPU flags 12 years ago			`if (EXTERNAL_SSE(cpu_flags)) {`
ac3enc: add float_to_fixed24() with x86-optimized versions to AC3DSPContext and use in scale_coefficients() for the floating-point AC-3 encoder. 14 years ago			`c->float_to_fixed24 = ff_float_to_fixed24_sse;`
			`}`
Consistently use "cpu_flags" as variable/parameter name for CPU flags 12 years ago			`if (EXTERNAL_SSE2(cpu_flags)) {`
Add x86-optimized versions of exponent_min(). Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 14 years ago			`c->ac3_exponent_min = ff_ac3_exponent_min_sse2;`
ac3enc: Add x86-optimized function to speed up log2_tab(). AC3DSPContext.ac3_max_msb_abs_int16() finds the maximum MSB of the absolute value of each element in an array of int16_t. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 14 years ago			`c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;`
ac3enc: add float_to_fixed24() with x86-optimized versions to AC3DSPContext and use in scale_coefficients() for the floating-point AC-3 encoder. 14 years ago			`c->float_to_fixed24 = ff_float_to_fixed24_sse2;`
ac3enc: modify mantissa bit counting to keep bap counts for all values of bap instead of just 0 to 4. This does all the actual bit counting as a final step. 14 years ago			`c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2;`
ac3dsp: add x86-optimized versions of ac3dsp.extract_exponents(). 14 years ago			`c->extract_exponents = ff_ac3_extract_exponents_sse2;`
Consistently use "cpu_flags" as variable/parameter name for CPU flags 12 years ago			`if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW)) {`
ac3enc: add SIMD-optimized shifting functions for use with the fixed-point AC3 encoder. 14 years ago			`c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2;`
			`c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;`
			`}`
dsputil: Move apply_window_int16 to ac3dsp The (optimized) functions are used nowhere else. 11 years ago			`if (bit_exact) {`
			`c->apply_window_int16 = ff_apply_window_int16_sse2;`
			`} else if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW)) {`
			`c->apply_window_int16 = ff_apply_window_int16_round_sse2;`
			`}`
ac3enc: Add x86-optimized function to speed up log2_tab(). AC3DSPContext.ac3_max_msb_abs_int16() finds the maximum MSB of the absolute value of each element in an array of int16_t. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 14 years ago			`}`
Consistently use "cpu_flags" as variable/parameter name for CPU flags 12 years ago			`if (EXTERNAL_SSSE3(cpu_flags)) {`
ac3enc: Add x86-optimized function to speed up log2_tab(). AC3DSPContext.ac3_max_msb_abs_int16() finds the maximum MSB of the absolute value of each element in an array of int16_t. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 14 years ago			`c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;`
dsputil: Move apply_window_int16 to ac3dsp The (optimized) functions are used nowhere else. 11 years ago			`if (cpu_flags & AV_CPU_FLAG_ATOM) {`
			`c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;`
			`} else {`
ac3dsp: add x86-optimized versions of ac3dsp.extract_exponents(). 14 years ago			`c->extract_exponents = ff_ac3_extract_exponents_ssse3;`
dsputil: Move apply_window_int16 to ac3dsp The (optimized) functions are used nowhere else. 11 years ago			`c->apply_window_int16 = ff_apply_window_int16_ssse3;`
ac3dsp: add x86-optimized versions of ac3dsp.extract_exponents(). 14 years ago			`}`
Add x86-optimized versions of exponent_min(). Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 14 years ago			`}`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 13 years ago
ac3dec: make downmix() take array of pointers to channel data 13 years ago			`#if HAVE_SSE_INLINE && HAVE_7REGS`
Consistently use "cpu_flags" as variable/parameter name for CPU flags 12 years ago			`if (INLINE_SSE(cpu_flags)) {`
ac3: move ac3_downmix() from dsputil to ac3dsp Signed-off-by: Mans Rullgard <mans@mansr.com> 13 years ago			`c->downmix = ac3_downmix_sse;`
			`}`
x86: ac3dsp: Only refer to the ac3_downmix_sse symbol if it has been declared This fixes building without inline assembly. Signed-off-by: Martin Storsjö <martin@martin.st> 13 years ago			`#endif`
Add x86-optimized versions of exponent_min(). Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 14 years ago			`}`