FFmpeg/libavutil/x86/float_dsp_init.c

/*
 * This file is part of Libav.
 *
 * Libav is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * Libav is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with Libav; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "config.h"

#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/float_dsp.h"
#include "cpu.h"
#include "asm.h"

void ff_vector_fmul_sse(float *dst, const float *src0, const float *src1,
                        int len);
void ff_vector_fmul_avx(float *dst, const float *src0, const float *src1,
                        int len);

void ff_vector_fmac_scalar_sse(float *dst, const float *src, float mul,
                               int len);
void ff_vector_fmac_scalar_avx(float *dst, const float *src, float mul,
                               int len);

void ff_vector_fmul_scalar_sse(float *dst, const float *src, float mul,
                               int len);

void ff_vector_dmul_scalar_sse2(double *dst, const double *src,
                                double mul, int len);
void ff_vector_dmul_scalar_avx(double *dst, const double *src,
                               double mul, int len);

void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
                            const float *src2, int len);
void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
                            const float *src2, int len);

void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
                                const float *src1, int len);
void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
                                const float *src1, int len);

float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);

void ff_butterflies_float_sse(float *src0, float *src1, int len);

#if HAVE_6REGS && HAVE_INLINE_ASM
static void vector_fmul_window_3dnowext(float *dst, const float *src0,
                                        const float *src1, const float *win,
                                        int len)
{
    x86_reg i = -len * 4;
    x86_reg j =  len * 4 - 8;
    __asm__ volatile (
        "1:                             \n"
        "pswapd (%5, %1), %%mm1         \n"
        "movq   (%5, %0), %%mm0         \n"
        "pswapd (%4, %1), %%mm5         \n"
        "movq   (%3, %0), %%mm4         \n"
        "movq      %%mm0, %%mm2         \n"
        "movq      %%mm1, %%mm3         \n"
        "pfmul     %%mm4, %%mm2         \n" // src0[len + i] * win[len + i]
        "pfmul     %%mm5, %%mm3         \n" // src1[j]       * win[len + j]
        "pfmul     %%mm4, %%mm1         \n" // src0[len + i] * win[len + j]
        "pfmul     %%mm5, %%mm0         \n" // src1[j]       * win[len + i]
        "pfadd     %%mm3, %%mm2         \n"
        "pfsub     %%mm0, %%mm1         \n"
        "pswapd    %%mm2, %%mm2         \n"
        "movq      %%mm1, (%2, %0)      \n"
        "movq      %%mm2, (%2, %1)      \n"
        "sub          $8, %1            \n"
        "add          $8, %0            \n"
        "jl           1b                \n"
        "femms                          \n"
        : "+r"(i), "+r"(j)
        : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
    );
}

static void vector_fmul_window_sse(float *dst, const float *src0,
                                   const float *src1, const float *win, int len)
{
    x86_reg i = -len * 4;
    x86_reg j =  len * 4 - 16;
    __asm__ volatile (
        "1:                             \n"
        "movaps      (%5, %1), %%xmm1   \n"
        "movaps      (%5, %0), %%xmm0   \n"
        "movaps      (%4, %1), %%xmm5   \n"
        "movaps      (%3, %0), %%xmm4   \n"
        "shufps $0x1b, %%xmm1, %%xmm1   \n"
        "shufps $0x1b, %%xmm5, %%xmm5   \n"
        "movaps        %%xmm0, %%xmm2   \n"
        "movaps        %%xmm1, %%xmm3   \n"
        "mulps         %%xmm4, %%xmm2   \n" // src0[len + i] * win[len + i]
        "mulps         %%xmm5, %%xmm3   \n" // src1[j]       * win[len + j]
        "mulps         %%xmm4, %%xmm1   \n" // src0[len + i] * win[len + j]
        "mulps         %%xmm5, %%xmm0   \n" // src1[j]       * win[len + i]
        "addps         %%xmm3, %%xmm2   \n"
        "subps         %%xmm0, %%xmm1   \n"
        "shufps $0x1b, %%xmm2, %%xmm2   \n"
        "movaps        %%xmm1, (%2, %0) \n"
        "movaps        %%xmm2, (%2, %1) \n"
        "sub              $16, %1       \n"
        "add              $16, %0       \n"
        "jl                1b           \n"
        : "+r"(i), "+r"(j)
        : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
    );
}
#endif /* HAVE_6REGS && HAVE_INLINE_ASM */

av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
{
    int cpu_flags = av_get_cpu_flags();

#if HAVE_6REGS && HAVE_INLINE_ASM
    if (INLINE_AMD3DNOWEXT(cpu_flags)) {
        fdsp->vector_fmul_window  = vector_fmul_window_3dnowext;
    }
    if (INLINE_SSE(cpu_flags)) {
        fdsp->vector_fmul_window = vector_fmul_window_sse;
    }
#endif
    if (EXTERNAL_SSE(cpu_flags)) {
        fdsp->vector_fmul = ff_vector_fmul_sse;
        fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse;
        fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_sse;
        fdsp->vector_fmul_add    = ff_vector_fmul_add_sse;
        fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
        fdsp->scalarproduct_float = ff_scalarproduct_float_sse;
        fdsp->butterflies_float   = ff_butterflies_float_sse;
    }
    if (EXTERNAL_SSE2(cpu_flags)) {
        fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2;
    }
    if (EXTERNAL_AVX_FAST(cpu_flags)) {
        fdsp->vector_fmul = ff_vector_fmul_avx;
        fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx;
        fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx;
        fdsp->vector_fmul_add    = ff_vector_fmul_add_avx;
        fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
    }
}
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 13 years ago			`/*`
			`* This file is part of Libav.`
			`*`
			`* Libav is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
			`* version 2.1 of the License, or (at your option) any later version.`
			`*`
			`* Libav is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
			`* License along with Libav; if not, write to the Free Software`
			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`*/`

			`#include "config.h"`

avutil: Add av_cold attributes to init functions missing them 12 years ago			`#include "libavutil/attributes.h"`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 13 years ago			`#include "libavutil/cpu.h"`
			`#include "libavutil/float_dsp.h"`
x86: Replace checks for CPU extensions and flags by convenience macros This separates code relying on inline from that relying on external assembly and fixes instances where the coalesced check was incorrect. 13 years ago			`#include "cpu.h"`
lavc: Move vector_fmul_window to AVFloatDSPContext Signed-off-by: Luca Barbato <lu_zero@gentoo.org> 12 years ago			`#include "asm.h"`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 13 years ago
cosmetics: Remove unnecessary extern keywords from function declarations 12 years ago			`void ff_vector_fmul_sse(float dst, const float src0, const float *src1,`
			`int len);`
			`void ff_vector_fmul_avx(float dst, const float src0, const float *src1,`
			`int len);`

			`void ff_vector_fmac_scalar_sse(float dst, const float src, float mul,`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 13 years ago			`int len);`
cosmetics: Remove unnecessary extern keywords from function declarations 12 years ago			`void ff_vector_fmac_scalar_avx(float dst, const float src, float mul,`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 13 years ago			`int len);`

cosmetics: Remove unnecessary extern keywords from function declarations 12 years ago			`void ff_vector_fmul_scalar_sse(float dst, const float src, float mul,`
			`int len);`
x86: float_dsp: add SSE version of vector_fmul_scalar() 13 years ago
cosmetics: Remove unnecessary extern keywords from function declarations 12 years ago			`void ff_vector_dmul_scalar_sse2(double dst, const double src,`
			`double mul, int len);`
			`void ff_vector_dmul_scalar_avx(double dst, const double src,`
			`double mul, int len);`
float_dsp: add vector_dmul_scalar() to multiply a vector of doubles Include x86-optimized versions for SSE2 and AVX. 13 years ago
floatdsp: move vector_fmul_add from dsputil to avfloatdsp. 12 years ago			`void ff_vector_fmul_add_sse(float dst, const float src0, const float *src1,`
			`const float *src2, int len);`
			`void ff_vector_fmul_add_avx(float dst, const float src0, const float *src1,`
			`const float *src2, int len);`

floatdsp: move vector_fmul_reverse from dsputil to avfloatdsp. Now, nellymoserenc and aacenc no longer depends on dsputil. Independent of this patch, wmaprodec also does not depend on dsputil, so I removed it from there also. 12 years ago			`void ff_vector_fmul_reverse_sse(float dst, const float src0,`
			`const float *src1, int len);`
			`void ff_vector_fmul_reverse_avx(float dst, const float src0,`
			`const float *src1, int len);`

floatdsp: move scalarproduct_float from dsputil to avfloatdsp. This makes the aac decoder and all voice codecs independent of dsputil. 12 years ago			`float ff_scalarproduct_float_sse(const float v1, const float v2, int order);`

x86: float dsp: butterflies_float SSE 97c -> 49c Some codecs could benefit from more unrolling, but AAC doesn't. 12 years ago			`void ff_butterflies_float_sse(float src0, float src1, int len);`

float_dsp: Add #ifdef HAVE_INLINE_ASM around vector_fmul_window This fixes builds on 64bit MSVC. Signed-off-by: Martin Storsjö <martin@martin.st> 12 years ago			`#if HAVE_6REGS && HAVE_INLINE_ASM`
lavc: Move vector_fmul_window to AVFloatDSPContext Signed-off-by: Luca Barbato <lu_zero@gentoo.org> 12 years ago			`static void vector_fmul_window_3dnowext(float dst, const float src0,`
			`const float src1, const float win,`
			`int len)`
			`{`
			`x86_reg i = -len * 4;`
			`x86_reg j = len * 4 - 8;`
			`__asm__ volatile (`
			`"1: \n"`
			`"pswapd (%5, %1), %%mm1 \n"`
			`"movq (%5, %0), %%mm0 \n"`
			`"pswapd (%4, %1), %%mm5 \n"`
			`"movq (%3, %0), %%mm4 \n"`
			`"movq %%mm0, %%mm2 \n"`
			`"movq %%mm1, %%mm3 \n"`
			`"pfmul %%mm4, %%mm2 \n" // src0[len + i] * win[len + i]`
			`"pfmul %%mm5, %%mm3 \n" // src1[j] * win[len + j]`
			`"pfmul %%mm4, %%mm1 \n" // src0[len + i] * win[len + j]`
			`"pfmul %%mm5, %%mm0 \n" // src1[j] * win[len + i]`
			`"pfadd %%mm3, %%mm2 \n"`
			`"pfsub %%mm0, %%mm1 \n"`
			`"pswapd %%mm2, %%mm2 \n"`
			`"movq %%mm1, (%2, %0) \n"`
			`"movq %%mm2, (%2, %1) \n"`
			`"sub $8, %1 \n"`
			`"add $8, %0 \n"`
			`"jl 1b \n"`
			`"femms \n"`
			`: "+r"(i), "+r"(j)`
			`: "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)`
			`);`
			`}`

			`static void vector_fmul_window_sse(float dst, const float src0,`
			`const float src1, const float win, int len)`
			`{`
			`x86_reg i = -len * 4;`
			`x86_reg j = len * 4 - 16;`
			`__asm__ volatile (`
			`"1: \n"`
			`"movaps (%5, %1), %%xmm1 \n"`
			`"movaps (%5, %0), %%xmm0 \n"`
			`"movaps (%4, %1), %%xmm5 \n"`
			`"movaps (%3, %0), %%xmm4 \n"`
			`"shufps $0x1b, %%xmm1, %%xmm1 \n"`
			`"shufps $0x1b, %%xmm5, %%xmm5 \n"`
			`"movaps %%xmm0, %%xmm2 \n"`
			`"movaps %%xmm1, %%xmm3 \n"`
			`"mulps %%xmm4, %%xmm2 \n" // src0[len + i] * win[len + i]`
			`"mulps %%xmm5, %%xmm3 \n" // src1[j] * win[len + j]`
			`"mulps %%xmm4, %%xmm1 \n" // src0[len + i] * win[len + j]`
			`"mulps %%xmm5, %%xmm0 \n" // src1[j] * win[len + i]`
			`"addps %%xmm3, %%xmm2 \n"`
			`"subps %%xmm0, %%xmm1 \n"`
			`"shufps $0x1b, %%xmm2, %%xmm2 \n"`
			`"movaps %%xmm1, (%2, %0) \n"`
			`"movaps %%xmm2, (%2, %1) \n"`
			`"sub $16, %1 \n"`
			`"add $16, %0 \n"`
			`"jl 1b \n"`
			`: "+r"(i), "+r"(j)`
			`: "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)`
			`);`
			`}`
float_dsp: Add #ifdef HAVE_INLINE_ASM around vector_fmul_window This fixes builds on 64bit MSVC. Signed-off-by: Martin Storsjö <martin@martin.st> 12 years ago			`#endif /* HAVE_6REGS && HAVE_INLINE_ASM */`
lavc: Move vector_fmul_window to AVFloatDSPContext Signed-off-by: Luca Barbato <lu_zero@gentoo.org> 12 years ago
avutil: Add av_cold attributes to init functions missing them 12 years ago			`av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 13 years ago			`{`
Consistently use "cpu_flags" as variable/parameter name for CPU flags 12 years ago			`int cpu_flags = av_get_cpu_flags();`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 13 years ago
float_dsp: Add #ifdef HAVE_INLINE_ASM around vector_fmul_window This fixes builds on 64bit MSVC. Signed-off-by: Martin Storsjö <martin@martin.st> 12 years ago			`#if HAVE_6REGS && HAVE_INLINE_ASM`
Consistently use "cpu_flags" as variable/parameter name for CPU flags 12 years ago			`if (INLINE_AMD3DNOWEXT(cpu_flags)) {`
lavc: Move vector_fmul_window to AVFloatDSPContext Signed-off-by: Luca Barbato <lu_zero@gentoo.org> 12 years ago			`fdsp->vector_fmul_window = vector_fmul_window_3dnowext;`
			`}`
Consistently use "cpu_flags" as variable/parameter name for CPU flags 12 years ago			`if (INLINE_SSE(cpu_flags)) {`
lavc: Move vector_fmul_window to AVFloatDSPContext Signed-off-by: Luca Barbato <lu_zero@gentoo.org> 12 years ago			`fdsp->vector_fmul_window = vector_fmul_window_sse;`
			`}`
			`#endif`
Consistently use "cpu_flags" as variable/parameter name for CPU flags 12 years ago			`if (EXTERNAL_SSE(cpu_flags)) {`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 13 years ago			`fdsp->vector_fmul = ff_vector_fmul_sse;`
float_dsp: add x86-optimized functions for vector_fmac_scalar() 13 years ago			`fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse;`
x86: float_dsp: add SSE version of vector_fmul_scalar() 13 years ago			`fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_sse;`
floatdsp: move vector_fmul_add from dsputil to avfloatdsp. 12 years ago			`fdsp->vector_fmul_add = ff_vector_fmul_add_sse;`
floatdsp: move vector_fmul_reverse from dsputil to avfloatdsp. Now, nellymoserenc and aacenc no longer depends on dsputil. Independent of this patch, wmaprodec also does not depend on dsputil, so I removed it from there also. 12 years ago			`fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_sse;`
floatdsp: move scalarproduct_float from dsputil to avfloatdsp. This makes the aac decoder and all voice codecs independent of dsputil. 12 years ago			`fdsp->scalarproduct_float = ff_scalarproduct_float_sse;`
x86: float dsp: butterflies_float SSE 97c -> 49c Some codecs could benefit from more unrolling, but AAC doesn't. 12 years ago			`fdsp->butterflies_float = ff_butterflies_float_sse;`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 13 years ago			`}`
Consistently use "cpu_flags" as variable/parameter name for CPU flags 12 years ago			`if (EXTERNAL_SSE2(cpu_flags)) {`
float_dsp: add vector_dmul_scalar() to multiply a vector of doubles Include x86-optimized versions for SSE2 and AVX. 13 years ago			`fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2;`
			`}`
x86: check for AV_CPU_FLAG_AVXSLOW where useful Signed-off-by: James Almer <jamrial@gmail.com> Signed-off-by: Luca Barbato <lu_zero@gentoo.org> 10 years ago			`if (EXTERNAL_AVX_FAST(cpu_flags)) {`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 13 years ago			`fdsp->vector_fmul = ff_vector_fmul_avx;`
float_dsp: add x86-optimized functions for vector_fmac_scalar() 13 years ago			`fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx;`
float_dsp: add vector_dmul_scalar() to multiply a vector of doubles Include x86-optimized versions for SSE2 and AVX. 13 years ago			`fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx;`
floatdsp: move vector_fmul_add from dsputil to avfloatdsp. 12 years ago			`fdsp->vector_fmul_add = ff_vector_fmul_add_avx;`
floatdsp: move vector_fmul_reverse from dsputil to avfloatdsp. Now, nellymoserenc and aacenc no longer depends on dsputil. Independent of this patch, wmaprodec also does not depend on dsputil, so I removed it from there also. 12 years ago			`fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx;`
Add a float DSP framework to libavutil Move vector_fmul() from DSPContext to AVFloatDSPContext. 13 years ago			`}`
			`}`