FFmpeg/libavcodec/x86/audiodsp.asm

;******************************************************************************
;* optimized audio functions
;* Copyright (c) 2008 Loren Merritt
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "libavutil/x86/x86util.asm"

SECTION .text

; int ff_scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
INIT_XMM sse2
cglobal scalarproduct_int16, 3,3,3, v1, v2, order
    add orderd, orderd
    add v1q, orderq
    add v2q, orderq
    neg orderq
    pxor    m2, m2
.loop:
    movu    m0, [v1q + orderq]
    movu    m1, [v1q + orderq + mmsize]
    pmaddwd m0, [v2q + orderq]
    pmaddwd m1, [v2q + orderq + mmsize]
    paddd   m2, m0
    paddd   m2, m1
    add     orderq, mmsize*2
    jl .loop
    HADDD   m2, m0
    movd   eax, m2
    RET

%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
cglobal scalarproduct_int16, 3,3,2, v1, v2, order
    add orderd, orderd
    add v1q, orderq
    add v2q, orderq
    neg orderq
    pxor    m1, m1
.loop:
    movu    m0, [v1q + orderq]
    pmaddwd m0, [v2q + orderq]
    paddd   m1, m0
    add     orderq, mmsize
    jl .loop
    HADDD   m1, m0
    movd   eax, xm1
    RET
%endif

;-----------------------------------------------------------------------------
; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
;                           int32_t max, unsigned int len)
;-----------------------------------------------------------------------------

; %1 = number of xmm registers used
; %2 = number of inline load/process/store loops per asm loop
; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
; %4 = CLIPD function takes min/max as float instead of int (SSE2 version)
; %5 = suffix
%macro VECTOR_CLIP_INT32 4-5
cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
%if %4
    cvtsi2ss  m4, minm
    cvtsi2ss  m5, maxm
%else
    movd      m4, minm
    movd      m5, maxm
%endif
    SPLATD    m4
    SPLATD    m5
.loop:
%assign %%i 0
%rep %2
    mova      m0,  [srcq + mmsize * (0 + %%i)]
    mova      m1,  [srcq + mmsize * (1 + %%i)]
    mova      m2,  [srcq + mmsize * (2 + %%i)]
    mova      m3,  [srcq + mmsize * (3 + %%i)]
%if %3
    mova      m7,  [srcq + mmsize * (4 + %%i)]
    mova      m8,  [srcq + mmsize * (5 + %%i)]
    mova      m9,  [srcq + mmsize * (6 + %%i)]
    mova      m10, [srcq + mmsize * (7 + %%i)]
%endif
    CLIPD  m0,  m4, m5, m6
    CLIPD  m1,  m4, m5, m6
    CLIPD  m2,  m4, m5, m6
    CLIPD  m3,  m4, m5, m6
%if %3
    CLIPD  m7,  m4, m5, m6
    CLIPD  m8,  m4, m5, m6
    CLIPD  m9,  m4, m5, m6
    CLIPD  m10, m4, m5, m6
%endif
    mova  [dstq + mmsize * (0 + %%i)], m0
    mova  [dstq + mmsize * (1 + %%i)], m1
    mova  [dstq + mmsize * (2 + %%i)], m2
    mova  [dstq + mmsize * (3 + %%i)], m3
%if %3
    mova  [dstq + mmsize * (4 + %%i)], m7
    mova  [dstq + mmsize * (5 + %%i)], m8
    mova  [dstq + mmsize * (6 + %%i)], m9
    mova  [dstq + mmsize * (7 + %%i)], m10
%endif
%assign %%i (%%i + 4 * (1 + %3))
%endrep
    add     srcq, mmsize*4*(%2+%3)
    add     dstq, mmsize*4*(%2+%3)
    sub     lend, mmsize*(%2+%3)
    jg .loop
    RET
%endmacro

INIT_XMM sse2
VECTOR_CLIP_INT32 6, 1, 0, 0, _int
VECTOR_CLIP_INT32 6, 2, 0, 1
INIT_XMM sse4
%ifdef m8
VECTOR_CLIP_INT32 11, 1, 1, 0
%else
VECTOR_CLIP_INT32 6, 1, 0, 0
%endif

; void ff_vector_clipf_sse(float *dst, const float *src,
;                          int len, float min, float max)
INIT_XMM sse
cglobal vector_clipf, 3, 3, 6, dst, src, len, min, max
%if ARCH_X86_32
    VBROADCASTSS m0, minm
    VBROADCASTSS m1, maxm
%elif WIN64
    SWAP 0, 3
    VBROADCASTSS m0, m0
    VBROADCASTSS m1, maxm
%else ; 64bit sysv
    VBROADCASTSS m0, m0
    VBROADCASTSS m1, m1
%endif

    movsxdifnidn lenq, lend

.loop:
    mova m2, [srcq + 4 * lenq - 4 * mmsize]
    mova m3, [srcq + 4 * lenq - 3 * mmsize]
    mova m4, [srcq + 4 * lenq - 2 * mmsize]
    mova m5, [srcq + 4 * lenq - 1 * mmsize]

    maxps m2, m0
    maxps m3, m0
    maxps m4, m0
    maxps m5, m0

    minps m2, m1
    minps m3, m1
    minps m4, m1
    minps m5, m1

    mova [dstq + 4 * lenq - 4 * mmsize], m2
    mova [dstq + 4 * lenq - 3 * mmsize], m3
    mova [dstq + 4 * lenq - 2 * mmsize], m4
    mova [dstq + 4 * lenq - 1 * mmsize], m5

    sub lenq, mmsize
    jg .loop

    RET
dsputil: Split audio operations off into a separate context 11 years ago			`;******************************************************************************`
			`;* optimized audio functions`
			`;* Copyright (c) 2008 Loren Merritt`
			`;*`
Merge commit '9a9e2f1c8aa4539a261625145e5c1f46a8106ac2' * commit '9a9e2f1c8aa4539a261625145e5c1f46a8106ac2': dsputil: Split audio operations off into a separate context Conflicts: configure libavcodec/takdec.c libavcodec/x86/Makefile libavcodec/x86/dsputil.asm libavcodec/x86/dsputil_init.c libavcodec/x86/dsputil_mmx.c libavcodec/x86/dsputil_x86.h Merged-by: Michael Niedermayer <michaelni@gmx.at> 11 years ago			`;* This file is part of FFmpeg.`
dsputil: Split audio operations off into a separate context 11 years ago			`;*`
Merge commit '9a9e2f1c8aa4539a261625145e5c1f46a8106ac2' * commit '9a9e2f1c8aa4539a261625145e5c1f46a8106ac2': dsputil: Split audio operations off into a separate context Conflicts: configure libavcodec/takdec.c libavcodec/x86/Makefile libavcodec/x86/dsputil.asm libavcodec/x86/dsputil_init.c libavcodec/x86/dsputil_mmx.c libavcodec/x86/dsputil_x86.h Merged-by: Michael Niedermayer <michaelni@gmx.at> 11 years ago			`;* FFmpeg is free software; you can redistribute it and/or`
dsputil: Split audio operations off into a separate context 11 years ago			`;* modify it under the terms of the GNU Lesser General Public`
			`;* License as published by the Free Software Foundation; either`
			`;* version 2.1 of the License, or (at your option) any later version.`
			`;*`
Merge commit '9a9e2f1c8aa4539a261625145e5c1f46a8106ac2' * commit '9a9e2f1c8aa4539a261625145e5c1f46a8106ac2': dsputil: Split audio operations off into a separate context Conflicts: configure libavcodec/takdec.c libavcodec/x86/Makefile libavcodec/x86/dsputil.asm libavcodec/x86/dsputil_init.c libavcodec/x86/dsputil_mmx.c libavcodec/x86/dsputil_x86.h Merged-by: Michael Niedermayer <michaelni@gmx.at> 11 years ago			`;* FFmpeg is distributed in the hope that it will be useful,`
dsputil: Split audio operations off into a separate context 11 years ago			`;* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`;* Lesser General Public License for more details.`
			`;*`
			`;* You should have received a copy of the GNU Lesser General Public`
Merge commit '9a9e2f1c8aa4539a261625145e5c1f46a8106ac2' * commit '9a9e2f1c8aa4539a261625145e5c1f46a8106ac2': dsputil: Split audio operations off into a separate context Conflicts: configure libavcodec/takdec.c libavcodec/x86/Makefile libavcodec/x86/dsputil.asm libavcodec/x86/dsputil_init.c libavcodec/x86/dsputil_mmx.c libavcodec/x86/dsputil_x86.h Merged-by: Michael Niedermayer <michaelni@gmx.at> 11 years ago			`;* License along with FFmpeg; if not, write to the Free Software`
dsputil: Split audio operations off into a separate context 11 years ago			`;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`;******************************************************************************`

			`%include "libavutil/x86/x86util.asm"`

x86inc: Drop SECTION_TEXT macro The .text section is already 16-byte aligned by default on all supported platforms so `SECTION_TEXT` isn't any different from `SECTION .text`. 10 years ago			`SECTION .text`
dsputil: Split audio operations off into a separate context 11 years ago
			`; int ff_scalarproduct_int16(int16_t v1, int16_t v2, int order)`
avcodec/x86/audiodsp_init: Remove obsolete MMX(EXT) functions x64 always has MMX, MMXEXT, SSE and SSE2 and this means that some functions for MMX, MMXEXT and 3dnow are always overridden by other functions (unless one e.g. explicitly disables SSE2) for x64. So given that the only systems that benefit from these functions are truely ancient 32bit x86s they are removed. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> 2 years ago			`INIT_XMM sse2`
dsputil: Split audio operations off into a separate context 11 years ago			`cglobal scalarproduct_int16, 3,3,3, v1, v2, order`
audiodsp/x86: clear the high bits of the order parameter on 64bit Also change shl to add, since it can be faster on some CPUs. CC: libav-stable@libav.org 8 years ago			`add orderd, orderd`
dsputil: Split audio operations off into a separate context 11 years ago			`add v1q, orderq`
			`add v2q, orderq`
			`neg orderq`
			`pxor m2, m2`
			`.loop:`
			`movu m0, [v1q + orderq]`
			`movu m1, [v1q + orderq + mmsize]`
			`pmaddwd m0, [v2q + orderq]`
			`pmaddwd m1, [v2q + orderq + mmsize]`
			`paddd m2, m0`
			`paddd m2, m1`
			`add orderq, mmsize*2`
			`jl .loop`
Merge commit '9a9e2f1c8aa4539a261625145e5c1f46a8106ac2' * commit '9a9e2f1c8aa4539a261625145e5c1f46a8106ac2': dsputil: Split audio operations off into a separate context Conflicts: configure libavcodec/takdec.c libavcodec/x86/Makefile libavcodec/x86/dsputil.asm libavcodec/x86/dsputil_init.c libavcodec/x86/dsputil_mmx.c libavcodec/x86/dsputil_x86.h Merged-by: Michael Niedermayer <michaelni@gmx.at> 11 years ago			`HADDD m2, m0`
dsputil: Split audio operations off into a separate context 11 years ago			`movd eax, m2`
			`RET`

avcodec/x86/audiodsp: add scalarproduct avx2 2 years ago			`%if HAVE_AVX2_EXTERNAL`
			`INIT_YMM avx2`
			`cglobal scalarproduct_int16, 3,3,2, v1, v2, order`
			`add orderd, orderd`
			`add v1q, orderq`
			`add v2q, orderq`
			`neg orderq`
			`pxor m1, m1`
			`.loop:`
			`movu m0, [v1q + orderq]`
			`pmaddwd m0, [v2q + orderq]`
			`paddd m1, m0`
			`add orderq, mmsize`
			`jl .loop`
			`HADDD m1, m0`
			`movd eax, xm1`
			`RET`
			`%endif`
dsputil: Split audio operations off into a separate context 11 years ago
			`;-----------------------------------------------------------------------------`
			`; void ff_vector_clip_int32(int32_t dst, const int32_t src, int32_t min,`
			`; int32_t max, unsigned int len)`
			`;-----------------------------------------------------------------------------`

			`; %1 = number of xmm registers used`
			`; %2 = number of inline load/process/store loops per asm loop`
			`; %3 = process 4mmsize (%3=0) or 8mmsize (%3=1) bytes per loop`
x86util: Port all macros to cpuflags Also do some small cosmetic changes: Drop pointless _MMX suffix from ABSD2 macro name, drop pointless check for MMX support, we always assume MMX is available in our SIMD code, fix spelling. 13 years ago			`; %4 = CLIPD function takes min/max as float instead of int (SSE2 version)`
dsputil: Split audio operations off into a separate context 11 years ago			`; %5 = suffix`
			`%macro VECTOR_CLIP_INT32 4-5`
			`cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len`
			`%if %4`
			`cvtsi2ss m4, minm`
			`cvtsi2ss m5, maxm`
			`%else`
			`movd m4, minm`
			`movd m5, maxm`
			`%endif`
			`SPLATD m4`
			`SPLATD m5`
			`.loop:`
Merge commit '9a9e2f1c8aa4539a261625145e5c1f46a8106ac2' * commit '9a9e2f1c8aa4539a261625145e5c1f46a8106ac2': dsputil: Split audio operations off into a separate context Conflicts: configure libavcodec/takdec.c libavcodec/x86/Makefile libavcodec/x86/dsputil.asm libavcodec/x86/dsputil_init.c libavcodec/x86/dsputil_mmx.c libavcodec/x86/dsputil_x86.h Merged-by: Michael Niedermayer <michaelni@gmx.at> 11 years ago			`%assign %%i 0`
dsputil: Split audio operations off into a separate context 11 years ago			`%rep %2`
audiodsp/x86: fix ff_vector_clip_int32_sse2 This version, which is the only one doing two processing cycles per loop iteration, computes the load/store indices incorrectly for the second cycle. CC: libav-stable@libav.org 8 years ago			`mova m0, [srcq + mmsize * (0 + %%i)]`
			`mova m1, [srcq + mmsize * (1 + %%i)]`
			`mova m2, [srcq + mmsize * (2 + %%i)]`
			`mova m3, [srcq + mmsize * (3 + %%i)]`
dsputil: Split audio operations off into a separate context 11 years ago			`%if %3`
audiodsp/x86: fix ff_vector_clip_int32_sse2 This version, which is the only one doing two processing cycles per loop iteration, computes the load/store indices incorrectly for the second cycle. CC: libav-stable@libav.org 8 years ago			`mova m7, [srcq + mmsize * (4 + %%i)]`
			`mova m8, [srcq + mmsize * (5 + %%i)]`
			`mova m9, [srcq + mmsize * (6 + %%i)]`
			`mova m10, [srcq + mmsize * (7 + %%i)]`
dsputil: Split audio operations off into a separate context 11 years ago			`%endif`
			`CLIPD m0, m4, m5, m6`
			`CLIPD m1, m4, m5, m6`
			`CLIPD m2, m4, m5, m6`
			`CLIPD m3, m4, m5, m6`
			`%if %3`
			`CLIPD m7, m4, m5, m6`
			`CLIPD m8, m4, m5, m6`
			`CLIPD m9, m4, m5, m6`
			`CLIPD m10, m4, m5, m6`
			`%endif`
audiodsp/x86: fix ff_vector_clip_int32_sse2 This version, which is the only one doing two processing cycles per loop iteration, computes the load/store indices incorrectly for the second cycle. CC: libav-stable@libav.org 8 years ago			`mova [dstq + mmsize * (0 + %%i)], m0`
			`mova [dstq + mmsize * (1 + %%i)], m1`
			`mova [dstq + mmsize * (2 + %%i)], m2`
			`mova [dstq + mmsize * (3 + %%i)], m3`
dsputil: Split audio operations off into a separate context 11 years ago			`%if %3`
audiodsp/x86: fix ff_vector_clip_int32_sse2 This version, which is the only one doing two processing cycles per loop iteration, computes the load/store indices incorrectly for the second cycle. CC: libav-stable@libav.org 8 years ago			`mova [dstq + mmsize * (4 + %%i)], m7`
			`mova [dstq + mmsize * (5 + %%i)], m8`
			`mova [dstq + mmsize * (6 + %%i)], m9`
			`mova [dstq + mmsize * (7 + %%i)], m10`
dsputil: Split audio operations off into a separate context 11 years ago			`%endif`
audiodsp/x86: fix ff_vector_clip_int32_sse2 This version, which is the only one doing two processing cycles per loop iteration, computes the load/store indices incorrectly for the second cycle. CC: libav-stable@libav.org 8 years ago			`%assign %%i (%%i + 4 * (1 + %3))`
dsputil: Split audio operations off into a separate context 11 years ago			`%endrep`
			`add srcq, mmsize4(%2+%3)`
			`add dstq, mmsize4(%2+%3)`
			`sub lend, mmsize*(%2+%3)`
			`jg .loop`
x86: replace explicit REP_RETs with RETs From x86inc: > On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either > a branch or a branch target. So switch to a 2-byte form of ret in that case. > We can automatically detect "follows a branch", but not a branch target. > (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) x86inc can automatically determine whether to use REP_RET rather than REP in most of these cases, so impact is minimal. Additionally, a few REP_RETs were used unnecessary, despite the return being nowhere near a branch. The only CPUs affected were AMD K10s, made between 2007 and 2011, 16 years ago and 12 years ago, respectively. In the future, everyone involved with x86inc should consider dropping REP_RETs altogether. 2 years ago			`RET`
dsputil: Split audio operations off into a separate context 11 years ago			`%endmacro`

			`INIT_XMM sse2`
			`VECTOR_CLIP_INT32 6, 1, 0, 0, _int`
			`VECTOR_CLIP_INT32 6, 2, 0, 1`
			`INIT_XMM sse4`
			`%ifdef m8`
			`VECTOR_CLIP_INT32 11, 1, 1, 0`
			`%else`
			`VECTOR_CLIP_INT32 6, 1, 0, 0`
			`%endif`
x86/audiodsp: move asm code out of dsputil Signed-off-by: James Almer <jamrial@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 11 years ago
audiodsp/x86: yasmify vector_clipf_sse 8 years ago			`; void ff_vector_clipf_sse(float dst, const float src,`
			`; int len, float min, float max)`
x86/audiodsp: move asm code out of dsputil Signed-off-by: James Almer <jamrial@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 11 years ago			`INIT_XMM sse`
audiodsp/x86: yasmify vector_clipf_sse 8 years ago			`cglobal vector_clipf, 3, 3, 6, dst, src, len, min, max`
			`%if ARCH_X86_32`
			`VBROADCASTSS m0, minm`
			`VBROADCASTSS m1, maxm`
			`%elif WIN64`
x86/audiodsp: remove an unnecessary movss 8 years ago			`SWAP 0, 3`
			`VBROADCASTSS m0, m0`
audiodsp/x86: yasmify vector_clipf_sse 8 years ago			`VBROADCASTSS m1, maxm`
			`%else ; 64bit sysv`
			`VBROADCASTSS m0, m0`
			`VBROADCASTSS m1, m1`
x86/audiodsp: move asm code out of dsputil Signed-off-by: James Almer <jamrial@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 11 years ago			`%endif`
audiodsp/x86: yasmify vector_clipf_sse 8 years ago
			`movsxdifnidn lenq, lend`

x86: Add missing colons after assembly labels This fixes many warnings of the sort warning: label alone on a line without a colon might be in error 9 years ago			`.loop:`
audiodsp/x86: yasmify vector_clipf_sse 8 years ago			`mova m2, [srcq + 4 * lenq - 4 * mmsize]`
			`mova m3, [srcq + 4 * lenq - 3 * mmsize]`
			`mova m4, [srcq + 4 * lenq - 2 * mmsize]`
			`mova m5, [srcq + 4 * lenq - 1 * mmsize]`

			`maxps m2, m0`
			`maxps m3, m0`
			`maxps m4, m0`
			`maxps m5, m0`

			`minps m2, m1`
			`minps m3, m1`
			`minps m4, m1`
			`minps m5, m1`

			`mova [dstq + 4 * lenq - 4 * mmsize], m2`
			`mova [dstq + 4 * lenq - 3 * mmsize], m3`
			`mova [dstq + 4 * lenq - 2 * mmsize], m4`
			`mova [dstq + 4 * lenq - 1 * mmsize], m5`

			`sub lenq, mmsize`
			`jg .loop`

			`RET`