FFmpeg/libswscale/x86/scale_avx2.asm

;******************************************************************************
;* x86-optimized horizontal line scaling functions
;* Copyright 2020 Google LLC
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "libavutil/x86/x86util.asm"

SECTION_RODATA 32

swizzle: dd 0, 4, 1, 5, 2, 6, 3, 7
four: times 8 dd 4

SECTION .text

;-----------------------------------------------------------------------------
; horizontal line scaling
;
; void hscale8to15_<filterSize>_<opt>
;                   (SwsInternal *c, int16_t *dst,
;                    int dstW, const uint8_t *src,
;                    const int16_t *filter,
;                    const int32_t *filterPos, int filterSize);
;
; Scale one horizontal line. Input is 8-bit width Filter is 14 bits. Output is
; 15 bits (in int16_t). Each output pixel is generated from $filterSize input
; pixels, the position of the first pixel is given in filterPos[nOutputPixel].
;-----------------------------------------------------------------------------

%macro SCALE_FUNC 1
cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize, count, inner
    pxor m0, m0
    mova m15, [swizzle]
    xor countq, countq
    movsxd wq, wd
%ifidn %1, X4
    mova m14, [four]
    shr fltsized, 2
%endif
    cmp wq, 0x10
    jl .tail_loop
    sub wq, 0x10
.loop:
    movu m1, [fltposq]
    movu m2, [fltposq+32]
%ifidn %1, X4
    pxor m9, m9
    pxor m10, m10
    pxor m11, m11
    pxor m12, m12
    xor innerq, innerq
.innerloop:
%endif
    vpcmpeqd  m13, m13
    vpgatherdd m3,[srcmemq + m1], m13
    vpcmpeqd  m13, m13
    vpgatherdd m4,[srcmemq + m2], m13
    vpunpcklbw m5, m3, m0
    vpunpckhbw m6, m3, m0
    vpunpcklbw m7, m4, m0
    vpunpckhbw m8, m4, m0
    vpmaddwd m5, m5, [filterq]
    vpmaddwd m6, m6, [filterq + 32]
    vpmaddwd m7, m7, [filterq + 64]
    vpmaddwd m8, m8, [filterq + 96]
    add filterq, 0x80
%ifidn %1, X4
    paddd m9, m5
    paddd m10, m6
    paddd m11, m7
    paddd m12, m8
    paddd m1, m14
    paddd m2, m14
    add innerq, 1
    cmp innerq, fltsizeq
    jl .innerloop
    vphaddd m5, m9, m10
    vphaddd m6, m11, m12
%else
    vphaddd m5, m5, m6
    vphaddd m6, m7, m8
%endif
    vpsrad  m5, 7
    vpsrad  m6, 7
    vpackssdw m5, m5, m6
    vpermd m5, m15, m5
    vmovdqu [dstq + countq * 2], m5
    add fltposq, 0x40
    add countq, 0x10
    cmp countq, wq
    jle .loop

    add wq, 0x10
    cmp countq, wq
    jge .end

.tail_loop:
    movu xm1, [fltposq]
%ifidn %1, X4
    pxor xm9, xm9
    pxor xm10, xm10
    xor innerq, innerq
.tail_innerloop:
%endif
    vpcmpeqd  xm13, xm13
    vpgatherdd xm3,[srcmemq + xm1], xm13
    vpunpcklbw xm5, xm3, xm0
    vpunpckhbw xm6, xm3, xm0
    vpmaddwd xm5, xm5, [filterq]
    vpmaddwd xm6, xm6, [filterq + 0x10]
    add filterq, 0x20
%ifidn %1, X4
    paddd xm9, xm5
    paddd xm10, xm6
    paddd xm1, xm14
    add innerq, 1
    cmp innerq, fltsizeq
    jl .tail_innerloop
    vphaddd xm5, xm9, xm10
%else
    vphaddd xm5, xm5, xm6
%endif
    vpsrad  xm5, 7
    vpackssdw xm5, xm5, xm5
    vmovq [dstq + countq * 2], xm5
    add fltposq, 0x10
    add countq, 0x4
    cmp countq, wq
    jl .tail_loop
.end:
RET
%endmacro

%if ARCH_X86_64
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
SCALE_FUNC 4
SCALE_FUNC X4
%endif
%endif
libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes. Fixes so that fate under 64 bit Windows passes. These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`;******************************************************************************`
			`;* x86-optimized horizontal line scaling functions`
			`;* Copyright 2020 Google LLC`
			`;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>`
			`;*`
			`;* This file is part of FFmpeg.`
			`;*`
			`;* FFmpeg is free software; you can redistribute it and/or`
			`;* modify it under the terms of the GNU Lesser General Public`
			`;* License as published by the Free Software Foundation; either`
			`;* version 2.1 of the License, or (at your option) any later version.`
			`;*`
			`;* FFmpeg is distributed in the hope that it will be useful,`
			`;* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`;* Lesser General Public License for more details.`
			`;*`
			`;* You should have received a copy of the GNU Lesser General Public`
			`;* License along with FFmpeg; if not, write to the Free Software`
			`;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`;******************************************************************************`

			`%include "libavutil/x86/x86util.asm"`

			`SECTION_RODATA 32`

			`swizzle: dd 0, 4, 1, 5, 2, 6, 3, 7`
			`four: times 8 dd 4`

			`SECTION .text`

			`;-----------------------------------------------------------------------------`
			`; horizontal line scaling`
			`;`
			`; void hscale8to15_<filterSize>_<opt>`
swscale: rename SwsContext to SwsInternal And preserve the public SwsContext as separate name. The motivation here is that I want to turn SwsContext into a public struct, while keeping the internal implementation hidden. Additionally, I also want to be able to use multiple internal implementations, e.g. for GPU devices. This commit does not include any functional changes. For the most part, it is a simple rename. The only complications arise from the public facing API functions, which preserve their current type (and hence require an additional unwrapping step internally), and the checkasm test framework, which directly accesses SwsInternal. For consistency, the affected functions that need to maintain a distionction have generally been changed to refer to the SwsContext as sws, and the SwsInternal as c. In an upcoming commit, I will provide a backing definition for the public SwsContext, and update `sws_internal()` to dereference the internal struct instead of merely casting it. Sponsored-by: Sovereign Tech Fund Signed-off-by: Niklas Haas <git@haasn.dev> 3 months ago			`; (SwsInternal c, int16_t dst,`
libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes. Fixes so that fate under 64 bit Windows passes. These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`; int dstW, const uint8_t *src,`
			`; const int16_t *filter,`
			`; const int32_t *filterPos, int filterSize);`
			`;`
			`; Scale one horizontal line. Input is 8-bit width Filter is 14 bits. Output is`
			`; 15 bits (in int16_t). Each output pixel is generated from $filterSize input`
			`; pixels, the position of the first pixel is given in filterPos[nOutputPixel].`
			`;-----------------------------------------------------------------------------`

			`%macro SCALE_FUNC 1`
			`cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize, count, inner`
x86/scale_avx2: Change asm indent from 2 to 4 spaces. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`pxor m0, m0`
			`mova m15, [swizzle]`
x86/scale_avx2: don't use $ for hex literals Fixes compilation with AVX2 enabled yasm. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`xor countq, countq`
x86/scale_avx2: Change asm indent from 2 to 4 spaces. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`movsxd wq, wd`
libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes. Fixes so that fate under 64 bit Windows passes. These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`%ifidn %1, X4`
x86/scale_avx2: Change asm indent from 2 to 4 spaces. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`mova m14, [four]`
			`shr fltsized, 2`
libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes. Fixes so that fate under 64 bit Windows passes. These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`%endif`
sws: allow avx2 hscale to process inputs of any size. The main loop processes blocks of 16 pixels. The tail processes blocks of size 4. Signed-off-by: Anton Khirnov <anton@khirnov.net> 3 years ago			`cmp wq, 0x10`
			`jl .tail_loop`
			`sub wq, 0x10`
libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes. Fixes so that fate under 64 bit Windows passes. These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`.loop:`
x86/scale_avx2: Change asm indent from 2 to 4 spaces. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`movu m1, [fltposq]`
			`movu m2, [fltposq+32]`
libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes. Fixes so that fate under 64 bit Windows passes. These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`%ifidn %1, X4`
x86/scale_avx2: Change asm indent from 2 to 4 spaces. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`pxor m9, m9`
			`pxor m10, m10`
			`pxor m11, m11`
			`pxor m12, m12`
x86/scale_avx2: don't use $ for hex literals Fixes compilation with AVX2 enabled yasm. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`xor innerq, innerq`
libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes. Fixes so that fate under 64 bit Windows passes. These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`.innerloop:`
			`%endif`
x86/scale_avx2: Change asm indent from 2 to 4 spaces. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`vpcmpeqd m13, m13`
			`vpgatherdd m3,[srcmemq + m1], m13`
			`vpcmpeqd m13, m13`
			`vpgatherdd m4,[srcmemq + m2], m13`
			`vpunpcklbw m5, m3, m0`
			`vpunpckhbw m6, m3, m0`
			`vpunpcklbw m7, m4, m0`
			`vpunpckhbw m8, m4, m0`
			`vpmaddwd m5, m5, [filterq]`
			`vpmaddwd m6, m6, [filterq + 32]`
			`vpmaddwd m7, m7, [filterq + 64]`
			`vpmaddwd m8, m8, [filterq + 96]`
x86/scale_avx2: don't use $ for hex literals Fixes compilation with AVX2 enabled yasm. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`add filterq, 0x80`
libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes. Fixes so that fate under 64 bit Windows passes. These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`%ifidn %1, X4`
x86/scale_avx2: Change asm indent from 2 to 4 spaces. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`paddd m9, m5`
			`paddd m10, m6`
			`paddd m11, m7`
			`paddd m12, m8`
			`paddd m1, m14`
			`paddd m2, m14`
x86/scale_avx2: don't use $ for hex literals Fixes compilation with AVX2 enabled yasm. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`add innerq, 1`
x86/scale_avx2: Change asm indent from 2 to 4 spaces. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`cmp innerq, fltsizeq`
			`jl .innerloop`
			`vphaddd m5, m9, m10`
			`vphaddd m6, m11, m12`
libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes. Fixes so that fate under 64 bit Windows passes. These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`%else`
x86/scale_avx2: Change asm indent from 2 to 4 spaces. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`vphaddd m5, m5, m6`
			`vphaddd m6, m7, m8`
libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes. Fixes so that fate under 64 bit Windows passes. These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`%endif`
x86/scale_avx2: Change asm indent from 2 to 4 spaces. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`vpsrad m5, 7`
			`vpsrad m6, 7`
			`vpackssdw m5, m5, m6`
			`vpermd m5, m15, m5`
			`vmovdqu [dstq + countq * 2], m5`
x86/scale_avx2: don't use $ for hex literals Fixes compilation with AVX2 enabled yasm. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`add fltposq, 0x40`
			`add countq, 0x10`
x86/scale_avx2: Change asm indent from 2 to 4 spaces. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`cmp countq, wq`
sws: allow avx2 hscale to process inputs of any size. The main loop processes blocks of 16 pixels. The tail processes blocks of size 4. Signed-off-by: Anton Khirnov <anton@khirnov.net> 3 years ago			`jle .loop`

			`add wq, 0x10`
			`cmp countq, wq`
			`jge .end`

			`.tail_loop:`
			`movu xm1, [fltposq]`
			`%ifidn %1, X4`
			`pxor xm9, xm9`
			`pxor xm10, xm10`
			`xor innerq, innerq`
			`.tail_innerloop:`
			`%endif`
			`vpcmpeqd xm13, xm13`
			`vpgatherdd xm3,[srcmemq + xm1], xm13`
			`vpunpcklbw xm5, xm3, xm0`
			`vpunpckhbw xm6, xm3, xm0`
			`vpmaddwd xm5, xm5, [filterq]`
			`vpmaddwd xm6, xm6, [filterq + 0x10]`
			`add filterq, 0x20`
			`%ifidn %1, X4`
			`paddd xm9, xm5`
			`paddd xm10, xm6`
			`paddd xm1, xm14`
			`add innerq, 1`
			`cmp innerq, fltsizeq`
			`jl .tail_innerloop`
			`vphaddd xm5, xm9, xm10`
			`%else`
			`vphaddd xm5, xm5, xm6`
			`%endif`
			`vpsrad xm5, 7`
			`vpackssdw xm5, xm5, xm5`
			`vmovq [dstq + countq * 2], xm5`
			`add fltposq, 0x10`
			`add countq, 0x4`
			`cmp countq, wq`
			`jl .tail_loop`
			`.end:`
x86: replace explicit REP_RETs with RETs From x86inc: > On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either > a branch or a branch target. So switch to a 2-byte form of ret in that case. > We can automatically detect "follows a branch", but not a branch target. > (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) x86inc can automatically determine whether to use REP_RET rather than REP in most of these cases, so impact is minimal. Additionally, a few REP_RETs were used unnecessary, despite the return being nowhere near a branch. The only CPUs affected were AMD K10s, made between 2007 and 2011, 16 years ago and 12 years ago, respectively. In the future, everyone involved with x86inc should consider dropping REP_RETs altogether. 2 years ago			`RET`
libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes. Fixes so that fate under 64 bit Windows passes. These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`%endmacro`

			`%if ARCH_X86_64`
x86/scale_avx2: add missing check for AVX2 assembler support Should fix compilation with old yasm. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`%if HAVE_AVX2_EXTERNAL`
libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes. Fixes so that fate under 64 bit Windows passes. These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`INIT_YMM avx2`
			`SCALE_FUNC 4`
			`SCALE_FUNC X4`
			`%endif`
x86/scale_avx2: add missing check for AVX2 assembler support Should fix compilation with old yasm. Signed-off-by: James Almer <jamrial@gmail.com> 3 years ago			`%endif`