|
|
|
;******************************************************************************
|
|
|
|
;* x86-optimized horizontal line scaling functions
|
|
|
|
;* Copyright 2020 Google LLC
|
|
|
|
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
|
|
|
|
;*
|
|
|
|
;* This file is part of FFmpeg.
|
|
|
|
;*
|
|
|
|
;* FFmpeg is free software; you can redistribute it and/or
|
|
|
|
;* modify it under the terms of the GNU Lesser General Public
|
|
|
|
;* License as published by the Free Software Foundation; either
|
|
|
|
;* version 2.1 of the License, or (at your option) any later version.
|
|
|
|
;*
|
|
|
|
;* FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
;* Lesser General Public License for more details.
|
|
|
|
;*
|
|
|
|
;* You should have received a copy of the GNU Lesser General Public
|
|
|
|
;* License along with FFmpeg; if not, write to the Free Software
|
|
|
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
;******************************************************************************
|
|
|
|
|
|
|
|
%include "libavutil/x86/x86util.asm"
|
|
|
|
|
|
|
|
SECTION_RODATA 32
|
|
|
|
|
|
|
|
swizzle: dd 0, 4, 1, 5, 2, 6, 3, 7
|
|
|
|
four: times 8 dd 4
|
|
|
|
|
|
|
|
SECTION .text
|
|
|
|
|
|
|
|
;-----------------------------------------------------------------------------
|
|
|
|
; horizontal line scaling
|
|
|
|
;
|
|
|
|
; void hscale8to15_<filterSize>_<opt>
|
swscale: rename SwsContext to SwsInternal
And preserve the public SwsContext as separate name. The motivation here
is that I want to turn SwsContext into a public struct, while keeping the
internal implementation hidden. Additionally, I also want to be able to
use multiple internal implementations, e.g. for GPU devices.
This commit does not include any functional changes. For the most part, it is
a simple rename. The only complications arise from the public facing API
functions, which preserve their current type (and hence require an additional
unwrapping step internally), and the checkasm test framework, which directly
accesses SwsInternal.
For consistency, the affected functions that need to maintain a distionction
have generally been changed to refer to the SwsContext as *sws, and the
SwsInternal as *c.
In an upcoming commit, I will provide a backing definition for the public
SwsContext, and update `sws_internal()` to dereference the internal struct
instead of merely casting it.
Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <git@haasn.dev>
3 months ago
|
|
|
; (SwsInternal *c, int16_t *dst,
|
|
|
|
; int dstW, const uint8_t *src,
|
|
|
|
; const int16_t *filter,
|
|
|
|
; const int32_t *filterPos, int filterSize);
|
|
|
|
;
|
|
|
|
; Scale one horizontal line. Input is 8-bit width Filter is 14 bits. Output is
|
|
|
|
; 15 bits (in int16_t). Each output pixel is generated from $filterSize input
|
|
|
|
; pixels, the position of the first pixel is given in filterPos[nOutputPixel].
|
|
|
|
;-----------------------------------------------------------------------------
|
|
|
|
|
|
|
|
%macro SCALE_FUNC 1
|
|
|
|
cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize, count, inner
|
|
|
|
pxor m0, m0
|
|
|
|
mova m15, [swizzle]
|
|
|
|
xor countq, countq
|
|
|
|
movsxd wq, wd
|
|
|
|
%ifidn %1, X4
|
|
|
|
mova m14, [four]
|
|
|
|
shr fltsized, 2
|
|
|
|
%endif
|
|
|
|
cmp wq, 0x10
|
|
|
|
jl .tail_loop
|
|
|
|
sub wq, 0x10
|
|
|
|
.loop:
|
|
|
|
movu m1, [fltposq]
|
|
|
|
movu m2, [fltposq+32]
|
|
|
|
%ifidn %1, X4
|
|
|
|
pxor m9, m9
|
|
|
|
pxor m10, m10
|
|
|
|
pxor m11, m11
|
|
|
|
pxor m12, m12
|
|
|
|
xor innerq, innerq
|
|
|
|
.innerloop:
|
|
|
|
%endif
|
|
|
|
vpcmpeqd m13, m13
|
|
|
|
vpgatherdd m3,[srcmemq + m1], m13
|
|
|
|
vpcmpeqd m13, m13
|
|
|
|
vpgatherdd m4,[srcmemq + m2], m13
|
|
|
|
vpunpcklbw m5, m3, m0
|
|
|
|
vpunpckhbw m6, m3, m0
|
|
|
|
vpunpcklbw m7, m4, m0
|
|
|
|
vpunpckhbw m8, m4, m0
|
|
|
|
vpmaddwd m5, m5, [filterq]
|
|
|
|
vpmaddwd m6, m6, [filterq + 32]
|
|
|
|
vpmaddwd m7, m7, [filterq + 64]
|
|
|
|
vpmaddwd m8, m8, [filterq + 96]
|
|
|
|
add filterq, 0x80
|
|
|
|
%ifidn %1, X4
|
|
|
|
paddd m9, m5
|
|
|
|
paddd m10, m6
|
|
|
|
paddd m11, m7
|
|
|
|
paddd m12, m8
|
|
|
|
paddd m1, m14
|
|
|
|
paddd m2, m14
|
|
|
|
add innerq, 1
|
|
|
|
cmp innerq, fltsizeq
|
|
|
|
jl .innerloop
|
|
|
|
vphaddd m5, m9, m10
|
|
|
|
vphaddd m6, m11, m12
|
|
|
|
%else
|
|
|
|
vphaddd m5, m5, m6
|
|
|
|
vphaddd m6, m7, m8
|
|
|
|
%endif
|
|
|
|
vpsrad m5, 7
|
|
|
|
vpsrad m6, 7
|
|
|
|
vpackssdw m5, m5, m6
|
|
|
|
vpermd m5, m15, m5
|
|
|
|
vmovdqu [dstq + countq * 2], m5
|
|
|
|
add fltposq, 0x40
|
|
|
|
add countq, 0x10
|
|
|
|
cmp countq, wq
|
|
|
|
jle .loop
|
|
|
|
|
|
|
|
add wq, 0x10
|
|
|
|
cmp countq, wq
|
|
|
|
jge .end
|
|
|
|
|
|
|
|
.tail_loop:
|
|
|
|
movu xm1, [fltposq]
|
|
|
|
%ifidn %1, X4
|
|
|
|
pxor xm9, xm9
|
|
|
|
pxor xm10, xm10
|
|
|
|
xor innerq, innerq
|
|
|
|
.tail_innerloop:
|
|
|
|
%endif
|
|
|
|
vpcmpeqd xm13, xm13
|
|
|
|
vpgatherdd xm3,[srcmemq + xm1], xm13
|
|
|
|
vpunpcklbw xm5, xm3, xm0
|
|
|
|
vpunpckhbw xm6, xm3, xm0
|
|
|
|
vpmaddwd xm5, xm5, [filterq]
|
|
|
|
vpmaddwd xm6, xm6, [filterq + 0x10]
|
|
|
|
add filterq, 0x20
|
|
|
|
%ifidn %1, X4
|
|
|
|
paddd xm9, xm5
|
|
|
|
paddd xm10, xm6
|
|
|
|
paddd xm1, xm14
|
|
|
|
add innerq, 1
|
|
|
|
cmp innerq, fltsizeq
|
|
|
|
jl .tail_innerloop
|
|
|
|
vphaddd xm5, xm9, xm10
|
|
|
|
%else
|
|
|
|
vphaddd xm5, xm5, xm6
|
|
|
|
%endif
|
|
|
|
vpsrad xm5, 7
|
|
|
|
vpackssdw xm5, xm5, xm5
|
|
|
|
vmovq [dstq + countq * 2], xm5
|
|
|
|
add fltposq, 0x10
|
|
|
|
add countq, 0x4
|
|
|
|
cmp countq, wq
|
|
|
|
jl .tail_loop
|
|
|
|
.end:
|
|
|
|
RET
|
|
|
|
%endmacro
|
|
|
|
|
|
|
|
%if ARCH_X86_64
|
|
|
|
%if HAVE_AVX2_EXTERNAL
|
|
|
|
INIT_YMM avx2
|
|
|
|
SCALE_FUNC 4
|
|
|
|
SCALE_FUNC X4
|
|
|
|
%endif
|
|
|
|
%endif
|