You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

135 lines
3.8 KiB

; /*
; * Provide SIMD DMVR SAD functions for VVC decoding
; *
; * Copyright (c) 2024 Stone Chen
; *
; * This file is part of FFmpeg.
; *
; * FFmpeg is free software; you can redistribute it and/or
; * modify it under the terms of the GNU Lesser General Public
; * License as published by the Free Software Foundation; either
; * version 2.1 of the License, or (at your option) any later version.
; *
; * FFmpeg is distributed in the hope that it will be useful,
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
; * Lesser General Public License for more details.
; *
; * You should have received a copy of the GNU Lesser General Public
; * License along with FFmpeg; if not, write to the Free Software
; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
; */
%include "libavutil/x86/x86util.asm"
%define MAX_PB_SIZE 128
%define ROWS 2
SECTION_RODATA
pw_1: times 2 dw 1
; DMVR SAD is only calculated on even rows to reduce complexity
; Additionally the only valid sizes are 8x16, 16x8, and 16x16
SECTION .text
%macro MIN_MAX_SAD 3
pminuw %3, %2, %1
pmaxuw %1, %2, %1
psubusw %1, %1, %3
%endmacro
%macro HORIZ_ADD 3 ; xm0, xm1, m1
vextracti128 %1, %3, q0001 ; 3 2 1 0
paddd %1, %2 ; xm0 (7 + 3) (6 + 2) (5 + 1) (4 + 0)
pshufd %2, %1, q0032 ; xm1 - - (7 + 3) (6 + 2)
paddd %1, %1, %2 ; xm0 _ _ (5 1 7 3) (4 0 6 2)
pshufd %2, %1, q0001 ; xm1 _ _ (5 1 7 3) (5 1 7 3)
paddd %1, %1, %2 ; (01234567)
%endmacro
%if ARCH_X86_64
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
cglobal vvc_sad, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx
movsxdifnidn dxq, dxd
movsxdifnidn dyq, dyd
sub dxq, 2
sub dyq, 2
mov off1q, 2
mov off2q, 2
add off1q, dyq
sub off2q, dyq
shl off1q, 7
shl off2q, 7
add off1q, dxq
sub off2q, dxq
lea src1q, [src1q + off1q * 2 + 2 * 2]
lea src2q, [src2q + off2q * 2 + 2 * 2]
pxor m3, m3
vpbroadcastd m4, [pw_1]
cmp block_wd, 16
je vvc_sad_16
vvc_sad_8:
.loop_height:
movu xm0, [src1q]
vinserti128 m0, m0, [src1q + MAX_PB_SIZE * ROWS * 2], 1
movu xm1, [src2q]
vinserti128 m1, m1, [src2q + MAX_PB_SIZE * ROWS * 2], 1
MIN_MAX_SAD m1, m0, m2
pmaddwd m1, m4
paddd m3, m1
add src1q, 2 * MAX_PB_SIZE * ROWS * 2
add src2q, 2 * MAX_PB_SIZE * ROWS * 2
sub block_hd, 4
jg .loop_height
HORIZ_ADD xm0, xm3, m3
movd eax, xm0
RET
vvc_sad_16:
sar block_wd, 4
.loop_height:
mov off1q, src1q
mov off2q, src2q
mov row_idxd, block_wd
.loop_width:
movu m0, [src1q]
movu m1, [src2q]
MIN_MAX_SAD m1, m0, m2
pmaddwd m1, m4
paddd m3, m1
add src1q, 32
add src2q, 32
dec row_idxd
jg .loop_width
lea src1q, [off1q + ROWS * MAX_PB_SIZE * 2]
lea src2q, [off2q + ROWS * MAX_PB_SIZE * 2]
sub block_hd, 2
jg .loop_height
HORIZ_ADD xm0, xm3, m3
movd eax, xm0
RET
%endif
%endif