; /* ; * Provide SIMD DMVR SAD functions for VVC decoding ; * ; * Copyright (c) 2024 Stone Chen ; * ; * This file is part of FFmpeg. ; * ; * FFmpeg is free software; you can redistribute it and/or ; * modify it under the terms of the GNU Lesser General Public ; * License as published by the Free Software Foundation; either ; * version 2.1 of the License, or (at your option) any later version. ; * ; * FFmpeg is distributed in the hope that it will be useful, ; * but WITHOUT ANY WARRANTY; without even the implied warranty of ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ; * Lesser General Public License for more details. ; * ; * You should have received a copy of the GNU Lesser General Public ; * License along with FFmpeg; if not, write to the Free Software ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ; */ %include "libavutil/x86/x86util.asm" %define MAX_PB_SIZE 128 %define ROWS 2 SECTION_RODATA pw_1: times 2 dw 1 ; DMVR SAD is only calculated on even rows to reduce complexity SECTION .text %macro MIN_MAX_SAD 3 pminuw %3, %2, %1 pmaxuw %1, %2, %1 psubusw %1, %1, %3 %endmacro %macro HORIZ_ADD 3 ; xm0, xm1, m1 vextracti128 %1, %3, q0001 ; 3 2 1 0 paddd %1, %2 ; xm0 (7 + 3) (6 + 2) (5 + 1) (4 + 0) pshufd %2, %1, q0032 ; xm1 - - (7 + 3) (6 + 2) paddd %1, %1, %2 ; xm0 _ _ (5 1 7 3) (4 0 6 2) pshufd %2, %1, q0001 ; xm1 _ _ (5 1 7 3) (5 1 7 3) paddd %1, %1, %2 ; (01234567) %endmacro %if ARCH_X86_64 %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 cglobal vvc_sad, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx movsxdifnidn dxq, dxd movsxdifnidn dyq, dyd sub dxq, 2 sub dyq, 2 mov off1q, 2 mov off2q, 2 add off1q, dyq sub off2q, dyq shl off1q, 7 shl off2q, 7 add off1q, dxq sub off2q, dxq lea src1q, [src1q + off1q * 2 + 2 * 2] lea src2q, [src2q + off2q * 2 + 2 * 2] pxor m3, m3 vpbroadcastd m4, [pw_1] cmp block_wd, 16 jge vvc_sad_16_128 vvc_sad_8: .loop_height: movu xm0, [src1q] vinserti128 m0, m0, [src1q + MAX_PB_SIZE * ROWS * 2], 1 movu xm1, [src2q] vinserti128 m1, m1, [src2q + MAX_PB_SIZE * ROWS * 2], 1 MIN_MAX_SAD m1, m0, m2 pmaddwd m1, m4 paddd m3, m1 add src1q, 2 * MAX_PB_SIZE * ROWS * 2 add src2q, 2 * MAX_PB_SIZE * ROWS * 2 sub block_hd, 4 jg .loop_height HORIZ_ADD xm0, xm3, m3 movd eax, xm0 RET vvc_sad_16_128: sar block_wd, 4 .loop_height: mov off1q, src1q mov off2q, src2q mov row_idxd, block_wd .loop_width: movu m0, [src1q] movu m1, [src2q] MIN_MAX_SAD m1, m0, m2 pmaddwd m1, m4 paddd m3, m1 add src1q, 32 add src2q, 32 dec row_idxd jg .loop_width lea src1q, [off1q + ROWS * MAX_PB_SIZE * 2] lea src2q, [off2q + ROWS * MAX_PB_SIZE * 2] sub block_hd, 2 jg .loop_height HORIZ_ADD xm0, xm3, m3 movd eax, xm0 RET %endif %endif