mirror of https://github.com/FFmpeg/FFmpeg.git
Decoder-Side Motion Vector Refinement is about 4~8% CPU usage for some clips here is the test result for one time clips | before| after | delta ------------------------------------------|-------|-------|------ RitualDance_1920x1080_60_10_420_37_RA.266 | 338.7 | 354.3 |4.61% NovosobornayaSquare_1920x1080.bin | 320.3 | 329.3 |2.81% Tango2_3840x2160_60_10_420_27_LD.266 | 83.3 | 83.7 |0.48% RitualDance_1920x1080_60_10_420_32_LD.266 | 320.7 | 327.3 |2.06% Chimera_8bit_1080P_1000_frames.vvc | 360.7 | 381.0 |5.63% BQTerrace_1920x1080_60_10_420_22_RA.vvc | 161.7 | 163.0 |0.80%release/7.1
parent
1c17520e79
commit
6464c157d5
3 changed files with 399 additions and 0 deletions
@ -0,0 +1,373 @@ |
||||
; /* |
||||
; * Provide AVX2 luma dmvr functions for VVC decoding |
||||
; * Copyright (c) 2024 Nuo Mi |
||||
; * |
||||
; * This file is part of FFmpeg. |
||||
; * |
||||
; * FFmpeg is free software; you can redistribute it and/or |
||||
; * modify it under the terms of the GNU Lesser General Public |
||||
; * License as published by the Free Software Foundation; either |
||||
; * version 2.1 of the License, or (at your option) any later version. |
||||
; * |
||||
; * FFmpeg is distributed in the hope that it will be useful, |
||||
; * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
; * Lesser General Public License for more details. |
||||
; * |
||||
; * You should have received a copy of the GNU Lesser General Public |
||||
; * License along with FFmpeg; if not, write to the Free Software |
||||
; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
; */ |
||||
%include "libavutil/x86/x86util.asm" |
||||
|
||||
%define MAX_PB_SIZE 128 |
||||
|
||||
SECTION_RODATA 32 |
||||
|
||||
shift_12 times 2 dw 1 << (15 - (12 - 10)) |
||||
shift3_8 times 2 dw 1 << (15 - (8 - 6)) |
||||
shift3_10 times 2 dw 1 << (15 - (10 - 6)) |
||||
shift3_12 times 2 dw 1 << (15 - (12 - 6)) |
||||
pw_16 times 2 dw 16 |
||||
|
||||
%if ARCH_X86_64 |
||||
|
||||
%if HAVE_AVX2_EXTERNAL |
||||
|
||||
SECTION .text |
||||
|
||||
%define pstride (bd / 10 + 1) |
||||
|
||||
; LOAD(dst, src) |
||||
%macro LOAD_W16 2 |
||||
%if bd == 8 |
||||
pmovzxbw %1, %2 |
||||
%else |
||||
movu %1, %2 |
||||
%endif |
||||
%endmacro |
||||
|
||||
%macro SHIFT_W16 2 |
||||
%if bd == 8 |
||||
psllw %1, (10 - bd) |
||||
%elif bd == 10 |
||||
; nothing |
||||
%else |
||||
pmulhrsw %1, %2 |
||||
%endif |
||||
%endmacro |
||||
|
||||
%macro SAVE_W16 2 |
||||
movu %1, %2 |
||||
%endmacro |
||||
|
||||
; NEXT_4_LINES(is_h) |
||||
%macro NEXT_4_LINES 1 |
||||
lea dstq, [dstq + dsq*4] |
||||
lea srcq, [srcq + ssq*4] |
||||
%if %1 |
||||
lea src1q, [srcq + pstride] |
||||
%endif |
||||
%endmacro |
||||
|
||||
|
||||
; DMVR_4xW16(dst, dst_stride, dst_stride3, src, src_stride, src_stride3) |
||||
%macro DMVR_4xW16 6 |
||||
LOAD_W16 m0, [%4] |
||||
LOAD_W16 m1, [%4 + %5] |
||||
LOAD_W16 m2, [%4 + 2 * %5] |
||||
LOAD_W16 m3, [%4 + %6] |
||||
|
||||
SHIFT_W16 m0, m4 |
||||
SHIFT_W16 m1, m4 |
||||
SHIFT_W16 m2, m4 |
||||
SHIFT_W16 m3, m4 |
||||
|
||||
SAVE_W16 [%1] , m0 |
||||
SAVE_W16 [%1 + %2] , m1 |
||||
SAVE_W16 [%1 + 2 * %2], m2 |
||||
SAVE_W16 [%1 + %3] , m3 |
||||
%endmacro |
||||
|
||||
; buf += -stride * h + off |
||||
; OFFSET_TO_W4(buf, stride, off) |
||||
%macro OFFSET_TO_W4 3 |
||||
mov id, hd |
||||
imul iq, %2 |
||||
sub %1, iq |
||||
lea %1, [%1 + %3] |
||||
%endmacro |
||||
|
||||
%macro OFFSET_TO_W4 0 |
||||
OFFSET_TO_W4 srcq, ssq, 16 * (bd / 10 + 1) |
||||
OFFSET_TO_W4 dstq, dsq, 16 * 2 |
||||
%endmacro |
||||
|
||||
; void ff_vvc_dmvr_%1_avx2(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, |
||||
; int height, intptr_t mx, intptr_t my, int width); |
||||
%macro DMVR_AVX2 1 |
||||
cglobal vvc_dmvr_%1, 4, 9, 5, dst, src, ss, h, ds, ds3, w, ss3, i |
||||
%define bd %1 |
||||
|
||||
LOAD_STRIDES |
||||
|
||||
%if %1 > 10 |
||||
vpbroadcastd m4, [shift_%1] |
||||
%endif |
||||
|
||||
mov wd, wm |
||||
mov id, hd |
||||
.w16: |
||||
sub id, 4 |
||||
jl .w16_end |
||||
DMVR_4xW16 dstq, dsq, ds3q, srcq, ssq, ss3q |
||||
NEXT_4_LINES 0 |
||||
jmp .w16 |
||||
.w16_end: |
||||
|
||||
sub wd, 16 |
||||
jl .w4_end |
||||
|
||||
OFFSET_TO_W4 |
||||
.w4: |
||||
sub hd, 4 |
||||
jl .w4_end |
||||
DMVR_4xW16 dstq, dsq, ds3q, srcq, ssq, ss3q |
||||
NEXT_4_LINES 0 |
||||
jmp .w4 |
||||
.w4_end: |
||||
|
||||
RET |
||||
%endmacro |
||||
|
||||
; LOAD_COEFFS(coeffs0, coeffs1, src) |
||||
%macro LOAD_COEFFS 3 |
||||
movd xm%2, %3 |
||||
vpbroadcastw m%2, xm%2 |
||||
vpbroadcastd m%1, [pw_16] |
||||
psubw m%1, m%2 |
||||
%endmacro |
||||
|
||||
; LOAD_SHIFT(shift, src) |
||||
%macro LOAD_SHIFT 2 |
||||
vpbroadcastd %1, [%2] |
||||
%if bd == 12 |
||||
psllw %1, 1 ; avoid signed mul for pmulhrsw |
||||
%endif |
||||
%endmacro |
||||
|
||||
; LOAD_STRIDES(shift, src) |
||||
%macro LOAD_STRIDES 0 |
||||
mov dsq, MAX_PB_SIZE * 2 |
||||
lea ss3q, [ssq*3] |
||||
lea ds3q, [dsq*3] |
||||
%endmacro |
||||
|
||||
; BILINEAR(dst/src0, src1, coeff0, coeff1, round, tmp) |
||||
%macro BILINEAR 6 |
||||
pmullw %1, %3 |
||||
pmullw %6, %2, %4 |
||||
paddw %1, %6 |
||||
%if bd == 12 |
||||
psrlw %1, 1 ; avoid signed mul for pmulhrsw |
||||
%endif |
||||
pmulhrsw %1, %5 |
||||
%endmacro |
||||
|
||||
; DMVR_H_1xW16(dst, src0, src1, offset, tmp) |
||||
%macro DMVR_H_1xW16 5 |
||||
LOAD_W16 %1, [%2 + %4] |
||||
LOAD_W16 %5, [%3 + %4] |
||||
BILINEAR %1, %5, m10, m11, m12, %5 |
||||
%endmacro |
||||
|
||||
; DMVR_H_4xW16(dst, dst_stride, dst_stride3, src, src_stride, src_stride3, src1) |
||||
%macro DMVR_H_4xW16 7 |
||||
DMVR_H_1xW16 m0, %4, %7, 0, m4 |
||||
DMVR_H_1xW16 m1, %4, %7, %5, m5 |
||||
DMVR_H_1xW16 m2, %4, %7, 2 * %5, m6 |
||||
DMVR_H_1xW16 m3, %4, %7, %6, m7 |
||||
|
||||
SAVE_W16 [%1] , m0 |
||||
SAVE_W16 [%1 + %2] , m1 |
||||
SAVE_W16 [%1 + 2 * %2], m2 |
||||
SAVE_W16 [%1 + %3] , m3 |
||||
%endmacro |
||||
|
||||
; void ff_vvc_dmvr_h_%1_avx2(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, |
||||
; int height, intptr_t mx, intptr_t my, int width); |
||||
%macro DMVR_H_AVX2 1 |
||||
cglobal vvc_dmvr_h_%1, 4, 10, 13, dst, src, ss, h, ds, ds3, w, ss3, src1, i |
||||
%define bd %1 |
||||
|
||||
LOAD_COEFFS 10, 11, dsm |
||||
LOAD_SHIFT m12, shift3_%1 |
||||
|
||||
LOAD_STRIDES |
||||
lea src1q, [srcq + pstride] |
||||
|
||||
mov wd, wm |
||||
mov id, hd |
||||
.w16: |
||||
sub id, 4 |
||||
jl .w16_end |
||||
DMVR_H_4xW16 dstq, dsq, ds3q, srcq, ssq, ss3q, src1q |
||||
NEXT_4_LINES 1 |
||||
jmp .w16 |
||||
.w16_end: |
||||
|
||||
sub wd, 16 |
||||
jl .w4_end |
||||
|
||||
OFFSET_TO_W4 |
||||
lea src1q, [srcq + pstride] |
||||
.w4: |
||||
sub hd, 4 |
||||
jl .w4_end |
||||
DMVR_H_4xW16 dstq, dsq, ds3q, srcq, ssq, ss3q, src1q |
||||
NEXT_4_LINES 1 |
||||
jmp .w4 |
||||
.w4_end: |
||||
|
||||
RET |
||||
%endmacro |
||||
|
||||
; DMVR_V_4xW16(dst, dst_stride, dst_stride3, src, src_stride, src_stride3) |
||||
%macro DMVR_V_4xW16 6 |
||||
LOAD_W16 m1, [%4 + %5] |
||||
LOAD_W16 m2, [%4 + 2 * %5] |
||||
LOAD_W16 m3, [%4 + %6] |
||||
LOAD_W16 m4, [%4 + 4 * %5] |
||||
|
||||
BILINEAR m0, m1, m8, m9, m10, m11 |
||||
BILINEAR m1, m2, m8, m9, m10, m12 |
||||
BILINEAR m2, m3, m8, m9, m10, m13 |
||||
BILINEAR m3, m4, m8, m9, m10, m14 |
||||
|
||||
SAVE_W16 [%1] , m0 |
||||
SAVE_W16 [%1 + %2] , m1 |
||||
SAVE_W16 [%1 + 2 * %2], m2 |
||||
SAVE_W16 [%1 + %3] , m3 |
||||
|
||||
; why can't we use SWAP m0, m4 here? |
||||
movaps m0, m4 |
||||
%endmacro |
||||
|
||||
; void ff_vvc_dmvr_v_%1_avx2(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, |
||||
; int height, intptr_t mx, intptr_t my, int width); |
||||
%macro DMVR_V_AVX2 1 |
||||
cglobal vvc_dmvr_v_%1, 4, 9, 15, dst, src, ss, h, ds, ds3, w, ss3, i |
||||
%define bd %1 |
||||
|
||||
LOAD_COEFFS 8, 9, ds3m |
||||
LOAD_SHIFT m10, shift3_%1 |
||||
|
||||
LOAD_STRIDES |
||||
|
||||
mov wd, wm |
||||
mov id, hd |
||||
LOAD_W16 m0, [srcq] |
||||
.w16: |
||||
sub id, 4 |
||||
jl .w16_end |
||||
DMVR_V_4xW16 dstq, dsq, ds3q, srcq, ssq, ss3q |
||||
NEXT_4_LINES 0 |
||||
jmp .w16 |
||||
.w16_end: |
||||
|
||||
sub wd, 16 |
||||
jl .w4_end |
||||
|
||||
OFFSET_TO_W4 |
||||
LOAD_W16 m0, [srcq] |
||||
.w4: |
||||
sub hd, 4 |
||||
jl .w4_end |
||||
DMVR_V_4xW16 dstq, dsq, ds3q, srcq, ssq, ss3q |
||||
NEXT_4_LINES 0 |
||||
jmp .w4 |
||||
.w4_end: |
||||
|
||||
RET |
||||
%endmacro |
||||
|
||||
; DMVR_HV_4xW16(dst, dst_stride, dst_stride3, src, src_stride, src_stride3, src1) |
||||
%macro DMVR_HV_4xW16 7 |
||||
DMVR_H_1xW16 m1, %4, %7, %5, m6 |
||||
DMVR_H_1xW16 m2, %4, %7, 2 * %5, m7 |
||||
DMVR_H_1xW16 m3, %4, %7, %6, m8 |
||||
DMVR_H_1xW16 m4, %4, %7, 4 * %5, m9 |
||||
|
||||
BILINEAR m0, m1, m13, m14, m15, m6 |
||||
BILINEAR m1, m2, m13, m14, m15, m7 |
||||
BILINEAR m2, m3, m13, m14, m15, m8 |
||||
BILINEAR m3, m4, m13, m14, m15, m9 |
||||
|
||||
SAVE_W16 [%1] , m0 |
||||
SAVE_W16 [%1 + %2] , m1 |
||||
SAVE_W16 [%1 + 2 * %2], m2 |
||||
SAVE_W16 [%1 + %3] , m3 |
||||
|
||||
; why can't we use SWAP m0, m4 here? |
||||
movaps m0, m4 |
||||
%endmacro |
||||
|
||||
; void ff_vvc_dmvr_hv_%1_avx2(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, |
||||
; int height, intptr_t mx, intptr_t my, int width); |
||||
%macro DMVR_HV_AVX2 1 |
||||
cglobal vvc_dmvr_hv_%1, 7, 10, 16, dst, src, ss, h, ds, ds3, w, ss3, src1, i |
||||
%define bd %1 |
||||
|
||||
LOAD_COEFFS 10, 11, dsm |
||||
LOAD_SHIFT m12, shift3_%1 |
||||
|
||||
LOAD_COEFFS 13, 14, ds3m |
||||
LOAD_SHIFT m15, shift3_10 |
||||
|
||||
LOAD_STRIDES |
||||
lea src1q, [srcq + pstride] |
||||
|
||||
mov id, hd |
||||
DMVR_H_1xW16 m0, srcq, src1q, 0, m5 |
||||
.w16: |
||||
sub id, 4 |
||||
jl .w16_end |
||||
DMVR_HV_4xW16 dstq, dsq, ds3q, srcq, ssq, ss3q, src1q |
||||
NEXT_4_LINES 1 |
||||
jmp .w16 |
||||
.w16_end: |
||||
|
||||
sub wd, 16 |
||||
jl .w4_end |
||||
|
||||
OFFSET_TO_W4 |
||||
lea src1q, [srcq + pstride] |
||||
|
||||
DMVR_H_1xW16 m0, srcq, src1q, 0, m5 |
||||
.w4: |
||||
sub hd, 4 |
||||
jl .w4_end |
||||
DMVR_HV_4xW16 dstq, dsq, ds3q, srcq, ssq, ss3q, src1q |
||||
NEXT_4_LINES 1 |
||||
jmp .w4 |
||||
.w4_end: |
||||
|
||||
RET |
||||
%endmacro |
||||
|
||||
%macro VVC_DMVR_AVX2 1 |
||||
DMVR_AVX2 %1 |
||||
DMVR_H_AVX2 %1 |
||||
DMVR_V_AVX2 %1 |
||||
DMVR_HV_AVX2 %1 |
||||
%endmacro |
||||
|
||||
INIT_YMM avx2 |
||||
|
||||
VVC_DMVR_AVX2 8 |
||||
VVC_DMVR_AVX2 10 |
||||
VVC_DMVR_AVX2 12 |
||||
|
||||
%endif ; HAVE_AVX2_EXTERNAL |
||||
|
||||
%endif ; ARCH_X86_64 |
Loading…
Reference in new issue