mirror of https://github.com/FFmpeg/FFmpeg.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
302 lines
8.2 KiB
302 lines
8.2 KiB
; /* |
|
; * Provide SIMD MC functions for VVC decoding |
|
; * |
|
; * Copyright © 2021, VideoLAN and dav1d authors |
|
; * Copyright © 2021, Two Orioles, LLC |
|
; * All rights reserved. |
|
; * |
|
; * Copyright (c) 2023-2024 Nuo Mi |
|
; * Copyright (c) 2023-2024 Wu Jianhua |
|
; * |
|
; * This file is part of FFmpeg. |
|
; * |
|
; * FFmpeg is free software; you can redistribute it and/or |
|
; * modify it under the terms of the GNU Lesser General Public |
|
; * License as published by the Free Software Foundation; either |
|
; * version 2.1 of the License, or (at your option) any later version. |
|
; * |
|
; * FFmpeg is distributed in the hope that it will be useful, |
|
; * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
; * Lesser General Public License for more details. |
|
; * |
|
; * You should have received a copy of the GNU Lesser General Public |
|
; * License along with FFmpeg; if not, write to the Free Software |
|
; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
; */ |
|
|
|
%include "libavutil/x86/x86util.asm" |
|
|
|
%define MAX_PB_SIZE 128 |
|
|
|
SECTION_RODATA 32 |
|
|
|
%if ARCH_X86_64 |
|
|
|
%if HAVE_AVX2_EXTERNAL |
|
|
|
pw_0 times 2 dw 0 |
|
pw_1 times 2 dw 1 |
|
pw_4 times 2 dw 4 |
|
pw_12 times 2 dw 12 |
|
pw_256 times 2 dw 256 |
|
|
|
%macro AVG_JMP_TABLE 3-* |
|
%xdefine %1_%2_%3_table (%%table - 2*%4) |
|
%xdefine %%base %1_%2_%3_table |
|
%xdefine %%prefix mangle(private_prefix %+ _vvc_%1_%2bpc_%3) |
|
%%table: |
|
%rep %0 - 3 |
|
dd %%prefix %+ .w%4 - %%base |
|
%rotate 1 |
|
%endrep |
|
%endmacro |
|
|
|
AVG_JMP_TABLE avg, 8, avx2, 2, 4, 8, 16, 32, 64, 128 |
|
AVG_JMP_TABLE avg, 16, avx2, 2, 4, 8, 16, 32, 64, 128 |
|
AVG_JMP_TABLE w_avg, 8, avx2, 2, 4, 8, 16, 32, 64, 128 |
|
AVG_JMP_TABLE w_avg, 16, avx2, 2, 4, 8, 16, 32, 64, 128 |
|
|
|
SECTION .text |
|
|
|
%macro AVG_W16_FN 3 ; bpc, op, count |
|
%assign %%i 0 |
|
%rep %3 |
|
%define off %%i |
|
AVG_LOAD_W16 0, off |
|
%2 |
|
AVG_SAVE_W16 %1, 0, off |
|
|
|
|
|
AVG_LOAD_W16 1, off |
|
%2 |
|
AVG_SAVE_W16 %1, 1, off |
|
|
|
%assign %%i %%i+1 |
|
%endrep |
|
%endmacro |
|
|
|
%macro AVG_FN 2 ; bpc, op |
|
jmp wq |
|
|
|
.w2: |
|
movd xm0, [src0q] |
|
pinsrd xm0, [src0q + AVG_SRC_STRIDE], 1 |
|
movd xm1, [src1q] |
|
pinsrd xm1, [src1q + AVG_SRC_STRIDE], 1 |
|
%2 |
|
AVG_SAVE_W2 %1 |
|
AVG_LOOP_END .w2 |
|
|
|
.w4: |
|
movq xm0, [src0q] |
|
pinsrq xm0, [src0q + AVG_SRC_STRIDE], 1 |
|
movq xm1, [src1q] |
|
pinsrq xm1, [src1q + AVG_SRC_STRIDE], 1 |
|
%2 |
|
AVG_SAVE_W4 %1 |
|
|
|
AVG_LOOP_END .w4 |
|
|
|
.w8: |
|
vinserti128 m0, m0, [src0q], 0 |
|
vinserti128 m0, m0, [src0q + AVG_SRC_STRIDE], 1 |
|
vinserti128 m1, m1, [src1q], 0 |
|
vinserti128 m1, m1, [src1q + AVG_SRC_STRIDE], 1 |
|
%2 |
|
AVG_SAVE_W8 %1 |
|
|
|
AVG_LOOP_END .w8 |
|
|
|
.w16: |
|
AVG_W16_FN %1, %2, 1 |
|
|
|
AVG_LOOP_END .w16 |
|
|
|
.w32: |
|
AVG_W16_FN %1, %2, 2 |
|
|
|
AVG_LOOP_END .w32 |
|
|
|
.w64: |
|
AVG_W16_FN %1, %2, 4 |
|
|
|
AVG_LOOP_END .w64 |
|
|
|
.w128: |
|
AVG_W16_FN %1, %2, 8 |
|
|
|
AVG_LOOP_END .w128 |
|
|
|
.ret: |
|
RET |
|
%endmacro |
|
|
|
%macro AVG 0 |
|
paddsw m0, m1 |
|
pmulhrsw m0, m2 |
|
CLIPW m0, m3, m4 |
|
%endmacro |
|
|
|
%macro W_AVG 0 |
|
punpckhwd m5, m0, m1 |
|
pmaddwd m5, m3 |
|
paddd m5, m4 |
|
psrad m5, xm2 |
|
|
|
punpcklwd m0, m0, m1 |
|
pmaddwd m0, m3 |
|
paddd m0, m4 |
|
psrad m0, xm2 |
|
|
|
packssdw m0, m5 |
|
CLIPW m0, m6, m7 |
|
%endmacro |
|
|
|
%macro AVG_LOAD_W16 2 ; line, offset |
|
movu m0, [src0q + %1 * AVG_SRC_STRIDE + %2 * 32] |
|
movu m1, [src1q + %1 * AVG_SRC_STRIDE + %2 * 32] |
|
%endmacro |
|
|
|
%macro AVG_SAVE_W2 1 ;bpc |
|
%if %1 == 16 |
|
pextrd [dstq], xm0, 0 |
|
pextrd [dstq + strideq], xm0, 1 |
|
%else |
|
packuswb m0, m0 |
|
pextrw [dstq], xm0, 0 |
|
pextrw [dstq + strideq], xm0, 1 |
|
%endif |
|
%endmacro |
|
|
|
%macro AVG_SAVE_W4 1 ;bpc |
|
%if %1 == 16 |
|
pextrq [dstq], xm0, 0 |
|
pextrq [dstq + strideq], xm0, 1 |
|
%else |
|
packuswb m0, m0 |
|
pextrd [dstq], xm0, 0 |
|
pextrd [dstq + strideq], xm0, 1 |
|
%endif |
|
%endmacro |
|
|
|
%macro AVG_SAVE_W8 1 ;bpc |
|
%if %1 == 16 |
|
vextracti128 [dstq], m0, 0 |
|
vextracti128 [dstq + strideq], m0, 1 |
|
%else |
|
packuswb m0, m0 |
|
vpermq m0, m0, 1000b |
|
pextrq [dstq], xm0, 0 |
|
pextrq [dstq + strideq], xm0, 1 |
|
%endif |
|
%endmacro |
|
|
|
%macro AVG_SAVE_W16 3 ; bpc, line, offset |
|
%if %1 == 16 |
|
movu [dstq + %2 * strideq + %3 * 32], m0 |
|
%else |
|
packuswb m0, m0 |
|
vpermq m0, m0, 1000b |
|
vextracti128 [dstq + %2 * strideq + %3 * 16], m0, 0 |
|
%endif |
|
%endmacro |
|
|
|
%macro AVG_LOOP_END 1 |
|
sub hd, 2 |
|
je .ret |
|
|
|
lea src0q, [src0q + 2 * AVG_SRC_STRIDE] |
|
lea src1q, [src1q + 2 * AVG_SRC_STRIDE] |
|
lea dstq, [dstq + 2 * strideq] |
|
jmp %1 |
|
%endmacro |
|
|
|
%define AVG_SRC_STRIDE MAX_PB_SIZE*2 |
|
|
|
;void ff_vvc_avg_%1bpc_avx2(uint8_t *dst, ptrdiff_t dst_stride, |
|
; const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max); |
|
%macro VVC_AVG_AVX2 1 |
|
cglobal vvc_avg_%1bpc, 4, 7, 5, dst, stride, src0, src1, w, h, bd |
|
movifnidn hd, hm |
|
|
|
pxor m3, m3 ; pixel min |
|
vpbroadcastw m4, bdm ; pixel max |
|
|
|
movifnidn bdd, bdm |
|
inc bdd |
|
tzcnt bdd, bdd ; bit depth |
|
|
|
sub bdd, 8 |
|
movd xm0, bdd |
|
vpbroadcastd m1, [pw_4] |
|
pminuw m0, m1 |
|
vpbroadcastd m2, [pw_256] |
|
psllw m2, xm0 ; shift |
|
|
|
lea r6, [avg_%1 %+ SUFFIX %+ _table] |
|
tzcnt wd, wm |
|
movsxd wq, dword [r6+wq*4] |
|
add wq, r6 |
|
AVG_FN %1, AVG |
|
%endmacro |
|
|
|
;void ff_vvc_w_avg_%1bpc_avx(uint8_t *dst, ptrdiff_t dst_stride, |
|
; const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, |
|
; intptr_t denom, intptr_t w0, intptr_t w1, intptr_t o0, intptr_t o1, intptr_t pixel_max); |
|
%macro VVC_W_AVG_AVX2 1 |
|
cglobal vvc_w_avg_%1bpc, 4, 8, 8, dst, stride, src0, src1, w, h, t0, t1 |
|
|
|
movifnidn hd, hm |
|
|
|
movifnidn t0d, r8m ; w1 |
|
shl t0d, 16 |
|
mov t0w, r7m ; w0 |
|
movd xm3, t0d |
|
vpbroadcastd m3, xm3 ; w0, w1 |
|
|
|
pxor m6, m6 ;pixel min |
|
vpbroadcastw m7, r11m ;pixel max |
|
|
|
mov t1q, rcx ; save ecx |
|
mov ecx, r11m |
|
inc ecx ; bd |
|
tzcnt ecx, ecx |
|
sub ecx, 8 |
|
mov t0d, r9m ; o0 |
|
add t0d, r10m ; o1 |
|
shl t0d, cl |
|
inc t0d ;((o0 + o1) << (BIT_DEPTH - 8)) + 1 |
|
|
|
neg ecx |
|
add ecx, 4 ; bd - 12 |
|
cmovl ecx, [pw_0] |
|
add ecx, 3 |
|
add ecx, r6m |
|
movd xm2, ecx ; shift |
|
|
|
dec ecx |
|
shl t0d, cl |
|
movd xm4, t0d |
|
vpbroadcastd m4, xm4 ; offset |
|
mov rcx, t1q ; restore ecx |
|
|
|
lea r6, [w_avg_%1 %+ SUFFIX %+ _table] |
|
tzcnt wd, wm |
|
movsxd wq, dword [r6+wq*4] |
|
add wq, r6 |
|
AVG_FN %1, W_AVG |
|
%endmacro |
|
|
|
INIT_YMM avx2 |
|
|
|
VVC_AVG_AVX2 16 |
|
|
|
VVC_AVG_AVX2 8 |
|
|
|
VVC_W_AVG_AVX2 16 |
|
|
|
VVC_W_AVG_AVX2 8 |
|
%endif |
|
|
|
%endif
|
|
|