mirror of https://github.com/FFmpeg/FFmpeg.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Tag:
Branch:
Tree:
2336fc44ac
master
oldabi
release/0.10
release/0.11
release/0.5
release/0.6
release/0.7
release/0.8
release/0.9
release/1.0
release/1.1
release/1.2
release/2.0
release/2.1
release/2.2
release/2.3
release/2.4
release/2.5
release/2.6
release/2.7
release/2.8
release/3.0
release/3.1
release/3.2
release/3.3
release/3.4
release/4.0
release/4.1
release/4.2
release/4.3
release/4.4
release/5.0
release/5.1
release/6.0
release/6.1
release/7.0
release/7.1
N
ffmpeg-0.6.3
n0.10
n0.10.1
n0.10.10
n0.10.11
n0.10.12
n0.10.13
n0.10.14
n0.10.15
n0.10.16
n0.10.2
n0.10.3
n0.10.4
n0.10.5
n0.10.6
n0.10.7
n0.10.8
n0.10.9
n0.11
n0.11-dev
n0.11.1
n0.11.2
n0.11.3
n0.11.4
n0.11.5
n0.12-dev
n0.5.10
n0.5.11
n0.5.12
n0.5.13
n0.5.14
n0.5.15
n0.5.5
n0.5.6
n0.5.7
n0.5.8
n0.5.9
n0.6.4
n0.6.5
n0.6.6
n0.6.7
n0.7.1
n0.7.10
n0.7.11
n0.7.12
n0.7.13
n0.7.14
n0.7.15
n0.7.16
n0.7.17
n0.7.2
n0.7.3
n0.7.4
n0.7.5
n0.7.6
n0.7.7
n0.7.8
n0.7.9
n0.8
n0.8.1
n0.8.10
n0.8.11
n0.8.12
n0.8.13
n0.8.14
n0.8.15
n0.8.2
n0.8.3
n0.8.4
n0.8.5
n0.8.6
n0.8.7
n0.8.8
n0.8.9
n0.9
n0.9.1
n0.9.2
n0.9.3
n0.9.4
n1.0
n1.0.1
n1.0.10
n1.0.2
n1.0.3
n1.0.4
n1.0.5
n1.0.6
n1.0.7
n1.0.8
n1.0.9
n1.1
n1.1-dev
n1.1.1
n1.1.10
n1.1.11
n1.1.12
n1.1.13
n1.1.14
n1.1.15
n1.1.16
n1.1.2
n1.1.3
n1.1.4
n1.1.5
n1.1.6
n1.1.7
n1.1.8
n1.1.9
n1.2
n1.2-dev
n1.2.1
n1.2.10
n1.2.11
n1.2.12
n1.2.2
n1.2.3
n1.2.4
n1.2.5
n1.2.6
n1.2.7
n1.2.8
n1.2.9
n1.3-dev
n2.0
n2.0.1
n2.0.2
n2.0.3
n2.0.4
n2.0.5
n2.0.6
n2.0.7
n2.1
n2.1-dev
n2.1.1
n2.1.2
n2.1.3
n2.1.4
n2.1.5
n2.1.6
n2.1.7
n2.1.8
n2.2
n2.2-dev
n2.2-rc1
n2.2-rc2
n2.2.1
n2.2.10
n2.2.11
n2.2.12
n2.2.13
n2.2.14
n2.2.15
n2.2.16
n2.2.2
n2.2.3
n2.2.4
n2.2.5
n2.2.6
n2.2.7
n2.2.8
n2.2.9
n2.3
n2.3-dev
n2.3.1
n2.3.2
n2.3.3
n2.3.4
n2.3.5
n2.3.6
n2.4
n2.4-dev
n2.4.1
n2.4.10
n2.4.11
n2.4.12
n2.4.13
n2.4.14
n2.4.2
n2.4.3
n2.4.4
n2.4.5
n2.4.6
n2.4.7
n2.4.8
n2.4.9
n2.5
n2.5-dev
n2.5.1
n2.5.10
n2.5.11
n2.5.2
n2.5.3
n2.5.4
n2.5.5
n2.5.6
n2.5.7
n2.5.8
n2.5.9
n2.6
n2.6-dev
n2.6.1
n2.6.2
n2.6.3
n2.6.4
n2.6.5
n2.6.6
n2.6.7
n2.6.8
n2.6.9
n2.7
n2.7-dev
n2.7.1
n2.7.2
n2.7.3
n2.7.4
n2.7.5
n2.7.6
n2.7.7
n2.8
n2.8-dev
n2.8.1
n2.8.10
n2.8.11
n2.8.12
n2.8.13
n2.8.14
n2.8.15
n2.8.16
n2.8.17
n2.8.18
n2.8.19
n2.8.2
n2.8.20
n2.8.21
n2.8.22
n2.8.3
n2.8.4
n2.8.5
n2.8.6
n2.8.7
n2.8.8
n2.8.9
n2.9-dev
n3.0
n3.0.1
n3.0.10
n3.0.11
n3.0.12
n3.0.2
n3.0.3
n3.0.4
n3.0.5
n3.0.6
n3.0.7
n3.0.8
n3.0.9
n3.1
n3.1-dev
n3.1.1
n3.1.10
n3.1.11
n3.1.2
n3.1.3
n3.1.4
n3.1.5
n3.1.6
n3.1.7
n3.1.8
n3.1.9
n3.2
n3.2-dev
n3.2.1
n3.2.10
n3.2.11
n3.2.12
n3.2.13
n3.2.14
n3.2.15
n3.2.16
n3.2.17
n3.2.18
n3.2.19
n3.2.2
n3.2.3
n3.2.4
n3.2.5
n3.2.6
n3.2.7
n3.2.8
n3.2.9
n3.3
n3.3-dev
n3.3.1
n3.3.2
n3.3.3
n3.3.4
n3.3.5
n3.3.6
n3.3.7
n3.3.8
n3.3.9
n3.4
n3.4-dev
n3.4.1
n3.4.10
n3.4.11
n3.4.12
n3.4.13
n3.4.2
n3.4.3
n3.4.4
n3.4.5
n3.4.6
n3.4.7
n3.4.8
n3.4.9
n3.5-dev
n4.0
n4.0.1
n4.0.2
n4.0.3
n4.0.4
n4.0.5
n4.0.6
n4.1
n4.1-dev
n4.1.1
n4.1.10
n4.1.11
n4.1.2
n4.1.3
n4.1.4
n4.1.5
n4.1.6
n4.1.7
n4.1.8
n4.1.9
n4.2
n4.2-dev
n4.2.1
n4.2.10
n4.2.2
n4.2.3
n4.2.4
n4.2.5
n4.2.6
n4.2.7
n4.2.8
n4.2.9
n4.3
n4.3-dev
n4.3.1
n4.3.2
n4.3.3
n4.3.4
n4.3.5
n4.3.6
n4.3.7
n4.3.8
n4.4
n4.4-dev
n4.4.1
n4.4.2
n4.4.3
n4.4.4
n4.4.5
n4.5-dev
n5.0
n5.0.1
n5.0.2
n5.0.3
n5.1
n5.1-dev
n5.1.1
n5.1.2
n5.1.3
n5.1.4
n5.1.5
n5.1.6
n5.2-dev
n6.0
n6.0.1
n6.1
n6.1-dev
n6.1.1
n6.1.2
n6.2-dev
n7.0
n7.0.1
n7.0.2
n7.1
n7.1-dev
n7.2-dev
v0.5
v0.5.1
v0.5.2
v0.5.3
v0.6
v0.6.1
${ noResults }
FFmpeg/libavcodec/x86/vvc/alf.asm
877 lines
24 KiB
877 lines
24 KiB
;******************************************************************************
|
|||
;* VVC Adaptive Loop Filter SIMD optimizations
|
|||
;*
|
|||
;* Copyright (c) 2023-2024 Nuo Mi <nuomi2021@gmail.com>
|
|||
;* Copyright (c) 2023-2024 Wu Jianhua <toqsxw@outlook.com>
|
|||
;*
|
|||
;* This file is part of FFmpeg.
|
|||
;*
|
|||
;* FFmpeg is free software; you can redistribute it and/or
|
|||
;* modify it under the terms of the GNU Lesser General Public
|
|||
;* License as published by the Free Software Foundation; either
|
|||
;* version 2.1 of the License, or (at your option) any later version.
|
|||
;*
|
|||
;* FFmpeg is distributed in the hope that it will be useful,
|
|||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|||
;* Lesser General Public License for more details.
|
|||
;*
|
|||
;* You should have received a copy of the GNU Lesser General Public
|
|||
;* License along with FFmpeg; if not, write to the Free Software
|
|||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|||
;******************************************************************************
|
|||
|
|||
%include "libavutil/x86/x86util.asm"
|
|||
|
|||
SECTION_RODATA
|
|||
|
|||
%macro PARAM_SHUFFE 1
|
|||
%assign i (%1 * 2)
|
|||
%assign j ((i + 1) << 8) + (i)
|
|||
param_shuffe_ %+ %1:
|
|||
%rep 2
|
|||
times 4 dw j
|
|||
times 4 dw (j + 0x0808)
|
|||
%endrep
|
|||
%endmacro
|
|||
|
|||
PARAM_SHUFFE 0
|
|||
PARAM_SHUFFE 1
|
|||
PARAM_SHUFFE 2
|
|||
PARAM_SHUFFE 3
|
|||
|
|||
CLASSIFY_SHUFFE: times 2 db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
|
|||
TRANSPOSE_PERMUTE: dd 0, 1, 4, 5, 2, 3, 6, 7
|
|||
ARG_VAR_SHUFFE: times 2 db 0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4
|
|||
|
|||
dd448: times 8 dd 512 - 64
|
|||
dw64: times 8 dd 64
|
|||
dd2: times 8 dd 2
|
|||
dw3: times 8 dd 3
|
|||
dw5: times 8 dd 5
|
|||
dd15: times 8 dd 15
|
|||
|
|||
SECTION .text
|
|||
|
|||
|
|||
%define ALF_NUM_COEFF_LUMA 12
|
|||
%define ALF_NUM_COEFF_CHROMA 6
|
|||
%define ALF_NUM_COEFF_CC 7
|
|||
|
|||
;%1-%3 out
|
|||
;%4 clip or filter
|
|||
%macro LOAD_LUMA_PARAMS_W16 4
|
|||
lea offsetq, [3 * xq] ;xq * ALF_NUM_COEFF_LUMA / ALF_BLOCK_SIZE
|
|||
movu m%1, [%4q + 2 * offsetq + 0 * 32] ; 2 * for sizeof(int16_t)
|
|||
movu m%2, [%4q + 2 * offsetq + 1 * 32]
|
|||
movu m%3, [%4q + 2 * offsetq + 2 * 32]
|
|||
%endmacro
|
|||
|
|||
%macro LOAD_LUMA_PARAMS_W16 6
|
|||
LOAD_LUMA_PARAMS_W16 %1, %2, %3, %4
|
|||
;m%1 = 03 02 01 00
|
|||
;m%2 = 07 06 05 04
|
|||
;m%3 = 11 10 09 08
|
|||
|
|||
shufpd m%5, m%1, m%2, 0011b ;06 02 05 01
|
|||
shufpd m%6, m%3, m%5, 1001b ;06 10 01 09
|
|||
|
|||
shufpd m%1, m%1, m%6, 1100b ;06 03 09 00
|
|||
shufpd m%2, m%2, m%6, 0110b ;10 07 01 04
|
|||
shufpd m%3, m%3, m%5, 0110b ;02 11 05 08
|
|||
|
|||
vpermpd m%1, m%1, 01111000b ;09 06 03 00
|
|||
shufpd m%2, m%2, m%2, 1001b ;10 07 04 01
|
|||
vpermpd m%3, m%3, 10000111b ;11 08 05 02
|
|||
%endmacro
|
|||
|
|||
; %1-%3 out
|
|||
; %4 clip or filter
|
|||
; %5-%6 tmp
|
|||
%macro LOAD_LUMA_PARAMS 6
|
|||
LOAD_LUMA_PARAMS_W16 %1, %2, %3, %4, %5, %6
|
|||
%endmacro
|
|||
|
|||
%macro LOAD_CHROMA_PARAMS 4
|
|||
; LOAD_CHROMA_PARAMS_W %+ WIDTH %1, %2, %3, %4
|
|||
movq xm%1, [%3q]
|
|||
movd xm%2, [%3q + 8]
|
|||
vpbroadcastq m%1, xm%1
|
|||
vpbroadcastq m%2, xm%2
|
|||
%endmacro
|
|||
|
|||
%macro LOAD_PARAMS 0
|
|||
%if LUMA
|
|||
LOAD_LUMA_PARAMS 3, 4, 5, filter, 6, 7
|
|||
LOAD_LUMA_PARAMS 6, 7, 8, clip, 9, 10
|
|||
%else
|
|||
LOAD_CHROMA_PARAMS 3, 4, filter, 5
|
|||
LOAD_CHROMA_PARAMS 6, 7, clip, 8
|
|||
%endif
|
|||
%endmacro
|
|||
|
|||
; FILTER(param_idx)
|
|||
; input: m2, m9, m10
|
|||
; output: m0, m1
|
|||
; tmp: m11-m13
|
|||
%macro FILTER 1
|
|||
%assign i (%1 % 4)
|
|||
%assign j (%1 / 4 + 3)
|
|||
%assign k (%1 / 4 + 6)
|
|||
%define filters m %+ j
|
|||
%define clips m %+ k
|
|||
|
|||
pshufb m12, clips, [param_shuffe_ %+ i] ;clip
|
|||
pxor m11, m11
|
|||
psubw m11, m12 ;-clip
|
|||
|
|||
psubw m9, m2
|
|||
CLIPW m9, m11, m12
|
|||
|
|||
psubw m10, m2
|
|||
CLIPW m10, m11, m12
|
|||
|
|||
punpckhwd m13, m9, m10
|
|||
punpcklwd m9, m9, m10
|
|||
|
|||
pshufb m12, filters, [param_shuffe_ %+ i] ;filter
|
|||
punpcklwd m10, m12, m12
|
|||
punpckhwd m12, m12, m12
|
|||
|
|||
pmaddwd m9, m10
|
|||
pmaddwd m12, m13
|
|||
|
|||
paddd m0, m9
|
|||
paddd m1, m12
|
|||
%endmacro
|
|||
|
|||
; FILTER(param_idx, bottom, top, byte_offset)
|
|||
; input: param_idx, bottom, top, byte_offset
|
|||
; output: m0, m1
|
|||
; temp: m9, m10
|
|||
%macro FILTER 4
|
|||
LOAD_PIXELS m10, [%2 + %4]
|
|||
LOAD_PIXELS m9, [%3 - %4]
|
|||
FILTER %1
|
|||
%endmacro
|
|||
|
|||
; GET_SRCS(line)
|
|||
; brief: get source lines
|
|||
; input: src, src_stride, vb_pos
|
|||
; output: s1...s6
|
|||
%macro GET_SRCS 1
|
|||
lea s1q, [srcq + src_strideq]
|
|||
lea s3q, [s1q + src_strideq]
|
|||
%if LUMA
|
|||
lea s5q, [s3q + src_strideq]
|
|||
%endif
|
|||
neg src_strideq
|
|||
lea s2q, [srcq + src_strideq]
|
|||
lea s4q, [s2q + src_strideq]
|
|||
%if LUMA
|
|||
lea s6q, [s4q + src_strideq]
|
|||
%endif
|
|||
neg src_strideq
|
|||
|
|||
%if LUMA
|
|||
cmp vb_posq, 0
|
|||
je %%vb_bottom
|
|||
cmp vb_posq, 4
|
|||
jne %%vb_end
|
|||
%else
|
|||
cmp vb_posq, 2
|
|||
jne %%vb_end
|
|||
cmp %1, 2
|
|||
jge %%vb_bottom
|
|||
%endif
|
|||
|
|||
%%vb_above:
|
|||
; above
|
|||
; p1 = (y + i == vb_pos - 1) ? p0 : p1;
|
|||
; p2 = (y + i == vb_pos - 1) ? p0 : p2;
|
|||
; p3 = (y + i >= vb_pos - 2) ? p1 : p3;
|
|||
; p4 = (y + i >= vb_pos - 2) ? p2 : p4;
|
|||
; p5 = (y + i >= vb_pos - 3) ? p3 : p5;
|
|||
; p6 = (y + i >= vb_pos - 3) ? p4 : p6;
|
|||
dec vb_posq
|
|||
cmp vb_posq, %1
|
|||
cmove s1q, srcq
|
|||
cmove s2q, srcq
|
|||
|
|||
dec vb_posq
|
|||
cmp vb_posq, %1
|
|||
cmovbe s3q, s1q
|
|||
cmovbe s4q, s2q
|
|||
|
|||
dec vb_posq
|
|||
%if LUMA
|
|||
cmp vb_posq, %1
|
|||
cmovbe s5q, s3q
|
|||
cmovbe s6q, s4q
|
|||
%endif
|
|||
add vb_posq, 3
|
|||
jmp %%vb_end
|
|||
|
|||
%%vb_bottom:
|
|||
; bottom
|
|||
; p1 = (y + i == vb_pos ) ? p0 : p1;
|
|||
; p2 = (y + i == vb_pos ) ? p0 : p2;
|
|||
; p3 = (y + i <= vb_pos + 1) ? p1 : p3;
|
|||
; p4 = (y + i <= vb_pos + 1) ? p2 : p4;
|
|||
; p5 = (y + i <= vb_pos + 2) ? p3 : p5;
|
|||
; p6 = (y + i <= vb_pos + 2) ? p4 : p6;
|
|||
cmp vb_posq, %1
|
|||
cmove s1q, srcq
|
|||
cmove s2q, srcq
|
|||
|
|||
inc vb_posq
|
|||
cmp vb_posq, %1
|
|||
cmovae s3q, s1q
|
|||
cmovae s4q, s2q
|
|||
|
|||
inc vb_posq
|
|||
%if LUMA
|
|||
cmp vb_posq, %1
|
|||
cmovae s5q, s3q
|
|||
cmovae s6q, s4q
|
|||
%endif
|
|||
sub vb_posq, 2
|
|||
%%vb_end:
|
|||
%endmacro
|
|||
|
|||
; SHIFT_VB(line)
|
|||
; brief: shift filter result
|
|||
; input: m0, m1, vb_pos
|
|||
; output: m0
|
|||
; temp: m9
|
|||
%macro SHIFT_VB 1
|
|||
%define SHIFT 7
|
|||
%if LUMA
|
|||
cmp %1, 3
|
|||
je %%near_above
|
|||
cmp %1, 0
|
|||
je %%near_below
|
|||
jmp %%no_vb
|
|||
%%near_above:
|
|||
cmp vb_posq, 4
|
|||
je %%near_vb
|
|||
jmp %%no_vb
|
|||
%%near_below:
|
|||
cmp vb_posq, 0
|
|||
je %%near_vb
|
|||
%else
|
|||
cmp %1, 0
|
|||
je %%no_vb
|
|||
cmp %1, 3
|
|||
je %%no_vb
|
|||
cmp vb_posq, 2
|
|||
je %%near_vb
|
|||
%endif
|
|||
%%no_vb:
|
|||
psrad m0, SHIFT
|
|||
psrad m1, SHIFT
|
|||
jmp %%shift_end
|
|||
%%near_vb:
|
|||
vpbroadcastd m9, [dd448]
|
|||
paddd m0, m9
|
|||
paddd m1, m9
|
|||
psrad m0, SHIFT + 3
|
|||
psrad m1, SHIFT + 3
|
|||
%%shift_end:
|
|||
packssdw m0, m0, m1
|
|||
%endmacro
|
|||
|
|||
; FILTER_VB(line)
|
|||
; brief: filter pixels for luma and chroma
|
|||
; input: line
|
|||
; output: m0, m1
|
|||
; temp: s0q...s1q
|
|||
%macro FILTER_VB 1
|
|||
vpbroadcastd m0, [dw64]
|
|||
vpbroadcastd m1, [dw64]
|
|||
|
|||
GET_SRCS %1
|
|||
%if LUMA
|
|||
FILTER 0, s5q, s6q, 0 * ps
|
|||
FILTER 1, s3q, s4q, 1 * ps
|
|||
FILTER 2, s3q, s4q, 0 * ps
|
|||
FILTER 3, s3q, s4q, -1 * ps
|
|||
FILTER 4, s1q, s2q, 2 * ps
|
|||
FILTER 5, s1q, s2q, 1 * ps
|
|||
FILTER 6, s1q, s2q, 0 * ps
|
|||
FILTER 7, s1q, s2q, -1 * ps
|
|||
FILTER 8, s1q, s2q, -2 * ps
|
|||
FILTER 9, srcq, srcq, 3 * ps
|
|||
FILTER 10, srcq, srcq, 2 * ps
|
|||
FILTER 11, srcq, srcq, 1 * ps
|
|||
%else
|
|||
FILTER 0, s3q, s4q, 0 * ps
|
|||
FILTER 1, s1q, s2q, 1 * ps
|
|||
FILTER 2, s1q, s2q, 0 * ps
|
|||
FILTER 3, s1q, s2q, -1 * ps
|
|||
FILTER 4, srcq, srcq, 2 * ps
|
|||
FILTER 5, srcq, srcq, 1 * ps
|
|||
%endif
|
|||
SHIFT_VB %1
|
|||
%endmacro
|
|||
|
|||
; LOAD_PIXELS(dest, src)
|
|||
%macro LOAD_PIXELS 2
|
|||
%if ps == 2
|
|||
movu %1, %2
|
|||
%else
|
|||
pmovzxbw %1, %2
|
|||
%endif
|
|||
%endmacro
|
|||
|
|||
; STORE_PIXELS_W16(dst, src)
|
|||
%macro STORE_PIXELS_W16 2
|
|||
%if ps == 2
|
|||
movu [%1], m%2
|
|||
%else
|
|||
movu [%1], xm%2
|
|||
%endif
|
|||
%endmacro
|
|||
|
|||
%macro STORE_PIXELS_W8 2
|
|||
%if ps == 2
|
|||
movu [%1], xm%2
|
|||
%else
|
|||
movq [%1], xm%2
|
|||
%endif
|
|||
%endmacro
|
|||
|
|||
; STORE_PIXELS_W4(dst, src, offset)
|
|||
%macro STORE_PIXELS_W4 3
|
|||
%if ps == 2
|
|||
movq [%1 + %3 * ps], xm%2
|
|||
%else
|
|||
movd [%1 + %3], xm%2
|
|||
%endif
|
|||
%endmacro
|
|||
|
|||
%macro STORE_PIXELS_W8LE 3
|
|||
cmp %3, 8
|
|||
jl .w4
|
|||
STORE_PIXELS_W8 %1, %2
|
|||
cmp %3, 12
|
|||
%if ps == 2
|
|||
vpermq m%2, m%2, q0302
|
|||
%else
|
|||
vpermq m%2, m%2, q0101
|
|||
%endif
|
|||
jl .end
|
|||
STORE_PIXELS_W4 %1, %2, 8
|
|||
jmp .end
|
|||
.w4:
|
|||
STORE_PIXELS_W4 %1, %2, 0
|
|||
.end:
|
|||
%endmacro
|
|||
|
|||
; STORE_PIXELS(dst, src, width)
|
|||
%macro STORE_PIXELS 3
|
|||
%if ps == 1
|
|||
packuswb m%2, m%2
|
|||
vpermq m%2, m%2, 0x8
|
|||
%endif
|
|||
|
|||
%ifidn %3, 16
|
|||
STORE_PIXELS_W16 %1, %2
|
|||
%else
|
|||
%if LUMA
|
|||
STORE_PIXELS_W8 %1, %2
|
|||
%else
|
|||
STORE_PIXELS_W8LE %1, %2, %3
|
|||
%endif
|
|||
%endif
|
|||
%endmacro
|
|||
|
|||
%macro FILTER_16x4 1
|
|||
%if LUMA
|
|||
push clipq
|
|||
push strideq
|
|||
%define s1q clipq
|
|||
%define s2q strideq
|
|||
%else
|
|||
%define s1q s5q
|
|||
%define s2q s6q
|
|||
%endif
|
|||
|
|||
%define s3q pixel_maxq
|
|||
%define s4q offsetq
|
|||
push xq
|
|||
|
|||
xor xq, xq
|
|||
%%filter_16x4_loop:
|
|||
LOAD_PIXELS m2, [srcq] ;p0
|
|||
|
|||
FILTER_VB xq
|
|||
|
|||
; sum += curr
|
|||
paddsw m0, m2
|
|||
|
|||
; clip to pixel
|
|||
CLIPW m0, m14, m15
|
|||
|
|||
STORE_PIXELS dstq, 0, %1
|
|||
|
|||
lea srcq, [srcq + src_strideq]
|
|||
lea dstq, [dstq + dst_strideq]
|
|||
inc xq
|
|||
cmp xq, 4
|
|||
jl %%filter_16x4_loop
|
|||
|
|||
mov xq, src_strideq
|
|||
neg xq
|
|||
lea srcq, [srcq + xq * 4]
|
|||
mov xq, dst_strideq
|
|||
neg xq
|
|||
lea dstq, [dstq + xq * 4]
|
|||
|
|||
pop xq
|
|||
|
|||
%if LUMA
|
|||
pop strideq
|
|||
pop clipq
|
|||
%endif
|
|||
%endmacro
|
|||
|
|||
; FILTER(bpc, luma/chroma)
|
|||
%macro ALF_FILTER 2
|
|||
%xdefine BPC %1
|
|||
%ifidn %2, luma
|
|||
%xdefine LUMA 1
|
|||
%else
|
|||
%xdefine LUMA 0
|
|||
%endif
|
|||
|
|||
; ******************************
|
|||
; void vvc_alf_filter_%2_%1bpc_avx2(uint8_t *dst, ptrdiff_t dst_stride,
|
|||
; const uint8_t *src, ptrdiff_t src_stride, const ptrdiff_t width, cosnt ptr_diff_t height,
|
|||
; const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t vb_pos, ptrdiff_t pixel_max);
|
|||
; ******************************
|
|||
cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 16, 0-0x30, dst, dst_stride, src, src_stride, width, height, filter, clip, stride, vb_pos, pixel_max, \
|
|||
offset, x, s5, s6
|
|||
%define ps (%1 / 8) ; pixel size
|
|||
movd xm15, pixel_maxd
|
|||
vpbroadcastw m15, xm15
|
|||
pxor m14, m14
|
|||
|
|||
.loop:
|
|||
push srcq
|
|||
push dstq
|
|||
push widthq
|
|||
xor xq, xq
|
|||
|
|||
.loop_w:
|
|||
cmp widthq, 16
|
|||
jl .loop_w_end
|
|||
|
|||
LOAD_PARAMS
|
|||
FILTER_16x4 16
|
|||
|
|||
add srcq, 16 * ps
|
|||
add dstq, 16 * ps
|
|||
add xq, 16
|
|||
sub widthq, 16
|
|||
jmp .loop_w
|
|||
|
|||
.loop_w_end:
|
|||
cmp widthq, 0
|
|||
je .w_end
|
|||
|
|||
LOAD_PARAMS
|
|||
FILTER_16x4 widthq
|
|||
|
|||
.w_end:
|
|||
|
|||
pop widthq
|
|||
pop dstq
|
|||
pop srcq
|
|||
lea srcq, [srcq + 4 * src_strideq]
|
|||
lea dstq, [dstq + 4 * dst_strideq]
|
|||
|
|||
lea filterq, [filterq + 2 * strideq]
|
|||
lea clipq, [clipq + 2 * strideq]
|
|||
|
|||
sub vb_posq, 4
|
|||
sub heightq, 4
|
|||
jg .loop
|
|||
RET
|
|||
%endmacro
|
|||
|
|||
; FILTER(bpc)
|
|||
%macro ALF_FILTER 1
|
|||
ALF_FILTER %1, luma
|
|||
ALF_FILTER %1, chroma
|
|||
%endmacro
|
|||
|
|||
%define ALF_GRADIENT_BORDER 2
|
|||
%define ALF_BORDER_LUMA 3
|
|||
|
|||
; ******************************
|
|||
; void ff_vvc_alf_classify_grad(int *gradient_sum, const uint8_t *src,
|
|||
; ptrdiff_t src_stride, intptr_t width, intptr_t height, intptr_t vb_pos);
|
|||
; ******************************
|
|||
%macro ALF_CLASSIFY_GRAD 1
|
|||
cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, gradient_sum, src, src_stride, width, height, vb_pos, \
|
|||
x, y, s0, s1, s2, s3, vb_pos_below, src_stride3
|
|||
|
|||
lea src_stride3q, [src_strideq * 2 + src_strideq]
|
|||
|
|||
lea vb_pos_belowd, [vb_posd + ALF_GRADIENT_BORDER]
|
|||
|
|||
; src = src - ALF_BORDER_LUMA * src_stride - ALF_BORDER_LUMA
|
|||
sub srcq, src_stride3q
|
|||
sub srcq, ALF_BORDER_LUMA * ps
|
|||
|
|||
add widthd, ALF_GRADIENT_BORDER * 2
|
|||
add heightd, ALF_GRADIENT_BORDER * 2
|
|||
|
|||
xor yd, yd
|
|||
|
|||
.loop_h:
|
|||
xor xd, xd
|
|||
pxor m15, m15 ; prev
|
|||
.loop_w:
|
|||
lea s0q, [srcq + xq * ps]
|
|||
lea s1q, [s0q + src_strideq]
|
|||
lea s2q, [s0q + 2 * src_strideq]
|
|||
lea s3q, [s0q + src_stride3q]
|
|||
|
|||
cmp yd, vb_pos_belowd
|
|||
cmove s0q, s1q
|
|||
|
|||
cmp yd, vb_posd
|
|||
cmove s3q, s2q
|
|||
|
|||
LOAD_PIXELS m0, [s0q]
|
|||
LOAD_PIXELS m1, [s1q]
|
|||
LOAD_PIXELS m2, [s2q]
|
|||
LOAD_PIXELS m3, [s3q]
|
|||
|
|||
LOAD_PIXELS m4, [s0q + 2 * ps]
|
|||
LOAD_PIXELS m5, [s1q + 2 * ps]
|
|||
LOAD_PIXELS m6, [s2q + 2 * ps]
|
|||
LOAD_PIXELS m7, [s3q + 2 * ps]
|
|||
|
|||
pblendw m8, m0, m1, 0xaa ; nw
|
|||
pblendw m9, m0, m5, 0x55 ; n
|
|||
pblendw m10, m4, m5, 0xaa ; ne
|
|||
pblendw m11, m1, m2, 0xaa ; w
|
|||
pblendw m12, m5, m6, 0xaa ; e
|
|||
pblendw m13, m2, m3, 0xaa ; sw
|
|||
pblendw m14, m2, m7, 0x55 ; s
|
|||
|
|||
pblendw m0, m1, m6, 0x55
|
|||
paddw m0, m0 ; c
|
|||
|
|||
movu m1, [CLASSIFY_SHUFFE]
|
|||
pshufb m1, m0, m1 ; d
|
|||
|
|||
paddw m9, m14 ; n + s
|
|||
psubw m9, m0 ; (n + s) - c
|
|||
pabsw m9, m9 ; ver
|
|||
|
|||
paddw m11, m12 ; w + e
|
|||
psubw m11, m1 ; (w + e) - d
|
|||
pabsw m11, m11 ; hor
|
|||
|
|||
pblendw m14, m6, m7, 0xaa ; se
|
|||
paddw m8, m14 ; nw + se
|
|||
psubw m8, m1 ; (nw + se) - d
|
|||
pabsw m8, m8 ; di0
|
|||
|
|||
paddw m10, m13 ; ne + sw
|
|||
psubw m10, m1 ; (nw + se) - d
|
|||
pabsw m10, m10 ; di1
|
|||
|
|||
phaddw m9, m11 ; vh, each word represent 2x2 pixels
|
|||
phaddw m8, m10 ; di, each word represent 2x2 pixels
|
|||
phaddw m0, m9, m8 ; all = each word represent 4x2 pixels, order is v_h_d0_d1 x 4
|
|||
|
|||
vinserti128 m15, m15, xm0, 1
|
|||
pblendw m1, m0, m15, 0xaa ; t
|
|||
|
|||
phaddw m1, m0 ; each word represent 8x2 pixels, adjacent word share 4x2 pixels
|
|||
|
|||
vextracti128 xm15, m0, 1 ; prev
|
|||
|
|||
movu [gradient_sumq], m1
|
|||
|
|||
add gradient_sumq, 32
|
|||
add xd, 16
|
|||
cmp xd, widthd
|
|||
jl .loop_w
|
|||
|
|||
lea srcq, [srcq + 2 * src_strideq]
|
|||
add yd, 2
|
|||
cmp yd, heightd
|
|||
jl .loop_h
|
|||
RET
|
|||
%endmacro
|
|||
|
|||
; SAVE_CLASSIFY_PARAM_W16(dest, src)
|
|||
%macro SAVE_CLASSIFY_PARAM_W16 2
|
|||
lea tempq, [%1q + xq]
|
|||
movu [tempq], xm%2
|
|||
vperm2i128 m%2, m%2, m%2, 1
|
|||
movu [tempq + widthq], xm%2
|
|||
%endmacro
|
|||
|
|||
; SAVE_CLASSIFY_PARAM_W8
|
|||
%macro SAVE_CLASSIFY_PARAM_W8 2
|
|||
movq [%1], xm%2
|
|||
vperm2i128 m%2, m%2, m%2, 1
|
|||
movq [%1 + widthq], xm%2
|
|||
%endmacro
|
|||
|
|||
; SAVE_CLASSIFY_PARAM_W4
|
|||
%macro SAVE_CLASSIFY_PARAM_W4 2
|
|||
movd [%1], xm%2
|
|||
vperm2i128 m%2, m%2, m%2, 1
|
|||
movd [%1 + widthq], xm%2
|
|||
%endmacro
|
|||
|
|||
; SAVE_CLASSIFY_PARAM_W(dest, src)
|
|||
%macro SAVE_CLASSIFY_PARAM_W 2
|
|||
lea tempq, [%1q + xq]
|
|||
cmp wd, 8
|
|||
jl %%w4
|
|||
SAVE_CLASSIFY_PARAM_W8 tempq, %2
|
|||
vpermq m%2, m%2, 00010011b
|
|||
add tempq, 8
|
|||
cmp wd, 8
|
|||
je %%end
|
|||
%%w4:
|
|||
SAVE_CLASSIFY_PARAM_W4 tempq, %2
|
|||
%%end:
|
|||
%endmacro
|
|||
|
|||
%macro ALF_CLASSIFY_H8 0
|
|||
; first line, sum of 16x4 pixels (includes borders)
|
|||
lea gradq, [gradient_sumq + 2 * xq]
|
|||
movu m0, [gradq]
|
|||
movu m1, [gradq + sum_strideq]
|
|||
movu m2, [gradq + 2 * sum_strideq]
|
|||
|
|||
pcmpeqb m11, m11
|
|||
movd xm13, yd
|
|||
vpbroadcastd m13, xm13
|
|||
movd xm12, vb_posd
|
|||
vpbroadcastd m12, xm12
|
|||
pcmpeqd m13, m12 ; y == vb_pos
|
|||
pandn m13, m11 ; y != vb_pos
|
|||
|
|||
vpbroadcastd m14, [dw3]
|
|||
pblendvb m14, m14, [dd2], m13 ; ac
|
|||
|
|||
pblendvb m3, m15, [gradq + sum_stride3q], m13
|
|||
|
|||
; extent to dword to avoid overflow
|
|||
punpcklwd m4, m0, m15
|
|||
punpckhwd m5, m0, m15
|
|||
punpcklwd m6, m1, m15
|
|||
punpckhwd m7, m1, m15
|
|||
punpcklwd m8, m2, m15
|
|||
punpckhwd m9, m2, m15
|
|||
punpcklwd m10, m3, m15
|
|||
punpckhwd m11, m3, m15
|
|||
|
|||
paddd m0, m4, m6
|
|||
paddd m1, m5, m7
|
|||
paddd m2, m8, m10
|
|||
paddd m3, m9, m11
|
|||
|
|||
; sum of the first row
|
|||
paddd m0, m2 ; low
|
|||
paddd m1, m3 ; high
|
|||
|
|||
lea gradq, [gradq + 2 * sum_strideq]
|
|||
|
|||
pblendvb m10, m15, [gradq], m13
|
|||
|
|||
movu m11, [gradq + sum_strideq]
|
|||
movu m12, [gradq + 2 * sum_strideq]
|
|||
movu m13, [gradq + sum_stride3q]
|
|||
|
|||
punpcklwd m4, m10, m15
|
|||
punpckhwd m5, m10, m15
|
|||
punpcklwd m6, m11, m15
|
|||
punpckhwd m7, m11, m15
|
|||
punpcklwd m8, m12, m15
|
|||
punpckhwd m9, m12, m15
|
|||
punpcklwd m10, m13, m15
|
|||
punpckhwd m11, m13, m15
|
|||
|
|||
paddd m2, m4, m6
|
|||
paddd m3, m5, m7
|
|||
paddd m4, m8, m10
|
|||
paddd m5, m9, m11
|
|||
|
|||
; sum of the second row
|
|||
paddd m2, m4 ; low
|
|||
paddd m3, m5 ; high
|
|||
|
|||
punpckldq m4, m0, m2
|
|||
punpckhdq m5, m0, m2
|
|||
punpckldq m6, m1, m3
|
|||
punpckhdq m7, m1, m3
|
|||
|
|||
; each dword represent 4x2 alf blocks
|
|||
; the order is 01452367
|
|||
punpckldq m0, m4, m6 ; sum_v
|
|||
punpckhdq m1, m4, m6 ; sum_h
|
|||
punpckldq m2, m5, m7 ; sum_d0
|
|||
punpckhdq m3, m5, m7 ; sum_d1
|
|||
|
|||
pcmpgtd m4, m0, m1 ; dir_hv - 1
|
|||
pmaxsd m5, m0, m1 ; hv1
|
|||
pminsd m6, m0, m1 ; hv0
|
|||
|
|||
paddd m0, m1; ; sum_hv
|
|||
|
|||
pcmpgtd m7, m2, m3 ; dir_d - 1
|
|||
pmaxsd m8, m2, m3 ; d1
|
|||
pminsd m9, m2, m3 ; d0
|
|||
|
|||
; *transpose_idx = dir_d * 2 + dir_hv;
|
|||
vpbroadcastd m10, [dw3]
|
|||
paddd m11, m7, m7
|
|||
paddd m11, m4
|
|||
paddd m10, m11
|
|||
vpermq m10, m10, 11011000b
|
|||
SAVE_CLASSIFY_PARAM transpose_idx, 10
|
|||
|
|||
psrlq m10, m8, 32
|
|||
psrlq m11, m6, 32
|
|||
pmuldq m12, m10, m11 ; d1 * hv0 high
|
|||
psrlq m1, m9, 32
|
|||
psrlq m2, m5, 32
|
|||
pmuldq m3, m1, m2 ; d0 * hv1 high
|
|||
pcmpgtq m10, m12, m3 ; dir1 - 1 high
|
|||
|
|||
pmuldq m1, m8, m6 ; d1 * hv0 low
|
|||
pmuldq m2, m9, m5 ; d0 * hv1 low
|
|||
pcmpgtq m1, m2 ; dir1 - 1 low
|
|||
|
|||
vpblendd m1, m1, m10, 0xaa ; dir1 - 1
|
|||
|
|||
pblendvb m2, m5, m8, m1 ; hvd1
|
|||
pblendvb m3, m6, m9, m1 ; hvd0
|
|||
|
|||
movd xm5, bit_depthd
|
|||
vpbroadcastd m5, xm5
|
|||
|
|||
;*class_idx = arg_var[av_clip_uintp2(sum_hv * ac >> (BIT_DEPTH - 1), 4)];
|
|||
pmulld m0, m14 ; sum_hv * ac
|
|||
vpsrlvd m0, m0, m5
|
|||
pminsd m0, [dd15]
|
|||
movu m6, [ARG_VAR_SHUFFE]
|
|||
pshufb m6, m0 ; class_idx
|
|||
|
|||
vpbroadcastd m10, [dw5]
|
|||
|
|||
; if (hvd1 * 2 > 9 * hvd0)
|
|||
; *class_idx += ((dir1 << 1) + 2) * 5;
|
|||
; else if (hvd1 > 2 * hvd0)
|
|||
; *class_idx += ((dir1 << 1) + 1) * 5;
|
|||
paddd m7, m3, m3
|
|||
pcmpgtd m7, m2, m7 ; hvd1 > 2 * hvd0
|
|||
pand m7, m10
|
|||
paddd m6, m7 ; class_idx
|
|||
|
|||
paddd m8, m2, m2
|
|||
pslld m9, m3, 3
|
|||
paddd m9, m3
|
|||
pcmpgtd m8, m9 ; hvd1 * 2 > 9 * hvd0
|
|||
pand m8, m10
|
|||
paddd m6, m8 ; class_idx
|
|||
|
|||
pandn m1, m7
|
|||
paddd m1, m1 ; dir1 << 1
|
|||
paddd m6, m1 ; class_idx
|
|||
vpermq m6, m6, 11011000b
|
|||
|
|||
SAVE_CLASSIFY_PARAM class_idx, 6
|
|||
%endmacro
|
|||
|
|||
%macro ALF_CLASSIFY_16x8 0
|
|||
%define SAVE_CLASSIFY_PARAM SAVE_CLASSIFY_PARAM_W16
|
|||
ALF_CLASSIFY_H8
|
|||
%undef SAVE_CLASSIFY_PARAM
|
|||
%endmacro
|
|||
|
|||
%macro ALF_CLASSIFY_Wx8 0
|
|||
%define SAVE_CLASSIFY_PARAM SAVE_CLASSIFY_PARAM_W
|
|||
ALF_CLASSIFY_H8
|
|||
%undef SAVE_CLASSIFY_PARAM
|
|||
%endmacro
|
|||
|
|||
; ******************************
|
|||
;void ff_vvc_alf_classify(int *class_idx, int *transpose_idx, const int *gradient_sum,
|
|||
; intptr_t width, intptr_t height, intptr_t vb_pos, int *gradient_tmp, intptr_t bit_depth);
|
|||
; ******************************
|
|||
%macro ALF_CLASSIFY 1
|
|||
%define ps (%1 / 8)
|
|||
ALF_CLASSIFY_GRAD %1
|
|||
cglobal vvc_alf_classify_%1bpc, 7, 15, 16, class_idx, transpose_idx, gradient_sum, width, height, vb_pos, bit_depth, \
|
|||
x, y, grad, sum_stride, sum_stride3, temp, w
|
|||
|
|||
sub bit_depthq, 1
|
|||
|
|||
; now we can use gradient to get class idx and transpose idx
|
|||
lea sum_strideq, [widthd + ALF_GRADIENT_BORDER * 2]
|
|||
add sum_strideq, 15
|
|||
and sum_strideq, ~15 ; align to 16
|
|||
add sum_strideq, sum_strideq ; two rows a time
|
|||
|
|||
add gradient_sumq, 8 ; first 4 words are garbage
|
|||
|
|||
lea sum_stride3q, [3 * sum_strideq]
|
|||
|
|||
xor yd, yd
|
|||
and vb_posd, ~7 ; floor align to 8
|
|||
pxor m15, m15
|
|||
|
|||
.loop_sum_h:
|
|||
xor xd, xd
|
|||
.loop_sum_w16:
|
|||
lea wd, [widthd]
|
|||
sub wd, xd
|
|||
cmp wd, 16
|
|||
jl .loop_sum_w16_end
|
|||
|
|||
ALF_CLASSIFY_16x8
|
|||
|
|||
add xd, 16
|
|||
jmp .loop_sum_w16
|
|||
.loop_sum_w16_end:
|
|||
|
|||
cmp wd, 0
|
|||
je .loop_sum_w_end
|
|||
|
|||
ALF_CLASSIFY_Wx8
|
|||
|
|||
.loop_sum_w_end:
|
|||
lea gradient_sumq, [gradient_sumq + 4 * sum_strideq]
|
|||
lea transpose_idxq, [transpose_idxq + 2 * widthq]
|
|||
lea class_idxq, [class_idxq + 2 * widthq]
|
|||
|
|||
add yd, 8
|
|||
cmp yd, heightd
|
|||
jl .loop_sum_h
|
|||
|
|||
RET
|
|||
%endmacro
|
|||
|
|||
%if ARCH_X86_64
|
|||
%if HAVE_AVX2_EXTERNAL
|
|||
INIT_YMM avx2
|
|||
ALF_FILTER 16
|
|||
ALF_FILTER 8
|
|||
ALF_CLASSIFY 16
|
|||
ALF_CLASSIFY 8
|
|||
%endif
|
|||
%endif
|