x86/vvc_alf: avoid overwriting for non-16 aligned widths

Previously, the code allowed overwriting on 16-aligned blocks, which was suitable when there were
no picture's virtual boundaries because both CTU sizes and strides were 16-aligned. However, with
picture's virtual boundaries, each CTU is divided into four ALF blocks, leading to potential issues
with overwriting later CTUs.

In cases involving picture virtual boundaries, each ALF block is 8-pixel aligned.
For luma, we consistently ensure an 8-aligned width. For chroma in 4:2:0 format,
we need to account for a 4-aligned width.
release/7.1
Nuo Mi 5 months ago
parent 1fa9f5b17f
commit 6b0e6a98b5
  1. 85
      libavcodec/x86/vvc/vvc_alf.asm

@ -324,18 +324,69 @@ SECTION .text
%endif %endif
%endmacro %endmacro
; STORE_PIXELS(dst, src) ; STORE_PIXELS_W16(dst, src)
%macro STORE_PIXELS 2 %macro STORE_PIXELS_W16 2
%if ps == 2 %if ps == 2
movu %1, m%2 movu [%1], m%2
%else %else
movu [%1], xm%2
%endif
%endmacro
%macro STORE_PIXELS_W8 2
%if ps == 2
movu [%1], xm%2
%else
movq [%1], xm%2
%endif
%endmacro
; STORE_PIXELS_W4(dst, src, offset)
%macro STORE_PIXELS_W4 3
%if ps == 2
movq [%1 + %3 * ps], xm%2
%else
movd [%1 + %3], xm%2
%endif
%endmacro
%macro STORE_PIXELS_W8LE 3
cmp %3, 8
jl .w4
STORE_PIXELS_W8 %1, %2
cmp %3, 12
%if ps == 2
vpermq m%2, m%2, q0302
%else
vpermq m%2, m%2, q0101
%endif
jl .end
STORE_PIXELS_W4 %1, %2, 8
jmp .end
.w4:
STORE_PIXELS_W4 %1, %2, 0
.end:
%endmacro
; STORE_PIXELS(dst, src, width)
%macro STORE_PIXELS 3
%if ps == 1
packuswb m%2, m%2 packuswb m%2, m%2
vpermq m%2, m%2, 0x8 vpermq m%2, m%2, 0x8
movu %1, xm%2 %endif
%ifidn %3, 16
STORE_PIXELS_W16 %1, %2
%else
%if LUMA
STORE_PIXELS_W8 %1, %2
%else
STORE_PIXELS_W8LE %1, %2, %3
%endif
%endif %endif
%endmacro %endmacro
%macro FILTER_16x4 0 %macro FILTER_16x4 1
%if LUMA %if LUMA
push clipq push clipq
push strideq push strideq
@ -362,7 +413,7 @@ SECTION .text
; clip to pixel ; clip to pixel
CLIPW m0, m14, m15 CLIPW m0, m14, m15
STORE_PIXELS [dstq], 0 STORE_PIXELS dstq, 0, %1
lea srcq, [srcq + src_strideq] lea srcq, [srcq + src_strideq]
lea dstq, [dstq + dst_strideq] lea dstq, [dstq + dst_strideq]
@ -399,7 +450,7 @@ SECTION .text
; const uint8_t *src, ptrdiff_t src_stride, const ptrdiff_t width, cosnt ptr_diff_t height, ; const uint8_t *src, ptrdiff_t src_stride, const ptrdiff_t width, cosnt ptr_diff_t height,
; const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t vb_pos, ptrdiff_t pixel_max); ; const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t vb_pos, ptrdiff_t pixel_max);
; ****************************** ; ******************************
cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 16, 0-0x28, dst, dst_stride, src, src_stride, width, height, filter, clip, stride, vb_pos, pixel_max, \ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 16, 0-0x30, dst, dst_stride, src, src_stride, width, height, filter, clip, stride, vb_pos, pixel_max, \
offset, x, s5, s6 offset, x, s5, s6
%define ps (%1 / 8) ; pixel size %define ps (%1 / 8) ; pixel size
movd xm15, pixel_maxd movd xm15, pixel_maxd
@ -409,18 +460,32 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 16, 0-0x28, dst, dst_stride, src, src_s
.loop: .loop:
push srcq push srcq
push dstq push dstq
push widthq
xor xq, xq xor xq, xq
.loop_w: .loop_w:
cmp widthq, 16
jl .loop_w_end
LOAD_PARAMS LOAD_PARAMS
FILTER_16x4 FILTER_16x4 16
add srcq, 16 * ps add srcq, 16 * ps
add dstq, 16 * ps add dstq, 16 * ps
add xq, 16 add xq, 16
cmp xq, widthq sub widthq, 16
jl .loop_w jmp .loop_w
.loop_w_end:
cmp widthq, 0
je .w_end
LOAD_PARAMS
FILTER_16x4 widthq
.w_end:
pop widthq
pop dstq pop dstq
pop srcq pop srcq
lea srcq, [srcq + 4 * src_strideq] lea srcq, [srcq + 4 * src_strideq]

Loading…
Cancel
Save