|
|
|
@ -843,6 +843,88 @@ DEBLOCK_LUMA_INTRA |
|
|
|
|
mova [r0+2*r1], m2 |
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
|
|
; expands to [base],...,[base+7*stride] |
|
|
|
|
%define PASS8ROWS(base, base3, stride, stride3) \ |
|
|
|
|
[base], [base+stride], [base+stride*2], [base3], \ |
|
|
|
|
[base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] |
|
|
|
|
|
|
|
|
|
; in: 8 rows of 4 words in %4..%11 |
|
|
|
|
; out: 4 rows of 8 words in m0..m3 |
|
|
|
|
%macro TRANSPOSE4x8W_LOAD 8 |
|
|
|
|
movq m0, %1 |
|
|
|
|
movq m2, %2 |
|
|
|
|
movq m1, %3 |
|
|
|
|
movq m3, %4 |
|
|
|
|
|
|
|
|
|
punpcklwd m0, m2 |
|
|
|
|
punpcklwd m1, m3 |
|
|
|
|
punpckhdq m2, m0, m1 |
|
|
|
|
punpckldq m0, m1 |
|
|
|
|
|
|
|
|
|
movq m4, %5 |
|
|
|
|
movq m6, %6 |
|
|
|
|
movq m5, %7 |
|
|
|
|
movq m3, %8 |
|
|
|
|
|
|
|
|
|
punpcklwd m4, m6 |
|
|
|
|
punpcklwd m5, m3 |
|
|
|
|
punpckhdq m6, m4, m5 |
|
|
|
|
punpckldq m4, m5 |
|
|
|
|
|
|
|
|
|
punpckhqdq m1, m0, m4 |
|
|
|
|
punpcklqdq m0, m4 |
|
|
|
|
punpckhqdq m3, m2, m6 |
|
|
|
|
punpcklqdq m2, m6 |
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
|
|
; in: 4 rows of 8 words in m0..m3 |
|
|
|
|
; out: 8 rows of 4 words in %1..%8 |
|
|
|
|
%macro TRANSPOSE8x4W_STORE 8 |
|
|
|
|
TRANSPOSE4x4W 0, 1, 2, 3, 4 |
|
|
|
|
movq %1, m0 |
|
|
|
|
movhps %2, m0 |
|
|
|
|
movq %3, m1 |
|
|
|
|
movhps %4, m1 |
|
|
|
|
movq %5, m2 |
|
|
|
|
movhps %6, m2 |
|
|
|
|
movq %7, m3 |
|
|
|
|
movhps %8, m3 |
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
|
|
; %1 = base + 3*stride |
|
|
|
|
; %2 = 3*stride (unused on mmx) |
|
|
|
|
; %3, %4 = place to store p1 and q1 values |
|
|
|
|
%macro CHROMA_H_LOAD 4 |
|
|
|
|
%if mmsize == 8 |
|
|
|
|
movq m0, [pix_q - 4] |
|
|
|
|
movq m1, [pix_q + stride_q - 4] |
|
|
|
|
movq m2, [pix_q + 2*stride_q - 4] |
|
|
|
|
movq m3, [%1 - 4] |
|
|
|
|
TRANSPOSE4x4W 0, 1, 2, 3, 4 |
|
|
|
|
%else |
|
|
|
|
TRANSPOSE4x8W_LOAD PASS8ROWS(pix_q-4, %1-4, stride_q, %2) |
|
|
|
|
%endif |
|
|
|
|
mova %3, m0 |
|
|
|
|
mova %4, m3 |
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
|
|
; %1 = base + 3*stride |
|
|
|
|
; %2 = 3*stride (unused on mmx) |
|
|
|
|
; %3, %4 = place to load p1 and q1 values |
|
|
|
|
%macro CHROMA_H_STORE 4 |
|
|
|
|
mova m0, %3 |
|
|
|
|
mova m3, %4 |
|
|
|
|
%if mmsize == 8 |
|
|
|
|
TRANSPOSE4x4W 0, 1, 2, 3, 4 |
|
|
|
|
movq [pix_q - 4], m0 |
|
|
|
|
movq [pix_q + stride_q - 4], m1 |
|
|
|
|
movq [pix_q + 2*stride_q - 4], m2 |
|
|
|
|
movq [%1 - 4], m3 |
|
|
|
|
%else |
|
|
|
|
TRANSPOSE8x4W_STORE PASS8ROWS(pix_q-4, %1-4, stride_q, %2) |
|
|
|
|
%endif |
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
|
|
%macro CHROMA_V_LOAD_TC 2 |
|
|
|
|
movd %1, [%2] |
|
|
|
|
punpcklbw %1, %1 |
|
|
|
@ -914,6 +996,42 @@ cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16) |
|
|
|
|
%else |
|
|
|
|
RET |
|
|
|
|
%endif |
|
|
|
|
|
|
|
|
|
;----------------------------------------------------------------------------- |
|
|
|
|
; void ff_deblock_h_chroma_10(uint16_t *pix, int stride, int alpha, int beta, |
|
|
|
|
; int8_t *tc0) |
|
|
|
|
;----------------------------------------------------------------------------- |
|
|
|
|
cglobal deblock_h_chroma_10, 5, 7, 8, 2*mmsize, pix_, stride_, alpha_, beta_, tc0_ |
|
|
|
|
shl alpha_d, 2 |
|
|
|
|
shl beta_d, 2 |
|
|
|
|
mov r5, pix_q |
|
|
|
|
lea r6, [3*stride_q] |
|
|
|
|
add r5, r6 |
|
|
|
|
%if mmsize == 8 |
|
|
|
|
mov r6d, 2 |
|
|
|
|
.loop: |
|
|
|
|
%endif |
|
|
|
|
|
|
|
|
|
CHROMA_H_LOAD r5, r6, [rsp], [rsp + mmsize] |
|
|
|
|
LOAD_AB m4, m5, alpha_d, beta_d |
|
|
|
|
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 |
|
|
|
|
pxor m4, m4 |
|
|
|
|
CHROMA_V_LOAD_TC m6, tc0_q |
|
|
|
|
psubw m6, [pw_3] |
|
|
|
|
pmaxsw m6, m4 |
|
|
|
|
pand m7, m6 |
|
|
|
|
DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 |
|
|
|
|
CHROMA_H_STORE r5, r6, [rsp], [rsp + mmsize] |
|
|
|
|
|
|
|
|
|
%if mmsize == 8 |
|
|
|
|
lea pix_q, [pix_q + 4*stride_q] |
|
|
|
|
lea r5, [r5 + 4*stride_q] |
|
|
|
|
add tc0_q, 2 |
|
|
|
|
dec r6d |
|
|
|
|
jg .loop |
|
|
|
|
%endif |
|
|
|
|
RET |
|
|
|
|
|
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
|
|
%if ARCH_X86_64 == 0 |
|
|
|
|