|
|
|
@ -99,8 +99,6 @@ section .text align=16 |
|
|
|
|
%ifidn %1, row |
|
|
|
|
psubw m10,[row_round] |
|
|
|
|
%endif |
|
|
|
|
SIGNEXTEND m8, m9, m14 ; { row[2] }[0-3] / [4-7] |
|
|
|
|
SIGNEXTEND m10, m11, m14 ; { row[0] }[0-3] / [4-7] |
|
|
|
|
pmaddwd m2, m0, [w4_plus_w6] |
|
|
|
|
pmaddwd m3, m1, [w4_plus_w6] |
|
|
|
|
pmaddwd m4, m0, [w4_min_w6] |
|
|
|
@ -114,54 +112,28 @@ section .text align=16 |
|
|
|
|
; a1: -1*row[0] |
|
|
|
|
; a2: -1*row[0] |
|
|
|
|
; a3: -1*row[0]+1*row[2] |
|
|
|
|
psubd m2, m10 ; a1[0-3] |
|
|
|
|
psubd m3, m11 ; a1[4-7] |
|
|
|
|
psubd m4, m10 ; a2[0-3] |
|
|
|
|
psubd m5, m11 ; a2[4-7] |
|
|
|
|
psubd m0, m10 |
|
|
|
|
psubd m1, m11 |
|
|
|
|
psubd m6, m10 |
|
|
|
|
psubd m7, m11 |
|
|
|
|
psubd m0, m8 ; a0[0-3] |
|
|
|
|
psubd m1, m9 ; a0[4-7] |
|
|
|
|
paddd m6, m8 ; a3[0-3] |
|
|
|
|
paddd m7, m9 ; a3[4-7] |
|
|
|
|
|
|
|
|
|
; a0 += W4*row[4] + W6*row[6]; i.e. -1*row[4] |
|
|
|
|
; a1 -= W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6] |
|
|
|
|
; a2 -= W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6] |
|
|
|
|
; a3 += W4*row[4] - W6*row[6]; i.e. -1*row[4] |
|
|
|
|
SBUTTERFLY3 wd, 8, 9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7] |
|
|
|
|
SIGNEXTEND m13, m14, m10 ; { row[4] }[0-3] / [4-7] |
|
|
|
|
pmaddwd m10, m8, [w4_plus_w6] |
|
|
|
|
pmaddwd m11, m9, [w4_plus_w6] |
|
|
|
|
psubd m10, m13 |
|
|
|
|
psubd m11, m14 |
|
|
|
|
paddd m0, m10 ; a0[0-3] |
|
|
|
|
paddd m1, m11 ; a0[4-7] |
|
|
|
|
pmaddwd m10, m8, [w4_min_w6] |
|
|
|
|
pmaddwd m11, m9, [w4_min_w6] |
|
|
|
|
psubd m10, m13 |
|
|
|
|
psubd m11, m14 |
|
|
|
|
paddd m6, m10 ; a3[0-3] |
|
|
|
|
paddd m7, m11 ; a3[4-7] |
|
|
|
|
pmaddwd m10, m8, [w4_min_w2] |
|
|
|
|
pmaddwd m11, m9, [w4_min_w2] |
|
|
|
|
pmaddwd m8, [w4_plus_w2] |
|
|
|
|
pmaddwd m9, [w4_plus_w2] |
|
|
|
|
psubd m10, m13 |
|
|
|
|
psubd m11, m14 |
|
|
|
|
psubd m8, m13 |
|
|
|
|
psubd m9, m14 |
|
|
|
|
psubd m4, m10 ; a2[0-3] intermediate |
|
|
|
|
psubd m5, m11 ; a2[4-7] intermediate |
|
|
|
|
psubd m2, m8 ; a1[0-3] intermediate |
|
|
|
|
psubd m3, m9 ; a1[4-7] intermediate |
|
|
|
|
SIGNEXTEND m12, m13, m10 ; { row[6] }[0-3] / [4-7] |
|
|
|
|
psubd m4, m12 ; a2[0-3] |
|
|
|
|
psubd m5, m13 ; a2[4-7] |
|
|
|
|
paddd m2, m12 ; a1[0-3] |
|
|
|
|
paddd m3, m13 ; a1[4-7] |
|
|
|
|
|
|
|
|
|
; load/store |
|
|
|
|
mova [r2+ 0], m0 |
|
|
|
@ -192,8 +164,6 @@ section .text align=16 |
|
|
|
|
; b3 = MUL(W7, row[1]); |
|
|
|
|
; MAC(b3, -W5, row[3]); |
|
|
|
|
SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[1], row[3] }[0-3]/[4-7] |
|
|
|
|
SIGNEXTEND m10, m11, m12 ; { row[1] }[0-3] / [4-7] |
|
|
|
|
SIGNEXTEND m8, m9, m12 ; { row[3] }[0-3] / [4-7] |
|
|
|
|
pmaddwd m2, m0, [w3_min_w7] |
|
|
|
|
pmaddwd m3, m1, [w3_min_w7] |
|
|
|
|
pmaddwd m4, m0, [w5_min_w1] |
|
|
|
@ -207,22 +177,6 @@ section .text align=16 |
|
|
|
|
; b1: +2*row[1]-1*row[3] |
|
|
|
|
; b2: -1*row[1]-1*row[3] |
|
|
|
|
; b3: +1*row[1]+1*row[3] |
|
|
|
|
psubd m2, m8 |
|
|
|
|
psubd m3, m9 |
|
|
|
|
paddd m0, m8 |
|
|
|
|
paddd m1, m9 |
|
|
|
|
paddd m8, m10 ; { row[1] + row[3] }[0-3] |
|
|
|
|
paddd m9, m11 ; { row[1] + row[3] }[4-7] |
|
|
|
|
paddd m10, m10 |
|
|
|
|
paddd m11, m11 |
|
|
|
|
paddd m0, m8 ; b0[0-3] |
|
|
|
|
paddd m1, m9 ; b0[4-7] |
|
|
|
|
paddd m2, m10 ; b1[0-3] |
|
|
|
|
paddd m3, m11 ; b2[4-7] |
|
|
|
|
psubd m4, m8 ; b2[0-3] |
|
|
|
|
psubd m5, m9 ; b2[4-7] |
|
|
|
|
paddd m6, m8 ; b3[0-3] |
|
|
|
|
paddd m7, m9 ; b3[4-7] |
|
|
|
|
|
|
|
|
|
; MAC(b0, W5, row[5]); |
|
|
|
|
; MAC(b0, W7, row[7]); |
|
|
|
@ -233,29 +187,11 @@ section .text align=16 |
|
|
|
|
; MAC(b3, W3, row[5]); |
|
|
|
|
; MAC(b3, -W1, row[7]); |
|
|
|
|
SBUTTERFLY3 wd, 8, 9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7] |
|
|
|
|
SIGNEXTEND m13, m12, m11 ; { row[5] }[0-3] / [4-7] |
|
|
|
|
SIGNEXTEND m14, m11, m10 ; { row[7] }[0-3] / [4-7] |
|
|
|
|
|
|
|
|
|
; b0: -1*row[5]+1*row[7] |
|
|
|
|
; b1: -1*row[5]+1*row[7] |
|
|
|
|
; b2: +1*row[5]+2*row[7] |
|
|
|
|
; b3: +2*row[5]-1*row[7] |
|
|
|
|
paddd m4, m13 |
|
|
|
|
paddd m5, m12 |
|
|
|
|
paddd m6, m13 |
|
|
|
|
paddd m7, m12 |
|
|
|
|
psubd m13, m14 ; { row[5] - row[7] }[0-3] |
|
|
|
|
psubd m12, m11 ; { row[5] - row[7] }[4-7] |
|
|
|
|
paddd m14, m14 |
|
|
|
|
paddd m11, m11 |
|
|
|
|
psubd m0, m13 |
|
|
|
|
psubd m1, m12 |
|
|
|
|
psubd m2, m13 |
|
|
|
|
psubd m3, m12 |
|
|
|
|
paddd m4, m14 |
|
|
|
|
paddd m5, m11 |
|
|
|
|
paddd m6, m13 |
|
|
|
|
paddd m7, m12 |
|
|
|
|
|
|
|
|
|
pmaddwd m10, m8, [w1_plus_w5] |
|
|
|
|
pmaddwd m11, m9, [w1_plus_w5] |
|
|
|
@ -374,25 +310,9 @@ cglobal prores_idct_put_10_%1, 4, 4, %2 |
|
|
|
|
RET |
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
|
|
%macro signextend_sse2 3 ; dstlow, dsthigh, tmp |
|
|
|
|
pxor %3, %3 |
|
|
|
|
pcmpgtw %3, %1 |
|
|
|
|
mova %2, %1 |
|
|
|
|
punpcklwd %1, %3 |
|
|
|
|
punpckhwd %2, %3 |
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
|
|
%macro signextend_sse4 2-3 ; dstlow, dsthigh |
|
|
|
|
movhlps %2, %1 |
|
|
|
|
pmovsxwd %1, %1 |
|
|
|
|
pmovsxwd %2, %2 |
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
|
|
INIT_XMM |
|
|
|
|
%define SIGNEXTEND signextend_sse2 |
|
|
|
|
idct_put_fn sse2, 16 |
|
|
|
|
INIT_XMM |
|
|
|
|
%define SIGNEXTEND signextend_sse4 |
|
|
|
|
idct_put_fn sse4, 16 |
|
|
|
|
INIT_AVX |
|
|
|
|
idct_put_fn avx, 16 |
|
|
|
|