|
|
|
@ -1170,6 +1170,62 @@ DR_FUNCS 2 |
|
|
|
|
INIT_XMM avx |
|
|
|
|
DR_FUNCS 2 |
|
|
|
|
|
|
|
|
|
%if HAVE_AVX2_EXTERNAL |
|
|
|
|
INIT_YMM avx2 |
|
|
|
|
cglobal vp9_ipred_dr_16x16_16, 4, 5, 6, dst, stride, l, a |
|
|
|
|
mova m0, [lq] ; klmnopqrstuvwxyz |
|
|
|
|
movu m1, [aq-2] ; *abcdefghijklmno |
|
|
|
|
mova m2, [aq] ; abcdefghijklmnop |
|
|
|
|
vperm2i128 m4, m2, m2, q2001 ; ijklmnop........ |
|
|
|
|
vpalignr m5, m4, m2, 2 ; bcdefghijklmnop. |
|
|
|
|
vperm2i128 m3, m0, m1, q0201 ; stuvwxyz*abcdefg |
|
|
|
|
LOWPASS 1, 2, 5 ; ABCDEFGHIJKLMNO. |
|
|
|
|
vpalignr m4, m3, m0, 2 ; lmnopqrstuvwxyz* |
|
|
|
|
vpalignr m5, m3, m0, 4 ; mnopqrstuvwxyz*a |
|
|
|
|
LOWPASS 0, 4, 5 ; LMNOPQRSTUVWXYZ# |
|
|
|
|
vperm2i128 m5, m0, m1, q0201 ; TUVWXYZ#ABCDEFGH |
|
|
|
|
DEFINE_ARGS dst, stride, stride3, stride5, dst3 |
|
|
|
|
lea dst3q, [dstq+strideq*4] |
|
|
|
|
lea stride3q, [strideq*3] |
|
|
|
|
lea stride5q, [stride3q+strideq*2] |
|
|
|
|
|
|
|
|
|
vpalignr m3, m5, m0, 2 |
|
|
|
|
vpalignr m4, m1, m5, 2 |
|
|
|
|
mova [dst3q+stride5q*2], m3 ; 14 |
|
|
|
|
mova [ dstq+stride3q*2], m4 ; 6 |
|
|
|
|
vpalignr m3, m5, m0, 4 |
|
|
|
|
vpalignr m4, m1, m5, 4 |
|
|
|
|
sub dst3q, strideq |
|
|
|
|
mova [dst3q+stride5q*2], m3 ; 13 |
|
|
|
|
mova [dst3q+strideq*2 ], m4 ; 5 |
|
|
|
|
mova [dst3q+stride3q*4], m0 ; 15 |
|
|
|
|
vpalignr m3, m5, m0, 6 |
|
|
|
|
vpalignr m4, m1, m5, 6 |
|
|
|
|
mova [dstq+stride3q*4], m3 ; 12 |
|
|
|
|
mova [dst3q+strideq*1], m4 ; 4 |
|
|
|
|
vpalignr m3, m5, m0, 8 |
|
|
|
|
vpalignr m4, m1, m5, 8 |
|
|
|
|
mova [dst3q+strideq*8], m3 ; 11 |
|
|
|
|
mova [dst3q+strideq*0], m4 ; 3 |
|
|
|
|
vpalignr m3, m5, m0, 10 |
|
|
|
|
vpalignr m4, m1, m5, 10 |
|
|
|
|
mova [dstq+stride5q*2], m3 ; 10 |
|
|
|
|
mova [dstq+strideq*2 ], m4 ; 2 |
|
|
|
|
vpalignr m3, m5, m0, 12 |
|
|
|
|
vpalignr m4, m1, m5, 12 |
|
|
|
|
mova [dst3q+stride3q*2], m3 ; 9 |
|
|
|
|
mova [dstq+strideq*1 ], m4 ; 1 |
|
|
|
|
vpalignr m3, m5, m0, 14 |
|
|
|
|
vpalignr m4, m1, m5, 14 |
|
|
|
|
mova [dstq+strideq*8], m3 ; 8 |
|
|
|
|
mova [dstq+strideq*0], m4 ; 0 |
|
|
|
|
sub dstq, strideq |
|
|
|
|
mova [dst3q+strideq*4], m5 ; 7 |
|
|
|
|
mova [ dstq+strideq*0], m1 ; -1 |
|
|
|
|
RET |
|
|
|
|
%endif |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
%macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function |
|
|
|
|
cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a |
|
|
|
|
movifnidn aq, amp |
|
|
|
|