|
|
|
@ -37,125 +37,133 @@ ex_b: db 2,-1,-1,-1,5,-1,-1,-1,8,-1,-1,-1,11,-1,-1,-1 |
|
|
|
|
SECTION .text |
|
|
|
|
|
|
|
|
|
INIT_XMM sse4 |
|
|
|
|
cglobal anaglyph, 11, 13, 16, 2*6*mmsize, dst, lsrc, rsrc, dst_linesize, l_linesize, r_linesize, width, height, ana_matrix_r, ana_matrix_g, ana_matrix_b |
|
|
|
|
movu m13, [ana_matrix_rq+ 0] |
|
|
|
|
movq m15, [ana_matrix_rq+16] |
|
|
|
|
pshufd m10, m13, q0000 |
|
|
|
|
pshufd m11, m13, q1111 |
|
|
|
|
pshufd m12, m13, q2222 |
|
|
|
|
pshufd m13, m13, q3333 |
|
|
|
|
pshufd m14, m15, q0000 |
|
|
|
|
pshufd m15, m15, q1111 |
|
|
|
|
mova [rsp+mmsize*0], m10 |
|
|
|
|
mova [rsp+mmsize*1], m11 |
|
|
|
|
mova [rsp+mmsize*2], m12 |
|
|
|
|
mova [rsp+mmsize*3], m13 |
|
|
|
|
mova [rsp+mmsize*4], m14 |
|
|
|
|
mova [rsp+mmsize*5], m15 |
|
|
|
|
|
|
|
|
|
movu m13, [ana_matrix_gq+ 0] |
|
|
|
|
movq m15, [ana_matrix_gq+16] |
|
|
|
|
pshufd m10, m13, q0000 |
|
|
|
|
pshufd m11, m13, q1111 |
|
|
|
|
pshufd m12, m13, q2222 |
|
|
|
|
pshufd m13, m13, q3333 |
|
|
|
|
pshufd m14, m15, q0000 |
|
|
|
|
pshufd m15, m15, q1111 |
|
|
|
|
mova [rsp+mmsize*6 ], m10 |
|
|
|
|
mova [rsp+mmsize*7 ], m11 |
|
|
|
|
mova [rsp+mmsize*8 ], m12 |
|
|
|
|
mova [rsp+mmsize*9 ], m13 |
|
|
|
|
mova [rsp+mmsize*10], m14 |
|
|
|
|
mova [rsp+mmsize*11], m15 |
|
|
|
|
|
|
|
|
|
movu m13, [ana_matrix_bq+ 0] |
|
|
|
|
movq m15, [ana_matrix_bq+16] |
|
|
|
|
pshufd m10, m13, q0000 |
|
|
|
|
pshufd m11, m13, q1111 |
|
|
|
|
pshufd m12, m13, q2222 |
|
|
|
|
pshufd m13, m13, q3333 |
|
|
|
|
pshufd m14, m15, q0000 |
|
|
|
|
pshufd m15, m15, q1111 |
|
|
|
|
cglobal anaglyph, 6, 10, 14, 2*6*mmsize, dst, lsrc, rsrc, dst_linesize, l_linesize, r_linesize, width, height, o, cnt |
|
|
|
|
%define ana_matrix_rq r6q |
|
|
|
|
%define ana_matrix_gq r7q |
|
|
|
|
%define ana_matrix_bq r8q |
|
|
|
|
mov ana_matrix_rq, r8m |
|
|
|
|
mov ana_matrix_gq, r9m |
|
|
|
|
mov ana_matrix_bq, r10m |
|
|
|
|
movu m3, [ana_matrix_rq+ 0] |
|
|
|
|
movq m5, [ana_matrix_rq+16] |
|
|
|
|
pshufd m0, m3, q0000 |
|
|
|
|
pshufd m1, m3, q1111 |
|
|
|
|
pshufd m2, m3, q2222 |
|
|
|
|
pshufd m3, m3, q3333 |
|
|
|
|
pshufd m4, m5, q0000 |
|
|
|
|
pshufd m5, m5, q1111 |
|
|
|
|
mova [rsp+mmsize*0], m0 |
|
|
|
|
mova [rsp+mmsize*1], m1 |
|
|
|
|
mova [rsp+mmsize*2], m2 |
|
|
|
|
mova [rsp+mmsize*3], m3 |
|
|
|
|
mova [rsp+mmsize*4], m4 |
|
|
|
|
mova [rsp+mmsize*5], m5 |
|
|
|
|
|
|
|
|
|
movu m3, [ana_matrix_gq+ 0] |
|
|
|
|
movq m5, [ana_matrix_gq+16] |
|
|
|
|
pshufd m0, m3, q0000 |
|
|
|
|
pshufd m1, m3, q1111 |
|
|
|
|
pshufd m2, m3, q2222 |
|
|
|
|
pshufd m3, m3, q3333 |
|
|
|
|
pshufd m4, m5, q0000 |
|
|
|
|
pshufd m5, m5, q1111 |
|
|
|
|
mova [rsp+mmsize*6 ], m0 |
|
|
|
|
mova [rsp+mmsize*7 ], m1 |
|
|
|
|
mova [rsp+mmsize*8 ], m2 |
|
|
|
|
mova [rsp+mmsize*9 ], m3 |
|
|
|
|
mova [rsp+mmsize*10], m4 |
|
|
|
|
mova [rsp+mmsize*11], m5 |
|
|
|
|
|
|
|
|
|
movu m11, [ana_matrix_bq+ 0] |
|
|
|
|
movq m13, [ana_matrix_bq+16] |
|
|
|
|
pshufd m8, m11, q0000 |
|
|
|
|
pshufd m9, m11, q1111 |
|
|
|
|
pshufd m10, m11, q2222 |
|
|
|
|
pshufd m11, m11, q3333 |
|
|
|
|
pshufd m12, m13, q0000 |
|
|
|
|
pshufd m13, m13, q1111 |
|
|
|
|
mov widthd, dword widthm |
|
|
|
|
mov heightd, dword heightm |
|
|
|
|
|
|
|
|
|
.nextrow: |
|
|
|
|
mov r11q, widthq |
|
|
|
|
mov r12q, 0 |
|
|
|
|
%define o r12q |
|
|
|
|
mov od, widthd |
|
|
|
|
xor cntd, cntd |
|
|
|
|
|
|
|
|
|
.loop: |
|
|
|
|
movu m0, [lsrcq+o+0] |
|
|
|
|
movu m0, [lsrcq+cntq] |
|
|
|
|
pshufb m1, m0, [ex_r] |
|
|
|
|
pshufb m2, m0, [ex_g] |
|
|
|
|
pshufb m3, m0, [ex_b] |
|
|
|
|
movu m0, [rsrcq+o+0] |
|
|
|
|
movu m0, [rsrcq+cntq] |
|
|
|
|
pshufb m4, m0, [ex_r] |
|
|
|
|
pshufb m5, m0, [ex_g] |
|
|
|
|
pshufb m6, m0, [ex_b] |
|
|
|
|
pshufb m0, [ex_b] |
|
|
|
|
pmulld m1, [rsp+mmsize*0] |
|
|
|
|
pmulld m2, [rsp+mmsize*1] |
|
|
|
|
pmulld m3, [rsp+mmsize*2] |
|
|
|
|
pmulld m4, [rsp+mmsize*3] |
|
|
|
|
pmulld m5, [rsp+mmsize*4] |
|
|
|
|
pmulld m6, [rsp+mmsize*5] |
|
|
|
|
pmulld m0, [rsp+mmsize*5] |
|
|
|
|
paddd m1, m2 |
|
|
|
|
paddd m3, m4 |
|
|
|
|
paddd m5, m6 |
|
|
|
|
paddd m5, m0 |
|
|
|
|
paddd m1, m3 |
|
|
|
|
paddd m1, m5 |
|
|
|
|
|
|
|
|
|
movu m0, [lsrcq+o+0] |
|
|
|
|
movu m0, [lsrcq+cntq] |
|
|
|
|
pshufb m7, m0, [ex_r] |
|
|
|
|
pshufb m2, m0, [ex_g] |
|
|
|
|
pshufb m3, m0, [ex_b] |
|
|
|
|
movu m0, [rsrcq+o+0] |
|
|
|
|
movu m0, [rsrcq+cntq] |
|
|
|
|
pshufb m4, m0, [ex_r] |
|
|
|
|
pshufb m5, m0, [ex_g] |
|
|
|
|
pshufb m6, m0, [ex_b] |
|
|
|
|
pshufb m0, [ex_b] |
|
|
|
|
pmulld m7, [rsp+mmsize*6] |
|
|
|
|
pmulld m2, [rsp+mmsize*7] |
|
|
|
|
pmulld m3, [rsp+mmsize*8] |
|
|
|
|
pmulld m4, [rsp+mmsize*9] |
|
|
|
|
pmulld m5, [rsp+mmsize*10] |
|
|
|
|
pmulld m6, [rsp+mmsize*11] |
|
|
|
|
pmulld m0, [rsp+mmsize*11] |
|
|
|
|
paddd m7, m2 |
|
|
|
|
paddd m3, m4 |
|
|
|
|
paddd m5, m6 |
|
|
|
|
paddd m5, m0 |
|
|
|
|
paddd m7, m3 |
|
|
|
|
paddd m7, m5 |
|
|
|
|
|
|
|
|
|
movu m0, [lsrcq+o+0] |
|
|
|
|
pshufb m8, m0, [ex_r] |
|
|
|
|
pshufb m2, m0, [ex_g] |
|
|
|
|
pshufb m3, m0, [ex_b] |
|
|
|
|
movu m0, [rsrcq+o+0] |
|
|
|
|
pshufb m4, m0, [ex_r] |
|
|
|
|
pshufb m5, m0, [ex_g] |
|
|
|
|
pshufb m6, m0, [ex_b] |
|
|
|
|
pmulld m8, m10 |
|
|
|
|
pmulld m2, m11 |
|
|
|
|
pmulld m3, m12 |
|
|
|
|
pmulld m4, m13 |
|
|
|
|
pmulld m5, m14 |
|
|
|
|
pmulld m6, m15 |
|
|
|
|
paddd m8, m2 |
|
|
|
|
paddd m3, m4 |
|
|
|
|
paddd m5, m6 |
|
|
|
|
paddd m8, m3 |
|
|
|
|
paddd m8, m5 |
|
|
|
|
movu m0, [lsrcq+cntq] |
|
|
|
|
pshufb m2, m0, [ex_r] |
|
|
|
|
pshufb m3, m0, [ex_g] |
|
|
|
|
pshufb m4, m0, [ex_b] |
|
|
|
|
movu m0, [rsrcq+cntq] |
|
|
|
|
pshufb m5, m0, [ex_r] |
|
|
|
|
pshufb m6, m0, [ex_g] |
|
|
|
|
pshufb m0, [ex_b] |
|
|
|
|
pmulld m2, m8 |
|
|
|
|
pmulld m3, m9 |
|
|
|
|
pmulld m4, m10 |
|
|
|
|
pmulld m5, m11 |
|
|
|
|
pmulld m6, m12 |
|
|
|
|
pmulld m0, m13 |
|
|
|
|
paddd m2, m3 |
|
|
|
|
paddd m4, m5 |
|
|
|
|
paddd m6, m0 |
|
|
|
|
paddd m2, m4 |
|
|
|
|
paddd m2, m6 |
|
|
|
|
|
|
|
|
|
psrld m1, 16 |
|
|
|
|
psrld m7, 16 |
|
|
|
|
psrld m8, 16 |
|
|
|
|
psrld m2, 16 |
|
|
|
|
|
|
|
|
|
packusdw m1, m7 |
|
|
|
|
packusdw m8, m8 |
|
|
|
|
packuswb m1, m8 |
|
|
|
|
packusdw m2, m2 |
|
|
|
|
packuswb m1, m2 |
|
|
|
|
pshufb m1, [shuf] |
|
|
|
|
|
|
|
|
|
movq [dstq+o+0], m1 |
|
|
|
|
movq [dstq+cntq+0], m1 |
|
|
|
|
psrldq m1, 8 |
|
|
|
|
movd [dstq+o+8], m1 |
|
|
|
|
add r12d, 12 |
|
|
|
|
sub r11d, 4 |
|
|
|
|
movd [dstq+cntq+8], m1 |
|
|
|
|
add cntd, 12 |
|
|
|
|
sub od, 4 |
|
|
|
|
jg .loop |
|
|
|
|
|
|
|
|
|
add dstq, dst_linesizeq |
|
|
|
|