|
|
@ -266,29 +266,20 @@ MIX_1_TO_2_S16P_FLT |
|
|
|
%else |
|
|
|
%else |
|
|
|
%assign matrix_elements_stack 0 |
|
|
|
%assign matrix_elements_stack 0 |
|
|
|
%endif |
|
|
|
%endif |
|
|
|
|
|
|
|
%assign matrix_stack_size matrix_elements_stack * mmsize |
|
|
|
|
|
|
|
|
|
|
|
cglobal mix_%1_to_%2_%3_flt, 3,in_channels+2,needed_mmregs+matrix_elements_mm, src0, src1, len, src2, src3, src4, src5, src6, src7 |
|
|
|
%assign needed_stack_size -1 * matrix_stack_size |
|
|
|
|
|
|
|
%if ARCH_X86_32 && in_channels >= 7 |
|
|
|
|
|
|
|
%assign needed_stack_size needed_stack_size - 16 |
|
|
|
|
|
|
|
%endif |
|
|
|
|
|
|
|
|
|
|
|
; get aligned stack space if needed |
|
|
|
cglobal mix_%1_to_%2_%3_flt, 3,in_channels+2,needed_mmregs+matrix_elements_mm, needed_stack_size, src0, src1, len, src2, src3, src4, src5, src6, src7 |
|
|
|
%if matrix_elements_stack > 0 |
|
|
|
|
|
|
|
%if mmsize == 32 |
|
|
|
; define src pointers on stack if needed |
|
|
|
%assign bkpreg %1 + 1 |
|
|
|
%if matrix_elements_stack > 0 && ARCH_X86_32 && in_channels >= 7 |
|
|
|
%define bkpq r %+ bkpreg %+ q |
|
|
|
|
|
|
|
mov bkpq, rsp |
|
|
|
|
|
|
|
and rsp, ~(mmsize-1) |
|
|
|
|
|
|
|
sub rsp, matrix_elements_stack * mmsize |
|
|
|
|
|
|
|
%else |
|
|
|
|
|
|
|
%assign matrix_stack_size matrix_elements_stack * mmsize |
|
|
|
|
|
|
|
%assign pad matrix_stack_size + (mmsize - gprsize) - (stack_offset & (mmsize - gprsize)) |
|
|
|
|
|
|
|
; on x86-32 for 7 and 8 channels we need more stack space for src pointers |
|
|
|
|
|
|
|
%if ARCH_X86_32 && in_channels >= 7 |
|
|
|
|
|
|
|
%assign pad pad + 0x10 |
|
|
|
|
|
|
|
%define src5m [rsp+matrix_stack_size+0] |
|
|
|
%define src5m [rsp+matrix_stack_size+0] |
|
|
|
%define src6m [rsp+matrix_stack_size+4] |
|
|
|
%define src6m [rsp+matrix_stack_size+4] |
|
|
|
%define src7m [rsp+matrix_stack_size+8] |
|
|
|
%define src7m [rsp+matrix_stack_size+8] |
|
|
|
%endif |
|
|
|
|
|
|
|
SUB rsp, pad |
|
|
|
|
|
|
|
%endif |
|
|
|
|
|
|
|
%endif |
|
|
|
%endif |
|
|
|
|
|
|
|
|
|
|
|
; load matrix pointers |
|
|
|
; load matrix pointers |
|
|
@ -469,14 +460,6 @@ cglobal mix_%1_to_%2_%3_flt, 3,in_channels+2,needed_mmregs+matrix_elements_mm, s |
|
|
|
|
|
|
|
|
|
|
|
add lenq, mmsize |
|
|
|
add lenq, mmsize |
|
|
|
jl .loop |
|
|
|
jl .loop |
|
|
|
; restore stack pointer |
|
|
|
|
|
|
|
%if matrix_elements_stack > 0 |
|
|
|
|
|
|
|
%if mmsize == 32 |
|
|
|
|
|
|
|
mov rsp, bkpq |
|
|
|
|
|
|
|
%else |
|
|
|
|
|
|
|
ADD rsp, pad |
|
|
|
|
|
|
|
%endif |
|
|
|
|
|
|
|
%endif |
|
|
|
|
|
|
|
; zero ymm high halves |
|
|
|
; zero ymm high halves |
|
|
|
%if mmsize == 32 |
|
|
|
%if mmsize == 32 |
|
|
|
vzeroupper |
|
|
|
vzeroupper |
|
|
|