@ -175,3 +175,155 @@ cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0
INIT_XMM ss e
DCA_LFE_FIR 0
DCA_LFE_FIR 1
INIT_XMM ss e2
% macro INNER_LOOP 1
; reading backwards: ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i
;~ a += window[i + j] * (-synth_buf[15 - i + j])
;~ b += window[i + j + 16] * (synth_buf[i + j])
pshufd m5 , [ ptr2 + j + ( 15 - 3 ) * 4 ], q0123
mova m6 , [ ptr1 + j ]
% if ARCH_X86_64
pshufd m11 , [ ptr2 + j + ( 15 - 3 ) * 4 - mmsize ], q0123
mova m12 , [ ptr1 + j + mmsize ]
% endif
mulps m6 , [ win + % 1 + j + 16 * 4 ]
mulps m5 , [ win + % 1 + j ]
% if ARCH_X86_64
mulps m12 , [ win + % 1 + j + mmsize + 16 * 4 ]
mulps m11 , [ win + % 1 + j + mmsize ]
% endif
addps m2 , m6
subps m1 , m5
% if ARCH_X86_64
addps m8 , m12
subps m7 , m11
% endif
;~ c += window[i + j + 32] * (synth_buf[16 + i + j])
;~ d += window[i + j + 48] * (synth_buf[31 - i + j])
pshufd m6 , [ ptr2 + j + ( 31 - 3 ) * 4 ], q0123
mova m5 , [ ptr1 + j + 16 * 4 ]
% if ARCH_X86_64
pshufd m12 , [ ptr2 + j + ( 31 - 3 ) * 4 - mmsize ], q0123
mova m11 , [ ptr1 + j + mmsize + 16 * 4 ]
% endif
mulps m5 , [ win + % 1 + j + 32 * 4 ]
mulps m6 , [ win + % 1 + j + 48 * 4 ]
% if ARCH_X86_64
mulps m11 , [ win + % 1 + j + mmsize + 32 * 4 ]
mulps m12 , [ win + % 1 + j + mmsize + 48 * 4 ]
% endif
addps m3 , m5
addps m4 , m6
% if ARCH_X86_64
addps m9 , m11
addps m10 , m12
% endif
sub j , 64 * 4
% endmacro
; void ff_synth_filter_inner_sse2(float *synth_buf, float synth_buf2[32],
; const float window[512], float out[32],
; intptr_t offset, float scale)
cglobal synth_filter_inner , 0 , 6 + 4 * ARCH_X86_64 , 7 + 6 * ARCH_X86_64 , \
synth_buf , synth_buf2 , window , out , off , scale
% define scale m0
% if ARCH_X86_32 || WIN64
movd scale , scalem
; Make sure offset is in a register and not on the stack
% define OFFQ r4q
% else
% define OFFQ offq
% endif
pshufd m0 , m0 , 0
; prepare inner counter limit 1
mov r5q , 480
sub r5q , offmp
and r5q , - 64
shl r5q , 2
mov OFFQ , r5q
% define i r5q
mov i , 16 * 4 - ( ARCH_X86_64 + 1 ) * mmsize ; main loop counter
% define buf2 synth_buf2q
% if ARCH_X86_32
mov buf2 , synth_buf2mp
% endif
.mainloop
; m1 = a m2 = b m3 = c m4 = d
pxor m3 , m3
pxor m4 , m4
mova m1 , [ buf2 + i ]
mova m2 , [ buf2 + i + 16 * 4 ]
% if ARCH_X86_32
% define ptr1 r0q
% define ptr2 r1q
% define win r2q
% define j r3q
mov win , windowm
mov ptr1 , synth_bufm
add win , i
add ptr1 , i
% else ; ARCH_X86_64
% define ptr1 r6q
% define ptr2 r7q ; must be loaded
% define win r8q
% define j r9q
pxor m9 , m9
pxor m10 , m10
mova m7 , [ buf2 + i + mmsize ]
mova m8 , [ buf2 + i + mmsize + 16 * 4 ]
lea win , [ windowq + i ]
lea ptr1 , [ synth_bufq + i ]
% endif
mov ptr2 , synth_bufmp
; prepare the inner loop counter
mov j , OFFQ
sub ptr2 , i
.loop1:
INNER_LOOP 0
jge .loop1
mov j , 448 * 4
sub j , OFFQ
jz .end
sub ptr1 , j
sub ptr2 , j
add win , OFFQ ; now at j-64, so define OFFSET
sub j , 64 * 4
.loop2:
INNER_LOOP 64 * 4
jge .loop2
.end:
% if ARCH_X86_32
mov buf2 , synth_buf2m ; needed for next iteration anyway
mov outq , outmp ; j, which will be set again during it
% endif
;~ out[i] = a * scale;
;~ out[i + 16] = b * scale;
mulps m1 , scale
mulps m2 , scale
% if ARCH_X86_64
mulps m7 , scale
mulps m8 , scale
% endif
;~ synth_buf2[i] = c;
;~ synth_buf2[i + 16] = d;
mova [ buf2 + i + 0 * 4 ], m3
mova [ buf2 + i + 16 * 4 ], m4
% if ARCH_X86_64
mova [ buf2 + i + 0 * 4 + mmsize ], m9
mova [ buf2 + i + 16 * 4 + mmsize ], m10
% endif
;~ out[i] = a;
;~ out[i + 16] = a;
mova [ outq + i + 0 * 4 ], m1
mova [ outq + i + 16 * 4 ], m2
% if ARCH_X86_64
mova [ outq + i + 0 * 4 + mmsize ], m7
mova [ outq + i + 16 * 4 + mmsize ], m8
% endif
sub i , ( ARCH_X86_64 + 1 ) * mmsize
jge .mainloop
RET