@ -200,18 +200,22 @@ DCA_LFE_FIR 0
DCA_LFE_FIR 1
% macro SETZERO 1
% if cpuflag(sse2)
% if cpuflag(sse2) && notcpuflag(avx)
pxor % 1 , % 1
% else
xorps % 1 , % 1 , % 1
% endif
% endmacro
% macro SHUF 2
% if cpuflag(sse2)
pshufd % 1 , % 2 , q0123
% macro SHUF 3
% if cpuflag(avx)
mova % 3 , [ % 2 - 16 ]
vperm2f128 % 1 , % 3 , % 3 , 1
vshufps % 1 , % 1 , % 1 , q0123
% elif cpuflag(sse2)
pshufd % 1 , [ % 2 ], q0123
% else
mova % 1 , % 2
mova % 1 , [ % 2 ]
shufps % 1 , % 1 , q0123
% endif
% endmacro
@ -220,43 +224,43 @@ DCA_LFE_FIR 1
; reading backwards: ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i
;~ a += window[i + j] * (-synth_buf[15 - i + j])
;~ b += window[i + j + 16] * (synth_buf[i + j])
SHUF m5 , [ ptr2 + j + ( 15 - 3 ) * 4 ]
SHUF m5 , ptr2 + j + ( 15 - 3 ) * 4 , m6
mova m6 , [ ptr1 + j ]
% if ARCH_X86_64
SHUF m11 , [ ptr2 + j + ( 15 - 3 ) * 4 - mmsize ]
SHUF m11 , ptr2 + j + ( 15 - 3 ) * 4 - mmsize , m12
mova m12 , [ ptr1 + j + mmsize ]
% endif
mulps m6 , [ win + % 1 + j + 16 * 4 ]
mulps m5 , [ win + % 1 + j ]
mulps m6 , m6 , [ win + % 1 + j + 16 * 4 ]
mulps m5 , m5 , [ win + % 1 + j ]
% if ARCH_X86_64
mulps m12 , [ win + % 1 + j + mmsize + 16 * 4 ]
mulps m11 , [ win + % 1 + j + mmsize ]
mulps m12 , m12 , [ win + % 1 + j + mmsize + 16 * 4 ]
mulps m11 , m11 , [ win + % 1 + j + mmsize ]
% endif
addps m2 , m6
subps m1 , m5
addps m2 , m2 , m 6
subps m1 , m1 , m 5
% if ARCH_X86_64
addps m8 , m12
subps m7 , m11
addps m8 , m8 , m 12
subps m7 , m7 , m 11
% endif
;~ c += window[i + j + 32] * (synth_buf[16 + i + j])
;~ d += window[i + j + 48] * (synth_buf[31 - i + j])
SHUF m6 , [ ptr2 + j + ( 31 - 3 ) * 4 ]
SHUF m6 , ptr2 + j + ( 31 - 3 ) * 4 , m5
mova m5 , [ ptr1 + j + 16 * 4 ]
% if ARCH_X86_64
SHUF m12 , [ ptr2 + j + ( 31 - 3 ) * 4 - mmsize ]
SHUF m12 , ptr2 + j + ( 31 - 3 ) * 4 - mmsize , m11
mova m11 , [ ptr1 + j + mmsize + 16 * 4 ]
% endif
mulps m5 , [ win + % 1 + j + 32 * 4 ]
mulps m6 , [ win + % 1 + j + 48 * 4 ]
mulps m5 , m5 , [ win + % 1 + j + 32 * 4 ]
mulps m6 , m6 , [ win + % 1 + j + 48 * 4 ]
% if ARCH_X86_64
mulps m11 , [ win + % 1 + j + mmsize + 32 * 4 ]
mulps m12 , [ win + % 1 + j + mmsize + 48 * 4 ]
mulps m11 , m11 , [ win + % 1 + j + mmsize + 32 * 4 ]
mulps m12 , m12 , [ win + % 1 + j + mmsize + 48 * 4 ]
% endif
addps m3 , m5
addps m4 , m6
addps m3 , m3 , m 5
addps m4 , m4 , m 6
% if ARCH_X86_64
addps m9 , m11
addps m10 , m12
addps m9 , m9 , m 11
addps m10 , m10 , m1 2
% endif
sub j , 64 * 4
% endmacro
@ -269,25 +273,34 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
synth_buf , synth_buf2 , window , out , off , scale
% define scale m0
% if ARCH_X86_32 || WIN64
% if cpuflag(sse2)
% if cpuflag(sse2) && notcpuflag(avx)
movd scale , scalem
SPLATD m0
% else
movss scale , scalem
VBROADCASTSS m0 , scalem
% endif
; Make sure offset is in a register and not on the stack
% define OFFQ r4q
% else
SPLATD xmm0
% if cpuflag(avx)
vinsertf128 m0 , m0 , xmm0 , 1
% endif
% define OFFQ offq
% endif
SPLATD m0
; prepare inner counter limit 1
mov r5q , 480
sub r5q , offmp
and r5q , - 64
shl r5q , 2
% if ARCH_X86_32 || notcpuflag(avx)
mov OFFQ , r5q
% define i r5q
mov i , 16 * 4 - ( ARCH_X86_64 + 1 ) * mmsize ; main loop counter
% else
% define i 0
% define OFFQ r5q
% endif
% define buf2 synth_buf2q
% if ARCH_X86_32
@ -306,8 +319,10 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
% define j r3q
mov win , windowm
mov ptr1 , synth_bufm
% if ARCH_X86_32 || notcpuflag(avx)
add win , i
add ptr1 , i
% endif
% else ; ARCH_X86_64
% define ptr1 r6q
% define ptr2 r7q ; must be loaded
@ -323,7 +338,9 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
mov ptr2 , synth_bufmp
; prepare the inner loop counter
mov j , OFFQ
% if ARCH_X86_32 || notcpuflag(avx)
sub ptr2 , i
% endif
.loop1:
INNER_LOOP 0
jge .loop1
@ -346,11 +363,11 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
% endif
;~ out[i] = a * scale;
;~ out[i + 16] = b * scale;
mulps m1 , scale
mulps m2 , scale
mulps m1 , m1 , scale
mulps m2 , m2 , scale
% if ARCH_X86_64
mulps m7 , scale
mulps m8 , scale
mulps m7 , m7 , scale
mulps m8 , m8 , scale
% endif
;~ synth_buf2[i] = c;
;~ synth_buf2[i + 16] = d;
@ -368,8 +385,10 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
mova [ outq + i + 0 * 4 + mmsize ], m7
mova [ outq + i + 16 * 4 + mmsize ], m8
% endif
% if ARCH_X86_32 || notcpuflag(avx)
sub i , ( ARCH_X86_64 + 1 ) * mmsize
jge .mainloop
% endif
RET
% endmacro
@ -379,3 +398,5 @@ SYNTH_FILTER
% endif
INIT_XMM ss e2
SYNTH_FILTER
INIT_YMM avx
SYNTH_FILTER