@ -199,11 +199,15 @@ INIT_XMM sse
DCA_LFE_FIR 0
DCA_LFE_FIR 1
% macro SHUF 2
% if cpuflag(sse2)
pshufd % 1 , % 2 , q0123
% macro SHUF 3
% if cpuflag(avx)
mova % 3 , [ % 2 - 16 ]
vperm2f128 % 1 , % 3 , % 3 , 1
vshufps % 1 , % 1 , % 1 , q0123
% elif cpuflag(sse2)
pshufd % 1 , [ % 2 ], q0123
% else
mova % 1 , % 2
mova % 1 , [ % 2 ]
shufps % 1 , % 1 , q0123
% endif
% endmacro
@ -212,43 +216,43 @@ DCA_LFE_FIR 1
; reading backwards: ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i
;~ a += window[i + j] * (-synth_buf[15 - i + j])
;~ b += window[i + j + 16] * (synth_buf[i + j])
SHUF m5 , [ ptr2 + j + ( 15 - 3 ) * 4 ]
SHUF m5 , ptr2 + j + ( 15 - 3 ) * 4 , m6
mova m6 , [ ptr1 + j ]
% if ARCH_X86_64
SHUF m11 , [ ptr2 + j + ( 15 - 3 ) * 4 - mmsize ]
SHUF m11 , ptr2 + j + ( 15 - 3 ) * 4 - mmsize , m12
mova m12 , [ ptr1 + j + mmsize ]
% endif
mulps m6 , [ win + % 1 + j + 16 * 4 ]
mulps m5 , [ win + % 1 + j ]
mulps m6 , m6 , [ win + % 1 + j + 16 * 4 ]
mulps m5 , m5 , [ win + % 1 + j ]
% if ARCH_X86_64
mulps m12 , [ win + % 1 + j + mmsize + 16 * 4 ]
mulps m11 , [ win + % 1 + j + mmsize ]
mulps m12 , m12 , [ win + % 1 + j + mmsize + 16 * 4 ]
mulps m11 , m11 , [ win + % 1 + j + mmsize ]
% endif
addps m2 , m6
subps m1 , m5
addps m2 , m2 , m 6
subps m1 , m1 , m 5
% if ARCH_X86_64
addps m8 , m12
subps m7 , m11
addps m8 , m8 , m 12
subps m7 , m7 , m 11
% endif
;~ c += window[i + j + 32] * (synth_buf[16 + i + j])
;~ d += window[i + j + 48] * (synth_buf[31 - i + j])
SHUF m6 , [ ptr2 + j + ( 31 - 3 ) * 4 ]
SHUF m6 , ptr2 + j + ( 31 - 3 ) * 4 , m5
mova m5 , [ ptr1 + j + 16 * 4 ]
% if ARCH_X86_64
SHUF m12 , [ ptr2 + j + ( 31 - 3 ) * 4 - mmsize ]
SHUF m12 , ptr2 + j + ( 31 - 3 ) * 4 - mmsize , m11
mova m11 , [ ptr1 + j + mmsize + 16 * 4 ]
% endif
mulps m5 , [ win + % 1 + j + 32 * 4 ]
mulps m6 , [ win + % 1 + j + 48 * 4 ]
mulps m5 , m5 , [ win + % 1 + j + 32 * 4 ]
mulps m6 , m6 , [ win + % 1 + j + 48 * 4 ]
% if ARCH_X86_64
mulps m11 , [ win + % 1 + j + mmsize + 32 * 4 ]
mulps m12 , [ win + % 1 + j + mmsize + 48 * 4 ]
mulps m11 , m11 , [ win + % 1 + j + mmsize + 32 * 4 ]
mulps m12 , m12 , [ win + % 1 + j + mmsize + 48 * 4 ]
% endif
addps m3 , m5
addps m4 , m6
addps m3 , m3 , m 5
addps m4 , m4 , m 6
% if ARCH_X86_64
addps m9 , m11
addps m10 , m12
addps m9 , m9 , m 11
addps m10 , m10 , m1 2
% endif
sub j , 64 * 4
% endmacro
@ -261,13 +265,16 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
synth_buf , synth_buf2 , window , out , off , scale
% define scale m0
% if ARCH_X86_32 || WIN64
movd scale , scalem
VBROADCASTSS m0 , scalem
; Make sure offset is in a register and not on the stack
% define OFFQ r4q
% else
SPLATD xmm0
% if cpuflag(avx)
vinsertf128 m0 , m0 , xmm0 , 1
% endif
% define OFFQ offq
% endif
SPLATD m0
; prepare inner counter limit 1
mov r5q , 480
sub r5q , offmp
@ -283,8 +290,8 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
% endif
.mainloop
; m1 = a m2 = b m3 = c m4 = d
xorps m3 , m3
xorps m4 , m4
xorps m3 , m3 , m3
xorps m4 , m4 , m4
mova m1 , [ buf2 + i ]
mova m2 , [ buf2 + i + 16 * 4 ]
% if ARCH_X86_32
@ -301,8 +308,8 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
% define ptr2 r7q ; must be loaded
% define win r8q
% define j r9q
xorps m9 , m9
xorps m10 , m10
xorps m9 , m9 , m9
xorps m10 , m10 , m10
mova m7 , [ buf2 + i + mmsize ]
mova m8 , [ buf2 + i + mmsize + 16 * 4 ]
lea win , [ windowq + i ]
@ -334,11 +341,11 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
% endif
;~ out[i] = a * scale;
;~ out[i + 16] = b * scale;
mulps m1 , scale
mulps m2 , scale
mulps m1 , m1 , scale
mulps m2 , m2 , scale
% if ARCH_X86_64
mulps m7 , scale
mulps m8 , scale
mulps m7 , m7 , scale
mulps m8 , m8 , scale
% endif
;~ synth_buf2[i] = c;
;~ synth_buf2[i + 16] = d;
@ -367,3 +374,7 @@ SYNTH_FILTER
% endif
INIT_XMM ss e2
SYNTH_FILTER
% if HAVE_AVX_EXTERNAL
INIT_YMM avx
SYNTH_FILTER
% endif