@ -1,5 +1,5 @@
;******************************************************************************
;* FFT transform with SSE/3DNow optimizations
;* FFT transform with SSE/AVX optimizations
;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2011 Vitor Sessak
;*
@ -92,29 +92,6 @@ cextern cos_ %+ i
SECTION .text
% macro T2_3DNOW 4 ; z0, z1, mem0, mem1
mova % 1 , % 3
mova % 2 , % 1
pfadd % 1 , % 4
pfsub % 2 , % 4
% endmacro
% macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1
mova % 5 , % 3
pfsub % 3 , % 4
pfadd % 5 , % 4 ; {t6,t5}
pxor % 3 , [ ps_m1p1 ] ; {t8,t7}
mova % 6 , % 1
movd [ r0 + 12 ], % 3
punpckhdq % 3 , [ r0 + 8 ]
pfadd % 1 , % 5 ; {r0,i0}
pfsub % 6 , % 5 ; {r2,i2}
mova % 4 , % 2
pfadd % 2 , % 3 ; {r1,i1}
pfsub % 4 , % 3 ; {r3,i3}
SWAP % 3 , % 6
% endmacro
; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
; %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
; %3, %4, %5 tmp
@ -199,7 +176,7 @@ SECTION .text
vextractf128 % 4 %+ H ( % 5 ), % 3 , 0
vextractf128 % 4 ( % 5 + 1 ), % 2 , 1
vextractf128 % 4 %+ H ( % 5 + 1 ), % 3 , 1
% elif cpuflag(sse) || cpuflag(3dnow)
% elif cpuflag(sse)
mova % 3 , % 2
unpcklps % 2 , % 1
unpckhps % 3 , % 1
@ -310,12 +287,6 @@ IF%1 mova Z(1), m5
% endif
% endmacro
% macro PUNPCK 3
mova % 3 , % 1
punpckldq % 1 , % 2
punpckhdq % 3 , % 2
% endmacro
% define Z(x) [r0+mmsize*x]
% define Z2(x) [r0+mmsize*x]
% define ZH(x) [r0+mmsize*x+mmsize/2]
@ -462,68 +433,6 @@ fft16_sse:
ret
% macro FFT48_3DNOW 0
align 16
fft4 %+ SUFFIX :
T2_3DNOW m0 , m1 , Z ( 0 ), Z ( 1 )
mova m2 , Z ( 2 )
mova m3 , Z ( 3 )
T4_3DNOW m0 , m1 , m2 , m3 , m4 , m5
PUNPCK m0 , m1 , m4
PUNPCK m2 , m3 , m5
mova Z ( 0 ), m0
mova Z ( 1 ), m4
mova Z ( 2 ), m2
mova Z ( 3 ), m5
ret
align 16
fft8 %+ SUFFIX :
T2_3DNOW m0 , m1 , Z ( 0 ), Z ( 1 )
mova m2 , Z ( 2 )
mova m3 , Z ( 3 )
T4_3DNOW m0 , m1 , m2 , m3 , m4 , m5
mova Z ( 0 ), m0
mova Z ( 2 ), m2
T2_3DNOW m4 , m5 , Z ( 4 ), Z ( 5 )
T2_3DNOW m6 , m7 , Z2 ( 6 ), Z2 ( 7 )
PSWAPD m0 , m5
PSWAPD m2 , m7
pxor m0 , [ ps_m1p1 ]
pxor m2 , [ ps_m1p1 ]
pfsub m5 , m0
pfadd m7 , m2
pfmul m5 , [ ps_root2 ]
pfmul m7 , [ ps_root2 ]
T4_3DNOW m1 , m3 , m5 , m7 , m0 , m2
mova Z ( 5 ), m5
mova Z2 ( 7 ), m7
mova m0 , Z ( 0 )
mova m2 , Z ( 2 )
T4_3DNOW m0 , m2 , m4 , m6 , m5 , m7
PUNPCK m0 , m1 , m5
PUNPCK m2 , m3 , m7
mova Z ( 0 ), m0
mova Z ( 1 ), m5
mova Z ( 2 ), m2
mova Z ( 3 ), m7
PUNPCK m4 , Z ( 5 ), m5
PUNPCK m6 , Z2 ( 7 ), m7
mova Z ( 4 ), m4
mova Z ( 5 ), m5
mova Z2 ( 6 ), m6
mova Z2 ( 7 ), m7
ret
% endmacro
% if ARCH_X86_32
INIT_MMX 3 dnowext
FFT48_3DNOW
INIT_MMX 3 dnow
FFT48_3DNOW
% endif
% define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)]
% define Z2(x) [zcq + o3q + mmsize*(x&1)]
% define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
@ -575,7 +484,7 @@ INIT_XMM sse
DECL_PASS pass_sse , PASS_BIG 1
DECL_PASS pass_interleave_sse , PASS_BIG 0
% macro FFT_CALC_FUNC 0
INIT_XMM ss e
cglobal fft_calc , 2 , 5 , 8
mov r3d , [ r0 + FFTContext.nbits ]
PUSH r1
@ -592,36 +501,16 @@ cglobal fft_calc, 2,5,8
shl r2 , cl
sub r4 , r2
.loop:
% if mmsize == 8
PSWAPD m0 , [ r4 + r2 + 4 ]
mova [ r4 + r2 + 4 ], m0
% else
movaps xmm0 , [ r4 + r2 ]
movaps xmm1 , xmm0
unpcklps xmm0 , [ r4 + r2 + 16 ]
unpckhps xmm1 , [ r4 + r2 + 16 ]
movaps [ r4 + r2 ], xmm0
movaps [ r4 + r2 + 16 ], xmm1
% endif
add r2 , mmsize * 2
jl .loop
.end:
% if cpuflag(3dnow)
femms
RET
% else
REP_RET
% endif
% endmacro
% if ARCH_X86_32
INIT_MMX 3 dnow
FFT_CALC_FUNC
INIT_MMX 3 dnowext
FFT_CALC_FUNC
% endif
INIT_XMM ss e
FFT_CALC_FUNC
cglobal fft_permute , 2 , 7 , 1
mov r4 , [ r0 + FFTContext.revtab ]
@ -656,7 +545,7 @@ cglobal fft_permute, 2,7,1
jl .loopcopy
REP_RET
% macro IMDCT_CALC_FUNC 0
INIT_XMM ss e
cglobal imdct_calc , 3 , 5 , 3
mov r3d , [ r0 + FFTContext.mdctsize ]
mov r4 , [ r0 + FFTContext.imdcthalf ]
@ -684,52 +573,17 @@ cglobal imdct_calc, 3,5,3
neg r2
mova m2 , [ ps_neg ]
.loop:
% if mmsize == 8
PSWAPD m0 , [ r1 + r3 ]
PSWAPD m1 , [ r0 + r2 ]
pxor m0 , m2
% else
mova m0 , [ r1 + r3 ]
mova m1 , [ r0 + r2 ]
shufps m0 , m0 , 0x1b
shufps m1 , m1 , 0x1b
xorps m0 , m2
% endif
mova [ r0 + r3 ], m1
mova [ r1 + r2 ], m0
sub r3 , mmsize
add r2 , mmsize
jl .loop
% if cpuflag(3dnow)
femms
RET
% else
REP_RET
% endif
% endmacro
% if ARCH_X86_32
INIT_MMX 3 dnow
IMDCT_CALC_FUNC
INIT_MMX 3 dnowext
IMDCT_CALC_FUNC
% endif
INIT_XMM ss e
IMDCT_CALC_FUNC
% if ARCH_X86_32
INIT_MMX 3 dnow
% define mulps pfmul
% define addps pfadd
% define subps pfsub
% define unpcklps punpckldq
% define unpckhps punpckhdq
DECL_PASS pass_3dnow , PASS_SMALL 1 , [ wq ], [ wq + o1q ]
DECL_PASS pass_interleave_3dnow , PASS_BIG 0
% define pass_3dnowext pass_3dnow
% define pass_interleave_3dnowext pass_interleave_3dnow
% endif
% ifdef PIC
% define SECTION_REL - $$
@ -785,14 +639,6 @@ DECL_FFT 6, _interleave
INIT_XMM ss e
DECL_FFT 5
DECL_FFT 5 , _interleave
% if ARCH_X86_32
INIT_MMX 3 dnow
DECL_FFT 4
DECL_FFT 4 , _interleave
INIT_MMX 3 dnowext
DECL_FFT 4
DECL_FFT 4 , _interleave
% endif
INIT_XMM ss e
% undef mulps
@ -802,37 +648,6 @@ INIT_XMM sse
% undef unpckhps
% macro PREROTATER 5 ; -2*k, 2*k, input+n4, tcos+n8, tsin+n8
% if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8
PSWAPD m0 , [ % 3 +% 2 * 4 ]
movq m2 , [ % 3 +% 1 * 4 - 8 ]
movq m3 , m0
punpckldq m0 , m2
punpckhdq m2 , m3
movd m1 , [ % 4 +% 1 * 2 - 4 ] ; tcos[j]
movd m3 , [ % 4 +% 2 * 2 ] ; tcos[n4-j-1]
punpckldq m1 , [ % 5 +% 1 * 2 - 4 ] ; tsin[j]
punpckldq m3 , [ % 5 +% 2 * 2 ] ; tsin[n4-j-1]
mova m4 , m0
PSWAPD m5 , m1
pfmul m0 , m1
pfmul m4 , m5
mova m6 , m2
PSWAPD m5 , m3
pfmul m2 , m3
pfmul m6 , m5
% if cpuflag(3dnowext)
pfpnacc m0 , m4
pfpnacc m2 , m6
% else
SBUTTERFLY dq , 0 , 4 , 1
SBUTTERFLY dq , 2 , 6 , 3
pxor m4 , m7
pxor m6 , m7
pfadd m0 , m4
pfadd m2 , m6
% endif
% else
movaps xmm0 , [ % 3 +% 2 * 4 ]
movaps xmm1 , [ % 3 +% 1 * 4 - 0x10 ]
movaps xmm2 , xmm0
@ -853,29 +668,15 @@ INIT_XMM sse
movaps xmm0 , xmm1
unpcklps xmm1 , xmm2
unpckhps xmm0 , xmm2
% endif
% endmacro
% macro CMUL 6 ; j, xmm0, xmm1, 3, 4, 5
% if cpuflag(sse)
mulps m6 , % 3 , [ % 5 +% 1 ]
mulps m7 , % 2 , [ % 5 +% 1 ]
mulps % 2 , % 2 , [ % 6 +% 1 ]
mulps % 3 , % 3 , [ % 6 +% 1 ]
subps % 2 , % 2 , m6
addps % 3 , % 3 , m7
% elif cpuflag(3dnow)
mova m6 , [ % 1 +% 2 * 2 ]
mova % 3 , [ % 1 +% 2 * 2 + 8 ]
mova % 4 , m6
mova m7 , % 3
pfmul m6 , [ % 5 +% 2 ]
pfmul % 3 , [ % 6 +% 2 ]
pfmul % 4 , [ % 6 +% 2 ]
pfmul m7 , [ % 5 +% 2 ]
pfsub % 3 , m6
pfadd % 4 , m7
% endif
% endmacro
% macro POSROTATESHUF 5 ; j, k, z+n8, tcos+n8, tsin+n8
@ -909,7 +710,7 @@ INIT_XMM sse
sub % 2 , 0x20
add % 1 , 0x20
jl .post
% elif cpuflag(s se)
% else
movaps xmm1 , [ % 3 +% 1 * 2 ]
movaps xmm0 , [ % 3 +% 1 * 2 + 0x10 ]
CMUL % 1 , xmm0 , xmm1 , % 3 , % 4 , % 5
@ -931,24 +732,6 @@ INIT_XMM sse
sub % 2 , 0x10
add % 1 , 0x10
jl .post
% elif cpuflag(3dnow)
CMUL % 3 , % 1 , m0 , m1 , % 4 , % 5
CMUL % 3 , % 2 , m2 , m3 , % 4 , % 5
movd [ % 3 +% 1 * 2 + 0 ], m0
movd [ % 3 +% 2 * 2 + 12 ], m1
movd [ % 3 +% 2 * 2 + 0 ], m2
movd [ % 3 +% 1 * 2 + 12 ], m3
psrlq m0 , 32
psrlq m1 , 32
psrlq m2 , 32
psrlq m3 , 32
movd [ % 3 +% 1 * 2 + 8 ], m0
movd [ % 3 +% 2 * 2 + 4 ], m1
movd [ % 3 +% 2 * 2 + 8 ], m2
movd [ % 3 +% 1 * 2 + 4 ], m3
sub % 2 , 8
add % 1 , 8
jl .post
% endif
% endmacro
@ -981,39 +764,21 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i
push rrevtab
% endif
% if mmsize == 8
sub r3 , 2
% else
sub r3 , 4
% endif
% if ARCH_X86_64 || mmsize == 8
% if ARCH_X86_64
xor r4 , r4
sub r4 , r3
% endif
% if notcpuflag(3dnowext) && mmsize == 8
movd m7 , [ ps_neg ]
% endif
.pre:
% if ARCH_X86_64 == 0
;unspill
% if mmsize != 8
xor r4 , r4
sub r4 , r3
% endif
mov rtcos , [ esp + 8 ]
mov rtsin , [ esp + 4 ]
% endif
PREROTATER r4 , r3 , r2 , rtcos , rtsin
% if mmsize == 8
mov r6 , [ esp ] ; rrevtab = ptr+n8
movzx r5 , word [ rrevtab + r4 - 2 ] ; rrevtab[j]
movzx r6 , word [ rrevtab + r3 ] ; rrevtab[n4-j-1]
mova [ r1 + r5 * 8 ], m0
mova [ r1 + r6 * 8 ], m2
add r4 , 2
sub r3 , 2
% else
% if ARCH_X86_64
movzx r5 , word [ rrevtab + r4 - 4 ]
movzx r6 , word [ rrevtab + r4 - 2 ]
@ -1036,7 +801,6 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i
movhps [ r1 + r4 * 8 ], xmm1
% endif
sub r3 , 4
% endif
jns .pre
mov r5 , r0
@ -1061,23 +825,12 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i
POSROTATESHUF r0 , r1 , r6 , rtcos , rtsin
% if ARCH_X86_64 == 0
add esp , 12
% endif
% if mmsize == 8
femms
% endif
RET
% endmacro
DECL_IMDCT
% if ARCH_X86_32
INIT_MMX 3 dnow
DECL_IMDCT
INIT_MMX 3 dnowext
DECL_IMDCT
% endif
INIT_YMM avx
% if HAVE_AVX_EXTERNAL