|
|
|
@ -191,6 +191,23 @@ SECTION .text |
|
|
|
|
addps %2, %2, %5 ; {i0,i1,i2,i3} |
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
|
|
%macro INTERL 5 |
|
|
|
|
%if cpuflag(avx) |
|
|
|
|
vunpckhps %3, %2, %1 |
|
|
|
|
vunpcklps %2, %2, %1 |
|
|
|
|
vextractf128 %4(%5), %2, 0 |
|
|
|
|
vextractf128 %4 %+ H(%5), %3, 0 |
|
|
|
|
vextractf128 %4(%5 + 1), %2, 1 |
|
|
|
|
vextractf128 %4 %+ H(%5 + 1), %3, 1 |
|
|
|
|
%elif cpuflag(sse) |
|
|
|
|
mova %3, %2 |
|
|
|
|
unpcklps %2, %1 |
|
|
|
|
unpckhps %3, %1 |
|
|
|
|
mova %4(%5), %2 |
|
|
|
|
mova %4(%5+1), %3 |
|
|
|
|
%endif |
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
|
|
; scheduled for cpu-bound sizes |
|
|
|
|
%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim |
|
|
|
|
IF%1 mova m4, Z(4) |
|
|
|
@ -541,17 +558,6 @@ DEFINE_ARGS zc, w, n, o1, o3 |
|
|
|
|
INIT_YMM avx |
|
|
|
|
|
|
|
|
|
%if HAVE_AVX_EXTERNAL |
|
|
|
|
%macro INTERL_AVX 5 |
|
|
|
|
vunpckhps %3, %2, %1 |
|
|
|
|
vunpcklps %2, %2, %1 |
|
|
|
|
vextractf128 %4(%5), %2, 0 |
|
|
|
|
vextractf128 %4 %+ H(%5), %3, 0 |
|
|
|
|
vextractf128 %4(%5 + 1), %2, 1 |
|
|
|
|
vextractf128 %4 %+ H(%5 + 1), %3, 1 |
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
|
|
%define INTERL INTERL_AVX |
|
|
|
|
|
|
|
|
|
DECL_PASS pass_avx, PASS_BIG 1 |
|
|
|
|
DECL_PASS pass_interleave_avx, PASS_BIG 0 |
|
|
|
|
|
|
|
|
@ -566,16 +572,6 @@ cglobal fft_calc, 2,5,8 |
|
|
|
|
|
|
|
|
|
INIT_XMM sse |
|
|
|
|
|
|
|
|
|
%macro INTERL_SSE 5 |
|
|
|
|
mova %3, %2 |
|
|
|
|
unpcklps %2, %1 |
|
|
|
|
unpckhps %3, %1 |
|
|
|
|
mova %4(%5), %2 |
|
|
|
|
mova %4(%5+1), %3 |
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
|
|
%define INTERL INTERL_SSE |
|
|
|
|
|
|
|
|
|
DECL_PASS pass_sse, PASS_BIG 1 |
|
|
|
|
DECL_PASS pass_interleave_sse, PASS_BIG 0 |
|
|
|
|
|
|
|
|
@ -861,16 +857,30 @@ INIT_XMM sse |
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
|
|
%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5 |
|
|
|
|
%if cpuflag(sse) |
|
|
|
|
mulps m6, %3, [%5+%1] |
|
|
|
|
mulps m7, %2, [%5+%1] |
|
|
|
|
mulps %2, %2, [%6+%1] |
|
|
|
|
mulps %3, %3, [%6+%1] |
|
|
|
|
subps %2, %2, m6 |
|
|
|
|
addps %3, %3, m7 |
|
|
|
|
%elif cpuflag(3dnow) |
|
|
|
|
mova m6, [%1+%2*2] |
|
|
|
|
mova %3, [%1+%2*2+8] |
|
|
|
|
mova %4, m6 |
|
|
|
|
mova m7, %3 |
|
|
|
|
pfmul m6, [%5+%2] |
|
|
|
|
pfmul %3, [%6+%2] |
|
|
|
|
pfmul %4, [%6+%2] |
|
|
|
|
pfmul m7, [%5+%2] |
|
|
|
|
pfsub %3, m6 |
|
|
|
|
pfadd %4, m7 |
|
|
|
|
%endif |
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
|
|
%macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8 |
|
|
|
|
%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8 |
|
|
|
|
.post: |
|
|
|
|
%if cpuflag(avx) |
|
|
|
|
vmovaps ymm1, [%3+%1*2] |
|
|
|
|
vmovaps ymm0, [%3+%1*2+0x20] |
|
|
|
|
vmovaps ymm3, [%3+%2*2] |
|
|
|
@ -899,10 +909,7 @@ INIT_XMM sse |
|
|
|
|
sub %2, 0x20 |
|
|
|
|
add %1, 0x20 |
|
|
|
|
jl .post |
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
|
|
%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8 |
|
|
|
|
.post: |
|
|
|
|
%elif cpuflag(sse) |
|
|
|
|
movaps xmm1, [%3+%1*2] |
|
|
|
|
movaps xmm0, [%3+%1*2+0x10] |
|
|
|
|
CMUL %1, xmm0, xmm1, %3, %4, %5 |
|
|
|
@ -924,25 +931,9 @@ INIT_XMM sse |
|
|
|
|
sub %2, 0x10 |
|
|
|
|
add %1, 0x10 |
|
|
|
|
jl .post |
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
|
|
%macro CMUL_3DNOW 6 |
|
|
|
|
mova m6, [%1+%2*2] |
|
|
|
|
mova %3, [%1+%2*2+8] |
|
|
|
|
mova %4, m6 |
|
|
|
|
mova m7, %3 |
|
|
|
|
pfmul m6, [%5+%2] |
|
|
|
|
pfmul %3, [%6+%2] |
|
|
|
|
pfmul %4, [%6+%2] |
|
|
|
|
pfmul m7, [%5+%2] |
|
|
|
|
pfsub %3, m6 |
|
|
|
|
pfadd %4, m7 |
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
|
|
%macro POSROTATESHUF_3DNOW 5 ;j, k, z+n8, tcos+n8, tsin+n8 |
|
|
|
|
.post: |
|
|
|
|
CMUL_3DNOW %3, %1, m0, m1, %4, %5 |
|
|
|
|
CMUL_3DNOW %3, %2, m2, m3, %4, %5 |
|
|
|
|
%elif cpuflag(3dnow) |
|
|
|
|
CMUL %3, %1, m0, m1, %4, %5 |
|
|
|
|
CMUL %3, %2, m2, m3, %4, %5 |
|
|
|
|
movd [%3+%1*2+ 0], m0 |
|
|
|
|
movd [%3+%2*2+12], m1 |
|
|
|
|
movd [%3+%2*2+ 0], m2 |
|
|
|
@ -958,9 +949,10 @@ INIT_XMM sse |
|
|
|
|
sub %2, 8 |
|
|
|
|
add %1, 8 |
|
|
|
|
jl .post |
|
|
|
|
%endif |
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
|
|
%macro DECL_IMDCT 1 |
|
|
|
|
%macro DECL_IMDCT 0 |
|
|
|
|
cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input |
|
|
|
|
%if ARCH_X86_64 |
|
|
|
|
%define rrevtab r7 |
|
|
|
@ -1066,7 +1058,7 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i |
|
|
|
|
neg r0 |
|
|
|
|
mov r1, -mmsize |
|
|
|
|
sub r1, r0 |
|
|
|
|
%1 r0, r1, r6, rtcos, rtsin |
|
|
|
|
POSROTATESHUF r0, r1, r6, rtcos, rtsin |
|
|
|
|
%if ARCH_X86_64 == 0 |
|
|
|
|
add esp, 12 |
|
|
|
|
%endif |
|
|
|
@ -1076,18 +1068,18 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i |
|
|
|
|
RET |
|
|
|
|
%endmacro |
|
|
|
|
|
|
|
|
|
DECL_IMDCT POSROTATESHUF |
|
|
|
|
DECL_IMDCT |
|
|
|
|
|
|
|
|
|
%if ARCH_X86_32 |
|
|
|
|
INIT_MMX 3dnow |
|
|
|
|
DECL_IMDCT POSROTATESHUF_3DNOW |
|
|
|
|
DECL_IMDCT |
|
|
|
|
|
|
|
|
|
INIT_MMX 3dnowext |
|
|
|
|
DECL_IMDCT POSROTATESHUF_3DNOW |
|
|
|
|
DECL_IMDCT |
|
|
|
|
%endif |
|
|
|
|
|
|
|
|
|
INIT_YMM avx |
|
|
|
|
|
|
|
|
|
%if HAVE_AVX_EXTERNAL |
|
|
|
|
DECL_IMDCT POSROTATESHUF_AVX |
|
|
|
|
DECL_IMDCT |
|
|
|
|
%endif |
|
|
|
|