|
|
|
@ -214,20 +214,20 @@ SECTION .text |
|
|
|
|
addsubps %3, %3, %2 ; z1234, z5678 |
|
|
|
|
addsubps %1, %1, %4 ; s3142, s7586 |
|
|
|
|
|
|
|
|
|
mulps %3, %3, [s8_mult_odd] ; z * s8_mult_odd |
|
|
|
|
vpermilps %1, [s8_perm_even] ; s1234, s5687 ! |
|
|
|
|
mulps %3, [s8_mult_odd] ; z * s8_mult_odd |
|
|
|
|
vpermilps %1, %1, [s8_perm_even] ; s1234, s5687 ! |
|
|
|
|
|
|
|
|
|
shufps %2, %3, %3, q2332 ; junk, z7887 |
|
|
|
|
xorps %4, %1, [mask_mmmmpppm] ; e1234, e5687 ! |
|
|
|
|
|
|
|
|
|
vpermilps %3, %3, [s8_perm_odd2] ; z2314, z6556 |
|
|
|
|
vperm2f128 %1, %4, 0x03 ; e5687, s1234 |
|
|
|
|
vperm2f128 %1, %1, %4, 0x03 ; e5687, s1234 |
|
|
|
|
|
|
|
|
|
addsubps %2, %2, %3 ; junk, t5678 |
|
|
|
|
subps %1, %1, %4 ; w1234, w5678 even |
|
|
|
|
|
|
|
|
|
vperm2f128 %2, %2, 0x11 ; t5678, t5678 |
|
|
|
|
vperm2f128 %3, %3, 0x00 ; z2314, z2314 |
|
|
|
|
vperm2f128 %2, %2, %2, 0x11 ; t5678, t5678 |
|
|
|
|
vperm2f128 %3, %3, %3, 0x00 ; z2314, z2314 |
|
|
|
|
|
|
|
|
|
xorps %2, %2, [mask_ppmpmmpm] ; t * ppmpmmpm |
|
|
|
|
addps %2, %3, %2 ; u1234, u5678 odd |
|
|
|
@ -279,14 +279,14 @@ SECTION .text |
|
|
|
|
xorps %4, %6, mask ; s[8..15]*mpmppmpm |
|
|
|
|
xorps %3, %5, mask ; s[0...7]*mpmppmpm |
|
|
|
|
|
|
|
|
|
vperm2f128 %4, %4, 0x01 ; s[12..15, 8..11] |
|
|
|
|
vperm2f128 %3, %3, 0x01 ; s[4..7, 0..3] |
|
|
|
|
vperm2f128 %4, %4, %4, 0x01 ; s[12..15, 8..11] |
|
|
|
|
vperm2f128 %3, %3, %3, 0x01 ; s[4..7, 0..3] |
|
|
|
|
|
|
|
|
|
addps %6, %6, %4 ; y56, u56, y34, u34 |
|
|
|
|
addps %5, %5, %3 ; w56, x56, w34, x34 |
|
|
|
|
|
|
|
|
|
vpermilps %6, perm ; y56, u56, y43, u43 |
|
|
|
|
vpermilps %5, perm ; w56, x56, w43, x43 |
|
|
|
|
vpermilps %6, %6, perm ; y56, u56, y43, u43 |
|
|
|
|
vpermilps %5, %5, perm ; w56, x56, w43, x43 |
|
|
|
|
|
|
|
|
|
subps %4, %2, %6 ; odd part 2 |
|
|
|
|
addps %3, %2, %6 ; odd part 1 |
|
|
|
@ -453,7 +453,7 @@ SECTION .text |
|
|
|
|
fmsubaddps %5, %5, %8, %9 ; j[0..8] even |
|
|
|
|
%else |
|
|
|
|
mulps %5, %5, %8 ; m2,3[23]reim * cos0246 |
|
|
|
|
xorps %9, %9, [mask_pmpmpmpm]; +-m2,3[23]imre * wim7531 |
|
|
|
|
xorps %9, %9, [mask_pmpmpmpm] ; +-m2,3[23]imre * wim7531 |
|
|
|
|
addps %5, %5, %9 ; j[0..8] |
|
|
|
|
%endif |
|
|
|
|
|
|
|
|
@ -486,7 +486,7 @@ SECTION .text |
|
|
|
|
movaps [outq + 14*mmsize], tx2_o0 |
|
|
|
|
|
|
|
|
|
movaps tw_e, [ff_cos_64_float + mmsize] |
|
|
|
|
vperm2f128 tw_o, [ff_cos_64_float + 64 - 4*7 - mmsize], 0x23 |
|
|
|
|
vperm2f128 tw_o, tw_o, [ff_cos_64_float + 64 - 4*7 - mmsize], 0x23 |
|
|
|
|
|
|
|
|
|
movaps m0, [outq + 1*mmsize] |
|
|
|
|
movaps m1, [outq + 3*mmsize] |
|
|
|
@ -513,7 +513,7 @@ SECTION .text |
|
|
|
|
; %1 must contain len*2, %2 must contain len*4, %3 must contain len*6 |
|
|
|
|
%macro SPLIT_RADIX_LOAD_COMBINE_4 8 |
|
|
|
|
movaps m8, [rtabq + (%5)*mmsize + %7] |
|
|
|
|
vperm2f128 m9, [itabq - (%5)*mmsize + %8], 0x23 |
|
|
|
|
vperm2f128 m9, m9, [itabq - (%5)*mmsize + %8], 0x23 |
|
|
|
|
|
|
|
|
|
movaps m0, [outq + (0 + %4)*mmsize + %6] |
|
|
|
|
movaps m2, [outq + (2 + %4)*mmsize + %6] |
|
|
|
@ -570,7 +570,7 @@ SECTION .text |
|
|
|
|
; %3 must contain len*2, %4 must contain len*4, %5 must contain len*6 |
|
|
|
|
%macro SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 6 |
|
|
|
|
movaps m8, [rtabq + (0 + %2)*mmsize] |
|
|
|
|
vperm2f128 m9, [itabq - (0 + %2)*mmsize], 0x23 |
|
|
|
|
vperm2f128 m9, m9, [itabq - (0 + %2)*mmsize], 0x23 |
|
|
|
|
|
|
|
|
|
movaps m0, [outq + (0 + 0 + %1)*mmsize + %6] |
|
|
|
|
movaps m2, [outq + (2 + 0 + %1)*mmsize + %6] |
|
|
|
@ -612,7 +612,7 @@ SECTION .text |
|
|
|
|
vperm2f128 m13, m13, m5, 0x13 |
|
|
|
|
|
|
|
|
|
movaps m8, [rtabq + (1 + %2)*mmsize] |
|
|
|
|
vperm2f128 m9, [itabq - (1 + %2)*mmsize], 0x23 |
|
|
|
|
vperm2f128 m9, m9, [itabq - (1 + %2)*mmsize], 0x23 |
|
|
|
|
|
|
|
|
|
movaps m0, [outq + (0 + 1 + %1)*mmsize + %6] |
|
|
|
|
movaps m2, [outq + (2 + 1 + %1)*mmsize + %6] |
|
|
|
@ -801,7 +801,7 @@ cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp |
|
|
|
|
LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m14, m15 |
|
|
|
|
|
|
|
|
|
movaps m8, [ff_cos_32_float] |
|
|
|
|
vperm2f128 m9, [ff_cos_32_float + 4*8 - 4*7], 0x23 |
|
|
|
|
vperm2f128 m9, m9, [ff_cos_32_float + 4*8 - 4*7], 0x23 |
|
|
|
|
|
|
|
|
|
FFT16 m0, m1, m2, m3, m10, m11, m12, m13 |
|
|
|
|
|
|
|
|
@ -859,7 +859,7 @@ ALIGN 16 |
|
|
|
|
sub outq, (%1*4) + (%1*2) + (%1/2) |
|
|
|
|
|
|
|
|
|
lea rtabq, [ff_cos_ %+ %1 %+ _float] |
|
|
|
|
lea itabq, [rtabq + %1 - 4*7] |
|
|
|
|
lea itabq, [ff_cos_ %+ %1 %+ _float + %1 - 4*7] |
|
|
|
|
|
|
|
|
|
%if %0 > 1 |
|
|
|
|
cmp tgtq, %1 |
|
|
|
@ -904,7 +904,7 @@ ALIGN 16 |
|
|
|
|
LOAD64_LUT m3, inq, lutq, (mmsize/2)*3, tmpq, m14, m15 |
|
|
|
|
|
|
|
|
|
movaps m8, [ff_cos_32_float] |
|
|
|
|
vperm2f128 m9, [ff_cos_32_float + 32 - 4*7], 0x23 |
|
|
|
|
vperm2f128 m9, m9, [ff_cos_32_float + 32 - 4*7], 0x23 |
|
|
|
|
|
|
|
|
|
FFT16 m0, m1, m2, m3, m10, m11, m12, m13 |
|
|
|
|
|
|
|
|
@ -962,7 +962,7 @@ ALIGN 16 |
|
|
|
|
FFT16 tx2_e0, tx2_e1, tx2_o0, tx2_o1, tmp1, tmp2, tw_e, tw_o |
|
|
|
|
|
|
|
|
|
movaps tw_e, [ff_cos_64_float] |
|
|
|
|
vperm2f128 tw_o, [ff_cos_64_float + 64 - 4*7], 0x23 |
|
|
|
|
vperm2f128 tw_o, tw_o, [ff_cos_64_float + 64 - 4*7], 0x23 |
|
|
|
|
|
|
|
|
|
add lutq, (mmsize/2)*8 |
|
|
|
|
cmp tgtq, 64 |
|
|
|
@ -990,7 +990,7 @@ ALIGN 16 |
|
|
|
|
sub outq, 24*mmsize |
|
|
|
|
|
|
|
|
|
lea rtabq, [ff_cos_128_float] |
|
|
|
|
lea itabq, [rtabq + 128 - 4*7] |
|
|
|
|
lea itabq, [ff_cos_128_float + 128 - 4*7] |
|
|
|
|
|
|
|
|
|
cmp tgtq, 128 |
|
|
|
|
je .deinterleave |
|
|
|
@ -1017,7 +1017,7 @@ ALIGN 16 |
|
|
|
|
sub outq, 48*mmsize |
|
|
|
|
|
|
|
|
|
lea rtabq, [ff_cos_256_float] |
|
|
|
|
lea itabq, [rtabq + 256 - 4*7] |
|
|
|
|
lea itabq, [ff_cos_256_float + 256 - 4*7] |
|
|
|
|
|
|
|
|
|
cmp tgtq, 256 |
|
|
|
|
je .deinterleave |
|
|
|
@ -1045,7 +1045,7 @@ ALIGN 16 |
|
|
|
|
sub outq, 96*mmsize |
|
|
|
|
|
|
|
|
|
lea rtabq, [ff_cos_512_float] |
|
|
|
|
lea itabq, [rtabq + 512 - 4*7] |
|
|
|
|
lea itabq, [ff_cos_512_float + 512 - 4*7] |
|
|
|
|
|
|
|
|
|
cmp tgtq, 512 |
|
|
|
|
je .deinterleave |
|
|
|
@ -1080,7 +1080,7 @@ ALIGN 16 |
|
|
|
|
sub outq, 192*mmsize |
|
|
|
|
|
|
|
|
|
lea rtabq, [ff_cos_1024_float] |
|
|
|
|
lea itabq, [rtabq + 1024 - 4*7] |
|
|
|
|
lea itabq, [ff_cos_1024_float + 1024 - 4*7] |
|
|
|
|
|
|
|
|
|
cmp tgtq, 1024 |
|
|
|
|
je .deinterleave |
|
|
|
@ -1161,7 +1161,7 @@ FFT_SPLIT_RADIX_DEF 131072 |
|
|
|
|
vextractf128 [outq + 13*mmsize + 16], tx2_e0, 1 |
|
|
|
|
|
|
|
|
|
movaps tw_e, [ff_cos_64_float + mmsize] |
|
|
|
|
vperm2f128 tw_o, [ff_cos_64_float + 64 - 4*7 - mmsize], 0x23 |
|
|
|
|
vperm2f128 tw_o, tw_o, [ff_cos_64_float + 64 - 4*7 - mmsize], 0x23 |
|
|
|
|
|
|
|
|
|
movaps m0, [outq + 1*mmsize] |
|
|
|
|
movaps m1, [outq + 3*mmsize] |
|
|
|
|