@ -21,8 +21,39 @@
# include " l i b a v u t i l / a r m / a s m . S "
@ TODO: * FFTs wider than 16
@ * dispatch code
@ The fftx_internal_vfp versions of the functions obey a modified AAPCS:
@ VFP is in RunFast mode, vector length 4, stride 1 thoroughout, and
@ all single-precision VFP registers may be corrupted on exit. The a2
@ register may not be clobbered in these functions, as it holds the
@ stored original FPSCR.
function f f _ f f t _ c a l c _ v f p , e x p o r t =1
ldr i p , [ a1 , #0 ] @ nbits
mov a1 , a2
A l d r p c , [ p c , i p , l s l #2 ]
A . w o r d 0
A . w o r d 0
A . w o r d 0
T m o v r e l a2 , ( f f t _ t a b _ v f p - 8 )
T l d r p c , [ a2 , i p , l s l #2 ]
T e n d f u n c
T c o n s t f f t _ t a b _ v f p
.word fft4_vfp
.word fft8_vfp
.word X( f f _ f f t 1 6 _ v f p ) @ this one alone is exported
.word fft32_vfp
.word fft64_vfp
.word fft128_vfp
.word fft256_vfp
.word fft512_vfp
.word fft1024_vfp
.word fft2048_vfp
.word fft4096_vfp
.word fft8192_vfp
.word fft16384_vfp
.word fft32768_vfp
.word fft65536_vfp
A e n d f u n c
function f f t 4 _ v f p
vldr d0 , [ a1 , #0 * 2 * 4 ] @ s0,s1 = z[0]
@ -131,18 +162,22 @@ endfunc
vstr d9 , [ a1 , #3 * 2 * 4 ]
.endm
function . L f f t 8 _ i n t e r n a l _ v f p
macro_ f f t 8 _ h e a d
macro_ f f t 8 _ t a i l
bx l r
endfunc
function f f t 8 _ v f p
ldr a3 , =0x03030000 @ RunFast mode, vector length 4, stride 1
fmrx a2 , F P S C R
fmxr F P S C R , a3
vpush { s16 - s31 }
macro_ f f t 8 _ h e a d
macro_ f f t 8 _ t a i l
mov i p , l r
bl . L f f t 8 _ i n t e r n a l _ v f p
vpop { s16 - s31 }
fmxr F P S C R , a2
bx l r
bx i p
endfunc
.align 3
@ -153,12 +188,7 @@ cos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2
cos3pi8 : @ cos(2*pi/8) = sqrt(2-sqrt(2))/2
.float 0 .3826834261417388916015625
function f f _ f f t 1 6 _ v f p , e x p o r t =1
ldr a3 , =0x03030000 @ RunFast mode, vector length 4, stride 1
fmrx a2 , F P S C R
fmxr F P S C R , a3
vpush { s16 - s31 }
function . L f f t 1 6 _ i n t e r n a l _ v f p
macro_ f f t 8 _ h e a d
@ FFT4(z+8)
vldr d10 , [ a1 , #8 * 2 * 4 ]
@ -292,7 +322,213 @@ function ff_fft16_vfp, export=1
vstr d8 , [ a1 , #0 * 2 * 4 ]
vstr d9 , [ a1 , #4 * 2 * 4 ]
bx l r
endfunc
function f f _ f f t 1 6 _ v f p , e x p o r t =1
ldr a3 , =0x03030000 @ RunFast mode, vector length 4, stride 1
fmrx a2 , F P S C R
fmxr F P S C R , a3
vpush { s16 - s31 }
mov i p , l r
bl . L f f t 1 6 _ i n t e r n a l _ v f p
vpop { s16 - s31 }
fmxr F P S C R , a2
bx l r
bx i p
endfunc
.macro pass n, z 0 , z 1 , z 2 , z 3
add v6 , v5 , #4 * 2 * \ n
@ TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3])
@ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
@ TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0])
@ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
vldr d8 , [ \ z 2 , #8 * ( o 2 + 1 ) ] @ s16,s17
vldmdb v6 ! , { s2 }
vldr d9 , [ \ z 3 , #8 * ( o 3 + 1 ) ] @ s18,s19
vldmia v5 ! , { s0 ,s1 } @ s0 is unused
vldr s7 , [ \ z 2 , #8 * o 2 ] @ t1
vmul. f s20 , s16 , s2 @ vector * scalar
vldr s0 , [ \ z 3 , #8 * o 3 ] @ t5
vldr s6 , [ \ z 2 , #8 * o 2 + 4 ] @ t2
vldr s3 , [ \ z 3 , #8 * o 3 + 4 ] @ t6
vmul. f s16 , s16 , s1 @ vector * scalar
ldr a4 , = \ n - 1
1 : add \ z 0 , \ z 0 , #8 * 2
.if \ n* 4 * 2 > = 5 1 2
add \ z 1 , \ z 1 , #8 * 2
.endif
.if \ n* 4 * 2 > = 2 5 6
add \ z 2 , \ z 2 , #8 * 2
.endif
.if \ n* 4 * 2 > = 5 1 2
add \ z 3 , \ z 3 , #8 * 2
.endif
@ up to 2 stalls (VFP vector issuing / waiting for s0)
@ depending upon whether this is the first iteration and
@ how many add instructions are inserted above
vadd. f s4 , s0 , s7 @ t5
vadd. f s5 , s6 , s3 @ t6
vsub. f s6 , s6 , s3 @ t4
vsub. f s7 , s0 , s7 @ t3
vldr d6 , [ \ z 0 , #8 * 0 - 8 * 2 ] @ s12,s13
vadd. f s0 , s16 , s21 @ t1
vldr d7 , [ \ z 1 , #8 * o 1 - 8 * 2 ] @ s14,s15
vsub. f s1 , s18 , s23 @ t5
vadd. f s8 , s4 , s12 @ vector + vector
@ stall (VFP vector issuing)
@ stall (VFP vector issuing)
@ stall (VFP vector issuing)
vsub. f s4 , s12 , s4
vsub. f s5 , s13 , s5
vsub. f s6 , s14 , s6
vsub. f s7 , s15 , s7
vsub. f s2 , s17 , s20 @ t2
vadd. f s3 , s19 , s22 @ t6
vstr d4 , [ \ z 0 , #8 * 0 - 8 * 2 ] @ s8,s9
vstr d5 , [ \ z 1 , #8 * o 1 - 8 * 2 ] @ s10,s11
@ stall (waiting for s5)
vstr d2 , [ \ z 2 , #8 * o 2 - 8 * 2 ] @ s4,s5
vadd. f s4 , s1 , s0 @ t5
vstr d3 , [ \ z 3 , #8 * o 3 - 8 * 2 ] @ s6,s7
vsub. f s7 , s1 , s0 @ t3
vadd. f s5 , s2 , s3 @ t6
vsub. f s6 , s2 , s3 @ t4
vldr d6 , [ \ z 0 , #8 * 1 - 8 * 2 ] @ s12,s13
vldr d7 , [ \ z 1 , #8 * ( o 1 + 1 ) - 8 * 2 ] @ s14,s15
vldr d4 , [ \ z 2 , #8 * o 2 ] @ s8,s9
vldmdb v6 ! , { s2 ,s3 }
vldr d5 , [ \ z 3 , #8 * o 3 ] @ s10,s11
vadd. f s20 , s4 , s12 @ vector + vector
vldmia v5 ! , { s0 ,s1 }
vldr d8 , [ \ z 2 , #8 * ( o 2 + 1 ) ] @ s16,s17
@ stall (VFP vector issuing)
vsub. f s4 , s12 , s4
vsub. f s5 , s13 , s5
vsub. f s6 , s14 , s6
vsub. f s7 , s15 , s7
vmul. f s12 , s8 , s3 @ vector * scalar
vstr d10 , [ \ z 0 , #8 * 1 - 8 * 2 ] @ s20,s21
vldr d9 , [ \ z 3 , #8 * ( o 3 + 1 ) ] @ s18,s19
vstr d11 , [ \ z 1 , #8 * ( o 1 + 1 ) - 8 * 2 ] @ s22,s23
vmul. f s8 , s8 , s0 @ vector * scalar
vstr d2 , [ \ z 2 , #8 * ( o 2 + 1 ) - 8 * 2 ] @ s4,s5
@ stall (waiting for s7)
vstr d3 , [ \ z 3 , #8 * ( o 3 + 1 ) - 8 * 2 ] @ s6,s7
vmul. f s20 , s16 , s2 @ vector * scalar
@ stall (VFP vector issuing)
@ stall (VFP vector issuing)
@ stall (VFP vector issuing)
vadd. f s7 , s8 , s13 @ t1
vsub. f s6 , s9 , s12 @ t2
vsub. f s0 , s10 , s15 @ t5
vadd. f s3 , s11 , s14 @ t6
vmul. f s16 , s16 , s1 @ vector * scalar
subs a4 , a4 , #1
bne 1 b
@ What remains is identical to the first two indentations of
@ the above, but without the increment of z
vadd. f s4 , s0 , s7 @ t5
vadd. f s5 , s6 , s3 @ t6
vsub. f s6 , s6 , s3 @ t4
vsub. f s7 , s0 , s7 @ t3
vldr d6 , [ \ z 0 , #8 * 0 ] @ s12,s13
vadd. f s0 , s16 , s21 @ t1
vldr d7 , [ \ z 1 , #8 * o 1 ] @ s14,s15
vsub. f s1 , s18 , s23 @ t5
vadd. f s8 , s4 , s12 @ vector + vector
vsub. f s4 , s12 , s4
vsub. f s5 , s13 , s5
vsub. f s6 , s14 , s6
vsub. f s7 , s15 , s7
vsub. f s2 , s17 , s20 @ t2
vadd. f s3 , s19 , s22 @ t6
vstr d4 , [ \ z 0 , #8 * 0 ] @ s8,s9
vstr d5 , [ \ z 1 , #8 * o 1 ] @ s10,s11
vstr d2 , [ \ z 2 , #8 * o 2 ] @ s4,s5
vadd. f s4 , s1 , s0 @ t5
vstr d3 , [ \ z 3 , #8 * o 3 ] @ s6,s7
vsub. f s7 , s1 , s0 @ t3
vadd. f s5 , s2 , s3 @ t6
vsub. f s6 , s2 , s3 @ t4
vldr d6 , [ \ z 0 , #8 * 1 ] @ s12,s13
vldr d7 , [ \ z 1 , #8 * ( o 1 + 1 ) ] @ s14,s15
vadd. f s20 , s4 , s12 @ vector + vector
vsub. f s4 , s12 , s4
vsub. f s5 , s13 , s5
vsub. f s6 , s14 , s6
vsub. f s7 , s15 , s7
vstr d10 , [ \ z 0 , #8 * 1 ] @ s20,s21
vstr d11 , [ \ z 1 , #8 * ( o 1 + 1 ) ] @ s22,s23
vstr d2 , [ \ z 2 , #8 * ( o 2 + 1 ) ] @ s4,s5
vstr d3 , [ \ z 3 , #8 * ( o 3 + 1 ) ] @ s6,s7
.endm
.macro def_fft n, n 2 , n 4
function . L f f t \ n \ ( ) _ i n t e r n a l _ v f p
.if \ n > = 5 1 2
push { v1 - v6 ,l r }
.elseif \ n > = 2 5 6
push { v1 - v2 ,v5 - v6 ,l r }
.else
push { v1 ,v5 - v6 ,l r }
.endif
mov v1 , a1
bl . L f f t \ n 2 \ ( ) _ i n t e r n a l _ v f p
add a1 , v1 , #8 * ( \ n / 4 ) * 2
bl . L f f t \ n 4 \ ( ) _ i n t e r n a l _ v f p
movrelx v5 , X ( f f _ c o s _ \ n ) , a1
add a1 , v1 , #8 * ( \ n / 4 ) * 3
bl . L f f t \ n 4 \ ( ) _ i n t e r n a l _ v f p
.if \ n > = 5 1 2
.set o1 , 0 * ( \ n / 4 / 2 )
.set o2 , 0 * ( \ n / 4 / 2 )
.set o3 , 0 * ( \ n / 4 / 2 )
add v2 , v1 , #8 * 2 * ( \ n / 4 / 2 )
add v3 , v1 , #8 * 4 * ( \ n / 4 / 2 )
add v4 , v1 , #8 * 6 * ( \ n / 4 / 2 )
pass ( \ n / 4 / 2 ) , v1 , v2 , v3 , v4
pop { v1 - v6 ,p c }
.elseif \ n > = 2 5 6
.set o1 , 2 * ( \ n / 4 / 2 )
.set o2 , 0 * ( \ n / 4 / 2 )
.set o3 , 2 * ( \ n / 4 / 2 )
add v2 , v1 , #8 * 4 * ( \ n / 4 / 2 )
pass ( \ n / 4 / 2 ) , v1 , v1 , v2 , v2
pop { v1 - v2 ,v5 - v6 ,p c }
.else
.set o1 , 2 * ( \ n / 4 / 2 )
.set o2 , 4 * ( \ n / 4 / 2 )
.set o3 , 6 * ( \ n / 4 / 2 )
pass ( \ n / 4 / 2 ) , v1 , v1 , v1 , v1
pop { v1 ,v5 - v6 ,p c }
.endif
endfunc
function f f t \ n \ ( ) _ v f p
ldr a3 , =0x03030000 / * R u n F a s t m o d e , v e c t o r l e n g t h 4 , s t r i d e 1 * /
fmrx a2 , F P S C R
fmxr F P S C R , a3
vpush { s16 - s31 }
mov i p , l r
bl . L f f t \ n \ ( ) _ i n t e r n a l _ v f p
vpop { s16 - s31 }
fmxr F P S C R , a2
bx i p
endfunc
.ltorg
.endm
def_ f f t 3 2 , 1 6 , 8
def_ f f t 6 4 , 3 2 , 1 6
def_ f f t 1 2 8 , 6 4 , 3 2
def_ f f t 2 5 6 , 1 2 8 , 6 4
def_ f f t 5 1 2 , 2 5 6 , 1 2 8
def_ f f t 1 0 2 4 , 5 1 2 , 2 5 6
def_ f f t 2 0 4 8 , 1 0 2 4 , 5 1 2
def_ f f t 4 0 9 6 , 2 0 4 8 , 1 0 2 4
def_ f f t 8 1 9 2 , 4 0 9 6 , 2 0 4 8
def_ f f t 1 6 3 8 4 , 8 1 9 2 , 4 0 9 6
def_ f f t 3 2 7 6 8 , 1 6 3 8 4 , 8 1 9 2
def_ f f t 6 5 5 3 6 , 3 2 7 6 8 , 1 6 3 8 4