@ -218,3 +218,276 @@ endfunc
.unreq POST1
.unreq POST2
.unreq POST3
IN . r e q a1
SBACT . r e q a2
OLDFPSCR . r e q a3
IMDCT . r e q a4
WINDOW . r e q v1
OUT . r e q v2
BUF . r e q v3
SCALEINT . r e q v4 @ only used in softfp case
COUNT . r e q v5
SCALE . r e q s0
/ * Stack l a y o u t d i f f e r s i n s o f t f p a n d h a r d f p c a s e s :
*
* hardfp
* fp - > 6 a r g w o r d s s a v e d b y c a l l e r
* a3 ,a4 ,v1 - v3 ,v5 ,f p ,l r o n e n t r y ( a3 j u s t t o p a d t o 8 b y t e s )
* s1 6 - s23 o n e n t r y
* align 1 6
* buf - > 8 * 3 2 * 4 b y t e s b u f f e r
* s0 o n e n t r y
* sp - > 3 a r g w o r d s f o r c a l l e e
*
* softfp
* fp - > 7 a r g w o r d s s a v e d b y c a l l e r
* a4 ,v1 - v5 ,f p ,l r o n e n t r y
* s1 6 - s23 o n e n t r y
* align 1 6
* buf - > 8 * 3 2 * 4 b y t e s b u f f e r
* sp - > 4 a r g w o r d s f o r c a l l e e
* /
/ * void f f _ d c a _ q m f _ 3 2 _ s u b b a n d s _ v f p ( f l o a t s a m p l e s _ i n [ 3 2 ] [ 8 ] , i n t s b _ a c t ,
* SynthFilterContext * s y n t h , F F T C o n t e x t * i m d c t ,
* float ( * s y n t h _ b u f _ p t r ) [ 5 1 2 ] ,
* int * s y n t h _ b u f _ o f f s e t , f l o a t ( * s y n t h _ b u f2 ) [ 3 2 ] ,
* const f l o a t ( * w i n d o w ) [ 5 1 2 ] , f l o a t * s a m p l e s _ o u t ,
* float ( * r a X i n ) [ 3 2 ] , f l o a t s c a l e ) ;
* /
function f f _ d c a _ q m f _ 3 2 _ s u b b a n d s _ v f p , e x p o r t =1
VFP p u s h { a3 - a4 ,v1 - v3 ,v5 ,f p ,l r }
NOVFP p u s h { a4 ,v1 - v5 ,f p ,l r }
add f p , s p , #8 * 4
vpush { s16 - s23 }
@ The buffer pointed at by raXin isn't big enough for us to do a
@ complete matrix transposition as we want to, so allocate an
@ alternative buffer from the stack. Align to 4 words for speed.
sub B U F , s p , #8 * 3 2 * 4
bic B U F , B U F , #15
mov s p , B U F
ldr l r , =0x03330000 @ RunFast mode, short vectors of length 4, stride 2
fmrx O L D F P S C R , F P S C R
fmxr F P S C R , l r
@ COUNT is used to count down 2 things at once:
@ bits 0-4 are the number of word pairs remaining in the output row
@ bits 5-31 are the number of words to copy (with possible negation)
@ from the source matrix before we start zeroing the remainder
mov C O U N T , #( - 4 < < 5 ) + 1 6
adds C O U N T , C O U N T , S B A C T , l s l #5
bmi 2 f
1 :
vldr s8 , [ I N , #( 0 * 8 + 0 ) * 4 ]
vldr s10 , [ I N , #( 0 * 8 + 1 ) * 4 ]
vldr s12 , [ I N , #( 0 * 8 + 2 ) * 4 ]
vldr s14 , [ I N , #( 0 * 8 + 3 ) * 4 ]
vldr s16 , [ I N , #( 0 * 8 + 4 ) * 4 ]
vldr s18 , [ I N , #( 0 * 8 + 5 ) * 4 ]
vldr s20 , [ I N , #( 0 * 8 + 6 ) * 4 ]
vldr s22 , [ I N , #( 0 * 8 + 7 ) * 4 ]
vneg. f s8 , s8
vldr s9 , [ I N , #( 1 * 8 + 0 ) * 4 ]
vldr s11 , [ I N , #( 1 * 8 + 1 ) * 4 ]
vldr s13 , [ I N , #( 1 * 8 + 2 ) * 4 ]
vldr s15 , [ I N , #( 1 * 8 + 3 ) * 4 ]
vneg. f s16 , s16
vldr s17 , [ I N , #( 1 * 8 + 4 ) * 4 ]
vldr s19 , [ I N , #( 1 * 8 + 5 ) * 4 ]
vldr s21 , [ I N , #( 1 * 8 + 6 ) * 4 ]
vldr s23 , [ I N , #( 1 * 8 + 7 ) * 4 ]
vstr d4 , [ B U F , #( 0 * 3 2 + 0 ) * 4 ]
vstr d5 , [ B U F , #( 1 * 3 2 + 0 ) * 4 ]
vstr d6 , [ B U F , #( 2 * 3 2 + 0 ) * 4 ]
vstr d7 , [ B U F , #( 3 * 3 2 + 0 ) * 4 ]
vstr d8 , [ B U F , #( 4 * 3 2 + 0 ) * 4 ]
vstr d9 , [ B U F , #( 5 * 3 2 + 0 ) * 4 ]
vstr d10 , [ B U F , #( 6 * 3 2 + 0 ) * 4 ]
vstr d11 , [ B U F , #( 7 * 3 2 + 0 ) * 4 ]
vldr s9 , [ I N , #( 3 * 8 + 0 ) * 4 ]
vldr s11 , [ I N , #( 3 * 8 + 1 ) * 4 ]
vldr s13 , [ I N , #( 3 * 8 + 2 ) * 4 ]
vldr s15 , [ I N , #( 3 * 8 + 3 ) * 4 ]
vldr s17 , [ I N , #( 3 * 8 + 4 ) * 4 ]
vldr s19 , [ I N , #( 3 * 8 + 5 ) * 4 ]
vldr s21 , [ I N , #( 3 * 8 + 6 ) * 4 ]
vldr s23 , [ I N , #( 3 * 8 + 7 ) * 4 ]
vneg. f s9 , s9
vldr s8 , [ I N , #( 2 * 8 + 0 ) * 4 ]
vldr s10 , [ I N , #( 2 * 8 + 1 ) * 4 ]
vldr s12 , [ I N , #( 2 * 8 + 2 ) * 4 ]
vldr s14 , [ I N , #( 2 * 8 + 3 ) * 4 ]
vneg. f s17 , s17
vldr s16 , [ I N , #( 2 * 8 + 4 ) * 4 ]
vldr s18 , [ I N , #( 2 * 8 + 5 ) * 4 ]
vldr s20 , [ I N , #( 2 * 8 + 6 ) * 4 ]
vldr s22 , [ I N , #( 2 * 8 + 7 ) * 4 ]
vstr d4 , [ B U F , #( 0 * 3 2 + 2 ) * 4 ]
vstr d5 , [ B U F , #( 1 * 3 2 + 2 ) * 4 ]
vstr d6 , [ B U F , #( 2 * 3 2 + 2 ) * 4 ]
vstr d7 , [ B U F , #( 3 * 3 2 + 2 ) * 4 ]
vstr d8 , [ B U F , #( 4 * 3 2 + 2 ) * 4 ]
vstr d9 , [ B U F , #( 5 * 3 2 + 2 ) * 4 ]
vstr d10 , [ B U F , #( 6 * 3 2 + 2 ) * 4 ]
vstr d11 , [ B U F , #( 7 * 3 2 + 2 ) * 4 ]
add I N , I N , #4 * 8 * 4
add B U F , B U F , #4 * 4
subs C O U N T , C O U N T , #( 4 < < 5 ) + 2
bpl 1 b
2 : @ Now deal with trailing < 4 samples
adds C O U N T , C O U N T , #3 < < 5
bmi 4 f @ sb_act was a multiple of 4
bics l r , C O U N T , #0x1F
bne 3 f
@ sb_act was n*4+1
vldr s8 , [ I N , #( 0 * 8 + 0 ) * 4 ]
vldr s10 , [ I N , #( 0 * 8 + 1 ) * 4 ]
vldr s12 , [ I N , #( 0 * 8 + 2 ) * 4 ]
vldr s14 , [ I N , #( 0 * 8 + 3 ) * 4 ]
vldr s16 , [ I N , #( 0 * 8 + 4 ) * 4 ]
vldr s18 , [ I N , #( 0 * 8 + 5 ) * 4 ]
vldr s20 , [ I N , #( 0 * 8 + 6 ) * 4 ]
vldr s22 , [ I N , #( 0 * 8 + 7 ) * 4 ]
vneg. f s8 , s8
vldr s9 , z e r o
vldr s11 , z e r o
vldr s13 , z e r o
vldr s15 , z e r o
vneg. f s16 , s16
vldr s17 , z e r o
vldr s19 , z e r o
vldr s21 , z e r o
vldr s23 , z e r o
vstr d4 , [ B U F , #( 0 * 3 2 + 0 ) * 4 ]
vstr d5 , [ B U F , #( 1 * 3 2 + 0 ) * 4 ]
vstr d6 , [ B U F , #( 2 * 3 2 + 0 ) * 4 ]
vstr d7 , [ B U F , #( 3 * 3 2 + 0 ) * 4 ]
vstr d8 , [ B U F , #( 4 * 3 2 + 0 ) * 4 ]
vstr d9 , [ B U F , #( 5 * 3 2 + 0 ) * 4 ]
vstr d10 , [ B U F , #( 6 * 3 2 + 0 ) * 4 ]
vstr d11 , [ B U F , #( 7 * 3 2 + 0 ) * 4 ]
add B U F , B U F , #2 * 4
sub C O U N T , C O U N T , #1
b 4 f
3 : @ sb_act was n*4+2 or n*4+3, so do the first 2
vldr s8 , [ I N , #( 0 * 8 + 0 ) * 4 ]
vldr s10 , [ I N , #( 0 * 8 + 1 ) * 4 ]
vldr s12 , [ I N , #( 0 * 8 + 2 ) * 4 ]
vldr s14 , [ I N , #( 0 * 8 + 3 ) * 4 ]
vldr s16 , [ I N , #( 0 * 8 + 4 ) * 4 ]
vldr s18 , [ I N , #( 0 * 8 + 5 ) * 4 ]
vldr s20 , [ I N , #( 0 * 8 + 6 ) * 4 ]
vldr s22 , [ I N , #( 0 * 8 + 7 ) * 4 ]
vneg. f s8 , s8
vldr s9 , [ I N , #( 1 * 8 + 0 ) * 4 ]
vldr s11 , [ I N , #( 1 * 8 + 1 ) * 4 ]
vldr s13 , [ I N , #( 1 * 8 + 2 ) * 4 ]
vldr s15 , [ I N , #( 1 * 8 + 3 ) * 4 ]
vneg. f s16 , s16
vldr s17 , [ I N , #( 1 * 8 + 4 ) * 4 ]
vldr s19 , [ I N , #( 1 * 8 + 5 ) * 4 ]
vldr s21 , [ I N , #( 1 * 8 + 6 ) * 4 ]
vldr s23 , [ I N , #( 1 * 8 + 7 ) * 4 ]
vstr d4 , [ B U F , #( 0 * 3 2 + 0 ) * 4 ]
vstr d5 , [ B U F , #( 1 * 3 2 + 0 ) * 4 ]
vstr d6 , [ B U F , #( 2 * 3 2 + 0 ) * 4 ]
vstr d7 , [ B U F , #( 3 * 3 2 + 0 ) * 4 ]
vstr d8 , [ B U F , #( 4 * 3 2 + 0 ) * 4 ]
vstr d9 , [ B U F , #( 5 * 3 2 + 0 ) * 4 ]
vstr d10 , [ B U F , #( 6 * 3 2 + 0 ) * 4 ]
vstr d11 , [ B U F , #( 7 * 3 2 + 0 ) * 4 ]
add B U F , B U F , #2 * 4
sub C O U N T , C O U N T , #( 2 < < 5 ) + 1
bics l r , C O U N T , #0x1F
bne 4 f
@ sb_act was n*4+3
vldr s8 , [ I N , #( 2 * 8 + 0 ) * 4 ]
vldr s10 , [ I N , #( 2 * 8 + 1 ) * 4 ]
vldr s12 , [ I N , #( 2 * 8 + 2 ) * 4 ]
vldr s14 , [ I N , #( 2 * 8 + 3 ) * 4 ]
vldr s16 , [ I N , #( 2 * 8 + 4 ) * 4 ]
vldr s18 , [ I N , #( 2 * 8 + 5 ) * 4 ]
vldr s20 , [ I N , #( 2 * 8 + 6 ) * 4 ]
vldr s22 , [ I N , #( 2 * 8 + 7 ) * 4 ]
vldr s9 , z e r o
vldr s11 , z e r o
vldr s13 , z e r o
vldr s15 , z e r o
vldr s17 , z e r o
vldr s19 , z e r o
vldr s21 , z e r o
vldr s23 , z e r o
vstr d4 , [ B U F , #( 0 * 3 2 + 0 ) * 4 ]
vstr d5 , [ B U F , #( 1 * 3 2 + 0 ) * 4 ]
vstr d6 , [ B U F , #( 2 * 3 2 + 0 ) * 4 ]
vstr d7 , [ B U F , #( 3 * 3 2 + 0 ) * 4 ]
vstr d8 , [ B U F , #( 4 * 3 2 + 0 ) * 4 ]
vstr d9 , [ B U F , #( 5 * 3 2 + 0 ) * 4 ]
vstr d10 , [ B U F , #( 6 * 3 2 + 0 ) * 4 ]
vstr d11 , [ B U F , #( 7 * 3 2 + 0 ) * 4 ]
add B U F , B U F , #2 * 4
sub C O U N T , C O U N T , #1
4 : @ Now fill the remainder with 0
vldr s8 , z e r o
vldr s9 , z e r o
ands C O U N T , C O U N T , #0x1F
beq 6 f
5 : vstr d4 , [ B U F , #( 0 * 3 2 + 0 ) * 4 ]
vstr d4 , [ B U F , #( 1 * 3 2 + 0 ) * 4 ]
vstr d4 , [ B U F , #( 2 * 3 2 + 0 ) * 4 ]
vstr d4 , [ B U F , #( 3 * 3 2 + 0 ) * 4 ]
vstr d4 , [ B U F , #( 4 * 3 2 + 0 ) * 4 ]
vstr d4 , [ B U F , #( 5 * 3 2 + 0 ) * 4 ]
vstr d4 , [ B U F , #( 6 * 3 2 + 0 ) * 4 ]
vstr d4 , [ B U F , #( 7 * 3 2 + 0 ) * 4 ]
add B U F , B U F , #2 * 4
subs C O U N T , C O U N T , #1
bne 5 b
6 :
fmxr F P S C R , O L D F P S C R
ldr W I N D O W , [ f p , #3 * 4 ]
ldr O U T , [ f p , #4 * 4 ]
sub B U F , B U F , #32 * 4
NOVFP l d r S C A L E I N T , [ f p , #6 * 4 ]
mov C O U N T , #8
VFP v p u s h { S C A L E }
VFP s u b s p , s p , #3 * 4
NOVFP s u b s p , s p , #4 * 4
7 :
VFP l d r a1 , [ f p , #- 7 * 4 ] @ imdct
NOVFP l d r a1 , [ f p , #- 8 * 4 ]
ldmia f p , { a2 - a4 }
VFP s t m i a s p , { W I N D O W , O U T , B U F }
NOVFP s t m i a s p , { W I N D O W , O U T , B U F , S C A L E I N T }
VFP v l d r S C A L E , [ s p , #3 * 4 ]
bl f f _ s y n t h _ f i l t e r _ f l o a t _ v f p
add O U T , O U T , #32 * 4
add B U F , B U F , #32 * 4
subs C O U N T , C O U N T , #1
bne 7 b
A s u b s p , f p , #( 8 + 8 ) * 4
T s u b f p , f p , #( 8 + 8 ) * 4
T m o v s p , f p
vpop { s16 - s23 }
VFP p o p { a3 - a4 ,v1 - v3 ,v5 ,f p ,p c }
NOVFP p o p { a4 ,v1 - v5 ,f p ,p c }
endfunc
.unreq IN
.unreq SBACT
.unreq OLDFPSCR
.unreq IMDCT
.unreq WINDOW
.unreq OUT
.unreq BUF
.unreq SCALEINT
.unreq COUNT
.unreq SCALE
.align 2
zero : .word 0