@ -83,6 +83,168 @@ endfunc
* Assume l e n i s a m u l t i p l e o f 8 , d e s t i n a t i o n b u f f e r i s a t l e a s t 4 b y t e s a l i g n e d
* ( 1 6 bytes a l i g n m e n t i s b e s t f o r B C M 2 8 3 5 ) , l i t t l e - e n d i a n .
* /
@ void ff_int32_to_float_fmul_array8_vfp(FmtConvertContext *c, float *dst, const int32_t *src, const float *mul, int len)
function f f _ i n t 3 2 _ t o _ f l o a t _ f m u l _ a r r a y 8 _ v f p , e x p o r t =1
push { l r }
ldr a1 , [ s p , #4 ]
subs l r , a1 , #3 * 8
bcc 5 0 f @ too short to pipeline
@ Now need to find (len / 8) % 3. The approximation
@ x / 24 = (x * 0xAB) >> 12
@ is good for x < 4096, which is true for both AC3 and DCA.
mov a1 , #0xAB
ldr i p , =0x03070000 @ RunFast mode, short vectors of length 8, stride 1
mul a1 , l r , a1
vpush { s16 - s31 }
mov a1 , a1 , l s r #12
add a1 , a1 , a1 , l s l #1
rsb a1 , a1 , l r , l s r #3
cmp a1 , #1
fmrx a1 , F P S C R
fmxr F P S C R , i p
beq 1 1 f
blo 1 0 f
@ Array is (2 + multiple of 3) x 8 floats long
@ drop through...
vldmia a3 ! , { s16 - s23 }
vldmia a4 ! , { s2 ,s3 }
vldmia a3 ! , { s24 - s31 }
vcvt. f32 . s32 s16 , s16
vcvt. f32 . s32 s17 , s17
vcvt. f32 . s32 s18 , s18
vcvt. f32 . s32 s19 , s19
vcvt. f32 . s32 s20 , s20
vcvt. f32 . s32 s21 , s21
vcvt. f32 . s32 s22 , s22
vcvt. f32 . s32 s23 , s23
vmul. f32 s16 , s16 , s2
@ drop through...
3 :
vldmia a3 ! , { s8 - s15 }
vldmia a4 ! , { s1 }
vcvt. f32 . s32 s24 , s24
vcvt. f32 . s32 s25 , s25
vcvt. f32 . s32 s26 , s26
vcvt. f32 . s32 s27 , s27
vcvt. f32 . s32 s28 , s28
vcvt. f32 . s32 s29 , s29
vcvt. f32 . s32 s30 , s30
vcvt. f32 . s32 s31 , s31
vmul. f32 s24 , s24 , s3
vstmia a2 ! , { s16 - s19 }
vstmia a2 ! , { s20 - s23 }
2 :
vldmia a3 ! , { s16 - s23 }
vldmia a4 ! , { s2 }
vcvt. f32 . s32 s8 , s8
vcvt. f32 . s32 s9 , s9
vcvt. f32 . s32 s10 , s10
vcvt. f32 . s32 s11 , s11
vcvt. f32 . s32 s12 , s12
vcvt. f32 . s32 s13 , s13
vcvt. f32 . s32 s14 , s14
vcvt. f32 . s32 s15 , s15
vmul. f32 s8 , s8 , s1
vstmia a2 ! , { s24 - s27 }
vstmia a2 ! , { s28 - s31 }
1 :
vldmia a3 ! , { s24 - s31 }
vldmia a4 ! , { s3 }
vcvt. f32 . s32 s16 , s16
vcvt. f32 . s32 s17 , s17
vcvt. f32 . s32 s18 , s18
vcvt. f32 . s32 s19 , s19
vcvt. f32 . s32 s20 , s20
vcvt. f32 . s32 s21 , s21
vcvt. f32 . s32 s22 , s22
vcvt. f32 . s32 s23 , s23
vmul. f32 s16 , s16 , s2
vstmia a2 ! , { s8 - s11 }
vstmia a2 ! , { s12 - s15 }
subs l r , l r , #8 * 3
bpl 3 b
vcvt. f32 . s32 s24 , s24
vcvt. f32 . s32 s25 , s25
vcvt. f32 . s32 s26 , s26
vcvt. f32 . s32 s27 , s27
vcvt. f32 . s32 s28 , s28
vcvt. f32 . s32 s29 , s29
vcvt. f32 . s32 s30 , s30
vcvt. f32 . s32 s31 , s31
vmul. f32 s24 , s24 , s3
vstmia a2 ! , { s16 - s19 }
vstmia a2 ! , { s20 - s23 }
vstmia a2 ! , { s24 - s27 }
vstmia a2 ! , { s28 - s31 }
fmxr F P S C R , a1
vpop { s16 - s31 }
pop { p c }
10 : @ Array is (multiple of 3) x 8 floats long
vldmia a3 ! , { s8 - s15 }
vldmia a4 ! , { s1 ,s2 }
vldmia a3 ! , { s16 - s23 }
vcvt. f32 . s32 s8 , s8
vcvt. f32 . s32 s9 , s9
vcvt. f32 . s32 s10 , s10
vcvt. f32 . s32 s11 , s11
vcvt. f32 . s32 s12 , s12
vcvt. f32 . s32 s13 , s13
vcvt. f32 . s32 s14 , s14
vcvt. f32 . s32 s15 , s15
vmul. f32 s8 , s8 , s1
b 1 b
11 : @ Array is (1 + multiple of 3) x 8 floats long
vldmia a3 ! , { s24 - s31 }
vldmia a4 ! , { s3 }
vldmia a3 ! , { s8 - s15 }
vldmia a4 ! , { s1 }
vcvt. f32 . s32 s24 , s24
vcvt. f32 . s32 s25 , s25
vcvt. f32 . s32 s26 , s26
vcvt. f32 . s32 s27 , s27
vcvt. f32 . s32 s28 , s28
vcvt. f32 . s32 s29 , s29
vcvt. f32 . s32 s30 , s30
vcvt. f32 . s32 s31 , s31
vmul. f32 s24 , s24 , s3
b 2 b
50 :
ldr l r , =0x03070000 @ RunFast mode, short vectors of length 8, stride 1
fmrx i p , F P S C R
fmxr F P S C R , l r
51 :
vldmia a3 ! , { s8 - s15 }
vldmia a4 ! , { s0 }
vcvt. f32 . s32 s8 , s8
vcvt. f32 . s32 s9 , s9
vcvt. f32 . s32 s10 , s10
vcvt. f32 . s32 s11 , s11
vcvt. f32 . s32 s12 , s12
vcvt. f32 . s32 s13 , s13
vcvt. f32 . s32 s14 , s14
vcvt. f32 . s32 s15 , s15
vmul. f32 s8 , s8 , s0
subs a1 , a1 , #8
vstmia a2 ! , { s8 - s11 }
vstmia a2 ! , { s12 - s15 }
bne 5 1 b
fmxr F P S C R , i p
pop { p c }
endfunc
/ * *
* ARM V F P o p t i m i s e d i n t 3 2 t o f l o a t c o n v e r s i o n .
* Assume l e n i s a m u l t i p l e o f 8 , d e s t i n a t i o n b u f f e r i s a t l e a s t 4 b y t e s a l i g n e d
* ( 1 6 bytes a l i g n m e n t i s b e s t f o r B C M 2 8 3 5 ) , l i t t l e - e n d i a n .
* TODO : could b e f u r t h e r o p t i m i s e d b y u n r o l l i n g a n d i n t e r l e a v i n g , a s a b o v e
* /
@ void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src, float mul, int len)
function f f _ i n t 3 2 _ t o _ f l o a t _ f m u l _ s c a l a r _ v f p , e x p o r t =1
VFP t m p . r e q a4