ARM: 10l: fix large FFTs

Originally committed as revision 19846 to svn://svn.ffmpeg.org/ffmpeg/trunk
release/0.6
Måns Rullgård 16 years ago
parent 89c4e176f6
commit 9ecc414195
  1. 6
      libavcodec/arm/fft_neon.S
  2. 18
      libavcodec/arm/mdct_neon.S
  3. 5
      libavcodec/fft.c

@ -327,8 +327,10 @@ function ff_fft_permute_neon, export=1
1: 1:
vld1.32 {d0-d1}, [r1,:128]! vld1.32 {d0-d1}, [r1,:128]!
ldr r4, [r0], #4 ldr r4, [r0], #4
uxtah lr, r3, r4 uxth lr, r4
uxtah r4, r3, r4, ror #16 uxth r4, r4, ror #16
add lr, r3, lr, lsl #3
add r4, r3, r4, lsl #3
vst1.32 {d0}, [lr,:64] vst1.32 {d0}, [lr,:64]
vst1.32 {d1}, [r4,:64] vst1.32 {d1}, [r4,:64]
subs r12, r12, #2 subs r12, r12, #2

@ -52,8 +52,10 @@ function ff_imdct_half_neon, export=1
vmul.f32 d5, d17, d3 vmul.f32 d5, d17, d3
vsub.f32 d4, d6, d4 vsub.f32 d4, d6, d4
vadd.f32 d5, d5, d7 vadd.f32 d5, d5, d7
uxtah r8, r1, r6, ror #16 uxth r8, r6, ror #16
uxtah r6, r1, r6 uxth r6, r6
add r8, r1, r8, lsl #3
add r6, r1, r6, lsl #3
beq 1f beq 1f
vld2.32 {d16-d17},[r7,:128],r12 vld2.32 {d16-d17},[r7,:128],r12
vld2.32 {d0-d1}, [r2,:128]! vld2.32 {d0-d1}, [r2,:128]!
@ -198,8 +200,10 @@ function ff_mdct_calc_neon, export=1
subs lr, lr, #16 subs lr, lr, #16
vsub.f32 d6, d6, d7 @ -R*c-I*s vsub.f32 d6, d6, d7 @ -R*c-I*s
vadd.f32 d7, d4, d5 @ -R*s+I*c vadd.f32 d7, d4, d5 @ -R*s+I*c
uxtah r10, r1, r6, ror #16 uxth r10, r6, ror #16
uxtah r6, r1, r6 uxth r6, r6
add r10, r1, r10, lsl #3
add r6, r1, r6, lsl #3
beq 1f beq 1f
vld2.32 {d16,d18},[r9,:128],r12 @ x,x in4d1,in4d0 vld2.32 {d16,d18},[r9,:128],r12 @ x,x in4d1,in4d0
vld2.32 {d17,d19},[r8,:128],r12 @ x,x in3d1,in3d0 vld2.32 {d17,d19},[r8,:128],r12 @ x,x in3d1,in3d0
@ -245,8 +249,10 @@ function ff_mdct_calc_neon, export=1
subs lr, lr, #16 subs lr, lr, #16
vsub.f32 d6, d7, d6 @ I*s-R*c vsub.f32 d6, d7, d6 @ I*s-R*c
vadd.f32 d7, d4, d5 @ R*s-I*c vadd.f32 d7, d4, d5 @ R*s-I*c
uxtah r10, r1, r6, ror #16 uxth r10, r6, ror #16
uxtah r6, r1, r6 uxth r6, r6
add r10, r1, r10, lsl #3
add r6, r1, r6, lsl #3
beq 1f beq 1f
vld2.32 {d16,d18},[r9,:128],r12 @ x,x in2d1,in2d0 vld2.32 {d16,d18},[r9,:128],r12 @ x,x in2d1,in2d0
vld2.32 {d17,d19},[r8,:128],r12 @ x,x in1d1,in1d0 vld2.32 {d17,d19},[r8,:128],r12 @ x,x in1d1,in1d0

@ -64,7 +64,6 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
float alpha, c1, s1, s2; float alpha, c1, s1, s2;
int split_radix = 1; int split_radix = 1;
int av_unused has_vectors; int av_unused has_vectors;
int revtab_shift = 0;
if (nbits < 2 || nbits > 16) if (nbits < 2 || nbits > 16)
goto fail; goto fail;
@ -120,7 +119,6 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
s->imdct_calc = ff_imdct_calc_neon; s->imdct_calc = ff_imdct_calc_neon;
s->imdct_half = ff_imdct_half_neon; s->imdct_half = ff_imdct_half_neon;
s->mdct_calc = ff_mdct_calc_neon; s->mdct_calc = ff_mdct_calc_neon;
revtab_shift = 3;
#endif #endif
if (split_radix) { if (split_radix) {
@ -134,8 +132,7 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
tab[m/2-i] = tab[i]; tab[m/2-i] = tab[i];
} }
for(i=0; i<n; i++) for(i=0; i<n; i++)
s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = i;
i << revtab_shift;
s->tmp_buf = av_malloc(n * sizeof(FFTComplex)); s->tmp_buf = av_malloc(n * sizeof(FFTComplex));
} else { } else {
int np, nblocks, np2, l; int np, nblocks, np2, l;

Loading…
Cancel
Save