arm: Add VFP-accelerated version of qmf_32_subbands

Before           After
               Mean    StdDev   Mean    StdDev  Change
This function   1323.0  98.0      746.2  60.6   +77.3%
Overall        15400.0 336.4    14147.5 288.4    +8.9%

Signed-off-by: Martin Storsjö <martin@martin.st>
pull/27/merge
Ben Avison 12 years ago committed by Martin Storsjö
parent 800ffab48a
commit ff30d12159
  1. 10
      libavcodec/arm/dcadsp_init_arm.c
  2. 273
      libavcodec/arm/dcadsp_vfp.S

@ -26,6 +26,12 @@
void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
int decifactor, float scale);
void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
SynthFilterContext *synth, FFTContext *imdct,
float synth_buf_ptr[512],
int *synth_buf_offset, float synth_buf2[32],
const float window[512], float *samples_out,
float raXin[32], float scale);
void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs,
int decifactor, float scale);
@ -33,8 +39,10 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags))
if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) {
s->lfe_fir = ff_dca_lfe_fir_vfp;
s->qmf_32_subbands = ff_dca_qmf_32_subbands_vfp;
}
if (have_neon(cpu_flags))
s->lfe_fir = ff_dca_lfe_fir_neon;
}

@ -218,3 +218,276 @@ endfunc
.unreq POST1
.unreq POST2
.unreq POST3
IN .req a1
SBACT .req a2
OLDFPSCR .req a3
IMDCT .req a4
WINDOW .req v1
OUT .req v2
BUF .req v3
SCALEINT .req v4 @ only used in softfp case
COUNT .req v5
SCALE .req s0
/* Stack layout differs in softfp and hardfp cases:
*
* hardfp
* fp -> 6 arg words saved by caller
* a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes)
* s16-s23 on entry
* align 16
* buf -> 8*32*4 bytes buffer
* s0 on entry
* sp -> 3 arg words for callee
*
* softfp
* fp -> 7 arg words saved by caller
* a4,v1-v5,fp,lr on entry
* s16-s23 on entry
* align 16
* buf -> 8*32*4 bytes buffer
* sp -> 4 arg words for callee
*/
/* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
* SynthFilterContext *synth, FFTContext *imdct,
* float (*synth_buf_ptr)[512],
* int *synth_buf_offset, float (*synth_buf2)[32],
* const float (*window)[512], float *samples_out,
* float (*raXin)[32], float scale);
*/
function ff_dca_qmf_32_subbands_vfp, export=1
VFP push {a3-a4,v1-v3,v5,fp,lr}
NOVFP push {a4,v1-v5,fp,lr}
add fp, sp, #8*4
vpush {s16-s23}
@ The buffer pointed at by raXin isn't big enough for us to do a
@ complete matrix transposition as we want to, so allocate an
@ alternative buffer from the stack. Align to 4 words for speed.
sub BUF, sp, #8*32*4
bic BUF, BUF, #15
mov sp, BUF
ldr lr, =0x03330000 @ RunFast mode, short vectors of length 4, stride 2
fmrx OLDFPSCR, FPSCR
fmxr FPSCR, lr
@ COUNT is used to count down 2 things at once:
@ bits 0-4 are the number of word pairs remaining in the output row
@ bits 5-31 are the number of words to copy (with possible negation)
@ from the source matrix before we start zeroing the remainder
mov COUNT, #(-4 << 5) + 16
adds COUNT, COUNT, SBACT, lsl #5
bmi 2f
1:
vldr s8, [IN, #(0*8+0)*4]
vldr s10, [IN, #(0*8+1)*4]
vldr s12, [IN, #(0*8+2)*4]
vldr s14, [IN, #(0*8+3)*4]
vldr s16, [IN, #(0*8+4)*4]
vldr s18, [IN, #(0*8+5)*4]
vldr s20, [IN, #(0*8+6)*4]
vldr s22, [IN, #(0*8+7)*4]
vneg.f s8, s8
vldr s9, [IN, #(1*8+0)*4]
vldr s11, [IN, #(1*8+1)*4]
vldr s13, [IN, #(1*8+2)*4]
vldr s15, [IN, #(1*8+3)*4]
vneg.f s16, s16
vldr s17, [IN, #(1*8+4)*4]
vldr s19, [IN, #(1*8+5)*4]
vldr s21, [IN, #(1*8+6)*4]
vldr s23, [IN, #(1*8+7)*4]
vstr d4, [BUF, #(0*32+0)*4]
vstr d5, [BUF, #(1*32+0)*4]
vstr d6, [BUF, #(2*32+0)*4]
vstr d7, [BUF, #(3*32+0)*4]
vstr d8, [BUF, #(4*32+0)*4]
vstr d9, [BUF, #(5*32+0)*4]
vstr d10, [BUF, #(6*32+0)*4]
vstr d11, [BUF, #(7*32+0)*4]
vldr s9, [IN, #(3*8+0)*4]
vldr s11, [IN, #(3*8+1)*4]
vldr s13, [IN, #(3*8+2)*4]
vldr s15, [IN, #(3*8+3)*4]
vldr s17, [IN, #(3*8+4)*4]
vldr s19, [IN, #(3*8+5)*4]
vldr s21, [IN, #(3*8+6)*4]
vldr s23, [IN, #(3*8+7)*4]
vneg.f s9, s9
vldr s8, [IN, #(2*8+0)*4]
vldr s10, [IN, #(2*8+1)*4]
vldr s12, [IN, #(2*8+2)*4]
vldr s14, [IN, #(2*8+3)*4]
vneg.f s17, s17
vldr s16, [IN, #(2*8+4)*4]
vldr s18, [IN, #(2*8+5)*4]
vldr s20, [IN, #(2*8+6)*4]
vldr s22, [IN, #(2*8+7)*4]
vstr d4, [BUF, #(0*32+2)*4]
vstr d5, [BUF, #(1*32+2)*4]
vstr d6, [BUF, #(2*32+2)*4]
vstr d7, [BUF, #(3*32+2)*4]
vstr d8, [BUF, #(4*32+2)*4]
vstr d9, [BUF, #(5*32+2)*4]
vstr d10, [BUF, #(6*32+2)*4]
vstr d11, [BUF, #(7*32+2)*4]
add IN, IN, #4*8*4
add BUF, BUF, #4*4
subs COUNT, COUNT, #(4 << 5) + 2
bpl 1b
2: @ Now deal with trailing < 4 samples
adds COUNT, COUNT, #3 << 5
bmi 4f @ sb_act was a multiple of 4
bics lr, COUNT, #0x1F
bne 3f
@ sb_act was n*4+1
vldr s8, [IN, #(0*8+0)*4]
vldr s10, [IN, #(0*8+1)*4]
vldr s12, [IN, #(0*8+2)*4]
vldr s14, [IN, #(0*8+3)*4]
vldr s16, [IN, #(0*8+4)*4]
vldr s18, [IN, #(0*8+5)*4]
vldr s20, [IN, #(0*8+6)*4]
vldr s22, [IN, #(0*8+7)*4]
vneg.f s8, s8
vldr s9, zero
vldr s11, zero
vldr s13, zero
vldr s15, zero
vneg.f s16, s16
vldr s17, zero
vldr s19, zero
vldr s21, zero
vldr s23, zero
vstr d4, [BUF, #(0*32+0)*4]
vstr d5, [BUF, #(1*32+0)*4]
vstr d6, [BUF, #(2*32+0)*4]
vstr d7, [BUF, #(3*32+0)*4]
vstr d8, [BUF, #(4*32+0)*4]
vstr d9, [BUF, #(5*32+0)*4]
vstr d10, [BUF, #(6*32+0)*4]
vstr d11, [BUF, #(7*32+0)*4]
add BUF, BUF, #2*4
sub COUNT, COUNT, #1
b 4f
3: @ sb_act was n*4+2 or n*4+3, so do the first 2
vldr s8, [IN, #(0*8+0)*4]
vldr s10, [IN, #(0*8+1)*4]
vldr s12, [IN, #(0*8+2)*4]
vldr s14, [IN, #(0*8+3)*4]
vldr s16, [IN, #(0*8+4)*4]
vldr s18, [IN, #(0*8+5)*4]
vldr s20, [IN, #(0*8+6)*4]
vldr s22, [IN, #(0*8+7)*4]
vneg.f s8, s8
vldr s9, [IN, #(1*8+0)*4]
vldr s11, [IN, #(1*8+1)*4]
vldr s13, [IN, #(1*8+2)*4]
vldr s15, [IN, #(1*8+3)*4]
vneg.f s16, s16
vldr s17, [IN, #(1*8+4)*4]
vldr s19, [IN, #(1*8+5)*4]
vldr s21, [IN, #(1*8+6)*4]
vldr s23, [IN, #(1*8+7)*4]
vstr d4, [BUF, #(0*32+0)*4]
vstr d5, [BUF, #(1*32+0)*4]
vstr d6, [BUF, #(2*32+0)*4]
vstr d7, [BUF, #(3*32+0)*4]
vstr d8, [BUF, #(4*32+0)*4]
vstr d9, [BUF, #(5*32+0)*4]
vstr d10, [BUF, #(6*32+0)*4]
vstr d11, [BUF, #(7*32+0)*4]
add BUF, BUF, #2*4
sub COUNT, COUNT, #(2 << 5) + 1
bics lr, COUNT, #0x1F
bne 4f
@ sb_act was n*4+3
vldr s8, [IN, #(2*8+0)*4]
vldr s10, [IN, #(2*8+1)*4]
vldr s12, [IN, #(2*8+2)*4]
vldr s14, [IN, #(2*8+3)*4]
vldr s16, [IN, #(2*8+4)*4]
vldr s18, [IN, #(2*8+5)*4]
vldr s20, [IN, #(2*8+6)*4]
vldr s22, [IN, #(2*8+7)*4]
vldr s9, zero
vldr s11, zero
vldr s13, zero
vldr s15, zero
vldr s17, zero
vldr s19, zero
vldr s21, zero
vldr s23, zero
vstr d4, [BUF, #(0*32+0)*4]
vstr d5, [BUF, #(1*32+0)*4]
vstr d6, [BUF, #(2*32+0)*4]
vstr d7, [BUF, #(3*32+0)*4]
vstr d8, [BUF, #(4*32+0)*4]
vstr d9, [BUF, #(5*32+0)*4]
vstr d10, [BUF, #(6*32+0)*4]
vstr d11, [BUF, #(7*32+0)*4]
add BUF, BUF, #2*4
sub COUNT, COUNT, #1
4: @ Now fill the remainder with 0
vldr s8, zero
vldr s9, zero
ands COUNT, COUNT, #0x1F
beq 6f
5: vstr d4, [BUF, #(0*32+0)*4]
vstr d4, [BUF, #(1*32+0)*4]
vstr d4, [BUF, #(2*32+0)*4]
vstr d4, [BUF, #(3*32+0)*4]
vstr d4, [BUF, #(4*32+0)*4]
vstr d4, [BUF, #(5*32+0)*4]
vstr d4, [BUF, #(6*32+0)*4]
vstr d4, [BUF, #(7*32+0)*4]
add BUF, BUF, #2*4
subs COUNT, COUNT, #1
bne 5b
6:
fmxr FPSCR, OLDFPSCR
ldr WINDOW, [fp, #3*4]
ldr OUT, [fp, #4*4]
sub BUF, BUF, #32*4
NOVFP ldr SCALEINT, [fp, #6*4]
mov COUNT, #8
VFP vpush {SCALE}
VFP sub sp, sp, #3*4
NOVFP sub sp, sp, #4*4
7:
VFP ldr a1, [fp, #-7*4] @ imdct
NOVFP ldr a1, [fp, #-8*4]
ldmia fp, {a2-a4}
VFP stmia sp, {WINDOW, OUT, BUF}
NOVFP stmia sp, {WINDOW, OUT, BUF, SCALEINT}
VFP vldr SCALE, [sp, #3*4]
bl ff_synth_filter_float_vfp
add OUT, OUT, #32*4
add BUF, BUF, #32*4
subs COUNT, COUNT, #1
bne 7b
A sub sp, fp, #(8+8)*4
T sub fp, fp, #(8+8)*4
T mov sp, fp
vpop {s16-s23}
VFP pop {a3-a4,v1-v3,v5,fp,pc}
NOVFP pop {a4,v1-v5,fp,pc}
endfunc
.unreq IN
.unreq SBACT
.unreq OLDFPSCR
.unreq IMDCT
.unreq WINDOW
.unreq OUT
.unreq BUF
.unreq SCALEINT
.unreq COUNT
.unreq SCALE
.align 2
zero: .word 0

Loading…
Cancel
Save