arm: Add VFP-accelerated version of int32_to_float_fmul_scalar

Before           After
               Mean    StdDev   Mean    StdDev  Change
This function   1175.0   4.4      366.2  18.3   +220.8%
Overall        19285.5 292.0    18420.5 489.1     +4.7%

Signed-off-by: Martin Storsjö <martin@martin.st>
pull/27/merge
Ben Avison 12 years ago committed by Martin Storsjö
parent 41ef1d360b
commit ce9ed10ac2
  1. 10
      libavcodec/arm/fmtconvert_init_arm.c
  2. 38
      libavcodec/arm/fmtconvert_vfp.S

@ -28,6 +28,9 @@
void ff_int32_to_float_fmul_scalar_neon(float *dst, const int32_t *src, void ff_int32_to_float_fmul_scalar_neon(float *dst, const int32_t *src,
float mul, int len); float mul, int len);
void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src,
float mul, int len);
void ff_float_to_int16_neon(int16_t *dst, const float *src, long len); void ff_float_to_int16_neon(int16_t *dst, const float *src, long len);
void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
@ -38,6 +41,13 @@ av_cold void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx
int cpu_flags = av_get_cpu_flags(); int cpu_flags = av_get_cpu_flags();
if (have_vfp(cpu_flags) && have_armv6(cpu_flags)) { if (have_vfp(cpu_flags) && have_armv6(cpu_flags)) {
if (!have_vfpv3(cpu_flags)) {
// This function doesn't use anything armv6 specific in itself,
// but ff_float_to_int16_vfp which is in the same assembly source
// file does, thus the whole file requires armv6 to be built.
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_vfp;
}
c->float_to_int16 = ff_float_to_int16_vfp; c->float_to_int16 = ff_float_to_int16_vfp;
} }

@ -1,5 +1,6 @@
/* /*
* Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net> * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
* Copyright (c) 2013 RISC OS Open Ltd <bavison@riscosopen.org>
* *
* This file is part of Libav. * This file is part of Libav.
* *
@ -76,3 +77,40 @@ function ff_float_to_int16_vfp, export=1
vpop {d8-d11} vpop {d8-d11}
pop {r4-r8,pc} pop {r4-r8,pc}
endfunc endfunc
/**
* ARM VFP optimised int32 to float conversion.
* Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned
* (16 bytes alignment is best for BCM2835), little-endian.
*/
@ void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src, float mul, int len)
function ff_int32_to_float_fmul_scalar_vfp, export=1
VFP tmp .req a4
VFP len .req a3
NOVFP tmp .req a3
NOVFP len .req a4
NOVFP vmov s0, a3
ldr tmp, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1
fmrx ip, FPSCR
fmxr FPSCR, tmp
1:
vldmia a2!, {s8-s15}
vcvt.f32.s32 s8, s8
vcvt.f32.s32 s9, s9
vcvt.f32.s32 s10, s10
vcvt.f32.s32 s11, s11
vcvt.f32.s32 s12, s12
vcvt.f32.s32 s13, s13
vcvt.f32.s32 s14, s14
vcvt.f32.s32 s15, s15
vmul.f32 s8, s8, s0
subs len, len, #8
vstmia a1!, {s8-s11}
vstmia a1!, {s12-s15}
bne 1b
fmxr FPSCR, ip
bx lr
endfunc
.unreq tmp
.unreq len

Loading…
Cancel
Save