From be822d77b6f8363df0a49c568662187d655711e2 Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Thu, 12 Jan 2012 23:44:20 +0000 Subject: [PATCH] aacsbr: ARM NEON optimised sbrdsp functions Overall speedup of HE-AAC decoding 2.3x on Cortex-A8, 1.2x on A9. Signed-off-by: Mans Rullgard --- libavcodec/arm/Makefile | 4 + libavcodec/arm/sbrdsp_init_arm.c | 70 ++++++ libavcodec/arm/sbrdsp_neon.S | 411 +++++++++++++++++++++++++++++++ libavcodec/sbrdsp.c | 4 + libavcodec/sbrdsp.h | 1 + 5 files changed, 490 insertions(+) create mode 100644 libavcodec/arm/sbrdsp_init_arm.c create mode 100644 libavcodec/arm/sbrdsp_neon.S diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile index fc1711395b..e7fa7e511a 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile @@ -1,6 +1,8 @@ OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_init_arm.o \ arm/ac3dsp_arm.o +OBJS-$(CONFIG_AAC_DECODER) += arm/sbrdsp_init_arm.o + OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o \ ARMV6-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_armv6.o @@ -60,6 +62,8 @@ NEON-OBJS-$(CONFIG_H264PRED) += arm/h264pred_neon.o \ NEON-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_neon.o +NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/sbrdsp_neon.o + NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \ arm/synth_filter_neon.o \ diff --git a/libavcodec/arm/sbrdsp_init_arm.c b/libavcodec/arm/sbrdsp_init_arm.c new file mode 100644 index 0000000000..2ab0df829d --- /dev/null +++ b/libavcodec/arm/sbrdsp_init_arm.c @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2012 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavcodec/sbrdsp.h" + +void ff_sbr_sum64x5_neon(float *z); +float ff_sbr_sum_square_neon(float (*x)[2], int n); +void ff_sbr_neg_odd_64_neon(float *x); +void ff_sbr_qmf_pre_shuffle_neon(float *z); +void ff_sbr_qmf_post_shuffle_neon(float W[32][2], const float *z); +void ff_sbr_qmf_deint_neg_neon(float *v, const float *src); +void ff_sbr_qmf_deint_bfly_neon(float *v, const float *src0, const float *src1); +void ff_sbr_hf_g_filt_neon(float (*Y)[2], const float (*X_high)[40][2], + const float *g_filt, int m_max, int ixh); +void ff_sbr_hf_gen_neon(float (*X_high)[2], const float (*X_low)[2], + const float alpha0[2], const float alpha1[2], + float bw, int start, int end); +void ff_sbr_autocorrelate_neon(const float x[40][2], float phi[3][2][2]); + +void ff_sbr_hf_apply_noise_0_neon(float Y[64][2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max); +void ff_sbr_hf_apply_noise_1_neon(float Y[64][2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max); +void ff_sbr_hf_apply_noise_2_neon(float Y[64][2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max); +void ff_sbr_hf_apply_noise_3_neon(float Y[64][2], const float *s_m, + const float *q_filt, int noise, + int kx, int m_max); + +av_cold void ff_sbrdsp_init_arm(SBRDSPContext *s) +{ + if (HAVE_NEON) { + s->sum64x5 = ff_sbr_sum64x5_neon; + s->sum_square = ff_sbr_sum_square_neon; + s->neg_odd_64 = ff_sbr_neg_odd_64_neon; + s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_neon; + s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_neon; + s->qmf_deint_neg = ff_sbr_qmf_deint_neg_neon; + s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_neon; + s->hf_g_filt = ff_sbr_hf_g_filt_neon; + s->hf_gen = ff_sbr_hf_gen_neon; + s->autocorrelate = ff_sbr_autocorrelate_neon; + s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_neon; + s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_neon; + s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_neon; + s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_neon; + } +} diff --git a/libavcodec/arm/sbrdsp_neon.S b/libavcodec/arm/sbrdsp_neon.S new file mode 100644 index 0000000000..835c32caee --- /dev/null +++ b/libavcodec/arm/sbrdsp_neon.S @@ -0,0 +1,411 @@ +/* + * Copyright (c) 2012 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + +function ff_sbr_sum64x5_neon, export=1 + push {lr} + add r1, r0, # 64*4 + add r2, r0, #128*4 + add r3, r0, #192*4 + add lr, r0, #256*4 + mov r12, #64 +1: + vld1.32 {q0}, [r0,:128] + vld1.32 {q1}, [r1,:128]! + vadd.f32 q0, q0, q1 + vld1.32 {q2}, [r2,:128]! + vadd.f32 q0, q0, q2 + vld1.32 {q3}, [r3,:128]! + vadd.f32 q0, q0, q3 + vld1.32 {q8}, [lr,:128]! + vadd.f32 q0, q0, q8 + vst1.32 {q0}, [r0,:128]! + subs r12, #4 + bgt 1b + pop {pc} +endfunc + +function ff_sbr_sum_square_neon, export=1 + vmov.f32 q0, #0.0 +1: + vld1.32 {q1}, [r0,:128]! + vmla.f32 q0, q1, q1 + subs r1, r1, #2 + bgt 1b + vadd.f32 d0, d0, d1 + vpadd.f32 d0, d0, d0 +NOVFP vmov.32 r0, d0[0] + bx lr +endfunc + +function ff_sbr_neg_odd_64_neon, export=1 + mov r1, r0 + vmov.i32 q8, #1<<31 + vld2.32 {q0,q1}, [r0,:128]! + veor q1, q1, q8 + vld2.32 {q2,q3}, [r0,:128]! + .rept 3 + vst2.32 {q0,q1}, [r1,:128]! + veor q3, q3, q8 + vld2.32 {q0,q1}, [r0,:128]! + vst2.32 {q2,q3}, [r1,:128]! + veor q1, q1, q8 + vld2.32 {q2,q3}, [r0,:128]! + .endr + veor q3, q3, q8 + vst2.32 {q0,q1}, [r1,:128]! + vst2.32 {q2,q3}, [r1,:128]! + bx lr +endfunc + +function ff_sbr_qmf_pre_shuffle_neon, export=1 + add r1, r0, #60*4 + add r2, r0, #64*4 + vld1.32 {d0}, [r0,:64]! + vst1.32 {d0}, [r2,:64]! + mov r3, #-16 + mov r12, #24 + vmov.i32 q8, #1<<31 + vld1.32 {q0}, [r1,:128], r3 + vld1.32 {d2}, [r0,:64]! +1: + vld1.32 {d3,d4}, [r0,:128]! + vrev64.32 q0, q0 + vld1.32 {q9}, [r1,:128], r3 + veor q0, q0, q8 + vld1.32 {d5,d6}, [r0,:128]! + vswp d0, d1 + vrev64.32 q9, q9 + vst2.32 {q0,q1}, [r2,:64]! + vmov q10, q2 + veor q9, q9, q8 + vmov d2, d6 + vswp d18, d19 + vld1.32 {q0}, [r1,:128], r3 + vst2.32 {q9,q10}, [r2,:64]! + subs r12, r12, #8 + bgt 1b + vld1.32 {d3,d4}, [r0,:128]! + vrev64.32 q0, q0 + vld1.32 {q9}, [r1,:128], r3 + veor q0, q0, q8 + vld1.32 {d5}, [r0,:64]! + vswp d0, d1 + vrev64.32 q9, q9 + vst2.32 {q0,q1}, [r2,:64]! + vswp d4, d5 + veor q1, q9, q8 + vst2.32 {d3,d5}, [r2,:64]! + vst2.32 {d2[0],d4[0]}, [r2,:64]! + bx lr +endfunc + +function ff_sbr_qmf_post_shuffle_neon, export=1 + add r2, r1, #60*4 + mov r3, #-16 + mov r12, #32 + vmov.i32 q8, #1<<31 + vld1.32 {q0}, [r2,:128], r3 + vld1.32 {q1}, [r1,:128]! +1: + pld [r2, #-32] + vrev64.32 q0, q0 + vswp d2, d3 + veor q0, q0, q8 + vld1.32 {q2}, [r2,:128], r3 + vld1.32 {q3}, [r1,:128]! + vst2.32 {d1,d3}, [r0,:128]! + vst2.32 {d0,d2}, [r0,:128]! + pld [r2, #-32] + vrev64.32 q2, q2 + vswp d6, d7 + veor q2, q2, q8 + vld1.32 {q0}, [r2,:128], r3 + vld1.32 {q1}, [r1,:128]! + vst2.32 {d5,d7}, [r0,:128]! + vst2.32 {d4,d6}, [r0,:128]! + subs r12, r12, #8 + bgt 1b + bx lr +endfunc + +function ff_sbr_qmf_deint_neg_neon, export=1 + add r1, r1, #60*4 + add r2, r0, #62*4 + mov r3, #-16 + mov r12, #32 + vmov.i32 d2, #1<<31 +1: + vld2.32 {d0,d1}, [r1,:128], r3 + veor d0, d0, d2 + vrev64.32 d1, d1 + vst1.32 {d0}, [r2,:64] + vst1.32 {d1}, [r0,:64]! + sub r2, r2, #8 + subs r12, r12, #2 + bgt 1b + bx lr +endfunc + +function ff_sbr_qmf_deint_bfly_neon, export=1 + push {lr} + add r2, r2, #60*4 + add r3, r0, #124*4 + mov r12, #64 + mov lr, #-16 +1: + vld1.32 {q0}, [r1,:128]! + vld1.32 {q1}, [r2,:128], lr + vrev64.32 q2, q0 + vrev64.32 q3, q1 + vadd.f32 d3, d4, d3 + vadd.f32 d2, d5, d2 + vsub.f32 d0, d0, d7 + vsub.f32 d1, d1, d6 + vst1.32 {q1}, [r3,:128], lr + vst1.32 {q0}, [r0,:128]! + subs r12, r12, #4 + bgt 1b + pop {pc} +endfunc + +function ff_sbr_hf_g_filt_neon, export=1 + ldr r12, [sp] + add r1, r1, r12, lsl #3 + mov r12, #40*2*4 + sub r3, r3, #1 + vld2.32 {d2[],d3[]},[r2,:64]! + vld1.32 {d0}, [r1,:64], r12 +1: + vld1.32 {d1}, [r1,:64], r12 + vmul.f32 q3, q0, q1 + vld2.32 {d2[],d3[]},[r2,:64]! + vld1.32 {d0}, [r1,:64], r12 + vst1.32 {q3}, [r0,:64]! + subs r3, r3, #2 + bgt 1b + it lt + bxlt lr + vmul.f32 d0, d0, d2 + vst1.32 {d0}, [r0,:64]! + bx lr +endfunc + +function ff_sbr_hf_gen_neon, export=1 +NOVFP vld1.32 {d1[]}, [sp,:32] +VFP vdup.32 d1, d0[0] + vmul.f32 d0, d1, d1 + vld1.32 {d3}, [r2,:64] + vld1.32 {d2}, [r3,:64] + vmul.f32 q0, q0, q1 + ldrd r2, r3, [sp, #4*!HAVE_VFP_ARGS] + vtrn.32 d0, d1 + vneg.f32 d18, d1 + vtrn.32 d18, d1 + add r0, r0, r2, lsl #3 + add r1, r1, r2, lsl #3 + sub r1, r1, #2*8 + sub r3, r3, r2 + vld1.32 {q1}, [r1,:128]! +1: + vld1.32 {q3}, [r1,:128]! + vrev64.32 q2, q1 + vmov q8, q3 + vrev64.32 d20, d3 + vrev64.32 d21, d6 + vmla.f32 q3, q1, d0[0] + vmla.f32 d6, d4, d18 + vmla.f32 d7, d20, d18 + vmla.f32 d6, d3, d0[1] + vmla.f32 d7, d16, d0[1] + vmla.f32 d6, d5, d1 + vmla.f32 d7, d21, d1 + vmov q1, q8 + vst1.32 {q3}, [r0,:128]! + subs r3, r3, #2 + bgt 1b + bx lr +endfunc + +function ff_sbr_autocorrelate_neon, export=1 + vld1.32 {q0}, [r0,:128]! + vmov.f32 q1, #0.0 + vmov.f32 q3, #0.0 + vmov.f32 d20, #0.0 + vmul.f32 d21, d1, d1 + vmov q8, q0 + vmov q11, q0 + mov r12, #36 +1: + vld1.32 {q2}, [r0,:128]! + vrev64.32 q12, q2 + vmla.f32 q10, q2, q2 + vmla.f32 d2, d1, d4 + vmla.f32 d3, d1, d24 + vmla.f32 d6, d0, d4 + vmla.f32 d7, d0, d24 + vmla.f32 d2, d4, d5 + vmla.f32 d3, d4, d25 + vmla.f32 d6, d1, d5 + vmla.f32 d7, d1, d25 + vmov q0, q2 + subs r12, r12, #2 + bgt 1b + vld1.32 {q2}, [r0,:128]! + vrev64.32 q12, q2 + vmla.f32 d2, d1, d4 + vmla.f32 d3, d1, d24 + vmla.f32 d6, d0, d4 + vmla.f32 d7, d0, d24 + vadd.f32 d20, d20, d21 + vrev64.32 d18, d17 + vmla.f32 d6, d1, d5 + vmla.f32 d7, d1, d25 + vmov q0, q1 + vmla.f32 d0, d16, d17 + vmla.f32 d1, d16, d18 + vmla.f32 d2, d4, d5 + vmla.f32 d3, d4, d25 + vneg.f32 s15, s15 + vmov d21, d20 + vpadd.f32 d0, d0, d2 + vpadd.f32 d7, d6, d7 + vtrn.32 d1, d3 + vsub.f32 d6, d1, d3 + vmla.f32 d20, d22, d22 + vmla.f32 d21, d4, d4 + vtrn.32 d0, d6 + vpadd.f32 d20, d20, d21 + vst1.32 {q3}, [r1,:128]! + vst1.32 {d20[1]}, [r1,:32] + add r1, r1, #2*4 + vst1.32 {d0}, [r1,:64] + add r1, r1, #4*4 + vst1.32 {d20[0]}, [r1,:32] + bx lr +endfunc + +function ff_sbr_hf_apply_noise_0_neon, export=1 + vmov.i32 d3, #0 +.Lhf_apply_noise_0: + push {r4,lr} + ldr r12, [sp, #12] + movrel r4, X(ff_sbr_noise_table) + add r3, r3, #1 + bfc r3, #9, #23 + sub r12, r12, #1 +1: + add lr, r4, r3, lsl #3 + vld2.32 {q0}, [r0,:64] + vld2.32 {q3}, [lr,:64] + vld1.32 {d2}, [r1,:64]! + vld1.32 {d18}, [r2,:64]! + vceq.f32 d16, d2, #0 + veor d2, d2, d3 + vmov q2, q0 + vmla.f32 d0, d6, d18 + vmla.f32 d1, d7, d18 + vadd.f32 d4, d4, d2 + add r3, r3, #2 + bfc r3, #9, #23 + vbif d0, d4, d16 + vbif d1, d5, d16 + vst2.32 {q0}, [r0,:64]! + subs r12, r12, #2 + bgt 1b + blt 2f + add lr, r4, r3, lsl #3 + vld1.32 {d0}, [r0,:64] + vld1.32 {d6}, [lr,:64] + vld1.32 {d2[]}, [r1,:32]! + vld1.32 {d3[]}, [r2,:32]! + vceq.f32 d4, d2, #0 + veor d2, d2, d3 + vmov d1, d0 + vmla.f32 d0, d6, d3 + vadd.f32 s2, s2, s4 + vbif d0, d1, d4 + vst1.32 {d0}, [r0,:64]! +2: + pop {r4,pc} +endfunc + +function ff_sbr_hf_apply_noise_1_neon, export=1 + ldr r12, [sp] + push {r4,lr} + lsl r12, r12, #31 + eor lr, r12, #1<<31 + vmov d3, r12, lr +.Lhf_apply_noise_1: + ldr r12, [sp, #12] + movrel r4, X(ff_sbr_noise_table) + add r3, r3, #1 + bfc r3, #9, #23 + sub r12, r12, #1 +1: + add lr, r4, r3, lsl #3 + vld2.32 {q0}, [r0,:64] + vld2.32 {q3}, [lr,:64] + vld1.32 {d2}, [r1,:64]! + vld1.32 {d18}, [r2,:64]! + vceq.f32 d16, d2, #0 + veor d2, d2, d3 + vmov q2, q0 + vmla.f32 d0, d6, d18 + vmla.f32 d1, d7, d18 + vadd.f32 d5, d5, d2 + add r3, r3, #2 + bfc r3, #9, #23 + vbif d0, d4, d16 + vbif d1, d5, d16 + vst2.32 {q0}, [r0,:64]! + subs r12, r12, #2 + bgt 1b + blt 2f + add lr, r4, r3, lsl #3 + vld1.32 {d0}, [r0,:64] + vld1.32 {d6}, [lr,:64] + vld1.32 {d2[]}, [r1,:32]! + vld1.32 {d18[]}, [r2,:32]! + vceq.f32 d4, d2, #0 + veor d2, d2, d3 + vmov d1, d0 + vmla.f32 d0, d6, d18 + vadd.f32 s3, s3, s5 + vbif d0, d1, d4 + vst1.32 {d0}, [r0,:64]! +2: + pop {r4,pc} +endfunc + +function ff_sbr_hf_apply_noise_2_neon, export=1 + vmov.i32 d3, #1<<31 + b .Lhf_apply_noise_0 +endfunc + +function ff_sbr_hf_apply_noise_3_neon, export=1 + ldr r12, [sp] + push {r4,lr} + lsl r12, r12, #31 + eor lr, r12, #1<<31 + vmov d3, lr, r12 + b .Lhf_apply_noise_1 +endfunc diff --git a/libavcodec/sbrdsp.c b/libavcodec/sbrdsp.c index 7be962ed77..2711e71338 100644 --- a/libavcodec/sbrdsp.c +++ b/libavcodec/sbrdsp.c @@ -20,6 +20,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#include "config.h" #include "libavutil/attributes.h" #include "sbrdsp.h" @@ -234,4 +235,7 @@ av_cold void ff_sbrdsp_init(SBRDSPContext *s) s->hf_apply_noise[1] = sbr_hf_apply_noise_1; s->hf_apply_noise[2] = sbr_hf_apply_noise_2; s->hf_apply_noise[3] = sbr_hf_apply_noise_3; + + if (ARCH_ARM) + ff_sbrdsp_init_arm(s); } diff --git a/libavcodec/sbrdsp.h b/libavcodec/sbrdsp.h index 2f6cf1e4c5..88285b07ec 100644 --- a/libavcodec/sbrdsp.h +++ b/libavcodec/sbrdsp.h @@ -43,5 +43,6 @@ typedef struct SBRDSPContext { extern const float ff_sbr_noise_table[][2]; void ff_sbrdsp_init(SBRDSPContext *s); +void ff_sbrdsp_init_arm(SBRDSPContext *s); #endif