/* * Copyright (c) 2013 RISC OS Open Ltd * Author: Ben Avison * * This file is part of Libav. * * Libav is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * Libav is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with Libav; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "libavutil/arm/asm.S" POUT .req a1 PIN .req a2 PCOEF .req a3 DECIFACTOR .req a4 OLDFPSCR .req a4 COUNTER .req ip SCALE32 .req s28 @ use vector of 4 in place of 9th scalar when decifactor=32 / JMAX=8 SCALE64 .req s0 @ spare register in scalar bank when decifactor=64 / JMAX=4 IN0 .req s4 IN1 .req s5 IN2 .req s6 IN3 .req s7 IN4 .req s0 IN5 .req s1 IN6 .req s2 IN7 .req s3 COEF0 .req s8 @ coefficient elements COEF1 .req s9 COEF2 .req s10 COEF3 .req s11 COEF4 .req s12 COEF5 .req s13 COEF6 .req s14 COEF7 .req s15 ACCUM0 .req s16 @ double-buffered multiply-accumulate results ACCUM4 .req s20 POST0 .req s24 @ do long-latency post-multiply in this vector in parallel POST1 .req s25 POST2 .req s26 POST3 .req s27 .macro inner_loop decifactor, dir, tail, head .ifc "\dir","up" .set X, 0 .set Y, 4 .else .set X, 4*JMAX*4 - 4 .set Y, -4 .endif .ifnc "\head","" vldr COEF0, [PCOEF, #X + (0*JMAX + 0) * Y] vldr COEF1, [PCOEF, #X + (1*JMAX + 0) * Y] vldr COEF2, [PCOEF, #X + (2*JMAX + 0) * Y] vldr COEF3, [PCOEF, #X + (3*JMAX + 0) * Y] .endif .ifnc "\tail","" vadd.f POST0, ACCUM0, ACCUM4 @ vector operation .endif .ifnc "\head","" vmul.f ACCUM0, COEF0, IN0 @ vector = vector * scalar vldr COEF4, [PCOEF, #X + (0*JMAX + 1) * Y] vldr COEF5, [PCOEF, #X + (1*JMAX + 1) * Y] vldr COEF6, [PCOEF, #X + (2*JMAX + 1) * Y] .endif .ifnc "\tail","" vmul.f POST0, POST0, SCALE\decifactor @ vector operation (SCALE may be scalar) .endif .ifnc "\head","" vldr COEF7, [PCOEF, #X + (3*JMAX + 1) * Y] .ifc "\tail","" vmul.f ACCUM4, COEF4, IN1 @ vector operation .endif vldr COEF0, [PCOEF, #X + (0*JMAX + 2) * Y] vldr COEF1, [PCOEF, #X + (1*JMAX + 2) * Y] .ifnc "\tail","" vmul.f ACCUM4, COEF4, IN1 @ vector operation .endif vldr COEF2, [PCOEF, #X + (2*JMAX + 2) * Y] vldr COEF3, [PCOEF, #X + (3*JMAX + 2) * Y] .endif .ifnc "\tail","" vstmia POUT!, {POST0-POST3} .endif .ifnc "\head","" vmla.f ACCUM0, COEF0, IN2 @ vector = vector * scalar vldr COEF4, [PCOEF, #X + (0*JMAX + 3) * Y] vldr COEF5, [PCOEF, #X + (1*JMAX + 3) * Y] vldr COEF6, [PCOEF, #X + (2*JMAX + 3) * Y] vldr COEF7, [PCOEF, #X + (3*JMAX + 3) * Y] vmla.f ACCUM4, COEF4, IN3 @ vector = vector * scalar .if \decifactor == 32 vldr COEF0, [PCOEF, #X + (0*JMAX + 4) * Y] vldr COEF1, [PCOEF, #X + (1*JMAX + 4) * Y] vldr COEF2, [PCOEF, #X + (2*JMAX + 4) * Y] vldr COEF3, [PCOEF, #X + (3*JMAX + 4) * Y] vmla.f ACCUM0, COEF0, IN4 @ vector = vector * scalar vldr COEF4, [PCOEF, #X + (0*JMAX + 5) * Y] vldr COEF5, [PCOEF, #X + (1*JMAX + 5) * Y] vldr COEF6, [PCOEF, #X + (2*JMAX + 5) * Y] vldr COEF7, [PCOEF, #X + (3*JMAX + 5) * Y] vmla.f ACCUM4, COEF4, IN5 @ vector = vector * scalar vldr COEF0, [PCOEF, #X + (0*JMAX + 6) * Y] vldr COEF1, [PCOEF, #X + (1*JMAX + 6) * Y] vldr COEF2, [PCOEF, #X + (2*JMAX + 6) * Y] vldr COEF3, [PCOEF, #X + (3*JMAX + 6) * Y] vmla.f ACCUM0, COEF0, IN6 @ vector = vector * scalar vldr COEF4, [PCOEF, #X + (0*JMAX + 7) * Y] vldr COEF5, [PCOEF, #X + (1*JMAX + 7) * Y] vldr COEF6, [PCOEF, #X + (2*JMAX + 7) * Y] vldr COEF7, [PCOEF, #X + (3*JMAX + 7) * Y] vmla.f ACCUM4, COEF4, IN7 @ vector = vector * scalar .endif .endif .endm .macro dca_lfe_fir decifactor .if \decifactor == 32 .set JMAX, 8 vpush {s16-s31} vmov SCALE32, s0 @ duplicate scalar across vector vldr IN4, [PIN, #-4*4] vldr IN5, [PIN, #-5*4] vldr IN6, [PIN, #-6*4] vldr IN7, [PIN, #-7*4] .else .set JMAX, 4 vpush {s16-s27} .endif mov COUNTER, #\decifactor/4 - 1 inner_loop \decifactor, up,, head 1: add PCOEF, PCOEF, #4*JMAX*4 subs COUNTER, COUNTER, #1 inner_loop \decifactor, up, tail, head bne 1b inner_loop \decifactor, up, tail mov COUNTER, #\decifactor/4 - 1 inner_loop \decifactor, down,, head 1: sub PCOEF, PCOEF, #4*JMAX*4 subs COUNTER, COUNTER, #1 inner_loop \decifactor, down, tail, head bne 1b inner_loop \decifactor, down, tail .if \decifactor == 32 vpop {s16-s31} .else vpop {s16-s27} .endif fmxr FPSCR, OLDFPSCR bx lr .endm /* void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs, * int decifactor, float scale) */ function ff_dca_lfe_fir_vfp, export=1 teq DECIFACTOR, #32 fmrx OLDFPSCR, FPSCR ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 fmxr FPSCR, ip NOVFP vldr s0, [sp] vldr IN0, [PIN, #-0*4] vldr IN1, [PIN, #-1*4] vldr IN2, [PIN, #-2*4] vldr IN3, [PIN, #-3*4] beq 32f 64: dca_lfe_fir 64 .ltorg 32: dca_lfe_fir 32 endfunc .unreq POUT .unreq PIN .unreq PCOEF .unreq DECIFACTOR .unreq OLDFPSCR .unreq COUNTER .unreq SCALE32 .unreq SCALE64 .unreq IN0 .unreq IN1 .unreq IN2 .unreq IN3 .unreq IN4 .unreq IN5 .unreq IN6 .unreq IN7 .unreq COEF0 .unreq COEF1 .unreq COEF2 .unreq COEF3 .unreq COEF4 .unreq COEF5 .unreq COEF6 .unreq COEF7 .unreq ACCUM0 .unreq ACCUM4 .unreq POST0 .unreq POST1 .unreq POST2 .unreq POST3 IN .req a1 SBACT .req a2 OLDFPSCR .req a3 IMDCT .req a4 WINDOW .req v1 OUT .req v2 BUF .req v3 SCALEINT .req v4 @ only used in softfp case COUNT .req v5 SCALE .req s0 /* Stack layout differs in softfp and hardfp cases: * * hardfp * fp -> 6 arg words saved by caller * a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes) * s16-s23 on entry * align 16 * buf -> 8*32*4 bytes buffer * s0 on entry * sp -> 3 arg words for callee * * softfp * fp -> 7 arg words saved by caller * a4,v1-v5,fp,lr on entry * s16-s23 on entry * align 16 * buf -> 8*32*4 bytes buffer * sp -> 4 arg words for callee */ /* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act, * SynthFilterContext *synth, FFTContext *imdct, * float (*synth_buf_ptr)[512], * int *synth_buf_offset, float (*synth_buf2)[32], * const float (*window)[512], float *samples_out, * float (*raXin)[32], float scale); */ function ff_dca_qmf_32_subbands_vfp, export=1 VFP push {a3-a4,v1-v3,v5,fp,lr} NOVFP push {a4,v1-v5,fp,lr} add fp, sp, #8*4 vpush {s16-s23} @ The buffer pointed at by raXin isn't big enough for us to do a @ complete matrix transposition as we want to, so allocate an @ alternative buffer from the stack. Align to 4 words for speed. sub BUF, sp, #8*32*4 bic BUF, BUF, #15 mov sp, BUF ldr lr, =0x03330000 @ RunFast mode, short vectors of length 4, stride 2 fmrx OLDFPSCR, FPSCR fmxr FPSCR, lr @ COUNT is used to count down 2 things at once: @ bits 0-4 are the number of word pairs remaining in the output row @ bits 5-31 are the number of words to copy (with possible negation) @ from the source matrix before we start zeroing the remainder mov COUNT, #(-4 << 5) + 16 adds COUNT, COUNT, SBACT, lsl #5 bmi 2f 1: vldr s8, [IN, #(0*8+0)*4] vldr s10, [IN, #(0*8+1)*4] vldr s12, [IN, #(0*8+2)*4] vldr s14, [IN, #(0*8+3)*4] vldr s16, [IN, #(0*8+4)*4] vldr s18, [IN, #(0*8+5)*4] vldr s20, [IN, #(0*8+6)*4] vldr s22, [IN, #(0*8+7)*4] vneg.f s8, s8 vldr s9, [IN, #(1*8+0)*4] vldr s11, [IN, #(1*8+1)*4] vldr s13, [IN, #(1*8+2)*4] vldr s15, [IN, #(1*8+3)*4] vneg.f s16, s16 vldr s17, [IN, #(1*8+4)*4] vldr s19, [IN, #(1*8+5)*4] vldr s21, [IN, #(1*8+6)*4] vldr s23, [IN, #(1*8+7)*4] vstr d4, [BUF, #(0*32+0)*4] vstr d5, [BUF, #(1*32+0)*4] vstr d6, [BUF, #(2*32+0)*4] vstr d7, [BUF, #(3*32+0)*4] vstr d8, [BUF, #(4*32+0)*4] vstr d9, [BUF, #(5*32+0)*4] vstr d10, [BUF, #(6*32+0)*4] vstr d11, [BUF, #(7*32+0)*4] vldr s9, [IN, #(3*8+0)*4] vldr s11, [IN, #(3*8+1)*4] vldr s13, [IN, #(3*8+2)*4] vldr s15, [IN, #(3*8+3)*4] vldr s17, [IN, #(3*8+4)*4] vldr s19, [IN, #(3*8+5)*4] vldr s21, [IN, #(3*8+6)*4] vldr s23, [IN, #(3*8+7)*4] vneg.f s9, s9 vldr s8, [IN, #(2*8+0)*4] vldr s10, [IN, #(2*8+1)*4] vldr s12, [IN, #(2*8+2)*4] vldr s14, [IN, #(2*8+3)*4] vneg.f s17, s17 vldr s16, [IN, #(2*8+4)*4] vldr s18, [IN, #(2*8+5)*4] vldr s20, [IN, #(2*8+6)*4] vldr s22, [IN, #(2*8+7)*4] vstr d4, [BUF, #(0*32+2)*4] vstr d5, [BUF, #(1*32+2)*4] vstr d6, [BUF, #(2*32+2)*4] vstr d7, [BUF, #(3*32+2)*4] vstr d8, [BUF, #(4*32+2)*4] vstr d9, [BUF, #(5*32+2)*4] vstr d10, [BUF, #(6*32+2)*4] vstr d11, [BUF, #(7*32+2)*4] add IN, IN, #4*8*4 add BUF, BUF, #4*4 subs COUNT, COUNT, #(4 << 5) + 2 bpl 1b 2: @ Now deal with trailing < 4 samples adds COUNT, COUNT, #3 << 5 bmi 4f @ sb_act was a multiple of 4 bics lr, COUNT, #0x1F bne 3f @ sb_act was n*4+1 vldr s8, [IN, #(0*8+0)*4] vldr s10, [IN, #(0*8+1)*4] vldr s12, [IN, #(0*8+2)*4] vldr s14, [IN, #(0*8+3)*4] vldr s16, [IN, #(0*8+4)*4] vldr s18, [IN, #(0*8+5)*4] vldr s20, [IN, #(0*8+6)*4] vldr s22, [IN, #(0*8+7)*4] vneg.f s8, s8 vldr s9, zero vldr s11, zero vldr s13, zero vldr s15, zero vneg.f s16, s16 vldr s17, zero vldr s19, zero vldr s21, zero vldr s23, zero vstr d4, [BUF, #(0*32+0)*4] vstr d5, [BUF, #(1*32+0)*4] vstr d6, [BUF, #(2*32+0)*4] vstr d7, [BUF, #(3*32+0)*4] vstr d8, [BUF, #(4*32+0)*4] vstr d9, [BUF, #(5*32+0)*4] vstr d10, [BUF, #(6*32+0)*4] vstr d11, [BUF, #(7*32+0)*4] add BUF, BUF, #2*4 sub COUNT, COUNT, #1 b 4f 3: @ sb_act was n*4+2 or n*4+3, so do the first 2 vldr s8, [IN, #(0*8+0)*4] vldr s10, [IN, #(0*8+1)*4] vldr s12, [IN, #(0*8+2)*4] vldr s14, [IN, #(0*8+3)*4] vldr s16, [IN, #(0*8+4)*4] vldr s18, [IN, #(0*8+5)*4] vldr s20, [IN, #(0*8+6)*4] vldr s22, [IN, #(0*8+7)*4] vneg.f s8, s8 vldr s9, [IN, #(1*8+0)*4] vldr s11, [IN, #(1*8+1)*4] vldr s13, [IN, #(1*8+2)*4] vldr s15, [IN, #(1*8+3)*4] vneg.f s16, s16 vldr s17, [IN, #(1*8+4)*4] vldr s19, [IN, #(1*8+5)*4] vldr s21, [IN, #(1*8+6)*4] vldr s23, [IN, #(1*8+7)*4] vstr d4, [BUF, #(0*32+0)*4] vstr d5, [BUF, #(1*32+0)*4] vstr d6, [BUF, #(2*32+0)*4] vstr d7, [BUF, #(3*32+0)*4] vstr d8, [BUF, #(4*32+0)*4] vstr d9, [BUF, #(5*32+0)*4] vstr d10, [BUF, #(6*32+0)*4] vstr d11, [BUF, #(7*32+0)*4] add BUF, BUF, #2*4 sub COUNT, COUNT, #(2 << 5) + 1 bics lr, COUNT, #0x1F bne 4f @ sb_act was n*4+3 vldr s8, [IN, #(2*8+0)*4] vldr s10, [IN, #(2*8+1)*4] vldr s12, [IN, #(2*8+2)*4] vldr s14, [IN, #(2*8+3)*4] vldr s16, [IN, #(2*8+4)*4] vldr s18, [IN, #(2*8+5)*4] vldr s20, [IN, #(2*8+6)*4] vldr s22, [IN, #(2*8+7)*4] vldr s9, zero vldr s11, zero vldr s13, zero vldr s15, zero vldr s17, zero vldr s19, zero vldr s21, zero vldr s23, zero vstr d4, [BUF, #(0*32+0)*4] vstr d5, [BUF, #(1*32+0)*4] vstr d6, [BUF, #(2*32+0)*4] vstr d7, [BUF, #(3*32+0)*4] vstr d8, [BUF, #(4*32+0)*4] vstr d9, [BUF, #(5*32+0)*4] vstr d10, [BUF, #(6*32+0)*4] vstr d11, [BUF, #(7*32+0)*4] add BUF, BUF, #2*4 sub COUNT, COUNT, #1 4: @ Now fill the remainder with 0 vldr s8, zero vldr s9, zero ands COUNT, COUNT, #0x1F beq 6f 5: vstr d4, [BUF, #(0*32+0)*4] vstr d4, [BUF, #(1*32+0)*4] vstr d4, [BUF, #(2*32+0)*4] vstr d4, [BUF, #(3*32+0)*4] vstr d4, [BUF, #(4*32+0)*4] vstr d4, [BUF, #(5*32+0)*4] vstr d4, [BUF, #(6*32+0)*4] vstr d4, [BUF, #(7*32+0)*4] add BUF, BUF, #2*4 subs COUNT, COUNT, #1 bne 5b 6: fmxr FPSCR, OLDFPSCR ldr WINDOW, [fp, #3*4] ldr OUT, [fp, #4*4] sub BUF, BUF, #32*4 NOVFP ldr SCALEINT, [fp, #6*4] mov COUNT, #8 VFP vpush {SCALE} VFP sub sp, sp, #3*4 NOVFP sub sp, sp, #4*4 7: VFP ldr a1, [fp, #-7*4] @ imdct NOVFP ldr a1, [fp, #-8*4] ldmia fp, {a2-a4} VFP stmia sp, {WINDOW, OUT, BUF} NOVFP stmia sp, {WINDOW, OUT, BUF, SCALEINT} VFP vldr SCALE, [sp, #3*4] bl ff_synth_filter_float_vfp add OUT, OUT, #32*4 add BUF, BUF, #32*4 subs COUNT, COUNT, #1 bne 7b A sub sp, fp, #(8+8)*4 T sub fp, fp, #(8+8)*4 T mov sp, fp vpop {s16-s23} VFP pop {a3-a4,v1-v3,v5,fp,pc} NOVFP pop {a4,v1-v5,fp,pc} endfunc .unreq IN .unreq SBACT .unreq OLDFPSCR .unreq IMDCT .unreq WINDOW .unreq OUT .unreq BUF .unreq SCALEINT .unreq COUNT .unreq SCALE .align 2 zero: .word 0