mirror of https://github.com/FFmpeg/FFmpeg.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
221 lines
7.0 KiB
221 lines
7.0 KiB
11 years ago
|
/*
|
||
|
* Copyright (c) 2013 RISC OS Open Ltd
|
||
|
* Author: Ben Avison <bavison@riscosopen.org>
|
||
|
*
|
||
|
* This file is part of Libav.
|
||
|
*
|
||
|
* Libav is free software; you can redistribute it and/or
|
||
|
* modify it under the terms of the GNU Lesser General Public
|
||
|
* License as published by the Free Software Foundation; either
|
||
|
* version 2.1 of the License, or (at your option) any later version.
|
||
|
*
|
||
|
* Libav is distributed in the hope that it will be useful,
|
||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
* Lesser General Public License for more details.
|
||
|
*
|
||
|
* You should have received a copy of the GNU Lesser General Public
|
||
|
* License along with Libav; if not, write to the Free Software
|
||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||
|
*/
|
||
|
|
||
|
#include "libavutil/arm/asm.S"
|
||
|
|
||
|
POUT .req a1
|
||
|
PIN .req a2
|
||
|
PCOEF .req a3
|
||
|
DECIFACTOR .req a4
|
||
|
OLDFPSCR .req a4
|
||
|
COUNTER .req ip
|
||
|
|
||
|
SCALE32 .req s28 @ use vector of 4 in place of 9th scalar when decifactor=32 / JMAX=8
|
||
|
SCALE64 .req s0 @ spare register in scalar bank when decifactor=64 / JMAX=4
|
||
|
IN0 .req s4
|
||
|
IN1 .req s5
|
||
|
IN2 .req s6
|
||
|
IN3 .req s7
|
||
|
IN4 .req s0
|
||
|
IN5 .req s1
|
||
|
IN6 .req s2
|
||
|
IN7 .req s3
|
||
|
COEF0 .req s8 @ coefficient elements
|
||
|
COEF1 .req s9
|
||
|
COEF2 .req s10
|
||
|
COEF3 .req s11
|
||
|
COEF4 .req s12
|
||
|
COEF5 .req s13
|
||
|
COEF6 .req s14
|
||
|
COEF7 .req s15
|
||
|
ACCUM0 .req s16 @ double-buffered multiply-accumulate results
|
||
|
ACCUM4 .req s20
|
||
|
POST0 .req s24 @ do long-latency post-multiply in this vector in parallel
|
||
|
POST1 .req s25
|
||
|
POST2 .req s26
|
||
|
POST3 .req s27
|
||
|
|
||
|
|
||
|
.macro inner_loop decifactor, dir, tail, head
|
||
|
.ifc "\dir","up"
|
||
|
.set X, 0
|
||
|
.set Y, 4
|
||
|
.else
|
||
|
.set X, 4*JMAX*4 - 4
|
||
|
.set Y, -4
|
||
|
.endif
|
||
|
.ifnc "\head",""
|
||
|
vldr COEF0, [PCOEF, #X + (0*JMAX + 0) * Y]
|
||
|
vldr COEF1, [PCOEF, #X + (1*JMAX + 0) * Y]
|
||
|
vldr COEF2, [PCOEF, #X + (2*JMAX + 0) * Y]
|
||
|
vldr COEF3, [PCOEF, #X + (3*JMAX + 0) * Y]
|
||
|
.endif
|
||
|
.ifnc "\tail",""
|
||
|
vadd.f POST0, ACCUM0, ACCUM4 @ vector operation
|
||
|
.endif
|
||
|
.ifnc "\head",""
|
||
|
vmul.f ACCUM0, COEF0, IN0 @ vector = vector * scalar
|
||
|
vldr COEF4, [PCOEF, #X + (0*JMAX + 1) * Y]
|
||
|
vldr COEF5, [PCOEF, #X + (1*JMAX + 1) * Y]
|
||
|
vldr COEF6, [PCOEF, #X + (2*JMAX + 1) * Y]
|
||
|
.endif
|
||
|
.ifnc "\tail",""
|
||
|
vmul.f POST0, POST0, SCALE\decifactor @ vector operation (SCALE may be scalar)
|
||
|
.endif
|
||
|
.ifnc "\head",""
|
||
|
vldr COEF7, [PCOEF, #X + (3*JMAX + 1) * Y]
|
||
|
.ifc "\tail",""
|
||
|
vmul.f ACCUM4, COEF4, IN1 @ vector operation
|
||
|
.endif
|
||
|
vldr COEF0, [PCOEF, #X + (0*JMAX + 2) * Y]
|
||
|
vldr COEF1, [PCOEF, #X + (1*JMAX + 2) * Y]
|
||
|
.ifnc "\tail",""
|
||
|
vmul.f ACCUM4, COEF4, IN1 @ vector operation
|
||
|
.endif
|
||
|
vldr COEF2, [PCOEF, #X + (2*JMAX + 2) * Y]
|
||
|
vldr COEF3, [PCOEF, #X + (3*JMAX + 2) * Y]
|
||
|
.endif
|
||
|
.ifnc "\tail",""
|
||
|
vstmia POUT!, {POST0-POST3}
|
||
|
.endif
|
||
|
.ifnc "\head",""
|
||
|
vmla.f ACCUM0, COEF0, IN2 @ vector = vector * scalar
|
||
|
vldr COEF4, [PCOEF, #X + (0*JMAX + 3) * Y]
|
||
|
vldr COEF5, [PCOEF, #X + (1*JMAX + 3) * Y]
|
||
|
vldr COEF6, [PCOEF, #X + (2*JMAX + 3) * Y]
|
||
|
vldr COEF7, [PCOEF, #X + (3*JMAX + 3) * Y]
|
||
|
vmla.f ACCUM4, COEF4, IN3 @ vector = vector * scalar
|
||
|
.if \decifactor == 32
|
||
|
vldr COEF0, [PCOEF, #X + (0*JMAX + 4) * Y]
|
||
|
vldr COEF1, [PCOEF, #X + (1*JMAX + 4) * Y]
|
||
|
vldr COEF2, [PCOEF, #X + (2*JMAX + 4) * Y]
|
||
|
vldr COEF3, [PCOEF, #X + (3*JMAX + 4) * Y]
|
||
|
vmla.f ACCUM0, COEF0, IN4 @ vector = vector * scalar
|
||
|
vldr COEF4, [PCOEF, #X + (0*JMAX + 5) * Y]
|
||
|
vldr COEF5, [PCOEF, #X + (1*JMAX + 5) * Y]
|
||
|
vldr COEF6, [PCOEF, #X + (2*JMAX + 5) * Y]
|
||
|
vldr COEF7, [PCOEF, #X + (3*JMAX + 5) * Y]
|
||
|
vmla.f ACCUM4, COEF4, IN5 @ vector = vector * scalar
|
||
|
vldr COEF0, [PCOEF, #X + (0*JMAX + 6) * Y]
|
||
|
vldr COEF1, [PCOEF, #X + (1*JMAX + 6) * Y]
|
||
|
vldr COEF2, [PCOEF, #X + (2*JMAX + 6) * Y]
|
||
|
vldr COEF3, [PCOEF, #X + (3*JMAX + 6) * Y]
|
||
|
vmla.f ACCUM0, COEF0, IN6 @ vector = vector * scalar
|
||
|
vldr COEF4, [PCOEF, #X + (0*JMAX + 7) * Y]
|
||
|
vldr COEF5, [PCOEF, #X + (1*JMAX + 7) * Y]
|
||
|
vldr COEF6, [PCOEF, #X + (2*JMAX + 7) * Y]
|
||
|
vldr COEF7, [PCOEF, #X + (3*JMAX + 7) * Y]
|
||
|
vmla.f ACCUM4, COEF4, IN7 @ vector = vector * scalar
|
||
|
.endif
|
||
|
.endif
|
||
|
.endm
|
||
|
|
||
|
.macro dca_lfe_fir decifactor
|
||
|
.if \decifactor == 32
|
||
|
.set JMAX, 8
|
||
|
vpush {s16-s31}
|
||
|
vmov SCALE32, s0 @ duplicate scalar across vector
|
||
|
vldr IN4, [PIN, #-4*4]
|
||
|
vldr IN5, [PIN, #-5*4]
|
||
|
vldr IN6, [PIN, #-6*4]
|
||
|
vldr IN7, [PIN, #-7*4]
|
||
|
.else
|
||
|
.set JMAX, 4
|
||
|
vpush {s16-s27}
|
||
|
.endif
|
||
|
|
||
|
mov COUNTER, #\decifactor/4 - 1
|
||
|
inner_loop \decifactor, up,, head
|
||
|
1: add PCOEF, PCOEF, #4*JMAX*4
|
||
|
subs COUNTER, COUNTER, #1
|
||
|
inner_loop \decifactor, up, tail, head
|
||
|
bne 1b
|
||
|
inner_loop \decifactor, up, tail
|
||
|
|
||
|
mov COUNTER, #\decifactor/4 - 1
|
||
|
inner_loop \decifactor, down,, head
|
||
|
1: sub PCOEF, PCOEF, #4*JMAX*4
|
||
|
subs COUNTER, COUNTER, #1
|
||
|
inner_loop \decifactor, down, tail, head
|
||
|
bne 1b
|
||
|
inner_loop \decifactor, down, tail
|
||
|
|
||
|
.if \decifactor == 32
|
||
|
vpop {s16-s31}
|
||
|
.else
|
||
|
vpop {s16-s27}
|
||
|
.endif
|
||
|
fmxr FPSCR, OLDFPSCR
|
||
|
bx lr
|
||
|
.endm
|
||
|
|
||
|
|
||
|
/* void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
|
||
|
* int decifactor, float scale)
|
||
|
*/
|
||
|
function ff_dca_lfe_fir_vfp, export=1
|
||
|
teq DECIFACTOR, #32
|
||
|
fmrx OLDFPSCR, FPSCR
|
||
|
ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
|
||
|
fmxr FPSCR, ip
|
||
|
NOVFP vldr s0, [sp]
|
||
|
vldr IN0, [PIN, #-0*4]
|
||
|
vldr IN1, [PIN, #-1*4]
|
||
|
vldr IN2, [PIN, #-2*4]
|
||
|
vldr IN3, [PIN, #-3*4]
|
||
|
beq 32f
|
||
|
64: dca_lfe_fir 64
|
||
|
.ltorg
|
||
|
32: dca_lfe_fir 32
|
||
|
endfunc
|
||
|
|
||
|
.unreq POUT
|
||
|
.unreq PIN
|
||
|
.unreq PCOEF
|
||
|
.unreq DECIFACTOR
|
||
|
.unreq OLDFPSCR
|
||
|
.unreq COUNTER
|
||
|
|
||
|
.unreq SCALE32
|
||
|
.unreq SCALE64
|
||
|
.unreq IN0
|
||
|
.unreq IN1
|
||
|
.unreq IN2
|
||
|
.unreq IN3
|
||
|
.unreq IN4
|
||
|
.unreq IN5
|
||
|
.unreq IN6
|
||
|
.unreq IN7
|
||
|
.unreq COEF0
|
||
|
.unreq COEF1
|
||
|
.unreq COEF2
|
||
|
.unreq COEF3
|
||
|
.unreq COEF4
|
||
|
.unreq COEF5
|
||
|
.unreq COEF6
|
||
|
.unreq COEF7
|
||
|
.unreq ACCUM0
|
||
|
.unreq ACCUM4
|
||
|
.unreq POST0
|
||
|
.unreq POST1
|
||
|
.unreq POST2
|
||
|
.unreq POST3
|