mirror of https://github.com/FFmpeg/FFmpeg.git
Before After Mean StdDev Mean StdDev Change This function 2653.0 28.5 1108.8 51.4 +139.3% Overall 17049.5 408.2 15973.0 223.2 +6.7% Signed-off-by: Martin Storsjö <martin@martin.st>pull/27/merge
parent
d6e4f5fef0
commit
b63bb251ea
4 changed files with 217 additions and 1 deletions
@ -0,0 +1,206 @@ |
|||||||
|
/* |
||||||
|
* Copyright (c) 2013 RISC OS Open Ltd |
||||||
|
* Author: Ben Avison <bavison@riscosopen.org>
|
||||||
|
* |
||||||
|
* This file is part of Libav. |
||||||
|
* |
||||||
|
* Libav is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public |
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version. |
||||||
|
* |
||||||
|
* Libav is distributed in the hope that it will be useful, |
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||||
|
* Lesser General Public License for more details. |
||||||
|
* |
||||||
|
* You should have received a copy of the GNU Lesser General Public |
||||||
|
* License along with Libav; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||||
|
*/ |
||||||
|
|
||||||
|
#include "libavutil/arm/asm.S" |
||||||
|
|
||||||
|
CONTEXT .req a1 |
||||||
|
ORIGOUT .req a2 |
||||||
|
IN .req a3 |
||||||
|
OUT .req v1 |
||||||
|
REVTAB .req v2 |
||||||
|
TCOS .req v3 |
||||||
|
TSIN .req v4 |
||||||
|
OLDFPSCR .req v5 |
||||||
|
J0 .req a2 |
||||||
|
J1 .req a4 |
||||||
|
J2 .req ip |
||||||
|
J3 .req lr |
||||||
|
|
||||||
|
.macro prerotation_innerloop
|
||||||
|
.set trig_lo, k |
||||||
|
.set trig_hi, n4 - k - 2 |
||||||
|
.set in_lo, trig_lo * 2 |
||||||
|
.set in_hi, trig_hi * 2 |
||||||
|
vldr d8, [TCOS, #trig_lo*4] @ s16,s17
|
||||||
|
vldr d9, [TCOS, #trig_hi*4] @ s18,s19
|
||||||
|
vldr s0, [IN, #in_hi*4 + 12] |
||||||
|
vldr s1, [IN, #in_hi*4 + 4] |
||||||
|
vldr s2, [IN, #in_lo*4 + 12] |
||||||
|
vldr s3, [IN, #in_lo*4 + 4] |
||||||
|
vmul.f s8, s0, s16 @ vector operation
|
||||||
|
vldr d10, [TSIN, #trig_lo*4] @ s20,s21
|
||||||
|
vldr d11, [TSIN, #trig_hi*4] @ s22,s23
|
||||||
|
vldr s4, [IN, #in_lo*4] |
||||||
|
vldr s5, [IN, #in_lo*4 + 8] |
||||||
|
vldr s6, [IN, #in_hi*4] |
||||||
|
vldr s7, [IN, #in_hi*4 + 8] |
||||||
|
ldr J0, [REVTAB, #trig_lo*2] |
||||||
|
vmul.f s12, s0, s20 @ vector operation
|
||||||
|
ldr J2, [REVTAB, #trig_hi*2] |
||||||
|
mov J1, J0, lsr #16 |
||||||
|
and J0, J0, #255 @ halfword value will be < n4
|
||||||
|
vmls.f s8, s4, s20 @ vector operation
|
||||||
|
mov J3, J2, lsr #16 |
||||||
|
and J2, J2, #255 @ halfword value will be < n4
|
||||||
|
add J0, OUT, J0, lsl #3 |
||||||
|
vmla.f s12, s4, s16 @ vector operation
|
||||||
|
add J1, OUT, J1, lsl #3 |
||||||
|
add J2, OUT, J2, lsl #3 |
||||||
|
add J3, OUT, J3, lsl #3 |
||||||
|
vstr s8, [J0] |
||||||
|
vstr s9, [J1] |
||||||
|
vstr s10, [J2] |
||||||
|
vstr s11, [J3] |
||||||
|
vstr s12, [J0, #4] |
||||||
|
vstr s13, [J1, #4] |
||||||
|
vstr s14, [J2, #4] |
||||||
|
vstr s15, [J3, #4] |
||||||
|
.set k, k + 2 |
||||||
|
.endm |
||||||
|
|
||||||
|
.macro postrotation_innerloop tail, head |
||||||
|
.set trig_lo_head, n8 - k - 2 |
||||||
|
.set trig_hi_head, n8 + k |
||||||
|
.set out_lo_head, trig_lo_head * 2 |
||||||
|
.set out_hi_head, trig_hi_head * 2 |
||||||
|
.set trig_lo_tail, n8 - (k - 2) - 2 |
||||||
|
.set trig_hi_tail, n8 + (k - 2) |
||||||
|
.set out_lo_tail, trig_lo_tail * 2 |
||||||
|
.set out_hi_tail, trig_hi_tail * 2 |
||||||
|
.if (k & 2) == 0 |
||||||
|
TCOS_D0_HEAD .req d10 @ s20,s21
|
||||||
|
TCOS_D1_HEAD .req d11 @ s22,s23
|
||||||
|
TCOS_S0_TAIL .req s24 |
||||||
|
.else |
||||||
|
TCOS_D0_HEAD .req d12 @ s24,s25
|
||||||
|
TCOS_D1_HEAD .req d13 @ s26,s27
|
||||||
|
TCOS_S0_TAIL .req s20 |
||||||
|
.endif |
||||||
|
.ifnc "\tail","" |
||||||
|
vmls.f s8, s0, TCOS_S0_TAIL @ vector operation
|
||||||
|
.endif |
||||||
|
.ifnc "\head","" |
||||||
|
vldr d8, [TSIN, #trig_lo_head*4] @ s16,s17
|
||||||
|
vldr d9, [TSIN, #trig_hi_head*4] @ s18,s19
|
||||||
|
vldr TCOS_D0_HEAD, [TCOS, #trig_lo_head*4] |
||||||
|
.endif |
||||||
|
.ifnc "\tail","" |
||||||
|
vmla.f s12, s4, TCOS_S0_TAIL @ vector operation
|
||||||
|
.endif |
||||||
|
.ifnc "\head","" |
||||||
|
vldr s0, [OUT, #out_lo_head*4] |
||||||
|
vldr s1, [OUT, #out_lo_head*4 + 8] |
||||||
|
vldr s2, [OUT, #out_hi_head*4] |
||||||
|
vldr s3, [OUT, #out_hi_head*4 + 8] |
||||||
|
vldr s4, [OUT, #out_lo_head*4 + 4] |
||||||
|
vldr s5, [OUT, #out_lo_head*4 + 12] |
||||||
|
vldr s6, [OUT, #out_hi_head*4 + 4] |
||||||
|
vldr s7, [OUT, #out_hi_head*4 + 12] |
||||||
|
.endif |
||||||
|
.ifnc "\tail","" |
||||||
|
vstr s8, [OUT, #out_lo_tail*4] |
||||||
|
vstr s9, [OUT, #out_lo_tail*4 + 8] |
||||||
|
vstr s10, [OUT, #out_hi_tail*4] |
||||||
|
vstr s11, [OUT, #out_hi_tail*4 + 8] |
||||||
|
.endif |
||||||
|
.ifnc "\head","" |
||||||
|
vmul.f s8, s4, s16 @ vector operation
|
||||||
|
.endif |
||||||
|
.ifnc "\tail","" |
||||||
|
vstr s12, [OUT, #out_hi_tail*4 + 12] |
||||||
|
vstr s13, [OUT, #out_hi_tail*4 + 4] |
||||||
|
vstr s14, [OUT, #out_lo_tail*4 + 12] |
||||||
|
vstr s15, [OUT, #out_lo_tail*4 + 4] |
||||||
|
.endif |
||||||
|
.ifnc "\head","" |
||||||
|
vmul.f s12, s0, s16 @ vector operation
|
||||||
|
vldr TCOS_D1_HEAD, [TCOS, #trig_hi_head*4] |
||||||
|
.endif |
||||||
|
.unreq TCOS_D0_HEAD
|
||||||
|
.unreq TCOS_D1_HEAD
|
||||||
|
.unreq TCOS_S0_TAIL
|
||||||
|
.ifnc "\head","" |
||||||
|
.set k, k + 2 |
||||||
|
.endif |
||||||
|
.endm |
||||||
|
|
||||||
|
|
||||||
|
/* void ff_imdct_half_vfp(FFTContext *s, |
||||||
|
* FFTSample *output, |
||||||
|
* const FFTSample *input) |
||||||
|
*/ |
||||||
|
function ff_imdct_half_vfp, export=1 |
||||||
|
ldr ip, [CONTEXT, #5*4] @ mdct_bits
|
||||||
|
teq ip, #6 |
||||||
|
it ne |
||||||
|
bne ff_imdct_half_c @ only case currently accelerated is the one used by DCA
|
||||||
|
|
||||||
|
.set n, 1<<6 |
||||||
|
.set n2, n/2 |
||||||
|
.set n4, n/4 |
||||||
|
.set n8, n/8 |
||||||
|
|
||||||
|
push {v1-v5,lr} |
||||||
|
vpush {s16-s27} |
||||||
|
fmrx OLDFPSCR, FPSCR |
||||||
|
ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
|
||||||
|
fmxr FPSCR, lr |
||||||
|
mov OUT, ORIGOUT |
||||||
|
ldr REVTAB, [CONTEXT, #2*4] |
||||||
|
ldr TCOS, [CONTEXT, #6*4] |
||||||
|
ldr TSIN, [CONTEXT, #7*4] |
||||||
|
|
||||||
|
.set k, 0 |
||||||
|
.rept n8/2 |
||||||
|
prerotation_innerloop |
||||||
|
.endr |
||||||
|
|
||||||
|
fmxr FPSCR, OLDFPSCR |
||||||
|
mov ORIGOUT, OUT |
||||||
|
ldr ip, [CONTEXT, #9*4] |
||||||
|
blx ip @ s->fft_calc(s, output)
|
||||||
|
ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
|
||||||
|
fmxr FPSCR, lr |
||||||
|
|
||||||
|
.set k, 0 |
||||||
|
postrotation_innerloop , head |
||||||
|
.rept n8/2 - 1 |
||||||
|
postrotation_innerloop tail, head |
||||||
|
.endr |
||||||
|
postrotation_innerloop tail |
||||||
|
|
||||||
|
fmxr FPSCR, OLDFPSCR |
||||||
|
vpop {s16-s27} |
||||||
|
pop {v1-v5,pc} |
||||||
|
endfunc |
||||||
|
|
||||||
|
.unreq CONTEXT
|
||||||
|
.unreq ORIGOUT
|
||||||
|
.unreq IN
|
||||||
|
.unreq OUT
|
||||||
|
.unreq REVTAB
|
||||||
|
.unreq TCOS
|
||||||
|
.unreq TSIN
|
||||||
|
.unreq OLDFPSCR
|
||||||
|
.unreq J0
|
||||||
|
.unreq J1
|
||||||
|
.unreq J2
|
||||||
|
.unreq J3
|
Loading…
Reference in new issue