mirror of https://github.com/FFmpeg/FFmpeg.git
Performance improvements: quant_bands: with: 681 decicycles in quant_bands, 8388453 runs, 155 skips without: 1190 decicycles in quant_bands, 8388386 runs, 222 skips Around 42% for the function Twoloop coder: abs_pow34: with/without: 7.82s/8.17s Around 4% for the entire encoder Both: with/without: 7.15s/8.17s Around 12% for the entire encoder Fast coder: abs_pow34: with/without: 3.40s/3.77s Around 10% for the entire encoder Both: with/without: 3.02s/3.77s Around 20% faster for the entire encoder Signed-off-by: Rostislav Pehlivanov <atomnuker@gmail.com> Tested-by: Michael Niedermayer <michael@niedermayer.cc> Reviewed-by: James Almer <jamrial@gmail.com>pull/238/head
parent
3b02f6dd7b
commit
d2ae5f77c6
13 changed files with 170 additions and 26 deletions
@ -0,0 +1,86 @@ |
|||||||
|
;****************************************************************************** |
||||||
|
;* SIMD optimized AAC encoder DSP functions |
||||||
|
;* |
||||||
|
;* Copyright (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com> |
||||||
|
;* |
||||||
|
;* This file is part of FFmpeg. |
||||||
|
;* |
||||||
|
;* FFmpeg is free software; you can redistribute it and/or |
||||||
|
;* modify it under the terms of the GNU Lesser General Public |
||||||
|
;* License as published by the Free Software Foundation; either |
||||||
|
;* version 2.1 of the License, or (at your option) any later version. |
||||||
|
;* |
||||||
|
;* FFmpeg is distributed in the hope that it will be useful, |
||||||
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||||
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||||
|
;* Lesser General Public License for more details. |
||||||
|
;* |
||||||
|
;* You should have received a copy of the GNU Lesser General Public |
||||||
|
;* License along with FFmpeg; if not, write to the Free Software |
||||||
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||||
|
;****************************************************************************** |
||||||
|
|
||||||
|
%include "libavutil/x86/x86util.asm" |
||||||
|
|
||||||
|
SECTION_RODATA |
||||||
|
|
||||||
|
float_abs_mask: times 4 dd 0x7fffffff |
||||||
|
|
||||||
|
SECTION .text |
||||||
|
|
||||||
|
;******************************************************************* |
||||||
|
;void ff_abs_pow34(float *out, const float *in, const int size); |
||||||
|
;******************************************************************* |
||||||
|
INIT_XMM sse |
||||||
|
cglobal abs_pow34, 3, 3, 3, out, in, size |
||||||
|
mova m2, [float_abs_mask] |
||||||
|
shl sizeq, 2 |
||||||
|
add inq, sizeq |
||||||
|
add outq, sizeq |
||||||
|
neg sizeq |
||||||
|
.loop: |
||||||
|
andps m0, m2, [inq+sizeq] |
||||||
|
sqrtps m1, m0 |
||||||
|
mulps m0, m1 |
||||||
|
sqrtps m0, m0 |
||||||
|
mova [outq+sizeq], m0 |
||||||
|
add sizeq, mmsize |
||||||
|
jl .loop |
||||||
|
RET |
||||||
|
|
||||||
|
;******************************************************************* |
||||||
|
;void ff_aac_quantize_bands(int *out, const float *in, const float *scaled, |
||||||
|
; int size, int is_signed, int maxval, const float Q34, |
||||||
|
; const float rounding) |
||||||
|
;******************************************************************* |
||||||
|
INIT_XMM sse2 |
||||||
|
cglobal aac_quantize_bands, 5, 5, 6, out, in, scaled, size, is_signed, maxval, Q34, rounding |
||||||
|
%if UNIX64 == 0 |
||||||
|
movss m0, Q34m |
||||||
|
movss m1, roundingm |
||||||
|
cvtsi2ss m3, dword maxvalm |
||||||
|
%else |
||||||
|
cvtsi2ss m3, maxvald |
||||||
|
%endif |
||||||
|
shufps m0, m0, 0 |
||||||
|
shufps m1, m1, 0 |
||||||
|
shufps m3, m3, 0 |
||||||
|
shl is_signedd, 31 |
||||||
|
movd m4, is_signedd |
||||||
|
shufps m4, m4, 0 |
||||||
|
shl sized, 2 |
||||||
|
add inq, sizeq |
||||||
|
add outq, sizeq |
||||||
|
add scaledq, sizeq |
||||||
|
neg sizeq |
||||||
|
.loop: |
||||||
|
mulps m2, m0, [scaledq+sizeq] |
||||||
|
addps m2, m1 |
||||||
|
minps m2, m3 |
||||||
|
andps m5, m4, [inq+sizeq] |
||||||
|
orps m2, m5 |
||||||
|
cvttps2dq m2, m2 |
||||||
|
mova [outq+sizeq], m2 |
||||||
|
add sizeq, mmsize |
||||||
|
jl .loop |
||||||
|
RET |
@ -0,0 +1,43 @@ |
|||||||
|
/*
|
||||||
|
* AAC encoder assembly optimizations |
||||||
|
* Copyright (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com> |
||||||
|
* |
||||||
|
* This file is part of FFmpeg. |
||||||
|
* |
||||||
|
* FFmpeg is free software; you can redistribute it and/or |
||||||
|
* modify it under the terms of the GNU Lesser General Public |
||||||
|
* License as published by the Free Software Foundation; either |
||||||
|
* version 2.1 of the License, or (at your option) any later version. |
||||||
|
* |
||||||
|
* FFmpeg is distributed in the hope that it will be useful, |
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||||
|
* Lesser General Public License for more details. |
||||||
|
* |
||||||
|
* You should have received a copy of the GNU Lesser General Public |
||||||
|
* License along with FFmpeg; if not, write to the Free Software |
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||||
|
*/ |
||||||
|
|
||||||
|
#include "config.h" |
||||||
|
|
||||||
|
#include "libavutil/float_dsp.h" |
||||||
|
#include "libavutil/x86/cpu.h" |
||||||
|
#include "libavcodec/aacenc.h" |
||||||
|
|
||||||
|
void ff_abs_pow34_sse(float *out, const float *in, const int size); |
||||||
|
|
||||||
|
void ff_aac_quantize_bands_sse2(int *out, const float *in, const float *scaled, |
||||||
|
int size, int is_signed, int maxval, const float Q34, |
||||||
|
const float rounding); |
||||||
|
|
||||||
|
av_cold void ff_aac_dsp_init_x86(AACEncContext *s) |
||||||
|
{ |
||||||
|
int cpu_flags = av_get_cpu_flags(); |
||||||
|
|
||||||
|
if (EXTERNAL_SSE(cpu_flags)) |
||||||
|
s->abs_pow34 = ff_abs_pow34_sse; |
||||||
|
|
||||||
|
if (EXTERNAL_SSE2(cpu_flags)) |
||||||
|
s->quant_bands = ff_aac_quantize_bands_sse2; |
||||||
|
} |
Loading…
Reference in new issue