mirror of https://github.com/FFmpeg/FFmpeg.git
From 1.8 to 2.4 times faster. Runtime is reduced by 2 to 39%. The speed-up generally increases with compression_level. This lpc encoder is not used with levels < 3 so it provides no speed-up in these cases. Signed-off-by: Michael Niedermayer <michaelni@gmx.at>pull/80/merge
parent
7cafdfe9c7
commit
0081a14e7d
5 changed files with 94 additions and 1 deletions
@ -0,0 +1,85 @@ |
||||
;****************************************************************************** |
||||
;* FLAC DSP functions |
||||
;* |
||||
;* Copyright (c) 2014 James Darnley <james.darnley@gmail.com> |
||||
;* |
||||
;* This file is part of FFmpeg. |
||||
;* |
||||
;* FFmpeg is free software; you can redistribute it and/or modify |
||||
;* it under the terms of the GNU General Public License as published by |
||||
;* the Free Software Foundation; either version 2 of the License, or |
||||
;* (at your option) any later version. |
||||
;* |
||||
;* FFmpeg is distributed in the hope that it will be useful, |
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||||
;* GNU General Public License for more details. |
||||
;* |
||||
;* You should have received a copy of the GNU General Public License along |
||||
;* with FFmpeg; if not, write to the Free Software Foundation, Inc., |
||||
;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
||||
;****************************************************************************** |
||||
|
||||
%include "libavutil/x86/x86util.asm" |
||||
|
||||
SECTION_TEXT |
||||
|
||||
INIT_XMM sse4 |
||||
%if ARCH_X86_64 |
||||
cglobal flac_enc_lpc_16, 5, 7, 4, 0, res, smp, len, order, coefs |
||||
DECLARE_REG_TMP 5, 6 |
||||
%define length r2d |
||||
|
||||
movsxd orderq, orderd |
||||
%else |
||||
cglobal flac_enc_lpc_16, 5, 6, 4, 0, res, smp, len, order, coefs |
||||
DECLARE_REG_TMP 2, 5 |
||||
%define length r2mp |
||||
%endif |
||||
|
||||
; Here we assume that the maximum order value is 32. This means that we only |
||||
; need to copy a maximum of 32 samples. Therefore we let the preprocessor |
||||
; unroll this loop and copy all 32. |
||||
%assign iter 0 |
||||
%rep 32/(mmsize/4) |
||||
movu m0, [smpq+iter] |
||||
movu [resq+iter], m0 |
||||
%assign iter iter+mmsize |
||||
%endrep |
||||
|
||||
lea resq, [resq+orderq*4] |
||||
lea smpq, [smpq+orderq*4] |
||||
lea coefsq, [coefsq+orderq*4] |
||||
sub length, orderd |
||||
movd m3, r5m |
||||
neg orderq |
||||
|
||||
%define posj t0q |
||||
%define negj t1q |
||||
|
||||
.looplen: |
||||
pxor m0, m0 |
||||
mov posj, orderq |
||||
xor negj, negj |
||||
|
||||
.looporder: |
||||
movd m2, [coefsq+posj*4] ; c = coefs[j] |
||||
SPLATD m2 |
||||
movu m1, [smpq+negj*4-4] ; s = smp[i-j-1] |
||||
pmulld m1, m2 |
||||
paddd m0, m1 ; p += c * s |
||||
|
||||
dec negj |
||||
inc posj |
||||
jnz .looporder |
||||
|
||||
psrad m0, m3 ; p >>= shift |
||||
movu m1, [smpq] |
||||
psubd m1, m0 ; smp[i] - p |
||||
movu [resq], m1 ; res[i] = smp[i] - (p >> shift) |
||||
|
||||
add resq, mmsize |
||||
add smpq, mmsize |
||||
sub length, mmsize/4 |
||||
jg .looplen |
||||
RET |
Loading…
Reference in new issue