mirror of https://github.com/FFmpeg/FFmpeg.git
263 lines
5.0 KiB
263 lines
5.0 KiB
;****************************************************************************** |
|
;* Copyright (c) Lynne |
|
;* |
|
;* This file is part of FFmpeg. |
|
;* |
|
;* FFmpeg is free software; you can redistribute it and/or |
|
;* modify it under the terms of the GNU Lesser General Public |
|
;* License as published by the Free Software Foundation; either |
|
;* version 2.1 of the License, or (at your option) any later version. |
|
;* |
|
;* FFmpeg is distributed in the hope that it will be useful, |
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
;* Lesser General Public License for more details. |
|
;* |
|
;* You should have received a copy of the GNU Lesser General Public |
|
;* License along with FFmpeg; if not, write to the Free Software |
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
;****************************************************************************** |
|
|
|
%include "libavutil/x86/x86util.asm" |
|
|
|
SECTION_RODATA 32 |
|
|
|
one_tab: times 4 dq 1.0 |
|
seq_tab_avx2: dq 3.0, 2.0, 1.0, 0.0 |
|
sub_tab: dq -1.0, -2.0, -3.0, -4.0 |
|
add_tab_avx2: times 4 dq 4.0 |
|
dec_tab_avx2: times 4 dq -4.0 |
|
add_tab_sse2: times 2 dq 2.0 |
|
dec_tab_sse2: times 2 dq -2.0 |
|
dec_tab_scalar: times 2 dq -1.0 |
|
seq_tab_sse2: dq 1.0, 0.0 |
|
|
|
SECTION .text |
|
|
|
%macro APPLY_WELCH_FN 0 |
|
cglobal lpc_apply_welch_window, 3, 5, 8, data, len, out, off1, off2 |
|
cmp lenq, 0 |
|
je .end_e |
|
cmp lenq, 2 |
|
je .two |
|
cmp lenq, 1 |
|
je .one |
|
|
|
movapd m6, [one_tab] |
|
|
|
movd xm1, lend |
|
cvtdq2pd xm1, xm1 ; len |
|
%if cpuflag(avx2) |
|
vbroadcastsd m1, xm1 |
|
%else |
|
shufpd m1, m1, 00b |
|
%endif |
|
|
|
addpd m0, m6, m6 ; 2.0 |
|
subpd m1, m6 ; len - 1 |
|
divpd m0, m1 ; 2.0 / (len - 1) |
|
|
|
mov off1q, lenq |
|
and off1q, 1 |
|
je .even |
|
|
|
movapd m5, m0 |
|
addpd m0, [sub_tab] |
|
|
|
lea off2q, [lenq*4 - mmsize/2] |
|
sub lenq, mmsize/4 ; avoid overwriting |
|
xor off1q, off1q |
|
|
|
cmp lenq, mmsize/4 |
|
jl .scalar_o |
|
|
|
%if cpuflag(avx2) |
|
movapd m7, [dec_tab_avx2] |
|
%else |
|
movapd m7, [dec_tab_sse2] |
|
%endif |
|
|
|
.loop_o: |
|
movapd m1, m6 |
|
%if cpuflag(avx2) |
|
fnmaddpd m1, m0, m0, m1 |
|
vpermpd m2, m1, q0123 |
|
%else |
|
mulpd m2, m0, m0 |
|
subpd m1, m2 |
|
shufpd m2, m1, m1, 01b |
|
%endif |
|
|
|
cvtdq2pd m3, [dataq + off1q] |
|
cvtdq2pd m4, [dataq + off2q] |
|
|
|
mulpd m1, m3 |
|
mulpd m2, m4 |
|
|
|
movupd [outq + off1q*2], m1 |
|
movupd [outq + off2q*2], m2 |
|
|
|
addpd m0, m7 |
|
add off1q, mmsize/2 |
|
sub off2q, mmsize/2 |
|
sub lenq, mmsize/4 |
|
jg .loop_o |
|
|
|
add lend, (mmsize/4 - 1) |
|
cmp lend, 0 |
|
je .end_o |
|
sub lenq, (mmsize/4 - 1) |
|
|
|
.scalar_o: |
|
movapd xm7, [dec_tab_scalar] |
|
|
|
; Set offsets |
|
add off2q, (mmsize/4) + 4*cpuflag(avx2) |
|
add lenq, mmsize/4 - 2 |
|
|
|
.loop_o_scalar: |
|
movapd xm1, xm6 |
|
%if cpuflag(avx2) |
|
fnmaddpd xm1, xm0, xm0, xm1 |
|
%else |
|
mulpd xm2, xm0, xm0 |
|
subpd xm1, xm2 |
|
%endif |
|
|
|
cvtdq2pd xm3, [dataq + off1q] |
|
cvtdq2pd xm4, [dataq + off2q] |
|
|
|
mulpd xm3, xm1 |
|
mulpd xm4, xm1 |
|
|
|
movlpd [outq + off1q*2], xm3 |
|
movlpd [outq + off2q*2], xm4 |
|
|
|
addpd xm0, xm7 |
|
|
|
add off1q, 4 |
|
sub off2q, 4 |
|
|
|
sub lenq, 2 |
|
jg .loop_o_scalar |
|
|
|
.end_o: |
|
xorpd xm3, xm3 |
|
movlpd [outq + off1q*2], xm3 |
|
RET |
|
|
|
.even: |
|
%if cpuflag(avx2) |
|
addpd m0, [seq_tab_avx2] |
|
%else |
|
addpd m0, [seq_tab_sse2] |
|
%endif |
|
|
|
mov off1d, lend |
|
shr off1d, 1 |
|
movd xm1, off1d |
|
cvtdq2pd xm1, xm1 ; len/2 |
|
%if cpuflag(avx2) |
|
vbroadcastsd m1, xm1 |
|
%else |
|
shufpd m1, m1, 00b |
|
%endif |
|
subpd m0, m1 |
|
|
|
%if cpuflag(avx2) |
|
movapd m7, [add_tab_avx2] |
|
%else |
|
movapd m7, [add_tab_sse2] |
|
%endif |
|
|
|
lea off2q, [lenq*2] |
|
lea off1q, [lenq*2 - mmsize/2] |
|
sub lenq, mmsize/4 |
|
|
|
cmp lenq, mmsize/4 |
|
jl .scalar_e |
|
|
|
.loop_e: |
|
movapd m1, m6 |
|
%if cpuflag(avx2) |
|
fnmaddpd m1, m0, m0, m1 |
|
%else |
|
mulpd m2, m0, m0 |
|
subpd m1, m2 |
|
%endif |
|
%if cpuflag(avx2) |
|
vpermpd m2, m1, q0123 |
|
%else |
|
shufpd m2, m1, m1, 01b |
|
%endif |
|
|
|
cvtdq2pd m3, [dataq + off1q] |
|
cvtdq2pd m4, [dataq + off2q] |
|
|
|
mulpd m1, m3 |
|
mulpd m2, m4 |
|
|
|
movupd [outq + off1q*2], m1 |
|
movupd [outq + off2q*2], m2 |
|
|
|
addpd m0, m7 |
|
add off2q, mmsize/2 |
|
sub off1q, mmsize/2 |
|
sub lenq, mmsize/4 |
|
jge .loop_e |
|
|
|
.scalar_e: |
|
subpd xm0, xm7 |
|
movapd xm7, [dec_tab_scalar] |
|
subpd xm0, xm7 |
|
|
|
add off1q, (mmsize/2) |
|
sub off2q, (mmsize/2) - 8*cpuflag(avx2) |
|
add lenq, 6 + 4*cpuflag(avx2) |
|
|
|
addpd xm0, [sub_tab] |
|
|
|
.loop_e_scalar: |
|
movapd xm1, xm6 |
|
%if cpuflag(avx2) |
|
fnmaddpd xm1, xm0, xm0, xm1 |
|
%else |
|
mulpd xm2, xm0, xm0 |
|
subpd xm1, xm2 |
|
%endif |
|
|
|
cvtdq2pd xm3, [dataq + off1q] |
|
cvtdq2pd xm4, [dataq + off2q] |
|
|
|
mulpd xm3, xm1 |
|
shufpd xm1, xm1, 00b |
|
mulpd xm4, xm1 |
|
|
|
movlpd [outq + off1q*2], xm3 |
|
movhpd [outq + off2q*2 + 8], xm4 |
|
|
|
subpd xm0, xm7 |
|
|
|
add off2q, 4 |
|
sub off1q, 4 |
|
sub lenq, 2 |
|
jg .loop_e_scalar |
|
RET |
|
|
|
.two: |
|
xorpd xm0, xm0 |
|
movhpd [outq + 8], xm0 |
|
.one: |
|
xorpd xm0, xm0 |
|
movhpd [outq], xm0 |
|
.end_e: |
|
RET |
|
%endmacro |
|
|
|
INIT_XMM sse2 |
|
APPLY_WELCH_FN |
|
|
|
%if HAVE_AVX2_EXTERNAL |
|
INIT_YMM avx2 |
|
APPLY_WELCH_FN |
|
%endif
|
|
|