mirror of https://github.com/FFmpeg/FFmpeg.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
133 lines
3.5 KiB
133 lines
3.5 KiB
;****************************************************************************** |
|
;* ALAC DSP SIMD optimizations |
|
;* |
|
;* Copyright (C) 2015 James Almer |
|
;* |
|
;* This file is part of FFmpeg. |
|
;* |
|
;* FFmpeg is free software; you can redistribute it and/or |
|
;* modify it under the terms of the GNU Lesser General Public |
|
;* License as published by the Free Software Foundation; either |
|
;* version 2.1 of the License, or (at your option) any later version. |
|
;* |
|
;* FFmpeg is distributed in the hope that it will be useful, |
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
;* Lesser General Public License for more details. |
|
;* |
|
;* You should have received a copy of the GNU Lesser General Public |
|
;* License along with FFmpeg; if not, write to the Free Software |
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
;****************************************************************************** |
|
|
|
%include "libavutil/x86/x86util.asm" |
|
|
|
SECTION .text |
|
|
|
INIT_XMM sse4 |
|
%if ARCH_X86_64 |
|
cglobal alac_decorrelate_stereo, 2, 5, 8, buf0, len, shift, weight, buf1 |
|
%else |
|
cglobal alac_decorrelate_stereo, 2, 3, 8, buf0, len, shift, weight |
|
%define buf1q r2q |
|
%endif |
|
movd m6, shiftm |
|
movd m7, weightm |
|
SPLATD m7 |
|
shl lend, 2 |
|
mov buf1q, [buf0q + gprsize] |
|
mov buf0q, [buf0q] |
|
add buf1q, lenq |
|
add buf0q, lenq |
|
neg lenq |
|
|
|
align 16 |
|
.loop: |
|
mova m0, [buf0q + lenq] |
|
mova m1, [buf0q + lenq + mmsize] |
|
mova m2, [buf1q + lenq] |
|
mova m3, [buf1q + lenq + mmsize] |
|
pmulld m4, m2, m7 |
|
pmulld m5, m3, m7 |
|
psrad m4, m6 |
|
psrad m5, m6 |
|
psubd m0, m4 |
|
psubd m1, m5 |
|
paddd m2, m0 |
|
paddd m3, m1 |
|
mova [buf1q + lenq], m0 |
|
mova [buf1q + lenq + mmsize], m1 |
|
mova [buf0q + lenq], m2 |
|
mova [buf0q + lenq + mmsize], m3 |
|
|
|
add lenq, mmsize*2 |
|
jl .loop |
|
RET |
|
|
|
INIT_XMM sse2 |
|
cglobal alac_append_extra_bits_stereo, 2, 5, 5, buf0, exbuf0, buf1, exbuf1, len |
|
movifnidn lend, lenm |
|
movd m4, r2m ; exbits |
|
shl lend, 2 |
|
mov buf1q, [buf0q + gprsize] |
|
mov buf0q, [buf0q] |
|
mov exbuf1q, [exbuf0q + gprsize] |
|
mov exbuf0q, [exbuf0q] |
|
add buf1q, lenq |
|
add buf0q, lenq |
|
add exbuf1q, lenq |
|
add exbuf0q, lenq |
|
neg lenq |
|
|
|
align 16 |
|
.loop: |
|
mova m0, [buf0q + lenq] |
|
mova m1, [buf0q + lenq + mmsize] |
|
pslld m0, m4 |
|
pslld m1, m4 |
|
mova m2, [buf1q + lenq] |
|
mova m3, [buf1q + lenq + mmsize] |
|
pslld m2, m4 |
|
pslld m3, m4 |
|
por m0, [exbuf0q + lenq] |
|
por m1, [exbuf0q + lenq + mmsize] |
|
por m2, [exbuf1q + lenq] |
|
por m3, [exbuf1q + lenq + mmsize] |
|
mova [buf0q + lenq ], m0 |
|
mova [buf0q + lenq + mmsize], m1 |
|
mova [buf1q + lenq ], m2 |
|
mova [buf1q + lenq + mmsize], m3 |
|
|
|
add lenq, mmsize*2 |
|
jl .loop |
|
REP_RET |
|
|
|
%if ARCH_X86_64 |
|
cglobal alac_append_extra_bits_mono, 2, 5, 3, buf, exbuf, exbits, ch, len |
|
%else |
|
cglobal alac_append_extra_bits_mono, 2, 3, 3, buf, exbuf, len |
|
%define exbitsm r2m |
|
%endif |
|
movifnidn lend, r4m |
|
movd m2, exbitsm |
|
shl lend, 2 |
|
mov bufq, [bufq] |
|
mov exbufq, [exbufq] |
|
add bufq, lenq |
|
add exbufq, lenq |
|
neg lenq |
|
|
|
align 16 |
|
.loop: |
|
mova m0, [bufq + lenq] |
|
mova m1, [bufq + lenq + mmsize] |
|
pslld m0, m2 |
|
pslld m1, m2 |
|
por m0, [exbufq + lenq] |
|
por m1, [exbufq + lenq + mmsize] |
|
mova [bufq + lenq], m0 |
|
mova [bufq + lenq + mmsize], m1 |
|
|
|
add lenq, mmsize*2 |
|
jl .loop |
|
REP_RET
|
|
|