on x86_64: time PSNR plain 3.303 inf SSE 1.649 107.087535 SSE3 1.632 107.087535 AVX 1.409 106.986771 FMA3 1.265 107.108437 on x86_32 (PSNR compared to x86_64 plain): time PSNR plain 7.225 103.951979 SSE 1.827 105.859282 SSE3 1.819 105.859282 AVX 1.533 105.997661 FMA3 1.384 105.885377 FMA4 test is not available Reviewed-by: James Almer <jamrial@gmail.com> Signed-off-by: Muhammad Faiz <mfcc64@gmail.com>pull/180/merge
parent
49b0246635
commit
1e69ac9246
5 changed files with 281 additions and 0 deletions
@ -0,0 +1,206 @@ |
|||||||
|
;***************************************************************************** |
||||||
|
;* x86-optimized functions for showcqt filter |
||||||
|
;* |
||||||
|
;* Copyright (C) 2016 Muhammad Faiz <mfcc64@gmail.com> |
||||||
|
;* |
||||||
|
;* This file is part of FFmpeg. |
||||||
|
;* |
||||||
|
;* FFmpeg is free software; you can redistribute it and/or |
||||||
|
;* modify it under the terms of the GNU Lesser General Public |
||||||
|
;* License as published by the Free Software Foundation; either |
||||||
|
;* version 2.1 of the License, or (at your option) any later version. |
||||||
|
;* |
||||||
|
;* FFmpeg is distributed in the hope that it will be useful, |
||||||
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||||
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||||
|
;* Lesser General Public License for more details. |
||||||
|
;* |
||||||
|
;* You should have received a copy of the GNU Lesser General Public |
||||||
|
;* License along with FFmpeg; if not, write to the Free Software |
||||||
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||||
|
;****************************************************************************** |
||||||
|
|
||||||
|
%include "libavutil/x86/x86util.asm" |
||||||
|
|
||||||
|
%if ARCH_X86_64 |
||||||
|
%define pointer resq |
||||||
|
%else |
||||||
|
%define pointer resd |
||||||
|
%endif |
||||||
|
|
||||||
|
struc Coeffs |
||||||
|
.val: pointer 1 |
||||||
|
.start: resd 1 |
||||||
|
.len: resd 1 |
||||||
|
.sizeof: |
||||||
|
endstruc |
||||||
|
|
||||||
|
%macro EMULATE_HADDPS 3 ; dst, src, tmp |
||||||
|
%if cpuflag(sse3) |
||||||
|
haddps %1, %2 |
||||||
|
%else |
||||||
|
movaps %3, %1 |
||||||
|
shufps %1, %2, q2020 |
||||||
|
shufps %3, %2, q3131 |
||||||
|
addps %1, %3 |
||||||
|
%endif |
||||||
|
%endmacro ; EMULATE_HADDPS |
||||||
|
|
||||||
|
%macro EMULATE_FMADDPS 5 ; dst, src1, src2, src3, tmp |
||||||
|
%if cpuflag(fma3) || cpuflag(fma4) |
||||||
|
fmaddps %1, %2, %3, %4 |
||||||
|
%else |
||||||
|
mulps %5, %2, %3 |
||||||
|
addps %1, %4, %5 |
||||||
|
%endif |
||||||
|
%endmacro ; EMULATE_FMADDPS |
||||||
|
|
||||||
|
%macro CQT_CALC 9 |
||||||
|
; %1 = a_re, %2 = a_im, %3 = b_re, %4 = b_im |
||||||
|
; %5 = m_re, %6 = m_im, %7 = tmp, %8 = coeffval, %9 = coeffsq_offset |
||||||
|
mov id, xd |
||||||
|
add id, [coeffsq + Coeffs.start + %9] |
||||||
|
movaps m%5, [srcq + 8 * iq] |
||||||
|
movaps m%7, [srcq + 8 * iq + mmsize] |
||||||
|
shufps m%6, m%5, m%7, q3131 |
||||||
|
shufps m%5, m%5, m%7, q2020 |
||||||
|
sub id, fft_lend |
||||||
|
EMULATE_FMADDPS m%2, m%6, m%8, m%2, m%6 |
||||||
|
neg id |
||||||
|
EMULATE_FMADDPS m%1, m%5, m%8, m%1, m%5 |
||||||
|
movups m%5, [srcq + 8 * iq - mmsize + 8] |
||||||
|
movups m%7, [srcq + 8 * iq - 2*mmsize + 8] |
||||||
|
%if mmsize == 32 |
||||||
|
vperm2f128 m%5, m%5, m%5, 1 |
||||||
|
vperm2f128 m%7, m%7, m%7, 1 |
||||||
|
%endif |
||||||
|
shufps m%6, m%5, m%7, q1313 |
||||||
|
shufps m%5, m%5, m%7, q0202 |
||||||
|
EMULATE_FMADDPS m%4, m%6, m%8, m%4, m%6 |
||||||
|
EMULATE_FMADDPS m%3, m%5, m%8, m%3, m%5 |
||||||
|
%endmacro ; CQT_CALC |
||||||
|
|
||||||
|
%macro CQT_SEPARATE 6 ; a_re, a_im, b_re, b_im, tmp, tmp2 |
||||||
|
addps m%5, m%4, m%2 |
||||||
|
subps m%6, m%3, m%1 |
||||||
|
addps m%1, m%3 |
||||||
|
subps m%2, m%4 |
||||||
|
EMULATE_HADDPS m%5, m%6, m%3 |
||||||
|
EMULATE_HADDPS m%1, m%2, m%3 |
||||||
|
EMULATE_HADDPS m%1, m%5, m%2 |
||||||
|
%if mmsize == 32 |
||||||
|
vextractf128 xmm%2, m%1, 1 |
||||||
|
addps xmm%1, xmm%2 |
||||||
|
%endif |
||||||
|
%endmacro ; CQT_SEPARATE |
||||||
|
|
||||||
|
%macro DECLARE_CQT_CALC 0 |
||||||
|
; ff_showcqt_cqt_calc_*(dst, src, coeffs, len, fft_len) |
||||||
|
%if ARCH_X86_64 |
||||||
|
cglobal showcqt_cqt_calc, 5, 10, 12, dst, src, coeffs, len, fft_len, x, coeffs_val, coeffs_val2, i, coeffs_len |
||||||
|
align 16 |
||||||
|
.loop_k: |
||||||
|
mov xd, [coeffsq + Coeffs.len] |
||||||
|
xorps m0, m0 |
||||||
|
movaps m1, m0 |
||||||
|
movaps m2, m0 |
||||||
|
mov coeffs_lend, [coeffsq + Coeffs.len + Coeffs.sizeof] |
||||||
|
movaps m3, m0 |
||||||
|
movaps m8, m0 |
||||||
|
cmp coeffs_lend, xd |
||||||
|
movaps m9, m0 |
||||||
|
movaps m10, m0 |
||||||
|
movaps m11, m0 |
||||||
|
cmova coeffs_lend, xd |
||||||
|
xor xd, xd |
||||||
|
test coeffs_lend, coeffs_lend |
||||||
|
jz .check_loop_b |
||||||
|
mov coeffs_valq, [coeffsq + Coeffs.val] |
||||||
|
mov coeffs_val2q, [coeffsq + Coeffs.val + Coeffs.sizeof] |
||||||
|
align 16 |
||||||
|
.loop_ab: |
||||||
|
movaps m7, [coeffs_valq + 4 * xq] |
||||||
|
CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 |
||||||
|
movaps m7, [coeffs_val2q + 4 * xq] |
||||||
|
CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof |
||||||
|
add xd, mmsize/4 |
||||||
|
cmp xd, coeffs_lend |
||||||
|
jb .loop_ab |
||||||
|
.check_loop_b: |
||||||
|
cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof] |
||||||
|
jae .check_loop_a |
||||||
|
align 16 |
||||||
|
.loop_b: |
||||||
|
movaps m7, [coeffs_val2q + 4 * xq] |
||||||
|
CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof |
||||||
|
add xd, mmsize/4 |
||||||
|
cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof] |
||||||
|
jb .loop_b |
||||||
|
.loop_end: |
||||||
|
CQT_SEPARATE 0, 1, 2, 3, 4, 5 |
||||||
|
CQT_SEPARATE 8, 9, 10, 11, 4, 5 |
||||||
|
mulps xmm0, xmm0 |
||||||
|
mulps xmm8, xmm8 |
||||||
|
EMULATE_HADDPS xmm0, xmm8, xmm1 |
||||||
|
movaps [dstq], xmm0 |
||||||
|
sub lend, 2 |
||||||
|
lea dstq, [dstq + 16] |
||||||
|
lea coeffsq, [coeffsq + 2*Coeffs.sizeof] |
||||||
|
jnz .loop_k |
||||||
|
REP_RET |
||||||
|
align 16 |
||||||
|
.check_loop_a: |
||||||
|
cmp xd, [coeffsq + Coeffs.len] |
||||||
|
jae .loop_end |
||||||
|
align 16 |
||||||
|
.loop_a: |
||||||
|
movaps m7, [coeffs_valq + 4 * xq] |
||||||
|
CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 |
||||||
|
add xd, mmsize/4 |
||||||
|
cmp xd, [coeffsq + Coeffs.len] |
||||||
|
jb .loop_a |
||||||
|
jmp .loop_end |
||||||
|
%else |
||||||
|
cglobal showcqt_cqt_calc, 4, 7, 8, dst, src, coeffs, len, x, coeffs_val, i |
||||||
|
%define fft_lend r4m |
||||||
|
align 16 |
||||||
|
.loop_k: |
||||||
|
mov xd, [coeffsq + Coeffs.len] |
||||||
|
xorps m0, m0 |
||||||
|
movaps m1, m0 |
||||||
|
movaps m2, m0 |
||||||
|
movaps m3, m0 |
||||||
|
test xd, xd |
||||||
|
jz .store |
||||||
|
mov coeffs_valq, [coeffsq + Coeffs.val] |
||||||
|
xor xd, xd |
||||||
|
align 16 |
||||||
|
.loop_x: |
||||||
|
movaps m7, [coeffs_valq + 4 * xq] |
||||||
|
CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 |
||||||
|
add xd, mmsize/4 |
||||||
|
cmp xd, [coeffsq + Coeffs.len] |
||||||
|
jb .loop_x |
||||||
|
CQT_SEPARATE 0, 1, 2, 3, 4, 5 |
||||||
|
mulps xmm0, xmm0 |
||||||
|
EMULATE_HADDPS xmm0, xmm0, xmm1 |
||||||
|
.store: |
||||||
|
movlps [dstq], xmm0 |
||||||
|
sub lend, 1 |
||||||
|
lea dstq, [dstq + 8] |
||||||
|
lea coeffsq, [coeffsq + Coeffs.sizeof] |
||||||
|
jnz .loop_k |
||||||
|
REP_RET |
||||||
|
%endif ; ARCH_X86_64 |
||||||
|
%endmacro ; DECLARE_CQT_CALC |
||||||
|
|
||||||
|
INIT_XMM sse |
||||||
|
DECLARE_CQT_CALC |
||||||
|
INIT_XMM sse3 |
||||||
|
DECLARE_CQT_CALC |
||||||
|
INIT_YMM avx |
||||||
|
DECLARE_CQT_CALC |
||||||
|
INIT_YMM fma3 |
||||||
|
DECLARE_CQT_CALC |
||||||
|
INIT_XMM fma4 |
||||||
|
DECLARE_CQT_CALC |
@ -0,0 +1,63 @@ |
|||||||
|
/*
|
||||||
|
* Copyright (c) 2016 Muhammad Faiz <mfcc64@gmail.com> |
||||||
|
* |
||||||
|
* This file is part of FFmpeg. |
||||||
|
* |
||||||
|
* FFmpeg is free software; you can redistribute it and/or |
||||||
|
* modify it under the terms of the GNU Lesser General Public |
||||||
|
* License as published by the Free Software Foundation; either |
||||||
|
* version 2.1 of the License, or (at your option) any later version. |
||||||
|
* |
||||||
|
* FFmpeg is distributed in the hope that it will be useful, |
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||||
|
* Lesser General Public License for more details. |
||||||
|
* |
||||||
|
* You should have received a copy of the GNU Lesser General Public |
||||||
|
* License along with FFmpeg; if not, write to the Free Software |
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||||
|
*/ |
||||||
|
|
||||||
|
#include "libavutil/attributes.h" |
||||||
|
#include "libavutil/cpu.h" |
||||||
|
#include "libavutil/x86/cpu.h" |
||||||
|
#include "libavfilter/avf_showcqt.h" |
||||||
|
|
||||||
|
#define DECLARE_CQT_CALC(type) \ |
||||||
|
void ff_showcqt_cqt_calc_##type(FFTComplex *dst, const FFTComplex *src, \
|
||||||
|
const Coeffs *coeffs, int len, int fft_len) |
||||||
|
|
||||||
|
DECLARE_CQT_CALC(sse); |
||||||
|
DECLARE_CQT_CALC(sse3); |
||||||
|
DECLARE_CQT_CALC(avx); |
||||||
|
DECLARE_CQT_CALC(fma3); |
||||||
|
DECLARE_CQT_CALC(fma4); |
||||||
|
|
||||||
|
#define permute_coeffs_0 NULL |
||||||
|
|
||||||
|
static void permute_coeffs_01452367(float *v, int len) |
||||||
|
{ |
||||||
|
int k; |
||||||
|
for (k = 0; k < len; k += 8) { |
||||||
|
FFSWAP(float, v[k+2], v[k+4]); |
||||||
|
FFSWAP(float, v[k+3], v[k+5]); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
av_cold void ff_showcqt_init_x86(ShowCQTContext *s) |
||||||
|
{ |
||||||
|
int cpuflags = av_get_cpu_flags(); |
||||||
|
|
||||||
|
#define SELECT_CQT_CALC(type, TYPE, align, perm) \ |
||||||
|
if (EXTERNAL_##TYPE(cpuflags)) { \
|
||||||
|
s->cqt_calc = ff_showcqt_cqt_calc_##type; \
|
||||||
|
s->cqt_align = align; \
|
||||||
|
s->permute_coeffs = permute_coeffs_##perm; \
|
||||||
|
} |
||||||
|
|
||||||
|
SELECT_CQT_CALC(sse, SSE, 4, 0); |
||||||
|
SELECT_CQT_CALC(sse3, SSE3_FAST, 4, 0); |
||||||
|
SELECT_CQT_CALC(fma4, FMA4, 4, 0); // using xmm
|
||||||
|
SELECT_CQT_CALC(avx, AVX_FAST, 8, 01452367); |
||||||
|
SELECT_CQT_CALC(fma3, FMA3_FAST, 8, 01452367); |
||||||
|
} |
Loading…
Reference in new issue