on x86_64: time PSNR plain 3.303 inf SSE 1.649 107.087535 SSE3 1.632 107.087535 AVX 1.409 106.986771 FMA3 1.265 107.108437 on x86_32 (PSNR compared to x86_64 plain): time PSNR plain 7.225 103.951979 SSE 1.827 105.859282 SSE3 1.819 105.859282 AVX 1.533 105.997661 FMA3 1.384 105.885377 FMA4 test is not available Reviewed-by: James Almer <jamrial@gmail.com> Signed-off-by: Muhammad Faiz <mfcc64@gmail.com>pull/180/merge
parent
49b0246635
commit
1e69ac9246
5 changed files with 281 additions and 0 deletions
@ -0,0 +1,206 @@ |
||||
;***************************************************************************** |
||||
;* x86-optimized functions for showcqt filter |
||||
;* |
||||
;* Copyright (C) 2016 Muhammad Faiz <mfcc64@gmail.com> |
||||
;* |
||||
;* This file is part of FFmpeg. |
||||
;* |
||||
;* FFmpeg is free software; you can redistribute it and/or |
||||
;* modify it under the terms of the GNU Lesser General Public |
||||
;* License as published by the Free Software Foundation; either |
||||
;* version 2.1 of the License, or (at your option) any later version. |
||||
;* |
||||
;* FFmpeg is distributed in the hope that it will be useful, |
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
;* Lesser General Public License for more details. |
||||
;* |
||||
;* You should have received a copy of the GNU Lesser General Public |
||||
;* License along with FFmpeg; if not, write to the Free Software |
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
;****************************************************************************** |
||||
|
||||
%include "libavutil/x86/x86util.asm" |
||||
|
||||
%if ARCH_X86_64 |
||||
%define pointer resq |
||||
%else |
||||
%define pointer resd |
||||
%endif |
||||
|
||||
struc Coeffs |
||||
.val: pointer 1 |
||||
.start: resd 1 |
||||
.len: resd 1 |
||||
.sizeof: |
||||
endstruc |
||||
|
||||
%macro EMULATE_HADDPS 3 ; dst, src, tmp |
||||
%if cpuflag(sse3) |
||||
haddps %1, %2 |
||||
%else |
||||
movaps %3, %1 |
||||
shufps %1, %2, q2020 |
||||
shufps %3, %2, q3131 |
||||
addps %1, %3 |
||||
%endif |
||||
%endmacro ; EMULATE_HADDPS |
||||
|
||||
%macro EMULATE_FMADDPS 5 ; dst, src1, src2, src3, tmp |
||||
%if cpuflag(fma3) || cpuflag(fma4) |
||||
fmaddps %1, %2, %3, %4 |
||||
%else |
||||
mulps %5, %2, %3 |
||||
addps %1, %4, %5 |
||||
%endif |
||||
%endmacro ; EMULATE_FMADDPS |
||||
|
||||
%macro CQT_CALC 9 |
||||
; %1 = a_re, %2 = a_im, %3 = b_re, %4 = b_im |
||||
; %5 = m_re, %6 = m_im, %7 = tmp, %8 = coeffval, %9 = coeffsq_offset |
||||
mov id, xd |
||||
add id, [coeffsq + Coeffs.start + %9] |
||||
movaps m%5, [srcq + 8 * iq] |
||||
movaps m%7, [srcq + 8 * iq + mmsize] |
||||
shufps m%6, m%5, m%7, q3131 |
||||
shufps m%5, m%5, m%7, q2020 |
||||
sub id, fft_lend |
||||
EMULATE_FMADDPS m%2, m%6, m%8, m%2, m%6 |
||||
neg id |
||||
EMULATE_FMADDPS m%1, m%5, m%8, m%1, m%5 |
||||
movups m%5, [srcq + 8 * iq - mmsize + 8] |
||||
movups m%7, [srcq + 8 * iq - 2*mmsize + 8] |
||||
%if mmsize == 32 |
||||
vperm2f128 m%5, m%5, m%5, 1 |
||||
vperm2f128 m%7, m%7, m%7, 1 |
||||
%endif |
||||
shufps m%6, m%5, m%7, q1313 |
||||
shufps m%5, m%5, m%7, q0202 |
||||
EMULATE_FMADDPS m%4, m%6, m%8, m%4, m%6 |
||||
EMULATE_FMADDPS m%3, m%5, m%8, m%3, m%5 |
||||
%endmacro ; CQT_CALC |
||||
|
||||
%macro CQT_SEPARATE 6 ; a_re, a_im, b_re, b_im, tmp, tmp2 |
||||
addps m%5, m%4, m%2 |
||||
subps m%6, m%3, m%1 |
||||
addps m%1, m%3 |
||||
subps m%2, m%4 |
||||
EMULATE_HADDPS m%5, m%6, m%3 |
||||
EMULATE_HADDPS m%1, m%2, m%3 |
||||
EMULATE_HADDPS m%1, m%5, m%2 |
||||
%if mmsize == 32 |
||||
vextractf128 xmm%2, m%1, 1 |
||||
addps xmm%1, xmm%2 |
||||
%endif |
||||
%endmacro ; CQT_SEPARATE |
||||
|
||||
%macro DECLARE_CQT_CALC 0 |
||||
; ff_showcqt_cqt_calc_*(dst, src, coeffs, len, fft_len) |
||||
%if ARCH_X86_64 |
||||
cglobal showcqt_cqt_calc, 5, 10, 12, dst, src, coeffs, len, fft_len, x, coeffs_val, coeffs_val2, i, coeffs_len |
||||
align 16 |
||||
.loop_k: |
||||
mov xd, [coeffsq + Coeffs.len] |
||||
xorps m0, m0 |
||||
movaps m1, m0 |
||||
movaps m2, m0 |
||||
mov coeffs_lend, [coeffsq + Coeffs.len + Coeffs.sizeof] |
||||
movaps m3, m0 |
||||
movaps m8, m0 |
||||
cmp coeffs_lend, xd |
||||
movaps m9, m0 |
||||
movaps m10, m0 |
||||
movaps m11, m0 |
||||
cmova coeffs_lend, xd |
||||
xor xd, xd |
||||
test coeffs_lend, coeffs_lend |
||||
jz .check_loop_b |
||||
mov coeffs_valq, [coeffsq + Coeffs.val] |
||||
mov coeffs_val2q, [coeffsq + Coeffs.val + Coeffs.sizeof] |
||||
align 16 |
||||
.loop_ab: |
||||
movaps m7, [coeffs_valq + 4 * xq] |
||||
CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 |
||||
movaps m7, [coeffs_val2q + 4 * xq] |
||||
CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof |
||||
add xd, mmsize/4 |
||||
cmp xd, coeffs_lend |
||||
jb .loop_ab |
||||
.check_loop_b: |
||||
cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof] |
||||
jae .check_loop_a |
||||
align 16 |
||||
.loop_b: |
||||
movaps m7, [coeffs_val2q + 4 * xq] |
||||
CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof |
||||
add xd, mmsize/4 |
||||
cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof] |
||||
jb .loop_b |
||||
.loop_end: |
||||
CQT_SEPARATE 0, 1, 2, 3, 4, 5 |
||||
CQT_SEPARATE 8, 9, 10, 11, 4, 5 |
||||
mulps xmm0, xmm0 |
||||
mulps xmm8, xmm8 |
||||
EMULATE_HADDPS xmm0, xmm8, xmm1 |
||||
movaps [dstq], xmm0 |
||||
sub lend, 2 |
||||
lea dstq, [dstq + 16] |
||||
lea coeffsq, [coeffsq + 2*Coeffs.sizeof] |
||||
jnz .loop_k |
||||
REP_RET |
||||
align 16 |
||||
.check_loop_a: |
||||
cmp xd, [coeffsq + Coeffs.len] |
||||
jae .loop_end |
||||
align 16 |
||||
.loop_a: |
||||
movaps m7, [coeffs_valq + 4 * xq] |
||||
CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 |
||||
add xd, mmsize/4 |
||||
cmp xd, [coeffsq + Coeffs.len] |
||||
jb .loop_a |
||||
jmp .loop_end |
||||
%else |
||||
cglobal showcqt_cqt_calc, 4, 7, 8, dst, src, coeffs, len, x, coeffs_val, i |
||||
%define fft_lend r4m |
||||
align 16 |
||||
.loop_k: |
||||
mov xd, [coeffsq + Coeffs.len] |
||||
xorps m0, m0 |
||||
movaps m1, m0 |
||||
movaps m2, m0 |
||||
movaps m3, m0 |
||||
test xd, xd |
||||
jz .store |
||||
mov coeffs_valq, [coeffsq + Coeffs.val] |
||||
xor xd, xd |
||||
align 16 |
||||
.loop_x: |
||||
movaps m7, [coeffs_valq + 4 * xq] |
||||
CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 |
||||
add xd, mmsize/4 |
||||
cmp xd, [coeffsq + Coeffs.len] |
||||
jb .loop_x |
||||
CQT_SEPARATE 0, 1, 2, 3, 4, 5 |
||||
mulps xmm0, xmm0 |
||||
EMULATE_HADDPS xmm0, xmm0, xmm1 |
||||
.store: |
||||
movlps [dstq], xmm0 |
||||
sub lend, 1 |
||||
lea dstq, [dstq + 8] |
||||
lea coeffsq, [coeffsq + Coeffs.sizeof] |
||||
jnz .loop_k |
||||
REP_RET |
||||
%endif ; ARCH_X86_64 |
||||
%endmacro ; DECLARE_CQT_CALC |
||||
|
||||
INIT_XMM sse |
||||
DECLARE_CQT_CALC |
||||
INIT_XMM sse3 |
||||
DECLARE_CQT_CALC |
||||
INIT_YMM avx |
||||
DECLARE_CQT_CALC |
||||
INIT_YMM fma3 |
||||
DECLARE_CQT_CALC |
||||
INIT_XMM fma4 |
||||
DECLARE_CQT_CALC |
@ -0,0 +1,63 @@ |
||||
/*
|
||||
* Copyright (c) 2016 Muhammad Faiz <mfcc64@gmail.com> |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#include "libavutil/attributes.h" |
||||
#include "libavutil/cpu.h" |
||||
#include "libavutil/x86/cpu.h" |
||||
#include "libavfilter/avf_showcqt.h" |
||||
|
||||
#define DECLARE_CQT_CALC(type) \ |
||||
void ff_showcqt_cqt_calc_##type(FFTComplex *dst, const FFTComplex *src, \
|
||||
const Coeffs *coeffs, int len, int fft_len) |
||||
|
||||
DECLARE_CQT_CALC(sse); |
||||
DECLARE_CQT_CALC(sse3); |
||||
DECLARE_CQT_CALC(avx); |
||||
DECLARE_CQT_CALC(fma3); |
||||
DECLARE_CQT_CALC(fma4); |
||||
|
||||
#define permute_coeffs_0 NULL |
||||
|
||||
static void permute_coeffs_01452367(float *v, int len) |
||||
{ |
||||
int k; |
||||
for (k = 0; k < len; k += 8) { |
||||
FFSWAP(float, v[k+2], v[k+4]); |
||||
FFSWAP(float, v[k+3], v[k+5]); |
||||
} |
||||
} |
||||
|
||||
av_cold void ff_showcqt_init_x86(ShowCQTContext *s) |
||||
{ |
||||
int cpuflags = av_get_cpu_flags(); |
||||
|
||||
#define SELECT_CQT_CALC(type, TYPE, align, perm) \ |
||||
if (EXTERNAL_##TYPE(cpuflags)) { \
|
||||
s->cqt_calc = ff_showcqt_cqt_calc_##type; \
|
||||
s->cqt_align = align; \
|
||||
s->permute_coeffs = permute_coeffs_##perm; \
|
||||
} |
||||
|
||||
SELECT_CQT_CALC(sse, SSE, 4, 0); |
||||
SELECT_CQT_CALC(sse3, SSE3_FAST, 4, 0); |
||||
SELECT_CQT_CALC(fma4, FMA4, 4, 0); // using xmm
|
||||
SELECT_CQT_CALC(avx, AVX_FAST, 8, 01452367); |
||||
SELECT_CQT_CALC(fma3, FMA3_FAST, 8, 01452367); |
||||
} |
Loading…
Reference in new issue