x86/lpc: implement a new Welch windowing function

Old one was written with the assumption only even inputs would be given.
This very messy replacement supports even and odd inputs, and supports
AVX2 for extra speed. The buffers given are usually quite big (4k samples),
so the speedup is worth it.
The new SSE version is still faster than the old inline asm version by 33%.

Also checkasm is provided to make sure this monstrosity works.

This fixes some FATE tests.
pull/388/head
Lynne 2 years ago
parent cc367a9b8a
commit 3ade6a8644
No known key found for this signature in database
GPG Key ID: A2FEA5F03F034464
  1. 3
      libavcodec/x86/Makefile
  2. 241
      libavcodec/x86/lpc.asm
  3. 72
      libavcodec/x86/lpc_init.c
  4. 1
      tests/checkasm/Makefile
  5. 3
      tests/checkasm/checkasm.c
  6. 1
      tests/checkasm/checkasm.h
  7. 80
      tests/checkasm/lpc.c

@ -23,7 +23,7 @@ OBJS-$(CONFIG_LLVIDENCDSP) += x86/lossless_videoencdsp_init.o
OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp_init.o
OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp_init.o
OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp_init.o
OBJS-$(CONFIG_LPC) += x86/lpc.o
OBJS-$(CONFIG_LPC) += x86/lpc_init.o
OBJS-$(CONFIG_MDCT15) += x86/mdct15_init.o
OBJS-$(CONFIG_ME_CMP) += x86/me_cmp_init.o
OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodsp.o
@ -127,6 +127,7 @@ X86ASM-OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp.o
X86ASM-OBJS-$(CONFIG_LLAUDDSP) += x86/lossless_audiodsp.o
X86ASM-OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp.o
X86ASM-OBJS-$(CONFIG_LLVIDENCDSP) += x86/lossless_videoencdsp.o
X86ASM-OBJS-$(CONFIG_LPC) += x86/lpc.o
X86ASM-OBJS-$(CONFIG_MDCT15) += x86/mdct15.o
X86ASM-OBJS-$(CONFIG_ME_CMP) += x86/me_cmp.o
X86ASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o

@ -0,0 +1,241 @@
;******************************************************************************
;* Copyright (c) Lynne
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA 32
one_tab: times 4 dq 1.0
seq_tab_avx2: dq 3.0, 2.0, 1.0, 0.0
sub_tab: dq -1.0, -2.0, -3.0, -4.0
add_tab_avx2: times 4 dq 4.0
dec_tab_avx2: times 4 dq -4.0
add_tab_sse2: times 2 dq 2.0
dec_tab_sse2: times 2 dq -2.0
dec_tab_scalar: times 2 dq -1.0
seq_tab_sse2: dq 1.0, 0.0
SECTION .text
%macro APPLY_WELCH_FN 0
cglobal lpc_apply_welch_window, 3, 5, 8, data, len, out, off1, off2
cmp lenq, 0
je .end
cmp lenq, 1
je .one
movapd m6, [one_tab]
movd xm1, lend
cvtdq2pd xm1, xm1 ; len
%if cpuflag(avx2)
vbroadcastsd m1, xm1
%else
shufpd m1, m1, 00b
%endif
addpd m0, m6, m6 ; 2.0
subpd m1, m6 ; len - 1
divpd m0, m1 ; 2.0 / (len - 1)
mov off1q, lenq
and off1q, 1
je .even
movapd m5, m0
addpd m0, [sub_tab]
lea off2q, [lenq*4 - mmsize/2]
sub lenq, mmsize/4 ; avoid overwriting
xor off1q, off1q
cmp lenq, mmsize/4
jl .scalar_o
%if cpuflag(avx2)
movapd m7, [dec_tab_avx2]
%else
movapd m7, [dec_tab_sse2]
%endif
.loop_o:
movapd m1, m6
mulpd m2, m0, m0
subpd m1, m2
%if cpuflag(avx2)
vpermpd m2, m1, q0123
%else
shufpd m2, m1, m1, 01b
%endif
cvtdq2pd m3, [dataq + off1q]
cvtdq2pd m4, [dataq + off2q]
mulpd m1, m3
mulpd m2, m4
movupd [outq + off1q*2], m1
movupd [outq + off2q*2], m2
addpd m0, m7
add off1q, mmsize/2
sub off2q, mmsize/2
sub lenq, mmsize/4
jg .loop_o
add lend, (mmsize/4 - 1)
cmp lend, 0
je .end
sub lenq, (mmsize/4 - 1)
.scalar_o:
movapd xm7, [dec_tab_scalar]
subpd xm0, xm7
; Set offsets
add off2q, (mmsize/4) + 4*cpuflag(avx2)
add lenq, mmsize/4 - 2
.loop_o_scalar:
movapd xm1, xm6
mulpd xm2, xm0, xm0
subpd xm1, xm2
cvtdq2pd xm3, [dataq + off1q - (mmsize/4) + 4*cpuflag(avx2)]
cvtdq2pd xm4, [dataq + off2q - (mmsize/4) + 4*cpuflag(avx2)]
mulpd xm3, xm1
mulpd xm4, xm1
movhpd [outq + off1q*2], xm3
movhpd [outq + off2q*2], xm4
addpd xm0, xm7
add off1q, 4
sub off2q, 4
sub lenq, 2
jg .loop_o_scalar
RET
.even:
%if cpuflag(avx2)
addpd m0, [seq_tab_avx2]
%else
addpd m0, [seq_tab_sse2]
%endif
mov off1d, lend
shr off1d, 1
movd xm1, off1d
cvtdq2pd xm1, xm1 ; len/2
%if cpuflag(avx2)
vbroadcastsd m1, xm1
%else
shufpd m1, m1, 00b
%endif
subpd m0, m1
%if cpuflag(avx2)
movapd m7, [add_tab_avx2]
%else
movapd m7, [add_tab_sse2]
%endif
lea off2q, [lenq*2]
lea off1q, [lenq*2 - mmsize/2]
sub lenq, mmsize/4
cmp lenq, mmsize/4
jl .scalar_e
.loop_e:
movapd m1, m6
mulpd m2, m0, m0
subpd m1, m2
%if cpuflag(avx2)
vpermpd m2, m1, q0123
%else
shufpd m2, m1, m1, 01b
%endif
cvtdq2pd m3, [dataq + off1q]
cvtdq2pd m4, [dataq + off2q]
mulpd m1, m3
mulpd m2, m4
movupd [outq + off1q*2], m1
movupd [outq + off2q*2], m2
addpd m0, m7
add off2q, mmsize/2
sub off1q, mmsize/2
sub lenq, mmsize/4
jge .loop_e
.scalar_e:
subpd m0, m7
movapd m7, [dec_tab_scalar]
subpd m0, m7
subpd m0, m7
subpd m0, m7
add off1q, (mmsize/2)
sub off2q, (mmsize/2) - 4 - 8*cpuflag(avx2)
addpd xm0, [sub_tab]
.loop_e_scalar:
movapd xm1, xm6
mulpd xm2, xm0, xm0
subpd xm1, xm2
cvtdq2pd m3, [dataq + off1q - 4]
cvtdq2pd m4, [dataq + off2q - 4]
mulpd m3, m1
mulpd m4, m1
movhpd [outq + off1q*2], xm3
movhpd [outq + off2q*2], xm4
subpd xm0, xm7
add off2q, 4
sub off1q, 4
jge .loop_e_scalar
RET
.one:
xorpd xm0, xm0
movhpd [outq], xm0
.end:
RET
%endmacro
INIT_XMM sse2
APPLY_WELCH_FN
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
APPLY_WELCH_FN
%endif

@ -20,65 +20,19 @@
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/mem_internal.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/lpc.h"
void ff_lpc_apply_welch_window_sse2(const int32_t *data, int len,
double *w_data);
void ff_lpc_apply_welch_window_avx2(const int32_t *data, int len,
double *w_data);
DECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 };
DECLARE_ASM_CONST(16, double, pd_2)[2] = { 2.0, 2.0 };
#if HAVE_SSE2_INLINE
static void lpc_apply_welch_window_sse2(const int32_t *data, int len,
double *w_data)
{
double c = 2.0 / (len-1.0);
int n2 = len>>1;
x86_reg i = -n2*sizeof(int32_t);
x86_reg j = n2*sizeof(int32_t);
__asm__ volatile(
"movsd %4, %%xmm7 \n\t"
"movapd "MANGLE(pd_1)", %%xmm6 \n\t"
"movapd "MANGLE(pd_2)", %%xmm5 \n\t"
"movlhps %%xmm7, %%xmm7 \n\t"
"subpd %%xmm5, %%xmm7 \n\t"
"addsd %%xmm6, %%xmm7 \n\t"
"test $1, %5 \n\t"
"jz 2f \n\t"
#define WELCH(MOVPD, offset)\
"1: \n\t"\
"movapd %%xmm7, %%xmm1 \n\t"\
"mulpd %%xmm1, %%xmm1 \n\t"\
"movapd %%xmm6, %%xmm0 \n\t"\
"subpd %%xmm1, %%xmm0 \n\t"\
"pshufd $0x4e, %%xmm0, %%xmm1 \n\t"\
"cvtpi2pd (%3,%0), %%xmm2 \n\t"\
"cvtpi2pd "#offset"*4(%3,%1), %%xmm3 \n\t"\
"mulpd %%xmm0, %%xmm2 \n\t"\
"mulpd %%xmm1, %%xmm3 \n\t"\
"movapd %%xmm2, (%2,%0,2) \n\t"\
MOVPD" %%xmm3, "#offset"*8(%2,%1,2) \n\t"\
"subpd %%xmm5, %%xmm7 \n\t"\
"sub $8, %1 \n\t"\
"add $8, %0 \n\t"\
"jl 1b \n\t"\
WELCH("movupd", -1)
"jmp 3f \n\t"
"2: \n\t"
WELCH("movapd", -2)
"3: \n\t"
:"+&r"(i), "+&r"(j)
:"r"(w_data+n2), "r"(data+n2), "m"(c), "r"(len)
NAMED_CONSTRAINTS_ARRAY_ADD(pd_1,pd_2)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm5", "%xmm6", "%xmm7")
);
#undef WELCH
}
static void lpc_compute_autocorr_sse2(const double *data, int len, int lag,
double *autoc)
{
@ -151,12 +105,16 @@ static void lpc_compute_autocorr_sse2(const double *data, int len, int lag,
av_cold void ff_lpc_init_x86(LPCContext *c)
{
#if HAVE_SSE2_INLINE
int cpu_flags = av_get_cpu_flags();
if (INLINE_SSE2_SLOW(cpu_flags)) {
c->lpc_apply_welch_window = lpc_apply_welch_window_sse2;
c->lpc_compute_autocorr = lpc_compute_autocorr_sse2;
}
#endif /* HAVE_SSE2_INLINE */
#if HAVE_SSE2_INLINE
if (INLINE_SSE2_SLOW(cpu_flags))
c->lpc_compute_autocorr = lpc_compute_autocorr_sse2;
#endif
if (EXTERNAL_SSE2(cpu_flags))
c->lpc_apply_welch_window = ff_lpc_apply_welch_window_sse2;
if (EXTERNAL_AVX2(cpu_flags))
c->lpc_apply_welch_window = ff_lpc_apply_welch_window_avx2;
}

@ -11,6 +11,7 @@ AVCODECOBJS-$(CONFIG_H264QPEL) += h264qpel.o
AVCODECOBJS-$(CONFIG_IDCTDSP) += idctdsp.o
AVCODECOBJS-$(CONFIG_LLVIDDSP) += llviddsp.o
AVCODECOBJS-$(CONFIG_LLVIDENCDSP) += llviddspenc.o
AVCODECOBJS-$(CONFIG_LPC) += lpc.o
AVCODECOBJS-$(CONFIG_ME_CMP) += motion.o
AVCODECOBJS-$(CONFIG_VC1DSP) += vc1dsp.o
AVCODECOBJS-$(CONFIG_VP8DSP) += vp8dsp.o

@ -135,6 +135,9 @@ static const struct {
#if CONFIG_LLVIDENCDSP
{ "llviddspenc", checkasm_check_llviddspenc },
#endif
#if CONFIG_LPC
{ "lpc", checkasm_check_lpc },
#endif
#if CONFIG_ME_CMP
{ "motion", checkasm_check_motion },
#endif

@ -68,6 +68,7 @@ void checkasm_check_idctdsp(void);
void checkasm_check_jpeg2000dsp(void);
void checkasm_check_llviddsp(void);
void checkasm_check_llviddspenc(void);
void checkasm_check_lpc(void);
void checkasm_check_motion(void);
void checkasm_check_nlmeans(void);
void checkasm_check_opusdsp(void);

@ -0,0 +1,80 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "libavutil/mem_internal.h"
#include "libavcodec/lpc.h"
#include "checkasm.h"
#define randomize_int32(buf, len) \
do { \
for (int i = 0; i < len; i++) { \
int32_t f = rnd() >> 8; \
buf[i] = f; \
} \
} while (0)
#define EPS 0.005
static void test_window(int len)
{
LOCAL_ALIGNED(16, int32_t, src, [5000]);
LOCAL_ALIGNED(16, double, dst0, [5000]);
LOCAL_ALIGNED(16, double, dst1, [5000]);
declare_func(void, int32_t *in, int len, double *out);
randomize_int32(src, len);
call_ref(src, len, dst0);
call_new(src, len, dst1);
if (!double_near_abs_eps_array(dst0, dst1, EPS, len))
fail();
bench_new(src, len, dst1);
}
void checkasm_check_lpc(void)
{
LPCContext ctx;
ff_lpc_init(&ctx, 32, 16, FF_LPC_TYPE_DEFAULT);
if (check_func(ctx.lpc_apply_welch_window, "apply_welch_window_even")) {
for (int i = 0; i < 64; i += 2)
test_window(i);
}
report("apply_welch_window_even");
if (check_func(ctx.lpc_apply_welch_window, "apply_welch_window_odd")) {
for (int i = 1; i < 64; i += 2)
test_window(i);
}
report("apply_welch_window_odd");
if (check_func(ctx.lpc_apply_welch_window, "apply_welch_window_4096"))
test_window(4096);
report("apply_welch_window_4096");
if (check_func(ctx.lpc_apply_welch_window, "apply_welch_window_4097"))
test_window(4097);
report("apply_welch_window_4097");
ff_lpc_end(&ctx);
}
Loading…
Cancel
Save