dcadec: simplify decoding of VQ high frequencies

The vector dequantization has a test in a loop preventing effective SIMD
implementation. By moving it out of the loop, this loop can be DSPized.

Therefore, modify the current DSP implementation. In particular, the
DSP implementation no longer has to handle null loop sizes.

The decode_hf implementations have following timings:

For x86 Arrandale:
        C  SSE SSE2 SSE4
win32: 260 162  119  104
win64: 242 N/A   89   72

The arm NEON optimizations follow in a later patch as external asm. The
now unused check for the y modifier in arm inline asm is removed from
configure.
pull/60/head
Christophe Gisquet 11 years ago committed by Janne Grunau
parent 7686afd049
commit 4cb6964244
  1. 2
      configure
  2. 23
      libavcodec/arm/dca.h
  3. 30
      libavcodec/dcadec.c
  4. 20
      libavcodec/dcadsp.c
  5. 8
      libavcodec/dcadsp.h
  6. 53
      libavcodec/x86/dcadsp.asm
  7. 18
      libavcodec/x86/dcadsp_init.c

2
configure vendored

@ -1330,7 +1330,6 @@ HAVE_LIST="
altivec_h altivec_h
arpa_inet_h arpa_inet_h
asm_mod_q asm_mod_q
asm_mod_y
atomic_cas_ptr atomic_cas_ptr
atomics_native atomics_native
attribute_may_alias attribute_may_alias
@ -3669,7 +3668,6 @@ EOF
$ARCH_EXT_LIST_ARM $ARCH_EXT_LIST_ARM
check_inline_asm asm_mod_q '"add r0, %Q0, %R0" :: "r"((long long)0)' check_inline_asm asm_mod_q '"add r0, %Q0, %R0" :: "r"((long long)0)'
check_inline_asm asm_mod_y '"vmul.i32 d0, d0, %y0" :: "x"(0)'
[ $target_os != win32 ] && enabled_all armv6t2 shared !pic && enable_weak_pic [ $target_os != win32 ] && enabled_all armv6t2 shared !pic && enable_weak_pic

@ -81,27 +81,4 @@ static inline int decode_blockcodes(int code1, int code2, int levels,
#endif #endif
#if HAVE_NEON_INLINE && HAVE_ASM_MOD_Y
#define int8x8_fmul_int32 int8x8_fmul_int32
static inline void int8x8_fmul_int32(av_unused DCADSPContext *dsp,
float *dst, const int8_t *src, int scale)
{
__asm__ ("vcvt.f32.s32 %2, %2, #4 \n"
"vld1.8 {d0}, [%1,:64] \n"
"vmovl.s8 q0, d0 \n"
"vmovl.s16 q1, d1 \n"
"vmovl.s16 q0, d0 \n"
"vcvt.f32.s32 q0, q0 \n"
"vcvt.f32.s32 q1, q1 \n"
"vmul.f32 q0, q0, %y2 \n"
"vmul.f32 q1, q1, %y2 \n"
"vst1.32 {q0-q1}, [%m0,:128] \n"
: "=Um"(*(float (*)[8])dst)
: "r"(src), "x"(scale)
: "d0", "d1", "d2", "d3");
}
#endif
#endif /* AVCODEC_ARM_DCA_H */ #endif /* AVCODEC_ARM_DCA_H */

@ -50,14 +50,10 @@
#if ARCH_ARM #if ARCH_ARM
# include "arm/dca.h" # include "arm/dca.h"
#endif #endif
#if ARCH_X86
# include "x86/dca.h"
#endif
//#define TRACE //#define TRACE
#define DCA_PRIM_CHANNELS_MAX (7) #define DCA_PRIM_CHANNELS_MAX (7)
#define DCA_SUBBANDS (32)
#define DCA_ABITS_MAX (32) /* Should be 28 */ #define DCA_ABITS_MAX (32) /* Should be 28 */
#define DCA_SUBSUBFRAMES_MAX (4) #define DCA_SUBSUBFRAMES_MAX (4)
#define DCA_SUBFRAMES_MAX (16) #define DCA_SUBFRAMES_MAX (16)
@ -340,7 +336,7 @@ typedef struct {
int prediction_vq[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< prediction VQ coefs int prediction_vq[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< prediction VQ coefs
int bitalloc[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< bit allocation index int bitalloc[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< bit allocation index
int transition_mode[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< transition mode (transients) int transition_mode[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< transition mode (transients)
int scale_factor[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][2]; ///< scale factors (2 if transient) int32_t scale_factor[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][2];///< scale factors (2 if transient)
int joint_huff[DCA_PRIM_CHANNELS_MAX]; ///< joint subband scale factors codebook int joint_huff[DCA_PRIM_CHANNELS_MAX]; ///< joint subband scale factors codebook
int joint_scale_factor[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< joint subband scale factors int joint_scale_factor[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< joint subband scale factors
float downmix_coef[DCA_PRIM_CHANNELS_MAX + 1][2]; ///< stereo downmix coefficients float downmix_coef[DCA_PRIM_CHANNELS_MAX + 1][2]; ///< stereo downmix coefficients
@ -353,7 +349,7 @@ typedef struct {
uint8_t core_downmix_amode; ///< audio channel arrangement of embedded downmix uint8_t core_downmix_amode; ///< audio channel arrangement of embedded downmix
uint16_t core_downmix_codes[DCA_PRIM_CHANNELS_MAX + 1][4]; ///< embedded downmix coefficients (9-bit codes) uint16_t core_downmix_codes[DCA_PRIM_CHANNELS_MAX + 1][4]; ///< embedded downmix coefficients (9-bit codes)
int high_freq_vq[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< VQ encoded high frequency subbands int32_t high_freq_vq[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< VQ encoded high frequency subbands
float lfe_data[2 * DCA_LFE_MAX * (DCA_BLOCKS_MAX + 4)]; ///< Low frequency effect data float lfe_data[2 * DCA_LFE_MAX * (DCA_BLOCKS_MAX + 4)]; ///< Low frequency effect data
int lfe_scale_factor; int lfe_scale_factor;
@ -1088,14 +1084,6 @@ static int decode_blockcodes(int code1, int code2, int levels, int32_t *values)
static const uint8_t abits_sizes[7] = { 7, 10, 12, 13, 15, 17, 19 }; static const uint8_t abits_sizes[7] = { 7, 10, 12, 13, 15, 17, 19 };
static const uint8_t abits_levels[7] = { 3, 5, 7, 9, 13, 17, 25 }; static const uint8_t abits_levels[7] = { 3, 5, 7, 9, 13, 17, 25 };
#ifndef int8x8_fmul_int32
static inline void int8x8_fmul_int32(DCADSPContext *dsp, float *dst,
const int8_t *src, int scale)
{
dsp->int8x8_fmul_int32(dst, src, scale);
}
#endif
static int dca_subsubframe(DCAContext *s, int base_channel, int block_index) static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
{ {
int k, l; int k, l;
@ -1220,20 +1208,16 @@ static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
/* /*
* Decode VQ encoded high frequencies * Decode VQ encoded high frequencies
*/ */
for (l = s->vq_start_subband[k]; l < s->subband_activity[k]; l++) { if (s->subband_activity[k] > s->vq_start_subband[k]) {
/* 1 vector -> 32 samples but we only need the 8 samples
* for this subsubframe. */
int hfvq = s->high_freq_vq[k][l];
if (!s->debug_flag & 0x01) { if (!s->debug_flag & 0x01) {
av_log(s->avctx, AV_LOG_DEBUG, av_log(s->avctx, AV_LOG_DEBUG,
"Stream with high frequencies VQ coding\n"); "Stream with high frequencies VQ coding\n");
s->debug_flag |= 0x01; s->debug_flag |= 0x01;
} }
s->dcadsp.decode_hf(subband_samples[k], s->high_freq_vq[k],
int8x8_fmul_int32(&s->dcadsp, subband_samples[k][l], high_freq_vq, subsubframe * 8,
&high_freq_vq[hfvq][subsubframe * 8], s->scale_factor[k], s->vq_start_subband[k],
s->scale_factor[k][l][0]); s->subband_activity[k]);
} }
} }

@ -24,12 +24,22 @@
#include "libavutil/intreadwrite.h" #include "libavutil/intreadwrite.h"
#include "dcadsp.h" #include "dcadsp.h"
static void int8x8_fmul_int32_c(float *dst, const int8_t *src, int scale) static void decode_hf_c(float dst[DCA_SUBBANDS][8],
const int32_t vq_num[DCA_SUBBANDS],
const int8_t hf_vq[1024][32], intptr_t vq_offset,
int32_t scale[DCA_SUBBANDS][2],
intptr_t start, intptr_t end)
{ {
float fscale = scale / 16.0; int i, l;
int i;
for (l = start; l < end; l++) {
/* 1 vector -> 32 samples but we only need the 8 samples
* for this subsubframe. */
const int8_t *ptr = &hf_vq[vq_num[l]][vq_offset];
float fscale = scale[l][0] * (1 / 16.0);
for (i = 0; i < 8; i++) for (i = 0; i < 8; i++)
dst[i] = src[i] * fscale; dst[l][i] = ptr[i] * fscale;
}
} }
static inline void static inline void
@ -96,7 +106,7 @@ av_cold void ff_dcadsp_init(DCADSPContext *s)
s->lfe_fir[0] = dca_lfe_fir0_c; s->lfe_fir[0] = dca_lfe_fir0_c;
s->lfe_fir[1] = dca_lfe_fir1_c; s->lfe_fir[1] = dca_lfe_fir1_c;
s->qmf_32_subbands = dca_qmf_32_subbands; s->qmf_32_subbands = dca_qmf_32_subbands;
s->int8x8_fmul_int32 = int8x8_fmul_int32_c; s->decode_hf = decode_hf_c;
if (ARCH_ARM) ff_dcadsp_init_arm(s); if (ARCH_ARM) ff_dcadsp_init_arm(s);
if (ARCH_X86) ff_dcadsp_init_x86(s); if (ARCH_X86) ff_dcadsp_init_x86(s);
} }

@ -22,6 +22,8 @@
#include "avfft.h" #include "avfft.h"
#include "synth_filter.h" #include "synth_filter.h"
#define DCA_SUBBANDS 32
typedef struct DCADSPContext { typedef struct DCADSPContext {
void (*lfe_fir[2])(float *out, const float *in, const float *coefs); void (*lfe_fir[2])(float *out, const float *in, const float *coefs);
void (*qmf_32_subbands)(float samples_in[32][8], int sb_act, void (*qmf_32_subbands)(float samples_in[32][8], int sb_act,
@ -30,7 +32,11 @@ typedef struct DCADSPContext {
int *synth_buf_offset, float synth_buf2[32], int *synth_buf_offset, float synth_buf2[32],
const float window[512], float *samples_out, const float window[512], float *samples_out,
float raXin[32], float scale); float raXin[32], float scale);
void (*int8x8_fmul_int32)(float *dst, const int8_t *src, int scale); void (*decode_hf)(float dst[DCA_SUBBANDS][8],
const int32_t vq_num[DCA_SUBBANDS],
const int8_t hf_vq[1024][32], intptr_t vq_offset,
int32_t scale[DCA_SUBBANDS][2],
intptr_t start, intptr_t end);
} DCADSPContext; } DCADSPContext;
void ff_dcadsp_init(DCADSPContext *s); void ff_dcadsp_init(DCADSPContext *s);

@ -26,18 +26,35 @@ pf_inv16: times 4 dd 0x3D800000 ; 1/16
SECTION_TEXT SECTION_TEXT
; void int8x8_fmul_int32_sse2(float *dst, const int8_t *src, int scale) ; void decode_hf(float dst[DCA_SUBBANDS][8], const int32_t vq_num[DCA_SUBBANDS],
%macro INT8X8_FMUL_INT32 0 ; const int8_t hf_vq[1024][32], intptr_t vq_offset,
cglobal int8x8_fmul_int32, 3,3,5, dst, src, scale ; int32_t scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end)
cvtsi2ss m0, scalem
%macro DECODE_HF 0
cglobal decode_hf, 6,6,5, dst, num, src, offset, scale, start, end
lea srcq, [srcq + offsetq]
shl startq, 2
mov offsetd, endm
%define DICT offsetq
shl offsetq, 2
mov endm, offsetq
.loop:
%if ARCH_X86_64
mov offsetd, [scaleq + 2 * startq]
cvtsi2ss m0, offsetd
%else
cvtsi2ss m0, [scaleq + 2 * startq]
%endif
mov offsetd, [numq + startq]
mulss m0, [pf_inv16] mulss m0, [pf_inv16]
shl DICT, 5
shufps m0, m0, 0 shufps m0, m0, 0
%if cpuflag(sse2) %if cpuflag(sse2)
%if cpuflag(sse4) %if cpuflag(sse4)
pmovsxbd m1, [srcq+0] pmovsxbd m1, [srcq + DICT + 0]
pmovsxbd m2, [srcq+4] pmovsxbd m2, [srcq + DICT + 4]
%else %else
movq m1, [srcq] movq m1, [srcq + DICT]
punpcklbw m1, m1 punpcklbw m1, m1
mova m2, m1 mova m2, m1
punpcklwd m1, m1 punpcklwd m1, m1
@ -48,8 +65,8 @@ cglobal int8x8_fmul_int32, 3,3,5, dst, src, scale
cvtdq2ps m1, m1 cvtdq2ps m1, m1
cvtdq2ps m2, m2 cvtdq2ps m2, m2
%else %else
movd mm0, [srcq+0] movd mm0, [srcq + DICT + 0]
movd mm1, [srcq+4] movd mm1, [srcq + DICT + 4]
punpcklbw mm0, mm0 punpcklbw mm0, mm0
punpcklbw mm1, mm1 punpcklbw mm1, mm1
movq mm2, mm0 movq mm2, mm0
@ -67,27 +84,33 @@ cglobal int8x8_fmul_int32, 3,3,5, dst, src, scale
cvtpi2ps m3, mm2 cvtpi2ps m3, mm2
cvtpi2ps m4, mm3 cvtpi2ps m4, mm3
shufps m0, m0, 0 shufps m0, m0, 0
emms
shufps m1, m3, q1010 shufps m1, m3, q1010
shufps m2, m4, q1010 shufps m2, m4, q1010
%endif %endif
mulps m1, m0 mulps m1, m0
mulps m2, m0 mulps m2, m0
mova [dstq+ 0], m1 mova [dstq + 8 * startq + 0], m1
mova [dstq+16], m2 mova [dstq + 8 * startq + 16], m2
add startq, 4
cmp startq, endm
jl .loop
.end:
%if notcpuflag(sse2)
emms
%endif
REP_RET REP_RET
%endmacro %endmacro
%if ARCH_X86_32 %if ARCH_X86_32
INIT_XMM sse INIT_XMM sse
INT8X8_FMUL_INT32 DECODE_HF
%endif %endif
INIT_XMM sse2 INIT_XMM sse2
INT8X8_FMUL_INT32 DECODE_HF
INIT_XMM sse4 INIT_XMM sse4
INT8X8_FMUL_INT32 DECODE_HF
; %1=v0/v1 %2=in1 %3=in2 ; %1=v0/v1 %2=in1 %3=in2
%macro FIR_LOOP 2-3 %macro FIR_LOOP 2-3

@ -23,9 +23,15 @@
#include "libavutil/x86/cpu.h" #include "libavutil/x86/cpu.h"
#include "libavcodec/dcadsp.h" #include "libavcodec/dcadsp.h"
void ff_int8x8_fmul_int32_sse(float *dst, const int8_t *src, int scale); void ff_decode_hf_sse(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS],
void ff_int8x8_fmul_int32_sse2(float *dst, const int8_t *src, int scale); const int8_t hf_vq[1024][32], intptr_t vq_offset,
void ff_int8x8_fmul_int32_sse4(float *dst, const int8_t *src, int scale); int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
void ff_decode_hf_sse2(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS],
const int8_t hf_vq[1024][32], intptr_t vq_offset,
int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
void ff_decode_hf_sse4(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS],
const int8_t hf_vq[1024][32], intptr_t vq_offset,
int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
void ff_dca_lfe_fir0_sse(float *out, const float *in, const float *coefs); void ff_dca_lfe_fir0_sse(float *out, const float *in, const float *coefs);
void ff_dca_lfe_fir1_sse(float *out, const float *in, const float *coefs); void ff_dca_lfe_fir1_sse(float *out, const float *in, const float *coefs);
@ -35,18 +41,18 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
if (EXTERNAL_SSE(cpu_flags)) { if (EXTERNAL_SSE(cpu_flags)) {
#if ARCH_X86_32 #if ARCH_X86_32
s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse; s->decode_hf = ff_decode_hf_sse;
#endif #endif
s->lfe_fir[0] = ff_dca_lfe_fir0_sse; s->lfe_fir[0] = ff_dca_lfe_fir0_sse;
s->lfe_fir[1] = ff_dca_lfe_fir1_sse; s->lfe_fir[1] = ff_dca_lfe_fir1_sse;
} }
if (EXTERNAL_SSE2(cpu_flags)) { if (EXTERNAL_SSE2(cpu_flags)) {
s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse2; s->decode_hf = ff_decode_hf_sse2;
} }
if (EXTERNAL_SSE4(cpu_flags)) { if (EXTERNAL_SSE4(cpu_flags)) {
s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse4; s->decode_hf = ff_decode_hf_sse4;
} }
} }

Loading…
Cancel
Save