fmtconvert: port int32_to_float_fmul_scalar() x86 inline asm to yasm

pull/2/head
Justin Ruggles 13 years ago
parent 185142a5ea
commit 4e8e262476
  1. 8
      libavcodec/x86/dsputil_yasm.asm
  2. 46
      libavcodec/x86/fmtconvert.asm
  3. 59
      libavcodec/x86/fmtconvert_mmx.c
  4. 12
      libavutil/x86/x86util.asm

@ -1055,14 +1055,6 @@ emu_edge mmx
; int32_t max, unsigned int len)
;-----------------------------------------------------------------------------
%macro SPLATD_MMX 1
punpckldq %1, %1
%endmacro
%macro SPLATD_SSE2 1
pshufd %1, %1, 0
%endmacro
%macro VECTOR_CLIP_INT32 4
cglobal vector_clip_int32_%1, 5,5,%2, dst, src, min, max, len
%ifidn %1, sse2

@ -24,6 +24,52 @@
SECTION_TEXT
;---------------------------------------------------------------------------------
; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len);
;---------------------------------------------------------------------------------
%macro INT32_TO_FLOAT_FMUL_SCALAR 2
%ifdef ARCH_X86_64
cglobal int32_to_float_fmul_scalar_%1, 3,3,%2, dst, src, len
%else
cglobal int32_to_float_fmul_scalar_%1, 4,4,%2, dst, src, mul, len
movss m0, mulm
%endif
SPLATD m0
shl lenq, 2
add srcq, lenq
add dstq, lenq
neg lenq
.loop:
%ifidn %1, sse2
cvtdq2ps m1, [srcq+lenq ]
cvtdq2ps m2, [srcq+lenq+16]
%else
cvtpi2ps m1, [srcq+lenq ]
cvtpi2ps m3, [srcq+lenq+ 8]
cvtpi2ps m2, [srcq+lenq+16]
cvtpi2ps m4, [srcq+lenq+24]
movlhps m1, m3
movlhps m2, m4
%endif
mulps m1, m0
mulps m2, m0
mova [dstq+lenq ], m1
mova [dstq+lenq+16], m2
add lenq, 32
jl .loop
REP_RET
%endmacro
INIT_XMM
%define SPLATD SPLATD_SSE
%define movdqa movaps
INT32_TO_FLOAT_FMUL_SCALAR sse, 5
%undef movdqa
%define SPLATD SPLATD_SSE2
INT32_TO_FLOAT_FMUL_SCALAR sse2, 3
%undef SPLATD
;------------------------------------------------------------------------------
; void ff_float_to_int16(int16_t *dst, const float *src, long len);
;------------------------------------------------------------------------------

@ -26,52 +26,11 @@
#include "libavutil/x86_cpu.h"
#include "libavcodec/fmtconvert.h"
static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
{
x86_reg i = -4*len;
__asm__ volatile(
"movss %3, %%xmm4 \n"
"shufps $0, %%xmm4, %%xmm4 \n"
"1: \n"
"cvtpi2ps (%2,%0), %%xmm0 \n"
"cvtpi2ps 8(%2,%0), %%xmm1 \n"
"cvtpi2ps 16(%2,%0), %%xmm2 \n"
"cvtpi2ps 24(%2,%0), %%xmm3 \n"
"movlhps %%xmm1, %%xmm0 \n"
"movlhps %%xmm3, %%xmm2 \n"
"mulps %%xmm4, %%xmm0 \n"
"mulps %%xmm4, %%xmm2 \n"
"movaps %%xmm0, (%1,%0) \n"
"movaps %%xmm2, 16(%1,%0) \n"
"add $32, %0 \n"
"jl 1b \n"
:"+r"(i)
:"r"(dst+len), "r"(src+len), "m"(mul)
);
}
static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
{
x86_reg i = -4*len;
__asm__ volatile(
"movss %3, %%xmm4 \n"
"shufps $0, %%xmm4, %%xmm4 \n"
"1: \n"
"cvtdq2ps (%2,%0), %%xmm0 \n"
"cvtdq2ps 16(%2,%0), %%xmm1 \n"
"mulps %%xmm4, %%xmm0 \n"
"mulps %%xmm4, %%xmm1 \n"
"movaps %%xmm0, (%1,%0) \n"
"movaps %%xmm1, 16(%1,%0) \n"
"add $32, %0 \n"
"jl 1b \n"
:"+r"(i)
:"r"(dst+len), "r"(src+len), "m"(mul)
);
}
#if HAVE_YASM
void ff_int32_to_float_fmul_scalar_sse (float *dst, const int *src, float mul, int len);
void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len);
void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len);
void ff_float_to_int16_sse (int16_t *dst, const float *src, long len);
void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len);
@ -204,8 +163,8 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
{
int mm_flags = av_get_cpu_flags();
if (mm_flags & AV_CPU_FLAG_MMX) {
#if HAVE_YASM
if (mm_flags & AV_CPU_FLAG_MMX) {
c->float_interleave = float_interleave_mmx;
if (HAVE_AMD3DNOW && mm_flags & AV_CPU_FLAG_3DNOW) {
@ -219,21 +178,17 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
}
}
#endif
if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE) {
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
#if HAVE_YASM
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse;
c->float_to_int16 = ff_float_to_int16_sse;
c->float_to_int16_interleave = float_to_int16_interleave_sse;
c->float_interleave = float_interleave_sse;
#endif
}
if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE2) {
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
#if HAVE_YASM
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2;
c->float_to_int16 = ff_float_to_int16_sse2;
c->float_to_int16_interleave = float_to_int16_interleave_sse2;
#endif
}
}
#endif
}

@ -536,6 +536,18 @@
%endif
%endmacro
%macro SPLATD_MMX 1
punpckldq %1, %1
%endmacro
%macro SPLATD_SSE 1
shufps %1, %1, 0
%endmacro
%macro SPLATD_SSE2 1
pshufd %1, %1, 0
%endmacro
%macro CLIPW 3 ;(dst, min, max)
pmaxsw %1, %2
pminsw %1, %3

Loading…
Cancel
Save