@ -26,11 +26,11 @@ SECTION_TEXT
;---------------------------------------------------------------------------------
; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len);
;---------------------------------------------------------------------------------
% macro INT32_TO_FLOAT_FMUL_SCALAR 2
% macro INT32_TO_FLOAT_FMUL_SCALAR 1
% if UNIX64
cglobal int32_to_float_fmul_scalar_ % 1 , 3 , 3 , % 2 , ds t , src , len
cglobal int32_to_float_fmul_scalar , 3 , 3 , % 1 , ds t , src , len
% else
cglobal int32_to_float_fmul_scalar_ % 1 , 4 , 4 , % 2 , ds t , src , mul , len
cglobal int32_to_float_fmul_scalar , 4 , 4 , % 1 , ds t , src , mul , len
% endif
% if WIN64
SWAP 0 , 2
@ -43,7 +43,7 @@ cglobal int32_to_float_fmul_scalar_%1, 4,4,%2, dst, src, mul, len
add ds tq , lenq
neg lenq
.loop:
% ifidn %1, sse2
% if cpuflag(sse2)
cvtdq2ps m1 , [ srcq + lenq ]
cvtdq2ps m2 , [ srcq + lenq + 16 ]
% else
@ -63,27 +63,26 @@ cglobal int32_to_float_fmul_scalar_%1, 4,4,%2, dst, src, mul, len
REP_RET
% endmacro
INIT_XMM
INIT_XMM ss e
% define SPLATD SPLATD_SSE
% define movdqa movaps
INT32_TO_FLOAT_FMUL_SCALAR ss e , 5
% undef movdqa
INT32_TO_FLOAT_FMUL_SCALAR 5
INIT_XMM ss e2
% define SPLATD SPLATD_SSE2
INT32_TO_FLOAT_FMUL_SCALAR ss e2 , 3
INT32_TO_FLOAT_FMUL_SCALAR 3
% undef SPLATD
;------------------------------------------------------------------------------
; void ff_float_to_int16(int16_t *dst, const float *src, long len);
;------------------------------------------------------------------------------
% macro FLOAT_TO_INT16 2
cglobal float_to_int16_ % 1 , 3 , 3 , % 2 , ds t , src , len
% macro FLOAT_TO_INT16 1
cglobal float_to_int16 , 3 , 3 , % 1 , ds t , src , len
add lenq , lenq
lea srcq , [ srcq + 2 * lenq ]
add ds tq , lenq
neg lenq
.loop:
% ifidn %1, sse2
% if cpuflag(sse2)
cvtps2dq m0 , [ srcq + 2 * lenq ]
cvtps2dq m1 , [ srcq + 2 * lenq + 16 ]
packssdw m0 , m1
@ -100,31 +99,32 @@ cglobal float_to_int16_%1, 3,3,%2, dst, src, len
% endif
add lenq , 16
js .loop
% ifnidn %1, sse2
% if mmsize == 8
emms
% endif
REP_RET
% endmacro
INIT_XMM
FLOAT_TO_INT16 ss e2 , 2
INIT_MMX
FLOAT_TO_INT16 ss e , 0
INIT_XMM ss e2
FLOAT_TO_INT16 2
INIT_MMX ss e
FLOAT_TO_INT16 0
% define cvtps2pi pf2id
FLOAT_TO_INT16 3 dnow , 0
INIT_MMX 3 dnow
FLOAT_TO_INT16 0
% undef cvtps2pi
;------------------------------------------------------------------------------
; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step);
;------------------------------------------------------------------------------
% macro FLOAT_TO_INT16_STEP 2
cglobal float_to_int16_step_ % 1 , 4 , 7 , % 2 , ds t , src , len , step , step3 , v1 , v2
% macro FLOAT_TO_INT16_STEP 1
cglobal float_to_int16_step , 4 , 7 , % 1 , ds t , src , len , step , step3 , v1 , v2
add lenq , lenq
lea srcq , [ srcq + 2 * lenq ]
lea step3q , [ stepq * 3 ]
neg lenq
.loop:
% ifidn %1, sse2
% if cpuflag(sse2)
cvtps2dq m0 , [ srcq + 2 * lenq ]
cvtps2dq m1 , [ srcq + 2 * lenq + 16 ]
packssdw m0 , m1
@ -179,25 +179,26 @@ cglobal float_to_int16_step_%1, 4,7,%2, dst, src, len, step, step3, v1, v2
% endif
add lenq , 16
js .loop
% ifnidn %1, sse2
% if mmsize == 8
emms
% endif
REP_RET
% endmacro
INIT_XMM
FLOAT_TO_INT16_STEP ss e2 , 2
INIT_MMX
FLOAT_TO_INT16_STEP ss e , 0
INIT_XMM ss e2
FLOAT_TO_INT16_STEP 2
INIT_MMX ss e
FLOAT_TO_INT16_STEP 0
% define cvtps2pi pf2id
FLOAT_TO_INT16_STEP 3 dnow , 0
INIT_MMX 3 dnow
FLOAT_TO_INT16_STEP 0
% undef cvtps2pi
;-------------------------------------------------------------------------------
; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
;-------------------------------------------------------------------------------
% macro FLOAT_TO_INT16_INTERLEAVE2 1
cglobal float_to_int16_interleave2_ % 1 , 3 , 4 , 2 , ds t , src0 , src1 , len
% macro FLOAT_TO_INT16_INTERLEAVE2 0
cglobal float_to_int16_interleave2 , 3 , 4 , 2 , ds t , src0 , src1 , len
lea lenq , [ 4 * r2q ]
mov src1q , [ src0q + gprsize ]
mov src0q , [ src0q ]
@ -206,7 +207,7 @@ cglobal float_to_int16_interleave2_%1, 3,4,2, dst, src0, src1, len
add src1q , lenq
neg lenq
.loop:
% ifidn %1, sse2
% if cpuflag(sse2)
cvtps2dq m0 , [ src0q + lenq ]
cvtps2dq m1 , [ src1q + lenq ]
packssdw m0 , m1
@ -228,21 +229,20 @@ cglobal float_to_int16_interleave2_%1, 3,4,2, dst, src0, src1, len
% endif
add lenq , 16
js .loop
% ifnidn %1, sse2
% if mmsize == 8
emms
% endif
REP_RET
% endmacro
INIT_MMX
INIT_MMX 3 dnow
% define cvtps2pi pf2id
FLOAT_TO_INT16_INTERLEAVE2 3 dnow
FLOAT_TO_INT16_INTERLEAVE2
% undef cvtps2pi
% define movdqa movaps
FLOAT_TO_INT16_INTERLEAVE2 ss e
% undef movdqa
INIT_XMM
FLOAT_TO_INT16_INTERLEAVE2 ss e2
INIT_MMX ss e
FLOAT_TO_INT16_INTERLEAVE2
INIT_XMM ss e2
FLOAT_TO_INT16_INTERLEAVE2
% macro PSWAPD_SSE 2
@ -254,9 +254,9 @@ FLOAT_TO_INT16_INTERLEAVE2 sse2
punpckldq % 1 , % 2
% endmacro
% macro FLOAT_TO_INT16_INTERLEAVE6 1
% macro FLOAT_TO_INT16_INTERLEAVE6 0
; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
cglobal float_to_int16_interleave6_ % 1 , 2 , 8 , 0 , ds t , src , src1 , src2 , src3 , src4 , src5 , len
cglobal float_to_int16_interleave6 , 2 , 8 , 0 , ds t , src , src1 , src2 , src3 , src4 , src5 , len
% if ARCH_X86_64
mov lend , r2d
% else
@ -302,21 +302,24 @@ cglobal float_to_int16_interleave6_%1, 2,8,0, dst, src, src1, src2, src3, src4,
RET
% endmacro ; FLOAT_TO_INT16_INTERLEAVE6
INIT_MMX ss e
% define pswapd PSWAPD_SSE
FLOAT_TO_INT16_INTERLEAVE6 ss e
FLOAT_TO_INT16_INTERLEAVE6
INIT_MMX 3 dnow
% define cvtps2pi pf2id
% define pswapd PSWAPD_3DNOW
FLOAT_TO_INT16_INTERLEAVE6 3 dnow
FLOAT_TO_INT16_INTERLEAVE6
% undef pswapd
FLOAT_TO_INT16_INTERLEAVE6 3 dnowext
INIT_MMX 3 dnowext
FLOAT_TO_INT16_INTERLEAVE6
% undef cvtps2pi
;-----------------------------------------------------------------------------
; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
;-----------------------------------------------------------------------------
% macro FLOAT_INTERLEAVE6 2
cglobal float_interleave6_ % 1 , 2 , 8 , % 2 , ds t , src , src1 , src2 , src3 , src4 , src5 , len
% macro FLOAT_INTERLEAVE6 1
cglobal float_interleave6 , 2 , 8 , % 1 , ds t , src , src1 , src2 , src3 , src4 , src5 , len
% if ARCH_X86_64
mov lend , r2d
% else
@ -334,7 +337,7 @@ cglobal float_interleave6_%1, 2,8,%2, dst, src, src1, src2, src3, src4, src5, le
sub src4q , srcq
sub src5q , srcq
.loop:
% ifidn %1, sse
% if cpuflag(sse)
movaps m0 , [ srcq ]
movaps m1 , [ srcq + src1q ]
movaps m2 , [ srcq + src2q ]
@ -383,62 +386,60 @@ cglobal float_interleave6_%1, 2,8,%2, dst, src, src1, src2, src3, src4, src5, le
add ds tq , mmsize * 6
sub lend , mmsize / 4
jg .loop
% ifidn %1, mmx
% if mmsize == 8
emms
% endif
REP_RET
% endmacro
INIT_MMX
FLOAT_INTERLEAVE6 mmx , 0
INIT_XMM
FLOAT_INTERLEAVE6 ss e , 7
INIT_MMX mmx
FLOAT_INTERLEAVE6 0
INIT_XMM ss e
FLOAT_INTERLEAVE6 7
;-----------------------------------------------------------------------------
; void ff_float_interleave2(float *dst, const float **src, unsigned int len);
;-----------------------------------------------------------------------------
% macro FLOAT_INTERLEAVE2 2
cglobal float_interleave2_ % 1 , 3 , 4 , % 2 , ds t , src , len , src1
% macro FLOAT_INTERLEAVE2 1
cglobal float_interleave2 , 3 , 4 , % 1 , ds t , src , len , src1
mov src1q , [ srcq + gprsize ]
mov srcq , [ srcq ]
sub src1q , srcq
.loop:
MOVPS m0 , [ srcq ]
MOVPS m1 , [ srcq + src1q ]
MOVPS m3 , [ srcq + mmsize ]
MOVPS m4 , [ srcq + src1q + mmsize ]
mova m0 , [ srcq ]
mova m1 , [ srcq + src1q ]
mova m3 , [ srcq + mmsize ]
mova m4 , [ srcq + src1q + mmsize ]
MOVPS m2 , m0
mova m2 , m0
PUNPCKLDQ m0 , m1
PUNPCKHDQ m2 , m1
MOVPS m1 , m3
mova m1 , m3
PUNPCKLDQ m3 , m4
PUNPCKHDQ m1 , m4
MOVPS [ ds tq ], m0
MOVPS [ ds tq + 1 * mmsize ], m2
MOVPS [ ds tq + 2 * mmsize ], m3
MOVPS [ ds tq + 3 * mmsize ], m1
mova [ ds tq ], m0
mova [ ds tq + 1 * mmsize ], m2
mova [ ds tq + 2 * mmsize ], m3
mova [ ds tq + 3 * mmsize ], m1
add srcq , mmsize * 2
add ds tq , mmsize * 4
sub lend , mmsize / 2
jg .loop
% ifidn %1, mmx
% if mmsize == 8
emms
% endif
REP_RET
% endmacro
INIT_MMX
% define MOVPS movq
INIT_MMX mmx
% define PUNPCKLDQ punpckldq
% define PUNPCKHDQ punpckhdq
FLOAT_INTERLEAVE2 mmx , 0
INIT_XMM
% define MOVPS movaps
FLOAT_INTERLEAVE2 0
INIT_XMM ss e
% define PUNPCKLDQ unpcklps
% define PUNPCKHDQ unpckhps
FLOAT_INTERLEAVE2 ss e , 5
FLOAT_INTERLEAVE2 5