@ -44,35 +44,17 @@ cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len
add ds tq , lenq
neg lenq
.loop:
% if cpuflag(sse2)
cvtdq2ps m1 , [ srcq + lenq ]
cvtdq2ps m2 , [ srcq + lenq + 16 ]
% else
cvtpi2ps m1 , [ srcq + lenq ]
cvtpi2ps m3 , [ srcq + lenq + 8 ]
cvtpi2ps m2 , [ srcq + lenq + 16 ]
cvtpi2ps m4 , [ srcq + lenq + 24 ]
movlhps m1 , m3
movlhps m2 , m4
% endif
mulps m1 , m0
mulps m2 , m0
mova [ ds tq + lenq ], m1
mova [ ds tq + lenq + 16 ], m2
add lenq , 32
jl .loop
% if notcpuflag(sse2)
;; cvtpi2ps switches to MMX even if the source is a memory location
;; possible an error in documentation since every tested CPU disagrees with
;; that. Use emms anyway since the vast majority of machines will use the
;; SSE2 variant
emms
% endif
RET
% endmacro
INIT_XMM ss e
INT32_TO_FLOAT_FMUL_SCALAR 5
INIT_XMM ss e2
INT32_TO_FLOAT_FMUL_SCALAR 3
@ -89,17 +71,8 @@ cglobal int32_to_float_fmul_array8, 5, 5, 5, c, dst, src, mul, len
.loop:
movss m0 , [ mulq ]
SPLATD m0
% if cpuflag(sse2)
cvtdq2ps m1 , [ srcq + lenq ]
cvtdq2ps m2 , [ srcq + lenq + 16 ]
% else
cvtpi2ps m1 , [ srcq + lenq ]
cvtpi2ps m3 , [ srcq + lenq + 8 ]
cvtpi2ps m2 , [ srcq + lenq + 16 ]
cvtpi2ps m4 , [ srcq + lenq + 24 ]
movlhps m1 , m3
movlhps m2 , m4
% endif
mulps m1 , m0
mulps m2 , m0
mova [ ds tq + lenq ], m1
@ -107,18 +80,9 @@ cglobal int32_to_float_fmul_array8, 5, 5, 5, c, dst, src, mul, len
add mulq , 4
add lenq , 32
jl .loop
% if notcpuflag(sse2)
;; cvtpi2ps switches to MMX even if the source is a memory location
;; possible an error in documentation since every tested CPU disagrees with
;; that. Use emms anyway since the vast majority of machines will use the
;; SSE2 variant
emms
% endif
RET
% endmacro
INIT_XMM ss e
INT32_TO_FLOAT_FMUL_ARRAY8
INIT_XMM ss e2
INT32_TO_FLOAT_FMUL_ARRAY8