diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm index 8f8e6dddf5..ff608f5f5a 100644 --- a/libavutil/x86/float_dsp.asm +++ b/libavutil/x86/float_dsp.asm @@ -443,19 +443,19 @@ cglobal scalarproduct_float, 3,3,2, v1, v2, offset INIT_YMM fma3 cglobal scalarproduct_float, 3,5,8, v1, v2, size, len, offset xor offsetq, offsetq - xorps m0, m0 + xorps m0, m0, m0 shl sized, 2 mov lenq, sizeq cmp lenq, 32 jl .l16 cmp lenq, 64 jl .l32 - xorps m1, m1 + xorps m1, m1, m1 cmp lenq, 128 jl .l64 and lenq, ~127 - xorps m2, m2 - xorps m3, m3 + xorps m2, m2, m2 + xorps m3, m3, m3 .loop128: movups m4, [v1q+offsetq] movups m5, [v1q+offsetq + 32] @@ -468,13 +468,13 @@ cglobal scalarproduct_float, 3,5,8, v1, v2, size, len, offset add offsetq, 128 cmp offsetq, lenq jl .loop128 - addps m0, m2 - addps m1, m3 + addps m0, m0, m2 + addps m1, m1, m3 mov lenq, sizeq and lenq, 127 cmp lenq, 64 jge .l64 - addps m0, m1 + addps m0, m0, m1 cmp lenq, 32 jge .l32 vextractf128 xmm2, m0, 1 @@ -502,7 +502,7 @@ cglobal scalarproduct_float, 3,5,8, v1, v2, size, len, offset add offsetq, 64 cmp offsetq, lenq jl .loop64 - addps m0, m1 + addps m0, m0, m1 mov lenq, sizeq and lenq, 63 cmp lenq, 32