|
|
|
@ -443,19 +443,19 @@ cglobal scalarproduct_float, 3,3,2, v1, v2, offset |
|
|
|
|
INIT_YMM fma3 |
|
|
|
|
cglobal scalarproduct_float, 3,5,8, v1, v2, size, len, offset |
|
|
|
|
xor offsetq, offsetq |
|
|
|
|
xorps m0, m0 |
|
|
|
|
xorps m0, m0, m0 |
|
|
|
|
shl sized, 2 |
|
|
|
|
mov lenq, sizeq |
|
|
|
|
cmp lenq, 32 |
|
|
|
|
jl .l16 |
|
|
|
|
cmp lenq, 64 |
|
|
|
|
jl .l32 |
|
|
|
|
xorps m1, m1 |
|
|
|
|
xorps m1, m1, m1 |
|
|
|
|
cmp lenq, 128 |
|
|
|
|
jl .l64 |
|
|
|
|
and lenq, ~127 |
|
|
|
|
xorps m2, m2 |
|
|
|
|
xorps m3, m3 |
|
|
|
|
xorps m2, m2, m2 |
|
|
|
|
xorps m3, m3, m3 |
|
|
|
|
.loop128: |
|
|
|
|
movups m4, [v1q+offsetq] |
|
|
|
|
movups m5, [v1q+offsetq + 32] |
|
|
|
@ -468,13 +468,13 @@ cglobal scalarproduct_float, 3,5,8, v1, v2, size, len, offset |
|
|
|
|
add offsetq, 128 |
|
|
|
|
cmp offsetq, lenq |
|
|
|
|
jl .loop128 |
|
|
|
|
addps m0, m2 |
|
|
|
|
addps m1, m3 |
|
|
|
|
addps m0, m0, m2 |
|
|
|
|
addps m1, m1, m3 |
|
|
|
|
mov lenq, sizeq |
|
|
|
|
and lenq, 127 |
|
|
|
|
cmp lenq, 64 |
|
|
|
|
jge .l64 |
|
|
|
|
addps m0, m1 |
|
|
|
|
addps m0, m0, m1 |
|
|
|
|
cmp lenq, 32 |
|
|
|
|
jge .l32 |
|
|
|
|
vextractf128 xmm2, m0, 1 |
|
|
|
@ -502,7 +502,7 @@ cglobal scalarproduct_float, 3,5,8, v1, v2, size, len, offset |
|
|
|
|
add offsetq, 64 |
|
|
|
|
cmp offsetq, lenq |
|
|
|
|
jl .loop64 |
|
|
|
|
addps m0, m1 |
|
|
|
|
addps m0, m0, m1 |
|
|
|
|
mov lenq, sizeq |
|
|
|
|
and lenq, 63 |
|
|
|
|
cmp lenq, 32 |
|
|
|
|