From 0db4fb18356d380ee9064c006c4b61b2236e0ddd Mon Sep 17 00:00:00 2001 From: Chip Kerchner <49959681+ChipKerchner@users.noreply.github.com> Date: Thu, 25 Jul 2019 14:21:32 -0400 Subject: [PATCH] Merge pull request #15136 from ChipKerchner:dotProd_unroll * Unroll multiply and add instructions in dotProd_32f - 35% faster. * Eliminate unnecessary v_reduce_sum instructions. --- modules/core/src/matmul.simd.hpp | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/modules/core/src/matmul.simd.hpp b/modules/core/src/matmul.simd.hpp index ef54bb037c..bb6b6c55d5 100644 --- a/modules/core/src/matmul.simd.hpp +++ b/modules/core/src/matmul.simd.hpp @@ -2511,6 +2511,27 @@ double dotProd_32f(const float* src1, const float* src2, int len) int j = 0; int cWidth = v_float32::nlanes; + +#if CV_ENABLE_UNROLLED + v_float32 v_sum1 = vx_setzero_f32(); + v_float32 v_sum2 = vx_setzero_f32(); + v_float32 v_sum3 = vx_setzero_f32(); + + for (; j <= blockSize - (cWidth * 4); j += (cWidth * 4)) + { + v_sum = v_muladd(vx_load(src1 + j), + vx_load(src2 + j), v_sum); + v_sum1 = v_muladd(vx_load(src1 + j + cWidth), + vx_load(src2 + j + cWidth), v_sum1); + v_sum2 = v_muladd(vx_load(src1 + j + (cWidth * 2)), + vx_load(src2 + j + (cWidth * 2)), v_sum2); + v_sum3 = v_muladd(vx_load(src1 + j + (cWidth * 3)), + vx_load(src2 + j + (cWidth * 3)), v_sum3); + } + + v_sum += v_sum1 + v_sum2 + v_sum3; +#endif + for (; j <= blockSize - cWidth; j += cWidth) v_sum = v_muladd(vx_load(src1 + j), vx_load(src2 + j), v_sum); @@ -2532,4 +2553,4 @@ double dotProd_64f(const double* src1, const double* src2, int len) #endif CV_CPU_OPTIMIZATION_NAMESPACE_END -} // namespace \ No newline at end of file +} // namespace