From ec91a3d59d2b3d6d2cc5974a29c7976592fecdff Mon Sep 17 00:00:00 2001 From: "Paul E. Murphy" Date: Wed, 28 Aug 2019 12:09:07 -0500 Subject: [PATCH] core: vectorize countNonZero64f Improves performance a bit. 2.2x on P9 and 2 - 3x on coffee lake x86-64. --- modules/core/src/count_non_zero.simd.hpp | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/modules/core/src/count_non_zero.simd.hpp b/modules/core/src/count_non_zero.simd.hpp index 4c01c08850..6994564127 100644 --- a/modules/core/src/count_non_zero.simd.hpp +++ b/modules/core/src/count_non_zero.simd.hpp @@ -179,7 +179,25 @@ static int countNonZero32f( const float* src, int len ) static int countNonZero64f( const double* src, int len ) { - return countNonZero_(src, len); + int nz = 0, i = 0; +#if CV_SIMD_64F + v_int64 sum1 = vx_setzero_s64(); + v_int64 sum2 = vx_setzero_s64(); + v_float64 zero = vx_setzero_f64(); + int step = v_float64::nlanes * 2; + int len0 = len & -step; + + for(i = 0; i < len0; i += step ) + { + sum1 += v_reinterpret_as_s64(vx_load(&src[i]) == zero); + sum2 += v_reinterpret_as_s64(vx_load(&src[i + step / 2]) == zero); + } + + // N.B the value is incremented by -1 (0xF...F) for each value + nz = i + (int)v_reduce_sum(sum1 + sum2); + v_cleanup(); +#endif + return nz + countNonZero_(src + i, len - i); } CountNonZeroFunc getCountNonZeroTab(int depth)