core: vectorize countNonZero64f

Improves performance a bit. 2.2x on P9 and 2 - 3x on coffee lake
x86-64.
pull/15685/head
Paul E. Murphy 5 years ago
parent dd4f591d54
commit ec91a3d59d
  1. 20
      modules/core/src/count_non_zero.simd.hpp

@ -179,7 +179,25 @@ static int countNonZero32f( const float* src, int len )
static int countNonZero64f( const double* src, int len )
{
return countNonZero_(src, len);
int nz = 0, i = 0;
#if CV_SIMD_64F
v_int64 sum1 = vx_setzero_s64();
v_int64 sum2 = vx_setzero_s64();
v_float64 zero = vx_setzero_f64();
int step = v_float64::nlanes * 2;
int len0 = len & -step;
for(i = 0; i < len0; i += step )
{
sum1 += v_reinterpret_as_s64(vx_load(&src[i]) == zero);
sum2 += v_reinterpret_as_s64(vx_load(&src[i + step / 2]) == zero);
}
// N.B the value is incremented by -1 (0xF...F) for each value
nz = i + (int)v_reduce_sum(sum1 + sum2);
v_cleanup();
#endif
return nz + countNonZero_(src + i, len - i);
}
CountNonZeroFunc getCountNonZeroTab(int depth)

Loading…
Cancel
Save