diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index 699b1667b4..810ec56ed3 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -188,8 +188,16 @@ enum CpuFeatures { # if defined __POPCNT__ || (defined _MSC_VER && _MSC_VER >= 1500) # ifdef _MSC_VER # include +# if defined(_M_X64) +# define CV_POPCNT_U64 _mm_popcnt_u64 +# endif +# define CV_POPCNT_U32 _mm_popcnt_u32 # else # include +# if defined(__x86_64__) +# define CV_POPCNT_U64 __builtin_popcountll +# endif +# define CV_POPCNT_U32 __builtin_popcount # endif # define CV_POPCNT 1 # endif diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index 93ca397817..e15c97d528 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -149,7 +149,7 @@ Element-wise binary and unary operations. Most of these operations return only one value. -- Reduce: @ref v_reduce_min, @ref v_reduce_max, @ref v_reduce_sum +- Reduce: @ref v_reduce_min, @ref v_reduce_max, @ref v_reduce_sum, @ref v_popcount - Mask: @ref v_signmask, @ref v_check_all, @ref v_check_any, @ref v_select ### Other math @@ -574,6 +574,49 @@ Scheme: For 32-bit integer and 32-bit floating point types. */ OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max) +static const unsigned char popCountTable[] = +{ + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, +}; +/** @brief Count the 1 bits in the vector and return 4 values + +Scheme: +@code +{A1 A2 A3 ...} => popcount(A1) +@endcode +Any types but result will be in v_uint32x4*/ +template inline v_uint32x4 v_popcount(const v_reg<_Tp, n>& a) +{ + v_uint8x16 b; + b = v_reinterpret_as_u8(a); + for( int i = 0; i < v_uint8x16::nlanes; i++ ) + { + b.s[i] = popCountTable[b.s[i]]; + } + v_uint32x4 c; + for( int i = 0; i < v_uint32x4::nlanes; i++ ) + { + c.s[i] = b.s[i*4] + b.s[i*4+1] + b.s[i*4+2] + b.s[i*4+3]; + } + return c; +} + + //! @cond IGNORED template inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index b000733a5f..2bcff2bc15 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -813,6 +813,22 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32) OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32) OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32) +#define OPENCV_HAL_IMPL_NEON_POPCOUNT(_Tpvec, cast) \ +inline v_uint32x4 v_popcount(const _Tpvec& a) \ +{ \ + uint8x16_t t = vcntq_u8(cast(a.val)); \ + uint16x8_t t0 = vpaddlq_u8(t); /* 16 -> 8 */ \ + uint32x4_t t1 = vpaddlq_u16(t0); /* 8 -> 4 */ \ + return v_uint32x4(t1); \ +} + +OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint8x16, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint16x8, vreinterpretq_u8_u16) +OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint32x4, vreinterpretq_u8_u32) +OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int8x16, vreinterpretq_u8_s8) +OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int16x8, vreinterpretq_u8_s16) +OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int32x4, vreinterpretq_u8_s32) + inline int v_signmask(const v_uint8x16& a) { int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100)); diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index fc81dac35d..9ff10c9b47 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -1121,6 +1121,28 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD) OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max) OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min) +#define OPENCV_HAL_IMPL_SSE_POPCOUNT(_Tpvec) \ +inline v_uint32x4 v_popcount(const _Tpvec& a) \ +{ \ + __m128i m1 = _mm_set1_epi32(0x55555555); \ + __m128i m2 = _mm_set1_epi32(0x33333333); \ + __m128i m4 = _mm_set1_epi32(0x0f0f0f0f); \ + __m128i p = a.val; \ + p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1)); \ + p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2)); \ + p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4)); \ + p = _mm_adds_epi8(p, _mm_srli_si128(p, 1)); \ + p = _mm_adds_epi8(p, _mm_srli_si128(p, 2)); \ + return v_uint32x4(_mm_and_si128(p, _mm_set1_epi32(0x000000ff))); \ +} + +OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint8x16) +OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint16x8) +OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint32x4) +OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int8x16) +OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int16x8) +OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int32x4) + #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \ inline int v_signmask(const _Tpvec& a) \ { \ diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index e1fddb220d..3336152a44 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -44,6 +44,7 @@ #include "precomp.hpp" #include #include +#include "opencv2/core/hal/intrin.hpp" #include "opencl_kernels_core.hpp" @@ -4238,22 +4239,8 @@ int normHamming(const uchar* a, int n) { int i = 0; int result = 0; -#if CV_NEON - { - uint32x4_t bits = vmovq_n_u32(0); - for (; i <= n - 16; i += 16) { - uint8x16_t A_vec = vld1q_u8 (a + i); - uint8x16_t bitsSet = vcntq_u8 (A_vec); - uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet); - uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8); - bits = vaddq_u32(bits, bitSet4); - } - uint64x2_t bitSet2 = vpaddlq_u32 (bits); - result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0); - result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2); - } -#elif CV_AVX2 - if (USE_AVX2) +#if CV_AVX2 + if(USE_AVX2) { __m256i _r0 = _mm256_setzero_si256(); __m256i _0 = _mm256_setzero_si256(); @@ -4274,12 +4261,40 @@ int normHamming(const uchar* a, int n) _r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2)); result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0); } +#elif CV_POPCNT + if(checkHardwareSupport(CV_CPU_POPCNT)) + { +# if defined CV_POPCNT_U64 + for(; i <= n - 8; i += 8) + { + result += (int)CV_POPCNT_U64(*(uint64*)(a + i)); + } +# endif + for(; i <= n - 4; i += 4) + { + result += CV_POPCNT_U32(*(uint*)(a + i)); + } + } +#elif CV_SIMD128 + if(hasSIMD128()) + { + v_uint32x4 t = v_setzero_u32(); + for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes) + { + t += v_popcount(v_load(a + i)); + } + result = v_reduce_sum(t); + } #endif - for( ; i <= n - 4; i += 4 ) - result += popCountTable[a[i]] + popCountTable[a[i+1]] + - popCountTable[a[i+2]] + popCountTable[a[i+3]]; - for( ; i < n; i++ ) + for(; i <= n - 4; i += 4) + { + result += popCountTable[a[i]] + popCountTable[a[i+1]] + + popCountTable[a[i+2]] + popCountTable[a[i+3]]; + } + for(; i < n; i++) + { result += popCountTable[a[i]]; + } return result; } @@ -4287,24 +4302,8 @@ int normHamming(const uchar* a, const uchar* b, int n) { int i = 0; int result = 0; -#if CV_NEON - { - uint32x4_t bits = vmovq_n_u32(0); - for (; i <= n - 16; i += 16) { - uint8x16_t A_vec = vld1q_u8 (a + i); - uint8x16_t B_vec = vld1q_u8 (b + i); - uint8x16_t AxorB = veorq_u8 (A_vec, B_vec); - uint8x16_t bitsSet = vcntq_u8 (AxorB); - uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet); - uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8); - bits = vaddq_u32(bits, bitSet4); - } - uint64x2_t bitSet2 = vpaddlq_u32 (bits); - result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0); - result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2); - } -#elif CV_AVX2 - if (USE_AVX2) +#if CV_AVX2 + if(USE_AVX2) { __m256i _r0 = _mm256_setzero_si256(); __m256i _0 = _mm256_setzero_si256(); @@ -4328,12 +4327,40 @@ int normHamming(const uchar* a, const uchar* b, int n) _r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2)); result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0); } +#elif CV_POPCNT + if(checkHardwareSupport(CV_CPU_POPCNT)) + { +# if defined CV_POPCNT_U64 + for(; i <= n - 8; i += 8) + { + result += (int)CV_POPCNT_U64(*(uint64*)(a + i) ^ *(uint64*)(b + i)); + } +# endif + for(; i <= n - 4; i += 4) + { + result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i)); + } + } +#elif CV_SIMD128 + if(hasSIMD128()) + { + v_uint32x4 t = v_setzero_u32(); + for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes) + { + t += v_popcount(v_load(a + i) ^ v_load(b + i)); + } + result = v_reduce_sum(t); + } #endif - for( ; i <= n - 4; i += 4 ) - result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] + - popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]]; - for( ; i < n; i++ ) + for(; i <= n - 4; i += 4) + { + result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] + + popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]]; + } + for(; i < n; i++) + { result += popCountTable[a[i] ^ b[i]]; + } return result; } diff --git a/modules/core/test/test_intrin.cpp b/modules/core/test/test_intrin.cpp index 66b2083a37..0ec24ef1fb 100644 --- a/modules/core/test/test_intrin.cpp +++ b/modules/core/test/test_intrin.cpp @@ -404,6 +404,18 @@ template struct TheTest return *this; } + TheTest & test_popcount() + { + static unsigned popcountTable[] = {0, 1, 2, 4, 5, 7, 9, 12, 13, 15, 17, 20, 22, 25, 28, 32, 33}; + Data dataA; + R a = dataA; + + unsigned resB = (unsigned)v_reduce_sum(v_popcount(a)); + EXPECT_EQ(popcountTable[R::nlanes], resB); + + return *this; + } + TheTest & test_absdiff() { typedef typename V_RegTrait128::u_reg Ru; @@ -798,6 +810,7 @@ TEST(hal_intrin, uint8x16) { .test_min_max() .test_absdiff() .test_mask() + .test_popcount() .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>() .test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>() .test_unpack() @@ -819,6 +832,7 @@ TEST(hal_intrin, int8x16) { .test_absdiff() .test_abs() .test_mask() + .test_popcount() .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>() .test_unpack() .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>() @@ -844,6 +858,7 @@ TEST(hal_intrin, uint16x8) { .test_absdiff() .test_reduce() .test_mask() + .test_popcount() .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>() .test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>() .test_unpack() @@ -870,6 +885,7 @@ TEST(hal_intrin, int16x8) { .test_abs() .test_reduce() .test_mask() + .test_popcount() .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>() .test_unpack() .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>() @@ -894,6 +910,7 @@ TEST(hal_intrin, uint32x4) { .test_absdiff() .test_reduce() .test_mask() + .test_popcount() .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>() .test_unpack() .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>() @@ -910,6 +927,7 @@ TEST(hal_intrin, int32x4) { .test_mul() .test_abs() .test_cmp() + .test_popcount() .test_shift<1>().test_shift<8>() .test_logic() .test_min_max()