From 96ab78dc4f2a1e4d2e497a8ac61597fcf494b91e Mon Sep 17 00:00:00 2001 From: Vitaly Tuzov Date: Mon, 1 Apr 2019 19:27:50 +0300 Subject: [PATCH 1/3] Reworked v_popcount implementation to provide number of bits in a single lane --- .../include/opencv2/core/hal/intrin_avx.hpp | 133 +++++++++++------ .../include/opencv2/core/hal/intrin_neon.hpp | 70 ++++++--- .../include/opencv2/core/hal/intrin_sse.hpp | 138 ++++++++++++------ .../include/opencv2/core/hal/intrin_vsx.hpp | 60 +++++++- modules/core/src/stat.simd.hpp | 24 ++- 5 files changed, 297 insertions(+), 128 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index e98524b9de..60827f462c 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -1015,6 +1015,34 @@ OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float64x4, _mm256_castsi256_pd ////////// Reduce and mask ///////// /** Reduce **/ +inline unsigned v_reduce_sum(const v_uint8x32& a) +{ + __m256i half = _mm256_sad_epu8(a.val, _mm256_setzero_si256()); + __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half)); + return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter))); +} +inline int v_reduce_sum(const v_int8x32& a) +{ + __m256i half = _mm256_sad_epu8(_mm256_xor_si256(a.val, _mm256_set1_epi8((schar)-128)), _mm256_setzero_si256()); + __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half)); + return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter))) - 4096; +} +#define OPENCV_HAL_IMPL_AVX_REDUCE_32(_Tpvec, sctype, func, intrin) \ + inline sctype v_reduce_##func(const _Tpvec& a) \ + { \ + __m128i val = intrin(_v256_extract_low(a.val), _v256_extract_high(a.val)); \ + val = intrin(val, _mm_srli_si128(val,8)); \ + val = intrin(val, _mm_srli_si128(val,4)); \ + val = intrin(val, _mm_srli_si128(val,2)); \ + val = intrin(val, _mm_srli_si128(val,1)); \ + return (sctype)_mm_cvtsi128_si32(val); \ + } + +OPENCV_HAL_IMPL_AVX_REDUCE_32(v_uint8x32, uchar, min, _mm_min_epu8) +OPENCV_HAL_IMPL_AVX_REDUCE_32(v_int8x32, schar, min, _mm_min_epi8) +OPENCV_HAL_IMPL_AVX_REDUCE_32(v_uint8x32, uchar, max, _mm_max_epu8) +OPENCV_HAL_IMPL_AVX_REDUCE_32(v_int8x32, schar, max, _mm_max_epi8) + #define OPENCV_HAL_IMPL_AVX_REDUCE_16(_Tpvec, sctype, func, intrin) \ inline sctype v_reduce_##func(const _Tpvec& a) \ { \ @@ -1062,31 +1090,6 @@ OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8, int, max, _mm_max_epi32) OPENCV_HAL_IMPL_AVX_REDUCE_FLT(min, _mm_min_ps) OPENCV_HAL_IMPL_AVX_REDUCE_FLT(max, _mm_max_ps) -inline ushort v_reduce_sum(const v_uint16x16& a) -{ - __m128i a0 = _v256_extract_low(a.val); - __m128i a1 = _v256_extract_high(a.val); - - __m128i s0 = _mm_adds_epu16(a0, a1); - s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8)); - s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4)); - s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 2)); - - return (ushort)_mm_cvtsi128_si32(s0); -} - -inline short v_reduce_sum(const v_int16x16& a) -{ - __m256i s0 = _mm256_hadds_epi16(a.val, a.val); - s0 = _mm256_hadds_epi16(s0, s0); - s0 = _mm256_hadds_epi16(s0, s0); - - __m128i s1 = _v256_extract_high(s0); - s1 = _mm_adds_epi16(_v256_extract_low(s0), s1); - - return (short)_mm_cvtsi128_si32(s1); -} - inline int v_reduce_sum(const v_int32x8& a) { __m256i s0 = _mm256_hadd_epi32(a.val, a.val); @@ -1101,6 +1104,11 @@ inline int v_reduce_sum(const v_int32x8& a) inline unsigned v_reduce_sum(const v_uint32x8& a) { return v_reduce_sum(v_reinterpret_as_s32(a)); } +inline int v_reduce_sum(const v_int16x16& a) +{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); } +inline unsigned v_reduce_sum(const v_uint16x16& a) +{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); } + inline float v_reduce_sum(const v_float32x8& a) { __m256 s0 = _mm256_hadd_ps(a.val, a.val); @@ -1112,6 +1120,18 @@ inline float v_reduce_sum(const v_float32x8& a) return _mm_cvtss_f32(s1); } +inline uint64 v_reduce_sum(const v_uint64x4& a) +{ + uint64 CV_DECL_ALIGNED(32) idx[2]; + _mm_store_si128((__m128i*)idx, _mm_add_epi64(_v256_extract_low(a.val), _v256_extract_high(a.val))); + return idx[0] + idx[1]; +} +inline int64 v_reduce_sum(const v_int64x4& a) +{ + int64 CV_DECL_ALIGNED(32) idx[2]; + _mm_store_si128((__m128i*)idx, _mm_add_epi64(_v256_extract_low(a.val), _v256_extract_high(a.val))); + return idx[0] + idx[1]; +} inline double v_reduce_sum(const v_float64x4& a) { __m256d s0 = _mm256_hadd_pd(a.val, a.val); @@ -1166,26 +1186,49 @@ inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b) } /** Popcount **/ -#define OPENCV_HAL_IMPL_AVX_POPCOUNT(_Tpvec) \ - inline v_uint32x8 v_popcount(const _Tpvec& a) \ - { \ - const v_uint32x8 m1 = v256_setall_u32(0x55555555); \ - const v_uint32x8 m2 = v256_setall_u32(0x33333333); \ - const v_uint32x8 m4 = v256_setall_u32(0x0f0f0f0f); \ - v_uint32x8 p = v_reinterpret_as_u32(a); \ - p = ((p >> 1) & m1) + (p & m1); \ - p = ((p >> 2) & m2) + (p & m2); \ - p = ((p >> 4) & m4) + (p & m4); \ - p.val = _mm256_sad_epu8(p.val, _mm256_setzero_si256()); \ - return p; \ - } - -OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint8x32) -OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int8x32) -OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint16x16) -OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int16x16) -OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint32x8) -OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int32x8) +inline v_uint8x32 v_popcount(const v_uint8x32& a) +{ + __m256i m1 = _mm256_set1_epi32(0x55555555); + __m256i m2 = _mm256_set1_epi32(0x33333333); + __m256i m4 = _mm256_set1_epi32(0x0f0f0f0f); + __m256i p = a.val; + p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 1), m1), _mm256_and_si256(p, m1)); + p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 2), m2), _mm256_and_si256(p, m2)); + p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 4), m4), _mm256_and_si256(p, m4)); + return v_uint8x32(p); +} +inline v_uint16x16 v_popcount(const v_uint16x16& a) +{ + v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a)); + p += v_rotate_right<1>(p); + return v_reinterpret_as_u16(p) & v256_setall_u16(0x00ff); +} +inline v_uint32x8 v_popcount(const v_uint32x8& a) +{ + v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a)); + p += v_rotate_right<1>(p); + p += v_rotate_right<2>(p); + return v_reinterpret_as_u32(p) & v256_setall_u32(0x000000ff); +} +inline v_uint64x4 v_popcount(const v_uint64x4& a) +{ + __m256i m1 = _mm256_set1_epi32(0x55555555); + __m256i m2 = _mm256_set1_epi32(0x33333333); + __m256i m4 = _mm256_set1_epi32(0x0f0f0f0f); + __m256i p = a.val; + p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 1), m1), _mm256_and_si256(p, m1)); + p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 2), m2), _mm256_and_si256(p, m2)); + p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 4), m4), _mm256_and_si256(p, m4)); + return v_uint64x4(_mm256_sad_epu8(p, _mm256_setzero_si256())); +} +inline v_uint8x32 v_popcount(const v_int8x32& a) +{ return v_popcount(v_reinterpret_as_u8(a)); } +inline v_uint16x16 v_popcount(const v_int16x16& a) +{ return v_popcount(v_reinterpret_as_u16(a)); } +inline v_uint32x8 v_popcount(const v_int32x8& a) +{ return v_popcount(v_reinterpret_as_u32(a)); } +inline v_uint64x4 v_popcount(const v_int64x4& a) +{ return v_popcount(v_reinterpret_as_u64(a)); } /** Mask **/ inline int v_signmask(const v_int8x32& a) diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index c6da1b42d9..468872a677 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -910,6 +910,31 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32) OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64) #endif +inline unsigned v_reduce_sum(const v_uint8x16& a) +{ + uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(a.val)); + uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0)); + return vget_lane_u32(vpadd_u32(t1, t1), 0); +} +inline int v_reduce_sum(const v_int8x16& a) +{ + int32x4_t t0 = vpaddlq_s16(vpaddlq_s8(a.val)); + int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0)); + return vget_lane_s32(vpadd_s32(t1, t1), 0); +} +inline unsigned v_reduce_sum(const v_uint16x8& a) +{ + uint32x4_t t0 = vpaddlq_u16(a.val); + uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0)); + return vget_lane_u32(vpadd_u32(t1, t1), 0); +} +inline int v_reduce_sum(const v_int16x8& a) +{ + int32x4_t t0 = vpaddlq_s16(a.val); + int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0)); + return vget_lane_s32(vpadd_s32(t1, t1), 0); +} + #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \ inline scalartype v_reduce_##func(const _Tpvec& a) \ { \ @@ -918,12 +943,10 @@ inline scalartype v_reduce_##func(const _Tpvec& a) \ return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \ } -OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, sum, add, u16) -OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, max, max, u16) -OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, min, min, u16) -OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, sum, add, s16) -OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, max, max, s16) -OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, min, min, s16) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned int, max, max, u16) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned int, min, min, u16) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, int, max, max, s16) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, int, min, min, s16) #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \ inline scalartype v_reduce_##func(const _Tpvec& a) \ @@ -942,6 +965,10 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32) OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32) OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32) +inline uint64 v_reduce_sum(const v_uint64x2& a) +{ return vget_lane_u64(vadd_u64(vget_low_u64(a.val), vget_high_u64(a.val)),0); } +inline int64 v_reduce_sum(const v_int64x2& a) +{ return vget_lane_s64(vadd_s64(vget_low_s64(a.val), vget_high_s64(a.val)),0); } #if CV_SIMD128_64F inline double v_reduce_sum(const v_float64x2& a) { @@ -1007,21 +1034,22 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b) return vget_lane_f32(vpadd_f32(t1, t1), 0); } -#define OPENCV_HAL_IMPL_NEON_POPCOUNT(_Tpvec, cast) \ -inline v_uint32x4 v_popcount(const _Tpvec& a) \ -{ \ - uint8x16_t t = vcntq_u8(cast(a.val)); \ - uint16x8_t t0 = vpaddlq_u8(t); /* 16 -> 8 */ \ - uint32x4_t t1 = vpaddlq_u16(t0); /* 8 -> 4 */ \ - return v_uint32x4(t1); \ -} - -OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint8x16, OPENCV_HAL_NOP) -OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint16x8, vreinterpretq_u8_u16) -OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint32x4, vreinterpretq_u8_u32) -OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int8x16, vreinterpretq_u8_s8) -OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int16x8, vreinterpretq_u8_s16) -OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int32x4, vreinterpretq_u8_s32) +inline v_uint8x16 v_popcount(const v_uint8x16& a) +{ return v_uint8x16(vcntq_u8(a.val)); } +inline v_uint8x16 v_popcount(const v_int8x16& a) +{ return v_uint8x16(vcntq_u8(vreinterpretq_u8_s8(a.val))); } +inline v_uint16x8 v_popcount(const v_uint16x8& a) +{ return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u16(a.val)))); } +inline v_uint16x8 v_popcount(const v_int16x8& a) +{ return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s16(a.val)))); } +inline v_uint32x4 v_popcount(const v_uint32x4& a) +{ return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u32(a.val))))); } +inline v_uint32x4 v_popcount(const v_int32x4& a) +{ return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s32(a.val))))); } +inline v_uint64x2 v_popcount(const v_uint64x2& a) +{ return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(a.val)))))); } +inline v_uint64x2 v_popcount(const v_int64x2& a) +{ return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s64(a.val)))))); } inline int v_signmask(const v_uint8x16& a) { diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index 6ab360e0b7..beeaa1663e 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -302,8 +302,8 @@ inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps) template inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \ { return _Tpvec(cast(a.val)); } -OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, char, OPENCV_HAL_NOP) -OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, char, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, schar, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, schar, OPENCV_HAL_NOP) OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP) OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP) OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP) @@ -1393,6 +1393,41 @@ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps) OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd) +inline unsigned v_reduce_sum(const v_uint8x16& a) +{ + __m128i half = _mm_sad_epu8(a.val, _mm_setzero_si128()); + return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half))); +} +inline int v_reduce_sum(const v_int8x16& a) +{ + __m128i half = _mm_set1_epi8((schar)-128); + half = _mm_sad_epu8(_mm_xor_si128(a.val, half), _mm_setzero_si128()); + return _mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half))) - 2048; +} +#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(func) \ +inline schar v_reduce_##func(const v_int8x16& a) \ +{ \ + __m128i val = a.val; \ + __m128i smask = _mm_set1_epi8((schar)-128); \ + val = _mm_xor_si128(val, smask); \ + val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \ + val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \ + val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \ + val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \ + return (schar)_mm_cvtsi128_si32(val) ^ (schar)-128; \ +} \ +inline uchar v_reduce_##func(const v_uint8x16& a) \ +{ \ + __m128i val = a.val; \ + val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \ + val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \ + val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \ + val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \ + return (uchar)_mm_cvtsi128_si32(val); \ +} +OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(max) +OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(min) + #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \ inline scalartype v_reduce_##func(const v_##_Tpvec& a) \ { \ @@ -1412,26 +1447,8 @@ inline unsigned scalartype v_reduce_##func(const v_u##_Tpvec& a) \ val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \ return (unsigned scalartype)(_mm_cvtsi128_si32(val) ^ sbit); \ } -#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(_Tpvec, scalartype, suffix) \ -inline scalartype v_reduce_sum(const v_##_Tpvec& a) \ -{ \ - __m128i val = a.val; \ - val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 8)); \ - val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 4)); \ - val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 2)); \ - return (scalartype)_mm_cvtsi128_si32(val); \ -} \ -inline unsigned scalartype v_reduce_sum(const v_u##_Tpvec& a) \ -{ \ - __m128i val = a.val; \ - val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 8)); \ - val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 4)); \ - val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 2)); \ - return (unsigned scalartype)_mm_cvtsi128_si32(val); \ -} OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, max, epi16, (short)-32768) OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, min, epi16, (short)-32768) -OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(int16x8, short, 16) #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, cast_from, cast_to, extract) \ inline scalartype v_reduce_sum(const _Tpvec& a) \ @@ -1456,6 +1473,23 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32) OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32) +inline int v_reduce_sum(const v_int16x8& a) +{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); } +inline unsigned v_reduce_sum(const v_uint16x8& a) +{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); } + +inline uint64 v_reduce_sum(const v_uint64x2& a) +{ + uint64 CV_DECL_ALIGNED(32) idx[2]; + v_store_aligned(idx, a); + return idx[0] + idx[1]; +} +inline int64 v_reduce_sum(const v_int64x2& a) +{ + int64 CV_DECL_ALIGNED(32) idx[2]; + v_store_aligned(idx, a); + return idx[0] + idx[1]; +} inline double v_reduce_sum(const v_float64x2& a) { double CV_DECL_ALIGNED(32) idx[2]; @@ -1520,27 +1554,49 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b) return v_reduce_sum(v_absdiff(a, b)); } -#define OPENCV_HAL_IMPL_SSE_POPCOUNT(_Tpvec) \ -inline v_uint32x4 v_popcount(const _Tpvec& a) \ -{ \ - __m128i m1 = _mm_set1_epi32(0x55555555); \ - __m128i m2 = _mm_set1_epi32(0x33333333); \ - __m128i m4 = _mm_set1_epi32(0x0f0f0f0f); \ - __m128i p = a.val; \ - p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1)); \ - p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2)); \ - p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4)); \ - p = _mm_adds_epi8(p, _mm_srli_si128(p, 1)); \ - p = _mm_adds_epi8(p, _mm_srli_si128(p, 2)); \ - return v_uint32x4(_mm_and_si128(p, _mm_set1_epi32(0x000000ff))); \ -} - -OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint8x16) -OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint16x8) -OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint32x4) -OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int8x16) -OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int16x8) -OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int32x4) +inline v_uint8x16 v_popcount(const v_uint8x16& a) +{ + __m128i m1 = _mm_set1_epi32(0x55555555); + __m128i m2 = _mm_set1_epi32(0x33333333); + __m128i m4 = _mm_set1_epi32(0x0f0f0f0f); + __m128i p = a.val; + p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1)); + p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2)); + p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4)); + return v_uint8x16(p); +} +inline v_uint16x8 v_popcount(const v_uint16x8& a) +{ + v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a)); + p += v_rotate_right<1>(p); + return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff); +} +inline v_uint32x4 v_popcount(const v_uint32x4& a) +{ + v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a)); + p += v_rotate_right<1>(p); + p += v_rotate_right<2>(p); + return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff); +} +inline v_uint64x2 v_popcount(const v_uint64x2& a) +{ + __m128i m1 = _mm_set1_epi32(0x55555555); + __m128i m2 = _mm_set1_epi32(0x33333333); + __m128i m4 = _mm_set1_epi32(0x0f0f0f0f); + __m128i p = a.val; + p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1)); + p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2)); + p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4)); + return v_uint64x2(_mm_sad_epu8(p, _mm_setzero_si128())); +} +inline v_uint8x16 v_popcount(const v_int8x16& a) +{ return v_popcount(v_reinterpret_as_u8(a)); } +inline v_uint16x8 v_popcount(const v_int16x8& a) +{ return v_popcount(v_reinterpret_as_u16(a)); } +inline v_uint32x4 v_popcount(const v_int32x4& a) +{ return v_popcount(v_reinterpret_as_u32(a)); } +inline v_uint64x2 v_popcount(const v_int64x2& a) +{ return v_popcount(v_reinterpret_as_u64(a)); } #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \ inline int v_signmask(const _Tpvec& a) \ diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index 390977b55e..80e6fefd53 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -692,15 +692,27 @@ inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) ////////// Reduce and mask ///////// /** Reduce **/ -inline short v_reduce_sum(const v_int16x8& a) +inline uint v_reduce_sum(const v_uint8x16& a) +{ + const vec_uint4 zero4 = vec_uint4_z; + vec_uint4 sum4 = vec_sum4s(a.val, zero4); + return (uint)vec_extract(vec_sums(vec_int4_c(sum4), vec_int4_c(zero4)), 3); +} +inline int v_reduce_sum(const v_int8x16& a) +{ + const vec_int4 zero4 = vec_int4_z; + vec_int4 sum4 = vec_sum4s(a.val, zero4); + return (int)vec_extract(vec_sums(sum4, zero4), 3); +} +inline int v_reduce_sum(const v_int16x8& a) { const vec_int4 zero = vec_int4_z; - return saturate_cast(vec_extract(vec_sums(vec_sum4s(a.val, zero), zero), 3)); + return saturate_cast(vec_extract(vec_sums(vec_sum4s(a.val, zero), zero), 3)); } -inline ushort v_reduce_sum(const v_uint16x8& a) +inline uint v_reduce_sum(const v_uint16x8& a) { const vec_int4 v4 = vec_int4_c(vec_unpackhu(vec_adds(a.val, vec_sld(a.val, a.val, 8)))); - return saturate_cast(vec_extract(vec_sums(v4, vec_int4_z), 3)); + return saturate_cast(vec_extract(vec_sums(v4, vec_int4_z), 3)); } #define OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(_Tpvec, _Tpvec2, scalartype, suffix, func) \ @@ -719,6 +731,14 @@ OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, sum, vec_add) OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, max, vec_max) OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, min, vec_min) +inline uint64 v_reduce_sum(const v_uint64x2& a) +{ + return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0); +} +inline int64 v_reduce_sum(const v_int64x2& a) +{ + return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0); +} inline double v_reduce_sum(const v_float64x2& a) { return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0); @@ -736,6 +756,19 @@ OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint16x8, vec_ushort8, ushort, min, vec_min) OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, max, vec_max) OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, min, vec_min) +#define OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(_Tpvec, _Tpvec2, scalartype, suffix, func) \ +inline scalartype v_reduce_##suffix(const _Tpvec& a) \ +{ \ + _Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8)); \ + rs = func(rs, vec_sld(rs, rs, 4)); \ + rs = func(rs, vec_sld(rs, rs, 2)); \ + return vec_extract(func(rs, vec_sld(rs, rs, 1)), 0); \ +} +OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint8x16, vec_uchar16, uchar, max, vec_max) +OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint8x16, vec_uchar16, uchar, min, vec_min) +OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int8x16, vec_char8, schar, max, vec_max) +OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int8x16, vec_char8, schar, min, vec_min) + inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c, const v_float32x4& d) { @@ -792,9 +825,22 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b) } /** Popcount **/ -template -inline v_uint32x4 v_popcount(const _Tpvec& a) -{ return v_uint32x4(vec_popcntu(vec_uint4_c(a.val))); } +inline v_uint8x16 v_popcount(const v_uint8x16& a) +{ return v_uint8x16(vec_popcntu(a.val)); } +inline v_uint8x16 v_popcount(const v_int8x16& a) +{ return v_uint8x16(vec_popcntu(a.val)); } +inline v_uint16x8 v_popcount(const v_uint16x8& a) +{ return v_uint16x8(vec_popcntu(a.val)); } +inline v_uint16x8 v_popcount(const v_int16x8& a) +{ return v_uint16x8(vec_popcntu(a.val)); } +inline v_uint32x4 v_popcount(const v_uint32x4& a) +{ return v_uint32x4(vec_popcntu(a.val)); } +inline v_uint32x4 v_popcount(const v_int32x4& a) +{ return v_uint32x4(vec_popcntu(a.val)); } +inline v_uint64x2 v_popcount(const v_uint64x2& a) +{ return v_uint64x2(vec_popcntu(a.val)); } +inline v_uint64x2 v_popcount(const v_int64x2& a) +{ return v_uint64x2(vec_popcntu(a.val)); } /** Mask **/ inline int v_signmask(const v_uint8x16& a) diff --git a/modules/core/src/stat.simd.hpp b/modules/core/src/stat.simd.hpp index b75100d3f4..1df25d2813 100644 --- a/modules/core/src/stat.simd.hpp +++ b/modules/core/src/stat.simd.hpp @@ -70,16 +70,14 @@ int normHamming(const uchar* a, int n) } #endif // CV_POPCNT -#if CV_SIMD128 +#if CV_SIMD { - v_uint32x4 t = v_setzero_u32(); - for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes) - { - t += v_popcount(v_load(a + i)); - } + v_uint64 t = vx_setzero_u64(); + for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes) + t += v_popcount(v_reinterpret_as_u64(vx_load(a + i))); result += v_reduce_sum(t); } -#endif // CV_SIMD128 +#endif // CV_SIMD #if CV_ENABLE_UNROLLED for(; i <= n - 4; i += 4) { @@ -141,16 +139,14 @@ int normHamming(const uchar* a, const uchar* b, int n) } #endif // CV_POPCNT -#if CV_SIMD128 +#if CV_SIMD { - v_uint32x4 t = v_setzero_u32(); - for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes) - { - t += v_popcount(v_load(a + i) ^ v_load(b + i)); - } + v_uint64 t = vx_setzero_u64(); + for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes) + t += v_popcount(v_reinterpret_as_u64(vx_load(a + i) ^ vx_load(b + i))); result += v_reduce_sum(t); } -#endif // CV_SIMD128 +#endif // CV_SIMD #if CV_ENABLE_UNROLLED for(; i <= n - 4; i += 4) { From 1220dd487711cf5ece217cac3c0fb0209c5e939a Mon Sep 17 00:00:00 2001 From: Vitaly Tuzov Date: Wed, 3 Apr 2019 11:45:38 +0300 Subject: [PATCH 2/3] Updated v_popcount description, reference implementation and test. --- .../include/opencv2/core/hal/intrin_cpp.hpp | 25 +++++++------------ modules/core/src/stat.simd.hpp | 4 +-- modules/core/test/test_intrin_utils.hpp | 20 +++++++++------ 3 files changed, 24 insertions(+), 25 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index 757c67b314..97756af5fa 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -603,27 +603,20 @@ static const unsigned char popCountTable[] = 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, }; -/** @brief Count the 1 bits in the vector and return 4 values +/** @brief Count the 1 bits in the vector lanes and return result as corresponding unsigned type Scheme: @code -{A1 A2 A3 ...} => popcount(A1) +{A1 A2 A3 ...} => {popcount(A1), popcount(A2), popcount(A3), ...} @endcode -Any types but result will be in v_uint32x4*/ -template inline v_uint32x4 v_popcount(const v_reg<_Tp, n>& a) +For all integer types. */ +template +inline v_reg::abs_type, n> v_popcount(const v_reg<_Tp, n>& a) { - v_uint8x16 b; - b = v_reinterpret_as_u8(a); - for( int i = 0; i < v_uint8x16::nlanes; i++ ) - { - b.s[i] = popCountTable[b.s[i]]; - } - v_uint32x4 c; - for( int i = 0; i < v_uint32x4::nlanes; i++ ) - { - c.s[i] = b.s[i*4] + b.s[i*4+1] + b.s[i*4+2] + b.s[i*4+3]; - } - return c; + v_reg::abs_type, n> b = v_reg::abs_type, n>::zero(); + for( int i = 0; i < n*sizeof(_Tp); i++ ) + b.s[i/sizeof(_Tp)] += popCountTable[v_reinterpret_as_u8(a).s[i]]; + return b; } diff --git a/modules/core/src/stat.simd.hpp b/modules/core/src/stat.simd.hpp index 1df25d2813..0da3f79380 100644 --- a/modules/core/src/stat.simd.hpp +++ b/modules/core/src/stat.simd.hpp @@ -75,7 +75,7 @@ int normHamming(const uchar* a, int n) v_uint64 t = vx_setzero_u64(); for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes) t += v_popcount(v_reinterpret_as_u64(vx_load(a + i))); - result += v_reduce_sum(t); + result += (int)v_reduce_sum(t); } #endif // CV_SIMD #if CV_ENABLE_UNROLLED @@ -144,7 +144,7 @@ int normHamming(const uchar* a, const uchar* b, int n) v_uint64 t = vx_setzero_u64(); for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes) t += v_popcount(v_reinterpret_as_u64(vx_load(a + i) ^ vx_load(b + i))); - result += v_reduce_sum(t); + result += (int)v_reduce_sum(t); } #endif // CV_SIMD #if CV_ENABLE_UNROLLED diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index 3b85d68dea..6ead0ecc60 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -686,18 +686,24 @@ template struct TheTest TheTest & test_popcount() { + typedef typename V_RegTraits::u_reg Ru; static unsigned popcountTable[] = { - 0, 1, 2, 4, 5, 7, 9, 12, 13, 15, 17, 20, 22, 25, 28, 32, 33, - 35, 37, 40, 42, 45, 48, 52, 54, 57, 60, 64, 67, 71, 75, 80, 81, - 83, 85, 88, 90, 93, 96, 100, 102, 105, 108, 112, 115, 119, 123, - 128, 130, 133, 136, 140, 143, 147, 151, 156, 159, 163, 167, 172, - 176, 181, 186, 192, 193 + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, //0x00-0x0f + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, //0x10-0x1f + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, //0x20-0x2f + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, //0x30-0x3f + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, //0x40-0x4f + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, //0x50-0x5f + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, //0x60-0x6f + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, //0x70-0x7f + 1 //0x80 }; Data dataA; R a = dataA; - unsigned resB = (unsigned)v_reduce_sum(v_popcount(a)); - EXPECT_EQ(popcountTable[R::nlanes], resB); + Data resB = v_popcount(a); + for (int i = 0; i < Ru::nlanes; ++i) + EXPECT_EQ(popcountTable[i + 1], resB[i]); return *this; } From 7a55f2af3bff2ea521555f7d5ffc4594cf92cfe2 Mon Sep 17 00:00:00 2001 From: Vitaly Tuzov Date: Tue, 14 May 2019 18:48:36 +0300 Subject: [PATCH 3/3] Updated AVX2 implementation of v_popcount for u8. --- .../include/opencv2/core/hal/intrin_avx.hpp | 22 ++---- .../include/opencv2/core/hal/intrin_sse.hpp | 9 +-- .../include/opencv2/core/hal/intrin_vsx.hpp | 4 +- modules/core/src/stat.simd.hpp | 79 ++++++------------- 4 files changed, 32 insertions(+), 82 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index 60827f462c..91e4483444 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -1188,14 +1188,11 @@ inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b) /** Popcount **/ inline v_uint8x32 v_popcount(const v_uint8x32& a) { - __m256i m1 = _mm256_set1_epi32(0x55555555); - __m256i m2 = _mm256_set1_epi32(0x33333333); - __m256i m4 = _mm256_set1_epi32(0x0f0f0f0f); - __m256i p = a.val; - p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 1), m1), _mm256_and_si256(p, m1)); - p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 2), m2), _mm256_and_si256(p, m2)); - p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 4), m4), _mm256_and_si256(p, m4)); - return v_uint8x32(p); + __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); + __m256i _popcnt_mask = _mm256_set1_epi8(0x0F); + return v_uint8x32(_mm256_add_epi8(_mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256( a.val , _popcnt_mask)), + _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_mm256_srli_epi16(a.val, 4), _popcnt_mask)))); } inline v_uint16x16 v_popcount(const v_uint16x16& a) { @@ -1212,14 +1209,7 @@ inline v_uint32x8 v_popcount(const v_uint32x8& a) } inline v_uint64x4 v_popcount(const v_uint64x4& a) { - __m256i m1 = _mm256_set1_epi32(0x55555555); - __m256i m2 = _mm256_set1_epi32(0x33333333); - __m256i m4 = _mm256_set1_epi32(0x0f0f0f0f); - __m256i p = a.val; - p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 1), m1), _mm256_and_si256(p, m1)); - p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 2), m2), _mm256_and_si256(p, m2)); - p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 4), m4), _mm256_and_si256(p, m4)); - return v_uint64x4(_mm256_sad_epu8(p, _mm256_setzero_si256())); + return v_uint64x4(_mm256_sad_epu8(v_popcount(v_reinterpret_as_u8(a)).val, _mm256_setzero_si256())); } inline v_uint8x32 v_popcount(const v_int8x32& a) { return v_popcount(v_reinterpret_as_u8(a)); } diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index beeaa1663e..7b7e97c561 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -1580,14 +1580,7 @@ inline v_uint32x4 v_popcount(const v_uint32x4& a) } inline v_uint64x2 v_popcount(const v_uint64x2& a) { - __m128i m1 = _mm_set1_epi32(0x55555555); - __m128i m2 = _mm_set1_epi32(0x33333333); - __m128i m4 = _mm_set1_epi32(0x0f0f0f0f); - __m128i p = a.val; - p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1)); - p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2)); - p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4)); - return v_uint64x2(_mm_sad_epu8(p, _mm_setzero_si128())); + return v_uint64x2(_mm_sad_epu8(v_popcount(v_reinterpret_as_u8(a)).val, _mm_setzero_si128())); } inline v_uint8x16 v_popcount(const v_int8x16& a) { return v_popcount(v_reinterpret_as_u8(a)); } diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index 80e6fefd53..1a118ae270 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -766,8 +766,8 @@ inline scalartype v_reduce_##suffix(const _Tpvec& a) } OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint8x16, vec_uchar16, uchar, max, vec_max) OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint8x16, vec_uchar16, uchar, min, vec_min) -OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int8x16, vec_char8, schar, max, vec_max) -OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int8x16, vec_char8, schar, min, vec_min) +OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int8x16, vec_char16, schar, max, vec_max) +OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int8x16, vec_char16, schar, min, vec_min) inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c, const v_float32x4& d) diff --git a/modules/core/src/stat.simd.hpp b/modules/core/src/stat.simd.hpp index 0da3f79380..34b784e12e 100644 --- a/modules/core/src/stat.simd.hpp +++ b/modules/core/src/stat.simd.hpp @@ -32,28 +32,15 @@ int normHamming(const uchar* a, int n) int i = 0; int result = 0; -#if CV_AVX2 - { - __m256i _r0 = _mm256_setzero_si256(); - __m256i _0 = _mm256_setzero_si256(); - __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); - __m256i _popcnt_mask = _mm256_set1_epi8(0x0F); - - for(; i <= n - 32; i+= 32) - { - __m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i)); - __m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_a0, _popcnt_mask)); - __m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table, - _mm256_and_si256(_mm256_srli_epi16(_a0, 4), _popcnt_mask)); - - _r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1))); - } - _r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2)); - result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0); +#if CV_SIMD && CV_SIMD_WIDTH > 16 + { + v_uint64 t = vx_setzero_u64(); + for (; i <= n - v_uint8::nlanes; i += v_uint8::nlanes) + t += v_popcount(v_reinterpret_as_u64(vx_load(a + i))); + result = (int)v_reduce_sum(t); } -#endif // CV_AVX2 +#endif #if CV_POPCNT { @@ -68,16 +55,14 @@ int normHamming(const uchar* a, int n) result += CV_POPCNT_U32(*(uint*)(a + i)); } } -#endif // CV_POPCNT - -#if CV_SIMD +#elif CV_SIMD { - v_uint64 t = vx_setzero_u64(); - for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes) - t += v_popcount(v_reinterpret_as_u64(vx_load(a + i))); + v_uint64x2 t = v_setzero_u64(); + for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes) + t += v_popcount(v_reinterpret_as_u64(v_load(a + i))); result += (int)v_reduce_sum(t); } -#endif // CV_SIMD +#endif #if CV_ENABLE_UNROLLED for(; i <= n - 4; i += 4) { @@ -98,31 +83,15 @@ int normHamming(const uchar* a, const uchar* b, int n) int i = 0; int result = 0; -#if CV_AVX2 - { - __m256i _r0 = _mm256_setzero_si256(); - __m256i _0 = _mm256_setzero_si256(); - __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); - __m256i _popcnt_mask = _mm256_set1_epi8(0x0F); - - for(; i <= n - 32; i+= 32) - { - __m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i)); - __m256i _b0 = _mm256_loadu_si256((const __m256i*)(b + i)); - - __m256i _xor = _mm256_xor_si256(_a0, _b0); - __m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_xor, _popcnt_mask)); - __m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table, - _mm256_and_si256(_mm256_srli_epi16(_xor, 4), _popcnt_mask)); - - _r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1))); - } - _r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2)); - result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0); +#if CV_SIMD && CV_SIMD_WIDTH > 16 + { + v_uint64 t = vx_setzero_u64(); + for (; i <= n - v_uint8::nlanes; i += v_uint8::nlanes) + t += v_popcount(v_reinterpret_as_u64(vx_load(a + i) ^ vx_load(b + i))); + result += (int)v_reduce_sum(t); } -#endif // CV_AVX2 +#endif #if CV_POPCNT { @@ -137,16 +106,14 @@ int normHamming(const uchar* a, const uchar* b, int n) result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i)); } } -#endif // CV_POPCNT - -#if CV_SIMD +#elif CV_SIMD { - v_uint64 t = vx_setzero_u64(); - for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes) + v_uint64x2 t = v_setzero_u64(); + for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes) t += v_popcount(v_reinterpret_as_u64(vx_load(a + i) ^ vx_load(b + i))); result += (int)v_reduce_sum(t); } -#endif // CV_SIMD +#endif #if CV_ENABLE_UNROLLED for(; i <= n - 4; i += 4) {