From 7a55f2af3bff2ea521555f7d5ffc4594cf92cfe2 Mon Sep 17 00:00:00 2001 From: Vitaly Tuzov Date: Tue, 14 May 2019 18:48:36 +0300 Subject: [PATCH] Updated AVX2 implementation of v_popcount for u8. --- .../include/opencv2/core/hal/intrin_avx.hpp | 22 ++---- .../include/opencv2/core/hal/intrin_sse.hpp | 9 +-- .../include/opencv2/core/hal/intrin_vsx.hpp | 4 +- modules/core/src/stat.simd.hpp | 79 ++++++------------- 4 files changed, 32 insertions(+), 82 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index 60827f462c..91e4483444 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -1188,14 +1188,11 @@ inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b) /** Popcount **/ inline v_uint8x32 v_popcount(const v_uint8x32& a) { - __m256i m1 = _mm256_set1_epi32(0x55555555); - __m256i m2 = _mm256_set1_epi32(0x33333333); - __m256i m4 = _mm256_set1_epi32(0x0f0f0f0f); - __m256i p = a.val; - p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 1), m1), _mm256_and_si256(p, m1)); - p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 2), m2), _mm256_and_si256(p, m2)); - p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 4), m4), _mm256_and_si256(p, m4)); - return v_uint8x32(p); + __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); + __m256i _popcnt_mask = _mm256_set1_epi8(0x0F); + return v_uint8x32(_mm256_add_epi8(_mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256( a.val , _popcnt_mask)), + _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_mm256_srli_epi16(a.val, 4), _popcnt_mask)))); } inline v_uint16x16 v_popcount(const v_uint16x16& a) { @@ -1212,14 +1209,7 @@ inline v_uint32x8 v_popcount(const v_uint32x8& a) } inline v_uint64x4 v_popcount(const v_uint64x4& a) { - __m256i m1 = _mm256_set1_epi32(0x55555555); - __m256i m2 = _mm256_set1_epi32(0x33333333); - __m256i m4 = _mm256_set1_epi32(0x0f0f0f0f); - __m256i p = a.val; - p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 1), m1), _mm256_and_si256(p, m1)); - p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 2), m2), _mm256_and_si256(p, m2)); - p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 4), m4), _mm256_and_si256(p, m4)); - return v_uint64x4(_mm256_sad_epu8(p, _mm256_setzero_si256())); + return v_uint64x4(_mm256_sad_epu8(v_popcount(v_reinterpret_as_u8(a)).val, _mm256_setzero_si256())); } inline v_uint8x32 v_popcount(const v_int8x32& a) { return v_popcount(v_reinterpret_as_u8(a)); } diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index beeaa1663e..7b7e97c561 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -1580,14 +1580,7 @@ inline v_uint32x4 v_popcount(const v_uint32x4& a) } inline v_uint64x2 v_popcount(const v_uint64x2& a) { - __m128i m1 = _mm_set1_epi32(0x55555555); - __m128i m2 = _mm_set1_epi32(0x33333333); - __m128i m4 = _mm_set1_epi32(0x0f0f0f0f); - __m128i p = a.val; - p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1)); - p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2)); - p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4)); - return v_uint64x2(_mm_sad_epu8(p, _mm_setzero_si128())); + return v_uint64x2(_mm_sad_epu8(v_popcount(v_reinterpret_as_u8(a)).val, _mm_setzero_si128())); } inline v_uint8x16 v_popcount(const v_int8x16& a) { return v_popcount(v_reinterpret_as_u8(a)); } diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index 80e6fefd53..1a118ae270 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -766,8 +766,8 @@ inline scalartype v_reduce_##suffix(const _Tpvec& a) } OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint8x16, vec_uchar16, uchar, max, vec_max) OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint8x16, vec_uchar16, uchar, min, vec_min) -OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int8x16, vec_char8, schar, max, vec_max) -OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int8x16, vec_char8, schar, min, vec_min) +OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int8x16, vec_char16, schar, max, vec_max) +OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int8x16, vec_char16, schar, min, vec_min) inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c, const v_float32x4& d) diff --git a/modules/core/src/stat.simd.hpp b/modules/core/src/stat.simd.hpp index 0da3f79380..34b784e12e 100644 --- a/modules/core/src/stat.simd.hpp +++ b/modules/core/src/stat.simd.hpp @@ -32,28 +32,15 @@ int normHamming(const uchar* a, int n) int i = 0; int result = 0; -#if CV_AVX2 - { - __m256i _r0 = _mm256_setzero_si256(); - __m256i _0 = _mm256_setzero_si256(); - __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); - __m256i _popcnt_mask = _mm256_set1_epi8(0x0F); - - for(; i <= n - 32; i+= 32) - { - __m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i)); - __m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_a0, _popcnt_mask)); - __m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table, - _mm256_and_si256(_mm256_srli_epi16(_a0, 4), _popcnt_mask)); - - _r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1))); - } - _r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2)); - result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0); +#if CV_SIMD && CV_SIMD_WIDTH > 16 + { + v_uint64 t = vx_setzero_u64(); + for (; i <= n - v_uint8::nlanes; i += v_uint8::nlanes) + t += v_popcount(v_reinterpret_as_u64(vx_load(a + i))); + result = (int)v_reduce_sum(t); } -#endif // CV_AVX2 +#endif #if CV_POPCNT { @@ -68,16 +55,14 @@ int normHamming(const uchar* a, int n) result += CV_POPCNT_U32(*(uint*)(a + i)); } } -#endif // CV_POPCNT - -#if CV_SIMD +#elif CV_SIMD { - v_uint64 t = vx_setzero_u64(); - for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes) - t += v_popcount(v_reinterpret_as_u64(vx_load(a + i))); + v_uint64x2 t = v_setzero_u64(); + for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes) + t += v_popcount(v_reinterpret_as_u64(v_load(a + i))); result += (int)v_reduce_sum(t); } -#endif // CV_SIMD +#endif #if CV_ENABLE_UNROLLED for(; i <= n - 4; i += 4) { @@ -98,31 +83,15 @@ int normHamming(const uchar* a, const uchar* b, int n) int i = 0; int result = 0; -#if CV_AVX2 - { - __m256i _r0 = _mm256_setzero_si256(); - __m256i _0 = _mm256_setzero_si256(); - __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); - __m256i _popcnt_mask = _mm256_set1_epi8(0x0F); - - for(; i <= n - 32; i+= 32) - { - __m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i)); - __m256i _b0 = _mm256_loadu_si256((const __m256i*)(b + i)); - - __m256i _xor = _mm256_xor_si256(_a0, _b0); - __m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_xor, _popcnt_mask)); - __m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table, - _mm256_and_si256(_mm256_srli_epi16(_xor, 4), _popcnt_mask)); - - _r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1))); - } - _r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2)); - result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0); +#if CV_SIMD && CV_SIMD_WIDTH > 16 + { + v_uint64 t = vx_setzero_u64(); + for (; i <= n - v_uint8::nlanes; i += v_uint8::nlanes) + t += v_popcount(v_reinterpret_as_u64(vx_load(a + i) ^ vx_load(b + i))); + result += (int)v_reduce_sum(t); } -#endif // CV_AVX2 +#endif #if CV_POPCNT { @@ -137,16 +106,14 @@ int normHamming(const uchar* a, const uchar* b, int n) result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i)); } } -#endif // CV_POPCNT - -#if CV_SIMD +#elif CV_SIMD { - v_uint64 t = vx_setzero_u64(); - for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes) + v_uint64x2 t = v_setzero_u64(); + for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes) t += v_popcount(v_reinterpret_as_u64(vx_load(a + i) ^ vx_load(b + i))); result += (int)v_reduce_sum(t); } -#endif // CV_SIMD +#endif #if CV_ENABLE_UNROLLED for(; i <= n - 4; i += 4) {