From 820fdbff2309150eb2b5a328ddac774b865ec8a1 Mon Sep 17 00:00:00 2001 From: Tomoaki Teshima Date: Thu, 26 Jan 2017 18:37:56 +0900 Subject: [PATCH] brush up AVX optimization in popcount - make sure SIMD optimization works even when AVX is not available --- modules/core/src/stat.cpp | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index 3336152a44..39e0fa7648 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -4261,7 +4261,9 @@ int normHamming(const uchar* a, int n) _r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2)); result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0); } -#elif CV_POPCNT +#endif // CV_AVX2 + +#if CV_POPCNT if(checkHardwareSupport(CV_CPU_POPCNT)) { # if defined CV_POPCNT_U64 @@ -4275,7 +4277,9 @@ int normHamming(const uchar* a, int n) result += CV_POPCNT_U32(*(uint*)(a + i)); } } -#elif CV_SIMD128 +#endif // CV_POPCNT + +#if CV_SIMD128 if(hasSIMD128()) { v_uint32x4 t = v_setzero_u32(); @@ -4283,9 +4287,10 @@ int normHamming(const uchar* a, int n) { t += v_popcount(v_load(a + i)); } - result = v_reduce_sum(t); + result += v_reduce_sum(t); } -#endif +#endif // CV_SIMD128 + for(; i <= n - 4; i += 4) { result += popCountTable[a[i]] + popCountTable[a[i+1]] + @@ -4327,7 +4332,9 @@ int normHamming(const uchar* a, const uchar* b, int n) _r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2)); result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0); } -#elif CV_POPCNT +#endif // CV_AVX2 + +#if CV_POPCNT if(checkHardwareSupport(CV_CPU_POPCNT)) { # if defined CV_POPCNT_U64 @@ -4341,7 +4348,9 @@ int normHamming(const uchar* a, const uchar* b, int n) result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i)); } } -#elif CV_SIMD128 +#endif // CV_POPCNT + +#if CV_SIMD128 if(hasSIMD128()) { v_uint32x4 t = v_setzero_u32(); @@ -4349,9 +4358,10 @@ int normHamming(const uchar* a, const uchar* b, int n) { t += v_popcount(v_load(a + i) ^ v_load(b + i)); } - result = v_reduce_sum(t); + result += v_reduce_sum(t); } -#endif +#endif // CV_SIMD128 + for(; i <= n - 4; i += 4) { result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +