From 18d10d6b86d5e8e5591de16b484fa678711cbfa8 Mon Sep 17 00:00:00 2001 From: Vitaly Tuzov Date: Mon, 22 Apr 2019 11:19:08 +0300 Subject: [PATCH] Fixed v_reduce_sad intrinsics implementation and added tests --- .../core/include/opencv2/core/hal/intrin_avx.hpp | 8 ++++++-- .../core/include/opencv2/core/hal/intrin_sse.hpp | 7 ++++--- modules/core/test/test_intrin_utils.hpp | 16 ++++++++++++++++ 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index 58db71467d..ba16feadea 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -1141,12 +1141,16 @@ inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b, inline unsigned v_reduce_sad(const v_uint8x32& a, const v_uint8x32& b) { - return (unsigned)_v_cvtsi256_si32(_mm256_sad_epu8(a.val, b.val)); + __m256i half = _mm256_sad_epu8(a.val, b.val); + __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half)); + return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter))); } inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b) { __m256i half = _mm256_set1_epi8(0x7f); - return (unsigned)_v_cvtsi256_si32(_mm256_sad_epu8(_mm256_add_epi8(a.val, half), _mm256_add_epi8(b.val, half))); + half = _mm256_sad_epu8(_mm256_add_epi8(a.val, half), _mm256_add_epi8(b.val, half)); + __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half)); + return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter))); } inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b) { diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index a5adad04c5..36499c0117 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -1486,13 +1486,14 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min) inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b) { - return (unsigned)_mm_cvtsi128_si32(_mm_sad_epu8(a.val, b.val)); + __m128i half = _mm_sad_epu8(a.val, b.val); + return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half))); } inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b) { __m128i half = _mm_set1_epi8(0x7f); - return (unsigned)_mm_cvtsi128_si32(_mm_sad_epu8(_mm_add_epi8(a.val, half), - _mm_add_epi8(b.val, half))); + half = _mm_sad_epu8(_mm_add_epi8(a.val, half), _mm_add_epi8(b.val, half)); + return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half))); } inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b) { diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index b28929c582..3b85d68dea 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -770,6 +770,15 @@ template struct TheTest return *this; } + TheTest & test_reduce_sad() + { + Data dataA, dataB(R::nlanes/2); + R a = dataA; + R b = dataB; + EXPECT_EQ((unsigned)(R::nlanes*R::nlanes/4), v_reduce_sad(a, b)); + return *this; + } + TheTest & test_mask() { typedef typename V_RegTraits::int_reg int_reg; @@ -1320,6 +1329,7 @@ void test_hal_intrin_uint8() .test_logic() .test_min_max() .test_absdiff() + .test_reduce_sad() .test_mask() .test_popcount() .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>() @@ -1358,6 +1368,7 @@ void test_hal_intrin_int8() .test_absdiff() .test_absdiffs() .test_abs() + .test_reduce_sad() .test_mask() .test_popcount() .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>() @@ -1387,6 +1398,7 @@ void test_hal_intrin_uint16() .test_min_max() .test_absdiff() .test_reduce() + .test_reduce_sad() .test_mask() .test_popcount() .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>() @@ -1418,6 +1430,7 @@ void test_hal_intrin_int16() .test_absdiffs() .test_abs() .test_reduce() + .test_reduce_sad() .test_mask() .test_popcount() .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>() @@ -1446,6 +1459,7 @@ void test_hal_intrin_uint32() .test_min_max() .test_absdiff() .test_reduce() + .test_reduce_sad() .test_mask() .test_popcount() .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>() @@ -1473,6 +1487,7 @@ void test_hal_intrin_int32() .test_min_max() .test_absdiff() .test_reduce() + .test_reduce_sad() .test_mask() .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>() .test_unpack() @@ -1528,6 +1543,7 @@ void test_hal_intrin_float32() .test_min_max() .test_float_absdiff() .test_reduce() + .test_reduce_sad() .test_mask() .test_unpack() .test_float_math()