diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index 3291921c14..55bc67fed6 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -1129,9 +1129,15 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_s inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c, const v_float32x4& d) { +#if CV_SSE3 __m128 ab = _mm_hadd_ps(a.val, b.val); __m128 cd = _mm_hadd_ps(c.val, d.val); return v_float32x4(_mm_hadd_ps(ab, cd)); +#else + __m128 ac = _mm_add_ps(_mm_unpacklo_ps(a.val, c.val), _mm_unpackhi_ps(a.val, c.val)); + __m128 bd = _mm_add_ps(_mm_unpacklo_ps(b.val, d.val), _mm_unpackhi_ps(b.val, d.val)); + return v_float32x4(_mm_add_ps(_mm_unpacklo_ps(ac, bd), _mm_unpackhi_ps(ac, bd))); +#endif } OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max) diff --git a/modules/core/test/test_intrin.cpp b/modules/core/test/test_intrin.cpp index 7d51bd3a6f..2ef4b8c33a 100644 --- a/modules/core/test/test_intrin.cpp +++ b/modules/core/test/test_intrin.cpp @@ -741,6 +741,23 @@ template struct TheTest return *this; } + TheTest & test_reduce_sum4() + { + R a(0.1f, 0.02f, 0.003f, 0.0004f); + R b(1, 20, 300, 4000); + R c(10, 2, 0.3f, 0.04f); + R d(1, 2, 3, 4); + + R sum = v_reduce_sum4(a, b, c, d); + + Data res = sum; + EXPECT_EQ(0.1234f, res[0]); + EXPECT_EQ(4321.0f, res[1]); + EXPECT_EQ(12.34f, res[2]); + EXPECT_EQ(10.0f, res[3]); + return *this; + } + TheTest & test_loadstore_fp16() { #if CV_FP16 && CV_SIMD128 @@ -986,6 +1003,7 @@ TEST(hal_intrin, float32x4) { .test_float_cvt64() .test_matmul() .test_transpose() + .test_reduce_sum4() ; }