Merge pull request #9024 from tomoaki0705:featureDispatchAccumulate

8 years ago · 10e6491c22
parent 4238add35b e7d5dbfec0
commit 10e6491c22
5 changed files with 3273 additions and 1707 deletions
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@ -899,6 +899,15 @@ inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)

+#define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec, cast) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ return cast(v_reinterpret_as_f64(a) == v_reinterpret_as_f64(b)); } \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return cast(v_reinterpret_as_f64(a) != v_reinterpret_as_f64(b)); }
+
+OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64);
+OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64);
+
 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8)
 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8)
 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16)
@ -1520,6 +1529,35 @@ inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4&
    v_transpose4x4(u0, u1, u2, u3, a, b, c, d);
 }

+inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
+{
+    __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2));
+    __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4));
+
+    a = v_uint64x2(_mm_unpacklo_epi64(t0, _mm_unpackhi_epi64(t1, t1)));
+    b = v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2));
+    c = v_uint64x2(_mm_unpacklo_epi64(t1, _mm_unpackhi_epi64(t2, t2)));
+}
+
+inline void v_load_deinterleave(const int64 *ptr, v_int64x2& a, v_int64x2& b, v_int64x2& c)
+{
+    v_uint64x2 t0, t1, t2;
+    v_load_deinterleave((const uint64*)ptr, t0, t1, t2);
+    a = v_reinterpret_as_s64(t0);
+    b = v_reinterpret_as_s64(t1);
+    c = v_reinterpret_as_s64(t2);
+}
+
+inline void v_load_deinterleave(const double *ptr, v_float64x2& a, v_float64x2& b, v_float64x2& c)
+{
+    v_uint64x2 t0, t1, t2;
+    v_load_deinterleave((const uint64*)ptr, t0, t1, t2);
+    a = v_reinterpret_as_f64(t0);
+    b = v_reinterpret_as_f64(t1);
+    c = v_reinterpret_as_f64(t2);
+}
+
 // 2-channel, float only
 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
 {
@ -1717,6 +1755,27 @@ inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32
    _mm_storeu_ps((ptr + 4), u1);
 }

+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c)
+{
+    __m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
+    __m128i t1 = _mm_unpacklo_epi64(c.val, _mm_unpackhi_epi64(a.val, a.val));
+    __m128i t2 = _mm_unpackhi_epi64(b.val, c.val);
+
+    _mm_storeu_si128((__m128i*)ptr, t0);
+    _mm_storeu_si128((__m128i*)(ptr + 2), t1);
+    _mm_storeu_si128((__m128i*)(ptr + 4), t2);
+}
+
+inline void v_store_interleave(int64 *ptr, const v_int64x2& a, const v_int64x2& b, const v_int64x2& c)
+{
+    v_store_interleave((uint64*)ptr, v_reinterpret_as_u64(a), v_reinterpret_as_u64(b), v_reinterpret_as_u64(c));
+}
+
+inline void v_store_interleave(double *ptr, const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    v_store_interleave((uint64*)ptr, v_reinterpret_as_u64(a), v_reinterpret_as_u64(b), v_reinterpret_as_u64(c));
+}
+
 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \
 inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
                                 _Tpvec& b0, _Tpvec& c0 ) \
--- a/modules/imgproc/CMakeLists.txt
+++ b/modules/imgproc/CMakeLists.txt
@ -1,2 +1,3 @@
 set(the_description "Image Processing")
+ocv_add_dispatched_file(accum SSE2 AVX NEON)
 ocv_define_module(imgproc opencv_core WRAP java python)
--- a/modules/imgproc/src/accum.cpp
+++ b/modules/imgproc/src/accum.cpp
--- a/modules/imgproc/src/accum.dispatch.cpp
+++ b/modules/imgproc/src/accum.dispatch.cpp
@ -0,0 +1,20 @@
+ // This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+
+#include "accum.simd.hpp"
+#include "accum.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
+
+namespace cv {
+
+DEF_ACC_INT_FUNCS(8u32f, uchar, float)
+DEF_ACC_INT_FUNCS(8u64f, uchar, double)
+DEF_ACC_INT_FUNCS(16u32f, ushort, float)
+DEF_ACC_INT_FUNCS(16u64f, ushort, double)
+DEF_ACC_FLT_FUNCS(32f, float, float)
+DEF_ACC_FLT_FUNCS(32f64f, float, double)
+DEF_ACC_FLT_FUNCS(64f, double, double)
+
+} //cv::hal
--- a/modules/imgproc/src/accum.simd.hpp
+++ b/modules/imgproc/src/accum.simd.hpp