remove raw SSE2/NEON implementation from imgwarp.cpp

* use universal intrinsic instead of raw intrinsic * add 2 channels de-interleave on x86 platform * add v_int32x4 version of v_muladd * add accumulate version of v_dotprod based on the commit from seiko2plus on bf1852d * remove some verify check in performance test * avoid the out of boundary access and keep the performance
7 years ago · a82e70cd40
parent fdd83e5027
commit a82e70cd40
8 changed files with 558 additions and 553 deletions
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@ -795,7 +795,7 @@ inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>
 /** @brief Multiply and add

 Returns \f$ a*b + c \f$
-For floating point types only. */
+For floating point types and signed 32bit int only. */
 template<typename _Tp, int n>
 inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
                              const v_reg<_Tp, n>& c)
@ -828,6 +828,29 @@ template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n
    return c;
 }

+/** @brief Dot product of elements
+
+Same as cv::v_dotprod, but add a third element to the sum of adjacent pairs.
+Scheme:
+@code
+  {A1 A2 ...} // 16-bit
+x {B1 B2 ...} // 16-bit
+-------------
+  {A1B1+A2B2+C1 ...} // 32-bit
+
+@endcode
+Implemented only for 16-bit signed source type (v_int16x8).
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
+    v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
+{
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    v_reg<w_type, n/2> s;
+    for( int i = 0; i < (n/2); i++ )
+        s.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1] + c.s[i];
+    return s;
+}
+
 /** @brief Multiply and expand

 Multiply values two registers and store results in two registers with wider pack type.
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@ -506,6 +506,12 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
    return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
 }

+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+    v_int32x4 s = v_dotprod(a, b);
+    return v_int32x4(vaddq_s32(s.val , c.val));
+}
+
 #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
    OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
    OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
@ -730,6 +736,11 @@ inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_
    return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
 }

+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_int32x4(vmlaq_s32(c.val, a.val, b.val));
+}
+
 #if CV_SIMD128_64F
 inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
 {
@ -1095,6 +1106,18 @@ OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
 OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64)
 #endif

+#if CV_SIMD128_64F
+inline v_int32x4 v_round(const v_float32x4& a)
+{
+    float32x4_t a_ = a.val;
+    int32x4_t result;
+    __asm__ ("fcvtns %0.4s, %1.4s"
+             : "=w"(result)
+             : "w"(a_)
+             : /* No clobbers */);
+    return v_int32x4(result);
+}
+#else
 inline v_int32x4 v_round(const v_float32x4& a)
 {
    static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
@ -1103,7 +1126,7 @@ inline v_int32x4 v_round(const v_float32x4& a)
    int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
    return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
 }
-
+#endif
 inline v_int32x4 v_floor(const v_float32x4& a)
 {
    int32x4_t a1 = vcvtq_s32_f32(a.val);
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@ -710,6 +710,11 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
    return v_int32x4(_mm_madd_epi16(a.val, b.val));
 }

+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+    return v_int32x4(_mm_add_epi32(_mm_madd_epi16(a.val, b.val), c.val));
+}
+
 #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
    OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
    OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
@ -954,6 +959,10 @@ inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
    __m128i m = _mm_cmpgt_epi32(b.val, a.val);
    return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m));
 }
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return a * b + c;
+}

 #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
 inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
@ -1599,7 +1608,7 @@ inline void v_load_deinterleave(const double *ptr, v_float64x2& a, v_float64x2&
    c = v_reinterpret_as_f64(t2);
 }

-// 2-channel, float only
+// 2-channel
 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
 {
    const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
@ -1611,7 +1620,29 @@ inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b
    b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
 }

-inline void v_store_interleave( short* ptr, const v_int16x8& a, const v_int16x8& b )
+inline void v_load_deinterleave(const short* ptr, v_int16x8& a, v_int16x8& b)
+{
+    __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));     // a0 b0 a1 b1 a2 b2 a3 b3
+    __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7
+
+    __m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5
+    __m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7
+    __m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6
+    __m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7
+
+    a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7
+    b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7
+}
+
+inline void v_load_deinterleave(const ushort*ptr, v_uint16x8& a, v_uint16x8& b)
+{
+    v_int16x8 sa, sb;
+    v_load_deinterleave((const short*)ptr, sa, sb);
+    a = v_reinterpret_as_u16(sa);
+    b = v_reinterpret_as_u16(sb);
+}
+
+inline void v_store_interleave(short* ptr, const v_int16x8& a, const v_int16x8& b)
 {
    __m128i t0, t1;
    t0 = _mm_unpacklo_epi16(a.val, b.val);
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@ -821,6 +821,9 @@ inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)   \
 OPENCV_HAL_IMPL_VSX_MULADD(v_float32x4)
 OPENCV_HAL_IMPL_VSX_MULADD(v_float64x2)

+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{ return a * b + c; }
+
 // TODO: exp, log, sin, cos

 /** Absolute values **/
@ -904,6 +907,9 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 { return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)); }

+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_int32x4(vec_msum(a.val, b.val, c.val)); }
+
 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
                            const v_float32x4& m1, const v_float32x4& m2,
                            const v_float32x4& m3)
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@ -521,15 +521,25 @@ template<typename R> struct TheTest
    TheTest & test_dot_prod()
    {
        typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
+        typedef typename Rx2::lane_type w_type;
+
        Data<R> dataA, dataB(2);
        R a = dataA, b = dataB;

-        Data<Rx2> res = v_dotprod(a, b);
+        Data<Rx2> dataC;
+        dataC += std::numeric_limits<w_type>::is_signed ?
+                    std::numeric_limits<w_type>::min() :
+                    std::numeric_limits<w_type>::max() - R::nlanes * (dataB[0] + 1);
+        Rx2 c = dataC;
+
+        Data<Rx2> resD = v_dotprod(a, b),
+                  resE = v_dotprod(a, b, c);

        const int n = R::nlanes / 2;
        for (int i = 0; i < n; ++i)
        {
-            EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1], res[i]);
+            EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1], resD[i]);
+            EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1] + dataC[i], resE[i]);
        }
        return *this;
    }
--- a/modules/imgproc/perf/opencl/perf_imgwarp.cpp
+++ b/modules/imgproc/perf/opencl/perf_imgwarp.cpp
@ -229,7 +229,7 @@ OCL_PERF_TEST_P(RemapFixture, Remap,

    OCL_TEST_CYCLE() cv::remap(src, dst, xmap, ymap, interpolation, borderMode);

-    SANITY_CHECK(dst, eps);
+    SANITY_CHECK_NOTHING();
 }

 } } // namespace opencv_test::ocl
--- a/modules/imgproc/perf/perf_warp.cpp
+++ b/modules/imgproc/perf/perf_warp.cpp
@ -202,8 +202,8 @@ PERF_TEST_P( TestWarpPerspectiveNear_t, WarpPerspectiveNear,

 PERF_TEST_P( TestRemap, remap,
             Combine(
-                 Values( TYPICAL_MAT_TYPES ),
-                 Values( szVGA, sz720p, sz1080p ),
+                 Values( CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1 ),
+                 Values( szVGA, sz1080p ),
                 InterType::all(),
                 BorderMode::all(),
                 RemapMode::all()
@ -231,7 +231,7 @@ PERF_TEST_P( TestRemap, remap,
        remap(source, destination, map_x, map_y, interpolationType, borderMode);
    }

-    SANITY_CHECK(destination, 1);
+    SANITY_CHECK_NOTHING();
 }

 void update_map(const Mat& src, Mat& map_x, Mat& map_y, const int remapMode )
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp