Merge pull request #13650 from terfendail:shapedescr_wintr

6 years ago · 0395b2ea9c
parent 3812ae7949 a84bbc62b1
commit 0395b2ea9c
2 changed files with 154 additions and 78 deletions
--- a/modules/imgproc/perf/perf_contours.cpp
+++ b/modules/imgproc/perf/perf_contours.cpp
@ -84,4 +84,26 @@ PERF_TEST_P(TestFindContoursFF, findContours,
    SANITY_CHECK_NOTHING();
 }
 typedef TestBaseWithParam< tuple<MatDepth, int> > TestBoundingRect;
 PERF_TEST_P(TestBoundingRect, BoundingRect,
    Combine(
        testing::Values(CV_32S, CV_32F), // points type
        Values(400, 511, 1000, 10000, 100000) // points count
    )
 )
 {
    int ptType = get<0>(GetParam());
    int n = get<1>(GetParam());
    Mat pts(n, 2, ptType);
    declare.in(pts, WARMUP_RNG);
    cv::Rect rect;
    TEST_CYCLE() rect = boundingRect(pts);
    SANITY_CHECK_NOTHING();
 }
 } } // namespace
--- a/modules/imgproc/src/shapedescr.cpp
+++ b/modules/imgproc/src/shapedescr.cpp
@ -39,6 +39,8 @@
 //
 //M*/
 #include "precomp.hpp"
 #include "opencv2/core/hal/intrin.hpp"
 namespace cv
 {
@ -746,53 +748,105 @@ static Rect pointSetBoundingRect( const Mat& points )
    if( npoints == 0 )
        return Rect();
-    const Point* pts = points.ptr<Point>();
+#if CV_SIMD
-    Point pt = pts[0];
+    const int64_t* pts = points.ptr<int64_t>();
 #if CV_SSE4_2
    if(cv::checkHardwareSupport(CV_CPU_SSE4_2))
    {
    if( !is_float )
    {
-            __m128i minval, maxval;
+        v_int32 minval, maxval;
-            minval = maxval = _mm_loadl_epi64((const __m128i*)(&pt)); //min[0]=pt.x, min[1]=pt.y
+        minval = maxval = v_reinterpret_as_s32(vx_setall_s64(*pts)); //min[0]=pt.x, min[1]=pt.y, min[2]=pt.x, min[3]=pt.y
-
+        for( i = 1; i <= npoints - v_int32::nlanes/2; i+= v_int32::nlanes/2 )
            for( i = 1; i < npoints; i++ )
        {
-                __m128i ptXY = _mm_loadl_epi64((const __m128i*)&pts[i]);
+            v_int32 ptXY2 = v_reinterpret_as_s32(vx_load(pts + i));
-                minval = _mm_min_epi32(ptXY, minval);
+            minval = v_min(ptXY2, minval);
-                maxval = _mm_max_epi32(ptXY, maxval);
+            maxval = v_max(ptXY2, maxval);
        }
-            xmin = _mm_cvtsi128_si32(minval);
+        minval = v_min(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))));
-            ymin = _mm_cvtsi128_si32(_mm_srli_si128(minval, 4));
+        maxval = v_max(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))));
-            xmax = _mm_cvtsi128_si32(maxval);
+        if( i <= npoints - v_int32::nlanes/4 )
-            ymax = _mm_cvtsi128_si32(_mm_srli_si128(maxval, 4));
+        {
            v_int32 ptXY = v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(vx_load_low(pts + i))));
            minval = v_min(ptXY, minval);
            maxval = v_max(ptXY, maxval);
            i += v_int64::nlanes/2;
        }
-        else
+        for(int j = 16; j < CV_SIMD_WIDTH; j*=2)
        {
-            __m128 minvalf, maxvalf, z = _mm_setzero_ps(), ptXY = _mm_setzero_ps();
+            minval = v_min(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))));
-            minvalf = maxvalf = _mm_loadl_pi(z, (const __m64*)(&pt));
+            maxval = v_max(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))));
-
+        }
-            for( i = 1; i < npoints; i++ )
+        xmin = minval.get0();
        xmax = maxval.get0();
        ymin = v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))).get0();
        ymax = v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))).get0();
 #if CV_SIMD_WIDTH > 16
        if( i < npoints )
        {
-                ptXY = _mm_loadl_pi(ptXY, (const __m64*)&pts[i]);
+            v_int32x4 minval2, maxval2;
-
+            minval2 = maxval2 = v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(v_load_low(pts + i))));
-                minvalf = _mm_min_ps(minvalf, ptXY);
+            for( i++; i < npoints; i++ )
-                maxvalf = _mm_max_ps(maxvalf, ptXY);
+            {
                v_int32x4 ptXY = v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(v_load_low(pts + i))));
                minval2 = v_min(ptXY, minval2);
                maxval2 = v_max(ptXY, maxval2);
            }
-
+            xmin = min(xmin, minval2.get0());
-            float xyminf[2], xymaxf[2];
+            xmax = max(xmax, maxval2.get0());
-            _mm_storel_pi((__m64*)xyminf, minvalf);
+            ymin = min(ymin, v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval2))).get0());
-            _mm_storel_pi((__m64*)xymaxf, maxvalf);
+            ymax = max(ymax, v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval2))).get0());
            xmin = cvFloor(xyminf[0]);
            ymin = cvFloor(xyminf[1]);
            xmax = cvFloor(xymaxf[0]);
            ymax = cvFloor(xymaxf[1]);
        }
 #endif
    }
    else
 #endif
    {
        v_float32 minval, maxval;
        minval = maxval = v_reinterpret_as_f32(vx_setall_s64(*pts)); //min[0]=pt.x, min[1]=pt.y, min[2]=pt.x, min[3]=pt.y
        for( i = 1; i <= npoints - v_float32::nlanes/2; i+= v_float32::nlanes/2 )
        {
            v_float32 ptXY2 = v_reinterpret_as_f32(vx_load(pts + i));
            minval = v_min(ptXY2, minval);
            maxval = v_max(ptXY2, maxval);
        }
        minval = v_min(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))));
        maxval = v_max(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))));
        if( i <= npoints - v_float32::nlanes/4 )
        {
            v_float32 ptXY = v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(vx_load_low(pts + i))));
            minval = v_min(ptXY, minval);
            maxval = v_max(ptXY, maxval);
            i += v_float32::nlanes/4;
        }
        for(int j = 16; j < CV_SIMD_WIDTH; j*=2)
        {
            minval = v_min(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))));
            maxval = v_max(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))));
        }
        xmin = cvFloor(minval.get0());
        xmax = cvFloor(maxval.get0());
        ymin = cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))).get0());
        ymax = cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))).get0());
 #if CV_SIMD_WIDTH > 16
        if( i < npoints )
        {
            v_float32x4 minval2, maxval2;
            minval2 = maxval2 = v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(v_load_low(pts + i))));
            for( i++; i < npoints; i++ )
            {
                v_float32x4 ptXY = v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(v_load_low(pts + i))));
                minval2 = v_min(ptXY, minval2);
                maxval2 = v_max(ptXY, maxval2);
            }
            xmin = min(xmin, cvFloor(minval2.get0()));
            xmax = max(xmax, cvFloor(maxval2.get0()));
            ymin = min(ymin, cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval2))).get0()));
            ymax = max(ymax, cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval2))).get0()));
        }
 #endif
    }
 #else
    const Point* pts = points.ptr<Point>();
    Point pt = pts[0];
    if( !is_float )
    {
        xmin = xmax = pt.x;
@ -848,7 +902,7 @@ static Rect pointSetBoundingRect( const Mat& points )
        v.i = CV_TOGGLE_FLT(xmax); xmax = cvFloor(v.f);
        v.i = CV_TOGGLE_FLT(ymax); ymax = cvFloor(v.f);
    }
-    }
+#endif
    return Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1);
 }