From 8af8c4d0aa4023effc5bde64e552d8562521a438 Mon Sep 17 00:00:00 2001 From: matze Date: Fri, 2 Sep 2016 20:09:13 +0200 Subject: [PATCH 1/8] Improves findContours using SSE _mm_movemask_epi8 to find next contour point. Cleaned up code a little bit --- modules/imgproc/src/contours.cpp | 230 ++++++++++++++++++++++++++----- 1 file changed, 199 insertions(+), 31 deletions(-) diff --git a/modules/imgproc/src/contours.cpp b/modules/imgproc/src/contours.cpp index 5f08be5fe5..163b88b2a8 100644 --- a/modules/imgproc/src/contours.cpp +++ b/modules/imgproc/src/contours.cpp @@ -50,6 +50,21 @@ static const CvPoint icvCodeDeltas[8] = { CvPoint(1, 0), CvPoint(1, -1), CvPoint(0, -1), CvPoint(-1, -1), CvPoint(-1, 0), CvPoint(-1, 1), CvPoint(0, 1), CvPoint(1, 1) }; +inline unsigned int trailingZeros(unsigned int value) { +#if defined(_MSC_VER) + return _tzcnt_u32(value); +#elif defined(_GCC) + return __builtin_ctz(value); +#elif defined(__INTEL_COMPILER) + //return _bit_scan_reverse(value); +#else + static const int MultiplyDeBruijnBitPosition[32] = { + 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, + 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 }; + return MultiplyDeBruijnBitPosition[((uint32_t)((value & -value) * 0x077CB531U)) >> 27]; +#endif +} + CV_IMPL void cvStartReadChainPoints( CvChain * chain, CvChainPtReader * reader ) { @@ -527,10 +542,8 @@ icvFetchContour( schar *ptr, { s = (s - 1) & 7; i1 = i0 + deltas[s]; - if( *i1 != 0 ) - break; } - while( s != s_end ); + while( *i1 == 0 && s != s_end ); if( s == s_end ) /* single pixel domain */ { @@ -631,10 +644,8 @@ icvTraceContour( schar *ptr, int step, schar *stop_ptr, int is_hole ) { s = (s - 1) & 7; i1 = i0 + deltas[s]; - if( *i1 != 0 ) - break; } - while( s != s_end ); + while( *i1 == 0 && s != s_end ); i3 = i0; @@ -644,7 +655,6 @@ icvTraceContour( schar *ptr, int step, schar *stop_ptr, int is_hole ) /* follow border */ for( ;; ) { - s_end = s; for( ;; ) { @@ -702,10 +712,8 @@ icvFetchContourEx( schar* ptr, { s = (s - 1) & 7; i1 = i0 + deltas[s]; - if( *i1 != 0 ) - break; } - while( s != s_end ); + while( *i1 == 0 && s != s_end ); if( s == s_end ) /* single pixel domain */ { @@ -817,10 +825,8 @@ icvTraceContour_32s( int *ptr, int step, int *stop_ptr, int is_hole ) { s = (s - 1) & 7; i1 = i0 + deltas[s]; - if( (*i1 & value_mask) == ccomp_val ) - break; } - while( s != s_end ); + while( (*i1 & value_mask) != ccomp_val && s != s_end ); i3 = i0; @@ -892,10 +898,8 @@ icvFetchContourEx_32s( int* ptr, { s = (s - 1) & 7; i1 = i0 + deltas[s]; - if( (*i1 & value_mask) == ccomp_val ) - break; } - while( s != s_end ); + while( (*i1 & value_mask) != ccomp_val && s != s_end ); if( s == s_end ) /* single pixel domain */ { @@ -990,6 +994,11 @@ cvFindNextContour( CvContourScanner scanner ) { if( !scanner ) CV_Error( CV_StsNullPtr, "" ); + +#if CV_SSE2 + bool haveSSE2 = cv::checkHardwareSupport(CPU_SSE2); +#endif + icvEndProcessContour( scanner ); /* initialize local state */ @@ -1034,13 +1043,60 @@ cvFindNextContour( CvContourScanner scanner ) } else { +#if CV_SSE2 + if ((p = img[x]) != prev) { + goto _next_contour; + } else if (haveSSE2) { + + __m128i v_prev = _mm_set1_epi8(prev); + int v_size = width - 32; + + for (; x <= v_size; x += 32) { + __m128i v_p1 = _mm_loadu_si128((__m128i*)(img + x)); + __m128i v_p2 = _mm_loadu_si128((__m128i*)(img + x + 16)); + + __m128i v_cmp1 = _mm_cmpeq_epi8(v_p1, v_prev); + __m128i v_cmp2 = _mm_cmpeq_epi8(v_p2, v_prev); + + unsigned int mask1 = _mm_movemask_epi8(v_cmp1); + unsigned int mask2 = _mm_movemask_epi8(v_cmp2); + + mask1 ^= 0x0000ffff; + mask2 ^= 0x0000ffff; + + if (mask1) { + p = img[(x += trailingZeros(mask1))]; + goto _next_contour; + } + + if (mask2) { + p = img[(x += trailingZeros(mask2 << 16))]; + goto _next_contour; + } + } + + if(x <= width - 16) { + __m128i v_p = _mm_loadu_si128((__m128i*)(img + x)); + + unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(v_p, v_prev)) ^ 0x0000ffff; + + if (mask) { + p = img[(x += trailingZeros(mask))]; + goto _next_contour; + } + x += 16; + } + } +#endif for( ; x < width && (p = img[x]) == prev; x++ ) ; } if( x >= width ) break; - +#if CV_SSE2 + _next_contour: +#endif { _CvContourInfo *par_info = 0; _CvContourInfo *l_cinfo = 0; @@ -1275,7 +1331,7 @@ cvEndFindContours( CvContourScanner * _scanner ) #define ICV_SINGLE 0 #define ICV_CONNECTING_ABOVE 1 #define ICV_CONNECTING_BELOW -1 -#define ICV_IS_COMPONENT_POINT(val) ((val) != 0) +//#define ICV_IS_COMPONENT_POINT(val) ((val) != 0) #define CV_GET_WRITTEN_ELEM( writer ) ((writer).ptr - (writer).seq->elem_size) @@ -1287,6 +1343,108 @@ typedef struct CvLinkedRunPoint } CvLinkedRunPoint; +#if CV_SSE2 +inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j, bool haveSSE2) { +#else +inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j) { +#endif +#if CV_SSE2 + if (haveSSE2) { + __m128i sseZero = _mm_setzero_si128(); + int sizeSse = img_size.width - 32; + + for (; j <= sizeSse; j += 32) { + __m128i sseP1 = _mm_loadu_si128((__m128i*)(src_data + j)); + __m128i sseP2 = _mm_loadu_si128((__m128i*)(src_data + j + 16)); + + __m128i sseCmp1 = _mm_cmpeq_epi8(sseP1, sseZero); + __m128i sseCmp2 = _mm_cmpeq_epi8(sseP2, sseZero); + + unsigned int mask1 = _mm_movemask_epi8(sseCmp1); + unsigned int mask2 = _mm_movemask_epi8(sseCmp2); + + mask1 ^= 0x0000ffff; + mask2 ^= 0x0000ffff; + + if (mask1) { + j += trailingZeros(mask1); + return j; + } + + if (mask2) { + j += trailingZeros(mask2 << 16); + return j; + } + } + + if (j <= img_size.width - 16) { + __m128i sseP = _mm_loadu_si128((__m128i*)(src_data + j)); + + unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(sseP, sseZero)) ^ 0x0000ffff; + + if (mask) { + j += trailingZeros(mask); + return j; + } + j += 16; + } + } +#endif + for (; j < img_size.width && !src_data[j]; ++j) + ; + return j; +} + +#if CV_SSE2 +inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j, bool haveSSE2) { +#else +inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j) { +#endif +#if CV_SSE2 + if (!src_data[j]) { + return j; + } else if (haveSSE2) { + __m128i sseZero = _mm_setzero_si128(); + int sizeSse = img_size.width - 32; + + for (; j <= sizeSse; j += 32) { + __m128i sseP1 = _mm_loadu_si128((__m128i*)(src_data + j)); + __m128i sseP2 = _mm_loadu_si128((__m128i*)(src_data + j + 16)); + + __m128i sseCmp1 = _mm_cmpeq_epi8(sseP1, sseZero); + __m128i sseCmp2 = _mm_cmpeq_epi8(sseP2, sseZero); + + unsigned int mask1 = _mm_movemask_epi8(sseCmp1); + unsigned int mask2 = _mm_movemask_epi8(sseCmp2); + + if (mask1) { + j += trailingZeros(mask1); + return j; + } + + if (mask2) { + j += trailingZeros(mask2 << 16); + return j; + } + } + + if (j <= img_size.width - 16) { + __m128i sseP = _mm_loadu_si128((__m128i*)(src_data + j)); + + unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(sseP, sseZero)); + + if (mask) { + j += trailingZeros(mask); + return j; + } + j += 16; + } + } +#endif + for (; j < img_size.width && src_data[j]; ++j) + ; + return j; +} static int icvFindContoursInInterval( const CvArr* src, @@ -1339,7 +1497,9 @@ icvFindContoursInInterval( const CvArr* src, if( contourHeaderSize < (int)sizeof(CvContour)) CV_Error( CV_StsBadSize, "Contour header size must be >= sizeof(CvContour)" ); - +#if CV_SSE2 + bool haveSSE2 = cv::checkHardwareSupport(CPU_SSE2); +#endif storage00.reset(cvCreateChildMemStorage(storage)); storage01.reset(cvCreateChildMemStorage(storage)); @@ -1372,8 +1532,11 @@ icvFindContoursInInterval( const CvArr* src, tmp_prev = upper_line; for( j = 0; j < img_size.width; ) { - for( ; j < img_size.width && !ICV_IS_COMPONENT_POINT(src_data[j]); j++ ) - ; +#if CV_SSE2 + j = findStartContourPoint(src_data, img_size, j, haveSSE2); +#else + j = findStartContourPoint(src_data, img_size, j); +#endif if( j == img_size.width ) break; @@ -1381,10 +1544,11 @@ icvFindContoursInInterval( const CvArr* src, CV_WRITE_SEQ_ELEM( tmp, writer ); tmp_prev->next = (CvLinkedRunPoint*)CV_GET_WRITTEN_ELEM( writer ); tmp_prev = tmp_prev->next; - - for( ; j < img_size.width && ICV_IS_COMPONENT_POINT(src_data[j]); j++ ) - ; - +#if CV_SSE2 + j = findEndContourPoint(src_data, img_size, j, haveSSE2); +#else + j = findEndContourPoint(src_data, img_size, j); +#endif tmp.pt.x = j-1; CV_WRITE_SEQ_ELEM( tmp, writer ); tmp_prev->next = (CvLinkedRunPoint*)CV_GET_WRITTEN_ELEM( writer ); @@ -1407,18 +1571,22 @@ icvFindContoursInInterval( const CvArr* src, all_total = runs->total; for( j = 0; j < img_size.width; ) { - for( ; j < img_size.width && !ICV_IS_COMPONENT_POINT(src_data[j]); j++ ) - ; +#if CV_SSE2 + j = findStartContourPoint(src_data, img_size, j, haveSSE2); +#else + j = findStartContourPoint(src_data, img_size, j); +#endif if( j == img_size.width ) break; tmp.pt.x = j; CV_WRITE_SEQ_ELEM( tmp, writer ); tmp_prev->next = (CvLinkedRunPoint*)CV_GET_WRITTEN_ELEM( writer ); tmp_prev = tmp_prev->next; - - for( ; j < img_size.width && ICV_IS_COMPONENT_POINT(src_data[j]); j++ ) - ; - +#if CV_SSE2 + j = findEndContourPoint(src_data, img_size, j, haveSSE2); +#else + j = findEndContourPoint(src_data, img_size, j); +#endif tmp.pt.x = j-1; CV_WRITE_SEQ_ELEM( tmp, writer ); tmp_prev = tmp_prev->next = (CvLinkedRunPoint*)CV_GET_WRITTEN_ELEM( writer ); From 0acd818efc3856b06e3e90515b347ef1052c477e Mon Sep 17 00:00:00 2001 From: matze Date: Fri, 2 Sep 2016 20:35:08 +0200 Subject: [PATCH 2/8] Removed #elif defined(__INTEL_COMPILER) because it is disabled at the moment --- modules/imgproc/src/contours.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/imgproc/src/contours.cpp b/modules/imgproc/src/contours.cpp index 163b88b2a8..b27c1552dc 100644 --- a/modules/imgproc/src/contours.cpp +++ b/modules/imgproc/src/contours.cpp @@ -55,7 +55,7 @@ inline unsigned int trailingZeros(unsigned int value) { return _tzcnt_u32(value); #elif defined(_GCC) return __builtin_ctz(value); -#elif defined(__INTEL_COMPILER) +//#elif defined(__INTEL_COMPILER) //return _bit_scan_reverse(value); #else static const int MultiplyDeBruijnBitPosition[32] = { From 25cf33d5c83ea3b5d98ee78b9db1c933ed5e703c Mon Sep 17 00:00:00 2001 From: matze Date: Fri, 2 Sep 2016 21:09:25 +0200 Subject: [PATCH 3/8] Warning fixed by adding a cast to char --- modules/imgproc/src/contours.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/imgproc/src/contours.cpp b/modules/imgproc/src/contours.cpp index b27c1552dc..dde5421c75 100644 --- a/modules/imgproc/src/contours.cpp +++ b/modules/imgproc/src/contours.cpp @@ -1048,7 +1048,7 @@ cvFindNextContour( CvContourScanner scanner ) goto _next_contour; } else if (haveSSE2) { - __m128i v_prev = _mm_set1_epi8(prev); + __m128i v_prev = _mm_set1_epi8((char)prev); int v_size = width - 32; for (; x <= v_size; x += 32) { From f6451c7ae6221970055624ff801a721e75fd0fc5 Mon Sep 17 00:00:00 2001 From: matze Date: Sat, 3 Sep 2016 14:35:09 +0200 Subject: [PATCH 4/8] Varaible names renamed. Macro definitions fixed. findEndContourPoint function fixed. findContoursInInterval adjusted. --- modules/imgproc/src/contours.cpp | 85 +++++++++++++++++--------------- 1 file changed, 46 insertions(+), 39 deletions(-) diff --git a/modules/imgproc/src/contours.cpp b/modules/imgproc/src/contours.cpp index dde5421c75..f6db9b086b 100644 --- a/modules/imgproc/src/contours.cpp +++ b/modules/imgproc/src/contours.cpp @@ -52,11 +52,17 @@ static const CvPoint icvCodeDeltas[8] = inline unsigned int trailingZeros(unsigned int value) { #if defined(_MSC_VER) +#if (_MSC_VER < 1500) + return _BitScanForward(value); +#else return _tzcnt_u32(value); -#elif defined(_GCC) +#endif +#elif defined(__GNUC__) || defined(__GNUG__) return __builtin_ctz(value); -//#elif defined(__INTEL_COMPILER) - //return _bit_scan_reverse(value); +//#elif defined(__ICC) || defined(__INTEL_COMPILER) +// return _bit_scan_forward(value); +//#elif defined(__clang__) +// return llvm.cttz.i32(value, true); #else static const int MultiplyDeBruijnBitPosition[32] = { 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, @@ -1052,8 +1058,8 @@ cvFindNextContour( CvContourScanner scanner ) int v_size = width - 32; for (; x <= v_size; x += 32) { - __m128i v_p1 = _mm_loadu_si128((__m128i*)(img + x)); - __m128i v_p2 = _mm_loadu_si128((__m128i*)(img + x + 16)); + __m128i v_p1 = _mm_loadu_si128((const __m128i*)(img + x)); + __m128i v_p2 = _mm_loadu_si128((const __m128i*)(img + x + 16)); __m128i v_cmp1 = _mm_cmpeq_epi8(v_p1, v_prev); __m128i v_cmp2 = _mm_cmpeq_epi8(v_p2, v_prev); @@ -1350,18 +1356,18 @@ inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j) { #endif #if CV_SSE2 if (haveSSE2) { - __m128i sseZero = _mm_setzero_si128(); - int sizeSse = img_size.width - 32; + __m128i v_zero = _mm_setzero_si128(); + int v_size = img_size.width - 32; - for (; j <= sizeSse; j += 32) { - __m128i sseP1 = _mm_loadu_si128((__m128i*)(src_data + j)); - __m128i sseP2 = _mm_loadu_si128((__m128i*)(src_data + j + 16)); + for (; j <= v_size; j += 32) { + __m128i v_p1 = _mm_loadu_si128((const __m128i*)(src_data + j)); + __m128i v_p2 = _mm_loadu_si128((const __m128i*)(src_data + j + 16)); - __m128i sseCmp1 = _mm_cmpeq_epi8(sseP1, sseZero); - __m128i sseCmp2 = _mm_cmpeq_epi8(sseP2, sseZero); + __m128i v_cmp1 = _mm_cmpeq_epi8(v_p1, v_zero); + __m128i v_cmp2 = _mm_cmpeq_epi8(v_p2, v_zero); - unsigned int mask1 = _mm_movemask_epi8(sseCmp1); - unsigned int mask2 = _mm_movemask_epi8(sseCmp2); + unsigned int mask1 = _mm_movemask_epi8(v_cmp1); + unsigned int mask2 = _mm_movemask_epi8(v_cmp2); mask1 ^= 0x0000ffff; mask2 ^= 0x0000ffff; @@ -1378,9 +1384,9 @@ inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j) { } if (j <= img_size.width - 16) { - __m128i sseP = _mm_loadu_si128((__m128i*)(src_data + j)); + __m128i v_p = _mm_loadu_si128((const __m128i*)(src_data + j)); - unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(sseP, sseZero)) ^ 0x0000ffff; + unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(v_p, v_zero)) ^ 0x0000ffff; if (mask) { j += trailingZeros(mask); @@ -1401,40 +1407,40 @@ inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j, bool hav inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j) { #endif #if CV_SSE2 - if (!src_data[j]) { - return j; + if (j < img_size.width && !src_data[j]) { + return j - 1; } else if (haveSSE2) { - __m128i sseZero = _mm_setzero_si128(); - int sizeSse = img_size.width - 32; + __m128i v_zero = _mm_setzero_si128(); + int v_size = img_size.width - 32; - for (; j <= sizeSse; j += 32) { - __m128i sseP1 = _mm_loadu_si128((__m128i*)(src_data + j)); - __m128i sseP2 = _mm_loadu_si128((__m128i*)(src_data + j + 16)); + for (; j <= v_size; j += 32) { + __m128i v_p1 = _mm_loadu_si128((const __m128i*)(src_data + j)); + __m128i v_p2 = _mm_loadu_si128((const __m128i*)(src_data + j + 16)); - __m128i sseCmp1 = _mm_cmpeq_epi8(sseP1, sseZero); - __m128i sseCmp2 = _mm_cmpeq_epi8(sseP2, sseZero); + __m128i v_cmp1 = _mm_cmpeq_epi8(v_p1, v_zero); + __m128i v_cmp2 = _mm_cmpeq_epi8(v_p2, v_zero); - unsigned int mask1 = _mm_movemask_epi8(sseCmp1); - unsigned int mask2 = _mm_movemask_epi8(sseCmp2); + unsigned int mask1 = _mm_movemask_epi8(v_cmp1); + unsigned int mask2 = _mm_movemask_epi8(v_cmp2); if (mask1) { - j += trailingZeros(mask1); + j += (trailingZeros(mask1) - 1); return j; } if (mask2) { - j += trailingZeros(mask2 << 16); + j += trailingZeros(mask2 << 15); return j; } } if (j <= img_size.width - 16) { - __m128i sseP = _mm_loadu_si128((__m128i*)(src_data + j)); + __m128i v_p = _mm_loadu_si128((const __m128i*)(src_data + j)); - unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(sseP, sseZero)); + unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(v_p, v_zero)); if (mask) { - j += trailingZeros(mask); + j += (trailingZeros(mask) - 1); return j; } j += 16; @@ -1443,7 +1449,8 @@ inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j) { #endif for (; j < img_size.width && src_data[j]; ++j) ; - return j; + + return j - 1; } static int @@ -1545,11 +1552,11 @@ icvFindContoursInInterval( const CvArr* src, tmp_prev->next = (CvLinkedRunPoint*)CV_GET_WRITTEN_ELEM( writer ); tmp_prev = tmp_prev->next; #if CV_SSE2 - j = findEndContourPoint(src_data, img_size, j, haveSSE2); + j = findEndContourPoint(src_data, img_size, j+1, haveSSE2); #else - j = findEndContourPoint(src_data, img_size, j); + j = findEndContourPoint(src_data, img_size, j+1); #endif - tmp.pt.x = j-1; + tmp.pt.x = j; CV_WRITE_SEQ_ELEM( tmp, writer ); tmp_prev->next = (CvLinkedRunPoint*)CV_GET_WRITTEN_ELEM( writer ); tmp_prev->link = tmp_prev->next; @@ -1583,11 +1590,11 @@ icvFindContoursInInterval( const CvArr* src, tmp_prev->next = (CvLinkedRunPoint*)CV_GET_WRITTEN_ELEM( writer ); tmp_prev = tmp_prev->next; #if CV_SSE2 - j = findEndContourPoint(src_data, img_size, j, haveSSE2); + j = findEndContourPoint(src_data, img_size, j+1, haveSSE2); #else - j = findEndContourPoint(src_data, img_size, j); + j = findEndContourPoint(src_data, img_size, j+1); #endif - tmp.pt.x = j-1; + tmp.pt.x = j; CV_WRITE_SEQ_ELEM( tmp, writer ); tmp_prev = tmp_prev->next = (CvLinkedRunPoint*)CV_GET_WRITTEN_ELEM( writer ); }//j From fd6f54c89511a19d475e657b9748efc55db2f010 Mon Sep 17 00:00:00 2001 From: matze Date: Sat, 3 Sep 2016 20:09:39 +0200 Subject: [PATCH 5/8] Bug fixed in findEndContours and icvFindContoursInInterval adjusted. --- modules/imgproc/src/contours.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/modules/imgproc/src/contours.cpp b/modules/imgproc/src/contours.cpp index f6db9b086b..9816714f7c 100644 --- a/modules/imgproc/src/contours.cpp +++ b/modules/imgproc/src/contours.cpp @@ -1408,7 +1408,7 @@ inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j) { #endif #if CV_SSE2 if (j < img_size.width && !src_data[j]) { - return j - 1; + return j; } else if (haveSSE2) { __m128i v_zero = _mm_setzero_si128(); int v_size = img_size.width - 32; @@ -1424,12 +1424,12 @@ inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j) { unsigned int mask2 = _mm_movemask_epi8(v_cmp2); if (mask1) { - j += (trailingZeros(mask1) - 1); + j += trailingZeros(mask1); return j; } if (mask2) { - j += trailingZeros(mask2 << 15); + j += trailingZeros(mask2 << 16); return j; } } @@ -1440,7 +1440,7 @@ inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j) { unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(v_p, v_zero)); if (mask) { - j += (trailingZeros(mask) - 1); + j += trailingZeros(mask); return j; } j += 16; @@ -1450,7 +1450,7 @@ inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j) { for (; j < img_size.width && src_data[j]; ++j) ; - return j - 1; + return j; } static int @@ -1556,7 +1556,7 @@ icvFindContoursInInterval( const CvArr* src, #else j = findEndContourPoint(src_data, img_size, j+1); #endif - tmp.pt.x = j; + tmp.pt.x = j - 1; CV_WRITE_SEQ_ELEM( tmp, writer ); tmp_prev->next = (CvLinkedRunPoint*)CV_GET_WRITTEN_ELEM( writer ); tmp_prev->link = tmp_prev->next; @@ -1594,7 +1594,7 @@ icvFindContoursInInterval( const CvArr* src, #else j = findEndContourPoint(src_data, img_size, j+1); #endif - tmp.pt.x = j; + tmp.pt.x = j - 1; CV_WRITE_SEQ_ELEM( tmp, writer ); tmp_prev = tmp_prev->next = (CvLinkedRunPoint*)CV_GET_WRITTEN_ELEM( writer ); }//j From aaa255465e3ce90c8d4c56e9ee86eec2f5b7fa50 Mon Sep 17 00:00:00 2001 From: matze Date: Sun, 11 Sep 2016 13:15:53 +0200 Subject: [PATCH 6/8] Renamed haveSSE2 in haveSIMD. Conditional compilation for function header removed --- modules/imgproc/src/contours.cpp | 53 ++++++++++---------------------- 1 file changed, 17 insertions(+), 36 deletions(-) diff --git a/modules/imgproc/src/contours.cpp b/modules/imgproc/src/contours.cpp index 9816714f7c..391d2b9262 100644 --- a/modules/imgproc/src/contours.cpp +++ b/modules/imgproc/src/contours.cpp @@ -1002,7 +1002,7 @@ cvFindNextContour( CvContourScanner scanner ) CV_Error( CV_StsNullPtr, "" ); #if CV_SSE2 - bool haveSSE2 = cv::checkHardwareSupport(CPU_SSE2); + bool haveSIMD = cv::checkHardwareSupport(CPU_SSE2); #endif icvEndProcessContour( scanner ); @@ -1052,7 +1052,7 @@ cvFindNextContour( CvContourScanner scanner ) #if CV_SSE2 if ((p = img[x]) != prev) { goto _next_contour; - } else if (haveSSE2) { + } else if (haveSIMD) { __m128i v_prev = _mm_set1_epi8((char)prev); int v_size = width - 32; @@ -1337,7 +1337,6 @@ cvEndFindContours( CvContourScanner * _scanner ) #define ICV_SINGLE 0 #define ICV_CONNECTING_ABOVE 1 #define ICV_CONNECTING_BELOW -1 -//#define ICV_IS_COMPONENT_POINT(val) ((val) != 0) #define CV_GET_WRITTEN_ELEM( writer ) ((writer).ptr - (writer).seq->elem_size) @@ -1349,13 +1348,9 @@ typedef struct CvLinkedRunPoint } CvLinkedRunPoint; +inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j, bool haveSIMD) { #if CV_SSE2 -inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j, bool haveSSE2) { -#else -inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j) { -#endif -#if CV_SSE2 - if (haveSSE2) { + if (haveSIMD) { __m128i v_zero = _mm_setzero_si128(); int v_size = img_size.width - 32; @@ -1401,15 +1396,11 @@ inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j) { return j; } -#if CV_SSE2 -inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j, bool haveSSE2) { -#else -inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j) { -#endif +inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j, bool haveSIMD) { #if CV_SSE2 if (j < img_size.width && !src_data[j]) { return j; - } else if (haveSSE2) { + } else if (haveSIMD) { __m128i v_zero = _mm_setzero_si128(); int v_size = img_size.width - 32; @@ -1505,7 +1496,7 @@ icvFindContoursInInterval( const CvArr* src, if( contourHeaderSize < (int)sizeof(CvContour)) CV_Error( CV_StsBadSize, "Contour header size must be >= sizeof(CvContour)" ); #if CV_SSE2 - bool haveSSE2 = cv::checkHardwareSupport(CPU_SSE2); + bool haveSIMD = cv::checkHardwareSupport(CPU_SSE2); #endif storage00.reset(cvCreateChildMemStorage(storage)); storage01.reset(cvCreateChildMemStorage(storage)); @@ -1539,11 +1530,8 @@ icvFindContoursInInterval( const CvArr* src, tmp_prev = upper_line; for( j = 0; j < img_size.width; ) { -#if CV_SSE2 - j = findStartContourPoint(src_data, img_size, j, haveSSE2); -#else - j = findStartContourPoint(src_data, img_size, j); -#endif + j = findStartContourPoint(src_data, img_size, j, haveSIMD); + if( j == img_size.width ) break; @@ -1551,11 +1539,9 @@ icvFindContoursInInterval( const CvArr* src, CV_WRITE_SEQ_ELEM( tmp, writer ); tmp_prev->next = (CvLinkedRunPoint*)CV_GET_WRITTEN_ELEM( writer ); tmp_prev = tmp_prev->next; -#if CV_SSE2 - j = findEndContourPoint(src_data, img_size, j+1, haveSSE2); -#else - j = findEndContourPoint(src_data, img_size, j+1); -#endif + + j = findEndContourPoint(src_data, img_size, j+1, haveSIMD); + tmp.pt.x = j - 1; CV_WRITE_SEQ_ELEM( tmp, writer ); tmp_prev->next = (CvLinkedRunPoint*)CV_GET_WRITTEN_ELEM( writer ); @@ -1578,22 +1564,17 @@ icvFindContoursInInterval( const CvArr* src, all_total = runs->total; for( j = 0; j < img_size.width; ) { -#if CV_SSE2 - j = findStartContourPoint(src_data, img_size, j, haveSSE2); -#else - j = findStartContourPoint(src_data, img_size, j); -#endif + j = findStartContourPoint(src_data, img_size, j, haveSIMD); + if( j == img_size.width ) break; tmp.pt.x = j; CV_WRITE_SEQ_ELEM( tmp, writer ); tmp_prev->next = (CvLinkedRunPoint*)CV_GET_WRITTEN_ELEM( writer ); tmp_prev = tmp_prev->next; -#if CV_SSE2 - j = findEndContourPoint(src_data, img_size, j+1, haveSSE2); -#else - j = findEndContourPoint(src_data, img_size, j+1); -#endif + + j = findEndContourPoint(src_data, img_size, j+1, haveSIMD); + tmp.pt.x = j - 1; CV_WRITE_SEQ_ELEM( tmp, writer ); tmp_prev = tmp_prev->next = (CvLinkedRunPoint*)CV_GET_WRITTEN_ELEM( writer ); From a865876c0f5c757ec82a2291aedad3bdd83df48b Mon Sep 17 00:00:00 2001 From: matze Date: Tue, 13 Sep 2016 19:32:37 +0200 Subject: [PATCH 7/8] Added CV_UNUSED to not used variable when build without SSE2 support. --- modules/imgproc/src/contours.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/modules/imgproc/src/contours.cpp b/modules/imgproc/src/contours.cpp index 391d2b9262..2bfb7d7893 100644 --- a/modules/imgproc/src/contours.cpp +++ b/modules/imgproc/src/contours.cpp @@ -1390,6 +1390,8 @@ inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j, bool h j += 16; } } +#else + CV_UNUSED(haveSIMD); #endif for (; j < img_size.width && !src_data[j]; ++j) ; @@ -1437,6 +1439,8 @@ inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j, bool hav j += 16; } } +#else + CV_UNUSED(haveSIMD); #endif for (; j < img_size.width && src_data[j]; ++j) ; @@ -1466,6 +1470,7 @@ icvFindContoursInInterval( const CvArr* src, int lower_total; int upper_total; int all_total; + bool haveSIMD = false; CvSeq* runs; CvLinkedRunPoint tmp; @@ -1496,7 +1501,7 @@ icvFindContoursInInterval( const CvArr* src, if( contourHeaderSize < (int)sizeof(CvContour)) CV_Error( CV_StsBadSize, "Contour header size must be >= sizeof(CvContour)" ); #if CV_SSE2 - bool haveSIMD = cv::checkHardwareSupport(CPU_SSE2); + haveSIMD = cv::checkHardwareSupport(CPU_SSE2); #endif storage00.reset(cvCreateChildMemStorage(storage)); storage01.reset(cvCreateChildMemStorage(storage)); @@ -1540,7 +1545,7 @@ icvFindContoursInInterval( const CvArr* src, tmp_prev->next = (CvLinkedRunPoint*)CV_GET_WRITTEN_ELEM( writer ); tmp_prev = tmp_prev->next; - j = findEndContourPoint(src_data, img_size, j+1, haveSIMD); + j = findEndContourPoint(src_data, img_size, j + 1, haveSIMD); tmp.pt.x = j - 1; CV_WRITE_SEQ_ELEM( tmp, writer ); @@ -1573,7 +1578,7 @@ icvFindContoursInInterval( const CvArr* src, tmp_prev->next = (CvLinkedRunPoint*)CV_GET_WRITTEN_ELEM( writer ); tmp_prev = tmp_prev->next; - j = findEndContourPoint(src_data, img_size, j+1, haveSIMD); + j = findEndContourPoint(src_data, img_size, j + 1, haveSIMD); tmp.pt.x = j - 1; CV_WRITE_SEQ_ELEM( tmp, writer ); From 975d2e4294e4eac113be996c2e6a8abf4d951c36 Mon Sep 17 00:00:00 2001 From: matze Date: Wed, 14 Sep 2016 18:20:47 +0200 Subject: [PATCH 8/8] Enable built in counting trailing zeros function for Intel compiler and clang. --- modules/imgproc/src/contours.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/imgproc/src/contours.cpp b/modules/imgproc/src/contours.cpp index 2bfb7d7893..c284927ac8 100644 --- a/modules/imgproc/src/contours.cpp +++ b/modules/imgproc/src/contours.cpp @@ -59,10 +59,10 @@ inline unsigned int trailingZeros(unsigned int value) { #endif #elif defined(__GNUC__) || defined(__GNUG__) return __builtin_ctz(value); -//#elif defined(__ICC) || defined(__INTEL_COMPILER) -// return _bit_scan_forward(value); -//#elif defined(__clang__) -// return llvm.cttz.i32(value, true); +#elif defined(__ICC) || defined(__INTEL_COMPILER) + return _bit_scan_forward(value); +#elif defined(__clang__) + return llvm.cttz.i32(value, true); #else static const int MultiplyDeBruijnBitPosition[32] = { 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,