diff --git a/modules/imgproc/src/contours.cpp b/modules/imgproc/src/contours.cpp index 5f08be5fe5..c284927ac8 100644 --- a/modules/imgproc/src/contours.cpp +++ b/modules/imgproc/src/contours.cpp @@ -50,6 +50,27 @@ static const CvPoint icvCodeDeltas[8] = { CvPoint(1, 0), CvPoint(1, -1), CvPoint(0, -1), CvPoint(-1, -1), CvPoint(-1, 0), CvPoint(-1, 1), CvPoint(0, 1), CvPoint(1, 1) }; +inline unsigned int trailingZeros(unsigned int value) { +#if defined(_MSC_VER) +#if (_MSC_VER < 1500) + return _BitScanForward(value); +#else + return _tzcnt_u32(value); +#endif +#elif defined(__GNUC__) || defined(__GNUG__) + return __builtin_ctz(value); +#elif defined(__ICC) || defined(__INTEL_COMPILER) + return _bit_scan_forward(value); +#elif defined(__clang__) + return llvm.cttz.i32(value, true); +#else + static const int MultiplyDeBruijnBitPosition[32] = { + 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, + 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 }; + return MultiplyDeBruijnBitPosition[((uint32_t)((value & -value) * 0x077CB531U)) >> 27]; +#endif +} + CV_IMPL void cvStartReadChainPoints( CvChain * chain, CvChainPtReader * reader ) { @@ -527,10 +548,8 @@ icvFetchContour( schar *ptr, { s = (s - 1) & 7; i1 = i0 + deltas[s]; - if( *i1 != 0 ) - break; } - while( s != s_end ); + while( *i1 == 0 && s != s_end ); if( s == s_end ) /* single pixel domain */ { @@ -631,10 +650,8 @@ icvTraceContour( schar *ptr, int step, schar *stop_ptr, int is_hole ) { s = (s - 1) & 7; i1 = i0 + deltas[s]; - if( *i1 != 0 ) - break; } - while( s != s_end ); + while( *i1 == 0 && s != s_end ); i3 = i0; @@ -644,7 +661,6 @@ icvTraceContour( schar *ptr, int step, schar *stop_ptr, int is_hole ) /* follow border */ for( ;; ) { - s_end = s; for( ;; ) { @@ -702,10 +718,8 @@ icvFetchContourEx( schar* ptr, { s = (s - 1) & 7; i1 = i0 + deltas[s]; - if( *i1 != 0 ) - break; } - while( s != s_end ); + while( *i1 == 0 && s != s_end ); if( s == s_end ) /* single pixel domain */ { @@ -817,10 +831,8 @@ icvTraceContour_32s( int *ptr, int step, int *stop_ptr, int is_hole ) { s = (s - 1) & 7; i1 = i0 + deltas[s]; - if( (*i1 & value_mask) == ccomp_val ) - break; } - while( s != s_end ); + while( (*i1 & value_mask) != ccomp_val && s != s_end ); i3 = i0; @@ -892,10 +904,8 @@ icvFetchContourEx_32s( int* ptr, { s = (s - 1) & 7; i1 = i0 + deltas[s]; - if( (*i1 & value_mask) == ccomp_val ) - break; } - while( s != s_end ); + while( (*i1 & value_mask) != ccomp_val && s != s_end ); if( s == s_end ) /* single pixel domain */ { @@ -990,6 +1000,11 @@ cvFindNextContour( CvContourScanner scanner ) { if( !scanner ) CV_Error( CV_StsNullPtr, "" ); + +#if CV_SSE2 + bool haveSIMD = cv::checkHardwareSupport(CPU_SSE2); +#endif + icvEndProcessContour( scanner ); /* initialize local state */ @@ -1034,13 +1049,60 @@ cvFindNextContour( CvContourScanner scanner ) } else { +#if CV_SSE2 + if ((p = img[x]) != prev) { + goto _next_contour; + } else if (haveSIMD) { + + __m128i v_prev = _mm_set1_epi8((char)prev); + int v_size = width - 32; + + for (; x <= v_size; x += 32) { + __m128i v_p1 = _mm_loadu_si128((const __m128i*)(img + x)); + __m128i v_p2 = _mm_loadu_si128((const __m128i*)(img + x + 16)); + + __m128i v_cmp1 = _mm_cmpeq_epi8(v_p1, v_prev); + __m128i v_cmp2 = _mm_cmpeq_epi8(v_p2, v_prev); + + unsigned int mask1 = _mm_movemask_epi8(v_cmp1); + unsigned int mask2 = _mm_movemask_epi8(v_cmp2); + + mask1 ^= 0x0000ffff; + mask2 ^= 0x0000ffff; + + if (mask1) { + p = img[(x += trailingZeros(mask1))]; + goto _next_contour; + } + + if (mask2) { + p = img[(x += trailingZeros(mask2 << 16))]; + goto _next_contour; + } + } + + if(x <= width - 16) { + __m128i v_p = _mm_loadu_si128((__m128i*)(img + x)); + + unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(v_p, v_prev)) ^ 0x0000ffff; + + if (mask) { + p = img[(x += trailingZeros(mask))]; + goto _next_contour; + } + x += 16; + } + } +#endif for( ; x < width && (p = img[x]) == prev; x++ ) ; } if( x >= width ) break; - +#if CV_SSE2 + _next_contour: +#endif { _CvContourInfo *par_info = 0; _CvContourInfo *l_cinfo = 0; @@ -1275,7 +1337,6 @@ cvEndFindContours( CvContourScanner * _scanner ) #define ICV_SINGLE 0 #define ICV_CONNECTING_ABOVE 1 #define ICV_CONNECTING_BELOW -1 -#define ICV_IS_COMPONENT_POINT(val) ((val) != 0) #define CV_GET_WRITTEN_ELEM( writer ) ((writer).ptr - (writer).seq->elem_size) @@ -1287,6 +1348,105 @@ typedef struct CvLinkedRunPoint } CvLinkedRunPoint; +inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j, bool haveSIMD) { +#if CV_SSE2 + if (haveSIMD) { + __m128i v_zero = _mm_setzero_si128(); + int v_size = img_size.width - 32; + + for (; j <= v_size; j += 32) { + __m128i v_p1 = _mm_loadu_si128((const __m128i*)(src_data + j)); + __m128i v_p2 = _mm_loadu_si128((const __m128i*)(src_data + j + 16)); + + __m128i v_cmp1 = _mm_cmpeq_epi8(v_p1, v_zero); + __m128i v_cmp2 = _mm_cmpeq_epi8(v_p2, v_zero); + + unsigned int mask1 = _mm_movemask_epi8(v_cmp1); + unsigned int mask2 = _mm_movemask_epi8(v_cmp2); + + mask1 ^= 0x0000ffff; + mask2 ^= 0x0000ffff; + + if (mask1) { + j += trailingZeros(mask1); + return j; + } + + if (mask2) { + j += trailingZeros(mask2 << 16); + return j; + } + } + + if (j <= img_size.width - 16) { + __m128i v_p = _mm_loadu_si128((const __m128i*)(src_data + j)); + + unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(v_p, v_zero)) ^ 0x0000ffff; + + if (mask) { + j += trailingZeros(mask); + return j; + } + j += 16; + } + } +#else + CV_UNUSED(haveSIMD); +#endif + for (; j < img_size.width && !src_data[j]; ++j) + ; + return j; +} + +inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j, bool haveSIMD) { +#if CV_SSE2 + if (j < img_size.width && !src_data[j]) { + return j; + } else if (haveSIMD) { + __m128i v_zero = _mm_setzero_si128(); + int v_size = img_size.width - 32; + + for (; j <= v_size; j += 32) { + __m128i v_p1 = _mm_loadu_si128((const __m128i*)(src_data + j)); + __m128i v_p2 = _mm_loadu_si128((const __m128i*)(src_data + j + 16)); + + __m128i v_cmp1 = _mm_cmpeq_epi8(v_p1, v_zero); + __m128i v_cmp2 = _mm_cmpeq_epi8(v_p2, v_zero); + + unsigned int mask1 = _mm_movemask_epi8(v_cmp1); + unsigned int mask2 = _mm_movemask_epi8(v_cmp2); + + if (mask1) { + j += trailingZeros(mask1); + return j; + } + + if (mask2) { + j += trailingZeros(mask2 << 16); + return j; + } + } + + if (j <= img_size.width - 16) { + __m128i v_p = _mm_loadu_si128((const __m128i*)(src_data + j)); + + unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(v_p, v_zero)); + + if (mask) { + j += trailingZeros(mask); + return j; + } + j += 16; + } + } +#else + CV_UNUSED(haveSIMD); +#endif + for (; j < img_size.width && src_data[j]; ++j) + ; + + return j; +} static int icvFindContoursInInterval( const CvArr* src, @@ -1310,6 +1470,7 @@ icvFindContoursInInterval( const CvArr* src, int lower_total; int upper_total; int all_total; + bool haveSIMD = false; CvSeq* runs; CvLinkedRunPoint tmp; @@ -1339,7 +1500,9 @@ icvFindContoursInInterval( const CvArr* src, if( contourHeaderSize < (int)sizeof(CvContour)) CV_Error( CV_StsBadSize, "Contour header size must be >= sizeof(CvContour)" ); - +#if CV_SSE2 + haveSIMD = cv::checkHardwareSupport(CPU_SSE2); +#endif storage00.reset(cvCreateChildMemStorage(storage)); storage01.reset(cvCreateChildMemStorage(storage)); @@ -1372,8 +1535,8 @@ icvFindContoursInInterval( const CvArr* src, tmp_prev = upper_line; for( j = 0; j < img_size.width; ) { - for( ; j < img_size.width && !ICV_IS_COMPONENT_POINT(src_data[j]); j++ ) - ; + j = findStartContourPoint(src_data, img_size, j, haveSIMD); + if( j == img_size.width ) break; @@ -1382,10 +1545,9 @@ icvFindContoursInInterval( const CvArr* src, tmp_prev->next = (CvLinkedRunPoint*)CV_GET_WRITTEN_ELEM( writer ); tmp_prev = tmp_prev->next; - for( ; j < img_size.width && ICV_IS_COMPONENT_POINT(src_data[j]); j++ ) - ; + j = findEndContourPoint(src_data, img_size, j + 1, haveSIMD); - tmp.pt.x = j-1; + tmp.pt.x = j - 1; CV_WRITE_SEQ_ELEM( tmp, writer ); tmp_prev->next = (CvLinkedRunPoint*)CV_GET_WRITTEN_ELEM( writer ); tmp_prev->link = tmp_prev->next; @@ -1407,8 +1569,8 @@ icvFindContoursInInterval( const CvArr* src, all_total = runs->total; for( j = 0; j < img_size.width; ) { - for( ; j < img_size.width && !ICV_IS_COMPONENT_POINT(src_data[j]); j++ ) - ; + j = findStartContourPoint(src_data, img_size, j, haveSIMD); + if( j == img_size.width ) break; tmp.pt.x = j; @@ -1416,10 +1578,9 @@ icvFindContoursInInterval( const CvArr* src, tmp_prev->next = (CvLinkedRunPoint*)CV_GET_WRITTEN_ELEM( writer ); tmp_prev = tmp_prev->next; - for( ; j < img_size.width && ICV_IS_COMPONENT_POINT(src_data[j]); j++ ) - ; + j = findEndContourPoint(src_data, img_size, j + 1, haveSIMD); - tmp.pt.x = j-1; + tmp.pt.x = j - 1; CV_WRITE_SEQ_ELEM( tmp, writer ); tmp_prev = tmp_prev->next = (CvLinkedRunPoint*)CV_GET_WRITTEN_ELEM( writer ); }//j