From 9e61a28b2e79e8fc65bcc5609206b70ce7a98e1d Mon Sep 17 00:00:00 2001 From: mschoeneck Date: Mon, 5 Sep 2016 12:51:36 +0200 Subject: [PATCH] Improve canny (#7227) * Improve Canny by using _mm_movemask_epi8 to find next pixel magnitude greater than lower threshold. Added parallelized finalPass to Canny with variable gradients. Little changes in finalPass. * Some things fixed --- modules/imgproc/src/canny.cpp | 232 +++++++++++++++++++++++++++------- 1 file changed, 189 insertions(+), 43 deletions(-) diff --git a/modules/imgproc/src/canny.cpp b/modules/imgproc/src/canny.cpp index 6ef5dee9e8..784b5ffe24 100644 --- a/modules/imgproc/src/canny.cpp +++ b/modules/imgproc/src/canny.cpp @@ -525,9 +525,88 @@ public: #define CANNY_SHIFT 15 const int TG22 = (int)(0.4142135623730950488016887242097*(1 << CANNY_SHIFT) + 0.5); - int prev_flag = 0; - bool canny_push = false; - for (int j = 0; j < src.cols; j++) + int prev_flag = 0, j = 0; +#if CV_SSE2 + if (checkHardwareSupport(CPU_SSE2)) + { + __m128i v_low = _mm_set1_epi32(low), v_one = _mm_set1_epi8(1); + + for (; j <= src.cols - 16; j += 16) + { + __m128i v_m1 = _mm_loadu_si128((const __m128i*)(_mag + j)); + __m128i v_m2 = _mm_loadu_si128((const __m128i*)(_mag + j + 4)); + __m128i v_m3 = _mm_loadu_si128((const __m128i*)(_mag + j + 8)); + __m128i v_m4 = _mm_loadu_si128((const __m128i*)(_mag + j + 12)); + + _mm_storeu_si128((__m128i*)(_map + j), v_one); + + __m128i v_cmp1 = _mm_cmpgt_epi32(v_m1, v_low); + __m128i v_cmp2 = _mm_cmpgt_epi32(v_m2, v_low); + __m128i v_cmp3 = _mm_cmpgt_epi32(v_m3, v_low); + __m128i v_cmp4 = _mm_cmpgt_epi32(v_m4, v_low); + + v_cmp1 = _mm_packs_epi32(v_cmp1, v_cmp2); + v_cmp2 = _mm_packs_epi32(v_cmp3, v_cmp4); + + v_cmp1 = _mm_packs_epi16(v_cmp1, v_cmp2); + unsigned int mask = _mm_movemask_epi8(v_cmp1); + + if (mask) + { + int m, k = j; + + for (; mask; ++k, mask >>= 1) + { + if (mask & 0x00000001) + { + m = _mag[k]; + int xs = _x[k]; + int ys = _y[k]; + int x = std::abs(xs); + int y = std::abs(ys) << CANNY_SHIFT; + + int tg22x = x * TG22; + + if (y < tg22x) + { + if (m > _mag[k - 1] && m >= _mag[k + 1]) goto _canny_push_sse; + } + else + { + int tg67x = tg22x + (x << (CANNY_SHIFT + 1)); + if (y > tg67x) + { + if (m > _mag[k + magstep2] && m >= _mag[k + magstep1]) goto _canny_push_sse; + } else + { + int s = (xs ^ ys) < 0 ? -1 : 1; + if (m > _mag[k + magstep2 - s] && m > _mag[k + magstep1 + s]) goto _canny_push_sse; + } + } + } + + prev_flag = 0; + continue; + +_canny_push_sse: + // _map[k-mapstep] is short-circuited at the start because previous thread is + // responsible for initializing it. + if (m > high && !prev_flag && (i <= boundaries.start + 1 || _map[k - mapstep] != 2)) + { + CANNY_PUSH(_map + k); + prev_flag = 1; + } else + _map[k] = 0; + + } + + if (prev_flag && ((k < j+16) || (k < src.cols && _mag[k] <= high))) + prev_flag = 0; + } + } + } +#endif + for (; j < src.cols; j++) { int m = _mag[j]; @@ -542,42 +621,37 @@ public: if (y < tg22x) { - if (m > _mag[j-1] && m >= _mag[j+1]) canny_push = true; + if (m > _mag[j-1] && m >= _mag[j+1]) goto _canny_push; } else { int tg67x = tg22x + (x << (CANNY_SHIFT+1)); if (y > tg67x) { - if (m > _mag[j+magstep2] && m >= _mag[j+magstep1]) canny_push = true; + if (m > _mag[j+magstep2] && m >= _mag[j+magstep1]) goto _canny_push; } else { int s = (xs ^ ys) < 0 ? -1 : 1; - if (m > _mag[j+magstep2-s] && m > _mag[j+magstep1+s]) canny_push = true; + if (m > _mag[j+magstep2-s] && m > _mag[j+magstep1+s]) goto _canny_push; } } } - if (!canny_push) + + prev_flag = 0; + _map[j] = uchar(1); + continue; + +_canny_push: + // _map[j-mapstep] is short-circuited at the start because previous thread is + // responsible for initializing it. + if (!prev_flag && m > high && (i <= boundaries.start+1 || _map[j-mapstep] != 2) ) { - prev_flag = 0; - _map[j] = uchar(1); - continue; + CANNY_PUSH(_map + j); + prev_flag = 1; } else - { - // _map[j-mapstep] is short-circuited at the start because previous thread is - // responsible for initializing it. - if (!prev_flag && m > high && (i <= boundaries.start+1 || _map[j-mapstep] != 2) ) - { - CANNY_PUSH(_map + j); - prev_flag = 1; - } - else - _map[j] = 0; - - canny_push = false; - } + _map[j] = 0; } // scroll the ring buffer @@ -640,8 +714,8 @@ class finalPass : public ParallelLoopBody { public: - finalPass(const Mat & _src, Mat &_dst, uchar *_map, ptrdiff_t _mapstep) : - src(_src), dst(_dst), map(_map), mapstep(_mapstep) {} + finalPass(uchar *_map, Mat &_dst, ptrdiff_t _mapstep) : + map(_map), dst(_dst), mapstep(_mapstep) {} ~finalPass() {} @@ -664,7 +738,7 @@ public: if(haveSSE2) { const __m128i v_zero = _mm_setzero_si128(); - for(; j <= src.cols - 32; j += 32) { + for(; j <= dst.cols - 32; j += 32) { __m128i v_pmap1 = _mm_loadu_si128((const __m128i*)(pmap + j)); __m128i v_pmap2 = _mm_loadu_si128((const __m128i*)(pmap + j + 16)); @@ -688,7 +762,7 @@ public: _mm_storeu_si128((__m128i*)(pdst + j + 16), v_pmap2); } - for(; j <= src.cols - 16; j += 16) { + for(; j <= dst.cols - 16; j += 16) { __m128i v_pmap = _mm_loadu_si128((const __m128i*)(pmap + j)); __m128i v_pmaplo = _mm_unpacklo_epi8(v_pmap, v_zero); @@ -704,15 +778,14 @@ public: } } #endif - for (; j < src.cols; j++) + for (; j < dst.cols; j++) pdst[j] = (uchar)-(pmap[j] >> 1); } } private: - const Mat &src; - Mat &dst; uchar *map; + Mat &dst; ptrdiff_t mapstep; }; @@ -817,7 +890,7 @@ int high = cvFloor(high_thresh); if (!m[mapstep+1]) CANNY_PUSH_SERIAL(m + mapstep + 1); } - parallel_for_(Range(0, src.rows), finalPass(src, dst, map, mapstep), src.total()/(double)(1<<16)); + parallel_for_(Range(0, dst.rows), finalPass(map, dst, mapstep), dst.total()/(double)(1<<16)); } void Canny( InputArray _dx, InputArray _dy, OutputArray _dst, @@ -1032,12 +1105,92 @@ static void CannyImpl(Mat& dx, Mat& dy, Mat& dst, stack_top = stack_bottom + sz; } - int prev_flag = 0; - for (int j = 0; j < cols; j++) +#define CANNY_SHIFT 15 + const int TG22 = (int)(0.4142135623730950488016887242097*(1<>= 1) + { + if (mask & 0x00000001) + { + m = _mag[k]; + int xs = _x[k]; + int ys = _y[k]; + int x = std::abs(xs); + int y = std::abs(ys) << CANNY_SHIFT; + + int tg22x = x * TG22; + + if (y < tg22x) + { + if (m > _mag[k - 1] && m >= _mag[k + 1]) goto ocv_canny_push_sse; + } + else + { + int tg67x = tg22x + (x << (CANNY_SHIFT + 1)); + if (y > tg67x) + { + if (m > _mag[k + magstep2] && m >= _mag[k + magstep1]) goto ocv_canny_push_sse; + } else + { + int s = (xs ^ ys) < 0 ? -1 : 1; + if (m > _mag[k + magstep2 - s] && m > _mag[k + magstep1 + s]) goto ocv_canny_push_sse; + } + } + } + + prev_flag = 0; + continue; + +ocv_canny_push_sse: + // _map[k-mapstep] is short-circuited at the start because previous thread is + // responsible for initializing it. + if (!prev_flag && m > high && _map[k-mapstep] != 2) + { + CANNY_PUSH(_map + k); + prev_flag = 1; + } else + _map[k] = 0; + + } + + if (prev_flag && ((k < j+16) || (k < cols && _mag[k] <= high))) + prev_flag = 0; + } + } + } +#endif + for (; j < cols; j++) + { int m = _mag[j]; if (m > low) @@ -1112,14 +1265,7 @@ __ocv_canny_push: if (!m[mapstep+1]) CANNY_PUSH(m + mapstep + 1); } - // the final pass, form the final image - const uchar* pmap = map + mapstep + 1; - uchar* pdst = dst.ptr(); - for (int i = 0; i < rows; i++, pmap += mapstep, pdst += dst.step) - { - for (int j = 0; j < cols; j++) - pdst[j] = (uchar)-(pmap[j] >> 1); - } + parallel_for_(Range(0, dst.rows), finalPass(map, dst, mapstep), dst.total()/(double)(1<<16)); } } // namespace cv