From 4a4d94f26658568cd079fdfe39b79e9bcbc775a7 Mon Sep 17 00:00:00 2001
From: mschoeneck <matthias.schoeneck@gmx.de>
Date: Wed, 24 May 2017 15:20:25 +0200
Subject: [PATCH] Merge pull request #8694 from mschoeneck:Canny

Parallelize Canny with custom gradient (#8694)

* New Canny implementation. Restructuring code in parallelCanny class. Align mag buffer and map.

* Fix warnings.

* Missing SIMD check added.

* Replaced local trailingZeros in contours.cpp. Use alignSize in canny.cpp

* Fix warnings in alignSize and allocate just minimum extra columns.

* Fix another warning in map.create.

* Exchange for loop by do loop to avoid double check at the beginning.
Define extra SIMD CANNY_CHECK to avoid unnecessary continue.
---
 .../core/include/opencv2/core/hal/intrin.hpp  |   23 +
 modules/imgproc/src/canny.cpp                 | 1121 +++++++----------
 modules/imgproc/src/contours.cpp              |   46 +-
 3 files changed, 498 insertions(+), 692 deletions(-)
diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
index 9dd1514ea3..d6dedc1d1a 100644
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -433,6 +433,29 @@ template <> struct V_RegTrait128<double> {
 };
 #endif
 
+inline unsigned int trailingZeros32(unsigned int value) {
+#if defined(_MSC_VER)
+#if (_MSC_VER < 1700)
+    unsigned long index = 0;
+    _BitScanForward(&index, value);
+    return (unsigned int)index;
+#else
+    return _tzcnt_u32(value);
+#endif
+#elif defined(__GNUC__) || defined(__GNUG__)
+    return __builtin_ctz(value);
+#elif defined(__ICC) || defined(__INTEL_COMPILER)
+    return _bit_scan_forward(value);
+#elif defined(__clang__)
+    return llvm.cttz.i32(value, true);
+#else
+    static const int MultiplyDeBruijnBitPosition[32] = {
+        0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+        31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 };
+    return MultiplyDeBruijnBitPosition[((uint32_t)((value & -value) * 0x077CB531U)) >> 27];
+#endif
+}
+
 #ifndef CV_DOXYGEN
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 #endif
diff --git a/modules/imgproc/src/canny.cpp b/modules/imgproc/src/canny.cpp
index fbbb804dbf..0106c9e2cd 100644
--- a/modules/imgproc/src/canny.cpp
+++ b/modules/imgproc/src/canny.cpp
@@ -43,7 +43,7 @@
 #include "precomp.hpp"
 #include "opencl_kernels_imgproc.hpp"
 #include "opencv2/core/hal/intrin.hpp"
-#include <queue>
+#include <deque>
 
 #include "opencv2/core/openvx/ovx_defs.hpp"
 
@@ -51,12 +51,13 @@
 #pragma warning( disable: 4127 ) // conditional expression is constant
 #endif
 
+#if CV_SIMD128
+#define CV_MALLOC_SIMD128 16
+#endif
+
 namespace cv
 {
 
-static void CannyImpl(Mat& dx_, Mat& dy_, Mat& _dst, double low_thresh, double high_thresh, bool L2gradient);
-
-
 #ifdef HAVE_IPP
 static bool ipp_Canny(const Mat& src , const Mat& dx_, const Mat& dy_, Mat& dst, float low,  float high, bool L2gradient, int aperture_size)
 {
@@ -285,413 +286,509 @@ static bool ocl_Canny(InputArray _src, const UMat& dx_, const UMat& dy_, OutputA
 
 #endif
 
+#define CANNY_PUSH(map, stack) *map = 2, stack.push_back(map)
+
+#define CANNY_CHECK_SIMD(m, high, map, stack) \
+    if (m > high) \
+        CANNY_PUSH(map, stack); \
+    else \
+        *map = 0
+
+#define CANNY_CHECK(m, high, map, stack) \
+    if (m > high) \
+        CANNY_PUSH(map, stack); \
+    else \
+        *map = 0; \
+    continue
+
 class parallelCanny : public ParallelLoopBody
 {
-
 public:
-    parallelCanny(const Mat& _src, uchar* _map, int _low, int _high, int _aperture_size, bool _L2gradient, std::queue<uchar*> *borderPeaksParallel) :
-        src(_src), map(_map), low(_low), high(_high), aperture_size(_aperture_size), L2gradient(_L2gradient), _borderPeaksParallel(borderPeaksParallel)
+    parallelCanny(const Mat &_src, Mat &_map, std::deque<uchar*> &borderPeaksParallel,
+                  int _low, int _high, int _aperture_size, bool _L2gradient) :
+        src(_src), src2(_src), map(_map), _borderPeaksParallel(borderPeaksParallel),
+        low(_low), high(_high), aperture_size(_aperture_size), L2gradient(_L2gradient)
     {
+#if CV_SIMD128
+        haveSIMD = hasSIMD128();
+        if(haveSIMD)
+            _map.create(src.rows + 2, (int)alignSize((size_t)(src.cols + CV_MALLOC_SIMD128 + 1), CV_MALLOC_SIMD128), CV_8UC1);
+        else
+#endif
+            _map.create(src.rows + 2, src.cols + 2,  CV_8UC1);
+        map = _map;
+        map.row(0).setTo(1);
+        map.row(src.rows + 1).setTo(1);
+        mapstep = map.cols;
+        needGradient = true;
+        cn = src.channels();
     }
 
-    ~parallelCanny()
+    parallelCanny(const Mat &_dx, const Mat &_dy, Mat &_map, std::deque<uchar*> &borderPeaksParallel,
+                  int _low, int _high, bool _L2gradient) :
+        src(_dx), src2(_dy), map(_map), _borderPeaksParallel(borderPeaksParallel),
+        low(_low), high(_high), aperture_size(0), L2gradient(_L2gradient)
     {
+#if CV_SIMD128
+        haveSIMD = hasSIMD128();
+        if(haveSIMD)
+            _map.create(src.rows + 2, (int)alignSize((size_t)(src.cols + CV_MALLOC_SIMD128 + 1), CV_MALLOC_SIMD128), CV_8UC1);
+        else
+#endif
+            _map.create(src.rows + 2, src.cols + 2,  CV_8UC1);
+        map = _map;
+        map.row(0).setTo(1);
+        map.row(src.rows + 1).setTo(1);
+        mapstep = map.cols;
+        needGradient = false;
+        cn = src.channels();
     }
 
+    ~parallelCanny() {}
+
     parallelCanny& operator=(const parallelCanny&) { return *this; }
 
     void operator()(const Range &boundaries) const
     {
-#if CV_SIMD128
-        bool haveSIMD = hasSIMD128();
-#endif
-
-        const int type = src.type(), cn = CV_MAT_CN(type);
-
         Mat dx, dy;
-        std::queue<uchar*> borderPeaksLocal;
-
-        ptrdiff_t mapstep = src.cols + 2;
-
-        // In sobel transform we calculate ksize2 extra lines for the first and last rows of each slice
-        // because IPPDerivSobel expects only isolated ROIs, in contrast with the opencv version which
-        // uses the pixels outside of the ROI to form a border.
-        //
-        // TODO: statement above is not true anymore, so adjustments may be required
-        int ksize2 = aperture_size / 2;
-        // If Scharr filter: aperture_size is 3 and ksize2 is 1
-        if(aperture_size == -1)
-        {
-            ksize2 = 1;
-        }
-
-        if (boundaries.start == 0 && boundaries.end == src.rows)
+        AutoBuffer<short> dxMax(0), dyMax(0);
+        std::deque<uchar*> stack, borderPeaksLocal;
+        const int rowStart = max(0, boundaries.start - 1), rowEnd = min(src.rows, boundaries.end + 1);
+        int *_mag_p, *_mag_a, *_mag_n;
+        short *_dx, *_dy, *_dx_a = NULL, *_dy_a = NULL, *_dx_n = NULL, *_dy_n = NULL;
+        uchar *_pmap;
+
+        if(needGradient)
         {
-            Mat tempdx(boundaries.end - boundaries.start + 2, src.cols, CV_16SC(cn));
-            Mat tempdy(boundaries.end - boundaries.start + 2, src.cols, CV_16SC(cn));
-
-            memset(tempdx.ptr<short>(0), 0, cn * src.cols*sizeof(short));
-            memset(tempdy.ptr<short>(0), 0, cn * src.cols*sizeof(short));
-            memset(tempdx.ptr<short>(tempdx.rows - 1), 0, cn * src.cols*sizeof(short));
-            memset(tempdy.ptr<short>(tempdy.rows - 1), 0, cn * src.cols*sizeof(short));
-
-            Sobel(src, tempdx.rowRange(1, tempdx.rows - 1), CV_16S, 1, 0, aperture_size, 1, 0, BORDER_REPLICATE);
-            Sobel(src, tempdy.rowRange(1, tempdy.rows - 1), CV_16S, 0, 1, aperture_size, 1, 0, BORDER_REPLICATE);
-
-            dx = tempdx;
-            dy = tempdy;
+            Sobel(src.rowRange(rowStart, rowEnd), dx, CV_16S, 1, 0, aperture_size, 1, 0, BORDER_REPLICATE);
+            Sobel(src.rowRange(rowStart, rowEnd), dy, CV_16S, 0, 1, aperture_size, 1, 0, BORDER_REPLICATE);
         }
-        else if (boundaries.start == 0)
+        else
         {
-            Mat tempdx(boundaries.end - boundaries.start + 2 + ksize2, src.cols, CV_16SC(cn));
-            Mat tempdy(boundaries.end - boundaries.start + 2 + ksize2, src.cols, CV_16SC(cn));
-
-            memset(tempdx.ptr<short>(0), 0, cn * src.cols*sizeof(short));
-            memset(tempdy.ptr<short>(0), 0, cn * src.cols*sizeof(short));
-
-            Sobel(src.rowRange(boundaries.start, boundaries.end + 1 + ksize2), tempdx.rowRange(1, tempdx.rows),
-                    CV_16S, 1, 0, aperture_size, 1, 0, BORDER_REPLICATE);
-            Sobel(src.rowRange(boundaries.start, boundaries.end + 1 + ksize2), tempdy.rowRange(1, tempdy.rows),
-                    CV_16S, 0, 1, aperture_size, 1, 0, BORDER_REPLICATE);
-
-            dx = tempdx.rowRange(0, tempdx.rows - ksize2);
-            dy = tempdy.rowRange(0, tempdy.rows - ksize2);
+            dx = src.rowRange(rowStart, rowEnd);
+            dy = src2.rowRange(rowStart, rowEnd);
         }
-        else if (boundaries.end == src.rows)
-        {
-            Mat tempdx(boundaries.end - boundaries.start + 2 + ksize2, src.cols, CV_16SC(cn));
-            Mat tempdy(boundaries.end - boundaries.start + 2 + ksize2, src.cols, CV_16SC(cn));
-
-            memset(tempdx.ptr<short>(tempdx.rows - 1), 0, cn * src.cols*sizeof(short));
-            memset(tempdy.ptr<short>(tempdy.rows - 1), 0, cn * src.cols*sizeof(short));
 
-            Sobel(src.rowRange(boundaries.start - 1 - ksize2, boundaries.end), tempdx.rowRange(0, tempdx.rows - 1),
-                    CV_16S, 1, 0, aperture_size, 1, 0, BORDER_REPLICATE);
-            Sobel(src.rowRange(boundaries.start - 1 - ksize2, boundaries.end), tempdy.rowRange(0, tempdy.rows - 1),
-                    CV_16S, 0, 1, aperture_size, 1, 0, BORDER_REPLICATE);
-
-            dx = tempdx.rowRange(ksize2, tempdx.rows);
-            dy = tempdy.rowRange(ksize2, tempdy.rows);
-        }
-        else
+        if(cn > 1)
         {
-            Mat tempdx(boundaries.end - boundaries.start + 2 + 2*ksize2, src.cols, CV_16SC(cn));
-            Mat tempdy(boundaries.end - boundaries.start + 2 + 2*ksize2, src.cols, CV_16SC(cn));
-
-            Sobel(src.rowRange(boundaries.start - 1 - ksize2, boundaries.end + 1 + ksize2), tempdx,
-                    CV_16S, 1, 0, aperture_size, 1, 0, BORDER_REPLICATE);
-            Sobel(src.rowRange(boundaries.start - 1 - ksize2, boundaries.end + 1 + ksize2), tempdy,
-                    CV_16S, 0, 1, aperture_size, 1, 0, BORDER_REPLICATE);
-
-            dx = tempdx.rowRange(ksize2, tempdx.rows - ksize2);
-            dy = tempdy.rowRange(ksize2, tempdy.rows - ksize2);
+            dxMax.allocate(2 * dx.cols);
+            dyMax.allocate(2 * dy.cols);
+            _dx_a = (short*)dxMax;
+            _dx_n = _dx_a + dx.cols;
+            _dy_a = (short*)dyMax;
+            _dy_n = _dy_a + dy.cols;
         }
 
-        int maxsize = std::max(1 << 10, src.cols * (boundaries.end - boundaries.start) / 10);
-        std::vector<uchar*> stack(maxsize);
-        uchar **stack_top = &stack[0];
-        uchar **stack_bottom = &stack[0];
+        // _mag_p: previous row, _mag_a: actual row, _mag_n: next row
+#if CV_SIMD128
+        AutoBuffer<int> buffer(3 * (mapstep * cn + CV_MALLOC_SIMD128));
+        _mag_p = alignPtr((int*)buffer + 1, CV_MALLOC_SIMD128);
+        _mag_a = alignPtr(_mag_p + mapstep * cn, CV_MALLOC_SIMD128);
+        _mag_n = alignPtr(_mag_a + mapstep * cn, CV_MALLOC_SIMD128);
+#else
+        AutoBuffer<int> buffer(3 * (mapstep * cn));
+        _mag_p = (int*)buffer + 1;
+        _mag_a = _mag_p + mapstep * cn;
+        _mag_n = _mag_a + mapstep * cn;
+#endif
 
-        AutoBuffer<uchar> buffer(cn * mapstep * 3 * sizeof(int));
+        // For the first time when just 2 rows are filled and for left and right borders
+        if(rowStart == boundaries.start)
+            memset(_mag_n - 1, 0, mapstep * sizeof(int));
+        else
+            _mag_n[src.cols] = _mag_n[-1] = 0;
 
-        int* mag_buf[3];
-        mag_buf[0] = (int*)(uchar*)buffer;
-        mag_buf[1] = mag_buf[0] + mapstep*cn;
-        mag_buf[2] = mag_buf[1] + mapstep*cn;
+        _mag_a[src.cols] = _mag_a[-1] = _mag_p[src.cols] = _mag_p[-1] = 0;
 
         // calculate magnitude and angle of gradient, perform non-maxima suppression.
         // fill the map with one of the following values:
         //   0 - the pixel might belong to an edge
         //   1 - the pixel can not belong to an edge
         //   2 - the pixel does belong to an edge
-        for (int i = boundaries.start - 1; i <= boundaries.end; i++)
+        for (int i = rowStart; i <= boundaries.end; ++i)
         {
-            int* _norm = mag_buf[(i > boundaries.start) - (i == boundaries.start - 1) + 1] + 1;
-
-            short* _dx = dx.ptr<short>(i - boundaries.start + 1);
-            short* _dy = dy.ptr<short>(i - boundaries.start + 1);
+            // Scroll the ring buffer
+            std::swap(_mag_n, _mag_a);
+            std::swap(_mag_n, _mag_p);
 
-            if (!L2gradient)
+            if(i < rowEnd)
             {
-                int j = 0, width = src.cols * cn;
+                // Next row calculation
+                _dx = dx.ptr<short>(i - rowStart);
+                _dy = dy.ptr<short>(i - rowStart);
+
+                if (L2gradient)
+                {
+                    int j = 0, width = src.cols * cn;
 #if CV_SIMD128
-                if (haveSIMD)
+                    if (haveSIMD)
+                    {
+                       for ( ; j <= width - 8; j += 8)
+                        {
+                            v_int16x8 v_dx = v_load((const short*)(_dx + j));
+                            v_int16x8 v_dy = v_load((const short*)(_dy + j));
+
+                            v_int32x4 v_dxp_low, v_dxp_high;
+                            v_int32x4 v_dyp_low, v_dyp_high;
+                            v_expand(v_dx, v_dxp_low, v_dxp_high);
+                            v_expand(v_dy, v_dyp_low, v_dyp_high);
+
+                            v_store_aligned((int *)(_mag_n + j), v_dxp_low*v_dxp_low+v_dyp_low*v_dyp_low);
+                            v_store_aligned((int *)(_mag_n + j + 4), v_dxp_high*v_dxp_high+v_dyp_high*v_dyp_high);
+                        }
+                    }
+#endif
+                    for ( ; j < width; ++j)
+                        _mag_n[j] = int(_dx[j])*_dx[j] + int(_dy[j])*_dy[j];
+                }
+                else
                 {
-                    for ( ; j <= width - 8; j += 8)
+                    int j = 0, width = src.cols * cn;
+#if CV_SIMD128
+                    if (haveSIMD)
                     {
-                        v_int16x8 v_dx = v_load((const short *)(_dx + j));
-                        v_int16x8 v_dy = v_load((const short *)(_dy + j));
+                        for(; j <= width - 8; j += 8)
+                        {
+                            v_int16x8 v_dx = v_load((const short *)(_dx + j));
+                            v_int16x8 v_dy = v_load((const short *)(_dy + j));
 
-                        v_dx = v_reinterpret_as_s16(v_abs(v_dx));
-                        v_dy = v_reinterpret_as_s16(v_abs(v_dy));
+                            v_dx = v_reinterpret_as_s16(v_abs(v_dx));
+                            v_dy = v_reinterpret_as_s16(v_abs(v_dy));
 
-                        v_int32x4 v_dx_ml;
-                        v_int32x4 v_dy_ml;
-                        v_int32x4 v_dx_mh;
-                        v_int32x4 v_dy_mh;
-                        v_expand(v_dx, v_dx_ml, v_dx_mh);
-                        v_expand(v_dy, v_dy_ml, v_dy_mh);
+                            v_int32x4 v_dx_ml, v_dy_ml, v_dx_mh, v_dy_mh;
+                            v_expand(v_dx, v_dx_ml, v_dx_mh);
+                            v_expand(v_dy, v_dy_ml, v_dy_mh);
 
-                        v_store((int *)(_norm + j), v_dx_ml + v_dy_ml);
-                        v_store((int *)(_norm + j + 4), v_dx_mh + v_dy_mh);
+                            v_store_aligned((int *)(_mag_n + j), v_dx_ml + v_dy_ml);
+                            v_store_aligned((int *)(_mag_n + j + 4), v_dx_mh + v_dy_mh);
+                        }
                     }
-                }
 #endif
-                for ( ; j < width; ++j)
-                    _norm[j] = std::abs(int(_dx[j])) + std::abs(int(_dy[j]));
-            }
-            else
-            {
-                int j = 0, width = src.cols * cn;
-#if CV_SIMD128
-                if (haveSIMD)
+                    for ( ; j < width; ++j)
+                        _mag_n[j] = std::abs(int(_dx[j])) + std::abs(int(_dy[j]));
+                }
+
+                if(cn > 1)
                 {
-                   for ( ; j <= width - 8; j += 8)
-                    {
-                        v_int16x8 v_dx = v_load((const short*)(_dx + j));
-                        v_int16x8 v_dy = v_load((const short*)(_dy + j));
+                    std::swap(_dx_n, _dx_a);
+                    std::swap(_dy_n, _dy_a);
 
-                        v_int32x4 v_dxp_low, v_dxp_high;
-                        v_int32x4 v_dyp_low, v_dyp_high;
-                        v_expand(v_dx, v_dxp_low, v_dxp_high);
-                        v_expand(v_dy, v_dyp_low, v_dyp_high);
+                    for(int j = 0, jn = 0; j < src.cols; ++j, jn += cn)
+                    {
+                        int maxIdx = jn;
+                        for(int k = 1; k < cn; ++k)
+                            if(_mag_n[jn + k] > _mag_n[maxIdx]) maxIdx = jn + k;
 
-                        v_store((int *)(_norm + j), v_dxp_low*v_dxp_low+v_dyp_low*v_dyp_low);
-                        v_store((int *)(_norm + j + 4), v_dxp_high*v_dxp_high+v_dyp_high*v_dyp_high);
+                        _mag_n[j] = _mag_n[maxIdx];
+                        _dx_n[j] = _dx[maxIdx];
+                        _dy_n[j] = _dy[maxIdx];
                     }
+
+                    _mag_n[src.cols] = 0;
                 }
-#endif
-                for ( ; j < width; ++j)
-                    _norm[j] = int(_dx[j])*_dx[j] + int(_dy[j])*_dy[j];
-            }
 
-            if (cn > 1)
+                // at the very beginning we do not have a complete ring
+                // buffer of 3 magnitude rows for non-maxima suppression
+                if (i <= boundaries.start)
+                    continue;
+            }
+            else
             {
-                for(int j = 0, jn = 0; j < src.cols; ++j, jn += cn)
+                memset(_mag_n - 1, 0, mapstep * sizeof(int));
+
+                if(cn > 1)
                 {
-                    int maxIdx = jn;
-                    for(int k = 1; k < cn; ++k)
-                        if(_norm[jn + k] > _norm[maxIdx]) maxIdx = jn + k;
-                    _norm[j] = _norm[maxIdx];
-                    _dx[j] = _dx[maxIdx];
-                    _dy[j] = _dy[maxIdx];
+                    std::swap(_dx_n, _dx_a);
+                    std::swap(_dy_n, _dy_a);
                 }
             }
-            _norm[-1] = _norm[src.cols] = 0;
-
-            // at the very beginning we do not have a complete ring
-            // buffer of 3 magnitude rows for non-maxima suppression
-            if (i <= boundaries.start)
-                continue;
-
-            uchar* _map = map + mapstep*i + 1;
-            _map[-1] = _map[src.cols] = 1;
 
-            int* _mag = mag_buf[1] + 1; // take the central row
-            ptrdiff_t magstep1 = mag_buf[2] - mag_buf[1];
-            ptrdiff_t magstep2 = mag_buf[0] - mag_buf[1];
+            // From here actual src row is (i - 1)
+            // Set left and right border to 1
+#if CV_SIMD128
+            if(haveSIMD)
+                _pmap = map.ptr<uchar>(i) + CV_MALLOC_SIMD128;
+            else
+#endif
+                _pmap = map.ptr<uchar>(i) + 1;
 
-            const short* _x = dx.ptr<short>(i - boundaries.start);
-            const short* _y = dy.ptr<short>(i - boundaries.start);
+            _pmap[src.cols] =_pmap[-1] = 1;
 
-            if ((stack_top - stack_bottom) + src.cols > maxsize)
+            if(cn == 1)
             {
-                int sz = (int)(stack_top - stack_bottom);
-                maxsize = std::max(maxsize * 3/2, sz + src.cols);
-                stack.resize(maxsize);
-                stack_bottom = &stack[0];
-                stack_top = stack_bottom + sz;
+                _dx = dx.ptr<short>(i - rowStart - 1);
+                _dy = dy.ptr<short>(i - rowStart - 1);
+            }
+            else
+            {
+                _dx = _dx_a;
+                _dy = _dy_a;
             }
 
-#define CANNY_PUSH(d)    *(d) = uchar(2), *stack_top++ = (d)
-#define CANNY_POP(d)     (d) = *--stack_top
-
-#define CANNY_SHIFT 15
-            const int TG22 = (int)(0.4142135623730950488016887242097*(1 << CANNY_SHIFT) + 0.5);
-
-            int prev_flag = 0, j = 0;
+            const int TG22 = 13573;
+            int j = 0;
 #if CV_SIMD128
             if (haveSIMD)
             {
-                v_int32x4 v_low = v_setall_s32(low);
-                v_int8x16 v_one = v_setall_s8(1);
+                const v_int32x4 v_low = v_setall_s32(low);
+                const v_int8x16 v_one = v_setall_s8(1);
 
-                for (; j <= src.cols - 16; j += 16)
+                for (; j <= src.cols - 32; j += 32)
                 {
-                    v_int32x4 v_m1 = v_load((const int*)(_mag + j));
-                    v_int32x4 v_m2 = v_load((const int*)(_mag + j + 4));
-                    v_int32x4 v_m3 = v_load((const int*)(_mag + j + 8));
-                    v_int32x4 v_m4 = v_load((const int*)(_mag + j + 12));
-
-                    v_store((signed char*)(_map + j), v_one);
+                    v_int32x4 v_m1 = v_load_aligned((const int*)(_mag_a + j));
+                    v_int32x4 v_m2 = v_load_aligned((const int*)(_mag_a + j + 4));
+                    v_int32x4 v_m3 = v_load_aligned((const int*)(_mag_a + j + 8));
+                    v_int32x4 v_m4 = v_load_aligned((const int*)(_mag_a + j + 12));
 
                     v_int32x4 v_cmp1 = v_m1 > v_low;
                     v_int32x4 v_cmp2 = v_m2 > v_low;
                     v_int32x4 v_cmp3 = v_m3 > v_low;
                     v_int32x4 v_cmp4 = v_m4 > v_low;
 
+                    v_m1 = v_load_aligned((const int*)(_mag_a + j + 16));
+                    v_m2 = v_load_aligned((const int*)(_mag_a + j + 20));
+                    v_m3 = v_load_aligned((const int*)(_mag_a + j + 24));
+                    v_m4 = v_load_aligned((const int*)(_mag_a + j + 28));
+
+                    v_store_aligned((signed char*)(_pmap + j), v_one);
+                    v_store_aligned((signed char*)(_pmap + j + 16), v_one);
+
                     v_int16x8 v_cmp80 = v_pack(v_cmp1, v_cmp2);
                     v_int16x8 v_cmp81 = v_pack(v_cmp3, v_cmp4);
 
+                    v_cmp1 = v_m1 > v_low;
+                    v_cmp2 = v_m2 > v_low;
+                    v_cmp3 = v_m3 > v_low;
+                    v_cmp4 = v_m4 > v_low;
+
                     v_int8x16 v_cmp = v_pack(v_cmp80, v_cmp81);
+
+                    v_cmp80 = v_pack(v_cmp1, v_cmp2);
+                    v_cmp81 = v_pack(v_cmp3, v_cmp4);
+
                     unsigned int mask = v_signmask(v_cmp);
 
+                    v_cmp = v_pack(v_cmp80, v_cmp81);
+                    mask |= v_signmask(v_cmp) << 16;
+
                     if (mask)
                     {
-                        int m, k = j;
+                        int k = j;
 
-                        for (; mask; ++k, mask >>= 1)
+                        do
                         {
-                            if (mask & 0x00000001)
-                            {
-                                m = _mag[k];
-                                int xs = _x[k];
-                                int ys = _y[k];
-                                int x = std::abs(xs);
-                                int y = std::abs(ys) << CANNY_SHIFT;
+                            int l = trailingZeros32(mask);
+                            k += l;
+                            mask >>= l;
+
+                            int m = _mag_a[k];
+                            short xs = _dx[k];
+                            short ys = _dy[k];
+                            int x = (int)std::abs(xs);
+                            int y = (int)std::abs(ys) << 15;
 
-                                int tg22x = x * TG22;
+                            int tg22x = x * TG22;
 
-                                if (y < tg22x)
+                            if (y < tg22x)
+                            {
+                                if (m > _mag_a[k - 1] && m >= _mag_a[k + 1])
                                 {
-                                    if (m > _mag[k - 1] && m >= _mag[k + 1]) goto _canny_push_sse;
+                                    CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
                                 }
-                                else
+                            }
+                            else
+                            {
+                                int tg67x = tg22x + (x << 16);
+                                if (y > tg67x)
                                 {
-                                    int tg67x = tg22x + (x << (CANNY_SHIFT + 1));
-                                    if (y > tg67x)
+                                    if (m > _mag_p[k] && m >= _mag_n[k])
                                     {
-                                        if (m > _mag[k + magstep2] && m >= _mag[k + magstep1]) goto _canny_push_sse;
-                                    } else
+                                        CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
+                                    }
+                                }
+                                else
+                                {
+                                    int s = (xs ^ ys) < 0 ? -1 : 1;
+                                    if(m > _mag_p[k - s] && m > _mag_n[k + s])
                                     {
-                                        int s = (xs ^ ys) < 0 ? -1 : 1;
-                                        if (m > _mag[k + magstep2 - s] && m > _mag[k + magstep1 + s]) goto _canny_push_sse;
+                                        CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
                                     }
                                 }
                             }
+                            ++k;
+                        } while((mask >>= 1));
+                    }
+                }
 
-                            prev_flag = 0;
-                            continue;
+                if (j <= src.cols - 16)
+                {
+                    v_int32x4 v_m1 = v_load_aligned((const int*)(_mag_a + j));
+                    v_int32x4 v_m2 = v_load_aligned((const int*)(_mag_a + j + 4));
+                    v_int32x4 v_m3 = v_load_aligned((const int*)(_mag_a + j + 8));
+                    v_int32x4 v_m4 = v_load_aligned((const int*)(_mag_a + j + 12));
 
-_canny_push_sse:
-                            // _map[k-mapstep] is short-circuited at the start because previous thread is
-                            // responsible for initializing it.
-                            if (m > high && !prev_flag  && (i <= boundaries.start + 1 || _map[k - mapstep] != 2))
-                            {
-                                CANNY_PUSH(_map + k);
-                                prev_flag = 1;
-                            } else
-                                _map[k] = 0;
+                    v_store_aligned((signed char*)(_pmap + j), v_one);
 
-                        }
+                    v_int32x4 v_cmp1 = v_m1 > v_low;
+                    v_int32x4 v_cmp2 = v_m2 > v_low;
+                    v_int32x4 v_cmp3 = v_m3 > v_low;
+                    v_int32x4 v_cmp4 = v_m4 > v_low;
+
+                    v_int16x8 v_cmp80 = v_pack(v_cmp1, v_cmp2);
+                    v_int16x8 v_cmp81 = v_pack(v_cmp3, v_cmp4);
+
+                    v_int8x16 v_cmp = v_pack(v_cmp80, v_cmp81);
+                    unsigned int mask = v_signmask(v_cmp);
+
+                    if (mask)
+                    {
+                        int k = j;
+
+                        do
+                        {
+                            int l = trailingZeros32(mask);
+                            k += l;
+                            mask >>= l;
+
+                            int m = _mag_a[k];
+                            short xs = _dx[k];
+                            short ys = _dy[k];
+                            int x = (int)std::abs(xs);
+                            int y = (int)std::abs(ys) << 15;
 
-                        if (prev_flag && ((k < j+16) || (k < src.cols && _mag[k] <= high)))
-                            prev_flag = 0;
+                            int tg22x = x * TG22;
+
+                            if (y < tg22x)
+                            {
+                                if (m > _mag_a[k - 1] && m >= _mag_a[k + 1])
+                                {
+                                    CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
+                                }
+                            }
+                            else
+                            {
+                                int tg67x = tg22x + (x << 16);
+                                if (y > tg67x)
+                                {
+                                    if (m > _mag_p[k] && m >= _mag_n[k])
+                                    {
+                                        CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
+                                    }
+                                }
+                                else
+                                {
+                                    int s = (xs ^ ys) < 0 ? -1 : 1;
+                                    if(m > _mag_p[k - s] && m > _mag_n[k + s])
+                                    {
+                                        CANNY_CHECK_SIMD(m, high, (_pmap+k), stack);
+                                    }
+                                }
+                            }
+                            ++k;
+                        } while((mask >>= 1));
                     }
+                    j += 16;
                 }
             }
 #endif
             for (; j < src.cols; j++)
             {
-                int m = _mag[j];
+                int m = _mag_a[j];
 
                 if (m > low)
                 {
-                    int xs = _x[j];
-                    int ys = _y[j];
-                    int x = std::abs(xs);
-                    int y = std::abs(ys) << CANNY_SHIFT;
+                    short xs = _dx[j];
+                    short ys = _dy[j];
+                    int x = (int)std::abs(xs);
+                    int y = (int)std::abs(ys) << 15;
 
                     int tg22x = x * TG22;
 
                     if (y < tg22x)
                     {
-                        if (m > _mag[j-1] && m >= _mag[j+1]) goto _canny_push;
+                        if (m > _mag_a[j - 1] && m >= _mag_a[j + 1])
+                        {
+                            CANNY_CHECK(m, high, (_pmap+j), stack);
+                        }
                     }
                     else
                     {
-                        int tg67x = tg22x + (x << (CANNY_SHIFT+1));
+                        int tg67x = tg22x + (x << 16);
                         if (y > tg67x)
                         {
-                            if (m > _mag[j+magstep2] && m >= _mag[j+magstep1]) goto _canny_push;
+                            if (m > _mag_p[j] && m >= _mag_n[j])
+                            {
+                                CANNY_CHECK(m, high, (_pmap+j), stack);
+                            }
                         }
                         else
                         {
                             int s = (xs ^ ys) < 0 ? -1 : 1;
-                            if (m > _mag[j+magstep2-s] && m > _mag[j+magstep1+s]) goto _canny_push;
+                            if(m > _mag_p[j - s] && m > _mag_n[j + s])
+                            {
+                                CANNY_CHECK(m, high, (_pmap+j), stack);
+                            }
                         }
                     }
                 }
-
-                prev_flag = 0;
-                _map[j] = uchar(1);
-                continue;
-
-_canny_push:
-                // _map[j-mapstep] is short-circuited at the start because previous thread is
-                // responsible for initializing it.
-                if (!prev_flag && m > high && (i <= boundaries.start+1 || _map[j-mapstep] != 2) )
-                {
-                    CANNY_PUSH(_map + j);
-                    prev_flag = 1;
-                }
-                else
-                    _map[j] = 0;
+                _pmap[j] = 1;
             }
-
-            // scroll the ring buffer
-            _mag = mag_buf[0];
-            mag_buf[0] = mag_buf[1];
-            mag_buf[1] = mag_buf[2];
-            mag_buf[2] = _mag;
         }
 
+        // Not for first row of first slice or last row of last slice
+        uchar *pmapLower = (rowStart == 0) ? map.data : (map.data + (boundaries.start + 2) * mapstep);
+        uint pmapDiff = (uint)(((rowEnd == src.rows) ? map.datalimit : (map.data + boundaries.end * mapstep)) - pmapLower);
+
         // now track the edges (hysteresis thresholding)
-        while (stack_top > stack_bottom)
+        while (!stack.empty())
         {
-            if ((stack_top - stack_bottom) + 8 > maxsize)
-            {
-                int sz = (int)(stack_top - stack_bottom);
-                maxsize = maxsize * 3/2;
-                stack.resize(maxsize);
-                stack_bottom = &stack[0];
-                stack_top = stack_bottom + sz;
-            }
-
-            uchar* m;
-            CANNY_POP(m);
+            uchar *m = stack.back();
+            stack.pop_back();
 
             // Stops thresholding from expanding to other slices by sending pixels in the borders of each
             // slice in a queue to be serially processed later.
-            if ( (m < map + (boundaries.start + 2) * mapstep) || (m >= map + boundaries.end * mapstep) )
+            if((unsigned)(m - pmapLower) < pmapDiff)
             {
-                borderPeaksLocal.push(m);
-                continue;
+                if (!m[-mapstep-1]) CANNY_PUSH((m-mapstep-1), stack);
+                if (!m[-mapstep])   CANNY_PUSH((m-mapstep), stack);
+                if (!m[-mapstep+1]) CANNY_PUSH((m-mapstep+1), stack);
+                if (!m[-1])         CANNY_PUSH((m-1), stack);
+                if (!m[1])          CANNY_PUSH((m+1), stack);
+                if (!m[mapstep-1])  CANNY_PUSH((m+mapstep-1), stack);
+                if (!m[mapstep])    CANNY_PUSH((m+mapstep), stack);
+                if (!m[mapstep+1])  CANNY_PUSH((m+mapstep+1), stack);
+            }
+            else
+            {
+                borderPeaksLocal.push_back(m);
+                ptrdiff_t mapstep2 = m < pmapLower ? mapstep : -mapstep;
+
+                if (!m[-1])         CANNY_PUSH((m-1), stack);
+                if (!m[1])          CANNY_PUSH((m+1), stack);
+                if (!m[mapstep2-1]) CANNY_PUSH((m+mapstep2-1), stack);
+                if (!m[mapstep2])   CANNY_PUSH((m+mapstep2), stack);
+                if (!m[mapstep2+1]) CANNY_PUSH((m+mapstep2+1), stack);
             }
-
-            if (!m[-1])         CANNY_PUSH(m - 1);
-            if (!m[1])          CANNY_PUSH(m + 1);
-            if (!m[-mapstep-1]) CANNY_PUSH(m - mapstep - 1);
-            if (!m[-mapstep])   CANNY_PUSH(m - mapstep);
-            if (!m[-mapstep+1]) CANNY_PUSH(m - mapstep + 1);
-            if (!m[mapstep-1])  CANNY_PUSH(m + mapstep - 1);
-            if (!m[mapstep])    CANNY_PUSH(m + mapstep);
-            if (!m[mapstep+1])  CANNY_PUSH(m + mapstep + 1);
         }
 
-        AutoLock lock(mutex);
-        while (!borderPeaksLocal.empty()) {
-            _borderPeaksParallel->push(borderPeaksLocal.front());
-            borderPeaksLocal.pop();
+        if(!borderPeaksLocal.empty())
+        {
+            AutoLock lock(mutex);
+            _borderPeaksParallel.insert(_borderPeaksParallel.end(), borderPeaksLocal.begin(), borderPeaksLocal.end());
         }
     }
 
 private:
-    const Mat& src;
-    uchar* map;
+    const Mat &src, &src2;
+    Mat &map;
+    std::deque<uchar*> &_borderPeaksParallel;
     int low, high, aperture_size;
-    bool L2gradient;
-    std::queue<uchar*> *_borderPeaksParallel;
+    bool L2gradient, needGradient;
+    ptrdiff_t mapstep;
+    int cn;
+#if CV_SIMD128
+    bool haveSIMD;
+#endif
     mutable Mutex mutex;
 };
 
@@ -699,8 +796,14 @@ class finalPass : public ParallelLoopBody
 {
 
 public:
-    finalPass(uchar *_map, Mat &_dst, ptrdiff_t _mapstep) :
-        map(_map), dst(_dst), mapstep(_mapstep) {}
+    finalPass(const Mat &_map, Mat &_dst) :
+        map(_map), dst(_dst)
+    {
+        dst = _dst;
+#if CV_SIMD128
+        haveSIMD = hasSIMD128();
+#endif
+    }
 
     ~finalPass() {}
 
@@ -709,28 +812,26 @@ public:
     void operator()(const Range &boundaries) const
     {
         // the final pass, form the final image
-        const uchar* pmap = map + mapstep + 1 + (ptrdiff_t)(mapstep * boundaries.start);
-        uchar* pdst = dst.ptr() + (ptrdiff_t)(dst.step * boundaries.start);
-
-#if CV_SIMD128
-        bool haveSIMD = hasSIMD128();
-#endif
-
-        for (int i = boundaries.start; i < boundaries.end; i++, pmap += mapstep, pdst += dst.step)
+        for (int i = boundaries.start; i < boundaries.end; i++)
         {
             int j = 0;
+            uchar *pdst = dst.ptr<uchar>(i);
+            uchar *pmap;
+#if CV_SIMD128
+            if(haveSIMD)
+                pmap = (uchar*)map.ptr<uchar>(i + 1) + CV_MALLOC_SIMD128;
+            else
+#endif
+                pmap = (uchar*)map.ptr<uchar>(i + 1) + 1;
 #if CV_SIMD128
             if(haveSIMD) {
                 const v_int8x16 v_zero = v_setzero_s8();
 
                 for(; j <= dst.cols - 32; j += 32) {
-                    v_uint8x16 v_pmap1 = v_load((const unsigned char*)(pmap + j));
-                    v_uint8x16 v_pmap2 = v_load((const unsigned char*)(pmap + j + 16));
+                    v_uint8x16 v_pmap1 = v_load_aligned((const unsigned char*)(pmap + j));
+                    v_uint8x16 v_pmap2 = v_load_aligned((const unsigned char*)(pmap + j + 16));
 
-                    v_uint16x8 v_pmaplo1;
-                    v_uint16x8 v_pmaphi1;
-                    v_uint16x8 v_pmaplo2;
-                    v_uint16x8 v_pmaphi2;
+                    v_uint16x8 v_pmaplo1, v_pmaphi1, v_pmaplo2, v_pmaphi2;
                     v_expand(v_pmap1, v_pmaplo1, v_pmaphi1);
                     v_expand(v_pmap2, v_pmaplo2, v_pmaphi2);
 
@@ -749,8 +850,8 @@ public:
                     v_store((pdst + j + 16), v_pmap2);
                 }
 
-                for(; j <= dst.cols - 16; j += 16) {
-                    v_uint8x16 v_pmap = v_load((const unsigned char*)(pmap + j));
+                if(j <= dst.cols - 16) {
+                    v_uint8x16 v_pmap = v_load_aligned((const unsigned char*)(pmap + j));
 
                     v_uint16x8 v_pmaplo;
                     v_uint16x8 v_pmaphi;
@@ -763,6 +864,24 @@ public:
                     v_pmap = v_reinterpret_as_u8(v_zero - v_reinterpret_as_s8(v_pmap));
 
                     v_store((pdst + j), v_pmap);
+                    j += 16;
+                }
+
+                if(j <= dst.cols - 8) {
+                    v_uint8x16 v_pmap = v_load_halves((const unsigned char*)(pmap + j), (const unsigned char*)(pmap + j));
+
+                    v_uint16x8 v_pmaplo;
+                    v_uint16x8 v_pmaphi;
+                    v_expand(v_pmap, v_pmaplo, v_pmaphi);
+
+                    v_pmaplo = v_pmaplo >> 1;
+                    v_pmaphi = v_pmaphi >> 1;
+
+                    v_pmap = v_pack(v_pmaplo, v_pmaphi);
+                    v_pmap = v_reinterpret_as_u8(v_zero - v_reinterpret_as_s8(v_pmap));
+
+                    v_store_low((pdst + j), v_pmap);
+                    j += 8;
                 }
             }
 #endif
@@ -772,9 +891,11 @@ public:
     }
 
 private:
-    uchar *map;
+    const Mat &map;
     Mat &dst;
-    ptrdiff_t mapstep;
+#if CV_SIMD128
+    bool haveSIMD;
+#endif
 };
 
 #ifdef HAVE_OPENVX
@@ -839,10 +960,10 @@ void Canny( InputArray _src, OutputArray _dst,
 {
     CV_INSTRUMENT_REGION()
 
-    const int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    CV_Assert( _src.depth() == CV_8U );
+
     const Size size = _src.size();
 
-    CV_Assert( depth == CV_8U );
     _dst.create(size, CV_8U);
 
     if (!L2gradient && (aperture_size & CV_CANNY_L2_GRADIENT) == CV_CANNY_L2_GRADIENT)
@@ -858,8 +979,8 @@ void Canny( InputArray _src, OutputArray _dst,
     if (low_thresh > high_thresh)
         std::swap(low_thresh, high_thresh);
 
-    CV_OCL_RUN(_dst.isUMat() && (cn == 1 || cn == 3),
-               ocl_Canny<false>(_src, UMat(), UMat(), _dst, (float)low_thresh, (float)high_thresh, aperture_size, L2gradient, cn, size))
+    CV_OCL_RUN(_dst.isUMat() && (_src.channels() == 1 || _src.channels() == 3),
+               ocl_Canny<false>(_src, UMat(), UMat(), _dst, (float)low_thresh, (float)high_thresh, aperture_size, L2gradient, _src.channels(), size))
 
     Mat src = _src.getMat(), dst = _dst.getMat();
 
@@ -895,60 +1016,40 @@ void Canny( InputArray _src, OutputArray _dst,
     int low = cvFloor(low_thresh);
     int high = cvFloor(high_thresh);
 
-    ptrdiff_t mapstep = src.cols + 2;
-    AutoBuffer<uchar> buffer((src.cols+2)*(src.rows+2) + cn * mapstep * 3 * sizeof(int));
-
-    int* mag_buf[3];
-    mag_buf[0] = (int*)(uchar*)buffer;
-    mag_buf[1] = mag_buf[0] + mapstep*cn;
-    mag_buf[2] = mag_buf[1] + mapstep*cn;
-    memset(mag_buf[0], 0, /* cn* */mapstep*sizeof(int));
-
-    uchar *map = (uchar*)(mag_buf[2] + mapstep*cn);
-    memset(map, 1, mapstep);
-    memset(map + mapstep*(src.rows + 1), 1, mapstep);
-
+    // If Scharr filter: aperture size is 3, ksize2 is 1
+    int ksize2 = aperture_size < 0 ? 1 : aperture_size / 2;
     // Minimum number of threads should be 1, maximum should not exceed number of CPU's, because of overhead
     int numOfThreads = std::max(1, std::min(getNumThreads(), getNumberOfCPUs()));
-
     // Make a fallback for pictures with too few rows.
     int grainSize = src.rows / numOfThreads;
-    int ksize2 = aperture_size / 2;
-    // If Scharr filter: aperture size is 3, ksize2 is 1
-    if(aperture_size == -1)
-    {
-        ksize2 = 1;
-    }
-
     int minGrainSize = 2 * (ksize2 + 1);
     if (grainSize < minGrainSize)
-    {
         numOfThreads = std::max(1, src.rows / minGrainSize);
-    }
-
-    std::queue<uchar*> borderPeaksParallel;
 
-    parallel_for_(Range(0, src.rows), parallelCanny(src, map, low, high, aperture_size, L2gradient, &borderPeaksParallel), numOfThreads);
+    Mat map;
+    std::deque<uchar*> stack;
 
-#define CANNY_PUSH_SERIAL(d)    *(d) = uchar(2), borderPeaksParallel.push(d)
+    parallel_for_(Range(0, src.rows), parallelCanny(src, map, stack, low, high, aperture_size, L2gradient), numOfThreads);
 
     // now track the edges (hysteresis thresholding)
-    uchar* m;
-    while (!borderPeaksParallel.empty())
+    ptrdiff_t mapstep = map.cols;
+
+    while (!stack.empty())
     {
-        m = borderPeaksParallel.front();
-        borderPeaksParallel.pop();
-        if (!m[-1])         CANNY_PUSH_SERIAL(m - 1);
-        if (!m[1])          CANNY_PUSH_SERIAL(m + 1);
-        if (!m[-mapstep-1]) CANNY_PUSH_SERIAL(m - mapstep - 1);
-        if (!m[-mapstep])   CANNY_PUSH_SERIAL(m - mapstep);
-        if (!m[-mapstep+1]) CANNY_PUSH_SERIAL(m - mapstep + 1);
-        if (!m[mapstep-1])  CANNY_PUSH_SERIAL(m + mapstep - 1);
-        if (!m[mapstep])    CANNY_PUSH_SERIAL(m + mapstep);
-        if (!m[mapstep+1])  CANNY_PUSH_SERIAL(m + mapstep + 1);
+        uchar* m = stack.back();
+        stack.pop_back();
+
+        if (!m[-mapstep-1]) CANNY_PUSH((m-mapstep-1), stack);
+        if (!m[-mapstep])   CANNY_PUSH((m-mapstep), stack);
+        if (!m[-mapstep+1]) CANNY_PUSH((m-mapstep+1), stack);
+        if (!m[-1])         CANNY_PUSH((m-1), stack);
+        if (!m[1])          CANNY_PUSH((m+1), stack);
+        if (!m[mapstep-1])  CANNY_PUSH((m+mapstep-1), stack);
+        if (!m[mapstep])    CANNY_PUSH((m+mapstep), stack);
+        if (!m[mapstep+1])  CANNY_PUSH((m+mapstep+1), stack);
     }
 
-    parallel_for_(Range(0, dst.rows), finalPass(map, dst, mapstep), dst.total()/(double)(1<<16));
+    parallel_for_(Range(0, src.rows), finalPass(map, dst), src.total()/(double)(1<<16));
 }
 
 void Canny( InputArray _dx, InputArray _dy, OutputArray _dst,
@@ -965,11 +1066,10 @@ void Canny( InputArray _dx, InputArray _dy, OutputArray _dst,
     if (low_thresh > high_thresh)
         std::swap(low_thresh, high_thresh);
 
-    const int cn = _dx.channels();
     const Size size = _dx.size();
 
     CV_OCL_RUN(_dst.isUMat(),
-               ocl_Canny<true>(UMat(), _dx.getUMat(), _dy.getUMat(), _dst, (float)low_thresh, (float)high_thresh, 0, L2gradient, cn, size))
+               ocl_Canny<true>(UMat(), _dx.getUMat(), _dy.getUMat(), _dst, (float)low_thresh, (float)high_thresh, 0, L2gradient, _dx.channels(), size))
 
     _dst.create(size, CV_8U);
     Mat dst = _dst.getMat();
@@ -979,20 +1079,6 @@ void Canny( InputArray _dx, InputArray _dy, OutputArray _dst,
 
     CV_IPP_RUN_FAST(ipp_Canny(Mat(), dx, dy, dst, (float)low_thresh, (float)high_thresh, L2gradient, 0))
 
-    if (cn > 1)
-    {
-        dx = dx.clone();
-        dy = dy.clone();
-    }
-    CannyImpl(dx, dy, dst, low_thresh, high_thresh, L2gradient);
-}
-
-static void CannyImpl(Mat& dx, Mat& dy, Mat& dst,
-    double low_thresh, double high_thresh, bool L2gradient)
-{
-    const int cn = dx.channels();
-    const int cols = dx.cols, rows = dx.rows;
-
     if (L2gradient)
     {
         low_thresh = std::min(32767.0, low_thresh);
@@ -1001,316 +1087,39 @@ static void CannyImpl(Mat& dx, Mat& dy, Mat& dst,
         if (low_thresh > 0) low_thresh *= low_thresh;
         if (high_thresh > 0) high_thresh *= high_thresh;
     }
+
     int low = cvFloor(low_thresh);
     int high = cvFloor(high_thresh);
 
-    ptrdiff_t mapstep = cols + 2;
-    AutoBuffer<uchar> buffer((cols+2)*(rows+2) + cn * mapstep * 3 * sizeof(int));
-
-    int* mag_buf[3];
-    mag_buf[0] = (int*)(uchar*)buffer;
-    mag_buf[1] = mag_buf[0] + mapstep*cn;
-    mag_buf[2] = mag_buf[1] + mapstep*cn;
-    memset(mag_buf[0], 0, /* cn* */mapstep*sizeof(int));
-
-    uchar* map = (uchar*)(mag_buf[2] + mapstep*cn);
-    memset(map, 1, mapstep);
-    memset(map + mapstep*(rows + 1), 1, mapstep);
-
-    int maxsize = std::max(1 << 10, cols * rows / 10);
-    std::vector<uchar*> stack(maxsize);
-    uchar **stack_top = &stack[0];
-    uchar **stack_bottom = &stack[0];
-
-    /* sector numbers
-       (Top-Left Origin)
-
-        1   2   3
-         *  *  *
-          * * *
-        0*******0
-          * * *
-         *  *  *
-        3   2   1
-    */
+    std::deque<uchar*> stack;
+    Mat map;
 
-    #define CANNY_PUSH(d)    *(d) = uchar(2), *stack_top++ = (d)
-    #define CANNY_POP(d)     (d) = *--stack_top
-
-#if CV_SIMD128
-    bool haveSIMD = hasSIMD128();
-#endif
-
-    // calculate magnitude and angle of gradient, perform non-maxima suppression.
-    // fill the map with one of the following values:
-    //   0 - the pixel might belong to an edge
-    //   1 - the pixel can not belong to an edge
-    //   2 - the pixel does belong to an edge
-    for (int i = 0; i <= rows; i++)
-    {
-        int* _norm = mag_buf[(i > 0) + 1] + 1;
-        if (i < rows)
-        {
-            short* _dx = dx.ptr<short>(i);
-            short* _dy = dy.ptr<short>(i);
-
-            if (!L2gradient)
-            {
-                int j = 0, width = cols * cn;
-#if CV_SIMD128
-                if (haveSIMD)
-                {
-                    for ( ; j <= width - 8; j += 8)
-                    {
-                        v_int16x8 v_dx = v_load((const short*)(_dx + j));
-                        v_int16x8 v_dy = v_load((const short*)(_dy + j));
-
-                        v_int32x4 v_dx0, v_dx1, v_dy0, v_dy1;
-                        v_expand(v_dx, v_dx0, v_dx1);
-                        v_expand(v_dy, v_dy0, v_dy1);
-
-                        v_dx0 = v_reinterpret_as_s32(v_abs(v_dx0));
-                        v_dx1 = v_reinterpret_as_s32(v_abs(v_dx1));
-                        v_dy0 = v_reinterpret_as_s32(v_abs(v_dy0));
-                        v_dy1 = v_reinterpret_as_s32(v_abs(v_dy1));
-
-                        v_store(_norm + j, v_dx0 + v_dy0);
-                        v_store(_norm + j + 4, v_dx1 + v_dy1);
-                    }
-                }
-#endif
-                for ( ; j < width; ++j)
-                    _norm[j] = std::abs(int(_dx[j])) + std::abs(int(_dy[j]));
-            }
-            else
-            {
-                int j = 0, width = cols * cn;
-#if CV_SIMD128
-                if (haveSIMD)
-                {
-                    for ( ; j <= width - 8; j += 8)
-                    {
-                        v_int16x8 v_dx = v_load((const short*)(_dx + j));
-                        v_int16x8 v_dy = v_load((const short*)(_dy + j));
-
-                        v_int16x8 v_dx_dy0, v_dx_dy1;
-                        v_zip(v_dx, v_dy, v_dx_dy0, v_dx_dy1);
-
-                        v_int32x4 v_dst0 = v_dotprod(v_dx_dy0, v_dx_dy0);
-                        v_int32x4 v_dst1 = v_dotprod(v_dx_dy1, v_dx_dy1);
-
-                        v_store(_norm + j, v_dst0);
-                        v_store(_norm + j + 4, v_dst1);
-                    }
-                }
-#endif
-                for ( ; j < width; ++j)
-                    _norm[j] = int(_dx[j])*_dx[j] + int(_dy[j])*_dy[j];
-            }
-
-            if (cn > 1)
-            {
-                for(int j = 0, jn = 0; j < cols; ++j, jn += cn)
-                {
-                    int maxIdx = jn;
-                    for(int k = 1; k < cn; ++k)
-                        if(_norm[jn + k] > _norm[maxIdx]) maxIdx = jn + k;
-                    _norm[j] = _norm[maxIdx];
-                    _dx[j] = _dx[maxIdx];
-                    _dy[j] = _dy[maxIdx];
-                }
-            }
-            _norm[-1] = _norm[cols] = 0;
-        }
-        else
-            memset(_norm-1, 0, /* cn* */mapstep*sizeof(int));
-
-        // at the very beginning we do not have a complete ring
-        // buffer of 3 magnitude rows for non-maxima suppression
-        if (i == 0)
-            continue;
-
-        uchar* _map = map + mapstep*i + 1;
-        _map[-1] = _map[cols] = 1;
-
-        int* _mag = mag_buf[1] + 1; // take the central row
-        ptrdiff_t magstep1 = mag_buf[2] - mag_buf[1];
-        ptrdiff_t magstep2 = mag_buf[0] - mag_buf[1];
-
-        const short* _x = dx.ptr<short>(i-1);
-        const short* _y = dy.ptr<short>(i-1);
-
-        if ((stack_top - stack_bottom) + cols > maxsize)
-        {
-            int sz = (int)(stack_top - stack_bottom);
-            maxsize = std::max(maxsize * 3/2, sz + cols);
-            stack.resize(maxsize);
-            stack_bottom = &stack[0];
-            stack_top = stack_bottom + sz;
-        }
-
-#define CANNY_SHIFT 15
-        const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
-
-        int prev_flag = 0, j = 0;
-#if CV_SIMD128
-        if (haveSIMD)
-        {
-            v_int32x4 v_low = v_setall_s32(low);
-            v_int8x16 v_one = v_setall_s8(1);
-
-            for (; j <= cols - 16; j += 16)
-            {
-                v_int32x4 v_m1 = v_load((const int*)(_mag + j));
-                v_int32x4 v_m2 = v_load((const int*)(_mag + j + 4));
-                v_int32x4 v_m3 = v_load((const int*)(_mag + j + 8));
-                v_int32x4 v_m4 = v_load((const int*)(_mag + j + 12));
-
-                v_store((signed char*)(_map + j), v_one);
-
-                v_int32x4 v_cmp1 = v_m1 > v_low;
-                v_int32x4 v_cmp2 = v_m2 > v_low;
-                v_int32x4 v_cmp3 = v_m3 > v_low;
-                v_int32x4 v_cmp4 = v_m4 > v_low;
-
-                v_int16x8 v_cmp80 = v_pack(v_cmp1, v_cmp2);
-                v_int16x8 v_cmp81 = v_pack(v_cmp3, v_cmp4);
-
-                v_int8x16 v_cmp = v_pack(v_cmp80, v_cmp81);
-                unsigned int mask = v_signmask(v_cmp);
-
-                if (mask)
-                {
-                    int m, k = j;
-
-                    for (; mask; ++k, mask >>= 1)
-                    {
-                        if (mask & 0x00000001)
-                        {
-                            m = _mag[k];
-                            int xs = _x[k];
-                            int ys = _y[k];
-                            int x = std::abs(xs);
-                            int y = std::abs(ys) << CANNY_SHIFT;
-
-                            int tg22x = x * TG22;
-
-                            if (y < tg22x)
-                            {
-                                if (m > _mag[k - 1] && m >= _mag[k + 1]) goto ocv_canny_push_sse;
-                            }
-                            else
-                            {
-                                int tg67x = tg22x + (x << (CANNY_SHIFT + 1));
-                                if (y > tg67x)
-                                {
-                                    if (m > _mag[k + magstep2] && m >= _mag[k + magstep1]) goto ocv_canny_push_sse;
-                                } else
-                                {
-                                    int s = (xs ^ ys) < 0 ? -1 : 1;
-                                    if (m > _mag[k + magstep2 - s] && m > _mag[k + magstep1 + s]) goto ocv_canny_push_sse;
-                                }
-                            }
-                        }
-
-                        prev_flag = 0;
-                        continue;
-
-ocv_canny_push_sse:
-                        // _map[k-mapstep] is short-circuited at the start because previous thread is
-                        // responsible for initializing it.
-                        if (!prev_flag && m > high && _map[k-mapstep] != 2)
-                        {
-                            CANNY_PUSH(_map + k);
-                            prev_flag = 1;
-                        } else
-                            _map[k] = 0;
-
-                    }
-
-                    if (prev_flag && ((k < j+16) || (k < cols && _mag[k] <= high)))
-                        prev_flag = 0;
-                }
-            }
-        }
-#endif
-        for (; j < cols; j++)
-        {
-            int m = _mag[j];
-
-            if (m > low)
-            {
-                int xs = _x[j];
-                int ys = _y[j];
-                int x = std::abs(xs);
-                int y = std::abs(ys) << CANNY_SHIFT;
-
-                int tg22x = x * TG22;
-
-                if (y < tg22x)
-                {
-                    if (m > _mag[j-1] && m >= _mag[j+1]) goto __ocv_canny_push;
-                }
-                else
-                {
-                    int tg67x = tg22x + (x << (CANNY_SHIFT+1));
-                    if (y > tg67x)
-                    {
-                        if (m > _mag[j+magstep2] && m >= _mag[j+magstep1]) goto __ocv_canny_push;
-                    }
-                    else
-                    {
-                        int s = (xs ^ ys) < 0 ? -1 : 1;
-                        if (m > _mag[j+magstep2-s] && m > _mag[j+magstep1+s]) goto __ocv_canny_push;
-                    }
-                }
-            }
-            prev_flag = 0;
-            _map[j] = uchar(1);
-            continue;
-__ocv_canny_push:
-            if (!prev_flag && m > high && _map[j-mapstep] != 2)
-            {
-                CANNY_PUSH(_map + j);
-                prev_flag = 1;
-            }
-            else
-                _map[j] = 0;
-        }
+    // Minimum number of threads should be 1, maximum should not exceed number of CPU's, because of overhead
+    int numOfThreads = std::max(1, std::min(getNumThreads(), getNumberOfCPUs()));
+    if (dx.rows / numOfThreads < 3)
+        numOfThreads = std::max(1, dx.rows / 3);
 
-        // scroll the ring buffer
-        _mag = mag_buf[0];
-        mag_buf[0] = mag_buf[1];
-        mag_buf[1] = mag_buf[2];
-        mag_buf[2] = _mag;
-    }
+    parallel_for_(Range(0, dx.rows), parallelCanny(dx, dy, map, stack, low, high, L2gradient), numOfThreads);
 
     // now track the edges (hysteresis thresholding)
-    while (stack_top > stack_bottom)
-    {
-        uchar* m;
-        if ((stack_top - stack_bottom) + 8 > maxsize)
-        {
-            int sz = (int)(stack_top - stack_bottom);
-            maxsize = maxsize * 3/2;
-            stack.resize(maxsize);
-            stack_bottom = &stack[0];
-            stack_top = stack_bottom + sz;
-        }
+    ptrdiff_t mapstep = map.cols;
 
-        CANNY_POP(m);
-
-        if (!m[-1])         CANNY_PUSH(m - 1);
-        if (!m[1])          CANNY_PUSH(m + 1);
-        if (!m[-mapstep-1]) CANNY_PUSH(m - mapstep - 1);
-        if (!m[-mapstep])   CANNY_PUSH(m - mapstep);
-        if (!m[-mapstep+1]) CANNY_PUSH(m - mapstep + 1);
-        if (!m[mapstep-1])  CANNY_PUSH(m + mapstep - 1);
-        if (!m[mapstep])    CANNY_PUSH(m + mapstep);
-        if (!m[mapstep+1])  CANNY_PUSH(m + mapstep + 1);
+    while (!stack.empty())
+    {
+        uchar* m = stack.back();
+        stack.pop_back();
+
+        if (!m[-mapstep-1]) CANNY_PUSH((m-mapstep-1), stack);
+        if (!m[-mapstep])   CANNY_PUSH((m-mapstep), stack);
+        if (!m[-mapstep+1]) CANNY_PUSH((m-mapstep+1), stack);
+        if (!m[-1])         CANNY_PUSH((m-1), stack);
+        if (!m[1])          CANNY_PUSH((m+1), stack);
+        if (!m[mapstep-1])  CANNY_PUSH((m+mapstep-1), stack);
+        if (!m[mapstep])    CANNY_PUSH((m+mapstep), stack);
+        if (!m[mapstep+1])  CANNY_PUSH((m+mapstep+1), stack);
     }
 
-    parallel_for_(Range(0, dst.rows), finalPass(map, dst, mapstep), dst.total()/(double)(1<<16));
+    parallel_for_(Range(0, dx.rows), finalPass(map, dst), dx.total()/(double)(1<<16));
 }
 
 } // namespace cv
diff --git a/modules/imgproc/src/contours.cpp b/modules/imgproc/src/contours.cpp
index b7b9a5be45..dc8905cdf9 100644
--- a/modules/imgproc/src/contours.cpp
+++ b/modules/imgproc/src/contours.cpp
@@ -39,6 +39,7 @@
 //
 //M*/
 #include "precomp.hpp"
+#include "opencv2/core/hal/intrin.hpp"
 
 /* initializes 8-element array for fast access to 3x3 neighborhood of a pixel */
 #define  CV_INIT_3X3_DELTAS( deltas, step, nch )            \
@@ -50,33 +51,6 @@
 static const CvPoint icvCodeDeltas[8] =
     { CvPoint(1, 0), CvPoint(1, -1), CvPoint(0, -1), CvPoint(-1, -1), CvPoint(-1, 0), CvPoint(-1, 1), CvPoint(0, 1), CvPoint(1, 1) };
 
-#if CV_SSE2
-static
-inline unsigned int trailingZeros(unsigned int value) {
-    CV_DbgAssert(value != 0); // undefined for zero input (https://en.wikipedia.org/wiki/Find_first_set)
-#if defined(_MSC_VER)
-#if (_MSC_VER < 1700)
-    unsigned long index = 0;
-    _BitScanForward(&index, value);
-    return (unsigned int)index;
-#else
-    return _tzcnt_u32(value);
-#endif
-#elif defined(__GNUC__) || defined(__GNUG__)
-    return __builtin_ctz(value);
-#elif defined(__ICC) || defined(__INTEL_COMPILER)
-    return _bit_scan_forward(value);
-#elif defined(__clang__)
-    return llvm.cttz.i32(value, true);
-#else
-    static const int MultiplyDeBruijnBitPosition[32] = {
-        0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
-        31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 };
-    return MultiplyDeBruijnBitPosition[((uint32_t)((value & -value) * 0x077CB531U)) >> 27];
-#endif
-}
-#endif
-
 CV_IMPL void
 cvStartReadChainPoints( CvChain * chain, CvChainPtReader * reader )
 {
@@ -1097,12 +1071,12 @@ cvFindNextContour( CvContourScanner scanner )
                         mask2 ^= 0x0000ffff;
 
                         if (mask1) {
-                            p = img[(x += trailingZeros(mask1))];
+                            p = img[(x += cv::trailingZeros32(mask1))];
                             goto _next_contour;
                         }
 
                         if (mask2) {
-                            p = img[(x += trailingZeros(mask2 << 16))];
+                            p = img[(x += cv::trailingZeros32(mask2 << 16))];
                             goto _next_contour;
                         }
                     }
@@ -1113,7 +1087,7 @@ cvFindNextContour( CvContourScanner scanner )
                         unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(v_p, v_prev)) ^ 0x0000ffff;
 
                         if (mask) {
-                            p = img[(x += trailingZeros(mask))];
+                            p = img[(x += cv::trailingZeros32(mask))];
                             goto _next_contour;
                         }
                         x += 16;
@@ -1394,12 +1368,12 @@ inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j, bool h
             mask2 ^= 0x0000ffff;
 
             if (mask1) {
-                j += trailingZeros(mask1);
+                j += cv::trailingZeros32(mask1);
                 return j;
             }
 
             if (mask2) {
-                j += trailingZeros(mask2 << 16);
+                j += cv::trailingZeros32(mask2 << 16);
                 return j;
             }
         }
@@ -1410,7 +1384,7 @@ inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j, bool h
             unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(v_p, v_zero)) ^ 0x0000ffff;
 
             if (mask) {
-                j += trailingZeros(mask);
+                j += cv::trailingZeros32(mask);
                 return j;
             }
             j += 16;
@@ -1443,12 +1417,12 @@ inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j, bool hav
             unsigned int mask2 = _mm_movemask_epi8(v_cmp2);
 
             if (mask1) {
-                j += trailingZeros(mask1);
+                j += cv::trailingZeros32(mask1);
                 return j;
             }
 
             if (mask2) {
-                j += trailingZeros(mask2 << 16);
+                j += cv::trailingZeros32(mask2 << 16);
                 return j;
             }
         }
@@ -1459,7 +1433,7 @@ inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j, bool hav
             unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(v_p, v_zero));
 
             if (mask) {
-                j += trailingZeros(mask);
+                j += cv::trailingZeros32(mask);
                 return j;
             }
             j += 16;