add Canny to ocl module

13 years ago · fa5113f303
parent 6161a3335c
commit fa5113f303
4 changed files with 1357 additions and 0 deletions
--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl/ocl.hpp
@ -895,6 +895,34 @@ namespace cv
 		// Supports TM_SQDIFF, TM_CCORR for type 32FC1 and 32FC4
 		CV_EXPORTS void matchTemplate(const oclMat& image, const oclMat& templ, oclMat& result, int method, MatchTemplateBuf& buf);

+		///////////////////////////////////////////// Canny /////////////////////////////////////////////
+		struct CV_EXPORTS CannyBuf;
+
+		//! compute edges of the input image using Canny operator
+		// Support CV_8UC1 only
+		CV_EXPORTS void Canny(const oclMat& image, oclMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
+		CV_EXPORTS void Canny(const oclMat& image, CannyBuf& buf, oclMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
+		CV_EXPORTS void Canny(const oclMat& dx, const oclMat& dy, oclMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
+		CV_EXPORTS void Canny(const oclMat& dx, const oclMat& dy, CannyBuf& buf, oclMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
+
+		struct CV_EXPORTS CannyBuf
+		{
+			CannyBuf() {}
+			explicit CannyBuf(const Size& image_size, int apperture_size = 3) {create(image_size, apperture_size);}
+			CannyBuf(const oclMat& dx_, const oclMat& dy_);
+
+			void create(const Size& image_size, int apperture_size = 3);
+
+			void release();
+
+			oclMat dx, dy;
+			oclMat dx_buf, dy_buf;
+			oclMat edgeBuf;
+			oclMat trackBuf1, trackBuf2;
+			oclMat counter;
+			Ptr<FilterEngine_GPU> filterDX, filterDY;
+		};
+
 #ifdef HAVE_CLAMDFFT
            ///////////////////////////////////////// clAmdFft related /////////////////////////////////////////
            // the two functions must be called before/after run any fft library functions.
--- a/modules/ocl/src/canny.cpp
+++ b/modules/ocl/src/canny.cpp
@ -0,0 +1,419 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include <iomanip>
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::ocl;
+using namespace std;
+
+#if !defined (HAVE_OPENCL)
+void cv::ocl::Canny(const oclMat& image, oclMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false) { throw_nogpu(); }
+void cv::ocl::Canny(const oclMat& image, CannyBuf& buf, oclMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false){ throw_nogpu(); }
+void cv::ocl::Canny(const oclMat& dx, const oclMat& dy, oclMat& edges, double low_thresh, double high_thresh, bool L2gradient = false){ throw_nogpu(); }
+void cv::ocl::Canny(const oclMat& dx, const oclMat& dy, CannyBuf& buf, oclMat& edges, double low_thresh, double high_thresh, bool L2gradient = false){ throw_nogpu(); }
+#else
+
+namespace cv
+{
+	namespace ocl
+	{
+		///////////////////////////OpenCL kernel strings///////////////////////////
+		extern const char *imgproc_canny;
+	}
+}
+
+cv::ocl::CannyBuf::CannyBuf(const oclMat& dx_, const oclMat& dy_) : dx(dx_), dy(dy_)
+{
+    CV_Assert(dx_.type() == CV_32SC1 && dy_.type() == CV_32SC1 && dx_.size() == dy_.size());
+
+    create(dx_.size(), -1);
+}
+
+void cv::ocl::CannyBuf::create(const Size& image_size, int apperture_size)
+{
+	dx.create(image_size, CV_32SC1);
+	dy.create(image_size, CV_32SC1);
+
+	if(apperture_size == 3)
+	{
+		dx_buf.create(image_size, CV_32SC1);
+		dy_buf.create(image_size, CV_32SC1);
+	}
+	else if(apperture_size > 0)
+    {
+		Mat kx, ky;
+        if (!filterDX)
+		{
+			filterDX = createDerivFilter_GPU(CV_32F, CV_32F, 1, 0, apperture_size, BORDER_REPLICATE);
+		}
+        if (!filterDY)
+		{
+            filterDY = createDerivFilter_GPU(CV_32F, CV_32F, 0, 1, apperture_size, BORDER_REPLICATE);
+		}
+    }
+	edgeBuf.create(image_size.height + 2, image_size.width + 2, CV_32FC1);
+	
+	trackBuf1.create(1, image_size.width * image_size.height, CV_16UC2);
+	trackBuf2.create(1, image_size.width * image_size.height, CV_16UC2);
+
+	counter.create(1,1, CV_32SC1);
+}
+
+void cv::ocl::CannyBuf::release()
+{
+    dx.release();
+    dy.release();
+    dx_buf.release();
+    dy_buf.release();
+    edgeBuf.release();
+    trackBuf1.release();
+    trackBuf2.release();
+	counter.release();
+}
+
+namespace cv { namespace ocl {
+	namespace canny
+	{
+        void calcSobelRowPass_gpu(const oclMat& src, oclMat& dx_buf, oclMat& dy_buf, int rows, int cols);
+
+        void calcMagnitude_gpu(const oclMat& dx_buf, const oclMat& dy_buf, oclMat& dx, oclMat& dy, oclMat& mag, int rows, int cols, bool L2Grad);
+        void calcMagnitude_gpu(const oclMat& dx, const oclMat& dy, oclMat& mag, int rows, int cols, bool L2Grad);
+
+        void calcMap_gpu(oclMat& dx, oclMat& dy, oclMat& mag, oclMat& map, int rows, int cols, float low_thresh, float high_thresh);
+
+        void edgesHysteresisLocal_gpu(oclMat& map, oclMat& st1, oclMat& counter, int rows, int cols);
+
+        void edgesHysteresisGlobal_gpu(oclMat& map, oclMat& st1, oclMat& st2, oclMat& counter, int rows, int cols);
+
+        void getEdges_gpu(oclMat& map, oclMat& dst, int rows, int cols);
+	}
+}}// cv::ocl
+
+namespace
+{
+    void CannyCaller(CannyBuf& buf, oclMat& dst, float low_thresh, float high_thresh)
+    {
+        using namespace ::cv::ocl::canny;
+        calcMap_gpu(buf.dx, buf.dy, buf.edgeBuf, buf.edgeBuf, dst.rows, dst.cols, low_thresh, high_thresh);
+
+        edgesHysteresisLocal_gpu(buf.edgeBuf, buf.trackBuf1, buf.counter, dst.rows, dst.cols);
+
+        edgesHysteresisGlobal_gpu(buf.edgeBuf, buf.trackBuf1, buf.trackBuf2, buf.counter, dst.rows, dst.cols);
+
+        getEdges_gpu(buf.edgeBuf, dst, dst.rows, dst.cols);
+    }
+}
+
+void cv::ocl::Canny(const oclMat& src, oclMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
+{
+    CannyBuf buf(src.size(), apperture_size);
+    Canny(src, buf, dst, low_thresh, high_thresh, apperture_size, L2gradient);
+}
+
+void cv::ocl::Canny(const oclMat& src, CannyBuf& buf, oclMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
+{
+    using namespace ::cv::ocl::canny;
+
+    CV_Assert(src.type() == CV_8UC1);
+
+    if( low_thresh > high_thresh )
+        std::swap( low_thresh, high_thresh );
+
+    dst.create(src.size(), CV_8U);
+    dst.setTo(Scalar::all(0));
+
+    buf.create(src.size(), apperture_size);
+    buf.edgeBuf.setTo(Scalar::all(0));
+	buf.counter.setTo(Scalar::all(0));
+
+    if (apperture_size == 3)
+    {
+        calcSobelRowPass_gpu(src, buf.dx_buf, buf.dy_buf, src.rows, src.cols);
+
+        calcMagnitude_gpu(buf.dx_buf, buf.dy_buf, buf.dx, buf.dy, buf.edgeBuf, src.rows, src.cols, L2gradient);
+    }
+    else
+    {
+		// FIXME:
+		// current ocl implementation requires the src and dst having same type
+		// convertTo is time consuming so this may be optimized later.
+		oclMat src_omat32f = src;
+		src.convertTo(src_omat32f, CV_32F); // FIXME
+
+        buf.filterDX->apply(src_omat32f, buf.dx);
+        buf.filterDY->apply(src_omat32f, buf.dy);
+
+		buf.dx.convertTo(buf.dx, CV_32S); // FIXME
+		buf.dy.convertTo(buf.dy, CV_32S); // FIXME
+
+        calcMagnitude_gpu(buf.dx, buf.dy, buf.edgeBuf, src.rows, src.cols, L2gradient);
+    }
+    CannyCaller(buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
+}
+void cv::ocl::Canny(const oclMat& dx, const oclMat& dy, oclMat& dst, double low_thresh, double high_thresh, bool L2gradient)
+{
+    CannyBuf buf(dx, dy);
+    Canny(dx, dy, buf, dst, low_thresh, high_thresh, L2gradient);
+}
+
+void cv::ocl::Canny(const oclMat& dx, const oclMat& dy, CannyBuf& buf, oclMat& dst, double low_thresh, double high_thresh, bool L2gradient)
+{
+    using namespace ::cv::ocl::canny;
+
+    CV_Assert(dx.type() == CV_32SC1 && dy.type() == CV_32SC1 && dx.size() == dy.size());
+
+    if( low_thresh > high_thresh )
+        std::swap( low_thresh, high_thresh);
+
+    dst.create(dx.size(), CV_8U);
+    dst.setTo(Scalar::all(0));
+
+    buf.dx = dx; buf.dy = dy;
+    buf.create(dx.size(), -1);
+    buf.edgeBuf.setTo(Scalar::all(0));
+	buf.counter.setTo(Scalar::all(0));
+    calcMagnitude_gpu(buf.dx, buf.dy, buf.edgeBuf, dx.rows, dx.cols, L2gradient);
+
+    CannyCaller(buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
+}
+
+void canny::calcSobelRowPass_gpu(const oclMat& src, oclMat& dx_buf, oclMat& dy_buf, int rows, int cols)
+{
+	Context *clCxt = src.clCxt;
+	string kernelName = "calcSobelRowPass";
+	vector< pair<size_t, const void *> > args;
+
+	args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
+	args.push_back( make_pair( sizeof(cl_mem), (void *)&dx_buf.data));
+	args.push_back( make_pair( sizeof(cl_mem), (void *)&dy_buf.data));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&rows));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&src.step));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&src.offset));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&dx_buf.step));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&dx_buf.offset));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&dy_buf.step));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&dy_buf.offset));
+
+	size_t globalThreads[3] = {cols, rows, 1};
+	size_t localThreads[3]  = {16, 16, 1};
+	openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
+}
+
+void canny::calcMagnitude_gpu(const oclMat& dx_buf, const oclMat& dy_buf, oclMat& dx, oclMat& dy, oclMat& mag, int rows, int cols, bool L2Grad)
+{
+	Context *clCxt = dx_buf.clCxt;
+	string kernelName = "calcMagnitude_buf";
+	vector< pair<size_t, const void *> > args;
+
+	args.push_back( make_pair( sizeof(cl_mem), (void *)&dx_buf.data));
+	args.push_back( make_pair( sizeof(cl_mem), (void *)&dy_buf.data));
+	args.push_back( make_pair( sizeof(cl_mem), (void *)&dx.data));
+	args.push_back( make_pair( sizeof(cl_mem), (void *)&dy.data));
+	args.push_back( make_pair( sizeof(cl_mem), (void *)&mag.data));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&rows));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&dx_buf.step));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&dx_buf.offset));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&dy_buf.step));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&dy_buf.offset));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&dx.step));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&dx.offset));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&dy.step));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&dy.offset));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&mag.step));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&mag.offset));
+
+	size_t globalThreads[3] = {cols, rows, 1};
+	size_t localThreads[3]  = {16, 16, 1};
+
+	char build_options [15] = "";
+	if(L2Grad)
+	{
+		strcat(build_options, "-D L2GRAD");
+	}
+	openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
+}
+void canny::calcMagnitude_gpu(const oclMat& dx, const oclMat& dy, oclMat& mag, int rows, int cols, bool L2Grad)
+{
+	Context *clCxt = dx.clCxt;
+	string kernelName = "calcMagnitude";
+	vector< pair<size_t, const void *> > args;
+
+	args.push_back( make_pair( sizeof(cl_mem), (void *)&dx.data));
+	args.push_back( make_pair( sizeof(cl_mem), (void *)&dy.data));
+	args.push_back( make_pair( sizeof(cl_mem), (void *)&mag.data));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&rows));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&dx.step));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&dx.offset));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&dy.step));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&dy.offset));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&mag.step));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&mag.offset));
+
+	size_t globalThreads[3] = {cols, rows, 1};
+	size_t localThreads[3]  = {16, 16, 1};
+
+	char build_options [15] = "";
+	if(L2Grad)
+	{
+		strcat(build_options, "-D L2GRAD");
+	}
+	openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
+}
+
+void canny::calcMap_gpu(oclMat& dx, oclMat& dy, oclMat& mag, oclMat& map, int rows, int cols, float low_thresh, float high_thresh)
+{
+	Context *clCxt = dx.clCxt;
+	
+	vector< pair<size_t, const void *> > args;
+
+	args.push_back( make_pair( sizeof(cl_mem), (void *)&dx.data));
+	args.push_back( make_pair( sizeof(cl_mem), (void *)&dy.data));
+	args.push_back( make_pair( sizeof(cl_mem), (void *)&mag.data));
+	args.push_back( make_pair( sizeof(cl_mem), (void *)&map.data));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&rows));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
+	args.push_back( make_pair( sizeof(cl_float), (void *)&low_thresh));
+	args.push_back( make_pair( sizeof(cl_float), (void *)&high_thresh));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&dx.step));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&dx.offset));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&dy.step));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&dy.offset));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&mag.step));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&mag.offset));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&map.step));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset));
+
+#if CALCMAP_FIXED
+	size_t globalThreads[3] = {cols, rows, 1};
+	string kernelName = "calcMap";
+	size_t localThreads[3]  = {16, 16, 1};
+#else
+	size_t globalThreads[3] = {cols, rows, 1};
+	string kernelName = "calcMap_2";
+	size_t localThreads[3]  = {256, 1, 1};
+#endif
+	openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
+}
+
+void canny::edgesHysteresisLocal_gpu(oclMat& map, oclMat& st1, oclMat& counter, int rows, int cols)
+{
+	Context *clCxt = map.clCxt;
+	string kernelName = "edgesHysteresisLocal";
+	vector< pair<size_t, const void *> > args;
+
+	args.push_back( make_pair( sizeof(cl_mem), (void *)&map.data));
+	args.push_back( make_pair( sizeof(cl_mem), (void *)&st1.data));
+	args.push_back( make_pair( sizeof(cl_mem), (void *)&counter.data));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&rows));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&map.step));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset));
+
+	size_t globalThreads[3] = {cols, rows, 1};
+	size_t localThreads[3]  = {16, 16, 1};
+
+	openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
+}
+
+void canny::edgesHysteresisGlobal_gpu(oclMat& map, oclMat& st1, oclMat& st2, oclMat& counter, int rows, int cols)
+{
+	unsigned int count = Mat(counter).at<unsigned int>(0);
+
+	Context *clCxt = map.clCxt;
+	string kernelName = "edgesHysteresisGlobal";
+	vector< pair<size_t, const void *> > args;
+	size_t localThreads[3]  = {128, 1, 1};
+
+#define DIVUP(a, b) ((a)+(b)-1)/(b)
+
+	while(count > 0)
+	{
+		counter.setTo(0);
+		args.clear();
+		size_t globalThreads[3] = {std::min(count, 65535u) * 128, DIVUP(count, 65535), 1};
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&map.data));
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&st1.data));
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&st2.data));
+		args.push_back( make_pair( sizeof(cl_mem), (void *)&counter.data));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&rows));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&count));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&map.step));
+		args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset));
+
+		openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
+		count = Mat(counter).at<unsigned int>(0);
+		std::swap(st1, st2);
+	}
+#undef DIVUP
+}
+
+void canny::getEdges_gpu(oclMat& map, oclMat& dst, int rows, int cols)
+{
+	Context *clCxt = map.clCxt;
+	string kernelName = "getEdges";
+	vector< pair<size_t, const void *> > args;
+
+	args.push_back( make_pair( sizeof(cl_mem), (void *)&map.data));
+	args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&rows));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&map.step));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset));
+
+	size_t globalThreads[3] = {cols, rows, 1};
+	size_t localThreads[3]  = {16, 16, 1};
+
+	openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
+}
+
+#endif // HAVE_OPENCL
--- a/modules/ocl/src/kernels/imgproc_canny.cl
+++ b/modules/ocl/src/kernels/imgproc_canny.cl
@ -0,0 +1,798 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+
+#ifdef L2GRAD
+inline float calc(int x, int y)
+{
+	return sqrt((float)(x * x + y * y));
+}
+#else
+inline float calc(int x, int y)
+{
+	return (float)abs(x) + abs(y);
+}
+#endif // 
+
+// Smoothing perpendicular to the derivative direction with a triangle filter
+// only support 3x3 Sobel kernel 
+// h (-1) =  1, h (0) =  2, h (1) =  1
+// h'(-1) = -1, h'(0) =  0, h'(1) =  1
+// thus sobel 2D operator can be calculated as:
+// h'(x, y) = h'(x)h(y) for x direction
+// 
+// src		input 8bit single channel image data
+// dx_buf	output dx buffer
+// dy_buf	output dy buffer
+__kernel 
+void calcSobelRowPass
+(
+	__global const uchar * src,
+	__global int * dx_buf,
+	__global int * dy_buf,
+	int rows,
+	int cols,
+	int src_step,
+	int src_offset,
+	int dx_buf_step,
+	int dx_buf_offset,
+	int dy_buf_step,
+	int dy_buf_offset
+)
+{
+	//src_step   /= sizeof(*src);
+	//src_offset /= sizeof(*src);
+	dx_buf_step   /= sizeof(*dx_buf);
+	dx_buf_offset /= sizeof(*dx_buf);
+	dy_buf_step   /= sizeof(*dy_buf);
+	dy_buf_offset /= sizeof(*dy_buf);
+
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+
+	int lidx = get_local_id(0);
+	int lidy = get_local_id(1);
+
+	__local int smem[16][18];
+
+	if(gidy < rows)
+	{
+		smem[lidy][lidx + 1] = src[gidx + gidy * src_step + src_offset]; 
+		if(lidx == 0)
+		{
+			smem[lidy][0]  = src[max(gidx - 1,  0)        + gidy * src_step + src_offset];
+			smem[lidy][17] = src[min(gidx + 16, cols - 1) + gidy * src_step + src_offset]; 
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+		if(gidx < cols)
+		{
+			dx_buf[gidx + gidy * dx_buf_step + dx_buf_offset] =
+				-smem[lidy][lidx] + smem[lidy][lidx + 2];
+			dy_buf[gidx + gidy * dy_buf_step + dy_buf_offset] =
+				 smem[lidy][lidx] + 2 * smem[lidy][lidx + 1] + smem[lidy][lidx + 2];
+		}
+	}
+}
+
+// calculate the magnitude of the filter pass combining both x and y directions
+// This is the buffered version(3x3 sobel)
+// 
+// dx_buf		dx buffer, calculated from calcSobelRowPass
+// dy_buf		dy buffer, calculated from calcSobelRowPass
+// dx			direvitive in x direction output
+// dy			direvitive in y direction output
+// mag			magnitude direvitive of xy output
+__kernel
+void calcMagnitude_buf
+(
+	__global const int * dx_buf,
+	__global const int * dy_buf,
+	__global int * dx,
+	__global int * dy,
+	__global float * mag,
+	int rows,
+	int cols,
+	int dx_buf_step,
+	int dx_buf_offset,
+	int dy_buf_step,
+	int dy_buf_offset,
+	int dx_step,
+	int dx_offset,
+	int dy_step,
+	int dy_offset,
+	int mag_step,
+	int mag_offset
+)
+{
+	dx_buf_step    /= sizeof(*dx_buf);
+	dx_buf_offset  /= sizeof(*dx_buf);
+	dy_buf_step    /= sizeof(*dy_buf);
+	dy_buf_offset  /= sizeof(*dy_buf);
+	dx_step    /= sizeof(*dx);
+	dx_offset  /= sizeof(*dx);
+	dy_step    /= sizeof(*dy);
+	dy_offset  /= sizeof(*dy);
+	mag_step   /= sizeof(*mag);
+	mag_offset /= sizeof(*mag);
+
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+
+	int lidx = get_local_id(0);
+	int lidy = get_local_id(1);
+
+	__local int sdx[18][16];
+	__local int sdy[18][16];
+
+	if(gidx < cols)
+	{
+		sdx[lidy + 1][lidx] = dx_buf[gidx + gidy * dx_buf_step + dx_buf_offset];
+		sdy[lidy + 1][lidx] = dy_buf[gidx + gidy * dy_buf_step + dy_buf_offset];
+		if(lidy == 0)
+		{
+			sdx[0][lidx]  = dx_buf[gidx + max(gidy - 1,  0)        * dx_buf_step + dx_buf_offset];
+			sdx[17][lidx] = dx_buf[gidx + min(gidy + 16, rows - 1) * dx_buf_step + dx_buf_offset];
+
+			sdy[0][lidx]  = dy_buf[gidx + max(gidy - 1,  0)        * dy_buf_step + dy_buf_offset];
+			sdy[17][lidx] = dy_buf[gidx + min(gidy + 16, rows - 1) * dy_buf_step + dy_buf_offset];
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		if(gidy < rows)
+		{
+			int x =  sdx[lidy][lidx] + 2 * sdx[lidy + 1][lidx] + sdx[lidy + 2][lidx];
+            int y = -sdy[lidy][lidx] + sdy[lidy + 2][lidx];
+			
+			dx[gidx + gidy * dx_step + dx_offset] = x;
+			dy[gidx + gidy * dy_step + dy_offset] = y;
+
+			mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] = calc(x, y);
+		}
+	}
+}
+
+// calculate the magnitude of the filter pass combining both x and y directions
+// This is the non-buffered version(non-3x3 sobel)
+// 
+// dx_buf		dx buffer, calculated from calcSobelRowPass
+// dy_buf		dy buffer, calculated from calcSobelRowPass
+// dx			direvitive in x direction output
+// dy			direvitive in y direction output
+// mag			magnitude direvitive of xy output
+__kernel
+void calcMagnitude
+(
+	__global const int * dx,
+	__global const int * dy,
+	__global float * mag,
+	int rows,
+	int cols,
+	int dx_step,
+	int dx_offset,
+	int dy_step,
+	int dy_offset,
+	int mag_step,
+	int mag_offset
+)
+{
+	dx_step    /= sizeof(*dx);
+	dx_offset  /= sizeof(*dx);
+	dy_step    /= sizeof(*dy);
+	dy_offset  /= sizeof(*dy);
+	mag_step   /= sizeof(*mag);
+	mag_offset /= sizeof(*mag);
+
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+
+	if(gidy < rows && gidx < cols)
+	{
+		mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] = 
+			calc(
+				dx[gidx + gidy * dx_step + dx_offset], 
+				dy[gidx + gidy * dy_step + dy_offset]
+			);
+	}
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// 0.4142135623730950488016887242097 is tan(22.5)
+#define CANNY_SHIFT 15
+#define TG22        (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
+
+//First pass of edge detection and non-maximum suppression
+// edgetype is set to for each pixel:
+// 0 - below low thres, not an edge
+// 1 - maybe an edge
+// 2 - is an edge, either magnitude is greater than high thres, or
+//     Given estimates of the image gradients, a search is then carried out 
+//     to determine if the gradient magnitude assumes a local maximum in the gradient direction.
+//     if the rounded gradient angle is zero degrees (i.e. the edge is in the north-south direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the west and east directions,
+//     if the rounded gradient angle is 90 degrees (i.e. the edge is in the east-west direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north and south directions,
+//     if the rounded gradient angle is 135 degrees (i.e. the edge is in the north east-south west direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north west and south east directions,
+//     if the rounded gradient angle is 45 degrees (i.e. the edge is in the north west-south east direction)the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north east and south west directions.
+//
+// dx, dy		direvitives of x and y direction
+// mag			magnitudes calculated from calcMagnitude function
+// map			output containing raw edge types
+__kernel
+void calcMap
+(
+	__global const int * dx,
+	__global const int * dy, 
+	__global const float * mag,
+	__global int * map,
+	int rows,
+	int cols,
+	float low_thresh,
+	float high_thresh,
+	int dx_step,
+	int dx_offset,
+	int dy_step,
+	int dy_offset,
+	int mag_step,
+	int mag_offset,
+	int map_step,
+	int map_offset
+)
+{
+	dx_step    /= sizeof(*dx);
+	dx_offset  /= sizeof(*dx);
+	dy_step    /= sizeof(*dy);
+	dy_offset  /= sizeof(*dy);
+	mag_step   /= sizeof(*mag);
+	mag_offset /= sizeof(*mag);
+	map_step   /= sizeof(*map);
+	map_offset /= sizeof(*map);
+
+	__local float smem[18][18];
+
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+
+	int lidx = get_local_id(0);
+	int lidy = get_local_id(1);
+
+	int grp_idx = get_global_id(0) & 0xFFFFF0;
+	int grp_idy = get_global_id(1) & 0xFFFFF0;
+
+	int tid = lidx + lidy * 16;
+	int lx = tid % 18;
+	int ly = tid / 18;
+	if(ly < 14)
+	{
+		smem[ly][lx] = mag[grp_idx + lx + (grp_idy + ly) * mag_step];
+	}
+	if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
+	{
+		smem[ly + 14][lx] = mag[grp_idx + lx + (grp_idy + ly + 14) * mag_step];
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	if(gidy < rows && gidx < cols)
+	{
+		int x = dx[gidx + gidy * dx_step];
+		int y = dy[gidx + gidy * dy_step];
+		const int s = (x ^ y) < 0 ? -1 : 1;
+		const float m = smem[lidy + 1][lidx + 1];
+		x = abs(x);
+		y = abs(y);
+
+		// 0 - the pixel can not belong to an edge
+		// 1 - the pixel might belong to an edge
+		// 2 - the pixel does belong to an edge
+		int edge_type = 0;
+		if(m > low_thresh)
+		{
+			const int tg22x = x * TG22;
+			const int tg67x = tg22x + (x << (1 + CANNY_SHIFT));
+			y <<= CANNY_SHIFT;
+			if(y < tg22x)
+			{
+				if(m > smem[lidy + 1][lidx] && m >= smem[lidy + 1][lidx + 2])
+				{
+					edge_type = 1 + (int)(m > high_thresh);
+				}
+			}
+			else if (y > tg67x)
+			{
+				if(m > smem[lidy][lidx + 1]&& m >= smem[lidy + 2][lidx + 1])
+				{
+					edge_type = 1 + (int)(m > high_thresh);
+				}
+			}
+			else
+			{
+				if(m > smem[lidy][lidx + 1 - s]&& m > smem[lidy + 2][lidx + 1 + s])
+				{
+					edge_type = 1 + (int)(m > high_thresh);
+				}
+			}
+		}
+		map[gidx + 1 + (gidy + 1) * map_step] = edge_type;
+	}
+}
+
+// non local memory version
+__kernel
+void calcMap_2 
+(
+	__global const int * dx,
+	__global const int * dy, 
+	__global const float * mag,
+	__global int * map,
+	int rows,
+	int cols,
+	float low_thresh,
+	float high_thresh,
+	int dx_step,
+	int dx_offset,
+	int dy_step,
+	int dy_offset,
+	int mag_step,
+	int mag_offset,
+	int map_step,
+	int map_offset
+)
+{
+	dx_step    /= sizeof(*dx);
+	dx_offset  /= sizeof(*dx);
+	dy_step    /= sizeof(*dy);
+	dy_offset  /= sizeof(*dy);
+	mag_step   /= sizeof(*mag);
+	mag_offset /= sizeof(*mag);
+	map_step   /= sizeof(*map);
+	map_offset /= sizeof(*map);
+
+
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+
+	if(gidy < rows && gidx < cols)
+	{
+		int x = dx[gidx + gidy * dx_step];
+		int y = dy[gidx + gidy * dy_step];
+		const int s = (x ^ y) < 0 ? -1 : 1;
+		const float m = mag[gidx + 1 + (gidy + 1) * mag_step];
+		x = abs(x);
+		y = abs(y);
+
+		// 0 - the pixel can not belong to an edge
+		// 1 - the pixel might belong to an edge
+		// 2 - the pixel does belong to an edge
+		int edge_type = 0;
+		if(m > low_thresh)
+		{
+			const int tg22x = x * TG22;
+			const int tg67x = tg22x + (x << (1 + CANNY_SHIFT));
+			y <<= CANNY_SHIFT;
+			if(y < tg22x)
+			{
+				if(m > mag[gidx + (gidy + 1) * mag_step] && m >= mag[gidx + 2 + (gidy + 1) * mag_step])
+				{
+					edge_type = 1 + (int)(m > high_thresh);
+				}
+			}
+			else if (y > tg67x)
+			{
+				if(m > mag[gidx + 1 + gidy* mag_step] && m >= mag[gidx + 1 + (gidy + 2) * mag_step])
+				{
+					edge_type = 1 + (int)(m > high_thresh);
+				}
+			}
+			else
+			{
+				if(m > mag[gidx + 1 - s + gidy * mag_step] && m > mag[gidx + 1 + s + (gidy + 2) * mag_step])
+				{
+					edge_type = 1 + (int)(m > high_thresh);
+				}
+			}
+		}
+		map[gidx + 1 + (gidy + 1) * map_step] = edge_type;
+	}
+}
+
+// [256, 1, 1] threaded, local memory version
+__kernel
+void calcMap_3
+(
+	__global const int * dx,
+	__global const int * dy, 
+	__global const float * mag,
+	__global int * map,
+	int rows,
+	int cols,
+	float low_thresh,
+	float high_thresh,
+	int dx_step,
+	int dx_offset,
+	int dy_step,
+	int dy_offset,
+	int mag_step,
+	int mag_offset,
+	int map_step,
+	int map_offset
+)
+{
+	dx_step    /= sizeof(*dx);
+	dx_offset  /= sizeof(*dx);
+	dy_step    /= sizeof(*dy);
+	dy_offset  /= sizeof(*dy);
+	mag_step   /= sizeof(*mag);
+	mag_offset /= sizeof(*mag);
+	map_step   /= sizeof(*map);
+	map_offset /= sizeof(*map);
+
+	__local float smem[18][18];
+
+	int lidx = get_local_id(0) % 16;
+	int lidy = get_local_id(0) / 16;
+	
+	int grp_pix = get_global_id(0); // identifies which pixel is processing currently in the target block
+	int grp_ind = get_global_id(1); // identifies which block of pixels is currently processing
+
+	int grp_idx = (grp_ind % (cols/16)) * 16;
+	int grp_idy = (grp_ind / (cols/16)) * 16; //(grp_ind / (cols/16)) * 16
+
+	int gidx = grp_idx + lidx;
+	int gidy = grp_idy + lidy;
+
+	int tid = get_global_id(0) % 256;
+	int lx = tid % 18;
+	int ly = tid / 18;
+	if(ly < 14)
+	{
+		smem[ly][lx] = mag[grp_idx + lx + (grp_idy + ly) * mag_step];
+	}
+	if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
+	{
+		smem[ly + 14][lx] = mag[grp_idx + lx + (grp_idy + ly + 14) * mag_step];
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	if(gidy < rows && gidx < cols)
+	{
+		int x = dx[gidx + gidy * dx_step];
+		int y = dy[gidx + gidy * dy_step];
+		const int s = (x ^ y) < 0 ? -1 : 1;
+		const float m = smem[lidy + 1][lidx + 1];
+		x = abs(x);
+		y = abs(y);
+
+		// 0 - the pixel can not belong to an edge
+		// 1 - the pixel might belong to an edge
+		// 2 - the pixel does belong to an edge
+		int edge_type = 0;
+		if(m > low_thresh)
+		{
+			const int tg22x = x * TG22;
+			const int tg67x = tg22x + (x << (1 + CANNY_SHIFT));
+			y <<= CANNY_SHIFT;
+			if(y < tg22x)
+			{
+				if(m > smem[lidy + 1][lidx] && m >= smem[lidy + 1][lidx + 2])
+				{
+					edge_type = 1 + (int)(m > high_thresh);
+				}
+			}
+			else if (y > tg67x)
+			{
+				if(m > smem[lidy][lidx + 1]&& m >= smem[lidy + 2][lidx + 1])
+				{
+					edge_type = 1 + (int)(m > high_thresh);
+				}
+			}
+			else
+			{
+				if(m > smem[lidy][lidx + 1 - s]&& m > smem[lidy + 2][lidx + 1 + s])
+				{
+					edge_type = 1 + (int)(m > high_thresh);
+				}
+			}
+		}
+		map[gidx + 1 + (gidy + 1) * map_step] = edge_type;
+	}
+}
+
+#undef CANNY_SHIFT
+#undef TG22
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// do Hysteresis for pixel whose edge type is 1
+//
+// If candidate pixel (edge type is 1) has a neighbour pixel (in 3x3 area) with type 2, it is believed to be part of an edge and
+// marked as edge. Each thread will iterate for 16 times to connect local edges.
+// Candidate pixel being identified as edge will then be tested if there is nearby potiential edge points. If there is, counter will 
+// be incremented by 1 and the point location is stored. These potiential candidates will be processed further in next kernel.
+// 
+// map		raw edge type results calculated from calcMap.
+// st		the potiential edge points found in this kernel call
+// counter	the number of potiential edge points
+__kernel
+void edgesHysteresisLocal
+(
+	__global int * map,
+	__global ushort2 * st, 
+	volatile __global unsigned int * counter,
+	int rows,
+	int cols,
+	int map_step,
+	int map_offset
+)
+{
+	map_step   /= sizeof(*map);
+	map_offset /= sizeof(*map);
+
+	__local int smem[18][18];
+
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+
+	int lidx = get_local_id(0);
+	int lidy = get_local_id(1);
+
+	int grp_idx = get_global_id(0) & 0xFFFFF0;
+	int grp_idy = get_global_id(1) & 0xFFFFF0;
+
+	int tid = lidx + lidy * 16;
+	int lx = tid % 18;
+	int ly = tid / 18;
+	if(ly < 14)
+	{
+		smem[ly][lx] = map[grp_idx + lx + (grp_idy + ly) * map_step + map_offset];
+	}
+	if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
+	{
+		smem[ly + 14][lx] = map[grp_idx + lx + (grp_idy + ly + 14) * map_step + map_offset];
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	if(gidy < rows && gidx < cols)
+	{
+		int n;
+
+		#pragma unroll
+		for (int k = 0; k < 16; ++k)
+		{
+			n = 0;
+
+			if (smem[lidy + 1][lidx + 1] == 1)
+			{
+				n += smem[lidy    ][lidx    ] == 2;
+				n += smem[lidy    ][lidx + 1] == 2;
+				n += smem[lidy    ][lidx + 2] == 2;
+
+				n += smem[lidy + 1][lidx    ] == 2;
+				n += smem[lidy + 1][lidx + 2] == 2;
+
+				n += smem[lidy + 2][lidx    ] == 2;
+				n += smem[lidy + 2][lidx + 1] == 2;
+				n += smem[lidy + 2][lidx + 2] == 2;
+			}
+
+			if (n > 0)
+				smem[lidy + 1][lidx + 1] = 2;
+		}
+
+		const int e = smem[lidy + 1][lidx + 1];
+		map[gidx + 1 + (gidy + 1) * map_step] = e;
+
+		n = 0;
+		if(e == 2)
+		{
+			n += smem[lidy    ][lidx    ] == 1;
+			n += smem[lidy    ][lidx + 1] == 1;
+			n += smem[lidy    ][lidx + 2] == 1;
+
+			n += smem[lidy + 1][lidx    ] == 1;
+			n += smem[lidy + 1][lidx + 2] == 1;
+
+			n += smem[lidy + 2][lidx    ] == 1;
+			n += smem[lidy + 2][lidx + 1] == 1;
+			n += smem[lidy + 2][lidx + 2] == 1;
+		}
+
+		if(n > 0)
+		{
+			unsigned int ind = atomic_inc(counter);
+			st[ind] = (ushort2)(gidx + 1, gidy + 1);
+		}
+	}
+}
+
+__constant int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
+__constant c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
+
+#define stack_size 512
+__kernel
+void edgesHysteresisGlobal
+(
+	__global int * map,
+	__global ushort2 * st1, 
+	__global ushort2 * st2, 
+	volatile __global int * counter,
+	int rows,
+	int cols,
+	int count,
+	int map_step,
+	int map_offset
+)
+{
+
+	map_step   /= sizeof(*map);
+	map_offset /= sizeof(*map);
+
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+
+	int lidx = get_local_id(0);
+	int lidy = get_local_id(1);
+
+	int grp_idx = get_group_id(0);
+	int grp_idy = get_group_id(1);
+
+	volatile __local unsigned int s_counter;
+	__local unsigned int s_ind;
+
+	__local ushort2 s_st[stack_size];
+
+	if(lidx == 0)
+	{
+		s_counter = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int ind = grp_idy * get_num_groups(0) + grp_idx;
+	
+	if(ind < count)
+	{
+		ushort2 pos = st1[ind];
+		if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
+		{
+			if (lidx < 8)
+			{
+				pos.x += c_dx[lidx];
+				pos.y += c_dy[lidx];
+
+				if (map[pos.x + pos.y * map_step] == 1)
+				{
+					map[pos.x + pos.y * map_step] = 2;
+
+					ind = atomic_inc(&s_counter);
+
+					s_st[ind] = pos;
+				}
+			}
+			barrier(CLK_LOCAL_MEM_FENCE);
+
+			while (s_counter > 0 && s_counter <= stack_size - get_num_groups(0))
+			{
+				const int subTaskIdx = lidx >> 3;
+				const int portion = min(s_counter, get_num_groups(0) >> 3);
+
+				pos.x = pos.y = 0;
+
+				if (subTaskIdx < portion)
+					pos = s_st[s_counter - 1 - subTaskIdx];
+				barrier(CLK_LOCAL_MEM_FENCE);
+
+				if (lidx == 0)
+					s_counter -= portion;
+				barrier(CLK_LOCAL_MEM_FENCE);
+
+				if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
+				{
+					pos.x += c_dx[lidx & 7];
+					pos.y += c_dy[lidx & 7];
+
+					if (map[pos.x + map_offset + pos.y * map_step] == 1)
+					{
+						map[pos.x + map_offset + pos.y * map_step] = 2;
+
+						ind = atomic_inc(&s_counter);
+
+						s_st[ind] = pos;
+					}
+				}
+				barrier(CLK_LOCAL_MEM_FENCE);
+			}
+
+			if (s_counter > 0)
+			{
+				if (lidx == 0)
+				{
+					ind = atomic_add(counter, s_counter);
+					s_ind = ind - s_counter;
+				}
+				barrier(CLK_LOCAL_MEM_FENCE);
+
+				ind = s_ind;
+
+				for (int i = lidx; i < s_counter; i += get_num_groups(0))
+				{
+					st2[ind + i] = s_st[i];
+				}
+			}
+		}
+	}
+}
+#undef stack_size
+
+//Get the edge result. egde type of value 2 will be marked as an edge point and set to 255. Otherwise 0.
+// map		edge type mappings
+// dst		edge output
+__kernel
+void getEdges
+(
+	__global const int * map,
+	__global uchar * dst,
+	int rows,
+	int cols,
+	int map_step,
+	int map_offset,
+	int dst_step,
+	int dst_offset
+)
+{
+	map_step   /= sizeof(*map);
+	map_offset /= sizeof(*map);
+	//dst_step   /= sizeof(*dst);
+	//dst_offset /= sizeof(*dst);
+
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+
+	if(gidy < rows && gidx < cols)
+	{
+		//dst[gidx + gidy * dst_step] = map[gidx + 1 + (gidy + 1) * map_step] == 2 ? 255: 0;
+		dst[gidx + gidy * dst_step] = (uchar)(-(map[gidx + 1 + (gidy + 1) * map_step] / 2));
+	}
+}
--- a/modules/ocl/test/test_canny.cpp
+++ b/modules/ocl/test/test_canny.cpp
@ -0,0 +1,112 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#ifdef WIN32
+#define FILTER_IMAGE "C:/Users/Public/Pictures/Sample Pictures/Penguins.jpg"
+#else
+#define FILTER_IMAGE "/Users/Test/Valve_original.PNG" // user need to specify a valid image path
+#endif
+#define SHOW_RESULT 0
+
+////////////////////////////////////////////////////////
+// Canny
+
+IMPLEMENT_PARAM_CLASS(AppertureSize, int);
+IMPLEMENT_PARAM_CLASS(L2gradient, bool);
+
+PARAM_TEST_CASE(Canny, AppertureSize, L2gradient)
+{
+    int apperture_size;
+    bool useL2gradient;
+
+    cv::Mat edges_gold;
+	std::vector<cv::ocl::Info> oclinfo;
+    virtual void SetUp()
+    {
+        apperture_size = GET_PARAM(0);
+        useL2gradient = GET_PARAM(1);
+		int devnums = getDevice(oclinfo);
+		CV_Assert(devnums > 0);
+    }
+};
+
+TEST_P(Canny, Accuracy)
+{
+    cv::Mat img = readImage(FILTER_IMAGE, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    double low_thresh = 50.0;
+    double high_thresh = 100.0;
+
+	cv::resize(img, img, cv::Size(512, 384));
+	cv::ocl::oclMat ocl_img = cv::ocl::oclMat(img);
+
+	cv::ocl::oclMat edges;
+	cv::ocl::Canny(ocl_img, edges, low_thresh, high_thresh, apperture_size, useL2gradient);
+
+	char filename [100];
+	sprintf(filename, "G:/Valve_edges_a%d_L2Grad%d.jpg", apperture_size, (int)useL2gradient);
+
+	cv::Mat edges_gold;
+	cv::Canny(img, edges_gold, low_thresh, high_thresh, apperture_size, useL2gradient);
+
+#if SHOW_RESULT
+	cv::Mat edges_x2, ocl_edges(edges);
+	edges_x2.create(edges.rows, edges.cols * 2, edges.type());
+	edges_x2.setTo(0);
+	cv::add(edges_gold,cv::Mat(edges_x2,cv::Rect(0,0,edges_gold.cols,edges_gold.rows)), cv::Mat(edges_x2,cv::Rect(0,0,edges_gold.cols,edges_gold.rows)));
+	cv::add(ocl_edges,cv::Mat(edges_x2,cv::Rect(edges_gold.cols,0,edges_gold.cols,edges_gold.rows)), cv::Mat(edges_x2,cv::Rect(edges_gold.cols,0,edges_gold.cols,edges_gold.rows)));
+	cv::namedWindow("Canny result (left: cpu, right: ocl)");
+    cv::imshow("Canny result (left: cpu, right: ocl)", edges_x2);
+	cv::waitKey();
+#endif //OUTPUT_RESULT
+	EXPECT_MAT_SIMILAR(edges_gold, edges, 1e-2);
+}
+
+INSTANTIATE_TEST_CASE_P(ocl_ImgProc, Canny, testing::Combine(
+    testing::Values(AppertureSize(3), AppertureSize(5)),
+    testing::Values(L2gradient(false), L2gradient(true))));