Add blend, columnsum, pyrUp/down, matchTemplate for ocl module

13 years ago · 0db69fbe19
parent 064d022a4b
commit 0db69fbe19
16 changed files with 4085 additions and 1 deletions
--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl/ocl.hpp
@ -859,6 +859,43 @@ namespace cv
        void benchmark_copy_offset_stride(const oclMat &src, oclMat &dst);
        void benchmark_ILP();
 		//! computes vertical sum, supports only CV_32FC1 images
 		CV_EXPORTS void columnSum(const oclMat& src, oclMat& sum);
 		//! performs linear blending of two images
 		//! to avoid accuracy errors sum of weigths shouldn't be very close to zero
 		// supports only CV_8UC1 source type
 		CV_EXPORTS void blendLinear(const oclMat& img1, const oclMat& img2, const oclMat& weights1, const oclMat& weights2, oclMat& result);
 		/////////////////////////////// Pyramid /////////////////////////////////////
 		CV_EXPORTS void pyrDown(const oclMat& src, oclMat& dst);
 		//! upsamples the source image and then smoothes it
 		CV_EXPORTS void pyrUp(const cv::ocl::oclMat& src,cv::ocl::oclMat& dst);
 		///////////////////////////////////////// match_template /////////////////////////////////////////////////////////////
 		struct CV_EXPORTS MatchTemplateBuf
 		{
 			Size user_block_size;
 			oclMat imagef, templf;
 			std::vector<oclMat> images;
 			std::vector<oclMat> image_sums;
 			std::vector<oclMat> image_sqsums;
 		};
 		//! computes the proximity map for the raster template and the image where the template is searched for
 		// Supports TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED for type 8UC1 and 8UC4
 		// Supports TM_SQDIFF, TM_CCORR for type 32FC1 and 32FC4
 		CV_EXPORTS void matchTemplate(const oclMat& image, const oclMat& templ, oclMat& result, int method);
 		//! computes the proximity map for the raster template and the image where the template is searched for
 		// Supports TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED for type 8UC1 and 8UC4
 		// Supports TM_SQDIFF, TM_CCORR for type 32FC1 and 32FC4
 		CV_EXPORTS void matchTemplate(const oclMat& image, const oclMat& templ, oclMat& result, int method, MatchTemplateBuf& buf);
    }
 }
 #include "opencv2/ocl/matrix_operations.hpp"
--- a/modules/ocl/src/blend.cpp
+++ b/modules/ocl/src/blend.cpp
@ -0,0 +1,98 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
 //    Nathan, liujun@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other oclMaterials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 #include "precomp.hpp"
 #include <iomanip>
 using namespace cv;
 using namespace cv::ocl;
 using namespace std;
 #if !defined (HAVE_OPENCL)
 void cv::ocl::blendLinear(const oclMat& img1, const oclMat& img2, const oclMat& weights1, const oclMat& weights2,
                            oclMat& result){throw_nogpu();}
 #else
 namespace cv 
 {
 	namespace ocl 
 	{
        ////////////////////////////////////OpenCL kernel strings//////////////////////////
        extern const char *blend_linear;
 	}
 }
 void cv::ocl::blendLinear(const oclMat& img1, const oclMat& img2, const oclMat& weights1, const oclMat& weights2,
                            oclMat& result)
 {
 	cv::ocl::Context *ctx = img1.clCxt;
 	assert(ctx == img2.clCxt && ctx == weights1.clCxt && ctx == weights2.clCxt);
 	int channels = img1.channels();
 	int depth = img1.depth();
 	int rows = img1.rows;
 	int cols = img1.cols;
 	int istep = img1.step;
 	int wstep = weights1.step;
 	size_t globalSize[] = {cols * channels, rows, 1};
 	size_t localSize[] = {16, 16, 1};
 	vector< pair<size_t, const void *> > args;
 	if(globalSize[0]!=0)
 	{
 		args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data ));
 		args.push_back( make_pair( sizeof(cl_mem), (void *)&img1.data ));
 		args.push_back( make_pair( sizeof(cl_mem), (void *)&img2.data ));
 		args.push_back( make_pair( sizeof(cl_mem), (void *)&weights1.data ));
 		args.push_back( make_pair( sizeof(cl_mem), (void *)&weights2.data ));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&rows ));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&istep ));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&wstep ));
 		std::string kernelName = "BlendLinear";
 		openCLExecuteKernel(ctx, &blend_linear, kernelName, globalSize, localSize, args, channels, depth);
 	}
 }
 #endif
--- a/modules/ocl/src/columnsum.cpp
+++ b/modules/ocl/src/columnsum.cpp
@ -0,0 +1,91 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
 //    Chunpeng Zhang, chunpeng@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other oclMaterials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 #include <iomanip>
 #include "precomp.hpp"
 using namespace cv;
 using namespace cv::ocl;
 using namespace std;
 #if !defined(HAVE_OPENCL)
 void cv::ocl::columnSum(const oclMat& src,oclMat& dst){ throw_nogpu(); }
 #else /*!HAVE_OPENCL */
 namespace cv 
 { 
 	namespace ocl
 	{
 		extern const char* imgproc_columnsum;
 	}
 }
 void cv::ocl::columnSum(const oclMat& src,oclMat& dst)
 {
 	CV_Assert(src.type() == CV_32FC1 && dst.type() == CV_32FC1 && src.size() == dst.size());
 	Context *clCxt = src.clCxt;                                        
 	const std::string kernelName = "columnSum";
 	std::vector< pair<size_t, const void *> > args;
 	args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));		
 	args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));			
 	args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols));		
 	args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));			
 	args.push_back( make_pair( sizeof(cl_int), (void *)&src.step));		
 	args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step));		
 	size_t globalThreads[3] = {dst.cols, dst.rows, 1};					
 	size_t localThreads[3]  = {16, 16, 1};		
 	openCLExecuteKernel(clCxt, &imgproc_columnsum, kernelName, globalThreads, localThreads, args, src.channels(), src.depth());
 }
 #endif 
--- a/modules/ocl/src/kernels/blend_linear.cl
+++ b/modules/ocl/src/kernels/blend_linear.cl
@ -0,0 +1,196 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2010-2012, MulticoreWare Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
 //    Liu Liujun, liujun@multicorewareinc.com 
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other GpuMaterials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 __kernel void BlendLinear_C1_D0(
 	__global uchar *dst,
 	__global uchar *img1,
 	__global uchar *img2,
 	__global float *weight1,
 	__global float *weight2,
 	int rows,
 	int cols,
 	int istep,
 	int wstep
 	)
 {
 	int idx = get_global_id(0);
 	int idy = get_global_id(1);
 	if (idx < cols && idy < rows)
 	{
 		int pos = idy * istep + idx;
 		int wpos = idy * (wstep /sizeof(float)) + idx;
 		float w1 = weight1[wpos];
 		float w2 = weight2[wpos];
 		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
 	}
 }
 __kernel void BlendLinear_C3_D0(
 	__global uchar *dst,
 	__global uchar *img1,
 	__global uchar *img2,
 	__global float *weight1,
 	__global float *weight2,
 	int rows,
 	int cols,
 	int istep,
 	int wstep
 	)
 {
 	int idx = get_global_id(0);
 	int idy = get_global_id(1);
 	int x = idx / 3;
 	int y = idy;
 	if (x < cols && y < rows)
 	{
 		int pos = idy * istep + idx;
 		int wpos = idy * (wstep /sizeof(float)) + x;
 		float w1 = weight1[wpos];
 		float w2 = weight2[wpos];
 		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
 	}
 }
 __kernel void BlendLinear_C4_D0(
 	__global uchar *dst,
 	__global uchar *img1,
 	__global uchar *img2,
 	__global float *weight1,
 	__global float *weight2,
 	int rows,
 	int cols,
 	int istep,
 	int wstep
 	)
 {
 	int idx = get_global_id(0);
 	int idy = get_global_id(1);
 	int x = idx / 4;
 	int y = idy;
 	if (x < cols && y < rows)
 	{
 		int pos = idy * istep + idx;
 		int wpos = idy * (wstep /sizeof(float)) + x;
 		float w1 = weight1[wpos];
 		float w2 = weight2[wpos];
 		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
 	}
 }
 __kernel void BlendLinear_C1_D5(
 	__global float *dst,
 	__global float *img1,
 	__global float *img2,
 	__global float *weight1,
 	__global float *weight2,
 	int rows,
 	int cols,
 	int istep,
 	int wstep
 	)
 {
 	int idx = get_global_id(0);
 	int idy = get_global_id(1);
 	if (idx < cols && idy < rows)
 	{
 		int pos = idy * (istep / sizeof(float)) + idx;
 		int wpos = idy * (wstep /sizeof(float)) + idx;
 		float w1 = weight1[wpos];
 		float w2 = weight2[wpos];
 		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
 	}
 }
 __kernel void BlendLinear_C3_D5(
 	__global float *dst,
 	__global float *img1,
 	__global float *img2,
 	__global float *weight1,
 	__global float *weight2,
 	int rows,
 	int cols,
 	int istep,
 	int wstep
 	)
 {
 	int idx = get_global_id(0);
 	int idy = get_global_id(1);
 	int x = idx / 3;
 	int y = idy;
 	if (x < cols && y < rows)
 	{
 		int pos = idy * (istep / sizeof(float)) + idx;
 		int wpos = idy * (wstep /sizeof(float)) + x;
 		float w1 = weight1[wpos];
 		float w2 = weight2[wpos];
 		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
 	}
 }
 __kernel void BlendLinear_C4_D5(
 	__global float *dst,
 	__global float *img1,
 	__global float *img2,
 	__global float *weight1,
 	__global float *weight2,
 	int rows,
 	int cols,
 	int istep,
 	int wstep
 	)
 {
 	int idx = get_global_id(0);
 	int idy = get_global_id(1);
 	int x = idx / 4;
 	int y = idy;
 	if (x < cols && y < rows)
 	{
 		int pos = idy * (istep / sizeof(float)) + idx;
 		int wpos = idy * (wstep /sizeof(float)) + x;
 		float w1 = weight1[wpos];
 		float w2 = weight2[wpos];
 		dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
 	}
 }
--- a/modules/ocl/src/kernels/imgproc_columnsum.cl
+++ b/modules/ocl/src/kernels/imgproc_columnsum.cl
@ -0,0 +1,80 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
 //    Chunpeng Zhang chunpeng@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other oclMaterials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 #pragma OPENCL EXTENSION cl_amd_printf : enable
 #if defined (__ATI__)
 #pragma OPENCL EXTENSION cl_amd_fp64:enable
 #elif defined (__NVIDIA__)
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif
 ////////////////////////////////////////////////////////////////////
 ///////////////////////// columnSum ////////////////////////////////
 ////////////////////////////////////////////////////////////////////
 /// CV_32FC1
 __kernel void columnSum_C1_D5(__global float* src,__global float* dst,int srcCols,int srcRows,int srcStep,int dstStep)
 {
 	const int x = get_global_id(0);
 	srcStep >>= 2;
 	dstStep >>= 2;
 	if (x < srcCols)
    {
 		int srcIdx = x ;
 		int dstIdx = x ;
        float sum = 0;
        for (int y = 0; y < srcRows; ++y)
        {
 			sum += src[srcIdx];
            dst[dstIdx] = sum;
 			srcIdx += srcStep;
 			dstIdx += dstStep;	
        }
 	}
 }
--- a/modules/ocl/src/kernels/match_template.cl
+++ b/modules/ocl/src/kernels/match_template.cl
@ -0,0 +1,824 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
 //    Peng Xiao, pengxiao@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other oclMaterials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 #pragma OPENCL EXTENSION cl_amd_printf : enable
 #if defined (__ATI__)
 #pragma OPENCL EXTENSION cl_amd_fp64:enable
 #elif defined (__NVIDIA__)
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif
 #if !defined(USE_SQR_INTEGRAL) && (defined (__ATI__) || defined (__NVIDIA__))
 #define TYPE_IMAGE_SQSUM double
 #else
 #define TYPE_IMAGE_SQSUM ulong
 #endif
 //////////////////////////////////////////////////
 // utilities
 #define SQSUMS_PTR(ox, oy) mad24(gidy + oy, img_sqsums_step, gidx + img_sqsums_offset + ox)
 #define SUMS_PTR(ox, oy) mad24(gidy + oy, img_sums_step, gidx + img_sums_offset + ox)
 // normAcc* are accurate normalization routines which make GPU matchTemplate
 // consistent with CPU one
 float normAcc(float num, float denum)
 {
 	if(fabs(num) < denum)
 	{
 		return num / denum;
 	}
 	if(fabs(num) < denum * 1.125f)
 	{
 		return num > 0 ? 1 : -1;
 	}
 	return 0;
 }
 float normAcc_SQDIFF(float num, float denum)
 {
 	if(fabs(num) < denum)
 	{
 		return num / denum;
 	}
 	if(fabs(num) < denum * 1.125f)
 	{
 		return num > 0 ? 1 : -1;
 	}
 	return 1;
 }
 //////////////////////////////////////////////////////////////////////
 // normalize
 __kernel 
 void normalizeKernel_C1_D0
 (
 	__global const TYPE_IMAGE_SQSUM * img_sqsums,
 	__global float * res,
 	ulong tpl_sqsum,
 	int res_rows,
 	int res_cols,
 	int tpl_rows,
 	int tpl_cols, 
 	int img_sqsums_offset,
 	int img_sqsums_step,
 	int res_offset,
 	int res_step
 )
 {
 	int gidx = get_global_id(0);
 	int gidy = get_global_id(1);
 	res_step   /= sizeof(*res);
 	res_offset /= sizeof(*res);
 	img_sqsums_step /= sizeof(*img_sqsums);
 	img_sqsums_offset /= sizeof(*img_sqsums);
 	int res_idx = mad24(gidy, res_step, res_offset + gidx);
 	if(gidx < res_cols && gidy < res_rows)
 	{
 		float image_sqsum_ = (float)(
 			(img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
 			(img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
 		res[res_idx] = normAcc(res[res_idx], sqrt(image_sqsum_ * tpl_sqsum));
 	}
 }
 __kernel 
 void matchTemplate_Prepared_SQDIFF_C1_D0
 (
 	__global const TYPE_IMAGE_SQSUM * img_sqsums,
 	__global float * res,
 	ulong tpl_sqsum,
 	int res_rows,
 	int res_cols,
 	int tpl_rows,
 	int tpl_cols, 
 	int img_sqsums_offset,
 	int img_sqsums_step,
 	int res_offset,
 	int res_step
 )
 {
 	int gidx = get_global_id(0);
 	int gidy = get_global_id(1);
 	res_step   /= sizeof(*res);
 	res_offset /= sizeof(*res);
 	img_sqsums_step /= sizeof(*img_sqsums);
 	img_sqsums_offset /= sizeof(*img_sqsums);
 	int res_idx = mad24(gidy, res_step, res_offset + gidx);
 	if(gidx < res_cols && gidy < res_rows)
 	{
 		float image_sqsum_ = (float)(
 			(img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
 			(img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
 		res[res_idx] = image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum;
 	}
 }
 __kernel 
 void matchTemplate_Prepared_SQDIFF_NORMED_C1_D0
 (
 	__global const TYPE_IMAGE_SQSUM * img_sqsums,
 	__global float * res,
 	ulong tpl_sqsum,
 	int res_rows,
 	int res_cols,
 	int tpl_rows,
 	int tpl_cols, 
 	int img_sqsums_offset,
 	int img_sqsums_step,
 	int res_offset,
 	int res_step
 )
 {
 	int gidx = get_global_id(0);
 	int gidy = get_global_id(1);
 	res_step   /= sizeof(*res);
 	res_offset /= sizeof(*res);
 	img_sqsums_step /= sizeof(*img_sqsums);
 	img_sqsums_offset /= sizeof(*img_sqsums);
 	int res_idx = mad24(gidy, res_step, res_offset + gidx);
 	if(gidx < res_cols && gidy < res_rows)
 	{
 		float image_sqsum_ = (float)(
 			(img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
 			(img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
 		res[res_idx] = normAcc_SQDIFF(image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum,
 										sqrt(image_sqsum_ * tpl_sqsum));
 	}
 }
 //////////////////////////////////////////////////
 // SQDIFF
 __kernel 
 void matchTemplate_Naive_SQDIFF_C1_D0
 (
 	__global const uchar * img,
 	__global const uchar * tpl,
 	__global float * res,
 	int img_rows,
 	int img_cols,
 	int tpl_rows,
 	int tpl_cols, 
 	int res_rows,
 	int res_cols,
 	int img_offset,
 	int tpl_offset,
 	int res_offset,
 	int img_step,
 	int tpl_step,
 	int res_step
 )
 {
 	int gidx = get_global_id(0);
 	int gidy = get_global_id(1);
 	int i,j;
 	int delta;
 	int sum = 0;
 	res_step   /= sizeof(*res);
 	res_offset /= sizeof(*res);
 	int res_idx = mad24(gidy, res_step, res_offset + gidx);
 	if(gidx < res_cols && gidy < res_rows)
 	{
 		for(i = 0; i < tpl_rows; i ++)
 		{
 			// get specific rows of img data
 			__global const uchar * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
 			__global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
 			for(j = 0; j < tpl_cols; j ++)
 			{
 				delta = img_ptr[j] - tpl_ptr[j];
 				sum   = mad24(delta, delta, sum);
 			}
 		}
 		res[res_idx] = sum;
 	}
 }
 __kernel 
 void matchTemplate_Naive_SQDIFF_C1_D5
 (
 	__global const float * img,
 	__global const float * tpl,
 	__global float * res,
 	int img_rows,
 	int img_cols,
 	int tpl_rows,
 	int tpl_cols, 
 	int res_rows,
 	int res_cols,
 	int img_offset,
 	int tpl_offset,
 	int res_offset,
 	int img_step,
 	int tpl_step,
 	int res_step
 )
 {
 	int gidx = get_global_id(0);
 	int gidy = get_global_id(1);
 	int i,j;
 	float delta;
 	float sum = 0;
 	img_step   /= sizeof(*img);
 	img_offset /= sizeof(*img);
 	tpl_step   /= sizeof(*tpl);
 	tpl_offset /= sizeof(*tpl);
 	res_step   /= sizeof(*res);
 	res_offset /= sizeof(*res);
 	int res_idx = mad24(gidy, res_step, res_offset + gidx);
 	if(gidx < res_cols && gidy < res_rows)
 	{
 		for(i = 0; i < tpl_rows; i ++)
 		{
 			// get specific rows of img data
 			__global const float * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
 			__global const float * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
 			for(j = 0; j < tpl_cols; j ++)
 			{
 				delta = img_ptr[j] - tpl_ptr[j];
 				sum   = mad(delta, delta, sum);
 			}
 		}
 		res[res_idx] = sum;
 	}
 }
 __kernel 
 void matchTemplate_Naive_SQDIFF_C4_D0
 (
 	__global const uchar4 * img,
 	__global const uchar4 * tpl,
 	__global float * res,
 	int img_rows,
 	int img_cols,
 	int tpl_rows,
 	int tpl_cols, 
 	int res_rows,
 	int res_cols,
 	int img_offset,
 	int tpl_offset,
 	int res_offset,
 	int img_step,
 	int tpl_step,
 	int res_step
 )
 {
 	int gidx = get_global_id(0);
 	int gidy = get_global_id(1);
 	int i,j;
 	int4 delta;
 	int4 sum = (int4)(0, 0, 0, 0);
 	img_step   /= sizeof(*img);
 	img_offset /= sizeof(*img);
 	tpl_step   /= sizeof(*tpl);
 	tpl_offset /= sizeof(*tpl);
 	res_step   /= sizeof(*res);
 	res_offset /= sizeof(*res);
 	int res_idx = mad24(gidy, res_step, res_offset + gidx);
 	if(gidx < res_cols && gidy < res_rows)
 	{
 		for(i = 0; i < tpl_rows; i ++)
 		{
 			// get specific rows of img data
 			__global const uchar4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
 			__global const uchar4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
 			for(j = 0; j < tpl_cols; j ++)
 			{
 				//delta = convert_int4(img_ptr[j] - tpl_ptr[j]); // this alternative is incorrect
 				delta.x = img_ptr[j].x - tpl_ptr[j].x;
 				delta.y = img_ptr[j].y - tpl_ptr[j].y;
 				delta.z = img_ptr[j].z - tpl_ptr[j].z;
 				delta.w = img_ptr[j].w - tpl_ptr[j].w;
 				sum   = mad24(delta, delta, sum);
 			}
 		}
 		res[res_idx] = sum.x + sum.y + sum.z + sum.w;
 	}
 }
 __kernel 
 void matchTemplate_Naive_SQDIFF_C4_D5
 (
 	__global const float4 * img,
 	__global const float4 * tpl,
 	__global float * res,
 	int img_rows,
 	int img_cols,
 	int tpl_rows,
 	int tpl_cols, 
 	int res_rows,
 	int res_cols,
 	int img_offset,
 	int tpl_offset,
 	int res_offset,
 	int img_step,
 	int tpl_step,
 	int res_step
 )
 {
 	int gidx = get_global_id(0);
 	int gidy = get_global_id(1);
 	int i,j;
 	float4 delta;
 	float4 sum = (float4)(0, 0, 0, 0);
 	img_step   /= sizeof(*img);
 	img_offset /= sizeof(*img);
 	tpl_step   /= sizeof(*tpl);
 	tpl_offset /= sizeof(*tpl);
 	res_step   /= sizeof(*res);
 	res_offset /= sizeof(*res);
 	int res_idx = mad24(gidy, res_step, res_offset + gidx);
 	if(gidx < res_cols && gidy < res_rows)
 	{
 		for(i = 0; i < tpl_rows; i ++)
 		{
 			// get specific rows of img data
 			__global const float4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
 			__global const float4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
 			for(j = 0; j < tpl_cols; j ++)
 			{
 				//delta = convert_int4(img_ptr[j] - tpl_ptr[j]); // this alternative is incorrect
 				delta.x = img_ptr[j].x - tpl_ptr[j].x;
 				delta.y = img_ptr[j].y - tpl_ptr[j].y;
 				delta.z = img_ptr[j].z - tpl_ptr[j].z;
 				delta.w = img_ptr[j].w - tpl_ptr[j].w;
 				sum   = mad(delta, delta, sum);
 			}
 		}
 		res[res_idx] = sum.x + sum.y + sum.z + sum.w;
 	}
 }
 //////////////////////////////////////////////////
 // CCORR
 __kernel 
 void matchTemplate_Naive_CCORR_C1_D0
 (
 	__global const uchar * img,
 	__global const uchar * tpl,
 	__global float * res,
 	int img_rows,
 	int img_cols,
 	int tpl_rows,
 	int tpl_cols, 
 	int res_rows,
 	int res_cols,
 	int img_offset,
 	int tpl_offset,
 	int res_offset,
 	int img_step,
 	int tpl_step,
 	int res_step
 )
 {
 	int gidx = get_global_id(0);
 	int gidy = get_global_id(1);
 	int i,j;
 	int sum = 0;
 	res_step   /= sizeof(*res);
 	res_offset /= sizeof(*res);
 	int res_idx = mad24(gidy, res_step, res_offset + gidx);
 	if(gidx < res_cols && gidy < res_rows)
 	{
 		for(i = 0; i < tpl_rows; i ++)
 		{
 			// get specific rows of img data
 			__global const uchar * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
 			__global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
 			for(j = 0; j < tpl_cols; j ++)
 			{
 				sum = mad24(img_ptr[j], tpl_ptr[j], sum);
 			}
 		}
 		res[res_idx] = sum;
 	}
 }
 __kernel 
 void matchTemplate_Naive_CCORR_C1_D5
 (
 	__global const float * img,
 	__global const float * tpl,
 	__global float * res,
 	int img_rows,
 	int img_cols,
 	int tpl_rows,
 	int tpl_cols, 
 	int res_rows,
 	int res_cols,
 	int img_offset,
 	int tpl_offset,
 	int res_offset,
 	int img_step,
 	int tpl_step,
 	int res_step
 )
 {
 	int gidx = get_global_id(0);
 	int gidy = get_global_id(1);
 	int i,j;
 	float sum = 0;
 	img_step   /= sizeof(*img);
 	img_offset /= sizeof(*img);
 	tpl_step   /= sizeof(*tpl);
 	tpl_offset /= sizeof(*tpl);
 	res_step   /= sizeof(*res);
 	res_offset /= sizeof(*res);
 	int res_idx = mad24(gidy, res_step, res_offset + gidx);
 	if(gidx < res_cols && gidy < res_rows)
 	{
 		for(i = 0; i < tpl_rows; i ++)
 		{
 			// get specific rows of img data
 			__global const float * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
 			__global const float * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
 			for(j = 0; j < tpl_cols; j ++)
 			{
 				sum = mad(img_ptr[j], tpl_ptr[j], sum);
 			}
 		}
 		res[res_idx] = sum;
 	}
 }
 __kernel 
 void matchTemplate_Naive_CCORR_C4_D0
 (
 	__global const uchar4 * img,
 	__global const uchar4 * tpl,
 	__global float * res,
 	int img_rows,
 	int img_cols,
 	int tpl_rows,
 	int tpl_cols, 
 	int res_rows,
 	int res_cols,
 	int img_offset,
 	int tpl_offset,
 	int res_offset,
 	int img_step,
 	int tpl_step,
 	int res_step
 )
 {
 	int gidx = get_global_id(0);
 	int gidy = get_global_id(1);
 	int i,j;
 	int4 sum = (int4)(0, 0, 0, 0);
 	img_step   /= sizeof(*img);
 	img_offset /= sizeof(*img);
 	tpl_step   /= sizeof(*tpl);
 	tpl_offset /= sizeof(*tpl);
 	res_step   /= sizeof(*res);
 	res_offset /= sizeof(*res);
 	int res_idx = mad24(gidy, res_step, res_offset + gidx);
 	if(gidx < res_cols && gidy < res_rows)
 	{
 		for(i = 0; i < tpl_rows; i ++)
 		{
 			// get specific rows of img data
 			__global const uchar4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
 			__global const uchar4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
 			for(j = 0; j < tpl_cols; j ++)
 			{
 				sum   = mad24(convert_int4(img_ptr[j]), convert_int4(tpl_ptr[j]), sum);
 			}
 		}
 		res[res_idx] = sum.x + sum.y + sum.z + sum.w;
 	}
 }
 __kernel 
 void matchTemplate_Naive_CCORR_C4_D5
 (
 	__global const float4 * img,
 	__global const float4 * tpl,
 	__global float * res,
 	int img_rows,
 	int img_cols,
 	int tpl_rows,
 	int tpl_cols, 
 	int res_rows,
 	int res_cols,
 	int img_offset,
 	int tpl_offset,
 	int res_offset,
 	int img_step,
 	int tpl_step,
 	int res_step
 )
 {
 	int gidx = get_global_id(0);
 	int gidy = get_global_id(1);
 	int i,j;
 	float4 sum = (float4)(0, 0, 0, 0);
 	img_step   /= sizeof(*img);
 	img_offset /= sizeof(*img);
 	tpl_step   /= sizeof(*tpl);
 	tpl_offset /= sizeof(*tpl);
 	res_step   /= sizeof(*res);
 	res_offset /= sizeof(*res);
 	int res_idx = mad24(gidy, res_step, res_offset + gidx);
 	if(gidx < res_cols && gidy < res_rows)
 	{
 		for(i = 0; i < tpl_rows; i ++)
 		{
 			// get specific rows of img data
 			__global const float4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
 			__global const float4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
 			for(j = 0; j < tpl_cols; j ++)
 			{
 				sum = mad(convert_float4(img_ptr[j]), convert_float4(tpl_ptr[j]), sum);
 			}
 		}
 		res[res_idx] = sum.x + sum.y + sum.z + sum.w;
 	}
 }
 //////////////////////////////////////////////////
 // CCOFF
 __kernel 
 void matchTemplate_Prepared_CCOFF_C1_D0
 (
 	__global float * res,
 	int img_rows,
 	int img_cols,
 	int tpl_rows,
 	int tpl_cols, 
 	int res_rows,
 	int res_cols,
 	int res_offset,
 	int res_step,
 	__global const uint * img_sums,
 	int img_sums_offset,
 	int img_sums_step,
 	float tpl_sum
 )
 {
 	int gidx = get_global_id(0);
 	int gidy = get_global_id(1);
 	img_sums_offset   /= sizeof(*img_sums);
 	img_sums_step     /= sizeof(*img_sums);
 	res_step   /= sizeof(*res);
 	res_offset /= sizeof(*res);
 	int res_idx = mad24(gidy, res_step, res_offset + gidx);
 	if(gidx < res_cols && gidy < res_rows)
 	{
 		float sum = (float)(
 			(img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
 		  - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
 		res[res_idx] -= sum * tpl_sum;
 	}
 }
 __kernel 
 void matchTemplate_Prepared_CCOFF_C4_D0
 (
 	__global float * res,
 	int img_rows,
 	int img_cols,
 	int tpl_rows,
 	int tpl_cols, 
 	int res_rows,
 	int res_cols,
 	int res_offset,
 	int res_step,
 	__global const uint * img_sums_c0,
 	__global const uint * img_sums_c1,
 	__global const uint * img_sums_c2,
 	__global const uint * img_sums_c3,
 	int img_sums_offset,
 	int img_sums_step,
 	float tpl_sum_c0,
 	float tpl_sum_c1,
 	float tpl_sum_c2,
 	float tpl_sum_c3
 )
 {
 	int gidx = get_global_id(0);
 	int gidy = get_global_id(1);
 	img_sums_offset   /= sizeof(*img_sums_c0);
 	img_sums_step     /= sizeof(*img_sums_c0);
 	res_step   /= sizeof(*res);
 	res_offset /= sizeof(*res);
 	int res_idx = mad24(gidy, res_step, res_offset + gidx);
 	if(gidx < res_cols && gidy < res_rows)
 	{
 		float ccorr = res[res_idx];
 		ccorr -= tpl_sum_c0*(float)(
 			(img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)])
 		  - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)]));
 		ccorr -= tpl_sum_c1*(float)(
 			(img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)])
 		  - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)]));
 		ccorr -= tpl_sum_c2*(float)(
 			(img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)])
 		  - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)]));
 		ccorr -= tpl_sum_c3*(float)(
 			(img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)])
 		  - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)]));
 		res[res_idx] = ccorr;
 	}
 }
 __kernel
 void matchTemplate_Prepared_CCOFF_NORMED_C1_D0
 (
 	__global float * res,
 	int img_rows,
 	int img_cols,
 	int tpl_rows,
 	int tpl_cols, 
 	int res_rows,
 	int res_cols,
 	int res_offset,
 	int res_step,
 	float weight,
 	__global const uint * img_sums,
 	int img_sums_offset,
 	int img_sums_step,
 	__global const TYPE_IMAGE_SQSUM * img_sqsums,
 	int img_sqsums_offset,
 	int img_sqsums_step,
 	float tpl_sum,
 	float tpl_sqsum
 )
 {
 	int gidx = get_global_id(0);
 	int gidy = get_global_id(1);
 	img_sqsums_step   /= sizeof(*img_sqsums);
 	img_sqsums_offset /= sizeof(*img_sqsums);
 	img_sums_offset   /= sizeof(*img_sums);
 	img_sums_step     /= sizeof(*img_sums);
 	res_step   /= sizeof(*res);
 	res_offset /= sizeof(*res);
 	int res_idx = mad24(gidy, res_step, res_offset + gidx);
 	if(gidx < res_cols && gidy < res_rows)
 	{
 		float image_sum_ =  (float)(
 			(img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
 		  - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
 		float image_sqsum_ = (float)(
 			(img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
 			(img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
 		res[res_idx] = normAcc(res[res_idx] - image_sum_ * tpl_sum,
 							   sqrt(tpl_sqsum * (image_sqsum_ - weight * image_sum_ * image_sum_)));
 	}
 }
 __kernel
 void matchTemplate_Prepared_CCOFF_NORMED_C4_D0
 (
 	__global float * res,
 	int img_rows,
 	int img_cols,
 	int tpl_rows,
 	int tpl_cols, 
 	int res_rows,
 	int res_cols,
 	int res_offset,
 	int res_step,
 	float weight,
 	__global const uint * img_sums_c0,
 	__global const uint * img_sums_c1,
 	__global const uint * img_sums_c2,
 	__global const uint * img_sums_c3,
 	int img_sums_offset,
 	int img_sums_step,
 	__global const TYPE_IMAGE_SQSUM * img_sqsums_c0,
 	__global const TYPE_IMAGE_SQSUM * img_sqsums_c1,
 	__global const TYPE_IMAGE_SQSUM * img_sqsums_c2,
 	__global const TYPE_IMAGE_SQSUM * img_sqsums_c3,
 	int img_sqsums_offset,
 	int img_sqsums_step,
 	float tpl_sum_c0,
 	float tpl_sum_c1,
 	float tpl_sum_c2,
 	float tpl_sum_c3,
 	float tpl_sqsum
 )
 {
 	int gidx = get_global_id(0);
 	int gidy = get_global_id(1);
 	img_sqsums_step   /= sizeof(*img_sqsums_c0);
 	img_sqsums_offset /= sizeof(*img_sqsums_c0);
 	img_sums_offset   /= sizeof(*img_sums_c0);
 	img_sums_step     /= sizeof(*img_sums_c0);
 	res_step   /= sizeof(*res);
 	res_offset /= sizeof(*res);
 	int res_idx = mad24(gidy, res_step, res_offset + gidx);
 	if(gidx < res_cols && gidy < res_rows)
 	{
 		float image_sum_c0 =  (float)(
 			(img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)])
 		  - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)]));
 		float image_sum_c1 =  (float)(
 			(img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)])
 		  - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)]));
 		float image_sum_c2 =  (float)(
 			(img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)])
 		  - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)]));
 		float image_sum_c3 =  (float)(
 			(img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)])
 		  - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)]));
 		float image_sqsum_c0 = (float)(
 			(img_sqsums_c0[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(tpl_cols, 0)]) -
 			(img_sqsums_c0[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(0, 0)]));
 		float image_sqsum_c1 = (float)(
 			(img_sqsums_c1[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(tpl_cols, 0)]) -
 			(img_sqsums_c1[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(0, 0)]));
 		float image_sqsum_c2 = (float)(
 			(img_sqsums_c2[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(tpl_cols, 0)]) -
 			(img_sqsums_c2[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(0, 0)]));
 		float image_sqsum_c3 = (float)(
 			(img_sqsums_c3[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(tpl_cols, 0)]) -
 			(img_sqsums_c3[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(0, 0)]));
 		float num = res[res_idx] - 
 			image_sum_c0 * tpl_sum_c0 -
 			image_sum_c1 * tpl_sum_c1 -
 			image_sum_c2 * tpl_sum_c2 -
 			image_sum_c3 * tpl_sum_c3;
 		float denum = sqrt( tpl_sqsum * (
 			image_sqsum_c0 - weight * image_sum_c0 * image_sum_c0 +
 			image_sqsum_c1 - weight * image_sum_c1 * image_sum_c1 +
 			image_sqsum_c2 - weight * image_sum_c2 * image_sum_c2 +
 			image_sqsum_c3 - weight * image_sum_c0 * image_sum_c3)
 			);
 		res[res_idx] = normAcc(num, denum);
 	}
 }
--- a/modules/ocl/src/kernels/pyr_down.cl
+++ b/modules/ocl/src/kernels/pyr_down.cl
@ -0,0 +1,500 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
 //    Dachuan Zhao, dachuan@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other oclMaterials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 #pragma OPENCL EXTENSION cl_amd_printf : enable
 uchar round_uchar_uchar(uchar v)
 { 
 	return v;
 }
 uchar round_uchar_int(int v)
 { 
    return (uchar)((uint)v <= 255 ? v : v > 0 ? 255 : 0); 
 }
 uchar round_uchar_float(float v)
 { 
 	if(v - convert_int_sat_rte(v) > 1e-6 || v - convert_int_sat_rte(v) < -1e-6)
 	{
 		if(((int)v + 1) - (v + 0.5f) < 1e-6 && ((int)v + 1) - (v + 0.5f) > -1e-6)
 		{
 			v = (int)v + 0.51f;
 		}
 	}
    int iv = convert_int_sat_rte(v);
    return round_uchar_int(iv); 
 }
 uchar4 round_uchar4_uchar4(uchar4 v)
 { 
 	return v;
 }
 uchar4 round_uchar4_int4(int4 v)
 { 
 	uchar4 result;
 	result.x = (uchar)(v.x <= 255 ? v.x : v.x > 0 ? 255 : 0); 
 	result.y = (uchar)(v.y <= 255 ? v.y : v.y > 0 ? 255 : 0); 
 	result.z = (uchar)(v.z <= 255 ? v.z : v.z > 0 ? 255 : 0); 
 	result.w = (uchar)(v.w <= 255 ? v.w : v.w > 0 ? 255 : 0); 
    return result; 
 }
 uchar4 round_uchar4_float4(float4 v)
 { 
 	if(v.x - convert_int_sat_rte(v.x) > 1e-6 || v.x - convert_int_sat_rte(v.x) < -1e-6)
 	{
 		if(((int)(v.x) + 1) - (v.x + 0.5f) < 1e-6 && ((int)(v.x) + 1) - (v.x + 0.5f) > -1e-6)
 		{
 			v.x = (int)(v.x) + 0.51f;
 		}
 	}
 	if(v.y - convert_int_sat_rte(v.y) > 1e-6 || v.y - convert_int_sat_rte(v.y) < -1e-6)
 	{
 		if(((int)(v.y) + 1) - (v.y + 0.5f) < 1e-6 && ((int)(v.y) + 1) - (v.y + 0.5f) > -1e-6)
 		{
 			v.y = (int)(v.y) + 0.51f;
 		}
 	}
 	if(v.z - convert_int_sat_rte(v.z) > 1e-6 || v.z - convert_int_sat_rte(v.z) < -1e-6)
 	{
 		if(((int)(v.z) + 1) - (v.z + 0.5f) < 1e-6 && ((int)(v.z) + 1) - (v.z + 0.5f) > -1e-6)
 		{
 			v.z = (int)(v.z) + 0.51f;
 		}
 	}
 	if(v.w - convert_int_sat_rte(v.w) > 1e-6 || v.w - convert_int_sat_rte(v.w) < -1e-6)
 	{
 		if(((int)(v.w) + 1) - (v.w + 0.5f) < 1e-6 && ((int)(v.w) + 1) - (v.w + 0.5f) > -1e-6)
 		{
 			v.w = (int)(v.w) + 0.51f;
 		}
 	}
    int4 iv = convert_int4_sat_rte(v);
    return round_uchar4_int4(iv); 
 }
 int idx_row_low(int y, int last_row)
 {
 	if(y < 0)
 	{
 		y = -y;
 	}
    return y % (last_row + 1);
 }
 int idx_row_high(int y, int last_row) 
 {
 	int i;
 	int j;
 	if(last_row - y < 0)
 	{
 		i = (y - last_row);
 	}
 	else
 	{
 		i = (last_row - y);
 	}
 	if(last_row - i < 0)
 	{
 		j = i - last_row;
 	}
 	else
 	{
 		j = last_row - i;
 	}
    return j % (last_row + 1);
 }
 int idx_row(int y, int last_row)
 {
    return idx_row_low(idx_row_high(y, last_row), last_row);
 }
 int idx_col_low(int x, int last_col)
 {
 	if(x < 0)
 	{
 		x = -x;
 	}
    return x % (last_col + 1);
 }
 int idx_col_high(int x, int last_col) 
 {
 	int i;
 	int j;
 	if(last_col - x < 0)
 	{
 		i = (x - last_col);
 	}
 	else
 	{
 		i = (last_col - x);
 	}
 	if(last_col - i < 0)
 	{
 		j = i - last_col;
 	}
 	else
 	{
 		j = last_col - i;
 	}
    return j % (last_col + 1);
 }
 int idx_col(int x, int last_col)
 {
    return idx_col_low(idx_col_high(x, last_col), last_col);
 }
 __kernel void pyrDown_C1_D0(__global uchar * srcData, int srcStep, int srcOffset, int srcRows, int srcCols, __global uchar *dst, int dstStep, int dstOffset, int dstCols)
 {
    const int x = get_group_id(0) * get_local_size(0) + get_local_id(0);
    const int y = get_group_id(1);
    __local float smem[256 + 4];
    float sum;
    const int src_y = 2*y;
    const int last_row = srcRows - 1;
    const int last_col = srcCols - 1;
    sum = 0;
    sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(x, last_col)]);
    sum = sum + 0.25f   * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(x, last_col)]);
    sum = sum + 0.375f  * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[idx_col(x, last_col)]);
    sum = sum + 0.25f   * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(x, last_col)]);
    sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(x, last_col)]);
    smem[2 + get_local_id(0)] = sum;
    if (get_local_id(0) < 2)
    {
        const int left_x = x - 2;
        sum = 0;
        sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(left_x, last_col)]);
 		sum = sum + 0.25f   * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(left_x, last_col)]);
 		sum = sum + 0.375f  * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[idx_col(left_x, last_col)]);
 		sum = sum + 0.25f   * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(left_x, last_col)]);
 		sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(left_x, last_col)]);
        smem[get_local_id(0)] = sum;
    }
    if (get_local_id(0) > 253)
    {
        const int right_x = x + 2;
        sum = 0;
        sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(right_x, last_col)]);
 		sum = sum + 0.25f   * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(right_x, last_col)]);
 		sum = sum + 0.375f  * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[idx_col(right_x, last_col)]);
 		sum = sum + 0.25f   * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(right_x, last_col)]);
 		sum = sum + 0.0625f * round_uchar_uchar(((__global uchar*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(right_x, last_col)]);
        smem[4 + get_local_id(0)] = sum;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (get_local_id(0) < 128)
    {
        const int tid2 = get_local_id(0) * 2;
        sum = 0;
        sum = sum + 0.0625f * smem[2 + tid2 - 2];
        sum = sum + 0.25f   * smem[2 + tid2 - 1];
        sum = sum + 0.375f  * smem[2 + tid2    ];
        sum = sum + 0.25f   * smem[2 + tid2 + 1];
        sum = sum + 0.0625f * smem[2 + tid2 + 2];
        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
        if (dst_x < dstCols)
            dst[y * dstStep + dst_x] = round_uchar_float(sum);
    }
 }
 __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcOffset, int srcRows, int srcCols, __global uchar4 *dst, int dstStep, int dstOffset, int dstCols)
 {
    const int x = get_group_id(0) * get_local_size(0) + get_local_id(0);
    const int y = get_group_id(1);
    __local float4 smem[256 + 4];
    float4 sum;
    const int src_y = 2*y;
    const int last_row = srcRows - 1;
    const int last_col = srcCols - 1;
 	float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
 	float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
 	float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
    sum = 0;
 	sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(x, last_col)]));
 	sum = sum + co2   * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(x, last_col)]));
 	sum = sum + co1  * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[idx_col(x, last_col)]));
 	sum = sum + co2   * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(x, last_col)]));
 	sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(x, last_col)]));
 	smem[2 + get_local_id(0)] = sum;
 	if (get_local_id(0) < 2)
 	{
 		const int left_x = x - 2;
 		sum = 0;
 		sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(left_x, last_col)]));
 		sum = sum + co2   * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(left_x, last_col)]));
 		sum = sum + co1  * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[idx_col(left_x, last_col)]));
 		sum = sum + co2   * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(left_x, last_col)]));
 		sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(left_x, last_col)]));
 		smem[get_local_id(0)] = sum;
 	}
 	if (get_local_id(0) > 253)
 	{
 		const int right_x = x + 2;
 		sum = 0;
 		sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(right_x, last_col)]));
 		sum = sum + co2   * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(right_x, last_col)]));
 		sum = sum + co1  * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[idx_col(right_x, last_col)]));
 		sum = sum + co2   * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(right_x, last_col)]));
 		sum = sum + co3 * convert_float4(round_uchar4_uchar4(((__global uchar4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(right_x, last_col)]));
 		smem[4 + get_local_id(0)] = sum;
 	}
    barrier(CLK_LOCAL_MEM_FENCE);
    if (get_local_id(0) < 128)
    {
        const int tid2 = get_local_id(0) * 2;
        sum = 0;
        sum = sum + co3 * smem[2 + tid2 - 2];
        sum = sum + co2   * smem[2 + tid2 - 1];
        sum = sum + co1  * smem[2 + tid2    ];
        sum = sum + co2   * smem[2 + tid2 + 1];
        sum = sum + co3 * smem[2 + tid2 + 2];
        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
        if (dst_x < dstCols)
            dst[y * dstStep / 4 + dst_x] = round_uchar4_float4(sum);
    }
 }
 __kernel void pyrDown_C1_D5(__global float * srcData, int srcStep, int srcOffset, int srcRows, int srcCols, __global float *dst, int dstStep, int dstOffset, int dstCols)
 {
    const int x = get_group_id(0) * get_local_size(0) + get_local_id(0);
    const int y = get_group_id(1);
    __local float smem[256 + 4];
    float sum;
    const int src_y = 2*y;
    const int last_row = srcRows - 1;
    const int last_col = srcCols - 1;
    sum = 0;
    sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(x, last_col)];
    sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(x, last_col)];
    sum = sum + 0.375f  * ((__global float*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[idx_col(x, last_col)];
    sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(x, last_col)];
    sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(x, last_col)];
    smem[2 + get_local_id(0)] = sum;
    if (get_local_id(0) < 2)
    {
        const int left_x = x - 2;
        sum = 0;
        sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(left_x, last_col)];
 		sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(left_x, last_col)];
 		sum = sum + 0.375f  * ((__global float*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[idx_col(left_x, last_col)];
 		sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(left_x, last_col)];
 		sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(left_x, last_col)];
        smem[get_local_id(0)] = sum;
    }
    if (get_local_id(0) > 253)
    {
        const int right_x = x + 2;
        sum = 0;
        sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[idx_col(right_x, last_col)];
 		sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[idx_col(right_x, last_col)];
 		sum = sum + 0.375f  * ((__global float*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[idx_col(right_x, last_col)];
 		sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[idx_col(right_x, last_col)];
 		sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[idx_col(right_x, last_col)];
        smem[4 + get_local_id(0)] = sum;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (get_local_id(0) < 128)
    {
        const int tid2 = get_local_id(0) * 2;
        sum = 0;
        sum = sum + 0.0625f * smem[2 + tid2 - 2];
        sum = sum + 0.25f   * smem[2 + tid2 - 1];
        sum = sum + 0.375f  * smem[2 + tid2    ];
        sum = sum + 0.25f   * smem[2 + tid2 + 1];
        sum = sum + 0.0625f * smem[2 + tid2 + 2];
        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
        if (dst_x < dstCols)
            dst[y * dstStep / 4 + dst_x] = sum;
    }
 }
 __kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcOffset, int srcRows, int srcCols, __global float4 *dst, int dstStep, int dstOffset, int dstCols)
 {
    const int x = get_group_id(0) * get_local_size(0) + get_local_id(0);
    const int y = get_group_id(1);
    __local float4 smem[256 + 4];
    float4 sum;
    const int src_y = 2*y;
    const int last_row = srcRows - 1;
    const int last_col = srcCols - 1;
 	float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
 	float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
 	float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
    sum = 0;
 	sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(x, last_col)];
 	sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(x, last_col)];
 	sum = sum + co1  * ((__global float4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[idx_col(x, last_col)];
 	sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(x, last_col)];
 	sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(x, last_col)];
 	smem[2 + get_local_id(0)] = sum;
 	if (get_local_id(0) < 2)
 	{
 		const int left_x = x - 2;
 		sum = 0;
 		sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(left_x, last_col)];
 		sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(left_x, last_col)];
 		sum = sum + co1  * ((__global float4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[idx_col(left_x, last_col)];
 		sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(left_x, last_col)];
 		sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(left_x, last_col)];
 		smem[get_local_id(0)] = sum;
 	}
 	if (get_local_id(0) > 253)
 	{
 		const int right_x = x + 2;
 		sum = 0;
 		sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[idx_col(right_x, last_col)];
 		sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[idx_col(right_x, last_col)];
 		sum = sum + co1  * ((__global float4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[idx_col(right_x, last_col)];
 		sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[idx_col(right_x, last_col)];
 		sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[idx_col(right_x, last_col)];
 		smem[4 + get_local_id(0)] = sum;
 	}
    barrier(CLK_LOCAL_MEM_FENCE);
    if (get_local_id(0) < 128)
    {
        const int tid2 = get_local_id(0) * 2;
        sum = 0;
        sum = sum + co3 * smem[2 + tid2 - 2];
        sum = sum + co2   * smem[2 + tid2 - 1];
        sum = sum + co1  * smem[2 + tid2    ];
        sum = sum + co2   * smem[2 + tid2 + 1];
        sum = sum + co3 * smem[2 + tid2 + 2];
        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
        if (dst_x < dstCols)
            dst[y * dstStep / 16 + dst_x] = sum;
    }
 }
--- a/modules/ocl/src/kernels/pyr_up.cl
+++ b/modules/ocl/src/kernels/pyr_up.cl
@ -0,0 +1,750 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
 //    Zhang Chunpeng	chunpeng@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other oclMaterials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 //#pragma OPENCL EXTENSION cl_amd_printf : enable
 uchar get_valid_uchar(uchar data)
 {
 	return (uchar)(data <= 255 ? data : data > 0 ? 255 : 0);
 }
 ///////////////////////////////////////////////////////////////////////
 //////////////////////////  CV_8UC1  //////////////////////////////////
 ///////////////////////////////////////////////////////////////////////
 __kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst,
 				int srcRows,int dstRows,int srcCols,int dstCols,
 				int srcOffset,int dstOffset,int srcStep,int dstStep)
 {
 	const int x = get_global_id(0);
 	const int y = get_global_id(1);
 	__local float s_srcPatch[10][10];
 	__local float s_dstPatch[20][16];
 	if( get_local_id(0) < 10 && get_local_id(1) < 10 )
 	{
 		int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
 		int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
 		srcx = abs(srcx);
 		srcx = min(srcCols - 1,srcx);
 		srcy = abs(srcy);
 		srcy = min(srcRows -1 ,srcy);
 		s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]);
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	float sum = 0;
 	const int evenFlag = (int)((get_local_id(0) & 1) == 0);
 	const int oddFlag = (int)((get_local_id(0) & 1) != 0);
 	const bool  eveny = ((get_local_id(1) & 1) == 0);
 	const int tidx = get_local_id(0);
 	if(eveny)
 	{
 		sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)];
        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)];
        sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)];
        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)];
        sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)];	
  	}
 	s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
 	if (get_local_id(1) < 2)
    {
 		sum = 0;
        if (eveny)
        {
 			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
            sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
         }
         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
     }
     if (get_local_id(1) > 13)
     {
 		sum = 0;
 		if (eveny)
 		{
 			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
 			sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
 			sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx    ) >> 1)];
 			sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
 			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
 		}
 		s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
     }
 	barrier(CLK_LOCAL_MEM_FENCE);
 	sum = 0;
    const int tidy = get_local_id(1);
    sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)];
    sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][get_local_id(0)];
    sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][get_local_id(0)];
    sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][get_local_id(0)];
    sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)];
    if ((x < dstCols) && (y < dstRows))
 		dst[x + y * dstStep] = (float)(4.0f * sum);
 }
 ///////////////////////////////////////////////////////////////////////
 //////////////////////////  CV_16UC1  /////////////////////////////////
 ///////////////////////////////////////////////////////////////////////
 __kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst,
 				int srcRows,int dstRows,int srcCols,int dstCols,
 				int srcOffset,int dstOffset,int srcStep,int dstStep)
 {
 	const int x = get_global_id(0);
 	const int y = get_global_id(1);
 	__local float s_srcPatch[10][10];
 	__local float s_dstPatch[20][16];
 	srcStep = srcStep >> 1;
 	dstStep = dstStep >> 1;
 	srcOffset = srcOffset >> 1;
 	dstOffset = dstOffset >> 1;
 	if( get_local_id(0) < 10 && get_local_id(1) < 10 )
 	{
 		int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
 		int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
 		srcx = abs(srcx);
 		srcx = min(srcCols - 1,srcx);
 		srcy = abs(srcy);
 		srcy = min(srcRows -1 ,srcy);
 		s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]);
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	float sum = 0;
 	const int evenFlag = (int)((get_local_id(0) & 1) == 0);
 	const int oddFlag = (int)((get_local_id(0) & 1) != 0);
 	const bool  eveny = ((get_local_id(1) & 1) == 0);
 	const int tidx = get_local_id(0);
 	if(eveny)
 	{
 		sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)];
        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)];
        sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)];
        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)];
        sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)];	
  	}
 	s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
 	if (get_local_id(1) < 2)
    {
 		sum = 0;
        if (eveny)
        {
 			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
            sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
         }
         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
     }
     if (get_local_id(1) > 13)
     {
 		sum = 0;
 		if (eveny)
 		{
 			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
 			sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
 			sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx    ) >> 1)];
 			sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
 			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
 		}
 		s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
     }
 	barrier(CLK_LOCAL_MEM_FENCE);
 	sum = 0;
    const int tidy = get_local_id(1);
    sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)];
    sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][get_local_id(0)];
    sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][get_local_id(0)];
    sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][get_local_id(0)];
    sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)];
    if ((x < dstCols) && (y < dstRows))
 		dst[x + y * dstStep] = (float)(4.0f * sum);
 }
 ///////////////////////////////////////////////////////////////////////
 //////////////////////////  CV_32FC1  /////////////////////////////////
 ///////////////////////////////////////////////////////////////////////
 __kernel void pyrUp_C1_D5(__global float* src,__global float* dst,
 				int srcRows,int dstRows,int srcCols,int dstCols,
 				int srcOffset,int dstOffset,int srcStep,int dstStep)
 {
 	const int x = get_global_id(0);
 	const int y = get_global_id(1);
 	__local float s_srcPatch[10][10];
 	__local float s_dstPatch[20][16];
 	srcOffset = srcOffset >> 2;
 	dstOffset = dstOffset >> 2;
 	srcStep = srcStep >> 2;
 	dstStep = dstStep >> 2;
 	if( get_local_id(0) < 10 && get_local_id(1) < 10 )
 	{
 		int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
 		int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
 		srcx = abs(srcx);
 		srcx = min(srcCols - 1,srcx);
 		srcy = abs(srcy);
 		srcy = min(srcRows -1 ,srcy);
 		s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]);
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	float sum = 0;
 	const int evenFlag = (int)((get_local_id(0) & 1) == 0);
 	const int oddFlag = (int)((get_local_id(0) & 1) != 0);
 	const bool  eveny = ((get_local_id(1) & 1) == 0);
 	const int tidx = get_local_id(0);
 	if(eveny)
 	{
 		sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)];
        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)];
        sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)];
        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)];
        sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)];	
  	}
 	s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
 	if (get_local_id(1) < 2)
    {
 		sum = 0;
        if (eveny)
        {
 			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
            sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
         }
         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
     }
     if (get_local_id(1) > 13)
     {
 		sum = 0;
 		if (eveny)
 		{
 			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
 			sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
 			sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx    ) >> 1)];
 			sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
 			sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
 		}
 		s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
     }
 	barrier(CLK_LOCAL_MEM_FENCE);
 	sum = 0;
    const int tidy = get_local_id(1);
    sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)];
    sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][get_local_id(0)];
    sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][get_local_id(0)];
    sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][get_local_id(0)];
    sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)];
    if ((x < dstCols) && (y < dstRows))
 		dst[x + y * dstStep] = (float)(4.0f * sum);
 }
 ///////////////////////////////////////////////////////////////////////
 //////////////////////////  CV_8UC4  //////////////////////////////////
 ///////////////////////////////////////////////////////////////////////
 float4 covert_uchar4_to_float4(uchar4 data)
 {
 	float4 f4Data = {0,0,0,0};
 	f4Data.x = (float)data.x;
 	f4Data.y = (float)data.y;
 	f4Data.z = (float)data.z;
 	f4Data.w = (float)data.w;
 	return f4Data;
 }
 uchar4 convert_float4_to_uchar4(float4 data)
 {
 	uchar4 u4Data;
 	u4Data.x = get_valid_uchar(data.x);
 	u4Data.y = get_valid_uchar(data.y);
 	u4Data.z = get_valid_uchar(data.z);
 	u4Data.w = get_valid_uchar(data.w);
 	return u4Data;
 }
 float4 int_x_float4(int leftOpr,float4 rightOpr)
 {
 	float4 result = {0,0,0,0};
 	result.x = rightOpr.x * leftOpr;
 	result.y = rightOpr.y * leftOpr;
 	result.z = rightOpr.z * leftOpr;
 	result.w = rightOpr.w * leftOpr;
 	return result;
 }
 float4 float4_x_float4(float4 leftOpr,float4 rightOpr)
 {
 	float4 result;
 	result.x = leftOpr.x * rightOpr.x;
 	result.y = leftOpr.y * rightOpr.y;
 	result.z = leftOpr.z * rightOpr.z;
 	result.w = leftOpr.w * rightOpr.w;
 	return result;
 }
 __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst,
 				int srcRows,int dstRows,int srcCols,int dstCols,
 				int srcOffset,int dstOffset,int srcStep,int dstStep)
 {
 	const int x = get_global_id(0);
 	const int y = get_global_id(1);
 	__local float4 s_srcPatch[10][10];
 	__local float4 s_dstPatch[20][16];
 	srcOffset >>= 2;
 	dstOffset >>= 2;
 	srcStep >>= 2;
 	dstStep >>= 2;
 	if( get_local_id(0) < 10 && get_local_id(1) < 10 )
 	{
 		int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
 		int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
 		srcx = abs(srcx);
 		srcx = min(srcCols - 1,srcx);
 		srcy = abs(srcy);
 		srcy = min(srcRows -1 ,srcy);
 		s_srcPatch[get_local_id(1)][get_local_id(0)] = covert_uchar4_to_float4(src[srcx + srcy * srcStep]);
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	float4 sum = (float4)(0,0,0,0);
 	const int evenFlag = (int)((get_local_id(0) & 1) == 0);
 	const int oddFlag = (int)((get_local_id(0) & 1) != 0);
 	const bool  eveny = ((get_local_id(1) & 1) == 0);
 	const int tidx = get_local_id(0);
 	float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
 	float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
 	float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
 	if(eveny)
 	{
 		sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]);
        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]);
        sum = sum + float4_x_float4(int_x_float4( evenFlag, co1 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)]);
        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]);
        sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]);	
  	}
 	s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
 	if (get_local_id(1) < 2)
    {
 		sum = 0;
        if (eveny)
        {
 			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx - 2) >> 1)]);
            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx - 1) >> 1)]);
            sum = sum + float4_x_float4(int_x_float4(evenFlag , co1 ) , s_srcPatch[0][1 + ((tidx    ) >> 1)]);
            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx + 1) >> 1)]);
            sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx + 2) >> 1)]);
         }
         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
     }
     if (get_local_id(1) > 13)
     {
 		sum = 0;
 		if (eveny)
 		{
 			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx - 2) >> 1)]);
 			sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx - 1) >> 1)]);
 			sum = sum + float4_x_float4(int_x_float4(evenFlag , co1) , s_srcPatch[9][1 + ((tidx    ) >> 1)]);
 			sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx + 1) >> 1)]);
 			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx + 2) >> 1)]);
 		}
 		s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
     }
 	barrier(CLK_LOCAL_MEM_FENCE);
 	sum = 0;
    const int tidy = get_local_id(1);
    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy - 2][get_local_id(0)]);
    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy - 1][get_local_id(0)]);
    sum = sum + float4_x_float4(co1 , s_dstPatch[2 + tidy    ][get_local_id(0)]);
    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy + 1][get_local_id(0)]);
    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy + 2][get_local_id(0)]);
    if ((x < dstCols) && (y < dstRows))
    {
 		dst[x + y * dstStep] = convert_float4_to_uchar4(int_x_float4(4.0f,sum));
 	}
 } 
 ///////////////////////////////////////////////////////////////////////
 //////////////////////////  CV_16UC4 //////////////////////////////////
 ///////////////////////////////////////////////////////////////////////
 float4 covert_ushort4_to_float4(ushort4 data)
 {
 	float4 f4Data = {0,0,0,0};
 	f4Data.x = (float)data.x;
 	f4Data.y = (float)data.y;
 	f4Data.z = (float)data.z;
 	f4Data.w = (float)data.w;
 	return f4Data;
 }
 ushort4 convert_float4_to_ushort4(float4 data)
 {
 	ushort4 u4Data;
 	u4Data.x = (float)data.x;
 	u4Data.y = (float)data.y;
 	u4Data.z = (float)data.z;
 	u4Data.w = (float)data.w;
 	return u4Data;
 }
 __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
 				int srcRows,int dstRows,int srcCols,int dstCols,
 				int srcOffset,int dstOffset,int srcStep,int dstStep)
 {
 	const int x = get_global_id(0);
 	const int y = get_global_id(1);
 	__local float4 s_srcPatch[10][10];
 	__local float4 s_dstPatch[20][16];
 	srcOffset >>= 3;
 	dstOffset >>= 3;
 	srcStep >>= 3;
 	dstStep >>= 3;
 	if( get_local_id(0) < 10 && get_local_id(1) < 10 )
 	{
 		int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
 		int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
 		srcx = abs(srcx);
 		srcx = min(srcCols - 1,srcx);
 		srcy = abs(srcy);
 		srcy = min(srcRows -1 ,srcy);
 		s_srcPatch[get_local_id(1)][get_local_id(0)] = covert_ushort4_to_float4(src[srcx + srcy * srcStep]);
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	float4 sum = (float4)(0,0,0,0);
 	const int evenFlag = (int)((get_local_id(0) & 1) == 0);
 	const int oddFlag = (int)((get_local_id(0) & 1) != 0);
 	const bool  eveny = ((get_local_id(1) & 1) == 0);
 	const int tidx = get_local_id(0);
 	float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
 	float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
 	float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
 	if(eveny)
 	{
 		sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]);
        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]);
        sum = sum + float4_x_float4(int_x_float4( evenFlag, co1 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)]);
        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]);
        sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]);	
  	}
 	s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
 	if (get_local_id(1) < 2)
    {
 		sum = 0;
        if (eveny)
        {
 			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx - 2) >> 1)]);
            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx - 1) >> 1)]);
            sum = sum + float4_x_float4(int_x_float4(evenFlag , co1 ) , s_srcPatch[0][1 + ((tidx    ) >> 1)]);
            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx + 1) >> 1)]);
            sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx + 2) >> 1)]);
         }
         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
     }
     if (get_local_id(1) > 13)
     {
 		sum = 0;
 		if (eveny)
 		{
 			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx - 2) >> 1)]);
 			sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx - 1) >> 1)]);
 			sum = sum + float4_x_float4(int_x_float4(evenFlag , co1) , s_srcPatch[9][1 + ((tidx    ) >> 1)]);
 			sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx + 1) >> 1)]);
 			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx + 2) >> 1)]);
 		}
 		s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
     }
 	barrier(CLK_LOCAL_MEM_FENCE);
 	sum = 0;
    const int tidy = get_local_id(1);
    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy - 2][get_local_id(0)]);
    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy - 1][get_local_id(0)]);
    sum = sum + float4_x_float4(co1 , s_dstPatch[2 + tidy    ][get_local_id(0)]);
    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy + 1][get_local_id(0)]);
    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy + 2][get_local_id(0)]);
    if ((x < dstCols) && (y < dstRows))
    {
 		dst[x + y * dstStep] = convert_float4_to_ushort4(int_x_float4(4.0f,sum));
 	}
 }
 ///////////////////////////////////////////////////////////////////////
 //////////////////////////  CV_32FC4 //////////////////////////////////
 ///////////////////////////////////////////////////////////////////////
 __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst,
 				int srcRows,int dstRows,int srcCols,int dstCols,
 				int srcOffset,int dstOffset,int srcStep,int dstStep)
 {
 	const int x = get_global_id(0);
 	const int y = get_global_id(1);
 	__local float4 s_srcPatch[10][10];
 	__local float4 s_dstPatch[20][16];
 	srcOffset >>= 4;
 	dstOffset >>= 4;
 	srcStep >>= 4;
 	dstStep >>= 4;
 	if( get_local_id(0) < 10 && get_local_id(1) < 10 )
 	{
 		int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
 		int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
 		srcx = abs(srcx);
 		srcx = min(srcCols - 1,srcx);
 		srcy = abs(srcy);
 		srcy = min(srcRows -1 ,srcy);
 		s_srcPatch[get_local_id(1)][get_local_id(0)] = (float4)(src[srcx + srcy * srcStep]);
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	float4 sum = (float4)(0,0,0,0);
 	const int evenFlag = (int)((get_local_id(0) & 1) == 0);
 	const int oddFlag = (int)((get_local_id(0) & 1) != 0);
 	const bool  eveny = ((get_local_id(1) & 1) == 0);
 	const int tidx = get_local_id(0);
 	float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
 	float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
 	float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
 	if(eveny)
 	{
 		sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]);
        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]);
        sum = sum + float4_x_float4(int_x_float4( evenFlag, co1 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)]);
        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]);
        sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]);	
  	}
 	s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
 	if (get_local_id(1) < 2)
    {
 		sum = 0;
        if (eveny)
        {
 			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx - 2) >> 1)]);
            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx - 1) >> 1)]);
            sum = sum + float4_x_float4(int_x_float4(evenFlag , co1 ) , s_srcPatch[0][1 + ((tidx    ) >> 1)]);
            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx + 1) >> 1)]);
            sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx + 2) >> 1)]);
         }
         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
     }
     if (get_local_id(1) > 13)
     {
 		sum = 0;
 		if (eveny)
 		{
 			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx - 2) >> 1)]);
 			sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx - 1) >> 1)]);
 			sum = sum + float4_x_float4(int_x_float4(evenFlag , co1) , s_srcPatch[9][1 + ((tidx    ) >> 1)]);
 			sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx + 1) >> 1)]);
 			sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx + 2) >> 1)]);
 		}
 		s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
     }
 	barrier(CLK_LOCAL_MEM_FENCE);
 	sum = 0;
    const int tidy = get_local_id(1);
    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy - 2][get_local_id(0)]);
    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy - 1][get_local_id(0)]);
    sum = sum + float4_x_float4(co1 , s_dstPatch[2 + tidy    ][get_local_id(0)]);
    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy + 1][get_local_id(0)]);
    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy + 2][get_local_id(0)]);
    if ((x < dstCols) && (y < dstRows))
    {
 		dst[x + y * dstStep] = 4.0f * sum;
 	}
 }
--- a/modules/ocl/src/match_template.cpp
+++ b/modules/ocl/src/match_template.cpp
@ -0,0 +1,560 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
 //    Peng Xiao, pengxiao@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other oclMaterials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 #include <iomanip>
 #include "precomp.hpp"
 using namespace cv;
 using namespace cv::ocl;
 using namespace std;
 #define EXT_FP64 0
 #if !defined (HAVE_OPENCL)
 void cv::ocl::matchTemplate(const oclMat&, const oclMat&, oclMat&) { throw_nogpu(); }
 #else
 //helper routines
 namespace cv
 {
 	namespace ocl
 	{
 		///////////////////////////OpenCL kernel strings///////////////////////////
 		extern const char *match_template;
 	}
 }
 namespace cv { namespace ocl
 {
 	void matchTemplate_SQDIFF(
 		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
 	void matchTemplate_SQDIFF_NORMED(
 		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
 	void matchTemplate_CCORR(
 		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
 	void matchTemplate_CCORR_NORMED(
 		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
 	void matchTemplate_CCOFF(
 		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
 	void matchTemplate_CCOFF_NORMED(
 		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
 	void matchTemplateNaive_SQDIFF(
 		const oclMat& image, const oclMat& templ, oclMat& result, int cn);
 	void matchTemplateNaive_CCORR(
 		const oclMat& image, const oclMat& templ, oclMat& result, int cn);
 	// Evaluates optimal template's area threshold. If 
 	// template's area is less  than the threshold, we use naive match 
 	// template version, otherwise FFT-based (if available)
 	int getTemplateThreshold(int method, int depth)
 	{
 		switch (method)
 		{
 		case CV_TM_CCORR: 
 			if (depth == CV_32F) return 250;
 			if (depth == CV_8U) return 300;
 			break;
 		case CV_TM_SQDIFF:
 			if (depth == CV_32F) return MAXSHORT; // do naive SQDIFF for CV_32F
 			if (depth == CV_8U) return 300;
 			break;
 		}
 		CV_Error(CV_StsBadArg, "getTemplateThreshold: unsupported match template mode");
 		return 0;
 	}
 	//////////////////////////////////////////////////////////////////////
 	// SQDIFF
 	void matchTemplate_SQDIFF(
 		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
 	{
 		result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
 		if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth()))
 		{
 			matchTemplateNaive_SQDIFF(image, templ, result, image.channels());
 			return;
 		}
 		else
 		{
 			// TODO
 			CV_Error(CV_StsBadArg, "Not supported yet for this size template");
 		}
 	}
 	void matchTemplate_SQDIFF_NORMED(
 		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
 	{
 		matchTemplate_CCORR(image,templ,result,buf);
 		buf.image_sums.resize(1);
 		buf.image_sqsums.resize(1);
 		integral(image.reshape(1), buf.image_sums[0], buf.image_sqsums[0]);
 #if EXT_FP64 && SQRSUM_FIXED
 		unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
 #else
 		Mat sqr_mat = templ.reshape(1);
 		unsigned long long templ_sqsum = (unsigned long long)sum(sqr_mat.mul(sqr_mat))[0];
 #endif
 		Context *clCxt = image.clCxt;
 		string kernelName = "matchTemplate_Prepared_SQDIFF_NORMED";
 		vector< pair<size_t, const void *> > args;
 		args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data));
 		args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
 		args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 		size_t globalThreads[3] = {result.cols, result.rows, 1};
 		size_t localThreads[3]  = {32, 8, 1};
 		openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
 	}
 	void matchTemplateNaive_SQDIFF(
 		const oclMat& image, const oclMat& templ, oclMat& result, int cn)
 	{
 		CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
 			|| (image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F);
 		CV_Assert(image.channels() == templ.channels() && (image.channels() == 1 || image.channels() == 4) && result.channels() == 1);
 		CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
 		Context *clCxt = image.clCxt;
 		string kernelName = "matchTemplate_Naive_SQDIFF";
 		vector< pair<size_t, const void *> > args;
 		args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data));
 		args.push_back( make_pair( sizeof(cl_mem), (void *)&templ.data));
 		args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.offset));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&image.step));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.step));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 		size_t globalThreads[3] = {result.cols, result.rows, 1};
 		size_t localThreads[3]  = {32, 8, 1};
 		openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth());
 	}
 	//////////////////////////////////////////////////////////////////////
 	// CCORR
 	void matchTemplate_CCORR(
 		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
 	{
 		result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
 		if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth()))
 		{
 			matchTemplateNaive_CCORR(image, templ, result, image.channels());
 			return;
 		}
 		else
 		{
 			CV_Error(CV_StsBadArg, "Not supported yet for this size template");
 			if(image.depth() == CV_8U && templ.depth() == CV_8U)
 			{
 				image.convertTo(buf.imagef, CV_32F);
 				templ.convertTo(buf.templf, CV_32F);
 			}
 			CV_Assert(image.channels() == 1);
 			oclMat o_result(image.size(), CV_MAKETYPE(CV_32F, image.channels()));
 			filter2D(buf.imagef,o_result,CV_32F,buf.templf, Point(0,0));
 			result = o_result(Rect(0,0,image.rows - templ.rows + 1, image.cols - templ.cols + 1));
 		}
 	}
 	void matchTemplate_CCORR_NORMED(
 		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
 	{
 		matchTemplate_CCORR(image,templ,result,buf);
 		buf.image_sums.resize(1);
 		buf.image_sqsums.resize(1);
 		integral(image.reshape(1), buf.image_sums[0], buf.image_sqsums[0]);
 #if EXT_FP64 && SQRSUM_FIXED
 		unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
 #elif EXT_FP64
 		oclMat templ_c1 = templ.reshape(1);
 		multiply(templ_c1, templ_c1, templ_c1);
 		unsigned long long templ_sqsum = (unsigned long long)sum(templ_c1)[0];
 #else
 		Mat m_templ_c1 = templ.reshape(1);
 		multiply(m_templ_c1, m_templ_c1, m_templ_c1);
 		unsigned long long templ_sqsum = (unsigned long long)sum(m_templ_c1)[0];
 #endif
 		Context *clCxt = image.clCxt;
 		string kernelName = "normalizeKernel";
 		vector< pair<size_t, const void *> > args;
 		args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data));
 		args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
 		args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 		size_t globalThreads[3] = {result.cols, result.rows, 1};
 		size_t localThreads[3]  = {32, 8, 1};
 		openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
 	}
 	void matchTemplateNaive_CCORR(
 		const oclMat& image, const oclMat& templ, oclMat& result, int cn)
 	{
 		CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
 			|| (image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F);
 		CV_Assert(image.channels() == templ.channels() && (image.channels() == 1 || image.channels() == 4) && result.channels() == 1);
 		CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
 		Context *clCxt = image.clCxt;
 		string kernelName = "matchTemplate_Naive_CCORR";
 		vector< pair<size_t, const void *> > args;
 		args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data));
 		args.push_back( make_pair( sizeof(cl_mem), (void *)&templ.data));
 		args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.offset));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&image.step));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.step));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 		size_t globalThreads[3] = {result.cols, result.rows, 1};
 		size_t localThreads[3]  = {32, 8, 1};
 		openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth());
 	}
 	//////////////////////////////////////////////////////////////////////
 	// CCOFF
 	void matchTemplate_CCOFF(
 		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
 	{
 		CV_Assert(image.depth() == CV_8U && templ.depth() == CV_8U);
 		matchTemplate_CCORR(image,templ,result,buf);
 		Context *clCxt = image.clCxt;
 		string kernelName;
 		kernelName = "matchTemplate_Prepared_CCOFF";
 		size_t globalThreads[3] = {result.cols, result.rows, 1};
 		size_t localThreads[3]  = {32, 8, 1};
 		vector< pair<size_t, const void *> > args;
 		args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
 		args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows) ); 
 		args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols) );
 		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows) );
 		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols) );
 		args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) );
 		args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
 		args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 		// to be continued in the following section
 		if(image.channels() == 1)
 		{
 			buf.image_sums.resize(1);
 			// FIXME: temp fix for incorrect integral kernel
 			oclMat tmp_oclmat;
 			integral(image, buf.image_sums[0], tmp_oclmat);
 			float templ_sum = 0;
 #if EXT_FP64
 			templ_sum = (float)sum(templ)[0] / templ.size().area();
 #else
 			Mat o_templ = templ;
 			templ_sum = (float)sum(o_templ)[0] / o_templ.size().area(); // temp fix for non-double supported machine
 #endif
 			args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 			args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 			args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 			args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum) );
 		}
 		else
 		{
 			Vec4f templ_sum = Vec4f::all(0);
 #if EXT_FP64
 			split(image,buf.images);
 			templ_sum = sum(templ) / templ.size().area();
 #else 
 			// temp fix for non-double supported machine
 			Mat o_templ = templ, o_image = image;
 			vector<Mat> o_mat_vector;
 			o_mat_vector.resize(image.channels());
 			buf.images.resize(image.channels());
 			split(o_image, o_mat_vector);
 			for(int i = 0; i < o_mat_vector.size(); i ++)
 			{
 				buf.images[i] = oclMat(o_mat_vector[i]);
 			}
 			templ_sum = sum(o_templ) / templ.size().area();
 #endif
 			buf.image_sums.resize(buf.images.size());
 			for(int i = 0; i < image.channels(); i ++)
 			{
 				// FIXME: temp fix for incorrect integral kernel
 				oclMat omat_temp;
 				integral(buf.images[i], buf.image_sums[i], omat_temp);
 			}
 			switch(image.channels())
 			{
 			case 4:
 				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
 				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
 				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
 				args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 				args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 				args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[0]) );
 				args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[1]) );
 				args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[2]) );
 				args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[3]) );
 				break;
 			default:
 				CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
 				break;
 			}
 		}
 		openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth());
 	}
 	void matchTemplate_CCOFF_NORMED(
 		const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
 	{
 		image.convertTo(buf.imagef, CV_32F);
 		templ.convertTo(buf.templf, CV_32F);
 		matchTemplate_CCORR(buf.imagef, buf.templf, result, buf);
 		float scale = 1.f/templ.size().area();
 		Context *clCxt = image.clCxt;
 		string kernelName;
 		kernelName = "matchTemplate_Prepared_CCOFF_NORMED";
 		size_t globalThreads[3] = {result.cols, result.rows, 1};
 		size_t localThreads[3]  = {32, 8, 1};
 		vector< pair<size_t, const void *> > args;
 		args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
 		args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows) ); 
 		args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols) );
 		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows) );
 		args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols) );
 		args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) );
 		args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
 		args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 		args.push_back( make_pair( sizeof(cl_float),(void *)&scale) );
 		// to be continued in the following section
 		if(image.channels() == 1)
 		{
 			buf.image_sums.resize(1);
 			buf.image_sqsums.resize(1);
 			integral(image, buf.image_sums[0], buf.image_sqsums[0]);
 			float templ_sum = 0;
 			float templ_sqsum = 0;
 #if EXT_FP64
 			templ_sum   = (float)sum(templ)[0];
 #if SQRSUM_FIXED
 			templ_sqsum = sqrSum(templ);
 #else
 			oclMat templ_sqr = templ;
 			multiply(templ,templ, templ_sqr);
 			templ_sqsum  = sum(templ_sqr)[0];
 #endif //SQRSUM_FIXED
 			templ_sqsum -= scale * templ_sum * templ_sum;
 			templ_sum   *= scale;
 #else
 			// temp fix for non-double supported machine
 			Mat o_templ = templ;
 			templ_sum   = (float)sum(o_templ)[0]; 
 			templ_sqsum = sum(o_templ.mul(o_templ))[0] - scale * templ_sum * templ_sum;
 			templ_sum  *= scale;
 #endif
 			args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 			args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 			args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 			args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
 			args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
 			args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
 			args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum) );
 			args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sqsum) );
 		}
 		else
 		{
 			Vec4f templ_sum   = Vec4f::all(0);
 			Vec4f templ_sqsum = Vec4f::all(0);
 #if EXT_FP64
 			split(image,buf.images);
 			templ_sum   = sum(templ);
 #if SQRSUM_FIXED
 			templ_sqsum = sqrSum(templ);
 #else
 			oclMat templ_sqr = templ;
 			multiply(templ,templ, templ_sqr);
 			templ_sqsum  = sum(templ_sqr);
 #endif //SQRSUM_FIXED
 			templ_sqsum -= scale * templ_sum * templ_sum;
 #else 
 			// temp fix for non-double supported machine
 			Mat o_templ = templ, o_image = image;
 			vector<Mat> o_mat_vector;
 			o_mat_vector.resize(image.channels());
 			buf.images.resize(image.channels());
 			split(o_image, o_mat_vector);
 			for(int i = 0; i < o_mat_vector.size(); i ++)
 			{
 				buf.images[i] = oclMat(o_mat_vector[i]);
 			}
 			templ_sum    = sum(o_templ);
 			templ_sqsum  = sum(o_templ.mul(o_templ));
 #endif
 			float templ_sqsum_sum = 0;
 			for(int i = 0; i < image.channels(); i ++)
 			{
 				templ_sqsum_sum += templ_sqsum[i] - scale * templ_sum[i] * templ_sum[i];
 			}
 			templ_sum   *= scale;
 			buf.image_sums.resize(buf.images.size());
 			buf.image_sqsums.resize(buf.images.size());
 			for(int i = 0; i < image.channels(); i ++)
 			{
 				integral(buf.images[i], buf.image_sums[i], buf.image_sqsums[i]);
 			}
 			switch(image.channels())
 			{
 			case 4:
 				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
 				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
 				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
 				args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 				args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
 				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[1].data) );
 				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[2].data) );
 				args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[3].data) );
 				args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
 				args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
 				args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[0]) );
 				args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[1]) );
 				args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[2]) );
 				args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[3]) );
 				args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sqsum_sum) );
 				break;
 			default:
 				CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
 				break;
 			}
 		}
 		openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth());
 	}
 }/*ocl*/} /*cv*/
 void cv::ocl::matchTemplate(const oclMat& image, const oclMat& templ, oclMat& result, int method)
 {
 	MatchTemplateBuf buf;
 	matchTemplate(image,templ, result, method,buf);
 }
 void cv::ocl::matchTemplate(const oclMat& image, const oclMat& templ, oclMat& result, int method, MatchTemplateBuf& buf)
 {
 	CV_Assert(image.type() == templ.type());
 	CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows);
 	typedef void (*Caller)(const oclMat&, const oclMat&, oclMat&, MatchTemplateBuf&);
 	const Caller callers[] = { 
 		::matchTemplate_SQDIFF, ::matchTemplate_SQDIFF_NORMED, 
 		::matchTemplate_CCORR, ::matchTemplate_CCORR_NORMED, 
 		::matchTemplate_CCOFF, ::matchTemplate_CCOFF_NORMED
 	};
 	Caller caller = callers[method];
 	CV_Assert(caller);
 	caller(image, templ, result, buf);
 }
 #endif //
--- a/modules/ocl/src/pyrdown.cpp
+++ b/modules/ocl/src/pyrdown.cpp
@ -0,0 +1,115 @@
 #include "precomp.hpp"
 using namespace cv;
 using namespace cv::ocl;
 using namespace std;
 using std::cout;
 using std::endl;
 namespace cv
 {
    namespace ocl
    {
        ///////////////////////////OpenCL kernel strings///////////////////////////
        extern const char *pyr_down;
    }
 }
 //////////////////////////////////////////////////////////////////////////////
 /////////////////////// add subtract multiply divide /////////////////////////
 //////////////////////////////////////////////////////////////////////////////
 template<typename T>
 void pyrdown_run(const oclMat &src, const oclMat &dst)
 {
    CV_Assert(src.cols / 2 == dst.cols && src.rows / 2 == dst.rows);
    CV_Assert(src.type() == dst.type());
    CV_Assert(src.depth() != CV_8S);
    Context  *clCxt = src.clCxt;
    //int channels = dst.channels();
    //int depth = dst.depth();
    string kernelName = "pyrDown";
    //int vector_lengths[4][7] = {{4, 0, 4, 4, 1, 1, 1},
    //    {4, 0, 4, 4, 1, 1, 1},
    //    {4, 0, 4, 4, 1, 1, 1},
    //    {4, 0, 4, 4, 1, 1, 1}
    //};
    //size_t vector_length = vector_lengths[channels-1][depth];
    //int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1);
    size_t localThreads[3]  = { 256, 1, 1 };
    size_t globalThreads[3] = { src.cols, dst.rows, 1};
    //int dst_step1 = dst.cols * dst.elemSize();
    vector<pair<size_t , const void *> > args;
    args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data ));
    args.push_back( make_pair( sizeof(cl_int), (void *)&src.step ));
    args.push_back( make_pair( sizeof(cl_int), (void *)&src.offset ));
    args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));
    args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols));
    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset ));
    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols));
    openCLExecuteKernel(clCxt, &pyr_down, kernelName, globalThreads, localThreads, args, src.channels(), src.depth());
 }
 void pyrdown_run(const oclMat &src, const oclMat &dst)
 {
 	switch(src.depth())
 	{
 	case 0:
 	    pyrdown_run<unsigned char>(src, dst);
 		break;
 	case 1:
 	    pyrdown_run<char>(src, dst);
 		break;
 	case 2:
 	    pyrdown_run<unsigned short>(src, dst);
 		break;
 	case 3:
 	    pyrdown_run<short>(src, dst);
 		break;
 	case 4:
 	    pyrdown_run<int>(src, dst);
 		break;
 	case 5:
 	    pyrdown_run<float>(src, dst);
 		break;
 	case 6:
 	    pyrdown_run<double>(src, dst);
 		break;
 	default:
 		break;
 	}
 }
 //////////////////////////////////////////////////////////////////////////////
 // pyrDown
 void cv::ocl::pyrDown(const oclMat& src, oclMat& dst)
 {
    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
 	//src.step = src.rows;
    dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());
 	//dst.step = dst.rows;
    pyrdown_run(src, dst);
 }
--- a/modules/ocl/src/pyrup.cpp
+++ b/modules/ocl/src/pyrup.cpp
@ -0,0 +1,88 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
 //		Zhang Chunpeng chunpeng@multicorewareinc.com
 //    
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other oclMaterials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 /* Haar features calculation */
 //#define EMU
 #include "precomp.hpp"
 using namespace cv;
 using namespace cv::ocl;
 using namespace std;
 #ifndef HAVE_OPENCL
 void cv::ocl::pyrUp(const oclMat&, GpuMat&, oclMat&) { throw_nogpu(); }
 #else
 namespace cv { namespace ocl 
 { 
 	extern const char *pyr_up;
 	void pyrUp(const cv::ocl::oclMat& src,cv::ocl::oclMat& dst)
 	{
 		dst.create(src.rows * 2, src.cols * 2, src.type());
 		Context *clCxt = src.clCxt;
 		const std::string kernelName = "pyrUp";
 		std::vector< pair<size_t, const void *> > args;
 		args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
 		args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&src.offset));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&src.step));
 		args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step));
 		size_t globalThreads[3] = {dst.cols, dst.rows, 1};
 		size_t localThreads[3]  = {16, 16, 1};
 		openCLExecuteKernel(clCxt, &pyr_up, kernelName, globalThreads, localThreads, args, src.channels(), src.depth());
 	}
 }};
 #endif // HAVE_OPENCL
--- a/modules/ocl/test/test_blend.cpp
+++ b/modules/ocl/test/test_blend.cpp
@ -0,0 +1,82 @@
 #include "precomp.hpp"
 #include <iomanip>
 using namespace cv;
 using namespace cv::ocl;
 using namespace cvtest;
 using namespace testing;
 using namespace std;
 template <typename T>
 void blendLinearGold(const cv::Mat& img1, const cv::Mat& img2, const cv::Mat& weights1, const cv::Mat& weights2, cv::Mat& result_gold)
 {
    result_gold.create(img1.size(), img1.type());
    int cn = img1.channels();
    for (int y = 0; y < img1.rows; ++y)
    {
        const float* weights1_row = weights1.ptr<float>(y);
        const float* weights2_row = weights2.ptr<float>(y);
        const T* img1_row = img1.ptr<T>(y);
        const T* img2_row = img2.ptr<T>(y);
        T* result_gold_row = result_gold.ptr<T>(y);
        for (int x = 0; x < img1.cols * cn; ++x)
        {
            float w1 = weights1_row[x / cn];
            float w2 = weights2_row[x / cn];
            result_gold_row[x] = static_cast<T>((img1_row[x] * w1 + img2_row[x] * w2) / (w1 + w2 + 1e-5f));
        }
    }
 }
 PARAM_TEST_CASE(Blend, cv::Size, MatType/*, UseRoi*/)
 {
    //cv::gpu::DeviceInfo devInfo;
    cv::Size size;
    int type;
    bool useRoi;
    virtual void SetUp()
    {
        //devInfo = GET_PARAM(0);
        size = GET_PARAM(0);
        type = GET_PARAM(1);
        /*useRoi = GET_PARAM(3);*/
        //cv::gpu::setDevice(devInfo.deviceID());
    }
 };
 TEST_P(Blend, Accuracy)
 {
    int depth = CV_MAT_DEPTH(type);
    cv::Mat img1 = randomMat(size, type, 0.0, depth == CV_8U ? 255.0 : 1.0);
    cv::Mat img2 = randomMat(size, type, 0.0, depth == CV_8U ? 255.0 : 1.0);
    cv::Mat weights1 = randomMat(size, CV_32F, 0, 1);
    cv::Mat weights2 = randomMat(size, CV_32F, 0, 1);
 	cv::ocl::oclMat gimg1(size, type), gimg2(size, type), gweights1(size, CV_32F), gweights2(size, CV_32F);
 	cv::ocl::oclMat dst(size, type);
 	gimg1.upload(img1);
 	gimg2.upload(img2);
 	gweights1.upload(weights1);
 	gweights2.upload(weights2);
 	cv::ocl::blendLinear(gimg1, gimg2, gweights1, gweights2, dst);
 	cv::Mat result;
    cv::Mat result_gold;
 	dst.download(result);
    if (depth == CV_8U)
        blendLinearGold<uchar>(img1, img2, weights1, weights2, result_gold);
    else
        blendLinearGold<float>(img1, img2, weights1, weights2, result_gold);
    EXPECT_MAT_NEAR(result_gold, result, CV_MAT_DEPTH(type) == CV_8U ? 1 : 1e-5f, NULL)
 }
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Blend, Combine(
 	DIFFERENT_SIZES,
 	testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC4))
 ));
--- a/modules/ocl/test/test_columnsum.cpp
+++ b/modules/ocl/test/test_columnsum.cpp
@ -0,0 +1,105 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
 //	   Chunpeng Zhang chunpeng@multicorewareinc.com
 //    
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other oclMaterials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 #include "precomp.hpp"
 #include <iomanip>
 ///////////////////////////////////////////////////////////////////////////////
 /// ColumnSum
 #ifdef HAVE_OPENCL
 ////////////////////////////////////////////////////////////////////////
 // ColumnSum
 PARAM_TEST_CASE(ColumnSum, cv::Size, bool )
 {
    cv::Size size;
    cv::Mat src;
 	bool useRoi;
    virtual void SetUp()
    {
        size = GET_PARAM(0);
 		useRoi = GET_PARAM(1);
    }
 };
 TEST_P(ColumnSum, Accuracy)
 {
    cv::Mat src = randomMat(size, CV_32FC1);
 	//cv::Mat src(size,CV_32FC1);
 	//cv::ocl::oclMat d_dst = ::createMat(size,src.type(),useRoi);
 	cv::ocl::oclMat d_dst = loadMat(src,useRoi);
    cv::ocl::columnSum(loadMat(src,useRoi),d_dst);
    cv::Mat dst(d_dst);
    for (int j = 0; j < src.cols; ++j)
    {
        float gold = src.at<float>(0, j);
        float res = dst.at<float>(0, j);
        ASSERT_NEAR(res, gold, 1e-5);
    }
    for (int i = 1; i < src.rows; ++i)
    {
        for (int j = 0; j < src.cols; ++j)
        {
            float gold = src.at<float>(i, j) += src.at<float>(i - 1, j);
            float res = dst.at<float>(i, j);
            ASSERT_NEAR(res, gold, 1e-5);
        }
    }
 }
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ColumnSum, testing::Combine(
 						DIFFERENT_SIZES,testing::Values(Inverse(false),Inverse(true))));
 #endif 
--- a/modules/ocl/test/test_match_template.cpp
+++ b/modules/ocl/test/test_match_template.cpp
@ -0,0 +1,166 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
 //    Peng Xiao, pengxiao@multicorewareinc.com
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other oclMaterials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 #include "precomp.hpp"
 #define PERF_TEST 0
 ////////////////////////////////////////////////////////////////////////////////
 // MatchTemplate
 #define ALL_TEMPLATE_METHODS testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR), TemplateMethod(cv::TM_CCOEFF), TemplateMethod(cv::TM_SQDIFF_NORMED), TemplateMethod(cv::TM_CCORR_NORMED), TemplateMethod(cv::TM_CCOEFF_NORMED))
 IMPLEMENT_PARAM_CLASS(TemplateSize, cv::Size);
 const char* TEMPLATE_METHOD_NAMES[6] = {"TM_SQDIFF", "TM_SQDIFF_NORMED", "TM_CCORR", "TM_CCORR_NORMED", "TM_CCOEFF", "TM_CCOEFF_NORMED"};
 PARAM_TEST_CASE(MatchTemplate8U, cv::Size, TemplateSize, Channels, TemplateMethod)
 {
    cv::Size size;
    cv::Size templ_size;
    int cn;
    int method;
    virtual void SetUp()
    {
        size = GET_PARAM(0);
        templ_size = GET_PARAM(1);
        cn = GET_PARAM(2);
        method = GET_PARAM(3);
    }
 };
 TEST_P(MatchTemplate8U, Accuracy)
 {
 	std::cout << "Method: " << TEMPLATE_METHOD_NAMES[method] << std::endl;
 	std::cout << "Image Size: (" << size.width << ", " << size.height << ")"<< std::endl;
 	std::cout << "Template Size: (" << templ_size.width << ", " << templ_size.height << ")"<< std::endl;
 	std::cout << "Channels: " << cn << std::endl;
 	cv::Mat image = randomMat(size, CV_MAKETYPE(CV_8U, cn));
    cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_8U, cn));
    cv::ocl::oclMat dst, ocl_image(image), ocl_templ(templ);
 	cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method);
    cv::Mat dst_gold;
    cv::matchTemplate(image, templ, dst_gold, method);
 	char sss [100] = "";
 	cv::Mat mat_dst;
 	dst.download(mat_dst);
    EXPECT_MAT_NEAR(dst_gold, mat_dst, templ_size.area() * 1e-1, sss);
 #if PERF_TEST
 	{
 		P_TEST_FULL({}, {cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method);}, {});
 		P_TEST_FULL({}, {cv::matchTemplate(image, templ, dst_gold, method);}, {});
 	}
 #endif // PERF_TEST
 }
 PARAM_TEST_CASE(MatchTemplate32F, cv::Size, TemplateSize, Channels, TemplateMethod)
 {
    cv::Size size;
    cv::Size templ_size;
    int cn;
    int method;
    virtual void SetUp()
    {
        size = GET_PARAM(0);
        templ_size = GET_PARAM(1);
        cn = GET_PARAM(2);
        method = GET_PARAM(3);
    }
 };
 TEST_P(MatchTemplate32F, Accuracy)
 {
    cv::Mat image = randomMat(size, CV_MAKETYPE(CV_32F, cn));
    cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_32F, cn));
    cv::ocl::oclMat dst, ocl_image(image), ocl_templ(templ);
 	cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method);
    cv::Mat dst_gold;
    cv::matchTemplate(image, templ, dst_gold, method);
 	char sss [100] = "";
 	cv::Mat mat_dst;
 	dst.download(mat_dst);
    EXPECT_MAT_NEAR(dst_gold, mat_dst, templ_size.area() * 1e-1, sss);
 #if PERF_TEST
 	{
 		std::cout << "Method: " << TEMPLATE_METHOD_NAMES[method] << std::endl;
 		std::cout << "Image Size: (" << size.width << ", " << size.height << ")"<< std::endl;
 		std::cout << "Template Size: (" << templ_size.width << ", " << templ_size.height << ")"<< std::endl;
 		std::cout << "Channels: " << cn << std::endl;
 		P_TEST_FULL({}, {cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method);}, {});
 		P_TEST_FULL({}, {cv::matchTemplate(image, templ, dst_gold, method);}, {});
 	}
 #endif // PERF_TEST
 }
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate8U, 
 	testing::Combine(
    DIFFERENT_SIZES,
    testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
    testing::Values(Channels(1), Channels(4)),
 	ALL_TEMPLATE_METHODS
 	)
 );
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate32F, testing::Combine(
    DIFFERENT_SIZES,
    testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
    testing::Values(Channels(1), Channels(4)),
    testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR))));
--- a/modules/ocl/test/test_pyrdown.cpp
+++ b/modules/ocl/test/test_pyrdown.cpp
@ -0,0 +1,295 @@
 ///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
 //    Dachuan Zhao, dachuan@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other oclMaterials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 //#define PRINT_CPU_TIME 1000
 //#define PRINT_TIME
 #include "precomp.hpp"
 #include <iomanip>
 #ifdef HAVE_OPENCL
 using namespace cv;
 using namespace cv::ocl;
 using namespace cvtest;
 using namespace testing;
 using namespace std;
 PARAM_TEST_CASE(PyrDown, MatType, bool)
 {
    int type;
    cv::Scalar val;
    //src mat
    cv::Mat mat1;
    cv::Mat mat2;
    cv::Mat mask;
    cv::Mat dst;
    cv::Mat dst1; //bak, for two outputs
    // set up roi
    int roicols;
    int roirows;
    int src1x;
    int src1y;
    int src2x;
    int src2y;
    int dstx;
    int dsty;
    int maskx;
    int masky;
    //src mat with roi
    cv::Mat mat1_roi;
    cv::Mat mat2_roi;
    cv::Mat mask_roi;
    cv::Mat dst_roi;
    cv::Mat dst1_roi; //bak
    std::vector<cv::ocl::Info> oclinfo;
    //ocl dst mat for testing
    cv::ocl::oclMat gdst_whole;
    cv::ocl::oclMat gdst1_whole; //bak
    //ocl mat with roi
    cv::ocl::oclMat gmat1;
    cv::ocl::oclMat gmat2;
    cv::ocl::oclMat gdst;
    cv::ocl::oclMat gdst1;   //bak
    cv::ocl::oclMat gmask;
    virtual void SetUp()
    {
        type = GET_PARAM(0);
        cv::RNG &rng = TS::ptr()->get_rng();
        cv::Size size(MWIDTH, MHEIGHT);
        mat1 = randomMat(rng, size, type, 5, 16, false);
        mat2 = randomMat(rng, size, type, 5, 16, false);
        dst  = randomMat(rng, size, type, 5, 16, false);
        dst1  = randomMat(rng, size, type, 5, 16, false);
        mask = randomMat(rng, size, CV_8UC1, 0, 2,  false);
        cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
        val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
        //int devnums = getDevice(oclinfo);
        //CV_Assert(devnums > 0);
        //if you want to use undefault device, set it here
        //setDevice(oclinfo[0]);
    }
 	void Cleanup()
 	{
 		mat1.release();
 		mat2.release();
 		mask.release();
 		dst.release();
 		dst1.release();
 		mat1_roi.release();
 		mat2_roi.release();
 		mask_roi.release();
 		dst_roi.release();
 		dst1_roi.release();
 		gdst_whole.release();
 		gdst1_whole.release();
 		gmat1.release();
 		gmat2.release();
 		gdst.release();
 		gdst1.release();
 		gmask.release();
 	}
    void random_roi()
    {
        cv::RNG &rng = TS::ptr()->get_rng();
 #ifdef RANDOMROI
        //randomize ROI
        roicols = rng.uniform(1, mat1.cols);
        roirows = rng.uniform(1, mat1.rows);
        src1x   = rng.uniform(0, mat1.cols - roicols);
        src1y   = rng.uniform(0, mat1.rows - roirows);
        dstx    = rng.uniform(0, dst.cols  - roicols);
        dsty    = rng.uniform(0, dst.rows  - roirows);
 #else
        roicols = mat1.cols;
        roirows = mat1.rows;
        src1x = 0;
        src1y = 0;
        dstx = 0;
        dsty = 0;
 #endif
        maskx   = rng.uniform(0, mask.cols - roicols);
        masky   = rng.uniform(0, mask.rows - roirows);
        src2x   = rng.uniform(0, mat2.cols - roicols);
        src2y   = rng.uniform(0, mat2.rows - roirows);
        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
        mat2_roi = mat2(Rect(src2x, src2y, roicols, roirows));
        mask_roi = mask(Rect(maskx, masky, roicols, roirows));
        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
        dst1_roi = dst1(Rect(dstx, dsty, roicols, roirows));
        gdst_whole = dst;
        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
        gdst1_whole = dst1;
        gdst1 = gdst1_whole(Rect(dstx, dsty, roicols, roirows));
        gmat1 = mat1_roi;
        gmat2 = mat2_roi;
        gmask = mask_roi; //end
    }
 };
 #define VARNAME(A) string(#A);
 void PrePrint()
 {
 		//for(int i = 0; i < MHEIGHT; i++)
 		//{
 		//	printf("(%d) ", i);
 		//	for(int k = 0; k < MWIDTH; k++)
 		//	{
 		//		printf("%d ", mat1_roi.data[i * MHEIGHT + k]);
 		//	}
 		//	printf("\n");
 		//}
 }
 void PostPrint()
 {
 		//dst_roi.convertTo(dst_roi,CV_32S);
 		//cpu_dst.convertTo(cpu_dst,CV_32S);
 		//dst_roi -= cpu_dst;
 		//cpu_dst -= dst_roi;
 		//for(int i = 0; i < MHEIGHT / 2; i++)
 		//{
 		//	printf("(%d) ", i);
 		//	for(int k = 0; k < MWIDTH / 2; k++)
 		//	{
 		//		if(gmat1.depth() == 0)
 		//		{
 		//			if(gmat1.channels() == 1)
 		//			{
 		//				printf("%d ", dst_roi.data[i * MHEIGHT / 2 + k]);
 		//			}
 		//			else
 		//			{
 		//				printf("%d ", ((unsigned*)dst_roi.data)[i * MHEIGHT / 2 + k]);
 		//			}
 		//		}
 		//		else if(gmat1.depth() == 5)
 		//		{
 		//			printf("%.6f ", ((float*)dst_roi.data)[i * MHEIGHT / 2 + k]);
 		//		}
 		//	}
 		//	printf("\n");
 		//}
 		//for(int i = 0; i < MHEIGHT / 2; i++)
 		//{
 		//	printf("(%d) ", i);
 		//	for(int k = 0; k < MWIDTH / 2; k++)
 		//	{
 		//		if(gmat1.depth() == 0)
 		//		{
 		//			if(gmat1.channels() == 1)
 		//			{
 		//				printf("%d ", cpu_dst.data[i * MHEIGHT / 2 + k]);
 		//			}
 		//			else
 		//			{
 		//				printf("%d ", ((unsigned*)cpu_dst.data)[i * MHEIGHT / 2 + k]);
 		//			}
 		//		}
 		//		else if(gmat1.depth() == 5)
 		//		{
 		//			printf("%.6f ", ((float*)cpu_dst.data)[i * MHEIGHT / 2 + k]);
 		//		}
 		//	}
 		//	printf("\n");
 		//}
 }
 ////////////////////////////////PyrDown/////////////////////////////////////////////////
 //struct PyrDown : ArithmTestBase {};
 TEST_P(PyrDown, Mat)
 {
    for(int j = 0; j < LOOP_TIMES; j++)
    {
        random_roi();
 		cv::pyrDown(mat1_roi, dst_roi);
 		cv::ocl::pyrDown(gmat1, gdst);
        cv::Mat cpu_dst;
        gdst.download(cpu_dst);
        char s[1024];
        sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
 		EXPECT_MAT_NEAR(dst_roi, cpu_dst, dst_roi.depth() == CV_32F ? 1e-5f : 1.0f, s);
 		Cleanup();
    }
 }
 //********test****************
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrDown, Combine(
                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
                            Values(false))); // Values(false) is the reserved parameter
 #endif // HAVE_OPENCL
--- a/modules/ocl/test/test_pyrup.cpp
+++ b/modules/ocl/test/test_pyrup.cpp
@ -0,0 +1,97 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
 //    Zhang Chunpeng chunpeng@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other oclMaterials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 #include "precomp.hpp"
 #include "opencv2/core/core.hpp"
 #ifdef HAVE_OPENCL
 PARAM_TEST_CASE(PyrUp,cv::Size,int)
 {
 	cv::Size size;
 	int type;
 	virtual void SetUp()
 	{
 		size = GET_PARAM(0);
 		type = GET_PARAM(1);
 	}
 };
 TEST_P(PyrUp,Accuracy)
 {
 	cv::Mat src = randomMat(size,type);
 	cv::Mat dst_gold;
 	cv::pyrUp(src,dst_gold);
 	cv::ocl::oclMat dst;
 	cv::ocl::oclMat srcMat(src);
 	cv::ocl::pyrUp(srcMat,dst);
 	char s[100]={0};
 	EXPECT_MAT_NEAR(dst_gold, dst, (src.depth() == CV_32F ? 1e-4f : 1.0),s);	
 }
 #if 1
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrUp, testing::Combine(
    testing::Values(cv::Size(32, 32)),
    testing::Values(MatType(CV_8UC1),MatType(CV_16UC1),MatType(CV_32FC1),MatType(CV_8UC4),
 	MatType(CV_16UC4),MatType(CV_32FC4))));
 #endif
 #endif // HAVE_OPENCL