diff --git a/modules/ocl/doc/image_filtering.rst b/modules/ocl/doc/image_filtering.rst index 92a6c575f4..147ebc3da4 100644 --- a/modules/ocl/doc/image_filtering.rst +++ b/modules/ocl/doc/image_filtering.rst @@ -287,7 +287,7 @@ ocl::createSeparableLinearFilter_GPU ---------------------------------------- Creates a separable linear filter engine. -.. ocv:function:: Ptr ocl::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel, const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT) +.. ocv:function:: Ptr ocl::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel, const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1) ) :param srcType: Source array type. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` source types are supported. @@ -303,6 +303,8 @@ Creates a separable linear filter engine. :param bordertype: Pixel extrapolation method. + :param imgSize: Source image size to choose optimal method for processing. + .. seealso:: :ocv:func:`ocl::getLinearRowFilter_GPU`, :ocv:func:`ocl::getLinearColumnFilter_GPU`, :ocv:func:`createSeparableLinearFilter` @@ -334,7 +336,7 @@ ocl::createDerivFilter_GPU ------------------------------ Creates a filter engine for the generalized Sobel operator. -.. ocv:function:: Ptr ocl::createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT ) +.. ocv:function:: Ptr ocl::createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT, Size imgSize = Size(-1,-1) ) :param srcType: Source image type. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` source types are supported. @@ -348,6 +350,8 @@ Creates a filter engine for the generalized Sobel operator. :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate`. + :param imgSize: Source image size to choose optimal method for processing. + .. seealso:: :ocv:func:`ocl::createSeparableLinearFilter_GPU`, :ocv:func:`createDerivFilter` @@ -405,7 +409,7 @@ ocl::createGaussianFilter_GPU --------------------------------- Creates a Gaussian filter engine. -.. ocv:function:: Ptr ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT) +.. ocv:function:: Ptr ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1) ) :param type: Source and destination image type. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` are supported. @@ -417,6 +421,8 @@ Creates a Gaussian filter engine. :param bordertype: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate`. + :param imgSize: Source image size to choose optimal method for processing. + .. seealso:: :ocv:func:`ocl::createSeparableLinearFilter_GPU`, :ocv:func:`createGaussianFilter` ocl::GaussianBlur diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp index af42136303..d144a042e8 100644 --- a/modules/ocl/include/opencv2/ocl/ocl.hpp +++ b/modules/ocl/include/opencv2/ocl/ocl.hpp @@ -706,17 +706,17 @@ namespace cv //! returns the separable linear filter engine CV_EXPORTS Ptr createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel, - const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT); + const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1)); //! returns the separable filter engine with the specified filters CV_EXPORTS Ptr createSeparableFilter_GPU(const Ptr &rowFilter, const Ptr &columnFilter); //! returns the Gaussian filter engine - CV_EXPORTS Ptr createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT); + CV_EXPORTS Ptr createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1)); //! returns filter engine for the generalized Sobel operator - CV_EXPORTS Ptr createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT ); + CV_EXPORTS Ptr createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT, Size imgSize = Size(-1,-1) ); //! applies Laplacian operator to the image // supports only ksize = 1 and ksize = 3 @@ -869,7 +869,6 @@ namespace cv CV_EXPORTS void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT); CV_EXPORTS void cornerMinEigenVal_dxdy(const oclMat &src, oclMat &dst, oclMat &Dx, oclMat &Dy, int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT); - /////////////////////////////////// ML /////////////////////////////////////////// //! Compute closest centers for each lines in source and lable it after center's index diff --git a/modules/ocl/src/filtering.cpp b/modules/ocl/src/filtering.cpp index 4f9802cb71..35aa226de6 100644 --- a/modules/ocl/src/filtering.cpp +++ b/modules/ocl/src/filtering.cpp @@ -739,6 +739,135 @@ void cv::ocl::filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &ke f->apply(src, dst); } +const int optimizedSepFilterLocalSize = 16; +static void sepFilter2D_SinglePass(const oclMat &src, oclMat &dst, + const Mat &row_kernel, const Mat &col_kernel, int bordertype = BORDER_DEFAULT) +{ + size_t lt2[3] = {optimizedSepFilterLocalSize, optimizedSepFilterLocalSize, 1}; + size_t gt2[3] = {lt2[0]*(1 + (src.cols-1) / lt2[0]), lt2[1]*(1 + (src.rows-1) / lt2[1]), 1}; + + unsigned int src_pitch = src.step; + unsigned int dst_pitch = dst.step; + + int src_offset_x = (src.offset % src.step) / src.elemSize(); + int src_offset_y = src.offset / src.step; + + std::vector > args; + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data )); + args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&src_pitch )); + + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset_x )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset_y )); + + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.offset )); + args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&dst_pitch )); + + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.wholecols )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.wholerows )); + + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows )); + + string option = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d",(int)lt2[0], (int)lt2[1], + row_kernel.rows / 2, col_kernel.rows / 2 ); + + option += " -D KERNEL_MATRIX_X="; + for(int i=0; i( &row_kernel.at(i) ) ); + option += "0x0"; + + option += " -D KERNEL_MATRIX_Y="; + for(int i=0; i( &col_kernel.at(i) ) ); + option += "0x0"; + + switch(src.type()) + { + case CV_8UC1: + option += " -D SRCTYPE=uchar -D CONVERT_SRCTYPE=convert_float -D WORKTYPE=float"; + break; + case CV_32FC1: + option += " -D SRCTYPE=float -D CONVERT_SRCTYPE= -D WORKTYPE=float"; + break; + case CV_8UC2: + option += " -D SRCTYPE=uchar2 -D CONVERT_SRCTYPE=convert_float2 -D WORKTYPE=float2"; + break; + case CV_32FC2: + option += " -D SRCTYPE=float2 -D CONVERT_SRCTYPE= -D WORKTYPE=float2"; + break; + case CV_8UC3: + option += " -D SRCTYPE=uchar3 -D CONVERT_SRCTYPE=convert_float3 -D WORKTYPE=float3"; + break; + case CV_32FC3: + option += " -D SRCTYPE=float3 -D CONVERT_SRCTYPE= -D WORKTYPE=float3"; + break; + case CV_8UC4: + option += " -D SRCTYPE=uchar4 -D CONVERT_SRCTYPE=convert_float4 -D WORKTYPE=float4"; + break; + case CV_32FC4: + option += " -D SRCTYPE=float4 -D CONVERT_SRCTYPE= -D WORKTYPE=float4"; + break; + default: + CV_Error(CV_StsUnsupportedFormat, "Image type is not supported!"); + break; + } + switch(dst.type()) + { + case CV_8UC1: + option += " -D DSTTYPE=uchar -D CONVERT_DSTTYPE=convert_uchar_sat"; + break; + case CV_8UC2: + option += " -D DSTTYPE=uchar2 -D CONVERT_DSTTYPE=convert_uchar2_sat"; + break; + case CV_8UC3: + option += " -D DSTTYPE=uchar3 -D CONVERT_DSTTYPE=convert_uchar3_sat"; + break; + case CV_8UC4: + option += " -D DSTTYPE=uchar4 -D CONVERT_DSTTYPE=convert_uchar4_sat"; + break; + case CV_32FC1: + option += " -D DSTTYPE=float -D CONVERT_DSTTYPE="; + break; + case CV_32FC2: + option += " -D DSTTYPE=float2 -D CONVERT_DSTTYPE="; + break; + case CV_32FC3: + option += " -D DSTTYPE=float3 -D CONVERT_DSTTYPE="; + break; + case CV_32FC4: + option += " -D DSTTYPE=float4 -D CONVERT_DSTTYPE="; + break; + default: + CV_Error(CV_StsUnsupportedFormat, "Image type is not supported!"); + break; + } + switch(bordertype) + { + case cv::BORDER_CONSTANT: + option += " -D BORDER_CONSTANT"; + break; + case cv::BORDER_REPLICATE: + option += " -D BORDER_REPLICATE"; + break; + case cv::BORDER_REFLECT: + option += " -D BORDER_REFLECT"; + break; + case cv::BORDER_REFLECT101: + option += " -D BORDER_REFLECT_101"; + break; + case cv::BORDER_WRAP: + option += " -D BORDER_WRAP"; + break; + default: + CV_Error(CV_StsBadFlag, "BORDER type is not supported!"); + break; + } + + openCLExecuteKernel(src.clCxt, &filtering_sep_filter_singlepass, "sep_filter_singlepass", gt2, lt2, args, + -1, -1, option.c_str() ); +} + //////////////////////////////////////////////////////////////////////////////////////////////////// // SeparableFilter @@ -788,6 +917,35 @@ Ptr cv::ocl::createSeparableFilter_GPU(const Ptr(new SeparableFilterEngine_GPU(rowFilter, columnFilter)); } +namespace +{ +class SingleStepSeparableFilterEngine_GPU : public FilterEngine_GPU +{ +public: + SingleStepSeparableFilterEngine_GPU( const Mat &rowKernel_, const Mat &columnKernel_, const int btype ) + { + bordertype = btype; + rowKernel = rowKernel_; + columnKernel = columnKernel_; + } + + virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1)) + { + normalizeROI(roi, Size(rowKernel.rows, columnKernel.rows), Point(-1,-1), src.size()); + + oclMat srcROI = src(roi); + oclMat dstROI = dst(roi); + + sepFilter2D_SinglePass(src, dst, rowKernel, columnKernel, bordertype); + } + + Mat rowKernel; + Mat columnKernel; + int bordertype; +}; +} + + static void GPUFilterBox(const oclMat &src, oclMat &dst, Size &ksize, const Point anchor, const int borderType) { @@ -1241,17 +1399,32 @@ Ptr cv::ocl::getLinearColumnFilter_GPU(int /*bufType*/, in } Ptr cv::ocl::createSeparableLinearFilter_GPU(int srcType, int dstType, - const Mat &rowKernel, const Mat &columnKernel, const Point &anchor, double delta, int bordertype) + const Mat &rowKernel, const Mat &columnKernel, const Point &anchor, double delta, int bordertype, Size imgSize ) { int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType); int cn = CV_MAT_CN(srcType); int bdepth = std::max(std::max(sdepth, ddepth), CV_32F); int bufType = CV_MAKETYPE(bdepth, cn); + Context* clCxt = Context::getContext(); + + //if image size is non-degenerate and large enough + //and if filter support is reasonable to satisfy larger local memory requirements, + //then we can use single pass routine to avoid extra runtime calls overhead + if( clCxt && clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE) && + rowKernel.rows <= 21 && columnKernel.rows <= 21 && + (rowKernel.rows & 1) == 1 && (columnKernel.rows & 1) == 1 && + imgSize.width > optimizedSepFilterLocalSize + (rowKernel.rows>>1) && + imgSize.height > optimizedSepFilterLocalSize + (columnKernel.rows>>1) ) + { + return Ptr(new SingleStepSeparableFilterEngine_GPU(rowKernel, columnKernel, bordertype)); + } + else + { + Ptr rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, bordertype); + Ptr columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, bordertype, delta); - Ptr rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, bordertype); - Ptr columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, bordertype, delta); - - return createSeparableFilter_GPU(rowFilter, columnFilter); + return createSeparableFilter_GPU(rowFilter, columnFilter); + } } void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernelX, const Mat &kernelY, Point anchor, double delta, int bordertype) @@ -1275,16 +1448,16 @@ void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels())); - Ptr f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, delta, bordertype); + Ptr f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, delta, bordertype, src.size()); f->apply(src, dst); } -Ptr cv::ocl::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int borderType) +Ptr cv::ocl::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int borderType, Size imgSize ) { Mat kx, ky; getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F); return createSeparableLinearFilter_GPU(srcType, dstType, - kx, ky, Point(-1, -1), 0, borderType); + kx, ky, Point(-1, -1), 0, borderType, imgSize); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -1354,7 +1527,7 @@ void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, d //////////////////////////////////////////////////////////////////////////////////////////////////// // Gaussian Filter -Ptr cv::ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2, int bordertype) +Ptr cv::ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2, int bordertype, Size imgSize) { int depth = CV_MAT_DEPTH(type); @@ -1381,7 +1554,7 @@ Ptr cv::ocl::createGaussianFilter_GPU(int type, Size ksize, do else ky = getGaussianKernel(ksize.height, sigma2, std::max(depth, CV_32F)); - return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1, -1), 0.0, bordertype); + return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1, -1), 0.0, bordertype, imgSize); } void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double sigma1, double sigma2, int bordertype) @@ -1417,7 +1590,7 @@ void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double si dst.create(src.size(), src.type()); - Ptr f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, bordertype); + Ptr f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, bordertype, src.size()); f->apply(src, dst); } diff --git a/modules/ocl/src/opencl/filtering_sep_filter_singlepass.cl b/modules/ocl/src/opencl/filtering_sep_filter_singlepass.cl new file mode 100644 index 0000000000..c6555bff0f --- /dev/null +++ b/modules/ocl/src/opencl/filtering_sep_filter_singlepass.cl @@ -0,0 +1,185 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2013, Intel Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ +/////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////Macro for border type//////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#ifdef BORDER_CONSTANT +//CCCCCC|abcdefgh|CCCCCCC +#define EXTRAPOLATE(x, maxV) +#elif defined BORDER_REPLICATE +//aaaaaa|abcdefgh|hhhhhhh +#define EXTRAPOLATE(x, maxV) \ + { \ + (x) = max(min((x), (maxV) - 1), 0); \ + } +#elif defined BORDER_WRAP +//cdefgh|abcdefgh|abcdefg +#define EXTRAPOLATE(x, maxV) \ + { \ + (x) = ( (x) + (maxV) ) % (maxV); \ + } +#elif defined BORDER_REFLECT +//fedcba|abcdefgh|hgfedcb +#define EXTRAPOLATE(x, maxV) \ + { \ + (x) = min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ); \ + } +#elif defined BORDER_REFLECT_101 +//gfedcb|abcdefgh|gfedcba +#define EXTRAPOLATE(x, maxV) \ + { \ + (x) = min(((maxV)-1)*2-(x), max((x),-(x)) ); \ + } +#else +#error No extrapolation method +#endif + +#define SRC(_x,_y) CONVERT_SRCTYPE(((global SRCTYPE*)(Src+(_y)*SrcPitch))[_x]) + +#ifdef BORDER_CONSTANT +//CCCCCC|abcdefgh|CCCCCCC +#define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y)) +#else +#define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y)) +#endif + +#define DST(_x,_y) (((global DSTTYPE*)(Dst+DstOffset+(_y)*DstPitch))[_x]) + +//horizontal and vertical filter kernels +//should be defined on host during compile time to avoid overhead +__constant uint mat_kernelX[] = {KERNEL_MATRIX_X}; +__constant uint mat_kernelY[] = {KERNEL_MATRIX_Y}; + +__kernel __attribute__((reqd_work_group_size(BLK_X,BLK_Y,1))) void sep_filter_singlepass + ( + __global uchar* Src, + const uint SrcPitch, + const int srcOffsetX, + const int srcOffsetY, + __global uchar* Dst, + const int DstOffset, + const uint DstPitch, + int width, + int height, + int dstWidth, + int dstHeight + ) +{ + //RADIUSX, RADIUSY are filter dimensions + //BLK_X, BLK_Y are local wrogroup sizes + //all these should be defined on host during compile time + //first lsmem array for source pixels used in first pass, + //second lsmemDy for storing first pass results + __local WORKTYPE lsmem[BLK_Y+2*RADIUSY][BLK_X+2*RADIUSX]; + __local WORKTYPE lsmemDy[BLK_Y][BLK_X+2*RADIUSX]; + + //get local and global ids - used as image and local memory array indexes + int lix = get_local_id(0); + int liy = get_local_id(1); + + int x = (int)get_global_id(0); + int y = (int)get_global_id(1); + + //calculate pixel position in source image taking image offset into account + int srcX = x + srcOffsetX - RADIUSX; + int srcY = y + srcOffsetY - RADIUSY; + int xb = srcX; + int yb = srcY; + + //extrapolate coordinates, if needed + //and read my own source pixel into local memory + //with account for extra border pixels, which will be read by starting workitems + int clocY = liy; + int cSrcY = srcY; + do + { + int yb = cSrcY; + EXTRAPOLATE(yb, (height)); + + int clocX = lix; + int cSrcX = srcX; + do + { + int xb = cSrcX; + EXTRAPOLATE(xb,(width)); + lsmem[clocY][clocX] = ELEM(xb, yb, (width), (height), 0 ); + + clocX += BLK_X; + cSrcX += BLK_X; + } + while(clocX < BLK_X+(RADIUSX*2)); + + clocY += BLK_Y; + cSrcY += BLK_Y; + } + while(clocY < BLK_Y+(RADIUSY*2)); + barrier(CLK_LOCAL_MEM_FENCE); + + //do vertical filter pass + //and store intermediate results to second local memory array + int i; + WORKTYPE sum = 0.0f; + int clocX = lix; + do + { + sum = 0.0f; + for(i=0; i<=2*RADIUSY; i++) + sum = mad(lsmem[liy+i][clocX], as_float(mat_kernelY[i]), sum); + lsmemDy[liy][clocX] = sum; + clocX += BLK_X; + } + while(clocX < BLK_X+(RADIUSX*2)); + barrier(CLK_LOCAL_MEM_FENCE); + + //if this pixel happened to be out of image borders because of global size rounding, + //then just return + if( x >= dstWidth || y >=dstHeight ) return; + + //do second horizontal filter pass + //and calculate final result + sum = 0.0f; + for(i=0; i<=2*RADIUSX; i++) + sum = mad(lsmemDy[liy][lix+i], as_float(mat_kernelX[i]), sum); + + //store result into destination image + DST(x,y) = CONVERT_DSTTYPE(sum); +}