From d17142b83d67501a03a2ec2149ab0c397b6dff14 Mon Sep 17 00:00:00 2001 From: Alexander Karsakov Date: Fri, 28 Mar 2014 21:46:03 +0400 Subject: [PATCH 1/6] Prototype OCL version of gaussian blur with integer arithmetic --- .../imgproc/src/opencl/gaussian_blur_8u.cl | 189 ++++++++++++++++++ modules/imgproc/src/smooth.cpp | 77 +++++++ modules/imgproc/test/ocl/test_filters.cpp | 18 +- 3 files changed, 283 insertions(+), 1 deletion(-) create mode 100644 modules/imgproc/src/opencl/gaussian_blur_8u.cl diff --git a/modules/imgproc/src/opencl/gaussian_blur_8u.cl b/modules/imgproc/src/opencl/gaussian_blur_8u.cl new file mode 100644 index 0000000000..268d8b7c1b --- /dev/null +++ b/modules/imgproc/src/opencl/gaussian_blur_8u.cl @@ -0,0 +1,189 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2014, Intel Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////Macro for border type//////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#ifdef BORDER_CONSTANT +// CCCCCC|abcdefgh|CCCCCCC +#define EXTRAPOLATE(x, maxV) +#elif defined BORDER_REPLICATE +// aaaaaa|abcdefgh|hhhhhhh +#define EXTRAPOLATE(x, maxV) \ + { \ + (x) = max(min((x), (maxV) - 1), 0); \ + } +#elif defined BORDER_WRAP +// cdefgh|abcdefgh|abcdefg +#define EXTRAPOLATE(x, maxV) \ + { \ + (x) = ( (x) + (maxV) ) % (maxV); \ + } +#elif defined BORDER_REFLECT +// fedcba|abcdefgh|hgfedcb +#define EXTRAPOLATE(x, maxV) \ + { \ + (x) = min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ); \ + } +#elif defined BORDER_REFLECT_101 || defined BORDER_REFLECT101 +// gfedcb|abcdefgh|gfedcba +#define EXTRAPOLATE(x, maxV) \ + { \ + (x) = min(((maxV)-1)*2-(x), max((x),-(x)) ); \ + } +#else +#error No extrapolation method +#endif + +#if CN != 3 +#define loadpix(addr) *(__global const srcT *)(addr) +#define storepix(val, addr) *(__global dstT *)(addr) = val +#define SRCSIZE (int)sizeof(srcT) +#define DSTSIZE (int)sizeof(dstT) +#else +#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr)) +#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr)) +#define SRCSIZE (int)sizeof(srcT1)*3 +#define DSTSIZE (int)sizeof(dstT1)*3 +#endif + +#define SRC(_x,_y) convertToWT(loadpix(Src + mad24(_y, src_step, SRCSIZE * _x))) + +#ifdef BORDER_CONSTANT +// CCCCCC|abcdefgh|CCCCCCC +#define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y)) +#else +#define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y)) +#endif + +#define noconvert + +// horizontal and vertical filter kernels +// should be defined on host during compile time to avoid overhead +#define DIG(a) a, +__constant int mat_kernelX[] = { KERNEL_MATRIX_X }; +__constant int mat_kernelY[] = { KERNEL_MATRIX_Y }; + +__kernel void gaussian_blur_8u(__global uchar* Src, int src_step, int srcOffsetX, int srcOffsetY, int height, int width, + __global uchar* Dst, int dst_step, int dst_offset, int dst_rows, int dst_cols) +{ + // RADIUSX, RADIUSY are filter dimensions + // BLK_X, BLK_Y are local wrogroup sizes + // all these should be defined on host during compile time + // first lsmem array for source pixels used in first pass, + // second lsmemDy for storing first pass results + __local WT lsmem[BLK_Y + 2 * RADIUSY][BLK_X + 2 * RADIUSX]; + __local WT lsmemDy[BLK_Y][BLK_X + 2 * RADIUSX]; + + // get local and global ids - used as image and local memory array indexes + int lix = get_local_id(0); + int liy = get_local_id(1); + + int x = get_global_id(0); + int y = get_global_id(1); + + // calculate pixel position in source image taking image offset into account + int srcX = x + srcOffsetX - RADIUSX; + int srcY = y + srcOffsetY - RADIUSY; + int xb = srcX; + int yb = srcY; + + // extrapolate coordinates, if needed + // and read my own source pixel into local memory + // with account for extra border pixels, which will be read by starting workitems + int clocY = liy; + int cSrcY = srcY; + do + { + int yb = cSrcY; + EXTRAPOLATE(yb, (height)); + + int clocX = lix; + int cSrcX = srcX; + do + { + int xb = cSrcX; + EXTRAPOLATE(xb,(width)); + lsmem[clocY][clocX] = ELEM(xb, yb, (width), (height), 0 ); + + clocX += BLK_X; + cSrcX += BLK_X; + } + while(clocX < BLK_X+(RADIUSX*2)); + + clocY += BLK_Y; + cSrcY += BLK_Y; + } + while (clocY < BLK_Y+(RADIUSY*2)); + barrier(CLK_LOCAL_MEM_FENCE); + + // do vertical filter pass + // and store intermediate results to second local memory array + int i, clocX = lix; + WT sum = 0; + do + { + sum = 0; + for (i=0; i<=2*RADIUSY; i++) + sum = mad(lsmem[liy+i][clocX], mat_kernelY[i], sum); + lsmemDy[liy][clocX] = sum; + clocX += BLK_X; + } + while(clocX < BLK_X+(RADIUSX*2)); + barrier(CLK_LOCAL_MEM_FENCE); + + // if this pixel happened to be out of image borders because of global size rounding, + // then just return + if( x >= dst_cols || y >=dst_rows ) + return; + + // do second horizontal filter pass + // and calculate final result + sum = 0; + for (i=0; i<=2*RADIUSX; i++) + sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum); + + sum = sum >> (GAUSSIAN_COEF_BITS * 2); + + //store result into destination image + storepix(convertToDstT(sum), Dst + mad24(y, dst_step, mad24(x, DSTSIZE, dst_offset))); +} diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index 6a18af5c2d..864fec7971 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -42,6 +42,7 @@ #include "precomp.hpp" #include "opencl_kernels.hpp" +#include /* * This file includes the code, contributed by Simon Perreault @@ -1069,6 +1070,73 @@ static void createGaussianKernels( Mat & kx, Mat & ky, int type, Size ksize, ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) ); } +#define GAUSSIAN_COEF_BITS 11 + +static bool GaussianBlur_8u(InputArray _src, OutputArray _dst, Size ksize, + double sigma1, double sigma2, + int borderType) +{ + int type = _src.type(); + Mat kx, ky; + createGaussianKernels(kx, ky, CV_64F, ksize, sigma1, sigma2); + Mat kx_8u, ky_8u; + + int scale_coef = 1 << GAUSSIAN_COEF_BITS; + kx.convertTo(kx_8u, CV_32S, scale_coef); + ky.convertTo(ky_8u, CV_32S, scale_coef); + + kx_8u.reshape(1, 1); + ky_8u.reshape(1, 1); + + Size size = _src.size(), wholeSize; + Point origin; + int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype), + esz = CV_ELEM_SIZE(stype), wdepth = CV_32S, + ddepth = sdepth; + size_t src_step = _src.step(), src_offset = _src.offset(); + + if ((src_offset % src_step) % esz != 0 || !(borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE || + borderType == BORDER_REFLECT || borderType == BORDER_WRAP || + borderType == BORDER_REFLECT_101)) + return false; + + size_t lt2[2] = { 16, 16 }; + size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1] * (1 + (size.height - 1) / lt2[1]) }; + + char cvt[2][40]; + const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", + "BORDER_REFLECT_101" }; + + String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s" + " -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s" + " -D %s -D srcT1=%s -D dstT1=%s -D CN=%d -D GAUSSIAN_COEF_BITS=%d", (int)lt2[0], (int)lt2[1], + kx.rows / 2, kx.rows / 2, + ocl::kernelToStr(kx_8u, CV_32S, "KERNEL_MATRIX_X").c_str(), + ocl::kernelToStr(ky_8u, CV_32S, "KERNEL_MATRIX_Y").c_str(), + ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), + ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(stype), + ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType], + ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn, GAUSSIAN_COEF_BITS); + + ocl::Kernel k("gaussian_blur_8u", ocl::imgproc::gaussian_blur_8u_oclsrc, opts); + if (k.empty()) + return false; + + UMat src = _src.getUMat(); + _dst.create(size, stype); + UMat dst = _dst.getUMat(); + + int src_offset_x = static_cast((src_offset % src_step) / esz); + int src_offset_y = static_cast(src_offset / src_step); + + src.locateROI(wholeSize, origin); + + k.args(ocl::KernelArg::PtrReadOnly(src), (int)src_step, src_offset_x, src_offset_y, + wholeSize.height, wholeSize.width, ocl::KernelArg::WriteOnly(dst)); + + return k.run(2, gt2, lt2, false); +} + } cv::Ptr cv::createGaussianFilter( int type, Size ksize, @@ -1082,6 +1150,8 @@ cv::Ptr cv::createGaussianFilter( int type, Size ksize, } + + void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize, double sigma1, double sigma2, int borderType ) @@ -1126,6 +1196,13 @@ void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize, } #endif + if (type == CV_8U) + { + CV_OCL_RUN_(_dst.isUMat() && _src.dims() <= 2 && + (!(borderType & BORDER_ISOLATED) || _src.offset() == 0), + GaussianBlur_8u(_src, _dst, ksize, sigma1, sigma2, borderType)) + } + Mat kx, ky; createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2); sepFilter2D(_src, _dst, CV_MAT_DEPTH(type), kx, ky, Point(-1,-1), 0, borderType ); diff --git a/modules/imgproc/test/ocl/test_filters.cpp b/modules/imgproc/test/ocl/test_filters.cpp index 09b215108e..a43a7712f1 100644 --- a/modules/imgproc/test/ocl/test_filters.cpp +++ b/modules/imgproc/test/ocl/test_filters.cpp @@ -219,7 +219,23 @@ OCL_TEST_P(GaussianBlurTest, Mat) OCL_OFF(cv::GaussianBlur(src_roi, dst_roi, Size(ksize, ksize), sigma1, sigma2, borderType)); OCL_ON(cv::GaussianBlur(usrc_roi, udst_roi, Size(ksize, ksize), sigma1, sigma2, borderType)); - Near(CV_MAT_DEPTH(type) == CV_8U ? 3 : 5e-5, false); + + if (checkNorm2(dst_roi, udst_roi) > 2 && CV_MAT_DEPTH(type) == CV_8U) + { + Mat udst = udst_roi.getMat(ACCESS_READ); + Mat diff; + absdiff(dst_roi, udst, diff); + int nonZero = countNonZero(diff); + double max; + Point maxn; + minMaxLoc(diff, (double*)0, &max, (Point*) 0, &maxn); + + uchar a = dst_roi.at(maxn); + uchar b = udst.at(maxn); + + } + + Near(CV_MAT_DEPTH(type) == CV_8U ? 2 : 5e-5, false); } } From 10a52220f0ff40de659c124d7ba0ebef42a25da1 Mon Sep 17 00:00:00 2001 From: Alexander Karsakov Date: Mon, 31 Mar 2014 16:45:15 +0400 Subject: [PATCH 2/6] Added integer arithmetic to sepFilter2D --- modules/imgproc/src/filter.cpp | 72 +++++++++++++++------- modules/imgproc/src/opencl/filterSepCol.cl | 7 ++- modules/imgproc/src/opencl/filterSepRow.cl | 3 +- modules/imgproc/src/smooth.cpp | 12 ++-- modules/imgproc/test/ocl/test_filters.cpp | 13 ++-- 5 files changed, 73 insertions(+), 34 deletions(-) diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp index 2bc6b8a706..d81f8affb8 100644 --- a/modules/imgproc/src/filter.cpp +++ b/modules/imgproc/src/filter.cpp @@ -3275,6 +3275,7 @@ static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX int type = src.type(), cn = CV_MAT_CN(type), sdepth = CV_MAT_DEPTH(type); bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; Size bufSize = buf.size(); + int buf_type = buf.type(), bdepth = CV_MAT_DEPTH(buf_type); if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) return false; @@ -3306,11 +3307,11 @@ static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX radiusX, (int)localsize[0], (int)localsize[1], cn, btype, extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", - ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)), - ocl::convertTypeStr(sdepth, CV_32F, cn, cvt), - ocl::typeToStr(sdepth), ocl::typeToStr(CV_32F), + ocl::typeToStr(type), ocl::typeToStr(buf_type), + ocl::convertTypeStr(sdepth, bdepth, cn, cvt), + ocl::typeToStr(sdepth), ocl::typeToStr(bdepth), doubleSupport ? " -D DOUBLE_SUPPORT" : ""); - build_options += ocl::kernelToStr(kernelX, CV_32F); + build_options += ocl::kernelToStr(kernelX, bdepth); Size srcWholeSize; Point srcOffset; src.locateROI(srcWholeSize, srcOffset); @@ -3337,7 +3338,7 @@ static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX return k.run(2, globalsize, localsize, false); } -static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor) +static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor, int bits) { bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; if (dst.depth() == CV_64F && !doubleSupport) @@ -3352,6 +3353,7 @@ static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY int dtype = dst.type(), cn = CV_MAT_CN(dtype), ddepth = CV_MAT_DEPTH(dtype); Size sz = dst.size(); + int buf_type = buf.type(), bdepth = CV_MAT_DEPTH(buf_type); globalsize[1] = DIVUP(sz.height, localsize[1]) * localsize[1]; globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0]; @@ -3359,13 +3361,13 @@ static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY char cvt[40]; cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d" " -D srcT=%s -D dstT=%s -D convertToDstT=%s" - " -D srcT1=%s -D dstT1=%s%s", + " -D srcT1=%s -D dstT1=%s -D BITS=%d%s", anchor, (int)localsize[0], (int)localsize[1], cn, - ocl::typeToStr(buf.type()), ocl::typeToStr(dtype), - ocl::convertTypeStr(CV_32F, ddepth, cn, cvt), - ocl::typeToStr(CV_32F), ocl::typeToStr(ddepth), - doubleSupport ? " -D DOUBLE_SUPPORT" : ""); - build_options += ocl::kernelToStr(kernelY, CV_32F); + ocl::typeToStr(buf_type), ocl::typeToStr(dtype), + ocl::convertTypeStr(bdepth, ddepth, cn, cvt), + ocl::typeToStr(bdepth), ocl::typeToStr(ddepth), + bits, doubleSupport ? " -D DOUBLE_SUPPORT" : ""); + build_options += ocl::kernelToStr(kernelY, bdepth); ocl::Kernel k("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc, build_options); @@ -3457,13 +3459,13 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, if (ddepth < 0) ddepth = sdepth; - CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 && - imgSize.width > optimizedSepFilterLocalSize + (kernelX.cols >> 1) && - imgSize.height > optimizedSepFilterLocalSize + (kernelY.cols >> 1) && - (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) && anchor == Point(-1, -1) && - (d.isIntel() || (d.isAMD() && !d.hostUnifiedMemory())), - ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta, - borderType & ~BORDER_ISOLATED, ddepth), true) + //CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 && + // imgSize.width > optimizedSepFilterLocalSize + (kernelX.cols >> 1) && + // imgSize.height > optimizedSepFilterLocalSize + (kernelY.cols >> 1) && + // (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) && anchor == Point(-1, -1) && + // (d.isIntel() || (d.isAMD() && !d.hostUnifiedMemory())), + // ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta, + // borderType & ~BORDER_ISOLATED, ddepth), true) if (anchor.x < 0) anchor.x = kernelX.cols >> 1; @@ -3474,19 +3476,45 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, Size srcWholeSize; Point srcOffset; src.locateROI(srcWholeSize, srcOffset); - bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 && - src.cols % 4 == 0 && src.step % 4 == 0; + //bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 && + // src.cols % 4 == 0 && src.step % 4 == 0; + bool fast8uc1 = false; + + int rtype = getKernelType(kernelX, + kernelX.rows == 1 ? Point(anchor.x, 0) : Point(0, anchor.x)); + int ctype = getKernelType(kernelY, + kernelY.rows == 1 ? Point(anchor.y, 0) : Point(0, anchor.y)); + + int bdepth = CV_32F; + int bits = 0; + + if( sdepth == CV_8U && + ((rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && + ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && + ddepth == CV_8U))) + { + bdepth = CV_32S; + bits = 8; + _kernelX.getMat().convertTo( kernelX, CV_32S, 1 << bits ); + _kernelY.getMat().convertTo( kernelY, CV_32S, 1 << bits ); + kernelX = kernelX.reshape(1,1); + kernelY = kernelY.reshape(1,1); + bits *= 2; + delta *= (1 << bits); + } Size srcSize = src.size(); Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1); - UMat buf(bufSize, CV_32FC(cn)); + UMat buf(bufSize, CV_MAKETYPE(bdepth, cn)); if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1)) return false; + Mat buffer = buf.getMat(ACCESS_READ); + _dst.create(srcSize, CV_MAKETYPE(ddepth, cn)); UMat dst = _dst.getUMat(); - return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y); + return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, bits); } #endif diff --git a/modules/imgproc/src/opencl/filterSepCol.cl b/modules/imgproc/src/opencl/filterSepCol.cl index 29514cc21f..94730d8787 100644 --- a/modules/imgproc/src/opencl/filterSepCol.cl +++ b/modules/imgproc/src/opencl/filterSepCol.cl @@ -60,7 +60,7 @@ #endif #define DIG(a) a, -__constant float mat_kernel[] = { COEFF }; +__constant srcT1 mat_kernel[] = { COEFF }; __kernel void col_filter(__global const uchar * src, int src_step, int src_offset, int src_whole_rows, int src_whole_cols, __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, float delta) @@ -97,8 +97,13 @@ __kernel void col_filter(__global const uchar * src, int src_step, int src_offse temp[0] = LDS_DAT[l_y + RADIUSY - i][l_x]; temp[1] = LDS_DAT[l_y + RADIUSY + i][l_x]; sum += mad(temp[0], mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]); + //sum += temp[0]*mat_kernel[RADIUSY - i] + temp[1] * mat_kernel[RADIUSY + i]; } +#if BITS > 0 + sum = sum >> BITS; +#endif + // write the result to dst if (x < dst_cols && y < dst_rows) { diff --git a/modules/imgproc/src/opencl/filterSepRow.cl b/modules/imgproc/src/opencl/filterSepRow.cl index 726de448e4..8deec35ae3 100644 --- a/modules/imgproc/src/opencl/filterSepRow.cl +++ b/modules/imgproc/src/opencl/filterSepRow.cl @@ -138,7 +138,7 @@ #endif #define DIG(a) a, -__constant float mat_kernel[] = { COEFF }; +__constant dstT1 mat_kernel[] = { COEFF }; __kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel, int src_offset_x, int src_offset_y, int src_cols, int src_rows, int src_whole_cols, int src_whole_rows, @@ -356,6 +356,7 @@ __kernel void row_filter(__global const uchar * src, int src_step, int src_offse temp[0] = LDS_DAT[l_y][l_x + RADIUSX - i]; temp[1] = LDS_DAT[l_y][l_x + RADIUSX + i]; sum += mad(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]); + //sum += convertToDstT(temp[0])*mat_kernel[RADIUSX - i] + convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]; } // write the result to dst diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index 864fec7971..e2365cd203 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -1196,12 +1196,12 @@ void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize, } #endif - if (type == CV_8U) - { - CV_OCL_RUN_(_dst.isUMat() && _src.dims() <= 2 && - (!(borderType & BORDER_ISOLATED) || _src.offset() == 0), - GaussianBlur_8u(_src, _dst, ksize, sigma1, sigma2, borderType)) - } + //if (type == CV_8U) + //{ + // CV_OCL_RUN_(_dst.isUMat() && _src.dims() <= 2 && + // (!(borderType & BORDER_ISOLATED) || _src.offset() == 0), + // GaussianBlur_8u(_src, _dst, ksize, sigma1, sigma2, borderType)) + //} Mat kx, ky; createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2); diff --git a/modules/imgproc/test/ocl/test_filters.cpp b/modules/imgproc/test/ocl/test_filters.cpp index a43a7712f1..aee1f08b1a 100644 --- a/modules/imgproc/test/ocl/test_filters.cpp +++ b/modules/imgproc/test/ocl/test_filters.cpp @@ -209,7 +209,7 @@ typedef FilterTestBase GaussianBlurTest; OCL_TEST_P(GaussianBlurTest, Mat) { - for (int j = 0; j < test_loop_times; j++) + for (int j = 0; j < test_loop_times + 100; j++) { random_roi(); @@ -222,7 +222,8 @@ OCL_TEST_P(GaussianBlurTest, Mat) if (checkNorm2(dst_roi, udst_roi) > 2 && CV_MAT_DEPTH(type) == CV_8U) { - Mat udst = udst_roi.getMat(ACCESS_READ); + std::cout << "i = " << j << std::endl; + Mat uudst = udst_roi.getMat(ACCESS_READ); Mat diff; absdiff(dst_roi, udst, diff); int nonZero = countNonZero(diff); @@ -231,11 +232,15 @@ OCL_TEST_P(GaussianBlurTest, Mat) minMaxLoc(diff, (double*)0, &max, (Point*) 0, &maxn); uchar a = dst_roi.at(maxn); - uchar b = udst.at(maxn); + uchar b = uudst.at(maxn); + std::cout << "dst_roi" << dst_roi << std::endl; + std::cout << "udst_roi" << uudst << std::endl; } - Near(CV_MAT_DEPTH(type) == CV_8U ? 2 : 5e-5, false); + + + Near(CV_MAT_DEPTH(type) == CV_8U ? 1 : 5e-5, false); } } From a3825acee4b1ab9462f05af190d5827447f01efd Mon Sep 17 00:00:00 2001 From: Alexander Karsakov Date: Tue, 1 Apr 2014 11:27:43 +0400 Subject: [PATCH 3/6] Small refactoring --- modules/imgproc/src/filter.cpp | 67 +++---- modules/imgproc/src/opencl/filterSepCol.cl | 10 +- modules/imgproc/src/opencl/filterSepRow.cl | 6 +- .../imgproc/src/opencl/gaussian_blur_8u.cl | 189 ------------------ modules/imgproc/src/smooth.cpp | 77 ------- modules/imgproc/test/ocl/test_filters.cpp | 25 +-- 6 files changed, 46 insertions(+), 328 deletions(-) delete mode 100644 modules/imgproc/src/opencl/gaussian_blur_8u.cl diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp index d81f8affb8..7d870d72f8 100644 --- a/modules/imgproc/src/filter.cpp +++ b/modules/imgproc/src/filter.cpp @@ -3269,8 +3269,10 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth, return k.run(2, globalsize, localsize, false); } +const int shift_bits = 8; + static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX, int anchor, - int borderType, int ddepth, bool fast8uc1) + int borderType, int ddepth, bool fast8uc1, bool int_arithm) { int type = src.type(), cn = CV_MAT_CN(type), sdepth = CV_MAT_DEPTH(type); bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; @@ -3303,14 +3305,15 @@ static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX char cvt[40]; cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s" - " -D srcT=%s -D dstT=%s -D convertToDstT=%s -D srcT1=%s -D dstT1=%s%s", + " -D srcT=%s -D dstT=%s -D convertToDstT=%s -D srcT1=%s -D dstT1=%s%s%s", radiusX, (int)localsize[0], (int)localsize[1], cn, btype, extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", ocl::typeToStr(type), ocl::typeToStr(buf_type), ocl::convertTypeStr(sdepth, bdepth, cn, cvt), ocl::typeToStr(sdepth), ocl::typeToStr(bdepth), - doubleSupport ? " -D DOUBLE_SUPPORT" : ""); + doubleSupport ? " -D DOUBLE_SUPPORT" : "", + int_arithm ? " -D INTEGER_ARITHMETIC" : ""); build_options += ocl::kernelToStr(kernelX, bdepth); Size srcWholeSize; Point srcOffset; @@ -3338,7 +3341,7 @@ static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX return k.run(2, globalsize, localsize, false); } -static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor, int bits) +static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor, bool int_arithm) { bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; if (dst.depth() == CV_64F && !doubleSupport) @@ -3361,12 +3364,13 @@ static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY char cvt[40]; cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d" " -D srcT=%s -D dstT=%s -D convertToDstT=%s" - " -D srcT1=%s -D dstT1=%s -D BITS=%d%s", + " -D srcT1=%s -D dstT1=%s -D SHIFT_BITS=%d%s%s", anchor, (int)localsize[0], (int)localsize[1], cn, ocl::typeToStr(buf_type), ocl::typeToStr(dtype), ocl::convertTypeStr(bdepth, ddepth, cn, cvt), ocl::typeToStr(bdepth), ocl::typeToStr(ddepth), - bits, doubleSupport ? " -D DOUBLE_SUPPORT" : ""); + 2*shift_bits, doubleSupport ? " -D DOUBLE_SUPPORT" : "", + int_arithm ? " -D INTEGER_ARITHMETIC" : ""); build_options += ocl::kernelToStr(kernelY, bdepth); ocl::Kernel k("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc, @@ -3459,62 +3463,55 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, if (ddepth < 0) ddepth = sdepth; - //CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 && - // imgSize.width > optimizedSepFilterLocalSize + (kernelX.cols >> 1) && - // imgSize.height > optimizedSepFilterLocalSize + (kernelY.cols >> 1) && - // (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) && anchor == Point(-1, -1) && - // (d.isIntel() || (d.isAMD() && !d.hostUnifiedMemory())), - // ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta, - // borderType & ~BORDER_ISOLATED, ddepth), true) - if (anchor.x < 0) anchor.x = kernelX.cols >> 1; if (anchor.y < 0) anchor.y = kernelY.cols >> 1; - UMat src = _src.getUMat(); - Size srcWholeSize; Point srcOffset; - src.locateROI(srcWholeSize, srcOffset); - - //bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 && - // src.cols % 4 == 0 && src.step % 4 == 0; - bool fast8uc1 = false; - int rtype = getKernelType(kernelX, kernelX.rows == 1 ? Point(anchor.x, 0) : Point(0, anchor.x)); int ctype = getKernelType(kernelY, kernelY.rows == 1 ? Point(anchor.y, 0) : Point(0, anchor.y)); int bdepth = CV_32F; - int bits = 0; - + bool int_arithm = false; if( sdepth == CV_8U && ((rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && ddepth == CV_8U))) { bdepth = CV_32S; - bits = 8; - _kernelX.getMat().convertTo( kernelX, CV_32S, 1 << bits ); - _kernelY.getMat().convertTo( kernelY, CV_32S, 1 << bits ); - kernelX = kernelX.reshape(1,1); - kernelY = kernelY.reshape(1,1); - bits *= 2; - delta *= (1 << bits); + _kernelX.getMat().reshape(1,1).convertTo( kernelX, CV_32S, 1 << shift_bits ); + _kernelY.getMat().reshape(1,1).convertTo( kernelY, CV_32S, 1 << shift_bits ); + int_arithm = true; } + CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 && !int_arithm && + imgSize.width > optimizedSepFilterLocalSize + anchor.x && + imgSize.height > optimizedSepFilterLocalSize + anchor.y && + (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) && + anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) && + (d.isIntel() || (d.isAMD() && !d.hostUnifiedMemory())), + ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta, + borderType & ~BORDER_ISOLATED, ddepth), true) + + UMat src = _src.getUMat(); + Size srcWholeSize; Point srcOffset; + src.locateROI(srcWholeSize, srcOffset); + + bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 && + src.cols % 4 == 0 && src.step % 4 == 0 && !int_arithm; + Size srcSize = src.size(); Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1); UMat buf(bufSize, CV_MAKETYPE(bdepth, cn)); - if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1)) + if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1, int_arithm)) return false; - Mat buffer = buf.getMat(ACCESS_READ); - _dst.create(srcSize, CV_MAKETYPE(ddepth, cn)); UMat dst = _dst.getUMat(); - return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, bits); + return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, int_arithm); } #endif diff --git a/modules/imgproc/src/opencl/filterSepCol.cl b/modules/imgproc/src/opencl/filterSepCol.cl index 94730d8787..13595058ff 100644 --- a/modules/imgproc/src/opencl/filterSepCol.cl +++ b/modules/imgproc/src/opencl/filterSepCol.cl @@ -3,6 +3,7 @@ // // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Copyright (C) 2014, Itseez, Inc, all rights reserved. // Third party copyrights are property of their respective owners. // // @Authors @@ -96,12 +97,15 @@ __kernel void col_filter(__global const uchar * src, int src_step, int src_offse { temp[0] = LDS_DAT[l_y + RADIUSY - i][l_x]; temp[1] = LDS_DAT[l_y + RADIUSY + i][l_x]; +#ifndef INTEGER_ARITHMETIC sum += mad(temp[0], mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]); - //sum += temp[0]*mat_kernel[RADIUSY - i] + temp[1] * mat_kernel[RADIUSY + i]; +#else + sum += mad24(temp[0],mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]); +#endif } -#if BITS > 0 - sum = sum >> BITS; +#ifdef INTEGER_ARITHMETIC + sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS; #endif // write the result to dst diff --git a/modules/imgproc/src/opencl/filterSepRow.cl b/modules/imgproc/src/opencl/filterSepRow.cl index 8deec35ae3..890eeb8cbb 100644 --- a/modules/imgproc/src/opencl/filterSepRow.cl +++ b/modules/imgproc/src/opencl/filterSepRow.cl @@ -3,6 +3,7 @@ // // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Copyright (C) 2014, Itseez, Inc, all rights reserved. // Third party copyrights are property of their respective owners. // // @Authors @@ -355,8 +356,11 @@ __kernel void row_filter(__global const uchar * src, int src_step, int src_offse { temp[0] = LDS_DAT[l_y][l_x + RADIUSX - i]; temp[1] = LDS_DAT[l_y][l_x + RADIUSX + i]; +#ifndef INTEGER_ARITHMETIC sum += mad(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]); - //sum += convertToDstT(temp[0])*mat_kernel[RADIUSX - i] + convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]; +#else + sum += mad24(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]); +#endif } // write the result to dst diff --git a/modules/imgproc/src/opencl/gaussian_blur_8u.cl b/modules/imgproc/src/opencl/gaussian_blur_8u.cl deleted file mode 100644 index 268d8b7c1b..0000000000 --- a/modules/imgproc/src/opencl/gaussian_blur_8u.cl +++ /dev/null @@ -1,189 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2014, Intel Corporation, all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// -/////////////////////////////////Macro for border type//////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////////////////////////////// - -#ifdef BORDER_CONSTANT -// CCCCCC|abcdefgh|CCCCCCC -#define EXTRAPOLATE(x, maxV) -#elif defined BORDER_REPLICATE -// aaaaaa|abcdefgh|hhhhhhh -#define EXTRAPOLATE(x, maxV) \ - { \ - (x) = max(min((x), (maxV) - 1), 0); \ - } -#elif defined BORDER_WRAP -// cdefgh|abcdefgh|abcdefg -#define EXTRAPOLATE(x, maxV) \ - { \ - (x) = ( (x) + (maxV) ) % (maxV); \ - } -#elif defined BORDER_REFLECT -// fedcba|abcdefgh|hgfedcb -#define EXTRAPOLATE(x, maxV) \ - { \ - (x) = min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ); \ - } -#elif defined BORDER_REFLECT_101 || defined BORDER_REFLECT101 -// gfedcb|abcdefgh|gfedcba -#define EXTRAPOLATE(x, maxV) \ - { \ - (x) = min(((maxV)-1)*2-(x), max((x),-(x)) ); \ - } -#else -#error No extrapolation method -#endif - -#if CN != 3 -#define loadpix(addr) *(__global const srcT *)(addr) -#define storepix(val, addr) *(__global dstT *)(addr) = val -#define SRCSIZE (int)sizeof(srcT) -#define DSTSIZE (int)sizeof(dstT) -#else -#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr)) -#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr)) -#define SRCSIZE (int)sizeof(srcT1)*3 -#define DSTSIZE (int)sizeof(dstT1)*3 -#endif - -#define SRC(_x,_y) convertToWT(loadpix(Src + mad24(_y, src_step, SRCSIZE * _x))) - -#ifdef BORDER_CONSTANT -// CCCCCC|abcdefgh|CCCCCCC -#define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y)) -#else -#define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y)) -#endif - -#define noconvert - -// horizontal and vertical filter kernels -// should be defined on host during compile time to avoid overhead -#define DIG(a) a, -__constant int mat_kernelX[] = { KERNEL_MATRIX_X }; -__constant int mat_kernelY[] = { KERNEL_MATRIX_Y }; - -__kernel void gaussian_blur_8u(__global uchar* Src, int src_step, int srcOffsetX, int srcOffsetY, int height, int width, - __global uchar* Dst, int dst_step, int dst_offset, int dst_rows, int dst_cols) -{ - // RADIUSX, RADIUSY are filter dimensions - // BLK_X, BLK_Y are local wrogroup sizes - // all these should be defined on host during compile time - // first lsmem array for source pixels used in first pass, - // second lsmemDy for storing first pass results - __local WT lsmem[BLK_Y + 2 * RADIUSY][BLK_X + 2 * RADIUSX]; - __local WT lsmemDy[BLK_Y][BLK_X + 2 * RADIUSX]; - - // get local and global ids - used as image and local memory array indexes - int lix = get_local_id(0); - int liy = get_local_id(1); - - int x = get_global_id(0); - int y = get_global_id(1); - - // calculate pixel position in source image taking image offset into account - int srcX = x + srcOffsetX - RADIUSX; - int srcY = y + srcOffsetY - RADIUSY; - int xb = srcX; - int yb = srcY; - - // extrapolate coordinates, if needed - // and read my own source pixel into local memory - // with account for extra border pixels, which will be read by starting workitems - int clocY = liy; - int cSrcY = srcY; - do - { - int yb = cSrcY; - EXTRAPOLATE(yb, (height)); - - int clocX = lix; - int cSrcX = srcX; - do - { - int xb = cSrcX; - EXTRAPOLATE(xb,(width)); - lsmem[clocY][clocX] = ELEM(xb, yb, (width), (height), 0 ); - - clocX += BLK_X; - cSrcX += BLK_X; - } - while(clocX < BLK_X+(RADIUSX*2)); - - clocY += BLK_Y; - cSrcY += BLK_Y; - } - while (clocY < BLK_Y+(RADIUSY*2)); - barrier(CLK_LOCAL_MEM_FENCE); - - // do vertical filter pass - // and store intermediate results to second local memory array - int i, clocX = lix; - WT sum = 0; - do - { - sum = 0; - for (i=0; i<=2*RADIUSY; i++) - sum = mad(lsmem[liy+i][clocX], mat_kernelY[i], sum); - lsmemDy[liy][clocX] = sum; - clocX += BLK_X; - } - while(clocX < BLK_X+(RADIUSX*2)); - barrier(CLK_LOCAL_MEM_FENCE); - - // if this pixel happened to be out of image borders because of global size rounding, - // then just return - if( x >= dst_cols || y >=dst_rows ) - return; - - // do second horizontal filter pass - // and calculate final result - sum = 0; - for (i=0; i<=2*RADIUSX; i++) - sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum); - - sum = sum >> (GAUSSIAN_COEF_BITS * 2); - - //store result into destination image - storepix(convertToDstT(sum), Dst + mad24(y, dst_step, mad24(x, DSTSIZE, dst_offset))); -} diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index e2365cd203..6a18af5c2d 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -42,7 +42,6 @@ #include "precomp.hpp" #include "opencl_kernels.hpp" -#include /* * This file includes the code, contributed by Simon Perreault @@ -1070,73 +1069,6 @@ static void createGaussianKernels( Mat & kx, Mat & ky, int type, Size ksize, ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) ); } -#define GAUSSIAN_COEF_BITS 11 - -static bool GaussianBlur_8u(InputArray _src, OutputArray _dst, Size ksize, - double sigma1, double sigma2, - int borderType) -{ - int type = _src.type(); - Mat kx, ky; - createGaussianKernels(kx, ky, CV_64F, ksize, sigma1, sigma2); - Mat kx_8u, ky_8u; - - int scale_coef = 1 << GAUSSIAN_COEF_BITS; - kx.convertTo(kx_8u, CV_32S, scale_coef); - ky.convertTo(ky_8u, CV_32S, scale_coef); - - kx_8u.reshape(1, 1); - ky_8u.reshape(1, 1); - - Size size = _src.size(), wholeSize; - Point origin; - int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype), - esz = CV_ELEM_SIZE(stype), wdepth = CV_32S, - ddepth = sdepth; - size_t src_step = _src.step(), src_offset = _src.offset(); - - if ((src_offset % src_step) % esz != 0 || !(borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE || - borderType == BORDER_REFLECT || borderType == BORDER_WRAP || - borderType == BORDER_REFLECT_101)) - return false; - - size_t lt2[2] = { 16, 16 }; - size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1] * (1 + (size.height - 1) / lt2[1]) }; - - char cvt[2][40]; - const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", - "BORDER_REFLECT_101" }; - - String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s" - " -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s" - " -D %s -D srcT1=%s -D dstT1=%s -D CN=%d -D GAUSSIAN_COEF_BITS=%d", (int)lt2[0], (int)lt2[1], - kx.rows / 2, kx.rows / 2, - ocl::kernelToStr(kx_8u, CV_32S, "KERNEL_MATRIX_X").c_str(), - ocl::kernelToStr(ky_8u, CV_32S, "KERNEL_MATRIX_Y").c_str(), - ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), - ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(stype), - ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType], - ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn, GAUSSIAN_COEF_BITS); - - ocl::Kernel k("gaussian_blur_8u", ocl::imgproc::gaussian_blur_8u_oclsrc, opts); - if (k.empty()) - return false; - - UMat src = _src.getUMat(); - _dst.create(size, stype); - UMat dst = _dst.getUMat(); - - int src_offset_x = static_cast((src_offset % src_step) / esz); - int src_offset_y = static_cast(src_offset / src_step); - - src.locateROI(wholeSize, origin); - - k.args(ocl::KernelArg::PtrReadOnly(src), (int)src_step, src_offset_x, src_offset_y, - wholeSize.height, wholeSize.width, ocl::KernelArg::WriteOnly(dst)); - - return k.run(2, gt2, lt2, false); -} - } cv::Ptr cv::createGaussianFilter( int type, Size ksize, @@ -1150,8 +1082,6 @@ cv::Ptr cv::createGaussianFilter( int type, Size ksize, } - - void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize, double sigma1, double sigma2, int borderType ) @@ -1196,13 +1126,6 @@ void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize, } #endif - //if (type == CV_8U) - //{ - // CV_OCL_RUN_(_dst.isUMat() && _src.dims() <= 2 && - // (!(borderType & BORDER_ISOLATED) || _src.offset() == 0), - // GaussianBlur_8u(_src, _dst, ksize, sigma1, sigma2, borderType)) - //} - Mat kx, ky; createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2); sepFilter2D(_src, _dst, CV_MAT_DEPTH(type), kx, ky, Point(-1,-1), 0, borderType ); diff --git a/modules/imgproc/test/ocl/test_filters.cpp b/modules/imgproc/test/ocl/test_filters.cpp index aee1f08b1a..bf45a72fd6 100644 --- a/modules/imgproc/test/ocl/test_filters.cpp +++ b/modules/imgproc/test/ocl/test_filters.cpp @@ -209,7 +209,7 @@ typedef FilterTestBase GaussianBlurTest; OCL_TEST_P(GaussianBlurTest, Mat) { - for (int j = 0; j < test_loop_times + 100; j++) + for (int j = 0; j < test_loop_times; j++) { random_roi(); @@ -219,28 +219,7 @@ OCL_TEST_P(GaussianBlurTest, Mat) OCL_OFF(cv::GaussianBlur(src_roi, dst_roi, Size(ksize, ksize), sigma1, sigma2, borderType)); OCL_ON(cv::GaussianBlur(usrc_roi, udst_roi, Size(ksize, ksize), sigma1, sigma2, borderType)); - - if (checkNorm2(dst_roi, udst_roi) > 2 && CV_MAT_DEPTH(type) == CV_8U) - { - std::cout << "i = " << j << std::endl; - Mat uudst = udst_roi.getMat(ACCESS_READ); - Mat diff; - absdiff(dst_roi, udst, diff); - int nonZero = countNonZero(diff); - double max; - Point maxn; - minMaxLoc(diff, (double*)0, &max, (Point*) 0, &maxn); - - uchar a = dst_roi.at(maxn); - uchar b = uudst.at(maxn); - - std::cout << "dst_roi" << dst_roi << std::endl; - std::cout << "udst_roi" << uudst << std::endl; - } - - - - Near(CV_MAT_DEPTH(type) == CV_8U ? 1 : 5e-5, false); + Near(CV_MAT_DEPTH(type) >= CV_32F ? 5e-5 : 1, false); } } From fc10ffefb838343d91be2b3252eee11292cc9c9f Mon Sep 17 00:00:00 2001 From: Alexander Karsakov Date: Thu, 3 Apr 2014 12:04:35 +0400 Subject: [PATCH 4/6] Enabled integer arithmetic for row_filter_C1_D0 --- modules/imgproc/src/filter.cpp | 13 ++++++------- modules/imgproc/src/opencl/filterSepRow.cl | 20 ++++++++++++++++---- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp index 7d870d72f8..d9469530f3 100644 --- a/modules/imgproc/src/filter.cpp +++ b/modules/imgproc/src/filter.cpp @@ -3475,14 +3475,13 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, int bdepth = CV_32F; bool int_arithm = false; - if( sdepth == CV_8U && - ((rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && - ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && - ddepth == CV_8U))) + if( sdepth == CV_8U && ddepth == CV_8U && + rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && + ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL) { bdepth = CV_32S; - _kernelX.getMat().reshape(1,1).convertTo( kernelX, CV_32S, 1 << shift_bits ); - _kernelY.getMat().reshape(1,1).convertTo( kernelY, CV_32S, 1 << shift_bits ); + kernelX.convertTo( kernelX, CV_32S, 1 << shift_bits ); + kernelY.convertTo( kernelY, CV_32S, 1 << shift_bits ); int_arithm = true; } @@ -3500,7 +3499,7 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, src.locateROI(srcWholeSize, srcOffset); bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 && - src.cols % 4 == 0 && src.step % 4 == 0 && !int_arithm; + src.cols % 4 == 0 && src.step % 4 == 0; Size srcSize = src.size(); Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1); diff --git a/modules/imgproc/src/opencl/filterSepRow.cl b/modules/imgproc/src/opencl/filterSepRow.cl index 890eeb8cbb..472ac4c91e 100644 --- a/modules/imgproc/src/opencl/filterSepRow.cl +++ b/modules/imgproc/src/opencl/filterSepRow.cl @@ -141,6 +141,14 @@ #define DIG(a) a, __constant dstT1 mat_kernel[] = { COEFF }; +#ifndef INTEGER_ARITHMETIC +#define dstT4 float4 +#define convertDstVec convert_float4 +#else +#define dstT4 int4 +#define convertDstVec convert_int4 +#endif + __kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel, int src_offset_x, int src_offset_y, int src_cols, int src_rows, int src_whole_cols, int src_whole_rows, __global float * dst, int dst_step_in_pixel, int dst_cols, int dst_rows, @@ -156,7 +164,7 @@ __kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel int start_y = y + src_offset_y - radiusy; int start_addr = mad24(start_y, src_step_in_pixel, start_x); - float4 sum; + dstT4 sum; uchar4 temp[READ_TIMES_ROW]; __local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW * LSIZE0 + 1]; @@ -250,19 +258,23 @@ __kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel barrier(CLK_LOCAL_MEM_FENCE); // read pixels from lds and calculate the result - sum = convert_float4(vload4(0,(__local uchar *)&LDS_DAT[l_y][l_x]+RADIUSX+offset)) * mat_kernel[RADIUSX]; + sum = convertDstVec(vload4(0,(__local uchar *)&LDS_DAT[l_y][l_x]+RADIUSX+offset)) * mat_kernel[RADIUSX]; for (int i = 1; i <= RADIUSX; ++i) { temp[0] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset - i); temp[1] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset + i); - sum += mad(convert_float4(temp[0]), mat_kernel[RADIUSX-i], convert_float4(temp[1]) * mat_kernel[RADIUSX + i]); +#ifndef INTEGER_ARITHMETIC + sum += mad(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]); +#else + sum += mad24(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]); +#endif } start_addr = mad24(y, dst_step_in_pixel, x); // write the result to dst if ((x+3 Date: Thu, 3 Apr 2014 16:53:57 +0400 Subject: [PATCH 5/6] Enabled integer arithmetic for filterSepSinglePass --- modules/imgproc/src/filter.cpp | 19 ++++++++++--------- .../src/opencl/filterSep_singlePass.cl | 17 +++++++++++++---- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp index d9469530f3..e2a1964466 100644 --- a/modules/imgproc/src/filter.cpp +++ b/modules/imgproc/src/filter.cpp @@ -3388,12 +3388,12 @@ const int optimizedSepFilterLocalSize = 16; static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, Mat row_kernel, Mat col_kernel, - double delta, int borderType, int ddepth) + double delta, int borderType, int ddepth, int bdepth, bool int_arithm) { Size size = _src.size(), wholeSize; Point origin; int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype), - esz = CV_ELEM_SIZE(stype), wdepth = std::max(std::max(sdepth, ddepth), CV_32F), + esz = CV_ELEM_SIZE(stype), wdepth = std::max(std::max(sdepth, ddepth), bdepth), dtype = CV_MAKE_TYPE(ddepth, cn); size_t src_step = _src.step(), src_offset = _src.offset(); bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; @@ -3413,14 +3413,15 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s" " -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s" - " -D %s -D srcT1=%s -D dstT1=%s -D CN=%d", (int)lt2[0], (int)lt2[1], - row_kernel.cols / 2, col_kernel.cols / 2, - ocl::kernelToStr(row_kernel, CV_32F, "KERNEL_MATRIX_X").c_str(), - ocl::kernelToStr(col_kernel, CV_32F, "KERNEL_MATRIX_Y").c_str(), + " -D %s -D srcT1=%s -D dstT1=%s -D CN=%d -D SHIFT_BITS=%d%s", + (int)lt2[0], (int)lt2[1], row_kernel.cols / 2, col_kernel.cols / 2, + ocl::kernelToStr(row_kernel, wdepth, "KERNEL_MATRIX_X").c_str(), + ocl::kernelToStr(col_kernel, wdepth, "KERNEL_MATRIX_Y").c_str(), ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype), ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType], - ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn); + ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn, 2*shift_bits, + int_arithm ? " -D INTEGER_ARITHMETIC" : ""); ocl::Kernel k("sep_filter", ocl::imgproc::filterSep_singlePass_oclsrc, opts); if (k.empty()) @@ -3485,14 +3486,14 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, int_arithm = true; } - CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 && !int_arithm && + CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 && imgSize.width > optimizedSepFilterLocalSize + anchor.x && imgSize.height > optimizedSepFilterLocalSize + anchor.y && (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) && anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) && (d.isIntel() || (d.isAMD() && !d.hostUnifiedMemory())), ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta, - borderType & ~BORDER_ISOLATED, ddepth), true) + borderType & ~BORDER_ISOLATED, ddepth, bdepth, int_arithm), true) UMat src = _src.getUMat(); Size srcWholeSize; Point srcOffset; diff --git a/modules/imgproc/src/opencl/filterSep_singlePass.cl b/modules/imgproc/src/opencl/filterSep_singlePass.cl index e75574035f..5fbf763e1e 100644 --- a/modules/imgproc/src/opencl/filterSep_singlePass.cl +++ b/modules/imgproc/src/opencl/filterSep_singlePass.cl @@ -100,8 +100,8 @@ // horizontal and vertical filter kernels // should be defined on host during compile time to avoid overhead #define DIG(a) a, -__constant float mat_kernelX[] = { KERNEL_MATRIX_X }; -__constant float mat_kernelY[] = { KERNEL_MATRIX_Y }; +__constant WT mat_kernelX[] = { KERNEL_MATRIX_X }; +__constant WT mat_kernelY[] = { KERNEL_MATRIX_Y }; __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int srcOffsetY, int height, int width, __global uchar* Dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, float delta) @@ -159,12 +159,16 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int // do vertical filter pass // and store intermediate results to second local memory array int i, clocX = lix; - WT sum = 0.0f; + WT sum = (WT) 0; do { - sum = 0.0f; + sum = (WT) 0; for (i=0; i<=2*RADIUSY; i++) +#ifndef INTEGER_ARITHMETIC sum = mad(lsmem[liy+i][clocX], mat_kernelY[i], sum); +#else + sum = mad24(lsmem[liy+i][clocX], mat_kernelY[i], sum); +#endif lsmemDy[liy][clocX] = sum; clocX += BLK_X; } @@ -180,8 +184,13 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int // and calculate final result sum = 0.0f; for (i=0; i<=2*RADIUSX; i++) +#ifndef INTEGER_ARITHMETIC sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum); +#else + sum = mad24(lsmemDy[liy][lix+i], mat_kernelX[i], sum); + sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS; +#endif // store result into destination image storepix(convertToDstT(sum + (WT)(delta)), Dst + mad24(y, dst_step, mad24(x, DSTSIZE, dst_offset))); } From a66db67b83bf57a59b9d2aca1177d86d0ed92986 Mon Sep 17 00:00:00 2001 From: Alexander Karsakov Date: Mon, 7 Apr 2014 10:36:13 +0400 Subject: [PATCH 6/6] Attempt to improve performance --- modules/imgproc/src/filter.cpp | 10 +++++----- modules/imgproc/src/opencl/filterSep_singlePass.cl | 7 +++---- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp index e2a1964466..141e8e9f48 100644 --- a/modules/imgproc/src/filter.cpp +++ b/modules/imgproc/src/filter.cpp @@ -3413,15 +3413,15 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s" " -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s" - " -D %s -D srcT1=%s -D dstT1=%s -D CN=%d -D SHIFT_BITS=%d%s", + " -D %s -D srcT1=%s -D dstT1=%s -D WT1=%s -D CN=%d -D SHIFT_BITS=%d%s", (int)lt2[0], (int)lt2[1], row_kernel.cols / 2, col_kernel.cols / 2, ocl::kernelToStr(row_kernel, wdepth, "KERNEL_MATRIX_X").c_str(), ocl::kernelToStr(col_kernel, wdepth, "KERNEL_MATRIX_Y").c_str(), ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype), ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType], - ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn, 2*shift_bits, - int_arithm ? " -D INTEGER_ARITHMETIC" : ""); + ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), ocl::typeToStr(wdepth), + cn, 2*shift_bits, int_arithm ? " -D INTEGER_ARITHMETIC" : ""); ocl::Kernel k("sep_filter", ocl::imgproc::filterSep_singlePass_oclsrc, opts); if (k.empty()) @@ -3481,8 +3481,8 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL) { bdepth = CV_32S; - kernelX.convertTo( kernelX, CV_32S, 1 << shift_bits ); - kernelY.convertTo( kernelY, CV_32S, 1 << shift_bits ); + kernelX.convertTo( kernelX, bdepth, 1 << shift_bits ); + kernelY.convertTo( kernelY, bdepth, 1 << shift_bits ); int_arithm = true; } diff --git a/modules/imgproc/src/opencl/filterSep_singlePass.cl b/modules/imgproc/src/opencl/filterSep_singlePass.cl index 5fbf763e1e..b8b812df46 100644 --- a/modules/imgproc/src/opencl/filterSep_singlePass.cl +++ b/modules/imgproc/src/opencl/filterSep_singlePass.cl @@ -100,8 +100,8 @@ // horizontal and vertical filter kernels // should be defined on host during compile time to avoid overhead #define DIG(a) a, -__constant WT mat_kernelX[] = { KERNEL_MATRIX_X }; -__constant WT mat_kernelY[] = { KERNEL_MATRIX_Y }; +__constant WT1 mat_kernelX[] = { KERNEL_MATRIX_X }; +__constant WT1 mat_kernelY[] = { KERNEL_MATRIX_Y }; __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int srcOffsetY, int height, int width, __global uchar* Dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, float delta) @@ -124,8 +124,6 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int // calculate pixel position in source image taking image offset into account int srcX = x + srcOffsetX - RADIUSX; int srcY = y + srcOffsetY - RADIUSY; - int xb = srcX; - int yb = srcY; // extrapolate coordinates, if needed // and read my own source pixel into local memory @@ -191,6 +189,7 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS; #endif + // store result into destination image storepix(convertToDstT(sum + (WT)(delta)), Dst + mad24(y, dst_step, mad24(x, DSTSIZE, dst_offset))); }