diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp index d548168491..24f222e253 100644 --- a/modules/imgproc/src/filter.cpp +++ b/modules/imgproc/src/filter.cpp @@ -42,6 +42,7 @@ #include "precomp.hpp" #include "opencl_kernels.hpp" +#include /****************************************************************************************\ Base Image Filter @@ -3314,6 +3315,246 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth, } return kernel.run(2, globalsize, localsize, true); } + +static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor, int borderType, bool sync) +{ + int type = src.type(); + int cn = CV_MAT_CN(type); + int sdepth = CV_MAT_DEPTH(type); + Size bufSize = buf.size(); + +#ifdef ANDROID + size_t localsize[2] = {16, 10}; +#else + size_t localsize[2] = {16, 16}; +#endif + size_t globalsize[2] = {DIVUP(bufSize.width, localsize[0]) * localsize[0], DIVUP(bufSize.height, localsize[1]) * localsize[1]}; + if (CV_8U == sdepth) + { + switch (cn) + { + case 1: + globalsize[0] = DIVUP((bufSize.width + 3) >> 2, localsize[0]) * localsize[0]; + break; + case 2: + globalsize[0] = DIVUP((bufSize.width + 1) >> 1, localsize[0]) * localsize[0]; + break; + case 4: + globalsize[0] = DIVUP(bufSize.width, localsize[0]) * localsize[0]; + break; + } + } + + int radiusX = anchor; + int radiusY = (int)((buf.rows - src.rows) >> 1); + + bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0; + const char* btype = NULL; + switch (borderType & ~BORDER_ISOLATED) + { + case BORDER_CONSTANT: + btype = "BORDER_CONSTANT"; + break; + case BORDER_REPLICATE: + btype = "BORDER_REPLICATE"; + break; + case BORDER_REFLECT: + btype = "BORDER_REFLECT"; + break; + case BORDER_WRAP: + btype = "BORDER_WRAP"; + break; + case BORDER_REFLECT101: + btype = "BORDER_REFLECT_101"; + break; + default: + return false; + } + + bool extra_extrapolation = src.rows < ((-radiusY + globalsize[1]) >> 1) + 1; + extra_extrapolation |= src.rows < radiusY; + extra_extrapolation |= src.cols < ((-radiusX + globalsize[0] + 8 * localsize[0] + 3) >> 1) + 1; + extra_extrapolation |= src.cols < radiusX; + char build_options[1024]; + sprintf(build_options, "-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s", + radiusX, (int)localsize[0], (int)localsize[1], cn, + btype, + extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", + isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED"); + + Size srcWholeSize; Point srcOffset; + src.locateROI(srcWholeSize, srcOffset); + + std::stringstream strKernel; + strKernel << "row_filter"; + if (-1 != cn) + strKernel << "_C" << cn; + if (-1 != sdepth) + strKernel << "_D" << sdepth; + + ocl::Kernel kernelRow; + if (!kernelRow.create(strKernel.str().c_str(), cv::ocl::imgproc::filterSepRow_oclsrc, build_options)) + return false; + + int idxArg = 0; + idxArg = kernelRow.set(idxArg, ocl::KernelArg::PtrReadOnly(src)); + idxArg = kernelRow.set(idxArg, (int)(src.step / src.elemSize())); + + idxArg = kernelRow.set(idxArg, srcOffset.x); + idxArg = kernelRow.set(idxArg, srcOffset.y); + idxArg = kernelRow.set(idxArg, src.cols); + idxArg = kernelRow.set(idxArg, src.rows); + idxArg = kernelRow.set(idxArg, srcWholeSize.width); + idxArg = kernelRow.set(idxArg, srcWholeSize.height); + + idxArg = kernelRow.set(idxArg, ocl::KernelArg::PtrWriteOnly(buf)); + idxArg = kernelRow.set(idxArg, (int)(buf.step / buf.elemSize())); + idxArg = kernelRow.set(idxArg, buf.cols); + idxArg = kernelRow.set(idxArg, buf.rows); + idxArg = kernelRow.set(idxArg, radiusY); + idxArg = kernelRow.set(idxArg, ocl::KernelArg::PtrReadOnly(kernelX.getUMat(ACCESS_READ))); + + return kernelRow.run(2, globalsize, localsize, sync); +} + +static bool ocl_sepColFilter2D(UMat &buf, UMat &dst, Mat &kernelY, int anchor, bool sync) +{ +#ifdef ANDROID + size_t localsize[2] = {16, 10}; +#else + size_t localsize[2] = {16, 16}; +#endif + size_t globalsize[2] = {0, 0}; + + int type = dst.type(); + int cn = CV_MAT_CN(type); + int ddepth = CV_MAT_DEPTH(type); + Size sz = dst.size(); + + globalsize[1] = DIVUP(sz.height, localsize[1]) * localsize[1]; + + char build_options[1024]; + if (CV_8U == ddepth) + { + switch (cn) + { + case 1: + globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0]; + sprintf(build_options, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s", + anchor, (int)localsize[0], (int)localsize[1], cn, "float", "uchar", "convert_uchar_sat"); + break; + case 2: + globalsize[0] = DIVUP((sz.width + 1) / 2, localsize[0]) * localsize[0]; + sprintf(build_options, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s", + anchor, (int)localsize[0], (int)localsize[1], cn, "float2", "uchar2", "convert_uchar2_sat"); + break; + case 3: + case 4: + globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0]; + sprintf(build_options, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s", + anchor, (int)localsize[0], (int)localsize[1], cn, "float4", "uchar4", "convert_uchar4_sat"); + break; + } + } + else + { + globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0]; + switch (dst.type()) + { + case CV_32SC1: + sprintf(build_options, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s", + anchor, (int)localsize[0], (int)localsize[1], cn, "float", "int", "convert_int_sat"); + break; + case CV_32SC3: + case CV_32SC4: + sprintf(build_options, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s", + anchor, (int)localsize[0], (int)localsize[1], cn, "float4", "int4", "convert_int4_sat"); + break; + case CV_32FC1: + sprintf(build_options, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s", + anchor, (int)localsize[0], (int)localsize[1], cn, "float", "float", ""); + break; + case CV_32FC3: + case CV_32FC4: + sprintf(build_options, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s", + anchor, (int)localsize[0], (int)localsize[1], cn, "float4", "float4", ""); + break; + } + } + + ocl::Kernel kernelCol; + if (!kernelCol.create("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc, build_options)) + return false; + + int idxArg = 0; + idxArg = kernelCol.set(idxArg, ocl::KernelArg::PtrReadOnly(buf)); + idxArg = kernelCol.set(idxArg, (int)(buf.step / buf.elemSize())); + idxArg = kernelCol.set(idxArg, buf.cols); + idxArg = kernelCol.set(idxArg, buf.rows); + + idxArg = kernelCol.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst)); + idxArg = kernelCol.set(idxArg, (int)(dst.offset / dst.elemSize())); + idxArg = kernelCol.set(idxArg, (int)(dst.step / dst.elemSize())); + idxArg = kernelCol.set(idxArg, dst.cols); + idxArg = kernelCol.set(idxArg, dst.rows); + idxArg = kernelCol.set(idxArg, ocl::KernelArg::PtrReadOnly(kernelY.getUMat(ACCESS_READ))); + + return kernelCol.run(2, globalsize, localsize, sync); +} + +static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, + InputArray _kernelX, InputArray _kernelY, Point anchor, + double delta, int borderType ) +{ + if (abs(delta)> FLT_MIN) + return false; + + int type = _src.type(); + if ((CV_8UC1 != type) && (CV_8UC4 == type) && + (CV_32FC1 != type) && (CV_32FC4 == type)) + return false; + + int cn = CV_MAT_CN(type); + + Mat kernelX = _kernelX.getMat().reshape(1, 1); + if (1 != (kernelX.cols % 2)) + return false; + Mat kernelY = _kernelY.getMat().reshape(1, 1); + if (1 != (kernelY.cols % 2)) + return false; + + int sdepth = CV_MAT_DEPTH(type); + if( anchor.x < 0 ) + anchor.x = kernelX.cols >> 1; + if( anchor.y < 0 ) + anchor.y = kernelY.cols >> 1; + + if( ddepth < 0 ) + ddepth = sdepth; + else if (ddepth != sdepth) + return false; + + UMat src = _src.getUMat(); + Size srcWholeSize; Point srcOffset; + src.locateROI(srcWholeSize, srcOffset); + if ( (0 != (srcOffset.x % 4)) || + (0 != (src.cols % 4)) || + (0 != ((src.step / src.elemSize()) % 4)) + ) + { + return false; + } + + Size srcSize = src.size(); + Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1); + UMat buf; buf.create(bufSize, CV_MAKETYPE(CV_32F, cn)); + if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, true)) + return false; + + _dst.create(srcSize, CV_MAKETYPE(ddepth, cn)); + UMat dst = _dst.getUMat(); + return ocl_sepColFilter2D(buf, dst, kernelY, anchor.y, true); +} } cv::Ptr cv::getLinearFilter(int srcType, int dstType, @@ -3481,6 +3722,10 @@ void cv::sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, InputArray _kernelX, InputArray _kernelY, Point anchor, double delta, int borderType ) { + bool use_opencl = ocl::useOpenCL() && _dst.isUMat(); + if( use_opencl && ocl_sepFilter2D(_src, _dst, ddepth, _kernelX, _kernelY, anchor, delta, borderType)) + return; + Mat src = _src.getMat(), kernelX = _kernelX.getMat(), kernelY = _kernelY.getMat(); if( ddepth < 0 ) diff --git a/modules/imgproc/src/opencl/filterSepCol.cl b/modules/imgproc/src/opencl/filterSepCol.cl new file mode 100644 index 0000000000..c990a6ca19 --- /dev/null +++ b/modules/imgproc/src/opencl/filterSepCol.cl @@ -0,0 +1,116 @@ +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Niko Li, newlife20080214@gmail.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +// + +#define READ_TIMES_COL ((2*(RADIUSY+LSIZE1)-1)/LSIZE1) +#define RADIUS 1 +#if CN ==1 +#define ALIGN (((RADIUS)+3)>>2<<2) +#elif CN==2 +#define ALIGN (((RADIUS)+1)>>1<<1) +#elif CN==3 +#define ALIGN (((RADIUS)+3)>>2<<2) +#elif CN==4 +#define ALIGN (RADIUS) +#define READ_TIMES_ROW ((2*(RADIUS+LSIZE0)-1)/LSIZE0) +#endif + +/********************************************************************************** +These kernels are written for separable filters such as Sobel, Scharr, GaussianBlur. +Now(6/29/2011) the kernels only support 8U data type and the anchor of the convovle +kernel must be in the center. ROI is not supported either. +Each kernels read 4 elements(not 4 pixels), save them to LDS and read the data needed +from LDS to calculate the result. +The length of the convovle kernel supported is only related to the MAX size of LDS, +which is HW related. +Niko +6/29/2011 +The info above maybe obsolete. +***********************************************************************************/ + + +__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter + (__global const GENTYPE_SRC * restrict src, + const int src_step_in_pixel, + const int src_whole_cols, + const int src_whole_rows, + __global GENTYPE_DST * dst, + const int dst_offset_in_pixel, + const int dst_step_in_pixel, + const int dst_cols, + const int dst_rows, + __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSY+1))))) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + int l_x = get_local_id(0); + int l_y = get_local_id(1); + + int start_addr = mad24(y, src_step_in_pixel, x); + int end_addr = mad24(src_whole_rows - 1, src_step_in_pixel, src_whole_cols); + + int i; + GENTYPE_SRC sum, temp[READ_TIMES_COL]; + __local GENTYPE_SRC LDS_DAT[LSIZE1 * READ_TIMES_COL][LSIZE0 + 1]; + + //read pixels from src + for(i = 0;i>2<<2) +#elif CN==2 +#define ALIGN (((RADIUS)+1)>>1<<1) +#elif CN==3 +#define ALIGN (((RADIUS)+3)>>2<<2) +#elif CN==4 +#define ALIGN (RADIUS) +#endif + +#ifdef BORDER_REPLICATE +//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i)) +#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr)) +#endif + +#ifdef BORDER_REFLECT +//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i)) +#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr)) +#endif + +#ifdef BORDER_REFLECT_101 +//BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i)) +#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)) +#endif + +//blur function does not support BORDER_WRAP +#ifdef BORDER_WRAP +//BORDER_WRAP: cdefgh|abcdefgh|abcdefg +#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i)) +#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr)) +#endif + +#ifdef EXTRA_EXTRAPOLATION // border > src image size + #ifdef BORDER_CONSTANT + #define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2) + #elif defined BORDER_REPLICATE + #define EXTRAPOLATE(t, minT, maxT) \ + { \ + t = max(min(t, (maxT) - 1), (minT)); \ + } + #elif defined BORDER_WRAP + #define EXTRAPOLATE(x, minT, maxT) \ + { \ + if (t < (minT)) \ + t -= ((t - (maxT) + 1) / (maxT)) * (maxT); \ + if (t >= (maxT)) \ + t %= (maxT); \ + } + #elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101) + #define EXTRAPOLATE_(t, minT, maxT, delta) \ + { \ + if ((maxT) - (minT) == 1) \ + t = (minT); \ + else \ + do \ + { \ + if (t < (minT)) \ + t = (minT) - (t - (minT)) - 1 + delta; \ + else \ + t = (maxT) - 1 - (t - (maxT)) - delta; \ + } \ + while (t >= (maxT) || t < (minT)); \ + \ + } + #ifdef BORDER_REFLECT + #define EXTRAPOLATE(t, minT, maxT) EXTRAPOLATE_(t, minT, maxT, 0) + #elif defined(BORDER_REFLECT_101) + #define EXTRAPOLATE(t, minT, maxT) EXTRAPOLATE_(t, minT, maxT, 1) + #endif + #else + #error No extrapolation method + #endif //BORDER_.... +#else //EXTRA_EXTRAPOLATION + #ifdef BORDER_CONSTANT + #define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2) + #else + #define EXTRAPOLATE(t, minT, maxT) \ + { \ + int _delta = t - (minT); \ + _delta = ADDR_L(_delta, 0, (maxT) - (minT)); \ + _delta = ADDR_R(_delta, (maxT) - (minT), _delta); \ + t = _delta + (minT); \ + } + #endif //BORDER_CONSTANT +#endif //EXTRA_EXTRAPOLATION + +/********************************************************************************** +These kernels are written for separable filters such as Sobel, Scharr, GaussianBlur. +Now(6/29/2011) the kernels only support 8U data type and the anchor of the convovle +kernel must be in the center. ROI is not supported either. +For channels =1,2,4, each kernels read 4 elements(not 4 pixels), and for channels =3, +the kernel read 4 pixels, save them to LDS and read the data needed from LDS to +calculate the result. +The length of the convovle kernel supported is related to the LSIZE0 and the MAX size +of LDS, which is HW related. +For channels = 1,3 the RADIUS is no more than LSIZE0*2 +For channels = 2, the RADIUS is no more than LSIZE0 +For channels = 4, arbitary RADIUS is supported unless the LDS is not enough +Niko +6/29/2011 +The info above maybe obsolete. +***********************************************************************************/ + +__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D0 + (__global uchar * restrict src, + int src_step_in_pixel, + int src_offset_x, int src_offset_y, + int src_cols, int src_rows, + int src_whole_cols, int src_whole_rows, + __global float * dst, + int dst_step_in_pixel, + int dst_cols, int dst_rows, + int radiusy, + __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1))))) +{ + int x = get_global_id(0)<<2; + int y = get_global_id(1); + int l_x = get_local_id(0); + int l_y = get_local_id(1); + + int start_x = x+src_offset_x - RADIUSX & 0xfffffffc; + int offset = src_offset_x - RADIUSX & 3; + int start_y = y + src_offset_y - radiusy; + int start_addr = mad24(start_y, src_step_in_pixel, start_x); + int i; + float4 sum; + uchar4 temp[READ_TIMES_ROW]; + + __local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1]; +#ifdef BORDER_CONSTANT + int end_addr = mad24(src_whole_rows - 1, src_step_in_pixel, src_whole_cols); + + // read pixels from src + for (i = 0; i < READ_TIMES_ROW; i++) + { + int current_addr = start_addr+i*LSIZE0*4; + current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0; + temp[i] = *(__global uchar4*)&src[current_addr]; + } + + // judge if read out of boundary +#ifdef BORDER_ISOLATED + for (i = 0; isrc_offset_x + src_cols)| (start_y= src_offset_y + src_rows); +#else + int not_all_in_range = (start_x<0) | (start_x + READ_TIMES_ROW*LSIZE0*4+4>src_whole_cols)| (start_y<0) | (start_y >= src_whole_rows); +#endif + int4 index[READ_TIMES_ROW]; + int4 addr; + int s_y; + + if (not_all_in_range) + { + // judge if read out of boundary + for (i = 0; i < READ_TIMES_ROW; i++) + { + index[i] = (int4)(start_x+i*LSIZE0*4) + (int4)(0, 1, 2, 3); +#ifdef BORDER_ISOLATED + EXTRAPOLATE(index[i].x, src_offset_x, src_offset_x + src_cols); + EXTRAPOLATE(index[i].y, src_offset_x, src_offset_x + src_cols); + EXTRAPOLATE(index[i].z, src_offset_x, src_offset_x + src_cols); + EXTRAPOLATE(index[i].w, src_offset_x, src_offset_x + src_cols); +#else + EXTRAPOLATE(index[i].x, 0, src_whole_cols); + EXTRAPOLATE(index[i].y, 0, src_whole_cols); + EXTRAPOLATE(index[i].z, 0, src_whole_cols); + EXTRAPOLATE(index[i].w, 0, src_whole_cols); +#endif + } + s_y = start_y; +#ifdef BORDER_ISOLATED + EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows); +#else + EXTRAPOLATE(s_y, 0, src_whole_rows); +#endif + + // read pixels from src + for (i = 0; i 0)) ? current_addr : 0; + temp[i] = src[current_addr]; + } + + //judge if read out of boundary +#ifdef BORDER_ISOLATED + for (i = 0; i 0)) ? current_addr : 0; + temp[i] = src[current_addr]; + } + + // judge if read out of boundary +#ifdef BORDER_ISOLATED + for (i = 0; i 0)) ? current_addr : 0; + temp[i] = src[current_addr]; + } + + // judge if read out of boundary +#ifdef BORDER_ISOLATED + for (i = 0; i