Merge pull request #2491 from ilya-lavrenov:tapi_sep_filter

pull/2515/merge
Andrey Pavlenko 11 years ago committed by OpenCV Buildbot
commit d8c018289a
  1. 2
      modules/core/include/opencv2/core/mat.hpp
  2. 2
      modules/core/include/opencv2/core/ocl.hpp
  3. 34
      modules/core/src/matrix.cpp
  4. 8
      modules/core/src/ocl.cpp
  5. 2
      modules/imgproc/perf/opencl/perf_filters.cpp
  6. 272
      modules/imgproc/src/filter.cpp
  7. 101
      modules/imgproc/src/opencl/filterSepCol.cl
  8. 386
      modules/imgproc/src/opencl/filterSepRow.cl
  9. 187
      modules/imgproc/src/opencl/filterSep_singlePass.cl
  10. 2
      modules/imgproc/test/ocl/test_filters.cpp
  11. 22
      modules/imgproc/test/ocl/test_sepfilter2D.cpp
  12. 4
      modules/superres/src/btv_l1.cpp

@ -118,6 +118,8 @@ public:
virtual int kind() const;
virtual int dims(int i=-1) const;
virtual int cols(int i=-1) const;
virtual int rows(int i=-1) const;
virtual Size size(int i=-1) const;
virtual int sizend(int* sz, int i=-1) const;
virtual bool sameSize(const _InputArray& arr) const;

@ -592,7 +592,7 @@ protected:
CV_EXPORTS const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf);
CV_EXPORTS const char* typeToStr(int t);
CV_EXPORTS const char* memopTypeToStr(int t);
CV_EXPORTS String kernelToStr(InputArray _kernel, int ddepth = -1);
CV_EXPORTS String kernelToStr(InputArray _kernel, int ddepth = -1, const char * name = NULL);
CV_EXPORTS void getPlatfomsInfo(std::vector<PlatformInfo>& platform_info);
CV_EXPORTS int predictOptimalVectorWidth(InputArray src1, InputArray src2 = noArray(), InputArray src3 = noArray(),
InputArray src4 = noArray(), InputArray src5 = noArray(), InputArray src6 = noArray(),

@ -1416,6 +1416,16 @@ int _InputArray::kind() const
return flags & KIND_MASK;
}
int _InputArray::rows(int i) const
{
return size(i).height;
}
int _InputArray::cols(int i) const
{
return size(i).width;
}
Size _InputArray::size(int i) const
{
int k = kind();
@ -2078,45 +2088,45 @@ void _OutputArray::create(Size _sz, int mtype, int i, bool allowTransposed, int
create(2, sizes, mtype, i, allowTransposed, fixedDepthMask);
}
void _OutputArray::create(int rows, int cols, int mtype, int i, bool allowTransposed, int fixedDepthMask) const
void _OutputArray::create(int _rows, int _cols, int mtype, int i, bool allowTransposed, int fixedDepthMask) const
{
int k = kind();
if( k == MAT && i < 0 && !allowTransposed && fixedDepthMask == 0 )
{
CV_Assert(!fixedSize() || ((Mat*)obj)->size.operator()() == Size(cols, rows));
CV_Assert(!fixedSize() || ((Mat*)obj)->size.operator()() == Size(_cols, _rows));
CV_Assert(!fixedType() || ((Mat*)obj)->type() == mtype);
((Mat*)obj)->create(rows, cols, mtype);
((Mat*)obj)->create(_rows, _cols, mtype);
return;
}
if( k == UMAT && i < 0 && !allowTransposed && fixedDepthMask == 0 )
{
CV_Assert(!fixedSize() || ((UMat*)obj)->size.operator()() == Size(cols, rows));
CV_Assert(!fixedSize() || ((UMat*)obj)->size.operator()() == Size(_cols, _rows));
CV_Assert(!fixedType() || ((UMat*)obj)->type() == mtype);
((UMat*)obj)->create(rows, cols, mtype);
((UMat*)obj)->create(_rows, _cols, mtype);
return;
}
if( k == GPU_MAT && i < 0 && !allowTransposed && fixedDepthMask == 0 )
{
CV_Assert(!fixedSize() || ((cuda::GpuMat*)obj)->size() == Size(cols, rows));
CV_Assert(!fixedSize() || ((cuda::GpuMat*)obj)->size() == Size(_cols, _rows));
CV_Assert(!fixedType() || ((cuda::GpuMat*)obj)->type() == mtype);
((cuda::GpuMat*)obj)->create(rows, cols, mtype);
((cuda::GpuMat*)obj)->create(_rows, _cols, mtype);
return;
}
if( k == OPENGL_BUFFER && i < 0 && !allowTransposed && fixedDepthMask == 0 )
{
CV_Assert(!fixedSize() || ((ogl::Buffer*)obj)->size() == Size(cols, rows));
CV_Assert(!fixedSize() || ((ogl::Buffer*)obj)->size() == Size(_cols, _rows));
CV_Assert(!fixedType() || ((ogl::Buffer*)obj)->type() == mtype);
((ogl::Buffer*)obj)->create(rows, cols, mtype);
((ogl::Buffer*)obj)->create(_rows, _cols, mtype);
return;
}
if( k == CUDA_MEM && i < 0 && !allowTransposed && fixedDepthMask == 0 )
{
CV_Assert(!fixedSize() || ((cuda::CudaMem*)obj)->size() == Size(cols, rows));
CV_Assert(!fixedSize() || ((cuda::CudaMem*)obj)->size() == Size(_cols, _rows));
CV_Assert(!fixedType() || ((cuda::CudaMem*)obj)->type() == mtype);
((cuda::CudaMem*)obj)->create(rows, cols, mtype);
((cuda::CudaMem*)obj)->create(_rows, _cols, mtype);
return;
}
int sizes[] = {rows, cols};
int sizes[] = {_rows, _cols};
create(2, sizes, mtype, i, allowTransposed, fixedDepthMask);
}

@ -4307,7 +4307,7 @@ static std::string kerToStr(const Mat & k)
return stream.str();
}
String kernelToStr(InputArray _kernel, int ddepth)
String kernelToStr(InputArray _kernel, int ddepth, const char * name)
{
Mat kernel = _kernel.getMat().reshape(1, 1);
@ -4318,13 +4318,13 @@ String kernelToStr(InputArray _kernel, int ddepth)
if (ddepth != depth)
kernel.convertTo(kernel, ddepth);
typedef std::string (*func_t)(const Mat &);
static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>,kerToStr<short>,
typedef std::string (* func_t)(const Mat &);
static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
const func_t func = funcs[depth];
CV_Assert(func != 0);
return cv::format(" -D COEFF=%s", func(kernel).c_str());
return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str());
}
#define PROCESS_SRC(src) \

@ -211,7 +211,7 @@ OCL_PERF_TEST_P(SobelFixture, Sobel,
OCL_TEST_CYCLE() cv::Sobel(src, dst, -1, dx, dy);
SANITY_CHECK(dst);
SANITY_CHECK(dst, 1e-6);
}
///////////// Scharr ////////////////////////

@ -3134,7 +3134,7 @@ template<typename ST, class CastOp, class VecOp> struct Filter2D : public BaseFi
// b e h b e h 0 0
// c f i c f i 0 0
template <typename T>
static int _prepareKernelFilter2D(std::vector<T>& data, const Mat &kernel)
static int _prepareKernelFilter2D(std::vector<T> & data, const Mat & kernel)
{
Mat _kernel; kernel.convertTo(_kernel, DataDepth<T>::value);
int size_y_aligned = ROUNDUP(kernel.rows * 2, 4);
@ -3317,200 +3317,224 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
return kernel.run(2, globalsize, localsize, true);
}
static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor, int borderType, bool sync)
static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX, int anchor,
int borderType, int ddepth, bool fast8uc1)
{
int type = src.type();
int cn = CV_MAT_CN(type);
int sdepth = CV_MAT_DEPTH(type);
int type = src.type(), cn = CV_MAT_CN(type), sdepth = CV_MAT_DEPTH(type);
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
Size bufSize = buf.size();
if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
return false;
#ifdef ANDROID
size_t localsize[2] = {16, 10};
#else
size_t localsize[2] = {16, 16};
#endif
size_t globalsize[2] = {DIVUP(bufSize.width, localsize[0]) * localsize[0], DIVUP(bufSize.height, localsize[1]) * localsize[1]};
if (CV_8U == sdepth)
{
switch (cn)
{
case 1:
globalsize[0] = DIVUP((bufSize.width + 3) >> 2, localsize[0]) * localsize[0];
break;
case 2:
globalsize[0] = DIVUP((bufSize.width + 1) >> 1, localsize[0]) * localsize[0];
break;
case 4:
globalsize[0] = DIVUP(bufSize.width, localsize[0]) * localsize[0];
break;
}
}
if (fast8uc1)
globalsize[0] = DIVUP((bufSize.width + 3) >> 2, localsize[0]) * localsize[0];
int radiusX = anchor;
int radiusY = (int)((buf.rows - src.rows) >> 1);
int radiusX = anchor, radiusY = (buf.rows - src.rows) >> 1;
bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0;
const char* btype = NULL;
switch (borderType & ~BORDER_ISOLATED)
{
case BORDER_CONSTANT:
btype = "BORDER_CONSTANT";
break;
case BORDER_REPLICATE:
btype = "BORDER_REPLICATE";
break;
case BORDER_REFLECT:
btype = "BORDER_REFLECT";
break;
case BORDER_WRAP:
btype = "BORDER_WRAP";
break;
case BORDER_REFLECT101:
btype = "BORDER_REFLECT_101";
break;
default:
return false;
}
bool isolated = (borderType & BORDER_ISOLATED) != 0;
const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101" },
* const btype = borderMap[borderType & ~BORDER_ISOLATED];
bool extra_extrapolation = src.rows < (int)((-radiusY + globalsize[1]) >> 1) + 1;
extra_extrapolation |= src.rows < radiusY;
extra_extrapolation |= src.cols < (int)((-radiusX + globalsize[0] + 8 * localsize[0] + 3) >> 1) + 1;
extra_extrapolation |= src.cols < radiusX;
cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s",
radiusX, (int)localsize[0], (int)localsize[1], cn,
btype,
extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
char cvt[40];
cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s"
" -D srcT=%s -D dstT=%s -D convertToDstT=%s -D srcT1=%s -D dstT1=%s%s",
radiusX, (int)localsize[0], (int)localsize[1], cn, btype,
extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)),
ocl::convertTypeStr(sdepth, CV_32F, cn, cvt),
ocl::typeToStr(sdepth), ocl::typeToStr(CV_32F),
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
build_options += ocl::kernelToStr(kernelX, CV_32F);
Size srcWholeSize; Point srcOffset;
src.locateROI(srcWholeSize, srcOffset);
std::stringstream strKernel;
strKernel << "row_filter";
if (-1 != cn)
strKernel << "_C" << cn;
if (-1 != sdepth)
strKernel << "_D" << sdepth;
String kernelName("row_filter");
if (fast8uc1)
kernelName += "_C1_D0";
ocl::Kernel kernelRow;
if (!kernelRow.create(strKernel.str().c_str(), cv::ocl::imgproc::filterSepRow_oclsrc,
build_options))
ocl::Kernel k(kernelName.c_str(), cv::ocl::imgproc::filterSepRow_oclsrc,
build_options);
if (k.empty())
return false;
int idxArg = 0;
idxArg = kernelRow.set(idxArg, ocl::KernelArg::PtrReadOnly(src));
idxArg = kernelRow.set(idxArg, (int)(src.step / src.elemSize()));
idxArg = kernelRow.set(idxArg, srcOffset.x);
idxArg = kernelRow.set(idxArg, srcOffset.y);
idxArg = kernelRow.set(idxArg, src.cols);
idxArg = kernelRow.set(idxArg, src.rows);
idxArg = kernelRow.set(idxArg, srcWholeSize.width);
idxArg = kernelRow.set(idxArg, srcWholeSize.height);
idxArg = kernelRow.set(idxArg, ocl::KernelArg::PtrWriteOnly(buf));
idxArg = kernelRow.set(idxArg, (int)(buf.step / buf.elemSize()));
idxArg = kernelRow.set(idxArg, buf.cols);
idxArg = kernelRow.set(idxArg, buf.rows);
idxArg = kernelRow.set(idxArg, radiusY);
return kernelRow.run(2, globalsize, localsize, sync);
if (fast8uc1)
k.args(ocl::KernelArg::PtrReadOnly(src), (int)(src.step / src.elemSize()), srcOffset.x,
srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height,
ocl::KernelArg::PtrWriteOnly(buf), (int)(buf.step / buf.elemSize()),
buf.cols, buf.rows, radiusY);
else
k.args(ocl::KernelArg::PtrReadOnly(src), (int)src.step, srcOffset.x,
srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height,
ocl::KernelArg::PtrWriteOnly(buf), (int)buf.step, buf.cols, buf.rows, radiusY);
return k.run(2, globalsize, localsize, false);
}
static bool ocl_sepColFilter2D(const UMat &buf, UMat &dst, Mat &kernelY, int anchor, bool sync)
static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, int anchor)
{
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
if (dst.depth() == CV_64F && !doubleSupport)
return false;
#ifdef ANDROID
size_t localsize[2] = {16, 10};
size_t localsize[2] = { 16, 10 };
#else
size_t localsize[2] = {16, 16};
size_t localsize[2] = { 16, 16 };
#endif
size_t globalsize[2] = {0, 0};
size_t globalsize[2] = { 0, 0 };
int dtype = dst.type(), cn = CV_MAT_CN(dtype), ddepth = CV_MAT_DEPTH(dtype);
Size sz = dst.size();
globalsize[1] = DIVUP(sz.height, localsize[1]) * localsize[1];
if (dtype == CV_8UC2)
globalsize[0] = DIVUP((sz.width + 1) / 2, localsize[0]) * localsize[0];
else
globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0];
globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0];
char cvt[40];
cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
anchor, (int)localsize[0], (int)localsize[1], cn, ocl::typeToStr(buf.type()),
ocl::typeToStr(dtype), ocl::convertTypeStr(CV_32F, ddepth, cn, cvt));
cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d"
" -D srcT=%s -D dstT=%s -D convertToDstT=%s"
" -D srcT1=%s -D dstT1=%s%s",
anchor, (int)localsize[0], (int)localsize[1], cn,
ocl::typeToStr(buf.type()), ocl::typeToStr(dtype),
ocl::convertTypeStr(CV_32F, ddepth, cn, cvt),
ocl::typeToStr(CV_32F), ocl::typeToStr(ddepth),
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
build_options += ocl::kernelToStr(kernelY, CV_32F);
ocl::Kernel kernelCol;
if (!kernelCol.create("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc, build_options))
ocl::Kernel k("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc,
build_options);
if (k.empty())
return false;
int idxArg = 0;
idxArg = kernelCol.set(idxArg, ocl::KernelArg::PtrReadOnly(buf));
idxArg = kernelCol.set(idxArg, (int)(buf.step / buf.elemSize()));
idxArg = kernelCol.set(idxArg, buf.cols);
idxArg = kernelCol.set(idxArg, buf.rows);
idxArg = kernelCol.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst));
idxArg = kernelCol.set(idxArg, (int)(dst.offset / dst.elemSize()));
idxArg = kernelCol.set(idxArg, (int)(dst.step / dst.elemSize()));
idxArg = kernelCol.set(idxArg, dst.cols);
idxArg = kernelCol.set(idxArg, dst.rows);
return kernelCol.run(2, globalsize, localsize, sync);
k.args(ocl::KernelArg::ReadOnly(buf), ocl::KernelArg::WriteOnly(dst));
return k.run(2, globalsize, localsize, false);
}
const int optimizedSepFilterLocalSize = 16;
static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
Mat row_kernel, Mat col_kernel,
int borderType, int ddepth)
{
Size size = _src.size(), wholeSize;
Point origin;
int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
esz = CV_ELEM_SIZE(stype), wdepth = std::max(std::max(sdepth, ddepth), CV_32F),
dtype = CV_MAKE_TYPE(ddepth, cn);
size_t src_step = _src.step(), src_offset = _src.offset();
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
if ((src_offset % src_step) % esz != 0 || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) ||
!(borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE ||
borderType == BORDER_REFLECT || borderType == BORDER_WRAP ||
borderType == BORDER_REFLECT_101))
return false;
size_t lt2[2] = { optimizedSepFilterLocalSize, optimizedSepFilterLocalSize };
size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1] * (1 + (size.height - 1) / lt2[1]) };
char cvt[2][40];
const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
"BORDER_REFLECT_101" };
String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s"
" -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s"
" -D %s -D srcT1=%s -D dstT1=%s -D CN=%d", (int)lt2[0], (int)lt2[1],
row_kernel.cols / 2, col_kernel.cols / 2,
ocl::kernelToStr(row_kernel, CV_32F, "KERNEL_MATRIX_X").c_str(),
ocl::kernelToStr(col_kernel, CV_32F, "KERNEL_MATRIX_Y").c_str(),
ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype),
ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType],
ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn);
ocl::Kernel k("sep_filter", ocl::imgproc::filterSep_singlePass_oclsrc, opts);
if (k.empty())
return false;
UMat src = _src.getUMat();
_dst.create(size, dtype);
UMat dst = _dst.getUMat();
int src_offset_x = static_cast<int>((src_offset % src_step) / esz);
int src_offset_y = static_cast<int>(src_offset / src_step);
src.locateROI(wholeSize, origin);
k.args(ocl::KernelArg::PtrReadOnly(src), (int)src_step, src_offset_x, src_offset_y,
wholeSize.height, wholeSize.width, ocl::KernelArg::WriteOnly(dst));
return k.run(2, gt2, lt2, false);
}
static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
InputArray _kernelX, InputArray _kernelY, Point anchor,
double delta, int borderType )
{
const ocl::Device & d = ocl::Device::getDefault();
Size imgSize = _src.size();
if (abs(delta)> FLT_MIN)
return false;
int type = _src.type();
if ( !( (type == CV_8UC1 || type == CV_8UC4 || type == CV_32FC1 || type == CV_32FC4) &&
(ddepth == CV_32F || ddepth == CV_16S || ddepth == CV_8U || ddepth < 0) ) )
int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
if (cn > 4)
return false;
int cn = CV_MAT_CN(type);
Mat kernelX = _kernelX.getMat().reshape(1, 1);
if (1 != (kernelX.cols % 2))
if (kernelX.cols % 2 != 1)
return false;
Mat kernelY = _kernelY.getMat().reshape(1, 1);
if (1 != (kernelY.cols % 2))
if (kernelY.cols % 2 != 1)
return false;
int sdepth = CV_MAT_DEPTH(type);
if( anchor.x < 0 )
if (ddepth < 0)
ddepth = sdepth;
CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 &&
imgSize.width > optimizedSepFilterLocalSize + (kernelX.cols >> 1) &&
imgSize.height > optimizedSepFilterLocalSize + (kernelY.cols >> 1) &&
(!(borderType & BORDER_ISOLATED) || _src.offset() == 0) && anchor == Point(-1, -1) &&
(d.isIntel() || (d.isAMD() && !d.hostUnifiedMemory())),
ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, borderType, ddepth), true)
if (anchor.x < 0)
anchor.x = kernelX.cols >> 1;
if( anchor.y < 0 )
if (anchor.y < 0)
anchor.y = kernelY.cols >> 1;
if( ddepth < 0 )
ddepth = sdepth;
UMat src = _src.getUMat();
Size srcWholeSize; Point srcOffset;
src.locateROI(srcWholeSize, srcOffset);
if ( (0 != (srcOffset.x % 4)) ||
(0 != (src.cols % 4)) ||
(0 != ((src.step / src.elemSize()) % 4))
)
return false;
bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 &&
src.cols % 4 == 0 && src.step % 4 == 0;
Size srcSize = src.size();
Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1);
UMat buf; buf.create(bufSize, CV_MAKETYPE(CV_32F, cn));
if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, false))
UMat buf(bufSize, CV_32FC(cn));
if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1))
return false;
_dst.create(srcSize, CV_MAKETYPE(ddepth, cn));
UMat dst = _dst.getUMat();
return ocl_sepColFilter2D(buf, dst, kernelY, anchor.y, false);
return ocl_sepColFilter2D(buf, dst, kernelY, anchor.y);
}
#endif

@ -34,47 +34,36 @@
//
//
#ifdef DOUBLE_SUPPORT
#ifdef cl_amd_fp64
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
#endif
#define READ_TIMES_COL ((2*(RADIUSY+LSIZE1)-1)/LSIZE1)
#define RADIUS 1
#if CN ==1
#define ALIGN (((RADIUS)+3)>>2<<2)
#elif CN==2
#define ALIGN (((RADIUS)+1)>>1<<1)
#elif CN==3
#define ALIGN (((RADIUS)+3)>>2<<2)
#elif CN==4
#define ALIGN (RADIUS)
#define READ_TIMES_ROW ((2*(RADIUS+LSIZE0)-1)/LSIZE0)
#endif
#define noconvert
/**********************************************************************************
These kernels are written for separable filters such as Sobel, Scharr, GaussianBlur.
Now(6/29/2011) the kernels only support 8U data type and the anchor of the convovle
kernel must be in the center. ROI is not supported either.
Each kernels read 4 elements(not 4 pixels), save them to LDS and read the data needed
from LDS to calculate the result.
The length of the convovle kernel supported is only related to the MAX size of LDS,
which is HW related.
Niko
6/29/2011
The info above maybe obsolete.
***********************************************************************************/
#if CN != 3
#define loadpix(addr) *(__global const srcT *)(addr)
#define storepix(val, addr) *(__global dstT *)(addr) = val
#define SRCSIZE (int)sizeof(srcT)
#define DSTSIZE (int)sizeof(dstT)
#else
#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))
#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))
#define SRCSIZE (int)sizeof(srcT1)*3
#define DSTSIZE (int)sizeof(dstT1)*3
#endif
#define DIG(a) a,
__constant float mat_kernel[] = { COEFF };
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter
(__global const GENTYPE_SRC * restrict src,
const int src_step_in_pixel,
const int src_whole_cols,
const int src_whole_rows,
__global GENTYPE_DST * dst,
const int dst_offset_in_pixel,
const int dst_step_in_pixel,
const int dst_cols,
const int dst_rows)
__kernel void col_filter(__global const uchar * src, int src_step, int src_offset, int src_whole_rows, int src_whole_cols,
__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -82,38 +71,38 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int start_addr = mad24(y, src_step_in_pixel, x);
int end_addr = mad24(src_whole_rows - 1, src_step_in_pixel, src_whole_cols);
int start_addr = mad24(y, src_step, x * SRCSIZE);
int end_addr = mad24(src_whole_rows - 1, src_step, src_whole_cols * SRCSIZE);
int i;
GENTYPE_SRC sum, temp[READ_TIMES_COL];
__local GENTYPE_SRC LDS_DAT[LSIZE1 * READ_TIMES_COL][LSIZE0 + 1];
srcT sum, temp[READ_TIMES_COL];
__local srcT LDS_DAT[LSIZE1 * READ_TIMES_COL][LSIZE0 + 1];
//read pixels from src
for(i = 0;i<READ_TIMES_COL;i++)
// read pixels from src
for (int i = 0; i < READ_TIMES_COL; ++i)
{
int current_addr = start_addr+i*LSIZE1*src_step_in_pixel;
int current_addr = mad24(i, LSIZE1 * src_step, start_addr);
current_addr = current_addr < end_addr ? current_addr : 0;
temp[i] = src[current_addr];
}
//save pixels to lds
for(i = 0;i<READ_TIMES_COL;i++)
{
LDS_DAT[l_y+i*LSIZE1][l_x] = temp[i];
temp[i] = loadpix(src + current_addr);
}
// save pixels to lds
for (int i = 0; i < READ_TIMES_COL; ++i)
LDS_DAT[mad24(i, LSIZE1, l_y)][l_x] = temp[i];
barrier(CLK_LOCAL_MEM_FENCE);
//read pixels from lds and calculate the result
sum = LDS_DAT[l_y+RADIUSY][l_x]*mat_kernel[RADIUSY];
for(i=1;i<=RADIUSY;i++)
// read pixels from lds and calculate the result
sum = LDS_DAT[l_y + RADIUSY][l_x] * mat_kernel[RADIUSY];
for (int i = 1; i <= RADIUSY; ++i)
{
temp[0]=LDS_DAT[l_y+RADIUSY-i][l_x];
temp[1]=LDS_DAT[l_y+RADIUSY+i][l_x];
sum += temp[0] * mat_kernel[RADIUSY-i]+temp[1] * mat_kernel[RADIUSY+i];
temp[0] = LDS_DAT[l_y + RADIUSY - i][l_x];
temp[1] = LDS_DAT[l_y + RADIUSY + i][l_x];
sum += mad(temp[0], mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]);
}
//write the result to dst
if((x<dst_cols) & (y<dst_rows))
// write the result to dst
if (x < dst_cols && y < dst_rows)
{
start_addr = mad24(y, dst_step_in_pixel, x + dst_offset_in_pixel);
dst[start_addr] = convert_to_DST(sum);
start_addr = mad24(y, dst_step, mad24(DSTSIZE, x, dst_offset));
storepix(convertToDstT(sum), dst + start_addr);
}
}

@ -34,41 +34,37 @@
//
//
#ifdef DOUBLE_SUPPORT
#ifdef cl_amd_fp64
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
#endif
#define READ_TIMES_ROW ((2*(RADIUSX+LSIZE0)-1)/LSIZE0) //for c4 only
#define READ_TIMES_COL ((2*(RADIUSY+LSIZE1)-1)/LSIZE1)
//#pragma OPENCL EXTENSION cl_amd_printf : enable
#define RADIUS 1
#if CN ==1
#define ALIGN (((RADIUS)+3)>>2<<2)
#elif CN==2
#define ALIGN (((RADIUS)+1)>>1<<1)
#elif CN==3
#define ALIGN (((RADIUS)+3)>>2<<2)
#elif CN==4
#define ALIGN (RADIUS)
#endif
#ifdef BORDER_REPLICATE
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
// BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
#endif
#ifdef BORDER_REFLECT
//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb
// BORDER_REFLECT: fedcba|abcdefgh|hgfedcb
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
#endif
#ifdef BORDER_REFLECT_101
//BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba
// BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
#endif
//blur function does not support BORDER_WRAP
#ifdef BORDER_WRAP
//BORDER_WRAP: cdefgh|abcdefgh|abcdefg
// BORDER_WRAP: cdefgh|abcdefgh|abcdefg
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
#endif
@ -127,65 +123,56 @@
#endif //BORDER_CONSTANT
#endif //EXTRA_EXTRAPOLATION
/**********************************************************************************
These kernels are written for separable filters such as Sobel, Scharr, GaussianBlur.
Now(6/29/2011) the kernels only support 8U data type and the anchor of the convovle
kernel must be in the center. ROI is not supported either.
For channels =1,2,4, each kernels read 4 elements(not 4 pixels), and for channels =3,
the kernel read 4 pixels, save them to LDS and read the data needed from LDS to
calculate the result.
The length of the convovle kernel supported is related to the LSIZE0 and the MAX size
of LDS, which is HW related.
For channels = 1,3 the RADIUS is no more than LSIZE0*2
For channels = 2, the RADIUS is no more than LSIZE0
For channels = 4, arbitary RADIUS is supported unless the LDS is not enough
Niko
6/29/2011
The info above maybe obsolete.
***********************************************************************************/
#define noconvert
#if CN != 3
#define loadpix(addr) *(__global const srcT *)(addr)
#define storepix(val, addr) *(__global dstT *)(addr) = val
#define SRCSIZE (int)sizeof(srcT)
#define DSTSIZE (int)sizeof(dstT)
#else
#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))
#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))
#define SRCSIZE (int)sizeof(srcT1)*3
#define DSTSIZE (int)sizeof(dstT1)*3
#endif
#define DIG(a) a,
__constant float mat_kernel[] = { COEFF };
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D0
(__global uchar * restrict src,
int src_step_in_pixel,
int src_offset_x, int src_offset_y,
int src_cols, int src_rows,
int src_whole_cols, int src_whole_rows,
__global float * dst,
int dst_step_in_pixel,
int dst_cols, int dst_rows,
int radiusy)
__kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel, int src_offset_x, int src_offset_y,
int src_cols, int src_rows, int src_whole_cols, int src_whole_rows,
__global float * dst, int dst_step_in_pixel, int dst_cols, int dst_rows,
int radiusy)
{
int x = get_global_id(0)<<2;
int y = get_global_id(1);
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int start_x = x+src_offset_x - RADIUSX & 0xfffffffc;
int start_x = x + src_offset_x - RADIUSX & 0xfffffffc;
int offset = src_offset_x - RADIUSX & 3;
int start_y = y + src_offset_y - radiusy;
int start_addr = mad24(start_y, src_step_in_pixel, start_x);
int i;
float4 sum;
uchar4 temp[READ_TIMES_ROW];
__local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
__local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW * LSIZE0 + 1];
#ifdef BORDER_CONSTANT
int end_addr = mad24(src_whole_rows - 1, src_step_in_pixel, src_whole_cols);
// read pixels from src
for (i = 0; i < READ_TIMES_ROW; i++)
for (int i = 0; i < READ_TIMES_ROW; ++i)
{
int current_addr = start_addr+i*LSIZE0*4;
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
temp[i] = *(__global uchar4*)&src[current_addr];
int current_addr = mad24(i, LSIZE0 << 2, start_addr);
current_addr = current_addr < end_addr && current_addr > 0 ? current_addr : 0;
temp[i] = *(__global const uchar4 *)&src[current_addr];
}
// judge if read out of boundary
#ifdef BORDER_ISOLATED
for (i = 0; i<READ_TIMES_ROW; i++)
for (int i = 0; i < READ_TIMES_ROW; ++i)
{
temp[i].x = ELEM(start_x+i*LSIZE0*4, src_offset_x, src_offset_x + src_cols, 0, temp[i].x);
temp[i].y = ELEM(start_x+i*LSIZE0*4+1, src_offset_x, src_offset_x + src_cols, 0, temp[i].y);
@ -194,7 +181,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
temp[i] = ELEM(start_y, src_offset_y, src_offset_y + src_rows, (uchar4)0, temp[i]);
}
#else
for (i = 0; i<READ_TIMES_ROW; i++)
for (int i = 0; i < READ_TIMES_ROW; ++i)
{
temp[i].x = ELEM(start_x+i*LSIZE0*4, 0, src_whole_cols, 0, temp[i].x);
temp[i].y = ELEM(start_x+i*LSIZE0*4+1, 0, src_whole_cols, 0, temp[i].y);
@ -209,16 +196,15 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
#else
int not_all_in_range = (start_x<0) | (start_x + READ_TIMES_ROW*LSIZE0*4+4>src_whole_cols)| (start_y<0) | (start_y >= src_whole_rows);
#endif
int4 index[READ_TIMES_ROW];
int4 addr;
int4 index[READ_TIMES_ROW], addr;
int s_y;
if (not_all_in_range)
{
// judge if read out of boundary
for (i = 0; i < READ_TIMES_ROW; i++)
for (int i = 0; i < READ_TIMES_ROW; ++i)
{
index[i] = (int4)(start_x+i*LSIZE0*4) + (int4)(0, 1, 2, 3);
index[i] = (int4)(mad24(i, LSIZE0 << 2, start_x)) + (int4)(0, 1, 2, 3);
#ifdef BORDER_ISOLATED
EXTRAPOLATE(index[i].x, src_offset_x, src_offset_x + src_cols);
EXTRAPOLATE(index[i].y, src_offset_x, src_offset_x + src_cols);
@ -231,6 +217,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
EXTRAPOLATE(index[i].w, 0, src_whole_cols);
#endif
}
s_y = start_y;
#ifdef BORDER_ISOLATED
EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows);
@ -239,9 +226,9 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
#endif
// read pixels from src
for (i = 0; i<READ_TIMES_ROW; i++)
for (int i = 0; i < READ_TIMES_ROW; ++i)
{
addr = mad24((int4)s_y,(int4)src_step_in_pixel,index[i]);
addr = mad24((int4)s_y, (int4)src_step_in_pixel, index[i]);
temp[i].x = src[addr.x];
temp[i].y = src[addr.y];
temp[i].z = src[addr.z];
@ -251,26 +238,26 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
else
{
// read pixels from src
for (i = 0; i<READ_TIMES_ROW; i++)
temp[i] = *(__global uchar4*)&src[start_addr+i*LSIZE0*4];
for (int i = 0; i < READ_TIMES_ROW; ++i)
temp[i] = *(__global uchar4*)&src[mad24(i, LSIZE0 << 2, start_addr)];
}
#endif //BORDER_CONSTANT
// save pixels to lds
for (i = 0; i<READ_TIMES_ROW; i++)
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
for (int i = 0; i < READ_TIMES_ROW; ++i)
LDS_DAT[l_y][mad24(i, LSIZE0, l_x)] = temp[i];
barrier(CLK_LOCAL_MEM_FENCE);
// read pixels from lds and calculate the result
sum =convert_float4(vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset))*mat_kernel[RADIUSX];
for (i=1; i<=RADIUSX; i++)
sum = convert_float4(vload4(0,(__local uchar *)&LDS_DAT[l_y][l_x]+RADIUSX+offset)) * mat_kernel[RADIUSX];
for (int i = 1; i <= RADIUSX; ++i)
{
temp[0] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset - i);
temp[1] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset + i);
sum += convert_float4(temp[0]) * mat_kernel[RADIUSX-i] + convert_float4(temp[1]) * mat_kernel[RADIUSX+i];
sum += mad(convert_float4(temp[0]), mat_kernel[RADIUSX-i], convert_float4(temp[1]) * mat_kernel[RADIUSX + i]);
}
start_addr = mad24(y,dst_step_in_pixel,x);
start_addr = mad24(y, dst_step_in_pixel, x);
// write the result to dst
if ((x+3<dst_cols) & (y<dst_rows))
@ -290,247 +277,58 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
dst[start_addr] = sum.x;
}
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C4_D0
(__global uchar4 * restrict src,
int src_step_in_pixel,
int src_offset_x, int src_offset_y,
int src_cols, int src_rows,
int src_whole_cols, int src_whole_rows,
__global float4 * dst,
int dst_step_in_pixel,
int dst_cols, int dst_rows,
int radiusy)
__kernel void row_filter(__global const uchar * src, int src_step, int src_offset_x, int src_offset_y,
int src_cols, int src_rows, int src_whole_cols, int src_whole_rows,
__global uchar * dst, int dst_step, int dst_cols, int dst_rows,
int radiusy)
{
int x = get_global_id(0);
int y = get_global_id(1);
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int start_x = x+src_offset_x-RADIUSX;
int start_y = y+src_offset_y-radiusy;
int start_addr = mad24(start_y,src_step_in_pixel,start_x);
int i;
float4 sum;
uchar4 temp[READ_TIMES_ROW];
__local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
#ifdef BORDER_CONSTANT
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
// read pixels from src
for (i = 0; i<READ_TIMES_ROW; i++)
{
int current_addr = start_addr+i*LSIZE0;
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
temp[i] = src[current_addr];
}
//judge if read out of boundary
#ifdef BORDER_ISOLATED
for (i = 0; i<READ_TIMES_ROW; i++)
{
temp[i]= ELEM(start_x+i*LSIZE0, src_offset_x, src_offset_x + src_cols, (uchar4)0, temp[i]);
temp[i]= ELEM(start_y, src_offset_y, src_offset_y + src_rows, (uchar4)0, temp[i]);
}
#else
for (i = 0; i<READ_TIMES_ROW; i++)
{
temp[i]= ELEM(start_x+i*LSIZE0, 0, src_whole_cols, (uchar4)0, temp[i]);
temp[i]= ELEM(start_y, 0, src_whole_rows, (uchar4)0, temp[i]);
}
#endif
#else
int index[READ_TIMES_ROW];
int s_x,s_y;
// judge if read out of boundary
for (i = 0; i<READ_TIMES_ROW; i++)
{
s_x = start_x+i*LSIZE0;
s_y = start_y;
#ifdef BORDER_ISOLATED
EXTRAPOLATE(s_x, src_offset_x, src_offset_x + src_cols);
EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows);
#else
EXTRAPOLATE(s_x, 0, src_whole_cols);
EXTRAPOLATE(s_y, 0, src_whole_rows);
#endif
index[i]=mad24(s_y, src_step_in_pixel, s_x);
}
//read pixels from src
for (i = 0; i<READ_TIMES_ROW; i++)
temp[i] = src[index[i]];
#endif //BORDER_CONSTANT
//save pixels to lds
for (i = 0; i<READ_TIMES_ROW; i++)
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
barrier(CLK_LOCAL_MEM_FENCE);
int start_x = x + src_offset_x - RADIUSX;
int start_y = y + src_offset_y - radiusy;
int start_addr = mad24(start_y, src_step, start_x * SRCSIZE);
//read pixels from lds and calculate the result
sum =convert_float4(LDS_DAT[l_y][l_x+RADIUSX])*mat_kernel[RADIUSX];
for (i=1; i<=RADIUSX; i++)
{
temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
sum += convert_float4(temp[0])*mat_kernel[RADIUSX-i]+convert_float4(temp[1])*mat_kernel[RADIUSX+i];
}
//write the result to dst
if (x<dst_cols && y<dst_rows)
{
start_addr = mad24(y,dst_step_in_pixel,x);
dst[start_addr] = sum;
}
}
dstT sum;
srcT temp[READ_TIMES_ROW];
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D5
(__global float * restrict src,
int src_step_in_pixel,
int src_offset_x, int src_offset_y,
int src_cols, int src_rows,
int src_whole_cols, int src_whole_rows,
__global float * dst,
int dst_step_in_pixel,
int dst_cols, int dst_rows,
int radiusy)
{
int x = get_global_id(0);
int y = get_global_id(1);
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int start_x = x+src_offset_x-RADIUSX;
int start_y = y+src_offset_y-radiusy;
int start_addr = mad24(start_y,src_step_in_pixel,start_x);
int i;
float sum;
float temp[READ_TIMES_ROW];
__local float LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
__local srcT LDS_DAT[LSIZE1][READ_TIMES_ROW * LSIZE0 + 1];
#ifdef BORDER_CONSTANT
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
int end_addr = mad24(src_whole_rows - 1, src_step, src_whole_cols * SRCSIZE);
// read pixels from src
for (i = 0; i<READ_TIMES_ROW; i++)
for (int i = 0; i < READ_TIMES_ROW; i++)
{
int current_addr = start_addr+i*LSIZE0;
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
temp[i] = src[current_addr];
int current_addr = mad24(i, LSIZE0 * SRCSIZE, start_addr);
current_addr = current_addr < end_addr && current_addr >= 0 ? current_addr : 0;
temp[i] = loadpix(src + current_addr);
}
// judge if read out of boundary
#ifdef BORDER_ISOLATED
for (i = 0; i<READ_TIMES_ROW; i++)
for (int i = 0; i < READ_TIMES_ROW; ++i)
{
temp[i]= ELEM(start_x+i*LSIZE0, src_offset_x, src_offset_x + src_cols, (float)0,temp[i]);
temp[i]= ELEM(start_y, src_offset_y, src_offset_y + src_rows, (float)0,temp[i]);
temp[i] = ELEM(mad24(i, LSIZE0, start_x), src_offset_x, src_offset_x + src_cols, (srcT)(0), temp[i]);
temp[i] = ELEM(start_y, src_offset_y, src_offset_y + src_rows, (srcT)(0), temp[i]);
}
#else
for (i = 0; i<READ_TIMES_ROW; i++)
for (int i = 0; i < READ_TIMES_ROW; ++i)
{
temp[i]= ELEM(start_x+i*LSIZE0, 0, src_whole_cols, (float)0,temp[i]);
temp[i]= ELEM(start_y, 0, src_whole_rows, (float)0,temp[i]);
temp[i] = ELEM(mad24(i, LSIZE0, start_x), 0, src_whole_cols, (srcT)(0), temp[i]);
temp[i] = ELEM(start_y, 0, src_whole_rows, (srcT)(0), temp[i]);
}
#endif
#else // BORDER_CONSTANT
int index[READ_TIMES_ROW];
int s_x,s_y;
// judge if read out of boundary
for (i = 0; i<READ_TIMES_ROW; i++)
{
s_x = start_x + i*LSIZE0, s_y = start_y;
#ifdef BORDER_ISOLATED
EXTRAPOLATE(s_x, src_offset_x, src_offset_x + src_cols);
EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows);
#else
EXTRAPOLATE(s_x, 0, src_whole_cols);
EXTRAPOLATE(s_y, 0, src_whole_rows);
#endif
index[i]=mad24(s_y, src_step_in_pixel, s_x);
}
// read pixels from src
for (i = 0; i<READ_TIMES_ROW; i++)
temp[i] = src[index[i]];
#endif// BORDER_CONSTANT
//save pixels to lds
for (i = 0; i<READ_TIMES_ROW; i++)
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
barrier(CLK_LOCAL_MEM_FENCE);
// read pixels from lds and calculate the result
sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX];
for (i=1; i<=RADIUSX; i++)
{
temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
sum += temp[0]*mat_kernel[RADIUSX-i]+temp[1]*mat_kernel[RADIUSX+i];
}
// write the result to dst
if (x<dst_cols && y<dst_rows)
{
start_addr = mad24(y,dst_step_in_pixel,x);
dst[start_addr] = sum;
}
}
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C4_D5
(__global float4 * restrict src,
int src_step_in_pixel,
int src_offset_x, int src_offset_y,
int src_cols, int src_rows,
int src_whole_cols, int src_whole_rows,
__global float4 * dst,
int dst_step_in_pixel,
int dst_cols, int dst_rows,
int radiusy)
{
int x = get_global_id(0);
int y = get_global_id(1);
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int start_x = x+src_offset_x-RADIUSX;
int start_y = y+src_offset_y-radiusy;
int start_addr = mad24(start_y,src_step_in_pixel,start_x);
int i;
float4 sum;
float4 temp[READ_TIMES_ROW];
__local float4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
#ifdef BORDER_CONSTANT
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
// read pixels from src
for (i = 0; i<READ_TIMES_ROW; i++)
{
int current_addr = start_addr+i*LSIZE0;
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
temp[i] = src[current_addr];
}
int index[READ_TIMES_ROW], s_x, s_y;
// judge if read out of boundary
#ifdef BORDER_ISOLATED
for (i = 0; i<READ_TIMES_ROW; i++)
for (int i = 0; i < READ_TIMES_ROW; ++i)
{
temp[i]= ELEM(start_x+i*LSIZE0, src_offset_x, src_offset_x + src_cols, (float4)0,temp[i]);
temp[i]= ELEM(start_y, src_offset_y, src_offset_y + src_rows, (float4)0,temp[i]);
}
#else
for (i = 0; i<READ_TIMES_ROW; i++)
{
temp[i]= ELEM(start_x+i*LSIZE0, 0, src_whole_cols, (float4)0,temp[i]);
temp[i]= ELEM(start_y, 0, src_whole_rows, (float4)0,temp[i]);
}
#endif
#else
int index[READ_TIMES_ROW];
int s_x,s_y;
s_x = mad24(i, LSIZE0, start_x);
s_y = start_y;
// judge if read out of boundary
for (i = 0; i<READ_TIMES_ROW; i++)
{
s_x = start_x + i*LSIZE0, s_y = start_y;
#ifdef BORDER_ISOLATED
EXTRAPOLATE(s_x, src_offset_x, src_offset_x + src_cols);
EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows);
@ -538,32 +336,32 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
EXTRAPOLATE(s_x, 0, src_whole_cols);
EXTRAPOLATE(s_y, 0, src_whole_rows);
#endif
index[i]=mad24(s_y,src_step_in_pixel,s_x);
index[i] = mad24(s_y, src_step, s_x * SRCSIZE);
}
// read pixels from src
for (i = 0; i<READ_TIMES_ROW; i++)
temp[i] = src[index[i]];
#endif
for (int i = 0; i < READ_TIMES_ROW; ++i)
temp[i] = loadpix(src + index[i]);
#endif // BORDER_CONSTANT
// save pixels to lds
for (i = 0; i<READ_TIMES_ROW; i++)
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
for (int i = 0; i < READ_TIMES_ROW; ++i)
LDS_DAT[l_y][mad24(i, LSIZE0, l_x)] = temp[i];
barrier(CLK_LOCAL_MEM_FENCE);
// read pixels from lds and calculate the result
sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX];
for (i=1; i<=RADIUSX; i++)
sum = convertToDstT(LDS_DAT[l_y][l_x + RADIUSX]) * mat_kernel[RADIUSX];
for (int i = 1; i <= RADIUSX; ++i)
{
temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
sum += temp[0]*mat_kernel[RADIUSX-i]+temp[1]*mat_kernel[RADIUSX+i];
temp[0] = LDS_DAT[l_y][l_x + RADIUSX - i];
temp[1] = LDS_DAT[l_y][l_x + RADIUSX + i];
sum += mad(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]);
}
// write the result to dst
if (x<dst_cols && y<dst_rows)
if (x < dst_cols && y < dst_rows)
{
start_addr = mad24(y,dst_step_in_pixel,x);
dst[start_addr] = sum;
start_addr = mad24(y, dst_step, x * DSTSIZE);
storepix(sum, dst + start_addr);
}
}

@ -0,0 +1,187 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2014, Intel Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////Macro for border type////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////
#ifdef BORDER_CONSTANT
// CCCCCC|abcdefgh|CCCCCCC
#define EXTRAPOLATE(x, maxV)
#elif defined BORDER_REPLICATE
// aaaaaa|abcdefgh|hhhhhhh
#define EXTRAPOLATE(x, maxV) \
{ \
(x) = max(min((x), (maxV) - 1), 0); \
}
#elif defined BORDER_WRAP
// cdefgh|abcdefgh|abcdefg
#define EXTRAPOLATE(x, maxV) \
{ \
(x) = ( (x) + (maxV) ) % (maxV); \
}
#elif defined BORDER_REFLECT
// fedcba|abcdefgh|hgfedcb
#define EXTRAPOLATE(x, maxV) \
{ \
(x) = min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ); \
}
#elif defined BORDER_REFLECT_101 || defined BORDER_REFLECT101
// gfedcb|abcdefgh|gfedcba
#define EXTRAPOLATE(x, maxV) \
{ \
(x) = min(((maxV)-1)*2-(x), max((x),-(x)) ); \
}
#else
#error No extrapolation method
#endif
#if CN != 3
#define loadpix(addr) *(__global const srcT *)(addr)
#define storepix(val, addr) *(__global dstT *)(addr) = val
#define SRCSIZE (int)sizeof(srcT)
#define DSTSIZE (int)sizeof(dstT)
#else
#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))
#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))
#define SRCSIZE (int)sizeof(srcT1)*3
#define DSTSIZE (int)sizeof(dstT1)*3
#endif
#define SRC(_x,_y) convertToWT(loadpix(Src + mad24(_y, src_step, SRCSIZE * _x)))
#ifdef BORDER_CONSTANT
// CCCCCC|abcdefgh|CCCCCCC
#define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y))
#else
#define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y))
#endif
#define noconvert
// horizontal and vertical filter kernels
// should be defined on host during compile time to avoid overhead
#define DIG(a) a,
__constant float mat_kernelX[] = { KERNEL_MATRIX_X };
__constant float mat_kernelY[] = { KERNEL_MATRIX_Y };
__kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int srcOffsetY, int height, int width,
__global uchar* Dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)
{
// RADIUSX, RADIUSY are filter dimensions
// BLK_X, BLK_Y are local wrogroup sizes
// all these should be defined on host during compile time
// first lsmem array for source pixels used in first pass,
// second lsmemDy for storing first pass results
__local WT lsmem[BLK_Y + 2 * RADIUSY][BLK_X + 2 * RADIUSX];
__local WT lsmemDy[BLK_Y][BLK_X + 2 * RADIUSX];
// get local and global ids - used as image and local memory array indexes
int lix = get_local_id(0);
int liy = get_local_id(1);
int x = get_global_id(0);
int y = get_global_id(1);
// calculate pixel position in source image taking image offset into account
int srcX = x + srcOffsetX - RADIUSX;
int srcY = y + srcOffsetY - RADIUSY;
int xb = srcX;
int yb = srcY;
// extrapolate coordinates, if needed
// and read my own source pixel into local memory
// with account for extra border pixels, which will be read by starting workitems
int clocY = liy;
int cSrcY = srcY;
do
{
int yb = cSrcY;
EXTRAPOLATE(yb, (height));
int clocX = lix;
int cSrcX = srcX;
do
{
int xb = cSrcX;
EXTRAPOLATE(xb,(width));
lsmem[clocY][clocX] = ELEM(xb, yb, (width), (height), 0 );
clocX += BLK_X;
cSrcX += BLK_X;
}
while(clocX < BLK_X+(RADIUSX*2));
clocY += BLK_Y;
cSrcY += BLK_Y;
}
while (clocY < BLK_Y+(RADIUSY*2));
barrier(CLK_LOCAL_MEM_FENCE);
// do vertical filter pass
// and store intermediate results to second local memory array
int i, clocX = lix;
WT sum = 0.0f;
do
{
sum = 0.0f;
for (i=0; i<=2*RADIUSY; i++)
sum = mad(lsmem[liy+i][clocX], mat_kernelY[i], sum);
lsmemDy[liy][clocX] = sum;
clocX += BLK_X;
}
while(clocX < BLK_X+(RADIUSX*2));
barrier(CLK_LOCAL_MEM_FENCE);
// if this pixel happened to be out of image borders because of global size rounding,
// then just return
if( x >= dst_cols || y >=dst_rows )
return;
// do second horizontal filter pass
// and calculate final result
sum = 0.0f;
for (i=0; i<=2*RADIUSX; i++)
sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
//store result into destination image
storepix(convertToDstT(sum), Dst + mad24(y, dst_step, mad24(x, DSTSIZE, dst_offset)));
}

@ -306,7 +306,7 @@ OCL_TEST_P(MorphologyEx, Mat)
(int)BORDER_REFLECT|BORDER_ISOLATED, (int)BORDER_WRAP|BORDER_ISOLATED, \
(int)BORDER_REFLECT_101|BORDER_ISOLATED*/) // WRAP and ISOLATED are not supported by cv:: version
#define FILTER_TYPES Values(CV_8UC1, CV_8UC2, CV_8UC4, CV_32FC1, CV_32FC4, CV_64FC1, CV_64FC4)
#define FILTER_TYPES Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4)
OCL_INSTANTIATE_TEST_CASE_P(Filter, Bilateral, Combine(
Values((MatType)CV_8UC1),

@ -75,33 +75,24 @@ PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool)
void random_roi()
{
Size ksize = randomSize(kernelMinSize, kernelMaxSize);
if (1 != (ksize.width % 2))
if (1 != ksize.width % 2)
ksize.width++;
if (1 != (ksize.height % 2))
if (1 != ksize.height % 2)
ksize.height++;
Mat temp = randomMat(Size(ksize.width, 1), CV_MAKE_TYPE(CV_32F, 1), -MAX_VALUE, MAX_VALUE);
cv::normalize(temp, kernelX, 1.0, 0.0, NORM_L1);
temp = randomMat(Size(1, ksize.height), CV_MAKE_TYPE(CV_32F, 1), -MAX_VALUE, MAX_VALUE);
cv::normalize(temp, kernelY, 1.0, 0.0, NORM_L1);
Size roiSize = randomSize(ksize.width, MAX_VALUE, ksize.height, MAX_VALUE);
int rest = roiSize.width % 4;
if (0 != rest)
roiSize.width += (4 - rest);
Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
rest = srcBorder.lef % 4;
if (0 != rest)
srcBorder.lef += (4 - rest);
rest = srcBorder.rig % 4;
if (0 != rest)
srcBorder.rig += (4 - rest);
randomSubMat(src, src_roi, roiSize, srcBorder, type, -MAX_VALUE, MAX_VALUE);
Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
randomSubMat(dst, dst_roi, roiSize, dstBorder, type, -MAX_VALUE, MAX_VALUE);
anchor.x = -1;
anchor.y = -1;
anchor.x = anchor.y = -1;
UMAT_UPLOAD_INPUT_PARAMETER(src);
UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
@ -115,7 +106,7 @@ PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool)
OCL_TEST_P(SepFilter2D, Mat)
{
for (int j = 0; j < test_loop_times; j++)
for (int j = 0; j < test_loop_times + 3; j++)
{
random_roi();
@ -126,11 +117,10 @@ OCL_TEST_P(SepFilter2D, Mat)
}
}
OCL_INSTANTIATE_TEST_CASE_P(ImageProc, SepFilter2D,
Combine(
Values(CV_8U, CV_32F),
Values(1, 4),
OCL_ALL_CHANNELS,
Values(
(BorderType)BORDER_CONSTANT,
(BorderType)BORDER_REPLICATE,

@ -1014,10 +1014,8 @@ namespace
return;
#ifdef HAVE_OPENCL
if (isUmat_ && curFrame_.channels() == 1)
if (isUmat_)
curFrame_.copyTo(ucurFrame_);
else
isUmat_ = false;
#endif
++storePos_;

Loading…
Cancel
Save