simplified cv::sepFilter2D OpenCL part

pull/2491/head
Ilya Lavrenov 11 years ago
parent 82e6edfba2
commit b449b0bf71
  1. 137
      modules/imgproc/src/filter.cpp
  2. 62
      modules/imgproc/src/opencl/filterSepCol.cl
  3. 373
      modules/imgproc/src/opencl/filterSepRow.cl
  4. 11
      modules/imgproc/src/opencl/filterSep_singlePass.cl
  5. 4
      modules/imgproc/test/ocl/test_sepfilter2D.cpp

@ -41,6 +41,7 @@
//M*/
#include "precomp.hpp"
#define CV_OPENCL_RUN_ASSERT
#include "opencl_kernels.hpp"
#include <sstream>
@ -3317,11 +3318,9 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
return kernel.run(2, globalsize, localsize, true);
}
static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor, int borderType, bool sync)
static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor, int borderType)
{
int type = src.type();
int cn = CV_MAT_CN(type);
int sdepth = CV_MAT_DEPTH(type);
int type = src.type(), cn = CV_MAT_CN(type), sdepth = CV_MAT_DEPTH(type);
Size bufSize = buf.size();
#ifdef ANDROID
@ -3329,27 +3328,14 @@ static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor,
#else
size_t localsize[2] = {16, 16};
#endif
size_t globalsize[2] = {DIVUP(bufSize.width, localsize[0]) * localsize[0], DIVUP(bufSize.height, localsize[1]) * localsize[1]};
if (CV_8U == sdepth)
{
switch (cn)
{
case 1:
globalsize[0] = DIVUP((bufSize.width + 3) >> 2, localsize[0]) * localsize[0];
break;
case 2:
globalsize[0] = DIVUP((bufSize.width + 1) >> 1, localsize[0]) * localsize[0];
break;
case 4:
globalsize[0] = DIVUP(bufSize.width, localsize[0]) * localsize[0];
break;
}
}
if (type == CV_8UC1)
globalsize[0] = DIVUP((bufSize.width + 3) >> 2, localsize[0]) * localsize[0];
int radiusX = anchor;
int radiusY = (int)((buf.rows - src.rows) >> 1);
int radiusX = anchor, radiusY = (buf.rows - src.rows) >> 1;
bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0;
bool isolated = (borderType & BORDER_ISOLATED) != 0;
const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101" },
* const btype = borderMap[borderType & ~BORDER_ISOLATED];
@ -3358,49 +3344,38 @@ static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor,
extra_extrapolation |= src.cols < (int)((-radiusX + globalsize[0] + 8 * localsize[0] + 3) >> 1) + 1;
extra_extrapolation |= src.cols < radiusX;
cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s",
radiusX, (int)localsize[0], (int)localsize[1], cn,
btype,
extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
char cvt[40];
cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s"
" -D srcT=%s -D dstT=%s -D convertToDstT=%s -D srcT1=%s -D dstT1=%s",
radiusX, (int)localsize[0], (int)localsize[1], cn, btype,
extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)),
ocl::convertTypeStr(sdepth, CV_32F, cn, cvt),
ocl::typeToStr(sdepth), ocl::typeToStr(CV_32F));
build_options += ocl::kernelToStr(kernelX, CV_32F);
Size srcWholeSize; Point srcOffset;
src.locateROI(srcWholeSize, srcOffset);
std::stringstream strKernel;
strKernel << "row_filter";
if (-1 != cn)
strKernel << "_C" << cn;
if (-1 != sdepth)
strKernel << "_D" << sdepth;
String kernelName("row_filter");
if (type == CV_8UC1)
kernelName += "_C1_D0";
ocl::Kernel kernelRow;
if (!kernelRow.create(strKernel.str().c_str(), cv::ocl::imgproc::filterSepRow_oclsrc,
build_options))
ocl::Kernel k(kernelName.c_str(), cv::ocl::imgproc::filterSepRow_oclsrc,
build_options);
if (k.empty())
return false;
int idxArg = 0;
idxArg = kernelRow.set(idxArg, ocl::KernelArg::PtrReadOnly(src));
idxArg = kernelRow.set(idxArg, (int)(src.step / src.elemSize()));
idxArg = kernelRow.set(idxArg, srcOffset.x);
idxArg = kernelRow.set(idxArg, srcOffset.y);
idxArg = kernelRow.set(idxArg, src.cols);
idxArg = kernelRow.set(idxArg, src.rows);
idxArg = kernelRow.set(idxArg, srcWholeSize.width);
idxArg = kernelRow.set(idxArg, srcWholeSize.height);
idxArg = kernelRow.set(idxArg, ocl::KernelArg::PtrWriteOnly(buf));
idxArg = kernelRow.set(idxArg, (int)(buf.step / buf.elemSize()));
idxArg = kernelRow.set(idxArg, buf.cols);
idxArg = kernelRow.set(idxArg, buf.rows);
idxArg = kernelRow.set(idxArg, radiusY);
return kernelRow.run(2, globalsize, localsize, sync);
k.args(ocl::KernelArg::PtrReadOnly(src), (int)(src.step / src.elemSize()), srcOffset.x,
srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height,
ocl::KernelArg::PtrWriteOnly(buf), (int)(buf.step / buf.elemSize()),
buf.cols, buf.rows, radiusY);
return k.run(2, globalsize, localsize, false);
}
static bool ocl_sepColFilter2D(const UMat &buf, UMat &dst, Mat &kernelY, int anchor, bool sync)
static bool ocl_sepColFilter2D(const UMat &buf, UMat &dst, Mat &kernelY, int anchor)
{
#ifdef ANDROID
size_t localsize[2] = {16, 10};
@ -3420,28 +3395,23 @@ static bool ocl_sepColFilter2D(const UMat &buf, UMat &dst, Mat &kernelY, int anc
globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0];
char cvt[40];
cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
anchor, (int)localsize[0], (int)localsize[1], cn, ocl::typeToStr(buf.type()),
ocl::typeToStr(dtype), ocl::convertTypeStr(CV_32F, ddepth, cn, cvt));
cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d"
" -D srcT=%s -D dstT=%s -D convertToDstT=%s",
anchor, (int)localsize[0], (int)localsize[1], cn,
ocl::typeToStr(buf.type()), ocl::typeToStr(dtype),
ocl::convertTypeStr(CV_32F, ddepth, cn, cvt));
build_options += ocl::kernelToStr(kernelY, CV_32F);
ocl::Kernel kernelCol;
if (!kernelCol.create("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc, build_options))
ocl::Kernel k("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc,
build_options);
if (k.empty())
return false;
int idxArg = 0;
idxArg = kernelCol.set(idxArg, ocl::KernelArg::PtrReadOnly(buf));
idxArg = kernelCol.set(idxArg, (int)(buf.step / buf.elemSize()));
idxArg = kernelCol.set(idxArg, buf.cols);
idxArg = kernelCol.set(idxArg, buf.rows);
idxArg = kernelCol.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst));
idxArg = kernelCol.set(idxArg, (int)(dst.offset / dst.elemSize()));
idxArg = kernelCol.set(idxArg, (int)(dst.step / dst.elemSize()));
idxArg = kernelCol.set(idxArg, dst.cols);
idxArg = kernelCol.set(idxArg, dst.rows);
return kernelCol.run(2, globalsize, localsize, sync);
k.args(ocl::KernelArg::PtrReadOnly(buf), (int)(buf.step / buf.elemSize()), buf.cols,
buf.rows, ocl::KernelArg::PtrWriteOnly(dst), (int)(dst.offset / dst.elemSize()),
(int)(dst.step / dst.elemSize()), dst.cols, dst.rows);
return k.run(2, globalsize, localsize, false);
}
const int optimizedSepFilterLocalSize = 16;
@ -3473,12 +3443,14 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s"
" -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s"
" -D %s", (int)lt2[0], (int)lt2[1], _row_kernel.size().height / 2, _col_kernel.size().height / 2,
" -D %s -D srcT1=%s -D dstT1=%s -D cn=%d", (int)lt2[0], (int)lt2[1],
_row_kernel.size().height / 2, _col_kernel.size().height / 2,
ocl::kernelToStr(_row_kernel, CV_32F, "KERNEL_MATRIX_X").c_str(),
ocl::kernelToStr(_col_kernel, CV_32F, "KERNEL_MATRIX_Y").c_str(),
ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype),
ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType]);
ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType],
ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn);
ocl::Kernel k("sep_filter", ocl::imgproc::filterSep_singlePass_oclsrc, opts);
if (k.empty())
@ -3529,10 +3501,13 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
if (ddepth < 0)
ddepth = sdepth;
CV_OCL_RUN_(kernelY.rows <= 21 && kernelX.rows <= 21 &&
imgSize.width > optimizedSepFilterLocalSize + (kernelX.rows >> 1) &&
imgSize.height > optimizedSepFilterLocalSize + (kernelY.rows >> 1),
ocl_sepFilter2D_SinglePass(_src, _dst, _kernelX, _kernelY, borderType, ddepth), true)
// printf("%d %d\n", imgSize.width, optimizedSepFilterLocalSize + (kernelX.rows >> 1));
// printf("%d %d\n", imgSize.height, optimizedSepFilterLocalSize + (kernelY.rows >> 1));
// CV_OCL_RUN_(kernelY.rows <= 21 && kernelX.rows <= 21 &&
// imgSize.width > optimizedSepFilterLocalSize + (kernelX.rows >> 1) &&
// imgSize.height > optimizedSepFilterLocalSize + (kernelY.rows >> 1),
// ocl_sepFilter2D_SinglePass(_src, _dst, _kernelX, _kernelY, borderType, ddepth), true)
UMat src = _src.getUMat();
Size srcWholeSize; Point srcOffset;
@ -3546,12 +3521,12 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
Size srcSize = src.size();
Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1);
UMat buf; buf.create(bufSize, CV_MAKETYPE(CV_32F, cn));
if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, false))
if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType))
return false;
_dst.create(srcSize, CV_MAKETYPE(ddepth, cn));
UMat dst = _dst.getUMat();
return ocl_sepColFilter2D(buf, dst, kernelY, anchor.y, false);
return ocl_sepColFilter2D(buf, dst, kernelY, anchor.y);
}
#endif

@ -36,16 +36,6 @@
#define READ_TIMES_COL ((2*(RADIUSY+LSIZE1)-1)/LSIZE1)
#define RADIUS 1
#if CN ==1
#define ALIGN (((RADIUS)+3)>>2<<2)
#elif CN==2
#define ALIGN (((RADIUS)+1)>>1<<1)
#elif CN==3
#define ALIGN (((RADIUS)+3)>>2<<2)
#elif CN==4
#define ALIGN (RADIUS)
#define READ_TIMES_ROW ((2*(RADIUS+LSIZE0)-1)/LSIZE0)
#endif
#define noconvert
@ -65,16 +55,8 @@ The info above maybe obsolete.
#define DIG(a) a,
__constant float mat_kernel[] = { COEFF };
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter
(__global const GENTYPE_SRC * restrict src,
const int src_step_in_pixel,
const int src_whole_cols,
const int src_whole_rows,
__global GENTYPE_DST * dst,
const int dst_offset_in_pixel,
const int dst_step_in_pixel,
const int dst_cols,
const int dst_rows)
__kernel void col_filter(__global const srcT * src, int src_step_in_pixel, int src_whole_cols, int src_whole_rows,
__global dstT * dst, int dst_offset_in_pixel, int dst_step_in_pixel, int dst_cols, int dst_rows)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -85,35 +67,35 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter
int start_addr = mad24(y, src_step_in_pixel, x);
int end_addr = mad24(src_whole_rows - 1, src_step_in_pixel, src_whole_cols);
int i;
GENTYPE_SRC sum, temp[READ_TIMES_COL];
__local GENTYPE_SRC LDS_DAT[LSIZE1 * READ_TIMES_COL][LSIZE0 + 1];
srcT sum, temp[READ_TIMES_COL];
__local srcT LDS_DAT[LSIZE1 * READ_TIMES_COL][LSIZE0 + 1];
//read pixels from src
for(i = 0;i<READ_TIMES_COL;i++)
// read pixels from src
for (int i = 0; i < READ_TIMES_COL; ++i)
{
int current_addr = start_addr+i*LSIZE1*src_step_in_pixel;
int current_addr = mad24(i, LSIZE1 * src_step_in_pixel, start_addr);
current_addr = current_addr < end_addr ? current_addr : 0;
temp[i] = src[current_addr];
}
//save pixels to lds
for(i = 0;i<READ_TIMES_COL;i++)
{
LDS_DAT[l_y+i*LSIZE1][l_x] = temp[i];
}
// save pixels to lds
for (int i = 0; i < READ_TIMES_COL; ++i)
LDS_DAT[mad24(i, LSIZE1, l_y)][l_x] = temp[i];
barrier(CLK_LOCAL_MEM_FENCE);
//read pixels from lds and calculate the result
sum = LDS_DAT[l_y+RADIUSY][l_x]*mat_kernel[RADIUSY];
for(i=1;i<=RADIUSY;i++)
// read pixels from lds and calculate the result
sum = LDS_DAT[l_y + RADIUSY][l_x] * mat_kernel[RADIUSY];
for (int i = 1; i <= RADIUSY; ++i)
{
temp[0]=LDS_DAT[l_y+RADIUSY-i][l_x];
temp[1]=LDS_DAT[l_y+RADIUSY+i][l_x];
sum += temp[0] * mat_kernel[RADIUSY-i]+temp[1] * mat_kernel[RADIUSY+i];
temp[0] = LDS_DAT[l_y + RADIUSY - i][l_x];
temp[1] = LDS_DAT[l_y + RADIUSY + i][l_x];
sum += mad(temp[0], mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]);
}
//write the result to dst
if((x<dst_cols) & (y<dst_rows))
// write the result to dst
if (x < dst_cols && y < dst_rows)
{
start_addr = mad24(y, dst_step_in_pixel, x + dst_offset_in_pixel);
dst[start_addr] = convert_to_DST(sum);
dst[start_addr] = convertToDstT(sum);
}
}

@ -35,40 +35,28 @@
//
#define READ_TIMES_ROW ((2*(RADIUSX+LSIZE0)-1)/LSIZE0) //for c4 only
#define READ_TIMES_COL ((2*(RADIUSY+LSIZE1)-1)/LSIZE1)
//#pragma OPENCL EXTENSION cl_amd_printf : enable
#define RADIUS 1
#if CN ==1
#define ALIGN (((RADIUS)+3)>>2<<2)
#elif CN==2
#define ALIGN (((RADIUS)+1)>>1<<1)
#elif CN==3
#define ALIGN (((RADIUS)+3)>>2<<2)
#elif CN==4
#define ALIGN (RADIUS)
#endif
#ifdef BORDER_REPLICATE
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
// BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
#endif
#ifdef BORDER_REFLECT
//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb
// BORDER_REFLECT: fedcba|abcdefgh|hgfedcb
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
#endif
#ifdef BORDER_REFLECT_101
//BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba
// BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
#endif
//blur function does not support BORDER_WRAP
#ifdef BORDER_WRAP
//BORDER_WRAP: cdefgh|abcdefgh|abcdefg
// BORDER_WRAP: cdefgh|abcdefgh|abcdefg
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
#endif
@ -127,65 +115,56 @@
#endif //BORDER_CONSTANT
#endif //EXTRA_EXTRAPOLATION
/**********************************************************************************
These kernels are written for separable filters such as Sobel, Scharr, GaussianBlur.
Now(6/29/2011) the kernels only support 8U data type and the anchor of the convovle
kernel must be in the center. ROI is not supported either.
For channels =1,2,4, each kernels read 4 elements(not 4 pixels), and for channels =3,
the kernel read 4 pixels, save them to LDS and read the data needed from LDS to
calculate the result.
The length of the convovle kernel supported is related to the LSIZE0 and the MAX size
of LDS, which is HW related.
For channels = 1,3 the RADIUS is no more than LSIZE0*2
For channels = 2, the RADIUS is no more than LSIZE0
For channels = 4, arbitary RADIUS is supported unless the LDS is not enough
Niko
6/29/2011
The info above maybe obsolete.
***********************************************************************************/
#define noconvert
#if cn != 3
#define loadpix(addr) *(__global const srcT *)(addr)
#define storepix(val, addr) *(__global dstT *)(addr) = val
#define SRCSIZE ((int)sizeof(srcT))
#define DSTSIZE ((int)sizeof(dstT))
#else
#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))
#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))
#define SRCSIZE ((int)sizeof(srcT1)*3)
#define DSTSIZE ((int)sizeof(dstT1)*3)
#endif
#define DIG(a) a,
__constant float mat_kernel[] = { COEFF };
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D0
(__global uchar * restrict src,
int src_step_in_pixel,
int src_offset_x, int src_offset_y,
int src_cols, int src_rows,
int src_whole_cols, int src_whole_rows,
__global float * dst,
int dst_step_in_pixel,
int dst_cols, int dst_rows,
int radiusy)
__kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel, int src_offset_x, int src_offset_y,
int src_cols, int src_rows, int src_whole_cols, int src_whole_rows,
__global float * dst, int dst_step_in_pixel, int dst_cols, int dst_rows,
int radiusy)
{
int x = get_global_id(0)<<2;
int y = get_global_id(1);
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int start_x = x+src_offset_x - RADIUSX & 0xfffffffc;
int start_x = x + src_offset_x - RADIUSX & 0xfffffffc;
int offset = src_offset_x - RADIUSX & 3;
int start_y = y + src_offset_y - radiusy;
int start_addr = mad24(start_y, src_step_in_pixel, start_x);
int i;
float4 sum;
uchar4 temp[READ_TIMES_ROW];
__local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
__local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW * LSIZE0 + 1];
#ifdef BORDER_CONSTANT
int end_addr = mad24(src_whole_rows - 1, src_step_in_pixel, src_whole_cols);
// read pixels from src
for (i = 0; i < READ_TIMES_ROW; i++)
for (int i = 0; i < READ_TIMES_ROW; ++i)
{
int current_addr = start_addr+i*LSIZE0*4;
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
temp[i] = *(__global uchar4*)&src[current_addr];
int current_addr = mad24(i, LSIZE0 << 2, start_addr);
current_addr = current_addr < end_addr && current_addr > 0 ? current_addr : 0;
temp[i] = *(__global const uchar4 *)&src[current_addr];
}
// judge if read out of boundary
#ifdef BORDER_ISOLATED
for (i = 0; i<READ_TIMES_ROW; i++)
for (int i = 0; i < READ_TIMES_ROW; ++i)
{
temp[i].x = ELEM(start_x+i*LSIZE0*4, src_offset_x, src_offset_x + src_cols, 0, temp[i].x);
temp[i].y = ELEM(start_x+i*LSIZE0*4+1, src_offset_x, src_offset_x + src_cols, 0, temp[i].y);
@ -194,7 +173,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
temp[i] = ELEM(start_y, src_offset_y, src_offset_y + src_rows, (uchar4)0, temp[i]);
}
#else
for (i = 0; i<READ_TIMES_ROW; i++)
for (int i = 0; i < READ_TIMES_ROW; ++i)
{
temp[i].x = ELEM(start_x+i*LSIZE0*4, 0, src_whole_cols, 0, temp[i].x);
temp[i].y = ELEM(start_x+i*LSIZE0*4+1, 0, src_whole_cols, 0, temp[i].y);
@ -209,16 +188,15 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
#else
int not_all_in_range = (start_x<0) | (start_x + READ_TIMES_ROW*LSIZE0*4+4>src_whole_cols)| (start_y<0) | (start_y >= src_whole_rows);
#endif
int4 index[READ_TIMES_ROW];
int4 addr;
int4 index[READ_TIMES_ROW], addr;
int s_y;
if (not_all_in_range)
{
// judge if read out of boundary
for (i = 0; i < READ_TIMES_ROW; i++)
for (int i = 0; i < READ_TIMES_ROW; ++i)
{
index[i] = (int4)(start_x+i*LSIZE0*4) + (int4)(0, 1, 2, 3);
index[i] = (int4)(mad24(i, LSIZE0 << 2, start_x)) + (int4)(0, 1, 2, 3);
#ifdef BORDER_ISOLATED
EXTRAPOLATE(index[i].x, src_offset_x, src_offset_x + src_cols);
EXTRAPOLATE(index[i].y, src_offset_x, src_offset_x + src_cols);
@ -231,6 +209,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
EXTRAPOLATE(index[i].w, 0, src_whole_cols);
#endif
}
s_y = start_y;
#ifdef BORDER_ISOLATED
EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows);
@ -239,9 +218,9 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
#endif
// read pixels from src
for (i = 0; i<READ_TIMES_ROW; i++)
for (int i = 0; i < READ_TIMES_ROW; ++i)
{
addr = mad24((int4)s_y,(int4)src_step_in_pixel,index[i]);
addr = mad24((int4)s_y, (int4)src_step_in_pixel, index[i]);
temp[i].x = src[addr.x];
temp[i].y = src[addr.y];
temp[i].z = src[addr.z];
@ -251,26 +230,26 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
else
{
// read pixels from src
for (i = 0; i<READ_TIMES_ROW; i++)
temp[i] = *(__global uchar4*)&src[start_addr+i*LSIZE0*4];
for (int i = 0; i < READ_TIMES_ROW; ++i)
temp[i] = *(__global uchar4*)&src[mad24(i, LSIZE0 << 2, start_addr)];
}
#endif //BORDER_CONSTANT
// save pixels to lds
for (i = 0; i<READ_TIMES_ROW; i++)
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
for (int i = 0; i < READ_TIMES_ROW; ++i)
LDS_DAT[l_y][mad24(i, LSIZE0, l_x)] = temp[i];
barrier(CLK_LOCAL_MEM_FENCE);
// read pixels from lds and calculate the result
sum =convert_float4(vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset))*mat_kernel[RADIUSX];
for (i=1; i<=RADIUSX; i++)
sum = convert_float4(vload4(0,(__local uchar *)&LDS_DAT[l_y][l_x]+RADIUSX+offset)) * mat_kernel[RADIUSX];
for (int i = 1; i <= RADIUSX; ++i)
{
temp[0] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset - i);
temp[1] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset + i);
sum += convert_float4(temp[0]) * mat_kernel[RADIUSX-i] + convert_float4(temp[1]) * mat_kernel[RADIUSX+i];
sum += mad(convert_float4(temp[0]), mat_kernel[RADIUSX-i], convert_float4(temp[1]) * mat_kernel[RADIUSX + i]);
}
start_addr = mad24(y,dst_step_in_pixel,x);
start_addr = mad24(y, dst_step_in_pixel, x);
// write the result to dst
if ((x+3<dst_cols) & (y<dst_rows))
@ -290,154 +269,58 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
dst[start_addr] = sum.x;
}
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C4_D0
(__global uchar4 * restrict src,
int src_step_in_pixel,
int src_offset_x, int src_offset_y,
int src_cols, int src_rows,
int src_whole_cols, int src_whole_rows,
__global float4 * dst,
int dst_step_in_pixel,
int dst_cols, int dst_rows,
int radiusy)
__kernel void row_filter(__global const srcT * src, int src_step_in_pixel, int src_offset_x, int src_offset_y,
int src_cols, int src_rows, int src_whole_cols, int src_whole_rows,
__global dstT * dst, int dst_step_in_pixel, int dst_cols, int dst_rows,
int radiusy)
{
int x = get_global_id(0);
int y = get_global_id(1);
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int start_x = x+src_offset_x-RADIUSX;
int start_y = y+src_offset_y-radiusy;
int start_addr = mad24(start_y,src_step_in_pixel,start_x);
int i;
float4 sum;
uchar4 temp[READ_TIMES_ROW];
int start_x = x + src_offset_x - RADIUSX;
int start_y = y + src_offset_y - radiusy;
int start_addr = mad24(start_y, src_step_in_pixel, start_x);
dstT sum;
srcT temp[READ_TIMES_ROW];
__local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
__local srcT LDS_DAT[LSIZE1][READ_TIMES_ROW * LSIZE0 + 1];
#ifdef BORDER_CONSTANT
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
int end_addr = mad24(src_whole_rows - 1, src_step_in_pixel, src_whole_cols);
// read pixels from src
for (i = 0; i<READ_TIMES_ROW; i++)
for (int i = 0; i < READ_TIMES_ROW; i++)
{
int current_addr = start_addr+i*LSIZE0;
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
int current_addr = mad24(i, LSIZE0, start_addr);
current_addr = current_addr < end_addr && current_addr > 0 ? current_addr : 0;
temp[i] = src[current_addr];
}
//judge if read out of boundary
// judge if read out of boundary
#ifdef BORDER_ISOLATED
for (i = 0; i<READ_TIMES_ROW; i++)
for (int i = 0; i < READ_TIMES_ROW; ++i)
{
temp[i]= ELEM(start_x+i*LSIZE0, src_offset_x, src_offset_x + src_cols, (uchar4)0, temp[i]);
temp[i]= ELEM(start_y, src_offset_y, src_offset_y + src_rows, (uchar4)0, temp[i]);
temp[i] = ELEM(mad24(i, LSIZE0, start_x), src_offset_x, src_offset_x + src_cols, (srcT)(0), temp[i]);
temp[i] = ELEM(start_y, src_offset_y, src_offset_y + src_rows, (srcT)(0), temp[i]);
}
#else
for (i = 0; i<READ_TIMES_ROW; i++)
for (int i = 0; i < READ_TIMES_ROW; ++i)
{
temp[i]= ELEM(start_x+i*LSIZE0, 0, src_whole_cols, (uchar4)0, temp[i]);
temp[i]= ELEM(start_y, 0, src_whole_rows, (uchar4)0, temp[i]);
temp[i] = ELEM(mad24(i, LSIZE0, start_x), 0, src_whole_cols, (srcT)(0), temp[i]);
temp[i] = ELEM(start_y, 0, src_whole_rows, (srcT)(0), temp[i]);
}
#endif
#else
int index[READ_TIMES_ROW];
int s_x,s_y;
int s_x, s_y;
// judge if read out of boundary
for (i = 0; i<READ_TIMES_ROW; i++)
for (int i = 0; i < READ_TIMES_ROW; ++i)
{
s_x = start_x+i*LSIZE0;
s_x = mad24(i, LSIZE0, start_x);
s_y = start_y;
#ifdef BORDER_ISOLATED
EXTRAPOLATE(s_x, src_offset_x, src_offset_x + src_cols);
EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows);
#else
EXTRAPOLATE(s_x, 0, src_whole_cols);
EXTRAPOLATE(s_y, 0, src_whole_rows);
#endif
index[i]=mad24(s_y, src_step_in_pixel, s_x);
}
//read pixels from src
for (i = 0; i<READ_TIMES_ROW; i++)
temp[i] = src[index[i]];
#endif //BORDER_CONSTANT
//save pixels to lds
for (i = 0; i<READ_TIMES_ROW; i++)
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
barrier(CLK_LOCAL_MEM_FENCE);
//read pixels from lds and calculate the result
sum =convert_float4(LDS_DAT[l_y][l_x+RADIUSX])*mat_kernel[RADIUSX];
for (i=1; i<=RADIUSX; i++)
{
temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
sum += convert_float4(temp[0])*mat_kernel[RADIUSX-i]+convert_float4(temp[1])*mat_kernel[RADIUSX+i];
}
//write the result to dst
if (x<dst_cols && y<dst_rows)
{
start_addr = mad24(y,dst_step_in_pixel,x);
dst[start_addr] = sum;
}
}
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D5
(__global float * restrict src,
int src_step_in_pixel,
int src_offset_x, int src_offset_y,
int src_cols, int src_rows,
int src_whole_cols, int src_whole_rows,
__global float * dst,
int dst_step_in_pixel,
int dst_cols, int dst_rows,
int radiusy)
{
int x = get_global_id(0);
int y = get_global_id(1);
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int start_x = x+src_offset_x-RADIUSX;
int start_y = y+src_offset_y-radiusy;
int start_addr = mad24(start_y,src_step_in_pixel,start_x);
int i;
float sum;
float temp[READ_TIMES_ROW];
__local float LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
#ifdef BORDER_CONSTANT
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
// read pixels from src
for (i = 0; i<READ_TIMES_ROW; i++)
{
int current_addr = start_addr+i*LSIZE0;
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
temp[i] = src[current_addr];
}
// judge if read out of boundary
#ifdef BORDER_ISOLATED
for (i = 0; i<READ_TIMES_ROW; i++)
{
temp[i]= ELEM(start_x+i*LSIZE0, src_offset_x, src_offset_x + src_cols, (float)0,temp[i]);
temp[i]= ELEM(start_y, src_offset_y, src_offset_y + src_rows, (float)0,temp[i]);
}
#else
for (i = 0; i<READ_TIMES_ROW; i++)
{
temp[i]= ELEM(start_x+i*LSIZE0, 0, src_whole_cols, (float)0,temp[i]);
temp[i]= ELEM(start_y, 0, src_whole_rows, (float)0,temp[i]);
}
#endif
#else // BORDER_CONSTANT
int index[READ_TIMES_ROW];
int s_x,s_y;
// judge if read out of boundary
for (i = 0; i<READ_TIMES_ROW; i++)
{
s_x = start_x + i*LSIZE0, s_y = start_y;
#ifdef BORDER_ISOLATED
EXTRAPOLATE(s_x, src_offset_x, src_offset_x + src_cols);
EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows);
@ -445,125 +328,31 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
EXTRAPOLATE(s_x, 0, src_whole_cols);
EXTRAPOLATE(s_y, 0, src_whole_rows);
#endif
index[i]=mad24(s_y, src_step_in_pixel, s_x);
}
// read pixels from src
for (i = 0; i<READ_TIMES_ROW; i++)
temp[i] = src[index[i]];
#endif// BORDER_CONSTANT
//save pixels to lds
for (i = 0; i<READ_TIMES_ROW; i++)
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
barrier(CLK_LOCAL_MEM_FENCE);
// read pixels from lds and calculate the result
sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX];
for (i=1; i<=RADIUSX; i++)
{
temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
sum += temp[0]*mat_kernel[RADIUSX-i]+temp[1]*mat_kernel[RADIUSX+i];
}
// write the result to dst
if (x<dst_cols && y<dst_rows)
{
start_addr = mad24(y,dst_step_in_pixel,x);
dst[start_addr] = sum;
index[i] = mad24(s_y, src_step_in_pixel, s_x);
}
}
__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C4_D5
(__global float4 * restrict src,
int src_step_in_pixel,
int src_offset_x, int src_offset_y,
int src_cols, int src_rows,
int src_whole_cols, int src_whole_rows,
__global float4 * dst,
int dst_step_in_pixel,
int dst_cols, int dst_rows,
int radiusy)
{
int x = get_global_id(0);
int y = get_global_id(1);
int l_x = get_local_id(0);
int l_y = get_local_id(1);
int start_x = x+src_offset_x-RADIUSX;
int start_y = y+src_offset_y-radiusy;
int start_addr = mad24(start_y,src_step_in_pixel,start_x);
int i;
float4 sum;
float4 temp[READ_TIMES_ROW];
__local float4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
#ifdef BORDER_CONSTANT
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
// read pixels from src
for (i = 0; i<READ_TIMES_ROW; i++)
{
int current_addr = start_addr+i*LSIZE0;
current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
temp[i] = src[current_addr];
}
// judge if read out of boundary
#ifdef BORDER_ISOLATED
for (i = 0; i<READ_TIMES_ROW; i++)
{
temp[i]= ELEM(start_x+i*LSIZE0, src_offset_x, src_offset_x + src_cols, (float4)0,temp[i]);
temp[i]= ELEM(start_y, src_offset_y, src_offset_y + src_rows, (float4)0,temp[i]);
}
#else
for (i = 0; i<READ_TIMES_ROW; i++)
{
temp[i]= ELEM(start_x+i*LSIZE0, 0, src_whole_cols, (float4)0,temp[i]);
temp[i]= ELEM(start_y, 0, src_whole_rows, (float4)0,temp[i]);
}
#endif
#else
int index[READ_TIMES_ROW];
int s_x,s_y;
// judge if read out of boundary
for (i = 0; i<READ_TIMES_ROW; i++)
{
s_x = start_x + i*LSIZE0, s_y = start_y;
#ifdef BORDER_ISOLATED
EXTRAPOLATE(s_x, src_offset_x, src_offset_x + src_cols);
EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows);
#else
EXTRAPOLATE(s_x, 0, src_whole_cols);
EXTRAPOLATE(s_y, 0, src_whole_rows);
#endif
index[i]=mad24(s_y,src_step_in_pixel,s_x);
}
// read pixels from src
for (i = 0; i<READ_TIMES_ROW; i++)
for (int i = 0; i < READ_TIMES_ROW; ++i)
temp[i] = src[index[i]];
#endif
#endif // BORDER_CONSTANT
// save pixels to lds
for (i = 0; i<READ_TIMES_ROW; i++)
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
for (int i = 0; i < READ_TIMES_ROW; ++i)
LDS_DAT[l_y][mad24(i, LSIZE0, l_x)] = temp[i];
barrier(CLK_LOCAL_MEM_FENCE);
// read pixels from lds and calculate the result
sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX];
for (i=1; i<=RADIUSX; i++)
sum = convertToDstT(LDS_DAT[l_y][l_x + RADIUSX]) * mat_kernel[RADIUSX];
for (int i = 1; i <= RADIUSX; ++i)
{
temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
sum += temp[0]*mat_kernel[RADIUSX-i]+temp[1]*mat_kernel[RADIUSX+i];
temp[0] = LDS_DAT[l_y][l_x + RADIUSX - i];
temp[1] = LDS_DAT[l_y][l_x + RADIUSX + i];
sum += mad(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]);
}
// write the result to dst
if (x<dst_cols && y<dst_rows)
if (x < dst_cols && y < dst_rows)
{
start_addr = mad24(y,dst_step_in_pixel,x);
start_addr = mad24(y, dst_step_in_pixel, x);
dst[start_addr] = sum;
}
}

@ -75,6 +75,7 @@
#endif
#define SRC(_x,_y) convertToWT(((global srcT*)(Src+(_y)*src_step))[_x])
#define DST(_x,_y) (((global dstT*)(Dst+dst_offset+(_y)*dst_step))[_x])
#ifdef BORDER_CONSTANT
// CCCCCC|abcdefgh|CCCCCCC
@ -83,8 +84,6 @@
#define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y))
#endif
#define DST(_x,_y) (((global dstT*)(Dst+dst_offset+(_y)*dst_step))[_x])
#define noconvert
// horizontal and vertical filter kernels
@ -101,15 +100,15 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int
// all these should be defined on host during compile time
// first lsmem array for source pixels used in first pass,
// second lsmemDy for storing first pass results
__local WT lsmem[BLK_Y+2*RADIUSY][BLK_X+2*RADIUSX];
__local WT lsmemDy[BLK_Y][BLK_X+2*RADIUSX];
__local WT lsmem[BLK_Y + 2 * RADIUSY][BLK_X + 2 * RADIUSX];
__local WT lsmemDy[BLK_Y][BLK_X + 2 * RADIUSX];
// get local and global ids - used as image and local memory array indexes
int lix = get_local_id(0);
int liy = get_local_id(1);
int x = (int)get_global_id(0);
int y = (int)get_global_id(1);
int x = get_global_id(0);
int y = get_global_id(1);
// calculate pixel position in source image taking image offset into account
int srcX = x + srcOffsetX - RADIUSX;

@ -79,12 +79,14 @@ PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool)
ksize.width++;
if (1 != (ksize.height % 2))
ksize.height++;
Mat temp = randomMat(Size(ksize.width, 1), CV_MAKE_TYPE(CV_32F, 1), -MAX_VALUE, MAX_VALUE);
cv::normalize(temp, kernelX, 1.0, 0.0, NORM_L1);
temp = randomMat(Size(1, ksize.height), CV_MAKE_TYPE(CV_32F, 1), -MAX_VALUE, MAX_VALUE);
cv::normalize(temp, kernelY, 1.0, 0.0, NORM_L1);
Size roiSize = randomSize(ksize.width, MAX_VALUE, ksize.height, MAX_VALUE);
Size roiSize = randomSize(ksize.width + 16, MAX_VALUE, ksize.height + 20, MAX_VALUE);
std::cout << roiSize << std::endl;
int rest = roiSize.width % 4;
if (0 != rest)
roiSize.width += (4 - rest);

Loading…
Cancel
Save