Merge pull request #3413 from wangyan42164:denoising_opencl_improvement

pull/3420/head
Alexander Alekhin 10 years ago
commit a16a11f32b
  1. 12
      modules/photo/src/fast_nlmeans_denoising_opencl.hpp
  2. 22
      modules/photo/src/opencl/nlmeans.cl

@ -19,7 +19,8 @@ enum
{
BLOCK_ROWS = 32,
BLOCK_COLS = 32,
CTA_SIZE = 256
CTA_SIZE_INTEL = 64,
CTA_SIZE_DEFAULT = 256
};
static int divUp(int a, int b)
@ -70,6 +71,7 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
int templateWindowSize, int searchWindowSize)
{
int type = _src.type(), cn = CV_MAT_CN(type);
int ctaSize = ocl::Device::getDefault().isIntel() ? CTA_SIZE_INTEL : CTA_SIZE_DEFAULT;
Size size = _src.size();
if ( type != CV_8UC1 && type != CV_8UC2 && type != CV_8UC4 )
@ -86,12 +88,12 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d"
" -D uchar_t=%s -D int_t=%s -D BLOCK_COLS=%d -D BLOCK_ROWS=%d"
" -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d"
" -D convert_int_t=%s -D cn=%d -D CTA_SIZE2=%d -D convert_uchar_t=%s",
" -D convert_int_t=%s -D cn=%d -D convert_uchar_t=%s",
templateWindowSize, searchWindowSize, ocl::typeToStr(type),
ocl::typeToStr(CV_32SC(cn)), BLOCK_COLS, BLOCK_ROWS, CTA_SIZE,
ocl::typeToStr(CV_32SC(cn)), BLOCK_COLS, BLOCK_ROWS, ctaSize,
templateWindowHalfWize, searchWindowHalfSize,
ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), cn,
CTA_SIZE >> 1, ocl::convertTypeStr(CV_32S, CV_8U, cn, cvt[1]));
ocl::convertTypeStr(CV_32S, CV_8U, cn, cvt[1]));
ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts);
if (k.empty())
@ -120,7 +122,7 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
ocl::KernelArg::PtrReadOnly(almostDist2Weight),
ocl::KernelArg::PtrReadOnly(buffer), almostTemplateWindowSizeSqBinShift);
size_t globalsize[2] = { nblocksx * CTA_SIZE, nblocksy }, localsize[2] = { CTA_SIZE, 1 };
size_t globalsize[2] = { nblocksx * ctaSize, nblocksy }, localsize[2] = { ctaSize, 1 };
return k.run(2, globalsize, localsize, false);
}

@ -206,22 +206,11 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off
weighted_sum += (int_t)(weight) * src_value;
}
if (id >= CTA_SIZE2)
{
int id2 = id - CTA_SIZE2;
weights_local[id2] = weights;
weighted_sum_local[id2] = weighted_sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (id < CTA_SIZE2)
{
weights_local[id] += weights;
weighted_sum_local[id] += weighted_sum;
}
weights_local[id] = weights;
weighted_sum_local[id] = weighted_sum;
barrier(CLK_LOCAL_MEM_FENCE);
for (int lsize = CTA_SIZE2 >> 1; lsize > 2; lsize >>= 1)
for (int lsize = CTA_SIZE >> 1; lsize > 2; lsize >>= 1)
{
if (id < lsize)
{
@ -252,8 +241,8 @@ __kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int
int block_y = get_group_id(1);
int id = get_local_id(0), first;
__local int dists[SEARCH_SIZE_SQ], weights[CTA_SIZE2];
__local int_t weighted_sum[CTA_SIZE2];
__local int dists[SEARCH_SIZE_SQ], weights[CTA_SIZE];
__local int_t weighted_sum[CTA_SIZE];
int x0 = block_x * BLOCK_COLS, x1 = min(x0 + BLOCK_COLS, dst_cols);
int y0 = block_y * BLOCK_ROWS, y1 = min(y0 + BLOCK_ROWS, dst_rows);
@ -281,7 +270,6 @@ __kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int
first = (first + 1) % TEMPLATE_SIZE;
}
barrier(CLK_LOCAL_MEM_FENCE);
convolveWindow(src, src_step, src_offset, dists, almostDist2Weight, dst, dst_step, dst_offset,
y, x, id, weights, weighted_sum, almostTemplateWindowSizeSqBinShift);

Loading…
Cancel
Save