Improve the performance of fast_nlmeans_denoising_opencl.

1. Remove unnecessary barriers.
2. Adjust CTA_SIZE based on the following cases for Intel platform:
   a) OCL_Photo_DenoisingGrayscale.DenoisingGrayscale
   b) OCL_Photo_DenoisingColored.DenoisingColored
pull/3413/head
Yan Wang 10 years ago
parent 77be6d64c1
commit a8b4c99ed6
  1. 12
      modules/photo/src/fast_nlmeans_denoising_opencl.hpp
  2. 22
      modules/photo/src/opencl/nlmeans.cl

@ -19,7 +19,8 @@ enum
{ {
BLOCK_ROWS = 32, BLOCK_ROWS = 32,
BLOCK_COLS = 32, BLOCK_COLS = 32,
CTA_SIZE = 256 CTA_SIZE_INTEL = 64,
CTA_SIZE_DEFAULT = 256
}; };
static int divUp(int a, int b) static int divUp(int a, int b)
@ -70,6 +71,7 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
int templateWindowSize, int searchWindowSize) int templateWindowSize, int searchWindowSize)
{ {
int type = _src.type(), cn = CV_MAT_CN(type); int type = _src.type(), cn = CV_MAT_CN(type);
int ctaSize = ocl::Device::getDefault().isIntel() ? CTA_SIZE_INTEL : CTA_SIZE_DEFAULT;
Size size = _src.size(); Size size = _src.size();
if ( type != CV_8UC1 && type != CV_8UC2 && type != CV_8UC4 ) if ( type != CV_8UC1 && type != CV_8UC2 && type != CV_8UC4 )
@ -86,12 +88,12 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d" String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d"
" -D uchar_t=%s -D int_t=%s -D BLOCK_COLS=%d -D BLOCK_ROWS=%d" " -D uchar_t=%s -D int_t=%s -D BLOCK_COLS=%d -D BLOCK_ROWS=%d"
" -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d" " -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d"
" -D convert_int_t=%s -D cn=%d -D CTA_SIZE2=%d -D convert_uchar_t=%s", " -D convert_int_t=%s -D cn=%d -D convert_uchar_t=%s",
templateWindowSize, searchWindowSize, ocl::typeToStr(type), templateWindowSize, searchWindowSize, ocl::typeToStr(type),
ocl::typeToStr(CV_32SC(cn)), BLOCK_COLS, BLOCK_ROWS, CTA_SIZE, ocl::typeToStr(CV_32SC(cn)), BLOCK_COLS, BLOCK_ROWS, ctaSize,
templateWindowHalfWize, searchWindowHalfSize, templateWindowHalfWize, searchWindowHalfSize,
ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), cn, ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), cn,
CTA_SIZE >> 1, ocl::convertTypeStr(CV_32S, CV_8U, cn, cvt[1])); ocl::convertTypeStr(CV_32S, CV_8U, cn, cvt[1]));
ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts); ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts);
if (k.empty()) if (k.empty())
@ -120,7 +122,7 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
ocl::KernelArg::PtrReadOnly(almostDist2Weight), ocl::KernelArg::PtrReadOnly(almostDist2Weight),
ocl::KernelArg::PtrReadOnly(buffer), almostTemplateWindowSizeSqBinShift); ocl::KernelArg::PtrReadOnly(buffer), almostTemplateWindowSizeSqBinShift);
size_t globalsize[2] = { nblocksx * CTA_SIZE, nblocksy }, localsize[2] = { CTA_SIZE, 1 }; size_t globalsize[2] = { nblocksx * ctaSize, nblocksy }, localsize[2] = { ctaSize, 1 };
return k.run(2, globalsize, localsize, false); return k.run(2, globalsize, localsize, false);
} }

@ -206,22 +206,11 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off
weighted_sum += (int_t)(weight) * src_value; weighted_sum += (int_t)(weight) * src_value;
} }
if (id >= CTA_SIZE2) weights_local[id] = weights;
{ weighted_sum_local[id] = weighted_sum;
int id2 = id - CTA_SIZE2;
weights_local[id2] = weights;
weighted_sum_local[id2] = weighted_sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (id < CTA_SIZE2)
{
weights_local[id] += weights;
weighted_sum_local[id] += weighted_sum;
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (int lsize = CTA_SIZE2 >> 1; lsize > 2; lsize >>= 1) for (int lsize = CTA_SIZE >> 1; lsize > 2; lsize >>= 1)
{ {
if (id < lsize) if (id < lsize)
{ {
@ -252,8 +241,8 @@ __kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int
int block_y = get_group_id(1); int block_y = get_group_id(1);
int id = get_local_id(0), first; int id = get_local_id(0), first;
__local int dists[SEARCH_SIZE_SQ], weights[CTA_SIZE2]; __local int dists[SEARCH_SIZE_SQ], weights[CTA_SIZE];
__local int_t weighted_sum[CTA_SIZE2]; __local int_t weighted_sum[CTA_SIZE];
int x0 = block_x * BLOCK_COLS, x1 = min(x0 + BLOCK_COLS, dst_cols); int x0 = block_x * BLOCK_COLS, x1 = min(x0 + BLOCK_COLS, dst_cols);
int y0 = block_y * BLOCK_ROWS, y1 = min(y0 + BLOCK_ROWS, dst_rows); int y0 = block_y * BLOCK_ROWS, y1 = min(y0 + BLOCK_ROWS, dst_rows);
@ -281,7 +270,6 @@ __kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int
first = (first + 1) % TEMPLATE_SIZE; first = (first + 1) % TEMPLATE_SIZE;
} }
barrier(CLK_LOCAL_MEM_FENCE);
convolveWindow(src, src_step, src_offset, dists, almostDist2Weight, dst, dst_step, dst_offset, convolveWindow(src, src_step, src_offset, dists, almostDist2Weight, dst, dst_step, dst_offset,
y, x, id, weights, weighted_sum, almostTemplateWindowSizeSqBinShift); y, x, id, weights, weighted_sum, almostTemplateWindowSizeSqBinShift);

Loading…
Cancel
Save