From a8b4c99ed6560b24f53524f6b156ff36dd9c54ca Mon Sep 17 00:00:00 2001 From: Yan Wang Date: Wed, 12 Nov 2014 18:47:48 +0800 Subject: [PATCH] Improve the performance of fast_nlmeans_denoising_opencl. 1. Remove unnecessary barriers. 2. Adjust CTA_SIZE based on the following cases for Intel platform: a) OCL_Photo_DenoisingGrayscale.DenoisingGrayscale b) OCL_Photo_DenoisingColored.DenoisingColored --- .../src/fast_nlmeans_denoising_opencl.hpp | 12 +++++----- modules/photo/src/opencl/nlmeans.cl | 22 +++++-------------- 2 files changed, 12 insertions(+), 22 deletions(-) diff --git a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp index ae173905d8..1cdd8fa494 100644 --- a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp +++ b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp @@ -19,7 +19,8 @@ enum { BLOCK_ROWS = 32, BLOCK_COLS = 32, - CTA_SIZE = 256 + CTA_SIZE_INTEL = 64, + CTA_SIZE_DEFAULT = 256 }; static int divUp(int a, int b) @@ -70,6 +71,7 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h, int templateWindowSize, int searchWindowSize) { int type = _src.type(), cn = CV_MAT_CN(type); + int ctaSize = ocl::Device::getDefault().isIntel() ? CTA_SIZE_INTEL : CTA_SIZE_DEFAULT; Size size = _src.size(); if ( type != CV_8UC1 && type != CV_8UC2 && type != CV_8UC4 ) @@ -86,12 +88,12 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h, String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d" " -D uchar_t=%s -D int_t=%s -D BLOCK_COLS=%d -D BLOCK_ROWS=%d" " -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d" - " -D convert_int_t=%s -D cn=%d -D CTA_SIZE2=%d -D convert_uchar_t=%s", + " -D convert_int_t=%s -D cn=%d -D convert_uchar_t=%s", templateWindowSize, searchWindowSize, ocl::typeToStr(type), - ocl::typeToStr(CV_32SC(cn)), BLOCK_COLS, BLOCK_ROWS, CTA_SIZE, + ocl::typeToStr(CV_32SC(cn)), BLOCK_COLS, BLOCK_ROWS, ctaSize, templateWindowHalfWize, searchWindowHalfSize, ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), cn, - CTA_SIZE >> 1, ocl::convertTypeStr(CV_32S, CV_8U, cn, cvt[1])); + ocl::convertTypeStr(CV_32S, CV_8U, cn, cvt[1])); ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts); if (k.empty()) @@ -120,7 +122,7 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h, ocl::KernelArg::PtrReadOnly(almostDist2Weight), ocl::KernelArg::PtrReadOnly(buffer), almostTemplateWindowSizeSqBinShift); - size_t globalsize[2] = { nblocksx * CTA_SIZE, nblocksy }, localsize[2] = { CTA_SIZE, 1 }; + size_t globalsize[2] = { nblocksx * ctaSize, nblocksy }, localsize[2] = { ctaSize, 1 }; return k.run(2, globalsize, localsize, false); } diff --git a/modules/photo/src/opencl/nlmeans.cl b/modules/photo/src/opencl/nlmeans.cl index 67a6ec61f8..152f4ddcfd 100644 --- a/modules/photo/src/opencl/nlmeans.cl +++ b/modules/photo/src/opencl/nlmeans.cl @@ -206,22 +206,11 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off weighted_sum += (int_t)(weight) * src_value; } - if (id >= CTA_SIZE2) - { - int id2 = id - CTA_SIZE2; - weights_local[id2] = weights; - weighted_sum_local[id2] = weighted_sum; - } - barrier(CLK_LOCAL_MEM_FENCE); - - if (id < CTA_SIZE2) - { - weights_local[id] += weights; - weighted_sum_local[id] += weighted_sum; - } + weights_local[id] = weights; + weighted_sum_local[id] = weighted_sum; barrier(CLK_LOCAL_MEM_FENCE); - for (int lsize = CTA_SIZE2 >> 1; lsize > 2; lsize >>= 1) + for (int lsize = CTA_SIZE >> 1; lsize > 2; lsize >>= 1) { if (id < lsize) { @@ -252,8 +241,8 @@ __kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int int block_y = get_group_id(1); int id = get_local_id(0), first; - __local int dists[SEARCH_SIZE_SQ], weights[CTA_SIZE2]; - __local int_t weighted_sum[CTA_SIZE2]; + __local int dists[SEARCH_SIZE_SQ], weights[CTA_SIZE]; + __local int_t weighted_sum[CTA_SIZE]; int x0 = block_x * BLOCK_COLS, x1 = min(x0 + BLOCK_COLS, dst_cols); int y0 = block_y * BLOCK_ROWS, y1 = min(y0 + BLOCK_ROWS, dst_rows); @@ -281,7 +270,6 @@ __kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int first = (first + 1) % TEMPLATE_SIZE; } - barrier(CLK_LOCAL_MEM_FENCE); convolveWindow(src, src_step, src_offset, dists, almostDist2Weight, dst, dst_step, dst_offset, y, x, id, weights, weighted_sum, almostTemplateWindowSizeSqBinShift);