From 653b99c9bd1e02712105c344ca16c51c0c89a0be Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 24 Feb 2014 18:38:33 +0400 Subject: [PATCH] new scheme of sqrSum --- modules/imgproc/src/opencl/match_template.cl | 50 ++++++++++++-------- modules/imgproc/src/templmatch.cpp | 35 ++++++++------ 2 files changed, 52 insertions(+), 33 deletions(-) diff --git a/modules/imgproc/src/opencl/match_template.cl b/modules/imgproc/src/opencl/match_template.cl index ef7ba2d2d5..7c80b3c163 100644 --- a/modules/imgproc/src/opencl/match_template.cl +++ b/modules/imgproc/src/opencl/match_template.cl @@ -68,35 +68,47 @@ inline float normAcc_SQDIFF(float num, float denum) #ifdef CALC_SUM -__kernel void calcSum(__global const uchar * templateptr, int template_step, int template_offset, - int template_rows, int template_cols, __global float * result) +__kernel void calcSum(__global const uchar * srcptr, int src_step, int src_offset, + int cols, int total, __global float * dst) { - __global const T * template = (__global const T *)(templateptr + template_offset); + int lid = get_local_id(0), id = get_global_id(0); - WT res = (WT)(0); + __local WT localmem[WGS2_ALIGNED]; + WT accumulator = (WT)(0), tmp; - for (int y = 0; y < template_rows; ++y) + for ( ; id < total; id += WGS) { - for (int x = 0; x < template_cols; ++x) - { - WT value = convertToWT(template[x]); -#ifdef SUM_2 + int src_index = mad24(id / cols, src_step, mad24(id % cols, (int)sizeof(T), src_offset)); + __global const T * src = (__global const T *)(srcptr + src_index); + + tmp = convertToWT(src[0]); #if wdepth == 4 - res = mad24(value, value, res); -#else - res = mad(value, value, res); -#endif -#elif defined SUM_1 - res += value; + accumulator = mad24(tmp, tmp, accumulator); #else -#error "No operation is specified" + accumulator = mad(tmp, tmp, accumulator); #endif - } + } - template = (__global const T *)((__global const uchar *)template + template_step); + if (lid < WGS2_ALIGNED) + localmem[lid] = accumulator; + barrier(CLK_LOCAL_MEM_FENCE); + + if (lid >= WGS2_ALIGNED && total >= WGS2_ALIGNED) + localmem[lid - WGS2_ALIGNED] += accumulator; + barrier(CLK_LOCAL_MEM_FENCE); + + for (int lsize = WGS2_ALIGNED >> 1; lsize > 0; lsize >>= 1) + { + if (lid < lsize) + { + int lid2 = lsize + lid; + localmem[lid] += localmem[lid2]; + } + barrier(CLK_LOCAL_MEM_FENCE); } - result[0] = convertToDT(res); + if (lid == 0) + dst[0] = convertToDT(localmem[0]); } #elif defined CCORR diff --git a/modules/imgproc/src/templmatch.cpp b/modules/imgproc/src/templmatch.cpp index 2298b12a54..ca132dd0f8 100644 --- a/modules/imgproc/src/templmatch.cpp +++ b/modules/imgproc/src/templmatch.cpp @@ -40,7 +40,6 @@ //M*/ #include "precomp.hpp" -#define CV_OPENCL_RUN_ASSERT #include "opencl_kernels.hpp" ////////////////////////////////////////////////// matchTemplate ////////////////////////////////////////////////////////// @@ -57,28 +56,36 @@ enum SUM_1 = 0, SUM_2 = 1 }; -static bool sumTemplate(InputArray _templ, UMat & result, int sum_type) +static bool sumTemplate(InputArray _src, UMat & result) { - CV_Assert(sum_type == SUM_1 || sum_type == SUM_2); - - int type = _templ.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); + int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); int wdepth = std::max(CV_32S, depth), wtype = CV_MAKE_TYPE(wdepth, cn); + size_t wgs = ocl::Device::getDefault().maxWorkGroupSize(); + + int wgs2_aligned = 1; + while (wgs2_aligned < (int)wgs) + wgs2_aligned <<= 1; + wgs2_aligned >>= 1; char cvt[40]; - const char * const sumTypeToStr[] = { "SUM_1", "SUM_2" }; ocl::Kernel k("calcSum", ocl::imgproc::match_template_oclsrc, - format("-D CALC_SUM -D %s -D T=%s -D WT=%s -D convertToWT=%s -D cn=%d -D wdepth=%d", - sumTypeToStr[sum_type], ocl::typeToStr(type), ocl::typeToStr(wtype), - ocl::convertTypeStr(depth, wdepth, cn, cvt), cn, wdepth)); + format("-D CALC_SUM -D T=%s -D WT=%s -D cn=%d -D convertToWT=%s -D WGS=%d -D WGS2_ALIGNED=%d -D wdepth=%d", + ocl::typeToStr(type), ocl::typeToStr(wtype), cn, + ocl::convertTypeStr(depth, wdepth, cn, cvt), + (int)wgs, wgs2_aligned, wdepth)); if (k.empty()) return false; + UMat src = _src.getUMat(); result.create(1, 1, CV_32FC1); - UMat templ = _templ.getUMat(); - k.args(ocl::KernelArg::ReadOnly(templ), ocl::KernelArg::PtrWriteOnly(result)); + ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src), + resarg = ocl::KernelArg::PtrWriteOnly(result); + + k.args(srcarg, src.cols, (int)src.total(), resarg); - return k.runTask(false); + size_t globalsize = wgs; + return k.run(1, &globalsize, &wgs, false); } static bool matchTemplateNaive_CCORR(InputArray _image, InputArray _templ, OutputArray _result) @@ -123,7 +130,7 @@ static bool matchTemplate_CCORR_NORMED(InputArray _image, InputArray _templ, Out integral(image.reshape(1), image_sums, image_sqsums, CV_32F, CV_32F); UMat templ_sqsum; - if (!sumTemplate(templ, templ_sqsum, SUM_2)) + if (!sumTemplate(templ, templ_sqsum)) return false; k.args(ocl::KernelArg::ReadOnlyNoSize(image_sqsums), ocl::KernelArg::ReadWrite(result), @@ -177,7 +184,7 @@ static bool matchTemplate_SQDIFF_NORMED(InputArray _image, InputArray _templ, Ou integral(image.reshape(1), image_sums, image_sqsums, CV_32F, CV_32F); UMat templ_sqsum; - if (!sumTemplate(_templ, templ_sqsum, SUM_2)) + if (!sumTemplate(_templ, templ_sqsum)) return false; k.args(ocl::KernelArg::ReadOnlyNoSize(image_sqsums), ocl::KernelArg::ReadWrite(result),