From 7804d57f8bbc0a7bbf58335944d50ced10ecac8b Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 28 May 2014 16:43:58 +0400 Subject: [PATCH] optimized index calculation --- modules/core/src/opencl/reduce.cl | 29 ++++++++++++++++++++++++----- modules/core/src/stat.cpp | 24 ++++++++++++++++-------- modules/core/src/umatrix.cpp | 6 ++++-- 3 files changed, 44 insertions(+), 15 deletions(-) diff --git a/modules/core/src/opencl/reduce.cl b/modules/core/src/opencl/reduce.cl index a697cbaecd..8fc2330d56 100644 --- a/modules/core/src/opencl/reduce.cl +++ b/modules/core/src/opencl/reduce.cl @@ -82,6 +82,12 @@ #define noconvert +#ifdef HAVE_MASK_CONT +#define MASK_INDEX int mask_index = id + mask_offset; +#else +#define MASK_INDEX int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols)) +#endif + #if cn != 3 #define loadpix(addr) *(__global const srcT *)(addr) #define storepix(val, addr) *(__global dstT *)(addr) = val @@ -130,15 +136,22 @@ #ifdef HAVE_MASK #define REDUCE_GLOBAL \ - int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols)); \ + MASK_INDEX; \ if (mask[mask_index]) \ { \ dstT temp = convertToDT(loadpix(srcptr + src_index)); \ FUNC(accumulator, temp); \ } #elif defined OP_DOT + +#ifdef HAVE_SRC2_CONT +#define SRC2_INDEX int src2_index = mad24(id, srcTSIZE, src2_offset); +#else +#define SRC2_INDEX int src2_index = mad24(id / cols, src2_step, mad24(id % cols, srcTSIZE, src2_offset)) +#endif + #define REDUCE_GLOBAL \ - int src2_index = mad24(id / cols, src2_step, mad24(id % cols, srcTSIZE, src2_offset)); \ + SRC2_INDEX; \ dstT temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ FUNC(accumulator, temp, temp2) #else @@ -183,7 +196,7 @@ #define DEFINE_ACCUMULATOR \ srcT maxval = MIN_VAL, temp #define REDUCE_GLOBAL \ - int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols)); \ + MASK_INDEX; \ if (mask[mask_index]) \ { \ temp = loadpix(srcptr + src_index); \ @@ -270,7 +283,7 @@ #define REDUCE_GLOBAL \ temp = loadpix(srcptr + src_index); \ temploc = id; \ - int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols) * (int)sizeof(uchar)); \ + MASK_INDEX; \ __global const uchar * mask = (__global const uchar *)(maskptr + mask_index); \ temp_mask = mask[0]; \ srcT temp_minval = minval, temp_maxval = maxval; \ @@ -305,12 +318,18 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset int gid = get_group_id(0); int id = get_global_id(0); + srcptr += src_offset; + DECLARE_LOCAL_MEM; DEFINE_ACCUMULATOR; for (int grain = groupnum * WGS; id < total; id += grain) { - int src_index = mad24(id / cols, src_step, mad24(id % cols, srcTSIZE, src_offset)); +#ifdef HAVE_SRC_CONT + int src_index = mul24(id, srcTSIZE); +#else + int src_index = mad24(id / cols, src_step, mul24(id % cols, srcTSIZE)); +#endif REDUCE_GLOBAL; } diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index 0584496887..37cd8eb81e 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -496,13 +496,15 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask char cvt[40]; ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D ddepth=%d -D cn=%d" - " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s", + " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s", ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(dtype), ocl::typeToStr(ddepth), ddepth, cn, ocl::convertTypeStr(depth, ddepth, cn, cvt), opMap[sum_op], (int)wgs, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "", - haveMask ? " -D HAVE_MASK" : "")); + haveMask ? " -D HAVE_MASK" : "", + _src.isContinuous() ? " -D HAVE_SRC_CONT" : "", + _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "")); if (k.empty()) return false; @@ -658,9 +660,11 @@ static bool ocl_countNonZero( InputArray _src, int & res ) wgs2_aligned >>= 1; ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, - format("-D srcT=%s -D OP_COUNT_NON_ZERO -D WGS=%d -D WGS2_ALIGNED=%d%s", + format("-D srcT=%s -D OP_COUNT_NON_ZERO -D WGS=%d " + "-D WGS2_ALIGNED=%d%s%s", ocl::typeToStr(type), (int)wgs, - wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "")); + wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "", + _src.isContinuous() ? " -D HAVE_SRC_CONT" : "")); if (k.empty()) return false; @@ -1301,9 +1305,11 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* wgs2_aligned <<= 1; wgs2_aligned >>= 1; - String opts = format("-D DEPTH_%d -D srcT=%s -D OP_MIN_MAX_LOC%s -D WGS=%d -D WGS2_ALIGNED=%d%s", + String opts = format("-D DEPTH_%d -D srcT=%s -D OP_MIN_MAX_LOC%s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s", depth, ocl::typeToStr(depth), _mask.empty() ? "" : "_MASK", (int)wgs, - wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : ""); + wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "", + _src.isContinuous() ? " -D HAVE_SRC_CONT" : "", + _mask.isContinuous() ? " -D HAVE_MASK_CONT" : ""); ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts); if (k.empty()) @@ -2026,9 +2032,11 @@ static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double & ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, format("-D OP_NORM_INF_MASK -D HAVE_MASK -D DEPTH_%d" - " -D srcT=%s -D srcT1=%s -D WGS=%d -D cn=%d -D WGS2_ALIGNED=%d%s", + " -D srcT=%s -D srcT1=%s -D WGS=%d -D cn=%d -D WGS2_ALIGNED=%d%s%s%s", depth, ocl::typeToStr(type), ocl::typeToStr(depth), - wgs, cn, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "")); + wgs, cn, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "", + src.isContinuous() ? " -D HAVE_CONT_SRC" : "", + _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "")); if (k.empty()) return false; diff --git a/modules/core/src/umatrix.cpp b/modules/core/src/umatrix.cpp index 0060492541..07dd07fbe2 100644 --- a/modules/core/src/umatrix.cpp +++ b/modules/core/src/umatrix.cpp @@ -853,9 +853,11 @@ static bool ocl_dot( InputArray _src1, InputArray _src2, double & res ) char cvt[40]; ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, - format("-D srcT=%s -D dstT=%s -D ddepth=%d -D convertToDT=%s -D OP_DOT -D WGS=%d -D WGS2_ALIGNED=%d%s", + format("-D srcT=%s -D dstT=%s -D ddepth=%d -D convertToDT=%s -D OP_DOT -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s", ocl::typeToStr(depth), ocl::typeToStr(ddepth), ddepth, ocl::convertTypeStr(depth, ddepth, 1, cvt), - (int)wgs, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "")); + (int)wgs, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "", + _src1.isContinuous() ? " -D HAVE_SRC_CONT" : "", + _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "")); if (k.empty()) return false;