optimized index calculation

pull/2801/head
Ilya Lavrenov 11 years ago
parent c9db91ace5
commit 7804d57f8b
  1. 29
      modules/core/src/opencl/reduce.cl
  2. 24
      modules/core/src/stat.cpp
  3. 6
      modules/core/src/umatrix.cpp

@ -82,6 +82,12 @@
#define noconvert #define noconvert
#ifdef HAVE_MASK_CONT
#define MASK_INDEX int mask_index = id + mask_offset;
#else
#define MASK_INDEX int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols))
#endif
#if cn != 3 #if cn != 3
#define loadpix(addr) *(__global const srcT *)(addr) #define loadpix(addr) *(__global const srcT *)(addr)
#define storepix(val, addr) *(__global dstT *)(addr) = val #define storepix(val, addr) *(__global dstT *)(addr) = val
@ -130,15 +136,22 @@
#ifdef HAVE_MASK #ifdef HAVE_MASK
#define REDUCE_GLOBAL \ #define REDUCE_GLOBAL \
int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols)); \ MASK_INDEX; \
if (mask[mask_index]) \ if (mask[mask_index]) \
{ \ { \
dstT temp = convertToDT(loadpix(srcptr + src_index)); \ dstT temp = convertToDT(loadpix(srcptr + src_index)); \
FUNC(accumulator, temp); \ FUNC(accumulator, temp); \
} }
#elif defined OP_DOT #elif defined OP_DOT
#ifdef HAVE_SRC2_CONT
#define SRC2_INDEX int src2_index = mad24(id, srcTSIZE, src2_offset);
#else
#define SRC2_INDEX int src2_index = mad24(id / cols, src2_step, mad24(id % cols, srcTSIZE, src2_offset))
#endif
#define REDUCE_GLOBAL \ #define REDUCE_GLOBAL \
int src2_index = mad24(id / cols, src2_step, mad24(id % cols, srcTSIZE, src2_offset)); \ SRC2_INDEX; \
dstT temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ dstT temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
FUNC(accumulator, temp, temp2) FUNC(accumulator, temp, temp2)
#else #else
@ -183,7 +196,7 @@
#define DEFINE_ACCUMULATOR \ #define DEFINE_ACCUMULATOR \
srcT maxval = MIN_VAL, temp srcT maxval = MIN_VAL, temp
#define REDUCE_GLOBAL \ #define REDUCE_GLOBAL \
int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols)); \ MASK_INDEX; \
if (mask[mask_index]) \ if (mask[mask_index]) \
{ \ { \
temp = loadpix(srcptr + src_index); \ temp = loadpix(srcptr + src_index); \
@ -270,7 +283,7 @@
#define REDUCE_GLOBAL \ #define REDUCE_GLOBAL \
temp = loadpix(srcptr + src_index); \ temp = loadpix(srcptr + src_index); \
temploc = id; \ temploc = id; \
int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols) * (int)sizeof(uchar)); \ MASK_INDEX; \
__global const uchar * mask = (__global const uchar *)(maskptr + mask_index); \ __global const uchar * mask = (__global const uchar *)(maskptr + mask_index); \
temp_mask = mask[0]; \ temp_mask = mask[0]; \
srcT temp_minval = minval, temp_maxval = maxval; \ srcT temp_minval = minval, temp_maxval = maxval; \
@ -305,12 +318,18 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset
int gid = get_group_id(0); int gid = get_group_id(0);
int id = get_global_id(0); int id = get_global_id(0);
srcptr += src_offset;
DECLARE_LOCAL_MEM; DECLARE_LOCAL_MEM;
DEFINE_ACCUMULATOR; DEFINE_ACCUMULATOR;
for (int grain = groupnum * WGS; id < total; id += grain) for (int grain = groupnum * WGS; id < total; id += grain)
{ {
int src_index = mad24(id / cols, src_step, mad24(id % cols, srcTSIZE, src_offset)); #ifdef HAVE_SRC_CONT
int src_index = mul24(id, srcTSIZE);
#else
int src_index = mad24(id / cols, src_step, mul24(id % cols, srcTSIZE));
#endif
REDUCE_GLOBAL; REDUCE_GLOBAL;
} }

@ -496,13 +496,15 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
char cvt[40]; char cvt[40];
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D ddepth=%d -D cn=%d" format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D ddepth=%d -D cn=%d"
" -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s", " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s",
ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(type), ocl::typeToStr(depth),
ocl::typeToStr(dtype), ocl::typeToStr(ddepth), ddepth, cn, ocl::typeToStr(dtype), ocl::typeToStr(ddepth), ddepth, cn,
ocl::convertTypeStr(depth, ddepth, cn, cvt), ocl::convertTypeStr(depth, ddepth, cn, cvt),
opMap[sum_op], (int)wgs, wgs2_aligned, opMap[sum_op], (int)wgs, wgs2_aligned,
doubleSupport ? " -D DOUBLE_SUPPORT" : "", doubleSupport ? " -D DOUBLE_SUPPORT" : "",
haveMask ? " -D HAVE_MASK" : "")); haveMask ? " -D HAVE_MASK" : "",
_src.isContinuous() ? " -D HAVE_SRC_CONT" : "",
_mask.isContinuous() ? " -D HAVE_MASK_CONT" : ""));
if (k.empty()) if (k.empty())
return false; return false;
@ -658,9 +660,11 @@ static bool ocl_countNonZero( InputArray _src, int & res )
wgs2_aligned >>= 1; wgs2_aligned >>= 1;
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
format("-D srcT=%s -D OP_COUNT_NON_ZERO -D WGS=%d -D WGS2_ALIGNED=%d%s", format("-D srcT=%s -D OP_COUNT_NON_ZERO -D WGS=%d "
"-D WGS2_ALIGNED=%d%s%s",
ocl::typeToStr(type), (int)wgs, ocl::typeToStr(type), (int)wgs,
wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "")); wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
_src.isContinuous() ? " -D HAVE_SRC_CONT" : ""));
if (k.empty()) if (k.empty())
return false; return false;
@ -1301,9 +1305,11 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
wgs2_aligned <<= 1; wgs2_aligned <<= 1;
wgs2_aligned >>= 1; wgs2_aligned >>= 1;
String opts = format("-D DEPTH_%d -D srcT=%s -D OP_MIN_MAX_LOC%s -D WGS=%d -D WGS2_ALIGNED=%d%s", String opts = format("-D DEPTH_%d -D srcT=%s -D OP_MIN_MAX_LOC%s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s",
depth, ocl::typeToStr(depth), _mask.empty() ? "" : "_MASK", (int)wgs, depth, ocl::typeToStr(depth), _mask.empty() ? "" : "_MASK", (int)wgs,
wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : ""); wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
_src.isContinuous() ? " -D HAVE_SRC_CONT" : "",
_mask.isContinuous() ? " -D HAVE_MASK_CONT" : "");
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts); ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts);
if (k.empty()) if (k.empty())
@ -2026,9 +2032,11 @@ static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double &
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
format("-D OP_NORM_INF_MASK -D HAVE_MASK -D DEPTH_%d" format("-D OP_NORM_INF_MASK -D HAVE_MASK -D DEPTH_%d"
" -D srcT=%s -D srcT1=%s -D WGS=%d -D cn=%d -D WGS2_ALIGNED=%d%s", " -D srcT=%s -D srcT1=%s -D WGS=%d -D cn=%d -D WGS2_ALIGNED=%d%s%s%s",
depth, ocl::typeToStr(type), ocl::typeToStr(depth), depth, ocl::typeToStr(type), ocl::typeToStr(depth),
wgs, cn, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "")); wgs, cn, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
src.isContinuous() ? " -D HAVE_CONT_SRC" : "",
_mask.isContinuous() ? " -D HAVE_MASK_CONT" : ""));
if (k.empty()) if (k.empty())
return false; return false;

@ -853,9 +853,11 @@ static bool ocl_dot( InputArray _src1, InputArray _src2, double & res )
char cvt[40]; char cvt[40];
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
format("-D srcT=%s -D dstT=%s -D ddepth=%d -D convertToDT=%s -D OP_DOT -D WGS=%d -D WGS2_ALIGNED=%d%s", format("-D srcT=%s -D dstT=%s -D ddepth=%d -D convertToDT=%s -D OP_DOT -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s",
ocl::typeToStr(depth), ocl::typeToStr(ddepth), ddepth, ocl::convertTypeStr(depth, ddepth, 1, cvt), ocl::typeToStr(depth), ocl::typeToStr(ddepth), ddepth, ocl::convertTypeStr(depth, ddepth, 1, cvt),
(int)wgs, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "")); (int)wgs, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
_src1.isContinuous() ? " -D HAVE_SRC_CONT" : "",
_src2.isContinuous() ? " -D HAVE_SRC2_CONT" : ""));
if (k.empty()) if (k.empty())
return false; return false;

Loading…
Cancel
Save