optimized cv::norm with 2 args

pull/2837/head
Ilya Lavrenov 11 years ago
parent 1a7a262f74
commit 2040995801
  1. 81
      modules/core/src/opencl/minmaxloc.cl
  2. 219
      modules/core/src/opencl/reduce.cl
  3. 148
      modules/core/src/stat.cpp

@ -73,14 +73,26 @@
#define CALC_MAX(p, inc) #define CALC_MAX(p, inc)
#endif #endif
#ifdef OP_CALC2
#define CALC_MAX2(p) \
if (maxval2 < temp.p) \
maxval2 = temp.p
#else
#define CALC_MAX2(p)
#endif
#define CALC_P(p, inc) \ #define CALC_P(p, inc) \
CALC_MIN(p, inc) \ CALC_MIN(p, inc) \
CALC_MAX(p, inc) CALC_MAX(p, inc) \
CALC_MAX2(p)
__kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_offset, int cols, __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_offset, int cols,
int total, int groupnum, __global uchar * dstptr int total, int groupnum, __global uchar * dstptr
#ifdef HAVE_MASK #ifdef HAVE_MASK
, __global const uchar * mask, int mask_step, int mask_offset , __global const uchar * mask, int mask_step, int mask_offset
#endif
#ifdef HAVE_SRC2
, __global const uchar * src2ptr, int src2_step, int src2_offset
#endif #endif
) )
{ {
@ -92,36 +104,46 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
#ifdef HAVE_MASK #ifdef HAVE_MASK
mask += mask_offset; mask += mask_offset;
#endif #endif
#ifdef HAVE_SRC2
src2ptr += src2_offset;
#endif
#ifdef NEED_MINVAL #ifdef NEED_MINVAL
__local dstT1 localmem_min[WGS2_ALIGNED]; __local dstT1 localmem_min[WGS2_ALIGNED];
dstT1 minval = MAX_VAL;
#ifdef NEED_MINLOC #ifdef NEED_MINLOC
__local uint localmem_minloc[WGS2_ALIGNED]; __local uint localmem_minloc[WGS2_ALIGNED];
uint minloc = INDEX_MAX;
#endif #endif
#endif #endif
#ifdef NEED_MAXVAL #ifdef NEED_MAXVAL
dstT1 maxval = MIN_VAL;
__local dstT1 localmem_max[WGS2_ALIGNED]; __local dstT1 localmem_max[WGS2_ALIGNED];
#ifdef NEED_MAXLOC #ifdef NEED_MAXLOC
__local uint localmem_maxloc[WGS2_ALIGNED]; __local uint localmem_maxloc[WGS2_ALIGNED];
uint maxloc = INDEX_MAX;
#endif
#endif #endif
#ifdef OP_CALC2
__local dstT1 localmem_max2[WGS2_ALIGNED];
dstT1 maxval2 = MIN_VAL;
#endif #endif
dstT1 minval = MAX_VAL, maxval = MIN_VAL;
dstT temp;
uint minloc = INDEX_MAX, maxloc = INDEX_MAX;
int src_index; int src_index;
#ifdef HAVE_MASK #ifdef HAVE_MASK
int mask_index; int mask_index;
#endif #endif
#ifdef HAVE_SRC2
int src2_index;
#endif
for (int grain = groupnum * WGS * kercn; id < total; id += grain) dstT temp;
{ #ifdef HAVE_SRC2
#ifdef HAVE_SRC_CONT dstT temp2;
src_index = mul24(id, (int)sizeof(srcT1));
#else
src_index = mad24(id / cols, src_step, mul24(id % cols, (int)sizeof(srcT1)));
#endif #endif
for (int grain = groupnum * WGS * kercn; id < total; id += grain)
{
#ifdef HAVE_MASK #ifdef HAVE_MASK
#ifdef HAVE_MASK_CONT #ifdef HAVE_MASK_CONT
mask_index = id; mask_index = id;
@ -131,7 +153,26 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
if (mask[mask_index]) if (mask[mask_index])
#endif #endif
{ {
#ifdef HAVE_SRC_CONT
src_index = mul24(id, (int)sizeof(srcT1));
#else
src_index = mad24(id / cols, src_step, mul24(id % cols, (int)sizeof(srcT1)));
#endif
temp = convertToDT(*(__global const srcT *)(srcptr + src_index)); temp = convertToDT(*(__global const srcT *)(srcptr + src_index));
#ifdef OP_ABS
temp = temp >= (dstT)(0) ? temp : -temp;
#endif
#ifdef HAVE_SRC2
#ifdef HAVE_SRC2_CONT
src2_index = mul24(id, (int)sizeof(srcT1));
#else
src2_index = mad24(id / cols, src2_step, mul24(id % cols, (int)sizeof(srcT1)));
#endif
temp2 = convertToDT(*(__global const srcT *)(src2ptr + src2_index));
temp = temp > temp2 ? temp - temp2 : (temp2 - temp);
#endif
#if kercn == 1 #if kercn == 1
#ifdef NEED_MINVAL #ifdef NEED_MINVAL
if (minval > temp) if (minval > temp)
@ -150,6 +191,11 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
maxloc = id; maxloc = id;
#endif #endif
} }
#ifdef OP_CALC2
temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2;
if (maxval2 < temp2)
maxval2 = temp2;
#endif
#endif #endif
#elif kercn >= 2 #elif kercn >= 2
CALC_P(s0, 0) CALC_P(s0, 0)
@ -191,6 +237,9 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
#endif #endif
#ifdef NEED_MAXLOC #ifdef NEED_MAXLOC
localmem_maxloc[lid] = maxloc; localmem_maxloc[lid] = maxloc;
#endif
#ifdef OP_CALC2
localmem_max2[lid] = maxval2;
#endif #endif
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -221,6 +270,10 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
#endif #endif
localmem_max[lid3] = maxval; localmem_max[lid3] = maxval;
} }
#endif
#ifdef OP_CALC2
if (localmem_max2[lid3] < maxval2)
localmem_max2[lid3] = maxval2;
#endif #endif
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -254,6 +307,10 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
#endif #endif
localmem_max[lid] = localmem_max[lid2]; localmem_max[lid] = localmem_max[lid2];
} }
#endif
#ifdef OP_CALC2
if (localmem_max2[lid] < localmem_max2[lid2])
localmem_max2[lid] = localmem_max2[lid2];
#endif #endif
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -276,6 +333,10 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
#endif #endif
#ifdef NEED_MAXLOC #ifdef NEED_MAXLOC
*(__global uint *)(dstptr + mad24(gid, (int)sizeof(uint), pos)) = localmem_maxloc[0]; *(__global uint *)(dstptr + mad24(gid, (int)sizeof(uint), pos)) = localmem_maxloc[0];
#endif
#ifdef OP_CALC2
pos = mad24(groupnum, (int)sizeof(uint), pos);
*(__global dstT1 *)(dstptr + mad24(gid, (int)sizeof(dstT1), pos)) = localmem_max2[0];
#endif #endif
} }
} }

@ -109,13 +109,22 @@
#endif #endif
#ifdef HAVE_MASK #ifdef HAVE_MASK
#ifdef HAVE_SRC2
#define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset, __global const uchar * src2ptr, int src2_step, int src2_offset
#else
#define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset #define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset
#endif
#else
#ifdef HAVE_SRC2
#define EXTRA_PARAMS , __global const uchar * src2ptr, int src2_step, int src2_offset
#else #else
#define EXTRA_PARAMS #define EXTRA_PARAMS
#endif #endif
#endif
// accumulative reduction stuff // accumulative reduction stuff
#if defined OP_SUM || defined OP_SUM_ABS || defined OP_SUM_SQR || defined OP_DOT #if defined OP_SUM || defined OP_SUM_ABS || defined OP_SUM_SQR || defined OP_DOT
#ifdef OP_DOT #ifdef OP_DOT
#if ddepth <= 4 #if ddepth <= 4
#define FUNC(a, b, c) a = mad24(b, c, a) #define FUNC(a, b, c) a = mad24(b, c, a)
@ -137,18 +146,48 @@
#endif #endif
#endif #endif
#ifdef OP_CALC2
#define DECLARE_LOCAL_MEM \
__local dstT localmem[WGS2_ALIGNED]; \
__local dstT localmem2[WGS2_ALIGNED]
#define DEFINE_ACCUMULATOR \
dstT accumulator = (dstT)(0); \
dstT accumulator2 = (dstT)(0)
#else
#define DECLARE_LOCAL_MEM \ #define DECLARE_LOCAL_MEM \
__local dstT localmem[WGS2_ALIGNED] __local dstT localmem[WGS2_ALIGNED]
#define DEFINE_ACCUMULATOR \ #define DEFINE_ACCUMULATOR \
dstT accumulator = (dstT)(0) dstT accumulator = (dstT)(0)
#endif
#ifdef HAVE_SRC2
#ifdef OP_CALC2
#define PROCESS_ELEMS \
dstT temp = convertToDT(loadpix(srcptr + src_index)) - convertToDT(loadpix(src2ptr + src2_index)); \
dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp -= temp2; \
temp = temp > (dstT)(0) ? temp : -temp; \
FUNC(accumulator2, temp2); \
FUNC(accumulator, temp)
#else
#define PROCESS_ELEMS \
dstT temp = convertToDT(loadpix(srcptr + src_index)); \
dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp)
#endif
#else
#define PROCESS_ELEMS \
dstT temp = convertToDT(loadpix(srcptr + src_index)); \
FUNC(accumulator, temp)
#endif
#ifdef HAVE_MASK #ifdef HAVE_MASK
#define REDUCE_GLOBAL \ #define REDUCE_GLOBAL \
MASK_INDEX; \ MASK_INDEX; \
if (mask[mask_index]) \ if (mask[mask_index]) \
{ \ { \
dstT temp = convertToDT(loadpix(srcptr + src_index)); \ PROCESS_ELEMS; \
FUNC(accumulator, temp); \
} }
#elif defined OP_DOT #elif defined OP_DOT
@ -211,7 +250,158 @@
FUNC(accumulator, temp.sF, temp2.sF) FUNC(accumulator, temp.sF, temp2.sF)
#endif #endif
#else #else // sum or norm with 2 args
#ifdef HAVE_SRC2
#ifdef OP_CALC2 // norm relative
#if kercn == 1
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp); \
FUNC(accumulator2, temp2)
#elif kercn == 2
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp.s0); \
FUNC(accumulator, temp.s1); \
FUNC(accumulator2, temp2.s0); \
FUNC(accumulator2, temp2.s1)
#elif kercn == 4
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp.s0); \
FUNC(accumulator, temp.s1); \
FUNC(accumulator, temp.s2); \
FUNC(accumulator, temp.s3); \
FUNC(accumulator2, temp2.s0); \
FUNC(accumulator2, temp2.s1); \
FUNC(accumulator2, temp2.s2); \
FUNC(accumulator2, temp2.s3)
#elif kercn == 8
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp.s0); \
FUNC(accumulator, temp.s1); \
FUNC(accumulator, temp.s2); \
FUNC(accumulator, temp.s3); \
FUNC(accumulator, temp.s4); \
FUNC(accumulator, temp.s5); \
FUNC(accumulator, temp.s6); \
FUNC(accumulator, temp.s7); \
FUNC(accumulator2, temp2.s0); \
FUNC(accumulator2, temp2.s1); \
FUNC(accumulator2, temp2.s2); \
FUNC(accumulator2, temp2.s3); \
FUNC(accumulator2, temp2.s4); \
FUNC(accumulator2, temp2.s5); \
FUNC(accumulator2, temp2.s6); \
FUNC(accumulator2, temp2.s7)
#elif kercn == 16
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp.s0); \
FUNC(accumulator, temp.s1); \
FUNC(accumulator, temp.s2); \
FUNC(accumulator, temp.s3); \
FUNC(accumulator, temp.s4); \
FUNC(accumulator, temp.s5); \
FUNC(accumulator, temp.s6); \
FUNC(accumulator, temp.s7); \
FUNC(accumulator, temp.s8); \
FUNC(accumulator, temp.s9); \
FUNC(accumulator, temp.sA); \
FUNC(accumulator, temp.sB); \
FUNC(accumulator, temp.sC); \
FUNC(accumulator, temp.sD); \
FUNC(accumulator, temp.sE); \
FUNC(accumulator, temp.sF); \
FUNC(accumulator2, temp2.s0); \
FUNC(accumulator2, temp2.s1); \
FUNC(accumulator2, temp2.s2); \
FUNC(accumulator2, temp2.s3); \
FUNC(accumulator2, temp2.s4); \
FUNC(accumulator2, temp2.s5); \
FUNC(accumulator2, temp2.s6); \
FUNC(accumulator2, temp2.s7); \
FUNC(accumulator2, temp2.s8); \
FUNC(accumulator2, temp2.s9); \
FUNC(accumulator2, temp2.sA); \
FUNC(accumulator2, temp2.sB); \
FUNC(accumulator2, temp2.sC); \
FUNC(accumulator2, temp2.sD); \
FUNC(accumulator2, temp2.sE); \
FUNC(accumulator2, temp2.sF)
#endif
#else // norm with 2 args
#if kercn == 1
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp)
#elif kercn == 2
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp.s0); \
FUNC(accumulator, temp.s1)
#elif kercn == 4
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp.s0); \
FUNC(accumulator, temp.s1); \
FUNC(accumulator, temp.s2); \
FUNC(accumulator, temp.s3)
#elif kercn == 8
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp.s0); \
FUNC(accumulator, temp.s1); \
FUNC(accumulator, temp.s2); \
FUNC(accumulator, temp.s3); \
FUNC(accumulator, temp.s4); \
FUNC(accumulator, temp.s5); \
FUNC(accumulator, temp.s6); \
FUNC(accumulator, temp.s7)
#elif kercn == 16
#define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
FUNC(accumulator, temp.s0); \
FUNC(accumulator, temp.s1); \
FUNC(accumulator, temp.s2); \
FUNC(accumulator, temp.s3); \
FUNC(accumulator, temp.s4); \
FUNC(accumulator, temp.s5); \
FUNC(accumulator, temp.s6); \
FUNC(accumulator, temp.s7); \
FUNC(accumulator, temp.s8); \
FUNC(accumulator, temp.s9); \
FUNC(accumulator, temp.sA); \
FUNC(accumulator, temp.sB); \
FUNC(accumulator, temp.sC); \
FUNC(accumulator, temp.sD); \
FUNC(accumulator, temp.sE); \
FUNC(accumulator, temp.sF)
#endif
#endif
#else // sum
#if kercn == 1 #if kercn == 1
#define REDUCE_GLOBAL \ #define REDUCE_GLOBAL \
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
@ -260,6 +450,7 @@
FUNC(accumulator, temp.sF) FUNC(accumulator, temp.sF)
#endif #endif
#endif #endif
#endif
#define SET_LOCAL_1 \ #define SET_LOCAL_1 \
localmem[lid] = accumulator localmem[lid] = accumulator
@ -325,6 +516,20 @@
accumulator += value.sF == zero ? zero : one accumulator += value.sF == zero ? zero : one
#endif #endif
#ifdef OP_CALC2
#define SET_LOCAL_1 \
localmem[lid] = accumulator; \
localmem2[lid] = accumulator2; \
#define REDUCE_LOCAL_1 \
localmem[lid - WGS2_ALIGNED] += accumulator; \
localmem2[lid - WGS2_ALIGNED] += accumulator2
#define REDUCE_LOCAL_2 \
localmem[lid] += localmem[lid2]; \
localmem2[lid] += localmem2[lid2]
#define CALC_RESULT \
storepix(localmem[0], dstptr + dstTSIZE * gid); \
storepix(localmem2[0], dstptr + mad24(groupnum, srcTSIZE, dstTSIZE * gid))
#else
#define SET_LOCAL_1 \ #define SET_LOCAL_1 \
localmem[lid] = accumulator localmem[lid] = accumulator
#define REDUCE_LOCAL_1 \ #define REDUCE_LOCAL_1 \
@ -333,6 +538,7 @@
localmem[lid] += localmem[lid2] localmem[lid] += localmem[lid2]
#define CALC_RESULT \ #define CALC_RESULT \
storepix(localmem[0], dstptr + dstTSIZE * gid) storepix(localmem[0], dstptr + dstTSIZE * gid)
#endif
// norm (NORM_INF) with cn > 1 and mask // norm (NORM_INF) with cn > 1 and mask
#elif defined OP_NORM_INF_MASK #elif defined OP_NORM_INF_MASK
@ -384,6 +590,13 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset
int src_index = mul24(id, srcTSIZE); int src_index = mul24(id, srcTSIZE);
#else #else
int src_index = mad24(id / cols, src_step, mul24(id % cols, srcTSIZE)); int src_index = mad24(id / cols, src_step, mul24(id % cols, srcTSIZE));
#endif
#ifdef HAVE_SRC2
#ifdef HAVE_SRC2_CONT
int src2_index = mul24(id, srcTSIZE);
#else
int src2_index = mad24(id / cols, src2_step, mul24(id % cols, srcTSIZE));
#endif
#endif #endif
REDUCE_GLOBAL; REDUCE_GLOBAL;
} }

@ -469,21 +469,25 @@ template <typename T> Scalar ocl_part_sum(Mat m)
enum { OCL_OP_SUM = 0, OCL_OP_SUM_ABS = 1, OCL_OP_SUM_SQR = 2 }; enum { OCL_OP_SUM = 0, OCL_OP_SUM_ABS = 1, OCL_OP_SUM_SQR = 2 };
static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask = noArray() ) static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask = noArray(),
InputArray _src2 = noArray(), bool calc2 = false, const Scalar & res2 = Scalar() )
{ {
CV_Assert(sum_op == OCL_OP_SUM || sum_op == OCL_OP_SUM_ABS || sum_op == OCL_OP_SUM_SQR); CV_Assert(sum_op == OCL_OP_SUM || sum_op == OCL_OP_SUM_ABS || sum_op == OCL_OP_SUM_SQR);
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0, const ocl::Device & dev = ocl::Device::getDefault();
haveMask = _mask.kind() != _InputArray::NONE; bool doubleSupport = dev.doubleFPConfig() > 0,
haveMask = _mask.kind() != _InputArray::NONE,
haveSrc2 = _src2.kind() != _InputArray::NONE;
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
kercn = cn == 1 && !haveMask ? ocl::predictOptimalVectorWidth(_src) : 1, kercn = cn == 1 && !haveMask ? ocl::predictOptimalVectorWidth(_src) : 1,
mcn = std::max(cn, kercn); mcn = std::max(cn, kercn);
CV_Assert(!haveSrc2 || _src2.type() == type);
if ( (!doubleSupport && depth == CV_64F) || cn > 4 ) if ( (!doubleSupport && depth == CV_64F) || cn > 4 )
return false; return false;
int dbsize = ocl::Device::getDefault().maxComputeUnits(); int ngroups = dev.maxComputeUnits(), dbsize = ngroups * (calc2 ? 2 : 1);
size_t wgs = ocl::Device::getDefault().maxWorkGroupSize(); size_t wgs = dev.maxWorkGroupSize();
int ddepth = std::max(sum_op == OCL_OP_SUM_SQR ? CV_32F : CV_32S, depth), int ddepth = std::max(sum_op == OCL_OP_SUM_SQR ? CV_32F : CV_32S, depth),
dtype = CV_MAKE_TYPE(ddepth, cn); dtype = CV_MAKE_TYPE(ddepth, cn);
@ -497,7 +501,7 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" }; static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" };
char cvt[40]; char cvt[40];
String opts = format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D dstT1=%s -D ddepth=%d -D cn=%d" String opts = format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D dstT1=%s -D ddepth=%d -D cn=%d"
" -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s -D kercn=%d", " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s -D kercn=%d%s%s%s",
ocl::typeToStr(CV_MAKE_TYPE(depth, mcn)), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKE_TYPE(depth, mcn)), ocl::typeToStr(depth),
ocl::typeToStr(dtype), ocl::typeToStr(CV_MAKE_TYPE(ddepth, mcn)), ocl::typeToStr(dtype), ocl::typeToStr(CV_MAKE_TYPE(ddepth, mcn)),
ocl::typeToStr(ddepth), ddepth, cn, ocl::typeToStr(ddepth), ddepth, cn,
@ -506,30 +510,49 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
doubleSupport ? " -D DOUBLE_SUPPORT" : "", doubleSupport ? " -D DOUBLE_SUPPORT" : "",
haveMask ? " -D HAVE_MASK" : "", haveMask ? " -D HAVE_MASK" : "",
_src.isContinuous() ? " -D HAVE_SRC_CONT" : "", _src.isContinuous() ? " -D HAVE_SRC_CONT" : "",
_mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn); haveMask && _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn,
haveSrc2 ? " -D HAVE_SRC2" : "", calc2 ? " -D OP_CALC2" : "",
haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "");
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts); ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts);
if (k.empty()) if (k.empty())
return false; return false;
UMat src = _src.getUMat(), db(1, dbsize, dtype), mask = _mask.getUMat(); UMat src = _src.getUMat(), src2 = _src2.getUMat(),
db(1, dbsize, dtype), mask = _mask.getUMat();
ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
dbarg = ocl::KernelArg::PtrWriteOnly(db), dbarg = ocl::KernelArg::PtrWriteOnly(db),
maskarg = ocl::KernelArg::ReadOnlyNoSize(mask); maskarg = ocl::KernelArg::ReadOnlyNoSize(mask),
src2arg = ocl::KernelArg::ReadOnlyNoSize(src2);
if (haveMask) if (haveMask)
k.args(srcarg, src.cols, (int)src.total(), dbsize, dbarg, maskarg); {
if (haveSrc2)
k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, maskarg, src2arg);
else
k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, maskarg);
}
else else
k.args(srcarg, src.cols, (int)src.total(), dbsize, dbarg); {
if (haveSrc2)
k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, src2arg);
else
k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg);
}
size_t globalsize = dbsize * wgs; size_t globalsize = ngroups * wgs;
if (k.run(1, &globalsize, &wgs, false)) if (k.run(1, &globalsize, &wgs, false))
{ {
typedef Scalar (*part_sum)(Mat m); typedef Scalar (*part_sum)(Mat m);
part_sum funcs[3] = { ocl_part_sum<int>, ocl_part_sum<float>, ocl_part_sum<double> }, part_sum funcs[3] = { ocl_part_sum<int>, ocl_part_sum<float>, ocl_part_sum<double> },
func = funcs[ddepth - CV_32S]; func = funcs[ddepth - CV_32S];
res = func(db.getMat(ACCESS_READ));
Mat mres = db.getMat(ACCESS_READ);
if (calc2)
const_cast<Scalar &>(res2) = func(mres.colRange(dbsize, dbsize));
res = func(mres.colRange(0, dbsize));
return true; return true;
} }
return false; return false;
@ -1396,18 +1419,21 @@ typedef void (*getMinMaxResFunc)(const Mat & db, double *minVal, double *maxVal,
int *minLoc, int *maxLoc, int gropunum, int cols); int *minLoc, int *maxLoc, int gropunum, int cols);
static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* minLoc, int* maxLoc, InputArray _mask, static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* minLoc, int* maxLoc, InputArray _mask,
int ddepth = -1, bool absValues = false) int ddepth = -1, bool absValues = false, InputArray _src2 = noArray(), bool calc2 = false)
{ {
CV_Assert( (_src.channels() == 1 && (_mask.empty() || _mask.type() == CV_8U)) || CV_Assert( (_src.channels() == 1 && (_mask.empty() || _mask.type() == CV_8U)) ||
(_src.channels() >= 1 && _mask.empty() && !minLoc && !maxLoc) ); (_src.channels() >= 1 && _mask.empty() && !minLoc && !maxLoc) );
const ocl::Device & dev = ocl::Device::getDefault(); const ocl::Device & dev = ocl::Device::getDefault();
bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty(); bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty(),
haveSrc2 = _src2.kind() != _InputArray::NONE;
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
kercn = haveMask ? 1 : std::min(4, ocl::predictOptimalVectorWidth(_src)); kercn = haveMask ? 1 : std::min(4, ocl::predictOptimalVectorWidth(_src));
if (ddepth < 0) if (ddepth < 0)
ddepth = depth; ddepth = depth;
CV_Assert(!haveSrc2 || _src2.type() == type);
if ((depth == CV_64F || ddepth == CV_64F) && !doubleSupport) if ((depth == CV_64F || ddepth == CV_64F) && !doubleSupport)
return false; return false;
@ -1435,7 +1461,7 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
char cvt[40]; char cvt[40];
String opts = format("-D DEPTH_%d -D srcT1=%s%s -D WGS=%d -D srcT=%s" String opts = format("-D DEPTH_%d -D srcT1=%s%s -D WGS=%d -D srcT=%s"
" -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d%s%s%s%s" " -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d%s%s%s%s"
" -D dstT1=%s -D dstT=%s -D convertToDT=%s%s", " -D dstT1=%s -D dstT=%s -D convertToDT=%s%s%s%s%s",
depth, ocl::typeToStr(depth), haveMask ? " -D HAVE_MASK" : "", (int)wgs, depth, ocl::typeToStr(depth), haveMask ? " -D HAVE_MASK" : "", (int)wgs,
ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), wgs2_aligned, ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), wgs2_aligned,
doubleSupport ? " -D DOUBLE_SUPPORT" : "", doubleSupport ? " -D DOUBLE_SUPPORT" : "",
@ -1444,7 +1470,9 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
needMinVal ? " -D NEED_MINVAL" : "", needMaxVal ? " -D NEED_MAXVAL" : "", needMinVal ? " -D NEED_MINVAL" : "", needMaxVal ? " -D NEED_MAXVAL" : "",
needMinLoc ? " -D NEED_MINLOC" : "", needMaxLoc ? " -D NEED_MAXLOC" : "", needMinLoc ? " -D NEED_MINLOC" : "", needMaxLoc ? " -D NEED_MAXLOC" : "",
ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)), ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)),
ocl::convertTypeStr(depth, ddepth, kercn, cvt), absValues ? " -D OP_ABS" : ""); ocl::convertTypeStr(depth, ddepth, kercn, cvt), absValues ? " -D OP_ABS" : "",
haveSrc2 ? " -D HAVE_SRC2" : "", calc2 ? " -D OP_CALC2" : "",
haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "");
ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts); ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts);
if (k.empty()) if (k.empty())
@ -1452,18 +1480,35 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
int esz = CV_ELEM_SIZE(ddepth), esz32s = CV_ELEM_SIZE1(CV_32S), int esz = CV_ELEM_SIZE(ddepth), esz32s = CV_ELEM_SIZE1(CV_32S),
dbsize = groupnum * ((needMinVal ? esz : 0) + (needMaxVal ? esz : 0) + dbsize = groupnum * ((needMinVal ? esz : 0) + (needMaxVal ? esz : 0) +
(needMinLoc ? esz32s : 0) + (needMaxLoc ? esz32s : 0)); (needMinLoc ? esz32s : 0) + (needMaxLoc ? esz32s : 0) +
UMat src = _src.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat(); (calc2 ? esz : 0));
UMat src = _src.getUMat(), src2 = _src2.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat();
if (cn > 1) if (cn > 1)
{
src = src.reshape(1); src = src.reshape(1);
src2 = src2.reshape(1);
}
if (!haveMask) if (haveSrc2)
k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), {
groupnum, ocl::KernelArg::PtrWriteOnly(db)); if (!haveMask)
k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(src2));
else
k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask),
ocl::KernelArg::ReadOnlyNoSize(src2));
}
else else
k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), {
groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask)); if (!haveMask)
k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
groupnum, ocl::KernelArg::PtrWriteOnly(db));
else
k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask));
}
size_t globalsize = groupnum * wgs; size_t globalsize = groupnum * wgs;
if (!k.run(1, &globalsize, &wgs, false)) if (!k.run(1, &globalsize, &wgs, false))
@ -2498,38 +2543,45 @@ namespace cv {
static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask, double & result ) static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask, double & result )
{ {
const ocl::Device & d = ocl::Device::getDefault(); Scalar sc1, sc2;
int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), rowsPerWI = d.isIntel() ? 4 : 1; int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
bool doubleSupport = d.doubleFPConfig() > 0; bool relative = (normType & NORM_RELATIVE) != 0,
bool relative = (normType & NORM_RELATIVE) != 0; normsum = normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR;
normType &= ~NORM_RELATIVE; normType &= ~NORM_RELATIVE;
if ( !(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR) || if ( !(normType == NORM_INF || normsum) )
(!doubleSupport && depth == CV_64F))
return false; return false;
int wdepth = std::max(CV_32S, depth); if (normsum)
char cvt[50]; {
ocl::Kernel k("KF", ocl::core::arithm_oclsrc, if (!ocl_sum(_src1, sc1, normType == NORM_L2 || normType == NORM_L2SQR ?
format("-D BINARY_OP -D OP_ABSDIFF -D dstT=%s -D workT=dstT -D srcT1=%s -D srcT2=srcT1" OCL_OP_SUM_SQR : OCL_OP_SUM, _mask, _src2, relative, sc2))
" -D convertToDT=%s -D convertToWT1=convertToDT -D convertToWT2=convertToDT -D rowsPerWI=%d%s", return false;
ocl::typeToStr(wdepth), ocl::typeToStr(depth), }
ocl::convertTypeStr(depth, wdepth, 1, cvt), rowsPerWI, else
doubleSupport ? " -D DOUBLE_SUPPORT" : "")); {
if (k.empty()) if (!ocl_minMaxIdx(_src1, NULL, &result, NULL, NULL, _mask, std::max(CV_32S, depth),
return false; false, _src2, relative))
return false;
}
UMat src1 = _src1.getUMat(), src2 = _src2.getUMat(), diff(src1.size(), CV_MAKE_TYPE(wdepth, cn)); double s2 = 0;
k.args(ocl::KernelArg::ReadOnlyNoSize(src1), ocl::KernelArg::ReadOnlyNoSize(src2), for (int i = 0; i < cn; ++i)
ocl::KernelArg::WriteOnly(diff, cn)); {
result += sc1[i];
if (relative)
s2 += sc2[i];
}
size_t globalsize[2] = { diff.cols * cn, (diff.rows + rowsPerWI - 1) / rowsPerWI }; if (normType == NORM_L2)
if (!k.run(2, globalsize, NULL, false)) {
return false; result = std::sqrt(result);
if (relative)
s2 = std::sqrt(s2);
}
result = cv::norm(diff, normType, _mask);
if (relative) if (relative)
result /= cv::norm(src2, normType, _mask) + DBL_EPSILON; result /= (s2 + DBL_EPSILON);
return true; return true;
} }

Loading…
Cancel
Save