3-channel reduction operations

pull/2470/head
Ilya Lavrenov 11 years ago
parent fd0ab8eba1
commit 9932cf41a4
  1. 16
      modules/core/perf/opencl/perf_arithm.cpp
  2. 48
      modules/core/src/opencl/reduce.cl
  3. 9
      modules/core/src/stat.cpp

@ -344,7 +344,7 @@ OCL_PERF_TEST_P(FlipFixture, Flip,
typedef Size_MatType MinMaxLocFixture; typedef Size_MatType MinMaxLocFixture;
OCL_PERF_TEST_P(MinMaxLocFixture, MinMaxLoc, OCL_PERF_TEST_P(MinMaxLocFixture, MinMaxLoc,
::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES)) ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES_134))
{ {
const Size_MatType_t params = GetParam(); const Size_MatType_t params = GetParam();
const Size srcSize = get<0>(params); const Size srcSize = get<0>(params);
@ -380,7 +380,7 @@ typedef Size_MatType SumFixture;
OCL_PERF_TEST_P(SumFixture, Sum, OCL_PERF_TEST_P(SumFixture, Sum,
::testing::Combine(OCL_TEST_SIZES, ::testing::Combine(OCL_TEST_SIZES,
OCL_TEST_TYPES)) OCL_TEST_TYPES_134))
{ {
const Size_MatType_t params = GetParam(); const Size_MatType_t params = GetParam();
const Size srcSize = get<0>(params); const Size srcSize = get<0>(params);
@ -652,7 +652,8 @@ OCL_PERF_TEST_P(SetIdentityFixture, SetIdentity,
typedef Size_MatType MeanStdDevFixture; typedef Size_MatType MeanStdDevFixture;
OCL_PERF_TEST_P(MeanStdDevFixture, MeanStdDev, OCL_PERF_TEST_P(MeanStdDevFixture, MeanStdDev,
::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), OCL_TEST_TYPES)) ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3),
OCL_TEST_TYPES_134))
{ {
const Size_MatType_t params = GetParam(); const Size_MatType_t params = GetParam();
const Size srcSize = get<0>(params); const Size srcSize = get<0>(params);
@ -688,7 +689,8 @@ typedef std::tr1::tuple<Size, MatType, NormType> NormParams;
typedef TestBaseWithParam<NormParams> NormFixture; typedef TestBaseWithParam<NormParams> NormFixture;
OCL_PERF_TEST_P(NormFixture, Norm, OCL_PERF_TEST_P(NormFixture, Norm,
::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), OCL_TEST_TYPES, NormType::all())) ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3),
OCL_TEST_TYPES_134, NormType::all()))
{ {
const NormParams params = GetParam(); const NormParams params = GetParam();
const Size srcSize = get<0>(params); const Size srcSize = get<0>(params);
@ -711,7 +713,8 @@ OCL_PERF_TEST_P(NormFixture, Norm,
typedef Size_MatType UMatDotFixture; typedef Size_MatType UMatDotFixture;
OCL_PERF_TEST_P(UMatDotFixture, UMatDot, OCL_PERF_TEST_P(UMatDotFixture, UMatDot,
::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), OCL_TEST_TYPES)) ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3),
OCL_TEST_TYPES_134))
{ {
const Size_MatType_t params = GetParam(); const Size_MatType_t params = GetParam();
const Size srcSize = get<0>(params); const Size srcSize = get<0>(params);
@ -820,7 +823,8 @@ typedef tuple<Size, MatType, NormalizeModes> NormalizeParams;
typedef TestBaseWithParam<NormalizeParams> NormalizeFixture; typedef TestBaseWithParam<NormalizeParams> NormalizeFixture;
OCL_PERF_TEST_P(NormalizeFixture, Normalize, OCL_PERF_TEST_P(NormalizeFixture, Normalize,
::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, NormalizeModes::all())) ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES_134,
NormalizeModes::all()))
{ {
const NormalizeParams params = GetParam(); const NormalizeParams params = GetParam();
const Size srcSize = get<0>(params); const Size srcSize = get<0>(params);

@ -52,6 +52,18 @@
#define noconvert #define noconvert
#if cn != 3
#define loadpix(addr) *(__global const srcT *)(addr)
#define storepix(val, addr) *(__global dstT *)(addr) = val
#define srcTSIZE (int)sizeof(srcT)
#define dstTSIZE (int)sizeof(dstT)
#else
#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))
#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))
#define srcTSIZE ((int)sizeof(srcT1)*3)
#define dstTSIZE ((int)sizeof(dstT1)*3)
#endif
#ifdef HAVE_MASK #ifdef HAVE_MASK
#define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset #define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset
#else #else
@ -88,19 +100,20 @@
#ifdef HAVE_MASK #ifdef HAVE_MASK
#define REDUCE_GLOBAL \ #define REDUCE_GLOBAL \
dstT temp = convertToDT(src[0]); \
int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols)); \ int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols)); \
if (mask[mask_index]) \ if (mask[mask_index]) \
FUNC(accumulator, temp) { \
dstT temp = convertToDT(loadpix(srcptr + src_index)); \
FUNC(accumulator, temp); \
}
#elif defined OP_DOT #elif defined OP_DOT
#define REDUCE_GLOBAL \ #define REDUCE_GLOBAL \
int src2_index = mad24(id / cols, src2_step, mad24(id % cols, (int)sizeof(srcT), src2_offset)); \ int src2_index = mad24(id / cols, src2_step, mad24(id % cols, srcTSIZE, src2_offset)); \
__global const srcT * src2 = (__global const srcT *)(src2ptr + src2_index); \ dstT temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
dstT temp = convertToDT(src[0]), temp2 = convertToDT(src2[0]); \
FUNC(accumulator, temp, temp2) FUNC(accumulator, temp, temp2)
#else #else
#define REDUCE_GLOBAL \ #define REDUCE_GLOBAL \
dstT temp = convertToDT(src[0]); \ dstT temp = convertToDT(loadpix(srcptr + src_index)); \
FUNC(accumulator, temp) FUNC(accumulator, temp)
#endif #endif
@ -111,8 +124,7 @@
#define REDUCE_LOCAL_2 \ #define REDUCE_LOCAL_2 \
localmem[lid] += localmem[lid2] localmem[lid] += localmem[lid2]
#define CALC_RESULT \ #define CALC_RESULT \
__global dstT * dst = (__global dstT *)(dstptr + (int)sizeof(dstT) * gid); \ storepix(localmem[0], dstptr + dstTSIZE * gid)
dst[0] = localmem[0]
// countNonZero stuff // countNonZero stuff
#elif defined OP_COUNT_NON_ZERO #elif defined OP_COUNT_NON_ZERO
@ -123,7 +135,7 @@
dstT accumulator = (dstT)(0); \ dstT accumulator = (dstT)(0); \
srcT zero = (srcT)(0), one = (srcT)(1) srcT zero = (srcT)(0), one = (srcT)(1)
#define REDUCE_GLOBAL \ #define REDUCE_GLOBAL \
accumulator += src[0] == zero ? zero : one accumulator += loadpix(srcptr + src_index) == zero ? zero : one
#define SET_LOCAL_1 \ #define SET_LOCAL_1 \
localmem[lid] = accumulator localmem[lid] = accumulator
#define REDUCE_LOCAL_1 \ #define REDUCE_LOCAL_1 \
@ -131,8 +143,7 @@
#define REDUCE_LOCAL_2 \ #define REDUCE_LOCAL_2 \
localmem[lid] += localmem[lid2] localmem[lid] += localmem[lid2]
#define CALC_RESULT \ #define CALC_RESULT \
__global dstT * dst = (__global dstT *)(dstptr + (int)sizeof(dstT) * gid); \ storepix(localmem[0], dstptr + dstTSIZE * gid)
dst[0] = localmem[0]
// minMaxLoc stuff // minMaxLoc stuff
#elif defined OP_MIN_MAX_LOC || defined OP_MIN_MAX_LOC_MASK #elif defined OP_MIN_MAX_LOC || defined OP_MIN_MAX_LOC_MASK
@ -167,6 +178,8 @@
#define MAX_VAL DBL_MAX #define MAX_VAL DBL_MAX
#endif #endif
#define dstT srcT
#define DECLARE_LOCAL_MEM \ #define DECLARE_LOCAL_MEM \
__local srcT localmem_min[WGS2_ALIGNED]; \ __local srcT localmem_min[WGS2_ALIGNED]; \
__local srcT localmem_max[WGS2_ALIGNED]; \ __local srcT localmem_max[WGS2_ALIGNED]; \
@ -181,7 +194,7 @@
srcT temp; \ srcT temp; \
int temploc int temploc
#define REDUCE_GLOBAL \ #define REDUCE_GLOBAL \
temp = src[0]; \ temp = loadpix(srcptr + src_index); \
temploc = id; \ temploc = id; \
srcT temp_minval = minval, temp_maxval = maxval; \ srcT temp_minval = minval, temp_maxval = maxval; \
minval = min(minval, temp); \ minval = min(minval, temp); \
@ -217,10 +230,8 @@
localmem_maxloc[lid] : (max1 == max2) ? (max1 == oldmax) ? min(localmem_maxloc[lid2],localmem_maxloc[lid]) : \ localmem_maxloc[lid] : (max1 == max2) ? (max1 == oldmax) ? min(localmem_maxloc[lid2],localmem_maxloc[lid]) : \
localmem_maxloc[lid2] : localmem_maxloc[lid] localmem_maxloc[lid2] : localmem_maxloc[lid]
#define CALC_RESULT \ #define CALC_RESULT \
__global srcT * dstminval = (__global srcT *)(dstptr + (int)sizeof(srcT) * gid); \ storepix(localmem_min[0], dstptr + dstTSIZE * gid); \
__global srcT * dstmaxval = (__global srcT *)(dstptr2 + (int)sizeof(srcT) * gid); \ storepix(localmem_max[0], dstptr2 + dstTSIZE * gid); \
dstminval[0] = localmem_min[0]; \
dstmaxval[0] = localmem_max[0]; \
dstlocptr[gid] = localmem_minloc[0]; \ dstlocptr[gid] = localmem_minloc[0]; \
dstlocptr2[gid] = localmem_maxloc[0] dstlocptr2[gid] = localmem_maxloc[0]
@ -236,7 +247,7 @@
int temploc int temploc
#undef REDUCE_GLOBAL #undef REDUCE_GLOBAL
#define REDUCE_GLOBAL \ #define REDUCE_GLOBAL \
temp = src[0]; \ temp = loadpix(srcptr + src_index); \
temploc = id; \ temploc = id; \
int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols) * (int)sizeof(uchar)); \ int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols) * (int)sizeof(uchar)); \
__global const uchar * mask = (__global const uchar *)(maskptr + mask_index); \ __global const uchar * mask = (__global const uchar *)(maskptr + mask_index); \
@ -278,8 +289,7 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset
for (int grain = groupnum * WGS; id < total; id += grain) for (int grain = groupnum * WGS; id < total; id += grain)
{ {
int src_index = mad24(id / cols, src_step, mad24(id % cols, (int)sizeof(srcT), src_offset)); int src_index = mad24(id / cols, src_step, mad24(id % cols, srcTSIZE, src_offset));
__global const srcT * src = (__global const srcT *)(srcptr + src_index);
REDUCE_GLOBAL; REDUCE_GLOBAL;
} }

@ -475,7 +475,7 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
if ( (!doubleSupport && depth == CV_64F) || cn > 4 || cn == 3 ) if ( (!doubleSupport && depth == CV_64F) || cn > 4 )
return false; return false;
int dbsize = ocl::Device::getDefault().maxComputeUnits(); int dbsize = ocl::Device::getDefault().maxComputeUnits();
@ -494,8 +494,11 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" }; static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" };
char cvt[40]; char cvt[40];
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
format("-D srcT=%s -D dstT=%s -D ddepth=%d -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s", format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D ddepth=%d -D cn=%d"
ocl::typeToStr(type), ocl::typeToStr(dtype), ddepth, ocl::convertTypeStr(depth, ddepth, cn, cvt), " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s",
ocl::typeToStr(type), ocl::typeToStr(depth),
ocl::typeToStr(dtype), ocl::typeToStr(ddepth), ddepth, cn,
ocl::convertTypeStr(depth, ddepth, cn, cvt),
opMap[sum_op], (int)wgs, wgs2_aligned, opMap[sum_op], (int)wgs, wgs2_aligned,
doubleSupport ? " -D DOUBLE_SUPPORT" : "", doubleSupport ? " -D DOUBLE_SUPPORT" : "",
haveMask ? " -D HAVE_MASK" : "")); haveMask ? " -D HAVE_MASK" : ""));

Loading…
Cancel
Save