Merge pull request #2956 from ilya-lavrenov:tapi_accumulate

pull/3165/merge
Alexander Alekhin 10 years ago
commit b332152bef
  1. 20
      modules/core/include/opencv2/core/ocl.hpp
  2. 51
      modules/core/src/ocl.cpp
  3. 21
      modules/imgproc/src/accum.cpp
  4. 55
      modules/imgproc/src/opencl/accumulate.cl

@ -598,7 +598,27 @@ CV_EXPORTS const char* typeToStr(int t);
CV_EXPORTS const char* memopTypeToStr(int t); CV_EXPORTS const char* memopTypeToStr(int t);
CV_EXPORTS String kernelToStr(InputArray _kernel, int ddepth = -1, const char * name = NULL); CV_EXPORTS String kernelToStr(InputArray _kernel, int ddepth = -1, const char * name = NULL);
CV_EXPORTS void getPlatfomsInfo(std::vector<PlatformInfo>& platform_info); CV_EXPORTS void getPlatfomsInfo(std::vector<PlatformInfo>& platform_info);
enum OclVectorStrategy
{
// all matrices have its own vector width
OCL_VECTOR_OWN = 0,
// all matrices have maximal vector width among all matrices
// (useful for cases when matrices have different data types)
OCL_VECTOR_MAX = 1,
// default strategy
OCL_VECTOR_DEFAULT = OCL_VECTOR_OWN
};
CV_EXPORTS int predictOptimalVectorWidth(InputArray src1, InputArray src2 = noArray(), InputArray src3 = noArray(), CV_EXPORTS int predictOptimalVectorWidth(InputArray src1, InputArray src2 = noArray(), InputArray src3 = noArray(),
InputArray src4 = noArray(), InputArray src5 = noArray(), InputArray src6 = noArray(),
InputArray src7 = noArray(), InputArray src8 = noArray(), InputArray src9 = noArray(),
OclVectorStrategy strat = OCL_VECTOR_DEFAULT);
// with OCL_VECTOR_MAX strategy
CV_EXPORTS int predictOptimalVectorWidthMax(InputArray src1, InputArray src2 = noArray(), InputArray src3 = noArray(),
InputArray src4 = noArray(), InputArray src5 = noArray(), InputArray src6 = noArray(), InputArray src4 = noArray(), InputArray src5 = noArray(), InputArray src6 = noArray(),
InputArray src7 = noArray(), InputArray src8 = noArray(), InputArray src9 = noArray()); InputArray src7 = noArray(), InputArray src8 = noArray(), InputArray src9 = noArray());

@ -4451,42 +4451,46 @@ String kernelToStr(InputArray _kernel, int ddepth, const char * name)
if (!src.empty()) \ if (!src.empty()) \
{ \ { \
CV_Assert(src.isMat() || src.isUMat()); \ CV_Assert(src.isMat() || src.isUMat()); \
int ctype = src.type(), ccn = CV_MAT_CN(ctype); \
Size csize = src.size(); \ Size csize = src.size(); \
cols.push_back(ccn * csize.width); \ int ctype = src.type(), ccn = CV_MAT_CN(ctype), cdepth = CV_MAT_DEPTH(ctype), \
if (ctype != type) \ ckercn = vectorWidths[cdepth], cwidth = ccn * csize.width; \
if (cwidth < ckercn || ckercn <= 0) \
return 1; \
cols.push_back(cwidth); \
if (strat == OCL_VECTOR_OWN && ctype != ref_type) \
return 1; \ return 1; \
offsets.push_back(src.offset()); \ offsets.push_back(src.offset()); \
steps.push_back(src.step()); \ steps.push_back(src.step()); \
dividers.push_back(ckercn * CV_ELEM_SIZE1(ctype)); \
kercns.push_back(ckercn); \
} \ } \
} \ } \
while ((void)0, 0) while ((void)0, 0)
int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3, int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
InputArray src4, InputArray src5, InputArray src6, InputArray src4, InputArray src5, InputArray src6,
InputArray src7, InputArray src8, InputArray src9) InputArray src7, InputArray src8, InputArray src9,
OclVectorStrategy strat)
{ {
int type = src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz1 = CV_ELEM_SIZE1(depth);
Size ssize = src1.size();
const ocl::Device & d = ocl::Device::getDefault(); const ocl::Device & d = ocl::Device::getDefault();
int ref_type = src1.type();
int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(), int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(),
d.preferredVectorWidthShort(), d.preferredVectorWidthShort(), d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(), d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
d.preferredVectorWidthDouble(), -1 }, kercn = vectorWidths[depth]; d.preferredVectorWidthDouble(), -1 };
// if the device says don't use vectors // if the device says don't use vectors
if (vectorWidths[0] == 1) if (vectorWidths[0] == 1)
{ {
// it's heuristic // it's heuristic
int vectorWidthsOthers[] = { 16, 16, 8, 8, 1, 1, 1, -1 }; vectorWidths[CV_8U] = vectorWidths[CV_8S] = 16;
kercn = vectorWidthsOthers[depth]; vectorWidths[CV_16U] = vectorWidths[CV_16S] = 8;
vectorWidths[CV_32S] = vectorWidths[CV_32F] = vectorWidths[CV_64F] = 1;
} }
if (ssize.width * cn < kercn || kercn <= 0)
return 1;
std::vector<size_t> offsets, steps, cols; std::vector<size_t> offsets, steps, cols;
std::vector<int> dividers, kercns;
PROCESS_SRC(src1); PROCESS_SRC(src1);
PROCESS_SRC(src2); PROCESS_SRC(src2);
PROCESS_SRC(src3); PROCESS_SRC(src3);
@ -4498,27 +4502,24 @@ int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
PROCESS_SRC(src9); PROCESS_SRC(src9);
size_t size = offsets.size(); size_t size = offsets.size();
int wsz = kercn * esz1;
std::vector<int> dividers(size, wsz);
for (size_t i = 0; i < size; ++i) for (size_t i = 0; i < size; ++i)
while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % dividers[i] != 0) while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % kercns[i] != 0)
dividers[i] >>= 1; dividers[i] >>= 1, kercns[i] >>= 1;
// default strategy // default strategy
for (size_t i = 0; i < size; ++i) int kercn = *std::min_element(kercns.begin(), kercns.end());
if (dividers[i] != wsz)
{
kercn = 1;
break;
}
// another strategy
// width = *std::min_element(dividers.begin(), dividers.end());
return kercn; return kercn;
} }
int predictOptimalVectorWidthMax(InputArray src1, InputArray src2, InputArray src3,
InputArray src4, InputArray src5, InputArray src6,
InputArray src7, InputArray src8, InputArray src9)
{
return predictOptimalVectorWidth(src1, src2, src3, src4, src5, src6, src7, src8, src9, OCL_VECTOR_MAX);
}
#undef PROCESS_SRC #undef PROCESS_SRC

@ -369,11 +369,10 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray
CV_Assert(op_type == ACCUMULATE || op_type == ACCUMULATE_SQUARE || CV_Assert(op_type == ACCUMULATE || op_type == ACCUMULATE_SQUARE ||
op_type == ACCUMULATE_PRODUCT || op_type == ACCUMULATE_WEIGHTED); op_type == ACCUMULATE_PRODUCT || op_type == ACCUMULATE_WEIGHTED);
int stype = _src.type(), cn = CV_MAT_CN(stype); const ocl::Device & dev = ocl::Device::getDefault();
int sdepth = CV_MAT_DEPTH(stype), ddepth = _dst.depth(); bool haveMask = !_mask.empty(), doubleSupport = dev.doubleFPConfig() > 0;
int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype), ddepth = _dst.depth();
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0, int kercn = haveMask ? cn : ocl::predictOptimalVectorWidthMax(_src, _src2, _dst), rowsPerWI = dev.isIntel() ? 4 : 1;
haveMask = !_mask.empty();
if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
return false; return false;
@ -381,11 +380,13 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray
const char * const opMap[4] = { "ACCUMULATE", "ACCUMULATE_SQUARE", "ACCUMULATE_PRODUCT", const char * const opMap[4] = { "ACCUMULATE", "ACCUMULATE_SQUARE", "ACCUMULATE_PRODUCT",
"ACCUMULATE_WEIGHTED" }; "ACCUMULATE_WEIGHTED" };
char cvt[40];
ocl::Kernel k("accumulate", ocl::imgproc::accumulate_oclsrc, ocl::Kernel k("accumulate", ocl::imgproc::accumulate_oclsrc,
format("-D %s%s -D srcT=%s -D cn=%d -D dstT=%s%s", format("-D %s%s -D srcT1=%s -D cn=%d -D dstT1=%s%s -D rowsPerWI=%d -D convertToDT=%s",
opMap[op_type], haveMask ? " -D HAVE_MASK" : "", opMap[op_type], haveMask ? " -D HAVE_MASK" : "",
ocl::typeToStr(sdepth), cn, ocl::typeToStr(ddepth), ocl::typeToStr(sdepth), kercn, ocl::typeToStr(ddepth),
doubleSupport ? " -D DOUBLE_SUPPORT" : "")); doubleSupport ? " -D DOUBLE_SUPPORT" : "", rowsPerWI,
ocl::convertTypeStr(sdepth, ddepth, 1, cvt)));
if (k.empty()) if (k.empty())
return false; return false;
@ -393,7 +394,7 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray
ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
src2arg = ocl::KernelArg::ReadOnlyNoSize(src2), src2arg = ocl::KernelArg::ReadOnlyNoSize(src2),
dstarg = ocl::KernelArg::ReadWrite(dst), dstarg = ocl::KernelArg::ReadWrite(dst, cn, kercn),
maskarg = ocl::KernelArg::ReadOnlyNoSize(mask); maskarg = ocl::KernelArg::ReadOnlyNoSize(mask);
int argidx = k.set(0, srcarg); int argidx = k.set(0, srcarg);
@ -410,7 +411,7 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray
if (haveMask) if (haveMask)
k.set(argidx, maskarg); k.set(argidx, maskarg);
size_t globalsize[2] = { src.cols, src.rows }; size_t globalsize[2] = { src.cols * cn / kercn, (src.rows + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, NULL, false); return k.run(2, globalsize, NULL, false);
} }

@ -13,13 +13,18 @@
#endif #endif
#endif #endif
#define SRC_TSIZE cn * (int)sizeof(srcT1)
#define DST_TSIZE cn * (int)sizeof(dstT1)
#define noconvert
__kernel void accumulate(__global const uchar * srcptr, int src_step, int src_offset, __kernel void accumulate(__global const uchar * srcptr, int src_step, int src_offset,
#ifdef ACCUMULATE_PRODUCT #ifdef ACCUMULATE_PRODUCT
__global const uchar * src2ptr, int src2_step, int src2_offset, __global const uchar * src2ptr, int src2_step, int src2_offset,
#endif #endif
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols
#ifdef ACCUMULATE_WEIGHTED #ifdef ACCUMULATE_WEIGHTED
, dstT alpha , dstT1 alpha
#endif #endif
#ifdef HAVE_MASK #ifdef HAVE_MASK
, __global const uchar * mask, int mask_step, int mask_offset , __global const uchar * mask, int mask_step, int mask_offset
@ -27,39 +32,59 @@ __kernel void accumulate(__global const uchar * srcptr, int src_step, int src_of
) )
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * rowsPerWI;
if (x < dst_cols && y < dst_rows) if (x < dst_cols)
{ {
int src_index = mad24(y, src_step, src_offset + x * cn * (int)sizeof(srcT)); int src_index = mad24(y, src_step, mad24(x, SRC_TSIZE, src_offset));
#ifdef HAVE_MASK #ifdef HAVE_MASK
int mask_index = mad24(y, mask_step, mask_offset + x); int mask_index = mad24(y, mask_step, mask_offset + x);
mask += mask_index; mask += mask_index;
#endif #endif
int dst_index = mad24(y, dst_step, dst_offset + x * cn * (int)sizeof(dstT));
__global const srcT * src = (__global const srcT *)(srcptr + src_index);
#ifdef ACCUMULATE_PRODUCT #ifdef ACCUMULATE_PRODUCT
int src2_index = mad24(y, src2_step, src2_offset + x * cn * (int)sizeof(srcT)); int src2_index = mad24(y, src2_step, mad24(x, SRC_TSIZE, src2_offset));
__global const srcT * src2 = (__global const srcT *)(src2ptr + src2_index);
#endif #endif
__global dstT * dst = (__global dstT *)(dstptr + dst_index); int dst_index = mad24(y, dst_step, mad24(x, DST_TSIZE, dst_offset));
#pragma unroll #pragma unroll
for (int c = 0; c < cn; ++c) for (int i = 0; i < rowsPerWI; ++i)
if (y < dst_rows)
{
__global const srcT1 * src = (__global const srcT1 *)(srcptr + src_index);
#ifdef ACCUMULATE_PRODUCT
__global const srcT1 * src2 = (__global const srcT1 *)(src2ptr + src2_index);
#endif
__global dstT1 * dst = (__global dstT1 *)(dstptr + dst_index);
#ifdef HAVE_MASK #ifdef HAVE_MASK
if (mask[0]) if (mask[0])
#endif #endif
#pragma unroll
for (int c = 0; c < cn; ++c)
{
#ifdef ACCUMULATE #ifdef ACCUMULATE
dst[c] += src[c]; dst[c] += convertToDT(src[c]);
#elif defined ACCUMULATE_SQUARE #elif defined ACCUMULATE_SQUARE
dst[c] += src[c] * src[c]; dstT1 val = convertToDT(src[c]);
dst[c] = fma(val, val, dst[c]);
#elif defined ACCUMULATE_PRODUCT #elif defined ACCUMULATE_PRODUCT
dst[c] += src[c] * src2[c]; dst[c] = fma(convertToDT(src[c]), convertToDT(src2[c]), dst[c]);
#elif defined ACCUMULATE_WEIGHTED #elif defined ACCUMULATE_WEIGHTED
dst[c] = (1 - alpha) * dst[c] + src[c] * alpha; dst[c] = fma(1 - alpha, dst[c], src[c] * alpha);
#else #else
#error "Unknown accumulation type" #error "Unknown accumulation type"
#endif #endif
} }
src_index += src_step;
#ifdef ACCUMULATE_PRODUCT
src2_index += src2_step;
#endif
#ifdef HAVE_MASK
mask += mask_step;
#endif
dst_index += dst_step;
++y;
}
}
} }

Loading…
Cancel
Save