diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp index 5ab0d49b92..744cf2151e 100644 --- a/modules/core/include/opencv2/core/ocl.hpp +++ b/modules/core/include/opencv2/core/ocl.hpp @@ -598,9 +598,29 @@ CV_EXPORTS const char* typeToStr(int t); CV_EXPORTS const char* memopTypeToStr(int t); CV_EXPORTS String kernelToStr(InputArray _kernel, int ddepth = -1, const char * name = NULL); CV_EXPORTS void getPlatfomsInfo(std::vector& platform_info); + + +enum OclVectorStrategy +{ + // all matrices have its own vector width + OCL_VECTOR_OWN = 0, + // all matrices have maximal vector width among all matrices + // (useful for cases when matrices have different data types) + OCL_VECTOR_MAX = 1, + + // default strategy + OCL_VECTOR_DEFAULT = OCL_VECTOR_OWN +}; + CV_EXPORTS int predictOptimalVectorWidth(InputArray src1, InputArray src2 = noArray(), InputArray src3 = noArray(), InputArray src4 = noArray(), InputArray src5 = noArray(), InputArray src6 = noArray(), - InputArray src7 = noArray(), InputArray src8 = noArray(), InputArray src9 = noArray()); + InputArray src7 = noArray(), InputArray src8 = noArray(), InputArray src9 = noArray(), + OclVectorStrategy strat = OCL_VECTOR_DEFAULT); + +// with OCL_VECTOR_MAX strategy +CV_EXPORTS int predictOptimalVectorWidthMax(InputArray src1, InputArray src2 = noArray(), InputArray src3 = noArray(), + InputArray src4 = noArray(), InputArray src5 = noArray(), InputArray src6 = noArray(), + InputArray src7 = noArray(), InputArray src8 = noArray(), InputArray src9 = noArray()); CV_EXPORTS void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m); diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp index fc36ee08b5..9192af8e87 100644 --- a/modules/core/src/ocl.cpp +++ b/modules/core/src/ocl.cpp @@ -4451,42 +4451,46 @@ String kernelToStr(InputArray _kernel, int ddepth, const char * name) if (!src.empty()) \ { \ CV_Assert(src.isMat() || src.isUMat()); \ - int ctype = src.type(), ccn = CV_MAT_CN(ctype); \ Size csize = src.size(); \ - cols.push_back(ccn * csize.width); \ - if (ctype != type) \ + int ctype = src.type(), ccn = CV_MAT_CN(ctype), cdepth = CV_MAT_DEPTH(ctype), \ + ckercn = vectorWidths[cdepth], cwidth = ccn * csize.width; \ + if (cwidth < ckercn || ckercn <= 0) \ + return 1; \ + cols.push_back(cwidth); \ + if (strat == OCL_VECTOR_OWN && ctype != ref_type) \ return 1; \ offsets.push_back(src.offset()); \ steps.push_back(src.step()); \ + dividers.push_back(ckercn * CV_ELEM_SIZE1(ctype)); \ + kercns.push_back(ckercn); \ } \ } \ while ((void)0, 0) int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3, InputArray src4, InputArray src5, InputArray src6, - InputArray src7, InputArray src8, InputArray src9) + InputArray src7, InputArray src8, InputArray src9, + OclVectorStrategy strat) { - int type = src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz1 = CV_ELEM_SIZE1(depth); - Size ssize = src1.size(); const ocl::Device & d = ocl::Device::getDefault(); + int ref_type = src1.type(); int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(), d.preferredVectorWidthShort(), d.preferredVectorWidthShort(), d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(), - d.preferredVectorWidthDouble(), -1 }, kercn = vectorWidths[depth]; + d.preferredVectorWidthDouble(), -1 }; // if the device says don't use vectors if (vectorWidths[0] == 1) { // it's heuristic - int vectorWidthsOthers[] = { 16, 16, 8, 8, 1, 1, 1, -1 }; - kercn = vectorWidthsOthers[depth]; + vectorWidths[CV_8U] = vectorWidths[CV_8S] = 16; + vectorWidths[CV_16U] = vectorWidths[CV_16S] = 8; + vectorWidths[CV_32S] = vectorWidths[CV_32F] = vectorWidths[CV_64F] = 1; } - if (ssize.width * cn < kercn || kercn <= 0) - return 1; - std::vector offsets, steps, cols; + std::vector dividers, kercns; PROCESS_SRC(src1); PROCESS_SRC(src2); PROCESS_SRC(src3); @@ -4498,27 +4502,24 @@ int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3, PROCESS_SRC(src9); size_t size = offsets.size(); - int wsz = kercn * esz1; - std::vector dividers(size, wsz); for (size_t i = 0; i < size; ++i) - while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % dividers[i] != 0) - dividers[i] >>= 1; + while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % kercns[i] != 0) + dividers[i] >>= 1, kercns[i] >>= 1; // default strategy - for (size_t i = 0; i < size; ++i) - if (dividers[i] != wsz) - { - kercn = 1; - break; - } - - // another strategy -// width = *std::min_element(dividers.begin(), dividers.end()); + int kercn = *std::min_element(kercns.begin(), kercns.end()); return kercn; } +int predictOptimalVectorWidthMax(InputArray src1, InputArray src2, InputArray src3, + InputArray src4, InputArray src5, InputArray src6, + InputArray src7, InputArray src8, InputArray src9) +{ + return predictOptimalVectorWidth(src1, src2, src3, src4, src5, src6, src7, src8, src9, OCL_VECTOR_MAX); +} + #undef PROCESS_SRC diff --git a/modules/imgproc/src/accum.cpp b/modules/imgproc/src/accum.cpp index f2a47e3d32..9f23d3443b 100644 --- a/modules/imgproc/src/accum.cpp +++ b/modules/imgproc/src/accum.cpp @@ -369,11 +369,10 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray CV_Assert(op_type == ACCUMULATE || op_type == ACCUMULATE_SQUARE || op_type == ACCUMULATE_PRODUCT || op_type == ACCUMULATE_WEIGHTED); - int stype = _src.type(), cn = CV_MAT_CN(stype); - int sdepth = CV_MAT_DEPTH(stype), ddepth = _dst.depth(); - - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0, - haveMask = !_mask.empty(); + const ocl::Device & dev = ocl::Device::getDefault(); + bool haveMask = !_mask.empty(), doubleSupport = dev.doubleFPConfig() > 0; + int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype), ddepth = _dst.depth(); + int kercn = haveMask ? cn : ocl::predictOptimalVectorWidthMax(_src, _src2, _dst), rowsPerWI = dev.isIntel() ? 4 : 1; if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) return false; @@ -381,11 +380,13 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray const char * const opMap[4] = { "ACCUMULATE", "ACCUMULATE_SQUARE", "ACCUMULATE_PRODUCT", "ACCUMULATE_WEIGHTED" }; + char cvt[40]; ocl::Kernel k("accumulate", ocl::imgproc::accumulate_oclsrc, - format("-D %s%s -D srcT=%s -D cn=%d -D dstT=%s%s", + format("-D %s%s -D srcT1=%s -D cn=%d -D dstT1=%s%s -D rowsPerWI=%d -D convertToDT=%s", opMap[op_type], haveMask ? " -D HAVE_MASK" : "", - ocl::typeToStr(sdepth), cn, ocl::typeToStr(ddepth), - doubleSupport ? " -D DOUBLE_SUPPORT" : "")); + ocl::typeToStr(sdepth), kercn, ocl::typeToStr(ddepth), + doubleSupport ? " -D DOUBLE_SUPPORT" : "", rowsPerWI, + ocl::convertTypeStr(sdepth, ddepth, 1, cvt))); if (k.empty()) return false; @@ -393,7 +394,7 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src), src2arg = ocl::KernelArg::ReadOnlyNoSize(src2), - dstarg = ocl::KernelArg::ReadWrite(dst), + dstarg = ocl::KernelArg::ReadWrite(dst, cn, kercn), maskarg = ocl::KernelArg::ReadOnlyNoSize(mask); int argidx = k.set(0, srcarg); @@ -410,7 +411,7 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray if (haveMask) k.set(argidx, maskarg); - size_t globalsize[2] = { src.cols, src.rows }; + size_t globalsize[2] = { src.cols * cn / kercn, (src.rows + rowsPerWI - 1) / rowsPerWI }; return k.run(2, globalsize, NULL, false); } diff --git a/modules/imgproc/src/opencl/accumulate.cl b/modules/imgproc/src/opencl/accumulate.cl index a60d4d6d9d..f786f80385 100644 --- a/modules/imgproc/src/opencl/accumulate.cl +++ b/modules/imgproc/src/opencl/accumulate.cl @@ -13,13 +13,18 @@ #endif #endif +#define SRC_TSIZE cn * (int)sizeof(srcT1) +#define DST_TSIZE cn * (int)sizeof(dstT1) + +#define noconvert + __kernel void accumulate(__global const uchar * srcptr, int src_step, int src_offset, #ifdef ACCUMULATE_PRODUCT __global const uchar * src2ptr, int src2_step, int src2_offset, #endif __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols #ifdef ACCUMULATE_WEIGHTED - , dstT alpha + , dstT1 alpha #endif #ifdef HAVE_MASK , __global const uchar * mask, int mask_step, int mask_offset @@ -27,39 +32,59 @@ __kernel void accumulate(__global const uchar * srcptr, int src_step, int src_of ) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * rowsPerWI; - if (x < dst_cols && y < dst_rows) + if (x < dst_cols) { - int src_index = mad24(y, src_step, src_offset + x * cn * (int)sizeof(srcT)); + int src_index = mad24(y, src_step, mad24(x, SRC_TSIZE, src_offset)); #ifdef HAVE_MASK int mask_index = mad24(y, mask_step, mask_offset + x); mask += mask_index; #endif - int dst_index = mad24(y, dst_step, dst_offset + x * cn * (int)sizeof(dstT)); - - __global const srcT * src = (__global const srcT *)(srcptr + src_index); #ifdef ACCUMULATE_PRODUCT - int src2_index = mad24(y, src2_step, src2_offset + x * cn * (int)sizeof(srcT)); - __global const srcT * src2 = (__global const srcT *)(src2ptr + src2_index); + int src2_index = mad24(y, src2_step, mad24(x, SRC_TSIZE, src2_offset)); #endif - __global dstT * dst = (__global dstT *)(dstptr + dst_index); + int dst_index = mad24(y, dst_step, mad24(x, DST_TSIZE, dst_offset)); #pragma unroll - for (int c = 0; c < cn; ++c) + for (int i = 0; i < rowsPerWI; ++i) + if (y < dst_rows) + { + __global const srcT1 * src = (__global const srcT1 *)(srcptr + src_index); +#ifdef ACCUMULATE_PRODUCT + __global const srcT1 * src2 = (__global const srcT1 *)(src2ptr + src2_index); +#endif + __global dstT1 * dst = (__global dstT1 *)(dstptr + dst_index); + #ifdef HAVE_MASK - if (mask[0]) + if (mask[0]) #endif + #pragma unroll + for (int c = 0; c < cn; ++c) + { #ifdef ACCUMULATE - dst[c] += src[c]; + dst[c] += convertToDT(src[c]); #elif defined ACCUMULATE_SQUARE - dst[c] += src[c] * src[c]; + dstT1 val = convertToDT(src[c]); + dst[c] = fma(val, val, dst[c]); #elif defined ACCUMULATE_PRODUCT - dst[c] += src[c] * src2[c]; + dst[c] = fma(convertToDT(src[c]), convertToDT(src2[c]), dst[c]); #elif defined ACCUMULATE_WEIGHTED - dst[c] = (1 - alpha) * dst[c] + src[c] * alpha; + dst[c] = fma(1 - alpha, dst[c], src[c] * alpha); #else #error "Unknown accumulation type" #endif + } + + src_index += src_step; +#ifdef ACCUMULATE_PRODUCT + src2_index += src2_step; +#endif +#ifdef HAVE_MASK + mask += mask_step; +#endif + dst_index += dst_step; + ++y; + } } }