diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp index 0b99872eb1..86b63db054 100644 --- a/modules/core/src/matrix.cpp +++ b/modules/core/src/matrix.cpp @@ -3428,11 +3428,60 @@ static bool ocl_reduce(InputArray _src, OutputArray _dst, const char * const ops[4] = { "OCL_CV_REDUCE_SUM", "OCL_CV_REDUCE_AVG", "OCL_CV_REDUCE_MAX", "OCL_CV_REDUCE_MIN" }; char cvt[40]; - ocl::Kernel k("reduce", ocl::core::reduce2_oclsrc, - format("-D %s -D dim=%d -D cn=%d -D ddepth=%d -D srcT=%s -D dstT=%s -D convertToDT=%s%s", - ops[op], dim, cn, ddepth, ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), - ocl::convertTypeStr(sdepth, ddepth, 1, cvt), - doubleSupport ? " -D DOUBLE_SUPPORT" : "")); + + const int min_opt_cols = 128; + if ((1 == dim) && (_src.cols() > min_opt_cols)) + { + int buf_cols = 32; + + cv::String build_opt_pre = format("-D BUF_COLS=%d -D %s -D dim=1 -D cn=%d -D ddepth=%d -D srcT=%s -D dstT=%s -D convertToDT=%s%s", + buf_cols, ops[op], cn, ddepth, ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), + ocl::convertTypeStr(sdepth, ddepth, 1, cvt), + doubleSupport ? " -D DOUBLE_SUPPORT" : ""); + ocl::Kernel kpre("reduce_horz_pre", ocl::core::reduce2_oclsrc, build_opt_pre); + if (kpre.empty()) + return false; + + cv::String build_opt_main = format("-D %s -D dim=1 -D cn=%d -D ddepth=%d -D srcT=%s -D dstT=%s -D convertToDT=noconvert%s", + ops[op], cn, ddepth, ocl::typeToStr(ddepth), ocl::typeToStr(ddepth), + doubleSupport ? " -D DOUBLE_SUPPORT" : ""); + ocl::Kernel kmain("reduce", ocl::core::reduce2_oclsrc, build_opt_main); + if (kmain.empty()) + return false; + + UMat src = _src.getUMat(); + Size dsize(1, src.rows); + _dst.create(dsize, dtype); + UMat dst = _dst.getUMat(), temp = dst; + + if (op0 == CV_REDUCE_AVG && sdepth < CV_32S && ddepth0 < CV_32S) + temp.create(dsize, CV_32SC(cn)); + + UMat buf(src.rows, buf_cols, temp.type()); + + size_t globalSize[2] = { buf_cols, src.rows }; + + kpre.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnlyNoSize(buf)); + if (!kpre.run(2, globalSize, NULL, false)) + return false; + + globalSize[0] = src.rows; + kmain.args(ocl::KernelArg::ReadOnly(buf), ocl::KernelArg::WriteOnlyNoSize(temp)); + if (!kmain.run(1, globalSize, NULL, false)) + return false; + + if (op0 == CV_REDUCE_AVG) + temp.convertTo(dst, ddepth0, 1. / (dim == 0 ? src.rows : src.cols)); + + return true; + } + + cv::String build_opt = format("-D %s -D dim=%d -D cn=%d -D ddepth=%d -D srcT=%s -D dstT=%s -D convertToDT=%s%s", + ops[op], dim, cn, ddepth, ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), + ocl::convertTypeStr(sdepth, ddepth, 1, cvt), + doubleSupport ? " -D DOUBLE_SUPPORT" : ""); + + ocl::Kernel k("reduce", ocl::core::reduce2_oclsrc, build_opt); if (k.empty()) return false; diff --git a/modules/core/src/opencl/reduce2.cl b/modules/core/src/opencl/reduce2.cl index ef6a860773..4910c6198a 100644 --- a/modules/core/src/opencl/reduce2.cl +++ b/modules/core/src/opencl/reduce2.cl @@ -91,6 +91,41 @@ #error "No operation is specified" #endif +#ifndef BUF_COLS +#define BUF_COLS 64 +#endif + +__kernel void reduce_horz_pre(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols, + __global uchar * bufptr, int buf_step, int buf_offset) +{ + int x = get_global_id(0); + int y = get_global_id(1); + if (x < BUF_COLS) + { + int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT) * cn, src_offset)); + int buf_index = mad24(y, buf_step, mad24(x, (int)sizeof(dstT) * cn, buf_offset)); + + __global const srcT * src = (__global const srcT *)(srcptr + src_index); + __global dstT * buf = (__global dstT *)(bufptr + buf_index); + dstT tmp[cn] = { INIT_VALUE }; + + int src_step_mul = BUF_COLS * cn; + for (int x = 0; x < cols; x += BUF_COLS, src += src_step_mul) + { + #pragma unroll + for (int c = 0; c < cn; ++c) + { + dstT value = convertToDT(src[c]); + PROCESS_ELEM(tmp[c], value); + } + } + + #pragma unroll + for (int c = 0; c < cn; ++c) + buf[c] = tmp[c]; + } +} + __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols, __global uchar * dstptr, int dst_step, int dst_offset) {