From 579499d900ec326a1869d3cf6caff7c673ad713a Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 28 May 2014 19:23:13 +0400 Subject: [PATCH] optimized cv::sum (CV_8UC1) --- modules/core/src/opencl/reduce.cl | 69 +++++++++++++++++++++++++++++-- modules/core/src/stat.cpp | 35 +++++++++------- modules/core/src/umatrix.cpp | 10 +++-- 3 files changed, 92 insertions(+), 22 deletions(-) diff --git a/modules/core/src/opencl/reduce.cl b/modules/core/src/opencl/reduce.cl index e24c82a327..12e73b5b72 100644 --- a/modules/core/src/opencl/reduce.cl +++ b/modules/core/src/opencl/reduce.cl @@ -95,7 +95,11 @@ #if cn != 3 #define loadpix(addr) *(__global const srcT *)(addr) #define storepix(val, addr) *(__global dstT *)(addr) = val +#if kercn == 1 #define srcTSIZE (int)sizeof(srcT) +#else +#define srcTSIZE (int)sizeof(srcT1) +#endif #define dstTSIZE (int)sizeof(dstT) #else #define loadpix(addr) vload3(0, (__global const srcT1 *)(addr)) @@ -159,9 +163,53 @@ dstT temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ FUNC(accumulator, temp, temp2) #else +#if kercn == 1 #define REDUCE_GLOBAL \ - dstT temp = convertToDT(loadpix(srcptr + src_index)); \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ FUNC(accumulator, temp) +#elif kercn == 2 +#define REDUCE_GLOBAL \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ + FUNC(accumulator, temp.s0); \ + FUNC(accumulator, temp.s1) +#elif kercn == 4 +#define REDUCE_GLOBAL \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ + FUNC(accumulator, temp.s0); \ + FUNC(accumulator, temp.s1); \ + FUNC(accumulator, temp.s2); \ + FUNC(accumulator, temp.s3) +#elif kercn == 8 +#define REDUCE_GLOBAL \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ + FUNC(accumulator, temp.s0); \ + FUNC(accumulator, temp.s1); \ + FUNC(accumulator, temp.s2); \ + FUNC(accumulator, temp.s3); \ + FUNC(accumulator, temp.s4); \ + FUNC(accumulator, temp.s5); \ + FUNC(accumulator, temp.s6); \ + FUNC(accumulator, temp.s7) +#elif kercn == 16 +#define REDUCE_GLOBAL \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ + FUNC(accumulator, temp.s0); \ + FUNC(accumulator, temp.s1); \ + FUNC(accumulator, temp.s2); \ + FUNC(accumulator, temp.s3); \ + FUNC(accumulator, temp.s4); \ + FUNC(accumulator, temp.s5); \ + FUNC(accumulator, temp.s6); \ + FUNC(accumulator, temp.s7); \ + FUNC(accumulator, temp.s8); \ + FUNC(accumulator, temp.s9); \ + FUNC(accumulator, temp.sA); \ + FUNC(accumulator, temp.sB); \ + FUNC(accumulator, temp.sC); \ + FUNC(accumulator, temp.sD); \ + FUNC(accumulator, temp.sE); \ + FUNC(accumulator, temp.sF) +#endif #endif #define SET_LOCAL_1 \ @@ -184,6 +232,11 @@ #if kercn == 1 #define REDUCE_GLOBAL \ accumulator += loadpix(srcptr + src_index) == zero ? zero : one +#elif kercn == 2 +#define REDUCE_GLOBAL \ + srcT value = loadpix(srcptr + src_index); \ + accumulator += value.s0 == zero ? zero : one; \ + accumulator += value.s1 == zero ? zero : one #elif kercn == 4 #define REDUCE_GLOBAL \ srcT value = loadpix(srcptr + src_index); \ @@ -191,6 +244,17 @@ accumulator += value.s1 == zero ? zero : one; \ accumulator += value.s2 == zero ? zero : one; \ accumulator += value.s3 == zero ? zero : one +#elif kercn == 8 +#define REDUCE_GLOBAL \ + srcT value = loadpix(srcptr + src_index); \ + accumulator += value.s0 == zero ? zero : one; \ + accumulator += value.s1 == zero ? zero : one; \ + accumulator += value.s2 == zero ? zero : one; \ + accumulator += value.s3 == zero ? zero : one; \ + accumulator += value.s4 == zero ? zero : one; \ + accumulator += value.s5 == zero ? zero : one; \ + accumulator += value.s6 == zero ? zero : one; \ + accumulator += value.s7 == zero ? zero : one #elif kercn == 16 #define REDUCE_GLOBAL \ srcT value = loadpix(srcptr + src_index); \ @@ -210,9 +274,8 @@ accumulator += value.sD == zero ? zero : one; \ accumulator += value.sE == zero ? zero : one; \ accumulator += value.sF == zero ? zero : one -#else -#error "kercn should be either 1, 4 or 16" #endif + #define SET_LOCAL_1 \ localmem[lid] = accumulator #define REDUCE_LOCAL_1 \ diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index e9fc538016..ae74acca26 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -473,8 +473,11 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask { CV_Assert(sum_op == OCL_OP_SUM || sum_op == OCL_OP_SUM_ABS || sum_op == OCL_OP_SUM_SQR); - int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; + bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0, + haveMask = _mask.kind() != _InputArray::NONE; + int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), + kercn = cn == 1 && !haveMask ? ocl::predictOptimalVectorWidth(_src) : 1, + mcn = std::max(cn, kercn); if ( (!doubleSupport && depth == CV_64F) || cn > 4 ) return false; @@ -484,7 +487,6 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask int ddepth = std::max(sum_op == OCL_OP_SUM_SQR ? CV_32F : CV_32S, depth), dtype = CV_MAKE_TYPE(ddepth, cn); - bool haveMask = _mask.kind() != _InputArray::NONE; CV_Assert(!haveMask || _mask.type() == CV_8UC1); int wgs2_aligned = 1; @@ -494,17 +496,19 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" }; char cvt[40]; - ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, - format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D ddepth=%d -D cn=%d" - " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s", - ocl::typeToStr(type), ocl::typeToStr(depth), - ocl::typeToStr(dtype), ocl::typeToStr(ddepth), ddepth, cn, - ocl::convertTypeStr(depth, ddepth, cn, cvt), + String opts = format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D dstT1=%s -D ddepth=%d -D cn=%d" + " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s -D kercn=%d", + ocl::typeToStr(CV_MAKE_TYPE(depth, mcn)), ocl::typeToStr(depth), + ocl::typeToStr(dtype), ocl::typeToStr(CV_MAKE_TYPE(ddepth, mcn)), + ocl::typeToStr(ddepth), ddepth, cn, + ocl::convertTypeStr(depth, ddepth, mcn, cvt), opMap[sum_op], (int)wgs, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "", haveMask ? " -D HAVE_MASK" : "", _src.isContinuous() ? " -D HAVE_SRC_CONT" : "", - _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "")); + _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn); + + ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts); if (k.empty()) return false; @@ -660,8 +664,8 @@ static bool ocl_countNonZero( InputArray _src, int & res ) wgs2_aligned >>= 1; ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, - format("-D srcT=%s -D srcT1=%s -D cn=1 -D OP_COUNT_NON_ZERO -D WGS=%d " - "-D kercn=%d -D WGS2_ALIGNED=%d%s%s", + format("-D srcT=%s -D srcT1=%s -D cn=1 -D OP_COUNT_NON_ZERO" + " -D WGS=%d -D kercn=%d -D WGS2_ALIGNED=%d%s%s", ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), ocl::typeToStr(depth), (int)wgs, kercn, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "", @@ -1292,7 +1296,7 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* CV_Assert( (_src.channels() == 1 && (_mask.empty() || _mask.type() == CV_8U)) || (_src.channels() >= 1 && _mask.empty() && !minLoc && !maxLoc) ); - int type = _src.type(), depth = CV_MAT_DEPTH(type); + int type = _src.type(), depth = CV_MAT_DEPTH(type), kercn = 1; bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; if (depth == CV_64F && !doubleSupport) @@ -1306,11 +1310,12 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* wgs2_aligned <<= 1; wgs2_aligned >>= 1; - String opts = format("-D DEPTH_%d -D srcT=%s -D OP_MIN_MAX_LOC%s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s", + String opts = format("-D DEPTH_%d -D srcT=%s -D OP_MIN_MAX_LOC%s -D WGS=%d" + " -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d", depth, ocl::typeToStr(depth), _mask.empty() ? "" : "_MASK", (int)wgs, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "", _src.isContinuous() ? " -D HAVE_SRC_CONT" : "", - _mask.isContinuous() ? " -D HAVE_MASK_CONT" : ""); + _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn); ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts); if (k.empty()) diff --git a/modules/core/src/umatrix.cpp b/modules/core/src/umatrix.cpp index 07dd07fbe2..11148bcded 100644 --- a/modules/core/src/umatrix.cpp +++ b/modules/core/src/umatrix.cpp @@ -836,7 +836,7 @@ UMat UMat::mul(InputArray m, double scale) const static bool ocl_dot( InputArray _src1, InputArray _src2, double & res ) { - int type = _src1.type(), depth = CV_MAT_DEPTH(type); + int type = _src1.type(), depth = CV_MAT_DEPTH(type), kercn = 1; bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; if ( !doubleSupport && depth == CV_64F ) @@ -853,11 +853,13 @@ static bool ocl_dot( InputArray _src1, InputArray _src2, double & res ) char cvt[40]; ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, - format("-D srcT=%s -D dstT=%s -D ddepth=%d -D convertToDT=%s -D OP_DOT -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s", - ocl::typeToStr(depth), ocl::typeToStr(ddepth), ddepth, ocl::convertTypeStr(depth, ddepth, 1, cvt), + format("-D srcT=%s -D dstT=%s -D ddepth=%d -D convertToDT=%s -D OP_DOT " + "-D WGS=%d -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d", + ocl::typeToStr(depth), ocl::typeToStr(ddepth), ddepth, + ocl::convertTypeStr(depth, ddepth, 1, cvt), (int)wgs, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "", _src1.isContinuous() ? " -D HAVE_SRC_CONT" : "", - _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "")); + _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "", kercn)); if (k.empty()) return false;