diff --git a/modules/core/src/opencl/reduce.cl b/modules/core/src/opencl/reduce.cl index 12e73b5b72..851d36eb4d 100644 --- a/modules/core/src/opencl/reduce.cl +++ b/modules/core/src/opencl/reduce.cl @@ -158,10 +158,59 @@ #define SRC2_INDEX int src2_index = mad24(id / cols, src2_step, mad24(id % cols, srcTSIZE, src2_offset)) #endif +#if kercn == 1 #define REDUCE_GLOBAL \ SRC2_INDEX; \ - dstT temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ FUNC(accumulator, temp, temp2) +#elif kercn == 2 +#define REDUCE_GLOBAL \ + SRC2_INDEX; \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ + FUNC(accumulator, temp.s0, temp2.s0); \ + FUNC(accumulator, temp.s1, temp2.s1) +#elif kercn == 4 +#define REDUCE_GLOBAL \ + SRC2_INDEX; \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ + FUNC(accumulator, temp.s0, temp2.s0); \ + FUNC(accumulator, temp.s1, temp2.s1); \ + FUNC(accumulator, temp.s2, temp2.s2); \ + FUNC(accumulator, temp.s3, temp2.s3) +#elif kercn == 8 +#define REDUCE_GLOBAL \ + SRC2_INDEX; \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ + FUNC(accumulator, temp.s0, temp2.s0); \ + FUNC(accumulator, temp.s1, temp2.s1); \ + FUNC(accumulator, temp.s2, temp2.s2); \ + FUNC(accumulator, temp.s3, temp2.s3); \ + FUNC(accumulator, temp.s4, temp2.s4); \ + FUNC(accumulator, temp.s5, temp2.s5); \ + FUNC(accumulator, temp.s6, temp2.s6); \ + FUNC(accumulator, temp.s7, temp2.s7) +#elif kercn == 16 +#define REDUCE_GLOBAL \ + SRC2_INDEX; \ + dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ + FUNC(accumulator, temp.s0, temp2.s0); \ + FUNC(accumulator, temp.s1, temp2.s1); \ + FUNC(accumulator, temp.s2, temp2.s2); \ + FUNC(accumulator, temp.s3, temp2.s3); \ + FUNC(accumulator, temp.s4, temp2.s4); \ + FUNC(accumulator, temp.s5, temp2.s5); \ + FUNC(accumulator, temp.s6, temp2.s6); \ + FUNC(accumulator, temp.s7, temp2.s7); \ + FUNC(accumulator, temp.s8, temp2.s8); \ + FUNC(accumulator, temp.s9, temp2.s9); \ + FUNC(accumulator, temp.sA, temp2.sA); \ + FUNC(accumulator, temp.sB, temp2.sB); \ + FUNC(accumulator, temp.sC, temp2.sC); \ + FUNC(accumulator, temp.sD, temp2.sD); \ + FUNC(accumulator, temp.sE, temp2.sE); \ + FUNC(accumulator, temp.sF, temp2.sF) +#endif + #else #if kercn == 1 #define REDUCE_GLOBAL \ diff --git a/modules/core/src/umatrix.cpp b/modules/core/src/umatrix.cpp index 11148bcded..e494f72dc5 100644 --- a/modules/core/src/umatrix.cpp +++ b/modules/core/src/umatrix.cpp @@ -836,7 +836,10 @@ UMat UMat::mul(InputArray m, double scale) const static bool ocl_dot( InputArray _src1, InputArray _src2, double & res ) { - int type = _src1.type(), depth = CV_MAT_DEPTH(type), kercn = 1; + UMat src1 = _src1.getUMat().reshape(1), src2 = _src2.getUMat().reshape(1); + + int type = src1.type(), depth = CV_MAT_DEPTH(type), + kercn = ocl::predictOptimalVectorWidth(src1, src2); bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; if ( !doubleSupport && depth == CV_64F ) @@ -853,17 +856,18 @@ static bool ocl_dot( InputArray _src1, InputArray _src2, double & res ) char cvt[40]; ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, - format("-D srcT=%s -D dstT=%s -D ddepth=%d -D convertToDT=%s -D OP_DOT " + format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D ddepth=%d -D convertToDT=%s -D OP_DOT " "-D WGS=%d -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d", - ocl::typeToStr(depth), ocl::typeToStr(ddepth), ddepth, - ocl::convertTypeStr(depth, ddepth, 1, cvt), + ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), ocl::typeToStr(depth), + ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)), + ddepth, ocl::convertTypeStr(depth, ddepth, kercn, cvt), (int)wgs, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "", _src1.isContinuous() ? " -D HAVE_SRC_CONT" : "", _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "", kercn)); if (k.empty()) return false; - UMat src1 = _src1.getUMat().reshape(1), src2 = _src2.getUMat().reshape(1), db(1, dbsize, ddepth); + UMat db(1, dbsize, ddepth); ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1), src2arg = ocl::KernelArg::ReadOnlyNoSize(src2),