diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp index f7fae9e4e4..5db8ef7d88 100644 --- a/modules/core/include/opencv2/core/ocl.hpp +++ b/modules/core/include/opencv2/core/ocl.hpp @@ -169,8 +169,8 @@ public: VENDOR_NVIDIA=3 }; int vendorID() const; - inline bool isAMD() const { return vendorID() == VENDOR_AMD; }; - inline bool isIntel() const { return vendorID() == VENDOR_INTEL; }; + inline bool isAMD() const { return vendorID() == VENDOR_AMD; } + inline bool isIntel() const { return vendorID() == VENDOR_INTEL; } int maxClockFrequency() const; int maxComputeUnits() const; @@ -286,7 +286,7 @@ class CV_EXPORTS KernelArg { public: enum { LOCAL=1, READ_ONLY=2, WRITE_ONLY=4, READ_WRITE=6, CONSTANT=8, PTR_ONLY = 16, NO_SIZE=256 }; - KernelArg(int _flags, UMat* _m, int wscale=1, const void* _obj=0, size_t _sz=0); + KernelArg(int _flags, UMat* _m, int wscale=1, int iwscale=1, const void* _obj=0, size_t _sz=0); KernelArg(); static KernelArg Local() { return KernelArg(LOCAL, 0); } @@ -296,27 +296,27 @@ public: { return KernelArg(PTR_ONLY+READ_ONLY, (UMat*)&m); } static KernelArg PtrReadWrite(const UMat& m) { return KernelArg(PTR_ONLY+READ_WRITE, (UMat*)&m); } - static KernelArg ReadWrite(const UMat& m, int wscale=1) - { return KernelArg(READ_WRITE, (UMat*)&m, wscale); } - static KernelArg ReadWriteNoSize(const UMat& m, int wscale=1) - { return KernelArg(READ_WRITE+NO_SIZE, (UMat*)&m, wscale); } - static KernelArg ReadOnly(const UMat& m, int wscale=1) - { return KernelArg(READ_ONLY, (UMat*)&m, wscale); } - static KernelArg WriteOnly(const UMat& m, int wscale=1) - { return KernelArg(WRITE_ONLY, (UMat*)&m, wscale); } - static KernelArg ReadOnlyNoSize(const UMat& m, int wscale=1) - { return KernelArg(READ_ONLY+NO_SIZE, (UMat*)&m, wscale); } - static KernelArg WriteOnlyNoSize(const UMat& m, int wscale=1) - { return KernelArg(WRITE_ONLY+NO_SIZE, (UMat*)&m, wscale); } + static KernelArg ReadWrite(const UMat& m, int wscale=1, int iwscale=1) + { return KernelArg(READ_WRITE, (UMat*)&m, wscale, iwscale); } + static KernelArg ReadWriteNoSize(const UMat& m, int wscale=1, int iwscale=1) + { return KernelArg(READ_WRITE+NO_SIZE, (UMat*)&m, wscale, iwscale); } + static KernelArg ReadOnly(const UMat& m, int wscale=1, int iwscale=1) + { return KernelArg(READ_ONLY, (UMat*)&m, wscale, iwscale); } + static KernelArg WriteOnly(const UMat& m, int wscale=1, int iwscale=1) + { return KernelArg(WRITE_ONLY, (UMat*)&m, wscale, iwscale); } + static KernelArg ReadOnlyNoSize(const UMat& m, int wscale=1, int iwscale=1) + { return KernelArg(READ_ONLY+NO_SIZE, (UMat*)&m, wscale, iwscale); } + static KernelArg WriteOnlyNoSize(const UMat& m, int wscale=1, int iwscale=1) + { return KernelArg(WRITE_ONLY+NO_SIZE, (UMat*)&m, wscale, iwscale); } static KernelArg Constant(const Mat& m); template static KernelArg Constant(const _Tp* arr, size_t n) - { return KernelArg(CONSTANT, 0, 1, (void*)arr, n); } + { return KernelArg(CONSTANT, 0, 1, 1, (void*)arr, n); } int flags; UMat* m; const void* obj; size_t sz; - int wscale; + int wscale, iwscale; }; @@ -590,6 +590,9 @@ CV_EXPORTS const char* typeToStr(int t); CV_EXPORTS const char* memopTypeToStr(int t); CV_EXPORTS String kernelToStr(InputArray _kernel, int ddepth = -1); CV_EXPORTS void getPlatfomsInfo(std::vector& platform_info); +CV_EXPORTS int predictOptimalVectorWidth(InputArray src1, InputArray src2 = noArray(), InputArray src3 = noArray(), + InputArray src4 = noArray(), InputArray src5 = noArray(), InputArray src6 = noArray(), + InputArray src7 = noArray(), InputArray src8 = noArray(), InputArray src9 = noArray()); class CV_EXPORTS Image2D { diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index 50703ba35d..2702c211ea 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -933,17 +933,16 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst, int cn = CV_MAT_CN(srctype); bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; - if( oclop < 0 || ((haveMask || haveScalar) && cn > 4) || (!doubleSupport && srcdepth == CV_64F && !bitwise)) return false; char opts[1024]; - int kercn = haveMask || haveScalar ? cn : 1; + int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst); int scalarcn = kercn == 3 ? 4 : kercn; sprintf(opts, "-D %s%s -D %s -D dstT=%s%s -D dstT_C1=%s -D workST=%s -D cn=%d", - (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"), oclop2str[oclop], + haveMask ? "MASK_" : "", haveScalar ? "UNARY_OP" : "BINARY_OP", oclop2str[oclop], bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, kercn)) : ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)), doubleSupport ? " -D DOUBLE_SUPPORT" : "", bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, 1)) : @@ -953,16 +952,15 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst, kercn); ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts); - if( k.empty() ) + if (k.empty()) return false; UMat src1 = _src1.getUMat(), src2; UMat dst = _dst.getUMat(), mask = _mask.getUMat(); - int cscale = cn/kercn; - ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cscale); - ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cscale) : - ocl::KernelArg::WriteOnly(dst, cscale); + ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn); + ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cn, kercn) : + ocl::KernelArg::WriteOnly(dst, cn, kercn); ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1); if( haveScalar ) @@ -976,7 +974,7 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst, convertAndUnrollScalar(src2sc, srctype, (uchar*)buf, 1); } - ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, buf, esz); + ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz); if( !haveMask ) k.args(src1arg, dstarg, scalararg); @@ -986,7 +984,7 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst, else { src2 = _src2.getUMat(); - ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cscale); + ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cn, kercn); if( !haveMask ) k.args(src1arg, src2arg, dstarg); @@ -994,7 +992,7 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst, k.args(src1arg, src2arg, maskarg, dstarg); } - size_t globalsize[] = { src1.cols*(cn/kercn), src1.rows }; + size_t globalsize[] = { src1.cols * cn / kercn, src1.rows }; return k.run(2, globalsize, 0, false); } @@ -1313,7 +1311,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, if (!doubleSupport && (depth2 == CV_64F || depth1 == CV_64F)) return false; - int kercn = haveMask || haveScalar ? cn : 1; + int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst); int scalarcn = kercn == 3 ? 4 : kercn; char cvtstr[4][32], opts[1024]; @@ -1355,11 +1353,9 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, UMat src1 = _src1.getUMat(), src2; UMat dst = _dst.getUMat(), mask = _mask.getUMat(); - int cscale = cn/kercn; - - ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cscale); - ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cscale) : - ocl::KernelArg::WriteOnly(dst, cscale); + ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn); + ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cn, kercn) : + ocl::KernelArg::WriteOnly(dst, cn, kercn); ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1); if( haveScalar ) @@ -1370,7 +1366,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, if( !src2sc.empty() ) convertAndUnrollScalar(src2sc, wtype, (uchar*)buf, 1); - ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, buf, esz); + ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz); if( !haveMask ) { @@ -1378,7 +1374,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, k.args(src1arg, dstarg, scalararg); else if(n == 1) k.args(src1arg, dstarg, scalararg, - ocl::KernelArg(0, 0, 0, usrdata_p, usrdata_esz)); + ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz)); else CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters"); } @@ -1388,7 +1384,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, else { src2 = _src2.getUMat(); - ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cscale); + ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cn, kercn); if( !haveMask ) { @@ -1396,12 +1392,12 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, k.args(src1arg, src2arg, dstarg); else if(n == 1) k.args(src1arg, src2arg, dstarg, - ocl::KernelArg(0, 0, 0, usrdata_p, usrdata_esz)); + ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz)); else if(n == 3) k.args(src1arg, src2arg, dstarg, - ocl::KernelArg(0, 0, 0, usrdata_p, usrdata_esz), - ocl::KernelArg(0, 0, 0, usrdata_p + usrdata_esz, usrdata_esz), - ocl::KernelArg(0, 0, 0, usrdata_p + usrdata_esz*2, usrdata_esz)); + ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz), + ocl::KernelArg(0, 0, 0, 0, usrdata_p + usrdata_esz, usrdata_esz), + ocl::KernelArg(0, 0, 0, 0, usrdata_p + usrdata_esz*2, usrdata_esz)); else CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters"); } @@ -1409,7 +1405,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, k.args(src1arg, src2arg, maskarg, dstarg); } - size_t globalsize[] = { src1.cols * cscale, src1.rows }; + size_t globalsize[] = { src1.cols * cn / kercn, src1.rows }; return k.run(2, globalsize, NULL, false); } diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index c314823d12..af93cdbd2a 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -1310,7 +1310,8 @@ static BinaryFunc getConvertScaleFunc(int sdepth, int ddepth) static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, double beta ) { - int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); + int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), + kercn = cn > 4 || cn == 3 ? 1 : ocl::predictOptimalVectorWidth(_src, _dst); bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; if (!doubleSupport && depth == CV_64F) @@ -1319,27 +1320,31 @@ static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha char cvt[2][50]; int wdepth = std::max(depth, CV_32F); ocl::Kernel k("KF", ocl::core::arithm_oclsrc, - format("-D OP_CONVERT_SCALE_ABS -D UNARY_OP -D dstT=uchar -D srcT1=%s" - " -D workT=%s -D wdepth=%d -D convertToWT1=%s -D convertToDT=%s%s", - ocl::typeToStr(depth), ocl::typeToStr(wdepth), wdepth, - ocl::convertTypeStr(depth, wdepth, 1, cvt[0]), - ocl::convertTypeStr(wdepth, CV_8U, 1, cvt[1]), + format("-D OP_CONVERT_SCALE_ABS -D UNARY_OP -D dstT=%s -D srcT1=%s" + " -D workT=%s -D wdepth=%d -D convertToWT1=%s -D convertToDT=%s -D workT1=%s%s", + ocl::typeToStr(CV_8UC(kercn)), + ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), + ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)), wdepth, + ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]), + ocl::convertTypeStr(wdepth, CV_8U, kercn, cvt[1]), + ocl::typeToStr(wdepth), doubleSupport ? " -D DOUBLE_SUPPORT" : "")); if (k.empty()) return false; - _dst.createSameSize(_src, CV_8UC(cn)); - UMat src = _src.getUMat(), dst = _dst.getUMat(); + UMat src = _src.getUMat(); + _dst.create(src.size(), CV_8UC(cn)); + UMat dst = _dst.getUMat(); ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src), - dstarg = ocl::KernelArg::WriteOnly(dst, cn); + dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn); if (wdepth == CV_32F) k.args(srcarg, dstarg, (float)alpha, (float)beta); else if (wdepth == CV_64F) k.args(srcarg, dstarg, alpha, beta); - size_t globalsize[2] = { src.cols * cn, src.rows }; + size_t globalsize[2] = { src.cols * cn / kercn, src.rows }; return k.run(2, globalsize, NULL, false); } diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp index d7fad62e6f..8cc61aba63 100644 --- a/modules/core/src/mathfuncs.cpp +++ b/modules/core/src/mathfuncs.cpp @@ -62,40 +62,35 @@ static const char* oclop2str[] = { "OP_LOG", "OP_EXP", "OP_MAG", "OP_PHASE_DEGRE static bool ocl_math_op(InputArray _src1, InputArray _src2, OutputArray _dst, int oclop) { - int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn1 = CV_MAT_CN(type1); - int type2 = _src2.type(), cn2 = CV_MAT_CN(type2); - - char opts[1024]; + int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); + int kercn = cn == 3 || cn > 4 || oclop == OCL_OP_PHASE_DEGREES || + oclop == OCL_OP_PHASE_RADIANS ? 1 : ocl::predictOptimalVectorWidth(_src1, _src2, _dst); - bool double_support = false; - if(ocl::Device::getDefault().doubleFPConfig() > 0) - double_support = true; - if(!double_support && depth1 == CV_64F) + bool double_support = ocl::Device::getDefault().doubleFPConfig() > 0; + if (!double_support && depth == CV_64F) return false; - sprintf(opts, "-D %s -D %s -D dstT=%s %s", _src2.empty()?"UNARY_OP":"BINARY_OP", - oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, 1) ), double_support ? "-D DOUBLE_SUPPORT" : "" ); - - ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts); - if( k.empty() ) + ocl::Kernel k("KF", ocl::core::arithm_oclsrc, + format("-D %s -D %s -D dstT=%s%s", _src2.empty() ? "UNARY_OP" : "BINARY_OP", + oclop2str[oclop], ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), + double_support ? " -D DOUBLE_SUPPORT" : "")); + if (k.empty()) return false; - UMat src1 = _src1.getUMat(); - UMat src2 = _src2.getUMat(); - _dst.create(src1.size(), type1); + UMat src1 = _src1.getUMat(), src2 = _src2.getUMat(); + _dst.create(src1.size(), type); UMat dst = _dst.getUMat(); - ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cn1); - ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cn2); - ocl::KernelArg dstarg = ocl::KernelArg::WriteOnly(dst, cn1); + ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1), + src2arg = ocl::KernelArg::ReadOnlyNoSize(src2), + dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn); - if(_src2.empty()) + if (src2.empty()) k.args(src1arg, dstarg); else k.args(src1arg, src2arg, dstarg); - size_t globalsize[] = { src1.cols*cn1, src1.rows}; - + size_t globalsize[] = { src1.cols * cn / kercn, src1.rows }; return k.run(2, globalsize, 0, false); } diff --git a/modules/core/src/matmul.cpp b/modules/core/src/matmul.cpp index 022f88e5f4..409c30ffc1 100644 --- a/modules/core/src/matmul.cpp +++ b/modules/core/src/matmul.cpp @@ -2157,7 +2157,8 @@ typedef void (*ScaleAddFunc)(const uchar* src1, const uchar* src2, uchar* dst, i static bool ocl_scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray _dst, int type ) { - int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), wdepth = std::max(depth, CV_32F); + int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), wdepth = std::max(depth, CV_32F), + kercn = cn == 3 || cn > 4 ? 1 : ocl::predictOptimalVectorWidth(_src1, _src2, _dst); bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; Size size = _src1.size(); @@ -2166,27 +2167,31 @@ static bool ocl_scaleAdd( InputArray _src1, double alpha, InputArray _src2, Outp char cvt[2][50]; ocl::Kernel k("KF", ocl::core::arithm_oclsrc, - format("-D OP_SCALE_ADD -D BINARY_OP -D dstT=%s -D workT=%s -D wdepth=%d -D convertToWT1=%s" - " -D srcT1=dstT -D srcT2=dstT -D convertToDT=%s%s", ocl::typeToStr(depth), - ocl::typeToStr(wdepth), wdepth, ocl::convertTypeStr(depth, wdepth, 1, cvt[0]), - ocl::convertTypeStr(wdepth, depth, 1, cvt[1]), + format("-D OP_SCALE_ADD -D BINARY_OP -D dstT=%s -D workT=%s -D convertToWT1=%s" + " -D srcT1=dstT -D srcT2=dstT -D convertToDT=%s -D workT1=%s -D wdepth=%d%s", + ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), + ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)), + ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]), + ocl::convertTypeStr(wdepth, depth, kercn, cvt[1]), + ocl::typeToStr(wdepth), wdepth, doubleSupport ? " -D DOUBLE_SUPPORT" : "")); if (k.empty()) return false; + UMat src1 = _src1.getUMat(), src2 = _src2.getUMat(); _dst.create(size, type); - UMat src1 = _src1.getUMat(), src2 = _src2.getUMat(), dst = _dst.getUMat(); + UMat dst = _dst.getUMat(); ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1), src2arg = ocl::KernelArg::ReadOnlyNoSize(src2), - dstarg = ocl::KernelArg::WriteOnly(dst, cn); + dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn); if (wdepth == CV_32F) k.args(src1arg, src2arg, dstarg, (float)alpha); else k.args(src1arg, src2arg, dstarg, alpha); - size_t globalsize[2] = { dst.cols * cn, dst.rows }; + size_t globalsize[2] = { dst.cols * cn / kercn, dst.rows }; return k.run(2, globalsize, NULL, false); } diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp index c05df1ee43..27ff1c9c70 100644 --- a/modules/core/src/ocl.cpp +++ b/modules/core/src/ocl.cpp @@ -2640,19 +2640,19 @@ static cl_command_queue getQueue(const Queue& q) /////////////////////////////////////////// KernelArg ///////////////////////////////////////////// KernelArg::KernelArg() - : flags(0), m(0), obj(0), sz(0), wscale(1) + : flags(0), m(0), obj(0), sz(0), wscale(1), iwscale(1) { } -KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, const void* _obj, size_t _sz) - : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale) +KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, int _iwscale, const void* _obj, size_t _sz) + : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale), iwscale(_iwscale) { } KernelArg KernelArg::Constant(const Mat& m) { CV_Assert(m.isContinuous()); - return KernelArg(CONSTANT, 0, 1, m.data, m.total()*m.elemSize()); + return KernelArg(CONSTANT, 0, 0, 0, m.data, m.total()*m.elemSize()); } /////////////////////////////////////////// Kernel ///////////////////////////////////////////// @@ -2871,7 +2871,7 @@ int Kernel::set(int i, const KernelArg& arg) if( !(arg.flags & KernelArg::NO_SIZE) ) { - int cols = u2d.cols*arg.wscale; + int cols = u2d.cols*arg.wscale/arg.iwscale; CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows) == CL_SUCCESS); CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols) == CL_SUCCESS); i += 2; @@ -2887,7 +2887,7 @@ int Kernel::set(int i, const KernelArg& arg) i += 4; if( !(arg.flags & KernelArg::NO_SIZE) ) { - int cols = u3d.cols*arg.wscale; + int cols = u3d.cols*arg.wscale/arg.iwscale; CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.rows) == CL_SUCCESS); CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows) == CL_SUCCESS); CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols) == CL_SUCCESS); @@ -2915,7 +2915,7 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[], for (int i = 0; i < dims; i++) { size_t val = _localsize ? _localsize[i] : - dims == 1 ? 64 : dims == 2 ? (16>>i) : dims == 3 ? (8>>(int)(i>0)) : 1; + dims == 1 ? 64 : dims == 2 ? (i == 0 ? 256 : 8) : dims == 3 ? (8>>(int)(i>0)) : 1; CV_Assert( val > 0 ); total *= _globalsize[i]; globalsize[i] = ((_globalsize[i] + val - 1)/val)*val; @@ -4219,34 +4219,34 @@ const char* typeToStr(int type) { static const char* tab[]= { - "uchar", "uchar2", "uchar3", "uchar4", - "char", "char2", "char3", "char4", - "ushort", "ushort2", "ushort3", "ushort4", - "short", "short2", "short3", "short4", - "int", "int2", "int3", "int4", - "float", "float2", "float3", "float4", - "double", "double2", "double3", "double4", - "?", "?", "?", "?" + "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16", + "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16", + "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16", + "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16", + "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16", + "float", "float2", "float3", "float4", 0, 0, 0, "float8", 0, 0, 0, 0, 0, 0, 0, "float16", + "double", "double2", "double3", "double4", 0, 0, 0, "double8", 0, 0, 0, 0, 0, 0, 0, "double16", + "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?" }; int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type); - return cn > 4 ? "?" : tab[depth*4 + cn-1]; + return cn > 16 ? "?" : tab[depth*16 + cn-1]; } const char* memopTypeToStr(int type) { static const char* tab[] = { - "uchar", "uchar2", "uchar3", "uchar4", - "uchar", "uchar2", "uchar3", "uchar4", - "ushort", "ushort2", "ushort3", "ushort4", - "ushort", "ushort2", "ushort3", "ushort4", - "int", "int2", "int3", "int4", - "int", "int2", "int3", "int4", - "ulong", "ulong2", "ulong3", "ulong4", - "?", "?", "?", "?" + "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16", + "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16", + "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16", + "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16", + "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16", + "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16", + "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16", + "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?" }; int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type); - return cn > 4 ? "?" : tab[depth*4 + cn-1]; + return cn > 16 ? "?" : tab[depth*16 + cn-1]; } const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf) @@ -4321,6 +4321,74 @@ String kernelToStr(InputArray _kernel, int ddepth) return cv::format(" -D COEFF=%s", func(kernel).c_str()); } +#define PROCESS_SRC(src) \ + do \ + { \ + if (!src.empty()) \ + { \ + CV_Assert(src.isMat() || src.isUMat()); \ + int ctype = src.type(), ccn = CV_MAT_CN(ctype); \ + Size csize = src.size(); \ + cols.push_back(ccn * src.size().width); \ + if (ctype != type || csize != ssize) \ + return 1; \ + offsets.push_back(src.offset()); \ + steps.push_back(src.step()); \ + } \ + } \ + while ((void)0, 0) + +int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3, + InputArray src4, InputArray src5, InputArray src6, + InputArray src7, InputArray src8, InputArray src9) +{ + int type = src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); + Size ssize = src1.size(); + const ocl::Device & d = ocl::Device::getDefault(); + + int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(), + d.preferredVectorWidthShort(), d.preferredVectorWidthShort(), + d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(), + d.preferredVectorWidthDouble(), -1 }, width = vectorWidths[depth]; + CV_Assert(width >= 0); + + if (ssize.width * cn < width) + return 1; + + std::vector offsets, steps, cols; + PROCESS_SRC(src1); + PROCESS_SRC(src2); + PROCESS_SRC(src3); + PROCESS_SRC(src4); + PROCESS_SRC(src5); + PROCESS_SRC(src6); + PROCESS_SRC(src7); + PROCESS_SRC(src8); + PROCESS_SRC(src9); + + size_t size = offsets.size(); + std::vector dividers(size, width); + + for (size_t i = 0; i < size; ++i) + while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % dividers[i] != 0) + dividers[i] >>= 1; + + // default strategy + for (size_t i = 0; i < size; ++i) + if (dividers[i] != width) + { + width = 1; + break; + } + + // another strategy +// width = *std::min_element(dividers.begin(), dividers.end()); + + return width; +} + +#undef PROCESS_SRC + /////////////////////////////////////////// Image2D //////////////////////////////////////////////////// struct Image2D::Impl diff --git a/modules/core/src/opencl/arithm.cl b/modules/core/src/opencl/arithm.cl index bac72d306a..f7278e1f47 100644 --- a/modules/core/src/opencl/arithm.cl +++ b/modules/core/src/opencl/arithm.cl @@ -286,24 +286,24 @@ #elif defined OP_CONVERT_SCALE_ABS #undef EXTRA_PARAMS -#define EXTRA_PARAMS , workT alpha, workT beta +#define EXTRA_PARAMS , workT1 alpha, workT1 beta #if wdepth <= 4 #define PROCESS_ELEM \ - workT value = mad24(srcelem1, alpha, beta); \ + workT value = mad24(srcelem1, (workT)(alpha), (workT)(beta)); \ storedst(convertToDT(value >= 0 ? value : -value)) #else #define PROCESS_ELEM \ - workT value = mad(srcelem1, alpha, beta); \ + workT value = mad(srcelem1, (workT)(alpha), (workT)(beta)); \ storedst(convertToDT(value >= 0 ? value : -value)) #endif #elif defined OP_SCALE_ADD #undef EXTRA_PARAMS -#define EXTRA_PARAMS , workT alpha +#define EXTRA_PARAMS , workT1 alpha #if wdepth <= 4 -#define PROCESS_ELEM storedst(convertToDT(mad24(srcelem1, alpha, srcelem2))) +#define PROCESS_ELEM storedst(convertToDT(mad24(srcelem1, (workT)(alpha), srcelem2))) #else -#define PROCESS_ELEM storedst(convertToDT(mad(srcelem1, alpha, srcelem2))) +#define PROCESS_ELEM storedst(convertToDT(mad(srcelem1, (workT)(alpha), srcelem2))) #endif #elif defined OP_CTP_AD || defined OP_CTP_AR diff --git a/modules/core/src/umatrix.cpp b/modules/core/src/umatrix.cpp index 7ace7514be..44cb3f44b5 100644 --- a/modules/core/src/umatrix.cpp +++ b/modules/core/src/umatrix.cpp @@ -773,7 +773,7 @@ UMat& UMat::setTo(InputArray _value, InputArray _mask) ocl::Kernel setK(haveMask ? "setMask" : "set", ocl::core::copyset_oclsrc, opts); if( !setK.empty() ) { - ocl::KernelArg scalararg(0, 0, 0, buf, CV_ELEM_SIZE1(tp)*scalarcn); + ocl::KernelArg scalararg(0, 0, 0, 0, buf, CV_ELEM_SIZE1(tp)*scalarcn); UMat mask; if( haveMask ) diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index affe5c3fdc..51ee5bc0d9 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -4077,7 +4077,7 @@ static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0, matM.convertTo(M0, doubleSupport ? CV_64F : CV_32F); k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(M0), - ocl::KernelArg(0, 0, 0, borderBuf, CV_ELEM_SIZE(sctype))); + ocl::KernelArg(0, 0, 0, 0, borderBuf, CV_ELEM_SIZE(sctype))); size_t globalThreads[2] = { dst.cols, dst.rows }; return k.run(2, globalThreads, NULL, false); diff --git a/modules/imgproc/src/thresh.cpp b/modules/imgproc/src/thresh.cpp index fc0f6f9e95..cb280eced7 100644 --- a/modules/imgproc/src/thresh.cpp +++ b/modules/imgproc/src/thresh.cpp @@ -710,7 +710,9 @@ private: static bool ocl_threshold( InputArray _src, OutputArray _dst, double & thresh, double maxval, int thresh_type ) { - int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), ktype = CV_MAKE_TYPE(depth, 1); + int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), + kercn = cn <= 4 && cn != 3 ? cn : ocl::predictOptimalVectorWidth(_src, _dst), + ktype = CV_MAKE_TYPE(depth, kercn); bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; if ( !(thresh_type == THRESH_BINARY || thresh_type == THRESH_BINARY_INV || thresh_type == THRESH_TRUNC || @@ -733,11 +735,11 @@ static bool ocl_threshold( InputArray _src, OutputArray _dst, double & thresh, d if (depth <= CV_32S) thresh = cvFloor(thresh); - k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst, cn), - ocl::KernelArg::Constant(Mat(1, 1, ktype, thresh)), - ocl::KernelArg::Constant(Mat(1, 1, ktype, maxval))); + k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst, cn, kercn), + ocl::KernelArg::Constant(Mat(1, 1, ktype, Scalar::all(thresh))), + ocl::KernelArg::Constant(Mat(1, 1, ktype, Scalar::all(maxval)))); - size_t globalsize[2] = { dst.cols * cn, dst.rows }; + size_t globalsize[2] = { dst.cols * cn / kercn, dst.rows }; return k.run(2, globalsize, NULL, false); }