Merge pull request #24918 from opencv-pushbot:gitee/alalek/core_convertfp16_replacement

core(OpenCL): optimize convertTo() with CV_16F (convertFp16() replacement) #24918 relates #24909 relates #24917 relates #24892 Performance changes: - [x] 12700K (1 thread) + Intel iGPU |Name of Test|noOCL|convertFp16|convertTo BASE|convertTo PATCH| |---|:-:|:-:|:-:|:-:| |ConvertFP16FP32MatMat::OCL_Core|3.130|3.152|3.127|3.136| |ConvertFP16FP32MatUMat::OCL_Core|3.030|3.996|3.007|2.671| |ConvertFP16FP32UMatMat::OCL_Core|3.010|3.101|3.056|2.854| |ConvertFP16FP32UMatUMat::OCL_Core|3.016|3.298|2.072|2.061| |ConvertFP32FP16MatMat::OCL_Core|2.697|2.652|2.723|2.721| |ConvertFP32FP16MatUMat::OCL_Core|2.752|4.268|2.662|2.947| |ConvertFP32FP16UMatMat::OCL_Core|2.706|2.601|2.603|2.528| |ConvertFP32FP16UMatUMat::OCL_Core|2.704|3.215|1.999|1.988| Patched version is not worse than convertFp16 and convertTo baseline (except MatUMat 32->16, baseline uses CPU code+dst buffer map). There are still gaps against noOpenCL(CPU only) mode due to T-API implementation issues (unnecessary synchronization). - [x] 12700K + AMD dGPU |Name of Test|noOCL|convertFp16 dGPU|convertTo BASE dGPU|convertTo PATCH dGPU| |---|:-:|:-:|:-:|:-:| |ConvertFP16FP32MatMat::OCL_Core|3.130|3.133|3.172|3.087| |ConvertFP16FP32MatUMat::OCL_Core|3.030|1.713|9.559|1.729| |ConvertFP16FP32UMatMat::OCL_Core|3.010|6.515|6.309|4.452| |ConvertFP16FP32UMatUMat::OCL_Core|3.016|0.242|23.597|0.170| |ConvertFP32FP16MatMat::OCL_Core|2.697|2.641|2.713|2.689| |ConvertFP32FP16MatUMat::OCL_Core|2.752|4.076|6.483|4.191| |ConvertFP32FP16UMatMat::OCL_Core|2.706|9.042|16.481|1.834| |ConvertFP32FP16UMatUMat::OCL_Core|2.704|0.229|15.730|0.176| convertTo-baseline can't compile OpenCL kernel for FP16 properly - FIXED. dGPU has much more power, so results are x16-17 better than single cpu core. Patched version is not worse than convertFp16 and convertTo baseline. There are still gaps against noOpenCL(CPU only) mode due to T-API implementation issues (unnecessary synchronization) and required memory transfers. Co-authored-by: Alexander Alekhin <alexander.a.alekhin@gmail.com>
10 months ago · 40533dbf69
parent ae21368eb9
commit 40533dbf69
9 changed files with 353 additions and 89 deletions
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@ -556,6 +556,8 @@ The format of half precision floating point is defined in IEEE 754-2008.

@param src input array.
@param dst output array.
+
+@deprecated Use Mat::convertTo with CV_16F instead.
 */
 CV_EXPORTS_W void convertFp16(InputArray src, OutputArray dst);

--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@ -127,6 +127,11 @@ public:
    CV_WRAP int singleFPConfig() const;
    CV_WRAP int halfFPConfig() const;

+    /// true if 'cl_khr_fp64' extension is available
+    CV_WRAP bool hasFP64() const;
+    /// true if 'cl_khr_fp16' extension is available
+    CV_WRAP bool hasFP16() const;
+
    CV_WRAP bool endianLittle() const;
    CV_WRAP bool errorCorrectionSupport() const;

--- a/modules/core/include/opencv2/core/opencl/opencl_info.hpp
+++ b/modules/core/include/opencv2/core/opencl/opencl_info.hpp
@ -141,13 +141,13 @@ static void dumpOpenCLInformation()
        DUMP_MESSAGE_STDOUT("    Max memory allocation size = " << maxMemAllocSizeStr);
        DUMP_CONFIG_PROPERTY("cv_ocl_current_maxMemAllocSize", device.maxMemAllocSize());

-        const char* doubleSupportStr = device.doubleFPConfig() > 0 ? "Yes" : "No";
+        const char* doubleSupportStr = device.hasFP64() ? "Yes" : "No";
        DUMP_MESSAGE_STDOUT("    Double support = " << doubleSupportStr);
-        DUMP_CONFIG_PROPERTY("cv_ocl_current_haveDoubleSupport", device.doubleFPConfig() > 0);
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_haveDoubleSupport", device.hasFP64());

-        const char* halfSupportStr = device.halfFPConfig() > 0 ? "Yes" : "No";
+        const char* halfSupportStr = device.hasFP16() ? "Yes" : "No";
        DUMP_MESSAGE_STDOUT("    Half support = " << halfSupportStr);
-        DUMP_CONFIG_PROPERTY("cv_ocl_current_haveHalfSupport", device.halfFPConfig() > 0);
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_haveHalfSupport", device.hasFP16());

        const char* isUnifiedMemoryStr = device.hostUnifiedMemory() ? "Yes" : "No";
        DUMP_MESSAGE_STDOUT("    Host unified memory = " << isUnifiedMemoryStr);
--- a/modules/core/perf/opencl/perf_matop.cpp
+++ b/modules/core/perf/opencl/perf_matop.cpp
@ -80,6 +80,187 @@ OCL_PERF_TEST_P(ConvertToFixture, ConvertTo,
    SANITY_CHECK(dst);
 }

+
+//#define RUN_CONVERTFP16
+static Size convertFP16_srcSize(4000, 4000);
+
+OCL_PERF_TEST(Core, ConvertFP32FP16MatMat)
+{
+    const Size srcSize = convertFP16_srcSize;
+    const int type = CV_32F;
+    const int dtype = CV_16F;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(srcSize, dtype);
+
+    Mat src(srcSize, type);
+    Mat dst(srcSize, dtype);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+#ifdef RUN_CONVERTFP16
+    OCL_TEST_CYCLE() convertFp16(src, dst);
+#else
+    OCL_TEST_CYCLE() src.convertTo(dst, dtype);
+#endif
+
+    SANITY_CHECK_NOTHING();
+}
+
+OCL_PERF_TEST(Core, ConvertFP32FP16MatUMat)
+{
+    const Size srcSize = convertFP16_srcSize;
+    const int type = CV_32F;
+    const int dtype = CV_16F;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(srcSize, dtype);
+
+    Mat src(srcSize, type);
+    UMat dst(srcSize, dtype);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+#ifdef RUN_CONVERTFP16
+    OCL_TEST_CYCLE() convertFp16(src, dst);
+#else
+    OCL_TEST_CYCLE() src.convertTo(dst, dtype);
+#endif
+
+    SANITY_CHECK_NOTHING();
+}
+
+OCL_PERF_TEST(Core, ConvertFP32FP16UMatMat)
+{
+    const Size srcSize = convertFP16_srcSize;
+    const int type = CV_32F;
+    const int dtype = CV_16F;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(srcSize, dtype);
+
+    UMat src(srcSize, type);
+    Mat dst(srcSize, dtype);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+#ifdef RUN_CONVERTFP16
+    OCL_TEST_CYCLE() convertFp16(src, dst);
+#else
+    OCL_TEST_CYCLE() src.convertTo(dst, dtype);
+#endif
+
+    SANITY_CHECK_NOTHING();
+}
+
+OCL_PERF_TEST(Core, ConvertFP32FP16UMatUMat)
+{
+    const Size srcSize = convertFP16_srcSize;
+    const int type = CV_32F;
+    const int dtype = CV_16F;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(srcSize, dtype);
+
+    UMat src(srcSize, type);
+    UMat dst(srcSize, dtype);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+#ifdef RUN_CONVERTFP16
+    OCL_TEST_CYCLE() convertFp16(src, dst);
+#else
+    OCL_TEST_CYCLE() src.convertTo(dst, dtype);
+#endif
+
+    SANITY_CHECK_NOTHING();
+}
+
+OCL_PERF_TEST(Core, ConvertFP16FP32MatMat)
+{
+    const Size srcSize = convertFP16_srcSize;
+    const int type = CV_16F;
+    const int dtype = CV_32F;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(srcSize, dtype);
+
+    Mat src(srcSize, type);
+    Mat dst(srcSize, dtype);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+#ifdef RUN_CONVERTFP16
+    OCL_TEST_CYCLE() convertFp16(src, dst);
+#else
+    OCL_TEST_CYCLE() src.convertTo(dst, dtype);
+#endif
+
+    SANITY_CHECK_NOTHING();
+}
+
+OCL_PERF_TEST(Core, ConvertFP16FP32MatUMat)
+{
+    const Size srcSize = convertFP16_srcSize;
+    const int type = CV_16F;
+    const int dtype = CV_32F;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(srcSize, dtype);
+
+    Mat src(srcSize, type);
+    UMat dst(srcSize, dtype);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+#ifdef RUN_CONVERTFP16
+    OCL_TEST_CYCLE() convertFp16(src, dst);
+#else
+    OCL_TEST_CYCLE() src.convertTo(dst, dtype);
+#endif
+
+    SANITY_CHECK_NOTHING();
+}
+
+OCL_PERF_TEST(Core, ConvertFP16FP32UMatMat)
+{
+    const Size srcSize = convertFP16_srcSize;
+    const int type = CV_16F;
+    const int dtype = CV_32F;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(srcSize, dtype);
+
+    UMat src(srcSize, type);
+    Mat dst(srcSize, dtype);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+#ifdef RUN_CONVERTFP16
+    OCL_TEST_CYCLE() convertFp16(src, dst);
+#else
+    OCL_TEST_CYCLE() src.convertTo(dst, dtype);
+#endif
+
+    SANITY_CHECK_NOTHING();
+}
+
+OCL_PERF_TEST(Core, ConvertFP16FP32UMatUMat)
+{
+    const Size srcSize = convertFP16_srcSize;
+    const int type = CV_16F;
+    const int dtype = CV_32F;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(srcSize, dtype);
+
+    UMat src(srcSize, type);
+    UMat dst(srcSize, dtype);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+#ifdef RUN_CONVERTFP16
+    OCL_TEST_CYCLE() convertFp16(src, dst);
+#else
+    OCL_TEST_CYCLE() src.convertTo(dst, dtype);
+#endif
+
+    SANITY_CHECK_NOTHING();
+}
+
+
 ///////////// CopyTo ////////////////////////

 typedef Size_MatType CopyToFixture;
--- a/modules/core/src/convert.dispatch.cpp
+++ b/modules/core/src/convert.dispatch.cpp
@ -169,52 +169,130 @@ static bool ocl_convertFp16( InputArray _src, OutputArray _dst, int sdepth, int
    size_t globalsize[2] = { (size_t)src.cols * cn / kercn, ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI };
    return k.run(2, globalsize, NULL, false);
 }
+
+static bool ocl_convertTo(InputArray src_, OutputArray dst_, int ddepth, bool noScale, double alpha, double beta)
+{
+    CV_INSTRUMENT_REGION();
+
+    CV_Assert(ddepth >= 0);
+
+    int stype = src_.type();
+    int sdepth = CV_MAT_DEPTH(stype);
+    int cn = CV_MAT_CN(stype);
+
+    int dtype = CV_MAKETYPE(ddepth, cn);
+
+    int wdepth = (sdepth == CV_64F) ? CV_64F : CV_32F;
+
+    bool needDouble = sdepth == CV_64F || ddepth == CV_64F;
+    bool doubleCheck = true;
+    if (needDouble)
+    {
+        doubleCheck = ocl::Device::getDefault().hasFP64();
+    }
+    bool halfCheck = true;
+    bool needHalf = sdepth == CV_16F || ddepth == CV_16F;
+    if (needHalf)
+    {
+        halfCheck = ocl::Device::getDefault().hasFP16();
+    }
+
+    if (!doubleCheck)
+        return false;
+    if (!halfCheck)
+        return false;
+
+    const int rowsPerWI = 4;
+
+    char cvt[2][50];
+    ocl::Kernel k("convertTo", ocl::core::convert_oclsrc,
+                  format("-D srcT=%s -D WT=%s -D dstT=%s -D convertToWT=%s -D convertToDT=%s -D rowsPerWI=%d%s%s%s",
+                         ocl::typeToStr(sdepth), ocl::typeToStr(wdepth), ocl::typeToStr(ddepth),
+                         ocl::convertTypeStr(sdepth, wdepth, 1, cvt[0], sizeof(cvt[0])),
+                         ocl::convertTypeStr(wdepth, ddepth, 1, cvt[1], sizeof(cvt[1])),
+                         rowsPerWI,
+                         needDouble ? " -D DOUBLE_SUPPORT" : "",
+                         needHalf ? " -D HALF_SUPPORT" : "",
+                         noScale ? " -D NO_SCALE" : ""
+                  )
+    );
+
+    if (k.empty())
+        return false;
+
+    UMat src = src_.getUMat();
+    dst_.createSameSize(src_, dtype);
+    UMat dst = dst_.getUMat();
+
+    float alphaf = (float)alpha, betaf = (float)beta;
+
+    if (noScale)
+        k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst, cn));
+    else if (wdepth == CV_32F)
+        k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst, cn), alphaf, betaf);
+    else
+        k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst, cn), alpha, beta);
+
+    size_t globalsize[2] = {
+        (size_t)dst.cols * cn,
+        divUp((size_t)dst.rows, rowsPerWI)
+    };
+    if (!k.run(2, globalsize, NULL, false))
+        return false;
+
+    CV_IMPL_ADD(CV_IMPL_OCL);
+    return true;
+}
 #endif

-void Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta) const
+void Mat::convertTo(OutputArray dst, int type_, double alpha, double beta) const
 {
    CV_INSTRUMENT_REGION();

-    if( empty() )
+    if (empty())
    {
-        _dst.release();
+        dst.release();
        return;
    }

-    bool noScale = fabs(alpha-1) < DBL_EPSILON && fabs(beta) < DBL_EPSILON;
+    int stype = type();
+    int sdepth = CV_MAT_DEPTH(stype);

-    if( _type < 0 )
-        _type = _dst.fixedType() ? _dst.type() : type();
+    int ddepth = sdepth;
+    if (type_ >= 0)
+        ddepth = CV_MAT_DEPTH(type_);
    else
-        _type = CV_MAKETYPE(CV_MAT_DEPTH(_type), channels());
+        ddepth = dst.fixedType() ? dst.depth() : sdepth;

-    int sdepth = depth(), ddepth = CV_MAT_DEPTH(_type);
-    if( sdepth == ddepth && noScale )
+    bool noScale = std::fabs(alpha - 1) < DBL_EPSILON && std::fabs(beta) < DBL_EPSILON;
+    if (sdepth == ddepth && noScale)
    {
-        copyTo(_dst);
+        copyTo(dst);
        return;
    }

+    CV_OCL_RUN(dims <= 2 && dst.isUMat(),
+               ocl_convertTo(*this, dst, ddepth, noScale, alpha, beta))
+
+    int cn = channels();
+    int dtype = CV_MAKETYPE(ddepth, cn);
+
    Mat src = *this;
-    if( dims <= 2 )
-        _dst.create( size(), _type );
-    else
-        _dst.create( dims, size, _type );
-    Mat dst = _dst.getMat();
+    dst.create(dims, size, dtype);
+    Mat dstMat = dst.getMat();

    BinaryFunc func = noScale ? getConvertFunc(sdepth, ddepth) : getConvertScaleFunc(sdepth, ddepth);
    double scale[] = {alpha, beta};
-    int cn = channels();
    CV_Assert( func != 0 );

    if( dims <= 2 )
    {
-        Size sz = getContinuousSize2D(src, dst, cn);
-        func( src.data, src.step, 0, 0, dst.data, dst.step, sz, scale );
+        Size sz = getContinuousSize2D(src, dstMat, cn);
+        func(src.data, src.step, 0, 0, dstMat.data, dstMat.step, sz, scale);
    }
    else
    {
-        const Mat* arrays[] = {&src, &dst, 0};
+        const Mat* arrays[] = {&src, &dstMat, 0};
        uchar* ptrs[2] = {};
        NAryMatIterator it(arrays, ptrs);
        Size sz((int)(it.size*cn), 1);
@ -224,6 +302,44 @@ void Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta) cons
    }
 }

+void UMat::convertTo(OutputArray dst, int type_, double alpha, double beta) const
+{
+    CV_INSTRUMENT_REGION();
+
+    if (empty())
+    {
+        dst.release();
+        return;
+    }
+
+#ifdef HAVE_OPENCL
+    int stype = type();
+    int sdepth = CV_MAT_DEPTH(stype);
+
+    int ddepth = sdepth;
+    if (type_ >= 0)
+        ddepth = CV_MAT_DEPTH(type_);
+    else
+        ddepth = dst.fixedType() ? dst.depth() : sdepth;
+
+    bool noScale = std::fabs(alpha - 1) < DBL_EPSILON && std::fabs(beta) < DBL_EPSILON;
+    if (sdepth == ddepth && noScale)
+    {
+        copyTo(dst);
+        return;
+    }
+
+    CV_OCL_RUN(dims <= 2,
+               ocl_convertTo(*this, dst, ddepth, noScale, alpha, beta))
+#endif // HAVE_OPENCL
+
+    UMat src = *this;  // Fake reference to itself.
+                       // Resolves issue 8693 in case of src == dst.
+    Mat m = getMat(ACCESS_READ);
+    m.convertTo(dst, type_, alpha, beta);
+    (void)src;
+}
+
 //==================================================================================================

 void convertFp16(InputArray _src, OutputArray _dst)
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@ -1604,6 +1604,9 @@ struct Device::Impl
            pos = pos2 + 1;
        }

+        khr_fp64_support_ = isExtensionSupported("cl_khr_fp64");
+        khr_fp16_support_ = isExtensionSupported("cl_khr_fp16");
+
        intelSubgroupsSupport_ = isExtensionSupported("cl_intel_subgroups");

        vendorName_ = getStrProp(CL_DEVICE_VENDOR);
@ -1692,7 +1695,9 @@ struct Device::Impl
    String version_;
    std::string extensions_;
    int doubleFPConfig_;
+    bool khr_fp64_support_;
    int halfFPConfig_;
+    bool khr_fp16_support_;
    bool hostUnifiedMemory_;
    int maxComputeUnits_;
    size_t maxWorkGroupSize_;
@ -1844,6 +1849,11 @@ int Device::singleFPConfig() const
 int Device::halfFPConfig() const
 { return p ? p->halfFPConfig_ : 0; }

+bool Device::hasFP64() const
+{ return p ? p->khr_fp64_support_ : false; }
+bool Device::hasFP16() const
+{ return p ? p->khr_fp16_support_ : false; }
+
 bool Device::endianLittle() const
 { return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; }

--- a/modules/core/src/ocl_disabled.impl.hpp
+++ b/modules/core/src/ocl_disabled.impl.hpp
@ -67,6 +67,9 @@ int Device::doubleFPConfig() const { OCL_NOT_AVAILABLE(); }
 int Device::singleFPConfig() const { OCL_NOT_AVAILABLE(); }
 int Device::halfFPConfig() const { OCL_NOT_AVAILABLE(); }

+bool Device::hasFP64() const { OCL_NOT_AVAILABLE(); }
+bool Device::hasFP16() const { OCL_NOT_AVAILABLE(); }
+
 bool Device::endianLittle() const { OCL_NOT_AVAILABLE(); }
 bool Device::errorCorrectionSupport() const { OCL_NOT_AVAILABLE(); }

--- a/modules/core/src/opencl/convert.cl
+++ b/modules/core/src/opencl/convert.cl
@ -49,14 +49,21 @@
 #endif
 #endif

+#ifdef HALF_SUPPORT
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16:enable
+#endif
+#endif
+
+
 #define noconvert

 __kernel void convertTo(__global const uchar * srcptr, int src_step, int src_offset,
-                        __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+                        __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols
 #ifndef NO_SCALE
-                        WT alpha, WT beta,
+                        , WT alpha, WT beta
 #endif
-                        int rowsPerWI)
+)
 {
    int x = get_global_id(0);
    int y0 = get_global_id(1) * rowsPerWI;
--- a/modules/core/src/umatrix.cpp
+++ b/modules/core/src/umatrix.cpp
@ -1233,70 +1233,10 @@ void UMat::copyTo(OutputArray _dst, InputArray _mask) const
    src.copyTo(_dst, _mask);
 }

-void UMat::convertTo(OutputArray _dst, int _type, double alpha, double beta) const
-{
-    CV_INSTRUMENT_REGION();

-    bool noScale = std::fabs(alpha - 1) < DBL_EPSILON && std::fabs(beta) < DBL_EPSILON;
-    int stype = type(), cn = CV_MAT_CN(stype);
-
-    if( _type < 0 )
-        _type = _dst.fixedType() ? _dst.type() : stype;
-    else
-        _type = CV_MAKETYPE(CV_MAT_DEPTH(_type), cn);
-
-    int sdepth = CV_MAT_DEPTH(stype), ddepth = CV_MAT_DEPTH(_type);
-    if( sdepth == ddepth && noScale )
-    {
-        copyTo(_dst);
-        return;
-    }
-#ifdef HAVE_OPENCL
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
-    bool needDouble = sdepth == CV_64F || ddepth == CV_64F;
-    if( dims <= 2 && cn && _dst.isUMat() && ocl::useOpenCL() &&
-            ((needDouble && doubleSupport) || !needDouble) )
-    {
-        int wdepth = std::max(CV_32F, sdepth), rowsPerWI = 4;
-
-        char cvt[2][50];
-        ocl::Kernel k("convertTo", ocl::core::convert_oclsrc,
-                      format("-D srcT=%s -D WT=%s -D dstT=%s -D convertToWT=%s -D convertToDT=%s%s%s",
-                             ocl::typeToStr(sdepth), ocl::typeToStr(wdepth), ocl::typeToStr(ddepth),
-                             ocl::convertTypeStr(sdepth, wdepth, 1, cvt[0], sizeof(cvt[0])),
-                             ocl::convertTypeStr(wdepth, ddepth, 1, cvt[1], sizeof(cvt[1])),
-                             doubleSupport ? " -D DOUBLE_SUPPORT" : "", noScale ? " -D NO_SCALE" : ""));
-        if (!k.empty())
-        {
-            UMat src = *this;
-            _dst.create( size(), _type );
-            UMat dst = _dst.getUMat();
-
-            float alphaf = (float)alpha, betaf = (float)beta;
-            ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
-                    dstarg = ocl::KernelArg::WriteOnly(dst, cn);
-
-            if (noScale)
-                k.args(srcarg, dstarg, rowsPerWI);
-            else if (wdepth == CV_32F)
-                k.args(srcarg, dstarg, alphaf, betaf, rowsPerWI);
-            else
-                k.args(srcarg, dstarg, alpha, beta, rowsPerWI);
-
-            size_t globalsize[2] = { (size_t)dst.cols * cn, ((size_t)dst.rows + rowsPerWI - 1) / rowsPerWI };
-            if (k.run(2, globalsize, NULL, false))
-            {
-                CV_IMPL_ADD(CV_IMPL_OCL);
-                return;
-            }
-        }
-    }
-#endif
-    UMat src = *this;  // Fake reference to itself.
-                       // Resolves issue 8693 in case of src == dst.
-    Mat m = getMat(ACCESS_READ);
-    m.convertTo(_dst, _type, alpha, beta);
-}
+//
+// void UMat::convertTo moved to convert.dispatch.cpp
+//

 UMat& UMat::setTo(InputArray _value, InputArray _mask)
 {