Merge remote-tracking branch 'upstream/3.4' into merge-3.4

4 years ago · c3ac834526
parent c7e0888982 f605373a2b
commit c3ac834526
35 changed files with 1014 additions and 444 deletions
--- a/cmake/OpenCVDetectInferenceEngine.cmake
+++ b/cmake/OpenCVDetectInferenceEngine.cmake
@ -141,8 +141,8 @@ if(INF_ENGINE_TARGET)
    endif()
  endif()
  if(NOT INF_ENGINE_RELEASE AND NOT INF_ENGINE_RELEASE_INIT)
-    message(WARNING "InferenceEngine version has not been set, 2021.4 will be used by default. Set INF_ENGINE_RELEASE variable if you experience build errors.")
-    set(INF_ENGINE_RELEASE_INIT "2021040000")
+    message(WARNING "InferenceEngine version has not been set, 2021.4.1 will be used by default. Set INF_ENGINE_RELEASE variable if you experience build errors.")
+    set(INF_ENGINE_RELEASE_INIT "2021040100")
  elseif(DEFINED INF_ENGINE_RELEASE)
    set(INF_ENGINE_RELEASE_INIT "${INF_ENGINE_RELEASE}")
  endif()
--- a/doc/py_tutorials/py_feature2d/py_fast/py_fast.markdown
+++ b/doc/py_tutorials/py_feature2d/py_fast/py_fast.markdown
@ -98,7 +98,7 @@ import numpy as np
 import cv2 as cv
 from matplotlib import pyplot as plt

-img = cv.imread('simple.jpg',0)
+img = cv.imread('blox.jpg',0) # `<opencv_root>/samples/data/blox.jpg`

 # Initiate FAST object with default values
 fast = cv.FastFeatureDetector_create()
@ -113,17 +113,17 @@ print( "nonmaxSuppression:{}".format(fast.getNonmaxSuppression()) )
 print( "neighborhood: {}".format(fast.getType()) )
 print( "Total Keypoints with nonmaxSuppression: {}".format(len(kp)) )

-cv.imwrite('fast_true.png',img2)
+cv.imwrite('fast_true.png', img2)

 # Disable nonmaxSuppression
 fast.setNonmaxSuppression(0)
-kp = fast.detect(img,None)
+kp = fast.detect(img, None)

 print( "Total Keypoints without nonmaxSuppression: {}".format(len(kp)) )

 img3 = cv.drawKeypoints(img, kp, None, color=(255,0,0))

-cv.imwrite('fast_false.png',img3)
+cv.imwrite('fast_false.png', img3)
@endcode
 See the results. First image shows FAST with nonmaxSuppression and second one without
 nonmaxSuppression:
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@ -499,8 +499,8 @@ public:
    template<typename... _Tps> inline
    Kernel& args(const _Tps&... kernel_args) { set_args_(0, kernel_args...); return *this; }

+    /** @brief Run the OpenCL kernel (globalsize value may be adjusted)

-    /** @brief Run the OpenCL kernel.
    @param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3.
    @param globalsize work items for each dimension. It is not the final globalsize passed to
      OpenCL. Each dimension will be adjusted to the nearest integer divisible by the corresponding
@ -509,12 +509,26 @@ public:
    @param localsize work-group size for each dimension.
    @param sync specify whether to wait for OpenCL computation to finish before return.
    @param q command queue
+
+    @note Use run_() if your kernel code doesn't support adjusted globalsize.
    */
    bool run(int dims, size_t globalsize[],
             size_t localsize[], bool sync, const Queue& q=Queue());
+
+    /** @brief Run the OpenCL kernel
+     *
+     * @param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3.
+     * @param globalsize work items for each dimension. This value is passed to OpenCL without changes.
+     * @param localsize work-group size for each dimension.
+     * @param sync specify whether to wait for OpenCL computation to finish before return.
+     * @param q command queue
+     */
+    bool run_(int dims, size_t globalsize[], size_t localsize[], bool sync, const Queue& q=Queue());
+
    bool runTask(bool sync, const Queue& q=Queue());

-    /** @brief Similar to synchronized run() call with returning of kernel execution time
+    /** @brief Similar to synchronized run_() call with returning of kernel execution time
+     *
     * Separate OpenCL command queue may be used (with CL_QUEUE_PROFILING_ENABLE)
     * @return Execution time in nanoseconds or negative number on error
     */
--- a/modules/core/src/intel_gpu_gemm.inl.hpp
+++ b/modules/core/src/intel_gpu_gemm.inl.hpp
@ -24,11 +24,6 @@

 #ifdef HAVE_OPENCL

-#include <sstream>
-#include "opencl_kernels_core.hpp"
-#include "opencv2/core/opencl/runtime/opencl_clblas.hpp"
-#include "opencv2/core/opencl/runtime/opencl_core.hpp"
-
 namespace cv
 {

@ -37,52 +32,75 @@ static bool intel_gpu_gemm(
    UMat B, Size sizeB,
    UMat D, Size sizeD,
    double alpha, double beta,
-    bool atrans, bool btrans)
+    bool atrans, bool btrans,
+    bool& isPropagatedC2D
+)
 {
    CV_UNUSED(sizeB);

    int M = sizeD.height, N = sizeD.width, K = ((atrans)? sizeA.height : sizeA.width);

+    if (M < 4 || N < 4 || K < 4)  // vload4
+        return false;
+
+    CV_LOG_VERBOSE(NULL, 0, "M=" << M << " N=" << N << " K=" << K);
+
    std::string kernelName;
-    bool ret = true;

-    size_t lx = 8, ly = 4;
-    size_t dx = 4, dy = 8;
+    unsigned int lx = 8, ly = 4;
+    unsigned int dx = 4, dy = 8;

    if(!atrans && !btrans)
    {
-
        if (M % 32 == 0 && N % 32 == 0 && K % 16 == 0)
        {
            kernelName = "intelblas_gemm_buffer_NN_sp";
        }
        else
        {
+            if (M % 2 != 0)
+                return false;
+            // vload4(0, dst_write0) - 4 cols
+            // multiply by lx: 8
+            if (N % (4*8) != 0)
+                return false;
            kernelName = "intelblas_gemm_buffer_NN";
        }
    }
    else if(atrans && !btrans)
    {
+        if (M % 32 != 0)
+            return false;
+        if (N % 32 != 0)
+            return false;
        kernelName = "intelblas_gemm_buffer_TN";
    }
    else if(!atrans && btrans)
    {
+        if (K % 4 != 0)
+            return false;
        kernelName = "intelblas_gemm_buffer_NT";
        ly = 16;
        dx = 1;
    }
    else
    {
+        if (M % 32 != 0)
+            return false;
+        if (N % 32 != 0)
+            return false;
+        if (K % 16 != 0)
+            return false;
        kernelName = "intelblas_gemm_buffer_TT";
    }

-    const size_t gx = (size_t)(N + dx - 1) / dx;
-    const size_t gy = (size_t)(M + dy - 1) / dy;
+    CV_LOG_DEBUG(NULL, "kernel: " << kernelName << "  (M=" << M << " N=" << N << " K=" << K << ")");

-    size_t local[] = {lx, ly, 1};
-    size_t global[] = {(gx + lx - 1) / lx * lx, (gy + ly - 1) / ly * ly, 1};
+    const size_t gx = divUp((size_t)N, dx);
+    const size_t gy = divUp((size_t)M, dy);

-    int stride = (M * N < 1024 * 1024) ? 10000000 : 256;
+    size_t local[] = {lx, ly, 1};
+    size_t global[] = {roundUp(gx, lx), roundUp(gy, ly), 1};

    ocl::Queue q;
    String errmsg;
@ -110,10 +128,13 @@ static bool intel_gpu_gemm(
               (int)(D.step / sizeof(float))
        );

-        ret = k.run(2, global, local, false, q);
+        bool ret = k.run(2, global, local, false, q);
+        return ret;
    }
    else
    {
+        int stride = (M * N < 1024 * 1024) ? 10000000 : 256;
+
        for(int start_index = 0; start_index < K; start_index += stride)
        {
             ocl::Kernel k(kernelName.c_str(), program);
@ -132,12 +153,16 @@ static bool intel_gpu_gemm(
                    (int) start_index,                          // 14 start_index
                    stride);

-            ret = k.run(2, global, local, false, q);
-            if (!ret) return ret;
+            bool ret = k.run(2, global, local, false, q);
+            if (!ret)
+            {
+                if (start_index != 0)
+                    isPropagatedC2D = false;  // D array content is changed, need to rewrite
+                return false;
+            }
        }
+        return true;
    }
-
-    return ret;
 }

 } // namespace cv
--- a/modules/core/src/matmul.dispatch.cpp
+++ b/modules/core/src/matmul.dispatch.cpp
@ -42,6 +42,8 @@
 //M*/

 #include "precomp.hpp"
+#include <opencv2/core/utils/logger.hpp>
+
 #include "opencl_kernels_core.hpp"
 #include "opencv2/core/opencl/runtime/opencl_clblas.hpp"
 #include "opencv2/core/opencl/runtime/opencl_core.hpp"
@ -155,10 +157,12 @@ static bool ocl_gemm_amdblas( InputArray matA, InputArray matB, double alpha,
 static bool ocl_gemm( InputArray matA, InputArray matB, double alpha,
                      InputArray matC, double beta, OutputArray matD, int flags )
 {
-    int depth = matA.depth(), cn = matA.channels();
-    int type = CV_MAKETYPE(depth, cn);
+    int type = matA.type();
+    int depth = CV_MAT_DEPTH(type);
+    int cn = CV_MAT_CN(type);

-    CV_Assert_N( type == matB.type(), (type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2) );
+    CV_CheckTypeEQ(type, matB.type(), "");
+    CV_CheckType(type, type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2, "");

    const ocl::Device & dev = ocl::Device::getDefault();
    bool doubleSupport = dev.doubleFPConfig() > 0;
@ -170,88 +174,103 @@ static bool ocl_gemm( InputArray matA, InputArray matB, double alpha,
    Size sizeA = matA.size(), sizeB = matB.size(), sizeC = haveC ? matC.size() : Size(0, 0);
    bool atrans = (flags & GEMM_1_T) != 0, btrans = (flags & GEMM_2_T) != 0, ctrans = (flags & GEMM_3_T) != 0;

-    CV_Assert( !haveC || matC.type() == type );
-
-    Size sizeD(((btrans)? sizeB.height : sizeB.width),
-               ((atrans)? sizeA.width : sizeA.height));
-    matD.create(sizeD, type);
-
-    UMat A = matA.getUMat(), B = matB.getUMat(), D = matD.getUMat();
-
-
-    if (!dev.intelSubgroupsSupport() || (depth == CV_64F) || cn != 1)
-    {
-        String opts;
-
-        if (atrans)
-            sizeA = Size(sizeA.height, sizeA.width);
-        if (btrans)
-            sizeB = Size(sizeB.height, sizeB.width);
-        if (haveC && ctrans)
-            sizeC = Size(sizeC.height, sizeC.width);
-
-        CV_Assert( sizeA.width == sizeB.height && (!haveC || sizeC == sizeD) );
-
-        int max_wg_size = (int)dev.maxWorkGroupSize();
-        int block_size = (max_wg_size / (32*cn) < 32) ? (max_wg_size / (16*cn) < 16) ? (max_wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32;
-
-        if (atrans)
-            A = A.t();
-
-        if (btrans)
-            B = B.t();
+    if (haveC)
+        CV_CheckTypeEQ(type, matC.type(), "");

-        if (haveC)
-            ctrans ? transpose(matC, D) : matC.copyTo(D);
+    Size sizeD(((btrans) ? sizeB.height : sizeB.width),
+               ((atrans) ? sizeA.width : sizeA.height));

-        int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 };
-        int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D);
+    if (atrans)
+        sizeA = Size(sizeA.height, sizeA.width);
+    if (btrans)
+        sizeB = Size(sizeB.height, sizeB.width);
+    if (haveC && ctrans)
+        sizeC = Size(sizeC.height, sizeC.width);

-        opts += format(" -D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d%s%s%s",
-                          ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)),
-                          cn, kercn, block_size,
-                          (sizeA.width % block_size !=0) ? " -D NO_MULT" : "",
-                          haveC ? " -D HAVE_C" : "",
-                          doubleSupport ? " -D DOUBLE_SUPPORT" : "");
+    CV_CheckEQ(sizeA.width, sizeB.height, "");
+    if (haveC)
+        CV_CheckEQ(sizeC, sizeD, "");

-        ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts);
-        if (k.empty())
-            return false;
+    UMat A = matA.getUMat();
+    UMat B = matB.getUMat();

-        if (depth == CV_64F)
-            k.args(ocl::KernelArg::ReadOnlyNoSize(A),
-                   ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
-                   ocl::KernelArg::ReadWrite(D, cn, kercn),
-                   sizeA.width, alpha, beta);
-        else
-            k.args(ocl::KernelArg::ReadOnlyNoSize(A),
-                   ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
-                   ocl::KernelArg::ReadWrite(D, cn, kercn),
-                   sizeA.width, (float)alpha, (float)beta);
+    matD.create(sizeD, type);
+    UMat D = matD.getUMat();

-        size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height};
-        size_t localsize[2] = { (size_t)block_size, (size_t)block_size};
+    bool isPropagatedC2D = false;  // D content is updated with C / C.t()

-        return k.run(2, globalsize, block_size!=1 ? localsize : NULL, false);
-    }
-    else
+    if (dev.intelSubgroupsSupport() && (depth == CV_32F) && cn == 1)
    {
        if (haveC && beta != 0.0)
        {
            ctrans ? transpose(matC, D) : matC.copyTo(D);
+            isPropagatedC2D = true;
        }
        else
        {
            beta = 0.0;
        }

-        return intel_gpu_gemm(A, sizeA,
-                              B, sizeB,
-                              D, sizeD,
-                              alpha,
-                              beta,
-                              atrans, btrans);
+        bool res = intel_gpu_gemm(A, matA.size(),
+                                  B, matB.size(),
+                                  D, sizeD,
+                                  alpha,
+                                  beta,
+                                  atrans, btrans,
+                                  isPropagatedC2D);
+        if (res)
+            return true;
+        // fallback on generic OpenCL code
    }
+
+    if (sizeD.width < 8 || sizeD.height < 8)
+        return false;
+
+    String opts;
+
+    int wg_size = (int)dev.maxWorkGroupSize();
+    int sizeDmin = std::min(sizeD.width, sizeD.height);
+    wg_size = std::min(wg_size, sizeDmin * sizeDmin);
+    int block_size = (wg_size / (32*cn) < 32) ? (wg_size / (16*cn) < 16) ? (wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32;
+
+    if (atrans)
+        A = A.t();
+
+    if (btrans)
+        B = B.t();
+
+    if (haveC && !isPropagatedC2D)
+        ctrans ? transpose(matC, D) : matC.copyTo(D);
+
+    int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 };
+    int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D);
+
+    opts += format(" -D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d%s%s%s",
+                      ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)),
+                      cn, kercn, block_size,
+                      (sizeA.width % block_size !=0) ? " -D NO_MULT" : "",
+                      haveC ? " -D HAVE_C" : "",
+                      doubleSupport ? " -D DOUBLE_SUPPORT" : "");
+
+    ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts);
+    if (k.empty())
+        return false;
+
+    if (depth == CV_64F)
+        k.args(ocl::KernelArg::ReadOnlyNoSize(A),
+               ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
+               ocl::KernelArg::ReadWrite(D, cn, kercn),
+               sizeA.width, alpha, beta);
+    else
+        k.args(ocl::KernelArg::ReadOnlyNoSize(A),
+               ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
+               ocl::KernelArg::ReadWrite(D, cn, kercn),
+               sizeA.width, (float)alpha, (float)beta);
+
+    size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height};
+    size_t localsize[2] = { (size_t)block_size, (size_t)block_size};
+
+    return k.run(2, globalsize, block_size !=1 ? localsize : NULL, false);
 }
 #endif

--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@ -76,8 +76,11 @@
 #undef CV__ALLOCATOR_STATS_LOG

 #define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG          0
+#define CV_OPENCL_SHOW_BUILD_OPTIONS             0
+#define CV_OPENCL_SHOW_BUILD_KERNELS             0

 #define CV_OPENCL_SHOW_RUN_KERNELS               0
+#define CV_OPENCL_SYNC_RUN_KERNELS               0
 #define CV_OPENCL_TRACE_CHECK                    0

 #define CV_OPENCL_VALIDATE_BINARY_PROGRAMS       1
@ -2155,20 +2158,22 @@ static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
        platforms.resize(numPlatforms);
    }

-    int selectedPlatform = -1;
    if (platform.length() > 0)
    {
-        for (size_t i = 0; i < platforms.size(); i++)
+        for (std::vector<cl_platform_id>::iterator currentPlatform = platforms.begin(); currentPlatform != platforms.end();)
        {
            std::string name;
-            CV_OCL_DBG_CHECK(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name));
+            CV_OCL_DBG_CHECK(getStringInfo(clGetPlatformInfo, *currentPlatform, CL_PLATFORM_NAME, name));
            if (name.find(platform) != std::string::npos)
            {
-                selectedPlatform = (int)i;
-                break;
+                ++currentPlatform;
+            }
+            else
+            {
+                currentPlatform = platforms.erase(currentPlatform);
            }
        }
-        if (selectedPlatform == -1)
+        if (platforms.size() == 0)
        {
            CV_LOG_ERROR(NULL, "OpenCL: Can't find OpenCL platform by name: " << platform);
            goto not_found;
@ -2205,13 +2210,11 @@ static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
            goto not_found;
        }

-        std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup
-        for (int i = selectedPlatform >= 0 ? selectedPlatform : 0;
-                (selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size());
-                i++)
+        std::vector<cl_device_id> devices;
+        for (std::vector<cl_platform_id>::iterator currentPlatform = platforms.begin(); currentPlatform != platforms.end(); ++currentPlatform)
        {
            cl_uint count = 0;
-            cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count);
+            cl_int status = clGetDeviceIDs(*currentPlatform, deviceType, 0, NULL, &count);
            if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
            {
                CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get count");
@ -2220,7 +2223,7 @@ static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
                continue;
            size_t base = devices.size();
            devices.resize(base + count);
-            status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count);
+            status = clGetDeviceIDs(*currentPlatform, deviceType, count, &devices[base], &count);
            if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
            {
                CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get IDs");
@ -3679,6 +3682,8 @@ static cv::String dumpValue(size_t sz, const void* p)
 {
    if (!p)
        return "NULL";
+    if (sz == 2)
+        return cv::format("%d / %uu / 0x%04x", *(short*)p, *(unsigned short*)p, *(short*)p);
    if (sz == 4)
        return cv::format("%d / %uu / 0x%08x / %g", *(int*)p, *(int*)p, *(int*)p, *(float*)p);
    if (sz == 8)
@ -3851,6 +3856,14 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
 }


+bool Kernel::run_(int dims, size_t _globalsize[], size_t _localsize[],
+                  bool sync, const Queue& q)
+{
+    CV_Assert(p);
+    return p->run(dims, _globalsize, _localsize, sync, NULL, q);
+}
+
+
 static bool isRaiseErrorOnReuseAsyncKernel()
 {
    static bool initialized = false;
@ -3891,6 +3904,10 @@ bool Kernel::Impl::run(int dims, size_t globalsize[], size_t localsize[],
        return false;  // OpenCV 5.0: raise error
    }

+#if CV_OPENCL_SYNC_RUN_KERNELS
+    sync = true;
+#endif
+
    cl_command_queue qq = getQueue(q);
    if (haveTempDstUMats)
        sync = true;
@ -4338,7 +4355,28 @@ struct Program::Impl
            if (!param_buildExtraOptions.empty())
                buildflags = joinBuildOptions(buildflags, param_buildExtraOptions);
        }
+#if CV_OPENCL_SHOW_BUILD_OPTIONS
+        CV_LOG_INFO(NULL, "OpenCL program '" << sourceModule_ << "/" << sourceName_ << "' options:" << buildflags);
+#endif
        compile(ctx, src_, errmsg);
+#if CV_OPENCL_SHOW_BUILD_KERNELS
+        if (handle)
+        {
+            size_t retsz = 0;
+            char kernels_buffer[4096] = {0};
+            cl_int result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
+            CV_OCL_DBG_CHECK_RESULT(result, cv::format("clGetProgramInfo(CL_PROGRAM_KERNEL_NAMES: %s/%s)", sourceModule_.c_str(), sourceName_.c_str()).c_str());
+            if (result == CL_SUCCESS && retsz < sizeof(kernels_buffer))
+            {
+                kernels_buffer[retsz] = 0;
+                CV_LOG_INFO(NULL, "OpenCL program '" << sourceModule_ << "/" << sourceName_ << "' kernels: '" << kernels_buffer << "'");
+            }
+            else
+            {
+                CV_LOG_ERROR(NULL, "OpenCL program '" << sourceModule_ << "/" << sourceName_ << "' can't retrieve kernel names!");
+            }
+        }
+#endif
    }

    bool compile(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
@ -4570,7 +4608,6 @@ struct Program::Impl
                CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
            }
 #endif
-
        }
        return handle != NULL;
    }
--- a/modules/core/src/opencl/intel_gemm.cl
+++ b/modules/core/src/opencl/intel_gemm.cl
@ -392,6 +392,15 @@ __kernel void intelblas_gemm_buffer_NN(
 #define TILE_N          8
 #define SLM_BLOCK       512

+/*
+   A     K         B.t()   K            D     N
+    -----------       -----------        -----------
+    |         |       |         |        |         |
+  M |         |  x  N |         |  =>  M |         |
+    |         |       |         |        |         |
+    -----------       -----------        -----------
+*/
+
 __attribute__((reqd_work_group_size(8, LWG_HEIGHT, 1)))
 __kernel void intelblas_gemm_buffer_NT(
    const __global float *src0, int off0,
@ -422,59 +431,79 @@ __kernel void intelblas_gemm_buffer_NT(
    float8 dot06 = 0.f;
    float8 dot07 = 0.f;

-    float4 brow0;
-    float4 brow1;
-    float4 brow2;
-    float4 brow3;
-    float4 brow4;
-    float4 brow5;
-    float4 brow6;
-    float4 brow7;
-
-    __global float *dst_write0 = dst + local_x * VEC_SIZE + ( group_x * TILE_N ) + ( group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M) * ldC + offd;
+    const int dst_row = (global_y * TILE_M);
+    __global float *dst_write0 = dst + global_x + dst_row * ldC + offd;

-    const __global float *src0_read = src0 + local_x * ( TILE_K / 8 ) + ( group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M ) * ldA + off0;
+    const __global float *src0_read00 = src0 + off0;
+    const int a_row_base = global_y * TILE_M;
+    const int a_col_base = local_x * (TILE_K / 8);  // <= TILE_K - 4

-    const __global float *src1_read0 = src1 + ( group_x * TILE_N ) * ldB + off1;
+    const __global float *src1_read00 = src1 + off1;
+    const int b_row_base = (group_x * TILE_N);
+    //const int b_col_base = 0;

    __local float slm_brow[8 * SLM_BLOCK];
-    __local float* slm_brow0;

    int local_index = mad24(local_y, 8, local_x) * 4;
-    int w;
-    for(int b_tile = 0; b_tile < K; b_tile += SLM_BLOCK) {
+    int w = 0;
+    for (int b_tile = 0; b_tile < K; b_tile += SLM_BLOCK)
+    {
+#define UPDATE_BROW(_row) \
+        { \
+            float4 brow; \
+            int b_row = b_row_base + _row; \
+            int b_col = b_tile + local_index; \
+            if (b_row < N && b_col <= K - 4 /*vload4*/) \
+                brow = vload4(0, src1_read00 + mad24(b_row, ldB, b_col)); \
+            else \
+                brow = (float4)0; \
+            vstore4(brow, 0, slm_brow + mad24(_row, SLM_BLOCK, local_index)); \
+        }
+
        barrier(CLK_LOCAL_MEM_FENCE);
-        vstore4(vload4(0, src1_read0 + mad24(0, ldB, local_index)), 0, slm_brow + mad24(0, SLM_BLOCK, local_index));
-        vstore4(vload4(0, src1_read0 + mad24(1, ldB, local_index)), 0, slm_brow + mad24(1, SLM_BLOCK, local_index));
-        vstore4(vload4(0, src1_read0 + mad24(2, ldB, local_index)), 0, slm_brow + mad24(2, SLM_BLOCK, local_index));
-        vstore4(vload4(0, src1_read0 + mad24(3, ldB, local_index)), 0, slm_brow + mad24(3, SLM_BLOCK, local_index));
-        vstore4(vload4(0, src1_read0 + mad24(4, ldB, local_index)), 0, slm_brow + mad24(4, SLM_BLOCK, local_index));
-        vstore4(vload4(0, src1_read0 + mad24(5, ldB, local_index)), 0, slm_brow + mad24(5, SLM_BLOCK, local_index));
-        vstore4(vload4(0, src1_read0 + mad24(6, ldB, local_index)), 0, slm_brow + mad24(6, SLM_BLOCK, local_index));
-        vstore4(vload4(0, src1_read0 + mad24(7, ldB, local_index)), 0, slm_brow + mad24(7, SLM_BLOCK, local_index));
+        UPDATE_BROW(0);
+        UPDATE_BROW(1);
+        UPDATE_BROW(2);
+        UPDATE_BROW(3);
+        UPDATE_BROW(4);
+        UPDATE_BROW(5);
+        UPDATE_BROW(6);
+        UPDATE_BROW(7);
        barrier(CLK_LOCAL_MEM_FENCE);
-
-        slm_brow0 = slm_brow + local_x * (TILE_K / 8);
-        w = b_tile;
-        int end_w = min(b_tile + SLM_BLOCK, K);
-        while( w + TILE_K <= end_w ) {
-            float4 arow;
-
-            brow0 = vload4(0, slm_brow0 + 0 * SLM_BLOCK);
-            brow1 = vload4(0, slm_brow0 + 1 * SLM_BLOCK);
-            brow2 = vload4(0, slm_brow0 + 2 * SLM_BLOCK);
-            brow3 = vload4(0, slm_brow0 + 3 * SLM_BLOCK);
-            brow4 = vload4(0, slm_brow0 + 4 * SLM_BLOCK);
-            brow5 = vload4(0, slm_brow0 + 5 * SLM_BLOCK);
-            brow6 = vload4(0, slm_brow0 + 6 * SLM_BLOCK);
-            brow7 = vload4(0, slm_brow0 + 7 * SLM_BLOCK);
-
-#define MM_DOT_PRODUCT(_row,_dot)   \
-            arow = vload4(0, src0_read + _row * ldA);                           \
-            _dot = mad( (float8)(arow.x), (float8)(brow0.x, brow1.x, brow2.x, brow3.x, brow4.x, brow5.x, brow6.x, brow7.x), _dot ); \
-            _dot = mad( (float8)(arow.y), (float8)(brow0.y, brow1.y, brow2.y, brow3.y, brow4.y, brow5.y, brow6.y, brow7.y), _dot ); \
-            _dot = mad( (float8)(arow.z), (float8)(brow0.z, brow1.z, brow2.z, brow3.z, brow4.z, brow5.z, brow6.z, brow7.z), _dot ); \
-            _dot = mad( (float8)(arow.w), (float8)(brow0.w, brow1.w, brow2.w, brow3.w, brow4.w, brow5.w, brow6.w, brow7.w), _dot );
+#undef UPDATE_BROW
+
+        for (int k_tile_offset = 0; k_tile_offset < SLM_BLOCK; k_tile_offset += TILE_K)
+        {
+            int a_col = a_col_base + b_tile + k_tile_offset;
+
+            if (a_col > K - 4 /*vload4*/)
+                break;
+
+            int slm_brow_col = a_col_base + k_tile_offset;  // <= SLM_BLOCK - 4
+#define READ_SLM_BROW(_row) \
+            float4 brow##_row = vload4(0, slm_brow + mad24(_row, SLM_BLOCK, slm_brow_col));
+
+            READ_SLM_BROW(0);
+            READ_SLM_BROW(1);
+            READ_SLM_BROW(2);
+            READ_SLM_BROW(3);
+            READ_SLM_BROW(4);
+            READ_SLM_BROW(5);
+            READ_SLM_BROW(6);
+            READ_SLM_BROW(7);
+#undef READ_SLM_BROW
+
+#define MM_DOT_PRODUCT(_row,_dot) \
+            { \
+                int a_row = a_row_base + _row; \
+                if (a_row < M) { \
+                    float4 arow = vload4(0, src0_read00 + mad24(a_row, ldA, a_col)); \
+                    _dot = mad( (float8)(arow.x), (float8)(brow0.x, brow1.x, brow2.x, brow3.x, brow4.x, brow5.x, brow6.x, brow7.x), _dot ); \
+                    _dot = mad( (float8)(arow.y), (float8)(brow0.y, brow1.y, brow2.y, brow3.y, brow4.y, brow5.y, brow6.y, brow7.y), _dot ); \
+                    _dot = mad( (float8)(arow.z), (float8)(brow0.z, brow1.z, brow2.z, brow3.z, brow4.z, brow5.z, brow6.z, brow7.z), _dot ); \
+                    _dot = mad( (float8)(arow.w), (float8)(brow0.w, brow1.w, brow2.w, brow3.w, brow4.w, brow5.w, brow6.w, brow7.w), _dot ); \
+                } \
+            }

            MM_DOT_PRODUCT(0,dot00);
            MM_DOT_PRODUCT(1,dot01);
@ -485,53 +514,7 @@ __kernel void intelblas_gemm_buffer_NT(
            MM_DOT_PRODUCT(6,dot06);
            MM_DOT_PRODUCT(7,dot07);
 #undef MM_DOT_PRODUCT
-
-            src0_read += TILE_K;
-            slm_brow0 += TILE_K;
-            w += TILE_K;
        }
-        src1_read0 += SLM_BLOCK;
-    }
-
-    if(w < K) {
-        float4 arow;
-
-#define READ_BROW(_brow,_row) \
-        _brow = vload4(0, slm_brow0 + _row * SLM_BLOCK); \
-        _brow.x = (mad24(local_x, 4, w) < K) ? _brow.x : 0.0f; \
-        _brow.y = (mad24(local_x, 4, w + 1) < K) ? _brow.y : 0.0f; \
-        _brow.z = (mad24(local_x, 4, w + 2) < K) ? _brow.z : 0.0f; \
-        _brow.w = (mad24(local_x, 4, w + 3) < K) ? _brow.w : 0.0f;
-
-        READ_BROW(brow0,0);
-        READ_BROW(brow1,1);
-        READ_BROW(brow2,2);
-        READ_BROW(brow3,3);
-        READ_BROW(brow4,4);
-        READ_BROW(brow5,5);
-        READ_BROW(brow6,6);
-        READ_BROW(brow7,7);
-
-#define MM_DOT_PRODUCT(_row,_dot)   \
-        arow = vload4(0, src0_read + _row * ldA);  \
-        arow.x = (mad24(local_x, 4, w) < K) ? arow.x : 0.0f; \
-        arow.y = (mad24(local_x, 4, w + 1) < K) ? arow.y : 0.0f; \
-        arow.z = (mad24(local_x, 4, w + 2) < K) ? arow.z : 0.0f; \
-        arow.w = (mad24(local_x, 4, w + 3) < K) ? arow.w : 0.0f; \
-        _dot = mad( (float8)(arow.x), (float8)(brow0.x, brow1.x, brow2.x, brow3.x, brow4.x, brow5.x, brow6.x, brow7.x), _dot ); \
-        _dot = mad( (float8)(arow.y), (float8)(brow0.y, brow1.y, brow2.y, brow3.y, brow4.y, brow5.y, brow6.y, brow7.y), _dot ); \
-        _dot = mad( (float8)(arow.z), (float8)(brow0.z, brow1.z, brow2.z, brow3.z, brow4.z, brow5.z, brow6.z, brow7.z), _dot ); \
-        _dot = mad( (float8)(arow.w), (float8)(brow0.w, brow1.w, brow2.w, brow3.w, brow4.w, brow5.w, brow6.w, brow7.w), _dot );
-
-        MM_DOT_PRODUCT(0,dot00);
-        MM_DOT_PRODUCT(1,dot01);
-        MM_DOT_PRODUCT(2,dot02);
-        MM_DOT_PRODUCT(3,dot03);
-        MM_DOT_PRODUCT(4,dot04);
-        MM_DOT_PRODUCT(5,dot05);
-        MM_DOT_PRODUCT(6,dot06);
-        MM_DOT_PRODUCT(7,dot07);
-#undef MM_DOT_PRODUCT
    }

 #define REDUCE(_dot) \
@ -572,21 +555,22 @@ __kernel void intelblas_gemm_buffer_NT(
    output = (local_x == 5) ? _dot.s5 : output; \
    output = (local_x == 6) ? _dot.s6 : output; \
    output = (local_x == 7) ? _dot.s7 : output; \
-    if (beta != 0.0) \
+    if (beta != 0.0f) \
        dst_write0[0] = mad(output, (float)alpha, ((float)beta * dst_write0[0])); \
    else \
        dst_write0[0] = output * (float)alpha; \
    dst_write0 += ldC;

-    if(global_x < N && global_y * 8 < M) {
-        OUTPUT(dot00);
-        if(mad24(global_y, 8, 1) < M) { OUTPUT(dot01); }
-        if(mad24(global_y, 8, 2) < M) { OUTPUT(dot02); }
-        if(mad24(global_y, 8, 3) < M) { OUTPUT(dot03); }
-        if(mad24(global_y, 8, 4) < M) { OUTPUT(dot04); }
-        if(mad24(global_y, 8, 5) < M) { OUTPUT(dot05); }
-        if(mad24(global_y, 8, 6) < M) { OUTPUT(dot06); }
-        if(mad24(global_y, 8, 7) < M) { OUTPUT(dot07); }
+    if (global_x < N && dst_row < M)
+    {
+        /*if (dst_row + 0 < M)*/ { OUTPUT(dot00); }
+        if (dst_row + 1 < M) { OUTPUT(dot01); }
+        if (dst_row + 2 < M) { OUTPUT(dot02); }
+        if (dst_row + 3 < M) { OUTPUT(dot03); }
+        if (dst_row + 4 < M) { OUTPUT(dot04); }
+        if (dst_row + 5 < M) { OUTPUT(dot05); }
+        if (dst_row + 6 < M) { OUTPUT(dot06); }
+        if (dst_row + 7 < M) { OUTPUT(dot07); }
    }
 #undef OUTPUT
 }
--- a/modules/core/test/ocl/test_gemm.cpp
+++ b/modules/core/test/ocl/test_gemm.cpp
@ -67,6 +67,8 @@ PARAM_TEST_CASE(Gemm,

    double alpha, beta;

+    int M, N, K;
+
    TEST_DECLARE_INPUT_PARAMETER(A);
    TEST_DECLARE_INPUT_PARAMETER(B);
    TEST_DECLARE_INPUT_PARAMETER(C);
@ -90,30 +92,27 @@ PARAM_TEST_CASE(Gemm,

    void generateTestData()
    {
-        // set minimum size to 20, since testing less sizes doesn't make sense
-        Size ARoiSize = randomSize(20, MAX_VALUE);
-        Border ABorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(A, A_roi, ARoiSize, ABorder, type, -11, 11);
+        M = (int)randomDoubleLog(1, 100);
+        N = (int)randomDoubleLog(1, 100);
+        K = (int)randomDoubleLog(1, 1200);

-        if (atrans)
-            ARoiSize = Size(ARoiSize.height, ARoiSize.width);
+        M = roundUp(M, 1);
+        N = roundUp(N, 1);
+        K = roundUp(K, 1);

-        Size BRoiSize = randomSize(20, MAX_VALUE);
-        if (btrans)
-            BRoiSize.width = ARoiSize.width;
-        else
-            BRoiSize.height = ARoiSize.width;
+        Size ARoiSize = (atrans) ? Size(M, K) : Size(K, M);
+        Border ABorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(A, A_roi, ARoiSize, ABorder, type, -11, 11);

+        Size BRoiSize = (btrans) ? Size(K, N) : Size(N, K);
        Border BBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
        randomSubMat(B, B_roi, BRoiSize, BBorder, type, -11, 11);

-        if (btrans)
-            BRoiSize = Size(BRoiSize.height, BRoiSize.width);
-
-        Size DRoiSize = Size(BRoiSize.width, ARoiSize.height), CRoiSizeT(DRoiSize.height, DRoiSize.width);
+        Size CRoiSize = (ctrans) ? Size(M, N) : Size(N, M);
        Border CBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(C, C_roi, ctrans ? CRoiSizeT : DRoiSize, CBorder, type, -11, 11);
+        randomSubMat(C, C_roi, CRoiSize, CBorder, type, -11, 11);

+        Size DRoiSize = Size(N, M);
        Border DBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
        randomSubMat(D, D_roi, DRoiSize, DBorder, type, -11, 11);

@ -132,11 +131,12 @@ OCL_TEST_P(Gemm, Accuracy)
    for (int i = 0; i < test_loop_times; ++i)
    {
        generateTestData();
+        SCOPED_TRACE(cv::format("i=%d: M=%d N=%d K=%d", i, M, N, K));

        OCL_OFF(cv::gemm(A_roi, B_roi, alpha, C_roi, beta, D_roi, flags));
        OCL_ON(cv::gemm(uA_roi, uB_roi, alpha, uC_roi, beta, uD_roi, flags));

-        double eps = D_roi.size().area() * 1e-4;
+        double eps = D_roi.size().area() * (1e-5 * K);
        OCL_EXPECT_MATS_NEAR(D, eps);
    }
 }
--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@ -238,7 +238,7 @@ public:
                kernel.set(4, ocl::KernelArg::PtrReadOnly(umat_weight));
                kernel.set(5, ocl::KernelArg::PtrReadOnly(umat_bias));
                kernel.set(6, ocl::KernelArg::PtrWriteOnly(dst));
-                bool ret = kernel.run(2, global, NULL, false);
+                bool ret = kernel.run_(2, global, NULL, false);
                if (!ret)
                    return false;
            }
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@ -1951,7 +1951,7 @@ Ptr<Layer> ChannelsPReLULayer::create(const LayerParams& params)
    if (params.blobs[0].total() == 1)
    {
        LayerParams reluParams = params;
-        reluParams.set("negative_slope", params.blobs[0].at<float>(0));
+        reluParams.set("negative_slope", *params.blobs[0].ptr<float>());
        return ReLULayer::create(reluParams);
    }
    Ptr<ChannelsPReLULayer> l(new ElementWiseLayer<ChannelsPReLUFunctor>(ChannelsPReLUFunctor(params.blobs[0])));
--- a/modules/dnn/src/layers/mvn_layer.cpp
+++ b/modules/dnn/src/layers/mvn_layer.cpp
@ -200,7 +200,7 @@ public:
            k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_weight));
            k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_bias));
            k1.set(argId++, ocl::KernelArg::PtrWriteOnly(outMat));
-            ret = k1.run(1, globalsize, localsize, false);
+            ret = k1.run_(1, globalsize, localsize, false);
            if (!ret)
                return false;
        }
--- a/modules/dnn/src/layers/resize_layer.cpp
+++ b/modules/dnn/src/layers/resize_layer.cpp
@ -120,7 +120,14 @@ public:
        internals_arr.getMatVector(internals);

        if (outHeight == inputs[0].size[2] && outWidth == inputs[0].size[3])
+        {
+            // outputs[0] = inputs[0] doesn't work due to BlobManager optimizations
+            if (inputs[0].data != outputs[0].data)
+            {
+                inputs[0].copyTo(outputs[0]);
+            }
            return;
+        }

        Mat& inp = inputs[0];
        Mat& out = outputs[0];
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@ -64,6 +64,31 @@ namespace cv
 namespace dnn
 {

+void sliceRangesFromShape(const MatShape& inpShape, int& axis, std::vector<std::vector<cv::Range> >& sliceRanges)
+{
+    CV_Assert(inpShape.size() > 0);
+    bool axisNeg = (axis < 0);
+    axis = (axis + static_cast<int>(inpShape.size())) % inpShape.size();
+    int n = inpShape[axis];
+
+    for (size_t i = 0; i < sliceRanges.size(); ++i){
+        std::vector<Range>& ranges = sliceRanges[i];
+        if (axisNeg)
+        {
+            ranges.insert(ranges.begin(), axis, Range::all());
+        }
+        Range& range = ranges.back();
+
+        if (range.start >= 0)
+        {
+            continue;
+        }
+
+        CV_Assert(n != 0);
+        range.start = (n + range.start) % n;
+    }
+}
+
 class SliceLayerImpl : public SliceLayer
 {
 public:
@ -75,20 +100,22 @@ public:
        num_split = params.get<int>("num_split", 0);
        hasDynamicShapes = params.get<bool>("has_dynamic_shapes", false);
        shapesInitialized = !hasDynamicShapes;
+
        if (params.has("slice_point"))
        {
            CV_Assert(!params.has("begin") && !params.has("size") && !params.has("end"));
            const DictValue &indicesValue = params.get("slice_point");
+            int size = axis > 0 ? axis + 1 : 1;
            sliceRanges.resize(indicesValue.size() + 1,
-                               std::vector<Range>(std::max(axis,0) + 1, Range::all()));
+                               std::vector<Range>(size, Range::all()));
            int prevSlice = 0;
            for (int i = 0; i < indicesValue.size(); ++i)
            {
-                sliceRanges[i][axis].start = prevSlice;
-                sliceRanges[i][axis].end = indicesValue.get<int>(i);
-                prevSlice = sliceRanges[i][axis].end;
+                sliceRanges[i][size - 1].start = prevSlice;
+                sliceRanges[i][size - 1].end = indicesValue.get<int>(i);
+                prevSlice = sliceRanges[i][size - 1].end;
            }
-            sliceRanges.back()[axis].start = prevSlice;
+            sliceRanges.back()[size - 1].start = prevSlice;
        }
        else if (params.has("begin"))
        {
@ -103,7 +130,6 @@ public:
            {
                int start = begins.get<int>(i);
                int sizeOrEnd = sizesOrEnds.get<int>(i);  // It may be negative to reverse indexation.
-                CV_Assert(start >= 0);

                sliceRanges[0][i].start = start;
                if (params.has("size"))
@ -164,16 +190,20 @@ public:
        CV_Assert(inputs.size() == 1);
        MatShape inpShape = inputs[0];

-        if (!sliceRanges.empty())
+        int axis_rw = axis;
+        std::vector<std::vector<cv::Range> > sliceRanges_rw = sliceRanges;
+        sliceRangesFromShape(inpShape, axis_rw, sliceRanges_rw);
+
+        if (!sliceRanges_rw.empty())
        {
-            outputs.resize(sliceRanges.size(), inpShape);
+            outputs.resize(sliceRanges_rw.size(), inpShape);
            for (int i = 0; i < outputs.size(); ++i)
            {
-                CV_Assert(sliceRanges[i].size() <= inpShape.size());
-                for (int j = 0; j < sliceRanges[i].size(); ++j)
+                CV_Assert(sliceRanges_rw[i].size() <= inpShape.size());
+                for (int j = 0; j < sliceRanges_rw[i].size(); ++j)
                {
                    if (shapesInitialized || inpShape[j] > 0)
-                        outputs[i][j] = normalize_axis_range(sliceRanges[i][j], inpShape[j]).size();
+                        outputs[i][j] = normalize_axis_range(sliceRanges_rw[i][j], inpShape[j]).size();

                    if (!sliceSteps.empty() && (i < sliceSteps.size()) && (j < sliceSteps[i].size()) && (sliceSteps[i][j] > 1))
                        outputs[i][j] = (outputs[i][j] + sliceSteps[i][j] - 1) / sliceSteps[i][j];
@ -182,10 +212,10 @@ public:
        }
        else  // Divide input blob on equal parts by axis.
        {
-            CV_Assert(0 <= axis && axis < inpShape.size());
+            CV_Assert(0 <= axis_rw && axis_rw < inpShape.size());
            int splits = num_split ? num_split : requiredOutputs;
-            CV_Assert(splits > 0 && inpShape[axis] % splits == 0);
-            inpShape[axis] /= splits;
+            CV_Assert(splits > 0 && inpShape[axis_rw] % splits == 0);
+            inpShape[axis_rw] /= splits;
            outputs.resize(splits, inpShape);
        }
        return false;
@ -210,6 +240,7 @@ public:
        CV_Assert(inputs.size() == 1);
        const MatSize& inpShape = inputs[0].size;

+        sliceRangesFromShape(shape(inputs[0]), axis, sliceRanges);
        finalSliceRanges = sliceRanges;

        if (sliceRanges.empty())
@ -492,7 +523,7 @@ public:
                    ocl::KernelArg::PtrReadOnly(input),
                    ocl::KernelArg::PtrWriteOnly(output)
                )
-                .run(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false);
+                .run_(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false);
            if (!ret)
                return false;
        }  // for outputs.size()
--- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
+++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
@ -269,7 +269,7 @@ class OCL4DNNConvSpatial
        void generate_idlf_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
                                      int blockM, int blockK, int simd_size);
        void setFusionDefine(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise);
-        void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx);
+        void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, int fused_eltwise_offset, ocl::Kernel &kernel, cl_uint &argIdx);

        int32_t group_;
        bool bias_term_;
--- a/modules/dnn/src/ocl4dnn/src/math_functions.cpp
+++ b/modules/dnn/src/ocl4dnn/src/math_functions.cpp
@ -116,6 +116,7 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
                .args(
                    ocl::KernelArg::PtrReadOnly(buffer),
                    image, offset,
+                    padded_width, padded_height,
                    width, height,
                    ld)
                .run(2, global_copy, NULL, false);
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@ -270,17 +270,21 @@ void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ,
 }

 template<typename Dtype>
-void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx)
+void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, int fused_eltwise_offset, ocl::Kernel &kernel, cl_uint &argIdx)
 {
    if (fused_eltwise)
-        kernel.set(argIdx++, (cl_mem)bottom_data2_.handle(ACCESS_READ));
+    {
+        kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom_data2_));
+        if (fused_eltwise_offset >= 0)
+            kernel.set(argIdx++, fused_eltwise_offset);
+    }

    switch (fused_activ) {
        case OCL4DNN_CONV_FUSED_ACTIV_RELU:
            kernel.set(argIdx++, (float)negative_slope_);
            break;
        case OCL4DNN_CONV_FUSED_ACTIV_PRELU:
-            kernel.set(argIdx++, (cl_mem)negative_slope_umat_.handle(ACCESS_READ));
+            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(negative_slope_umat_));
            break;
        case OCL4DNN_CONV_FUSED_ACTIV_POWER:
            kernel.set(argIdx++, (float)power_);
@ -765,12 +769,11 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
            swizzled_factor
        );

-        size_t global_work_size_copy[3] = {
-            (size_t) (alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1, 1 };
+        size_t global_work_size_copy[1] = { (size_t)(alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_) };

-        if (!oclk_copy_weight.run(3, global_work_size_copy, NULL, false))
+        if (!oclk_copy_weight.run_(1, global_work_size_copy, NULL, false))
        {
-            std::cout << "Swizzle kernel run failed." << std::endl;
+            CV_LOG_ERROR(NULL, "DNN/OpenCL: Swizzle kernel run failed");
            return false;
        }
    } else {
@ -895,10 +898,12 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
    if (config->kernelType == KERNEL_TYPE_INTEL_IDLF) {
        if (!swizzleWeight(weight, config->workItem_output[2], false))
            return false;
+#if 0
        size_t total_bottom_size = bottom_dim_ * numImages;
        size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_;
        size_t total_bias_size = M_ * group_;
        size_t total_top_size = top_dim_ * numImages;
+#endif
        for (int32_t g = 0; g < group_; ++g) {
            bias_offset = M_ * g;
            int32_t image_offset = width_ * height_ * (channels_ / group_) * g;
@ -910,78 +915,28 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                return false;

            cl_uint argIdx = 0;
-            setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
-
-            UMat img_buffer;
-            if (image_offset)
-            {
-                CreateSubBuffer(bottom, img_buffer, image_offset,
-                                total_bottom_size - image_offset, false);
-                if (img_buffer.empty())
-                    return false;
-
-                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer));
-            }
-            else
-            {
-                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
-            }
+            setFusionArg(fused_activ_, fused_eltwise_, output_image_offset, kernel, argIdx);

-            UMat kernel_buffer;
-            if (kernel_offset)
-            {
-                CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset,
-                                total_kernel_size - kernel_offset, false);
-                if (kernel_buffer.empty())
-                    return false;
+            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+            kernel.set(argIdx++, image_offset);

-                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer));
-            }
-            else
-            {
-                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
-            }
+            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
+            kernel.set(argIdx++, kernel_offset);

-            UMat bias_buffer;
            if (bias_term_)
            {
-                if (bias_offset)
-                {
-                    CreateSubBuffer(bias, bias_buffer, bias_offset,
-                                    total_bias_size - bias_offset, false);
-                    if (bias_buffer.empty())
-                        return false;
-
-                    kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer));
-                }
-                else
-                {
-                    kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
-                }
+                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
+                kernel.set(argIdx++, bias_offset);
            }

-            UMat out_buffer;
-            if (output_image_offset)
-            {
-                CreateSubBuffer(top, out_buffer, output_image_offset,
-                                total_top_size - output_image_offset, true);
-                if (out_buffer.empty())
-                    return false;
-
-                kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
-                kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
-            }
-            else
-            {
-                kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
-                kernel.set(argIdx++, (int)(top.offset / element_size));
-            }
+            kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+            kernel.set(argIdx++, (int)(top.offset / element_size) + output_image_offset);

            kernel.set(argIdx++, (uint16_t)width_);
            kernel.set(argIdx++, (uint16_t)height_);
            kernel.set(argIdx++, (uint16_t)output_w_);
            kernel.set(argIdx++, (uint16_t)output_h_);
-            if (!kernel.run(3, config->global_work_size, config->local_work_size, false))
+            if (!kernel.run_(3, config->global_work_size, config->local_work_size, false))
            {
                std::cout << "IDLF kernel run failed." << std::endl;
                return false;
@ -990,9 +945,11 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
    } else if (config->kernelType == KERNEL_TYPE_GEMM_LIKE) {
        if (!swizzleWeight(weight, config->workItem_output[1], true))
            return false;
+#if 0
        size_t total_bottom_size = bottom_dim_ * numImages;
        size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_;
        size_t total_bias_size = M_ * group_;
+#endif
        size_t total_top_size = top_dim_ * numImages;
        for (int32_t g = 0; g < group_; ++g) {
            bias_offset = M_ * g;
@ -1005,72 +962,25 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                return false;

            cl_uint argIdx = 0;
-            setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
+            setFusionArg(fused_activ_, fused_eltwise_, output_image_offset, kernel, argIdx);

-            UMat img_buffer;
-            if (image_offset)
-            {
-                CreateSubBuffer(bottom, img_buffer, image_offset,
-                                total_bottom_size - image_offset, false);
-                if (img_buffer.empty())
-                    return false;
+            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+            kernel.set(argIdx++, (int)image_offset);
+            kernel.set(argIdx++, (int)(bottom.total() - image_offset));

-                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer));
-            }
-            else
-            {
-                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
-            }
-
-            UMat kernel_buffer;
-            if (kernel_offset)
-            {
-                CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset,
-                                total_kernel_size - kernel_offset, false);
-                if (kernel_buffer.empty())
-                    return false;
-
-                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer));
-            }
-            else
-            {
-                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
-            }
+            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
+            kernel.set(argIdx++, (int)kernel_offset);
+            kernel.set(argIdx++, (int)(swizzled_weights_umat.total() - kernel_offset));

-            UMat bias_buffer;
            if (bias_term_)
            {
-                if (bias_offset)
-                {
-                    CreateSubBuffer(bias, bias_buffer, bias_offset,
-                                    total_bias_size - bias_offset, false);
-                    if (bias_buffer.empty())
-                        return false;
-
-                    kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer));
-                }
-                else
-                {
-                    kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
-                }
+                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
+                kernel.set(argIdx++, (int)bias_offset);
            }

-            UMat out_buffer;
-            if (output_image_offset)
-            {
-                CreateSubBuffer(top, out_buffer, output_image_offset,
-                                total_top_size - output_image_offset, true);
-                if (out_buffer.empty())
-                    return false;
-
-                kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
-                kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
-            }
-            else
-            {
-                kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
-                kernel.set(argIdx++, (int)(top.offset / element_size));
-            }
+            kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+            kernel.set(argIdx++, (int)(top.offset / element_size) + output_image_offset);
+            kernel.set(argIdx++, (int)total_top_size - (int)(top.offset / element_size));

            kernel.set(argIdx++, (uint16_t)width_);
            kernel.set(argIdx++, (uint16_t)height_);
@ -1100,7 +1010,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
            gy = alignSize(gy, blockK);
            size_t global_size[3] = { gx, gy, config->global_work_size[2] };

-            if (!kernel.run(3, global_size, config->local_work_size, false))
+            if (!kernel.run_(3, global_size, config->local_work_size, false))
            {
                std::cout << "GEMM like kernel run failed." << std::endl;
                return false;
@ -1112,7 +1022,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
            return false;

        cl_uint argIdx = 0;
-        setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
+        setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx);
        kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
        kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
        if (bias_term_)
@ -1129,9 +1039,9 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
        global_size[1] = output_h_;
        global_size[2] = num_output_ * num_;

-        if (!kernel.run(3, global_size, NULL, false))
+        if (!kernel.run_(3, global_size, NULL, false))
        {
-            std::cout << "DWCONV kernel run failed." << std::endl;
+            CV_LOG_ERROR(NULL, "DNN/OpenCL: DWCONV kernel run failed");
            return false;
        }
    } else {
@ -1152,7 +1062,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                    return false;

                cl_uint argIdx = 0;
-                setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
+                setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx);
                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
                kernel.set(argIdx++, image_offset);
                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
@ -1171,11 +1081,11 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                kernel.set(argIdx++, (uint16_t)output_h_);
                kernel.set(argIdx++, (uint16_t)pad_w_);
                kernel.set(argIdx++, (uint16_t)pad_h_);
-                if (!kernel.run(3, config->global_work_size,
+                if (!kernel.run_(3, config->global_work_size,
                                (config->use_null_local) ? NULL : config->local_work_size,
                                false))
                {
-                    std::cout << "Basic kernel run failed." << std::endl;
+                    CV_LOG_ERROR(NULL, "DNN/OpenCL: Basic kernel run failed");
                    return false;
                }
            }
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp
@ -127,7 +127,7 @@ bool OCL4DNNSoftmax<Dtype>::Forward(const UMat& bottom, UMat& top)
            oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
            oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
        }
-        ret = oclk_softmax_forward_kernel.run(3, global_size, local_size, false);
+        ret = oclk_softmax_forward_kernel.run_(3, global_size, local_size, false);
    }
    return ret;
 }
--- a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
+++ b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
@ -234,6 +234,27 @@ public:
    }
 };

+class NormalizeSubgraph2_2 : public NormalizeSubgraphBase
+{
+public:
+    NormalizeSubgraph2_2()
+    {
+        int input = addNodeToMatch("");
+        int norm = addNodeToMatch("ReduceL2", input);
+
+        int min = addNodeToMatch("");
+        int max = addNodeToMatch("");
+        int clip = addNodeToMatch("Clip", norm, min, max);
+
+        int shape = addNodeToMatch("");
+        int expand = addNodeToMatch("Expand", clip, shape);
+
+        addNodeToMatch("Div", input, expand);
+
+        setFusedNode("Normalize", input);
+    }
+};
+
 class NormalizeSubgraph3 : public NormalizeSubgraphBase
 {
 public:
@ -558,6 +579,7 @@ void simplifySubgraphs(opencv_onnx::GraphProto& net)
    subgraphs.push_back(makePtr<SoftMaxSubgraph>());
    subgraphs.push_back(makePtr<NormalizeSubgraph1>());
    subgraphs.push_back(makePtr<NormalizeSubgraph2>());
+    subgraphs.push_back(makePtr<NormalizeSubgraph2_2>());
    subgraphs.push_back(makePtr<NormalizeSubgraph3>());
    subgraphs.push_back(makePtr<BatchNormalizationSubgraph1>());
    subgraphs.push_back(makePtr<BatchNormalizationSubgraph2>());
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@ -64,6 +64,8 @@ class ONNXImporter
    void addLayer(LayerParams& layerParams,
                  const opencv_onnx::NodeProto& node_proto);

+    void expandMid(const std::string& prefix, opencv_onnx::NodeProto& node_proto,
+                   const std::string& input, size_t n);
 public:
    ONNXImporter(Net& net, const char *onnxFile);
    ONNXImporter(Net& net, const char* buffer, size_t sizeBuffer);
@ -486,6 +488,37 @@ void ONNXImporter::addLayer(LayerParams& layerParams,
    }
 }

+/** @brief Make N copies of input layer and set them as input to node_proto.
+ * @param prefix prefix of new layers' names
+ * @param node_proto node which will contain all copies as inputs
+ * @param input name of the node to copy
+ * @param n number of copies
+ */
+void ONNXImporter::expandMid(const std::string& prefix, opencv_onnx::NodeProto& node_proto,
+                             const std::string& input, size_t n)
+{
+    std::vector<std::string> input_names;
+    input_names.reserve(n);
+    for (size_t j = 0; j < n; j++)
+    {
+        LayerParams copyLP;
+        copyLP.name = format("%s/copy_%zu", prefix.c_str(), j);
+        copyLP.type = "Identity";
+        CV_Assert((layer_id.find(copyLP.name) == layer_id.end()) &&
+            "Couldn't copy the node: generated name already exists in the graph.");
+        input_names.push_back(copyLP.name);
+
+        node_proto.set_input(0, input);
+        node_proto.set_output(0, copyLP.name);
+        addLayer(copyLP, node_proto);
+    }
+    node_proto.clear_input();
+    for (size_t i = 0; i < input_names.size(); i++)
+    {
+        node_proto.add_input(input_names[i]);
+    }
+}
+
 void ONNXImporter::addConstant(const std::string& name, const Mat& blob)
 {
    constBlobs.insert(std::make_pair(name, blob));
@ -1422,6 +1455,38 @@ void ONNXImporter::parseMatMul(LayerParams& layerParams, const opencv_onnx::Node
    addLayer(layerParams, node_proto);
 }

+void findBroadAxis(const MatShape& broadShape, const MatShape& outShape, size_t& axis, int& broadAxis)
+{
+    const size_t diff = outShape.size() - broadShape.size();
+
+    // find the first non-one element of the broadcasting shape
+    axis = 0;
+    for (; axis < broadShape.size() && broadShape[axis] == 1; ++axis) {}
+
+    // find the last non-one element of the broadcasting shape
+    size_t endAxis = broadShape.size();
+    for (; endAxis > axis && broadShape[endAxis - 1] == 1; --endAxis) {}
+
+    // find one between axis and endAxis - as it needs to be broadcasted,
+    // dimensions from the left of axis and from the right of endAxis will be handled by Scale layer
+    broadAxis = -1;
+    for (size_t i = axis; i < endAxis; ++i)
+    {
+        size_t outAxis = i + diff;
+        if (outShape[outAxis] == broadShape[i])
+        {
+            continue;
+        }
+
+        // ensure we need to broadcast only 1 dimension in the middle
+        CV_Assert(broadShape[i] == 1 && broadAxis == -1);
+        broadAxis = static_cast<int>(outAxis);
+    }
+
+    axis += diff;
+}
+
+// "Mul" "Div"
 void ONNXImporter::parseMul(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
 {
    opencv_onnx::NodeProto node_proto = node_proto_;
@ -1543,13 +1608,31 @@ void ONNXImporter::parseMul(LayerParams& layerParams, const opencv_onnx::NodePro
        }

        const MatShape& broadShape = outShapes[node_proto.input(1)];
-        const size_t outShapeSize = outShapes[node_proto.input(0)].size();
-        const size_t diff = outShapeSize - broadShape.size();
+        const MatShape& outShape = outShapes[node_proto.input(0)];
+
+        size_t axis = 0;
+        int broadAxis = -1;
+        findBroadAxis(broadShape, outShape, axis, broadAxis);
+
+        // if there is a one dimension in the middle that should be broadcasted, broadcast it
+        if (broadAxis != -1)
+        {
+            opencv_onnx::NodeProto concat_node_proto = node_proto;
+            const std::string& input1 = concat_node_proto.input(1);
+
+            expandMid(layerParams.name, concat_node_proto, input1, outShape[broadAxis]);

-        size_t axis;
-        for (axis = diff; axis < broadShape.size() && broadShape[axis - diff] == 1; ++axis) {}
+            LayerParams concatLP;
+            concatLP.name = layerParams.name + "/concat";
+            concatLP.set("axis", broadAxis);
+            concatLP.type = "Concat";
+            concat_node_proto.set_output(0, concatLP.name);

-        CV_Assert(axis != outShapeSize);
+            addLayer(concatLP, concat_node_proto);
+            node_proto.set_input(1, concatLP.name);
+        }
+
+        CV_Assert(axis != outShape.size());
        layerParams.set("axis", static_cast<int>(axis));
        layerParams.type = "Scale";
    }
@ -1818,12 +1901,11 @@ void ONNXImporter::parseExpand(LayerParams& layerParams, const opencv_onnx::Node
    // Unsqueeze and repeat along new axis
    if (targetShape.size() == inpShape.size() + 1)
    {
+        inpShape.insert(inpShape.begin(), targetShape.size() - inpShape.size(), 1);
        for (int i = 0; i < targetShape.size(); i++)
        {
-            if (targetShape[i] == -1 && i < inpShape.size())
+            if (abs(targetShape[i]) == 1)
                targetShape[i] = inpShape[i];
-            else if (i < inpShape.size() && targetShape[i] != inpShape[i])
-                inpShape.insert(inpShape.begin() + i, 1);
        }
        if (haveVariables)
        {
@ -1843,14 +1925,19 @@ void ONNXImporter::parseExpand(LayerParams& layerParams, const opencv_onnx::Node
    CV_CheckEQ(inpShape.size(), targetShape.size(), "Unsupported Expand op with different dims");

    std::vector<int> broadcast_axes;
+    // shapes aren't right-aligned here because targetShape.size() == inpShape.size()
    for (int i = 0; i < targetShape.size(); i++)
    {
        if (targetShape[i] != inpShape[i])
        {
            if (inpShape[i] == 1)
+            {
                broadcast_axes.push_back(i);
-            else
+            }
+            else if (targetShape[i] != 1)
+            {
                CV_Error(Error::StsError, format("Could not be broadcast by axis: %d", i));
+            }
        }
    }

@ -1889,31 +1976,16 @@ void ONNXImporter::parseExpand(LayerParams& layerParams, const opencv_onnx::Node
    }
    else if (broadcast_axes.size() == 1 && broadcast_axes[0] <= 1)
    {
-        String base_name = layerParams.name + "/copy_";
-        std::vector<std::string> input_names;
-        for (int j = 0; j < targetShape[broadcast_axes[0]]; j++)
-        {
-            std::ostringstream ss;
-            ss << j;
-            LayerParams copyLP;
-            copyLP.name = base_name + ss.str();
-            copyLP.type = "Identity";
-            CV_Assert(layer_id.find(copyLP.name) == layer_id.end());
-            input_names.push_back(copyLP.name);
-
-            node_proto.set_input(0, srcName);
-            node_proto.set_output(0, copyLP.name);
-            addLayer(copyLP, node_proto);
-        }
-        node_proto.clear_input();
-        for (int i = 0; i < input_names.size(); i++)
-        {
-            node_proto.add_input(input_names[i]);
-        }
+        expandMid(layerParams.name, node_proto, srcName, targetShape[broadcast_axes[0]]);
+
        layerParams.set("axis", broadcast_axes[0]);
        layerParams.type = "Concat";
        node_proto.set_output(0, layerParams.name);
    }
+    else if (broadcast_axes.empty())
+    {
+        layerParams.type = "Identity";
+    }
    else
        CV_Error(Error::StsNotImplemented, "Unsupported Expand op");
    addLayer(layerParams, node_proto);
--- a/modules/dnn/src/opencl/conv_layer_spatial.cl
+++ b/modules/dnn/src/opencl/conv_layer_spatial.cl
@ -74,18 +74,22 @@
    (_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \
 } while(0)
 #define ELTWISE_DATA_ARG __global Dtype* eltwise_data,
+#define ELTWISE_DATA_ARG_WITH_OFFSET __global Dtype* eltwise_ptr, int eltwise_offset,
 #else
 #define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { \
    const Dtype _x_ = (_data_); \
    (_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \
 } while(0)
 #define ELTWISE_DATA_ARG
+#define ELTWISE_DATA_ARG_WITH_OFFSET
 #endif

 #if APPLY_BIAS
 #define BIAS_KERNEL_ARG __global Dtype * biases_base,
+#define BIAS_KERNEL_ARG_WITH_OFFSET __global Dtype * biases_base_ptr, int biases_base_offset,
 #else
 #define BIAS_KERNEL_ARG
+#define BIAS_KERNEL_ARG_WITH_OFFSET
 #endif

 #define __CAT(x, y) x##y
@ -223,19 +227,28 @@ __attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
 __attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
 __kernel void
 convolve_simd(
-    ELTWISE_DATA_ARG
+    ELTWISE_DATA_ARG_WITH_OFFSET
    FUSED_ARG
-    __global Dtype* inputs,
-    __global Dtype* weights,
-    BIAS_KERNEL_ARG
-    __global Dtype* outputs_base,
-    const int outputs_offset,
+    __global Dtype* inputs_ptr, const int inputs_offset,
+    __global Dtype* weights_ptr, const int weights_offset,
+    BIAS_KERNEL_ARG_WITH_OFFSET
+    __global Dtype* outputs_base, const int outputs_offset,
    const ushort input_width,
    const ushort input_height,
    const ushort output_width,
    const ushort output_height)
 {
+  __global Dtype* inputs = inputs_ptr + inputs_offset;
+  __global Dtype* weights = weights_ptr + weights_offset;
+#if APPLY_BIAS
+  __global Dtype* biases_base = biases_base_ptr + biases_base_offset;
+#endif
+
  __global Dtype* outputs = outputs_base + outputs_offset;
+#ifdef FUSED_CONV_ELTWISE
+  __global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
+#endif
+
  unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH;  // oc = Output Column
  unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row
  unsigned int fm = get_global_id(2);                    // fm = Feature Map = od = Output Depth
@ -388,13 +401,12 @@ typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
 #define ROW_PITCH input_width

 #define GEMM_LIKE_KERNEL_ARGS     \
-    ELTWISE_DATA_ARG              \
+    ELTWISE_DATA_ARG_WITH_OFFSET  \
    FUSED_ARG                     \
-    const __global Dtype *src0,   \
-    const __global Dtype *src1,   \
-    BIAS_KERNEL_ARG               \
-    __global Dtype *dst_base,     \
-    const int dst_offset,         \
+    const __global Dtype *src0_ptr, const unsigned int src0_offset, const unsigned int src0_limit, \
+    const __global Dtype *src1_ptr, const unsigned int src1_offset, const unsigned int src1_limit, \
+    BIAS_KERNEL_ARG_WITH_OFFSET   \
+    __global Dtype *dst_base, const unsigned int dst_offset, const unsigned int dst_limit, \
    const ushort input_width,     \
    const ushort input_height,    \
    const ushort output_width,    \
@ -424,7 +436,17 @@ typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
 __attribute__((intel_reqd_sub_group_size(8)))
 __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 {
+    const __global Dtype *src0 = src0_ptr + src0_offset;
+    const __global Dtype *src1 = src1_ptr + src1_offset;
+#if APPLY_BIAS
+  __global Dtype* biases_base = biases_base_ptr + biases_base_offset;
+#endif
+
    __global Dtype *dst = dst_base + dst_offset;
+#ifdef FUSED_CONV_ELTWISE
+  __global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
+#endif
+
    const int group_x = get_group_id(0);
    const int group_y = get_group_id(1);
    const int global_x = get_global_id(0);
@ -447,6 +469,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
    }
    typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;

+// U_GEMM_LIKE_CONV_k11x11_cn3_g1_s4x4_d1x1_b1_in240x240_p0x0_num1_M96_activ1_eltwise0_FP32_5_1_8_32_SIMD8 doesn't run properly (src0_read out of bounds)
+// Test: DNNTestNetwork.AlexNet/0 (to run all kernels use OPENCV_OCL4DNN_FORCE_AUTO_TUNING=1)
+#if 0 // INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
+  #define OPTIMIZE_READ 1
+#else
+  #define OPTIMIZE_READ 0
+#endif
+
    // True for all threads if filter_width is multiple of TILE_N
    // else, true for all but right-most column of threads.
    if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N )
@ -463,7 +493,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
        // atile is M rows x K columns.
        int curr_x = ( global_y % output_width ) * STRIDE_X;
        int curr_y = ( global_y / output_width ) * STRIDE_Y;
-#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
+#if !OPTIMIZE_READ
        int saved_y = curr_y;
 #endif
        const __global Dtype *src0_read = src0
@ -483,7 +513,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
        do
        {
            int patch_row = 0;
-#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
+#if !OPTIMIZE_READ
            curr_y = saved_y;
 #endif

@ -501,11 +531,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
                // ...
                const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;

-#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
+#if OPTIMIZE_READ
  #if KERNEL_WIDTH == 3
                Dtype_t blockA00 = vload3(0, src0_read);
                Dtype*  pblockA00 = (Dtype*)(&blockA00);
  #else
+    #if 0 // debug
+                if ((int)(src0_read - src0) >= src0_limit - KERNEL_WIDTH)
+                {
+                    printf("CATCH: src0_read-src0: %d   limit=%d   curr_y,curr_x=%d,%d\n", (int)(src0_read - src0), src0_limit, curr_y, curr_x);
+                }
+    #endif
                Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[  0  ];
                Dtype*  pblockA00 = (Dtype*)(&blockA00);
  #endif
@ -626,7 +662,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
        // atile is M rows x K columns.
        int curr_x = ( global_y % output_width ) * STRIDE_X;
        int curr_y = ( global_y / output_width ) * STRIDE_Y;
-#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
+#if !OPTIMIZE_READ
        int saved_y = curr_y;
 #endif
        const __global Dtype *src0_read = src0
@ -646,14 +682,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
        do
        {
            int patch_row = 0;
-#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
+#if !OPTIMIZE_READ
            curr_y = saved_y;
 #endif
            do
            {
                // Load atile and interleaved btile.
                const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
-#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
+#if OPTIMIZE_READ
                Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[  0  ];
                Dtype*  pblockA00 = (Dtype*)(&blockA00);
 #else
@ -790,7 +826,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
            }
        }
    }
-#endif
+#endif  // TILE_N_LAST > 0
 }
 #endif
 #ifdef GEMM_LIKE_CONV_32_2
@ -813,7 +849,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 __attribute__((intel_reqd_sub_group_size(8)))
 __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 {
+    const __global Dtype *src0 = src0_ptr + src0_offset;
+    const __global Dtype *src1 = src1_ptr + src1_offset;
+#if APPLY_BIAS
+  __global Dtype* biases_base = biases_base_ptr + biases_base_offset;
+#endif
+
    __global Dtype *dst = dst_base + dst_offset;
+#ifdef FUSED_CONV_ELTWISE
+  __global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
+#endif
+
    const int group_x = get_group_id(0);
    const int group_y = get_group_id(1);
    const int global_x = get_global_id(0);
@ -1375,7 +1421,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 __attribute__((intel_reqd_sub_group_size(16)))
 __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 {
+    const __global Dtype *src0 = src0_ptr + src0_offset;
+    const __global Dtype *src1 = src1_ptr + src1_offset;
+#if APPLY_BIAS
+  __global Dtype* biases_base = biases_base_ptr + biases_base_offset;
+#endif
+
    __global Dtype *dst = dst_base + dst_offset;
+#ifdef FUSED_CONV_ELTWISE
+  __global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
+#endif
+
    const int group_x = get_group_id(0);
    const int group_y = get_group_id(1);
    const int global_x = get_global_id(0);
@ -1561,7 +1617,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 __attribute__((intel_reqd_sub_group_size(16)))
 __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 {
+    const __global Dtype *src0 = src0_ptr + src0_offset;
+    const __global Dtype *src1 = src1_ptr + src1_offset;
+#if APPLY_BIAS
+  __global Dtype* biases_base = biases_base_ptr + biases_base_offset;
+#endif
+
    __global Dtype *dst = dst_base + dst_offset;
+#ifdef FUSED_CONV_ELTWISE
+  __global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
+#endif
+
    const int group_x = get_group_id(0);
    const int group_y = get_group_id(1);
    const int global_x = get_global_id(0);
--- a/modules/dnn/src/opencl/conv_spatial_helper.cl
+++ b/modules/dnn/src/opencl/conv_spatial_helper.cl
@ -62,8 +62,8 @@ __kernel void TEMPLATE(copyWeightsSwizzled, Dtype)
  //Original location

  //Output location
-  int outputSublayer = channels / swizzleFactor;
-  int outputSublayerIndex = channels % swizzleFactor;
+  //int outputSublayer = channels / swizzleFactor;
+  //int outputSublayerIndex = channels % swizzleFactor;

  int filter = sX / (kernel_w*kernel_h*channels);
  int kernel_X = sX % kernel_w;
@ -73,6 +73,10 @@ __kernel void TEMPLATE(copyWeightsSwizzled, Dtype)
  int FP = filter / swizzleFactor;
  int F1 = filter % swizzleFactor;

-  weightOut[FP*(kernel_w*kernel_h*channels*swizzleFactor) + kernel_C*(kernel_w*kernel_h*swizzleFactor) + kernel_Y*(kernel_w*swizzleFactor) + kernel_X*swizzleFactor + F1]
-  = weightIn[filter*(kernel_w*kernel_h*channels) + kernel_C*(kernel_w*kernel_h) + kernel_Y*kernel_w + kernel_X];
+  int idxOut = FP*(kernel_w*kernel_h*channels*swizzleFactor) + kernel_C*(kernel_w*kernel_h*swizzleFactor) + kernel_Y*(kernel_w*swizzleFactor) + kernel_X*swizzleFactor + F1;
+  int idxIn = filter*(kernel_w*kernel_h*channels) + kernel_C*(kernel_w*kernel_h) + kernel_Y*kernel_w + kernel_X;
+
+  // idxIn is not valid if (filter >= outputs) - no data for these elements. Output alignment gaps are filled by zeros
+  Dtype v = (filter < outputs) ? weightIn[idxIn] : (Dtype)0;
+  weightOut[idxOut] = v;
 }
--- a/modules/dnn/src/opencl/gemm_image.cl
+++ b/modules/dnn/src/opencl/gemm_image.cl
@ -954,6 +954,10 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_transpose, Dtype)(
 {
    const int gidx = get_global_id(0);
    const int gidy = get_global_id(1);
+
+    if (gidx >= width || gidy >= height)
+        return;
+
    int2 coord_dst = (int2)(gidx, gidy);
    __global Dtype* A_off = A + offA;
    Dtype srcA = A_off[gidy * ldA + gidx];
@ -968,12 +972,18 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose, Dtype)(
    __global Dtype* A,
    __write_only image2d_t ImA,
    int offA,
+    int padded_width,
+    int padded_height,
    int width,
    int height,
    int ldA)
 {
    const int gidx = get_global_id(0);
    const int gidy = get_global_id(1);
+
+    if (gidx >= padded_width || gidy >= padded_height)
+        return;
+
    int2 coord_dst = (int2)(gidx, gidy);
 #if TYPE == TYPE_HALF
    if (gidx >= width || gidy >= height) {
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@ -933,7 +933,12 @@ void TFImporter::parseBias(tensorflow::GraphDef& net, const tensorflow::NodeDef&
        layer_id[name] = id;

        // one input only
-        connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
+        Pin inp0 = parsePin(layer.input(0));
+        if (layer_id.find(inp0.name) != layer_id.end())
+            // First operand is a constant.
+            connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
+        else
+            connect(layer_id, dstNet, parsePin(layer.input(1)), id, 0);
    }
    else
    {
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@ -267,6 +267,11 @@ TEST_P(Test_ONNX_layers, ReLU)
    testONNXModels("ReLU");
 }

+TEST_P(Test_ONNX_layers, PReLU)
+{
+    testONNXModels("PReLU_slope");
+}
+
 TEST_P(Test_ONNX_layers, Clip)
 {
    testONNXModels("clip", npy);
@ -302,6 +307,7 @@ TEST_P(Test_ONNX_layers, Scale)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
    testONNXModels("scale");
    testONNXModels("scale_broadcast", npy, 0, 0, false, true, 3);
+    testONNXModels("scale_broadcast_mid", npy, 0, 0, false, true, 2);
 }

 TEST_P(Test_ONNX_layers, ReduceMean3D)
@ -505,6 +511,8 @@ TEST_P(Test_ONNX_layers, MatMulAdd)

 TEST_P(Test_ONNX_layers, Expand)
 {
+    testONNXModels("expand");
+    testONNXModels("expand_identity");
    testONNXModels("expand_batch");
    testONNXModels("expand_channels");
    testONNXModels("expand_neg_batch");
@ -646,6 +654,7 @@ TEST_P(Test_ONNX_layers, ReduceL2)
    testONNXModels("reduceL2");
    testONNXModels("reduceL2_subgraph");
    testONNXModels("reduceL2_subgraph_2");
+    testONNXModels("reduceL2_subgraph2_2");
 }

 TEST_P(Test_ONNX_layers, Split)
@ -659,6 +668,7 @@ TEST_P(Test_ONNX_layers, Split)
    testONNXModels("split_3");
    testONNXModels("split_4");
    testONNXModels("split_sizes");
+    testONNXModels("split_neg_axis");
 }

 TEST_P(Test_ONNX_layers, Slice)
@ -667,6 +677,7 @@ TEST_P(Test_ONNX_layers, Slice)
    testONNXModels("slice", npy, 0, 0, false, false);
 #else
    testONNXModels("slice");
+    testONNXModels("slice_neg_starts");
    testONNXModels("slice_opset_11");
 #endif
 }
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@ -581,6 +581,18 @@ TEST_P(Test_TensorFlow_layers, l2_normalize)
    runTensorFlowNet("l2_normalize");
 }

+TEST_P(Test_TensorFlow_layers, BiasAdd)
+{
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019010000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD
+            && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X
+    )
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+
+    runTensorFlowNet("bias_add_1");
+}
+
 // TODO: fix it and add to l2_normalize
 TEST_P(Test_TensorFlow_layers, l2_normalize_3d)
 {
@ -1242,6 +1254,11 @@ TEST_P(Test_TensorFlow_layers, resize_bilinear_down)
    runTensorFlowNet("resize_bilinear_down");
 }

+TEST_P(Test_TensorFlow_layers, resize_concat_optimization)
+{
+    runTensorFlowNet("resize_concat_optimization");
+}
+
 TEST_P(Test_TensorFlow_layers, tf2_dense)
 {
    runTensorFlowNet("tf2_dense");
--- a/modules/features2d/include/opencv2/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d.hpp
@ -1105,7 +1105,7 @@ public:
    that is, copies both parameters and train data. If emptyTrainData is true, the method creates an
    object copy with the current parameters but with empty train data.
     */
-    CV_WRAP virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const = 0;
+    CV_WRAP CV_NODISCARD_STD virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const = 0;

    /** @brief Creates a descriptor matcher of a given type with the default parameters (using default
    constructor).
@ -1165,7 +1165,7 @@ protected:
    static bool isPossibleMatch( InputArray mask, int queryIdx, int trainIdx );
    static bool isMaskedOut( InputArrayOfArrays masks, int queryIdx );

-    static Mat clone_op( Mat m ) { return m.clone(); }
+    CV_NODISCARD_STD static Mat clone_op( Mat m ) { return m.clone(); }
    void checkMasks( InputArrayOfArrays masks, int queryDescriptorsCount ) const;

    //! Collection of descriptors from train images.
@ -1206,7 +1206,7 @@ public:
     */
    CV_WRAP static Ptr<BFMatcher> create( int normType=NORM_L2, bool crossCheck=false ) ;

-    virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const CV_OVERRIDE;
+    CV_NODISCARD_STD virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const CV_OVERRIDE;
 protected:
    virtual void knnMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, int k,
        InputArrayOfArrays masks=noArray(), bool compactResult=false ) CV_OVERRIDE;
@ -1245,7 +1245,7 @@ public:

    CV_WRAP static Ptr<FlannBasedMatcher> create();

-    virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const CV_OVERRIDE;
+    CV_NODISCARD_STD virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const CV_OVERRIDE;
 protected:
    static void convertToDMatches( const DescriptorCollection& descriptors,
                                   const Mat& indices, const Mat& distances,
--- a/modules/python/src2/hdr_parser.py
+++ b/modules/python/src2/hdr_parser.py
@ -437,11 +437,18 @@ class CppHeaderParser(object):
        # filter off some common prefixes, which are meaningless for Python wrappers.
        # note that we do not strip "static" prefix, which does matter;
        # it means class methods, not instance methods
-        decl_str = self.batch_replace(decl_str, [("static inline", ""), ("inline", ""), ("explicit ", ""),
-                                                 ("CV_EXPORTS_W", ""), ("CV_EXPORTS", ""), ("CV_CDECL", ""),
-                                                 ("CV_WRAP ", " "), ("CV_INLINE", ""),
-                                                 ("CV_DEPRECATED", ""), ("CV_DEPRECATED_EXTERNAL", "")]).strip()
-
+        decl_str = self.batch_replace(decl_str, [("static inline", ""),
+                                                 ("inline", ""),
+                                                 ("explicit ", ""),
+                                                 ("CV_EXPORTS_W", ""),
+                                                 ("CV_EXPORTS", ""),
+                                                 ("CV_CDECL", ""),
+                                                 ("CV_WRAP ", " "),
+                                                 ("CV_INLINE", ""),
+                                                 ("CV_DEPRECATED", ""),
+                                                 ("CV_DEPRECATED_EXTERNAL", ""),
+                                                 ("CV_NODISCARD_STD", ""),
+                                                 ("CV_NODISCARD", "")]).strip()

        if decl_str.strip().startswith('virtual'):
            virtual_method = True
--- a/platforms/winpack_dldt/2021.4.1/20210630-dldt-disable-multidevice-autoplugin.patch
+++ b/platforms/winpack_dldt/2021.4.1/20210630-dldt-disable-multidevice-autoplugin.patch
@ -0,0 +1,16 @@
+diff --git a/inference-engine/src/CMakeLists.txt b/inference-engine/src/CMakeLists.txt
+index 0ba0dd78..7d34e7cb 100644
+--- a/inference-engine/src/CMakeLists.txt
+++ b/inference-engine/src/CMakeLists.txt
+@@ -26,9 +26,9 @@ endif()
+ 
+ add_subdirectory(hetero_plugin)
+ 
+-add_subdirectory(auto_plugin)
+#add_subdirectory(auto_plugin)
+ 
+-add_subdirectory(multi_device)
+#add_subdirectory(multi_device)
+ 
+ add_subdirectory(transformations)
+ 
--- a/platforms/winpack_dldt/2021.4.1/20210630-dldt-disable-unused-targets.patch
+++ b/platforms/winpack_dldt/2021.4.1/20210630-dldt-disable-unused-targets.patch
@ -0,0 +1,219 @@
+diff --git a/cmake/developer_package/add_ie_target.cmake b/cmake/developer_package/add_ie_target.cmake
+index d49f16a4d..2726ca787 100644
+--- a/cmake/developer_package/add_ie_target.cmake
+++ b/cmake/developer_package/add_ie_target.cmake
+@@ -92,7 +92,7 @@ function(addIeTarget)
+     if (ARG_TYPE STREQUAL EXECUTABLE)
+         add_executable(${ARG_NAME} ${all_sources})
+     elseif(ARG_TYPE STREQUAL STATIC OR ARG_TYPE STREQUAL SHARED)
+-        add_library(${ARG_NAME} ${ARG_TYPE} ${all_sources})
+        add_library(${ARG_NAME} ${ARG_TYPE} EXCLUDE_FROM_ALL ${all_sources})
+     else()
+         message(SEND_ERROR "Invalid target type ${ARG_TYPE} specified for target name ${ARG_NAME}")
+     endif()
+diff --git a/inference-engine/CMakeLists.txt b/inference-engine/CMakeLists.txt
+index 1ac7fd8bf..df7091e51 100644
+--- a/inference-engine/CMakeLists.txt
+++ b/inference-engine/CMakeLists.txt
+@@ -39,7 +39,7 @@ if(ENABLE_TESTS)
+     add_subdirectory(tests)
+ endif()
+ 
+-add_subdirectory(tools)
+#add_subdirectory(tools)
+ 
+ function(ie_build_samples)
+     # samples should be build with the same flags as from OpenVINO package,
+@@ -58,7 +58,7 @@ endfunction()
+ 
+ # gflags and format_reader targets are kept inside of samples directory and
+ # they must be built even if samples build is disabled (required for tests and tools).
+-ie_build_samples()
+#ie_build_samples()
+ 
+ if(ENABLE_PYTHON)
+     add_subdirectory(ie_bridges/python)
+@@ -142,7 +142,7 @@ endif()
+ # Developer package
+ #
+ 
+-openvino_developer_export_targets(COMPONENT openvino_common TARGETS format_reader gflags ie_samples_utils)
+#openvino_developer_export_targets(COMPONENT openvino_common TARGETS format_reader gflags ie_samples_utils)
+ 
+ # for Template plugin
+ if(NGRAPH_INTERPRETER_ENABLE)
+@@ -166,7 +166,7 @@ function(ie_generate_dev_package_config)
+                 @ONLY)
+ endfunction()
+ 
+-ie_generate_dev_package_config()
+#ie_generate_dev_package_config()
+ 
+ #
+ # Coverage
+diff --git a/inference-engine/src/inference_engine/CMakeLists.txt b/inference-engine/src/inference_engine/CMakeLists.txt
+index e8ed1a5c4..1fc9fc3ff 100644
+--- a/inference-engine/src/inference_engine/CMakeLists.txt
+++ b/inference-engine/src/inference_engine/CMakeLists.txt
+@@ -110,7 +110,7 @@ add_cpplint_target(${TARGET_NAME}_plugin_api_cpplint FOR_SOURCES ${plugin_api_sr
+ 
+ # Create object library
+ 
+-add_library(${TARGET_NAME}_obj OBJECT
+add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
+             ${LIBRARY_SRC}
+             ${LIBRARY_HEADERS}
+             ${PUBLIC_HEADERS})
+@@ -181,7 +181,7 @@ ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
+ 
+ # Static library used for unit tests which are always built
+ 
+-add_library(${TARGET_NAME}_s STATIC
+add_library(${TARGET_NAME}_s STATIC EXCLUDE_FROM_ALL
+             $<TARGET_OBJECTS:${TARGET_NAME}_legacy_obj>
+             $<TARGET_OBJECTS:${TARGET_NAME}_obj>
+             ${IE_STATIC_DEPENDENT_FILES})
+diff --git a/inference-engine/src/legacy_api/CMakeLists.txt b/inference-engine/src/legacy_api/CMakeLists.txt
+index 8eae82bd2..e0e6745b1 100644
+--- a/inference-engine/src/legacy_api/CMakeLists.txt
+++ b/inference-engine/src/legacy_api/CMakeLists.txt
+@@ -26,7 +26,7 @@ endif()
+ 
+ file(TOUCH ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp)
+ 
+-add_library(${TARGET_NAME}_obj OBJECT
+add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
+             ${LIBRARY_SRC}
+             ${PUBLIC_HEADERS})
+ 
+diff --git a/inference-engine/src/mkldnn_plugin/CMakeLists.txt b/inference-engine/src/mkldnn_plugin/CMakeLists.txt
+index fe57b29dd..07831e2fb 100644
+--- a/inference-engine/src/mkldnn_plugin/CMakeLists.txt
+++ b/inference-engine/src/mkldnn_plugin/CMakeLists.txt
+@@ -67,7 +67,7 @@ ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
+ 
+ #  add test object library
+ 
+-add_library(${TARGET_NAME}_obj OBJECT ${SOURCES} ${HEADERS})
+add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL ${SOURCES} ${HEADERS})
+ target_link_libraries(${TARGET_NAME}_obj PUBLIC mkldnn)
+ 
+ target_include_directories(${TARGET_NAME}_obj PRIVATE $<TARGET_PROPERTY:inference_engine_preproc_s,INTERFACE_INCLUDE_DIRECTORIES>
+diff --git a/inference-engine/src/preprocessing/CMakeLists.txt b/inference-engine/src/preprocessing/CMakeLists.txt
+index f9548339d..ef962145a 100644
+--- a/inference-engine/src/preprocessing/CMakeLists.txt
+++ b/inference-engine/src/preprocessing/CMakeLists.txt
+@@ -101,7 +101,7 @@ endif()
+ 
+ # Create object library
+ 
+-add_library(${TARGET_NAME}_obj OBJECT
+add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
+             ${LIBRARY_SRC}
+             ${LIBRARY_HEADERS})
+ 
+@@ -153,7 +153,7 @@ ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
+ 
+ # Static library used for unit tests which are always built
+ 
+-add_library(${TARGET_NAME}_s STATIC
+add_library(${TARGET_NAME}_s STATIC EXCLUDE_FROM_ALL
+             $<TARGET_OBJECTS:${TARGET_NAME}_obj>)
+ 
+ set_ie_threading_interface_for(${TARGET_NAME}_s)
+diff --git a/inference-engine/src/vpu/common/CMakeLists.txt b/inference-engine/src/vpu/common/CMakeLists.txt
+index 249e47c28..4ddf63049 100644
+--- a/inference-engine/src/vpu/common/CMakeLists.txt
+++ b/inference-engine/src/vpu/common/CMakeLists.txt
+@@ -5,7 +5,7 @@
+ file(GLOB_RECURSE SOURCES *.cpp *.hpp *.h)
+ 
+ function(add_common_target TARGET_NAME STATIC_IE)
+-    add_library(${TARGET_NAME} STATIC ${SOURCES})
+    add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${SOURCES})
+ 
+     ie_faster_build(${TARGET_NAME}
+         UNITY
+@@ -60,7 +60,7 @@ add_common_target("vpu_common_lib" FALSE)
+ 
+ # Unit tests support for graph transformer
+ if(WIN32)
+-    add_common_target("vpu_common_lib_test_static" TRUE)
+    #add_common_target("vpu_common_lib_test_static" TRUE)
+ else()
+     add_library("vpu_common_lib_test_static" ALIAS "vpu_common_lib")
+ endif()
+diff --git a/inference-engine/src/vpu/graph_transformer/CMakeLists.txt b/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
+index bc73ab5b1..b4c1547fc 100644
+--- a/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
+++ b/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
+@@ -5,7 +5,7 @@
+ file(GLOB_RECURSE SOURCES *.cpp *.hpp *.h *.inc)
+ 
+ function(add_graph_transformer_target TARGET_NAME STATIC_IE)
+-    add_library(${TARGET_NAME} STATIC ${SOURCES})
+    add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${SOURCES})
+ 
+     set_ie_threading_interface_for(${TARGET_NAME})
+ 
+@@ -70,7 +70,7 @@ add_graph_transformer_target("vpu_graph_transformer" FALSE)
+ 
+ # Unit tests support for graph transformer
+ if(WIN32)
+-    add_graph_transformer_target("vpu_graph_transformer_test_static" TRUE)
+    #add_graph_transformer_target("vpu_graph_transformer_test_static" TRUE)
+ else()
+     add_library("vpu_graph_transformer_test_static" ALIAS "vpu_graph_transformer")
+ endif()
+diff --git a/inference-engine/thirdparty/pugixml/CMakeLists.txt b/inference-engine/thirdparty/pugixml/CMakeLists.txt
+index 8bcb2801a..f7e031c01 100644
+--- a/inference-engine/thirdparty/pugixml/CMakeLists.txt
+++ b/inference-engine/thirdparty/pugixml/CMakeLists.txt
+@@ -41,7 +41,7 @@ if(BUILD_SHARED_LIBS)
+ else()
+ 	add_library(pugixml STATIC ${SOURCES})
+ 	if (MSVC)
+-		add_library(pugixml_mt STATIC ${SOURCES})
+                #add_library(pugixml_mt STATIC ${SOURCES})
+ 		#if (WIN32)
+ 		#	set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
+ 		#	set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
+diff --git a/ngraph/core/builder/CMakeLists.txt b/ngraph/core/builder/CMakeLists.txt
+index ff5c381e7..2797ec9ab 100644
+--- a/ngraph/core/builder/CMakeLists.txt
+++ b/ngraph/core/builder/CMakeLists.txt
+@@ -16,7 +16,7 @@ source_group("src" FILES ${LIBRARY_SRC})
+ source_group("include" FILES ${PUBLIC_HEADERS})
+ 
+ # Create shared library
+-add_library(${TARGET_NAME} STATIC ${LIBRARY_SRC} ${PUBLIC_HEADERS})
+add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${LIBRARY_SRC} ${PUBLIC_HEADERS})
+ 
+ if(COMMAND ie_faster_build)
+     ie_faster_build(${TARGET_NAME}
+diff --git a/ngraph/core/reference/CMakeLists.txt b/ngraph/core/reference/CMakeLists.txt
+index ef4a764ab..f6d3172e2 100644
+--- a/ngraph/core/reference/CMakeLists.txt
+++ b/ngraph/core/reference/CMakeLists.txt
+@@ -16,7 +16,7 @@ source_group("src" FILES ${LIBRARY_SRC})
+ source_group("include" FILES ${PUBLIC_HEADERS})
+ 
+ # Create shared library
+-add_library(${TARGET_NAME} STATIC ${LIBRARY_SRC} ${PUBLIC_HEADERS})
+add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${LIBRARY_SRC} ${PUBLIC_HEADERS})
+ 
+ if(COMMAND ie_faster_build)
+     ie_faster_build(${TARGET_NAME}
+diff --git a/openvino/itt/CMakeLists.txt b/openvino/itt/CMakeLists.txt
+index e9f880b8c..c63f4df63 100644
+--- a/openvino/itt/CMakeLists.txt
+++ b/openvino/itt/CMakeLists.txt
+@@ -6,7 +6,7 @@ set(TARGET_NAME itt)
+ 
+ file(GLOB_RECURSE SOURCES "src/*.cpp" "src/*.hpp")
+ 
+-add_library(${TARGET_NAME} STATIC ${SOURCES})
+add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${SOURCES})
+ 
+ add_library(openvino::itt ALIAS ${TARGET_NAME})
+ 
--- a/platforms/winpack_dldt/2021.4.1/20210630-dldt-pdb.patch
+++ b/platforms/winpack_dldt/2021.4.1/20210630-dldt-pdb.patch
@ -0,0 +1,15 @@
+iff --git a/CMakeLists.txt b/CMakeLists.txt
+index e0706a72e..9a053b1e4 100644
+--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+@@ -6,6 +6,10 @@ cmake_minimum_required(VERSION 3.13)
+ 
+ project(OpenVINO)
+ 
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zi /FS")
+set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
+set(CMAKE_MODULE_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
+
+ set(OpenVINO_MAIN_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+ set(IE_MAIN_SOURCE_DIR ${OpenVINO_MAIN_SOURCE_DIR}/inference-engine)
+ 
--- a/platforms/winpack_dldt/2021.4.1/20210630-dldt-vs-version.patch
+++ b/platforms/winpack_dldt/2021.4.1/20210630-dldt-vs-version.patch
@ -0,0 +1,16 @@
+diff --git a/cmake/developer_package/vs_version/vs_version.cmake b/cmake/developer_package/vs_version/vs_version.cmake
+index 14d4c0e1e..6a44f73b9 100644
+--- a/cmake/developer_package/vs_version/vs_version.cmake
+++ b/cmake/developer_package/vs_version/vs_version.cmake
+@@ -8,9 +8,9 @@ set(IE_VS_VER_FILEVERSION_STR "${IE_VERSION_MAJOR}.${IE_VERSION_MINOR}.${IE_VERS
+ 
+ set(IE_VS_VER_COMPANY_NAME_STR "Intel Corporation")
+ set(IE_VS_VER_PRODUCTVERSION_STR "${CI_BUILD_NUMBER}")
+-set(IE_VS_VER_PRODUCTNAME_STR "OpenVINO toolkit")
+set(IE_VS_VER_PRODUCTNAME_STR "OpenVINO toolkit (for OpenCV Windows package)")
+ set(IE_VS_VER_COPYRIGHT_STR "Copyright (C) 2018-2021, Intel Corporation")
+-set(IE_VS_VER_COMMENTS_STR "https://docs.openvinotoolkit.org/")
+set(IE_VS_VER_COMMENTS_STR "https://github.com/opencv/opencv/wiki/Intel%27s-Deep-Learning-Inference-Engine-backend")
+ 
+ #
+ # ie_add_vs_version_file(NAME <name>
--- a/platforms/winpack_dldt/2021.4.1/build.config.py
+++ b/platforms/winpack_dldt/2021.4.1/build.config.py
@ -0,0 +1 @@
+os.environ['CI_BUILD_NUMBER'] = '2021.4.1-opencv_winpack_dldt'
--- a/platforms/winpack_dldt/2021.4.1/patch.config.py
+++ b/platforms/winpack_dldt/2021.4.1/patch.config.py
@ -0,0 +1,4 @@
+applyPatch('20210630-dldt-disable-unused-targets.patch')
+applyPatch('20210630-dldt-pdb.patch')
+applyPatch('20210630-dldt-disable-multidevice-autoplugin.patch')
+applyPatch('20210630-dldt-vs-version.patch')
--- a/platforms/winpack_dldt/2021.4.1/sysroot.config.py
+++ b/platforms/winpack_dldt/2021.4.1/sysroot.config.py
@ -0,0 +1,56 @@
+sysroot_bin_dir = prepare_dir(self.sysrootdir / 'bin')
+copytree(self.build_dir / 'install', self.sysrootdir / 'ngraph')
+#rm_one(self.sysrootdir / 'ngraph' / 'lib' / 'ngraph.dll')
+
+build_config = 'Release' if not self.config.build_debug else 'Debug'
+build_bin_dir = self.build_dir / 'bin' / 'intel64' / build_config
+
+def copy_bin(name):
+    global build_bin_dir, sysroot_bin_dir
+    copytree(build_bin_dir / name, sysroot_bin_dir / name)
+
+dll_suffix = 'd' if self.config.build_debug else ''
+def copy_dll(name):
+    global copy_bin, dll_suffix
+    copy_bin(name + dll_suffix + '.dll')
+    copy_bin(name + dll_suffix + '.pdb')
+
+copy_bin('cache.json')
+copy_dll('clDNNPlugin')
+copy_dll('HeteroPlugin')
+copy_dll('inference_engine')
+copy_dll('inference_engine_ir_reader')
+#copy_dll('inference_engine_ir_v7_reader')
+copy_dll('inference_engine_legacy')
+copy_dll('inference_engine_transformations')  # runtime
+copy_dll('inference_engine_lp_transformations')  # runtime
+#copy_dll('inference_engine_preproc')  # runtime
+copy_dll('MKLDNNPlugin')  # runtime
+copy_dll('myriadPlugin')  # runtime
+#copy_dll('MultiDevicePlugin')  # runtime, not used
+copy_dll('ngraph')
+copy_bin('plugins.xml')
+copy_bin('pcie-ma2x8x.elf')
+copy_bin('usb-ma2x8x.mvcmd')
+
+copytree(self.srcdir / 'inference-engine' / 'temp' / 'tbb' / 'bin', sysroot_bin_dir)
+copytree(self.srcdir / 'inference-engine' / 'temp' / 'tbb', self.sysrootdir / 'tbb')
+
+sysroot_ie_dir = prepare_dir(self.sysrootdir / 'deployment_tools' / 'inference_engine')
+sysroot_ie_lib_dir = prepare_dir(sysroot_ie_dir / 'lib' / 'intel64')
+
+copytree(self.srcdir / 'inference-engine' / 'include', sysroot_ie_dir / 'include')
+if not self.config.build_debug:
+    copytree(build_bin_dir / 'ngraph.lib', sysroot_ie_lib_dir / 'ngraph.lib')
+    copytree(build_bin_dir / 'inference_engine.lib', sysroot_ie_lib_dir / 'inference_engine.lib')
+    copytree(build_bin_dir / 'inference_engine_ir_reader.lib', sysroot_ie_lib_dir / 'inference_engine_ir_reader.lib')
+    copytree(build_bin_dir / 'inference_engine_legacy.lib', sysroot_ie_lib_dir / 'inference_engine_legacy.lib')
+else:
+    copytree(build_bin_dir / 'ngraphd.lib', sysroot_ie_lib_dir / 'ngraphd.lib')
+    copytree(build_bin_dir / 'inference_engined.lib', sysroot_ie_lib_dir / 'inference_engined.lib')
+    copytree(build_bin_dir / 'inference_engine_ir_readerd.lib', sysroot_ie_lib_dir / 'inference_engine_ir_readerd.lib')
+    copytree(build_bin_dir / 'inference_engine_legacyd.lib', sysroot_ie_lib_dir / 'inference_engine_legacyd.lib')
+
+sysroot_license_dir = prepare_dir(self.sysrootdir / 'etc' / 'licenses')
+copytree(self.srcdir / 'LICENSE', sysroot_license_dir / 'dldt-LICENSE')
+copytree(self.sysrootdir / 'tbb/LICENSE', sysroot_license_dir / 'tbb-LICENSE')
--- a/platforms/winpack_dldt/build_package.py
+++ b/platforms/winpack_dldt/build_package.py
@ -471,7 +471,8 @@ class Builder:
 def main():

    dldt_src_url = 'https://github.com/openvinotoolkit/openvino'
-    dldt_src_commit = '2021.4'
+    dldt_src_commit = '2021.4.1'
+    dldt_config = None
    dldt_release = None

    build_cache_dir_default = os.environ.get('BUILD_CACHE_DIR', '.build_cache')
@ -505,7 +506,7 @@ def main():
    parser.add_argument('--dldt_reference_dir', help='DLDT reference git repository (optional)')
    parser.add_argument('--dldt_src_dir', help='DLDT custom source repository (skip git checkout and patching, use for TESTING only)')

-    parser.add_argument('--dldt_config', help='Specify DLDT build configuration (defaults to evaluate from DLDT commit/branch)')
+    parser.add_argument('--dldt_config', default=dldt_config, help='Specify DLDT build configuration (defaults to evaluate from DLDT commit/branch)')

    parser.add_argument('--override_patch_hashsum', default='', help='(script debug mode)')
				`@ -0,0 +1 @@`
				`os.environ['CI_BUILD_NUMBER'] = '2021.4.1-opencv_winpack_dldt'`