diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp
index 2cd5cac030..415ec2be79 100644
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@@ -665,6 +665,7 @@ CV_EXPORTS const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
 CV_EXPORTS const char* typeToStr(int t);
 CV_EXPORTS const char* memopTypeToStr(int t);
 CV_EXPORTS const char* vecopTypeToStr(int t);
+CV_EXPORTS const char* getOpenCLErrorString(int errorCode);
 CV_EXPORTS String kernelToStr(InputArray _kernel, int ddepth = -1, const char * name = NULL);
 CV_EXPORTS void getPlatfomsInfo(std::vector<PlatformInfo>& platform_info);
 
@@ -731,6 +732,21 @@ protected:
     Impl* p;
 };
 
+class CV_EXPORTS Timer
+{
+public:
+    Timer(const Queue& q);
+    ~Timer();
+    void start();
+    void stop();
+    float milliSeconds();
+    float microSeconds();
+    float seconds();
+
+protected:
+    struct Impl;
+    Impl* p;
+};
 
 CV_EXPORTS MatAllocator* getOpenCLAllocator();
 
diff --git a/modules/core/include/opencv2/core/utils/configuration.private.hpp b/modules/core/include/opencv2/core/utils/configuration.private.hpp
new file mode 100644
index 0000000000..fa1b045178
--- /dev/null
+++ b/modules/core/include/opencv2/core/utils/configuration.private.hpp
@@ -0,0 +1,16 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CONFIGURATION_PRIVATE_HPP
+#define OPENCV_CONFIGURATION_PRIVATE_HPP
+
+namespace cv { namespace utils {
+
+CV_EXPORTS bool getConfigurationParameterBool(const char* name, bool defaultValue);
+CV_EXPORTS size_t getConfigurationParameterSizeT(const char* name, size_t defaultValue);
+CV_EXPORTS cv::String getConfigurationParameterString(const char* name, const char* defaultValue);
+
+}} // namespace
+
+#endif // OPENCV_CONFIGURATION_PRIVATE_HPP
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
index 8c24a04aea..60dca8bead 100644
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -51,7 +51,10 @@
 #include <inttypes.h>
 #endif
 
+#include <opencv2/core/utils/configuration.private.hpp>
+
 #include "opencv2/core/ocl_genbase.hpp"
+#include "opencl_kernels_core.hpp"
 
 #define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG 0
 #define CV_OPENCL_SHOW_RUN_ERRORS       0
@@ -4718,6 +4721,102 @@ const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
     return buf;
 }
 
+const char* getOpenCLErrorString(int errorCode)
+{
+    switch (errorCode)
+    {
+    case   0: return "CL_SUCCESS";
+    case  -1: return "CL_DEVICE_NOT_FOUND";
+    case  -2: return "CL_DEVICE_NOT_AVAILABLE";
+    case  -3: return "CL_COMPILER_NOT_AVAILABLE";
+    case  -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+    case  -5: return "CL_OUT_OF_RESOURCES";
+    case  -6: return "CL_OUT_OF_HOST_MEMORY";
+    case  -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
+    case  -8: return "CL_MEM_COPY_OVERLAP";
+    case  -9: return "CL_IMAGE_FORMAT_MISMATCH";
+    case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
+    case -11: return "CL_BUILD_PROGRAM_FAILURE";
+    case -12: return "CL_MAP_FAILURE";
+    case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
+    case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
+    case -15: return "CL_COMPILE_PROGRAM_FAILURE";
+    case -16: return "CL_LINKER_NOT_AVAILABLE";
+    case -17: return "CL_LINK_PROGRAM_FAILURE";
+    case -18: return "CL_DEVICE_PARTITION_FAILED";
+    case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
+    case -30: return "CL_INVALID_VALUE";
+    case -31: return "CL_INVALID_DEVICE_TYPE";
+    case -32: return "CL_INVALID_PLATFORM";
+    case -33: return "CL_INVALID_DEVICE";
+    case -34: return "CL_INVALID_CONTEXT";
+    case -35: return "CL_INVALID_QUEUE_PROPERTIES";
+    case -36: return "CL_INVALID_COMMAND_QUEUE";
+    case -37: return "CL_INVALID_HOST_PTR";
+    case -38: return "CL_INVALID_MEM_OBJECT";
+    case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+    case -40: return "CL_INVALID_IMAGE_SIZE";
+    case -41: return "CL_INVALID_SAMPLER";
+    case -42: return "CL_INVALID_BINARY";
+    case -43: return "CL_INVALID_BUILD_OPTIONS";
+    case -44: return "CL_INVALID_PROGRAM";
+    case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
+    case -46: return "CL_INVALID_KERNEL_NAME";
+    case -47: return "CL_INVALID_KERNEL_DEFINITION";
+    case -48: return "CL_INVALID_KERNEL";
+    case -49: return "CL_INVALID_ARG_INDEX";
+    case -50: return "CL_INVALID_ARG_VALUE";
+    case -51: return "CL_INVALID_ARG_SIZE";
+    case -52: return "CL_INVALID_KERNEL_ARGS";
+    case -53: return "CL_INVALID_WORK_DIMENSION";
+    case -54: return "CL_INVALID_WORK_GROUP_SIZE";
+    case -55: return "CL_INVALID_WORK_ITEM_SIZE";
+    case -56: return "CL_INVALID_GLOBAL_OFFSET";
+    case -57: return "CL_INVALID_EVENT_WAIT_LIST";
+    case -58: return "CL_INVALID_EVENT";
+    case -59: return "CL_INVALID_OPERATION";
+    case -60: return "CL_INVALID_GL_OBJECT";
+    case -61: return "CL_INVALID_BUFFER_SIZE";
+    case -62: return "CL_INVALID_MIP_LEVEL";
+    case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
+    case -64: return "CL_INVALID_PROPERTY";
+    case -65: return "CL_INVALID_IMAGE_DESCRIPTOR";
+    case -66: return "CL_INVALID_COMPILER_OPTIONS";
+    case -67: return "CL_INVALID_LINKER_OPTIONS";
+    case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
+    case -69: return "CL_INVALID_PIPE_SIZE";
+    case -70: return "CL_INVALID_DEVICE_QUEUE";
+    case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
+    case -1001: return "CL_PLATFORM_NOT_FOUND_KHR";
+    case -1002: return "CL_INVALID_D3D10_DEVICE_KHR";
+    case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR";
+    case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
+    case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
+    case -1024: return "clBLAS: Functionality is not implemented";
+    case -1023: return "clBLAS: Library is not initialized yet";
+    case -1022: return "clBLAS: Matrix A is not a valid memory object";
+    case -1021: return "clBLAS: Matrix B is not a valid memory object";
+    case -1020: return "clBLAS: Matrix C is not a valid memory object";
+    case -1019: return "clBLAS: Vector X is not a valid memory object";
+    case -1018: return "clBLAS: Vector Y is not a valid memory object";
+    case -1017: return "clBLAS: An input dimension (M:N:K) is invalid";
+    case -1016: return "clBLAS: Leading dimension A must not be less than the "
+                       "size of the first dimension";
+    case -1015: return "clBLAS: Leading dimension B must not be less than the "
+                       "size of the second dimension";
+    case -1014: return "clBLAS: Leading dimension C must not be less than the "
+                       "size of the third dimension";
+    case -1013: return "clBLAS: The increment for a vector X must not be 0";
+    case -1012: return "clBLAS: The increment for a vector Y must not be 0";
+    case -1011: return "clBLAS: The memory object for Matrix A is too small";
+    case -1010: return "clBLAS: The memory object for Matrix B is too small";
+    case -1009: return "clBLAS: The memory object for Matrix C is too small";
+    case -1008: return "clBLAS: The memory object for Vector X is too small";
+    case -1007: return "clBLAS: The memory object for Vector Y is too small";
+    default: return "Unknown OpenCL error";
+    }
+}
+
 template <typename T>
 static std::string kerToStr(const Mat & k)
 {
@@ -5134,4 +5233,175 @@ bool internal::isCLBuffer(UMat& u)
     return true;
 }
 
+struct Timer::Impl
+{
+    const Queue queue;
+
+    Impl(const Queue& q)
+        : queue(q)
+        , initted_(false)
+        , running_(false)
+        , has_run_at_least_once_(false)
+    {
+        init();
+    }
+
+    ~Impl()
+    {
+        clWaitForEvents(1, &start_gpu_cl_);
+        clWaitForEvents(1, &stop_gpu_cl_);
+        clReleaseEvent(start_gpu_cl_);
+        clReleaseEvent(stop_gpu_cl_);
+    }
+
+    void start()
+    {
+#ifdef HAVE_OPENCL
+        if (!running())
+        {
+            clWaitForEvents(1, &start_gpu_cl_);
+            clReleaseEvent(start_gpu_cl_);
+            ocl::Kernel kernel("null_kernel_float", ocl::core::benchmark_oclsrc);
+            float arg = 0;
+            clSetKernelArg((cl_kernel)kernel.ptr(), 0, sizeof(arg), &arg);
+            clEnqueueTask((cl_command_queue)queue.ptr(), (cl_kernel)kernel.ptr(), 0,
+                          NULL, &start_gpu_cl_);
+            clFinish((cl_command_queue)queue.ptr());
+            running_ = true;
+            has_run_at_least_once_ = true;
+        }
+#endif
+    }
+
+    void stop()
+    {
+#ifdef HAVE_OPENCL
+        if (running())
+        {
+            clWaitForEvents(1, &stop_gpu_cl_);
+            clReleaseEvent(stop_gpu_cl_);
+            ocl::Kernel kernel("null_kernel_float", ocl::core::benchmark_oclsrc);
+            float arg = 0;
+            clSetKernelArg((cl_kernel)kernel.ptr(), 0, sizeof(arg), &arg);
+            clEnqueueTask((cl_command_queue)queue.ptr(), (cl_kernel)kernel.ptr(), 0,
+                          NULL, &stop_gpu_cl_);
+            clFinish((cl_command_queue)queue.ptr());
+            running_ = false;
+        }
+#endif
+    }
+
+    float microSeconds()
+    {
+#ifdef HAVE_OPENCL
+        if (!has_run_at_least_once())
+        {
+            return 0;
+        }
+        if (running())
+        {
+            stop();
+        }
+        cl_ulong startTime, stopTime;
+        clWaitForEvents(1, &stop_gpu_cl_);
+        clGetEventProfilingInfo(start_gpu_cl_, CL_PROFILING_COMMAND_END,
+                                sizeof startTime, &startTime, NULL);
+        clGetEventProfilingInfo(stop_gpu_cl_, CL_PROFILING_COMMAND_START,
+                                sizeof stopTime, &stopTime, NULL);
+        double us = static_cast<double>(stopTime - startTime) / 1000.0;
+        elapsed_microseconds_ = static_cast<float>(us);
+        return elapsed_microseconds_;
+#else
+        return 0;
+#endif
+    }
+
+    float milliSeconds()
+    {
+#ifdef HAVE_OPENCL
+        if (!has_run_at_least_once())
+        {
+            return 0;
+        }
+        if (running())
+        {
+            stop();
+        }
+        cl_ulong startTime = 0, stopTime = 0;
+        clGetEventProfilingInfo(start_gpu_cl_, CL_PROFILING_COMMAND_END,
+                                sizeof startTime, &startTime, NULL);
+        clGetEventProfilingInfo(stop_gpu_cl_, CL_PROFILING_COMMAND_START,
+                                sizeof stopTime, &stopTime, NULL);
+        double ms = static_cast<double>(stopTime - startTime) / 1000000.0;
+        elapsed_milliseconds_ = static_cast<float>(ms);
+        return elapsed_milliseconds_;
+#else
+        return 0;
+#endif
+    }
+
+    float seconds()
+    {
+        return milliSeconds() / 1000.f;
+    }
+
+    void init()
+    {
+        CV_Assert(queue.getImpl() && queue.getImpl()->isProfilingQueue_);
+        if (!initted())
+        {
+            start_gpu_cl_ = 0;
+            stop_gpu_cl_ = 0;
+            initted_ = true;
+        }
+    }
+
+    inline bool initted() { return initted_; }
+    inline bool running() { return running_; }
+    inline bool has_run_at_least_once() { return has_run_at_least_once_; }
+
+    bool initted_;
+    bool running_;
+    bool has_run_at_least_once_;
+    float elapsed_milliseconds_;
+    float elapsed_microseconds_;
+    cl_event start_gpu_cl_;
+    cl_event stop_gpu_cl_;
+};
+
+Timer::Timer(const Queue& q)
+{
+    p = new Impl(q);
+}
+
+Timer::~Timer()
+{
+    if(p)
+    {
+        delete p;
+        p = 0;
+    }
+}
+
+void Timer::start()
+{
+    if(p)
+        p->start();
+}
+
+void Timer::stop()
+{
+    if(p)
+        p->stop();
+}
+
+float Timer::microSeconds()
+{ return p ? p->microSeconds() : 0; }
+
+float Timer::milliSeconds()
+{ return p ? p->milliSeconds() : 0; }
+
+float Timer::seconds()
+{ return p ? p->seconds() : 0; }
+
 }}
diff --git a/modules/core/src/opencl/benchmark.cl b/modules/core/src/opencl/benchmark.cl
new file mode 100644
index 0000000000..22acb93afd
--- /dev/null
+++ b/modules/core/src/opencl/benchmark.cl
@@ -0,0 +1,45 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+__kernel void null_kernel_float(float arg) {
+  float out = arg;
+}
diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp
index d38e20d34c..bb4ae6cf18 100644
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -297,12 +297,6 @@ TLSData<CoreTLSData>& getCoreTlsData();
 #define CL_RUNTIME_EXPORT
 #endif
 
-namespace utils {
-bool getConfigurationParameterBool(const char* name, bool defaultValue);
-size_t getConfigurationParameterSizeT(const char* name, size_t defaultValue);
-cv::String getConfigurationParameterString(const char* name, const char* defaultValue);
-}
-
 extern bool __termination; // skip some cleanups, because process is terminating
                            // (for example, if ExitProcess() was already called)
 
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index 2ec150ecfd..0a4110ec12 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -44,6 +44,7 @@
 #include "precomp.hpp"
 #include <iostream>
 
+#include <opencv2/core/utils/configuration.private.hpp>
 #include <opencv2/core/utils/trace.private.hpp>
 
 namespace cv {
diff --git a/modules/core/src/trace.cpp b/modules/core/src/trace.cpp
index d9153642d7..230510625e 100644
--- a/modules/core/src/trace.cpp
+++ b/modules/core/src/trace.cpp
@@ -6,6 +6,7 @@
 
 #include <opencv2/core/utils/trace.hpp>
 #include <opencv2/core/utils/trace.private.hpp>
+#include <opencv2/core/utils/configuration.private.hpp>
 
 #include <cstdarg> // va_start
 
diff --git a/modules/core/src/umatrix.cpp b/modules/core/src/umatrix.cpp
index 1be5fc6123..84cebdba09 100644
--- a/modules/core/src/umatrix.cpp
+++ b/modules/core/src/umatrix.cpp
@@ -267,19 +267,22 @@ UMat Mat::getUMat(int accessFlags, UMatUsageFlags usageFlags) const
     UMat hdr;
     if(!data)
         return hdr;
-    Size wholeSize;
-    Point ofs;
-    locateROI(wholeSize, ofs);
-    Size sz(cols, rows);
-    if (ofs.x != 0 || ofs.y != 0)
+    if (data != datastart)
     {
-        Mat src = *this;
-        int dtop = ofs.y;
-        int dbottom = wholeSize.height - src.rows - ofs.y;
-        int dleft = ofs.x;
-        int dright = wholeSize.width - src.cols - ofs.x;
-        src.adjustROI(dtop, dbottom, dleft, dright);
-        return src.getUMat(accessFlags, usageFlags)(cv::Rect(ofs.x, ofs.y, sz.width, sz.height));
+        Size wholeSize;
+        Point ofs;
+        locateROI(wholeSize, ofs);
+        Size sz(cols, rows);
+        if (ofs.x != 0 || ofs.y != 0)
+        {
+            Mat src = *this;
+            int dtop = ofs.y;
+            int dbottom = wholeSize.height - src.rows - ofs.y;
+            int dleft = ofs.x;
+            int dright = wholeSize.width - src.cols - ofs.x;
+            src.adjustROI(dtop, dbottom, dleft, dright);
+            return src.getUMat(accessFlags, usageFlags)(cv::Rect(ofs.x, ofs.y, sz.width, sz.height));
+        }
     }
     CV_Assert(data == datastart);
 
diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt
index d0bc3324d2..866f544e8c 100644
--- a/modules/dnn/CMakeLists.txt
+++ b/modules/dnn/CMakeLists.txt
@@ -21,6 +21,8 @@ ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-shadow -Wno-parentheses -Wmaybe-uninit
 )
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4701 /wd4100)
 
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src/ocl4dnn/include ${OPENCL_INCLUDE_DIRS})
+
 if(MSVC)
   add_definitions( -D_CRT_SECURE_NO_WARNINGS=1 )
   ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4244 /wd4267 /wd4018 /wd4355 /wd4800 /wd4251 /wd4996 /wd4146
diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp
index bd796691cc..cb015d4eba 100644
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -297,6 +297,7 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
 
         CV_PROP String name; //!< Name of the layer instance, can be used for logging or other internal purposes.
         CV_PROP String type; //!< Type name which was used for creating layer by layer factory.
+        CV_PROP int preferableTarget; //!< prefer target for layer forwarding
 
         Layer();
         explicit Layer(const LayerParams &params);      //!< Initializes only #name, #type and #blobs fields.
diff --git a/modules/dnn/perf/opencl/perf_convolution.cpp b/modules/dnn/perf/opencl/perf_convolution.cpp
new file mode 100644
index 0000000000..362057919a
--- /dev/null
+++ b/modules/dnn/perf/opencl/perf_convolution.cpp
@@ -0,0 +1,118 @@
+#include "../perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest
+{
+namespace ocl
+{
+
+using std::tr1::tuple;
+using std::tr1::get;
+using std::tr1::make_tuple;
+using std::make_pair;
+using namespace perf;
+using namespace testing;
+using namespace cv;
+using namespace cv::dnn;
+
+enum {STRIDE_OFF = 1, STRIDE_ON = 2};
+CV_ENUM(StrideSize, STRIDE_OFF, STRIDE_ON);
+
+enum {GROUP_OFF = 1, GROUP_2 = 2};
+CV_ENUM(GroupSize, GROUP_OFF, GROUP_2);
+
+//Squared Size
+#define SSZ(n) cv::Size(n, n)
+
+typedef std::pair<MatShape, int> InpShapeNumOut;
+typedef tuple<Size, InpShapeNumOut, GroupSize, StrideSize> ConvParam; //kernel_size, inp shape, groups, stride
+typedef TestBaseWithParam<ConvParam> ConvolutionPerfTest;
+
+static inline MatShape blobShape(int count, int nplanes, int height, int width)
+{
+    int data[] = {count, nplanes, height, width};
+    return MatShape(data, data+4);
+}
+
+OCL_PERF_TEST_P( ConvolutionPerfTest, perf, Combine(
+    Values(Size(1, 1), Size(3, 3), Size(5, 5), Size(11, 11)),
+    Values(make_pair(blobShape(1,   4, 224, 224),  64),
+           make_pair(blobShape(1,  64, 112, 122), 128),
+           make_pair(blobShape(1, 256,  28,  28), 512)),
+    GroupSize::all(),
+    StrideSize::all())
+)
+{
+    RNG rng(0);
+
+    ConvParam params = GetParam();
+    int ksz     = get<0>(params).width;
+    MatShape inpShape = get<1>(params).first;
+    int outCn   = get<1>(params).second;
+    int groups  = get<2>(params);
+    int stride  = (ksz >= 11) ? 4 : (int)get<3>(params);
+
+    int inpCn = inpShape[1];
+    int wgtSize[] = { outCn, inpCn/groups, ksz, ksz };
+    int biasSize[] = { outCn, 1, 1, 1 };
+    const int wtype = CV_32F;
+    Mat wgtBlob(4, wgtSize, wtype), biasBlob(4, biasSize, wtype);
+    Mat inpBlob(4, &inpShape[0], wtype);
+    rng.fill(biasBlob, RNG::UNIFORM, -1, +1);
+    rng.fill(wgtBlob, RNG::UNIFORM, -1, +1);
+    rng.fill(inpBlob, RNG::UNIFORM, -1, +1);
+
+    LayerParams lp;
+    lp.set("num_output", outCn);
+    lp.set("group", groups);
+    lp.set("stride", stride);
+    lp.set("kernel_size", ksz);
+    lp.blobs.reserve(2);
+    lp.blobs.push_back(wgtBlob);
+    lp.blobs.push_back(biasBlob);
+
+    std::vector<Mat*> inpBlobs(1, &inpBlob);
+    std::vector<Mat> outBlobs, internalBlobs;
+
+    cv::setNumThreads(cv::getNumberOfCPUs());
+
+    Ptr<Layer> layer = cv::dnn::LayerFactory::createLayerInstance("Convolution", lp);
+    std::vector<MatShape> inputShapes(1, shape(inpBlob)), outShapes, internals;
+    layer->getMemoryShapes(inputShapes, 0, outShapes, internals);
+    for (int i = 0; i < outShapes.size(); i++)
+    {
+        outBlobs.push_back(Mat(outShapes[i], CV_32F));
+    }
+    for (int i = 0; i < internals.size(); i++)
+    {
+        internalBlobs.push_back(Mat());
+        if (total(internals[i]))
+            internalBlobs.back().create(internals[i], CV_32F);
+    }
+
+    layer->finalize(inpBlobs, outBlobs);
+    layer->preferableTarget = DNN_TARGET_OPENCL;
+
+    Mat inpBlob2D = inpBlob.reshape(1, outCn);
+    Mat wgtBlob2D = wgtBlob.reshape(1, outCn*(inpCn/groups));
+    Mat outBlob2D = outBlobs[0].reshape(1, outBlobs[0].size[0]);
+    declare.in(inpBlob2D, wgtBlob2D, WARMUP_RNG).out(outBlob2D).tbb_threads(cv::getNumThreads());
+
+    // warmup
+    layer->forward(inpBlobs, outBlobs, internalBlobs);
+
+    TEST_CYCLE()
+    {
+        layer->forward(inpBlobs, outBlobs, internalBlobs);
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+}
+}
+
+#endif
diff --git a/modules/dnn/perf/perf_net.cpp b/modules/dnn/perf/perf_net.cpp
index 55f5ce69e6..990470f655 100644
--- a/modules/dnn/perf/perf_net.cpp
+++ b/modules/dnn/perf/perf_net.cpp
@@ -40,7 +40,7 @@ public:
 
         if (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL)
         {
-#if 0 //defined(HAVE_OPENCL)
+#if defined(HAVE_OPENCL)
             if (!cv::ocl::useOpenCL())
 #endif
             {
diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp
index 64bb85f042..424e8425a4 100644
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -875,7 +875,7 @@ struct Net::Impl
 
         if (preferableBackend == DNN_BACKEND_DEFAULT)
         {
-            CV_Assert(preferableTarget == DNN_TARGET_CPU);
+            CV_Assert(preferableTarget == DNN_TARGET_CPU || preferableTarget == DNN_TARGET_OPENCL);
             return;
         }
 
@@ -1000,6 +1000,7 @@ struct Net::Impl
         Ptr<Layer> layerPtr = ld.getLayerInstance();
         {
             layerPtr->finalize(ld.inputBlobs, ld.outputBlobs);
+            layerPtr->preferableTarget = preferableTarget;
 #if 0
             std::cout << "\toutputs:";
             size_t noutputs = ld.outputBlobs.size();
@@ -1026,7 +1027,7 @@ struct Net::Impl
 
     void fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
     {
-        if( !fusion || preferableBackend == DNN_BACKEND_HALIDE )
+        if( !fusion || !(preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_CPU))
             return;
 
         CV_TRACE_FUNCTION();
@@ -1236,7 +1237,6 @@ struct Net::Impl
         }
 
         layersTimings.resize(lastLayerId + 1, 0);
-
         fuseLayers(blobsToKeep_);
     }
 
@@ -1402,7 +1402,7 @@ struct Net::Impl
         }
         else
         {
-            CV_Assert(preferableTarget == DNN_TARGET_CPU);
+            CV_Assert(preferableTarget == DNN_TARGET_CPU || preferableTarget == DNN_TARGET_OPENCL);
         }
         return ld.outputBlobs[pin.oid];
     }
@@ -1963,12 +1963,12 @@ int64 Net::getPerfProfile(std::vector<double>& timings)
 
 Importer::~Importer() {}
 
-Layer::Layer() {}
+Layer::Layer() { preferableTarget = DNN_TARGET_CPU; }
 
 Layer::Layer(const LayerParams &params)
     : blobs(params.blobs), name(params.name), type(params.type)
 {
-
+    preferableTarget = DNN_TARGET_CPU;
 }
 
 void Layer::setParamsFrom(const LayerParams &params)
diff --git a/modules/dnn/src/layers/concat_layer.cpp b/modules/dnn/src/layers/concat_layer.cpp
index 67d82c2eb0..6833b0468b 100644
--- a/modules/dnn/src/layers/concat_layer.cpp
+++ b/modules/dnn/src/layers/concat_layer.cpp
@@ -43,6 +43,7 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "op_halide.hpp"
+#include "opencl_kernels_dnn.hpp"
 
 namespace cv
 {
@@ -174,11 +175,62 @@ public:
         }
     };
 
+#ifdef HAVE_OPENCL
+    bool forward_ocl(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        int cAxis = clamp(axis, inputs[0]->dims);
+        if (!(cAxis == 1 && outputs[0].dims == 4 && !padding))
+            return false;
+
+        int bottom_concat_axis;
+        int concat_size = inputs[0]->size[2] * inputs[0]->size[3];
+        int top_concat_axis = outputs[0].size[1];
+        int offset_concat_axis = 0;
+        UMat inpMat, outMat;
+        outMat = outputs[0].getUMat(ACCESS_WRITE);
+
+        ocl::Kernel kernel;
+        String buildopt = String("-DDtype=") + ocl::typeToStr(inputs[0]->type()) + String(" ");
+        if (!kernel.create("concat", ocl::dnn::concat_oclsrc, buildopt))
+            return false;
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            inpMat = inputs[i]->getUMat(ACCESS_READ);
+            bottom_concat_axis = inputs[i]->size[1];
+            size_t nthreads = inputs[i]->total();
+
+            kernel.set(0, (int)nthreads);
+            kernel.set(1, ocl::KernelArg::PtrReadOnly(inpMat));
+            kernel.set(2, (int)inputs[i]->size[0]);
+            kernel.set(3, (int)concat_size);
+            kernel.set(4, (int)top_concat_axis);
+            kernel.set(5, (int)bottom_concat_axis);
+            kernel.set(6, (int)offset_concat_axis);
+            kernel.set(7, ocl::KernelArg::PtrWriteOnly(outMat));
+
+            if (!kernel.run(1, &nthreads, NULL, false))
+                return false;
+
+            offset_concat_axis += bottom_concat_axis;
+        }
+
+        return true;
+    }
+#endif
+
     void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
     {
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
+        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+                   forward_ocl(inputs, outputs, internals))
+
         int cAxis = clamp(axis, inputs[0]->dims);
         Mat& outMat = outputs[0];
 
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 8440662367..129b874ea0 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -47,6 +47,10 @@
 #include "opencv2/core/hal/intrin.hpp"
 #include <iostream>
 
+#ifdef HAVE_OPENCL
+using namespace cv::dnn::ocl4dnn;
+#endif
+
 namespace cv
 {
 namespace dnn
@@ -150,6 +154,11 @@ public:
     Ptr<BatchNormLayer> bnorm;
     Ptr<ScaleLayer> scaleLayer;
 
+#ifdef HAVE_OPENCL
+    Ptr<OCL4DNNConvSpatial<float> > convolutionOp;
+    std::vector<UMat> umat_blobs;
+#endif
+
     MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const
     {
         Size out(outShape[3], outShape[2]);
@@ -636,6 +645,42 @@ public:
         }
     };
 
+#ifdef HAVE_OPENCL
+    bool forward_ocl(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+    {
+        int group = inputs[0]->size[1] / umat_blobs[0].size[1];
+
+        if (convolutionOp.empty())
+        {
+            OCL4DNNConvConfig config;
+            config.in_shape = shape(*inputs[0]);
+            config.out_shape = shape(outputs[0]);
+            config.kernel = kernel;
+            config.pad = pad;
+            config.stride = stride;
+            config.dilation = dilation;
+            config.group = group;
+            config.bias_term = (hasBias()) ? true : false;
+
+            convolutionOp = Ptr<OCL4DNNConvSpatial<float> >(new OCL4DNNConvSpatial<float>(config));
+        }
+
+        for (size_t ii = 0; ii < outputs.size(); ii++)
+        {
+            UMat inpMat, outMat;
+            inpMat = inputs[ii]->getUMat(ACCESS_READ);
+            outMat = outputs[ii].getUMat(ACCESS_WRITE);
+
+            int batch_size = inpMat.size[0];
+
+            if (!convolutionOp->Forward(inpMat, umat_blobs[0], hasBias() ? umat_blobs[1] : UMat(),
+                                        outMat, batch_size))
+               return false;
+        }
+        return true;
+    }
+#endif
+
     void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
     {
         CV_TRACE_FUNCTION();
@@ -649,6 +694,10 @@ public:
         int ngroups = inputs[0]->size[1]/blobs[0].size[1];
         CV_Assert(outputs[0].size[1] % ngroups == 0);
 
+        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+                   forward_ocl(inputs, outputs, internals))
+
         int k, outCn = blobs[0].size[0];
 
         if( weightsMat.empty() )
@@ -1203,8 +1252,17 @@ static void initConvDeconvLayerFromCaffe(Ptr<BaseConvolutionLayer> l, const Laye
 
 Ptr<BaseConvolutionLayer> ConvolutionLayer::create(const LayerParams &params)
 {
-    Ptr<BaseConvolutionLayer> l(new ConvolutionLayerImpl);
+    ConvolutionLayerImpl* conv_ptr = new ConvolutionLayerImpl;
+    Ptr<BaseConvolutionLayer> l(conv_ptr);
     initConvDeconvLayerFromCaffe(l, params);
+
+#ifdef HAVE_OPENCL
+    size_t n = params.blobs.size();
+    conv_ptr->umat_blobs.resize(n);
+    for (int i = 0; i < n; i++)
+        conv_ptr->umat_blobs[i] = params.blobs[i].getUMat(ACCESS_READ);
+#endif
+
     return l;
 }
 
diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp
index dee3fbb825..027eda4cc2 100644
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@@ -41,9 +41,12 @@
 //M*/
 
 #include "../precomp.hpp"
+#include "layers_common.hpp"
 #include "op_halide.hpp"
 #include "opencv2/imgproc.hpp"
 #include <opencv2/dnn/shape_utils.hpp>
+#include "opencl_kernels_dnn.hpp"
+#include <iostream>
 
 namespace cv
 {
@@ -158,6 +161,10 @@ public:
     {
         CV_TRACE_FUNCTION();
 
+        CV_OCL_RUN((this->preferableTarget == DNN_TARGET_OPENCL) &&
+                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+                   func.applyOCL(inputs, outputs, internals))
+
         for (size_t i = 0; i < inputs.size(); i++)
         {
             const Mat &src = *inputs[i];
@@ -191,6 +198,13 @@ public:
     bool run_parallel;
 };
 
+#ifdef HAVE_OPENCL
+static String oclGetTMacro(const UMat &m)
+{
+    return String("-DT=") + ocl::typeToStr(m.type()) + String(" ");
+}
+#endif
+
 struct ReLUFunctor
 {
     typedef ReLULayer Layer;
@@ -230,6 +244,46 @@ struct ReLUFunctor
         }
     }
 
+#ifdef HAVE_OPENCL
+    bool initKernel(ocl::Kernel &ker, const UMat &src) const
+    {
+        const char *buildoptSlope = (slope == 0) ? "-DRELU_NO_SLOPE" : "";
+        String buildopt = oclGetTMacro(src) + buildoptSlope;
+
+        if (!ker.create("ReLUForward", ocl::dnn::activations_oclsrc, buildopt))
+            return false;
+
+        if (slope != 0)
+            ker.set(3, (float)slope);
+
+        return true;
+    }
+
+    bool applyOCL(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+    {
+        size_t wgSize = ocl::Device::getDefault().maxWorkGroupSize();
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            UMat src, dst;
+            inputs[i]->copyTo(src);
+            dst = outputs[i].getUMat(ACCESS_WRITE);
+            CV_Assert(src.isContinuous() && dst.isContinuous() && !src.offset && !dst.offset);
+
+            ocl::Kernel ker;
+            CV_Assert(initKernel(ker, src));
+            ker.set(0, (int)src.total());
+            ker.set(1, ocl::KernelArg::PtrReadOnly(src));
+            ker.set(2, ocl::KernelArg::PtrWriteOnly(dst));
+
+            size_t gSize = src.total();
+            CV_Assert(ker.run(1, &gSize, &wgSize, false));
+        }
+
+        return true;
+    }
+#endif
+
 #ifdef HAVE_HALIDE
     void attachHalide(const Halide::Expr& input, Halide::Func& top)
     {
@@ -293,6 +347,14 @@ struct ReLU6Functor
         }
     }
 
+#ifdef HAVE_OPENCL
+    bool applyOCL(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+    {
+        // TODO: implement OCL version
+        return false;
+    }
+#endif
+
 #ifdef HAVE_HALIDE
     void attachHalide(const Halide::Expr& input, Halide::Func& top)
     {
@@ -320,6 +382,14 @@ struct TanHFunctor
         }
     }
 
+#ifdef HAVE_OPENCL
+    bool applyOCL(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+    {
+        // TODO: implement OCL version
+        return false;
+    }
+#endif
+
 #ifdef HAVE_HALIDE
     void attachHalide(const Halide::Expr& input, Halide::Func& top)
     {
@@ -347,6 +417,14 @@ struct SigmoidFunctor
         }
     }
 
+#ifdef HAVE_OPENCL
+    bool applyOCL(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+    {
+        // TODO: implement OCL version
+        return false;
+    }
+#endif
+
 #ifdef HAVE_HALIDE
     void attachHalide(const Halide::Expr& input, Halide::Func& top)
     {
@@ -376,6 +454,14 @@ struct ELUFunctor
         }
     }
 
+#ifdef HAVE_OPENCL
+    bool applyOCL(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+    {
+        // TODO: implement OCL version
+        return false;
+    }
+#endif
+
 #ifdef HAVE_HALIDE
     void attachHalide(const Halide::Expr& input, Halide::Func& top)
     {
@@ -403,6 +489,14 @@ struct AbsValFunctor
         }
     }
 
+#ifdef HAVE_OPENCL
+    bool applyOCL(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+    {
+        // TODO: implement OCL version
+        return false;
+    }
+#endif
+
 #ifdef HAVE_HALIDE
     void attachHalide(const Halide::Expr& input, Halide::Func& top)
     {
@@ -430,6 +524,14 @@ struct BNLLFunctor
         }
     }
 
+#ifdef HAVE_OPENCL
+    bool applyOCL(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+    {
+        // TODO: implement OCL version
+        return false;
+    }
+#endif
+
 #ifdef HAVE_HALIDE
     void attachHalide(const Halide::Expr& input, Halide::Func& top)
     {
@@ -479,6 +581,14 @@ struct PowerFunctor
         }
     }
 
+#ifdef HAVE_OPENCL
+    bool applyOCL(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+    {
+        // TODO: implement OCL version
+        return false;
+    }
+#endif
+
 #ifdef HAVE_HALIDE
     void attachHalide(const Halide::Expr& input, Halide::Func& top)
     {
@@ -524,18 +634,18 @@ struct ChannelsPReLUFunctor
             v_float32x4 s4 = v_setall_f32(s), z = v_setzero_f32();
             for( ; i <= len - 16; i += 16 )
             {
-                v_float32x4 x0 = v_load(ptr + i);
-                v_float32x4 x1 = v_load(ptr + i + 4);
-                v_float32x4 x2 = v_load(ptr + i + 8);
-                v_float32x4 x3 = v_load(ptr + i + 12);
+                v_float32x4 x0 = v_load(srcptr + i);
+                v_float32x4 x1 = v_load(srcptr + i + 4);
+                v_float32x4 x2 = v_load(srcptr + i + 8);
+                v_float32x4 x3 = v_load(srcptr + i + 12);
                 x0 = v_select(x0 >= z, x0, x0*s4);
                 x1 = v_select(x1 >= z, x1, x1*s4);
                 x2 = v_select(x2 >= z, x2, x2*s4);
                 x3 = v_select(x3 >= z, x3, x3*s4);
-                v_store(ptr + i, x0);
-                v_store(ptr + i + 4, x1);
-                v_store(ptr + i + 8, x2);
-                v_store(ptr + i + 12, x3);
+                v_store(dstptr + i, x0);
+                v_store(dstptr + i + 4, x1);
+                v_store(dstptr + i + 8, x2);
+                v_store(dstptr + i + 12, x3);
             }
         #endif
             for( ; i < len; i++ )
@@ -546,6 +656,14 @@ struct ChannelsPReLUFunctor
         }
     }
 
+#ifdef HAVE_OPENCL
+    bool applyOCL(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+    {
+        // TODO: implement OCL version
+        return false;
+    }
+#endif
+
 #ifdef HAVE_HALIDE
     void attachHalide(const Halide::Expr& input, Halide::Func& top)
     {
diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp
index 9bec3b086f..7893a2f83a 100644
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -43,8 +43,13 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "op_halide.hpp"
+#include "opencl_kernels_dnn.hpp"
 #include <opencv2/dnn/shape_utils.hpp>
 
+#ifdef HAVE_OPENCL
+using namespace cv::dnn::ocl4dnn;
+#endif
+
 namespace cv
 {
 namespace dnn
@@ -55,6 +60,11 @@ class FullyConnectedLayerImpl : public InnerProductLayer
 public:
     enum { VEC_ALIGN = 8 };
 
+#ifdef HAVE_OPENCL
+    Ptr<OCL4DNNInnerProduct<float> > innerProductOp;
+    std::vector<UMat> umat_blobs;
+#endif
+
     FullyConnectedLayerImpl(const LayerParams& params)
     {
         setParamsFrom(params);
@@ -84,6 +94,12 @@ public:
             biasMat = blobs[1] = blobs[1].reshape(1, 1);
         else
             biasMat = Mat::zeros(1, numOutput, weightsMat.type());
+
+#ifdef HAVE_OPENCL
+        size_t n = blobs.size();
+        umat_blobs.resize(n);
+        for (int i = 0; i < n; i++) umat_blobs[i] = blobs[i].getUMat(ACCESS_READ);
+#endif
     }
 
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -238,11 +254,78 @@ public:
         bool useAVX2;
     };
 
+#ifdef HAVE_OPENCL
+    bool forward_ocl(std::vector<Mat*> &input, std::vector<Mat> &output)
+    {
+        int axisCan = clamp(axis, input[0]->dims);
+        int numOutput = blobs[0].size[0];
+        int innerSize = blobs[0].size[1];
+        int outerSize = input[0]->total(0, axisCan);
+        bool ret = true;
+
+        if (innerProductOp.empty())
+        {
+            OCL4DNNInnerProductConfig config;
+            config.num_output = numOutput;
+            config.bias_term = bias;
+            config.M = outerSize;
+            config.K = innerSize;
+
+            innerProductOp = Ptr<OCL4DNNInnerProduct<float> >(new OCL4DNNInnerProduct<float>(config));
+        }
+
+        UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
+        for (size_t i = 0; i < input.size(); i++)
+        {
+            UMat srcMat, dstMat;
+            srcMat = input[i]->getUMat(ACCESS_READ);
+            dstMat = output[i].getUMat(ACCESS_WRITE);
+            dstMat.setTo(0.0f);
+
+            if (!innerProductOp->Forward(srcMat, umat_blobs[0], (bias) ? umat_blobs[1] : UMat(), dstMat))
+            {
+                ret = false;
+                break;
+            }
+
+            if (bias && (outerSize > 1))
+            {
+                UMat& biases = umat_blobs[1];
+                cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
+            }
+        }
+
+        if (ret) return true;
+
+        UMat& weights = umat_blobs[0];
+        for (size_t i = 0; i < input.size(); i++)
+        {
+            UMat srcMat, dstMat;
+            srcMat = input[i]->reshape(1, outerSize).getUMat(ACCESS_READ);
+            dstMat = output[i].reshape(1, outerSize).getUMat(ACCESS_WRITE);
+
+            cv::gemm(srcMat, weights, 1, noArray(), 0, dstMat, GEMM_2_T);
+
+            if (bias)
+            {
+                UMat& biases = umat_blobs[1];
+                cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
+            }
+        }
+
+        return true;
+    }
+#endif
+
     void forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &)
     {
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
+        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+                   forward_ocl(input, output))
+
         int axisCan = clamp(axis, input[0]->dims);
         int outerSize = input[0]->total(0, axisCan);
 
diff --git a/modules/dnn/src/layers/layers_common.hpp b/modules/dnn/src/layers/layers_common.hpp
index 46170e9109..ed8add94ff 100644
--- a/modules/dnn/src/layers/layers_common.hpp
+++ b/modules/dnn/src/layers/layers_common.hpp
@@ -51,6 +51,10 @@
 #include "layers/layers_common.simd_declarations.hpp"
 #undef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
+#ifdef HAVE_OPENCL
+#include "ocl4dnn.hpp"
+#endif
+
 namespace cv
 {
 namespace dnn
diff --git a/modules/dnn/src/layers/lrn_layer.cpp b/modules/dnn/src/layers/lrn_layer.cpp
index aa7a7cbf4d..62dde95e90 100644
--- a/modules/dnn/src/layers/lrn_layer.cpp
+++ b/modules/dnn/src/layers/lrn_layer.cpp
@@ -46,8 +46,13 @@
 #include "opencv2/imgproc.hpp"
 #include "opencv2/dnn/shape_utils.hpp"
 #include "opencv2/core/hal/hal.hpp"
+#include "opencl_kernels_dnn.hpp"
 #include <algorithm>
 
+#ifdef HAVE_OPENCL
+using namespace cv::dnn::ocl4dnn;
+#endif
+
 namespace cv
 {
 namespace dnn
@@ -78,18 +83,64 @@ public:
         normBySize = params.get<bool>("norm_by_size", true);
     }
 
+#ifdef HAVE_OPENCL
+    Ptr<OCL4DNNLRN<float> > lrnOp;
+#endif
+
     virtual bool supportBackend(int backendId)
     {
         return backendId == DNN_BACKEND_DEFAULT ||
                backendId == DNN_BACKEND_HALIDE && haveHalide();
     }
 
+#ifdef HAVE_OPENCL
+    bool forward_ocl(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+    {
+        if (lrnOp.empty())
+        {
+            OCL4DNNLRNConfig config;
+            config.lrn_type = type == CHANNEL_NRM ?
+                              LRNParameter_NormRegion_ACROSS_CHANNELS :
+                              LRNParameter_NormRegion_WITHIN_CHANNEL;
+
+            CHECK_EQ(size % 2, 1)<< "LRN only supports odd values for local_size";
+            config.local_size = size;
+            config.alpha = alpha;
+            config.beta = beta;
+            config.k = bias;
+            CHECK_EQ(4, inputs[0]->dims) << "Input must have 4 axes, "
+                     << "corresponding to (num, channels, height, width)";
+            config.batch_size = inputs[0]->size[0];
+            config.channels = inputs[0]->size[1];
+            config.height = inputs[0]->size[2];
+            config.width = inputs[0]->size[3];
+            config.norm_by_size = normBySize;
+
+            lrnOp = Ptr<OCL4DNNLRN<float> >(new OCL4DNNLRN<float>(config));
+        }
+
+        UMat inpMat, outMat;
+        inpMat = inputs[0]->getUMat(ACCESS_READ);
+        outMat = outputs[0].getUMat(ACCESS_WRITE);
+
+        if (!lrnOp->Forward(inpMat, outMat))
+            return false;
+
+        return true;
+    }
+#endif
+
     void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
     {
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
         CV_Assert(inputs.size() == outputs.size());
+
+        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+                   forward_ocl(inputs, outputs, internals))
+
         for (int i = 0; i < inputs.size(); i++)
         {
             CV_Assert(inputs[i]->dims == 4);
diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp
index b54b52d7fc..c27315ba26 100644
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@@ -44,10 +44,14 @@
 #include "layers_common.hpp"
 #include "opencv2/core/hal/intrin.hpp"
 #include "op_halide.hpp"
+#include "opencl_kernels_dnn.hpp"
 #include <float.h>
 #include <algorithm>
 using std::max;
 using std::min;
+#ifdef HAVE_OPENCL
+using namespace cv::dnn::ocl4dnn;
+#endif
 
 namespace cv
 {
@@ -81,6 +85,10 @@ public:
         ceilMode = params.get<bool>("ceil_mode", true);
     }
 
+#ifdef HAVE_OPENCL
+    Ptr<OCL4DNNPool<float> > poolOp;
+#endif
+
     void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
     {
         CV_Assert(inputs.size() == 1);
@@ -104,11 +112,59 @@ public:
                 type == PoolingLayer::AVE && !pad.width && !pad.height);
     }
 
+#ifdef HAVE_OPENCL
+    bool forward_ocl(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+    {
+        if (poolOp.empty())
+        {
+            OCL4DNNPoolConfig config;
+
+            config.in_shape = shape(*inputs[0]);
+            config.out_shape = shape(outputs[0]);
+            config.kernel = kernel;
+            config.pad = pad;
+            config.stride = stride;
+            config.channels = inputs[0]->size[1];
+            config.pool_method = type == MAX ? LIBDNN_POOLING_METHOD_MAX :
+                                (type == AVE ? LIBDNN_POOLING_METHOD_AVE :
+                                               LIBDNN_POOLING_METHOD_STO);
+            poolOp = Ptr<OCL4DNNPool<float> >(new OCL4DNNPool<float>(config));
+        }
+
+        for (size_t ii = 0; ii < inputs.size(); ii++)
+        {
+            UMat inpMat, outMat, maskMat;
+
+            inpMat = inputs[ii]->getUMat(ACCESS_READ);
+
+            if (type == MAX)
+            {
+                outMat = outputs[2 * ii].getUMat(ACCESS_WRITE);
+                maskMat = outputs[2 * ii + 1].getUMat(ACCESS_WRITE);
+            } else {
+                outMat = outputs[ii].getUMat(ACCESS_WRITE);
+                maskMat = UMat();
+            }
+
+            CV_Assert(inpMat.offset == 0 && outMat.offset == 0);
+
+            if (!poolOp->Forward(inpMat, outMat, maskMat))
+                return false;
+        }
+
+        return true;
+    }
+#endif
+
     void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
     {
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
+        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+                   forward_ocl(inputs, outputs, internals))
+
         for (size_t ii = 0; ii < inputs.size(); ii++)
         {
             switch (type)
diff --git a/modules/dnn/src/layers/softmax_layer.cpp b/modules/dnn/src/layers/softmax_layer.cpp
index 828557da49..fd14e29a05 100644
--- a/modules/dnn/src/layers/softmax_layer.cpp
+++ b/modules/dnn/src/layers/softmax_layer.cpp
@@ -43,9 +43,13 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "op_halide.hpp"
+#include "opencl_kernels_dnn.hpp"
 #include <algorithm>
 #include <stdlib.h>
 using std::max;
+#ifdef HAVE_OPENCL
+using namespace cv::dnn::ocl4dnn;
+#endif
 
 namespace cv
 {
@@ -63,6 +67,10 @@ public:
         setParamsFrom(params);
     }
 
+#ifdef HAVE_OPENCL
+    Ptr<OCL4DNNSoftmax<float> > softmaxOp;
+#endif
+
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
                          const int requiredOutputs,
                          std::vector<MatShape> &outputs,
@@ -82,11 +90,91 @@ public:
                backendId == DNN_BACKEND_HALIDE && haveHalide() && axisRaw == 1;
     }
 
+#ifdef HAVE_OPENCL
+    bool forward_ocl(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+    {
+        if (softmaxOp.empty())
+        {
+            OCL4DNNSoftmaxConfig config;
+
+            config.in_shape = shape(*inputs[0]);
+            config.axis = axisRaw;
+            config.channels = inputs[0]->size[axisRaw];
+
+            softmaxOp = Ptr<OCL4DNNSoftmax<float> >(new OCL4DNNSoftmax<float>(config));
+        }
+
+        UMat srcMat, dstMat;
+        srcMat = inputs[0]->getUMat(ACCESS_READ);
+        dstMat = outputs[0].getUMat(ACCESS_WRITE);
+
+        if (!logSoftMax && softmaxOp->Forward(srcMat, dstMat))
+            return true;
+
+        const Mat &src = *inputs[0];
+        UMat bufMat = internals[0].getUMat(ACCESS_WRITE);
+        srcMat.copyTo(dstMat);
+
+        int axis = clamp(axisRaw, src.dims);
+        size_t outerSize = src.total(0, axis);
+        size_t channels = src.size[axis];
+        size_t innerSize = src.total(axis + 1);
+
+        String buildOpts = String("-DT=") + ocl::typeToStr(src.type());
+        ocl::Kernel kmax, ksub, ksum, kdiv;
+
+        if (!kmax.create("kernel_channel_max", ocl::dnn::softmax_oclsrc, buildOpts))
+            return false;
+
+        if (!ksub.create("kernel_channel_subtract", ocl::dnn::softmax_oclsrc, buildOpts))
+            return false;
+
+        if (!ksum.create("kernel_channel_sum", ocl::dnn::softmax_oclsrc, buildOpts))
+            return false;
+
+        if (logSoftMax) buildOpts += " -DLOG_SOFTMAX ";
+        if (!kdiv.create("kernel_channel_div", ocl::dnn::softmax_oclsrc, buildOpts))
+            return false;
+
+        size_t wgSize = ocl::Device::getDefault().maxWorkGroupSize();
+        size_t bufSize = internals[0].total();
+        size_t totalSize = src.total();
+
+        kmax.args((int)outerSize, (int)channels, (int)innerSize,
+                  ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat));
+        if (!kmax.run(1, &bufSize, &wgSize, false))
+            return false;
+
+        ksub.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize,
+                  ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat));
+        if (!ksub.run(1, &totalSize, &wgSize, false))
+            return false;
+
+        cv::exp(dstMat, dstMat);
+
+        ksum.args((int)outerSize, (int)channels, (int)innerSize,
+                  ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat));
+        if (!ksum.run(1, &bufSize, &wgSize, false))
+            return false;
+
+        kdiv.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize,
+                  ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat));
+        if (!kdiv.run(1, &totalSize, &wgSize, false))
+            return false;
+
+        return true;
+    }
+#endif
+
     void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
     {
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
+        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+                   forward_ocl(inputs, outputs, internals))
+
         const Mat &src = *inputs[0];
         Mat &dst = outputs[0];
 
diff --git a/modules/dnn/src/ocl4dnn/include/common.hpp b/modules/dnn/src/ocl4dnn/include/common.hpp
new file mode 100644
index 0000000000..41466429b0
--- /dev/null
+++ b/modules/dnn/src/ocl4dnn/include/common.hpp
@@ -0,0 +1,62 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _OPENCV_LIBDNN_COMMON_HPP_
+#define _OPENCV_LIBDNN_COMMON_HPP_
+#include "../../precomp.hpp"
+#include "../../caffe/glog_emulator.hpp"
+#include <opencv2/core/opencl/runtime/opencl_core.hpp>
+
+#ifdef HAVE_OPENCL
+
+// Macro to select the single (_float) or double (_double) precision kernel
+#define CL_KERNEL_SELECT(kernel) kernel "_float"
+
+#define OCL_CHECK(condition) \
+    do { \
+        cl_int error = (condition); \
+        CHECK_EQ(error, CL_SUCCESS) << " " << cv::ocl::getOpenCLErrorString(error); \
+    } while (0)
+
+bool clOptionSupport(cv::String option);
+
+#endif // HAVE_OPENCL
+#endif
diff --git a/modules/dnn/src/ocl4dnn/include/default_kernel_config.hpp b/modules/dnn/src/ocl4dnn/include/default_kernel_config.hpp
new file mode 100644
index 0000000000..df3e321e31
--- /dev/null
+++ b/modules/dnn/src/ocl4dnn/include/default_kernel_config.hpp
@@ -0,0 +1,854 @@
+#ifndef _OPENCV_OCL4DNN_DEFAULT_KERNEL_CONFIG_HPP_
+#define _OPENCV_OCL4DNN_DEFAULT_KERNEL_CONFIG_HPP_
+const char *default_kernel_config_intel[] = {
+// Below is the information for OpenCL based on which these configurations tuned
+/*******************************************************************************
+Number of platforms                               1
+  Platform Name                                   Intel(R) OpenCL
+  Platform Vendor                                 Intel(R) Corporation
+  Platform Version                                OpenCL 2.0
+  Platform Profile                                FULL_PROFILE
+  Platform Extensions                             cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_driver_diagnostics cl_intel_media_block_io cl_intel_motion_estimation cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups
+  Platform Extensions function suffix             INTEL
+
+  Platform Name                                   Intel(R) OpenCL
+Number of devices                                 1
+  Device Name                                     Intel(R) HD Graphics
+  Device Vendor                                   Intel(R) Corporation
+  Device Vendor ID                                0x8086
+  Device Version                                  OpenCL 2.0
+  Driver Version                                  r4.1.61547
+  Device OpenCL C Version                         OpenCL C 2.0
+  Device Type                                     GPU
+  Device Profile                                  FULL_PROFILE
+  Max compute units                               72
+  Max clock frequency                             950MHz
+  Device Partition                                (core)
+    Max number of sub-devices                     0
+    Supported partition types                     by <unknown> (0x7FE000000000)
+  Max work item dimensions                        3
+  Max work item sizes                             256x256x256
+  Max work group size                             256
+  Preferred work group size multiple              32
+  Preferred / native vector sizes
+    char                                                16 / 16
+    short                                                8 / 8
+    int                                                  4 / 4
+    long                                                 1 / 1
+    half                                                 8 / 8        (cl_khr_fp16)
+    float                                                1 / 1
+    double                                               1 / 1        (cl_khr_fp64)
+  Half-precision Floating-point support           (cl_khr_fp16)
+    Denormals                                     Yes
+    Infinity and NANs                             Yes
+    Round to nearest                              Yes
+    Round to zero                                 Yes
+    Round to infinity                             Yes
+    IEEE754-2008 fused multiply-add               Yes
+    Support is emulated in software               No
+    Correctly-rounded divide and sqrt operations  No
+  Single-precision Floating-point support         (core)
+    Denormals                                     Yes
+    Infinity and NANs                             Yes
+    Round to nearest                              Yes
+    Round to zero                                 Yes
+    Round to infinity                             Yes
+    IEEE754-2008 fused multiply-add               Yes
+    Support is emulated in software               No
+    Correctly-rounded divide and sqrt operations  Yes
+  Double-precision Floating-point support         (cl_khr_fp64)
+    Denormals                                     Yes
+    Infinity and NANs                             Yes
+    Round to nearest                              Yes
+    Round to zero                                 Yes
+    Round to infinity                             Yes
+    IEEE754-2008 fused multiply-add               Yes
+    Support is emulated in software               No
+    Correctly-rounded divide and sqrt operations  No
+  Address bits                                    64, Little-Endian
+  Global memory size                              26887677543 (25.04GiB)
+  Error Correction support                        No
+  Max memory allocation                           4294959103 (4GiB)
+  Unified memory for Host and Device              Yes
+  Shared Virtual Memory (SVM) capabilities        (core)
+    Coarse-grained buffer sharing                 Yes
+    Fine-grained buffer sharing                   No
+    Fine-grained system sharing                   No
+    Atomics                                       No
+  Minimum alignment for any data type             128 bytes
+  Alignment of base address                       1024 bits (128 bytes)
+  Preferred alignment for atomics
+    SVM                                           64 bytes
+    Global                                        64 bytes
+    Local                                         64 bytes
+  Max size for global variable                    65536 (64KiB)
+  Preferred total size of global vars             4294959103 (4GiB)
+  Global Memory cache type                        Read/Write
+  Global Memory cache size                        1572864
+  Global Memory cache line                        64 bytes
+  Image support                                   Yes
+    Max number of samplers per kernel             16
+    Max size for 1D images from buffer            268434943 pixels
+    Max 1D or 2D image array size                 2048 images
+    Base address alignment for 2D image buffers   4 bytes
+    Pitch alignment for 2D image buffers          4 bytes
+    Max 2D image size                             16384x16384 pixels
+    Max 3D image size                             16384x16384x2048 pixels
+    Max number of read image args                 128
+    Max number of write image args                128
+    Max number of read/write image args           128
+  Max number of pipe args                         16
+  Max active pipe reservations                    1
+  Max pipe packet size                            1024
+  Local memory type                               Local
+  Local memory size                               65536 (64KiB)
+  Max constant buffer size                        4294959103 (4GiB)
+  Max number of constant args                     8
+  Max size of kernel argument                     1024
+  Queue properties (on host)
+    Out-of-order execution                        Yes
+    Profiling                                     Yes
+  Queue properties (on device)
+    Out-of-order execution                        Yes
+    Profiling                                     Yes
+    Preferred size                                131072 (128KiB)
+    Max size                                      67108864 (64MiB)
+  Max queues on device                            1
+  Max events on device                            1024
+  Prefer user sync for interop                    Yes
+  Profiling timer resolution                      83ns
+  Execution capabilities
+    Run OpenCL kernels                            Yes
+    Run native kernels                            No
+    SPIR versions                                 1.2
+  printf() buffer size                            4194304 (4MiB)
+  Built-in kernels                                block_motion_estimate_intel;block_advanced_motion_estimate_check_intel;block_advanced_motion_estimate_bidirectional_check_intel
+  Motion Estimation accelerator version	(Intel)   2
+  Device Available                                Yes
+  Compiler Available                              Yes
+  Linker Available                                Yes
+  Device Extensions                               cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_driver_diagnostics cl_intel_media_block_io cl_intel_motion_estimation cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups
+
+NULL platform behavior
+  clGetPlatformInfo(NULL, CL_PLATFORM_NAME, ...)  No platform
+  clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, ...)   No platform
+  clCreateContext(NULL, ...) [default]            No platform
+  clCreateContext(NULL, ...) [other]              Success [INTEL]
+  clCreateContextFromType(NULL, CL_DEVICE_TYPE_CPU)  No platform
+  clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU)  No platform
+  clCreateContextFromType(NULL, CL_DEVICE_TYPE_ACCELERATOR)  No platform
+  clCreateContextFromType(NULL, CL_DEVICE_TYPE_CUSTOM)  No platform
+  clCreateContextFromType(NULL, CL_DEVICE_TYPE_ALL)  No platform
+********************************************************************************/
+"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128","4 6 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32","12 2 8 2 1 1 8 1 0 ",
+"EU72_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num1_M64","1 8 32 5 1 8 1 1 0 ",
+"EU72_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128","4 2 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn128_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M64","1 8 32 5 1 8 1 1 0 ",
+"EU72_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num2_M192","2 7 16 2 1 1 16 1 0 ",
+"EU72_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M48","4 3 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32","4 6 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M96","1 8 32 5 1 8 1 1 0 ",
+"EU72_k11x7_cn3_g1_s3x4_d1x1_b1_in64x64_p3x2_num1_M64","4 1 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64","8 3 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32","4 6 8 2 1 1 8 1 0 ",
+"EU72_k3x3_cn4_g1_s1x1_d1x1_b1_in256x256_p1x1_num1_M4","14 1 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M4","4 4 8 2 1 1 8 1 0 ",
+"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M128","4 2 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M192","1 8 32 5 1 8 1 1 0 ",
+"EU72_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M208","2 6 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32","8 3 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M384","1 8 32 5 1 8 1 1 0 ",
+"EU72_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M320","2 5 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160","8 3 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M256","1 8 32 5 1 8 1 1 0 ",
+"EU72_k5x1_cn32_g1_s1x1_d1x1_b0_in64x64_p2x0_num1_M32","4 6 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn16_g1_s1x1_d1x1_b0_in256x256_p0x0_num1_M4","12 2 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64","2 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M16","8 3 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn32_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M128","1 16 32 5 1 16 1 1 0 ",
+"EU72_k3x3_cn32_g1_s1x1_d2x2_b1_in64x64_p2x2_num1_M32","3 6 16 2 1 1 16 1 0 ",
+"EU72_k3x3_cn32_g1_s1x1_d16x16_b1_in64x64_p16x16_num1_M32","1 16 32 5 1 16 1 1 0 ",
+"EU72_k1x1_cn128_g1_s1x1_d1x1_b0_in32x32_p0x0_num1_M512","2 8 32 5 1 8 1 1 0 ",
+"EU72_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M384","2 7 16 2 1 1 16 1 0 ",
+"EU72_k5x4_cn6_g3_s3x2_d1x1_b1_in128x80_p1x0_num2_M4","1 1 1 4 1 1 1 0 1 ",
+"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num2_M96","4 5 16 2 1 1 16 1 0 ",
+"EU72_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M192","10 2 16 2 1 1 16 1 0 ",
+"EU72_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M192","6 4 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn4_g1_s1x1_d1x1_b0_in256x256_p0x0_num1_M16","2 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M96","8 3 8 2 1 1 8 1 0 ",
+"EU72_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M32","8 1 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M64","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M384","4 7 8 2 1 1 8 1 0 ",
+"EU72_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256","2 6 16 2 1 1 16 1 0 ",
+"EU72_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128","6 4 16 2 1 1 16 1 0 ",
+"EU72_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M64","4 4 16 2 1 1 16 1 0 ",
+"EU72_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M48","4 3 16 2 1 1 16 1 0 ",
+"EU72_k3x3_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M5","2 3 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M24","8 2 8 2 1 1 8 1 0 ",
+"EU72_k3x3_cn128_g1_s1x1_d1x1_b0_in32x32_p1x1_num1_M128","1 8 32 5 1 8 1 1 0 ",
+"EU72_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num2_M128","2 7 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn128_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M32","1 16 32 5 1 16 1 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M112","8 2 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M64","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160","1 8 32 5 1 8 1 1 0 ",
+"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128","4 3 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num2_M64","1 16 32 5 1 16 1 1 0 ",
+"EU72_k1x1_cn64_g1_s1x1_d1x1_b0_in128x128_p0x0_num1_M16","2 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M144","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128","8 2 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn16_g1_s1x1_d1x1_b0_in128x128_p0x0_num1_M64","1 16 32 5 1 16 1 1 0 ",
+"EU72_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M224","2 7 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256","4 6 8 2 1 1 8 1 0 ",
+"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M96","4 3 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn512_g1_s2x2_d1x1_b0_in32x32_p0x0_num1_M256","1 8 32 5 1 8 1 1 0 ",
+"EU72_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num2_M192","10 2 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64","12 2 8 2 1 1 8 1 0 ",
+"EU72_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M128","2 5 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M48","4 6 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M48","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256","8 3 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn256_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M64","2 8 32 5 1 8 1 1 0 ",
+"EU72_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M288","2 5 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn1024_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M256","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M96","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn512_g1_s2x2_d1x1_b0_in32x32_p0x0_num1_M1024","1 16 32 5 1 16 1 1 0 ",
+"EU72_k1x1_cn2048_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M512","4 6 8 2 1 1 8 1 0 ",
+"EU72_k3x3_cn512_g1_s1x1_d1x1_b0_in16x16_p1x1_num1_M512","2 5 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M16","8 2 8 2 1 1 8 1 0 ",
+"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64","4 2 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128","8 3 8 2 1 1 8 1 0 ",
+"EU72_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M288","2 7 16 2 1 1 16 1 0 ",
+"EU72_k3x3_cn16_g1_s1x1_d1x1_b1_in128x128_p1x1_num1_M16","2 5 16 2 1 1 16 1 0 ",
+"EU72_k3x3_cn32_g1_s1x1_d8x8_b1_in64x64_p8x8_num1_M32","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn64_g1_s1x1_d1x1_b0_in128x128_p0x0_num1_M4","8 3 8 2 1 1 8 1 0 ",
+"EU72_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M256","2 7 16 2 1 1 16 1 0 ",
+"EU72_k3x3_cn256_g1_s1x1_d1x1_b0_in16x16_p1x1_num1_M256","2 5 16 2 1 1 16 1 0 ",
+"EU72_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M224","2 5 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ",
+"EU72_k2x2_cn16_g1_s2x2_d1x1_b0_in256x256_p0x0_num1_M16","6 4 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192","4 6 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn1024_g1_s2x2_d1x1_b0_in16x16_p0x0_num1_M512","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160","1 8 32 5 1 8 1 1 0 ",
+"EU72_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M192","2 5 16 2 1 1 16 1 0 ",
+"EU72_k5x5_cn96_g2_s1x1_d1x1_b1_in32x32_p2x2_num1_M128","4 3 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32","8 2 8 2 1 1 8 1 0 ",
+"EU72_k2x2_cn64_g1_s2x2_d1x1_b0_in128x128_p0x0_num1_M32","8 3 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn64_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M256","1 16 32 5 1 16 1 1 0 ",
+"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32","12 2 8 2 1 1 8 1 0 ",
+"EU72_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num2_M32","4 2 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M16","12 1 8 2 1 1 8 1 0 ",
+"EU72_k11x11_cn3_g1_s4x4_d1x1_b1_in224x224_p0x0_num1_M96","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M64","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M256","4 7 8 2 1 1 8 1 0 ",
+"EU72_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384","2 5 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M16","12 1 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160","8 3 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn256_g1_s2x2_d1x1_b0_in64x64_p0x0_num1_M512","1 16 32 5 1 16 1 1 0 ",
+"EU72_k1x1_cn128_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M16","2 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M192","4 6 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M96","1 16 32 5 1 16 1 1 0 ",
+"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32","12 1 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M64","12 2 8 2 1 1 8 1 0 ",
+"EU72_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384","2 7 16 2 1 1 16 1 0 ",
+"EU72_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64","4 2 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24","12 1 8 2 1 1 8 1 0 ",
+"EU72_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M128","4 2 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160","4 6 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M144","1 8 32 5 1 8 1 1 0 ",
+"EU72_k3x3_cn32_g1_s1x1_d4x4_b1_in64x64_p4x4_num1_M32","1 8 32 5 1 8 1 1 0 ",
+"EU72_k3x3_cn3_g1_s2x2_d1x1_b1_in256x256_p1x1_num1_M13","1 1 1 4 1 1 1 0 1 ",
+"EU72_k3x3_cn32_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M32","6 4 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M32","1 8 32 5 1 8 1 1 0 ",
+"EU72_k3x3_cn64_g1_s1x1_d1x1_b0_in64x64_p1x1_num1_M64","2 7 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn256_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M1024","2 8 32 5 1 8 1 1 0 ",
+"EU72_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M320","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x5_cn32_g1_s1x1_d1x1_b1_in64x64_p0x2_num1_M32","4 6 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn64_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M64","1 16 32 5 1 16 1 1 0 ",
+"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160","4 6 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b0_in32x32_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64","8 3 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64","12 2 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M128","2 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32","8 3 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M32","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M112","1 8 32 5 1 8 1 1 0 ",
+"EU72_k4x4_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M2","1 3 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn1024_g1_s2x2_d1x1_b0_in16x16_p0x0_num1_M2048","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn256_g1_s2x2_d1x1_b0_in64x64_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ",
+"EU72_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num2_M64","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M2048","1 8 32 5 1 8 1 1 0 ",
+"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M64","8 1 16 2 1 1 16 1 0 ",
+"EU72_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M208","2 7 16 2 1 1 16 1 0 ",
+// Below is the information for OpenCL based on which these configurations tuned
+/*******************************************************************************
+Number of platforms                               1
+  Platform Name                                   Intel(R) OpenCL
+  Platform Vendor                                 Intel(R) Corporation
+  Platform Version                                OpenCL 2.0
+  Platform Profile                                FULL_PROFILE
+  Platform Extensions                             cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_driver_diagnostics cl_intel_motion_estimation cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups
+  Platform Extensions function suffix             INTEL
+
+  Platform Name                                   Intel(R) OpenCL
+Number of devices                                 1
+  Device Name                                     Intel(R) HD Graphics
+  Device Vendor                                   Intel(R) Corporation
+  Device Vendor ID                                0x8086
+  Device Version                                  OpenCL 2.0
+  Driver Version                                  16.5.56875
+  Device OpenCL C Version                         OpenCL C 2.0 ( using IGC )
+  Device Type                                     GPU
+  Device Profile                                  FULL_PROFILE
+  Max compute units                               48
+  Max clock frequency                             950MHz
+  Device Partition                                (core)
+    Max number of sub-devices                     0
+    Supported partition types                     by <unknown> (0x7F4B00000000)
+  Max work item dimensions                        3
+  Max work item sizes                             256x256x256
+  Max work group size                             256
+  Preferred work group size multiple              32
+  Preferred / native vector sizes
+    char                                                16 / 16
+    short                                                8 / 8
+    int                                                  4 / 4
+    long                                                 1 / 1
+    half                                                 8 / 8        (cl_khr_fp16)
+    float                                                1 / 1
+    double                                               1 / 1        (cl_khr_fp64)
+  Half-precision Floating-point support           (cl_khr_fp16)
+    Denormals                                     Yes
+    Infinity and NANs                             Yes
+    Round to nearest                              Yes
+    Round to zero                                 Yes
+    Round to infinity                             Yes
+    IEEE754-2008 fused multiply-add               Yes
+    Support is emulated in software               No
+    Correctly-rounded divide and sqrt operations  No
+  Single-precision Floating-point support         (core)
+    Denormals                                     Yes
+    Infinity and NANs                             Yes
+    Round to nearest                              Yes
+    Round to zero                                 Yes
+    Round to infinity                             Yes
+    IEEE754-2008 fused multiply-add               Yes
+    Support is emulated in software               No
+    Correctly-rounded divide and sqrt operations  Yes
+  Double-precision Floating-point support         (cl_khr_fp64)
+    Denormals                                     Yes
+    Infinity and NANs                             Yes
+    Round to nearest                              Yes
+    Round to zero                                 Yes
+    Round to infinity                             Yes
+    IEEE754-2008 fused multiply-add               Yes
+    Support is emulated in software               No
+    Correctly-rounded divide and sqrt operations  No
+  Address bits                                    64, Little-Endian
+  Global memory size                              13361912218 (12.44GiB)
+  Error Correction support                        No
+  Max memory allocation                           4294959103 (4GiB)
+  Unified memory for Host and Device              Yes
+  Shared Virtual Memory (SVM) capabilities        (core)
+    Coarse-grained buffer sharing                 Yes
+    Fine-grained buffer sharing                   No
+    Fine-grained system sharing                   No
+    Atomics                                       No
+  Minimum alignment for any data type             128 bytes
+  Alignment of base address                       1024 bits (128 bytes)
+  Preferred alignment for atomics
+    SVM                                           64 bytes
+    Global                                        64 bytes
+    Local                                         64 bytes
+  Max size for global variable                    65536 (64KiB)
+  Preferred total size of global vars             4294959103 (4GiB)
+  Global Memory cache type                        Read/Write
+  Global Memory cache size                        1048576
+  Global Memory cache line                        64 bytes
+  Image support                                   Yes
+    Max number of samplers per kernel             16
+    Max size for 1D images from buffer            268434943 pixels
+    Max 1D or 2D image array size                 2048 images
+    Base address alignment for 2D image buffers   4 bytes
+    Pitch alignment for 2D image buffers          4 bytes
+    Max 2D image size                             16384x16384 pixels
+    Max 3D image size                             16384x16384x2048 pixels
+    Max number of read image args                 128
+    Max number of write image args                128
+    Max number of read/write image args           128
+  Max number of pipe args                         16
+  Max active pipe reservations                    1
+  Max pipe packet size                            1024
+  Local memory type                               Local
+  Local memory size                               65536 (64KiB)
+  Max constant buffer size                        4294959103 (4GiB)
+  Max number of constant args                     8
+  Max size of kernel argument                     1024
+  Queue properties (on host)
+    Out-of-order execution                        Yes
+    Profiling                                     Yes
+  Queue properties (on device)
+    Out-of-order execution                        Yes
+    Profiling                                     Yes
+    Preferred size                                131072 (128KiB)
+    Max size                                      67108864 (64MiB)
+  Max queues on device                            1
+  Max events on device                            1024
+  Prefer user sync for interop                    Yes
+  Profiling timer resolution                      83ns
+  Execution capabilities
+    Run OpenCL kernels                            Yes
+    Run native kernels                            No
+    SPIR versions                                 1.2
+  printf() buffer size                            4194304 (4MiB)
+  Built-in kernels                                block_motion_estimate_intel;block_advanced_motion_estimate_check_intel;block_advanced_motion_estimate_bidirectional_check_intel
+  Motion Estimation accelerator version	(Intel)   2
+  Device Available                                Yes
+  Compiler Available                              Yes
+  Linker Available                                Yes
+  Device Extensions                               cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_driver_diagnostics cl_intel_motion_estimation cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups
+
+NULL platform behavior
+  clGetPlatformInfo(NULL, CL_PLATFORM_NAME, ...)  No platform
+  clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, ...)   No platform
+  clCreateContext(NULL, ...) [default]            No platform
+  clCreateContext(NULL, ...) [other]              Success [INTEL]
+  clCreateContextFromType(NULL, CL_DEVICE_TYPE_CPU)  No platform
+  clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU)  No platform
+  clCreateContextFromType(NULL, CL_DEVICE_TYPE_ACCELERATOR)  No platform
+  clCreateContextFromType(NULL, CL_DEVICE_TYPE_CUSTOM)  No platform
+  clCreateContextFromType(NULL, CL_DEVICE_TYPE_ALL)  No platform
+********************************************************************************/
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32","8 3 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64","8 2 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn32_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M128","1 16 32 5 1 16 1 1 0 ",
+"EU48_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M32","8 1 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M144","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M96","1 16 32 5 1 16 1 1 0 ",
+"EU48_k3x3_cn128_g1_s1x1_d1x1_b0_in32x32_p1x1_num1_M128","6 4 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128","2 8 32 5 1 8 1 1 0 ",
+"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64","8 1 16 2 1 1 16 1 0 ",
+"EU48_k2x2_cn16_g1_s2x2_d1x1_b0_in256x256_p0x0_num1_M16","2 7 16 2 1 1 16 1 0 ",
+"EU48_k3x3_cn4_g1_s1x1_d1x1_b1_in256x256_p1x1_num1_M4","6 4 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn128_g1_s1x1_d1x1_b0_in32x32_p0x0_num1_M512","2 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M112","8 3 8 2 1 1 8 1 0 ",
+"EU48_k3x3_cn512_g1_s1x1_d1x1_b0_in16x16_p1x1_num1_M512","2 7 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M64","2 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M384","4 6 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M16","8 2 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M96","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn256_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M1024","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M192","4 7 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn128_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M64","1 8 32 5 1 8 1 1 0 ",
+"EU48_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M320","2 7 16 2 1 1 16 1 0 ",
+"EU48_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num1_M64","1 8 32 5 1 8 1 1 0 ",
+"EU48_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M48","4 2 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M256","2 8 32 5 1 8 1 1 0 ",
+"EU48_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M192","2 8 16 2 1 1 16 1 0 ",
+"EU48_k11x11_cn3_g1_s4x4_d1x1_b1_in224x224_p0x0_num1_M96","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M112","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32","12 1 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn512_g1_s2x2_d1x1_b0_in32x32_p0x0_num1_M256","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128","12 2 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M64","8 2 8 2 1 1 8 1 0 ",
+"EU48_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num2_M192","2 7 16 2 1 1 16 1 0 ",
+"EU48_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256","2 5 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn16_g1_s1x1_d1x1_b0_in256x256_p0x0_num1_M4","8 3 8 2 1 1 8 1 0 ",
+"EU48_k1x5_cn32_g1_s1x1_d1x1_b1_in64x64_p0x2_num1_M32","4 7 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256","4 7 8 2 1 1 8 1 0 ",
+"EU48_k3x3_cn3_g1_s2x2_d1x1_b1_in256x256_p1x1_num1_M13","1 1 1 4 1 1 1 0 1 ",
+"EU48_k11x7_cn3_g1_s3x4_d1x1_b1_in64x64_p3x2_num1_M64","4 1 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M96","8 3 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn128_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M16","1 8 32 5 1 8 1 1 0 ",
+"EU48_k3x3_cn32_g1_s1x1_d2x2_b1_in64x64_p2x2_num1_M32","3 3 16 2 1 1 16 1 0 ",
+"EU48_k3x3_cn32_g1_s1x1_d8x8_b1_in64x64_p8x8_num1_M32","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M96","2 8 32 5 1 8 1 1 0 ",
+"EU48_k2x2_cn64_g1_s2x2_d1x1_b0_in128x128_p0x0_num1_M32","4 4 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128","4 3 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b0_in32x32_p0x0_num1_M128","2 8 32 5 1 8 1 1 0 ",
+"EU48_k3x3_cn16_g1_s1x1_d1x1_b1_in128x128_p1x1_num1_M16","2 7 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn4_g1_s1x1_d1x1_b0_in256x256_p0x0_num1_M16","2 8 32 5 1 8 1 1 0 ",
+"EU48_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M128","6 2 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M4","4 2 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M144","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M384","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn256_g1_s2x2_d1x1_b0_in64x64_p0x0_num1_M128","1 16 32 5 1 16 1 1 0 ",
+"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M192","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn1024_g1_s2x2_d1x1_b0_in16x16_p0x0_num1_M2048","1 16 32 5 1 16 1 1 0 ",
+"EU48_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M384","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M128","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn16_g1_s1x1_d1x1_b0_in128x128_p0x0_num1_M64","1 16 32 5 1 16 1 1 0 ",
+"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32","4 7 8 2 1 1 8 1 0 ",
+"EU48_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M192","2 5 16 2 1 1 16 1 0 ",
+"EU48_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128","6 4 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32","8 3 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M64","12 2 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M64","2 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn2048_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M512","4 7 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64","12 2 8 2 1 1 8 1 0 ",
+"EU48_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M224","1 8 32 5 1 8 1 1 0 ",
+"EU48_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384","2 7 16 2 1 1 16 1 0 ",
+"EU48_k3x3_cn32_g1_s1x1_d4x4_b1_in64x64_p4x4_num1_M32","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256","1 8 32 5 1 8 1 1 0 ",
+"EU48_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384","2 4 16 2 1 1 16 1 0 ",
+"EU48_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M288","2 4 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M48","4 6 8 2 1 1 8 1 0 ",
+"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M64","8 1 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160","12 2 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn256_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M64","2 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192","4 6 8 2 1 1 8 1 0 ",
+"EU48_k5x5_cn96_g2_s1x1_d1x1_b1_in32x32_p2x2_num1_M128","4 5 16 2 1 1 16 1 0 ",
+"EU48_k3x3_cn256_g1_s1x1_d1x1_b0_in16x16_p1x1_num1_M256","2 6 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32","8 3 8 2 1 1 8 1 0 ",
+"EU48_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M48","4 2 16 2 1 1 16 1 0 ",
+"EU48_k3x3_cn64_g1_s1x1_d1x1_b0_in64x64_p1x1_num1_M64","10 2 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160","4 6 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32","4 5 8 2 1 1 8 1 0 ",
+"EU48_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M208","2 5 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M256","4 6 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M2048","2 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M48","4 6 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn64_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M256","1 16 32 5 1 16 1 1 0 ",
+"EU48_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M224","2 7 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M32","1 8 32 5 1 8 1 1 0 ",
+"EU48_k5x1_cn32_g1_s1x1_d1x1_b0_in64x64_p2x0_num1_M32","2 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn64_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M64","1 8 32 5 1 8 1 1 0 ",
+"EU48_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M288","2 7 16 2 1 1 16 1 0 ",
+"EU48_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M192","2 7 16 2 1 1 16 1 0 ",
+"EU48_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num2_M32","4 3 16 2 1 1 16 1 0 ",
+"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num2_M96","4 2 16 2 1 1 16 1 0 ",
+"EU48_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M208","2 5 16 2 1 1 16 1 0 ",
+"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M96","4 2 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24","12 1 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn64_g1_s1x1_d1x1_b0_in128x128_p0x0_num1_M16","4 7 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn256_g1_s2x2_d1x1_b0_in64x64_p0x0_num1_M512","2 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn1024_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M256","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M32","1 8 32 5 1 8 1 1 0 ",
+"EU48_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M320","2 8 16 2 1 1 16 1 0 ",
+"EU48_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num2_M192","6 4 16 2 1 1 16 1 0 ",
+"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M128","4 3 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160","4 6 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128","1 8 32 5 1 8 1 1 0 ",
+"EU48_k3x3_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M5","2 3 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32","8 3 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num2_M64","1 16 32 5 1 16 1 1 0 ",
+"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128","8 2 16 2 1 1 16 1 0 ",
+"EU48_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num2_M64","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128","4 6 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn128_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M32","1 16 32 5 1 16 1 1 0 ",
+"EU48_k1x1_cn1024_g1_s2x2_d1x1_b0_in16x16_p0x0_num1_M512","1 8 32 5 1 8 1 1 0 ",
+"EU48_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M64","4 4 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32","8 3 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160","12 2 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64","8 3 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn512_g1_s2x2_d1x1_b0_in32x32_p0x0_num1_M1024","1 8 32 5 1 8 1 1 0 ",
+"EU48_k5x4_cn6_g3_s3x2_d1x1_b1_in128x80_p1x0_num2_M4","1 1 1 4 1 1 1 0 1 ",
+"EU48_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M256","2 7 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M24","8 2 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M16","12 1 8 2 1 1 8 1 0 ",
+"EU48_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num2_M128","10 2 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64","1 16 32 5 1 16 1 1 0 ",
+"EU48_k3x3_cn32_g1_s1x1_d16x16_b1_in64x64_p16x16_num1_M32","1 16 32 5 1 16 1 1 0 ",
+"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128","4 7 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M16","12 2 8 2 1 1 8 1 0 ",
+"EU48_k4x4_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M2","1 4 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn64_g1_s1x1_d1x1_b0_in128x128_p0x0_num1_M4","8 2 8 2 1 1 8 1 0 ",
+"EU48_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64","4 2 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M16","1 8 32 5 1 8 1 1 0 ",
+"EU48_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M128","4 2 16 2 1 1 16 1 0 ",
+"EU48_k3x3_cn32_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M32","2 8 16 2 1 1 16 1 0 ",
+"EU48_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128","4 2 16 2 1 1 16 1 0 ",
+// Below is the information for OpenCL based on which these configurations tuned
+/*******************************************************************************
+Number of platforms                               1
+  Platform Name                                   Intel(R) OpenCL
+  Platform Vendor                                 Intel(R) Corporation
+  Platform Version                                OpenCL 2.0
+  Platform Profile                                FULL_PROFILE
+  Platform Extensions                             cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_driver_diagnostics cl_intel_media_block_io cl_intel_motion_estimation cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups
+  Platform Extensions function suffix             INTEL
+
+  Platform Name                                   Intel(R) OpenCL
+Number of devices                                 1
+  Device Name                                     Intel(R) HD Graphics
+  Device Vendor                                   Intel(R) Corporation
+  Device Vendor ID                                0x8086
+  Device Version                                  OpenCL 2.0
+  Driver Version                                  16.5.59288
+  Device OpenCL C Version                         OpenCL C 2.0
+  Device Type                                     GPU
+  Device Profile                                  FULL_PROFILE
+  Max compute units                               24
+  Max clock frequency                             1050MHz
+  Device Partition                                (core)
+    Max number of sub-devices                     0
+    Supported partition types                     by <unknown> (0x7F5100000000)
+  Max work item dimensions                        3
+  Max work item sizes                             256x256x256
+  Max work group size                             256
+  Preferred work group size multiple              32
+  Preferred / native vector sizes
+    char                                                16 / 16
+    short                                                8 / 8
+    int                                                  4 / 4
+    long                                                 1 / 1
+    half                                                 8 / 8        (cl_khr_fp16)
+    float                                                1 / 1
+    double                                               1 / 1        (cl_khr_fp64)
+  Half-precision Floating-point support           (cl_khr_fp16)
+    Denormals                                     Yes
+    Infinity and NANs                             Yes
+    Round to nearest                              Yes
+    Round to zero                                 Yes
+    Round to infinity                             Yes
+    IEEE754-2008 fused multiply-add               Yes
+    Support is emulated in software               No
+    Correctly-rounded divide and sqrt operations  No
+  Single-precision Floating-point support         (core)
+    Denormals                                     Yes
+    Infinity and NANs                             Yes
+    Round to nearest                              Yes
+    Round to zero                                 Yes
+    Round to infinity                             Yes
+    IEEE754-2008 fused multiply-add               Yes
+    Support is emulated in software               No
+    Correctly-rounded divide and sqrt operations  Yes
+  Double-precision Floating-point support         (cl_khr_fp64)
+    Denormals                                     Yes
+    Infinity and NANs                             Yes
+    Round to nearest                              Yes
+    Round to zero                                 Yes
+    Round to infinity                             Yes
+    IEEE754-2008 fused multiply-add               Yes
+    Support is emulated in software               No
+    Correctly-rounded divide and sqrt operations  No
+  Address bits                                    64, Little-Endian
+  Global memory size                              6588802663 (6.136GiB)
+  Error Correction support                        No
+  Max memory allocation                           3294401331 (3.068GiB)
+  Unified memory for Host and Device              Yes
+  Shared Virtual Memory (SVM) capabilities        (core)
+    Coarse-grained buffer sharing                 Yes
+    Fine-grained buffer sharing                   No
+    Fine-grained system sharing                   No
+    Atomics                                       No
+  Minimum alignment for any data type             128 bytes
+  Alignment of base address                       1024 bits (128 bytes)
+  Preferred alignment for atomics
+    SVM                                           64 bytes
+    Global                                        64 bytes
+    Local                                         64 bytes
+  Max size for global variable                    65536 (64KiB)
+  Preferred total size of global vars             3294401331 (3.068GiB)
+  Global Memory cache type                        Read/Write
+  Global Memory cache size                        524288
+  Global Memory cache line                        64 bytes
+  Image support                                   Yes
+    Max number of samplers per kernel             16
+    Max size for 1D images from buffer            205900083 pixels
+    Max 1D or 2D image array size                 2048 images
+    Base address alignment for 2D image buffers   4 bytes
+    Pitch alignment for 2D image buffers          4 bytes
+    Max 2D image size                             16384x16384 pixels
+    Max 3D image size                             16384x16384x2048 pixels
+    Max number of read image args                 128
+    Max number of write image args                128
+    Max number of read/write image args           128
+  Max number of pipe args                         16
+  Max active pipe reservations                    1
+  Max pipe packet size                            1024
+  Local memory type                               Local
+  Local memory size                               65536 (64KiB)
+  Max constant buffer size                        3294401331 (3.068GiB)
+  Max number of constant args                     8
+  Max size of kernel argument                     1024
+  Queue properties (on host)
+    Out-of-order execution                        Yes
+    Profiling                                     Yes
+  Queue properties (on device)
+    Out-of-order execution                        Yes
+    Profiling                                     Yes
+    Preferred size                                131072 (128KiB)
+    Max size                                      67108864 (64MiB)
+  Max queues on device                            1
+  Max events on device                            1024
+  Prefer user sync for interop                    Yes
+  Profiling timer resolution                      83ns
+  Execution capabilities
+    Run OpenCL kernels                            Yes
+    Run native kernels                            No
+    SPIR versions                                 1.2
+  printf() buffer size                            4194304 (4MiB)
+  Built-in kernels                                block_motion_estimate_intel;block_advanced_motion_estimate_check_intel;block_advanced_motion_estimate_bidirectional_check_intel
+  Motion Estimation accelerator version	(Intel)   2
+  Device Available                                Yes
+  Compiler Available                              Yes
+  Linker Available                                Yes
+  Device Extensions                               cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_driver_diagnostics cl_intel_media_block_io cl_intel_motion_estimation cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups
+
+NULL platform behavior
+  clGetPlatformInfo(NULL, CL_PLATFORM_NAME, ...)  No platform
+  clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, ...)   No platform
+  clCreateContext(NULL, ...) [default]            No platform
+  clCreateContext(NULL, ...) [other]              Success [INTEL]
+  clCreateContextFromType(NULL, CL_DEVICE_TYPE_CPU)  No platform
+  clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU)  No platform
+  clCreateContextFromType(NULL, CL_DEVICE_TYPE_ACCELERATOR)  No platform
+  clCreateContextFromType(NULL, CL_DEVICE_TYPE_CUSTOM)  No platform
+  clCreateContextFromType(NULL, CL_DEVICE_TYPE_ALL)  No platform
+********************************************************************************/
+"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64","2 8 32 5 1 8 1 1 0 ",
+"EU24_k5x1_cn32_g1_s1x1_d1x1_b0_in64x64_p2x0_num1_M32","4 6 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ",
+"EU24_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128","4 2 16 2 1 1 16 1 0 ",
+"EU24_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M224","2 5 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32","1 8 32 5 1 8 1 1 0 ",
+"EU24_k2x2_cn16_g1_s2x2_d1x1_b0_in256x256_p0x0_num1_M16","1 8 32 5 1 8 1 1 0 ",
+"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M128","4 3 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn256_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M64","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M256","1 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M384","1 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384","2 7 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn2048_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M512","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M16","2 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M128","2 7 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192","1 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M224","2 7 16 2 1 1 16 1 0 ",
+"EU24_k3x3_cn32_g1_s1x1_d8x8_b1_in64x64_p8x8_num1_M32","1 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M208","2 7 16 2 1 1 16 1 0 ",
+"EU24_k11x11_cn3_g1_s4x4_d1x1_b1_in224x224_p0x0_num1_M96","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160","1 8 32 5 1 8 1 1 0 ",
+"EU24_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num2_M64","1 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn32_g1_s1x1_d2x2_b1_in64x64_p2x2_num1_M32","3 3 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn128_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M64","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24","8 3 8 2 1 1 8 1 0 ",
+"EU24_k3x3_cn128_g1_s1x1_d1x1_b0_in32x32_p1x1_num1_M128","6 4 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M144","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn1024_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M256","2 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M208","2 7 16 2 1 1 16 1 0 ",
+"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128","4 3 16 2 1 1 16 1 0 ",
+"EU24_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M48","4 2 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M2048","4 7 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn128_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M16","2 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M192","6 4 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M128","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn256_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M1024","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn32_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M128","1 16 32 5 1 16 1 1 0 ",
+"EU24_k1x1_cn4_g1_s1x1_d1x1_b0_in256x256_p0x0_num1_M16","1 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384","2 7 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn128_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M32","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32","4 6 8 2 1 1 8 1 0 ",
+"EU24_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M128","4 4 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128","2 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32","8 2 8 2 1 1 8 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M64","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn1024_g1_s2x2_d1x1_b0_in16x16_p0x0_num1_M2048","1 16 32 5 1 16 1 1 0 ",
+"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M64","4 3 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M384","1 8 32 5 1 8 1 1 0 ",
+"EU24_k5x4_cn6_g3_s3x2_d1x1_b1_in128x80_p1x0_num2_M4","1 1 1 4 1 1 1 0 1 ",
+"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num2_M192","6 4 16 2 1 1 16 1 0 ",
+"EU24_k3x3_cn256_g1_s1x1_d1x1_b0_in16x16_p1x1_num1_M256","2 7 16 2 1 1 16 1 0 ",
+"EU24_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M320","2 8 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M64","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn512_g1_s2x2_d1x1_b0_in32x32_p0x0_num1_M256","2 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num2_M64","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256","2 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256","2 5 16 2 1 1 16 1 0 ",
+"EU24_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M64","4 3 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M16","8 3 8 2 1 1 8 1 0 ",
+"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128","2 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M112","2 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn64_g1_s1x1_d1x1_b0_in128x128_p0x0_num1_M16","2 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M96","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn64_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M256","2 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128","1 8 32 5 1 8 1 1 0 ",
+"EU24_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num2_M32","4 2 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M96","8 3 8 2 1 1 8 1 0 ",
+"EU24_k3x3_cn16_g1_s1x1_d1x1_b1_in128x128_p1x1_num1_M16","6 3 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M112","1 8 32 5 1 8 1 1 0 ",
+"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num2_M96","4 3 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32","8 2 8 2 1 1 8 1 0 ",
+"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M192","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160","1 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M288","2 8 16 2 1 1 16 1 0 ",
+"EU24_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M288","2 7 16 2 1 1 16 1 0 ",
+"EU24_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num1_M64","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b0_in32x32_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn4_g1_s1x1_d1x1_b1_in256x256_p1x1_num1_M4","10 2 8 2 1 1 8 1 0 ",
+"EU24_k3x3_cn32_g1_s1x1_d16x16_b1_in64x64_p16x16_num1_M32","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M16","8 2 8 2 1 1 8 1 0 ",
+"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64","1 16 32 5 1 16 1 1 0 ",
+"EU24_k1x5_cn32_g1_s1x1_d1x1_b1_in64x64_p0x2_num1_M32","4 7 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32","8 3 8 2 1 1 8 1 0 ",
+"EU24_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M192","2 7 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32","4 6 8 2 1 1 8 1 0 ",
+"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128","4 6 8 2 1 1 8 1 0 ",
+"EU24_k3x3_cn32_g1_s1x1_d4x4_b1_in64x64_p4x4_num1_M32","1 8 32 5 1 8 1 1 0 ",
+"EU24_k2x2_cn64_g1_s2x2_d1x1_b0_in128x128_p0x0_num1_M32","2 8 32 5 1 8 1 1 0 ",
+"EU24_k5x5_cn96_g2_s1x1_d1x1_b1_in32x32_p2x2_num1_M128","4 3 16 2 1 1 16 1 0 ",
+"EU24_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M48","8 1 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn16_g1_s1x1_d1x1_b0_in256x256_p0x0_num1_M4","8 3 8 2 1 1 8 1 0 ",
+"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M256","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M144","2 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128","6 4 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M32","2 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M192","2 7 16 2 1 1 16 1 0 ",
+"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64","4 2 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160","1 8 32 5 1 8 1 1 0 ",
+"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M96","4 4 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160","4 6 8 2 1 1 8 1 0 ",
+"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M32","2 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn32_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M32","2 8 16 2 1 1 16 1 0 ",
+"EU24_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num2_M128","10 2 16 2 1 1 16 1 0 ",
+"EU24_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M320","2 7 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32","8 3 8 2 1 1 8 1 0 ",
+"EU24_k3x3_cn64_g1_s1x1_d1x1_b0_in64x64_p1x1_num1_M64","2 8 16 2 1 1 16 1 0 ",
+"EU24_k3x3_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M5","2 3 8 2 1 1 8 1 0 ",
+"EU24_k1x1_cn16_g1_s1x1_d1x1_b0_in128x128_p0x0_num1_M64","1 16 32 5 1 16 1 1 0 ",
+"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M48","4 6 8 2 1 1 8 1 0 ",
+"EU24_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64","4 2 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn64_g1_s1x1_d1x1_b0_in128x128_p0x0_num1_M4","8 2 8 2 1 1 8 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64","8 2 8 2 1 1 8 1 0 ",
+"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M96","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn64_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M64","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M192","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M48","4 6 8 2 1 1 8 1 0 ",
+"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M256","2 7 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M4","4 4 16 2 1 1 16 1 0 ",
+"EU24_k4x4_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M2","1 3 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M96","1 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn512_g1_s1x1_d1x1_b0_in16x16_p1x1_num1_M512","2 7 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn256_g1_s2x2_d1x1_b0_in64x64_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn512_g1_s2x2_d1x1_b0_in32x32_p0x0_num1_M1024","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M64","1 8 32 5 1 8 1 1 0 ",
+"EU24_k11x7_cn3_g1_s3x4_d1x1_b1_in64x64_p3x2_num1_M64","4 1 16 2 1 1 16 1 0 ",
+"EU24_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num2_M192","6 4 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M64","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64","1 16 32 5 1 16 1 1 0 ",
+"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M16","8 3 8 2 1 1 8 1 0 ",
+"EU24_k1x1_cn128_g1_s1x1_d1x1_b0_in32x32_p0x0_num1_M512","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn1024_g1_s2x2_d1x1_b0_in16x16_p0x0_num1_M512","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M384","4 7 8 2 1 1 8 1 0 ",
+"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64","8 3 8 2 1 1 8 1 0 ",
+"EU24_k3x3_cn3_g1_s2x2_d1x1_b1_in256x256_p1x1_num1_M13","1 1 1 4 1 1 1 0 1 ",
+"EU24_k1x1_cn256_g1_s2x2_d1x1_b0_in64x64_p0x0_num1_M512","2 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M24","8 3 8 2 1 1 8 1 0 ",
+"EU24_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M32","4 3 16 2 1 1 16 1 0 ",
+};
+#endif
diff --git a/modules/dnn/src/ocl4dnn/include/math_functions.hpp b/modules/dnn/src/ocl4dnn/include/math_functions.hpp
new file mode 100644
index 0000000000..cac860490f
--- /dev/null
+++ b/modules/dnn/src/ocl4dnn/include/math_functions.hpp
@@ -0,0 +1,90 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _OPENCV_GREENTEA_MATH_FUNCTIONS_HPP_
+#define _OPENCV_GREENTEA_MATH_FUNCTIONS_HPP_
+#include "../../precomp.hpp"
+#include "common.hpp"
+
+namespace cv
+{
+namespace dnn
+{
+namespace ocl4dnn
+{
+
+#ifdef HAVE_OPENCL
+enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113};
+
+template<typename Dtype>
+bool ocl4dnnGEMMCommon(const CBLAS_TRANSPOSE TransB,
+                       const int32_t M, const int32_t N, const int32_t K,
+                       const UMat A, const UMat B,
+                       const UMat B_image, UMat C,
+                       const size_t max_image_size);
+
+template<typename Dtype>
+ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
+                                          bool is_matrix_a, bool transpose,
+                                          bool padding, int padded_height,
+                                          int padded_width, int height,
+                                          int width,  int ld);
+
+template<typename Dtype>
+bool ocl4dnnGEMV(const CBLAS_TRANSPOSE TransA,
+                 const int32_t M, const int32_t N, const Dtype alpha,
+                 const UMat A, const int32_t offA, const UMat x,
+                 const int32_t offx, const Dtype beta, UMat y,
+                 const int32_t offy);
+
+template<typename Dtype>
+bool ocl4dnnAXPY(const int32_t N, const Dtype alpha,
+                 const UMat x, const int32_t offx, UMat y,
+                 const int32_t offy);
+
+#endif  // HAVE_OPENCL
+
+} // namespace ocl4dnn
+} // namespace dnn
+} // namespce cv
+
+#endif
diff --git a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
new file mode 100644
index 0000000000..603c0aade0
--- /dev/null
+++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
@@ -0,0 +1,473 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _OPENCV_LIBDNN_HPP_
+#define _OPENCV_LIBDNN_HPP_
+#include "../../precomp.hpp"
+#include <iomanip>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "common.hpp"
+
+namespace cv { namespace dnn { namespace ocl4dnn {
+#ifdef HAVE_OPENCL
+
+struct OCL4DNNConvConfig
+{
+    OCL4DNNConvConfig() :
+        kernel(1, 1),
+        pad(0, 0),
+        stride(1, 1),
+        dilation(1, 1),
+        group(1),
+        bias_term(false)
+    {}
+    MatShape in_shape;
+    MatShape out_shape;
+    Size kernel;
+    Size pad;
+    Size stride;
+    Size dilation;
+    int group; // = 1;
+    bool bias_term; // = false;
+};
+
+
+template<typename Dtype>
+class OCL4DNNConvSpatial
+{
+    public:
+        explicit OCL4DNNConvSpatial(OCL4DNNConvConfig config);
+        ~OCL4DNNConvSpatial();
+        bool Forward(const UMat& bottom_data, const UMat& weight,
+                     const UMat& bias,
+                     UMat& top_data, int32_t batch_size);
+
+    private:
+        struct kernelConfig
+        {
+            std::string kernelName;
+            float executionTime;
+            size_t local_work_size[3];
+            size_t global_work_size[3];
+            int32_t workItem_output[3];
+            bool verified;
+            bool tested;
+            bool swizzle_weights;
+            bool use_null_local;
+            int32_t kernelType;
+
+            kernelConfig()
+            {}
+
+            kernelConfig(const std::string& name, const size_t* global_size, const size_t* local_size,
+                         const int32_t* workItem,
+                         bool swizzle,
+                         int32_t type = 0)
+                : executionTime(0)
+            {
+                kernelName = name;
+                for (int32_t x = 0; x < 3; x++)
+                {
+                    local_work_size[x] = local_size ? local_size[x] : 1;
+                    global_work_size[x] = global_size[x];
+                    workItem_output[x] = workItem[x];
+                }
+                swizzle_weights = swizzle;
+                use_null_local = local_size == NULL;
+                verified = false;
+                tested = false;
+                kernelType = type;
+            }
+        };
+
+        struct tunerParam
+        {
+           int kernelType;
+           int blockWidth;
+           int blockHeight;
+           int blockDepth;
+
+           tunerParam(int type, int w, int h, int d)
+           {
+               kernelType = type;
+               blockWidth = w;
+               blockHeight= h;
+               blockDepth = d;
+           }
+        };
+
+        inline void addDef(const char* name)
+        {
+            options_ << " -D " << name;
+        }
+
+        inline void addDef(const char* name, const int value)
+        {
+            options_ << " -D " << name << "=" << value;
+        }
+
+        inline void addDef(const char* name, const float value)
+        {
+            options_ << " -D " << name << "=(float)" << value;
+        }
+
+        inline void addDef(const char* name, const double value)
+        {
+            options_ << " -D " << name << "=(double)" << value;
+        }
+
+        inline void addDef(const char* name, const char* value)
+        {
+            options_ << " -D " << name << "=" << value;
+        }
+
+        void useFirstAvailable(const UMat &bottom,
+                               UMat &top,
+                               const UMat &weight,
+                               const UMat &bias,
+                               int32_t numImages,
+                               UMat &verifyTop);
+        void setupKernel();
+        void collectCommonInformation();
+        void setupKernelDetails(int32_t kernelType,
+                                int32_t blockM,
+                                int32_t blockK,
+                                int32_t blockN);
+
+        ocl::Program compileKernel();
+        typedef std::map<std::string, ocl::Program> phash_t;
+        phash_t phash;
+        void calculateBenchmark(const UMat &bottom, UMat &verifyTop,
+                                const UMat &weight, const UMat &bias,
+                                int32_t numImages);
+
+
+        void setupConvolution(const UMat &bottom,
+                              UMat &top,
+                              const UMat &weight,
+                              const UMat &bias,
+                              int32_t numImags,
+                              UMat &verifyTop);
+        bool createConvolutionKernel(int32_t kernelType,
+                                     int32_t blockWidth,
+                                     int32_t blockHeight,
+                                     int32_t blockDepth);
+        bool setupIDLF(int32_t blockWidth,
+                       int32_t blockHeight,
+                       int32_t blockDepth);
+        bool createBasicKernel(int32_t blockWidth,
+                               int32_t blockHeight,
+                               int32_t blockDepth);
+        bool createGEMMLikeConvKernel(int32_t blockWidth,
+                                      int32_t blockHeight,
+                                      int32_t blockDepth);
+        void CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
+                             int32_t offset, int32_t size, bool write_only);
+        bool convolve(const UMat &bottom, UMat &top,
+                      const UMat &weight, const UMat &bias,
+                      int32_t numImages,
+                      kernelConfig* config,
+                      const cv::ocl::Queue& queue);
+        float timedConvolve(const UMat &bottom, UMat &top,
+                            const UMat &weight, const UMat &bias,
+                            int32_t numImages, kernelConfig* config);
+
+        bool verifyResult(const UMat &bottom,
+                          UMat &top,
+                          const UMat &weight,
+                          const UMat &bias,
+                          int32_t numImages,
+                          kernelConfig* config,
+                          UMat &verifyTop);
+
+        bool swizzleWeight(const UMat &weight,
+                           int32_t swizzled_factor,
+                           bool interleave = false);
+
+        void generateKey();
+        std::string generateSpecificKey(int32_t type, int32_t blockWidth,
+                                          int32_t blockHeight,
+                                          int32_t blockDepth);
+        void cacheTunedConfig();
+        bool loadTunedConfig();
+
+        void saveTunedConfig();
+        bool loadCachedConfig();
+
+        void unloadProgram(const std::string& kernelName);
+        void prepareKernel(const UMat &bottom, UMat &top,
+                           const UMat &weight, const UMat &bias,
+                           int32_t numImages);
+        bool setupKernelByConfig(int x, int y, int z, int type,
+                                 int lx, int ly, int lz,
+                                 bool swizzle, bool nullLocal);
+        void generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems);
+
+        int32_t group_;
+        bool bias_term_;
+        UMat swizzled_weights_umat;
+
+        int32_t bottom_index_;
+        int32_t output_h_;
+        int32_t output_w_;
+        int32_t kernel_h_;
+        int32_t kernel_w_;
+        int32_t height_;
+        int32_t width_;
+        int32_t pad_h_;
+        int32_t pad_w_;
+        int32_t stride_h_;
+        int32_t stride_w_;
+        int32_t dilation_h_;
+        int32_t dilation_w_;
+
+        /// M_ is the channel dimension of the output for a single group, which is the
+        /// leading dimension of the filter matrix.
+        int32_t M_;
+
+        bool tuned_;
+        std::string key_, key_sanitized_;
+        std::string short_key_;
+        std::string kernel_name_;
+        std::string cache_path_;
+        bool use_cache_path_; // true if cache_path_ directory exists
+        bool force_auto_tuning_;
+        int32_t kernel_index_;
+        std::vector< cv::Ptr<kernelConfig> > kernelQueue;
+        cv::Ptr<kernelConfig> bestKernelConfig;
+
+        int32_t bottom_dim_;
+        int32_t top_dim_;
+        int32_t num_;
+        int32_t channels_;
+        int32_t num_output_;
+
+        int32_t kernelType_;
+        int32_t blockM_;
+        int32_t blockK_;
+        int32_t blockN_;
+        std::stringstream options_;
+        cv::ocl::ProgramSource src_;
+        int32_t prev_kernel_type_;
+};
+
+typedef enum {
+    LIBDNN_POOLING_METHOD_MAX                 = 0,
+    LIBDNN_POOLING_METHOD_AVE                 = 1,
+    LIBDNN_POOLING_METHOD_STO                 = 2
+} ocl4dnnPoolingMethod_t;
+
+struct OCL4DNNPoolConfig
+{
+    OCL4DNNPoolConfig() :
+        kernel(1, 1),
+        pad(0, 0),
+        stride(1, 1),
+        dilation(1, 1),
+        channels(0),
+        pool_method(LIBDNN_POOLING_METHOD_MAX),
+        global_pooling(false)
+    {}
+    MatShape in_shape;
+    MatShape out_shape;
+    Size kernel;
+    Size pad;
+    Size stride;
+    Size dilation;
+
+    int channels;
+    ocl4dnnPoolingMethod_t pool_method; // = LIBDNN_POOLING_METHOD_MAX;
+    bool global_pooling; // = false;
+};
+
+template<typename Dtype>
+class OCL4DNNPool
+{
+    public:
+        explicit OCL4DNNPool(OCL4DNNPoolConfig config);
+        ~OCL4DNNPool();
+        bool Forward(const UMat& bottom_data,
+                     UMat& top_data,
+                     UMat& top_mask);
+    private:
+        UMat mask_idx_;
+
+        // Pooling parameters
+        std::vector<int32_t> pad_;
+        std::vector<int32_t> stride_;
+        std::vector<int32_t> kernel_shape_;
+        std::vector<int32_t> im_in_shape_;
+        std::vector<int32_t> im_out_shape_;
+
+        ocl4dnnPoolingMethod_t pool_method_;
+        int32_t count_;
+        int32_t batch_size_;
+        int32_t channels_;
+        int32_t kernel_h_;
+        int32_t kernel_w_;
+        int32_t stride_h_;
+        int32_t stride_w_;
+        int32_t pad_h_;
+        int32_t pad_w_;
+        int32_t height_;
+        int32_t width_;
+        int32_t pooled_height_;
+        int32_t pooled_width_;
+};
+
+struct OCL4DNNInnerProductConfig
+{
+    OCL4DNNInnerProductConfig() :
+        num_output(0), M(0), K(0),
+        bias_term(false), transpose(false), phase_test(true)
+    {}
+    int num_output;
+    int M;
+    int K;
+    bool bias_term;
+    bool transpose; // = false;
+    bool phase_test; // = true;
+};
+
+template<typename Dtype>
+class OCL4DNNInnerProduct
+{
+    public:
+        explicit OCL4DNNInnerProduct(OCL4DNNInnerProductConfig config);
+        ~OCL4DNNInnerProduct();
+        bool Forward(const UMat& bottom_data,
+                     const UMat& weight,
+                     const UMat& bias,
+                     UMat& top_data);
+    private:
+        OCL4DNNInnerProductConfig config_;
+        int32_t axis_;
+        int32_t num_output_;
+        int32_t M_;
+        int32_t N_;
+        int32_t K_;
+        bool bias_term_;
+        bool transpose_;
+        bool image_copied_;
+        bool phase_test_;
+};
+
+typedef enum {
+    LRNParameter_NormRegion_ACROSS_CHANNELS = 0,
+    LRNParameter_NormRegion_WITHIN_CHANNEL = 1
+} LRNParameter_NormRegion_WITHIN_CHANNEL_t;
+
+struct OCL4DNNLRNConfig
+{
+    OCL4DNNLRNConfig() :
+        phase_test(true)
+    {}
+    MatShape in_shape;
+    LRNParameter_NormRegion_WITHIN_CHANNEL_t lrn_type;
+    bool phase_test; // = true;
+    int local_size;
+    float alpha;
+    float beta;
+    float k;
+    bool norm_by_size;
+    int32_t batch_size;
+    int32_t channels;
+    int32_t height;
+    int32_t width;
+};
+
+template<typename Dtype>
+class OCL4DNNLRN
+{
+    public:
+        explicit OCL4DNNLRN(OCL4DNNLRNConfig config);
+        bool Forward(const UMat& bottom_data, UMat& top_data);
+
+    private:
+        bool crossChannelForward(const UMat& bottom_data, UMat& top_data);
+        LRNParameter_NormRegion_WITHIN_CHANNEL_t lrn_type_;
+        bool phase_test_;
+        int32_t size_;
+        Dtype alpha_;
+        Dtype beta_;
+        Dtype k_;
+        int32_t num_;
+        int32_t channels_;
+        int32_t height_;
+        int32_t width_;
+        bool norm_by_size_;
+};
+
+struct OCL4DNNSoftmaxConfig
+{
+    OCL4DNNSoftmaxConfig()
+    {}
+    MatShape in_shape;
+    int axis;
+    int channels;
+};
+
+template<typename Dtype>
+class OCL4DNNSoftmax
+{
+    public:
+        explicit OCL4DNNSoftmax(OCL4DNNSoftmaxConfig config);
+        ~OCL4DNNSoftmax();
+        bool Forward(const UMat& bottom_data, UMat& top_data);
+
+    private:
+        int32_t softmax_axis_;
+        int32_t inner_num_;
+        int32_t outer_num_;
+        int32_t channels_;
+        int32_t count_;
+        bool use_slm_;
+        UMat scale_data_;
+};
+#endif // HAVE_OPENCL
+} // namespace ocl4dnn
+} // namespace dnn
+} // namespce cv
+#endif
diff --git a/modules/dnn/src/ocl4dnn/src/common.cpp b/modules/dnn/src/ocl4dnn/src/common.cpp
new file mode 100644
index 0000000000..5a18c41110
--- /dev/null
+++ b/modules/dnn/src/ocl4dnn/src/common.cpp
@@ -0,0 +1,57 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../../precomp.hpp"
+#include "common.hpp"
+#include "opencl_kernels_dnn.hpp"
+
+using namespace cv;
+
+#ifdef HAVE_OPENCL
+bool clOptionSupport(cv::String option)
+{
+    cv::String errmsg;
+    ocl::Program program = ocl::Context::getDefault().getProg(ocl::dnn::dummy_oclsrc, option, errmsg);
+    return program.ptr() ? true : false;
+}
+
+#endif // HAVE_OPENCL
diff --git a/modules/dnn/src/ocl4dnn/src/math_functions.cpp b/modules/dnn/src/ocl4dnn/src/math_functions.cpp
new file mode 100644
index 0000000000..42b35572aa
--- /dev/null
+++ b/modules/dnn/src/ocl4dnn/src/math_functions.cpp
@@ -0,0 +1,538 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../../precomp.hpp"
+#include "common.hpp"
+#include "math_functions.hpp"
+#include <vector>
+#include "opencl_kernels_dnn.hpp"
+
+namespace cv
+{
+namespace dnn
+{
+namespace ocl4dnn
+{
+
+#ifdef HAVE_OPENCL
+// Create and copy buffer to image for GEMM's matrix A and B.
+// Will return image to caller if the input image is NULL. Otherwise,
+// will use the image directly. It's caller's responsibility to
+// release the created image.
+template<typename Dtype>
+ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
+                                          bool is_matrix_a, bool transpose,
+                                          bool padding, int padded_height,
+                                          int padded_width, int height,
+                                          int width, int ld)
+{
+    ocl::Context ctx = ocl::Context::getDefault();
+    ocl::Queue queue = ocl::Queue::getDefault();
+    ocl::Image2D image;
+
+    if (!is_matrix_a && transpose)
+    {
+        if (ld == width)
+        {
+            image = ocl::Image2D(buffer);
+        } else {
+            // For matrix B with transpose, we need to handle them differently.
+            // As we can't use the sub group block read to get a row easily,
+            // we have to use CL_FLOAT type with read_imagef to get the row.
+            UMat mat(height, width, CV_32FC1);
+            image = ocl::Image2D(mat);
+
+            ocl::Kernel oclk_gemm_copy("gemm_buffer_copy_image_transpose_float", ocl::dnn::gemm_image_oclsrc);
+
+            size_t global_copy[2];
+            global_copy[0] = width;
+            global_copy[1] = height;
+            oclk_gemm_copy.set(0, ocl::KernelArg::PtrReadOnly(buffer));
+            oclk_gemm_copy.set(1, image);
+            oclk_gemm_copy.set(2, offset);
+            oclk_gemm_copy.set(3, width);
+            oclk_gemm_copy.set(4, height);
+            oclk_gemm_copy.set(5, ld);
+            oclk_gemm_copy.run(2, global_copy, NULL, false);
+        }
+    } else {
+        if (!padding)
+        {
+            // copy without padding.
+            image = ocl::Image2D(buffer);
+        } else {
+            UMat mat(padded_height, padded_width, CV_8UC4);
+            image = ocl::Image2D(mat);
+
+            ocl::Kernel oclk_gemm_copy("gemm_buffer_copy_image_no_transpose_float",
+                                       ocl::dnn::gemm_image_oclsrc);
+
+            size_t global_copy[2];
+            global_copy[0] = padded_width;
+            global_copy[1] = padded_height;
+
+            oclk_gemm_copy.set(0, ocl::KernelArg::PtrReadOnly(buffer));
+            oclk_gemm_copy.set(1, image);
+            oclk_gemm_copy.set(2, offset);
+            oclk_gemm_copy.set(3, width);
+            oclk_gemm_copy.set(4, height);
+            oclk_gemm_copy.set(5, ld);
+
+            oclk_gemm_copy.run(2, global_copy, NULL, false);
+        }
+    }
+
+    return image;
+}
+
+template
+ocl::Image2D ocl4dnnGEMMCopyBufferToImage<float>(UMat buffer, int offset,
+                                                 bool is_matrix_a, bool transpose,
+                                                 bool padding, int padded_height,
+                                                 int padded_width, int height,
+                                                 int width,  int ld);
+
+enum gemm_type_t
+{
+    GEMM_TYPE_NONE = 0,
+    GEMM_TYPE_FAST_IMAGE_32_1,
+    GEMM_TYPE_FAST_IMAGE_32_2,
+    GEMM_TYPE_FAST_IMAGE_B_IMAGE,
+    GEMM_TYPE_MAX
+};
+
+template<typename Dtype>
+static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
+                                 const CBLAS_TRANSPOSE TransB, const int32_t M,
+                                 const int32_t N, const int32_t K, const Dtype alpha,
+                                 const UMat A, const int32_t offA, const UMat B,
+                                 const int32_t offB, const Dtype beta, UMat C,
+                                 const int32_t offC, bool is_image_a, bool is_image_b,
+                                 enum gemm_type_t gemm_type,
+                                 const size_t max_image_size)
+{
+    CHECK_EQ(gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 || gemm_type == GEMM_TYPE_FAST_IMAGE_32_2 ||
+             gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE, true) << "Invalid fast image gemm type." << std::endl;
+
+    if (is_image_a)
+    {
+        CHECK_EQ(offA, 0) << "Invalid input image offset." << std::endl;
+        return false;
+    }
+
+    if (is_image_b)
+    {
+        CHECK_EQ(offB, 0) << "Invalid input image offset." << std::endl;
+        return false;
+    }
+
+    int widthA = (TransA == CblasNoTrans) ? K : M;
+    int heightA = (TransA == CblasNoTrans) ? M : K;
+    int widthB = (TransB == CblasNoTrans) ? N : K;
+    int heightB = (TransB == CblasNoTrans) ? K : N;
+
+    int ldA = widthA;
+    int ldB = widthB;
+    int ldC = N;
+
+    int A_start_x = 0, A_start_y = 0, B_start_x = 0;
+    int B_start_y = 0, C_start_x = 0, C_start_y = 0;
+    int blocksize = 1024;
+    if (gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE)
+        blocksize = max_image_size;
+    int blockA_width = blocksize;
+    int blockA_height = blocksize;
+    int blockB_width = blocksize;
+    int blockB_height = blocksize;
+    int blockC_width = blocksize;
+    int blockC_height = blocksize;
+
+    int use_buffer_indicator = 8;
+    // To fix the edge problem casued by the sub group block read.
+    // we have to pad the image if it's not multiple of tile.
+    // just padding one line is enough as the sub group block read
+    // will clamp to edge according to the spec.
+
+    ocl::Context ctx = ocl::Context::getDefault();
+    ocl::Queue queue = ocl::Queue::getDefault();
+
+    ocl::Image2D ImA;
+    ocl::Image2D ImB;
+
+    std::string kernel_name("gemm_");
+    if (gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 || gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE)
+        kernel_name += "32_1_";
+    else
+        kernel_name += "32_2_";
+
+    if (TransA == CblasNoTrans)
+        kernel_name += "N";
+    else
+        kernel_name += "T";
+
+    if (TransB == CblasNoTrans)
+    {
+        kernel_name += "N_";
+    } else {
+        kernel_name += "T_";
+        if (is_image_b || (K % use_buffer_indicator != 0))
+        {
+            kernel_name += "SCALAR_";
+        } else {
+            kernel_name += "BUFFER_";
+        }
+    }
+
+    if (alpha == 1)
+        kernel_name += "1_";
+    else
+        kernel_name += "0_";
+
+    if (beta == 0)
+        kernel_name += "0";
+    else
+        kernel_name += "1";
+
+    kernel_name += "_float";
+
+    ocl::Kernel oclk_gemm_float(kernel_name.c_str(), ocl::dnn::gemm_image_oclsrc);
+    if (oclk_gemm_float.empty())
+        return false;
+
+    while (C_start_y < M)
+    {
+        blockC_width = std::min(static_cast<int>(N) - C_start_x, blocksize);
+        blockC_height = std::min(static_cast<int>(M) - C_start_y, blocksize);
+
+        int isFirstColBlock = 1;
+        for (int k = 0; k < K; k += blocksize)
+        {
+            blockA_width = std::min(widthA - A_start_x, blocksize);
+            blockA_height = std::min(heightA - A_start_y, blocksize);
+            blockB_width = std::min(widthB - B_start_x, blocksize);
+            blockB_height = std::min(heightB - B_start_y, blocksize);
+            int block_Ksize = std::min(static_cast<int>(K) - k, blocksize);
+
+            int padded_k = block_Ksize + ((block_Ksize & 7) ? (8 - (block_Ksize & 7)) : 0);
+            int imageA_w = (TransA == CblasNoTrans) ? padded_k : blockA_width;
+            int imageA_h = (TransA == CblasNoTrans) ? blockA_height : padded_k;
+            int imageB_w = (TransB == CblasNoTrans) ? blockB_width : padded_k;
+            int imageB_h = (TransB == CblasNoTrans) ? padded_k : blockB_height;
+
+            int blockA_offset = offA + A_start_y * ldA + A_start_x;
+            int blockB_offset = offB + B_start_y * ldB + B_start_x;
+            int blockC_offset = offC + C_start_y * ldC + C_start_x;
+            if (TransB == CblasNoTrans)
+            {
+                bool padding_A = false;
+                bool padding_B = false;
+
+                if (!is_image_a && !is_image_b)
+                {
+                    if (M * K < N * K)
+                        padding_B = true;
+                    else
+                        padding_A = true;
+                }
+
+                if (!is_image_a)
+                {
+                    ImA = ocl4dnnGEMMCopyBufferToImage<Dtype>(A, blockA_offset,
+                                                              true, TransA != CblasNoTrans,
+                                                              padding_A, imageA_h, imageA_w,
+                                                              blockA_height, blockA_width, ldA);
+                }
+                if (!is_image_b)
+                {
+                    ImB = ocl4dnnGEMMCopyBufferToImage<Dtype>(B, blockB_offset,
+                                                              false, false,
+                                                              padding_B, imageB_h, imageB_w,
+                                                              blockB_height, blockB_width, ldB);
+                }
+            } else {
+                // We will use normal read_imagef to read image B when B has transpose.
+                // thus we don't need to pad image A at all.
+                if (!is_image_a)
+                {
+                    bool padding;
+                    padding = !is_image_b;
+                    ImA = ocl4dnnGEMMCopyBufferToImage<Dtype>(A, blockA_offset,
+                                                              true, TransA != CblasNoTrans,
+                                                              padding, imageA_h, imageA_w,
+                                                              blockA_height, blockA_width, ldA);
+                }
+
+                if (!is_image_b && (K % use_buffer_indicator != 0))
+                {
+                    ImB = ocl4dnnGEMMCopyBufferToImage<Dtype>(B, blockB_offset,
+                                                              false, true, false, imageB_h, imageB_w,
+                                                              blockB_height, blockB_width, ldB);
+                }
+            }
+
+            size_t global[2];
+            if (gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 || gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE)
+            {
+                global[0] = (size_t)( blockC_width + 7 ) & ~7;
+            } else {
+                global[0] = (size_t)( (blockC_width / 2 ) + 7 ) ^ ~7;
+            }
+            global[1] = (size_t)(blockC_height + 31) / 32;
+
+            size_t local[2];
+            local[0] = 8;
+            local[1] = 1;
+
+            cl_uint arg_idx = 0;
+            if (is_image_a)
+                oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrReadOnly(A));
+            else
+                oclk_gemm_float.set(arg_idx++, ImA);
+
+            if (TransB == CblasNoTrans || is_image_b || (K % use_buffer_indicator != 0))
+            {
+                if (is_image_b)
+                    oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrReadOnly(B));
+                else
+                    oclk_gemm_float.set(arg_idx++, ImB);
+            } else {
+                oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrReadOnly(B));
+                oclk_gemm_float.set(arg_idx++, blockB_offset);
+                oclk_gemm_float.set(arg_idx++, ldB);
+            }
+            oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrWriteOnly(C));
+            oclk_gemm_float.set(arg_idx++, blockC_offset);
+            oclk_gemm_float.set(arg_idx++, blockC_height);
+            oclk_gemm_float.set(arg_idx++, blockC_width);
+            oclk_gemm_float.set(arg_idx++, ldC);
+            oclk_gemm_float.set(arg_idx++, alpha);
+            oclk_gemm_float.set(arg_idx++, beta);
+            oclk_gemm_float.set(arg_idx++, padded_k);
+            if (TransB != CblasNoTrans)
+                oclk_gemm_float.set(arg_idx++, block_Ksize);
+            oclk_gemm_float.set(arg_idx++, isFirstColBlock);
+
+            if (!oclk_gemm_float.run(2, global, local, false))
+                return false;
+
+            if (TransA == CblasNoTrans)
+                A_start_x += blockA_width;
+            else
+                A_start_y += blockA_height;
+
+            if (TransB == CblasNoTrans)
+                B_start_y += blockB_height;
+            else
+                B_start_x += blockB_width;
+
+            isFirstColBlock = 0;
+        }
+
+        C_start_x += blockC_width;
+        if (TransA == CblasNoTrans)
+            A_start_x = 0;
+        else
+            A_start_y = 0;
+        if (TransB == CblasNoTrans)
+        {
+            B_start_x += blockB_width;
+            B_start_y = 0;
+        } else {
+            B_start_y += blockB_height;
+            B_start_x = 0;
+        }
+        if (C_start_x >= N)
+        {
+            C_start_x = 0;
+            B_start_x = 0;
+            B_start_y = 0;
+            C_start_y += blockC_height;
+            if (TransA == CblasNoTrans)
+                A_start_y += blockA_height;
+            else
+                A_start_x += blockA_width;
+        }
+    }
+
+    return true;
+}
+
+template<typename Dtype>
+bool ocl4dnnGEMMCommon(const CBLAS_TRANSPOSE TransB,
+                       const int32_t M, const int32_t N, const int32_t K,
+                       const UMat A, const UMat B,
+                       const UMat B_image, UMat C,
+                       const size_t max_image_size)
+{
+    gemm_type_t gemm_type = GEMM_TYPE_FAST_IMAGE_32_1;
+
+    if (gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 ||
+        gemm_type == GEMM_TYPE_FAST_IMAGE_32_2)
+    {
+        return ocl4dnnFastImageGEMM<Dtype>(CblasNoTrans, TransB, M, N, K,
+                                           (Dtype)1., A, 0, B, 0, (Dtype)0., C,
+                                           0, false, false, gemm_type, max_image_size);
+    }
+    else if (gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE)
+    {
+        return ocl4dnnFastImageGEMM<Dtype>(CblasNoTrans, TransB, M, N, K,
+                                           (Dtype)1., A, 0, B_image, 0, (Dtype)0., C,
+                                           0, false, true,
+                                           GEMM_TYPE_FAST_IMAGE_B_IMAGE,
+                                           max_image_size);
+    }
+    return false;
+}
+
+template bool ocl4dnnGEMMCommon<float>(const CBLAS_TRANSPOSE TransB,
+                                       const int32_t M, const int32_t N, const int32_t K,
+                                       const UMat A, const UMat B,
+                                       const UMat B_image, UMat C,
+                                       const size_t max_image_size);
+
+template<typename Dtype>
+bool ocl4dnnGEMV(const CBLAS_TRANSPOSE TransA,
+                 const int32_t M, const int32_t N, const Dtype alpha,
+                 const UMat A, const int32_t offA, const UMat x,
+                 const int32_t offx, const Dtype beta, UMat y,
+                 const int32_t offy)
+{
+    return false;
+}
+
+template<>
+bool ocl4dnnGEMV<float>(const CBLAS_TRANSPOSE TransA,
+                 const int32_t M, const int32_t N, const float alpha,
+                 const UMat A, const int32_t offA, const UMat x,
+                 const int32_t offx, const float beta, UMat y,
+                 const int32_t offy)
+{
+    ocl::Queue queue = ocl::Queue::getDefault();
+    bool ret = false;
+
+    if (TransA == CblasNoTrans)
+    {
+        ocl::Kernel k(CL_KERNEL_SELECT("matvec_mul4"), cv::ocl::dnn::matvec_mul_oclsrc);
+        if (k.empty())
+            return false;
+
+        uint row_size = M;
+        uint col_size = N;
+        size_t localsize[] = { 128 };
+        size_t globalsize[] = { row_size / 4 * localsize[0] };
+
+        uint argId = 0;
+        k.set(argId++, ocl::KernelArg::PtrReadOnly(A));
+        k.set(argId++, offA);
+        k.set(argId++, cl_uint(col_size));
+        k.set(argId++, cl_uint(col_size%4));
+        k.set(argId++, ocl::KernelArg::PtrReadOnly(x));
+        k.set(argId++, offx);
+        k.set(argId++, alpha);
+        k.set(argId++, beta);
+        k.set(argId++, ocl::KernelArg::PtrWriteOnly(y));
+        k.set(argId++, offy);
+        k.set(argId++, NULL, localsize[0] * sizeof(cl_float4));
+
+        ret = k.run(1, globalsize, localsize, false);
+
+        if ((row_size % 4) != 0 && ret)
+        {
+            ocl::Kernel k_1(CL_KERNEL_SELECT("matvec_mul1"), cv::ocl::dnn::matvec_mul_oclsrc);
+            size_t localsize[] = { 128 };
+            size_t globalsize[] = { row_size % 4 * localsize[0] };
+            uint row_offset = row_size - (row_size % 4);
+
+            uint argId = 0;
+            k_1.set(argId++, ocl::KernelArg::PtrReadOnly(A));
+            k_1.set(argId++, offA);
+            k_1.set(argId++, cl_uint(col_size));
+            k_1.set(argId++, cl_uint(row_offset));
+            k_1.set(argId++, cl_uint(col_size%4));
+            k_1.set(argId++, ocl::KernelArg::PtrReadOnly(x));
+            k_1.set(argId++, offx);
+            k_1.set(argId++, alpha);
+            k_1.set(argId++, beta);
+            k_1.set(argId++, ocl::KernelArg::PtrWriteOnly(y));
+            k_1.set(argId++, offy);
+            k_1.set(argId++, NULL, localsize[0] * sizeof(cl_float));
+
+            ret = k_1.run(1, globalsize, localsize, false);
+        }
+    }
+    return ret;
+}
+
+template<typename Dtype>
+bool ocl4dnnAXPY(const int32_t N, const Dtype alpha,
+                 const UMat X, const int32_t offX, UMat Y,
+                 const int32_t offY)
+{
+    ocl::Context ctx = ocl::Context::getDefault();
+
+    ocl::Kernel oclk_axpy(CL_KERNEL_SELECT("axpy"), cv::ocl::dnn::math_oclsrc);
+    if (oclk_axpy.empty())
+        return false;
+
+    size_t global[] = { 128 * 128 };
+    size_t local[] = { 128 };
+
+    cl_uint argIdx = 0;
+    oclk_axpy.set(argIdx++, N);
+    oclk_axpy.set(argIdx++, alpha);
+    oclk_axpy.set(argIdx++, ocl::KernelArg::PtrReadOnly(X));
+    oclk_axpy.set(argIdx++, offX);
+    oclk_axpy.set(argIdx++, ocl::KernelArg::PtrWriteOnly(Y));
+    oclk_axpy.set(argIdx++, offY);
+
+    return oclk_axpy.run(1, global, local, false);
+}
+
+template bool ocl4dnnAXPY<float>(const int32_t N, const float alpha,
+                                 const UMat X, const int32_t offX,
+                                 UMat Y, const int32_t offY);
+
+#endif  // HAVE_OPENCL
+
+} // namespace ocl4dnn
+} // namespace dnn
+} // namespce cv
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
new file mode 100644
index 0000000000..13d5afb165
--- /dev/null
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@@ -0,0 +1,1568 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../../precomp.hpp"
+
+#include <opencv2/core/utils/configuration.private.hpp>
+
+#include <string>
+#include <vector>
+#include <fstream>
+#include <sys/stat.h>
+#include <assert.h>
+#include "common.hpp"
+#include "ocl4dnn.hpp"
+#include "opencl_kernels_dnn.hpp"
+#include "math_functions.hpp"
+#include "default_kernel_config.hpp"
+
+#if defined WIN32 || defined _WIN32
+#include <windows.h>
+#include <direct.h>
+#endif
+
+#ifdef HAVE_OPENCL
+namespace cv { namespace dnn { namespace ocl4dnn {
+static cv::Mutex kernelConfigMutex;
+typedef std::map<std::string, std::string> kernel_hash_t;
+static kernel_hash_t kernelConfigMap;
+static bool defaultConfigLoaded = false;
+
+template<typename Dtype>
+OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
+{
+    bias_term_ = config.bias_term;
+    int dims = config.in_shape.size();
+    int spatial_dims = 2;
+
+    channels_   = config.in_shape[dims - spatial_dims - 1];
+    num_output_ = config.out_shape[dims - spatial_dims - 1];
+    group_ = config.group;
+
+    prev_kernel_type_ = -1;
+    tuned_ = false;
+
+    // assumption: spatial dimension is 2.
+    kernel_h_ = config.kernel.height;
+    kernel_w_ = config.kernel.width;
+    pad_h_ = config.pad.height;
+    pad_w_ = config.pad.width;
+    stride_h_ = config.stride.height;
+    stride_w_ = config.stride.width;
+    dilation_h_ = config.dilation.height;
+    dilation_w_ = config.dilation.width;
+    M_ = num_output_ / group_;
+    height_ = config.in_shape[dims - spatial_dims + 0];
+    width_ = config.in_shape[dims - spatial_dims + 1];
+    output_h_ = config.out_shape[dims - spatial_dims + 0];
+    output_w_ = config.out_shape[dims - spatial_dims + 1];
+    bottom_dim_ = channels_ * width_ * height_;
+    top_dim_ = num_output_ * output_w_ * output_h_;
+
+    cache_path_ = utils::getConfigurationParameterString("OPENCV_OCL4DNN_CONFIG_PATH", "");
+
+    use_cache_path_ = false;
+    if (!cache_path_.empty())
+    {
+#if defined _WIN32
+        struct _stat file_stat;
+        use_cache_path_ = _stat(cache_path_.c_str(), &file_stat) == 0 &&
+                      ((_S_IFDIR & file_stat.st_mode) != 0);
+#else
+        struct stat file_stat;
+        use_cache_path_ = stat(cache_path_.c_str(), &file_stat) == 0 &&
+                      S_ISDIR(file_stat.st_mode);
+#endif
+        if (!use_cache_path_)
+        {
+            static int warn_ = 0;
+            if (!warn_)
+            {
+                std::cerr
+                    << "OpenCV(ocl4dnn): Kernel configuration cache directory doesn't exist: " << cache_path_ << std::endl
+                    << std::endl;
+                warn_ = true;
+            }
+        }
+    }
+
+    force_auto_tuning_ =
+            (use_cache_path_ && !utils::getConfigurationParameterBool("OPENCV_OCL4DNN_DISABLE_AUTO_TUNING", false))
+            || utils::getConfigurationParameterBool("OPENCV_OCL4DNN_FORCE_AUTO_TUNING", false);
+}
+
+template<typename Dtype>
+OCL4DNNConvSpatial<Dtype>::~OCL4DNNConvSpatial()
+{
+    if (!swizzled_weights_umat.empty()) {
+        swizzled_weights_umat.release();
+    }
+}
+
+template<typename Dtype>
+void OCL4DNNConvSpatial<Dtype>::collectCommonInformation()
+{
+    addDef("Dtype", "float");
+    addDef("Dtype2", "float2");
+    addDef("Dtype4", "float4");
+    addDef("Dtype8", "float8");
+    addDef("Dtype16", "float16");
+    addDef("as_Dtype", "as_float");
+    addDef("as_Dtype2", "as_float2");
+    addDef("as_Dtype4", "as_float4");
+    addDef("as_Dtype8", "as_float8");
+    addDef("Dtype_ID", (int)CV_32F);
+    addDef("Dtype_SIZE", (int)sizeof(Dtype));
+}
+
+typedef enum {
+    KERNEL_TYPE_INTEL_IDLF = 2,
+    KERNEL_TYPE_BASIC = 4,
+    KERNEL_TYPE_GEMM_LIKE = 5
+} ocl4dnnConvSpatialKernelType_t;
+
+template<typename Dtype>
+void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
+                                                   int32_t blockM,
+                                                   int32_t blockK,
+                                                   int32_t blockN)
+{
+    std::string kernelUKey;
+    int32_t simd_size;
+
+    if (kernelType == KERNEL_TYPE_INTEL_IDLF) {
+        simd_size = blockN;
+        kernelUKey = generateSpecificKey(KERNEL_TYPE_INTEL_IDLF, blockM, blockK, 1);
+
+        // kernel name
+        kernel_name_ = "IDLF_";
+        kernel_name_ += kernelUKey;
+        if (simd_size == 16)
+            kernel_name_ += "_SIMD16";
+        else
+            kernel_name_ += "_SIMD8";
+
+        // options
+        options_ << " -cl-fast-relaxed-math -D KERNEL_IDLF -D convolve_simd=" << kernel_name_;
+        if (clOptionSupport("-cl-no-subgroup-ifp"))
+            options_ << " -cl-no-subgroup-ifp ";
+
+        // defs
+        int32_t output_width = output_w_;
+        int32_t output_height = output_h_;
+        int32_t output_block_width = blockM;
+        int32_t output_block_height = blockK;
+        const int32_t last_block_width = (output_width % output_block_width == 0) ?
+                                        output_block_width : output_width % output_block_width;
+        const int32_t last_block_height = (output_height % output_block_height == 0) ?
+                                         output_block_height : output_height % output_block_height;
+        int tile_x = alignSize((output_block_width - 1) * stride_w_ + kernel_w_ * dilation_w_, 4);
+        int tile_y = (output_block_height -1) * stride_h_ + kernel_h_ * dilation_h_;
+        int tile_y_stride = (4 * simd_size) / tile_x;
+        int invec_size = divUp(tile_y, tile_y_stride);
+
+        addDef("SIMD_SIZE", simd_size);
+        addDef("filter_qualifier", "__global");
+        addDef("OUT_BLOCK_WIDTH", output_block_width);
+        addDef("OUT_BLOCK_HEIGHT", output_block_height);
+        addDef("LAST_BLOCK_WIDTH", last_block_width);
+        addDef("LAST_BLOCK_HEIGHT", last_block_height);
+        addDef("INPUT_DEPTH", channels_ / group_);
+        addDef("TOTAL_INPUT_DEPTH_SIZE", channels_);
+        addDef("TOTAL_OUTPUT_DEPTH", num_output_);
+        addDef("INPUT_START_X", 0);
+        addDef("INPUT_START_Y", 0);
+        addDef("INPUT_START_Z", 0);
+        addDef("NUM_FILTERS", M_);
+        addDef("OUT_BUFF_OFFSET", 0);
+        addDef("TILE_X", tile_x);
+        addDef("TILE_Y", tile_y);
+        addDef("TILE_Y_STRIDE", tile_y_stride);
+        addDef("INVEC_SIZE", invec_size);
+        addDef("ALIGNED_NUM_FILTERS", (int)alignSize(M_, simd_size));
+        addDef("OUT_BLOCK_SIZE", (output_block_width*output_block_height));
+        addDef("APPLY_BIAS", bias_term_);
+
+        src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
+    }
+    else if (kernelType == KERNEL_TYPE_BASIC)
+    {
+        addDef("KERNEL_BASIC");
+
+        kernelUKey = generateSpecificKey(KERNEL_TYPE_BASIC, blockM, blockK, blockN);
+        kernel_name_ = "BASIC_";
+        kernel_name_ += kernelUKey;
+
+        // opts
+        options_ << " -cl-fast-relaxed-math -D ConvolveBasic=" << kernel_name_;
+        if (clOptionSupport("-cl-no-subgroup-ifp"))
+            options_ << " -cl-no-subgroup-ifp ";
+
+        // defs
+        addDef("CHANNELS", channels_ / group_);
+        addDef("APPLY_BIAS", bias_term_);
+        addDef("OUTPUT_Z", M_);
+        addDef("ZPAR", 1);
+
+        src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
+    }
+    else if (kernelType == KERNEL_TYPE_GEMM_LIKE)
+    {
+        simd_size = blockK;
+        kernelUKey = generateSpecificKey(KERNEL_TYPE_GEMM_LIKE, blockM, blockK, blockN);
+
+        kernel_name_ = "U_GEMM_LIKE_CONV_";
+        kernel_name_ += kernelUKey.c_str();
+        kernel_name_ += (blockK == 8) ? "_SIMD8" : "_SIMD16";
+        std::stringstream kernelDef;
+        kernelDef << "GEMM_LIKE_CONV_" << blockN << "_" << blockM;
+        if (blockK == 16)
+            kernelDef << "_SIMD16";
+
+        // Build list of options and defines
+        options_ << " -cl-fast-relaxed-math " << " -D " << kernelDef.str()
+            << " -D Conv_Interleaved=" << kernel_name_.c_str();
+        options_ << " -cl-mad-enable";
+        if (clOptionSupport("-cl-no-subgroup-ifp"))
+            options_ << " -cl-no-subgroup-ifp ";
+
+        addDef("INPUT_DEPTH", channels_);
+        addDef("WIDTH1", M_);
+        addDef("OUT_PADDING_LEFT", 0);
+        addDef("OUT_PADDING_HEIGHT", 0);
+        addDef("OUT_DEPTH", M_);
+        addDef("NUM_BATCHES", num_);
+        addDef("DY", blockM);
+        addDef("DX", blockN);
+        addDef("KERNEL_WIDTH_DIV2", kernel_w_ / 2);
+        addDef("KERNEL_SLICE_DIV2", (kernel_w_ * kernel_h_) / 2);
+        addDef("TILE_N_LAST", M_ % 32);
+        addDef("TILE_N_LAST_DIV8", (M_ % 32) / 8);
+        addDef("APPLY_BIAS", bias_term_);
+        src_ = ocl::dnn::conv_layer_spatial_oclsrc;
+    }
+}
+
+template<typename Dtype>
+void OCL4DNNConvSpatial<Dtype>::setupKernel()
+{
+    collectCommonInformation();
+
+    addDef("KERNEL_WIDTH", kernel_w_);
+    addDef("KERNEL_HEIGHT" , kernel_h_);
+    addDef("STRIDE_X", stride_w_);
+    addDef("STRIDE_Y", stride_h_);
+    addDef("DILATION_X", dilation_w_);
+    addDef("DILATION_Y", dilation_h_);
+    if (kernelType_ != KERNEL_TYPE_BASIC)
+    {
+        addDef("INPUT_PAD_W", pad_w_);
+        addDef("INPUT_PAD_H", pad_h_);
+    }
+
+    setupKernelDetails(kernelType_, blockM_, blockK_, blockN_);
+}
+
+template<typename Dtype>
+bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
+                                        const UMat& weight,
+                                        const UMat& bias,
+                                        UMat& top,
+                                        int32_t numImages)
+{
+    num_ = numImages;
+
+    prepareKernel(bottom, top, weight, bias, numImages);
+    return convolve(bottom, top, weight, bias, numImages, bestKernelConfig, cv::ocl::Queue::getDefault());
+}
+
+template<typename Dtype>
+void OCL4DNNConvSpatial<Dtype>::calculateBenchmark(const UMat &bottom, UMat &verifyTop,
+                                                   const UMat &weight, const UMat &bias,
+                                                   int32_t numImages)
+{
+    options_.str(""); options_.clear(); // clear contents and state flags
+    createBasicKernel(1, 1, 1);
+    kernel_index_ = kernelQueue.size() - 1;
+    convolve(bottom, verifyTop, weight, bias, numImages, kernelQueue[kernel_index_], cv::ocl::Queue::getDefault());
+    CV_Assert(phash.find(kernelQueue[kernel_index_]->kernelName) != phash.end());
+    //unloadProgram(kernelQueue[kernel_index_]->kernelName);
+    kernelQueue.pop_back();
+    return;
+}
+
+#define dbg
+#ifdef dbg
+#define dbgPrint(x) (x)
+#else
+#define dbgPrint(x)
+#endif
+
+// For large enough input size, we do not need to tune kernels for different
+// size. The reason is with large input size, there will be enough work items
+// to feed al the EUs.
+// FIXME for the gemm like convolution, switch back to eaxct image size.
+
+#define TUNING_SIZE(x) ((x) > 256 ? 256 : (alignSize(x, 16)))
+
+template<typename Dtype>
+void OCL4DNNConvSpatial<Dtype>::generateKey()
+{
+    std::stringstream keyBuilder;
+    // FIXME: to support fuse?
+    keyBuilder << "k" << kernel_w_ << "x" << kernel_h_ << "_"
+               << "cn" << channels_ << "_"
+               << "g" << group_ << "_"
+               << "s" << stride_w_ << "x" << stride_h_ << "_"
+               << "d" << dilation_w_ << "x" << dilation_h_ << "_"
+               << "b" << bias_term_ << "_"
+               << "in" << TUNING_SIZE(width_) << "x" << TUNING_SIZE(height_) << "_"
+               << "p" << pad_w_ << "x" << pad_h_ << "_"
+               << "num" << num_ << "_"
+               << "M" << M_;
+
+    key_ = ocl::Device::getDefault().vendorName() + "_EU" + cv::format("%d", ocl::Device::getDefault().maxComputeUnits()) + "_" + keyBuilder.str();
+    key_sanitized_ = key_;
+    for (size_t i = 0; i < key_sanitized_.size(); i++)
+    {
+        char c = key_sanitized_[i];
+        if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'))
+        {
+            key_sanitized_[i] = '_';
+        }
+    }
+    // TODO add hash?
+    // key_sanitized_ = key_sanitized_ + cv::format("_%08llx", crc64((uchar*)key_.c_str(), key_.size()));
+    short_key_ = keyBuilder.str();
+}
+
+template<typename Dtype>
+std::string OCL4DNNConvSpatial<Dtype>::generateSpecificKey(int32_t type, int32_t blockWidth,
+                                                           int32_t blockHeight, int32_t blockDepth)
+{
+    std::stringstream keyBuilder;
+    keyBuilder << short_key_
+               << "_" << type
+               << "_" << blockWidth
+               << "_" << blockHeight
+               << "_" << blockDepth;
+    return keyBuilder.str();
+}
+
+template<typename Dtype>
+void interleaveMatrix(Dtype* mem_dst, const Dtype *mem,
+                      int r, int c, int interleavedRows, int nonInterleavedRows,
+                      int blockWidth, int rowAlignment )
+{
+    CHECK_EQ(interleavedRows % 2, 0) <<
+             "interleaveMatrix only supports even values for interleavedRows.";
+
+    size_t memSize = r * c * sizeof(float);
+    size_t dstSize = memSize *
+                     (interleavedRows + nonInterleavedRows * 2) /
+                     (interleavedRows + nonInterleavedRows);
+    memset(mem_dst, 0, dstSize);    // NOLINT
+
+    const int xStride = blockWidth;
+    const int yStride = c * 2;
+    const Dtype *pSrc = mem;
+    Dtype* pDst = mem_dst;
+    for (int y = 0; y < r;) {
+        for (int rows = 0; rows < interleavedRows; rows += 2) {
+            if ( y >= r ) break;
+            if ((c % xStride) == 0) {
+                for (int x = 0; x < c / xStride; x++) {
+                    memcpy(pDst + x * xStride * 2,                         // NOLINT
+                           pSrc + x * xStride,     xStride * sizeof(Dtype));
+                    memcpy(pDst + x * xStride * 2 + xStride,               // NOLINT
+                           pSrc + x * xStride + c, xStride * sizeof(Dtype));
+                }
+            } else {
+                const int count = c / xStride;
+                int x = 0;
+                for (; x < count - 1; x++) {
+                    memcpy(pDst + x * xStride * 2,                          // NOLINT
+                           pSrc + x * xStride, xStride * sizeof(Dtype));
+                    memcpy(pDst + x * xStride * 2 + xStride,                // NOLINT
+                           pSrc + x * xStride + c, xStride * sizeof(Dtype));
+                }
+                memcpy(pDst + x * xStride * 2,                            // NOLINT
+                       pSrc + x * xStride, xStride * sizeof(Dtype));
+            }
+            pSrc += yStride;
+            pDst += yStride;
+            y += 2;
+        }
+
+        for (int rows = 0; rows < nonInterleavedRows; rows++) {
+            if (y >= r) break;
+            const int stride = rowAlignment;
+            int remaining = c;
+            for (int x = 0; x < c; x += stride) {
+                if (remaining >= stride) {
+                    memcpy(pDst + x * 2, pSrc + x, stride * sizeof(Dtype));    // NOLINT
+                    remaining -=stride;
+                } else {
+                    memcpy(pDst + x * 2, pSrc + x, remaining * sizeof(Dtype));  // NOLINT
+                }
+            }
+            pSrc += yStride / 2;
+            pDst += yStride;
+            y++;
+        }
+    }
+}
+
+template<typename Dtype>
+bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
+                                              int32_t swizzled_factor,
+                                              bool interleave)
+{
+    // Simply skip the weight swizzle if we already got a swizzled_weights_
+    // in test phase and not in auto tuning
+    // This requires we always call convolve again with the winner configuration
+    // during the auto tuning stage.
+    if (tuned_ && !swizzled_weights_umat.empty())
+        return true;
+
+    if (swizzled_weights_umat.empty())
+        swizzled_weights_umat.create(1, (int)alignSize(num_output_, 16) * channels_ *
+                                     kernel_h_ * (int)alignSize(kernel_w_, 2), CV_32FC1);
+
+    ocl::Queue queue = ocl::Queue::getDefault();
+    if (!interleave) {
+        cl_uint argIdx = 0;
+        int32_t channels = channels_ / group_;
+
+        ocl::Kernel oclk_copy_weight(CL_KERNEL_SELECT("copyWeightsSwizzled"),
+                                     cv::ocl::dnn::conv_spatial_helper_oclsrc);
+        if (oclk_copy_weight.empty())
+            return false;
+
+        oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
+        oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat));
+        oclk_copy_weight.set(argIdx++, kernel_w_);
+        oclk_copy_weight.set(argIdx++, kernel_h_);
+        oclk_copy_weight.set(argIdx++, channels);
+        oclk_copy_weight.set(argIdx++, num_output_);
+        oclk_copy_weight.set(argIdx++, swizzled_factor);
+
+        size_t global_work_size_copy[3] = {
+            (size_t) (alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1, 1 };
+
+        if (!oclk_copy_weight.run(3, global_work_size_copy, NULL, false))
+        {
+            std::cout << "Swizzle kernel run failed." << std::endl;
+            return false;
+        }
+    } else {
+        // assumption: kernel dimesion is 2
+        Mat weightMat = weight.getMat(ACCESS_READ);
+        Dtype* cpu_weight = (Dtype *)weightMat.ptr<float>();
+        Mat swizzledWeightMat = swizzled_weights_umat.getMat(ACCESS_WRITE);
+        Dtype* cpu_swizzled_weight = (Dtype *)swizzledWeightMat.ptr<float>();
+
+        int interleavedRows = (kernel_w_ / 2) * 2;
+        int nonInterleavedRows = kernel_w_ % 2;
+        int blockWidth = swizzled_factor;  // should equal to simd size.
+        int rowAlignment = 32;
+        size_t interleaved_filter_size = M_ * kernel_w_ * kernel_h_ * channels_ * sizeof(Dtype);
+        Dtype * tmpSwizzledWeight = reinterpret_cast<Dtype*>(malloc(interleaved_filter_size));
+        CHECK_EQ(tmpSwizzledWeight != NULL, true) << "Failed to allocate temporary swizzled weight";
+        for (int od = 0; od < M_; od++)
+            for (int id = 0; id < channels_; id++)
+                for (int r = 0; r < kernel_h_; r++)
+                    for (int c = 0; c < kernel_w_; c++)
+                        tmpSwizzledWeight[((id * kernel_h_ + r)* kernel_w_ + c) * M_ + od] =
+                            cpu_weight[((od * channels_ + id) * kernel_h_ + r)*kernel_w_+c];
+        interleaveMatrix(cpu_swizzled_weight,
+                         tmpSwizzledWeight,
+                         kernel_w_ * kernel_h_ * channels_, M_,
+                         interleavedRows,
+                         nonInterleavedRows,
+                         blockWidth,
+                         rowAlignment);
+        free(tmpSwizzledWeight);
+    }
+    return true;
+}
+
+template<>
+bool OCL4DNNConvSpatial<float>::createBasicKernel(int32_t blockWidth,
+                                                  int32_t blockHeight, int32_t blockDepth)
+{
+    kernelType_ = KERNEL_TYPE_BASIC;
+    blockM_ = blockWidth;
+    blockK_ = blockHeight;
+    blockN_ = blockDepth;
+    setupKernel();
+
+    ocl::Program program = compileKernel();
+    if (program.ptr())
+    {
+        int32_t workItemOutput[3] = { 1, 1, 1 };
+        size_t globalSize[3] = { (size_t)output_w_, (size_t)output_h_, (size_t)M_ };
+        kernelQueue.push_back(makePtr<kernelConfig>(kernel_name_, &globalSize[0], (const size_t*)NULL, &workItemOutput[0],
+                                                    false, KERNEL_TYPE_BASIC));
+        return true;
+    }
+    else
+        return false;
+}
+
+template<>
+void OCL4DNNConvSpatial<float>::CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
+                                                int32_t offset, int32_t size, bool write_only)
+{
+    cl_mem sub_mem;
+    cl_buffer_region region;
+    cl_int err;
+
+    region.origin = offset * sizeof(float);
+    region.size = size * sizeof(float);
+    sub_mem = clCreateSubBuffer((cl_mem)buffer.handle(ACCESS_READ),
+                                write_only ? CL_MEM_WRITE_ONLY : CL_MEM_READ_ONLY,
+                                CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
+    if (err)
+    {
+        std::cout << "Failed to create sub buffer." << std::endl;
+        return;
+    }
+
+    int step = sizeof(float), rows = size, cols = 1;
+    ocl::convertFromBuffer(sub_mem, step, rows, cols, CV_32FC1, sub_buffer);
+
+    //decrease ocl mem refcount
+    clReleaseMemObject(sub_mem);
+}
+
+template<>
+bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
+                                         const UMat &weight, const UMat &bias,
+                                         int32_t numImages, kernelConfig* config,
+                                         const cv::ocl::Queue& queue)
+{
+    ocl::Program program;
+    phash_t::iterator it = phash.find(config->kernelName);
+    if (it != phash.end())
+        program = it->second;
+    else
+        return false;
+
+    int32_t bias_offset;
+
+    if (config->kernelType == KERNEL_TYPE_INTEL_IDLF) {
+        if (!swizzleWeight(weight, config->workItem_output[2], false))
+            return false;
+        size_t total_bottom_size = bottom_dim_ * numImages;
+        size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_;
+        size_t total_bias_size = M_ * group_;
+        size_t total_top_size = top_dim_ * numImages;
+        for (int32_t g = 0; g < group_; ++g) {
+            bias_offset = M_ * g;
+            int32_t image_offset = width_ * height_ * (channels_ / group_) * g;
+            int32_t output_image_offset = output_w_ * output_h_ * M_ * g;
+            int32_t kernel_offset = kernel_h_ * kernel_w_ * (channels_ / group_) * M_ * g;
+
+            ocl::Kernel kernel(config->kernelName.c_str(), program);
+            if (kernel.empty())
+                return false;
+
+            cl_uint argIdx = 0;
+
+            UMat img_buffer;
+            if (image_offset)
+            {
+                CreateSubBuffer(bottom, img_buffer, image_offset,
+                                total_bottom_size - image_offset, false);
+                if (img_buffer.empty())
+                    return false;
+
+                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer));
+            }
+            else
+            {
+                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+            }
+
+            UMat kernel_buffer;
+            if (kernel_offset)
+            {
+                CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset,
+                                total_kernel_size - kernel_offset, false);
+                if (kernel_buffer.empty())
+                    return false;
+
+                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer));
+            }
+            else
+            {
+                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
+            }
+
+            UMat bias_buffer;
+            if (bias_term_)
+            {
+                if (bias_offset)
+                {
+                    CreateSubBuffer(bias, bias_buffer, bias_offset,
+                                    total_bias_size - bias_offset, false);
+                    if (bias_buffer.empty())
+                        return false;
+
+                    kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer));
+                }
+                else
+                {
+                    kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
+                }
+            }
+
+            UMat out_buffer;
+            if (output_image_offset)
+            {
+                CreateSubBuffer(top, out_buffer, output_image_offset,
+                                total_top_size - output_image_offset, true);
+                if (out_buffer.empty())
+                    return false;
+
+                kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
+            }
+            else
+            {
+                kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+            }
+
+            kernel.set(argIdx++, (uint16_t)width_);
+            kernel.set(argIdx++, (uint16_t)height_);
+            kernel.set(argIdx++, (uint16_t)output_w_);
+            kernel.set(argIdx++, (uint16_t)output_h_);
+            if (!kernel.run(3, config->global_work_size, config->local_work_size, false))
+            {
+                std::cout << "IDLF kernel run failed." << std::endl;
+                return false;
+            }
+        }
+    } else if (config->kernelType == KERNEL_TYPE_GEMM_LIKE) {
+        if (!swizzleWeight(weight, config->workItem_output[1], true))
+            return false;
+        size_t total_bottom_size = bottom_dim_ * numImages;
+        size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_;
+        size_t total_bias_size = M_ * group_;
+        size_t total_top_size = top_dim_ * numImages;
+        for (int32_t g = 0; g < group_; ++g) {
+            bias_offset = M_ * g;
+            int32_t image_offset = width_ * height_ * (channels_ / group_) * g;
+            int32_t output_image_offset = output_w_ * output_h_ * M_ * g;
+            int32_t kernel_offset = kernel_h_ * kernel_w_ * (channels_ / group_) * M_ * g;
+
+            ocl::Kernel kernel(config->kernelName.c_str(), program);
+            if (kernel.empty())
+                return false;
+
+            cl_uint argIdx = 0;
+
+            UMat img_buffer;
+            if (image_offset)
+            {
+                CreateSubBuffer(bottom, img_buffer, image_offset,
+                                total_bottom_size - image_offset, false);
+                if (img_buffer.empty())
+                    return false;
+
+                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer));
+            }
+            else
+            {
+                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+            }
+
+            UMat kernel_buffer;
+            if (kernel_offset)
+            {
+                CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset,
+                                total_kernel_size - kernel_offset, false);
+                if (kernel_buffer.empty())
+                    return false;
+
+                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer));
+            }
+            else
+            {
+                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
+            }
+
+            UMat bias_buffer;
+            if (bias_term_)
+            {
+                if (bias_offset)
+                {
+                    CreateSubBuffer(bias, bias_buffer, bias_offset,
+                                    total_bias_size - bias_offset, false);
+                    if (bias_buffer.empty())
+                        return false;
+
+                    kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer));
+                }
+                else
+                {
+                    kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
+                }
+            }
+
+            UMat out_buffer;
+            if (output_image_offset)
+            {
+                CreateSubBuffer(top, out_buffer, output_image_offset,
+                                total_top_size - output_image_offset, true);
+                if (out_buffer.empty())
+                    return false;
+
+                kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
+            }
+            else
+            {
+                kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+            }
+
+            kernel.set(argIdx++, (uint16_t)width_);
+            kernel.set(argIdx++, (uint16_t)height_);
+            kernel.set(argIdx++, (uint16_t)output_w_);
+            kernel.set(argIdx++, (uint16_t)output_h_);
+
+            int out_pitch_y = output_w_ * output_h_;
+            int out_pitch_z = out_pitch_y * M_;
+            int aligned_input_size = height_ * width_ * channels_ / group_;
+            int slice_pitch = width_ * height_;
+            kernel.set(argIdx++, (uint32_t)out_pitch_y);
+            kernel.set(argIdx++, (uint32_t)out_pitch_z);
+            kernel.set(argIdx++, (uint32_t)aligned_input_size);
+            kernel.set(argIdx++, (uint32_t)slice_pitch);
+
+            int blockM = config->workItem_output[0];
+            int blockK = config->workItem_output[1];
+            int blockN = config->workItem_output[2];
+            int alignedFilterWidth = alignSize(M_, blockN);
+            int alignedExpandHeight = alignSize(output_w_ * output_h_, blockM);
+            int globalWorkSizeDX = blockN;
+            int globalWorkSizeDY = blockM;
+            size_t sgemm_m = alignedExpandHeight;
+            size_t sgemm_n = alignedFilterWidth;
+            size_t gx = divUp(sgemm_n, globalWorkSizeDX);
+            size_t gy = divUp(sgemm_m, globalWorkSizeDY);
+            gy = alignSize(gy, blockK);
+            size_t global_size[3] = { gx, gy, config->global_work_size[2] };
+
+            if (!kernel.run(3, global_size, config->local_work_size, false))
+            {
+                std::cout << "GEMM like kernel run failed." << std::endl;
+                return false;
+            }
+        }
+    } else {
+        for (int32_t n = 0; n < numImages; ++n) {
+            for (int32_t g = 0; g < group_; ++g) {
+                bias_offset = M_ * g;
+                int32_t image_offset = n * bottom_dim_
+                    + width_ * height_ * (channels_ / group_) * g;
+                int32_t output_image_offset = n * top_dim_
+                    + output_w_ * output_h_ * M_ * g;
+
+                cl_uint argIdx = 0;
+                int32_t kernel_offset = kernel_h_ * kernel_w_ * (channels_ / group_) * M_ * g;
+
+                ocl::Kernel kernel(config->kernelName.c_str(), program);
+                if (kernel.empty())
+                    return false;
+
+                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+                kernel.set(argIdx++, image_offset);
+                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
+                kernel.set(argIdx++, kernel_offset);
+                if (bias_term_)
+                    kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
+                else
+                    kernel.set(argIdx++, (void *)NULL);
+                kernel.set(argIdx++, bias_offset);
+                kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+                kernel.set(argIdx++, output_image_offset);
+                kernel.set(argIdx++, (uint16_t)width_);
+                kernel.set(argIdx++, (uint16_t)height_);
+                kernel.set(argIdx++, (uint16_t)output_w_);
+                kernel.set(argIdx++, (uint16_t)output_h_);
+                kernel.set(argIdx++, (uint16_t)pad_w_);
+                kernel.set(argIdx++, (uint16_t)pad_h_);
+                if (!kernel.run(3, config->global_work_size,
+                                (config->use_null_local) ? NULL : config->local_work_size,
+                                false))
+                {
+                    std::cout << "Basic kernel run failed." << std::endl;
+                    return false;
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+template<>
+float OCL4DNNConvSpatial<float>::timedConvolve(const UMat &bottom, UMat &top,
+                                               const UMat &weight, const UMat &bias,
+                                               int32_t numImages, kernelConfig* config)
+{
+    cv::ocl::Queue profilingQueue;
+    try
+    {
+        profilingQueue = cv::ocl::Queue::getDefault().getProfilingQueue();
+    }
+    catch (const cv::Exception&)
+    {
+        static int warn_ = 0;
+        if (!warn_)
+        {
+            std::cout << "OpenCV(ocl4dnn): Can't create OpenCL profiling queue for auto-tuning." << std::endl;
+            warn_ = true;
+        }
+        return 1e6;
+    }
+
+    // warm up.
+    bool saved_tuned = tuned_;
+    tuned_ = false;
+    convolve(bottom, top, weight, bias, numImages, config, profilingQueue);
+
+    cv::ocl::Timer timer(profilingQueue);
+    timer.start();
+    bool res = true;;
+    dbgPrint(std::cout << "Benchmarking kernel: " << config->kernelName << std::endl);
+    tuned_ = true;
+    int loop_cnt = 4;
+    for (int i = 0; i < loop_cnt; i++) {
+        res = convolve(bottom, top, weight, bias, numImages, config, profilingQueue);
+        if (!res)
+            break;
+    }
+    tuned_ = saved_tuned;
+    timer.stop();
+    if (!res) {
+        config->tested = true;
+        config->verified = false;
+        return 1e5;
+    }
+
+    float elapsedTime = timer.milliSeconds() / loop_cnt;
+    #ifdef dbg
+    double out_w = output_w_;
+    double out_h = output_h_;
+    double out_z = M_;
+    double k_w = kernel_w_;
+    double k_h = kernel_h_;
+    double k_z = channels_;
+    double totalFlops = ((k_w*k_h*k_z -1)*2)*(out_w*out_h*out_z)*num_;
+    std::cout << "\tEstimated Gflops:" << ((totalFlops/1000)/1000)/1000
+              << std::endl;
+    std::cout << "\tEstimated GFLOPS/S: " << (((totalFlops/1000)/1000)/1000)*(1000.0/elapsedTime)
+              << std::endl;
+    #if 0
+    std::cout << "Estimated utilization: " <<
+        ((((totalFlops/1000)/1000)/1000)*(1000.0/elapsedTime))/880.0
+        << std::endl;
+    #endif
+    #endif
+    return elapsedTime;
+}
+
+template<>
+bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,
+                                             UMat &top,
+                                             const UMat &weight,
+                                             const UMat &bias,
+                                             int32_t numImages,
+                                             kernelConfig* config,
+                                             UMat &verifyTop)
+{
+
+    uint32_t verificationFail = 0;
+
+    if (config->verified)
+        return true;
+    else if (config->tested)
+        return false;
+
+    int32_t sz[4] = {numImages, num_output_, output_h_, output_w_};
+    top.zeros(4, sz, CV_32FC1);
+    bool saved_tuned = tuned_;
+    tuned_ = false;
+    convolve(bottom, top, weight, bias, numImages, config, cv::ocl::Queue::getDefault());
+    tuned_ = saved_tuned;
+
+    float *data = (float *)top.getMat(ACCESS_READ).ptr<float>();
+    float *verify_data = (float *)verifyTop.getMat(ACCESS_READ).ptr<float>();
+
+    for (int32_t n = 0; n < num_; ++n) {
+        for (int32_t g = 0; g < group_; ++g) {
+            int32_t output_image_offset = n * top_dim_ + output_w_ * output_h_ * M_ * g;
+            for (int out_ch = 0; out_ch < M_ && !verificationFail; out_ch++)
+                for (int h = 0; h < output_h_ && !verificationFail; h++)
+                    for (int w = 0; w < output_w_; w++) {
+                        size_t offset = output_image_offset + out_ch * output_w_ * output_h_ + h * output_w_ + w;
+                        if (fabs(data[offset] - verify_data[offset]) > 0.1 * fabs(verify_data[offset]) &&
+                            !(fabs(verify_data[offset]) < 1.e-3 &&
+                            fabs(data[offset] - verify_data[offset]) < 1.e-4))
+                        {
+                            dbgPrint(printf("test verification failed @ image %d group %d"
+                                            "out_ch %d h %d w %d got %G expected %G\n",
+                                            n, g, out_ch, h, w, data[offset], verify_data[offset]));
+                            verificationFail = 1;
+                            goto out;
+                        }
+                    }
+        }
+    }
+out:
+    if (verificationFail == 1)
+        return false;
+    else
+        return true;
+}
+
+template<typename Dtype>
+void OCL4DNNConvSpatial<Dtype>::unloadProgram(const std::string& kernelName)
+{
+    ocl::Program program;
+    phash_t::iterator it = phash.find(kernelName);
+    if (it != phash.end())
+    {
+        program = it->second;
+        it->second = ocl::Program();
+    }
+    else
+        return;
+
+    ocl::Context ctx = ocl::Context::getDefault();
+    ctx.unloadProg(program);
+}
+
+template<typename Dtype>
+ocl::Program OCL4DNNConvSpatial<Dtype>::compileKernel()
+{
+    phash_t::iterator it = phash.find(kernel_name_);
+    if (it != phash.end())
+    {
+        return it->second;
+    }
+
+    String errmsg;
+    ocl::Context ctx = ocl::Context::getDefault();
+    std::string options = options_.str();
+    CV_Assert(options.size() != 0);
+    ocl::Program program = ctx.getProg(src_, options, errmsg);
+
+    phash.insert(std::pair<std::string, ocl::Program>(kernel_name_, program));
+    if (!program.ptr())
+    {
+        std::cout << "Failed to compile kernel: " << kernel_name_
+                  << ", buildflags: " << options
+                  << ", errmsg: " << errmsg << std::endl;
+    }
+    return program;
+}
+
+template<>
+bool OCL4DNNConvSpatial<float>::createGEMMLikeConvKernel(int32_t blockM,
+                                                         int32_t blockK,
+                                                         int32_t blockN)
+{
+    int32_t simd_size = blockK;
+
+    int workItemOutput[3] = { blockM, blockK, blockN };
+    size_t gx = (size_t)divUp(M_, blockN);
+    size_t gy = (size_t)divUp(output_w_ * output_h_, blockM);
+    gy = alignSize(gy, simd_size);
+    size_t gz = num_;
+    size_t global_size[3] = { gx, gy, gz };
+    size_t local_size[3] = { 1, static_cast<size_t>(simd_size), 1 };
+
+    kernelType_ = KERNEL_TYPE_GEMM_LIKE;
+    blockM_ = blockM;
+    blockK_ = blockK;
+    blockN_ = blockN;
+    setupKernel();
+
+    ocl::Program program = compileKernel();
+    if (program.ptr())
+    {
+        size_t workgroupSize_used;
+        ocl::Kernel kernel(kernel_name_.c_str(), program);
+        if (kernel.empty())
+            return false;
+
+        workgroupSize_used = kernel.preferedWorkGroupSizeMultiple();
+        if (workgroupSize_used != simd_size)
+        {
+            std::cerr << "OpenCV(ocl4dnn): The OpenCL compiler chose a simd size (" << workgroupSize_used << ") that " << std::endl;
+            std::cerr << "                 does not equal the size (" << simd_size << ") kernel source required." << std::endl;
+            std::cerr << "                 Skip this kernel " << kernel_name_ << std::endl;
+            unloadProgram(kernel_name_);
+            return false;
+        }
+        else
+        {
+            kernelQueue.push_back(makePtr<kernelConfig>(kernel_name_, &global_size[0], &local_size[0], &workItemOutput[0],
+                                                        true, KERNEL_TYPE_GEMM_LIKE));
+            return true;
+        }
+    }
+    else
+        return false;
+}
+
+template<>
+bool OCL4DNNConvSpatial<float>::setupIDLF(int32_t blockWidth,
+                                          int32_t blockHeight,
+                                          int32_t simd_size)
+{
+    int32_t workItemOutput[3] = { blockWidth, blockHeight, simd_size };
+    const int32_t num_output_maps = M_;
+    int32_t output_width = output_w_;
+    int32_t output_height = output_h_;
+    int32_t output_block_width = blockWidth;
+    int32_t output_block_height = blockHeight;
+    int32_t num_batches = num_;
+
+    size_t global_size[3] = {
+        (size_t)divUp(output_width, output_block_width),
+        (size_t)divUp(output_height, output_block_height),
+        (size_t)num_batches * alignSize(num_output_maps, simd_size) };
+    size_t local_size[3] = { 1, 1, static_cast<size_t>(simd_size) };
+
+    kernelType_ = KERNEL_TYPE_INTEL_IDLF;
+    blockM_ = blockWidth;
+    blockK_ = blockHeight;
+    blockN_ = simd_size;
+
+    setupKernel();
+
+    ocl::Program program = compileKernel();
+    if (program.ptr())
+    {
+        size_t workgroupSize_used;
+        ocl::Kernel kernel(kernel_name_.c_str(), program);
+        if (kernel.empty())
+            return false;
+
+        workgroupSize_used = kernel.preferedWorkGroupSizeMultiple();
+        if (workgroupSize_used != simd_size)
+        {
+            std::cerr << "OpenCV(ocl4dnn): The OpenCL compiler chose a simd size (" << workgroupSize_used << ") that " << std::endl;
+            std::cerr << "                 does not equal the size (" << simd_size << ") kernel source required." << std::endl;
+            std::cerr << "                 Skip this kernel " << kernel_name_ << std::endl;
+            unloadProgram(kernel_name_);
+            return false;
+        }
+        else
+        {
+            kernelQueue.push_back(makePtr<kernelConfig>(kernel_name_, &global_size[0], &local_size[0], &workItemOutput[0],
+                                                        true, KERNEL_TYPE_INTEL_IDLF));
+            return true;
+        }
+    }
+    else
+        return false;
+}
+
+template<>
+bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
+                                                        int32_t blockWidth,
+                                                        int32_t blockHeight,
+                                                        int32_t blockDepth)
+{
+    kernelType_ = kernelType;
+    options_.str(""); options_.clear(); // clear contents and state flags
+    src_ = ocl::ProgramSource();
+
+    if (kernelType == KERNEL_TYPE_INTEL_IDLF)
+        return setupIDLF(blockWidth, blockHeight, blockDepth);
+    else if (kernelType == KERNEL_TYPE_BASIC)
+        return createBasicKernel(blockWidth, blockHeight, blockDepth);
+    else if (kernelType == KERNEL_TYPE_GEMM_LIKE)
+        return createGEMMLikeConvKernel(blockWidth, blockHeight, blockDepth);
+    else
+        CV_Assert(0 && "Internal error");
+    return false;
+}
+
+template<>
+void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems)
+{
+    if (ocl::Device::getDefault().intelSubgroupsSupport()) {
+        /* IDLF kernels are using Intel specific extension which make
+           them intel only. */
+        // Generates static key_
+        int max_compute_units = ocl::Device::getDefault().maxComputeUnits();
+        int kernelCnt = 0;
+        if (group_ == 1 && ((M_ % 8 == 0) && (M_ % 32 != 24))) {
+            tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, 1, 8, 32));
+            tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, 2, 8, 32));
+
+            if (kernel_w_ < 4 && M_ % 32 == 0)
+                tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, 1, 16, 32));
+        }
+
+        for (int simd_size = 8; simd_size <= 16; simd_size += 8) {
+            if (simd_size == 8 && !((group_ == 1 || M_ % 8 == 0)))
+                continue;
+            if (simd_size == 16 && !(group_ == 1 || M_ % 16 == 0))
+                continue;
+            const int width_max = 14, height_max = 8, block_size_max = 32;
+            for (uint32_t width = width_max; width > 0; width--) {
+                int candidate = 0;
+                if (width > output_w_)
+                    continue;
+                for (uint32_t height = height_max; height > 0; height--) {
+                    if (width * height > block_size_max || height > output_h_)
+                        continue;
+                    // Only when the work items count is less than the device
+                    // max work items or the M_ is less than 16, we will tune
+                    // for simd 8.
+                    if (simd_size == 8 &&
+                        M_ >= 16 &&
+                        ((num_ * M_ * output_w_ * output_h_ / static_cast<float>(width * height)) >=
+                        max_compute_units * 7 * 16))
+                        continue;
+                    int actual_tile_x = kernel_w_ * dilation_w_ + (width - 1) * stride_w_;
+                    int tile_x = alignSize(actual_tile_x, 4);
+                    int tile_y = kernel_h_ * dilation_h_ + (height - 1) * stride_h_;
+                    if (tile_x > (4 * simd_size))
+                        continue;
+                    // If actual_tile_x is multiple of 4, we may waste some IO bandwidth.
+                    // This could reduce 75% tuning candidates. It has slightly performance
+                    // impact for the final tuning result, less than 2% for most cases.
+                    if (actual_tile_x % 4 != 0)
+                        continue;
+                    if ((width * height + divUp(tile_x * tile_y, simd_size)) > block_size_max)
+                        continue;
+                    int tile_y_stride = (4 * simd_size) / tile_x;
+
+                    if (divUp(tile_y, tile_y_stride) < 4) {
+                        tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_INTEL_IDLF, width, height, simd_size));
+                        candidate++;
+                    }
+                    if (candidate >= 4 && height == 2)
+                        break;
+                }
+                kernelCnt += candidate;
+                if (kernelCnt >= 12 && width == 2)
+                    break;
+            }
+        }
+    }
+}
+
+template<>
+void OCL4DNNConvSpatial<float>::useFirstAvailable(const UMat &bottom,
+                                                  UMat &top,
+                                                  const UMat &weight,
+                                                  const UMat &bias,
+                                                  int32_t numImages,
+                                                  UMat &verifyTop)
+{
+    std::vector< cv::Ptr<tunerParam> > tunerItems;
+    generateTunerItems(tunerItems);
+    tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_BASIC, 1, 1, 1));
+
+    for (int i = 0; i < tunerItems.size(); i++) {
+        if (createConvolutionKernel(tunerItems[i]->kernelType,
+                                    tunerItems[i]->blockWidth,
+                                    tunerItems[i]->blockHeight,
+                                    tunerItems[i]->blockDepth)) {
+            int kernelIdx = kernelQueue.size() - 1;
+            if (verifyResult(bottom, top, weight, bias, numImages, kernelQueue[kernelIdx], verifyTop)) {
+                bestKernelConfig = kernelQueue[kernelIdx];
+                if (bestKernelConfig->kernelType != KERNEL_TYPE_INTEL_IDLF &&
+                    bestKernelConfig->kernelType != KERNEL_TYPE_GEMM_LIKE)
+                    if (!swizzled_weights_umat.empty())
+                        swizzled_weights_umat.release();
+
+                for (int32_t j = 0; j < kernelIdx; j++) {
+                    CV_Assert(phash.find(kernelQueue[j]->kernelName) != phash.end());
+                    unloadProgram(kernelQueue[j]->kernelName);
+                }
+                kernelQueue.clear();
+                tuned_ = true;
+                break;
+            }
+        }
+    }
+}
+
+template<>
+void OCL4DNNConvSpatial<float>::cacheTunedConfig()
+{
+    if (tuned_)
+    {
+        cv::AutoLock lock(kernelConfigMutex);
+        std::stringstream outputKernel;
+        outputKernel << bestKernelConfig->workItem_output[0] << " "
+                     << bestKernelConfig->workItem_output[1] << " "
+                     << bestKernelConfig->workItem_output[2] << " "
+                     << bestKernelConfig->kernelType << " "
+                     << bestKernelConfig->local_work_size[0] << " "
+                     << bestKernelConfig->local_work_size[1] << " "
+                     << bestKernelConfig->local_work_size[2] << " "
+                     << bestKernelConfig->swizzle_weights << " "
+                     << bestKernelConfig->use_null_local << " ";
+        kernelConfigMap.insert(std::pair<std::string, std::string>(key_, outputKernel.str()));
+    }
+}
+
+template<>
+void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom,
+                                                 UMat &top,
+                                                 const UMat &weight,
+                                                 const UMat &bias,
+                                                 int32_t numImages,
+                                                 UMat &verifyTop)
+{
+    std::vector< cv::Ptr<tunerParam> > tunerItems;
+
+    generateTunerItems(tunerItems);
+    for (int i = 0; i < tunerItems.size(); i++)
+        createConvolutionKernel(tunerItems[i]->kernelType,
+                                tunerItems[i]->blockWidth,
+                                tunerItems[i]->blockHeight,
+                                tunerItems[i]->blockDepth);
+
+    for (int32_t x = 0; x < kernelQueue.size(); x++) {
+        kernelQueue[x]->executionTime = timedConvolve(bottom, top, weight, bias, numImages,
+                                                      kernelQueue[x]);
+        #ifdef TEST_ALL_KERNELS
+        if (kernelQueue[x]->tested == false) {
+            bool verified = verifyResult(bottom, top, weight, bias, numImages, kernelQueue[x], verifyTop);
+            if (verified == false) {
+                dbgPrint(std::cout << "Kernel "
+                         << kernelQueue[x]->kernelName
+                         << " failed verification" << std::endl);
+                dbgPrint(std::cout << "kernelQueue[x]->workItem_output[0]: "
+                         << kernelQueue[x]->workItem_output[0] << " "
+                         << "kernelQueue[x]->workItem_output[1]: "
+                         << kernelQueue[x]->workItem_output[1] << " "
+                         << "kernelQueue[x]->workItem_output[2]: "
+                         << kernelQueue[x]->workItem_output[2] << " "
+                         << "kernelQueue[x]->kernelType: "
+                         << kernelQueue[x]->kernelType << " "
+                         << "kernelQueue[x]->global_work_size[0]: "
+                         << kernelQueue[x]->global_work_size[0] << " "
+                         << "kernelQueue[x]->global_work_size[1]: "
+                         << kernelQueue[x]->global_work_size[1] << " "
+                         << "kernelQueue[x]->global_work_size[2]: "
+                         << kernelQueue[x]->global_work_size[2] << " "
+                         << "kernelQueue[x]->local_work_size[0]: "
+                         << kernelQueue[x]->local_work_size[0] << " "
+                         << "kernelQueue[x]->local_work_size[1]: "
+                         << kernelQueue[x]->local_work_size[1] << " "
+                         << "kernelQueue[x]->local_work_size[2]: "
+                         << kernelQueue[x]->local_work_size[2] << " "
+                         << kernelQueue[x]->swizzle_weights << " "
+                         << kernelQueue[x]->use_null_local << std::endl);
+            } else {
+                dbgPrint(std::cout << "Kernel "
+                         << kernelQueue[x]->kernelName
+                         << " pass verification" << std::endl);
+            }
+        }
+        #endif
+    }
+    int32_t failures = 0;
+    bool verification = false;
+    if (kernelQueue.size()) {
+        while (failures < kernelQueue.size()) {
+            int32_t fastestKernel = -1;
+            float fastestTime = std::numeric_limits<float>::infinity();
+
+            for (int32_t x = 0; x < kernelQueue.size(); x++) {
+                if (kernelQueue[x]->executionTime < fastestTime &&
+                    kernelQueue[x]->tested == false) {
+                    fastestKernel = x;
+                    fastestTime = kernelQueue[x]->executionTime;
+                }
+            }
+            if (fastestKernel < 0) break;
+            // Test fastest kernel
+            bool verified = verifyResult(bottom, top, weight, bias, numImages, kernelQueue[fastestKernel], verifyTop);
+            if (verified == true) {
+                kernelQueue[fastestKernel]->verified = true;
+                kernel_index_ = fastestKernel;
+                verification = true;
+                break;
+            } else {
+                kernelQueue[fastestKernel]->tested = true;
+                dbgPrint(std::cout << "Kernel " <<
+                         kernelQueue[fastestKernel]->kernelName <<
+                         " failed verification" << std::endl);
+                failures++;
+            }
+        }
+    }
+    if (verification) {
+        dbgPrint(std::cout << "Kernel <" << kernelQueue[kernel_index_]->kernelName <<
+                 "> passed verification" << std::endl);
+        dbgPrint(std::cout << "Convolution Time:" << kernelQueue[kernel_index_]->executionTime << std::endl);
+    } else {
+        dbgPrint(std::cout << "fallback to basic kernel" << std::endl);
+        options_.str(""); options_.clear(); // clear contents and state flags
+        createBasicKernel(1, 1, 1);
+        kernel_index_ = kernelQueue.size() - 1;
+    }
+    this->bestKernelConfig = kernelQueue[kernel_index_];
+
+
+    if (bestKernelConfig->kernelType != KERNEL_TYPE_INTEL_IDLF && bestKernelConfig->kernelType != KERNEL_TYPE_GEMM_LIKE)
+        if (!swizzled_weights_umat.empty())
+            swizzled_weights_umat.release();
+
+    for (int32_t x = 0; x < kernelQueue.size(); x++) {
+        if (x != kernel_index_) {
+            CV_Assert(phash.find(kernelQueue[x]->kernelName) != phash.end());
+            unloadProgram(kernelQueue[x]->kernelName);
+        }
+    }
+    kernelQueue.clear();
+    tuned_ = true;
+    saveTunedConfig();
+}
+
+template<typename Dtype>
+void OCL4DNNConvSpatial<Dtype>::saveTunedConfig()
+{
+    CV_Assert(tuned_);
+    if (!use_cache_path_ || cache_path_.empty())
+        return;
+
+    std::string outputFile;
+    outputFile = cache_path_ + "/" + key_sanitized_;
+    std::ofstream outputKernel;
+    outputKernel.open(outputFile.c_str());
+    outputKernel << bestKernelConfig->workItem_output[0] << " "
+                 << bestKernelConfig->workItem_output[1] << " "
+                 << bestKernelConfig->workItem_output[2] << " "
+                 << bestKernelConfig->kernelType << " "
+                 << bestKernelConfig->local_work_size[0] << " "
+                 << bestKernelConfig->local_work_size[1] << " "
+                 << bestKernelConfig->local_work_size[2] << " "
+                 << bestKernelConfig->swizzle_weights << " "
+                 << bestKernelConfig->use_null_local << " ";
+    outputKernel.close();
+}
+
+template<typename Dtype>
+void OCL4DNNConvSpatial<Dtype>::prepareKernel(const UMat &bottom, UMat &top,
+                                              const UMat &weight, const UMat &bias,
+                                              int32_t numImages)
+{
+    std::string previous_key = key_;
+
+    generateKey();
+    if (key_.compare(previous_key) == 0 && bestKernelConfig != NULL)
+        return;
+
+    if (bestKernelConfig)
+    {
+        prev_kernel_type_ = bestKernelConfig->kernelType;
+        CV_Assert(phash.find(bestKernelConfig->kernelName) != phash.end());
+        phash.erase(bestKernelConfig->kernelName);
+        bestKernelConfig.release();
+    }
+
+    if (loadCachedConfig()) // check in-memory cache
+        return;
+    if (loadTunedConfig()) // check external storage
+        return;
+
+    UMat benchData(1, numImages * top_dim_, CV_32FC1);
+    if (force_auto_tuning_)
+    {
+        calculateBenchmark(bottom, benchData, weight, bias, numImages);
+        setupConvolution(bottom, top, weight, bias, numImages, benchData);
+    }
+    else
+    {
+        calculateBenchmark(bottom, benchData, weight, bias, numImages);
+        useFirstAvailable(bottom, top, weight, bias, numImages, benchData);
+    }
+    cacheTunedConfig();
+}
+
+template<typename Dtype>
+bool OCL4DNNConvSpatial<Dtype>::loadCachedConfig()
+{
+    cv::AutoLock lock(kernelConfigMutex);
+    if (!defaultConfigLoaded)
+    {
+        const size_t numConfigs = sizeof(default_kernel_config_intel)/sizeof(default_kernel_config_intel[0])/2;
+        for (size_t i = 0; i < numConfigs; i++)
+        {
+            std::pair<std::string, std::string> entry(
+                    std::string("Intel(R) Corporation_") + default_kernel_config_intel[2 * i],
+                    default_kernel_config_intel[2 * i + 1]);
+            kernelConfigMap.insert(entry);
+        }
+        defaultConfigLoaded = true;
+    }
+
+    kernel_hash_t::iterator it = kernelConfigMap.find(key_);
+    if (it != kernelConfigMap.end())
+    {
+        int32_t x, y, z, type, lx, ly, lz;
+        bool swizzle, nullLocal;
+        std::stringstream cachedKernel(it->second);
+        if (cachedKernel)
+        {
+            cachedKernel >> x;
+            cachedKernel >> y;
+            cachedKernel >> z;
+            cachedKernel >> type;
+            cachedKernel >> lx;
+            cachedKernel >> ly;
+            cachedKernel >> lz;
+            cachedKernel >> swizzle;
+            cachedKernel >> nullLocal;
+            if (setupKernelByConfig(x, y, z, type, lx, ly, lz, swizzle, nullLocal)) {
+                tuned_ = true;
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+
+template<typename Dtype>
+bool OCL4DNNConvSpatial<Dtype>::setupKernelByConfig(int x, int y, int z, int type,
+                                                    int lx, int ly, int lz,
+                                                    bool swizzle, bool nullLocal)
+{
+    if (type == KERNEL_TYPE_INTEL_IDLF)
+    {
+        if (z == 1)
+            z = 16;
+        CHECK_EQ(z == 16 || z == 8, true) << "invalid SIMD size" << std::endl;
+    }
+    kernelQueue.clear();
+    createConvolutionKernel(type, x, y, z);
+    if (kernelQueue.size() != 1) {
+        std::cerr << "Failed setup kernel by config:"
+            << " x = " << x
+            << " y = " << y
+            << " z = " << z
+            << " type = " << type
+            << std::endl;
+        return false;
+    }
+    bestKernelConfig = kernelQueue[0];
+    kernelQueue.clear();
+    bestKernelConfig->local_work_size[0] = lx;
+    bestKernelConfig->local_work_size[1] = ly;
+    bestKernelConfig->local_work_size[2] = lz;
+    bestKernelConfig->swizzle_weights = swizzle;
+    bestKernelConfig->use_null_local = nullLocal;
+    // If kernel type changed to type 2 or 4, we need to reset the swizzled
+    // weights pointer to invalidate the previous swizzled weights data.
+    if (prev_kernel_type_ != bestKernelConfig->kernelType &&
+        (bestKernelConfig->kernelType == KERNEL_TYPE_INTEL_IDLF ||
+        bestKernelConfig->kernelType == KERNEL_TYPE_GEMM_LIKE))
+    {
+        if (!swizzled_weights_umat.empty())
+            swizzled_weights_umat.release();
+    }
+    return true;
+}
+
+template<typename Dtype>
+bool OCL4DNNConvSpatial<Dtype>::loadTunedConfig()
+{
+    if (!use_cache_path_)
+    {
+        if (cache_path_.empty() && !force_auto_tuning_)
+        {
+            static int warn_ = 0;
+            if (!warn_)
+            {
+                std::cout << "OpenCV(ocl4dnn): consider to specify kernel configuration cache directory " << std::endl
+                          << "                 via OPENCV_OCL4DNN_CONFIG_PATH parameter." << std::endl;
+                warn_ = true;
+            }
+        }
+        return false;
+    }
+
+    int32_t x, y, z, type, lx, ly, lz;
+    bool swizzle, nullLocal;
+
+    // Find cached kernel configuration from file
+    std::string cacheFile = cache_path_ + "/" + key_sanitized_;
+    std::ifstream cachedKernel(cacheFile.c_str());
+    if (cachedKernel)
+    {
+        cachedKernel >> x;
+        cachedKernel >> y;
+        cachedKernel >> z;
+        cachedKernel >> type;
+        cachedKernel >> lx;
+        cachedKernel >> ly;
+        cachedKernel >> lz;
+        cachedKernel >> swizzle;
+        cachedKernel >> nullLocal;
+        if (setupKernelByConfig(x, y, z, type, lx, ly, lz, swizzle, nullLocal)) {
+            tuned_ = true;
+            return true;
+        }
+    }
+    return false;
+}
+
+template class OCL4DNNConvSpatial<float>;
+} // namespace ocl4dnn
+}
+}
+#endif // HAVE_OPENCL
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp
new file mode 100644
index 0000000000..b6c1df9908
--- /dev/null
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp
@@ -0,0 +1,108 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../../precomp.hpp"
+#include "common.hpp"
+#include "ocl4dnn.hpp"
+#include "math_functions.hpp"
+
+#ifdef HAVE_OPENCL
+namespace cv { namespace dnn { namespace ocl4dnn {
+template<typename Dtype>
+OCL4DNNInnerProduct<Dtype>::OCL4DNNInnerProduct(OCL4DNNInnerProductConfig config)
+{
+    bias_term_  = config.bias_term;
+    transpose_  = config.transpose;
+    N_ = num_output_ = config.num_output;
+    M_ = config.M;
+    K_ = config.K;
+    phase_test_ = config.phase_test;
+    image_copied_ = false;
+}
+
+template<typename Dtype>
+OCL4DNNInnerProduct<Dtype>::~OCL4DNNInnerProduct()
+{
+}
+
+template<typename Dtype>
+bool OCL4DNNInnerProduct<Dtype>::Forward(const UMat& bottom,
+                                         const UMat& weight,
+                                         const UMat& bias,
+                                         UMat& top)
+{
+    bool ret;
+
+    if (M_ == 1)
+    {
+        ret = ocl4dnnGEMV<Dtype>(CblasNoTrans, N_, K_, (Dtype) 1.,
+                                 weight, 0, bottom, 0, (Dtype) 0., top, 0);
+
+        if (bias_term_ && ret)
+            ret = ocl4dnnAXPY<Dtype>(N_, 1, bias, 0, top, 0);
+
+        return ret;
+    }
+    else
+    {
+        ret = false;
+        size_t max_image_size = std::min(ocl::Device::getDefault().image2DMaxWidth(),
+                                         ocl::Device::getDefault().image2DMaxHeight());
+        if (M_ <= max_image_size &&
+            N_ <= max_image_size &&
+            K_ <= max_image_size &&
+            cv::traits::Depth<Dtype>::value == CV_32F &&
+            ocl::Device::getDefault().intelSubgroupsSupport())
+        {
+            ret = ocl4dnnGEMMCommon<Dtype>(transpose_ ? CblasNoTrans : CblasTrans,
+                                           M_, N_, K_, bottom, weight, UMat(), top,
+                                           max_image_size);
+        }
+        return ret;
+    }
+}
+
+template class OCL4DNNInnerProduct<float>;
+} // namespace ocl4dnn
+}
+}
+#endif // HAVE_OPENCL
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_lrn.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_lrn.cpp
new file mode 100644
index 0000000000..6cc65b7189
--- /dev/null
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_lrn.cpp
@@ -0,0 +1,126 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../../precomp.hpp"
+#include "common.hpp"
+#include "ocl4dnn.hpp"
+#include "opencl_kernels_dnn.hpp"
+
+#ifdef HAVE_OPENCL
+namespace cv { namespace dnn { namespace ocl4dnn {
+template<typename Dtype>
+OCL4DNNLRN<Dtype>::OCL4DNNLRN(OCL4DNNLRNConfig config)
+{
+    lrn_type_ = config.lrn_type;
+    phase_test_ = config.phase_test;
+    size_ = config.local_size;
+    CHECK_EQ(size_ % 2, 1)<< "LRN only supports odd values for local_size";
+    alpha_ = config.alpha;
+    beta_ = config.beta;
+    k_ = config.k;
+    norm_by_size_ = config.norm_by_size;
+    num_ = config.batch_size;
+    channels_ = config.channels;
+    height_ = config.height;
+    width_ = config.width;
+}
+
+template<typename Dtype>
+bool OCL4DNNLRN<Dtype>::Forward(const UMat& bottom, UMat& top)
+{
+    bool ret = true;
+
+    if (!ocl::Device::getDefault().intelSubgroupsSupport())
+        return false;
+
+    switch (lrn_type_)
+    {
+    case LRNParameter_NormRegion_ACROSS_CHANNELS:
+        ret = crossChannelForward(bottom, top);
+        break;
+    case LRNParameter_NormRegion_WITHIN_CHANNEL:
+        //TODO
+        //WithinChannelForward(bottom_data, top_data);
+        ret = false;
+        break;
+    default:
+        ret = false;
+        LOG(FATAL)<< "Unknown normalization region.";
+    }
+    return ret;
+}
+
+template<typename Dtype>
+bool OCL4DNNLRN<Dtype>::crossChannelForward(const UMat& bottom, UMat& top)
+{
+    ocl::Queue queue = ocl::Queue::getDefault();
+    CHECK_EQ(phase_test_, true) << "Only support forward inference.";
+
+    cl_uint argIdx = 0;
+    int32_t n_threads = num_ * height_ * width_;
+    size_t global_work_size_[1] = {(size_t)n_threads};
+    String opts = clOptionSupport("-cl-no-subgroup-ifp") ? " -cl-no-subgroup-ifp " : "";
+    ocl::Kernel oclk_lrn_fill;
+    if (!oclk_lrn_fill.create(CL_KERNEL_SELECT("lrn_full_no_scale"), ocl::dnn::ocl4dnn_lrn_oclsrc, opts))
+        return false;
+
+    oclk_lrn_fill.set(argIdx++, n_threads);
+    oclk_lrn_fill.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+    oclk_lrn_fill.set(argIdx++, num_);
+    oclk_lrn_fill.set(argIdx++, channels_);
+    oclk_lrn_fill.set(argIdx++, height_);
+    oclk_lrn_fill.set(argIdx++, width_);
+    oclk_lrn_fill.set(argIdx++, size_);
+    int size_norm_factor = norm_by_size_ ? size_ : 1;
+    oclk_lrn_fill.set(argIdx++, alpha_ / size_norm_factor);
+    oclk_lrn_fill.set(argIdx++, k_);
+    oclk_lrn_fill.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+    oclk_lrn_fill.set(argIdx++, -beta_);
+
+    return oclk_lrn_fill.run(1, global_work_size_, NULL, false);
+}
+
+template class OCL4DNNLRN<float>;
+} // namespace ocl4dnn
+}
+}
+#endif // HAVE_OPENCL
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
new file mode 100644
index 0000000000..e0bdf71e67
--- /dev/null
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
@@ -0,0 +1,213 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../../precomp.hpp"
+#include <string>
+#include <vector>
+#include "common.hpp"
+#include "ocl4dnn.hpp"
+#include "opencl_kernels_dnn.hpp"
+
+#ifdef HAVE_OPENCL
+namespace cv { namespace dnn { namespace ocl4dnn {
+template<typename Dtype>
+OCL4DNNPool<Dtype>::OCL4DNNPool(OCL4DNNPoolConfig config)
+{
+    int dims = config.in_shape.size();
+    int spatial_dims = 2;
+
+    batch_size_ = config.in_shape[0];
+    channels_ = config.channels;
+    pool_method_ = config.pool_method;
+
+    for (int i = 0; i < spatial_dims; ++i)
+    {
+        kernel_shape_.push_back(i == 0 ? config.kernel.height : config.kernel.width);
+        pad_.push_back(i == 0 ? config.pad.height : config.pad.width);
+        stride_.push_back(i == 0 ? config.stride.height : config.stride.width);
+        im_in_shape_.push_back(config.in_shape[dims - spatial_dims + i]);
+        im_out_shape_.push_back(config.out_shape[dims - spatial_dims + i]);
+    }
+
+    kernel_h_ = kernel_shape_[0];
+    kernel_w_ = kernel_shape_[1];
+    stride_h_ = stride_[0];
+    stride_w_ = stride_[1];
+    pad_h_ = pad_[0];
+    pad_w_ = pad_[1];
+    height_ = im_in_shape_[0];
+    width_ = im_in_shape_[1];
+    pooled_height_ = im_out_shape_[0];
+    pooled_width_ = im_out_shape_[1];
+
+    count_ = 1;
+    for (int i = 0; i < config.out_shape.size(); ++i)
+    {
+        count_ *= config.out_shape[i];
+    }
+}
+
+template<typename Dtype>
+OCL4DNNPool<Dtype>::~OCL4DNNPool()
+{
+    mask_idx_.release();
+}
+
+template<typename Dtype>
+bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
+                                 UMat& top,
+                                 UMat& top_mask)
+{
+    bool ret = true;
+    ocl::Queue queue = ocl::Queue::getDefault();
+    size_t global[] = { 128 * 128 };
+    size_t local[] = { 128 };
+    cl_uint argIdx = 0;
+
+    // support 2D case
+    switch (pool_method_)
+    {
+    case LIBDNN_POOLING_METHOD_MAX:
+        {
+            if (top_mask.empty() && mask_idx_.empty())
+            {
+                mask_idx_.create(1, count_, CV_32FC1);
+            }
+            ocl::Kernel oclk_max_pool_forward(CL_KERNEL_SELECT("max_pool_forward"),
+                                              cv::ocl::dnn::ocl4dnn_pooling_oclsrc);
+
+            if (oclk_max_pool_forward.empty())
+                return false;
+
+            argIdx = 0;
+            oclk_max_pool_forward.set(argIdx++, count_);
+            oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+            oclk_max_pool_forward.set(argIdx++, batch_size_);
+            oclk_max_pool_forward.set(argIdx++, channels_);
+            oclk_max_pool_forward.set(argIdx++, height_);
+            oclk_max_pool_forward.set(argIdx++, width_);
+            oclk_max_pool_forward.set(argIdx++, pooled_height_);
+            oclk_max_pool_forward.set(argIdx++, pooled_width_);
+            oclk_max_pool_forward.set(argIdx++, kernel_h_);
+            oclk_max_pool_forward.set(argIdx++, kernel_w_);
+            oclk_max_pool_forward.set(argIdx++, stride_h_);
+            oclk_max_pool_forward.set(argIdx++, stride_w_);
+            oclk_max_pool_forward.set(argIdx++, pad_h_);
+            oclk_max_pool_forward.set(argIdx++, pad_w_);
+            oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+            oclk_max_pool_forward.set(argIdx++, mask_idx_.empty() ? 0 : 1);
+            if (mask_idx_.empty())
+                oclk_max_pool_forward.set(argIdx++, (void *)NULL);
+            else
+                oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(mask_idx_));
+            oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top_mask));
+
+            ret = oclk_max_pool_forward.run(1, global, local, false);
+        }
+        break;
+    case LIBDNN_POOLING_METHOD_AVE:
+        {
+            ocl::Kernel oclk_ave_pool_forward(CL_KERNEL_SELECT("ave_pool_forward"),
+                                              cv::ocl::dnn::ocl4dnn_pooling_oclsrc);
+
+            if (oclk_ave_pool_forward.empty())
+                return false;
+
+            argIdx = 0;
+            oclk_ave_pool_forward.set(argIdx++, count_);
+            oclk_ave_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+            oclk_ave_pool_forward.set(argIdx++, batch_size_);
+            oclk_ave_pool_forward.set(argIdx++, channels_);
+            oclk_ave_pool_forward.set(argIdx++, height_);
+            oclk_ave_pool_forward.set(argIdx++, width_);
+            oclk_ave_pool_forward.set(argIdx++, pooled_height_);
+            oclk_ave_pool_forward.set(argIdx++, pooled_width_);
+            oclk_ave_pool_forward.set(argIdx++, kernel_h_);
+            oclk_ave_pool_forward.set(argIdx++, kernel_w_);
+            oclk_ave_pool_forward.set(argIdx++, stride_h_);
+            oclk_ave_pool_forward.set(argIdx++, stride_w_);
+            oclk_ave_pool_forward.set(argIdx++, pad_h_);
+            oclk_ave_pool_forward.set(argIdx++, pad_w_);
+            oclk_ave_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+
+            ret = oclk_ave_pool_forward.run(1, global, local, false);
+        }
+        break;
+    case LIBDNN_POOLING_METHOD_STO:
+        {
+            ocl::Kernel oclk_sto_pool_forward(CL_KERNEL_SELECT("sto_pool_forward_test"),
+                                              cv::ocl::dnn::ocl4dnn_pooling_oclsrc);
+
+            if (oclk_sto_pool_forward.empty())
+                return false;
+
+            argIdx = 0;
+            oclk_sto_pool_forward.set(argIdx++, count_);
+            oclk_sto_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+            oclk_sto_pool_forward.set(argIdx++, batch_size_);
+            oclk_sto_pool_forward.set(argIdx++, channels_);
+            oclk_sto_pool_forward.set(argIdx++, height_);
+            oclk_sto_pool_forward.set(argIdx++, width_);
+            oclk_sto_pool_forward.set(argIdx++, pooled_height_);
+            oclk_sto_pool_forward.set(argIdx++, pooled_width_);
+            oclk_sto_pool_forward.set(argIdx++, kernel_h_);
+            oclk_sto_pool_forward.set(argIdx++, kernel_w_);
+            oclk_sto_pool_forward.set(argIdx++, stride_h_);
+            oclk_sto_pool_forward.set(argIdx++, stride_w_);
+            oclk_sto_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+
+            ret = oclk_sto_pool_forward.run(1, global, local, false);
+        }
+        break;
+    default:
+        {
+            ret = false;
+            LOG(FATAL)<< "Unknown pooling method.";
+        }
+    }
+    return ret;
+}
+
+template class OCL4DNNPool<float>;
+} // namespace ocl4dnn
+}
+}
+#endif // HAVE_OPENCL
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp
new file mode 100644
index 0000000000..e4802d2dff
--- /dev/null
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp
@@ -0,0 +1,135 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../../precomp.hpp"
+#include <vector>
+#include "common.hpp"
+#include "ocl4dnn.hpp"
+#include "opencl_kernels_dnn.hpp"
+
+#ifdef HAVE_OPENCL
+namespace cv { namespace dnn { namespace ocl4dnn {
+template<typename Dtype>
+OCL4DNNSoftmax<Dtype>::OCL4DNNSoftmax(OCL4DNNSoftmaxConfig config)
+{
+    softmax_axis_ = config.axis;
+    channels_ = config.channels;
+
+    inner_num_ = 1;
+    outer_num_ = 1;
+    count_ = 1;
+    int32_t scale_sz = 1;
+    for (int32_t i = softmax_axis_ + 1; i < config.in_shape.size(); i++)
+        inner_num_ *= config.in_shape[i];
+    use_slm_ = (config.in_shape[softmax_axis_] * inner_num_ + inner_num_ * 17) <= 8192;
+    for (int32_t i = 0; i < softmax_axis_; i++)
+        outer_num_ *= config.in_shape[i];
+    count_ = inner_num_ + outer_num_;
+
+    std::vector<int32_t> scale_dims = config.in_shape;
+    scale_dims[softmax_axis_] = use_slm_ ? 1 : 17;
+    for (int32_t i = 0; i < scale_dims.size(); i++)
+        scale_sz *= scale_dims[i];
+
+    scale_data_.create(1, scale_sz, CV_32FC1);
+}
+
+template<typename Dtype>
+OCL4DNNSoftmax<Dtype>::~OCL4DNNSoftmax()
+{
+    scale_data_.release();
+}
+
+template<typename Dtype>
+bool OCL4DNNSoftmax<Dtype>::Forward(const UMat& bottom, UMat& top)
+{
+    bool ret = false;
+    ocl::Queue queue = ocl::Queue::getDefault();
+    bool intel_subgroup = ocl::Device::getDefault().intelSubgroupsSupport();
+    if (intel_subgroup && inner_num_ < 128)
+    {
+        String opts = clOptionSupport("-cl-no-subgroup-ifp") ? " -cl-no-subgroup-ifp " : "";
+        String kname;
+        ocl::Kernel oclk_softmax_forward_kernel;
+
+        if (use_slm_)
+            kname = CL_KERNEL_SELECT("softmax_forward_slm");
+        else
+            kname = CL_KERNEL_SELECT("softmax_forward");
+
+        if (!oclk_softmax_forward_kernel.create(kname.c_str(), ocl::dnn::softmax_loss_oclsrc, opts))
+            return false;
+
+        size_t global_size[] = { 256, (size_t)outer_num_, 1 };
+        size_t local_size[] = { 256, 1, 1 };
+        cl_uint argIdx = 0;
+
+        if (use_slm_)
+        {
+            oclk_softmax_forward_kernel.set(argIdx++, outer_num_);
+            oclk_softmax_forward_kernel.set(argIdx++, channels_);
+            oclk_softmax_forward_kernel.set(argIdx++, inner_num_);
+            oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(scale_data_));
+            oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+            oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+            oclk_softmax_forward_kernel.set(argIdx++, NULL, channels_ * inner_num_* sizeof(Dtype));
+            oclk_softmax_forward_kernel.set(argIdx++, NULL, inner_num_* sizeof(Dtype));
+            oclk_softmax_forward_kernel.set(argIdx++, NULL, 16 * inner_num_* sizeof(Dtype));
+        }
+        else
+        {
+            oclk_softmax_forward_kernel.set(argIdx++, outer_num_);
+            oclk_softmax_forward_kernel.set(argIdx++, channels_);
+            oclk_softmax_forward_kernel.set(argIdx++, inner_num_);
+            oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(scale_data_));
+            oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+            oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+        }
+        ret = oclk_softmax_forward_kernel.run(3, global_size, local_size, false);
+    }
+    return ret;
+}
+
+template class OCL4DNNSoftmax<float>;
+} // namespace ocl4dnn
+}
+}
+#endif // HAVE_OPENCL
diff --git a/modules/dnn/src/opencl/activations.cl b/modules/dnn/src/opencl/activations.cl
index b98e52f674..0649f2e577 100644
--- a/modules/dnn/src/opencl/activations.cl
+++ b/modules/dnn/src/opencl/activations.cl
@@ -1,3 +1,45 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
 __kernel void ReLUForward(const int count, __global const T* in, __global T* out
 #ifndef RELU_NO_SLOPE
 , T negative_slope
diff --git a/modules/dnn/src/opencl/batchnorm.cl b/modules/dnn/src/opencl/batchnorm.cl
new file mode 100644
index 0000000000..3f9401c52e
--- /dev/null
+++ b/modules/dnn/src/opencl/batchnorm.cl
@@ -0,0 +1,26 @@
+
+__kernel void batchnorm(__global const T *src, int src_offset,
+                        __global const float *meanMat,
+                        float varMeanScale,
+                        __global const float *invStdMat,
+                        __global const float *weight,
+                        __global const float *bias,
+                        int hasWeight, int hasBias,
+                        int width, int height, int channel,
+                        __global T *dst, int dst_offset)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int c = get_global_id(2);
+
+    if (x >= width || y >= height || c >= channel)
+        return;
+
+    float mean = meanMat[c] * varMeanScale;
+    float invstd = invStdMat[c];
+    float w = hasWeight ? weight[c] : 1;
+    float b = hasBias ? bias[c] : 0;
+    int index = y * width + x + c * width * height;
+    T val = (src[index + src_offset] - mean) * w * invstd + b;
+    dst[index + dst_offset] = val;
+}
diff --git a/modules/dnn/src/opencl/benchmark.cl b/modules/dnn/src/opencl/benchmark.cl
new file mode 100644
index 0000000000..22acb93afd
--- /dev/null
+++ b/modules/dnn/src/opencl/benchmark.cl
@@ -0,0 +1,45 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+__kernel void null_kernel_float(float arg) {
+  float out = arg;
+}
diff --git a/modules/dnn/src/opencl/concat.cl b/modules/dnn/src/opencl/concat.cl
new file mode 100644
index 0000000000..041e6ac740
--- /dev/null
+++ b/modules/dnn/src/opencl/concat.cl
@@ -0,0 +1,60 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+__kernel void concat(const int nthreads,
+                     __global const Dtype* in_data,
+                     const int num_concats,
+                     const int concat_size,
+                     const int top_concat_axis,
+                     const int bottom_concat_axis,
+                     const int offset_concat_axis,
+                     __global Dtype* out_data) {
+
+  for (int index = get_global_id(0); index < nthreads;
+      index += get_global_size(0)) {
+    const int total_concat_size = concat_size * bottom_concat_axis;
+    const int concat_num = index / total_concat_size;
+    const int concat_index = index % total_concat_size;
+    const int top_index = concat_index
+        + (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
+    out_data[top_index] = in_data[index];
+  }
+}
diff --git a/modules/dnn/src/opencl/conv_layer_spatial.cl b/modules/dnn/src/opencl/conv_layer_spatial.cl
new file mode 100644
index 0000000000..a7bca1d6f0
--- /dev/null
+++ b/modules/dnn/src/opencl/conv_layer_spatial.cl
@@ -0,0 +1,1670 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if APPLY_BIAS
+#define BIAS_KERNEL_ARG __global Dtype * biases_base,
+#else
+#define BIAS_KERNEL_ARG
+#endif
+
+#define ACTIVATION_FUNCTION(_dst_, _offset_, _data_) do { (_dst_)[(_offset_)] = (_data_);} while(0)
+
+#define __CAT(x, y) x##y
+#define CAT(x, y) __CAT(x, y)
+#define LOOP0(VAR, STMT)
+#define LOOP1(VAR, STMT) (STMT); (VAR)++;
+#define LOOP2(VAR, STMT) LOOP1(VAR, STMT); (STMT); (VAR)++;
+#define LOOP3(VAR, STMT) LOOP2(VAR, STMT); (STMT); (VAR)++;
+#define LOOP4(VAR, STMT) LOOP3(VAR, STMT); (STMT); (VAR)++;
+#define LOOP5(VAR, STMT) LOOP4(VAR, STMT); (STMT); (VAR)++;
+#define LOOP6(VAR, STMT) LOOP5(VAR, STMT); (STMT); (VAR)++;
+#define LOOP7(VAR, STMT) LOOP6(VAR, STMT); (STMT); (VAR)++;
+#define LOOP8(VAR, STMT) LOOP7(VAR, STMT); (STMT); (VAR)++;
+#define LOOP9(VAR, STMT) LOOP8(VAR, STMT); (STMT); (VAR)++;
+#define LOOP10(VAR, STMT) LOOP9(VAR, STMT); (STMT); (VAR)++;
+#define LOOP11(VAR, STMT) LOOP10(VAR, STMT); (STMT); (VAR)++;
+#define LOOP12(VAR, STMT) LOOP11(VAR, STMT); (STMT); (VAR)++;
+#define LOOP13(VAR, STMT) LOOP12(VAR, STMT); (STMT); (VAR)++;
+#define LOOP14(VAR, STMT) LOOP13(VAR, STMT); (STMT); (VAR)++;
+#define LOOP15(VAR, STMT) LOOP14(VAR, STMT); (STMT); (VAR)++;
+#define LOOP16(VAR, STMT) LOOP15(VAR, STMT); (STMT); (VAR)++;
+#define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT))
+
+#if defined(convolve_simd) || defined(Conv_Interleaved)
+#if Dtype_SIZE == 4
+#define INT_TYPE uint
+#define INT_TYPE2 uint2
+#define INT_TYPE4 uint4
+#define INT_TYPE8 uint8
+#define SUB_GROUP_BLOCK_READ2 intel_sub_group_block_read2
+#define SUB_GROUP_BLOCK_READ4 intel_sub_group_block_read4
+#define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read8
+#define SUB_GROUP_BLOCK_READ intel_sub_group_block_read
+#else
+#error "Unsupported type"
+#endif
+#endif
+
+#ifdef KERNEL_BASIC
+
+__kernel void ConvolveBasic(
+    __global Dtype* image_data,
+    int image_offset,
+    __global Dtype* kernel_data,
+    int kernel_offset,
+    __global Dtype* bias,
+    const int bias_offset,
+    __global Dtype* convolved_image,
+    const int convolved_image_offset,
+    const ushort input_width,
+    const ushort input_height,
+    const ushort output_width,
+    const ushort output_height,
+    const ushort pad_w,
+    const ushort pad_h
+)
+{
+    const int outputX = get_global_id(0);
+    const int outputY = get_global_id(1);
+    const int kernelNum = get_global_id(2) * ZPAR;
+    if (outputX < output_width && outputY < output_height)
+    {
+        Dtype sum[ZPAR];
+        for (int kern = 0; kern < ZPAR; kern++)
+        {
+            sum[kern] = 0.0f;
+        }
+        const int org_y = outputY * STRIDE_Y - pad_h;
+        const int org_x = outputX * STRIDE_X - pad_w;
+        const int currentKernelOffset = kernel_offset + kernelNum*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS;
+#if APPLY_BIAS
+        const int biasIndex = bias_offset + kernelNum;
+#endif
+        const int local_image_offset = org_y * input_width + org_x;
+        const int imageSize = input_width * input_height;
+        __global Dtype* image_dataPtr = (image_data + (image_offset + local_image_offset));
+        __global Dtype* kernel_dataPtr = (kernel_data + (currentKernelOffset));
+        for (int c = 0; c < CHANNELS; c++)
+        {
+            for (int y = 0; y < KERNEL_HEIGHT; y++)
+            {
+                for (int x = 0; x < KERNEL_WIDTH; x++)
+                {
+                    int y_ = org_y + y * DILATION_Y;
+                    int x_ = org_x + x * DILATION_X;
+                    if (!(y_ >= 0 && y_ < input_height && x_ >= 0 && x_ < input_width))
+                    {
+                        continue;
+                    }
+                    for (int kern = 0; kern < ZPAR; kern++)
+                    {
+                        sum[kern] += image_dataPtr[x * DILATION_X] * kernel_dataPtr[kern*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS + x];
+                    }
+                }
+                image_dataPtr += input_width * DILATION_Y;
+                kernel_dataPtr += KERNEL_WIDTH;
+            }
+            image_dataPtr += imageSize - input_width*KERNEL_HEIGHT*DILATION_Y;
+        }
+
+        for (int kern = 0; kern < ZPAR; kern++)
+        {
+            if (kernelNum + kern < OUTPUT_Z)
+            {
+                int offset = convolved_image_offset + (kernelNum+kern)*output_height*output_width + outputY*output_width + outputX;
+#if APPLY_BIAS
+                ACTIVATION_FUNCTION(convolved_image, offset, sum[kern] + bias[biasIndex + kern]);
+#else
+                ACTIVATION_FUNCTION(convolved_image, offset, sum[kern]);
+#endif
+            }
+        }
+    }
+}
+
+#elif defined KERNEL_IDLF
+
+#if TYPE == TYPE_HALF
+#define VLOAD4(_v, _p) do { (_v).s0 = *(_p); (_v).s1 = *(_p + 1); (_v).s2 = *(_p + 2); (_v).s3 = *(_p + 3); } while(0)
+#else
+#define VLOAD4(_v, _p) do { _v = vload4(0, _p); } while(0)
+#endif
+
+// Each work-item computes a OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT region of one output map.
+// Each work-group (which will be mapped to 1 SIMD16/SIMD8 EU thread) will compute 16/8 different feature maps, but each feature map is for the same region of the imput image.
+// NDRange:  (output_width+pad)/ OUT_BLOCK_WIDTH, (output_height+pad)/OUT_BLOCK_HEIGHT, NUM_FILTERS/OUT_BLOCK_DEPTH
+
+// NOTE: for beignet this reqd_work_group_size does not guarantee that SIMD16 mode will be used, the compiler could choose to use two SIMD8 threads, and if that happens the code will break.
+#ifndef __BEIGNET__
+__attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
+__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
+#endif
+__kernel void
+convolve_simd(
+    __global Dtype* inputs_base,
+    filter_qualifier Dtype* weights_base,
+    BIAS_KERNEL_ARG
+    __global Dtype* outputs_base,
+    const ushort input_width,
+    const ushort input_height,
+    const ushort output_width,
+    const ushort output_height)
+{
+  __global Dtype* outputs = outputs_base;
+  __global Dtype* inputs = inputs_base;
+  filter_qualifier Dtype* weights = weights_base;
+  unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH;  // oc = Output Column
+  unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT;// or = Output Row
+  unsigned int fm = get_global_id(2);// fm = Feature Map = od = Output Depth
+  unsigned int fmg = get_group_id(2);
+  unsigned int lid = get_local_id(2);
+
+  Dtype out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT];
+
+  int in_addr;
+
+  // find weights adress of given neuron (lid is index)
+  unsigned int weight_addr = (fmg % (ALIGNED_NUM_FILTERS/SIMD_SIZE)) * INPUT_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE + lid;
+
+  for(int i=0;i<OUT_BLOCK_SIZE;i++) {
+    out[i]=0.0f;
+  }
+
+  unsigned int num_in_batch = ( fm ) / ALIGNED_NUM_FILTERS;
+
+  unsigned int input_batch_offset = num_in_batch * input_height * input_width * TOTAL_INPUT_DEPTH_SIZE;
+
+  int curr_local_y = ( lid / ( TILE_X / 4 ) );
+  int curr_local_x = ( lid % ( TILE_X / 4 ) ) * 4;
+  int curr_y = or * STRIDE_Y + INPUT_START_Y + curr_local_y;
+  int curr_x = oc * STRIDE_X + INPUT_START_X + curr_local_x;
+#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0
+  int saved_y = curr_y;
+#endif
+  in_addr = input_batch_offset + INPUT_START_Z * input_height * input_width
+            +  (curr_y - INPUT_PAD_H) * input_width             // y tile offset
+            +   curr_x - INPUT_PAD_W;                        // x tile offset
+  union {
+    Dtype4 in_vec[INVEC_SIZE];
+    Dtype in_array[INVEC_SIZE * 4];
+  } in_buf;
+
+  for(int kd = 0; kd < INPUT_DEPTH; kd++)
+  {
+    int in_offset = in_addr;
+    int reg = 0;
+    LOOP(INVEC_SIZE, reg,
+      {
+        if (curr_local_y + reg * TILE_Y_STRIDE < TILE_Y || INVEC_SIZE * TILE_Y_STRIDE <= (TILE_Y + 2) || reg < INVEC_SIZE - 1) {
+#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0
+        if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + 3 >= INPUT_PAD_W && curr_x < input_width + INPUT_PAD_W) {
+          if (curr_x < INPUT_PAD_W) {
+            in_buf.in_vec[reg].s0 = 0;
+            if (curr_x + 1 >= INPUT_PAD_W)
+              in_buf.in_vec[reg].s1 = *(inputs + in_offset + 1);
+            else
+              in_buf.in_vec[reg].s1 = 0;
+            if (curr_x + 2 >= INPUT_PAD_W)
+              in_buf.in_vec[reg].s2 = *(inputs + in_offset + 2);
+            else
+              in_buf.in_vec[reg].s2 = 0;
+            in_buf.in_vec[reg].s3 = *(inputs + in_offset + 3);
+          } else {
+            VLOAD4(in_buf.in_vec[reg], inputs + in_offset);
+            if (curr_x + 1 >= input_width + INPUT_PAD_W)
+              in_buf.in_vec[reg].s1 = 0;
+            if (curr_x + 2 >= input_width + INPUT_PAD_W)
+              in_buf.in_vec[reg].s2 = 0;
+            if (curr_x + 3 >= input_width + INPUT_PAD_W)
+              in_buf.in_vec[reg].s3 = 0;
+          }
+        } else {
+          in_buf.in_vec[reg] = 0;
+        }
+        curr_y += TILE_Y_STRIDE;
+#else
+        VLOAD4(in_buf.in_vec[reg], inputs + in_offset);
+#endif
+        }
+        in_offset += input_width * TILE_Y_STRIDE;
+      });
+    in_addr += input_height * input_width;
+#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0
+    curr_y = saved_y;
+#endif
+
+#if KERNEL_WIDTH * KERNEL_HEIGHT != 1
+#define WEIGHT_PREF 8
+#else
+#define WEIGHT_PREF 1
+#endif
+    union {
+      Dtype w[WEIGHT_PREF];
+#if KERNEL_WIDTH * KERNEL_HEIGHT != 1
+      INT_TYPE8 ui8;
+#endif
+    } weight_buf;
+    int w_idx=0;
+
+    unsigned int orig_weight_addr = weight_addr;
+#if KERNEL_WIDTH * KERNEL_HEIGHT != 1
+    weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]);
+    weight_addr += SIMD_SIZE * WEIGHT_PREF;
+#else
+    weight_buf.w[0] = as_Dtype(SUB_GROUP_BLOCK_READ((__global INT_TYPE *)&weights[weight_addr]));
+    weight_addr += SIMD_SIZE * 1;
+#endif
+
+#define BLOCK_IN(n) sub_group_broadcast( in_buf.in_array[((n)%4) + ((n) / (TILE_Y_STRIDE * TILE_X)) * 4], (((n) % (TILE_Y_STRIDE * TILE_X))/4))
+
+    int kr = 0;  // kr = Kernel Row
+    LOOP(KERNEL_HEIGHT, kr,// LOOP is a macro that unrolls the loop.
+        {
+          int kc = 0;  // kc = Kernel Column
+          LOOP(KERNEL_WIDTH, kc,
+              {
+                for(int br=0; br < OUT_BLOCK_HEIGHT; br++) {
+                  for(int bc=0; bc < OUT_BLOCK_WIDTH; bc++) {
+                    Dtype input = BLOCK_IN((br * STRIDE_Y + kr * DILATION_Y) * TILE_X + bc * STRIDE_X + kc * DILATION_X);
+                    out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf.w[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]);
+                  }
+                }
+#if KERNEL_WIDTH * KERNEL_HEIGHT > WEIGHT_PREF
+                // We assume KERNEL_W is equal to KERNEL_H here.
+                if ((w_idx + 1) % WEIGHT_PREF == 0
+                #if KERNEL_WIDTH * KERNEL_HEIGHT % 8 != 0
+                && ((w_idx + 1) <= (KERNEL_WIDTH * KERNEL_HEIGHT - WEIGHT_PREF))
+                #endif
+                    ) {
+                  weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]);
+                  weight_addr += SIMD_SIZE * WEIGHT_PREF;  // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details.
+                }
+              #if KERNEL_WIDTH*KERNEL_HEIGHT % 8 == 0
+                // need to do nothing
+              #else
+                else if ((w_idx + 1) %  WEIGHT_PREF == 0 && ((w_idx + 1) > (KERNEL_WIDTH * KERNEL_HEIGHT - WEIGHT_PREF)))
+                #if KERNEL_WIDTH * KERNEL_HEIGHT % 8 == 1
+                  weight_buf.w[0] = weights[weight_addr];
+                #elif KERNEL_WIDTH * KERNEL_HEIGHT % 8 == 2
+                  weight_buf.ui8.s01 = SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)&weights[weight_addr]);
+                #elif KERNEL_WIDTH * KERNEL_HEIGHT % 8 <= 4
+                  weight_buf.ui8.s0123 = SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)&weights[weight_addr]);
+                #else
+                  weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]);
+                #endif
+              #endif
+#endif
+                ++w_idx;
+              });
+        });
+    weight_addr = orig_weight_addr + KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE;
+
+  }
+  // dead code to work around possible compiler bug.
+  if (ALIGNED_NUM_FILTERS != NUM_FILTERS && fm > 0xfffffffeul) {
+    outputs[0] = BLOCK_IN(fm % SIMD_SIZE);
+  }
+  fm = fm % ALIGNED_NUM_FILTERS;
+
+  if ((ALIGNED_NUM_FILTERS == NUM_FILTERS || fm < NUM_FILTERS)) {
+  unsigned int out_addr = OUT_BUFF_OFFSET + ( num_in_batch * TOTAL_OUTPUT_DEPTH + fm ) * output_width * output_height;
+  out_addr += or * output_width + oc;
+  // we need this address calculation for biases because we support views and batching
+#if APPLY_BIAS
+  Dtype bias = biases_base[fm];
+#else
+  Dtype bias = 0;
+#endif
+    for(unsigned int r = 0; r < OUT_BLOCK_HEIGHT; r++) {
+      if (r + or >= output_height) break;
+      for(unsigned int c = 0; c < OUT_BLOCK_WIDTH; c++) {
+        if (c + oc >= output_width) break;
+        // this does a scattered write to SIMD_SIZE different feature maps, so that data within one map is contiguous, thus ready for input to next layer.
+        ACTIVATION_FUNCTION(outputs, out_addr + r * output_width + c, bias + out[r * OUT_BLOCK_WIDTH + c]);
+
+      }
+    }
+  }
+}
+
+#else // KERNEL_GEMM_LIKE
+
+#if APPLY_BIAS
+// Dtype bias[4];
+#define SUBGROUP_GET_BIAS(k, i) intel_sub_group_shuffle(bias[k], i)
+#else
+#define SUBGROUP_GET_BIAS(k, i) ((Dtype)0)
+#endif
+
+#ifdef Conv_Interleaved
+typedef struct float1 { float s0; } float1;
+typedef struct float5 { float s0; float s1; float s2; float s3; float s4; } float5;
+typedef struct float6 { float s0; float s1; float s2; float s3; float s4; float s5; } float6;
+typedef struct float7 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; } float7;
+typedef struct float9 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; float s8; } float9;
+typedef struct float10 { float s0; float s1; float s2; float s3; float s4; float s5;
+                         float s6; float s7; float s8; float s9;} float10;
+typedef struct float11 { float s0; float s1; float s2; float s3; float s4; float s5;
+                         float s6; float s7; float s8; float s9; float sa;} float11;
+typedef struct float12 { float s0; float s1; float s2; float s3; float s4; float s5;
+                         float s6; float s7; float s8; float s9; float sa; float sb; } float12;
+typedef struct float13 { float s0; float s1; float s2; float s3; float s4; float s5;
+                         float s6; float s7; float s8; float s9; float sa; float sb; float sc;} float13;
+typedef struct float14 { float s0; float s1; float s2; float s3; float s4; float s5;
+                         float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; } float14;
+typedef struct float15 { float s0; float s1; float s2; float s3; float s4; float s5;
+                         float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; float se; } float15;
+typedef struct float0 { float s0; } float0; //never used but makes compiler happy.
+
+#define OUT_PITCH_X output_width
+#define ROW_PITCH input_width
+
+#define GEMM_LIKE_KERNEL_ARGS     \
+    const __global Dtype *src0,   \
+    const __global Dtype *src1,   \
+    BIAS_KERNEL_ARG               \
+    __global Dtype *dst,          \
+    const ushort input_width,     \
+    const ushort input_height,    \
+    const ushort output_width,    \
+    const ushort output_height,   \
+    const int out_pitch_y,     \
+    const int out_pitch_z,     \
+    const int aligned_input_size, \
+    const int slice_pitch
+#endif
+
+#ifdef GEMM_LIKE_CONV_32_1
+//////////////////////////////////////////////////////////////////////////////
+// Conv_Interleaved_32_1_flex
+//
+// Convolution: each workitem computes 1 patch x 32 filters worth of output
+// data.  Kernel's inner loop works on a single tile consisting of one
+// row from each patch and the filter data corresponding to that row.  Filter
+// matrix is interleaved to reduce GRF bank conflicts.  Patches are walked
+// by rows and then by slices.  Relies on sub_group extension for block
+// reads and SIMD broadcast.  Allows flexible sizing of TILE width (TILE_N)
+// by dynamically selecting one of two code paths: one uses TILE_N = 32 and
+// the other uses TILE_N = 8, 16, or 24.
+#define TILE_M          1
+#define TILE_K          KERNEL_WIDTH
+#define TILE_N          32
+
+#ifndef __BEIGNET__
+__attribute__((intel_reqd_sub_group_size(8)))
+#endif
+__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
+{
+    const int group_x = get_group_id(0);
+    const int group_y = get_group_id(1);
+    const int global_x = get_global_id(0);
+    const int global_y = get_global_id(1);
+    const int global_z = get_global_id(2);
+    int interleaved_y;
+    int kernel_y;
+    int kernel_idx;
+
+#define DOT_PRODUCT_8( _result, _rowA, colB )    \
+    {   \
+        _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 );  \
+        _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 );  \
+        _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 );  \
+        _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 );  \
+        _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 );  \
+        _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 );  \
+        _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 );  \
+        _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 );  \
+    }
+    typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;
+
+    // True for all threads if filter_width is multiple of TILE_N
+    // else, true for all but right-most column of threads.
+    if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N )
+    {
+        // Result ctile (*dst) is M rows x N columns
+        // LWG size is 1x8.  Thus each thread calculates 8*M rows x N cols of ctile.
+        Dtype8  blockC00 = 0.f;
+        Dtype8  blockC10 = 0.f;
+        Dtype8  blockC20 = 0.f;
+        Dtype8  blockC30 = 0.f;
+
+        // Src0 (patch input) is directly used as atile.
+        // Each work item points to the start of a different patch.
+        // atile is M rows x K columns.
+        int curr_x = ( global_y % output_width ) * STRIDE_X;
+        int curr_y = ( global_y / output_width ) * STRIDE_Y;
+#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
+        int saved_y = curr_y;
+#endif
+        const __global Dtype *src0_read = src0
+          + aligned_input_size * global_z                            // batch offset
+          + (curr_y - INPUT_PAD_H) * ROW_PITCH      // y offset
+          + (curr_x - INPUT_PAD_W);                 // x offset
+
+        // Src1 (filter) is directly used as btile.
+        // It starts at the top of src1 and walks down.
+        // btile is K rows x N columns.
+        const __global Dtype *src1_read = src1 + ( global_x * TILE_N  * 2);
+
+        // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
+        // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
+        // and KERNEL_WIDTH/2 rows of interleaved filter.
+        int patch_depth = 0;
+        do
+        {
+            int patch_row = 0;
+#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
+            curr_y = saved_y;
+#endif
+
+            do
+            {
+                // Load atile and btile.
+                // Kernel data is partially interleaved.  Every 2 rows are interleaved at Dtype8 granularity.
+                // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved.  The non
+                // interleaved row is padded with zero to ensure same size as interleaved rows. This
+                // interleaving is done to ensure 0% GDR bank conflicts.  For example, this is how the
+                // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.
+                // (0, 0) (8, 0) (16, 0) (24, 0) ...       (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) ..
+                // (0, 1) (8, 1) (16, 1) (24, 1) ... =>    (0, 2) (8, 2) (16, 2) (24, 2) ...
+                // (0, 2) (8, 2) (16, 2) (24, 2) ...       ...
+                // ...
+                const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
+
+#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1
+                Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[  0  ];
+                Dtype*  pblockA00 = (Dtype*)(&blockA00);
+#else
+                Dtype_t blockA00;
+                Dtype*  pblockA00 = (Dtype*)(&blockA00);
+                int pos = 0;
+                LOOP(KERNEL_WIDTH, pos,
+                {
+                  if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
+                    pblockA00[pos] = src0_read[pos * DILATION_X];
+                  else
+                    pblockA00[pos] = 0;
+                })
+                curr_y += DILATION_Y;
+#endif
+                src0_read += (ROW_PITCH * DILATION_Y);
+
+                Dtype blockB00[KERNEL_WIDTH*4];
+                Dtype8* p8BlockB00 = (Dtype8*)blockB00;
+                Dtype4* p4BlockB00 = (Dtype4*)blockB00;
+                Dtype*  pBlockB00 =  (Dtype* )blockB00;
+
+                interleaved_y = 0;
+                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
+                {
+                    p8BlockB00[interleaved_y] = as_Dtype8( SUB_GROUP_BLOCK_READ8( (const __global INT_TYPE *)src1_read ) );
+                    src1_read += WIDTH1 * 2;
+                } )
+                if ( kernel_width_is_odd )
+                {
+                    p4BlockB00[KERNEL_WIDTH - 1] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE *)src1_read ) );
+                    src1_read += WIDTH1 * 2;
+                }
+
+                // Perform MADs
+                kernel_idx = 0;
+                interleaved_y = 0;
+                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
+                {
+                    kernel_y = interleaved_y * 2;
+                    DOT_PRODUCT_8( blockC00, pblockA00[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_8( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_8( blockC10, pblockA00[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_8( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_8( blockC20, pblockA00[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_8( blockC20, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_8( blockC30, pblockA00[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_8( blockC30, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
+                } )
+                    kernel_y = interleaved_y * 2;
+                if ( kernel_width_is_odd )
+                {
+                    DOT_PRODUCT_8( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_8( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_8( blockC20, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_8( blockC30, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
+                }
+            }
+
+            //while( ++patch_row < 1 ); //debug
+            while( ++patch_row < KERNEL_HEIGHT );
+
+            src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); // reset to start of next slice of patch
+        }
+        //while ( ++patch_depth < 1 ); //debug
+        while ( ++patch_depth < INPUT_DEPTH );
+
+        // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
+        // (SIMD * TILE_M) x 1 x TILE_N.  Partial writes most likely generated if padding used.
+        int out_offset = global_z * out_pitch_z                                                   // batch offset
+         + ( group_x * TILE_N ) * out_pitch_y                                       // channel offset
+         + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X  // y offset
+         + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT;               // x offset
+
+        __global Dtype *out = dst + out_offset;
+#if APPLY_BIAS
+        Dtype bias[4];
+        Dtype4 *bias_vec;
+        bias_vec = (Dtype4*)bias;
+        *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
+#endif
+#ifdef FUSED_CONV_CHANNEL_RELU
+        Dtype slope[4];
+        Dtype4 *slope_vec;
+        slope_vec = (Dtype4*)slope;
+        *slope_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)negative_slope_base + group_x * TILE_N));
+        Dtype negative_slope;
+#endif
+        if (global_y * TILE_M < output_width * output_height )
+        {
+            for (int i = 0; i < 8; i++)
+            {
+#ifdef FUSED_CONV_CHANNEL_RELU
+            negative_slope = intel_sub_group_shuffle(slope[0], i);
+#endif
+            ACTIVATION_FUNCTION(dst, out_offset + ( 0 + i ) * out_pitch_y, blockC00[i] + SUBGROUP_GET_BIAS(0, i));
+
+#ifdef FUSED_CONV_CHANNEL_RELU
+            negative_slope = intel_sub_group_shuffle(slope[1], i);
+#endif
+            ACTIVATION_FUNCTION(dst, out_offset + ( 8 + i ) * out_pitch_y, blockC10[i] + SUBGROUP_GET_BIAS(1, i));
+#ifdef FUSED_CONV_CHANNEL_RELU
+            negative_slope = intel_sub_group_shuffle(slope[2], i);
+#endif
+            ACTIVATION_FUNCTION(dst, out_offset + ( 16 + i ) * out_pitch_y, blockC20[i] + SUBGROUP_GET_BIAS(2, i));
+#ifdef FUSED_CONV_CHANNEL_RELU
+            negative_slope = intel_sub_group_shuffle(slope[3], i);
+#endif
+            ACTIVATION_FUNCTION(dst, out_offset + ( 24 + i ) * out_pitch_y, blockC30[i] + SUBGROUP_GET_BIAS(3, i));
+            }
+        }
+    }
+#if TILE_N_LAST > 0
+    else
+    {
+
+        // Result ctile (*dst) is M rows x N columns
+        // LWG size is 1x8.  Thus each thread calculates 8*M rows x N cols of ctile.
+        int i = 0;
+        Dtype8  blockC[TILE_N_LAST_DIV8];
+        LOOP(TILE_N_LAST_DIV8, i,
+        {
+            blockC[i] = 0.f;
+        } )
+
+        // Src0 (patch input) is directly used as atile.
+        // Each work item points to the start of a different patch.
+        // atile is M rows x K columns.
+        int curr_x = ( global_y % output_width ) * STRIDE_X;
+        int curr_y = ( global_y / output_width ) * STRIDE_Y;
+#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
+        int saved_y = curr_y;
+#endif
+        const __global Dtype *src0_read = src0
+          + aligned_input_size * global_z                            // batch offset
+          + (curr_y - INPUT_PAD_H) * ROW_PITCH      // y offset
+          + (curr_x - INPUT_PAD_W);                 // x offset
+
+        // Src1 (filter) is directly used as btile.
+        // It starts at the top of src1 and walks down.
+        // btile is K rows x N columns.
+        const __global Dtype *src1_read = src1 + ( global_x * TILE_N  * 2);
+
+        // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
+        // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
+        // and KERNEL_WIDTH/2 rows of interleaved filter.
+        int patch_depth = 0;
+        do
+        {
+            int patch_row = 0;
+#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
+            curr_y = saved_y;
+#endif
+            do
+            {
+                // Load atile and interleaved btile.
+                const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
+#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1
+                Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[  0  ];
+                Dtype*  pblockA00 = (Dtype*)(&blockA00);
+#else
+                Dtype_t blockA00;
+                Dtype*  pblockA00 = (Dtype*)(&blockA00);
+                int pos = 0;
+                LOOP(KERNEL_WIDTH, pos,
+                {
+                  if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
+                    pblockA00[pos] = src0_read[pos * DILATION_X];
+                  else
+                    pblockA00[pos] = 0;
+                })
+                curr_y += DILATION_Y;
+#endif
+                src0_read += (ROW_PITCH * DILATION_Y);
+                Dtype blockB[KERNEL_WIDTH * TILE_N_LAST_DIV8];
+
+                interleaved_y = 0;
+                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
+                {
+#if TILE_N_LAST_DIV8 == 1
+                    Dtype2* p2BlockB = (Dtype2* )blockB;
+                    p2BlockB[interleaved_y] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
+#elif TILE_N_LAST_DIV8 == 2
+                    Dtype4* p4BlockB = (Dtype4* )blockB;
+                    p4BlockB[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
+#elif TILE_N_LAST_DIV8 == 3
+                    //TODO: broken.  No block_read6
+                    Dtype6* p6BlockB = (Dtype6* )blockB;
+                    (*((Dtype8*)(&p6BlockB[interleaved_y]))).s0123 = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
+                    (*((Dtype8*)(&p6BlockB[interleaved_y]))).s45 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)(src1_read + 4 * 8) ) );
+#endif
+                    src1_read += WIDTH1 * 2;
+                } )
+                if ( kernel_width_is_odd )
+                {
+#if TILE_N_LAST_DIV8 == 1
+                    Dtype* pBlockB = (Dtype* )blockB;
+                    pBlockB[KERNEL_WIDTH - 1] = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*)src1_read ) );
+#elif TILE_N_LAST_DIV8 == 2
+                    Dtype2* p2BlockB = (Dtype2* )blockB;
+                    p2BlockB[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
+#elif TILE_N_LAST_DIV8 == 3
+                    Dtype3* p3BlockB = (Dtype3* )blockB;
+                    p3BlockB[KERNEL_WIDTH - 1].s01 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
+                    p3BlockB[KERNEL_WIDTH - 1].s2 = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*) (src1_read + 2 * 8) ) );
+#endif
+                    src1_read += WIDTH1 * 2;
+                }
+
+                // Perform MADs
+                Dtype* pBlockB = (Dtype*)blockB;
+                kernel_idx = 0;
+                interleaved_y = 0;
+                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
+                {
+                    kernel_y = interleaved_y * 2;
+                    DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y    ], pBlockB[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
+#if TILE_N_LAST_DIV8 >= 2
+                    DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y    ], pBlockB[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
+#if TILE_N_LAST_DIV8 >= 3
+                    DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y    ], pBlockB[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
+#endif
+#endif
+                } )
+                    kernel_y = interleaved_y * 2;
+                if ( kernel_width_is_odd )
+                {
+                    DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
+#if TILE_N_LAST_DIV8 >= 2
+                    DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
+#if TILE_N_LAST_DIV8 >= 3
+                    DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
+#endif
+#endif
+                }
+            }
+
+            //while( ++patch_row < 1 ); //debug
+            while( ++patch_row < KERNEL_HEIGHT );
+
+            src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch
+        }
+        //while ( ++patch_depth < 1 );  //debug
+        while ( ++patch_depth < INPUT_DEPTH );
+
+        // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
+        // (SIMD * TILE_M) x 1 x TILE_N.  Partial writes most likely generated if padding used.
+        int out_offset = global_z * out_pitch_z                                                   // batch offset
+         + ( group_x * TILE_N ) * out_pitch_y                                       // channel offset
+         + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X  // y offset
+         + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT;               // x offset
+        __global Dtype *out = dst + out_offset;
+#if APPLY_BIAS
+        Dtype bias[4];
+        Dtype4 *bias_vec;
+        bias_vec = (Dtype4*)bias;
+        *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
+#endif
+
+#ifdef FUSED_CONV_CHANNEL_RELU
+        Dtype slope[4];
+        Dtype4 *slope_vec;
+        slope_vec = (Dtype4*)slope;
+        *slope_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)negative_slope_base + group_x * TILE_N));
+        Dtype negative_slope;
+#endif
+
+        if (global_y * TILE_M < output_width * output_height )
+        {
+            for (int i = 0; i < 8; i++)
+            {
+                if ( TILE_N_LAST_DIV8 > 0 )
+                {
+#ifdef FUSED_CONV_CHANNEL_RELU
+                  negative_slope = intel_sub_group_shuffle(slope[0], i);
+#endif
+                  ACTIVATION_FUNCTION(dst, out_offset + ( 0+i) * out_pitch_y, blockC[0][i] + SUBGROUP_GET_BIAS(0, i));
+                }
+                if ( TILE_N_LAST_DIV8 > 1 )
+                {
+#ifdef FUSED_CONV_CHANNEL_RELU
+                  negative_slope = intel_sub_group_shuffle(slope[1], i);
+#endif
+                  ACTIVATION_FUNCTION(dst, out_offset + ( 8+i) * out_pitch_y, blockC[1][i] + SUBGROUP_GET_BIAS(1, i));
+                }
+                if ( TILE_N_LAST_DIV8 > 2 )
+                {
+#ifdef FUSED_CONV_CHANNEL_RELU
+                  negative_slope = intel_sub_group_shuffle(slope[2], i);
+#endif
+                  ACTIVATION_FUNCTION(dst, out_offset + (16+i) * out_pitch_y, blockC[2][i] + SUBGROUP_GET_BIAS(2, i));
+                }
+                if ( TILE_N_LAST_DIV8 > 3 )
+                {
+
+#ifdef FUSED_CONV_CHANNEL_RELU
+                  negative_slope = intel_sub_group_shuffle(slope[3], i);
+#endif
+                  ACTIVATION_FUNCTION(dst, out_offset + (24+i) * out_pitch_y, blockC[3][i] + SUBGROUP_GET_BIAS(3, i));
+                }
+            }
+        }
+    }
+#endif
+}
+#endif
+#ifdef GEMM_LIKE_CONV_32_2
+
+//////////////////////////////////////////////////////////////////////////////
+// Conv_Interleaved_32_2_flex
+//
+// Convolution: each workitem computes 1 patch x 32 filters worth of output
+// data.  Kernel's inner loop works on a single tile consisting of one
+// row from each patch and the filter data corresponding to that row.  Filter
+// matrix is interleaved to reduce GRF bank conflicts.  Patches are walked
+// by rows and then by slices.  Relies on sub_group extension for block
+// reads and SIMD broadcast.  Allows flexible sizing of TILE width (TILE_N)
+// by dynamically selecting one of two code paths: one uses TILE_N = 32 and
+// the other uses TILE_N = 8, 16, or 24.
+#define TILE_M          2
+#define TILE_K          KERNEL_WIDTH
+#define TILE_N          32
+
+#ifndef __BEIGNET__
+__attribute__((intel_reqd_sub_group_size(8)))
+#endif
+__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
+{
+    const int group_x = get_group_id(0);
+    const int group_y = get_group_id(1);
+    const int global_x = get_global_id(0);
+    const int global_y = get_global_id(1);
+    const int global_z = get_global_id(2);
+    int interleaved_y;
+    int kernel_y;
+    int kernel_idx;
+
+#define DOT_PRODUCT_8( _result, _rowA, colB )    \
+    {   \
+        _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 );  \
+        _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 );  \
+        _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 );  \
+        _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 );  \
+        _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 );  \
+        _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 );  \
+        _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 );  \
+        _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 );  \
+    }
+        typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;
+
+    // True for all threads if filter_width is multiple of TILE_N
+    // else, true for all but right-most column of threads.
+    if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N )
+    {
+        // Result ctile (*dst) is M rows x N columns
+        // LWG size is 1x8.  Thus each thread calculates 8*M rows x N cols of ctile.
+        Dtype8  blockC00 = 0.f;
+        Dtype8  blockC10 = 0.f;
+        Dtype8  blockC20 = 0.f;
+        Dtype8  blockC30 = 0.f;
+        Dtype8  blockC01 = 0.f;
+        Dtype8  blockC11 = 0.f;
+        Dtype8  blockC21 = 0.f;
+        Dtype8  blockC31 = 0.f;
+
+        // Src0 (patch input) is directly used as atile.
+        // Each work item points to the start of a different patch.
+        // atile is M rows x K columns.
+        int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X;
+        int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X;
+        int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y;
+        int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y;
+#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
+        int saved_y0 = curr_y0;
+        int saved_y1 = curr_y1;
+#endif
+        const __global Dtype *src0_read0 = src0
+         + aligned_input_size * global_z                                            // batch offset
+         + (curr_y0 - INPUT_PAD_H) * ROW_PITCH   // y offset
+         + curr_x0 - INPUT_PAD_W;                // x offset
+        const __global Dtype *src0_read1 = src0
+         + aligned_input_size * global_z                                            // batch offset
+         + (curr_y1 - INPUT_PAD_H) * ROW_PITCH   // y offset
+         + curr_x1 - INPUT_PAD_W;                // x offset
+
+        // Src1 (filter) is directly used as btile.
+        // It starts at the top of src1 and walks down.
+        // btile is K rows x N columns.
+        const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);
+
+        // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
+        // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
+        // and KERNEL_WIDTH/2 rows of interleaved filter.
+        int patch_depth = 0;
+        do
+        {
+            int patch_row = 0;
+            do
+            {
+                // Load atile and btile.
+                // Kernel data is partially interleaved.  Every 2 rows are interleaved at Dtype8 granularity.
+                // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved.  The non
+                // interleaved row is padded with zero to ensure same size as interleaved rows. This
+                // interleaving is done to ensure 0% GDR bank conflicts.  For example, this is how the
+                // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.
+                // (0, 0) (8, 0) (16, 0) (24, 0) ...       (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) ..
+                // (0, 1) (8, 1) (16, 1) (24, 1) ... =>    (0, 2) (8, 2) (16, 2) (24, 2) ...
+                // (0, 2) (8, 2) (16, 2) (24, 2) ...       ...
+                // ...
+                const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
+#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1
+                Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[  0  ]; src0_read0 += ROW_PITCH;
+                Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[  0  ]; src0_read1 += ROW_PITCH;
+                Dtype*  pblockA00 = (Dtype*)(&blockA00);
+                Dtype*  pblockA01 = (Dtype*)(&blockA01);
+#else
+                Dtype_t blockA00;
+                Dtype*  pblockA00 = (Dtype*)(&blockA00);
+                int pos = 0;
+                LOOP(KERNEL_WIDTH, pos,
+                {
+                  if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
+                    pblockA00[pos] = src0_read0[pos * DILATION_X];
+                  else
+                    pblockA00[pos] = 0;
+                })
+                curr_y0 += DILATION_Y;
+                Dtype_t blockA01;
+                Dtype*  pblockA01 = (Dtype*)(&blockA01);
+                pos = 0;
+                LOOP(KERNEL_WIDTH, pos,
+                {
+                  if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
+                    pblockA01[pos] = src0_read1[pos * DILATION_X];
+                  else
+                    pblockA01[pos] = 0;
+                })
+                curr_y1 += DILATION_Y;
+                src0_read0 += (ROW_PITCH * DILATION_Y);
+                src0_read1 += (ROW_PITCH * DILATION_Y);
+#endif
+                Dtype blockB00[KERNEL_WIDTH*4];
+                Dtype8* p8BlockB00 = (Dtype8*)blockB00;
+                Dtype4* p4BlockB00 = (Dtype4*)blockB00;
+                Dtype*  pBlockB00 =  (Dtype* )blockB00;
+
+                interleaved_y = 0;
+                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
+                {
+                    p8BlockB00[interleaved_y] = as_Dtype8( SUB_GROUP_BLOCK_READ8( (const __global INT_TYPE*)src1_read ) );
+                    src1_read += WIDTH1 * 2;
+                } )
+                if ( kernel_width_is_odd )
+                {
+                    p4BlockB00[KERNEL_WIDTH - 1] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
+                    src1_read += WIDTH1 * 2;
+                }
+                // Perform MADs
+                kernel_idx = 0;
+                interleaved_y = 0;
+                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
+                {
+                    kernel_y = interleaved_y * 2;
+                    DOT_PRODUCT_8( blockC00, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );
+                    DOT_PRODUCT_8( blockC01, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_8( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
+                    DOT_PRODUCT_8( blockC01, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_8( blockC10, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );
+                    DOT_PRODUCT_8( blockC11, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_8( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
+                    DOT_PRODUCT_8( blockC11, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_8( blockC20, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );
+                    DOT_PRODUCT_8( blockC21, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_8( blockC20, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
+                    DOT_PRODUCT_8( blockC21, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_8( blockC30, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );
+                    DOT_PRODUCT_8( blockC31, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_8( blockC30, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
+                    DOT_PRODUCT_8( blockC31, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
+                } )
+                if ( kernel_width_is_odd )
+                {
+                    kernel_y = interleaved_y * 2;
+                    DOT_PRODUCT_8( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] );
+                    DOT_PRODUCT_8( blockC01, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_8( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] );
+                    DOT_PRODUCT_8( blockC11, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_8( blockC20, pblockA00[kernel_y], pBlockB00[kernel_idx] );
+                    DOT_PRODUCT_8( blockC21, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_8( blockC30, pblockA00[kernel_y], pBlockB00[kernel_idx] );
+                    DOT_PRODUCT_8( blockC31, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
+                }
+            }
+
+            //while( ++patch_row < 1 ); //debug
+            while( ++patch_row < KERNEL_HEIGHT );
+#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1
+            curr_y0 = saved_y0;
+            curr_y1 = saved_y1;
+#endif
+            src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch
+            src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
+        }
+        //while ( ++patch_depth < 1 );  //debug
+        while ( ++patch_depth < INPUT_DEPTH );
+
+        // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
+        // (SIMD * TILE_M) x 1 x TILE_N.  Partial writes most likely generated if padding used.
+        int out0_offset = global_z * out_pitch_z                                                       // batch offset
+         + ( group_x * TILE_N ) * out_pitch_y                                           // channel offset
+         + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
+         + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT;               // x offset
+        int out1_offset = global_z * out_pitch_z                                                       // batch offset
+         + ( group_x * TILE_N ) * out_pitch_y                                           // channel offset
+         + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
+         + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT;               // x offset
+
+#if APPLY_BIAS
+        Dtype bias[4];
+        Dtype4 *bias_vec;
+        bias_vec = (Dtype4*)bias;
+        *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
+#endif
+
+#ifdef FUSED_CONV_CHANNEL_RELU
+        Dtype slope[4];
+        Dtype4 *slope_vec;
+        slope_vec = (Dtype4*)slope;
+        *slope_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)negative_slope_base + group_x * TILE_N));
+        Dtype negative_slope;
+#endif
+
+        if( global_y * TILE_M < output_width * output_height )
+        {
+            for( int i = 0; i < 8; i++ )
+            {
+#ifdef FUSED_CONV_CHANNEL_RELU
+                negative_slope = intel_sub_group_shuffle(slope[0], i);
+#endif
+                ACTIVATION_FUNCTION(dst, out0_offset + ( 0+i) * out_pitch_y, blockC00[i] + SUBGROUP_GET_BIAS(0, i));
+#ifdef FUSED_CONV_CHANNEL_RELU
+                negative_slope = intel_sub_group_shuffle(slope[1], i);
+#endif
+                ACTIVATION_FUNCTION(dst, out0_offset + ( 8+i) * out_pitch_y, blockC10[i] + SUBGROUP_GET_BIAS(1, i));
+#ifdef FUSED_CONV_CHANNEL_RELU
+                negative_slope = intel_sub_group_shuffle(slope[2], i);
+#endif
+                ACTIVATION_FUNCTION(dst, out0_offset + (16+i) * out_pitch_y, blockC20[i] + SUBGROUP_GET_BIAS(2, i));
+#ifdef FUSED_CONV_CHANNEL_RELU
+                negative_slope = intel_sub_group_shuffle(slope[3], i);
+#endif
+                ACTIVATION_FUNCTION(dst, out0_offset + (24+i) * out_pitch_y, blockC30[i] + SUBGROUP_GET_BIAS(3, i));
+            }
+        }
+        if( global_y * TILE_M + 1 < output_width * output_height )
+        {
+            for( int i = 0; i < 8; i++ )
+            {
+
+#ifdef FUSED_CONV_CHANNEL_RELU
+                negative_slope = intel_sub_group_shuffle(slope[0], i);
+#endif
+                ACTIVATION_FUNCTION(dst, out1_offset + ( 0+i) * out_pitch_y, blockC01[i] + SUBGROUP_GET_BIAS(0, i));
+
+#ifdef FUSED_CONV_CHANNEL_RELU
+                negative_slope = intel_sub_group_shuffle(slope[1], i);
+#endif
+                ACTIVATION_FUNCTION(dst, out1_offset + ( 8+i) * out_pitch_y, blockC11[i] + SUBGROUP_GET_BIAS(1, i));
+
+#ifdef FUSED_CONV_CHANNEL_RELU
+                negative_slope = intel_sub_group_shuffle(slope[2], i);
+#endif
+                ACTIVATION_FUNCTION(dst, out1_offset + (16+i) * out_pitch_y, blockC21[i] + SUBGROUP_GET_BIAS(2, i));
+
+#ifdef FUSED_CONV_CHANNEL_RELU
+                negative_slope = intel_sub_group_shuffle(slope[3], i);
+#endif
+                ACTIVATION_FUNCTION(dst, out1_offset + (24+i) * out_pitch_y, blockC31[i] + SUBGROUP_GET_BIAS(3, i));
+            }
+        }
+    }
+#if TILE_N_LAST > 0
+    else
+    {
+
+        // Result ctile (*dst) is M rows x N columns
+        // LWG size is 1x8.  Thus each thread calculates 8*M rows x N cols of ctile.
+        int i = 0;
+        Dtype8  blockC0[TILE_N_LAST_DIV8];
+        Dtype8  blockC1[TILE_N_LAST_DIV8];
+        LOOP(TILE_N_LAST_DIV8, i,
+        {
+            blockC0[i] = 0.f;
+            blockC1[i] = 0.f;
+        } )
+
+        // Src0 (patch input) is directly used as atile.
+        // Each work item points to the start of a different patch.
+        // atile is M rows x K columns.
+        int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X;
+        int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X;
+        int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y;
+        int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y;
+#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
+        int saved_y0 = curr_y0;
+        int saved_y1 = curr_y1;
+#endif
+        const __global Dtype *src0_read0 = src0
+         + aligned_input_size * global_z                                            // batch offset
+         + (curr_y0 - INPUT_PAD_H) * ROW_PITCH   // y offset
+         + curr_x0 - INPUT_PAD_W;                // x offset
+        const __global Dtype *src0_read1 = src0
+         + aligned_input_size * global_z                                            // batch offset
+         + (curr_y1 - INPUT_PAD_H) * ROW_PITCH   // y offset
+         + curr_x1 - INPUT_PAD_W;                // x offset
+
+        // Src1 (filter) is directly used as btile.
+        // It starts at the top of src1 and walks down.
+        // btile is K rows x N columns.
+        const __global Dtype *src1_read = src1 + ( global_x * TILE_N  * 2);
+
+        // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
+        // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
+        // and KERNEL_WIDTH/2 rows of interleaved filter.
+        int patch_depth = 0;
+        do
+        {
+            int patch_row = 0;
+            do
+            {
+                // Load atile and interleaved btile.
+                const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
+#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1
+                Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[  0  ]; src0_read0 += ROW_PITCH;
+                Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[  0  ]; src0_read1 += ROW_PITCH;
+                Dtype*  pblockA00 = (Dtype*)(&blockA00);
+                Dtype*  pblockA01 = (Dtype*)(&blockA01);
+#else
+                Dtype_t blockA00;
+                Dtype*  pblockA00 = (Dtype*)(&blockA00);
+                int pos = 0;
+                LOOP(KERNEL_WIDTH, pos,
+                {
+                  if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
+                    pblockA00[pos] = src0_read0[pos * DILATION_X];
+                  else
+                    pblockA00[pos] = 0;
+                })
+                curr_y0 += DILATION_Y;
+                Dtype_t blockA01;
+                Dtype*  pblockA01 = (Dtype*)(&blockA01);
+                pos = 0;
+                LOOP(KERNEL_WIDTH, pos,
+                {
+                  if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
+                    pblockA01[pos] = src0_read1[pos * DILATION_X];
+                  else
+                    pblockA01[pos] = 0;
+                })
+                curr_y1 += DILATION_Y;
+                src0_read0 += (ROW_PITCH * DILATION_Y);
+                src0_read1 += (ROW_PITCH * DILATION_Y);
+#endif
+                Dtype blockB[KERNEL_WIDTH * TILE_N_LAST_DIV8];
+
+                interleaved_y = 0;
+                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
+                {
+#if TILE_N_LAST_DIV8 == 1
+                    Dtype2* p2BlockB = (Dtype2* )blockB;
+                    p2BlockB[interleaved_y] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
+#elif TILE_N_LAST_DIV8 == 2
+                    Dtype4* p4BlockB = (Dtype4* )blockB;
+                    p4BlockB[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
+#elif TILE_N_LAST_DIV8 == 3
+                    //TODO: broken.  No block_read6
+                    Dtype6* p6BlockB = (Dtype6* )blockB;
+                    (*((Dtype8*)(&p6BlockB[interleaved_y]))).s0123 = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
+                    (*((Dtype8*)(&p6BlockB[interleaved_y]))).s45 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)(src1_read + 4 * 8) ) );
+#endif
+                    src1_read += WIDTH1 * 2;
+                } )
+                if ( kernel_width_is_odd )
+                {
+#if TILE_N_LAST_DIV8 == 1
+                    Dtype* pBlockB = (Dtype* )blockB;
+                    pBlockB[KERNEL_WIDTH - 1] = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*)src1_read ) );
+#elif TILE_N_LAST_DIV8 == 2
+                    Dtype2* p2BlockB = (Dtype2* )blockB;
+                    p2BlockB[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
+#elif TILE_N_LAST_DIV8 == 3
+                    Dtype3* p3BlockB = (Dtype3* )blockB;
+                    p3BlockB[KERNEL_WIDTH - 1].s01 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
+                    p3BlockB[KERNEL_WIDTH - 1].s2 = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*) (src1_read + 8) ) );
+#endif
+                    src1_read += WIDTH1 * 2;
+                }
+
+                // Perform MADs
+                Dtype* pBlockB = (Dtype*)blockB;
+                kernel_idx = 0;
+                interleaved_y = 0;
+                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
+                {
+                    kernel_y = interleaved_y * 2;
+                    DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y    ], pBlockB[kernel_idx] );
+                    DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y    ], pBlockB[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y + 1], pBlockB[kernel_idx] );
+                    DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
+#if TILE_N_LAST_DIV8 >= 2
+                    DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y    ], pBlockB[kernel_idx] );
+                    DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y    ], pBlockB[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y + 1], pBlockB[kernel_idx] );
+                    DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
+#if TILE_N_LAST_DIV8 >= 3
+                    DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y    ], pBlockB[kernel_idx] );
+                    DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y    ], pBlockB[kernel_idx] ); kernel_idx++;
+                    DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y + 1], pBlockB[kernel_idx] );
+                    DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
+#endif
+#endif
+                } )
+                    kernel_y = interleaved_y * 2;
+                if ( kernel_width_is_odd )
+                {
+                    DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y], pBlockB[kernel_idx] );
+                    DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
+#if TILE_N_LAST_DIV8 >= 2
+                    DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y], pBlockB[kernel_idx] );
+                    DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
+#if TILE_N_LAST_DIV8 >= 3
+                    DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y], pBlockB[kernel_idx] );
+                    DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
+#endif
+#endif
+                }
+            }
+
+            //while( ++patch_row < 1 ); //debug
+            while( ++patch_row < KERNEL_HEIGHT );
+#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1
+            curr_y0 = saved_y0;
+            curr_y1 = saved_y1;
+#endif
+            src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch
+            src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
+        }
+        //while ( ++patch_depth < 1 );  //debug
+        while ( ++patch_depth < INPUT_DEPTH );
+
+        // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
+        // (SIMD * TILE_M) x 1 x TILE_N.  Partial writes most likely generated if padding used.
+        int out0_offset = global_z * out_pitch_z                                                       // batch offset
+         + ( group_x * TILE_N ) * out_pitch_y                                           // channel offset
+         + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
+         + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT;               // x offset
+        int out1_offset = global_z * out_pitch_z                                                       // batch offset
+         + ( group_x * TILE_N ) * out_pitch_y                                           // channel offset
+         + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
+         + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT;               // x offset
+        __global Dtype *out1 = dst + out1_offset;
+
+#if APPLY_BIAS
+        Dtype bias[4];
+        Dtype4 *bias_vec;
+        bias_vec = (Dtype4*)bias;
+        *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
+#endif
+#ifdef FUSED_CONV_CHANNEL_RELU
+        Dtype slope[4];
+        Dtype4 *slope_vec;
+        slope_vec = (Dtype4*)slope;
+        *slope_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)negative_slope_base + group_x * TILE_N));
+        Dtype negative_slope;
+#endif
+        if( global_y * TILE_M < output_width * output_height )
+        {
+            for( int i = 0; i < 8; i++ )
+            {
+                if ( TILE_N_LAST_DIV8 > 0 )
+                {
+
+#ifdef FUSED_CONV_CHANNEL_RELU
+                  negative_slope = intel_sub_group_shuffle(slope[0], i);
+#endif
+                  ACTIVATION_FUNCTION(dst, out0_offset + ( 0+i) * out_pitch_y, blockC0[0][i] + SUBGROUP_GET_BIAS(0, i));
+                }
+                if ( TILE_N_LAST_DIV8 > 1 )
+                {
+#ifdef FUSED_CONV_CHANNEL_RELU
+                negative_slope = intel_sub_group_shuffle(slope[1], i);
+#endif
+                ACTIVATION_FUNCTION(dst, out0_offset + ( 8+i) * out_pitch_y, blockC0[1][i] + SUBGROUP_GET_BIAS(1, i));
+                }
+                if ( TILE_N_LAST_DIV8 > 2 )
+                {
+#ifdef FUSED_CONV_CHANNEL_RELU
+               negative_slope = intel_sub_group_shuffle(slope[2], i);
+#endif
+                ACTIVATION_FUNCTION(dst, out0_offset + (16+i) * out_pitch_y, blockC0[2][i] + SUBGROUP_GET_BIAS(2, i));
+                }
+                if ( TILE_N_LAST_DIV8 > 3 )
+                {
+#ifdef FUSED_CONV_CHANNEL_RELU
+                negative_slope = intel_sub_group_shuffle(slope[3], i);
+#endif
+                ACTIVATION_FUNCTION(dst, out0_offset + (24+i) * out_pitch_y, blockC0[3][i] + SUBGROUP_GET_BIAS(3, i));
+                }
+            }
+        }
+        if( global_y * TILE_M + 1 < output_width * output_height )
+        {
+            for( int i = 0; i < 8; i++ )
+            {
+                if ( TILE_N_LAST_DIV8 > 0 )
+                {
+#ifdef FUSED_CONV_CHANNEL_RELU
+                negative_slope = intel_sub_group_shuffle(slope[0], i);
+#endif
+                ACTIVATION_FUNCTION(dst, out1_offset + ( 0+i) * out_pitch_y, blockC1[0][i] + SUBGROUP_GET_BIAS(0, i));
+                }
+                if ( TILE_N_LAST_DIV8 > 1 )
+                {
+#ifdef FUSED_CONV_CHANNEL_RELU
+                negative_slope = intel_sub_group_shuffle(slope[1], i);
+#endif
+                ACTIVATION_FUNCTION(dst, out1_offset + ( 8+i) * out_pitch_y, blockC1[1][i] + SUBGROUP_GET_BIAS(1, i));
+                }
+                if ( TILE_N_LAST_DIV8 > 2 )
+                {
+#ifdef FUSED_CONV_CHANNEL_RELU
+                negative_slope = intel_sub_group_shuffle(slope[2], i);
+#endif
+                ACTIVATION_FUNCTION(dst, out1_offset + (16+i) * out_pitch_y, blockC1[2][i] + SUBGROUP_GET_BIAS(2, i));
+                }
+                if ( TILE_N_LAST_DIV8 > 3 )
+                {
+#ifdef FUSED_CONV_CHANNEL_RELU
+                negative_slope = intel_sub_group_shuffle(slope[3], i);
+#endif
+                ACTIVATION_FUNCTION(dst, out1_offset + (24+i) * out_pitch_y, blockC1[3][i] + SUBGROUP_GET_BIAS(3, i));
+                }
+            }
+        }
+    }
+#endif
+}
+#endif
+
+#if defined(GEMM_LIKE_CONV_32_2_SIMD16) || defined(GEMM_LIKE_CONV_32_1_SIMD16)
+#ifdef FUSED_CONV_CHANNEL_RELU
+#define INTERLEAVED_SIMD16_OUTPUT(_out_, _offset_,  _m_) do {\
+    if (global_y * TILE_M < output_width * output_height ) \
+    { \
+      if ( ( OUT_DEPTH % TILE_N ) == 0 ) {\
+        for (int i = 0; i < 16; i++) \
+        { \
+          negative_slope = intel_sub_group_shuffle(slope[0], i); \
+          ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i)); \
+          negative_slope = intel_sub_group_shuffle(slope[1], i); \
+          ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_ [i] + SUBGROUP_GET_BIAS(1, i)); \
+        } \
+      } \
+      else if( ( OUT_DEPTH % 16 ) == 0 ) { \
+        if ( ( global_x + 1 ) < get_global_size(0) ) { \
+          for ( int i = 0; i < 16; i++ ) \
+          { \
+            negative_slope = intel_sub_group_shuffle(slope[0], i); \
+            ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i)); \
+            negative_slope = intel_sub_group_shuffle(slope[1], i); \
+            ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_ [i] + SUBGROUP_GET_BIAS(1, i)); \
+          } \
+        } \
+        else { \
+          for (int i = 0; i < 16; i++) \
+          { \
+          negative_slope = intel_sub_group_shuffle(slope[0], i); \
+            ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i)); \
+          } \
+        } \
+      } \
+      else { \
+        if ( ( global_x + 1 ) < get_global_size(0) ) \
+        { \
+          for ( int i = 0; i < 16; i++ ) \
+          { \
+          negative_slope = intel_sub_group_shuffle(slope[0], i); \
+            ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i)); \
+          negative_slope = intel_sub_group_shuffle(slope[1], i); \
+            ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_[i] + SUBGROUP_GET_BIAS(1, i)); \
+          } \
+        } \
+        else { \
+          if ( (OUT_DEPTH % TILE_N) > 16 ) { \
+            for (int i = 0; i < 16 ; i++) \
+            { \
+          negative_slope = intel_sub_group_shuffle(slope[0], i); \
+              ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i)); \
+            } \
+            for (int i = 0; i < OUT_DEPTH % 16 ; i++) \
+            { \
+          negative_slope = intel_sub_group_shuffle(slope[1], i); \
+              ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_[i] + SUBGROUP_GET_BIAS(1, i)); \
+            } \
+          } \
+          else { \
+            for (int i = 0; i < OUT_DEPTH % 16 ; i++) \
+            { \
+            negative_slope = intel_sub_group_shuffle(slope[0], i); \
+              ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i)); \
+            } \
+          } \
+        } \
+      } \
+    } \
+ }while(0)
+#else
+#define INTERLEAVED_SIMD16_OUTPUT(_out_, _offset_,  _m_) do {\
+    if (global_y * TILE_M < output_width * output_height ) \
+    { \
+      if ( ( OUT_DEPTH % TILE_N ) == 0 ) {\
+        for (int i = 0; i < 16; i++) \
+        { \
+          ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i)); \
+          ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_ [i] + SUBGROUP_GET_BIAS(1, i)); \
+        } \
+      } \
+      else if( ( OUT_DEPTH % 16 ) == 0 ) { \
+        if ( ( global_x + 1 ) < get_global_size(0) ) { \
+          for ( int i = 0; i < 16; i++ ) \
+          { \
+            ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i)); \
+            ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_ [i] + SUBGROUP_GET_BIAS(1, i)); \
+          } \
+        } \
+        else { \
+          for (int i = 0; i < 16; i++) \
+          { \
+            ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i)); \
+          } \
+        } \
+      } \
+      else { \
+        if ( ( global_x + 1 ) < get_global_size(0) ) \
+        { \
+          for ( int i = 0; i < 16; i++ ) \
+          { \
+            ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i)); \
+            ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_[i] + SUBGROUP_GET_BIAS(1, i)); \
+          } \
+        } \
+        else { \
+          if ( (OUT_DEPTH % TILE_N) > 16 ) { \
+            for (int i = 0; i < 16 ; i++) \
+            { \
+              ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i)); \
+            } \
+            for (int i = 0; i < OUT_DEPTH % 16 ; i++) \
+            { \
+              ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_[i] + SUBGROUP_GET_BIAS(1, i)); \
+            } \
+          } \
+          else { \
+            for (int i = 0; i < OUT_DEPTH % 16 ; i++) \
+            { \
+              ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i)); \
+            } \
+          } \
+        } \
+      } \
+    } \
+ }while(0)
+#endif
+#endif
+
+#ifdef GEMM_LIKE_CONV_32_1_SIMD16
+#define TILE_M          1
+#define TILE_K          KERNEL_WIDTH
+#define TILE_N          32
+
+#ifndef __BEIGNET__
+__attribute__((intel_reqd_sub_group_size(16)))
+#endif
+__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
+{
+    const int group_x = get_group_id(0);
+    const int group_y = get_group_id(1);
+    const int global_x = get_global_id(0);
+    const int global_y = get_global_id(1);
+    const int global_z = get_global_id(2);
+    int interleaved_y;
+    int kernel_y;
+    int kernel_idx;
+
+    // Result ctile (*dst) is M rows x N columns
+    // LWG size is 1x16.  Thus each thread calculates 16*M rows x N cols of ctile.
+    Dtype16  blockC00 = 0.f;
+    Dtype16  blockC10 = 0.f;
+
+    // Src0 (patch input) is directly used as atile.
+    // Each work item points to the start of a different patch.
+    // atile is M rows x K columns.
+    int curr_x = ( global_y % output_width ) * STRIDE_X;
+    int curr_y = ( global_y / output_width ) * STRIDE_Y;
+#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
+    int saved_y = curr_y;
+#endif
+    const __global Dtype *src0_read = src0
+     + aligned_input_size * global_z                            // batch offset
+     + (curr_y - INPUT_PAD_H) * ROW_PITCH      // y offset
+     + curr_x - INPUT_PAD_W;                 // x offset
+     const __global Dtype *src0_read_orig = src0_read;
+
+    // Src1 (filter) is directly used as btile.
+    // It starts at the top of src1 and walks down.
+    // btile is K rows x N columns.
+    const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2 );
+
+#define DOT_PRODUCT_16( _result, _rowA, colB )    \
+    {   \
+        _result.s0 = mad( _rowA, sub_group_broadcast( colB,  0 ), _result.s0 );  \
+        _result.s1 = mad( _rowA, sub_group_broadcast( colB,  1 ), _result.s1 );  \
+        _result.s2 = mad( _rowA, sub_group_broadcast( colB,  2 ), _result.s2 );  \
+        _result.s3 = mad( _rowA, sub_group_broadcast( colB,  3 ), _result.s3 );  \
+        _result.s4 = mad( _rowA, sub_group_broadcast( colB,  4 ), _result.s4 );  \
+        _result.s5 = mad( _rowA, sub_group_broadcast( colB,  5 ), _result.s5 );  \
+        _result.s6 = mad( _rowA, sub_group_broadcast( colB,  6 ), _result.s6 );  \
+        _result.s7 = mad( _rowA, sub_group_broadcast( colB,  7 ), _result.s7 );  \
+        _result.s8 = mad( _rowA, sub_group_broadcast( colB,  8 ), _result.s8 );  \
+        _result.s9 = mad( _rowA, sub_group_broadcast( colB,  9 ), _result.s9 );  \
+        _result.sa = mad( _rowA, sub_group_broadcast( colB, 10 ), _result.sa );  \
+        _result.sb = mad( _rowA, sub_group_broadcast( colB, 11 ), _result.sb );  \
+        _result.sc = mad( _rowA, sub_group_broadcast( colB, 12 ), _result.sc );  \
+        _result.sd = mad( _rowA, sub_group_broadcast( colB, 13 ), _result.sd );  \
+        _result.se = mad( _rowA, sub_group_broadcast( colB, 14 ), _result.se );  \
+        _result.sf = mad( _rowA, sub_group_broadcast( colB, 15 ), _result.sf );  \
+    }
+    typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;
+    // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
+    // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
+    // and KERNEL_WIDTH/2 rows of interleaved filter.
+    int patch_depth = 0;
+#ifndef __BEIGNET__
+    __attribute__((opencl_unroll_hint(1)))
+#endif
+    do
+    {
+        int patch_row = 0;
+#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
+        curr_y = saved_y;
+#endif
+#ifndef __BEIGNET__
+        __attribute__((opencl_unroll_hint(1)))
+#endif
+        do
+        {
+            // Load atile and btile.
+            // Kernel data is partially interleaved.  Every 2 rows are interleaved at Dtype16 granularity.
+            // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved.  The non
+            // interleaved row is padded with zero to ensure same size as interleaved rows. This
+            // interleaving is done to ensure 0% GDR bank conflicts.  For example, this is how the
+            // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.
+            // (0, 0) (16, 0) (32, 0) (48, 0) ...     (0, 0) ( 0, 1) (16, 0) ( 0, 1) (32, 0) (0, 1) (48, 0) ...
+            // (0, 1) (16, 1) (32, 1) (48, 1) ... =>  (0, 2) (16, 2) (32, 2) (48, 2) ...
+            // (0, 2) (16, 2) (32, 2) (48, 2) ...     ...
+            // ...
+            const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
+
+#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1
+            Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[  0  ];
+            Dtype*  pblockA00 = (Dtype*)(&blockA00);
+#else
+            Dtype_t blockA00;
+            Dtype*  pblockA00 = (Dtype*)(&blockA00);
+            int pos = 0;
+            LOOP(KERNEL_WIDTH, pos,
+            {
+              if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
+                pblockA00[pos] = src0_read[pos * DILATION_X];
+              else
+                pblockA00[pos] = 0;
+            })
+            curr_y += DILATION_Y;
+#endif
+            src0_read += ROW_PITCH * DILATION_Y;
+            INT_TYPE blockB00[KERNEL_WIDTH * 2];
+            INT_TYPE4* p4BlockB00 = (INT_TYPE4*)blockB00;
+            INT_TYPE2* p2BlockB00 = (INT_TYPE2*)blockB00;
+            Dtype* pBlockB00  = (Dtype*)blockB00;
+            interleaved_y = 0;
+            LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
+            {
+                p4BlockB00[interleaved_y] = SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read );
+                src1_read += WIDTH1 * 2;
+            } )
+            if ( kernel_width_is_odd )
+            {
+                p2BlockB00[KERNEL_WIDTH - 1] = SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read );
+                src1_read += WIDTH1 * 2;
+            }
+
+            // Perform MADs
+            kernel_idx = 0;
+            interleaved_y = 0;
+            LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
+            {
+                kernel_y = interleaved_y * 2;
+                DOT_PRODUCT_16( blockC00, pblockA00[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
+                DOT_PRODUCT_16( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
+                DOT_PRODUCT_16( blockC10, pblockA00[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
+                DOT_PRODUCT_16( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
+            } )
+            if ( kernel_width_is_odd )
+            {
+                kernel_y = interleaved_y * 2;
+                DOT_PRODUCT_16( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
+                DOT_PRODUCT_16( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
+            }
+        }
+
+        //while( ++patch_row < 1 ); //debug
+        while( ++patch_row < KERNEL_HEIGHT );
+
+        src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch
+    }
+    //while ( ++patch_depth < 1 );  //debug
+    while ( ++patch_depth < INPUT_DEPTH );
+
+    // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
+    // (SIMD * TILE_M) x 1 x TILE_N.  Partial writes most likely generated if padding used.
+    int out_offset = global_z * out_pitch_z                                                   // batch offset
+     + ( group_x * TILE_N ) * out_pitch_y                                       // channel offset
+     + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X  // y offset
+     + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT;               // x offset
+    __global Dtype *out = dst + out_offset;
+
+#if APPLY_BIAS
+    Dtype bias[2];
+    Dtype2 *bias_vec;
+    bias_vec = (Dtype2*)bias;
+    *bias_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)biases_base + group_x * TILE_N));
+#endif
+#ifdef FUSED_CONV_CHANNEL_RELU
+        Dtype slope[2];
+        Dtype2 *slope_vec;
+        slope_vec = (Dtype2*)slope;
+        *slope_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)negative_slope_base + group_x * TILE_N));
+        Dtype negative_slope;
+#endif
+
+    INTERLEAVED_SIMD16_OUTPUT(dst, out_offset, 0);
+}
+#endif
+#endif // KERNEL_BASIC/IDLF/GEMM_LIKE
diff --git a/modules/dnn/src/opencl/conv_spatial_helper.cl b/modules/dnn/src/opencl/conv_spatial_helper.cl
new file mode 100644
index 0000000000..9d5a89f7b1
--- /dev/null
+++ b/modules/dnn/src/opencl/conv_spatial_helper.cl
@@ -0,0 +1,73 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+#define Dtype float
+
+__kernel void TEMPLATE(copyWeightsSwizzled, Dtype)
+    (__global Dtype* weightIn,
+     __global Dtype* weightOut,
+     const int kernel_w,
+     const int kernel_h,
+     const int channels,
+     const int outputs,
+     const int swizzleFactor) {
+
+  unsigned int sX = get_global_id(0);
+
+  //Original location
+
+  //Output location
+  int outputSublayer = channels / swizzleFactor;
+  int outputSublayerIndex = channels % swizzleFactor;
+
+  int filter = sX / (kernel_w*kernel_h*channels);
+  int kernel_X = sX % kernel_w;
+  int kernel_Y = (sX / kernel_w) % kernel_h;
+  int kernel_C = (sX / (kernel_w * kernel_h)) % channels;
+
+  int FP = filter / swizzleFactor;
+  int F1 = filter % swizzleFactor;
+
+  weightOut[FP*(kernel_w*kernel_h*channels*swizzleFactor) + kernel_C*(kernel_w*kernel_h*swizzleFactor) + kernel_Y*(kernel_w*swizzleFactor) + kernel_X*swizzleFactor + F1]
+  = weightIn[filter*(kernel_w*kernel_h*channels) + kernel_C*(kernel_w*kernel_h) + kernel_Y*kernel_w + kernel_X];
+}
diff --git a/modules/dnn/src/opencl/dummy.cl b/modules/dnn/src/opencl/dummy.cl
new file mode 100644
index 0000000000..6a55938244
--- /dev/null
+++ b/modules/dnn/src/opencl/dummy.cl
@@ -0,0 +1,43 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+__kernel void dummy_kernel()
+{
+}
diff --git a/modules/dnn/src/opencl/gemm_image.cl b/modules/dnn/src/opencl/gemm_image.cl
new file mode 100644
index 0000000000..37ae523a21
--- /dev/null
+++ b/modules/dnn/src/opencl/gemm_image.cl
@@ -0,0 +1,635 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+
+// Types used for parameters, offset computations and so on
+#define int_tp int
+#define uint_tp unsigned int
+
+#define Dtype  float
+#define Dtype2 float2
+#define Dtype4 float4
+#define Dtype8 float8
+
+#define as_Dtype  as_float
+#define as_Dtype2 as_float2
+#define as_Dtype4 as_float4
+#define as_Dtype8 as_float8
+
+#define KERNEL_ARG_DTYPE float
+
+#if defined(cl_intel_subgroups)
+#pragma OPENCL EXTENSION  cl_intel_subgroups : enable
+#endif
+
+#define TILE_M          32
+#define TILE_K          8
+
+// common block to calculate (alpha * AxB + beta * C) and output to destination image.
+
+#define SUBGROUP_BLOCK_READ8( __image, __coord ) intel_sub_group_block_read8( __image, __coord )
+#define SHUFFLE_TYPE2(val) val
+#define SHUFFLE_TYPE8(val) val
+#define READ_IMAGE(__image, __coord) read_imagef(__image, sampler, __coord)
+#define SIZE_OF_ELEMENT sizeof(uint)
+#define SIMD_SIZE_GEMM 8
+#define TILE_N 8
+
+//#define USE_IMAGE_C
+#ifdef USE_IMAGE_C
+#define BLOCKC_READ8( _C, _coordC ) as_Dtype8( intel_sub_group_block_read8( _C, _coordC ) )
+#define BLOCKC_WRITE8( _C, _coordC, _val ) intel_sub_group_block_write8( _C, _coordC, as_uint8( _val ) )
+#define MATC_PARAMETER __read_only image2d_t C, __write_only image2d_t dst
+#define GEMM_OUTPUT(ALPHA1, BETA_NOT0) GEMM_OUTPUT_EXT(ALPHA1, BETA_NOT0, C, dst, sizeof(uint))
+#else
+#define BLOCKC_READ8( _C, _coordC ) \
+          (Dtype8) ( (_coordC.x + get_local_id(0) < N && _coordC.y < M) ? _C[ _coordC.y * ldc + _coordC.x + get_local_id(0) ] : 0, \
+                     (_coordC.x + get_local_id(0) < N && _coordC.y + 1 < M) ? _C[ ( _coordC.y + 1 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \
+                     (_coordC.x + get_local_id(0) < N && _coordC.y + 2 < M) ? _C[ ( _coordC.y + 2 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \
+                     (_coordC.x + get_local_id(0) < N && _coordC.y + 3 < M) ? _C[ ( _coordC.y + 3 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \
+                     (_coordC.x + get_local_id(0) < N && _coordC.y + 4 < M) ? _C[ ( _coordC.y + 4 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \
+                     (_coordC.x + get_local_id(0) < N && _coordC.y + 5 < M) ? _C[ ( _coordC.y + 5 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \
+                     (_coordC.x + get_local_id(0) < N && _coordC.y + 6 < M) ? _C[ ( _coordC.y + 6 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \
+                     (_coordC.x + get_local_id(0) < N && _coordC.y + 7 < M) ? _C[ ( _coordC.y + 7 ) * ldc + _coordC.x + get_local_id(0) ] : 0)
+
+#define BLOCKC_WRITE8( _C, _coordC, _val) do {\
+                     if (_coordC.x + get_local_id(0) < N) { \
+                       if (_coordC.y < M) \
+                         _C[ _coordC.y * ldc + _coordC.x + get_local_id(0) ] = _val.s0; \
+                       if (_coordC.y + 1 < M) \
+                         _C[ ( _coordC.y + 1 )* ldc + _coordC.x + get_local_id(0) ] = _val.s1; \
+                       if (_coordC.y + 2 < M) \
+                         _C[ ( _coordC.y + 2 )* ldc + _coordC.x + get_local_id(0) ] = _val.s2; \
+                       if (_coordC.y + 3 < M) \
+                         _C[ ( _coordC.y + 3 )* ldc + _coordC.x + get_local_id(0) ] = _val.s3; \
+                       if (_coordC.y + 4 < M) \
+                         _C[ ( _coordC.y + 4 )* ldc + _coordC.x + get_local_id(0) ] = _val.s4; \
+                       if (_coordC.y + 5 < M) \
+                         _C[ ( _coordC.y + 5 )* ldc + _coordC.x + get_local_id(0) ] = _val.s5; \
+                       if (_coordC.y + 6 < M) \
+                         _C[ ( _coordC.y + 6 )* ldc + _coordC.x + get_local_id(0) ] = _val.s6; \
+                       if (_coordC.y + 7 < M) \
+                         _C[ ( _coordC.y + 7 )* ldc + _coordC.x + get_local_id(0) ] = _val.s7; \
+                     }} while(0)
+#define MATC_PARAMETER __global Dtype * C, const int offC, const int M, const int N, const int ldc
+#define GEMM_OUTPUT(ALPHA1, BETA_NOT0) GEMM_OUTPUT_EXT(ALPHA1, BETA_NOT0, (C + offC), (C + offC), 1)
+#endif
+
+#define GEMM_OUTPUT_EXT(ALPHA1, BETA_NOT0, _C, _dst, _C_step) \
+    int2    coordDst = (int2)( ( group_x * TILE_N ) * _C_step, ( group_y * TILE_M ) ); \
+    int2    coordC = coordDst; \
+    Dtype8 blockC00; \
+    Dtype8 blockC01; \
+    Dtype8 blockC02; \
+    Dtype8 blockC03; \
+    if (BETA_NOT0) { \
+        blockC00 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \
+        blockC01 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \
+        blockC02 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \
+        blockC03 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); \
+        if (!ALPHA1) { \
+            blockC00 = mad(blockAxB00, (Dtype8)alpha, blockC00); \
+            blockC01 = mad(blockAxB01, (Dtype8)alpha, blockC01); \
+            blockC02 = mad(blockAxB02, (Dtype8)alpha, blockC02); \
+            blockC03 = mad(blockAxB03, (Dtype8)alpha, blockC03); \
+        } else { \
+            blockC00 += blockAxB00; \
+            blockC01 += blockAxB01; \
+            blockC02 += blockAxB02; \
+            blockC03 += blockAxB03; \
+        } \
+    } else { \
+        blockC00 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \
+        blockC01 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \
+        blockC02 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \
+        blockC03 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); \
+        if (!ALPHA1) { \
+          blockC00 = mad(blockAxB00, (Dtype8)alpha, blockC00); \
+          blockC01 = mad(blockAxB01, (Dtype8)alpha, blockC01); \
+          blockC02 = mad(blockAxB02, (Dtype8)alpha, blockC02); \
+          blockC03 = mad(blockAxB03, (Dtype8)alpha, blockC03); \
+        } else { \
+          blockC00 += blockAxB00; \
+          blockC01 += blockAxB01; \
+          blockC02 += blockAxB02; \
+          blockC03 += blockAxB03; \
+        } \
+    } \
+    BLOCKC_WRITE8( _dst, coordDst, blockC00 );    coordDst.y += 8; \
+    BLOCKC_WRITE8( _dst, coordDst, blockC01 );    coordDst.y += 8; \
+    BLOCKC_WRITE8( _dst, coordDst, blockC02 );    coordDst.y += 8; \
+    BLOCKC_WRITE8( _dst, coordDst, blockC03 );
+
+// Get the specified column of the block of the block
+#define TRANSPOSE_BLOCK_8( _block, _col )   \
+        (Dtype8)( intel_sub_group_shuffle( _block.s0, _col ),   \
+                  intel_sub_group_shuffle( _block.s1, _col ),   \
+                  intel_sub_group_shuffle( _block.s2, _col ),   \
+                  intel_sub_group_shuffle( _block.s3, _col ),   \
+                  intel_sub_group_shuffle( _block.s4, _col ),   \
+                  intel_sub_group_shuffle( _block.s5, _col ),   \
+                  intel_sub_group_shuffle( _block.s6, _col ),   \
+                  intel_sub_group_shuffle( _block.s7, _col ) );
+
+// A's column block multiply B 's row block.
+#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB )    \
+        {   \
+            const Dtype8    acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 );    \
+            const Dtype8    acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 );    \
+            const Dtype8    acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 );    \
+            const Dtype8    acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 );    \
+            const Dtype8    acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 );    \
+            const Dtype8    acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 );    \
+            const Dtype8    acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 );    \
+            const Dtype8    acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 );    \
+            _result = mad( (Dtype8)(_blockB.s0), acol0, _result );      \
+            _result = mad( (Dtype8)(_blockB.s1), acol1, _result );      \
+            _result = mad( (Dtype8)(_blockB.s2), acol2, _result );      \
+            _result = mad( (Dtype8)(_blockB.s3), acol3, _result );      \
+            _result = mad( (Dtype8)(_blockB.s4), acol4, _result );      \
+            _result = mad( (Dtype8)(_blockB.s5), acol5, _result );      \
+            _result = mad( (Dtype8)(_blockB.s6), acol6, _result );      \
+            _result = mad( (Dtype8)(_blockB.s7), acol7, _result );      \
+        }
+
+#define GEMM_NN(ALPHA1, BETA_NOT0) \
+__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
+__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
+__kernel void TEMPLATE(gemm_32_1_NN_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \
+    __read_only image2d_t A, \
+    __read_only image2d_t B, \
+    MATC_PARAMETER, \
+    KERNEL_ARG_DTYPE alpha_in, \
+    KERNEL_ARG_DTYPE beta_in, \
+    int width0, \
+    int isFirstColBlock) \
+{ \
+    const Dtype alpha = (Dtype)alpha_in; \
+    const Dtype beta = (Dtype)beta_in; \
+    const int group_x = get_group_id(0); \
+    const int group_y = get_group_id(1); \
+    Dtype8 blockAxB00 = 0.0f; \
+    Dtype8 blockAxB01 = 0.0f; \
+    Dtype8 blockAxB02 = 0.0f; \
+    Dtype8 blockAxB03 = 0.0f; \
+    int2    coordA = (int2)( 0, group_y * TILE_M ); \
+    int2    coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 ); \
+    do \
+    {  \
+        int2    coordBTemp = coordB; \
+        Dtype8  blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) );    coordB.y += TILE_K; \
+        int2    coordATemp = coordA; \
+        Dtype8  blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8  blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8  blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8  blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordA.x += TILE_K * SIZE_OF_ELEMENT; \
+        MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00 ); \
+    } \
+    while( coordB.y < width0 ); \
+    GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
+}
+
+GEMM_NN(1, 0) // ALPHA == 1, BETA == 0
+GEMM_NN(1, 1) // ALPHA == 1, BETA != 0
+GEMM_NN(0, 0) // ALPHA != 1, BETA == 0
+GEMM_NN(0, 1) // ALPHA != 1, BETA != 0
+
+#undef TRANSPOSE_BLOCK_8
+#undef MULTIPLY_BLOCKS_8x8
+#undef GEMM_NN
+
+// replicate the first row to column block.
+#define TRANSPOSE_BLOCK_8(_vec, _col) \
+        (Dtype8)( intel_sub_group_shuffle(_vec, _col + 0), \
+                  intel_sub_group_shuffle(_vec, _col + 1), \
+                  intel_sub_group_shuffle(_vec, _col + 2), \
+                  intel_sub_group_shuffle(_vec, _col + 3), \
+                  intel_sub_group_shuffle(_vec, _col + 4), \
+                  intel_sub_group_shuffle(_vec, _col + 5), \
+                  intel_sub_group_shuffle(_vec, _col + 6), \
+                  intel_sub_group_shuffle(_vec, _col + 7) )
+
+#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB, _col )    \
+        {   \
+            _result = mad( (Dtype8)(_blockB.s0), TRANSPOSE_BLOCK_8(_blockA.s0, _col), _result );      \
+            _result = mad( (Dtype8)(_blockB.s1), TRANSPOSE_BLOCK_8(_blockA.s1, _col), _result );      \
+            _result = mad( (Dtype8)(_blockB.s2), TRANSPOSE_BLOCK_8(_blockA.s2, _col), _result );      \
+            _result = mad( (Dtype8)(_blockB.s3), TRANSPOSE_BLOCK_8(_blockA.s3, _col), _result );      \
+            _result = mad( (Dtype8)(_blockB.s4), TRANSPOSE_BLOCK_8(_blockA.s4, _col), _result );      \
+            _result = mad( (Dtype8)(_blockB.s5), TRANSPOSE_BLOCK_8(_blockA.s5, _col), _result );      \
+            _result = mad( (Dtype8)(_blockB.s6), TRANSPOSE_BLOCK_8(_blockA.s6, _col), _result );      \
+            _result = mad( (Dtype8)(_blockB.s7), TRANSPOSE_BLOCK_8(_blockA.s7, _col), _result );      \
+        }
+
+#define GEMM_TN(ALPHA1, BETA_NOT0) \
+__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
+__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
+__kernel void TEMPLATE(gemm_32_1_TN_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \
+    __read_only image2d_t A, \
+    __read_only image2d_t B, \
+    MATC_PARAMETER, \
+    KERNEL_ARG_DTYPE alpha_in, \
+    KERNEL_ARG_DTYPE beta_in, \
+    int width0, \
+    int isFirstColBlock) \
+{ \
+    const Dtype alpha = (Dtype)alpha_in; \
+    const Dtype beta = (Dtype)beta_in; \
+    const int group_x = get_group_id(0);\
+    const int group_y = get_group_id(1);\
+    Dtype8 blockAxB00 = 0.0f;\
+    Dtype8 blockAxB01 = 0.0f;\
+    Dtype8 blockAxB02 = 0.0f;\
+    Dtype8 blockAxB03 = 0.0f;\
+    int2    coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 );\
+    int2    coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 );\
+    do\
+    {\
+        int2    coordBTemp = coordB;\
+        Dtype8 blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) );    coordB.y += TILE_K;\
+        int2    coordATemp = coordA;\
+        Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.x += 8 * SIZE_OF_ELEMENT;\
+        Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.x += 8 * SIZE_OF_ELEMENT;\
+        Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.x += 8 * SIZE_OF_ELEMENT;\
+        Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordA.y += TILE_K;\
+        MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, 0 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00, 0 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00, 0 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00, 0 ); \
+    } \
+    while( coordB.y < width0 ); \
+    GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
+}
+
+GEMM_TN(1, 0) // ALPHA == 1, BETA == 0
+GEMM_TN(1, 1) // ALPHA == 1, BETA != 0
+GEMM_TN(0, 0) // ALPHA != 1, BETA == 0
+GEMM_TN(0, 1) // ALPHA != 1, BETA != 0
+
+#undef MULTIPLY_BLOCKS_8x8
+#undef TRANSPOSE_BLOCK_8
+#undef GEMM_TN
+
+// The same as GEMM_NN
+#define TRANSPOSE_BLOCK_8( _block, _col )   \
+        (Dtype8)( intel_sub_group_shuffle( _block.s0, _col),   \
+                  intel_sub_group_shuffle( _block.s1, _col),   \
+                  intel_sub_group_shuffle( _block.s2, _col),   \
+                  intel_sub_group_shuffle( _block.s3, _col),   \
+                  intel_sub_group_shuffle( _block.s4, _col),   \
+                  intel_sub_group_shuffle( _block.s5, _col),   \
+                  intel_sub_group_shuffle( _block.s6, _col),   \
+                  intel_sub_group_shuffle( _block.s7, _col) )
+
+#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB )    \
+        {   \
+            const Dtype8    acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 );    \
+            const Dtype8    acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 );    \
+            const Dtype8    acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 );    \
+            const Dtype8    acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 );    \
+            const Dtype8    acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 );    \
+            const Dtype8    acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 );    \
+            const Dtype8    acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 );    \
+            const Dtype8    acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 );    \
+            _result = mad( (Dtype8)_blockB.s0, acol0, _result );      \
+            _result = mad( (Dtype8)_blockB.s1, acol1, _result );      \
+            _result = mad( (Dtype8)_blockB.s2, acol2, _result );      \
+            _result = mad( (Dtype8)_blockB.s3, acol3, _result );      \
+            _result = mad( (Dtype8)_blockB.s4, acol4, _result );      \
+            _result = mad( (Dtype8)_blockB.s5, acol5, _result );      \
+            _result = mad( (Dtype8)_blockB.s6, acol6, _result );      \
+            _result = mad( (Dtype8)_blockB.s7, acol7, _result );      \
+        }
+
+#define GEMM_NT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
+__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
+__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
+__kernel void TEMPLATE(gemm_32_1_NT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \
+    __read_only image2d_t A, \
+    MATB_PARAMETER, \
+    MATC_PARAMETER, \
+    KERNEL_ARG_DTYPE alpha_in, \
+    KERNEL_ARG_DTYPE beta_in, \
+    int padded_k, \
+    int k, \
+    int isFirstColBlock) \
+{ \
+    const Dtype alpha = (Dtype)alpha_in; \
+    const Dtype beta = (Dtype)beta_in; \
+    const int group_x = get_group_id(0); \
+    const int group_y = get_group_id(1); \
+    Dtype8 blockAxB00 = 0.0f; \
+    Dtype8 blockAxB01 = 0.0f; \
+    Dtype8 blockAxB02 = 0.0f; \
+    Dtype8 blockAxB03 = 0.0f; \
+    int2    coordA = (int2)( 0, group_y * TILE_M ); \
+    int2    coordB = (int2)( 0, ( group_x * TILE_N )); \
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \
+    do \
+    { \
+        Dtype8 blockB00;  \
+        BLOCKB_READ8(blockB00, B, coordB); \
+        int2    coordATemp = coordA; \
+        Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordA.x += TILE_K * SIZE_OF_ELEMENT; \
+        MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00 ); \
+    } \
+    while( coordB.x < padded_k / VECSIZE ); \
+    GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
+}
+
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+        int2 _coordBTemp = _coordB; \
+        _coordBTemp.y += get_local_id(0); \
+        _blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordB.x += 2;
+
+#define MATB_PARAMETER __read_only image2d_t B
+
+GEMM_NT(1, 0, VEC4, 4) // ALPHA == 1, BETA == 0
+GEMM_NT(1, 1, VEC4, 4) // ALPHA == 1, BETA != 0
+GEMM_NT(0, 0, VEC4, 4) // ALPHA != 1, BETA == 0
+GEMM_NT(0, 1, VEC4, 4) // ALPHA != 1, BETA != 0
+#undef BLOCKB_READ8
+#undef MATB_PARAMETER
+
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+        int2 _coordBTemp = _coordB; \
+        _coordBTemp.y += get_local_id(0); \
+        const __global Dtype *B_read = (__global Dtype *)(_B + (_coordBTemp.y * ldb) + _coordBTemp.x + offB); \
+        _blockb = vload8(0, B_read); \
+        _coordB.x += TILE_K;
+
+#define MATB_PARAMETER __global Dtype *B, int offB, int ldb
+
+GEMM_NT(1, 0, BUFFER, 1) // ALPHA == 1, BETA == 0
+GEMM_NT(1, 1, BUFFER, 1) // ALPHA == 1, BETA != 0
+GEMM_NT(0, 0, BUFFER, 1) // ALPHA != 1, BETA == 0
+GEMM_NT(0, 1, BUFFER, 1) // ALPHA != 1, BETA != 0
+#undef BLOCKB_READ8
+#undef MATB_PARAMETER
+
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+        int2 _coordBTemp = _coordB; \
+        _coordBTemp.y += get_local_id(0); \
+        Dtype4 temp; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s0 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s1 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s2 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s3 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s4 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s5 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s6 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s7 = temp.s0; \
+        _coordB.x += 8;
+
+#define MATB_PARAMETER __read_only image2d_t B
+
+GEMM_NT(1, 0, SCALAR, 1) // ALPHA == 1, BETA == 0
+GEMM_NT(1, 1, SCALAR, 1) // ALPHA == 1, BETA != 0
+GEMM_NT(0, 0, SCALAR, 1) // ALPHA != 1, BETA == 0
+GEMM_NT(0, 1, SCALAR, 1) // ALPHA != 1, BETA != 0
+#undef BLOCKB_READ8
+#undef MATB_PARAMETER
+
+#undef MULTIPLY_BLOCKS_8x8
+#undef TRANSPOSE_BLOCK_8
+#undef GEMM_NT
+
+//The same as GEMM_TN.
+#define TRANSPOSE_BLOCK_8(_vec, _col) \
+        (Dtype8)( intel_sub_group_shuffle(_vec, _col + 0), \
+                  intel_sub_group_shuffle(_vec, _col + 1), \
+                  intel_sub_group_shuffle(_vec, _col + 2), \
+                  intel_sub_group_shuffle(_vec, _col + 3), \
+                  intel_sub_group_shuffle(_vec, _col + 4), \
+                  intel_sub_group_shuffle(_vec, _col + 5), \
+                  intel_sub_group_shuffle(_vec, _col + 6), \
+                  intel_sub_group_shuffle(_vec, _col + 7) );
+
+#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB, _col )    \
+        {   \
+            const Dtype8    acol0 = TRANSPOSE_BLOCK_8( _blockA.s0, _col );    \
+            const Dtype8    acol1 = TRANSPOSE_BLOCK_8( _blockA.s1, _col );    \
+            const Dtype8    acol2 = TRANSPOSE_BLOCK_8( _blockA.s2, _col );    \
+            const Dtype8    acol3 = TRANSPOSE_BLOCK_8( _blockA.s3, _col );    \
+            const Dtype8    acol4 = TRANSPOSE_BLOCK_8( _blockA.s4, _col );    \
+            const Dtype8    acol5 = TRANSPOSE_BLOCK_8( _blockA.s5, _col );    \
+            const Dtype8    acol6 = TRANSPOSE_BLOCK_8( _blockA.s6, _col );    \
+            const Dtype8    acol7 = TRANSPOSE_BLOCK_8( _blockA.s7, _col );    \
+            _result = mad( (Dtype8)_blockB.s0, acol0, _result );      \
+            _result = mad( (Dtype8)_blockB.s1, acol1, _result );      \
+            _result = mad( (Dtype8)_blockB.s2, acol2, _result );      \
+            _result = mad( (Dtype8)_blockB.s3, acol3, _result );      \
+            _result = mad( (Dtype8)_blockB.s4, acol4, _result );      \
+            _result = mad( (Dtype8)_blockB.s5, acol5, _result );      \
+            _result = mad( (Dtype8)_blockB.s6, acol6, _result );      \
+            _result = mad( (Dtype8)_blockB.s7, acol7, _result );      \
+        }
+
+#define GEMM_TT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
+__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
+__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
+__kernel void TEMPLATE(gemm_32_1_TT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \
+    __read_only image2d_t A, \
+    MATB_PARAMETER, \
+    MATC_PARAMETER, \
+    KERNEL_ARG_DTYPE alpha_in, \
+    KERNEL_ARG_DTYPE beta_in, \
+    int padded_k, \
+    int k, \
+    int isFirstColBlock) \
+{ \
+    const Dtype alpha = (Dtype)alpha_in; \
+    const Dtype beta = (Dtype)beta_in; \
+    const int group_x = get_group_id(0); \
+    const int group_y = get_group_id(1); \
+    Dtype8 blockAxB00 = 0.0f; \
+    Dtype8 blockAxB01 = 0.0f; \
+    Dtype8 blockAxB02 = 0.0f; \
+    Dtype8 blockAxB03 = 0.0f; \
+    int2    coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 ); \
+    int2    coordB = (int2)( 0, ( group_x * TILE_N )); \
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \
+    do \
+    { \
+        Dtype8 blockB00;             \
+        BLOCKB_READ8(blockB00, B, coordB); \
+        int2    coordATemp = coordA; \
+        Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.x += 8 * SIZE_OF_ELEMENT; \
+        Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.x += 8 * SIZE_OF_ELEMENT; \
+        Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.x += 8 * SIZE_OF_ELEMENT; \
+        Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordA.y += TILE_K; \
+        MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00 , blockB00, 0 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01 , blockB00, 0 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02 , blockB00, 0 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03 , blockB00, 0 ); \
+    } \
+    while( coordB.x < padded_k / VECSIZE ); \
+    GEMM_OUTPUT(ALPHA1, BETA_NOT0);\
+}
+
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+        int2 _coordBTemp = _coordB; \
+        _coordBTemp.y += get_local_id(0); \
+        _blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordB.x += 2;
+
+#define MATB_PARAMETER __read_only image2d_t B
+
+GEMM_TT(1, 0, VEC4, 4) // ALPHA == 1, BETA == 0
+GEMM_TT(1, 1, VEC4, 4) // ALPHA == 1, BETA != 0
+GEMM_TT(0, 0, VEC4, 4) // ALPHA != 1, BETA == 0
+GEMM_TT(0, 1, VEC4, 4) // ALPHA != 1, BETA != 0
+#undef BLOCKB_READ8
+#undef MATB_PARAMETER
+
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+        int2 _coordBTemp = _coordB; \
+        _coordBTemp.y += get_local_id(0); \
+        const __global Dtype *B_read = (__global Dtype *)(_B + (_coordBTemp.y * k) + _coordBTemp.x + offB); \
+        _blockb = vload8(0, B_read); \
+        _coordB.x += TILE_K;
+
+#define MATB_PARAMETER __global Dtype *B, int offB, int ldb
+
+GEMM_TT(1, 0, BUFFER, 1) // ALPHA == 1, BETA == 0
+GEMM_TT(1, 1, BUFFER, 1) // ALPHA == 1, BETA != 0
+GEMM_TT(0, 0, BUFFER, 1) // ALPHA != 1, BETA == 0
+GEMM_TT(0, 1, BUFFER, 1) // ALPHA != 1, BETA != 0
+#undef BLOCKB_READ8
+#undef MATB_PARAMETER
+
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+        int2 _coordBTemp = _coordB; \
+        _coordBTemp.y += get_local_id(0); \
+        Dtype4 temp; \
+        temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s0 = temp.s0; \
+        temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s1 = temp.s0; \
+        temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s2 = temp.s0; \
+        temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s3 = temp.s0; \
+        temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s4 = temp.s0; \
+        temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s5 = temp.s0; \
+        temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s6 = temp.s0; \
+        temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s7 = temp.s0; \
+        _coordB.x += 8;
+
+#define MATB_PARAMETER __read_only image2d_t B
+
+GEMM_TT(1, 0, SCALAR, 1) // ALPHA == 1, BETA == 0
+GEMM_TT(1, 1, SCALAR, 1) // ALPHA == 1, BETA != 0
+GEMM_TT(0, 0, SCALAR, 1) // ALPHA != 1, BETA == 0
+GEMM_TT(0, 1, SCALAR, 1) // ALPHA != 1, BETA != 0
+#undef BLOCKB_READ8
+#undef MATB_PARAMETER
+
+#undef MULTIPLY_BLOCKS_8x8
+#undef TRANSPOSE_BLOCK_8
+#undef GEMM_TT
+
+#undef TILE_M
+#undef TILE_K
+#undef TILE_N
+#undef SUBGROUP_BLOCK_READ8
+#undef READ_IMAGE
+#undef SIZE_OF_ELEMENT
+
+__kernel void TEMPLATE(gemm_buffer_copy_image_transpose,Dtype)(
+    __global Dtype* A,
+    __write_only image2d_t ImA,
+    int offA,
+    int width,
+    int height,
+    int ldA)
+{
+    const int gidx = get_global_id(0);
+    const int gidy = get_global_id(1);
+    int2 coord_dst = (int2)(gidx, gidy);
+    __global Dtype* A_off = A + offA;
+    Dtype srcA = A_off[gidy * ldA + gidx];
+    write_imagef(ImA, coord_dst, (Dtype4)srcA);
+}
+
+__kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose,Dtype)(
+    __global Dtype* A,
+    __write_only image2d_t ImA,
+    int offA,
+    int width,
+    int height,
+    int ldA)
+{
+    const int gidx = get_global_id(0);
+    const int gidy = get_global_id(1);
+    int2 coord_dst = (int2)(gidx, gidy);
+    if (gidx >= width || gidy >= height) {
+      write_imageui(ImA, coord_dst, (uint4)0);
+      return;
+    }
+    __global Dtype* A_off = A + offA;
+    uint4 srcA = convert_uint4(as_uchar4(A_off[gidy * ldA + gidx]));
+    write_imageui(ImA, coord_dst, srcA);
+}
diff --git a/modules/dnn/src/opencl/math.cl b/modules/dnn/src/opencl/math.cl
new file mode 100644
index 0000000000..b8f4eff010
--- /dev/null
+++ b/modules/dnn/src/opencl/math.cl
@@ -0,0 +1,55 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+#define Dtype float
+
+__kernel void TEMPLATE(axpy,Dtype)(const int n, const Dtype alpha, __global const Dtype* x,
+                                   const int offx, __global Dtype* y,
+                                   const int offy) {
+  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {
+    Dtype src = x[offx + index];
+    Dtype dst = y[offy + index];
+    y[offy + index] = alpha * src + dst;
+  }
+}
diff --git a/modules/dnn/src/opencl/matvec_mul.cl b/modules/dnn/src/opencl/matvec_mul.cl
new file mode 100644
index 0000000000..0dabd62c54
--- /dev/null
+++ b/modules/dnn/src/opencl/matvec_mul.cl
@@ -0,0 +1,191 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+#define Dtype float
+
+__kernel void TEMPLATE(matvec_mul4,Dtype)(
+          __global const float * A,
+          int offA,
+          unsigned int A_col_size,
+          unsigned int trail_item,
+          __global const float * v,
+          int offv,
+          float alpha,
+          float beta,
+          __global float4 * result,
+          int offr,
+          __local float4 * work)
+{
+  unsigned int row_gid = get_group_id(0);
+  unsigned int lid = get_local_id(0);
+  const __global float *src0_read = A + row_gid * 4 * A_col_size + offA;
+  const __global float *src1_read = v + offv;
+  result = (__global float4*)((__global float*)result + offr);
+  float4 dot0 = (float4)(0.f);
+  float4 dot1 = (float4)(0.f);
+  float4 dot2 = (float4)(0.f);
+  float4 dot3 = (float4)(0.f);
+
+  unsigned int i = lid;
+  while( i < A_col_size / 4) {
+    const float4 a0 = vload4(i, src0_read);
+    const float4 a1 = vload4(i, src0_read + A_col_size);
+    const float4 a2 = vload4(i, src0_read + 2 * A_col_size);
+    const float4 a3 = vload4(i, src0_read + 3 * A_col_size);
+
+    const float4 b0 = vload4(i, src1_read);
+
+    dot0 += a0 * b0;
+    dot1 += a1 * b0;
+    dot2 += a2 * b0;
+    dot3 += a3 * b0;
+
+    i += get_local_size(0);
+  }
+
+  work[lid].s0 = dot0.x + dot0.y + dot0.z + dot0.w;
+  work[lid].s1 = dot1.x + dot1.y + dot1.z + dot1.w;
+  work[lid].s2 = dot2.x + dot2.y + dot2.z + dot2.w;
+  work[lid].s3 = dot3.x + dot3.y + dot3.z + dot3.w;
+
+  if(i == A_col_size / 4)
+  {
+    if(trail_item != 0)
+    {
+      const __global float *src0_trail = src0_read + i * 4;
+      const __global float *src1_trail = src1_read + i * 4;
+      for(unsigned int i = 0; i < trail_item; ++i) {
+        const float at0 = src0_trail[i];
+        const float at1 = src0_trail[i + A_col_size];
+        const float at2 = src0_trail[i + 2 * A_col_size];
+        const float at3 = src0_trail[i + 3 * A_col_size];
+
+        const float bt = src1_trail[i];
+
+        work[lid].s0 += at0 * bt;
+        work[lid].s1 += at1 * bt;
+        work[lid].s2 += at2 * bt;
+        work[lid].s3 += at3 * bt;
+      }
+    }
+
+  }
+
+  for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1) {
+      barrier(CLK_LOCAL_MEM_FENCE);
+      if(lid < stride)
+        work[lid] += work[lid+stride];
+  }
+  if(lid == 0) {
+    if(beta == (Dtype)0)
+      result[row_gid] = alpha * work[0];
+    else
+      result[row_gid] = alpha * work[0] + beta * result[row_gid];
+  }
+}
+
+/* This kernel used for the trailing rows when row_of_A %4 !=0 */
+__kernel void TEMPLATE(matvec_mul1,Dtype)(
+          __global const float * A,
+          int offA,
+          unsigned int A_col_size,
+          unsigned int row_offset,
+          unsigned int trail_item,
+          __global const float * v,
+          int offv,
+          float alpha,
+          float beta,
+          __global float * result,
+          int offr,
+          __local float * work)
+{
+  unsigned int row_gid = get_group_id(0);
+  unsigned int lid = get_local_id(0);
+
+  const __global float *src0_read = A + (row_offset + row_gid) * A_col_size + offA;
+  const __global float *src1_read = v + + offv;
+  result = result + offr;
+  float4 dot0 = (float4)(0.f);
+
+  unsigned int i = lid;
+  while( i < A_col_size / 4)
+  {
+    const float4 a0 = vload4(i, src0_read);
+    const float4 b0 = vload4(i, src1_read);
+
+    dot0 += a0 * b0;
+    i += get_local_size(0);
+  }
+
+  work[lid] = dot0.x + dot0.y + dot0.z + dot0.w;
+
+  if(i == A_col_size / 4)
+  {
+    if(trail_item != 0)
+    {
+      const __global float *src0_trail = src0_read + i * 4;
+      const __global float *src1_trail = src1_read + i * 4;
+      for(unsigned int i = 0; i < trail_item; ++i) {
+        const float at0 = src0_trail[i];
+        const float bt = src1_trail[i];
+
+        work[lid] += at0 * bt;
+      }
+    }
+
+  }
+  for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1) {
+      barrier(CLK_LOCAL_MEM_FENCE);
+      if(lid < stride)
+        work[lid] += work[lid+stride];
+  }
+
+  if(lid == 0) {
+    if(beta == (Dtype)0) {
+      result[row_gid+row_offset] = alpha * work[0];
+    } else {
+      result[row_gid+row_offset] *= beta;
+      result[row_gid+row_offset] += alpha * work[0];
+    }
+  }
+}
diff --git a/modules/dnn/src/opencl/ocl4dnn_lrn.cl b/modules/dnn/src/opencl/ocl4dnn_lrn.cl
new file mode 100644
index 0000000000..58477cef0c
--- /dev/null
+++ b/modules/dnn/src/opencl/ocl4dnn_lrn.cl
@@ -0,0 +1,96 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+#define Dtype float
+
+__kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global const Dtype* in,
+                             const int num, const int channels,
+                             const int height, const int width, const int size,
+                             const Dtype alpha_over_size, const Dtype k,
+                             __global Dtype* const out,
+                             const Dtype negative_beta) {
+  for (int index = get_global_id(0); index < nthreads;
+      index += get_global_size(0)) {
+    // find out the local offset
+    const int w = index % width;
+    const int h = (index / width) % height;
+    const int n = index / width / height;
+    const int offset = (n * channels * height + h) * width + w;
+    const int step = height * width;
+    __global const Dtype* in_off = in + offset;
+    __global Dtype* out_off = out + offset;
+    Dtype scale_val;
+    int head = 0;
+    const int pre_pad = (size - 1) / 2;
+    const int post_pad = size - pre_pad - 1;
+    Dtype accum_scale = 0;
+    // fill the scale at [n, :, h, w]
+    // accumulate values
+    while (head < post_pad && head < channels) {
+      accum_scale += in_off[head * step] * in_off[head * step];
+      ++head;
+    }
+    // both add and subtract
+    while (head < channels) {
+      accum_scale += in_off[head * step] * in_off[head * step];
+      if (head - size >= 0) {
+        accum_scale -= in_off[(head - size) * step]
+            * in_off[(head - size) * step];
+      }
+      scale_val = k + accum_scale * alpha_over_size;
+      out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((float)scale_val, (float)negative_beta);
+      ++head;
+    }
+    // subtract only
+    while (head < channels + post_pad) {
+      if (head - size >= 0) {
+        accum_scale -= in_off[(head - size) * step]
+            * in_off[(head - size) * step];
+      }
+      scale_val = k + accum_scale * alpha_over_size;
+      out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((float)scale_val, (float)negative_beta);
+      ++head;
+    }
+  }
+}
diff --git a/modules/dnn/src/opencl/ocl4dnn_pooling.cl b/modules/dnn/src/opencl/ocl4dnn_pooling.cl
new file mode 100644
index 0000000000..326d5bc0d6
--- /dev/null
+++ b/modules/dnn/src/opencl/ocl4dnn_pooling.cl
@@ -0,0 +1,177 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+#define Dtype float
+
+void TEMPLATE(max_pool_forward_impl, Dtype)(
+    const int nthreads, __global const Dtype* bottom_data, const int num,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w,
+    __global Dtype* top_data,
+    const int use_mask, __global int* mask, __global Dtype* top_mask, bool no_mask)
+{
+  for (int index = get_global_id(0); index < nthreads;
+      index += get_global_size(0))
+  {
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int c = (index / pooled_width / pooled_height) % channels;
+    const int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    const int hend = min(hstart + kernel_h, height);
+    const int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, (int)0);
+    wstart = max(wstart, (int)0);
+    Dtype maxval = -FLT_MAX;
+    int maxidx = -1;
+    __global const Dtype* bottom_slice = bottom_data
+        + (n * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        if (bottom_slice[h * width + w] > maxval) {
+          maxidx = h * width + w;
+          maxval = bottom_slice[maxidx];
+        }
+      }
+    }
+    top_data[index] = maxval;
+    if (!no_mask) {
+      if (use_mask == 1) {
+        mask[index] = maxidx;
+      } else {
+        top_mask[index] = maxidx;
+      }
+    }
+  }
+}
+
+__kernel void TEMPLATE(max_pool_forward, Dtype)(
+    const int nthreads, __global const Dtype* bottom_data, const int num,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w,
+    __global Dtype* top_data,
+    const int use_mask, __global int* mask, __global Dtype* top_mask)
+{
+    TEMPLATE(max_pool_forward_impl, Dtype)(
+      nthreads, bottom_data, num, channels, height, width,
+      pooled_height, pooled_width, kernel_h,
+      kernel_w, stride_h, stride_w, pad_h, pad_w, top_data, use_mask, mask, top_mask, false
+    );
+}
+
+__kernel void TEMPLATE(ave_pool_forward, Dtype)(
+    const int nthreads, __global const Dtype* const bottom_data, const int num,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w, __global Dtype* top_data)
+{
+  for (int index = get_global_id(0); index < nthreads;
+      index += get_global_size(0))
+  {
+    {
+      const int pw = index % pooled_width;
+      const int ph = (index / pooled_width) % pooled_height;
+      const int c = (index / pooled_width / pooled_height) % channels;
+      const int n = index / pooled_width / pooled_height / channels;
+      int hstart = ph * stride_h - pad_h;
+      int wstart = pw * stride_w - pad_w;
+      int hend = min(hstart + kernel_h, height + pad_h);
+      int wend = min(wstart + kernel_w, width + pad_w);
+      const int pool_size = (hend - hstart) * (wend - wstart);
+      hstart = max(hstart, (int)0);
+      wstart = max(wstart, (int)0);
+      hend = min(hend, height);
+      wend = min(wend, width);
+      Dtype aveval = 0;
+      __global const Dtype* bottom_slice = bottom_data
+          + (n * channels + c) * height * width;
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          aveval += bottom_slice[h * width + w];
+        }
+      }
+      top_data[index] = aveval / pool_size;
+    }
+  }
+}
+
+__kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
+    const int nthreads, __global const Dtype* const bottom_data, const int num,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w,
+    __global Dtype* top_data)
+{
+  for (int index = get_global_id(0); index < nthreads;
+      index += get_global_size(0))
+  {
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int c = (index / pooled_width / pooled_height) % channels;
+    const int n = index / pooled_width / pooled_height / channels;
+    const int hstart = ph * stride_h;
+    const int hend = min(hstart + kernel_h, height);
+    const int wstart = pw * stride_w;
+    const int wend = min(wstart + kernel_w, width);
+    // We set cumsum to be 0 to avoid divide-by-zero problems
+    Dtype cumsum = FLT_MIN;
+    Dtype cumvalues = 0.;
+    __global const Dtype* bottom_slice = bottom_data
+        + (n * channels + c) * height * width;
+    // First pass: get sum
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        cumsum += bottom_slice[h * width + w];
+        cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];
+      }
+    }
+    top_data[index] = cumvalues / cumsum;
+  }
+}
diff --git a/modules/dnn/src/opencl/softmax.cl b/modules/dnn/src/opencl/softmax.cl
index e9fcadce39..54cf489501 100644
--- a/modules/dnn/src/opencl/softmax.cl
+++ b/modules/dnn/src/opencl/softmax.cl
@@ -70,6 +70,10 @@ __kernel void kernel_channel_div(const int count,
   if(index < count) {
     int n = index / channels / spatial_dim;
     int s = index % spatial_dim;
-    data[index] /= channel_sum[n * spatial_dim + s];
+    T v = data[index] / channel_sum[n * spatial_dim + s];
+#ifdef LOG_SOFTMAX
+    v = log(v);
+#endif
+    data[index] = v;
   }
-}
\ No newline at end of file
+}
diff --git a/modules/dnn/src/opencl/softmax_loss.cl b/modules/dnn/src/opencl/softmax_loss.cl
new file mode 100644
index 0000000000..d30b32bc69
--- /dev/null
+++ b/modules/dnn/src/opencl/softmax_loss.cl
@@ -0,0 +1,182 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+#define Dtype float
+
+#if defined(cl_intel_subgroups)
+#pragma OPENCL EXTENSION  cl_intel_subgroups : enable
+#endif
+
+__kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int channels,
+                                   const int spatial_dim,
+                                   __global Dtype* scale,
+                                   __global const Dtype* data,
+                                   __global Dtype* out,
+                                   __local Dtype *out_tmp,
+                                   __local Dtype *scale_tmp,
+                                   __local Dtype *group_tmp) {
+
+  int n = get_global_id(1);
+  for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
+      get_global_size(0), ++s) {
+    float maxval = -FLT_MAX;
+    for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
+      Dtype tmp = data[(n * channels + c) * spatial_dim + s];
+      maxval = max((Dtype)tmp, (Dtype)maxval);
+    }
+    maxval = sub_group_reduce_max(maxval * 100000);
+    //if (get_sub_group_local_id() == 0)
+    group_tmp[get_sub_group_id() * spatial_dim + s] = maxval;
+  }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index +=
+      get_global_size(0)) {
+    int s = index / get_max_sub_group_size();
+    Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
+    //if (get_sub_group_local_id() == 0)
+    scale_tmp[s] = maxval / 100000;
+  }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for (int index = get_global_id(0); index < channels * spatial_dim;
+      index += get_global_size(0)) {
+    int s = index % spatial_dim;
+    out_tmp[index] = exp(data[n * channels * spatial_dim + index] - scale_tmp[s]);
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
+      get_global_size(0), ++s) {
+    Dtype sum = 0;
+    for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
+      sum += out_tmp[c * spatial_dim + s];
+    }
+    sum = sub_group_reduce_add(sum * 100000);
+    group_tmp[get_sub_group_id() * spatial_dim + s] = sum;
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index +=
+      get_global_size(0)) {
+    int s = index / get_max_sub_group_size();
+    Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
+    //if (get_sub_group_local_id() == 0)
+    scale_tmp[s] = sum / 100000;
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for (int index = get_global_id(0); index < channels * spatial_dim;
+      index += get_global_size(0)) {
+    int s = index % spatial_dim;
+    out[n * channels * spatial_dim + index] = out_tmp[index] / scale_tmp[s];
+  }
+}
+
+__kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
+                                   const int spatial_dim,
+                                   __global Dtype* scale,
+                                   __global const Dtype* data,
+                                   __global Dtype* out) {
+
+  int n = get_global_id(1);
+  __global Dtype *group_tmp = scale + spatial_dim * num + n * get_max_sub_group_size() * spatial_dim;
+  for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
+      get_global_size(0), ++s) {
+    float maxval = -FLT_MAX;
+    for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
+      Dtype tmp = data[(n * channels + c) * spatial_dim + s];
+      maxval = max((Dtype)tmp, (Dtype)maxval);
+    }
+    maxval = sub_group_reduce_max(maxval * 100000);
+    //if (get_sub_group_local_id() == 0)
+    group_tmp[get_sub_group_id() * spatial_dim + s] = maxval;
+  }
+  barrier(CLK_GLOBAL_MEM_FENCE);
+
+  for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index +=
+      get_global_size(0)) {
+    int s = index / get_max_sub_group_size();
+    Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
+    //if (get_sub_group_local_id() == 0)
+    scale[n * spatial_dim + s] = maxval / 100000;
+  }
+
+  barrier(CLK_GLOBAL_MEM_FENCE);
+
+  for (int index = get_global_id(0); index < channels * spatial_dim;
+      index += get_global_size(0)) {
+    int s = index % spatial_dim;
+    out[n * channels * spatial_dim + index] = exp(data[n * channels * spatial_dim + index] - scale[n * spatial_dim + s]);
+  }
+  barrier(CLK_GLOBAL_MEM_FENCE);
+
+  for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
+      get_global_size(0), ++s) {
+    Dtype sum = 0;
+    for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
+      sum += out[n * channels * spatial_dim + c * spatial_dim + s];
+    }
+    sum = sub_group_reduce_add(sum * 100000);
+    group_tmp[get_sub_group_id() * spatial_dim + s] = sum;
+  }
+  barrier(CLK_GLOBAL_MEM_FENCE);
+
+  for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index +=
+      get_global_size(0)) {
+    int s = index / get_max_sub_group_size();
+    Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
+    //if (get_sub_group_local_id() == 0)
+    scale[n * spatial_dim + s] = sum / 100000;
+  }
+  barrier(CLK_GLOBAL_MEM_FENCE);
+
+  for (int index = get_global_id(0); index < channels * spatial_dim;
+      index += get_global_size(0)) {
+    int s = index % spatial_dim;
+    out[n * channels * spatial_dim + index] /= scale[n * spatial_dim + s];
+  }
+}
diff --git a/modules/dnn/src/precomp.hpp b/modules/dnn/src/precomp.hpp
index 9383a08e33..e860598283 100644
--- a/modules/dnn/src/precomp.hpp
+++ b/modules/dnn/src/precomp.hpp
@@ -40,6 +40,8 @@
 //M*/
 
 #include <opencv2/core.hpp>
+#include <opencv2/core/ocl.hpp>
+#include <opencv2/core/opencl/ocl_defs.hpp>
 #include <opencv2/core/utils/trace.hpp>
 #include <opencv2/core/softfloat.hpp> // int32_t (MSVS 2010-2013)
 #include "cvconfig.h"
diff --git a/modules/dnn/test/test_googlenet.cpp b/modules/dnn/test/test_googlenet.cpp
index c83c1a063a..1bd3e51ef4 100644
--- a/modules/dnn/test/test_googlenet.cpp
+++ b/modules/dnn/test/test_googlenet.cpp
@@ -73,6 +73,26 @@ TEST(Reproducibility_GoogLeNet, Accuracy)
     normAssert(out, ref);
 }
 
+OCL_TEST(Reproducibility_GoogLeNet, Accuracy)
+{
+    Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt", false),
+                               findDataFile("dnn/bvlc_googlenet.caffemodel", false));
+
+    net.setPreferableBackend(DNN_BACKEND_DEFAULT);
+    net.setPreferableTarget(DNN_TARGET_OPENCL);
+
+    std::vector<Mat> inpMats;
+    inpMats.push_back( imread(_tf("googlenet_0.png")) );
+    inpMats.push_back( imread(_tf("googlenet_1.png")) );
+    ASSERT_TRUE(!inpMats[0].empty() && !inpMats[1].empty());
+
+    net.setInput(blobFromImages(inpMats, 1.0f, Size(), Scalar(), false), "data");
+    Mat out = net.forward("prob");
+
+    Mat ref = blobFromNPY(_tf("googlenet_prob.npy"));
+    normAssert(out, ref);
+}
+
 TEST(IntermediateBlobs_GoogLeNet, Accuracy)
 {
     Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt", false),
@@ -99,6 +119,35 @@ TEST(IntermediateBlobs_GoogLeNet, Accuracy)
     }
 }
 
+OCL_TEST(IntermediateBlobs_GoogLeNet, Accuracy)
+{
+    Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt", false),
+                               findDataFile("dnn/bvlc_googlenet.caffemodel", false));
+
+    net.setPreferableBackend(DNN_BACKEND_DEFAULT);
+    net.setPreferableTarget(DNN_TARGET_OPENCL);
+
+    std::vector<String> blobsNames;
+    blobsNames.push_back("conv1/7x7_s2");
+    blobsNames.push_back("conv1/relu_7x7");
+    blobsNames.push_back("inception_4c/1x1");
+    blobsNames.push_back("inception_4c/relu_1x1");
+    std::vector<Mat> outs;
+    Mat in = blobFromImage(imread(_tf("googlenet_0.png")), 1.0f, Size(), Scalar(), false);
+    net.setInput(in, "data");
+    net.forward(outs, blobsNames);
+    CV_Assert(outs.size() == blobsNames.size());
+
+    for (int i = 0; i < blobsNames.size(); i++)
+    {
+        std::string filename = blobsNames[i];
+        std::replace( filename.begin(), filename.end(), '/', '#');
+        Mat ref = blobFromNPY(_tf("googlenet_" + filename + ".npy"));
+
+        normAssert(outs[i], ref, "", 1E-4, 1E-2);
+    }
+}
+
 TEST(SeveralCalls_GoogLeNet, Accuracy)
 {
     Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt", false),
@@ -128,4 +177,36 @@ TEST(SeveralCalls_GoogLeNet, Accuracy)
     normAssert(outs[0], ref, "", 1E-4, 1E-2);
 }
 
+OCL_TEST(SeveralCalls_GoogLeNet, Accuracy)
+{
+    Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt", false),
+                               findDataFile("dnn/bvlc_googlenet.caffemodel", false));
+
+    net.setPreferableBackend(DNN_BACKEND_DEFAULT);
+    net.setPreferableTarget(DNN_TARGET_OPENCL);
+
+    std::vector<Mat> inpMats;
+    inpMats.push_back( imread(_tf("googlenet_0.png")) );
+    inpMats.push_back( imread(_tf("googlenet_1.png")) );
+    ASSERT_TRUE(!inpMats[0].empty() && !inpMats[1].empty());
+
+    net.setInput(blobFromImages(inpMats, 1.0f, Size(), Scalar(), false), "data");
+    Mat out = net.forward();
+
+    Mat ref = blobFromNPY(_tf("googlenet_prob.npy"));
+    normAssert(out, ref);
+
+    std::vector<String> blobsNames;
+    blobsNames.push_back("conv1/7x7_s2");
+    std::vector<Mat> outs;
+    Mat in = blobFromImage(inpMats[0], 1.0f, Size(), Scalar(), false);
+    net.setInput(in, "data");
+    net.forward(outs, blobsNames);
+    CV_Assert(outs.size() == blobsNames.size());
+
+    ref = blobFromNPY(_tf("googlenet_conv1#7x7_s2.npy"));
+
+    normAssert(outs[0], ref, "", 1E-4, 1E-2);
+}
+
 }
diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp
index e3aeb7e9be..27c460c031 100644
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -98,7 +98,8 @@ void runLayer(Ptr<Layer> layer, std::vector<Mat> &inpBlobs, std::vector<Mat> &ou
 }
 
 
-void testLayerUsingCaffeModels(String basename, bool useCaffeModel = false, bool useCommonInputBlob = true)
+void testLayerUsingCaffeModels(String basename, int targetId = DNN_TARGET_CPU,
+                               bool useCaffeModel = false, bool useCommonInputBlob = true)
 {
     String prototxt = _tf(basename + ".prototxt");
     String caffemodel = _tf(basename + ".caffemodel");
@@ -111,6 +112,9 @@ void testLayerUsingCaffeModels(String basename, bool useCaffeModel = false, bool
     Net net = readNetFromCaffe(prototxt, (useCaffeModel) ? caffemodel : String());
     ASSERT_FALSE(net.empty());
 
+    net.setPreferableBackend(DNN_BACKEND_DEFAULT);
+    net.setPreferableTarget(targetId);
+
     Mat inp = blobFromNPY(inpfile);
     Mat ref = blobFromNPY(outfile);
 
@@ -122,47 +126,82 @@ void testLayerUsingCaffeModels(String basename, bool useCaffeModel = false, bool
 
 TEST(Layer_Test_Softmax, Accuracy)
 {
-     testLayerUsingCaffeModels("layer_softmax");
+    testLayerUsingCaffeModels("layer_softmax");
+}
+
+OCL_TEST(Layer_Test_Softmax, Accuracy)
+{
+    testLayerUsingCaffeModels("layer_softmax", DNN_TARGET_OPENCL);
 }
 
 TEST(Layer_Test_LRN_spatial, Accuracy)
 {
-     testLayerUsingCaffeModels("layer_lrn_spatial");
+    testLayerUsingCaffeModels("layer_lrn_spatial");
+}
+
+OCL_TEST(Layer_Test_LRN_spatial, Accuracy)
+{
+    testLayerUsingCaffeModels("layer_lrn_spatial", DNN_TARGET_OPENCL);
 }
 
 TEST(Layer_Test_LRN_channels, Accuracy)
 {
-     testLayerUsingCaffeModels("layer_lrn_channels");
+    testLayerUsingCaffeModels("layer_lrn_channels");
+}
+
+OCL_TEST(Layer_Test_LRN_channels, Accuracy)
+{
+    testLayerUsingCaffeModels("layer_lrn_channels", DNN_TARGET_OPENCL);
 }
 
 TEST(Layer_Test_Convolution, Accuracy)
 {
-     testLayerUsingCaffeModels("layer_convolution", true);
+    testLayerUsingCaffeModels("layer_convolution", DNN_TARGET_CPU, true);
+}
+
+OCL_TEST(Layer_Test_Convolution, Accuracy)
+{
+    testLayerUsingCaffeModels("layer_convolution", DNN_TARGET_OPENCL, true);
 }
 
 TEST(Layer_Test_DeConvolution, Accuracy)
 {
-     testLayerUsingCaffeModels("layer_deconvolution", true, false);
+    testLayerUsingCaffeModels("layer_deconvolution", DNN_TARGET_CPU, true, false);
 }
 
 TEST(Layer_Test_InnerProduct, Accuracy)
 {
-     testLayerUsingCaffeModels("layer_inner_product", true);
+    testLayerUsingCaffeModels("layer_inner_product", DNN_TARGET_CPU, true);
+}
+
+OCL_TEST(Layer_Test_InnerProduct, Accuracy)
+{
+    testLayerUsingCaffeModels("layer_inner_product", DNN_TARGET_OPENCL, true);
 }
 
 TEST(Layer_Test_Pooling_max, Accuracy)
 {
-     testLayerUsingCaffeModels("layer_pooling_max");
+    testLayerUsingCaffeModels("layer_pooling_max");
+}
+
+OCL_TEST(Layer_Test_Pooling_max, Accuracy)
+{
+    testLayerUsingCaffeModels("layer_pooling_max", DNN_TARGET_OPENCL);
 }
 
 TEST(Layer_Test_Pooling_ave, Accuracy)
 {
-     testLayerUsingCaffeModels("layer_pooling_ave");
+    testLayerUsingCaffeModels("layer_pooling_ave");
+}
+
+OCL_TEST(Layer_Test_Pooling_ave, Accuracy)
+{
+    testLayerUsingCaffeModels("layer_pooling_ave", DNN_TARGET_OPENCL);
 }
 
 TEST(Layer_Test_MVN, Accuracy)
 {
-     testLayerUsingCaffeModels("layer_mvn");
+    testLayerUsingCaffeModels("layer_mvn");
 }
 
 void testReshape(const MatShape& inputShape, const MatShape& targetShape,
@@ -207,22 +246,32 @@ TEST(Layer_Test_Reshape, Accuracy)
 
 TEST(Layer_Test_BatchNorm, Accuracy)
 {
-     testLayerUsingCaffeModels("layer_batch_norm", true);
+    testLayerUsingCaffeModels("layer_batch_norm", DNN_TARGET_CPU, true);
 }
 
 TEST(Layer_Test_ReLU, Accuracy)
 {
-     testLayerUsingCaffeModels("layer_relu");
+    testLayerUsingCaffeModels("layer_relu");
+}
+
+OCL_TEST(Layer_Test_ReLU, Accuracy)
+{
+    testLayerUsingCaffeModels("layer_relu", DNN_TARGET_OPENCL);
 }
 
 TEST(Layer_Test_Dropout, Accuracy)
 {
-     testLayerUsingCaffeModels("layer_dropout");
+    testLayerUsingCaffeModels("layer_dropout");
 }
 
 TEST(Layer_Test_Concat, Accuracy)
 {
-     testLayerUsingCaffeModels("layer_concat");
+    testLayerUsingCaffeModels("layer_concat");
+}
+
+OCL_TEST(Layer_Test_Concat, Accuracy)
+{
+    testLayerUsingCaffeModels("layer_concat", DNN_TARGET_OPENCL);
 }
 
 //template<typename XMat>
diff --git a/modules/dnn/test/test_torch_importer.cpp b/modules/dnn/test/test_torch_importer.cpp
index ec20ef077e..d83c203a28 100644
--- a/modules/dnn/test/test_torch_importer.cpp
+++ b/modules/dnn/test/test_torch_importer.cpp
@@ -44,6 +44,7 @@
 #include "test_precomp.hpp"
 #include "npy_blob.hpp"
 #include <opencv2/dnn/shape_utils.hpp>
+#include <opencv2/ts/ocl_test.hpp>
 
 namespace cvtest
 {
@@ -70,7 +71,7 @@ TEST(Torch_Importer, simple_read)
     ASSERT_FALSE(net.empty());
 }
 
-static void runTorchNet(String prefix, String outLayerName = "",
+static void runTorchNet(String prefix, int targetId = DNN_TARGET_CPU, String outLayerName = "",
                         bool check2ndBlob = false, bool isBinary = false)
 {
     String suffix = (isBinary) ? ".dat" : ".txt";
@@ -78,6 +79,9 @@ static void runTorchNet(String prefix, String outLayerName = "",
     Net net = readNetFromTorch(_tf(prefix + "_net" + suffix), isBinary);
     ASSERT_FALSE(net.empty());
 
+    net.setPreferableBackend(DNN_BACKEND_DEFAULT);
+    net.setPreferableTarget(targetId);
+
     Mat inp, outRef;
     ASSERT_NO_THROW( inp = readTorchBlob(_tf(prefix + "_input" + suffix), isBinary) );
     ASSERT_NO_THROW( outRef = readTorchBlob(_tf(prefix + "_output" + suffix), isBinary) );
@@ -103,9 +107,19 @@ TEST(Torch_Importer, run_convolution)
     runTorchNet("net_conv");
 }
 
+OCL_TEST(Torch_Importer, run_convolution)
+{
+    runTorchNet("net_conv", DNN_TARGET_OPENCL);
+}
+
 TEST(Torch_Importer, run_pool_max)
 {
-    runTorchNet("net_pool_max", "", true);
+    runTorchNet("net_pool_max", DNN_TARGET_CPU, "", true);
+}
+
+OCL_TEST(Torch_Importer, run_pool_max)
+{
+    runTorchNet("net_pool_max", DNN_TARGET_OPENCL, "", true);
 }
 
 TEST(Torch_Importer, run_pool_ave)
@@ -113,12 +127,17 @@ TEST(Torch_Importer, run_pool_ave)
     runTorchNet("net_pool_ave");
 }
 
+OCL_TEST(Torch_Importer, run_pool_ave)
+{
+    runTorchNet("net_pool_ave", DNN_TARGET_OPENCL);
+}
+
 TEST(Torch_Importer, run_reshape)
 {
     runTorchNet("net_reshape");
     runTorchNet("net_reshape_batch");
     runTorchNet("net_reshape_single_sample");
-    runTorchNet("net_reshape_channels", "", false, true);
+    runTorchNet("net_reshape_channels", DNN_TARGET_CPU, "", false, true);
 }
 
 TEST(Torch_Importer, run_linear)
@@ -128,13 +147,19 @@ TEST(Torch_Importer, run_linear)
 
 TEST(Torch_Importer, run_paralel)
 {
-    runTorchNet("net_parallel", "l5_torchMerge");
+    runTorchNet("net_parallel", DNN_TARGET_CPU, "l5_torchMerge");
 }
 
 TEST(Torch_Importer, run_concat)
 {
-    runTorchNet("net_concat", "l5_torchMerge");
-    runTorchNet("net_depth_concat", "", false, true);
+    runTorchNet("net_concat", DNN_TARGET_CPU, "l5_torchMerge");
+    runTorchNet("net_depth_concat", DNN_TARGET_CPU, "", false, true);
+}
+
+OCL_TEST(Torch_Importer, run_concat)
+{
+    runTorchNet("net_concat", DNN_TARGET_OPENCL, "l5_torchMerge");
+    runTorchNet("net_depth_concat", DNN_TARGET_OPENCL, "", false, true);
 }
 
 TEST(Torch_Importer, run_deconv)
@@ -163,37 +188,49 @@ TEST(Torch_Importer, net_softmax)
     runTorchNet("net_softmax_spatial");
 }
 
+OCL_TEST(Torch_Importer, net_softmax)
+{
+    runTorchNet("net_softmax", DNN_TARGET_OPENCL);
+    runTorchNet("net_softmax_spatial", DNN_TARGET_OPENCL);
+}
+
 TEST(Torch_Importer, net_logsoftmax)
 {
     runTorchNet("net_logsoftmax");
     runTorchNet("net_logsoftmax_spatial");
 }
 
+OCL_TEST(Torch_Importer, net_logsoftmax)
+{
+    runTorchNet("net_logsoftmax", DNN_TARGET_OPENCL);
+    runTorchNet("net_logsoftmax_spatial", DNN_TARGET_OPENCL);
+}
+
 TEST(Torch_Importer, net_lp_pooling)
 {
-    runTorchNet("net_lp_pooling_square", "", false, true);
-    runTorchNet("net_lp_pooling_power", "", false, true);
+    runTorchNet("net_lp_pooling_square", DNN_TARGET_CPU, "", false, true);
+    runTorchNet("net_lp_pooling_power", DNN_TARGET_CPU, "", false, true);
 }
 
 TEST(Torch_Importer, net_conv_gemm_lrn)
 {
-    runTorchNet("net_conv_gemm_lrn", "", false, true);
+    runTorchNet("net_conv_gemm_lrn", DNN_TARGET_CPU, "", false, true);
 }
 
 TEST(Torch_Importer, net_inception_block)
 {
-    runTorchNet("net_inception_block", "", false, true);
+    runTorchNet("net_inception_block", DNN_TARGET_CPU, "", false, true);
 }
 
 TEST(Torch_Importer, net_normalize)
 {
-    runTorchNet("net_normalize", "", false, true);
+    runTorchNet("net_normalize", DNN_TARGET_CPU, "", false, true);
 }
 
 TEST(Torch_Importer, net_padding)
 {
-    runTorchNet("net_padding", "", false, true);
-    runTorchNet("net_spatial_zero_padding", "", false, true);
+    runTorchNet("net_padding", DNN_TARGET_CPU, "", false, true);
+    runTorchNet("net_spatial_zero_padding", DNN_TARGET_CPU, "", false, true);
 }
 
 TEST(Torch_Importer, ENet_accuracy)
@@ -245,6 +282,62 @@ TEST(Torch_Importer, OpenFace_accuracy)
     normAssert(out, outRef);
 }
 
+OCL_TEST(Torch_Importer, OpenFace_accuracy)
+{
+    const string model = findDataFile("dnn/openface_nn4.small2.v1.t7", false);
+    Net net = readNetFromTorch(model);
+
+    net.setPreferableBackend(DNN_BACKEND_DEFAULT);
+    net.setPreferableTarget(DNN_TARGET_OPENCL);
+
+    Mat sample = imread(findDataFile("cv/shared/lena.png", false));
+    Mat sampleF32(sample.size(), CV_32FC3);
+    sample.convertTo(sampleF32, sampleF32.type());
+    sampleF32 /= 255;
+    resize(sampleF32, sampleF32, Size(96, 96), 0, 0, INTER_NEAREST);
+
+    Mat inputBlob = blobFromImage(sampleF32);
+
+    net.setInput(inputBlob);
+    Mat out = net.forward();
+
+    Mat outRef = readTorchBlob(_tf("net_openface_output.dat"), true);
+    normAssert(out, outRef);
+}
+
+OCL_TEST(Torch_Importer, ENet_accuracy)
+{
+    Net net;
+    {
+        const string model = findDataFile("dnn/Enet-model-best.net", false);
+        Ptr<Importer> importer = createTorchImporter(model, true);
+        ASSERT_TRUE(importer != NULL);
+        importer->populateNet(net);
+    }
+
+    net.setPreferableBackend(DNN_BACKEND_DEFAULT);
+    net.setPreferableTarget(DNN_TARGET_OPENCL);
+
+    Mat sample = imread(_tf("street.png", false));
+    Mat inputBlob = blobFromImage(sample, 1./255);
+
+    net.setInput(inputBlob, "");
+    Mat out = net.forward();
+    Mat ref = blobFromNPY(_tf("torch_enet_prob.npy", false));
+    // Due to numerical instability in Pooling-Unpooling layers (indexes jittering)
+    // thresholds for ENet must be changed. Accuracy of resuults was checked on
+    // Cityscapes dataset and difference in mIOU with Torch is 10E-4%
+    normAssert(ref, out, "", 0.00044, 0.44);
+
+    const int N = 3;
+    for (int i = 0; i < N; i++)
+    {
+        net.setInput(inputBlob, "");
+        Mat out = net.forward();
+        normAssert(ref, out, "", 0.00044, 0.44);
+    }
+}
+
 }
 
 #endif