diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
index 3b73da801c..5eee1da4a0 100644
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@@ -945,9 +945,11 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
     } else if (config->kernelType == KERNEL_TYPE_GEMM_LIKE) {
         if (!swizzleWeight(weight, config->workItem_output[1], true))
             return false;
+#if 0
         size_t total_bottom_size = bottom_dim_ * numImages;
         size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_;
         size_t total_bias_size = M_ * group_;
+#endif
         size_t total_top_size = top_dim_ * numImages;
         for (int32_t g = 0; g < group_; ++g) {
             bias_offset = M_ * g;
@@ -960,72 +962,25 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                 return false;
 
             cl_uint argIdx = 0;
-            setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx);
-
-            UMat img_buffer;
-            if (image_offset)
-            {
-                CreateSubBuffer(bottom, img_buffer, image_offset,
-                                total_bottom_size - image_offset, false);
-                if (img_buffer.empty())
-                    return false;
-
-                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer));
-            }
-            else
-            {
-                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
-            }
+            setFusionArg(fused_activ_, fused_eltwise_, output_image_offset, kernel, argIdx);
 
-            UMat kernel_buffer;
-            if (kernel_offset)
-            {
-                CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset,
-                                total_kernel_size - kernel_offset, false);
-                if (kernel_buffer.empty())
-                    return false;
+            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+            kernel.set(argIdx++, (int)image_offset);
+            kernel.set(argIdx++, (int)(bottom.total() - image_offset));
 
-                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer));
-            }
-            else
-            {
-                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
-            }
+            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
+            kernel.set(argIdx++, (int)kernel_offset);
+            kernel.set(argIdx++, (int)(swizzled_weights_umat.total() - kernel_offset));
 
-            UMat bias_buffer;
             if (bias_term_)
             {
-                if (bias_offset)
-                {
-                    CreateSubBuffer(bias, bias_buffer, bias_offset,
-                                    total_bias_size - bias_offset, false);
-                    if (bias_buffer.empty())
-                        return false;
-
-                    kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer));
-                }
-                else
-                {
-                    kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
-                }
+                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
+                kernel.set(argIdx++, (int)bias_offset);
             }
 
-            UMat out_buffer;
-            if (output_image_offset)
-            {
-                CreateSubBuffer(top, out_buffer, output_image_offset,
-                                total_top_size - output_image_offset, true);
-                if (out_buffer.empty())
-                    return false;
-
-                kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
-                kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
-            }
-            else
-            {
-                kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
-                kernel.set(argIdx++, (int)(top.offset / element_size));
-            }
+            kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+            kernel.set(argIdx++, (int)(top.offset / element_size) + output_image_offset);
+            kernel.set(argIdx++, (int)total_top_size - (int)(top.offset / element_size));
 
             kernel.set(argIdx++, (uint16_t)width_);
             kernel.set(argIdx++, (uint16_t)height_);
diff --git a/modules/dnn/src/opencl/conv_layer_spatial.cl b/modules/dnn/src/opencl/conv_layer_spatial.cl
index 55015557a0..e7bbacd4c4 100644
--- a/modules/dnn/src/opencl/conv_layer_spatial.cl
+++ b/modules/dnn/src/opencl/conv_layer_spatial.cl
@@ -401,13 +401,12 @@ typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
 #define ROW_PITCH input_width
 
 #define GEMM_LIKE_KERNEL_ARGS     \
-    ELTWISE_DATA_ARG              \
+    ELTWISE_DATA_ARG_WITH_OFFSET  \
     FUSED_ARG                     \
-    const __global Dtype *src0,   \
-    const __global Dtype *src1,   \
-    BIAS_KERNEL_ARG               \
-    __global Dtype *dst_base,     \
-    const int dst_offset,         \
+    const __global Dtype *src0_ptr, const unsigned int src0_offset, const unsigned int src0_limit, \
+    const __global Dtype *src1_ptr, const unsigned int src1_offset, const unsigned int src1_limit, \
+    BIAS_KERNEL_ARG_WITH_OFFSET   \
+    __global Dtype *dst_base, const unsigned int dst_offset, const unsigned int dst_limit, \
     const ushort input_width,     \
     const ushort input_height,    \
     const ushort output_width,    \
@@ -437,7 +436,17 @@ typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
 __attribute__((intel_reqd_sub_group_size(8)))
 __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 {
+    const __global Dtype *src0 = src0_ptr + src0_offset;
+    const __global Dtype *src1 = src1_ptr + src1_offset;
+#if APPLY_BIAS
+  __global Dtype* biases_base = biases_base_ptr + biases_base_offset;
+#endif
+
     __global Dtype *dst = dst_base + dst_offset;
+#ifdef FUSED_CONV_ELTWISE
+  __global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
+#endif
+
     const int group_x = get_group_id(0);
     const int group_y = get_group_id(1);
     const int global_x = get_global_id(0);
@@ -460,6 +469,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
     }
     typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;
 
+// U_GEMM_LIKE_CONV_k11x11_cn3_g1_s4x4_d1x1_b1_in240x240_p0x0_num1_M96_activ1_eltwise0_FP32_5_1_8_32_SIMD8 doesn't run properly (src0_read out of bounds)
+// Test: DNNTestNetwork.AlexNet/0 (to run all kernels use OPENCV_OCL4DNN_FORCE_AUTO_TUNING=1)
+#if 0 // INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
+  #define OPTIMIZE_READ 1
+#else
+  #define OPTIMIZE_READ 0
+#endif
+
     // True for all threads if filter_width is multiple of TILE_N
     // else, true for all but right-most column of threads.
     if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N )
@@ -476,7 +493,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
         // atile is M rows x K columns.
         int curr_x = ( global_y % output_width ) * STRIDE_X;
         int curr_y = ( global_y / output_width ) * STRIDE_Y;
-#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
+#if !OPTIMIZE_READ
         int saved_y = curr_y;
 #endif
         const __global Dtype *src0_read = src0
@@ -496,7 +513,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
         do
         {
             int patch_row = 0;
-#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
+#if !OPTIMIZE_READ
             curr_y = saved_y;
 #endif
 
@@ -514,11 +531,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
                 // ...
                 const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
 
-#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
+#if OPTIMIZE_READ
   #if KERNEL_WIDTH == 3
                 Dtype_t blockA00 = vload3(0, src0_read);
                 Dtype*  pblockA00 = (Dtype*)(&blockA00);
   #else
+    #if 0 // debug
+                if ((int)(src0_read - src0) >= src0_limit - KERNEL_WIDTH)
+                {
+                    printf("CATCH: src0_read-src0: %d   limit=%d   curr_y,curr_x=%d,%d\n", (int)(src0_read - src0), src0_limit, curr_y, curr_x);
+                }
+    #endif
                 Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[  0  ];
                 Dtype*  pblockA00 = (Dtype*)(&blockA00);
   #endif
@@ -639,7 +662,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
         // atile is M rows x K columns.
         int curr_x = ( global_y % output_width ) * STRIDE_X;
         int curr_y = ( global_y / output_width ) * STRIDE_Y;
-#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
+#if !OPTIMIZE_READ
         int saved_y = curr_y;
 #endif
         const __global Dtype *src0_read = src0
@@ -659,14 +682,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
         do
         {
             int patch_row = 0;
-#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
+#if !OPTIMIZE_READ
             curr_y = saved_y;
 #endif
             do
             {
                 // Load atile and interleaved btile.
                 const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
-#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
+#if OPTIMIZE_READ
                 Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[  0  ];
                 Dtype*  pblockA00 = (Dtype*)(&blockA00);
 #else
@@ -803,7 +826,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
             }
         }
     }
-#endif
+#endif  // TILE_N_LAST > 0
 }
 #endif
 #ifdef GEMM_LIKE_CONV_32_2
@@ -826,7 +849,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 __attribute__((intel_reqd_sub_group_size(8)))
 __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 {
+    const __global Dtype *src0 = src0_ptr + src0_offset;
+    const __global Dtype *src1 = src1_ptr + src1_offset;
+#if APPLY_BIAS
+  __global Dtype* biases_base = biases_base_ptr + biases_base_offset;
+#endif
+
     __global Dtype *dst = dst_base + dst_offset;
+#ifdef FUSED_CONV_ELTWISE
+  __global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
+#endif
+
     const int group_x = get_group_id(0);
     const int group_y = get_group_id(1);
     const int global_x = get_global_id(0);
@@ -1388,7 +1421,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 __attribute__((intel_reqd_sub_group_size(16)))
 __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 {
+    const __global Dtype *src0 = src0_ptr + src0_offset;
+    const __global Dtype *src1 = src1_ptr + src1_offset;
+#if APPLY_BIAS
+  __global Dtype* biases_base = biases_base_ptr + biases_base_offset;
+#endif
+
     __global Dtype *dst = dst_base + dst_offset;
+#ifdef FUSED_CONV_ELTWISE
+  __global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
+#endif
+
     const int group_x = get_group_id(0);
     const int group_y = get_group_id(1);
     const int global_x = get_global_id(0);
@@ -1574,7 +1617,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 __attribute__((intel_reqd_sub_group_size(16)))
 __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 {
+    const __global Dtype *src0 = src0_ptr + src0_offset;
+    const __global Dtype *src1 = src1_ptr + src1_offset;
+#if APPLY_BIAS
+  __global Dtype* biases_base = biases_base_ptr + biases_base_offset;
+#endif
+
     __global Dtype *dst = dst_base + dst_offset;
+#ifdef FUSED_CONV_ELTWISE
+  __global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
+#endif
+
     const int group_x = get_group_id(0);
     const int group_y = get_group_id(1);
     const int global_x = get_global_id(0);