update convolution opencl kernels in dnn module (#11762)

* optimize ocl kernel enqueue in fc layer Signed-off-by: Li Peng <peng.li@intel.com> * use CV_LOG_INFO in convolution auto tuning Signed-off-by: Li Peng <peng.li@intel.com> * update convolution IDLF kernel extend parameter tuning range, also cleanup ocl kernel implementation Signed-off-by: Li Peng <peng.li@intel.com> * update in-memory convolution cache config fp16 and fp32 cache config are stored separately Signed-off-by: Li Peng <peng.li@intel.com>
7 years ago · ab8022f74e
parent a2bc075924
commit ab8022f74e
4 changed files with 985 additions and 586 deletions
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@ -310,7 +310,6 @@ public:
            innerProductOp = Ptr<OCL4DNNInnerProduct<float> >(new OCL4DNNInnerProduct<float>(config));
        }
        UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
        for (size_t i = 0; i < inputs.size(); i++)
        {
            MatShape inshape, outshape;
@ -320,7 +319,6 @@ public:
            UMat srcMat, dstMat;
            srcMat = inputs[i].reshape(1, inshape.size(), &inshape[0]);
            dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]);
            dstMat.setTo(0.0f);
            if (!innerProductOp->Forward(srcMat, (use_half) ? half_blobs[0] : umat_blobs[0],
                                         (bias) ? (use_half ? half_blobs[1] : umat_blobs[1]) : UMat(),
@ -332,6 +330,7 @@ public:
            if (!use_half && bias && (outerSize > 1))
            {
                UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
                UMat& biases = umat_blobs[1];
                cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
            }
@ -354,6 +353,7 @@ public:
            if (bias)
            {
                UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
                UMat& biases = umat_blobs[1];
                cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
            }
--- a/modules/dnn/src/ocl4dnn/include/default_kernel_config.hpp
+++ b/modules/dnn/src/ocl4dnn/include/default_kernel_config.hpp
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@ -55,6 +55,7 @@
 #include "../include/math_functions.hpp"
 #include "../include/default_kernel_config.hpp"
 #include "opencv2/dnn/shape_utils.hpp"
 #include "opencv2/core/utils/logger.hpp"
 #if defined WIN32 || defined _WIN32
 #include <windows.h>
@ -87,10 +88,13 @@ static void initializeGlobalBuiltinConfigurations(const std::string& cache_path)
 {
    CV_Assert(defaultConfigLoaded == false);
    CV_Assert(kernelConfigMap.empty());
-    const size_t numConfigs = sizeof(default_kernel_config_intel)/sizeof(default_kernel_config_intel[0])/2;
+
    /* fp32 config */
    size_t numConfigs = sizeof(default_kernel_config_intel_fp32) /
                        sizeof(default_kernel_config_intel_fp32[0]) / 2;
    for (size_t i = 0; i < numConfigs; i++)
    {
-        std::string key = std::string("Intel(R) Corporation_") + default_kernel_config_intel[2 * i];
+        std::string key = std::string("Intel(R) Corporation_") + default_kernel_config_intel_fp32[2 * i];
        if (!cache_path.empty())
        {
            std::string cacheFile = cache_path + sanitize(key);
@ -100,9 +104,29 @@ static void initializeGlobalBuiltinConfigurations(const std::string& cache_path)
        }
        std::pair<std::string, std::string> entry(
                key,
-                default_kernel_config_intel[2 * i + 1]);
+                default_kernel_config_intel_fp32[2 * i + 1]);
        kernelConfigMap.insert(entry);
    }
    /* fp16 config */
    numConfigs = sizeof(default_kernel_config_intel_fp16) /
                 sizeof(default_kernel_config_intel_fp16[0]) / 2;
    for (size_t i = 0; i < numConfigs; i++)
    {
        std::string key = std::string("Intel(R) Corporation_") + default_kernel_config_intel_fp16[2 * i];
        if (!cache_path.empty())
        {
            std::string cacheFile = cache_path + sanitize(key);
            std::ifstream cachedKernel(cacheFile.c_str());
            if (cachedKernel)
                continue;  // external configuration found, skip builtin
        }
        std::pair<std::string, std::string> entry(
                key,
                default_kernel_config_intel_fp16[2 * i + 1]);
        kernelConfigMap.insert(entry);
    }
    defaultConfigLoaded = true;
 }
@ -311,40 +335,38 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
        // options
        options_ << " -cl-fast-relaxed-math -D KERNEL_IDLF -D convolve_simd=" << kernel_name_;
        options_ << " -cl-mad-enable";
        if (clOptionSupport("-cl-no-subgroup-ifp"))
            options_ << " -cl-no-subgroup-ifp ";
        // defs
        int32_t output_width = output_w_;
        int32_t output_height = output_h_;
        int32_t output_block_width = blockM;
        int32_t output_block_height = blockK;
-        const int32_t last_block_width = (output_width % output_block_width == 0) ?
+        int tile_x = (output_block_width - 1) * stride_w_ + kernel_w_ * dilation_w_;
                                        output_block_width : output_width % output_block_width;
        const int32_t last_block_height = (output_height % output_block_height == 0) ?
                                         output_block_height : output_height % output_block_height;
        int tile_x = alignSize((output_block_width - 1) * stride_w_ + kernel_w_ * dilation_w_, 4);
        int tile_y = (output_block_height - 1) * stride_h_ + kernel_h_ * dilation_h_;
-        int tile_y_stride = (4 * simd_size) / tile_x;
+        int invec_size = tile_y;
        int invec_size = divUp(tile_y, tile_y_stride);
        addDef("SIMD_SIZE", simd_size);
        addDef("filter_qualifier", "__global");
        addDef("OUT_BLOCK_WIDTH", output_block_width);
        addDef("OUT_BLOCK_HEIGHT", output_block_height);
        addDef("LAST_BLOCK_WIDTH", last_block_width);
        addDef("LAST_BLOCK_HEIGHT", last_block_height);
        addDef("INPUT_DEPTH", channels_ / group_);
        addDef("TOTAL_INPUT_DEPTH_SIZE", channels_);
        addDef("TOTAL_OUTPUT_DEPTH", num_output_);
        addDef("NUM_FILTERS", M_);
        addDef("TILE_X", tile_x);
        addDef("TILE_Y", tile_y);
        addDef("TILE_Y_STRIDE", tile_y_stride);
        addDef("INVEC_SIZE", invec_size);
        addDef("ALIGNED_NUM_FILTERS", (int)alignSize(M_, simd_size));
        addDef("OUT_BLOCK_SIZE", (output_block_width*output_block_height));
        addDef("APPLY_BIAS", bias_term_);
        addDef("WEIGHT_PREF", ((kernel_w_ * kernel_h_) == 1) ? 1 : 8);
        addDef("INPUT_PITCH", (width_ * height_));
        addDef("OUTPUT_PITCH", (output_w_ * output_h_));
        addDef("LEFT_FILTERS", ((int)alignSize(M_, simd_size) - M_));
        addDef("INPUT_WIDTH", width_);
        addDef("INPUT_HEIGHT", height_);
        addDef("FILTERS_IN_GROUP", ((int)alignSize(M_, simd_size) / simd_size));
        setFusionDefine(fused_activ_, fused_eltwise_);
        src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
@ -567,13 +589,6 @@ void OCL4DNNConvSpatial<Dtype>::calculateBenchmark(const UMat &bottom, UMat &ver
    return;
 }
 #define dbg
 #ifdef dbg
 #define dbgPrint(x) (x)
 #else
 #define dbgPrint(x)
 #endif
 // For large enough input size, we do not need to tune kernels for different
 // size. The reason is with large input size, there will be enough work items
 // to feed al the EUs.
@ -584,6 +599,7 @@ void OCL4DNNConvSpatial<Dtype>::calculateBenchmark(const UMat &bottom, UMat &ver
 template<typename Dtype>
 void OCL4DNNConvSpatial<Dtype>::generateKey()
 {
    std::string precision = (use_half_) ? "FP16" : "FP32";
    std::stringstream keyBuilder;
    // FIXME: to support fuse?
    keyBuilder << "k" << kernel_w_ << "x" << kernel_h_ << "_"
@ -597,7 +613,8 @@ void OCL4DNNConvSpatial<Dtype>::generateKey()
               << "num" << num_ << "_"
               << "M" << M_ << "_"
               << "activ" << fused_activ_ << "_"
-               << "eltwise" << fused_eltwise_;
+               << "eltwise" << fused_eltwise_ << "_"
               << precision;
    key_ = ocl::Device::getDefault().vendorName() + "_EU" + cv::format("%d", ocl::Device::getDefault().maxComputeUnits()) + "_" + keyBuilder.str();
@ -616,11 +633,6 @@ std::string OCL4DNNConvSpatial<Dtype>::generateSpecificKey(int32_t type, int32_t
               << "_" << blockHeight
               << "_" << blockDepth;
    if (!use_half_)
        keyBuilder << "_float";
    else
        keyBuilder << "_half";
    return keyBuilder.str();
 }
@ -1164,7 +1176,7 @@ float OCL4DNNConvSpatial<float>::timedConvolve(const UMat &bottom, UMat &top,
    cv::ocl::Timer timer(queue);
    timer.start();
    bool res = true;;
-    dbgPrint(std::cout << "Benchmarking kernel: " << config->kernelName << std::endl);
+    CV_LOG_INFO(NULL, "Benchmarking kernel: " << config->kernelName);
    tuned_ = true;
    int loop_cnt = 4;
    for (int i = 0; i < loop_cnt; i++) {
@ -1181,7 +1193,6 @@ float OCL4DNNConvSpatial<float>::timedConvolve(const UMat &bottom, UMat &top,
    }
    float elapsedTime = timer.durationNS() * 1e-6 / loop_cnt;
    #ifdef dbg
    double out_w = output_w_;
    double out_h = output_h_;
    double out_z = M_;
@ -1189,16 +1200,8 @@ float OCL4DNNConvSpatial<float>::timedConvolve(const UMat &bottom, UMat &top,
    double k_h = kernel_h_;
    double k_z = channels_;
    double totalFlops = ((k_w*k_h*k_z -1)*2)*(out_w*out_h*out_z)*num_;
-    std::cout << "\tEstimated Gflops:" << (totalFlops * 1e-9)
+    CV_LOG_INFO(NULL, "\tEstimated Gflops:" << (totalFlops * 1e-9));
-              << std::endl;
+    CV_LOG_INFO(NULL, "\tEstimated GFLOPS/S: " << ((totalFlops * 1e-9)*(1000.0/elapsedTime)));
    std::cout << "\tEstimated GFLOPS/S: " << ((totalFlops * 1e-9)*(1000.0/elapsedTime))
              << std::endl;
    #if 0
    std::cout << "Estimated utilization: " <<
        ((((totalFlops/1000)/1000)/1000)*(1000.0/elapsedTime))/880.0
        << std::endl;
    #endif
    #endif
    return elapsedTime;
 }
@ -1254,18 +1257,18 @@ bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,
                        if (use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) &&
                            error_factor > 0.04 && !(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4))
                        {
-                            dbgPrint(printf("test verification failed @ image %d group %d"
+                            CV_LOG_ERROR(NULL, "test verification failed @ image " << n << " group " << g
-                                            "out_ch %d h %d w %d got %G expected %G\n",
+                                         << " out_ch " << out_ch << " h " << h << " w " << w
-                                            n, g, out_ch, h, w, data[offset], verify_data[offset]));
+                                         << " got " << data[offset] << " expected " << verify_data[offset]);
                            verificationFail = 1;
                            goto out;
                        }
                        else if (!use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) &&
                                 !(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4))
                        {
-                            dbgPrint(printf("test verification failed @ image %d group %d"
+                            CV_LOG_ERROR(NULL, "test verification failed @ image " << n << " group " << g
-                                            "out_ch %d h %d w %d got %G expected %G\n",
+                                         << " out_ch " << out_ch << " h " << h << " w " << w
-                                            n, g, out_ch, h, w, data[offset], verify_data[offset]));
+                                         << " got " << data[offset] << " expected " << verify_data[offset]);
                            verificationFail = 1;
                            goto out;
                        }
@ -1546,17 +1549,11 @@ void OCL4DNNConvSpatial<float>::generate_idlf_tuneritems(std::vector< cv::Ptr<tu
        return;
    int actual_tile_x = kernel_w_ * dilation_w_ + (blockM - 1) * stride_w_ ;
-    int tile_x = alignSize(actual_tile_x, 4);
+    int tile_x = alignSize(actual_tile_x, simd_size);
-    int tile_y = kernel_h_ * dilation_h_ + (blockK - 1) * stride_h_;
+    if (tile_x > simd_size)
    if (tile_x > (4 * simd_size))
        return;
    if ((blockM * blockK + divUp(tile_x * tile_y, simd_size)) > block_size_max)
        return;
-    int tile_y_stride = (4 * simd_size) / tile_x;
+    if (blockM * blockK > block_size_max)
    int invec_size = divUp(tile_y, tile_y_stride);
    if (invec_size > 4)
        return;
    tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_INTEL_IDLF, blockM, blockK, simd_size));
@ -1599,11 +1596,7 @@ void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerPar
                for (uint32_t height = height_max; height > 0; height--)
                {
                    generate_idlf_tuneritems(tunerItems, width, height, simd_size);
                    if (tunerItems.size() >= 8 && height == 2)
                        break;
                }
                if (tunerItems.size() >= 12 && width == 2)
                    break;
            }
        }
    }
@ -1690,10 +1683,8 @@ void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom,
        if (kernelQueue[x]->tested == false) {
            bool verified = verifyResult(bottom, top, weight, bias, numImages, kernelQueue[x], verifyTop);
            if (verified == false) {
-                dbgPrint(std::cout << "Kernel "
+                CV_LOG_ERROR(NULL, "Kernel " << kernelQueue[x]->kernelName << " failed verification");
-                         << kernelQueue[x]->kernelName
+                CV_LOG_ERROR(NULL, "kernelQueue[x]->workItem_output[0]: "
                         << " failed verification" << std::endl);
                dbgPrint(std::cout << "kernelQueue[x]->workItem_output[0]: "
                             << kernelQueue[x]->workItem_output[0] << " "
                             << "kernelQueue[x]->workItem_output[1]: "
                             << kernelQueue[x]->workItem_output[1] << " "
@ -1714,11 +1705,9 @@ void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom,
                             << "kernelQueue[x]->local_work_size[2]: "
                             << kernelQueue[x]->local_work_size[2] << " "
                             << kernelQueue[x]->swizzle_weights << " "
-                         << kernelQueue[x]->use_null_local << std::endl);
+                             << kernelQueue[x]->use_null_local);
            } else {
-                dbgPrint(std::cout << "Kernel "
+                CV_LOG_INFO(NULL, "Kernel " << kernelQueue[x]->kernelName << " pass verification");
                         << kernelQueue[x]->kernelName
                         << " pass verification" << std::endl);
            }
        }
        #endif
@ -1747,19 +1736,28 @@ void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom,
                break;
            } else {
                kernelQueue[fastestKernel]->tested = true;
-                dbgPrint(std::cout << "Kernel " <<
+                CV_LOG_ERROR(NULL, "Kernel " << kernelQueue[fastestKernel]->kernelName <<
-                         kernelQueue[fastestKernel]->kernelName <<
+                             " failed verification");
                         " failed verification" << std::endl);
                failures++;
            }
        }
    }
    if (verification) {
-        dbgPrint(std::cout << "Kernel <" << kernelQueue[kernel_index_]->kernelName <<
+        CV_LOG_INFO(NULL, "Kernel <" << kernelQueue[kernel_index_]->kernelName <<
-                 "> passed verification" << std::endl);
+                    "> passed verification");
-        dbgPrint(std::cout << "Convolution Time:" << kernelQueue[kernel_index_]->executionTime << std::endl);
+        CV_LOG_INFO(NULL, "Convolution Time:" << kernelQueue[kernel_index_]->executionTime);
        double out_w = output_w_;
        double out_h = output_h_;
        double out_z = M_;
        double k_w = kernel_w_;
        double k_h = kernel_h_;
        double k_z = channels_;
        float elapsedTime = kernelQueue[kernel_index_]->executionTime;
        double totalFlops = ((k_w*k_h*k_z -1)*2)*(out_w*out_h*out_z)*num_;
        CV_LOG_INFO(NULL, "\tEstimated Gflops:" << (totalFlops * 1e-9));
        CV_LOG_INFO(NULL, "\tEstimated GFLOPS/S: " << ((totalFlops * 1e-9)*(1000.0/elapsedTime)));
    } else {
-        dbgPrint(std::cout << "fallback to basic kernel" << std::endl);
+        CV_LOG_INFO(NULL, "fallback to basic kernel");
        options_.str(""); options_.clear(); // clear contents and state flags
        createBasicKernel(1, 1, 1);
        kernel_index_ = kernelQueue.size() - 1;
--- a/modules/dnn/src/opencl/conv_layer_spatial.cl
+++ b/modules/dnn/src/opencl/conv_layer_spatial.cl
@ -206,8 +206,6 @@ __kernel void ConvolveBasic(
 #elif defined KERNEL_IDLF
 #define VLOAD4(_v, _p) do { _v = vload4(0, _p); } while(0)
 // Each work-item computes a OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT region of one output map.
 // Each work-group (which will be mapped to 1 SIMD16/SIMD8 EU thread) will compute 16/8 different feature maps, but each feature map is for the same region of the input image.
 // NDRange:  (output_width+pad)/ OUT_BLOCK_WIDTH, (output_height+pad)/OUT_BLOCK_HEIGHT, NUM_FILTERS/OUT_BLOCK_DEPTH
@ -219,124 +217,76 @@ __kernel void
 convolve_simd(
    ELTWISE_DATA_ARG
    FUSED_ARG
-    __global Dtype* inputs_base,
+    __global Dtype* inputs,
-    filter_qualifier Dtype* weights_base,
+    __global Dtype* weights,
    BIAS_KERNEL_ARG
-    __global Dtype* outputs_base,
+    __global Dtype* outputs,
    const ushort input_width,
    const ushort input_height,
    const ushort output_width,
    const ushort output_height)
 {
  __global Dtype* outputs = outputs_base;
  __global Dtype* inputs = inputs_base;
  filter_qualifier Dtype* weights = weights_base;
  unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH;  // oc = Output Column
  unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row
  unsigned int fm = get_global_id(2);                    // fm = Feature Map = od = Output Depth
  unsigned int fmg = get_group_id(2);
  unsigned int lid = get_local_id(2);
-  Dtype out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT];
+  Dtype out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT] = { 0.0f };
  int in_addr;
  // find weights address of given neuron (lid is index)
-  unsigned int weight_addr = (fmg % (ALIGNED_NUM_FILTERS/SIMD_SIZE)) * INPUT_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE + lid;
+  unsigned int weight_addr = (fmg % FILTERS_IN_GROUP) *
                             INPUT_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE + lid;
-  for(int i=0;i<OUT_BLOCK_SIZE;i++) {
+  unsigned int num_in_batch = fm / ALIGNED_NUM_FILTERS;
    out[i]=0.0f;
  }
-  unsigned int num_in_batch = ( fm ) / ALIGNED_NUM_FILTERS;
+  unsigned int input_batch_offset = num_in_batch * INPUT_PITCH * TOTAL_INPUT_DEPTH_SIZE;
-  unsigned int input_batch_offset = num_in_batch * input_height * input_width * TOTAL_INPUT_DEPTH_SIZE;
+  int curr_y = or * STRIDE_Y;
-
+  int curr_x = oc * STRIDE_X + lid;
  int curr_local_y = ( lid / ( TILE_X / 4 ) );
  int curr_local_x = ( lid % ( TILE_X / 4 ) ) * 4;
  int curr_y = or * STRIDE_Y + curr_local_y;
  int curr_x = oc * STRIDE_X + curr_local_x;
 #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
  int saved_y = curr_y;
 #endif
-  in_addr = input_batch_offset
+  int in_addr = input_batch_offset
-            +  (curr_y - INPUT_PAD_H) * input_width             // y tile offset
+                +  (curr_y - INPUT_PAD_H) * INPUT_WIDTH          // y tile offset
                +   curr_x - INPUT_PAD_W;                        // x tile offset
-  union {
+
-    Dtype4 in_vec[INVEC_SIZE];
+  Dtype in_buf[INVEC_SIZE];
    Dtype in_array[INVEC_SIZE * 4];
  } in_buf;
  for(int kd = 0; kd < INPUT_DEPTH; kd++)
  {
    int in_offset = in_addr;
-    int reg = 0;
+    __attribute__((opencl_unroll_hint(INVEC_SIZE)))
-    LOOP(INVEC_SIZE, reg,
+    for (int reg = 0; reg < INVEC_SIZE; reg++)
    {
-        if (curr_local_y + reg * TILE_Y_STRIDE < TILE_Y || INVEC_SIZE * TILE_Y_STRIDE <= (TILE_Y + 2) || reg < INVEC_SIZE - 1) {
+        in_buf[reg] = inputs[in_offset];
 #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
-        if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + 3 >= INPUT_PAD_W && curr_x < input_width + INPUT_PAD_W) {
+        if (!(curr_y >= INPUT_PAD_H && curr_y < INPUT_HEIGHT + INPUT_PAD_H &&
-          if (curr_x < INPUT_PAD_W) {
+              curr_x >= INPUT_PAD_W && curr_x < INPUT_WIDTH + INPUT_PAD_W))
-            in_buf.in_vec[reg].s0 = 0;
+        {
-            if (curr_x + 1 >= INPUT_PAD_W && curr_x + 1 < input_width + INPUT_PAD_W)
+          in_buf[reg] = 0;
-              in_buf.in_vec[reg].s1 = *(inputs + in_offset + 1);
+        }
            else
              in_buf.in_vec[reg].s1 = 0;
            if (curr_x + 2 >= INPUT_PAD_W && curr_x + 2 < input_width + INPUT_PAD_W)
              in_buf.in_vec[reg].s2 = *(inputs + in_offset + 2);
            else
              in_buf.in_vec[reg].s2 = 0;
            if (curr_x + 3 < input_width + INPUT_PAD_W)
              in_buf.in_vec[reg].s3 = *(inputs + in_offset + 3);
            else
              in_buf.in_vec[reg].s3 = 0;
          } else {
            VLOAD4(in_buf.in_vec[reg], inputs + in_offset);
            if (curr_x + 1 >= input_width + INPUT_PAD_W)
              in_buf.in_vec[reg].s1 = 0;
            if (curr_x + 2 >= input_width + INPUT_PAD_W)
              in_buf.in_vec[reg].s2 = 0;
            if (curr_x + 3 >= input_width + INPUT_PAD_W)
              in_buf.in_vec[reg].s3 = 0;
          }
        } else {
          in_buf.in_vec[reg] = 0;
        }
        curr_y += TILE_Y_STRIDE;
 #else
        VLOAD4(in_buf.in_vec[reg], inputs + in_offset);
 #endif
        curr_y += 1;
        in_offset += INPUT_WIDTH;
    }
-        in_offset += input_width * TILE_Y_STRIDE;
+
-      });
+    in_addr += INPUT_PITCH;
-    in_addr += input_height * input_width;
+
 #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
    curr_y = saved_y;
 #endif
-#if KERNEL_WIDTH * KERNEL_HEIGHT != 1
+    Dtype weight_buf[WEIGHT_PREF];
 #define WEIGHT_PREF 8
 #else
 #define WEIGHT_PREF 1
 #endif
    union {
      Dtype w[WEIGHT_PREF];
 #if KERNEL_WIDTH * KERNEL_HEIGHT != 1
      INT_TYPE8 ui8;
 #endif
    } weight_buf;
    int w_idx=0;
-    unsigned int orig_weight_addr = weight_addr;
+    for (int i = 0; i < WEIGHT_PREF; i++)
-#if KERNEL_WIDTH * KERNEL_HEIGHT != 1
+    {
-    weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]);
+        weight_buf[i] = weights[weight_addr];
-    weight_addr += SIMD_SIZE * WEIGHT_PREF;
+        weight_addr += SIMD_SIZE;
-#else
+    }
    weight_buf.w[0] = as_Dtype(SUB_GROUP_BLOCK_READ((__global INT_TYPE *)&weights[weight_addr]));
    weight_addr += SIMD_SIZE * 1;
 #endif
-#define BLOCK_IN(n) sub_group_broadcast( in_buf.in_array[((n)%4) + ((n) / (TILE_Y_STRIDE * TILE_X)) * 4], (((n) % (TILE_Y_STRIDE * TILE_X))/4))
+#define BLOCK_IN(n, c) intel_sub_group_shuffle(in_buf[n], (c))
    int kr = 0;  // kr = Kernel Row
    LOOP(KERNEL_HEIGHT, kr,// LOOP is a macro that unrolls the loop.
@ -344,51 +294,29 @@ convolve_simd(
        int kc = 0;  // kc = Kernel Column
        LOOP(KERNEL_WIDTH, kc,
        {
-                for(int br=0; br < OUT_BLOCK_HEIGHT; br++) {
+            for (int br=0; br < OUT_BLOCK_HEIGHT; br++)
-                  for(int bc=0; bc < OUT_BLOCK_WIDTH; bc++) {
+            {
-                    Dtype input = BLOCK_IN((br * STRIDE_Y + kr * DILATION_Y) * TILE_X + bc * STRIDE_X + kc * DILATION_X);
+                for(int bc=0; bc < OUT_BLOCK_WIDTH; bc++)
-                    out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf.w[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]);
+                {
-                  }
+                    Dtype input = BLOCK_IN((br * STRIDE_Y + kr * DILATION_Y), bc * STRIDE_X + kc * DILATION_X);
                    out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]);
                }
 #if KERNEL_WIDTH * KERNEL_HEIGHT > WEIGHT_PREF
                // We assume KERNEL_W is equal to KERNEL_H here.
                if ((w_idx + 1) % WEIGHT_PREF == 0
                #if KERNEL_WIDTH * KERNEL_HEIGHT % 8 != 0
                && ((w_idx + 1) <= (KERNEL_WIDTH * KERNEL_HEIGHT - WEIGHT_PREF))
                #endif
                    ) {
                  weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]);
                  weight_addr += SIMD_SIZE * WEIGHT_PREF;  // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details.
            }
-              #if KERNEL_WIDTH*KERNEL_HEIGHT % 8 == 0
+            weight_buf[w_idx % WEIGHT_PREF] = weights[weight_addr];
-                // need to do nothing
+            weight_addr += SIMD_SIZE;
              #else
                else if ((w_idx + 1) %  WEIGHT_PREF == 0 && ((w_idx + 1) > (KERNEL_WIDTH * KERNEL_HEIGHT - WEIGHT_PREF)))
                #if KERNEL_WIDTH * KERNEL_HEIGHT % 8 == 1
                  weight_buf.w[0] = weights[weight_addr];
                #elif KERNEL_WIDTH * KERNEL_HEIGHT % 8 == 2
                  weight_buf.ui8.s01 = SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)&weights[weight_addr]);
                #elif KERNEL_WIDTH * KERNEL_HEIGHT % 8 <= 4
                  weight_buf.ui8.s0123 = SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)&weights[weight_addr]);
                #else
                  weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]);
                #endif
              #endif
 #endif
            ++w_idx;
        });
    });
-    weight_addr = orig_weight_addr + KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE;
+    weight_addr -= WEIGHT_PREF * SIMD_SIZE;
  }
  // dead code to work around possible compiler bug.
  if (ALIGNED_NUM_FILTERS != NUM_FILTERS && fm > 0xfffffffeul) {
    outputs[0] = BLOCK_IN(fm % SIMD_SIZE);
  }
  fm = fm % ALIGNED_NUM_FILTERS;
-  if ((ALIGNED_NUM_FILTERS == NUM_FILTERS || fm < NUM_FILTERS)) {
+#if LEFT_FILTERS > 0
-  unsigned int out_addr = ( num_in_batch * TOTAL_OUTPUT_DEPTH + fm ) * output_width * output_height;
+  if (fm < NUM_FILTERS)
 #endif
  {
    unsigned int out_addr = (num_in_batch * TOTAL_OUTPUT_DEPTH + fm) * OUTPUT_PITCH;
    out_addr += or * output_width + oc;
    // we need this address calculation for biases because we support views and batching
 #if APPLY_BIAS
@ -396,13 +324,16 @@ convolve_simd(
 #else
    Dtype bias = 0;
 #endif
-    for(unsigned int r = 0; r < OUT_BLOCK_HEIGHT; r++) {
+
    for(unsigned int r = 0; r < OUT_BLOCK_HEIGHT; r++)
    {
      if (r + or >= output_height) break;
-      for(unsigned int c = 0; c < OUT_BLOCK_WIDTH; c++) {
+      for(unsigned int c = 0; c < OUT_BLOCK_WIDTH; c++)
      {
        if (c + oc >= output_width) break;
-        // this does a scattered write to SIMD_SIZE different feature maps, so that data within one map is contiguous, thus ready for input to next layer.
+        // this does a scattered write to SIMD_SIZE different feature maps,
        // so that data within one map is contiguous, thus ready for input to next layer.
        ACTIVATION_FUNCTION(outputs, out_addr + r * output_width + c, bias + out[r * OUT_BLOCK_WIDTH + c], fm);
      }
    }
  }