From 8c2dd5fb9a4e568466f92db5d954d5ddc351bc9d Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Fri, 8 Oct 2021 00:12:14 +0000
Subject: [PATCH] dnn(ocl4dnn): cleanup dead code, improve logging

---
 modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp   |   2 -
 .../src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp  | 100 ++++++------------
 modules/dnn/src/opencl/conv_layer_spatial.cl  |  39 ++-----
 3 files changed, 42 insertions(+), 99 deletions(-)
diff --git a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
index d6fb83becb..bf5fba71a1 100644
--- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
+++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
@@ -222,8 +222,6 @@ class OCL4DNNConvSpatial
         bool createDWConvKernel(int32_t blockWidth,
                                 int32_t blockHeight,
                                 int32_t blockDepth);
-        void CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
-                             int32_t offset, int32_t size, bool write_only);
         bool convolve(const UMat &bottom, UMat &top,
                       const UMat &weight, const UMat &bias,
                       int32_t numImages,
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
index 45bd249e5d..6c468fc7da 100644
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@@ -218,14 +218,7 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
 #endif
         if (!use_cache_path_)
         {
-            static int warn_ = 0;
-            if (!warn_)
-            {
-                std::cerr
-                    << "OpenCV(ocl4dnn): Kernel configuration cache directory doesn't exist: " << cache_path_ << std::endl
-                    << std::endl;
-                warn_ = true;
-            }
+            CV_LOG_ONCE_ERROR(NULL, "OpenCV(ocl4dnn): Kernel configuration cache directory doesn't exist: " << cache_path_);
         }
     }
 
@@ -418,7 +411,6 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
         addDef("CHANNELS", channels_ / group_);
         addDef("APPLY_BIAS", bias_term_);
         addDef("OUTPUT_Z", M_);
-        addDef("ZPAR", 1);
         setFusionDefine(fused_activ_, fused_eltwise_);
 
         src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
@@ -672,8 +664,7 @@ void interleaveMatrix(Dtype* mem_dst, const Dtype *mem,
                       int r, int c, int interleavedRows, int nonInterleavedRows,
                       int blockWidth, int rowAlignment )
 {
-    CHECK_EQ(interleavedRows % 2, 0) <<
-             "interleaveMatrix only supports even values for interleavedRows.";
+    CV_Check(interleavedRows, interleavedRows % 2 == 0, "interleaveMatrix only supports even values for interleavedRows.");
 
     size_t memSize = r * c * sizeof(float);
     size_t dstSize = memSize *
@@ -685,9 +676,12 @@ void interleaveMatrix(Dtype* mem_dst, const Dtype *mem,
     const int yStride = c * 2;
     const Dtype *pSrc = mem;
     Dtype* pDst = mem_dst;
-    for (int y = 0; y < r;) {
-        for (int rows = 0; rows < interleavedRows; rows += 2) {
-            if ( y >= r ) break;
+    for (int y = 0; y < r;)
+    {
+        for (int rows = 0; rows < interleavedRows; rows += 2)
+        {
+            if (y >= r)
+                break;
             if ((c % xStride) == 0) {
                 for (int x = 0; x < c / xStride; x++) {
                     memcpy(pDst + x * xStride * 2,                         // NOLINT
@@ -712,11 +706,14 @@ void interleaveMatrix(Dtype* mem_dst, const Dtype *mem,
             y += 2;
         }
 
-        for (int rows = 0; rows < nonInterleavedRows; rows++) {
-            if (y >= r) break;
+        for (int rows = 0; rows < nonInterleavedRows; rows++)
+        {
+            if (y >= r)
+                break;
             const int stride = rowAlignment;
             int remaining = c;
-            for (int x = 0; x < c; x += stride) {
+            for (int x = 0; x < c; x += stride)
+            {
                 if (remaining >= stride) {
                     memcpy(pDst + x * 2, pSrc + x, stride * sizeof(Dtype));    // NOLINT
                     remaining -=stride;
@@ -852,34 +849,6 @@ bool OCL4DNNConvSpatial<float>::createBasicKernel(int32_t blockWidth,
         return false;
 }
 
-template<>
-void OCL4DNNConvSpatial<float>::CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
-                                                int32_t offset, int32_t size, bool write_only)
-{
-    cl_mem sub_mem;
-    cl_buffer_region region;
-    cl_int err;
-    size_t element_size = (use_half_) ? sizeof(short) : sizeof(float);
-
-    region.origin = offset * element_size + buffer.offset;
-    region.size = size * element_size;
-    sub_mem = clCreateSubBuffer((cl_mem)buffer.handle(ACCESS_READ),
-                                write_only ? CL_MEM_WRITE_ONLY : CL_MEM_READ_ONLY,
-                                CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
-    if (err)
-    {
-        std::cout << "Failed to create sub buffer." << std::endl;
-        return;
-    }
-
-    int step = element_size, rows = size, cols = 1;
-    ocl::convertFromBuffer(sub_mem, step, rows, cols,
-                           (use_half_) ? CV_16SC1 : CV_32FC1, sub_buffer);
-
-    //decrease ocl mem refcount
-    clReleaseMemObject(sub_mem);
-}
-
 template<>
 bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                                          const UMat &weight, const UMat &bias,
@@ -938,7 +907,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
             kernel.set(argIdx++, (uint16_t)output_h_);
             if (!kernel.run_(3, config->global_work_size, config->local_work_size, false))
             {
-                std::cout << "IDLF kernel run failed." << std::endl;
+                CV_LOG_ERROR(NULL, "DNN/OpenCL: IDLF kernel run failed");
                 return false;
             }
         }
@@ -1012,7 +981,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
 
             if (!kernel.run_(3, global_size, config->local_work_size, false))
             {
-                std::cout << "GEMM like kernel run failed." << std::endl;
+                CV_LOG_ERROR(NULL, "DNN/OpenCL: GEMM like kernel run failed");
                 return false;
             }
         }
@@ -1115,14 +1084,9 @@ float OCL4DNNConvSpatial<float>::timedConvolve(const UMat &bottom, UMat &top,
     {
         queue = cv::ocl::Queue::getDefault();
     }
-    catch (const cv::Exception&)
+    catch (const std::exception& e)
     {
-        static int warn_ = 0;
-        if (!warn_)
-        {
-            std::cout << "OpenCV(ocl4dnn): Can't get OpenCL default queue for auto-tuning." << std::endl;
-            warn_ = true;
-        }
+        CV_LOG_ONCE_ERROR(NULL, "OpenCV(ocl4dnn): Can't get OpenCL default queue for auto-tuning: " << e.what());
         return 1e6;
     }
 
@@ -1326,9 +1290,9 @@ ocl::Program OCL4DNNConvSpatial<Dtype>::compileKernel()
     phash.insert(std::pair<std::string, ocl::Program>(kernel_name_, program));
     if (!program.ptr())
     {
-        std::cout << "Failed to compile kernel: " << kernel_name_
-                  << ", buildflags: " << options
-                  << ", errmsg: " << errmsg << std::endl;
+        CV_LOG_WARNING(NULL, "DNN/OpenCL: Failed to compile kernel: " << kernel_name_
+            << ", buildflags: '" << options << "', errmsg: '" << errmsg << "'"
+        );
     }
     return program;
 }
@@ -1754,7 +1718,8 @@ void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom,
                     fastestTime = kernelQueue[x]->executionTime;
                 }
             }
-            if (fastestKernel < 0) break;
+            if (fastestKernel < 0)
+                break;
             // Test fastest kernel
             bool verified = verifyResult(bottom, top, weight, bias, numImages, kernelQueue[fastestKernel], verifyTop);
             if (verified == true) {
@@ -1913,17 +1878,18 @@ bool OCL4DNNConvSpatial<Dtype>::setupKernelByConfig(int x, int y, int z, int typ
     {
         if (z == 1)
             z = 16;
-        CHECK_EQ(z == 16 || z == 8, true) << "invalid SIMD size" << std::endl;
+        CV_Check(z, z == 16 || z == 8, "DNN/OpenCL: IDLF - invalid SIMD size");
     }
     kernelQueue.clear();
     createConvolutionKernel(type, x, y, z);
-    if (kernelQueue.size() != 1) {
-        std::cerr << "Failed setup kernel by config:"
+    if (kernelQueue.size() != 1)
+    {
+        CV_LOG_ERROR(NULL, "DNN/OpenCL: Failed setup kernel by config: "
             << " x = " << x
             << " y = " << y
             << " z = " << z
             << " type = " << type
-            << std::endl;
+        );
         return false;
     }
     bestKernelConfig = kernelQueue[0];
@@ -1955,13 +1921,9 @@ bool OCL4DNNConvSpatial<Dtype>::loadTunedConfig()
     {
         if (cache_path_.empty())
         {
-            static int warn_ = 0;
-            if (!warn_)
-            {
-                std::cout << "OpenCV(ocl4dnn): consider to specify kernel configuration cache directory " << std::endl
-                          << "                 via OPENCV_OCL4DNN_CONFIG_PATH parameter." << std::endl;
-                warn_ = true;
-            }
+            CV_LOG_ONCE_WARNING(NULL, "OpenCV(ocl4dnn): consider to specify kernel configuration cache directory "
+                "through OPENCV_OCL4DNN_CONFIG_PATH parameter."
+            );
         }
         return false;
     }
diff --git a/modules/dnn/src/opencl/conv_layer_spatial.cl b/modules/dnn/src/opencl/conv_layer_spatial.cl
index eb5d354020..c9ddacfb8e 100644
--- a/modules/dnn/src/opencl/conv_layer_spatial.cl
+++ b/modules/dnn/src/opencl/conv_layer_spatial.cl
@@ -161,23 +161,15 @@ __kernel void ConvolveBasic(
     const int out_idx = get_global_id(0);  // 1D task layout: [output_width * output_height * OUTPUT_Z]
     const int plane_size = output_width * output_height;
     const int out_plane_idx = out_idx % plane_size;
-    const int outputZ = out_idx / plane_size;
+    const int outputZ = out_idx / plane_size;  // kernelNum
     const int outputY = out_plane_idx / output_width;
     const int outputX = out_plane_idx % output_width;
-    const int kernelNum = outputZ * ZPAR;
-    if (kernelNum < OUTPUT_Z)
+    if (outputZ < OUTPUT_Z)
     {
-        Dtype sum[ZPAR];
-        for (int kern = 0; kern < ZPAR; kern++)
-        {
-            sum[kern] = 0.0f;
-        }
+        Dtype sum = 0.0f;
         const int org_y = outputY * STRIDE_Y - pad_h;
         const int org_x = outputX * STRIDE_X - pad_w;
-        const int currentKernelOffset = kernel_offset + kernelNum*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS;
-#if APPLY_BIAS
-        const int biasIndex = bias_offset + kernelNum;
-#endif
+        const int currentKernelOffset = kernel_offset + outputZ*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS;
         const int local_image_offset = org_y * input_width + org_x;
         const int imageSize = input_width * input_height;
         __global Dtype* image_dataPtr = (image_data + (image_offset + local_image_offset));
@@ -186,17 +178,13 @@ __kernel void ConvolveBasic(
         {
             for (int y = 0; y < KERNEL_HEIGHT; y++)
             {
+                int y_ = org_y + y * DILATION_Y;
                 for (int x = 0; x < KERNEL_WIDTH; x++)
                 {
-                    int y_ = org_y + y * DILATION_Y;
                     int x_ = org_x + x * DILATION_X;
-                    if (!(y_ >= 0 && y_ < input_height && x_ >= 0 && x_ < input_width))
-                    {
-                        continue;
-                    }
-                    for (int kern = 0; kern < ZPAR; kern++)
+                    if (y_ >= 0 && y_ < input_height && x_ >= 0 && x_ < input_width)
                     {
-                        sum[kern] += image_dataPtr[x * DILATION_X] * kernel_dataPtr[kern*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS + x];
+                        sum = mad(image_dataPtr[x * DILATION_X], kernel_dataPtr[x], sum);
                     }
                 }
                 image_dataPtr += input_width * DILATION_Y;
@@ -205,18 +193,13 @@ __kernel void ConvolveBasic(
             image_dataPtr += imageSize - input_width*KERNEL_HEIGHT*DILATION_Y;
         }
 
-        for (int kern = 0; kern < ZPAR; kern++)
-        {
-            if (kernelNum + kern < OUTPUT_Z)
-            {
-                int offset = convolved_image_offset + (kernelNum+kern)*output_height*output_width + outputY*output_width + outputX;
+        int offset = convolved_image_offset + out_idx;
 #if APPLY_BIAS
-                ACTIVATION_FUNCTION(convolved_image, offset, sum[kern] + bias[biasIndex + kern], biasIndex + kern);
+        int biasIndex = bias_offset + outputZ;
+        ACTIVATION_FUNCTION(convolved_image, offset, sum + bias[biasIndex], biasIndex);
 #else
-                ACTIVATION_FUNCTION(convolved_image, offset, sum[kern], kernelNum + kern);
+        ACTIVATION_FUNCTION(convolved_image, offset, sum, outputZ);
 #endif
-            }
-        }
     }
 }