From 5578ad5e14fe1fcf4d7171f74c665db2a578187f Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sat, 4 Sep 2021 16:27:51 +0000 Subject: [PATCH] dnn(ocl): fix automatic globalsize adjusting - if kernel code doesn't support that --- modules/core/include/opencv2/core/ocl.hpp | 20 +++++++++++++++++-- modules/core/src/ocl.cpp | 8 ++++++++ modules/dnn/src/layers/batch_norm_layer.cpp | 2 +- modules/dnn/src/layers/mvn_layer.cpp | 2 +- modules/dnn/src/layers/slice_layer.cpp | 2 +- .../dnn/src/ocl4dnn/src/math_functions.cpp | 1 + .../src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp | 19 +++++++++--------- .../dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp | 2 +- modules/dnn/src/opencl/gemm_image.cl | 10 ++++++++++ 9 files changed, 50 insertions(+), 16 deletions(-) diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp index f03de180fc..b51b39359d 100644 --- a/modules/core/include/opencv2/core/ocl.hpp +++ b/modules/core/include/opencv2/core/ocl.hpp @@ -562,7 +562,9 @@ public: i = set(i, a6); i = set(i, a7); i = set(i, a8); i = set(i, a9); i = set(i, a10); i = set(i, a11); i = set(i, a12); i = set(i, a13); i = set(i, a14); set(i, a15); return *this; } - /** @brief Run the OpenCL kernel. + + /** @brief Run the OpenCL kernel (globalsize value may be adjusted) + @param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3. @param globalsize work items for each dimension. It is not the final globalsize passed to OpenCL. Each dimension will be adjusted to the nearest integer divisible by the corresponding @@ -571,12 +573,26 @@ public: @param localsize work-group size for each dimension. @param sync specify whether to wait for OpenCL computation to finish before return. @param q command queue + + @note Use run_() if your kernel code doesn't support adjusted globalsize. */ bool run(int dims, size_t globalsize[], size_t localsize[], bool sync, const Queue& q=Queue()); + + /** @brief Run the OpenCL kernel + * + * @param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3. + * @param globalsize work items for each dimension. This value is passed to OpenCL without changes. + * @param localsize work-group size for each dimension. + * @param sync specify whether to wait for OpenCL computation to finish before return. + * @param q command queue + */ + bool run_(int dims, size_t globalsize[], size_t localsize[], bool sync, const Queue& q=Queue()); + bool runTask(bool sync, const Queue& q=Queue()); - /** @brief Similar to synchronized run() call with returning of kernel execution time + /** @brief Similar to synchronized run_() call with returning of kernel execution time + * * Separate OpenCL command queue may be used (with CL_QUEUE_PROFILING_ENABLE) * @return Execution time in nanoseconds or negative number on error */ diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp index daf4fcd280..a550c1d91a 100644 --- a/modules/core/src/ocl.cpp +++ b/modules/core/src/ocl.cpp @@ -3160,6 +3160,14 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[], } +bool Kernel::run_(int dims, size_t _globalsize[], size_t _localsize[], + bool sync, const Queue& q) +{ + CV_Assert(p); + return p->run(dims, _globalsize, _localsize, sync, NULL, q); +} + + static bool isRaiseErrorOnReuseAsyncKernel() { static bool initialized = false; diff --git a/modules/dnn/src/layers/batch_norm_layer.cpp b/modules/dnn/src/layers/batch_norm_layer.cpp index 42676c7938..dcb4005975 100644 --- a/modules/dnn/src/layers/batch_norm_layer.cpp +++ b/modules/dnn/src/layers/batch_norm_layer.cpp @@ -231,7 +231,7 @@ public: kernel.set(4, ocl::KernelArg::PtrReadOnly(umat_weight)); kernel.set(5, ocl::KernelArg::PtrReadOnly(umat_bias)); kernel.set(6, ocl::KernelArg::PtrWriteOnly(dst)); - bool ret = kernel.run(2, global, NULL, false); + bool ret = kernel.run_(2, global, NULL, false); if (!ret) return false; } diff --git a/modules/dnn/src/layers/mvn_layer.cpp b/modules/dnn/src/layers/mvn_layer.cpp index 8f06216df1..de2b0d5690 100644 --- a/modules/dnn/src/layers/mvn_layer.cpp +++ b/modules/dnn/src/layers/mvn_layer.cpp @@ -191,7 +191,7 @@ public: k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_weight)); k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_bias)); k1.set(argId++, ocl::KernelArg::PtrWriteOnly(outMat)); - ret = k1.run(1, globalsize, localsize, false); + ret = k1.run_(1, globalsize, localsize, false); if (!ret) return false; } diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp index 507964edf9..16f1958879 100644 --- a/modules/dnn/src/layers/slice_layer.cpp +++ b/modules/dnn/src/layers/slice_layer.cpp @@ -482,7 +482,7 @@ public: ocl::KernelArg::PtrReadOnly(input), ocl::KernelArg::PtrWriteOnly(output) ) - .run(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false); + .run_(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false); if (!ret) return false; } // for outputs.size() diff --git a/modules/dnn/src/ocl4dnn/src/math_functions.cpp b/modules/dnn/src/ocl4dnn/src/math_functions.cpp index 855a21e08f..c924d66b12 100644 --- a/modules/dnn/src/ocl4dnn/src/math_functions.cpp +++ b/modules/dnn/src/ocl4dnn/src/math_functions.cpp @@ -116,6 +116,7 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset, .args( ocl::KernelArg::PtrReadOnly(buffer), image, offset, + padded_width, padded_height, width, height, ld) .run(2, global_copy, NULL, false); diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp index b4477ebfc4..3b73da801c 100644 --- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp +++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp @@ -769,12 +769,11 @@ bool OCL4DNNConvSpatial::swizzleWeight(const UMat &weight, swizzled_factor ); - size_t global_work_size_copy[3] = { - (size_t) (alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1, 1 }; + size_t global_work_size_copy[1] = { (size_t)(alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_) }; - if (!oclk_copy_weight.run(3, global_work_size_copy, NULL, false)) + if (!oclk_copy_weight.run_(1, global_work_size_copy, NULL, false)) { - std::cout << "Swizzle kernel run failed." << std::endl; + CV_LOG_ERROR(NULL, "DNN/OpenCL: Swizzle kernel run failed"); return false; } } else { @@ -937,7 +936,7 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, kernel.set(argIdx++, (uint16_t)height_); kernel.set(argIdx++, (uint16_t)output_w_); kernel.set(argIdx++, (uint16_t)output_h_); - if (!kernel.run(3, config->global_work_size, config->local_work_size, false)) + if (!kernel.run_(3, config->global_work_size, config->local_work_size, false)) { std::cout << "IDLF kernel run failed." << std::endl; return false; @@ -1056,7 +1055,7 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, gy = alignSize(gy, blockK); size_t global_size[3] = { gx, gy, config->global_work_size[2] }; - if (!kernel.run(3, global_size, config->local_work_size, false)) + if (!kernel.run_(3, global_size, config->local_work_size, false)) { std::cout << "GEMM like kernel run failed." << std::endl; return false; @@ -1085,9 +1084,9 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, global_size[1] = output_h_; global_size[2] = num_output_ * num_; - if (!kernel.run(3, global_size, NULL, false)) + if (!kernel.run_(3, global_size, NULL, false)) { - std::cout << "DWCONV kernel run failed." << std::endl; + CV_LOG_ERROR(NULL, "DNN/OpenCL: DWCONV kernel run failed"); return false; } } else { @@ -1127,11 +1126,11 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, kernel.set(argIdx++, (uint16_t)output_h_); kernel.set(argIdx++, (uint16_t)pad_w_); kernel.set(argIdx++, (uint16_t)pad_h_); - if (!kernel.run(3, config->global_work_size, + if (!kernel.run_(3, config->global_work_size, (config->use_null_local) ? NULL : config->local_work_size, false)) { - std::cout << "Basic kernel run failed." << std::endl; + CV_LOG_ERROR(NULL, "DNN/OpenCL: Basic kernel run failed"); return false; } } diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp index 78576711a7..7b32189fdc 100644 --- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp +++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp @@ -127,7 +127,7 @@ bool OCL4DNNSoftmax::Forward(const UMat& bottom, UMat& top) oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); } - ret = oclk_softmax_forward_kernel.run(3, global_size, local_size, false); + ret = oclk_softmax_forward_kernel.run_(3, global_size, local_size, false); } return ret; } diff --git a/modules/dnn/src/opencl/gemm_image.cl b/modules/dnn/src/opencl/gemm_image.cl index 710637a093..f6e0020d82 100644 --- a/modules/dnn/src/opencl/gemm_image.cl +++ b/modules/dnn/src/opencl/gemm_image.cl @@ -954,6 +954,10 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_transpose, Dtype)( { const int gidx = get_global_id(0); const int gidy = get_global_id(1); + + if (gidx >= width || gidy >= height) + return; + int2 coord_dst = (int2)(gidx, gidy); __global Dtype* A_off = A + offA; Dtype srcA = A_off[gidy * ldA + gidx]; @@ -968,12 +972,18 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose, Dtype)( __global Dtype* A, __write_only image2d_t ImA, int offA, + int padded_width, + int padded_height, int width, int height, int ldA) { const int gidx = get_global_id(0); const int gidy = get_global_id(1); + + if (gidx >= padded_width || gidy >= padded_height) + return; + int2 coord_dst = (int2)(gidx, gidy); #if TYPE == TYPE_HALF if (gidx >= width || gidy >= height) {