Merge pull request #20655 from alalek:dnn_ocl_fix_globalsize

pull/20661/head
Alexander Alekhin 4 years ago
commit 1e0d290f2e
  1. 20
      modules/core/include/opencv2/core/ocl.hpp
  2. 8
      modules/core/src/ocl.cpp
  3. 2
      modules/dnn/src/layers/batch_norm_layer.cpp
  4. 2
      modules/dnn/src/layers/mvn_layer.cpp
  5. 2
      modules/dnn/src/layers/slice_layer.cpp
  6. 1
      modules/dnn/src/ocl4dnn/src/math_functions.cpp
  7. 19
      modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
  8. 2
      modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp
  9. 10
      modules/dnn/src/opencl/gemm_image.cl

@ -562,7 +562,9 @@ public:
i = set(i, a6); i = set(i, a7); i = set(i, a8); i = set(i, a9); i = set(i, a10); i = set(i, a11); i = set(i, a6); i = set(i, a7); i = set(i, a8); i = set(i, a9); i = set(i, a10); i = set(i, a11);
i = set(i, a12); i = set(i, a13); i = set(i, a14); set(i, a15); return *this; i = set(i, a12); i = set(i, a13); i = set(i, a14); set(i, a15); return *this;
} }
/** @brief Run the OpenCL kernel.
/** @brief Run the OpenCL kernel (globalsize value may be adjusted)
@param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3. @param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3.
@param globalsize work items for each dimension. It is not the final globalsize passed to @param globalsize work items for each dimension. It is not the final globalsize passed to
OpenCL. Each dimension will be adjusted to the nearest integer divisible by the corresponding OpenCL. Each dimension will be adjusted to the nearest integer divisible by the corresponding
@ -571,12 +573,26 @@ public:
@param localsize work-group size for each dimension. @param localsize work-group size for each dimension.
@param sync specify whether to wait for OpenCL computation to finish before return. @param sync specify whether to wait for OpenCL computation to finish before return.
@param q command queue @param q command queue
@note Use run_() if your kernel code doesn't support adjusted globalsize.
*/ */
bool run(int dims, size_t globalsize[], bool run(int dims, size_t globalsize[],
size_t localsize[], bool sync, const Queue& q=Queue()); size_t localsize[], bool sync, const Queue& q=Queue());
/** @brief Run the OpenCL kernel
*
* @param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3.
* @param globalsize work items for each dimension. This value is passed to OpenCL without changes.
* @param localsize work-group size for each dimension.
* @param sync specify whether to wait for OpenCL computation to finish before return.
* @param q command queue
*/
bool run_(int dims, size_t globalsize[], size_t localsize[], bool sync, const Queue& q=Queue());
bool runTask(bool sync, const Queue& q=Queue()); bool runTask(bool sync, const Queue& q=Queue());
/** @brief Similar to synchronized run() call with returning of kernel execution time /** @brief Similar to synchronized run_() call with returning of kernel execution time
*
* Separate OpenCL command queue may be used (with CL_QUEUE_PROFILING_ENABLE) * Separate OpenCL command queue may be used (with CL_QUEUE_PROFILING_ENABLE)
* @return Execution time in nanoseconds or negative number on error * @return Execution time in nanoseconds or negative number on error
*/ */

@ -3160,6 +3160,14 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
} }
bool Kernel::run_(int dims, size_t _globalsize[], size_t _localsize[],
bool sync, const Queue& q)
{
CV_Assert(p);
return p->run(dims, _globalsize, _localsize, sync, NULL, q);
}
static bool isRaiseErrorOnReuseAsyncKernel() static bool isRaiseErrorOnReuseAsyncKernel()
{ {
static bool initialized = false; static bool initialized = false;

@ -231,7 +231,7 @@ public:
kernel.set(4, ocl::KernelArg::PtrReadOnly(umat_weight)); kernel.set(4, ocl::KernelArg::PtrReadOnly(umat_weight));
kernel.set(5, ocl::KernelArg::PtrReadOnly(umat_bias)); kernel.set(5, ocl::KernelArg::PtrReadOnly(umat_bias));
kernel.set(6, ocl::KernelArg::PtrWriteOnly(dst)); kernel.set(6, ocl::KernelArg::PtrWriteOnly(dst));
bool ret = kernel.run(2, global, NULL, false); bool ret = kernel.run_(2, global, NULL, false);
if (!ret) if (!ret)
return false; return false;
} }

@ -191,7 +191,7 @@ public:
k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_weight)); k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_weight));
k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_bias)); k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_bias));
k1.set(argId++, ocl::KernelArg::PtrWriteOnly(outMat)); k1.set(argId++, ocl::KernelArg::PtrWriteOnly(outMat));
ret = k1.run(1, globalsize, localsize, false); ret = k1.run_(1, globalsize, localsize, false);
if (!ret) if (!ret)
return false; return false;
} }

@ -482,7 +482,7 @@ public:
ocl::KernelArg::PtrReadOnly(input), ocl::KernelArg::PtrReadOnly(input),
ocl::KernelArg::PtrWriteOnly(output) ocl::KernelArg::PtrWriteOnly(output)
) )
.run(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false); .run_(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false);
if (!ret) if (!ret)
return false; return false;
} // for outputs.size() } // for outputs.size()

@ -116,6 +116,7 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
.args( .args(
ocl::KernelArg::PtrReadOnly(buffer), ocl::KernelArg::PtrReadOnly(buffer),
image, offset, image, offset,
padded_width, padded_height,
width, height, width, height,
ld) ld)
.run(2, global_copy, NULL, false); .run(2, global_copy, NULL, false);

@ -769,12 +769,11 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
swizzled_factor swizzled_factor
); );
size_t global_work_size_copy[3] = { size_t global_work_size_copy[1] = { (size_t)(alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_) };
(size_t) (alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1, 1 };
if (!oclk_copy_weight.run(3, global_work_size_copy, NULL, false)) if (!oclk_copy_weight.run_(1, global_work_size_copy, NULL, false))
{ {
std::cout << "Swizzle kernel run failed." << std::endl; CV_LOG_ERROR(NULL, "DNN/OpenCL: Swizzle kernel run failed");
return false; return false;
} }
} else { } else {
@ -937,7 +936,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
kernel.set(argIdx++, (uint16_t)height_); kernel.set(argIdx++, (uint16_t)height_);
kernel.set(argIdx++, (uint16_t)output_w_); kernel.set(argIdx++, (uint16_t)output_w_);
kernel.set(argIdx++, (uint16_t)output_h_); kernel.set(argIdx++, (uint16_t)output_h_);
if (!kernel.run(3, config->global_work_size, config->local_work_size, false)) if (!kernel.run_(3, config->global_work_size, config->local_work_size, false))
{ {
std::cout << "IDLF kernel run failed." << std::endl; std::cout << "IDLF kernel run failed." << std::endl;
return false; return false;
@ -1056,7 +1055,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
gy = alignSize(gy, blockK); gy = alignSize(gy, blockK);
size_t global_size[3] = { gx, gy, config->global_work_size[2] }; size_t global_size[3] = { gx, gy, config->global_work_size[2] };
if (!kernel.run(3, global_size, config->local_work_size, false)) if (!kernel.run_(3, global_size, config->local_work_size, false))
{ {
std::cout << "GEMM like kernel run failed." << std::endl; std::cout << "GEMM like kernel run failed." << std::endl;
return false; return false;
@ -1085,9 +1084,9 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
global_size[1] = output_h_; global_size[1] = output_h_;
global_size[2] = num_output_ * num_; global_size[2] = num_output_ * num_;
if (!kernel.run(3, global_size, NULL, false)) if (!kernel.run_(3, global_size, NULL, false))
{ {
std::cout << "DWCONV kernel run failed." << std::endl; CV_LOG_ERROR(NULL, "DNN/OpenCL: DWCONV kernel run failed");
return false; return false;
} }
} else { } else {
@ -1127,11 +1126,11 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
kernel.set(argIdx++, (uint16_t)output_h_); kernel.set(argIdx++, (uint16_t)output_h_);
kernel.set(argIdx++, (uint16_t)pad_w_); kernel.set(argIdx++, (uint16_t)pad_w_);
kernel.set(argIdx++, (uint16_t)pad_h_); kernel.set(argIdx++, (uint16_t)pad_h_);
if (!kernel.run(3, config->global_work_size, if (!kernel.run_(3, config->global_work_size,
(config->use_null_local) ? NULL : config->local_work_size, (config->use_null_local) ? NULL : config->local_work_size,
false)) false))
{ {
std::cout << "Basic kernel run failed." << std::endl; CV_LOG_ERROR(NULL, "DNN/OpenCL: Basic kernel run failed");
return false; return false;
} }
} }

@ -127,7 +127,7 @@ bool OCL4DNNSoftmax<Dtype>::Forward(const UMat& bottom, UMat& top)
oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
} }
ret = oclk_softmax_forward_kernel.run(3, global_size, local_size, false); ret = oclk_softmax_forward_kernel.run_(3, global_size, local_size, false);
} }
return ret; return ret;
} }

@ -954,6 +954,10 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_transpose, Dtype)(
{ {
const int gidx = get_global_id(0); const int gidx = get_global_id(0);
const int gidy = get_global_id(1); const int gidy = get_global_id(1);
if (gidx >= width || gidy >= height)
return;
int2 coord_dst = (int2)(gidx, gidy); int2 coord_dst = (int2)(gidx, gidy);
__global Dtype* A_off = A + offA; __global Dtype* A_off = A + offA;
Dtype srcA = A_off[gidy * ldA + gidx]; Dtype srcA = A_off[gidy * ldA + gidx];
@ -968,12 +972,18 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose, Dtype)(
__global Dtype* A, __global Dtype* A,
__write_only image2d_t ImA, __write_only image2d_t ImA,
int offA, int offA,
int padded_width,
int padded_height,
int width, int width,
int height, int height,
int ldA) int ldA)
{ {
const int gidx = get_global_id(0); const int gidx = get_global_id(0);
const int gidy = get_global_id(1); const int gidy = get_global_id(1);
if (gidx >= padded_width || gidy >= padded_height)
return;
int2 coord_dst = (int2)(gidx, gidy); int2 coord_dst = (int2)(gidx, gidy);
#if TYPE == TYPE_HALF #if TYPE == TYPE_HALF
if (gidx >= width || gidy >= height) { if (gidx >= width || gidy >= height) {

Loading…
Cancel
Save