dnn(ocl): fix automatic globalsize adjusting

- if kernel code doesn't support that
pull/20655/head
Alexander Alekhin 3 years ago
parent d11f0a709d
commit 5578ad5e14
  1. 20
      modules/core/include/opencv2/core/ocl.hpp
  2. 8
      modules/core/src/ocl.cpp
  3. 2
      modules/dnn/src/layers/batch_norm_layer.cpp
  4. 2
      modules/dnn/src/layers/mvn_layer.cpp
  5. 2
      modules/dnn/src/layers/slice_layer.cpp
  6. 1
      modules/dnn/src/ocl4dnn/src/math_functions.cpp
  7. 19
      modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
  8. 2
      modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp
  9. 10
      modules/dnn/src/opencl/gemm_image.cl

@ -562,7 +562,9 @@ public:
i = set(i, a6); i = set(i, a7); i = set(i, a8); i = set(i, a9); i = set(i, a10); i = set(i, a11);
i = set(i, a12); i = set(i, a13); i = set(i, a14); set(i, a15); return *this;
}
/** @brief Run the OpenCL kernel.
/** @brief Run the OpenCL kernel (globalsize value may be adjusted)
@param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3.
@param globalsize work items for each dimension. It is not the final globalsize passed to
OpenCL. Each dimension will be adjusted to the nearest integer divisible by the corresponding
@ -571,12 +573,26 @@ public:
@param localsize work-group size for each dimension.
@param sync specify whether to wait for OpenCL computation to finish before return.
@param q command queue
@note Use run_() if your kernel code doesn't support adjusted globalsize.
*/
bool run(int dims, size_t globalsize[],
size_t localsize[], bool sync, const Queue& q=Queue());
/** @brief Run the OpenCL kernel
*
* @param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3.
* @param globalsize work items for each dimension. This value is passed to OpenCL without changes.
* @param localsize work-group size for each dimension.
* @param sync specify whether to wait for OpenCL computation to finish before return.
* @param q command queue
*/
bool run_(int dims, size_t globalsize[], size_t localsize[], bool sync, const Queue& q=Queue());
bool runTask(bool sync, const Queue& q=Queue());
/** @brief Similar to synchronized run() call with returning of kernel execution time
/** @brief Similar to synchronized run_() call with returning of kernel execution time
*
* Separate OpenCL command queue may be used (with CL_QUEUE_PROFILING_ENABLE)
* @return Execution time in nanoseconds or negative number on error
*/

@ -3160,6 +3160,14 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
}
bool Kernel::run_(int dims, size_t _globalsize[], size_t _localsize[],
bool sync, const Queue& q)
{
CV_Assert(p);
return p->run(dims, _globalsize, _localsize, sync, NULL, q);
}
static bool isRaiseErrorOnReuseAsyncKernel()
{
static bool initialized = false;

@ -231,7 +231,7 @@ public:
kernel.set(4, ocl::KernelArg::PtrReadOnly(umat_weight));
kernel.set(5, ocl::KernelArg::PtrReadOnly(umat_bias));
kernel.set(6, ocl::KernelArg::PtrWriteOnly(dst));
bool ret = kernel.run(2, global, NULL, false);
bool ret = kernel.run_(2, global, NULL, false);
if (!ret)
return false;
}

@ -191,7 +191,7 @@ public:
k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_weight));
k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_bias));
k1.set(argId++, ocl::KernelArg::PtrWriteOnly(outMat));
ret = k1.run(1, globalsize, localsize, false);
ret = k1.run_(1, globalsize, localsize, false);
if (!ret)
return false;
}

@ -482,7 +482,7 @@ public:
ocl::KernelArg::PtrReadOnly(input),
ocl::KernelArg::PtrWriteOnly(output)
)
.run(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false);
.run_(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false);
if (!ret)
return false;
} // for outputs.size()

@ -116,6 +116,7 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
.args(
ocl::KernelArg::PtrReadOnly(buffer),
image, offset,
padded_width, padded_height,
width, height,
ld)
.run(2, global_copy, NULL, false);

@ -769,12 +769,11 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
swizzled_factor
);
size_t global_work_size_copy[3] = {
(size_t) (alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1, 1 };
size_t global_work_size_copy[1] = { (size_t)(alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_) };
if (!oclk_copy_weight.run(3, global_work_size_copy, NULL, false))
if (!oclk_copy_weight.run_(1, global_work_size_copy, NULL, false))
{
std::cout << "Swizzle kernel run failed." << std::endl;
CV_LOG_ERROR(NULL, "DNN/OpenCL: Swizzle kernel run failed");
return false;
}
} else {
@ -937,7 +936,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
kernel.set(argIdx++, (uint16_t)height_);
kernel.set(argIdx++, (uint16_t)output_w_);
kernel.set(argIdx++, (uint16_t)output_h_);
if (!kernel.run(3, config->global_work_size, config->local_work_size, false))
if (!kernel.run_(3, config->global_work_size, config->local_work_size, false))
{
std::cout << "IDLF kernel run failed." << std::endl;
return false;
@ -1056,7 +1055,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
gy = alignSize(gy, blockK);
size_t global_size[3] = { gx, gy, config->global_work_size[2] };
if (!kernel.run(3, global_size, config->local_work_size, false))
if (!kernel.run_(3, global_size, config->local_work_size, false))
{
std::cout << "GEMM like kernel run failed." << std::endl;
return false;
@ -1085,9 +1084,9 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
global_size[1] = output_h_;
global_size[2] = num_output_ * num_;
if (!kernel.run(3, global_size, NULL, false))
if (!kernel.run_(3, global_size, NULL, false))
{
std::cout << "DWCONV kernel run failed." << std::endl;
CV_LOG_ERROR(NULL, "DNN/OpenCL: DWCONV kernel run failed");
return false;
}
} else {
@ -1127,11 +1126,11 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
kernel.set(argIdx++, (uint16_t)output_h_);
kernel.set(argIdx++, (uint16_t)pad_w_);
kernel.set(argIdx++, (uint16_t)pad_h_);
if (!kernel.run(3, config->global_work_size,
if (!kernel.run_(3, config->global_work_size,
(config->use_null_local) ? NULL : config->local_work_size,
false))
{
std::cout << "Basic kernel run failed." << std::endl;
CV_LOG_ERROR(NULL, "DNN/OpenCL: Basic kernel run failed");
return false;
}
}

@ -127,7 +127,7 @@ bool OCL4DNNSoftmax<Dtype>::Forward(const UMat& bottom, UMat& top)
oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
}
ret = oclk_softmax_forward_kernel.run(3, global_size, local_size, false);
ret = oclk_softmax_forward_kernel.run_(3, global_size, local_size, false);
}
return ret;
}

@ -954,6 +954,10 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_transpose, Dtype)(
{
const int gidx = get_global_id(0);
const int gidy = get_global_id(1);
if (gidx >= width || gidy >= height)
return;
int2 coord_dst = (int2)(gidx, gidy);
__global Dtype* A_off = A + offA;
Dtype srcA = A_off[gidy * ldA + gidx];
@ -968,12 +972,18 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose, Dtype)(
__global Dtype* A,
__write_only image2d_t ImA,
int offA,
int padded_width,
int padded_height,
int width,
int height,
int ldA)
{
const int gidx = get_global_id(0);
const int gidy = get_global_id(1);
if (gidx >= padded_width || gidy >= padded_height)
return;
int2 coord_dst = (int2)(gidx, gidy);
#if TYPE == TYPE_HALF
if (gidx >= width || gidy >= height) {

Loading…
Cancel
Save