Merge pull request #20651 from alalek:issue_18361

pull/20655/head
Alexander Alekhin 4 years ago
commit 0a43b23275
  1. 2
      modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
  2. 78
      modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
  3. 25
      modules/dnn/src/opencl/conv_layer_spatial.cl

@ -269,7 +269,7 @@ class OCL4DNNConvSpatial
void generate_idlf_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems, void generate_idlf_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
int blockM, int blockK, int simd_size); int blockM, int blockK, int simd_size);
void setFusionDefine(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise); void setFusionDefine(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise);
void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx); void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, int fused_eltwise_offset, ocl::Kernel &kernel, cl_uint &argIdx);
int32_t group_; int32_t group_;
bool bias_term_; bool bias_term_;

@ -270,17 +270,21 @@ void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ,
} }
template<typename Dtype> template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx) void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, int fused_eltwise_offset, ocl::Kernel &kernel, cl_uint &argIdx)
{ {
if (fused_eltwise) if (fused_eltwise)
kernel.set(argIdx++, (cl_mem)bottom_data2_.handle(ACCESS_READ)); {
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom_data2_));
if (fused_eltwise_offset >= 0)
kernel.set(argIdx++, fused_eltwise_offset);
}
switch (fused_activ) { switch (fused_activ) {
case OCL4DNN_CONV_FUSED_ACTIV_RELU: case OCL4DNN_CONV_FUSED_ACTIV_RELU:
kernel.set(argIdx++, (float)negative_slope_); kernel.set(argIdx++, (float)negative_slope_);
break; break;
case OCL4DNN_CONV_FUSED_ACTIV_PRELU: case OCL4DNN_CONV_FUSED_ACTIV_PRELU:
kernel.set(argIdx++, (cl_mem)negative_slope_umat_.handle(ACCESS_READ)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(negative_slope_umat_));
break; break;
case OCL4DNN_CONV_FUSED_ACTIV_POWER: case OCL4DNN_CONV_FUSED_ACTIV_POWER:
kernel.set(argIdx++, (float)power_); kernel.set(argIdx++, (float)power_);
@ -895,10 +899,12 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
if (config->kernelType == KERNEL_TYPE_INTEL_IDLF) { if (config->kernelType == KERNEL_TYPE_INTEL_IDLF) {
if (!swizzleWeight(weight, config->workItem_output[2], false)) if (!swizzleWeight(weight, config->workItem_output[2], false))
return false; return false;
#if 0
size_t total_bottom_size = bottom_dim_ * numImages; size_t total_bottom_size = bottom_dim_ * numImages;
size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_; size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_;
size_t total_bias_size = M_ * group_; size_t total_bias_size = M_ * group_;
size_t total_top_size = top_dim_ * numImages; size_t total_top_size = top_dim_ * numImages;
#endif
for (int32_t g = 0; g < group_; ++g) { for (int32_t g = 0; g < group_; ++g) {
bias_offset = M_ * g; bias_offset = M_ * g;
int32_t image_offset = width_ * height_ * (channels_ / group_) * g; int32_t image_offset = width_ * height_ * (channels_ / group_) * g;
@ -910,72 +916,22 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return false; return false;
cl_uint argIdx = 0; cl_uint argIdx = 0;
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx); setFusionArg(fused_activ_, fused_eltwise_, output_image_offset, kernel, argIdx);
UMat img_buffer;
if (image_offset)
{
CreateSubBuffer(bottom, img_buffer, image_offset,
total_bottom_size - image_offset, false);
if (img_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
} kernel.set(argIdx++, image_offset);
UMat kernel_buffer;
if (kernel_offset)
{
CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset,
total_kernel_size - kernel_offset, false);
if (kernel_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
} kernel.set(argIdx++, kernel_offset);
UMat bias_buffer;
if (bias_term_) if (bias_term_)
{
if (bias_offset)
{
CreateSubBuffer(bias, bias_buffer, bias_offset,
total_bias_size - bias_offset, false);
if (bias_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer));
}
else
{ {
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
} kernel.set(argIdx++, bias_offset);
} }
UMat out_buffer;
if (output_image_offset)
{
CreateSubBuffer(top, out_buffer, output_image_offset,
total_top_size - output_image_offset, true);
if (out_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
kernel.set(argIdx++, (int)(top.offset / element_size)); kernel.set(argIdx++, (int)(top.offset / element_size) + output_image_offset);
}
kernel.set(argIdx++, (uint16_t)width_); kernel.set(argIdx++, (uint16_t)width_);
kernel.set(argIdx++, (uint16_t)height_); kernel.set(argIdx++, (uint16_t)height_);
@ -1005,7 +961,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return false; return false;
cl_uint argIdx = 0; cl_uint argIdx = 0;
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx); setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx);
UMat img_buffer; UMat img_buffer;
if (image_offset) if (image_offset)
@ -1112,7 +1068,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return false; return false;
cl_uint argIdx = 0; cl_uint argIdx = 0;
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx); setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
if (bias_term_) if (bias_term_)
@ -1152,7 +1108,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return false; return false;
cl_uint argIdx = 0; cl_uint argIdx = 0;
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx); setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
kernel.set(argIdx++, image_offset); kernel.set(argIdx++, image_offset);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));

@ -74,18 +74,22 @@
(_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \ (_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \
} while(0) } while(0)
#define ELTWISE_DATA_ARG __global Dtype* eltwise_data, #define ELTWISE_DATA_ARG __global Dtype* eltwise_data,
#define ELTWISE_DATA_ARG_WITH_OFFSET __global Dtype* eltwise_ptr, int eltwise_offset,
#else #else
#define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { \ #define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { \
const Dtype _x_ = (_data_); \ const Dtype _x_ = (_data_); \
(_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \ (_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \
} while(0) } while(0)
#define ELTWISE_DATA_ARG #define ELTWISE_DATA_ARG
#define ELTWISE_DATA_ARG_WITH_OFFSET
#endif #endif
#if APPLY_BIAS #if APPLY_BIAS
#define BIAS_KERNEL_ARG __global Dtype * biases_base, #define BIAS_KERNEL_ARG __global Dtype * biases_base,
#define BIAS_KERNEL_ARG_WITH_OFFSET __global Dtype * biases_base_ptr, int biases_base_offset,
#else #else
#define BIAS_KERNEL_ARG #define BIAS_KERNEL_ARG
#define BIAS_KERNEL_ARG_WITH_OFFSET
#endif #endif
#define __CAT(x, y) x##y #define __CAT(x, y) x##y
@ -223,19 +227,28 @@ __attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE))) __attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
__kernel void __kernel void
convolve_simd( convolve_simd(
ELTWISE_DATA_ARG ELTWISE_DATA_ARG_WITH_OFFSET
FUSED_ARG FUSED_ARG
__global Dtype* inputs, __global Dtype* inputs_ptr, const int inputs_offset,
__global Dtype* weights, __global Dtype* weights_ptr, const int weights_offset,
BIAS_KERNEL_ARG BIAS_KERNEL_ARG_WITH_OFFSET
__global Dtype* outputs_base, __global Dtype* outputs_base, const int outputs_offset,
const int outputs_offset,
const ushort input_width, const ushort input_width,
const ushort input_height, const ushort input_height,
const ushort output_width, const ushort output_width,
const ushort output_height) const ushort output_height)
{ {
__global Dtype* inputs = inputs_ptr + inputs_offset;
__global Dtype* weights = weights_ptr + weights_offset;
#if APPLY_BIAS
__global Dtype* biases_base = biases_base_ptr + biases_base_offset;
#endif
__global Dtype* outputs = outputs_base + outputs_offset; __global Dtype* outputs = outputs_base + outputs_offset;
#ifdef FUSED_CONV_ELTWISE
__global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
#endif
unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column
unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row
unsigned int fm = get_global_id(2); // fm = Feature Map = od = Output Depth unsigned int fm = get_global_id(2); // fm = Feature Map = od = Output Depth

Loading…
Cancel
Save