dnn(ocl): fix buffer offsets in IDLF kernel

- drop CreateSubBuffer
- fix FUSED_CONV_ELTWISE mode
pull/20651/head
Alexander Alekhin 3 years ago
parent 7c23ec90a9
commit 407adc7061
  1. 2
      modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
  2. 86
      modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
  3. 25
      modules/dnn/src/opencl/conv_layer_spatial.cl

@ -269,7 +269,7 @@ class OCL4DNNConvSpatial
void generate_idlf_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
int blockM, int blockK, int simd_size);
void setFusionDefine(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise);
void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx);
void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, int fused_eltwise_offset, ocl::Kernel &kernel, cl_uint &argIdx);
int32_t group_;
bool bias_term_;

@ -270,17 +270,21 @@ void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ,
}
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx)
void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, int fused_eltwise_offset, ocl::Kernel &kernel, cl_uint &argIdx)
{
if (fused_eltwise)
kernel.set(argIdx++, (cl_mem)bottom_data2_.handle(ACCESS_READ));
{
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom_data2_));
if (fused_eltwise_offset >= 0)
kernel.set(argIdx++, fused_eltwise_offset);
}
switch (fused_activ) {
case OCL4DNN_CONV_FUSED_ACTIV_RELU:
kernel.set(argIdx++, (float)negative_slope_);
break;
case OCL4DNN_CONV_FUSED_ACTIV_PRELU:
kernel.set(argIdx++, (cl_mem)negative_slope_umat_.handle(ACCESS_READ));
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(negative_slope_umat_));
break;
case OCL4DNN_CONV_FUSED_ACTIV_POWER:
kernel.set(argIdx++, (float)power_);
@ -895,10 +899,12 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
if (config->kernelType == KERNEL_TYPE_INTEL_IDLF) {
if (!swizzleWeight(weight, config->workItem_output[2], false))
return false;
#if 0
size_t total_bottom_size = bottom_dim_ * numImages;
size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_;
size_t total_bias_size = M_ * group_;
size_t total_top_size = top_dim_ * numImages;
#endif
for (int32_t g = 0; g < group_; ++g) {
bias_offset = M_ * g;
int32_t image_offset = width_ * height_ * (channels_ / group_) * g;
@ -910,72 +916,22 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return false;
cl_uint argIdx = 0;
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
setFusionArg(fused_activ_, fused_eltwise_, output_image_offset, kernel, argIdx);
UMat img_buffer;
if (image_offset)
{
CreateSubBuffer(bottom, img_buffer, image_offset,
total_bottom_size - image_offset, false);
if (img_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
kernel.set(argIdx++, image_offset);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
}
UMat kernel_buffer;
if (kernel_offset)
{
CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset,
total_kernel_size - kernel_offset, false);
if (kernel_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
}
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
kernel.set(argIdx++, kernel_offset);
UMat bias_buffer;
if (bias_term_)
{
if (bias_offset)
{
CreateSubBuffer(bias, bias_buffer, bias_offset,
total_bias_size - bias_offset, false);
if (bias_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
}
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
kernel.set(argIdx++, bias_offset);
}
UMat out_buffer;
if (output_image_offset)
{
CreateSubBuffer(top, out_buffer, output_image_offset,
total_top_size - output_image_offset, true);
if (out_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
kernel.set(argIdx++, (int)(top.offset / element_size));
}
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
kernel.set(argIdx++, (int)(top.offset / element_size) + output_image_offset);
kernel.set(argIdx++, (uint16_t)width_);
kernel.set(argIdx++, (uint16_t)height_);
@ -1005,7 +961,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return false;
cl_uint argIdx = 0;
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx);
UMat img_buffer;
if (image_offset)
@ -1112,7 +1068,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return false;
cl_uint argIdx = 0;
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
if (bias_term_)
@ -1152,7 +1108,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return false;
cl_uint argIdx = 0;
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
kernel.set(argIdx++, image_offset);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));

@ -74,18 +74,22 @@
(_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \
} while(0)
#define ELTWISE_DATA_ARG __global Dtype* eltwise_data,
#define ELTWISE_DATA_ARG_WITH_OFFSET __global Dtype* eltwise_ptr, int eltwise_offset,
#else
#define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { \
const Dtype _x_ = (_data_); \
(_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \
} while(0)
#define ELTWISE_DATA_ARG
#define ELTWISE_DATA_ARG_WITH_OFFSET
#endif
#if APPLY_BIAS
#define BIAS_KERNEL_ARG __global Dtype * biases_base,
#define BIAS_KERNEL_ARG_WITH_OFFSET __global Dtype * biases_base_ptr, int biases_base_offset,
#else
#define BIAS_KERNEL_ARG
#define BIAS_KERNEL_ARG_WITH_OFFSET
#endif
#define __CAT(x, y) x##y
@ -223,19 +227,28 @@ __attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
__kernel void
convolve_simd(
ELTWISE_DATA_ARG
ELTWISE_DATA_ARG_WITH_OFFSET
FUSED_ARG
__global Dtype* inputs,
__global Dtype* weights,
BIAS_KERNEL_ARG
__global Dtype* outputs_base,
const int outputs_offset,
__global Dtype* inputs_ptr, const int inputs_offset,
__global Dtype* weights_ptr, const int weights_offset,
BIAS_KERNEL_ARG_WITH_OFFSET
__global Dtype* outputs_base, const int outputs_offset,
const ushort input_width,
const ushort input_height,
const ushort output_width,
const ushort output_height)
{
__global Dtype* inputs = inputs_ptr + inputs_offset;
__global Dtype* weights = weights_ptr + weights_offset;
#if APPLY_BIAS
__global Dtype* biases_base = biases_base_ptr + biases_base_offset;
#endif
__global Dtype* outputs = outputs_base + outputs_offset;
#ifdef FUSED_CONV_ELTWISE
__global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
#endif
unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column
unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row
unsigned int fm = get_global_id(2); // fm = Feature Map = od = Output Depth

Loading…
Cancel
Save