From 407adc7061c9d2126a5d27c53ff76b56a705f3e2 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sat, 4 Sep 2021 04:35:00 +0000 Subject: [PATCH] dnn(ocl): fix buffer offsets in IDLF kernel - drop CreateSubBuffer - fix FUSED_CONV_ELTWISE mode --- modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp | 2 +- .../src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp | 86 +++++-------------- modules/dnn/src/opencl/conv_layer_spatial.cl | 25 ++++-- 3 files changed, 41 insertions(+), 72 deletions(-) diff --git a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp index 7bb277d102..d6fb83becb 100644 --- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp +++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp @@ -269,7 +269,7 @@ class OCL4DNNConvSpatial void generate_idlf_tuneritems(std::vector< cv::Ptr > &tunerItems, int blockM, int blockK, int simd_size); void setFusionDefine(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise); - void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx); + void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, int fused_eltwise_offset, ocl::Kernel &kernel, cl_uint &argIdx); int32_t group_; bool bias_term_; diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp index 517a663e46..b4477ebfc4 100644 --- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp +++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp @@ -270,17 +270,21 @@ void OCL4DNNConvSpatial::setFusionDefine(ocl4dnnFusedActiv_t fused_activ, } template -void OCL4DNNConvSpatial::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx) +void OCL4DNNConvSpatial::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, int fused_eltwise_offset, ocl::Kernel &kernel, cl_uint &argIdx) { if (fused_eltwise) - kernel.set(argIdx++, (cl_mem)bottom_data2_.handle(ACCESS_READ)); + { + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom_data2_)); + if (fused_eltwise_offset >= 0) + kernel.set(argIdx++, fused_eltwise_offset); + } switch (fused_activ) { case OCL4DNN_CONV_FUSED_ACTIV_RELU: kernel.set(argIdx++, (float)negative_slope_); break; case OCL4DNN_CONV_FUSED_ACTIV_PRELU: - kernel.set(argIdx++, (cl_mem)negative_slope_umat_.handle(ACCESS_READ)); + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(negative_slope_umat_)); break; case OCL4DNN_CONV_FUSED_ACTIV_POWER: kernel.set(argIdx++, (float)power_); @@ -895,10 +899,12 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, if (config->kernelType == KERNEL_TYPE_INTEL_IDLF) { if (!swizzleWeight(weight, config->workItem_output[2], false)) return false; +#if 0 size_t total_bottom_size = bottom_dim_ * numImages; size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_; size_t total_bias_size = M_ * group_; size_t total_top_size = top_dim_ * numImages; +#endif for (int32_t g = 0; g < group_; ++g) { bias_offset = M_ * g; int32_t image_offset = width_ * height_ * (channels_ / group_) * g; @@ -910,72 +916,22 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, return false; cl_uint argIdx = 0; - setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx); + setFusionArg(fused_activ_, fused_eltwise_, output_image_offset, kernel, argIdx); - UMat img_buffer; - if (image_offset) - { - CreateSubBuffer(bottom, img_buffer, image_offset, - total_bottom_size - image_offset, false); - if (img_buffer.empty()) - return false; + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); + kernel.set(argIdx++, image_offset); - kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer)); - } - else - { - kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); - } - - UMat kernel_buffer; - if (kernel_offset) - { - CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset, - total_kernel_size - kernel_offset, false); - if (kernel_buffer.empty()) - return false; - - kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer)); - } - else - { - kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat)); - } + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat)); + kernel.set(argIdx++, kernel_offset); - UMat bias_buffer; if (bias_term_) { - if (bias_offset) - { - CreateSubBuffer(bias, bias_buffer, bias_offset, - total_bias_size - bias_offset, false); - if (bias_buffer.empty()) - return false; - - kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer)); - } - else - { - kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias)); - } + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias)); + kernel.set(argIdx++, bias_offset); } - UMat out_buffer; - if (output_image_offset) - { - CreateSubBuffer(top, out_buffer, output_image_offset, - total_top_size - output_image_offset, true); - if (out_buffer.empty()) - return false; - - kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer)); - kernel.set(argIdx++, (int)(out_buffer.offset / element_size)); - } - else - { - kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); - kernel.set(argIdx++, (int)(top.offset / element_size)); - } + kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); + kernel.set(argIdx++, (int)(top.offset / element_size) + output_image_offset); kernel.set(argIdx++, (uint16_t)width_); kernel.set(argIdx++, (uint16_t)height_); @@ -1005,7 +961,7 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, return false; cl_uint argIdx = 0; - setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx); + setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx); UMat img_buffer; if (image_offset) @@ -1112,7 +1068,7 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, return false; cl_uint argIdx = 0; - setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx); + setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight)); if (bias_term_) @@ -1152,7 +1108,7 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, return false; cl_uint argIdx = 0; - setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx); + setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); kernel.set(argIdx++, image_offset); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight)); diff --git a/modules/dnn/src/opencl/conv_layer_spatial.cl b/modules/dnn/src/opencl/conv_layer_spatial.cl index 236e8d029a..55015557a0 100644 --- a/modules/dnn/src/opencl/conv_layer_spatial.cl +++ b/modules/dnn/src/opencl/conv_layer_spatial.cl @@ -74,18 +74,22 @@ (_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \ } while(0) #define ELTWISE_DATA_ARG __global Dtype* eltwise_data, +#define ELTWISE_DATA_ARG_WITH_OFFSET __global Dtype* eltwise_ptr, int eltwise_offset, #else #define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { \ const Dtype _x_ = (_data_); \ (_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \ } while(0) #define ELTWISE_DATA_ARG +#define ELTWISE_DATA_ARG_WITH_OFFSET #endif #if APPLY_BIAS #define BIAS_KERNEL_ARG __global Dtype * biases_base, +#define BIAS_KERNEL_ARG_WITH_OFFSET __global Dtype * biases_base_ptr, int biases_base_offset, #else #define BIAS_KERNEL_ARG +#define BIAS_KERNEL_ARG_WITH_OFFSET #endif #define __CAT(x, y) x##y @@ -223,19 +227,28 @@ __attribute__((reqd_work_group_size(1, 1, SIMD_SIZE))) __attribute__((intel_reqd_sub_group_size(SIMD_SIZE))) __kernel void convolve_simd( - ELTWISE_DATA_ARG + ELTWISE_DATA_ARG_WITH_OFFSET FUSED_ARG - __global Dtype* inputs, - __global Dtype* weights, - BIAS_KERNEL_ARG - __global Dtype* outputs_base, - const int outputs_offset, + __global Dtype* inputs_ptr, const int inputs_offset, + __global Dtype* weights_ptr, const int weights_offset, + BIAS_KERNEL_ARG_WITH_OFFSET + __global Dtype* outputs_base, const int outputs_offset, const ushort input_width, const ushort input_height, const ushort output_width, const ushort output_height) { + __global Dtype* inputs = inputs_ptr + inputs_offset; + __global Dtype* weights = weights_ptr + weights_offset; +#if APPLY_BIAS + __global Dtype* biases_base = biases_base_ptr + biases_base_offset; +#endif + __global Dtype* outputs = outputs_base + outputs_offset; +#ifdef FUSED_CONV_ELTWISE + __global Dtype* eltwise_data = eltwise_ptr + eltwise_offset; +#endif + unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row unsigned int fm = get_global_id(2); // fm = Feature Map = od = Output Depth