add depthwise convolution kernel

Signed-off-by: Li Peng <peng.li@intel.com>
pull/10341/head
Li Peng 7 years ago
parent 910d7dab1f
commit 436d7e4eaf
  1. 5
      modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
  2. 100
      modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
  3. 59
      modules/dnn/src/opencl/conv_layer_spatial.cl

@ -215,6 +215,9 @@ class OCL4DNNConvSpatial
bool createGEMMLikeConvKernel(int32_t blockWidth,
int32_t blockHeight,
int32_t blockDepth);
bool createDWConvKernel(int32_t blockWidth,
int32_t blockHeight,
int32_t blockDepth);
void CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
int32_t offset, int32_t size, bool write_only);
bool convolve(const UMat &bottom, UMat &top,
@ -282,6 +285,8 @@ class OCL4DNNConvSpatial
int32_t M_;
bool tuned_;
bool dwconv_;
std::string key_, key_sanitized_;
std::string short_key_;
std::string kernel_name_;

@ -103,6 +103,7 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
top_dim_ = num_output_ * output_w_ * output_h_;
cache_path_ = utils::getConfigurationParameterString("OPENCV_OCL4DNN_CONFIG_PATH", "");
dwconv_ = (num_output_ == channels_ && channels_ == group_);
use_cache_path_ = false;
if (!cache_path_.empty())
@ -203,7 +204,8 @@ void OCL4DNNConvSpatial<Dtype>::collectCommonInformation()
typedef enum {
KERNEL_TYPE_INTEL_IDLF = 2,
KERNEL_TYPE_BASIC = 4,
KERNEL_TYPE_GEMM_LIKE = 5
KERNEL_TYPE_GEMM_LIKE = 5,
KERNEL_TYPE_DWCONV = 6
} ocl4dnnConvSpatialKernelType_t;
template<typename Dtype>
@ -313,6 +315,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
if (clOptionSupport("-cl-no-subgroup-ifp"))
options_ << " -cl-no-subgroup-ifp ";
addDef("KERNEL_GEMM_LIKE");
addDef("INPUT_DEPTH", channels_);
addDef("WIDTH1", M_);
addDef("OUT_PADDING_LEFT", 0);
@ -329,6 +332,28 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
setFusionDefine(fused_activ_, fused_eltwise_);
src_ = ocl::dnn::conv_layer_spatial_oclsrc;
}
else if (kernelType == KERNEL_TYPE_DWCONV)
{
kernelUKey = generateSpecificKey(KERNEL_TYPE_DWCONV, blockM, blockK, blockN);
kernel_name_ = "DWCONV_";
kernel_name_ += kernelUKey.c_str();
options_ << " -cl-fast-relaxed-math ";
if (clOptionSupport("-cl-no-subgroup-ifp"))
options_ << " -cl-no-subgroup-ifp ";
addDef("KERNEL_DWCONV");
addDef("KERNEL_SIZE", kernel_w_ * kernel_h_);
addDef("KERNEL_W", kernel_w_);
addDef("KERNEL_H", kernel_h_);
addDef("APPLY_BIAS", bias_term_);
addDef("OUTPUT_Z", num_output_ * num_);
addDef("CHANNELS", num_output_);
setFusionDefine(fused_activ_, fused_eltwise_);
options_ << " -D DWCONV=" << kernel_name_;
src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
}
}
template<typename Dtype>
@ -906,6 +931,33 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return false;
}
}
} else if (config->kernelType == KERNEL_TYPE_DWCONV) {
ocl::Kernel kernel(config->kernelName.c_str(), program);
if (kernel.empty())
return false;
cl_uint argIdx = 0;
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
if (bias_term_)
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
kernel.set(argIdx++, (uint16_t)width_);
kernel.set(argIdx++, (uint16_t)height_);
kernel.set(argIdx++, (uint16_t)output_w_);
kernel.set(argIdx++, (uint16_t)output_h_);
size_t global_size[3];
global_size[0] = output_w_;
global_size[1] = output_h_;
global_size[2] = num_output_ * num_;
if (!kernel.run(3, global_size, NULL, false))
{
std::cout << "DWCONV kernel run failed." << std::endl;
return false;
}
} else {
for (int32_t n = 0; n < numImages; ++n) {
for (int32_t g = 0; g < group_; ++g) {
@ -1222,6 +1274,39 @@ bool OCL4DNNConvSpatial<float>::createIDLFKernel(int32_t blockWidth,
return false;
}
template<>
bool OCL4DNNConvSpatial<float>::createDWConvKernel(int32_t blockWidth,
int32_t blockHeight,
int32_t blockDepth)
{
if (!dwconv_)
return false;
int workItemOutput[3] = { 1, 1, 1 };
size_t local_size[3] = { 1, 1, 1 };
size_t global_size[3];
global_size[0] = divUp(output_w_, workItemOutput[0]);
global_size[1] = divUp(output_h_, workItemOutput[1]);
global_size[2] = divUp(M_ * num_, workItemOutput[2]);
kernelType_ = KERNEL_TYPE_DWCONV;
blockM_ = blockWidth;
blockK_ = blockHeight;
blockN_ = blockDepth;
setupKernel();
ocl::Program program = compileKernel();
if (program.ptr())
{
kernelQueue.push_back(makePtr<kernelConfig>(kernel_name_, &global_size[0], &local_size[0],
&workItemOutput[0], false, KERNEL_TYPE_DWCONV));
return true;
}
else
return false;
}
template<>
bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
int32_t blockWidth,
@ -1238,6 +1323,8 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
return createBasicKernel(blockWidth, blockHeight, blockDepth);
else if (kernelType == KERNEL_TYPE_GEMM_LIKE)
return createGEMMLikeConvKernel(blockWidth, blockHeight, blockDepth);
else if (kernelType == KERNEL_TYPE_DWCONV)
return createDWConvKernel(blockWidth, blockHeight, blockDepth);
else
CV_Assert(0 && "Internal error");
return false;
@ -1246,7 +1333,16 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
template<>
void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems)
{
if (ocl::Device::getDefault().intelSubgroupsSupport()) {
if (ocl::Device::getDefault().intelSubgroupsSupport())
{
//depth_wise kernels
if (dwconv_)
{
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_DWCONV, 1, 1, 1));
if (group_ > 8)
return;
}
/* IDLF kernels are using Intel specific extension which make
them intel only. */
// Generates static key_

@ -383,7 +383,7 @@ convolve_simd(
}
}
#else // KERNEL_GEMM_LIKE
#elif defined KERNEL_GEMM_LIKE
#if APPLY_BIAS
// Dtype bias[4];
@ -1501,4 +1501,59 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
INTERLEAVED_SIMD16_OUTPUT(dst, out_offset, 0);
}
#endif
#endif // KERNEL_BASIC/IDLF/GEMM_LIKE
#elif defined KERNEL_DWCONV
__kernel void DWCONV(
ELTWISE_DATA_ARG
NEGATIVE_SLOPE_ARG
__global Dtype* image_data,
__global Dtype* kernel_data,
BIAS_KERNEL_ARG
__global Dtype* convolved_image,
const ushort input_width,
const ushort input_height,
const ushort output_width,
const ushort output_height) {
const int outputX = get_global_id(0);
const int outputY = get_global_id(1);
const int outputZ = get_global_id(2);
if(outputX < output_width && outputY < output_height)
{
Dtype sum = 0.;
const int org_y = outputY * STRIDE_Y - INPUT_PAD_H;
const int org_x = outputX * STRIDE_X - INPUT_PAD_W;
const int currentKernelOffset = KERNEL_SIZE*(outputZ%CHANNELS);
const int biasIndex=outputZ%CHANNELS;
const int local_image_offset = org_y*input_width + org_x;
const int imageSize = input_width*input_height;
__global Dtype* image_dataPtrFloat = (image_data + (imageSize*outputZ + local_image_offset));
__global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset));
for(int y = 0; y < KERNEL_H; y++)
{
for(int x = 0; x < KERNEL_W; x++)
{
if(!(org_y + y * DILATION_Y >= 0 && org_y + y * DILATION_Y < input_height && org_x + x * DILATION_X >= 0 && org_x + x * DILATION_X < input_width))
{
continue;
}
sum += image_dataPtrFloat[x * DILATION_X] * kernel_dataPtrFloat[x];
}
image_dataPtrFloat += input_width * DILATION_Y;
kernel_dataPtrFloat += KERNEL_W;
}
#if APPLY_BIAS
int offset = outputZ*output_height*output_width + outputY*output_width + outputX;
ACTIVATION_FUNCTION(convolved_image, offset, sum + biases_base[biasIndex], biasIndex);
#else
int offset = outputZ*output_height*output_width + outputY*output_width + outputX;
ACTIVATION_FUNCTION(convolved_image, offset, sum, biasIndex);
#endif
}
}
#endif // KERNEL_BASIC/IDLF/GEMM_LIKE/DWCONV

Loading…
Cancel
Save