refactor candidate generation of convolution auto-tuning

Signed-off-by: Li Peng <peng.li@intel.com>
pull/10370/head
Li Peng 7 years ago
parent eecb64a973
commit 0aa5e43a14
  1. 6
      modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
  2. 148
      modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp

@ -258,6 +258,12 @@ class OCL4DNNConvSpatial
int lx, int ly, int lz, int lx, int ly, int lz,
bool swizzle, bool nullLocal); bool swizzle, bool nullLocal);
void generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems); void generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems);
void generate_dwconv_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
int blockM, int blockK, int blockN);
void generate_gemmlike_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
int blockM, int blockK, int blockN);
void generate_idlf_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
int blockM, int blockK, int simd_size);
void setFusionDefine(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise); void setFusionDefine(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise);
void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx); void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx);

@ -1331,75 +1331,127 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
} }
template<> template<>
void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems) void OCL4DNNConvSpatial<float>::generate_gemmlike_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
{ int blockM, int blockK, int blockN)
if (ocl::Device::getDefault().intelSubgroupsSupport())
{ {
//depth_wise kernels if (group_ != 1 || ((M_ % 8 != 0) || (M_ % 32 == 24)))
if (dwconv_) return;
if (blockM != 1 && blockM != 2)
return;
if (blockN != 32)
return;
if (blockK != 8 && blockK != 16)
return;
if (blockK == 16)
{ {
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_DWCONV, 1, 1, 1)); if ((blockM == 1 && (kernel_w_ > 4)) || M_ % 32 != 0)
if (group_ > 8)
return; return;
if ((blockM == 2) || M_ % 32 != 0)
return;
}
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, blockM, blockK, blockN));
} }
/* IDLF kernels are using Intel specific extension which make template<>
them intel only. */ void OCL4DNNConvSpatial<float>::generate_idlf_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
// Generates static key_ int blockM, int blockK, int simd_size)
{
int max_compute_units = ocl::Device::getDefault().maxComputeUnits(); int max_compute_units = ocl::Device::getDefault().maxComputeUnits();
int kernelCnt = 0;
if (group_ == 1 && ((M_ % 8 == 0) && (M_ % 32 != 24))) {
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, 1, 8, 32));
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, 2, 8, 32));
if (kernel_w_ < 4 && M_ % 32 == 0) if (simd_size != 8 && simd_size != 16)
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, 1, 16, 32)); return;
}
for (int simd_size = 8; simd_size <= 16; simd_size += 8) {
if (simd_size == 8 && !((group_ == 1 || M_ % 8 == 0))) if (simd_size == 8 && !((group_ == 1 || M_ % 8 == 0)))
continue; return;
if (simd_size == 16 && !(group_ == 1 || M_ % 16 == 0)) if (simd_size == 16 && !(group_ == 1 || M_ % 16 == 0))
continue; return;
const int width_max = 14, height_max = 8, block_size_max = 32;
for (uint32_t width = width_max; width > 0; width--) { int width_max, height_max, block_size_max;
int candidate = 0; width_max = 14;
if (width > output_w_) height_max = 14;
continue; block_size_max = 32;
for (uint32_t height = height_max; height > 0; height--) {
if (width * height > block_size_max || height > output_h_) if (blockM > width_max)
continue; return;
if (blockK > height_max)
return;
if (blockM > output_w_)
return;
if (blockK > output_h_)
return;
// Only when the work items count is less than the device // Only when the work items count is less than the device
// max work items or the M_ is less than 16, we will tune // max work items or the M_ is less than 16, we will tune
// for simd 8. // for simd 8.
if (simd_size == 8 && if (simd_size == 8 && M_ >= 16 &&
M_ >= 16 && ((num_ * M_ * output_w_ * output_h_ / static_cast<float>(blockM * blockK)) >=
((num_ * M_ * output_w_ * output_h_ / static_cast<float>(width * height)) >=
max_compute_units * 7 * 16)) max_compute_units * 7 * 16))
continue; return;
int actual_tile_x = kernel_w_ * dilation_w_ + (width - 1) * stride_w_;
int actual_tile_x = kernel_w_ * dilation_w_ + (blockM - 1) * stride_w_ ;
int tile_x = alignSize(actual_tile_x, 4); int tile_x = alignSize(actual_tile_x, 4);
int tile_y = kernel_h_ * dilation_h_ + (height - 1) * stride_h_; int tile_y = kernel_h_ * dilation_h_ + (blockK - 1) * stride_h_;
if (tile_x > (4 * simd_size)) if (tile_x > (4 * simd_size))
continue; return;
// If actual_tile_x is multiple of 4, we may waste some IO bandwidth.
// This could reduce 75% tuning candidates. It has slightly performance if ((blockM * blockK + divUp(tile_x * tile_y, simd_size)) > block_size_max)
// impact for the final tuning result, less than 2% for most cases. return;
if (actual_tile_x % 4 != 0)
continue;
if ((width * height + divUp(tile_x * tile_y, simd_size)) > block_size_max)
continue;
int tile_y_stride = (4 * simd_size) / tile_x; int tile_y_stride = (4 * simd_size) / tile_x;
int invec_size = divUp(tile_y, tile_y_stride);
if (invec_size > 4)
return;
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_INTEL_IDLF, blockM, blockK, simd_size));
}
template<>
void OCL4DNNConvSpatial<float>::generate_dwconv_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
int blockM, int blockK, int blockN)
{
if (!dwconv_)
return;
if (divUp(tile_y, tile_y_stride) < 4) { tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_DWCONV, blockM, blockK, blockN));
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_INTEL_IDLF, width, height, simd_size));
candidate++;
} }
if (candidate >= 4 && height == 2)
template<>
void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems)
{
if (ocl::Device::getDefault().intelSubgroupsSupport())
{
// depthwise kernel
generate_dwconv_tuneritems(tunerItems, 1, 1, 1);
if (tunerItems.size() > 0 && group_ > 8)
return;
// gemm like kernel
generate_gemmlike_tuneritems(tunerItems, 1, 8, 32);
generate_gemmlike_tuneritems(tunerItems, 2, 8, 32);
generate_gemmlike_tuneritems(tunerItems, 1, 16, 32);
// idlf kernel
for (int simd_size = 8; simd_size <= 16; simd_size += 8)
{
int width_max, height_max;
width_max = 14;
height_max = 14;
for (uint32_t width = width_max; width > 0; width--)
{
for (uint32_t height = height_max; height > 0; height--)
{
generate_idlf_tuneritems(tunerItems, width, height, simd_size);
if (tunerItems.size() >= 8 && height == 2)
break; break;
} }
kernelCnt += candidate; if (tunerItems.size() >= 12 && width == 2)
if (kernelCnt >= 12 && width == 2)
break; break;
} }
} }

Loading…
Cancel
Save