|
|
@ -1331,75 +1331,127 @@ bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType, |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
template<> |
|
|
|
template<> |
|
|
|
void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems) |
|
|
|
void OCL4DNNConvSpatial<float>::generate_gemmlike_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems, |
|
|
|
|
|
|
|
int blockM, int blockK, int blockN) |
|
|
|
{ |
|
|
|
{ |
|
|
|
if (ocl::Device::getDefault().intelSubgroupsSupport()) |
|
|
|
if (group_ != 1 || ((M_ % 8 != 0) || (M_ % 32 == 24))) |
|
|
|
{ |
|
|
|
return; |
|
|
|
//depth_wise kernels
|
|
|
|
|
|
|
|
if (dwconv_) |
|
|
|
if (blockM != 1 && blockM != 2) |
|
|
|
|
|
|
|
return; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (blockN != 32) |
|
|
|
|
|
|
|
return; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (blockK != 8 && blockK != 16) |
|
|
|
|
|
|
|
return; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (blockK == 16) |
|
|
|
{ |
|
|
|
{ |
|
|
|
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_DWCONV, 1, 1, 1)); |
|
|
|
if ((blockM == 1 && (kernel_w_ > 4)) || M_ % 32 != 0) |
|
|
|
if (group_ > 8) |
|
|
|
return; |
|
|
|
|
|
|
|
if ((blockM == 2) || M_ % 32 != 0) |
|
|
|
return; |
|
|
|
return; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/* IDLF kernels are using Intel specific extension which make
|
|
|
|
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, blockM, blockK, blockN)); |
|
|
|
them intel only. */ |
|
|
|
} |
|
|
|
// Generates static key_
|
|
|
|
|
|
|
|
|
|
|
|
template<> |
|
|
|
|
|
|
|
void OCL4DNNConvSpatial<float>::generate_idlf_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems, |
|
|
|
|
|
|
|
int blockM, int blockK, int simd_size) |
|
|
|
|
|
|
|
{ |
|
|
|
int max_compute_units = ocl::Device::getDefault().maxComputeUnits(); |
|
|
|
int max_compute_units = ocl::Device::getDefault().maxComputeUnits(); |
|
|
|
int kernelCnt = 0; |
|
|
|
|
|
|
|
if (group_ == 1 && ((M_ % 8 == 0) && (M_ % 32 != 24))) { |
|
|
|
|
|
|
|
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, 1, 8, 32)); |
|
|
|
|
|
|
|
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, 2, 8, 32)); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (kernel_w_ < 4 && M_ % 32 == 0) |
|
|
|
if (simd_size != 8 && simd_size != 16) |
|
|
|
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, 1, 16, 32)); |
|
|
|
return; |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (int simd_size = 8; simd_size <= 16; simd_size += 8) { |
|
|
|
|
|
|
|
if (simd_size == 8 && !((group_ == 1 || M_ % 8 == 0))) |
|
|
|
if (simd_size == 8 && !((group_ == 1 || M_ % 8 == 0))) |
|
|
|
continue; |
|
|
|
return; |
|
|
|
|
|
|
|
|
|
|
|
if (simd_size == 16 && !(group_ == 1 || M_ % 16 == 0)) |
|
|
|
if (simd_size == 16 && !(group_ == 1 || M_ % 16 == 0)) |
|
|
|
continue; |
|
|
|
return; |
|
|
|
const int width_max = 14, height_max = 8, block_size_max = 32; |
|
|
|
|
|
|
|
for (uint32_t width = width_max; width > 0; width--) { |
|
|
|
int width_max, height_max, block_size_max; |
|
|
|
int candidate = 0; |
|
|
|
width_max = 14; |
|
|
|
if (width > output_w_) |
|
|
|
height_max = 14; |
|
|
|
continue; |
|
|
|
block_size_max = 32; |
|
|
|
for (uint32_t height = height_max; height > 0; height--) { |
|
|
|
|
|
|
|
if (width * height > block_size_max || height > output_h_) |
|
|
|
if (blockM > width_max) |
|
|
|
continue; |
|
|
|
return; |
|
|
|
|
|
|
|
if (blockK > height_max) |
|
|
|
|
|
|
|
return; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (blockM > output_w_) |
|
|
|
|
|
|
|
return; |
|
|
|
|
|
|
|
if (blockK > output_h_) |
|
|
|
|
|
|
|
return; |
|
|
|
|
|
|
|
|
|
|
|
// Only when the work items count is less than the device
|
|
|
|
// Only when the work items count is less than the device
|
|
|
|
// max work items or the M_ is less than 16, we will tune
|
|
|
|
// max work items or the M_ is less than 16, we will tune
|
|
|
|
// for simd 8.
|
|
|
|
// for simd 8.
|
|
|
|
if (simd_size == 8 && |
|
|
|
if (simd_size == 8 && M_ >= 16 && |
|
|
|
M_ >= 16 && |
|
|
|
((num_ * M_ * output_w_ * output_h_ / static_cast<float>(blockM * blockK)) >= |
|
|
|
((num_ * M_ * output_w_ * output_h_ / static_cast<float>(width * height)) >= |
|
|
|
|
|
|
|
max_compute_units * 7 * 16)) |
|
|
|
max_compute_units * 7 * 16)) |
|
|
|
continue; |
|
|
|
return; |
|
|
|
int actual_tile_x = kernel_w_ * dilation_w_ + (width - 1) * stride_w_; |
|
|
|
|
|
|
|
|
|
|
|
int actual_tile_x = kernel_w_ * dilation_w_ + (blockM - 1) * stride_w_ ; |
|
|
|
int tile_x = alignSize(actual_tile_x, 4); |
|
|
|
int tile_x = alignSize(actual_tile_x, 4); |
|
|
|
int tile_y = kernel_h_ * dilation_h_ + (height - 1) * stride_h_; |
|
|
|
int tile_y = kernel_h_ * dilation_h_ + (blockK - 1) * stride_h_; |
|
|
|
if (tile_x > (4 * simd_size)) |
|
|
|
if (tile_x > (4 * simd_size)) |
|
|
|
continue; |
|
|
|
return; |
|
|
|
// If actual_tile_x is multiple of 4, we may waste some IO bandwidth.
|
|
|
|
|
|
|
|
// This could reduce 75% tuning candidates. It has slightly performance
|
|
|
|
if ((blockM * blockK + divUp(tile_x * tile_y, simd_size)) > block_size_max) |
|
|
|
// impact for the final tuning result, less than 2% for most cases.
|
|
|
|
return; |
|
|
|
if (actual_tile_x % 4 != 0) |
|
|
|
|
|
|
|
continue; |
|
|
|
|
|
|
|
if ((width * height + divUp(tile_x * tile_y, simd_size)) > block_size_max) |
|
|
|
|
|
|
|
continue; |
|
|
|
|
|
|
|
int tile_y_stride = (4 * simd_size) / tile_x; |
|
|
|
int tile_y_stride = (4 * simd_size) / tile_x; |
|
|
|
|
|
|
|
int invec_size = divUp(tile_y, tile_y_stride); |
|
|
|
|
|
|
|
if (invec_size > 4) |
|
|
|
|
|
|
|
return; |
|
|
|
|
|
|
|
|
|
|
|
if (divUp(tile_y, tile_y_stride) < 4) { |
|
|
|
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_INTEL_IDLF, blockM, blockK, simd_size)); |
|
|
|
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_INTEL_IDLF, width, height, simd_size)); |
|
|
|
} |
|
|
|
candidate++; |
|
|
|
|
|
|
|
} |
|
|
|
template<> |
|
|
|
if (candidate >= 4 && height == 2) |
|
|
|
void OCL4DNNConvSpatial<float>::generate_dwconv_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems, |
|
|
|
|
|
|
|
int blockM, int blockK, int blockN) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
if (!dwconv_) |
|
|
|
|
|
|
|
return; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_DWCONV, blockM, blockK, blockN)); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template<> |
|
|
|
|
|
|
|
void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
if (ocl::Device::getDefault().intelSubgroupsSupport()) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
// depthwise kernel
|
|
|
|
|
|
|
|
generate_dwconv_tuneritems(tunerItems, 1, 1, 1); |
|
|
|
|
|
|
|
if (tunerItems.size() > 0 && group_ > 8) |
|
|
|
|
|
|
|
return; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// gemm like kernel
|
|
|
|
|
|
|
|
generate_gemmlike_tuneritems(tunerItems, 1, 8, 32); |
|
|
|
|
|
|
|
generate_gemmlike_tuneritems(tunerItems, 2, 8, 32); |
|
|
|
|
|
|
|
generate_gemmlike_tuneritems(tunerItems, 1, 16, 32); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// idlf kernel
|
|
|
|
|
|
|
|
for (int simd_size = 8; simd_size <= 16; simd_size += 8) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
int width_max, height_max; |
|
|
|
|
|
|
|
width_max = 14; |
|
|
|
|
|
|
|
height_max = 14; |
|
|
|
|
|
|
|
for (uint32_t width = width_max; width > 0; width--) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
for (uint32_t height = height_max; height > 0; height--) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
generate_idlf_tuneritems(tunerItems, width, height, simd_size); |
|
|
|
|
|
|
|
if (tunerItems.size() >= 8 && height == 2) |
|
|
|
break; |
|
|
|
break; |
|
|
|
} |
|
|
|
} |
|
|
|
kernelCnt += candidate; |
|
|
|
if (tunerItems.size() >= 12 && width == 2) |
|
|
|
if (kernelCnt >= 12 && width == 2) |
|
|
|
|
|
|
|
break; |
|
|
|
break; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|