@ -55,6 +55,7 @@
# include "../include/math_functions.hpp"
# include "../include/default_kernel_config.hpp"
# include "opencv2/dnn/shape_utils.hpp"
# include "opencv2/core/utils/logger.hpp"
# if defined WIN32 || defined _WIN32
# include <windows.h>
@ -87,10 +88,13 @@ static void initializeGlobalBuiltinConfigurations(const std::string& cache_path)
{
CV_Assert ( defaultConfigLoaded = = false ) ;
CV_Assert ( kernelConfigMap . empty ( ) ) ;
const size_t numConfigs = sizeof ( default_kernel_config_intel ) / sizeof ( default_kernel_config_intel [ 0 ] ) / 2 ;
/* fp32 config */
size_t numConfigs = sizeof ( default_kernel_config_intel_fp32 ) /
sizeof ( default_kernel_config_intel_fp32 [ 0 ] ) / 2 ;
for ( size_t i = 0 ; i < numConfigs ; i + + )
{
std : : string key = std : : string ( " Intel(R) Corporation_ " ) + default_kernel_config_intel [ 2 * i ] ;
std : : string key = std : : string ( " Intel(R) Corporation_ " ) + default_kernel_config_intel_fp32 [ 2 * i ] ;
if ( ! cache_path . empty ( ) )
{
std : : string cacheFile = cache_path + sanitize ( key ) ;
@ -100,9 +104,29 @@ static void initializeGlobalBuiltinConfigurations(const std::string& cache_path)
}
std : : pair < std : : string , std : : string > entry (
key ,
default_kernel_config_intel [ 2 * i + 1 ] ) ;
default_kernel_config_intel_fp32 [ 2 * i + 1 ] ) ;
kernelConfigMap . insert ( entry ) ;
}
/* fp16 config */
numConfigs = sizeof ( default_kernel_config_intel_fp16 ) /
sizeof ( default_kernel_config_intel_fp16 [ 0 ] ) / 2 ;
for ( size_t i = 0 ; i < numConfigs ; i + + )
{
std : : string key = std : : string ( " Intel(R) Corporation_ " ) + default_kernel_config_intel_fp16 [ 2 * i ] ;
if ( ! cache_path . empty ( ) )
{
std : : string cacheFile = cache_path + sanitize ( key ) ;
std : : ifstream cachedKernel ( cacheFile . c_str ( ) ) ;
if ( cachedKernel )
continue ; // external configuration found, skip builtin
}
std : : pair < std : : string , std : : string > entry (
key ,
default_kernel_config_intel_fp16 [ 2 * i + 1 ] ) ;
kernelConfigMap . insert ( entry ) ;
}
defaultConfigLoaded = true ;
}
@ -311,40 +335,38 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
// options
options_ < < " -cl-fast-relaxed-math -D KERNEL_IDLF -D convolve_simd= " < < kernel_name_ ;
options_ < < " -cl-mad-enable " ;
if ( clOptionSupport ( " -cl-no-subgroup-ifp " ) )
options_ < < " -cl-no-subgroup-ifp " ;
// defs
int32_t output_width = output_w_ ;
int32_t output_height = output_h_ ;
int32_t output_block_width = blockM ;
int32_t output_block_height = blockK ;
const int32_t last_block_width = ( output_width % output_block_width = = 0 ) ?
output_block_width : output_width % output_block_width ;
const int32_t last_block_height = ( output_height % output_block_height = = 0 ) ?
output_block_height : output_height % output_block_height ;
int tile_x = alignSize ( ( output_block_width - 1 ) * stride_w_ + kernel_w_ * dilation_w_ , 4 ) ;
int tile_x = ( output_block_width - 1 ) * stride_w_ + kernel_w_ * dilation_w_ ;
int tile_y = ( output_block_height - 1 ) * stride_h_ + kernel_h_ * dilation_h_ ;
int tile_y_stride = ( 4 * simd_size ) / tile_x ;
int invec_size = divUp ( tile_y , tile_y_stride ) ;
int invec_size = tile_y ;
addDef ( " SIMD_SIZE " , simd_size ) ;
addDef ( " filter_qualifier " , " __global " ) ;
addDef ( " OUT_BLOCK_WIDTH " , output_block_width ) ;
addDef ( " OUT_BLOCK_HEIGHT " , output_block_height ) ;
addDef ( " LAST_BLOCK_WIDTH " , last_block_width ) ;
addDef ( " LAST_BLOCK_HEIGHT " , last_block_height ) ;
addDef ( " INPUT_DEPTH " , channels_ / group_ ) ;
addDef ( " TOTAL_INPUT_DEPTH_SIZE " , channels_ ) ;
addDef ( " TOTAL_OUTPUT_DEPTH " , num_output_ ) ;
addDef ( " NUM_FILTERS " , M_ ) ;
addDef ( " TILE_X " , tile_x ) ;
addDef ( " TILE_Y " , tile_y ) ;
addDef ( " TILE_Y_STRIDE " , tile_y_stride ) ;
addDef ( " INVEC_SIZE " , invec_size ) ;
addDef ( " ALIGNED_NUM_FILTERS " , ( int ) alignSize ( M_ , simd_size ) ) ;
addDef ( " OUT_BLOCK_SIZE " , ( output_block_width * output_block_height ) ) ;
addDef ( " APPLY_BIAS " , bias_term_ ) ;
addDef ( " WEIGHT_PREF " , ( ( kernel_w_ * kernel_h_ ) = = 1 ) ? 1 : 8 ) ;
addDef ( " INPUT_PITCH " , ( width_ * height_ ) ) ;
addDef ( " OUTPUT_PITCH " , ( output_w_ * output_h_ ) ) ;
addDef ( " LEFT_FILTERS " , ( ( int ) alignSize ( M_ , simd_size ) - M_ ) ) ;
addDef ( " INPUT_WIDTH " , width_ ) ;
addDef ( " INPUT_HEIGHT " , height_ ) ;
addDef ( " FILTERS_IN_GROUP " , ( ( int ) alignSize ( M_ , simd_size ) / simd_size ) ) ;
setFusionDefine ( fused_activ_ , fused_eltwise_ ) ;
src_ = cv : : ocl : : dnn : : conv_layer_spatial_oclsrc ;
@ -567,13 +589,6 @@ void OCL4DNNConvSpatial<Dtype>::calculateBenchmark(const UMat &bottom, UMat &ver
return ;
}
# define dbg
# ifdef dbg
# define dbgPrint(x) (x)
# else
# define dbgPrint(x)
# endif
// For large enough input size, we do not need to tune kernels for different
// size. The reason is with large input size, there will be enough work items
// to feed al the EUs.
@ -584,6 +599,7 @@ void OCL4DNNConvSpatial<Dtype>::calculateBenchmark(const UMat &bottom, UMat &ver
template < typename Dtype >
void OCL4DNNConvSpatial < Dtype > : : generateKey ( )
{
std : : string precision = ( use_half_ ) ? " FP16 " : " FP32 " ;
std : : stringstream keyBuilder ;
// FIXME: to support fuse?
keyBuilder < < " k " < < kernel_w_ < < " x " < < kernel_h_ < < " _ "
@ -597,7 +613,8 @@ void OCL4DNNConvSpatial<Dtype>::generateKey()
< < " num " < < num_ < < " _ "
< < " M " < < M_ < < " _ "
< < " activ " < < fused_activ_ < < " _ "
< < " eltwise " < < fused_eltwise_ ;
< < " eltwise " < < fused_eltwise_ < < " _ "
< < precision ;
key_ = ocl : : Device : : getDefault ( ) . vendorName ( ) + " _EU " + cv : : format ( " %d " , ocl : : Device : : getDefault ( ) . maxComputeUnits ( ) ) + " _ " + keyBuilder . str ( ) ;
@ -616,11 +633,6 @@ std::string OCL4DNNConvSpatial<Dtype>::generateSpecificKey(int32_t type, int32_t
< < " _ " < < blockHeight
< < " _ " < < blockDepth ;
if ( ! use_half_ )
keyBuilder < < " _float " ;
else
keyBuilder < < " _half " ;
return keyBuilder . str ( ) ;
}
@ -1164,7 +1176,7 @@ float OCL4DNNConvSpatial<float>::timedConvolve(const UMat &bottom, UMat &top,
cv : : ocl : : Timer timer ( queue ) ;
timer . start ( ) ;
bool res = true ; ;
dbgPrint ( std : : cout < < " Benchmarking kernel: " < < config - > kernelName < < std : : endl ) ;
CV_LOG_INFO ( NULL , " Benchmarking kernel: " < < config - > kernelName ) ;
tuned_ = true ;
int loop_cnt = 4 ;
for ( int i = 0 ; i < loop_cnt ; i + + ) {
@ -1181,7 +1193,6 @@ float OCL4DNNConvSpatial<float>::timedConvolve(const UMat &bottom, UMat &top,
}
float elapsedTime = timer . durationNS ( ) * 1e-6 / loop_cnt ;
# ifdef dbg
double out_w = output_w_ ;
double out_h = output_h_ ;
double out_z = M_ ;
@ -1189,16 +1200,8 @@ float OCL4DNNConvSpatial<float>::timedConvolve(const UMat &bottom, UMat &top,
double k_h = kernel_h_ ;
double k_z = channels_ ;
double totalFlops = ( ( k_w * k_h * k_z - 1 ) * 2 ) * ( out_w * out_h * out_z ) * num_ ;
std : : cout < < " \t Estimated Gflops: " < < ( totalFlops * 1e-9 )
< < std : : endl ;
std : : cout < < " \t Estimated GFLOPS/S: " < < ( ( totalFlops * 1e-9 ) * ( 1000.0 / elapsedTime ) )
< < std : : endl ;
#if 0
std : : cout < < " Estimated utilization: " < <
( ( ( ( totalFlops / 1000 ) / 1000 ) / 1000 ) * ( 1000.0 / elapsedTime ) ) / 880.0
< < std : : endl ;
# endif
# endif
CV_LOG_INFO ( NULL , " \t Estimated Gflops: " < < ( totalFlops * 1e-9 ) ) ;
CV_LOG_INFO ( NULL , " \t Estimated GFLOPS/S: " < < ( ( totalFlops * 1e-9 ) * ( 1000.0 / elapsedTime ) ) ) ;
return elapsedTime ;
}
@ -1254,18 +1257,18 @@ bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,
if ( use_half_ & & error_factor > 0.1 * fabs ( verify_data [ offset ] ) & &
error_factor > 0.04 & & ! ( fabs ( verify_data [ offset ] ) < 1.e-3 & & error_factor < 1.e-4 ) )
{
dbgPrint ( printf ( " test verification failed @ image %d group %d "
" out_ch %d h %d w %d got %G expected %G \n " ,
n , g , out_ch , h , w , data [ offset ] , verify_data [ offset ] ) ) ;
CV_LOG_ERROR ( NULL , " test verification failed @ image " < < n < < " group " < < g
< < " out_ch " < < out_ch < < " h " < < h < < " w " < < w
< < " got " < < data [ offset ] < < " expected " < < verify_data [ offset ] ) ;
verificationFail = 1 ;
goto out ;
}
else if ( ! use_half_ & & error_factor > 0.1 * fabs ( verify_data [ offset ] ) & &
! ( fabs ( verify_data [ offset ] ) < 1.e-3 & & error_factor < 1.e-4 ) )
{
dbgPrint ( printf ( " test verification failed @ image %d group %d "
" out_ch %d h %d w %d got %G expected %G \n " ,
n , g , out_ch , h , w , data [ offset ] , verify_data [ offset ] ) ) ;
CV_LOG_ERROR ( NULL , " test verification failed @ image " < < n < < " group " < < g
< < " out_ch " < < out_ch < < " h " < < h < < " w " < < w
< < " got " < < data [ offset ] < < " expected " < < verify_data [ offset ] ) ;
verificationFail = 1 ;
goto out ;
}
@ -1546,17 +1549,11 @@ void OCL4DNNConvSpatial<float>::generate_idlf_tuneritems(std::vector< cv::Ptr<tu
return ;
int actual_tile_x = kernel_w_ * dilation_w_ + ( blockM - 1 ) * stride_w_ ;
int tile_x = alignSize ( actual_tile_x , 4 ) ;
int tile_y = kernel_h_ * dilation_h_ + ( blockK - 1 ) * stride_h_ ;
if ( tile_x > ( 4 * simd_size ) )
return ;
if ( ( blockM * blockK + divUp ( tile_x * tile_y , simd_size ) ) > block_size_max )
int tile_x = alignSize ( actual_tile_x , simd_size ) ;
if ( tile_x > simd_size )
return ;
int tile_y_stride = ( 4 * simd_size ) / tile_x ;
int invec_size = divUp ( tile_y , tile_y_stride ) ;
if ( invec_size > 4 )
if ( blockM * blockK > block_size_max )
return ;
tunerItems . push_back ( makePtr < tunerParam > ( KERNEL_TYPE_INTEL_IDLF , blockM , blockK , simd_size ) ) ;
@ -1599,11 +1596,7 @@ void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerPar
for ( uint32_t height = height_max ; height > 0 ; height - - )
{
generate_idlf_tuneritems ( tunerItems , width , height , simd_size ) ;
if ( tunerItems . size ( ) > = 8 & & height = = 2 )
break ;
}
if ( tunerItems . size ( ) > = 12 & & width = = 2 )
break ;
}
}
}
@ -1690,10 +1683,8 @@ void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom,
if ( kernelQueue [ x ] - > tested = = false ) {
bool verified = verifyResult ( bottom , top , weight , bias , numImages , kernelQueue [ x ] , verifyTop ) ;
if ( verified = = false ) {
dbgPrint ( std : : cout < < " Kernel "
< < kernelQueue [ x ] - > kernelName
< < " failed verification " < < std : : endl ) ;
dbgPrint ( std : : cout < < " kernelQueue[x]->workItem_output[0]: "
CV_LOG_ERROR ( NULL , " Kernel " < < kernelQueue [ x ] - > kernelName < < " failed verification " ) ;
CV_LOG_ERROR ( NULL , " kernelQueue[x]->workItem_output[0]: "
< < kernelQueue [ x ] - > workItem_output [ 0 ] < < " "
< < " kernelQueue[x]->workItem_output[1]: "
< < kernelQueue [ x ] - > workItem_output [ 1 ] < < " "
@ -1714,11 +1705,9 @@ void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom,
< < " kernelQueue[x]->local_work_size[2]: "
< < kernelQueue [ x ] - > local_work_size [ 2 ] < < " "
< < kernelQueue [ x ] - > swizzle_weights < < " "
< < kernelQueue [ x ] - > use_null_local < < std : : end l ) ;
< < kernelQueue [ x ] - > use_null_local ) ;
} else {
dbgPrint ( std : : cout < < " Kernel "
< < kernelQueue [ x ] - > kernelName
< < " pass verification " < < std : : endl ) ;
CV_LOG_INFO ( NULL , " Kernel " < < kernelQueue [ x ] - > kernelName < < " pass verification " ) ;
}
}
# endif
@ -1747,19 +1736,28 @@ void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom,
break ;
} else {
kernelQueue [ fastestKernel ] - > tested = true ;
dbgPrint ( std : : cout < < " Kernel " < <
kernelQueue [ fastestKernel ] - > kernelName < <
" failed verification " < < std : : endl ) ;
CV_LOG_ERROR ( NULL , " Kernel " < < kernelQueue [ fastestKernel ] - > kernelName < <
" failed verification " ) ;
failures + + ;
}
}
}
if ( verification ) {
dbgPrint ( std : : cout < < " Kernel < " < < kernelQueue [ kernel_index_ ] - > kernelName < <
" > passed verification " < < std : : endl ) ;
dbgPrint ( std : : cout < < " Convolution Time: " < < kernelQueue [ kernel_index_ ] - > executionTime < < std : : endl ) ;
CV_LOG_INFO ( NULL , " Kernel < " < < kernelQueue [ kernel_index_ ] - > kernelName < <
" > passed verification " ) ;
CV_LOG_INFO ( NULL , " Convolution Time: " < < kernelQueue [ kernel_index_ ] - > executionTime ) ;
double out_w = output_w_ ;
double out_h = output_h_ ;
double out_z = M_ ;
double k_w = kernel_w_ ;
double k_h = kernel_h_ ;
double k_z = channels_ ;
float elapsedTime = kernelQueue [ kernel_index_ ] - > executionTime ;
double totalFlops = ( ( k_w * k_h * k_z - 1 ) * 2 ) * ( out_w * out_h * out_z ) * num_ ;
CV_LOG_INFO ( NULL , " \t Estimated Gflops: " < < ( totalFlops * 1e-9 ) ) ;
CV_LOG_INFO ( NULL , " \t Estimated GFLOPS/S: " < < ( ( totalFlops * 1e-9 ) * ( 1000.0 / elapsedTime ) ) ) ;
} else {
dbgPrint ( std : : cout < < " fallback to basic kernel " < < std : : endl ) ;
CV_LOG_INFO ( NULL , " fallback to basic kernel " ) ;
options_ . str ( " " ) ; options_ . clear ( ) ; // clear contents and state flags
createBasicKernel ( 1 , 1 , 1 ) ;
kernel_index_ = kernelQueue . size ( ) - 1 ;