@ -160,6 +160,10 @@ public:
void finalize ( InputArrayOfArrays inputs_arr , OutputArrayOfArrays outputs_arr ) CV_OVERRIDE
{
# ifdef HAVE_OPENCL
ocl_exec_cache . clear ( ) ;
# endif
std : : vector < Mat > inputs , outputs ;
inputs_arr . getMatVector ( inputs ) ;
outputs_arr . getMatVector ( outputs ) ;
@ -214,26 +218,33 @@ public:
}
# ifdef HAVE_OPENCL
bool forward_ocl ( InputArrayOfArrays inputs_ , OutputArrayOfArrays outputs_ , OutputArrayOfArrays internals_ )
struct OpenCLExecInfo
{
std : : vector < UMat > inputs ;
std : : vector < UMat > outputs ;
std : : string kernel_name ;
std : : string build_opts ;
size_t local_size [ 2 ] ;
size_t global_size [ 2 ] ;
inputs_ . getUMatVector ( inputs ) ;
outputs_ . getUMatVector ( outputs ) ;
OpenCLExecInfo ( )
{
local_size [ 0 ] = local_size [ 1 ] = 0 ;
global_size [ 0 ] = global_size [ 1 ] = 0 ;
}
} ;
std : : vector < OpenCLExecInfo > ocl_exec_cache ;
void ocl_prepare ( const std : : vector < UMat > & inputs , const std : : vector < UMat > & outputs )
{
CV_TRACE_FUNCTION ( ) ;
CV_Assert ( outputs . size ( ) = = finalSliceRanges . size ( ) ) ;
ocl_exec_cache . resize ( outputs . size ( ) ) ;
const UMat & input = inputs [ 0 ] ;
if ( input . dims > 5 )
{
CV_LOG_INFO ( NULL , " DNN/OpenCL/Slice: implementation doesn't support dims= " < < input . dims < < " . Fallback to CPU " ) ;
return false ;
}
const int dims = input . dims ;
size_t WSZ = 128 ;
const int dims = input . dims ;
const int elemSize = ( int ) input . elemSize ( ) ;
String opts0 = cv : : format (
" -DDIMS=%d -DELEMSIZE=%d " ,
@ -243,10 +254,11 @@ public:
{
opts0 + = cv : : format ( " -DSRC_STEP_%d=%d " , d , ( int ) input . step [ dims - 1 - d ] ) ;
}
String kname = cv : : format ( " slice_%d " , dims ) ;
for ( size_t i = 0 ; i < outputs . size ( ) ; i + + )
{
UMat & output = outputs [ i ] ;
OpenCLExecInfo & ocl = ocl_exec_cache [ i ] ;
const UMat & output = outputs [ i ] ;
const std : : vector < Range > & range = finalSliceRanges [ i ] ;
String opts = opts0 ;
@ -262,6 +274,8 @@ public:
CV_CheckEQ ( range [ d ] . size ( ) , ( int ) output . size [ d ] , " " ) ;
}
const size_t param_LIMIT_BLOCK_SIZE_PER_WG = WSZ * 64 ;
int block_dims = 0 ;
size_t block_size = elemSize ;
for ( int i = dims - 1 ; i > = 0 ; - - i )
@ -270,12 +284,14 @@ public:
break ;
block_size * = output . size [ i ] ;
block_dims + + ;
if ( block_size > = param_LIMIT_BLOCK_SIZE_PER_WG )
break ;
}
const size_t total = output . total ( ) * elemSize ;
size_t num_blocks = total / block_size ;
if ( ( num_blocks < = 8 & & block_size > = WSZ * 4 ) | | ( block_size > = WSZ * 64 ) )
if ( ( num_blocks < = 8 & & block_size > = WSZ * 4 ) | | ( block_size > = param_LIMIT_BLOCK_SIZE_PER_WG ) )
{
// use 1D copy mode
opts + = cv : : format ( " -DUSE_COPY_1D=1 " ) ;
@ -345,23 +361,98 @@ public:
opts + = cv : : format ( " -DWSZ=%d " , ( int ) WSZ ) ;
size_t local [ ] = { WSZ , 1 } ;
size_t global [ ] = { WSZ , num_blocks } ;
std : : ostringstream kernel_suffix ;
kernel_suffix < < dims < < ' x ' < < elemSize < < " _bsz " < < block_size ;
kernel_suffix < < " __src_ " ;
for ( int d = 0 ; d < dims ; d + + )
{
kernel_suffix < < input . size [ dims - 1 - d ] < < ' _ ' ;
}
kernel_suffix < < ' _ ' ;
/*for (int d = 0; d < dims; d++)
{
kernel_suffix < < input . step [ dims - 1 - d ] < < ' _ ' ;
}
kernel_suffix < < ' _ ' ; */
ocl : : Kernel kernel ( kname . c_str ( ) , ocl : : dnn : : slice_oclsrc , opts ) ;
kernel_suffix < < " dst_ " ;
for ( int d = 0 ; d < dims ; d + + )
{
kernel_suffix < < output . size [ dims - 1 - d ] < < ' _ ' ;
}
/*kernel_suffix << '_';
for ( int d = 0 ; d < dims ; d + + )
{
kernel_suffix < < output . step [ dims - 1 - d ] < < ' _ ' ;
} */
kernel_suffix < < " _slice_ " ;
for ( int d = 0 ; d < dims ; d + + )
{
kernel_suffix < < range [ dims - 1 - d ] . start < < ' _ ' ;
}
for ( int d = 0 ; d < dims ; d + + )
{
kernel_suffix < < ' _ ' < < range [ dims - 1 - d ] . end ;
}
std : : string kernel_suffix_str = kernel_suffix . str ( ) ;
opts + = cv : : format ( " -DSLICE_KERNEL_SUFFIX=%s " , kernel_suffix_str . c_str ( ) ) ;
ocl . kernel_name = cv : : format ( " slice_%s " , kernel_suffix_str . c_str ( ) ) ;
ocl . build_opts = opts ;
ocl . local_size [ 0 ] = WSZ ;
ocl . local_size [ 1 ] = 1 ;
ocl . global_size [ 0 ] = WSZ ;
ocl . global_size [ 1 ] = num_blocks ;
} // for outputs.size()
} // ocl_prepare
bool forward_ocl ( InputArrayOfArrays inputs_ , OutputArrayOfArrays outputs_ , OutputArrayOfArrays internals_ )
{
CV_TRACE_FUNCTION ( ) ;
std : : vector < UMat > inputs ;
std : : vector < UMat > outputs ;
inputs_ . getUMatVector ( inputs ) ;
outputs_ . getUMatVector ( outputs ) ;
CV_Assert ( outputs . size ( ) = = finalSliceRanges . size ( ) ) ;
const UMat & input = inputs [ 0 ] ;
const int dims = input . dims ;
if ( dims > 5 )
{
CV_LOG_INFO ( NULL , " DNN/OpenCL/Slice: implementation doesn't support dims= " < < dims < < " . Fallback to CPU " ) ;
return false ;
}
if ( ocl_exec_cache . empty ( ) )
{
ocl_prepare ( inputs , outputs ) ;
}
CV_CheckEQ ( ocl_exec_cache . size ( ) , outputs . size ( ) , " " ) ;
for ( size_t i = 0 ; i < outputs . size ( ) ; i + + )
{
const OpenCLExecInfo & ocl = ocl_exec_cache [ i ] ;
UMat & output = outputs [ i ] ;
ocl : : Kernel kernel ( ocl . kernel_name . c_str ( ) , ocl : : dnn : : slice_oclsrc , ocl . build_opts ) ;
if ( kernel . empty ( ) )
return false ;
bool ret = kernel . args (
ocl : : KernelArg : : PtrReadOnly ( input ) ,
ocl : : KernelArg : : PtrWriteOnly ( output )
)
. run ( 2 , global , local , false ) ;
. run ( 2 , ( size_t * ) ocl . global_size , ( size_t * ) ocl . local_size , false ) ;
if ( ! ret )
return false ;
} // for outputs.size()
return true ;
}
} // forward_ocl
# endif
void forward ( InputArrayOfArrays inputs_arr , OutputArrayOfArrays outputs_arr , OutputArrayOfArrays internals_arr ) CV_OVERRIDE