@ -464,29 +464,34 @@ public:
}
}
void reuseOrCreate ( const MatShape & shape , const LayerPin & lp , Mat & dst )
void reuseOrCreate ( const MatShape & shape , const LayerPin & lp , Mat & dst , bool force )
{
std : : map < LayerPin , Mat > : : iterator hostIt ;
std : : map < LayerPin , int > : : iterator refIt ;
const int targetTotal = total ( shape ) ;
Mat bestBlob ;
int bestBlobTotal = INT_MAX ;
LayerPin bestBlobPin ;
for ( hostIt = memHosts . begin ( ) ; hostIt ! = memHosts . end ( ) ; + + hostIt )
if ( ! force )
{
refIt = refCounter . find ( hostIt - > first ) ;
// Use only blobs that had references before because if not,
// it might be used as output.
if ( refIt ! = refCounter . end ( ) & & refIt - > second = = 0 )
std : : map < LayerPin , Mat > : : iterator hostIt ;
std : : map < LayerPin , int > : : iterator refIt ;
const int targetTotal = total ( shape ) ;
int bestBlobTotal = INT_MAX ;
for ( hostIt = memHosts . begin ( ) ; hostIt ! = memHosts . end ( ) ; + + hostIt )
{
Mat & unusedBlob = hostIt - > second ;
if ( unusedBlob . total ( ) > = targetTotal & &
unusedBlob . total ( ) < bestBlobTotal )
refIt = refCounter . find ( hostIt - > first ) ;
// Use only blobs that had references before because if not,
// it might be used as output.
if ( refIt ! = refCounter . end ( ) & & refIt - > second = = 0 )
{
bestBlobPin = hostIt - > first ;
bestBlob = unusedBlob ;
bestBlobTotal = unusedBlob . total ( ) ;
Mat & unusedBlob = hostIt - > second ;
if ( unusedBlob . total ( ) > = targetTotal & &
unusedBlob . total ( ) < bestBlobTotal )
{
bestBlobPin = hostIt - > first ;
bestBlob = unusedBlob ;
bestBlobTotal = unusedBlob . total ( ) ;
}
}
}
}
@ -505,7 +510,8 @@ public:
}
void allocateBlobsForLayer ( LayerData & ld , const LayerShapes & layerShapes ,
std : : vector < LayerPin > & pinsForInternalBlobs )
std : : vector < LayerPin > & pinsForInternalBlobs ,
bool maximizeReuse )
{
CV_TRACE_FUNCTION ( ) ;
@ -561,6 +567,7 @@ public:
}
std : : map < int , std : : vector < int > > : : reverse_iterator it ;
bool force = ! maximizeReuse & & ld . inputBlobsId . size ( ) > 1 ;
for ( it = idxSizes . rbegin ( ) ; it ! = idxSizes . rend ( ) ; it + + )
{
for ( int j = 0 ; j < it - > second . size ( ) ; j + + )
@ -569,7 +576,7 @@ public:
if ( total ( shapes [ index ] ) )
{
LayerPin blobPin ( ld . id , index ) ;
if ( index < outShapes . size ( ) & & inPlace )
if ( index < outShapes . size ( ) & & inPlace & & ! force )
{
CV_Assert ( ld . inputBlobs [ 0 ] - > total ( ) = = total ( shapes [ index ] ) ) ;
ld . outputBlobs [ index ] = ld . inputBlobs [ 0 ] - > reshape ( 1 , shapes [ index ] ) ;
@ -577,7 +584,7 @@ public:
}
else
{
reuseOrCreate ( shapes [ index ] , blobPin , * blobs [ index ] ) ;
reuseOrCreate ( shapes [ index ] , blobPin , * blobs [ index ] , force ) ;
}
}
}
@ -628,6 +635,7 @@ struct Net::Impl
lastLayerId = 1 ;
netWasAllocated = false ;
fusion = true ;
preferableBackend = DNN_BACKEND_DEFAULT ;
preferableTarget = DNN_TARGET_CPU ;
}
@ -647,6 +655,7 @@ struct Net::Impl
int lastLayerId ;
bool netWasAllocated ;
bool fusion ;
void compileHalide ( )
{
@ -695,8 +704,7 @@ struct Net::Impl
if ( currLayer . empty ( ) )
continue ;
currLayer - > setActivation ( Ptr < ActivationLayer > ( ) ) ;
currLayer - > setBatchNorm ( Ptr < BatchNormLayer > ( ) ) ;
currLayer - > unsetAttached ( ) ;
Ptr < PoolingLayer > poolingLayer = currLayer . dynamicCast < PoolingLayer > ( ) ;
if ( ! poolingLayer . empty ( ) )
@ -704,9 +712,11 @@ struct Net::Impl
poolingLayer - > computeMaxIdx = true ;
}
}
it = layers . find ( 0 ) ;
CV_Assert ( it ! = layers . end ( ) ) ;
it - > second . skipFlags [ DNN_BACKEND_DEFAULT ] = true ;
}
void setUpNet ( const std : : vector < LayerPin > & blobsToKeep_ = std : : vector < LayerPin > ( ) )
{
CV_TRACE_FUNCTION ( ) ;
@ -783,13 +793,11 @@ struct Net::Impl
LayerData & getLayerData ( const DictValue & layerDesc )
{
CV_Assert ( layerDesc . isInt ( ) | | layerDesc . isString ( ) ) ;
if ( layerDesc . isInt ( ) )
return getLayerData ( layerDesc . get < int > ( ) ) ;
else if ( layerDesc . isString ( ) )
else /*if (layerDesc.isString())*/
return getLayerData ( layerDesc . get < String > ( ) ) ;
CV_Assert ( layerDesc . isInt ( ) | | layerDesc . isString ( ) ) ;
return * ( ( LayerData * ) NULL ) ;
}
static void addLayerInput ( LayerData & ld , int inNum , LayerPin from )
@ -1021,7 +1029,8 @@ struct Net::Impl
CV_Assert ( layerShapesIt ! = layersShapes . end ( ) ) ;
std : : vector < LayerPin > pinsForInternalBlobs ;
blobManager . allocateBlobsForLayer ( ld , layerShapesIt - > second , pinsForInternalBlobs ) ;
bool maximizeReuse = preferableBackend = = DNN_BACKEND_HALIDE ;
blobManager . allocateBlobsForLayer ( ld , layerShapesIt - > second , pinsForInternalBlobs , maximizeReuse ) ;
Ptr < Layer > layerPtr = ld . getLayerInstance ( ) ;
{
@ -1044,8 +1053,17 @@ struct Net::Impl
ld . flag = 1 ;
}
#if 0
# define printf_(args) printf args
# else
# define printf_(args)
# endif
void fuseLayers ( const std : : vector < LayerPin > & blobsToKeep_ )
{
if ( ! fusion | | preferableBackend = = DNN_BACKEND_HALIDE )
return ;
CV_TRACE_FUNCTION ( ) ;
// scan through all the layers. If there is convolution layer followed by the activation layer,
@ -1060,11 +1078,17 @@ struct Net::Impl
LayerData & ld = layers [ lid ] ;
if ( ld . skipFlags [ DNN_BACKEND_DEFAULT ] )
{
printf_ ( ( " skipped %s: %s \n " , ld . layerInstance - > name . c_str ( ) , ld . layerInstance - > type . c_str ( ) ) ) ;
continue ;
}
printf_ ( ( " analyzing %s: %s \n " , ld . layerInstance - > name . c_str ( ) , ld . layerInstance - > type . c_str ( ) ) ) ;
if ( ld . consumers . size ( ) = = 0 )
outnames . push_back ( ld . layerInstance - > name ) ;
// the optimization #1. try to fuse batch norm, scaling and/or activation layers
// with the current layer if they follow it. Normally, the are fused with the convolution layer,
// but some of them (like activation) may be fused with fully-connected, elemwise (+) and
// some other layers.
Ptr < Layer > & currLayer = ld . layerInstance ;
if ( ld . consumers . size ( ) = = 1 & & pinsToKeep . count ( LayerPin ( lid , 0 ) ) = = 0 )
{
@ -1078,10 +1102,29 @@ struct Net::Impl
nextData = 0 ;
if ( currLayer - > setBatchNorm ( nextBNormLayer ) )
{
printf_ ( ( " \t fused with %s \n " , nextBNormLayer - > name . c_str ( ) ) ) ;
bnormData - > skipFlags [ DNN_BACKEND_DEFAULT ] = true ;
ld . outputBlobs = layers [ lpNext . lid ] . outputBlobs ;
if ( bnormData - > consumers . size ( ) = = 1 )
nextData = & layers [ bnormData - > consumers [ 0 ] . lid ] ;
lpNext = LayerPin ( bnormData - > consumers [ 0 ] . lid , 0 ) ;
}
}
Ptr < ScaleLayer > nextScaleLayer ;
if ( nextData )
nextScaleLayer = nextData - > layerInstance . dynamicCast < ScaleLayer > ( ) ;
if ( ! nextScaleLayer . empty ( ) & & pinsToKeep . count ( lpNext ) = = 0 )
{
LayerData * scaleData = nextData ;
nextData = 0 ;
if ( currLayer - > setScale ( nextScaleLayer ) )
{
printf_ ( ( " \t fused with %s \n " , nextScaleLayer - > name . c_str ( ) ) ) ;
scaleData - > skipFlags [ DNN_BACKEND_DEFAULT ] = true ;
ld . outputBlobs = layers [ lpNext . lid ] . outputBlobs ;
if ( scaleData - > consumers . size ( ) = = 1 )
nextData = & layers [ scaleData - > consumers [ 0 ] . lid ] ;
}
}
@ -1091,11 +1134,16 @@ struct Net::Impl
if ( ! nextActivLayer . empty ( ) & & currLayer - > setActivation ( nextActivLayer ) )
{
//printf("successfully merged %s and %s\n", currLayer->name.c_str(), nextActivLayer->name.c_str());
printf_ ( ( " \t fused with %s \n " , nextActivLayer - > name . c_str ( ) ) ) ;
nextData - > skipFlags [ DNN_BACKEND_DEFAULT ] = true ;
ld . outputBlobs = layers [ lpNext . lid ] . outputBlobs ;
}
}
// the optimization #2. if there is no layer that takes max pooling layer's computed
// max indices (and only some semantical segmentation networks might need this;
// many others only take the maximum values), then we switch the max pooling
// layer to the faster operating mode.
Ptr < PoolingLayer > poolingLayer = ld . layerInstance . dynamicCast < PoolingLayer > ( ) ;
if ( ! poolingLayer . empty ( ) & & ! ld . consumers . empty ( ) )
{
@ -1108,7 +1156,71 @@ struct Net::Impl
if ( i > = nconsumers )
{
poolingLayer - > computeMaxIdx = false ;
//printf("simplified pooling layer %s\n", poolingLayer->name.c_str());
printf_ ( ( " \t simplified pooling layer %s \n " , poolingLayer - > name . c_str ( ) ) ) ;
}
}
// the optimization #3. if there is concat layer that concatenates channels
// from the inputs together (i.e. axis == 1) then we make the inputs of
// the concat layer to write to the concatetion output buffer
// (and so we eliminate the concatenation layer, because the channels
// are concatenated implicitly).
Ptr < ConcatLayer > concatLayer = ld . layerInstance . dynamicCast < ConcatLayer > ( ) ;
if ( ! concatLayer . empty ( ) & & concatLayer - > axis = = 1 & &
ld . outputBlobs . size ( ) = = 1 )
{
Mat & output = ld . outputBlobs [ 0 ] ;
// TODO: in general, this optimization can always be done, but
// many layers currently check that the input/output blobs are
// continuous arrays. Unfortunately, this is not true when
// the concatenation optimization is applied with batch_size > 1.
// so, for now, we only apply this optimization in the most popular
// case batch_size == 1.
if ( output . dims = = 4 & & output . size [ 0 ] = = 1 )
{
size_t i , ninputs = ld . inputBlobsId . size ( ) ;
std : : vector < LayerPin > realinputs ( ninputs ) ;
for ( i = 0 ; i < ninputs ; i + + )
{
LayerPin pin = ld . inputBlobsId [ i ] ;
LayerData * inp_i_data = & layers [ pin . lid ] ;
while ( inp_i_data - > skipFlags [ DNN_BACKEND_DEFAULT ] & &
inp_i_data - > inputBlobsId . size ( ) = = 1 )
{
pin = inp_i_data - > inputBlobsId [ 0 ] ;
inp_i_data = & layers [ pin . lid ] ;
}
printf_ ( ( " \t real input for %s is %s \n " ,
layers [ ld . inputBlobsId [ i ] . lid ] . getLayerInstance ( ) - > name . c_str ( ) ,
inp_i_data - > getLayerInstance ( ) - > name . c_str ( ) ) ) ;
if ( inp_i_data - > skipFlags [ DNN_BACKEND_DEFAULT ] )
break ;
realinputs [ i ] = pin ;
}
if ( i > = ninputs )
{
Range chrange [ ] = { Range : : all ( ) , Range : : all ( ) , Range : : all ( ) , Range : : all ( ) } ;
int ofs = 0 ;
for ( i = 0 ; i < ninputs ; i + + )
{
LayerPin pin = realinputs [ i ] ;
LayerData * inp_i_data = & layers [ pin . lid ] ;
int channels_i = ld . inputBlobs [ i ] - > size [ 1 ] ;
chrange [ 1 ] = Range ( ofs , ofs + channels_i ) ;
printf_ ( ( " \t output %s(%d) to channels (%d, %d) \n " , inp_i_data - > layerInstance - > name . c_str ( ) ,
pin . oid , ofs , ofs + channels_i ) ) ;
ofs + = channels_i ;
Mat output_slice = output ( chrange ) ;
Mat & curr_output = inp_i_data - > outputBlobs [ pin . oid ] ;
CV_Assert ( output_slice . isContinuous ( ) & & output_slice . size = = curr_output . size ) ;
curr_output = output_slice ;
}
ld . skipFlags [ DNN_BACKEND_DEFAULT ] = true ;
printf_ ( ( " \t optimized out Concat layer %s \n " , concatLayer - > name . c_str ( ) ) ) ;
}
}
}
}
@ -1458,9 +1570,12 @@ void Net::setPreferableBackend(int backendId)
CV_TRACE_FUNCTION ( ) ;
CV_TRACE_ARG ( backendId ) ;
impl - > netWasAllocated = impl - > netWasAllocated & &
impl - > preferableBackend = = backendId ;
impl - > preferableBackend = backendId ;
if ( impl - > preferableBackend ! = backendId )
{
impl - > preferableBackend = backendId ;
impl - > netWasAllocated = false ;
impl - > clear ( ) ;
}
}
void Net : : setPreferableTarget ( int targetId )
@ -1468,9 +1583,12 @@ void Net::setPreferableTarget(int targetId)
CV_TRACE_FUNCTION ( ) ;
CV_TRACE_ARG ( targetId ) ;
impl - > netWasAllocated = impl - > netWasAllocated & &
impl - > preferableTarget = = targetId ;
impl - > preferableTarget = targetId ;
if ( impl - > preferableTarget ! = targetId )
{
impl - > preferableTarget = targetId ;
impl - > netWasAllocated = false ;
impl - > clear ( ) ;
}
}
void Net : : setInputsNames ( const std : : vector < String > & inputBlobNames )
@ -1825,6 +1943,16 @@ void Net::getMemoryConsumption(const MatShape& netInputShape, std::vector<int>&
weights , blobs ) ;
}
void Net : : enableFusion ( bool fusion )
{
if ( impl - > fusion ! = fusion )
{
impl - > fusion = fusion ;
impl - > netWasAllocated = false ;
impl - > clear ( ) ;
}
}
void Net : : setHalideScheduler ( const String & scheduler )
{
CV_TRACE_FUNCTION ( ) ;
@ -1950,6 +2078,13 @@ Ptr<BackendNode> Layer::tryAttach(const Ptr<BackendNode>& node)
bool Layer : : setActivation ( const Ptr < ActivationLayer > & ) { return false ; }
bool Layer : : setBatchNorm ( const Ptr < BatchNormLayer > & ) { return false ; }
bool Layer : : setScale ( const Ptr < ScaleLayer > & ) { return false ; }
void Layer : : unsetAttached ( )
{
setActivation ( Ptr < ActivationLayer > ( ) ) ;
setBatchNorm ( Ptr < BatchNormLayer > ( ) ) ;
setScale ( Ptr < ScaleLayer > ( ) ) ;
}
template < typename T >
static void vecToPVec ( const std : : vector < T > & v , std : : vector < T * > & pv )