|
|
|
@ -464,29 +464,34 @@ public: |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst) |
|
|
|
|
void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, bool force) |
|
|
|
|
{ |
|
|
|
|
std::map<LayerPin, Mat>::iterator hostIt; |
|
|
|
|
std::map<LayerPin, int>::iterator refIt; |
|
|
|
|
|
|
|
|
|
const int targetTotal = total(shape); |
|
|
|
|
Mat bestBlob; |
|
|
|
|
int bestBlobTotal = INT_MAX; |
|
|
|
|
LayerPin bestBlobPin; |
|
|
|
|
for (hostIt = memHosts.begin(); hostIt != memHosts.end(); ++hostIt) |
|
|
|
|
|
|
|
|
|
if( !force ) |
|
|
|
|
{ |
|
|
|
|
refIt = refCounter.find(hostIt->first); |
|
|
|
|
// Use only blobs that had references before because if not,
|
|
|
|
|
// it might be used as output.
|
|
|
|
|
if (refIt != refCounter.end() && refIt->second == 0) |
|
|
|
|
std::map<LayerPin, Mat>::iterator hostIt; |
|
|
|
|
std::map<LayerPin, int>::iterator refIt; |
|
|
|
|
|
|
|
|
|
const int targetTotal = total(shape); |
|
|
|
|
int bestBlobTotal = INT_MAX; |
|
|
|
|
|
|
|
|
|
for (hostIt = memHosts.begin(); hostIt != memHosts.end(); ++hostIt) |
|
|
|
|
{ |
|
|
|
|
Mat& unusedBlob = hostIt->second; |
|
|
|
|
if (unusedBlob.total() >= targetTotal && |
|
|
|
|
unusedBlob.total() < bestBlobTotal) |
|
|
|
|
refIt = refCounter.find(hostIt->first); |
|
|
|
|
// Use only blobs that had references before because if not,
|
|
|
|
|
// it might be used as output.
|
|
|
|
|
if (refIt != refCounter.end() && refIt->second == 0) |
|
|
|
|
{ |
|
|
|
|
bestBlobPin = hostIt->first; |
|
|
|
|
bestBlob = unusedBlob; |
|
|
|
|
bestBlobTotal = unusedBlob.total(); |
|
|
|
|
Mat& unusedBlob = hostIt->second; |
|
|
|
|
if (unusedBlob.total() >= targetTotal && |
|
|
|
|
unusedBlob.total() < bestBlobTotal) |
|
|
|
|
{ |
|
|
|
|
bestBlobPin = hostIt->first; |
|
|
|
|
bestBlob = unusedBlob; |
|
|
|
|
bestBlobTotal = unusedBlob.total(); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
@ -505,7 +510,8 @@ public: |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
void allocateBlobsForLayer(LayerData &ld, const LayerShapes& layerShapes, |
|
|
|
|
std::vector<LayerPin>& pinsForInternalBlobs) |
|
|
|
|
std::vector<LayerPin>& pinsForInternalBlobs, |
|
|
|
|
bool maximizeReuse) |
|
|
|
|
{ |
|
|
|
|
CV_TRACE_FUNCTION(); |
|
|
|
|
|
|
|
|
@ -561,6 +567,7 @@ public: |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
std::map<int, std::vector<int> >::reverse_iterator it; |
|
|
|
|
bool force = !maximizeReuse && ld.inputBlobsId.size() > 1; |
|
|
|
|
for(it = idxSizes.rbegin(); it != idxSizes.rend(); it++) |
|
|
|
|
{ |
|
|
|
|
for(int j = 0; j < it->second.size(); j++) |
|
|
|
@ -569,7 +576,7 @@ public: |
|
|
|
|
if (total(shapes[index])) |
|
|
|
|
{ |
|
|
|
|
LayerPin blobPin(ld.id, index); |
|
|
|
|
if (index < outShapes.size() && inPlace) |
|
|
|
|
if (index < outShapes.size() && inPlace && !force) |
|
|
|
|
{ |
|
|
|
|
CV_Assert(ld.inputBlobs[0]->total() == total(shapes[index])); |
|
|
|
|
ld.outputBlobs[index] = ld.inputBlobs[0]->reshape(1, shapes[index]); |
|
|
|
@ -577,7 +584,7 @@ public: |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
{ |
|
|
|
|
reuseOrCreate(shapes[index], blobPin, *blobs[index]); |
|
|
|
|
reuseOrCreate(shapes[index], blobPin, *blobs[index], force); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
@ -628,6 +635,7 @@ struct Net::Impl |
|
|
|
|
|
|
|
|
|
lastLayerId = 1; |
|
|
|
|
netWasAllocated = false; |
|
|
|
|
fusion = true; |
|
|
|
|
preferableBackend = DNN_BACKEND_DEFAULT; |
|
|
|
|
preferableTarget = DNN_TARGET_CPU; |
|
|
|
|
} |
|
|
|
@ -647,6 +655,7 @@ struct Net::Impl |
|
|
|
|
int lastLayerId; |
|
|
|
|
|
|
|
|
|
bool netWasAllocated; |
|
|
|
|
bool fusion; |
|
|
|
|
|
|
|
|
|
void compileHalide() |
|
|
|
|
{ |
|
|
|
@ -695,8 +704,7 @@ struct Net::Impl |
|
|
|
|
if( currLayer.empty() ) |
|
|
|
|
continue; |
|
|
|
|
|
|
|
|
|
currLayer->setActivation(Ptr<ActivationLayer>()); |
|
|
|
|
currLayer->setBatchNorm(Ptr<BatchNormLayer>()); |
|
|
|
|
currLayer->unsetAttached(); |
|
|
|
|
|
|
|
|
|
Ptr<PoolingLayer> poolingLayer = currLayer.dynamicCast<PoolingLayer>(); |
|
|
|
|
if( !poolingLayer.empty() ) |
|
|
|
@ -704,9 +712,11 @@ struct Net::Impl |
|
|
|
|
poolingLayer->computeMaxIdx = true; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
it = layers.find(0); |
|
|
|
|
CV_Assert(it != layers.end()); |
|
|
|
|
it->second.skipFlags[DNN_BACKEND_DEFAULT] = true; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void setUpNet(const std::vector<LayerPin>& blobsToKeep_ = std::vector<LayerPin>()) |
|
|
|
|
{ |
|
|
|
|
CV_TRACE_FUNCTION(); |
|
|
|
@ -783,13 +793,11 @@ struct Net::Impl |
|
|
|
|
|
|
|
|
|
LayerData& getLayerData(const DictValue &layerDesc) |
|
|
|
|
{ |
|
|
|
|
CV_Assert(layerDesc.isInt() || layerDesc.isString()); |
|
|
|
|
if (layerDesc.isInt()) |
|
|
|
|
return getLayerData(layerDesc.get<int>()); |
|
|
|
|
else if (layerDesc.isString()) |
|
|
|
|
else /*if (layerDesc.isString())*/ |
|
|
|
|
return getLayerData(layerDesc.get<String>()); |
|
|
|
|
|
|
|
|
|
CV_Assert(layerDesc.isInt() || layerDesc.isString()); |
|
|
|
|
return *((LayerData*)NULL); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
static void addLayerInput(LayerData &ld, int inNum, LayerPin from) |
|
|
|
@ -1021,7 +1029,8 @@ struct Net::Impl |
|
|
|
|
CV_Assert(layerShapesIt != layersShapes.end()); |
|
|
|
|
|
|
|
|
|
std::vector<LayerPin> pinsForInternalBlobs; |
|
|
|
|
blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs); |
|
|
|
|
bool maximizeReuse = preferableBackend == DNN_BACKEND_HALIDE; |
|
|
|
|
blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs, maximizeReuse); |
|
|
|
|
|
|
|
|
|
Ptr<Layer> layerPtr = ld.getLayerInstance(); |
|
|
|
|
{ |
|
|
|
@ -1044,8 +1053,17 @@ struct Net::Impl |
|
|
|
|
ld.flag = 1; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#if 0 |
|
|
|
|
#define printf_(args) printf args |
|
|
|
|
#else |
|
|
|
|
#define printf_(args) |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
void fuseLayers(const std::vector<LayerPin>& blobsToKeep_) |
|
|
|
|
{ |
|
|
|
|
if( !fusion || preferableBackend == DNN_BACKEND_HALIDE ) |
|
|
|
|
return; |
|
|
|
|
|
|
|
|
|
CV_TRACE_FUNCTION(); |
|
|
|
|
|
|
|
|
|
// scan through all the layers. If there is convolution layer followed by the activation layer,
|
|
|
|
@ -1060,11 +1078,17 @@ struct Net::Impl |
|
|
|
|
LayerData& ld = layers[lid]; |
|
|
|
|
if( ld.skipFlags[DNN_BACKEND_DEFAULT] ) |
|
|
|
|
{ |
|
|
|
|
printf_(("skipped %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str())); |
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
printf_(("analyzing %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str())); |
|
|
|
|
if( ld.consumers.size() == 0 ) |
|
|
|
|
outnames.push_back(ld.layerInstance->name); |
|
|
|
|
|
|
|
|
|
// the optimization #1. try to fuse batch norm, scaling and/or activation layers
|
|
|
|
|
// with the current layer if they follow it. Normally, the are fused with the convolution layer,
|
|
|
|
|
// but some of them (like activation) may be fused with fully-connected, elemwise (+) and
|
|
|
|
|
// some other layers.
|
|
|
|
|
Ptr<Layer>& currLayer = ld.layerInstance; |
|
|
|
|
if( ld.consumers.size() == 1 && pinsToKeep.count(LayerPin(lid, 0)) == 0 ) |
|
|
|
|
{ |
|
|
|
@ -1078,10 +1102,29 @@ struct Net::Impl |
|
|
|
|
nextData = 0; |
|
|
|
|
if( currLayer->setBatchNorm(nextBNormLayer) ) |
|
|
|
|
{ |
|
|
|
|
printf_(("\tfused with %s\n", nextBNormLayer->name.c_str())); |
|
|
|
|
bnormData->skipFlags[DNN_BACKEND_DEFAULT] = true; |
|
|
|
|
ld.outputBlobs = layers[lpNext.lid].outputBlobs; |
|
|
|
|
if( bnormData->consumers.size() == 1 ) |
|
|
|
|
nextData = &layers[bnormData->consumers[0].lid]; |
|
|
|
|
lpNext = LayerPin(bnormData->consumers[0].lid, 0); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
Ptr<ScaleLayer> nextScaleLayer; |
|
|
|
|
if( nextData ) |
|
|
|
|
nextScaleLayer = nextData->layerInstance.dynamicCast<ScaleLayer>(); |
|
|
|
|
if( !nextScaleLayer.empty() && pinsToKeep.count(lpNext) == 0 ) |
|
|
|
|
{ |
|
|
|
|
LayerData* scaleData = nextData; |
|
|
|
|
nextData = 0; |
|
|
|
|
if( currLayer->setScale(nextScaleLayer) ) |
|
|
|
|
{ |
|
|
|
|
printf_(("\tfused with %s\n", nextScaleLayer->name.c_str())); |
|
|
|
|
scaleData->skipFlags[DNN_BACKEND_DEFAULT] = true; |
|
|
|
|
ld.outputBlobs = layers[lpNext.lid].outputBlobs; |
|
|
|
|
if( scaleData->consumers.size() == 1 ) |
|
|
|
|
nextData = &layers[scaleData->consumers[0].lid]; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -1091,11 +1134,16 @@ struct Net::Impl |
|
|
|
|
|
|
|
|
|
if( !nextActivLayer.empty() && currLayer->setActivation(nextActivLayer) ) |
|
|
|
|
{ |
|
|
|
|
//printf("successfully merged %s and %s\n", currLayer->name.c_str(), nextActivLayer->name.c_str());
|
|
|
|
|
printf_(("\tfused with %s\n", nextActivLayer->name.c_str())); |
|
|
|
|
nextData->skipFlags[DNN_BACKEND_DEFAULT] = true; |
|
|
|
|
ld.outputBlobs = layers[lpNext.lid].outputBlobs; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// the optimization #2. if there is no layer that takes max pooling layer's computed
|
|
|
|
|
// max indices (and only some semantical segmentation networks might need this;
|
|
|
|
|
// many others only take the maximum values), then we switch the max pooling
|
|
|
|
|
// layer to the faster operating mode.
|
|
|
|
|
Ptr<PoolingLayer> poolingLayer = ld.layerInstance.dynamicCast<PoolingLayer>(); |
|
|
|
|
if( !poolingLayer.empty() && !ld.consumers.empty() ) |
|
|
|
|
{ |
|
|
|
@ -1108,7 +1156,71 @@ struct Net::Impl |
|
|
|
|
if( i >= nconsumers ) |
|
|
|
|
{ |
|
|
|
|
poolingLayer->computeMaxIdx = false; |
|
|
|
|
//printf("simplified pooling layer %s\n", poolingLayer->name.c_str());
|
|
|
|
|
printf_(("\tsimplified pooling layer %s\n", poolingLayer->name.c_str())); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// the optimization #3. if there is concat layer that concatenates channels
|
|
|
|
|
// from the inputs together (i.e. axis == 1) then we make the inputs of
|
|
|
|
|
// the concat layer to write to the concatetion output buffer
|
|
|
|
|
// (and so we eliminate the concatenation layer, because the channels
|
|
|
|
|
// are concatenated implicitly).
|
|
|
|
|
Ptr<ConcatLayer> concatLayer = ld.layerInstance.dynamicCast<ConcatLayer>(); |
|
|
|
|
if( !concatLayer.empty() && concatLayer->axis == 1 && |
|
|
|
|
ld.outputBlobs.size() == 1 ) |
|
|
|
|
{ |
|
|
|
|
Mat& output = ld.outputBlobs[0]; |
|
|
|
|
|
|
|
|
|
// TODO: in general, this optimization can always be done, but
|
|
|
|
|
// many layers currently check that the input/output blobs are
|
|
|
|
|
// continuous arrays. Unfortunately, this is not true when
|
|
|
|
|
// the concatenation optimization is applied with batch_size > 1.
|
|
|
|
|
// so, for now, we only apply this optimization in the most popular
|
|
|
|
|
// case batch_size == 1.
|
|
|
|
|
if( output.dims == 4 && output.size[0] == 1 ) |
|
|
|
|
{ |
|
|
|
|
size_t i, ninputs = ld.inputBlobsId.size(); |
|
|
|
|
std::vector<LayerPin> realinputs(ninputs); |
|
|
|
|
for( i = 0; i < ninputs; i++ ) |
|
|
|
|
{ |
|
|
|
|
LayerPin pin = ld.inputBlobsId[i]; |
|
|
|
|
LayerData* inp_i_data = &layers[pin.lid]; |
|
|
|
|
while(inp_i_data->skipFlags[DNN_BACKEND_DEFAULT] && |
|
|
|
|
inp_i_data->inputBlobsId.size() == 1) |
|
|
|
|
{ |
|
|
|
|
pin = inp_i_data->inputBlobsId[0]; |
|
|
|
|
inp_i_data = &layers[pin.lid]; |
|
|
|
|
} |
|
|
|
|
printf_(("\treal input for %s is %s\n", |
|
|
|
|
layers[ld.inputBlobsId[i].lid].getLayerInstance()->name.c_str(), |
|
|
|
|
inp_i_data->getLayerInstance()->name.c_str())); |
|
|
|
|
|
|
|
|
|
if(inp_i_data->skipFlags[DNN_BACKEND_DEFAULT]) |
|
|
|
|
break; |
|
|
|
|
realinputs[i] = pin; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if( i >= ninputs ) |
|
|
|
|
{ |
|
|
|
|
Range chrange[] = { Range::all(), Range::all(), Range::all(), Range::all() }; |
|
|
|
|
int ofs = 0; |
|
|
|
|
for( i = 0; i < ninputs; i++ ) |
|
|
|
|
{ |
|
|
|
|
LayerPin pin = realinputs[i]; |
|
|
|
|
LayerData* inp_i_data = &layers[pin.lid]; |
|
|
|
|
int channels_i = ld.inputBlobs[i]->size[1]; |
|
|
|
|
chrange[1] = Range(ofs, ofs + channels_i); |
|
|
|
|
printf_(("\toutput %s(%d) to channels (%d, %d)\n", inp_i_data->layerInstance->name.c_str(), |
|
|
|
|
pin.oid, ofs, ofs + channels_i)); |
|
|
|
|
ofs += channels_i; |
|
|
|
|
Mat output_slice = output(chrange); |
|
|
|
|
Mat& curr_output = inp_i_data->outputBlobs[pin.oid]; |
|
|
|
|
CV_Assert(output_slice.isContinuous() && output_slice.size == curr_output.size); |
|
|
|
|
curr_output = output_slice; |
|
|
|
|
} |
|
|
|
|
ld.skipFlags[DNN_BACKEND_DEFAULT] = true; |
|
|
|
|
printf_(("\toptimized out Concat layer %s\n", concatLayer->name.c_str())); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
@ -1458,9 +1570,12 @@ void Net::setPreferableBackend(int backendId) |
|
|
|
|
CV_TRACE_FUNCTION(); |
|
|
|
|
CV_TRACE_ARG(backendId); |
|
|
|
|
|
|
|
|
|
impl->netWasAllocated = impl->netWasAllocated && |
|
|
|
|
impl->preferableBackend == backendId; |
|
|
|
|
impl->preferableBackend = backendId; |
|
|
|
|
if( impl->preferableBackend != backendId ) |
|
|
|
|
{ |
|
|
|
|
impl->preferableBackend = backendId; |
|
|
|
|
impl->netWasAllocated = false; |
|
|
|
|
impl->clear(); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
void Net::setPreferableTarget(int targetId) |
|
|
|
@ -1468,9 +1583,12 @@ void Net::setPreferableTarget(int targetId) |
|
|
|
|
CV_TRACE_FUNCTION(); |
|
|
|
|
CV_TRACE_ARG(targetId); |
|
|
|
|
|
|
|
|
|
impl->netWasAllocated = impl->netWasAllocated && |
|
|
|
|
impl->preferableTarget == targetId; |
|
|
|
|
impl->preferableTarget = targetId; |
|
|
|
|
if( impl->preferableTarget != targetId ) |
|
|
|
|
{ |
|
|
|
|
impl->preferableTarget = targetId; |
|
|
|
|
impl->netWasAllocated = false; |
|
|
|
|
impl->clear(); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
void Net::setInputsNames(const std::vector<String> &inputBlobNames) |
|
|
|
@ -1825,6 +1943,16 @@ void Net::getMemoryConsumption(const MatShape& netInputShape, std::vector<int>& |
|
|
|
|
weights, blobs); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
void Net::enableFusion(bool fusion) |
|
|
|
|
{ |
|
|
|
|
if( impl->fusion != fusion ) |
|
|
|
|
{ |
|
|
|
|
impl->fusion = fusion; |
|
|
|
|
impl->netWasAllocated = false; |
|
|
|
|
impl->clear(); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
void Net::setHalideScheduler(const String& scheduler) |
|
|
|
|
{ |
|
|
|
|
CV_TRACE_FUNCTION(); |
|
|
|
@ -1950,6 +2078,13 @@ Ptr<BackendNode> Layer::tryAttach(const Ptr<BackendNode>& node) |
|
|
|
|
|
|
|
|
|
bool Layer::setActivation(const Ptr<ActivationLayer>&) { return false; } |
|
|
|
|
bool Layer::setBatchNorm(const Ptr<BatchNormLayer>&) { return false; } |
|
|
|
|
bool Layer::setScale(const Ptr<ScaleLayer>&) { return false; } |
|
|
|
|
void Layer::unsetAttached() |
|
|
|
|
{ |
|
|
|
|
setActivation(Ptr<ActivationLayer>()); |
|
|
|
|
setBatchNorm(Ptr<BatchNormLayer>()); |
|
|
|
|
setScale(Ptr<ScaleLayer>()); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
template <typename T> |
|
|
|
|
static void vecToPVec(const std::vector<T> &v, std::vector<T*> &pv) |
|
|
|
|