From 09b73b2dc79aeb9fc16ab25c8b0cd019e00e89e1 Mon Sep 17 00:00:00 2001
From: Aleksandr Rybnikov <arrybn@gmail.com>
Date: Fri, 9 Jun 2017 21:36:19 +0300
Subject: [PATCH] Blobs reuse improvement (#1205)

* Reuse deep learning output blobs

* Changed order for iterating through blobs while seeking memory. Refactored a little.
---
 modules/dnn/include/opencv2/dnn/dnn.hpp       |  15 +
 modules/dnn/misc/python/pyopencv_dnn.hpp      |   1 +
 modules/dnn/src/dnn.cpp                       | 382 ++++++++++++++----
 modules/dnn/src/layers/batch_norm_layer.cpp   |   9 +
 modules/dnn/src/layers/blank_layer.cpp        |   7 +-
 modules/dnn/src/layers/elementwise_layers.cpp |  14 +-
 modules/dnn/src/layers/reshape_layer.cpp      |   7 +-
 modules/dnn/src/layers/scale_layer.cpp        |   9 +
 modules/dnn/src/layers/split_layer.cpp        |  10 +-
 modules/dnn/test/test_caffe_importer.cpp      |   4 +
 10 files changed, 374 insertions(+), 84 deletions(-)
diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp
index ce671a8b7..384bcb530 100644
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -369,6 +369,21 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
          CV_WRAP void getMemoryConsumption(const int layerId,
                                            const MatShape& netInputShape,
                                            size_t& weights, size_t& blobs) const;
+
+         /** @brief Computes bytes number which are requered to store
+          * all weights and intermediate blobs for each layer.
+          * @param netInputShapes vector of shapes for all net inputs.
+          * @param layerIds output vector to save layer IDs.
+          * @param weights output parameter to store resulting bytes for weights.
+          * @param blobs output parameter to store resulting bytes for intermediate blobs.
+          */
+         CV_WRAP void getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
+                                           std::vector<int>& layerIds, std::vector<size_t>& weights,
+                                           std::vector<size_t>& blobs) const;
+         /** @overload */
+         CV_WRAP void getMemoryConsumption(const MatShape& netInputShape,
+                                           std::vector<int>& layerIds, std::vector<size_t>& weights,
+                                           std::vector<size_t>& blobs) const;
     private:
 
         struct Impl;
diff --git a/modules/dnn/misc/python/pyopencv_dnn.hpp b/modules/dnn/misc/python/pyopencv_dnn.hpp
index 15365d896..3ab5a3cd2 100644
--- a/modules/dnn/misc/python/pyopencv_dnn.hpp
+++ b/modules/dnn/misc/python/pyopencv_dnn.hpp
@@ -2,6 +2,7 @@
 typedef dnn::DictValue LayerId;
 typedef std::vector<dnn::MatShape> vector_MatShape;
 typedef std::vector<std::vector<dnn::MatShape> > vector_vector_MatShape;
+typedef std::vector<size_t> vector_size_t;
 
 template<>
 bool pyopencv_to(PyObject *o, dnn::DictValue &dv, const char *name)
diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp
index 0fa9a2f8f..cecf5aa95 100644
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -55,6 +55,22 @@ using std::map;
 using std::make_pair;
 using std::set;
 
+namespace
+{
+    typedef std::vector<MatShape> ShapesVec;
+
+    struct LayerShapes
+    {
+        ShapesVec in, out, internal;
+        // No guarantees that layer which support in-place computations
+        // will be computed in-place (input.data_ptr == output.data_ptr).
+        // If layer said that it could work in-place and layers after it
+        // no longer use input blob, we'll set output = input.
+        bool supportInPlace;
+        LayerShapes() {supportInPlace = false;}
+    };
+}
+
 namespace cv
 {
 namespace dnn
@@ -154,6 +170,11 @@ struct LayerPin
     {
         return (lid == r.lid && oid == r.oid);
     }
+
+    bool operator<(const LayerPin &r) const
+    {
+        return lid < r.lid || lid == r.lid && oid < r.oid;
+    }
 };
 
 struct LayerData
@@ -219,16 +240,222 @@ private:
     std::vector<String> outNames;
 };
 
-struct Net::Impl
+struct BlobManager
 {
-    typedef std::vector<MatShape> ShapesVec;
-    struct LayerShapes
+public:
+    // Increase references counter to layer output.
+    void addReference(const LayerPin& lp)
     {
-        ShapesVec in, out, internal;
-        bool inplace;
-        LayerShapes() {inplace = false;}
-    };
+        std::map<LayerPin, int>::iterator it = refCounter.find(lp);
+        if (it == refCounter.end())
+            refCounter[lp] = 1;
+        else
+            it->second += 1;
+    }
 
+    void addReferences(const std::vector<LayerPin>& pins)
+    {
+        for (int i = 0; i < pins.size(); i++)
+        {
+            addReference(pins[i]);
+        }
+    }
+
+    // Returns number of references to allocated memory that used in specific
+    // layer blob.
+    int numReferences(const LayerPin& lp)
+    {
+        std::map<LayerPin, LayerPin>::iterator mapIt = reuseMap.find(lp);
+        CV_Assert(mapIt != reuseMap.end());
+        LayerPin memHost = mapIt->second;
+
+        std::map<LayerPin, int>::iterator refIt = refCounter.find(memHost);
+        CV_Assert(refIt != refCounter.end());
+        return refIt->second;
+    }
+
+    // Reuse data allocated in <host> inside the <user> blob.
+    void reuse(const LayerPin& host, const LayerPin& user)
+    {
+        CV_Assert(reuseMap.find(user) == reuseMap.end());
+        CV_Assert(reuseMap.find(host) != reuseMap.end());
+        LayerPin memHost = reuseMap[host];
+        reuseMap[user] = memHost;
+        if (refCounter.find(memHost) != refCounter.end())
+        {
+            std::map<LayerPin, int>::iterator userRefIt = refCounter.find(user);
+            if (userRefIt != refCounter.end())
+            {
+                refCounter[memHost] += userRefIt->second;
+                refCounter.erase(userRefIt);
+            }
+            else
+                refCounter[memHost] += 1;
+        }
+    }
+
+    // Decrease references counter to allocated memory inside specific blob.
+    void releaseReference(const LayerPin& lp)
+    {
+        std::map<LayerPin, LayerPin>::iterator mapIt = reuseMap.find(lp);
+        CV_Assert(mapIt != reuseMap.end());
+
+        std::map<LayerPin, int>::iterator refIt = refCounter.find(mapIt->second);
+        CV_Assert(refIt != refCounter.end());
+        CV_Assert(refIt->second > 0);
+        refIt->second -= 1;
+    }
+
+    void releaseReferences(const std::vector<LayerPin>& pins)
+    {
+        for (int i = 0; i < pins.size(); i++)
+        {
+            releaseReference(pins[i]);
+        }
+    }
+
+    void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst)
+    {
+        std::map<LayerPin, Mat>::iterator hostIt;
+        std::map<LayerPin, int>::iterator refIt;
+
+        const int targetTotal = total(shape);
+        Mat bestBlob;
+        int bestBlobTotal = INT_MAX;
+        LayerPin bestBlobPin;
+        for (hostIt = memHosts.begin(); hostIt != memHosts.end(); ++hostIt)
+        {
+            refIt = refCounter.find(hostIt->first);
+            // Use only blobs that had references before because if not,
+            // it might be used as output.
+            if (refIt != refCounter.end() && refIt->second == 0)
+            {
+                Mat& unusedBlob = hostIt->second;
+                if (unusedBlob.total() >= targetTotal &&
+                    unusedBlob.total() < bestBlobTotal)
+                {
+                    bestBlobPin = hostIt->first;
+                    bestBlob = unusedBlob;
+                    bestBlobTotal = unusedBlob.total();
+                }
+            }
+        }
+        if (!bestBlob.empty())
+        {
+            reuse(bestBlobPin, lp);
+            dst = Mat(shape, CV_32F, bestBlob.data);
+        }
+        else
+        {
+            dst.create(shape, CV_32F);
+            addHost(lp, dst);
+        }
+    }
+
+    void allocateBlobsForLayer(LayerData &ld, const LayerShapes& layerShapes,
+                               std::vector<LayerPin>& pinsForInternalBlobs)
+    {
+        pinsForInternalBlobs.clear();
+
+        std::vector<Mat>& outputBlobs = ld.outputBlobs,
+                &internalBlobs = ld.internals;
+
+        const ShapesVec& outShapes = layerShapes.out,
+                internalShapes = layerShapes.internal;
+
+        outputBlobs.resize(std::max((size_t)1, outShapes.size())); //layer produce at least one output blob
+        internalBlobs.resize(internalShapes.size());
+
+        CV_Assert(ld.requiredOutputs.size() <= outShapes.size());
+
+        // Check that layer could work in-place.
+        bool inPlace = false;
+        if (layerShapes.supportInPlace)
+        {
+            if (ld.inputBlobs.size() == 1)
+            {
+                // Get number of references to the input memory.
+                int numRef = numReferences(ld.inputBlobsId[0]);
+                // If current layer is one and only customer of this blob.
+                inPlace = numRef == 1;
+            }
+        }
+
+        ShapesVec shapes(outShapes);
+        shapes.insert(shapes.end(), internalShapes.begin(), internalShapes.end());
+        std::vector<Mat*> blobs;
+        for(int i = 0; i < outputBlobs.size(); i++)
+        {
+            blobs.push_back(&outputBlobs[i]);
+        }
+
+        for(int i = 0; i < internalBlobs.size(); i++)
+        {
+            blobs.push_back(&internalBlobs[i]);
+            if (total(internalShapes[i]))
+            {
+                pinsForInternalBlobs.push_back(LayerPin(ld.id, ld.outputBlobs.size() + i));
+            }
+        }
+
+        addReferences(pinsForInternalBlobs);
+
+        std::map<int, std::vector<int> > idxSizes;
+        for(int i = 0; i < shapes.size(); i++)
+        {
+            idxSizes[total(shapes[i])].push_back(i);
+        }
+
+        std::map<int, std::vector<int> >::reverse_iterator it;
+        for(it = idxSizes.rbegin(); it != idxSizes.rend(); it++)
+        {
+            for(int j = 0; j < it->second.size(); j++)
+            {
+                int index = it->second[j];
+                if (total(shapes[index]))
+                {
+                    LayerPin blobPin(ld.id, index);
+                    if (index < outShapes.size() && inPlace)
+                    {
+                        CV_Assert(ld.inputBlobs[0]->total() == total(shapes[index]));
+                        ld.outputBlobs[index] = ld.inputBlobs[0]->reshape(1, shapes[index]);
+                        reuse(ld.inputBlobsId[0], blobPin);
+                    }
+                    else
+                    {
+                        reuseOrCreate(shapes[index], blobPin, *blobs[index]);
+                    }
+                }
+            }
+        }
+    }
+
+    // Clear internal state. Calls before an every reallocation.
+    void reset()
+    {
+        refCounter.clear();
+        reuseMap.clear();
+        memHosts.clear();
+    }
+
+private:
+    // Registed allocated memory.
+    void addHost(const LayerPin& lp, const Mat& mat)
+    {
+        CV_Assert(memHosts.find(lp) == memHosts.end());
+        reuseMap[lp] = lp;
+        memHosts[lp] = mat;
+    }
+
+    std::map<LayerPin, int> refCounter;
+    // Maps pin to origin blob (for whom memory was allocated firstly).
+    // For origin blobs key == value.
+    std::map<LayerPin, LayerPin> reuseMap;
+    std::map<LayerPin, Mat> memHosts;
+};
+
+struct Net::Impl
+{
     typedef std::map<int, LayerShapes> LayersShapesMap;
     typedef std::map<int, LayerData> MapIdToLayerData;
 
@@ -252,6 +479,7 @@ struct Net::Impl
 
     MapIdToLayerData layers;
     std::map<String, int> layerNameToId;
+    BlobManager blobManager;
 
     int lastLayerId;
 
@@ -469,37 +697,11 @@ struct Net::Impl
         LayersShapesMap::const_iterator layerShapesIt = layersShapes.find(lid);
 
         CV_Assert(layerShapesIt != layersShapes.end());
-        const ShapesVec& outShapes = layerShapesIt->second.out;
-        CV_Assert(ld.requiredOutputs.size() <= outShapes.size());
-
-        ld.outputBlobs.resize(std::max((size_t)1, outShapes.size())); //layer produce at least one output blob
-        for(int i = 0; i < outShapes.size(); i++)
-        {
-            if (shape(ld.outputBlobs[i]) != outShapes[i])
-            {
-                if (layerShapesIt->second.inplace)
-                {
-                    CV_Assert(ld.inputBlobs.size() == ld.outputBlobs.size());
-                    CV_Assert(ld.inputBlobs[i]->total() == total(outShapes[i]));
-                    ld.outputBlobs[i] = ld.inputBlobs[i]->reshape(1, outShapes[i]);
-                }
-                else
-                {
-                    ld.outputBlobs[i].create(outShapes[i], CV_32F);
-                }
-            }
-        }
 
-        const ShapesVec& intShapes = layerShapesIt->second.internal;
-        ld.internals.resize(intShapes.size());
-        for(int i = 0; i < intShapes.size(); i++)
-        {
-            if (shape(ld.internals[i]) != intShapes[i] && total(intShapes[i]))
-                ld.internals[i].create(intShapes[i], CV_32F);
-        }
+        std::vector<LayerPin> pinsForInternalBlobs;
+        blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs);
 
         Ptr<Layer> layerPtr = ld.getLayerInstance();
-        //try
         {
             layerPtr->finalize(ld.inputBlobs, ld.outputBlobs);
 #if 0
@@ -512,10 +714,10 @@ struct Net::Impl
             std::cout << "\n";
 #endif
         }
-        /*catch (const cv::Exception &err)
-        {
-            CV_RETHROW_ERROR(err, format("The following error occured while making allocate() for layer \"%s\": %s", ld.name.c_str(), err.err.c_str()));
-        }*/
+
+        // After allocation of layer, we decrease counters to it's input blobs.
+        blobManager.releaseReferences(ld.inputBlobsId);
+        blobManager.releaseReferences(pinsForInternalBlobs);
 
         ld.flag = 1;
     }
@@ -536,6 +738,13 @@ struct Net::Impl
         LayersShapesMap layersShapes;
         getLayersShapes(inputShapes, layersShapes);
 
+        blobManager.reset();
+        for (it = layers.begin(); it != layers.end(); ++it)
+        {
+            const LayerData& ld = it->second;
+            blobManager.addReferences(ld.inputBlobsId);
+        }
+
         for (it = layers.begin(); it != layers.end(); it++)
         {
             int lid = it->first;
@@ -609,7 +818,7 @@ struct Net::Impl
         ShapesVec& os = inOutShapes[id].out;
         ShapesVec& ints = inOutShapes[id].internal;
         int requiredOutputs = layers[id].requiredOutputs.size();
-        inOutShapes[id].inplace =
+        inOutShapes[id].supportInPlace =
                 layers[id].getLayerInstance()->getMemoryShapes(is, requiredOutputs, os, ints);
     }
 
@@ -718,9 +927,13 @@ void Net::setBlob(String outputName, const Mat &blob_)
     LayerData &ld = impl->layers[pin.lid];
     ld.outputBlobs.resize( std::max(pin.oid+1, (int)ld.requiredOutputs.size()) );
     MatShape prevShape = shape(ld.outputBlobs[pin.oid]);
-    ld.outputBlobs[pin.oid] = blob_.clone();
+    bool oldShape = prevShape == shape(blob_);
+    if (oldShape)
+        blob_.copyTo(ld.outputBlobs[pin.oid]);
+    else
+        ld.outputBlobs[pin.oid] = blob_.clone();
 
-    impl->netWasAllocated = impl->netWasAllocated && prevShape == shape(blob_);
+    impl->netWasAllocated = impl->netWasAllocated && oldShape;
 }
 
 Mat Net::getBlob(String outputName)
@@ -827,10 +1040,10 @@ std::vector<int> Net::getUnconnectedOutLayers() const
     return layersIds;
 }
 
-void Net::getLayersShapes(const Net::Impl::ShapesVec& netInputShapes,
+void Net::getLayersShapes(const ShapesVec& netInputShapes,
                           std::vector<int>* layersIds,
-                          std::vector<Net::Impl::ShapesVec>* inLayersShapes,
-                          std::vector<Net::Impl::ShapesVec>* outLayersShapes) const
+                          std::vector<ShapesVec>* inLayersShapes,
+                          std::vector<ShapesVec>* outLayersShapes) const
 {
     if ((layersIds || inLayersShapes || outLayersShapes) == false)
         return;
@@ -856,29 +1069,29 @@ void Net::getLayersShapes(const Net::Impl::ShapesVec& netInputShapes,
 
 void Net::getLayersShapes(const MatShape& netInputShape,
                           std::vector<int>* layerIds,
-                          std::vector<Net::Impl::ShapesVec>* inLayersShapes,
-                          std::vector<Net::Impl::ShapesVec>* outLayersShapes) const
+                          std::vector<ShapesVec>* inLayersShapes,
+                          std::vector<ShapesVec>* outLayersShapes) const
 {
-    getLayersShapes(Net::Impl::ShapesVec(1, netInputShape),
+    getLayersShapes(ShapesVec(1, netInputShape),
                     layerIds, inLayersShapes, outLayersShapes);
 }
 
 void Net::getLayerShapes(const MatShape& netInputShape,
                          const int layerId,
-                         Net::Impl::ShapesVec* inLayerShapes,
-                         Net::Impl::ShapesVec* outLayerShapes) const
+                         ShapesVec* inLayerShapes,
+                         ShapesVec* outLayerShapes) const
 {
-    getLayerShapes(Net::Impl::ShapesVec(1, netInputShape),
+    getLayerShapes(ShapesVec(1, netInputShape),
                    layerId, inLayerShapes, outLayerShapes);
 
 }
 
-void Net::getLayerShapes(const Net::Impl::ShapesVec& netInputShapes,
+void Net::getLayerShapes(const ShapesVec& netInputShapes,
                     const int layerId,
-                    Net::Impl::ShapesVec* inLayerShapes,
-                    Net::Impl::ShapesVec* outLayerShapes) const
+                    ShapesVec* inLayerShapes,
+                    ShapesVec* outLayerShapes) const
 {
-    Impl::LayerShapes shapes;
+    LayerShapes shapes;
     impl->getLayerShapes(netInputShapes, layerId, shapes);
     if (inLayerShapes)
         *inLayerShapes = shapes.in;
@@ -915,7 +1128,7 @@ int64 Net::getFLOPS(const int layerId,
     Impl::MapIdToLayerData::iterator layer = impl->layers.find(layerId);
     CV_Assert(layer != impl->layers.end());
 
-    Impl::LayerShapes shapes;
+    LayerShapes shapes;
     impl->getLayerShapes(netInputShapes, layerId, shapes);
 
     return layer->second.getLayerInstance()->getFLOPS(shapes.in, shapes.out);
@@ -986,41 +1199,70 @@ void Net::getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
                                size_t& weights, size_t& blobs) const
 {
     std::vector<int> layerIds;
+    std::vector<size_t> w, b;
+    getMemoryConsumption(netInputShapes, layerIds, w, b);
+
+    weights = blobs = 0;
+    for(int i = 0; i < layerIds.size(); i++)
+    {
+        weights += w[i];
+        blobs += b[i];
+    }
+}
+
+void Net::getMemoryConsumption(const int layerId,
+                               const MatShape& netInputShape,
+                               size_t& weights, size_t& blobs) const
+{
+    getMemoryConsumption(layerId, std::vector<MatShape>(1, netInputShape),
+                         weights, blobs);
+}
+
+void Net::getMemoryConsumption(const MatShape& netInputShape,
+                               size_t& weights, size_t& blobs) const
+{
+    getMemoryConsumption(std::vector<MatShape>(1, netInputShape),
+                         weights, blobs);
+}
+
+void Net::getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
+                                  std::vector<int>& layerIds, std::vector<size_t>& weights,
+                                  std::vector<size_t>& blobs) const
+{
+    layerIds.clear();
+    weights.clear();
+    blobs.clear();
+
     std::vector<std::vector<MatShape> > outLayerShapes;
 
     getLayersShapes(netInputShapes, &layerIds, 0, &outLayerShapes);
 
-    weights = blobs = 0;
     for(int i = 0; i < layerIds.size(); i++)
     {
+        int w = 0, b = 0;
         Impl::MapIdToLayerData::iterator layer = impl->layers.find(layerIds[i]);
         CV_Assert(layer != impl->layers.end());
 
         for(int j = 0; j < layer->second.params.blobs.size(); j++)
         {
             const Mat& weightsBlob = layer->second.params.blobs[j];
-            weights += weightsBlob.total()*weightsBlob.elemSize();
+            w += weightsBlob.total()*weightsBlob.elemSize();
         }
 
         for(int j = 0; j < outLayerShapes[i].size(); j++)
         {
-            blobs += total(outLayerShapes[i][j]) * sizeof(float);
+            b += total(outLayerShapes[i][j]) * sizeof(float);
         }
-    }
-}
 
-void Net::getMemoryConsumption(const int layerId,
-                               const MatShape& netInputShape,
-                               size_t& weights, size_t& blobs) const
-{
-    getMemoryConsumption(layerId, std::vector<MatShape>(1, netInputShape),
-                         weights, blobs);
+        weights.push_back(w);
+        blobs.push_back(b);
+    }
 }
 
-void Net::getMemoryConsumption(const MatShape& netInputShape,
-                               size_t& weights, size_t& blobs) const
+void Net::getMemoryConsumption(const MatShape& netInputShape, std::vector<int>& layerIds,
+                               std::vector<size_t>& weights, std::vector<size_t>& blobs) const
 {
-    getMemoryConsumption(std::vector<MatShape>(1, netInputShape),
+    getMemoryConsumption(std::vector<MatShape>(1, netInputShape), layerIds,
                          weights, blobs);
 }
 
diff --git a/modules/dnn/src/layers/batch_norm_layer.cpp b/modules/dnn/src/layers/batch_norm_layer.cpp
index 0b05b9345..e5f5b68a0 100644
--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@@ -30,6 +30,15 @@ public:
         epsilon = params.get<float>("eps", 1E-5);
     }
 
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const
+    {
+        Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        return true;
+    }
+
     void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
     {
         CV_Assert(blobs.size() >= 2);
diff --git a/modules/dnn/src/layers/blank_layer.cpp b/modules/dnn/src/layers/blank_layer.cpp
index 6b2a7dea1..f90f238c9 100644
--- a/modules/dnn/src/layers/blank_layer.cpp
+++ b/modules/dnn/src/layers/blank_layer.cpp
@@ -61,7 +61,12 @@ public:
         return true;
     }
 
-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals) {}
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+    {
+        for (int i = 0, n = outputs.size(); i < n; ++i)
+            if (outputs[i].data != inputs[i]->data)
+                inputs[i]->copyTo(outputs[i]);
+    }
 };
 
 Ptr<BlankLayer> BlankLayer::create(const LayerParams& params)
diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp
index e114b797b..87e2d6908 100644
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@@ -20,17 +20,17 @@ public:
     class PBody : public cv::ParallelLoopBody
     {
         Func &func;
-        Dtype *data;
+        Dtype *src, *dst;
     public:
 
-        PBody(Mat &mat, Func &func_) :
-            func(func_), data(mat.ptr<Dtype>())
+        PBody(Mat &src, Mat &dst, Func &func_) :
+            func(func_), src(src.ptr<Dtype>()), dst(dst.ptr<Dtype>())
         {}
 
         void operator()(const Range &r) const
         {
             for (int i = r.start; i < r.end; i++)
-                data[i] = func(data[i]);
+                dst[i] = func(src[i]);
         }
     };
 
@@ -49,13 +49,13 @@ public:
     {
         for (size_t i = 0; i < inputs.size(); i++)
         {
-            const Mat &src = *inputs[i];
+            Mat &src = *inputs[i];
             Mat &dst = outputs[i];
-            CV_Assert(src.ptr() == dst.ptr() && src.isContinuous());
+            CV_Assert(src.isContinuous() && dst.isContinuous());
 
             Range sizeRange = Range(0, dst.total());
             CV_Assert(src.type() == CV_32F);
-            PBody<float> body(dst, func);
+            PBody<float> body(src, dst, func);
             if( run_parallel )
                 cv::parallel_for_(sizeRange, body);
             else
diff --git a/modules/dnn/src/layers/reshape_layer.cpp b/modules/dnn/src/layers/reshape_layer.cpp
index a98e4e962..4fa089e64 100644
--- a/modules/dnn/src/layers/reshape_layer.cpp
+++ b/modules/dnn/src/layers/reshape_layer.cpp
@@ -178,7 +178,7 @@ public:
         for (size_t i = 0; i < inputs.size(); i++)
         {
             Mat srcBlob = *inputs[i];
-            MatShape inputShape = shape(srcBlob);
+            MatShape inputShape = shape(srcBlob), outShape = shape(outputs[i]);
 
             if (performReordering)
             {
@@ -204,6 +204,11 @@ public:
                 }
                 internals[i].copyTo(outputs[i]);
             }
+            else
+            {
+                if (outputs[i].data != srcBlob.data)
+                    srcBlob.reshape(1, outShape).copyTo(outputs[i]);
+            }
         }
     }
 
diff --git a/modules/dnn/src/layers/scale_layer.cpp b/modules/dnn/src/layers/scale_layer.cpp
index 37db031f3..473b1b38b 100644
--- a/modules/dnn/src/layers/scale_layer.cpp
+++ b/modules/dnn/src/layers/scale_layer.cpp
@@ -27,6 +27,15 @@ public:
         hasBias = params.get<bool>("bias_term", false);
     }
 
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const
+    {
+        Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        return true;
+    }
+
     void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
     {
         CV_Assert(blobs.size() == 1 + hasBias);
diff --git a/modules/dnn/src/layers/split_layer.cpp b/modules/dnn/src/layers/split_layer.cpp
index d15702aa2..975230173 100644
--- a/modules/dnn/src/layers/split_layer.cpp
+++ b/modules/dnn/src/layers/split_layer.cpp
@@ -72,17 +72,17 @@ public:
     {
         CV_Assert(inputs.size() == 1);
 
-        outputs.resize(outputsCount >= 0 ? outputsCount : requiredOutputs,
-                       inputs[0]);
-
-        return false;
+        Layer::getMemoryShapes(inputs, outputsCount >= 0 ? outputsCount : requiredOutputs,
+                               outputs, internals);
+        return true;
     }
 
     void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
     {
         for (size_t i = 0; i < outputs.size(); i++)
         {
-            inputs[0]->copyTo(outputs[i]);
+            if (outputs[i].data != inputs[0]->data)
+                inputs[0]->copyTo(outputs[i]);
         }
     }
 };
diff --git a/modules/dnn/test/test_caffe_importer.cpp b/modules/dnn/test/test_caffe_importer.cpp
index 52869badf..8b8a4e7fe 100644
--- a/modules/dnn/test/test_caffe_importer.cpp
+++ b/modules/dnn/test/test_caffe_importer.cpp
@@ -121,6 +121,10 @@ TEST(Reproducibility_FCN, Accuracy)
     if (sample.size() != inputSize)
         resize(sample, sample, inputSize);
 
+    std::vector<int> layerIds;
+    std::vector<size_t> weights, blobs;
+    net.getMemoryConsumption(shape(1,3,227,227), layerIds, weights, blobs);
+
     net.setBlob(".data", blobFromImage(sample, 1.));
     net.forward();