Merge pull request #1156 from arrybn:layers_shapes

8 years ago · 4b1834acea
parent 3e9b1f669e 9b73fee29d
commit 4b1834acea
37 changed files with 1102 additions and 897 deletions
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@ -121,21 +121,7 @@ namespace dnn
          * @details If this parameter is empty or unset then @p outTailShape = [`Wh`.size(0)] will be used,
          * where `Wh` is parameter from setWeights().
          */
-        virtual void setOutShape(const std::vector<int> &outTailShape = std::vector<int>()) = 0;
-
-        /** @brief Set @f$ h_{t-1} @f$ value that will be used in next forward() calls.
-          * @details By-default @f$ h_{t-1} @f$ is inited by zeros and updated after each forward() call.
-          */
-        virtual void setH(const Mat &H) = 0;
-        /** @brief Returns current @f$ h_{t-1} @f$ value (deep copy). */
-        virtual Mat getH() const = 0;
-
-        /** @brief Set @f$ c_{t-1} @f$ value that will be used in next forward() calls.
-          * @details By-default @f$ c_{t-1} @f$ is inited by zeros and updated after each forward() call.
-          */
-        virtual void setC(const Mat &C) = 0;
-        /** @brief Returns current @f$ c_{t-1} @f$ value (deep copy). */
-        virtual Mat getC() const = 0;
+        virtual void setOutShape(const MatShape &outTailShape = MatShape()) = 0;

        /** @brief Specifies either interpet first dimension of input blob as timestamp dimenion either as sample.
          *
@ -289,7 +275,7 @@ namespace dnn
    class CV_EXPORTS ReshapeLayer : public Layer
    {
    public:
-        std::vector<int> newShapeDesc;
+        MatShape newShapeDesc;
        Range newShapeRange;

        static Ptr<ReshapeLayer> create(const LayerParams& params);
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@ -53,6 +53,8 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
 //! @addtogroup dnn
 //! @{

+    typedef std::vector<int> MatShape;
+
    /** @brief Initialize dnn module and built-in layers.
     *
     * This function automatically called on most of OpenCV builds,
@ -87,33 +89,35 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
        //! List of learned parameters must be stored here to allow read them by using Net::getParam().
        CV_PROP_RW std::vector<Mat> blobs;

-        /** @brief Allocates internal buffers and output blobs with respect to the shape of inputs.
+        /** @brief Computes and sets internal parameters according to inputs, outputs and blobs.
         *  @param[in]  input  vector of already allocated input blobs
-         *  @param[out] output vector of output blobs, which must be allocated
+         *  @param[out] output vector of already allocated output blobs
         *
-         * This method must create each produced blob according to shape of @p input blobs and internal layer params.
-         * If this method is called first time then @p output vector consists from empty blobs and its size determined by number of output connections.
-         * This method can be called multiple times if size of any @p input blob was changed.
+         * If this method is called after network has allocated all memory for input and output blobs
+         * and before inferencing.
         */
-        virtual void allocate(const std::vector<Mat*> &input, std::vector<Mat> &output) = 0;
+        virtual void finalize(const std::vector<Mat*> &input, std::vector<Mat> &output);

        /** @brief Given the @p input blobs, computes the output @p blobs.
         *  @param[in]  input  the input blobs.
         *  @param[out] output allocated output blobs, which will store results of the computation.
+         *  @param[out] internals allocated internal blobs
         */
-        virtual void forward(std::vector<Mat*> &input, std::vector<Mat> &output) = 0;
+        virtual void forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &internals) = 0;

        /** @brief @overload */
-        CV_WRAP void allocate(const std::vector<Mat> &inputs, CV_OUT std::vector<Mat> &outputs);
+        CV_WRAP void finalize(const std::vector<Mat> &inputs, CV_OUT std::vector<Mat> &outputs);

        /** @brief @overload */
-        CV_WRAP std::vector<Mat> allocate(const std::vector<Mat> &inputs);
+        CV_WRAP std::vector<Mat> finalize(const std::vector<Mat> &inputs);

        /** @brief @overload */
-        CV_WRAP void forward(const std::vector<Mat> &inputs, CV_IN_OUT std::vector<Mat> &outputs);
+        CV_WRAP void forward(const std::vector<Mat> &inputs, CV_IN_OUT std::vector<Mat> &outputs,
+                             CV_IN_OUT std::vector<Mat> &internals);

        /** @brief Allocates layer and computes output. */
-        CV_WRAP void run(const std::vector<Mat> &inputs, CV_OUT std::vector<Mat> &outputs);
+        CV_WRAP void run(const std::vector<Mat> &inputs, CV_OUT std::vector<Mat> &outputs,
+                         CV_IN_OUT std::vector<Mat> &internals);

        /** @brief Returns index of input blob into the input array.
         *  @param inputName label of input blob
@ -127,6 +131,11 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
         */
        virtual int outputNameToIndex(String outputName);

+        virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                                     const int requiredOutputs,
+                                     std::vector<MatShape> &outputs,
+                                     std::vector<MatShape> &internals) const;
+
        CV_PROP String name; //!< Name of the layer instance, can be used for logging or other internal purposes.
        CV_PROP String type; //!< Type name which was used for creating layer by layer factory.

@ -275,6 +284,45 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
        /** @brief Returns indexes of layers with unconnected outputs.
         */
        CV_WRAP std::vector<int> getUnconnectedOutLayers() const;
+        /** @brief Returns input and output shapes for all layers in loaded model;
+          *  preliminary inferencing isn't necessary.
+          *  @param netInputShapes shapes for all input blobs in net input layer.
+          *  @param layersIds output parameter for layer IDs.
+          *  @param inLayersShapes output parameter for input layers shapes;
+          * order is the same as in layersIds
+          *  @param outLayersShapes output parameter for output layers shapes;
+          * order is the same as in layersIds
+          */
+         CV_WRAP void getLayersShapes(const std::vector<MatShape>& netInputShapes,
+                                      std::vector<int>* layersIds,
+                                      std::vector<std::vector<MatShape> >* inLayersShapes,
+                                      std::vector<std::vector<MatShape> >* outLayersShapes) const;
+
+         /** @overload */
+         CV_WRAP void getLayersShapes(const MatShape& netInputShape,
+                                      std::vector<int>* layersIds,
+                                      std::vector<std::vector<MatShape> >* inLayersShapes,
+                                      std::vector<std::vector<MatShape> >* outLayersShapes) const;
+
+         /** @brief Returns input and output shapes for layer with specified
+          * id in loaded model; preliminary inferencing isn't necessary.
+          *  @param netInputShape shape input blob in net input layer.
+          *  @param layerId id for layer.
+          *  @param inLayerShapes output parameter for input layers shapes;
+          * order is the same as in layersIds
+          *  @param outLayerShapes output parameter for output layers shapes;
+          * order is the same as in layersIds
+          */
+         CV_WRAP void getLayerShapes(const MatShape& netInputShape,
+                                     const int layerId,
+                                     std::vector<MatShape>* inLayerShapes,
+                                     std::vector<MatShape>* outLayerShapes) const;
+
+         /** @overload */
+         CV_WRAP void getLayerShapes(const std::vector<MatShape>& netInputShapes,
+                                     const int layerId,
+                                     std::vector<MatShape>* inLayerShapes,
+                                     std::vector<MatShape>* outLayerShapes) const;
    private:

        struct Impl;
--- a/modules/dnn/include/opencv2/dnn/shape_utils.hpp
+++ b/modules/dnn/include/opencv2/dnn/shape_utils.hpp
@ -55,22 +55,6 @@ inline std::ostream &operator<< (std::ostream &s, cv::Range &r)
    return s << "[" << r.start << ", " << r.end << ")";
 }

-//Reshaping
-//TODO: add -1 specifier for automatic size inferring
-
-/*template<typename Mat>
-void reshape(Mat &m, const BlobShape &shape)
-{
-    m = m.reshape(1, shape.dims(), shape.ptr());
-}
-
-template<typename Mat>
-Mat reshaped(const Mat &m, const BlobShape &shape)
-{
-    return m.reshape(1, shape.dims(), shape.ptr());
-}*/
-
-
 //Slicing

 struct _Range : public cv::Range
@ -139,12 +123,76 @@ static inline Mat getPlane(const Mat &m, int n, int cn)
    return m(range).reshape(1, m.dims-2, sz);
 }

-static inline size_t shapeTotal(const std::vector<int>& shape)
+static inline MatShape shape(const int* dims, const int n = 4)
+{
+    MatShape shape;
+    shape.assign(dims, dims + n);
+    return shape;
+}
+
+static inline MatShape shape(const MatSize& size)
+{
+    return shape((const int*)size, size.dims());
+}
+
+static inline MatShape shape(const Mat& mat)
+{
+    return shape(mat.size);
+}
+
+namespace {inline bool is_neg(int i) { return i < 0; }}
+
+static inline MatShape shape(int a0, int a1=-1, int a2=-1, int a3=-1)
+{
+    int dims[] = {a0, a1, a2, a3};
+    MatShape s = shape(dims);
+    s.erase(std::remove_if(s.begin(), s.end(), is_neg), s.end());
+    return s;
+}
+
+static inline int total(const MatShape& shape, int start = -1, int end = -1)
+{
+    if (start == -1) start = 0;
+    if (end == -1) end = shape.size();
+
+    if (shape.empty())
+        return 0;
+
+    int elems = 1;
+    CV_Assert(start < shape.size() && end <= shape.size() &&
+              start <= end);
+    for(int i = start; i < end; i++)
+    {
+        elems *= shape[i];
+    }
+    return elems;
+}
+
+static inline MatShape concat(const MatShape& a, const MatShape& b)
 {
-    size_t i, n = shape.size(), p = 1;
-    for( i = 0; i < n; i++ ) p *= shape[i];
+    MatShape c = a;
+    c.insert(c.end(), b.begin(), b.end());

-    return p;
+    return c;
+}
+
+inline void print(const MatShape& shape, const String& name = "")
+{
+    printf("%s: [", name.c_str());
+    size_t i, n = shape.size();
+    for( i = 0; i < n; i++ )
+        printf(" %d", shape[i]);
+    printf(" ]\n");
+}
+
+inline int clamp(int ax, int dims)
+{
+    return ax < 0 ? ax + dims : ax;
+}
+
+inline int clamp(int ax, const MatShape& shape)
+{
+    return clamp(ax, shape.size());
 }

 }
--- a/modules/dnn/misc/python/pyopencv_dnn.hpp
+++ b/modules/dnn/misc/python/pyopencv_dnn.hpp
@ -1,5 +1,7 @@
 #ifdef HAVE_OPENCV_DNN
 typedef dnn::DictValue LayerId;
+typedef std::vector<dnn::MatShape> vector_MatShape;
+typedef std::vector<std::vector<dnn::MatShape> > vector_vector_MatShape;

 template<>
 bool pyopencv_to(PyObject *o, dnn::DictValue &dv, const char *name)
--- a/modules/dnn/perf/perf_convolution.cpp
+++ b/modules/dnn/perf/perf_convolution.cpp
@ -1,4 +1,5 @@
 #include "perf_precomp.hpp"
+#include <opencv2/dnn/shape_utils.hpp>

 namespace cvtest
 {
@ -21,14 +22,14 @@ CV_ENUM(GroupSize, GROUP_OFF, GROUP_2);
 //Squared Size
 #define SSZ(n) cv::Size(n, n)

-typedef std::pair<std::vector<int>, int> InpShapeNumOut;
+typedef std::pair<MatShape, int> InpShapeNumOut;
 typedef tuple<Size, InpShapeNumOut, GroupSize, StrideSize> ConvParam; //kernel_size, inp shape, groups, stride
 typedef TestBaseWithParam<ConvParam> ConvolutionPerfTest;

-static inline std::vector<int> blobShape(int count, int nplanes, int height, int width)
+static inline MatShape blobShape(int count, int nplanes, int height, int width)
 {
    int data[] = {count, nplanes, height, width};
-    return std::vector<int>(data, data+4);
+    return MatShape(data, data+4);
 }

 PERF_TEST_P( ConvolutionPerfTest, perf, Combine(
@ -44,7 +45,7 @@ PERF_TEST_P( ConvolutionPerfTest, perf, Combine(

    ConvParam params = GetParam();
    int ksz     = get<0>(params).width;
-    std::vector<int> inpShape = get<1>(params).first;
+    MatShape inpShape = get<1>(params).first;
    int outCn   = get<1>(params).second;
    int groups  = get<2>(params);
    int stride  = (ksz >= 11) ? 4 : (int)get<3>(params);
@ -69,12 +70,25 @@ PERF_TEST_P( ConvolutionPerfTest, perf, Combine(
    lp.blobs.push_back(biasBlob);

    std::vector<Mat*> inpBlobs(1, &inpBlob);
-    std::vector<Mat> outBlobs;
+    std::vector<Mat> outBlobs, internalBlobs;

    cv::setNumThreads(cv::getNumberOfCPUs());

    Ptr<Layer> layer = cv::dnn::LayerFactory::createLayerInstance("Convolution", lp);
-    layer->allocate(inpBlobs, outBlobs);
+    std::vector<MatShape> inputShapes(1, shape(inpBlob)), outShapes, internals;
+    layer->getMemoryShapes(inputShapes, 0, outShapes, internals);
+    for (int i = 0; i < outShapes.size(); i++)
+    {
+        outBlobs.push_back(Mat(outShapes[i], CV_32F));
+    }
+    for (int i = 0; i < internals.size(); i++)
+    {
+        internalBlobs.push_back(Mat());
+        if (total(internals[i]))
+            internalBlobs.back().create(internals[i], CV_32F);
+    }
+
+    layer->finalize(inpBlobs, outBlobs);

    Mat inpBlob2D = inpBlob.reshape(1, outCn);
    Mat wgtBlob2D = wgtBlob.reshape(1, outCn*(inpCn/groups));
@ -83,7 +97,7 @@ PERF_TEST_P( ConvolutionPerfTest, perf, Combine(

    TEST_CYCLE_N(10)
    {
-        layer->forward(inpBlobs, outBlobs);
+        layer->forward(inpBlobs, outBlobs, internalBlobs);
    }

    SANITY_CHECK_NOTHING();
--- a/modules/dnn/src/caffe/caffe_importer.cpp
+++ b/modules/dnn/src/caffe/caffe_importer.cpp
@ -192,7 +192,7 @@ public:
        }
    }

-    void blobShapeFromProto(const caffe::BlobProto &pbBlob, std::vector<int>& shape)
+    void blobShapeFromProto(const caffe::BlobProto &pbBlob, MatShape& shape)
    {
        shape.clear();
        if (pbBlob.has_num() || pbBlob.has_channels() || pbBlob.has_height() || pbBlob.has_width())
@ -215,7 +215,7 @@ public:

    void blobFromProto(const caffe::BlobProto &pbBlob, cv::Mat &dstBlob)
    {
-        std::vector<int> shape;
+        MatShape shape;
        blobShapeFromProto(pbBlob, shape);

        dstBlob.create((int)shape.size(), &shape[0], CV_32F);
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@ -45,6 +45,7 @@
 #include <iostream>
 #include <sstream>
 #include <iterator>
+#include <opencv2/dnn/shape_utils.hpp>

 using namespace cv;
 using namespace cv::dnn;
@ -168,6 +169,7 @@ struct LayerData
    Ptr<Layer> layerInstance;
    std::vector<Mat> outputBlobs;
    std::vector<Mat*> inputBlobs;
+    std::vector<Mat> internals;

    int flag;

@ -189,8 +191,8 @@ struct LayerData
 //fake layer containing network input blobs
 struct DataLayer : public Layer
 {
-    void allocate(const std::vector<Mat*>&, std::vector<Mat>&) {}
-    void forward(std::vector<Mat*>&, std::vector<Mat>&) {}
+    void finalize(const std::vector<Mat*>&, std::vector<Mat>&) {}
+    void forward(std::vector<Mat*>&, std::vector<Mat>&, std::vector<Mat> &) {}

    int outputNameToIndex(String tgtName)
    {
@ -209,6 +211,17 @@ private:

 struct Net::Impl
 {
+    typedef std::vector<MatShape> ShapesVec;
+    struct LayerShapes
+    {
+        ShapesVec in, out, internal;
+        bool inplace;
+        LayerShapes() {inplace = false;}
+    };
+
+    typedef std::map<int, LayerShapes> LayersShapesMap;
+    typedef std::map<int, LayerData> MapIdToLayerData;
+
    Impl()
    {
        //allocate fake net input layer
@ -227,8 +240,7 @@ struct Net::Impl
    Ptr<DataLayer> netInputLayer;
    std::vector<int> netOutputs;

-    typedef std::map<int, LayerData> MapIdToLayerData;
-    std::map<int, LayerData> layers;
+    MapIdToLayerData layers;
    std::map<String, int> layerNameToId;

    int lastLayerId;
@ -399,7 +411,7 @@ struct Net::Impl
    #define CV_RETHROW_ERROR(err, newmsg)\
        cv::error(err.code, newmsg, err.func.c_str(), err.file.c_str(), err.line)

-    void allocateLayer(int lid)
+    void allocateLayer(int lid, const LayersShapesMap& layersShapes)
    {
        LayerData &ld = layers[lid];

@ -432,7 +444,7 @@ struct Net::Impl

        //allocate parents
        for (set<int>::iterator i = ld.inputLayersId.begin(); i != ld.inputLayersId.end(); i++)
-            allocateLayer(*i);
+            allocateLayer(*i, layersShapes);

        //bind inputs
        ld.inputBlobs.resize(ninputs);
@ -444,12 +456,42 @@ struct Net::Impl
            ld.inputBlobs[i] = &layers[from.lid].outputBlobs[from.oid];
        }

-        //allocate layer
-        ld.outputBlobs.resize(std::max((size_t)1, ld.requiredOutputs.size())); //layer produce at least one output blob
+        LayersShapesMap::const_iterator layerShapesIt = layersShapes.find(lid);
+
+        CV_Assert(layerShapesIt != layersShapes.end());
+        const ShapesVec& outShapes = layerShapesIt->second.out;
+        CV_Assert(ld.requiredOutputs.size() <= outShapes.size());
+
+        ld.outputBlobs.resize(std::max((size_t)1, outShapes.size())); //layer produce at least one output blob
+        for(int i = 0; i < outShapes.size(); i++)
+        {
+            if (shape(ld.outputBlobs[i]) != outShapes[i])
+            {
+                if (layerShapesIt->second.inplace)
+                {
+                    CV_Assert(ld.inputBlobs.size() == ld.outputBlobs.size());
+                    CV_Assert(ld.inputBlobs[i]->total() == total(outShapes[i]));
+                    ld.outputBlobs[i] = ld.inputBlobs[i]->reshape(1, outShapes[i]);
+                }
+                else
+                {
+                    ld.outputBlobs[i].create(outShapes[i], CV_32F);
+                }
+            }
+        }
+
+        const ShapesVec& intShapes = layerShapesIt->second.internal;
+        ld.internals.resize(intShapes.size());
+        for(int i = 0; i < intShapes.size(); i++)
+        {
+            if (shape(ld.internals[i]) != intShapes[i] && total(intShapes[i]))
+                ld.internals[i].create(intShapes[i], CV_32F);
+        }
+
+        Ptr<Layer> layerPtr = ld.getLayerInstance();
        //try
        {
-            Ptr<Layer> layerPtr = ld.getLayerInstance();
-            layerPtr->allocate(ld.inputBlobs, ld.outputBlobs);
+            layerPtr->finalize(ld.inputBlobs, ld.outputBlobs);
 #if 0
            std::cout << "\toutputs:";
            size_t noutputs = ld.outputBlobs.size();
@ -474,10 +516,20 @@ struct Net::Impl
        for (it = layers.begin(); it != layers.end(); it++)
            it->second.flag = 0;

+        CV_Assert(!layers[0].outputBlobs.empty());
+        ShapesVec inputShapes;
+        for(int i = 0; i < layers[0].outputBlobs.size(); i++)
+        {
+            CV_Assert(layers[0].outputBlobs[i].total());
+            inputShapes.push_back(shape(layers[0].outputBlobs[i]));
+        }
+        LayersShapesMap layersShapes;
+        getLayersShapes(inputShapes, layersShapes);
+
        for (it = layers.begin(); it != layers.end(); it++)
        {
            int lid = it->first;
-            allocateLayer(lid);
+            allocateLayer(lid, layersShapes);
        }
    }

@ -503,7 +555,7 @@ struct Net::Impl
        //forward itself
        //try
        {
-            ld.layerInstance->forward(ld.inputBlobs, ld.outputBlobs);
+            ld.layerInstance->forward(ld.inputBlobs, ld.outputBlobs, ld.internals);
        }
        /*catch (const cv::Exception &err)
        {
@ -522,6 +574,57 @@ struct Net::Impl
        for (it = layers.begin(); it != layers.end(); it++)
            forwardLayer(it->second, false);
    }
+
+    void getLayerShapesRecursively(int id, LayersShapesMap& inOutShapes)
+    {
+        std::vector<LayerPin>& inputLayerIds = layers[id].inputBlobsId;
+
+        if (inOutShapes[id].in.empty())
+        {
+            for(int i = 0; i < inputLayerIds.size(); i++)
+            {
+                int layerId = inputLayerIds[i].lid;
+                LayersShapesMap::iterator it =
+                        inOutShapes.find(layerId);
+                if(it == inOutShapes.end() ||
+                        it->second.out.empty())
+                {
+                    getLayerShapesRecursively(layerId, inOutShapes);
+                }
+                const MatShape& shape = inOutShapes[layerId].out[inputLayerIds[i].oid];
+                inOutShapes[id].in.push_back(shape);
+            }
+        }
+        const ShapesVec& is = inOutShapes[id].in;
+        ShapesVec& os = inOutShapes[id].out;
+        ShapesVec& ints = inOutShapes[id].internal;
+        int requiredOutputs = layers[id].requiredOutputs.size();
+        inOutShapes[id].inplace =
+                layers[id].getLayerInstance()->getMemoryShapes(is, requiredOutputs, os, ints);
+    }
+
+    void getLayersShapes(const ShapesVec& netInputShapes,
+                         LayersShapesMap& inOutShapes)
+    {
+        inOutShapes.clear();
+
+        inOutShapes[0].in = netInputShapes; //insert shape for first input layer
+        for (MapIdToLayerData::iterator it = layers.begin();
+             it != layers.end(); it++)
+        {
+            getLayerShapesRecursively(it->first, inOutShapes);
+        }
+    }
+
+    void getLayerShapes(const ShapesVec& netInputShapes,
+                        const int layerId,
+                        LayerShapes& shapes)
+    {
+        LayersShapesMap inOutShapes;
+        inOutShapes[0].in = netInputShapes; //insert shape for first input layer
+        getLayerShapesRecursively(layerId, inOutShapes);
+        shapes = inOutShapes[layerId];
+    }
 };

 Net::Net() : impl(new Net::Impl)
@ -604,10 +707,10 @@ void Net::setBlob(String outputName, const Mat &blob_)

    LayerData &ld = impl->layers[pin.lid];
    ld.outputBlobs.resize( std::max(pin.oid+1, (int)ld.requiredOutputs.size()) );
-    bool oldShape = ld.outputBlobs[pin.oid].size == blob_.size;
+    MatShape prevShape = shape(ld.outputBlobs[pin.oid]);
    ld.outputBlobs[pin.oid] = blob_.clone();

-    impl->netWasAllocated = impl->netWasAllocated && oldShape;
+    impl->netWasAllocated = impl->netWasAllocated && prevShape == shape(blob_);
 }

 Mat Net::getBlob(String outputName)
@ -714,6 +817,64 @@ std::vector<int> Net::getUnconnectedOutLayers() const
    return layersIds;
 }

+void Net::getLayersShapes(const Net::Impl::ShapesVec& netInputShapes,
+                          std::vector<int>* layersIds,
+                          std::vector<Net::Impl::ShapesVec>* inLayersShapes,
+                          std::vector<Net::Impl::ShapesVec>* outLayersShapes) const
+{
+    if ((layersIds || inLayersShapes || outLayersShapes) == false)
+        return;
+
+    if (layersIds) layersIds->clear();
+    if (inLayersShapes) inLayersShapes->clear();
+    if (outLayersShapes) outLayersShapes->clear();
+
+    Impl::LayersShapesMap inOutShapes;
+    impl->getLayersShapes(netInputShapes, inOutShapes);
+
+    for(Impl::LayersShapesMap::const_iterator it = inOutShapes.begin();
+        it != inOutShapes.end(); it++)
+    {
+        if (layersIds)
+            layersIds->push_back(it->first);
+        if (inLayersShapes)
+            inLayersShapes->push_back(it->second.in);
+        if (outLayersShapes)
+            outLayersShapes->push_back(it->second.out);
+    }
+}
+
+void Net::getLayersShapes(const MatShape& netInputShape,
+                          std::vector<int>* layerIds,
+                          std::vector<Net::Impl::ShapesVec>* inLayersShapes,
+                          std::vector<Net::Impl::ShapesVec>* outLayersShapes) const
+{
+    getLayersShapes(Net::Impl::ShapesVec(1, netInputShape),
+                    layerIds, inLayersShapes, outLayersShapes);
+}
+
+void Net::getLayerShapes(const MatShape& netInputShape,
+                         const int layerId,
+                         Net::Impl::ShapesVec* inLayerShapes,
+                         Net::Impl::ShapesVec* outLayerShapes) const
+{
+    getLayerShapes(Net::Impl::ShapesVec(1, netInputShape),
+                   layerId, inLayerShapes, outLayerShapes);
+
+}
+
+void Net::getLayerShapes(const Net::Impl::ShapesVec& netInputShapes,
+                    const int layerId,
+                    Net::Impl::ShapesVec* inLayerShapes,
+                    Net::Impl::ShapesVec* outLayerShapes) const
+{
+    Impl::LayerShapes shapes;
+    impl->getLayerShapes(netInputShapes, layerId, shapes);
+    if (inLayerShapes)
+        *inLayerShapes = shapes.in;
+    if (outLayerShapes)
+        *outLayerShapes = shapes.out;
+}

 //////////////////////////////////////////////////////////////////////////

@ -752,37 +913,52 @@ static void vecToPVec(const std::vector<T> &v, std::vector<T*> &pv)
        pv[i] = const_cast<T*>(&v[i]);
 }

-void Layer::allocate(const std::vector<Mat> &inputs, std::vector<Mat> &outputs)
+void Layer::finalize(const std::vector<Mat> &inputs, std::vector<Mat> &outputs)
 {
    std::vector<Mat*> inputsp;
    vecToPVec(inputs, inputsp);
-    this->allocate(inputsp, outputs);
+    this->finalize(inputsp, outputs);
+}
+
+void Layer::finalize(const std::vector<Mat*> &input, std::vector<Mat> &output)
+{
+    (void)input;(void)output;
 }

-std::vector<Mat> Layer::allocate(const std::vector<Mat> &inputs)
+std::vector<Mat> Layer::finalize(const std::vector<Mat> &inputs)
 {
    std::vector<Mat> outputs;
-    this->allocate(inputs, outputs);
+    this->finalize(inputs, outputs);
    return outputs;
 }

-void Layer::forward(const std::vector<Mat> &inputs, std::vector<Mat> &outputs)
+void Layer::forward(const std::vector<Mat> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
 {
    std::vector<Mat*> inputsp;
    vecToPVec(inputs, inputsp);
-    this->forward(inputsp, outputs);
+    this->forward(inputsp, outputs, internals);
 }

-void Layer::run(const std::vector<Mat> &inputs, std::vector<Mat> &outputs)
+void Layer::run(const std::vector<Mat> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
 {
    std::vector<Mat*> inputsp;
    vecToPVec(inputs, inputsp);
-    this->allocate(inputsp, outputs);
-    this->forward(inputsp, outputs);
+    this->finalize(inputsp, outputs);
+    this->forward(inputsp, outputs, internals);
 }

 Layer::~Layer() {}

+bool Layer::getMemoryShapes(const std::vector<MatShape> &inputs,
+                            const int requiredOutputs,
+                            std::vector<MatShape> &outputs,
+                            std::vector<MatShape> &internals) const
+{
+    CV_Assert(inputs.size());
+    outputs.assign(std::max(requiredOutputs, (int)inputs.size()), inputs[0]);
+    return false;
+}
+
 //////////////////////////////////////////////////////////////////////////

 struct LayerFactory::Impl : public std::map<String, LayerFactory::Constuctor>
--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@ -29,32 +29,20 @@ public:
        epsilon = params.get<float>("eps", 1E-5);
    }

-    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
        CV_Assert(blobs.size() >= 2);
+        CV_Assert(inputs.size() == 1);

-        outputs.resize(inputs.size());
-        for (size_t i = 0; i < inputs.size(); i++)
-        {
-            CV_Assert(blobs[0].total() == inputs[i]->size[1]);
-            CV_Assert(blobs[1].total() == inputs[i]->size[1]);
-            Mat* inp = inputs[i];
-            outputs[i].create(inp->dims, &inp->size.p[0], inp->type());
-        }
-
-        varMeanScale = 1.f;
+        float varMeanScale = 1.f;
        if (!hasWeights && !hasBias) {
            varMeanScale = *blobs[2].ptr<float>();
            if (varMeanScale != 0)
                varMeanScale = 1/varMeanScale;
        }

+        Mat invStdMat;
        cv::pow(blobs[1]*varMeanScale + epsilon, -0.5, invStdMat);
-    }
-
-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
-    {
-        CV_Assert(inputs.size() == 1);

        Mat &inpBlob = *inputs[0];

@ -91,8 +79,7 @@ public:
    }

    bool hasWeights, hasBias;
-    float epsilon, varMeanScale;
-    Mat invStdMat;
+    float epsilon;
 };

 Ptr<BatchNormLayer> BatchNormLayer::create(const LayerParams& params)
--- a/modules/dnn/src/layers/blank_layer.cpp
+++ b/modules/dnn/src/layers/blank_layer.cpp
@ -56,7 +56,7 @@ public:
            outputs[i] = *inputs[i];
    }

-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
        for (size_t i = 0; i < inputs.size(); i++)
            outputs[i] = *inputs[i];
--- a/modules/dnn/src/layers/concat_layer.cpp
+++ b/modules/dnn/src/layers/concat_layer.cpp
@ -56,49 +56,50 @@ public:
        axis = params.get<int>("axis", 1);
    }

-    void allocate(const std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
+    virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                                 const int requiredOutputs,
+                                 std::vector<MatShape> &outputs,
+                                 std::vector<MatShape> &internals) const
    {
        CV_Assert(inputs.size() > 0);
-
-        int dims = inputs[0]->dims, dtype = inputs[0]->type();
-        std::vector<int> refShape(inputs[0]->size.p, inputs[0]->size.p + dims);
-        axisIdx = axis < 0 ? axis + dims : axis;
+        outputs.clear();
+        outputs.push_back(inputs[0]);
+        int cAxis = clamp(axis, inputs[0]);

        int axisSum = 0;
        for (size_t i = 0; i < inputs.size(); i++)
        {
-            CV_Assert(inputs[i]->type() == dtype);
-            for (int curAxis = 0; curAxis < dims; curAxis++)
+            MatShape curShape = inputs[i];
+
+            CV_Assert(curShape.size() == outputs.back().size());
+            for (int curAxis = 0; curAxis < outputs.back().size(); curAxis++)
            {
-                if (curAxis != axisIdx && inputs[0]->size[curAxis] != inputs[i]->size[curAxis])
+                if (curAxis != cAxis && outputs.back()[curAxis] != curShape[curAxis])
                    CV_Error(Error::StsBadSize, "Inconsitent shape for ConcatLayer");
            }

-            axisSum += inputs[i]->size[axisIdx];
+            axisSum += curShape[cAxis];
        }

-        refShape[axisIdx] = axisSum;
+        outputs.back()[cAxis] = axisSum;

-        outputs.resize(1);
-        outputs[0].create(dims, &refShape[0], dtype);
+        return false;
    }

-
-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
+        int cAxis = clamp(axis, inputs[0]->dims);
        Mat& outMat = outputs[0];
        std::vector<Range> ranges(outputs[0].dims, Range::all());

-        ranges[axisIdx].start = 0;
+        ranges[cAxis].start = 0;
        for (size_t i = 0; i < inputs.size(); i++)
        {
-            ranges[axisIdx].end = ranges[axisIdx].start + inputs[i]->size[axisIdx];
+            ranges[cAxis].end = ranges[cAxis].start + inputs[i]->size[cAxis];
            inputs[i]->copyTo(outMat(&ranges[0]));
-            ranges[axisIdx].start = ranges[axisIdx].end;
+            ranges[cAxis].start = ranges[cAxis].end;
        }
    }
-
-    int axisIdx;
 };

 Ptr<ConcatLayer> ConcatLayer::create(const LayerParams& params)
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@ -43,7 +43,6 @@
 #include "layers_common.hpp"
 #include "op_im2col.hpp"
 #include "op_blas.hpp"
-#include <opencv2/dnn/shape_utils.hpp>
 #include <iostream>

 namespace cv
@ -56,13 +55,6 @@ class BaseConvolutionLayerImpl : public ConvolutionLayer
 public:
    BaseConvolutionLayerImpl()
    {
-        numOutput = -1;
-        group = -1;
-        inpH = inpW = inpCn = 0;
-        outH = outW = outCn = 0;
-        inpGroupCn = outGroupCn = 0;
-        ksize = 0;
-        bias = false;
 #ifdef HAVE_LAPACK
        int nthreads = cv::getThreadNum();
        if (getBlasThreads() != nthreads)
@ -71,11 +63,12 @@ public:
        }
 #endif
    }
-    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
    {
        CV_Assert(inputs.size() > 0);

-        init();
+        CV_Assert(blobs.size() >= 1 && blobs.size() <= 2);
+        CV_Assert(blobs[0].dims == 4 && blobs[0].size[3] == kernel.width && blobs[0].size[2] == kernel.height);

        const Mat &input = *inputs[0];
        CV_Assert(input.dims == 4 && (input.type() == CV_32F || input.type() == CV_64F));
@ -86,103 +79,104 @@ public:
            CV_Assert(inputs[i]->size[2] == input.size[2] && inputs[i]->size[3] == input.size[3]);
        }

-        computeInpOutShape(input);
-
-        if (bias)
-        {
-            biasOnesBlob.create(1, outH * outW, input.type());
-            biasOnesBlob.setTo(1);
-        }
-
-        outputs.resize(inputs.size());
-        for (size_t i = 0; i < inputs.size(); i++)
-        {
-            int sz[] = { inputs[i]->size[0], outCn, outH, outW };
-            outputs[i].create(4, sz, input.type());
-        }
-
-        if (!is1x1())
-        {
-            colRowBlob.create((int)colRowBlobShape.size(), &colRowBlobShape[0], input.type());
-            colRowBlob.setTo(0);
-        }
+        Size outSize = Size(outputs[0].size[3], outputs[0].size[2]);
+        getConvPoolPaddings(Size(input.size[3], input.size[2]), outSize,
+                kernel, stride, padMode, pad);
    }

-    void init()
+    bool hasBias() const
    {
-        CV_Assert(blobs.size() >= 1 && blobs.size() <= 2);
-        CV_Assert(blobs[0].dims == 4 && blobs[0].size[3] == kernel.width && blobs[0].size[2] == kernel.height);
-
-        bias = (blobs.size() >= 2);
+        return blobs.size() >= 2;
    }
-    virtual void computeInpOutShape(const Mat &inpBlob) = 0;
+
+    virtual MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const = 0;
    bool is1x1() const
    {
        return (kernel.height == 1 && kernel.width == 1) &&
        (stride.height == 1 && stride.width == 1) &&
        (dilation.height == 1 && dilation.width == 1);
    }
-
-    int numOutput, group;
-    int inpH, inpW, inpCn;
-    int outH, outW, outCn;
-    int inpGroupCn, outGroupCn;
-    int ksize;
-    std::vector<int> colRowBlobShape;
-
-    bool bias;
-    Mat colRowBlob, biasOnesBlob;
 };

 //TODO: simultaneously convolution and bias addition for cache optimization
 class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
 {
 public:
-    void computeInpOutShape(const Mat &input)
+    MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const
+    {
+        Size out(outShape[3], outShape[2]);
+        int inpGroupCn = blobs[0].size[1];
+        int ksize = inpGroupCn * kernel.height * kernel.width;
+        return shape(out.area(), ksize);
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const
    {
-        CV_Assert(!bias || blobs[1].total() == (size_t)blobs[0].size[0]);
+        CV_Assert(blobs.size() != 0);
+        CV_Assert(!hasBias() || blobs[1].total() == (size_t)blobs[0].size[0]);
+        CV_Assert(inputs.size() != 0);
+
+        internals.clear();

-        numOutput = blobs[0].size[0];
+        int inpCn = inputs[0][1];
+        int inpH = inputs[0][2];
+        int inpW = inputs[0][3];

-        inpH = input.size[2];
-        inpW = input.size[3];
-        inpCn = input.size[1];
-        outCn = numOutput;
+        int outCn = blobs[0].size[0];
+        Size out;

        if (padMode.empty())
        {
-            outH = (inpH + 2 * pad.height - (dilation.height * (kernel.height - 1) + 1)) / stride.height + 1;
-            outW = (inpW + 2 * pad.width - (dilation.width * (kernel.width - 1) + 1)) / stride.width + 1;
+            out.height = (inpH + 2 * pad.height - (dilation.height * (kernel.height - 1) + 1)) / stride.height + 1;
+            out.width = (inpW + 2 * pad.width - (dilation.width * (kernel.width - 1) + 1)) / stride.width + 1;
        }
        else
        {
-            getConvPoolOutParams(inpH, inpW, kernel, stride, pad, padMode, outH, outW);
+            getConvPoolOutParams(Size(inpH, inpW), kernel, stride, padMode, out);
        }

-        group = inpCn / blobs[0].size[1];
+        int group = inpCn / blobs[0].size[1];

        CV_Assert(inpCn % group == 0 && outCn % group == 0);
-        CV_Assert(blobs[0].size[0] == outCn && blobs[0].size[1] == inpCn / group);
+        CV_Assert(blobs[0].size[0] == outCn);
+
+        int dims[] = {inputs[0][0], outCn, out.height, out.width};
+        outputs.resize(inputs.size(), shape(dims));
+
+        internals.push_back(MatShape());
+        if (!is1x1())
+            internals[0] = computeColRowShape(inputs[0], outputs[0]);

-        outGroupCn = outCn / group;
-        inpGroupCn = inpCn / group;
-        ksize = inpGroupCn * kernel.height * kernel.width;
+        if (hasBias())
+            internals.push_back(shape(1, out.area()));

-        colRowBlobShape.clear();
-        colRowBlobShape.push_back(outH*outW);
-        colRowBlobShape.push_back(ksize);
+        return false;
    }

-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
        CV_Assert(inputs.size() > 0);

+        internals[0].setTo(0);
+
+        if (hasBias())
+            internals[1].setTo(1);
+
+        int outCn = blobs[0].size[0];
+        int inpCn = inputs[0]->size[1];
+        int inpGroupCn = blobs[0].size[1];
+
        Mat weightsMat = blobs[0].reshape(1, outCn);
-        Mat biasesMat  = bias ? blobs[1].reshape(1, outCn) : Mat();
+        Mat biasesMat  = hasBias() ? blobs[1].reshape(1, outCn) : Mat();

        for (size_t ii = 0; ii < outputs.size(); ii++)
        {
            int numImg = inputs[ii]->size[0];
+            int group = inpCn / blobs[0].size[1];
+            int outGroupCn = outCn / group;
            Mat inpMat = *inputs[ii];
            Mat outMat = outputs[ii].reshape(1, numImg*group*outGroupCn);

@ -192,7 +186,7 @@ public:
                {
                    Mat curInp = slice(inpMat, n, _Range(g * inpGroupCn, inpGroupCn));

-                    im2row(curInp, colRowBlob);
+                    im2row(curInp, internals[0], shape(inpMat), shape(outputs[ii]));

                    _Range kerRange(g * outGroupCn, outGroupCn);
                    Mat kerMat = weightsMat.rowRange(kerRange);
@ -200,19 +194,25 @@ public:
                    _Range outRange((g + n * group) * outGroupCn, outGroupCn);
                    Mat dstMat = outMat.rowRange(outRange);

-                    dnn::gemm(kerMat, colRowBlob, 1, dstMat, 0, GEMM_2_T);
+                    dnn::gemm(kerMat, internals[0], 1, dstMat, 0, GEMM_2_T);

-                    if (bias)
+                    if (hasBias())
                    {
-                        dnn::gemm(biasesMat.rowRange(kerRange), biasOnesBlob, 1, dstMat, 1);
+                        dnn::gemm(biasesMat.rowRange(kerRange), internals[1], 1, dstMat, 1);
                    }
                }
            }
        }
    }

-    void im2row(const  Mat &srcImg, Mat &dstRow)
+    void im2row(const  Mat &srcImg, Mat &dstRow, const MatShape& inShape, const MatShape& outShape)
    {
+        int inpH = inShape[2];
+        int inpW = inShape[3];
+        int outH = outShape[2], outW = outShape[3];
+        int inpGroupCn = blobs[0].size[1];
+        int ksize = inpGroupCn * kernel.height * kernel.width;
+
        if (is1x1())
        {
            transpose(srcImg.reshape(1, ksize), dstRow);
@ -229,52 +229,71 @@ public:
 class DeConvolutionLayerImpl : public BaseConvolutionLayerImpl
 {
 public:
-    void computeInpOutShape(const Mat &inpBlob)
+    MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const
    {
-        CV_Assert(!bias || blobs[1].total() == (size_t)blobs[0].size[0]);
+        int inpCn = inpShape[1];
+        int inpH = inpShape[2];
+        int inpW = inpShape[3];
+        int outCn = outShape[1];
+        int group = inpCn / blobs[0].size[1];
+        int outGroupCn = outCn / group;
+        int ksize = outGroupCn * kernel.height * kernel.width;
+        return shape(ksize, inpH * inpW);
+    }

-        numOutput = blobs[0].size[0];
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const
+    {
+        CV_Assert(!hasBias() || blobs[1].total() == (size_t)blobs[0].size[0]);
+        CV_Assert(inputs.size() != 0);

-        inpH = inpBlob.size[2];
-        inpW = inpBlob.size[3];
-        inpCn = inpBlob.size[1];
+        int inpCn = inputs[0][1];
+        int inpH = inputs[0][2];
+        int inpW = inputs[0][3];

-        outH = stride.height * (inpH - 1) + kernel.height - 2 * pad.height + adjustPad.height;
-        outW = stride.width * (inpW - 1) + kernel.width - 2 * pad.width + adjustPad.width;
-        outCn = numOutput;
+        int outH = stride.height * (inpH - 1) + kernel.height - 2 * pad.height + adjustPad.height;
+        int outW = stride.width * (inpW - 1) + kernel.width - 2 * pad.width + adjustPad.width;
+        int outCn = blobs[0].size[0];

-        group = inpCn / blobs[0].size[1];
-        outGroupCn = outCn / group;
-        inpGroupCn = inpCn / group;
-        ksize = outGroupCn * kernel.height * kernel.width;
+        int group = inpCn / blobs[0].size[1];

        CV_Assert(inpCn % group == 0 && outCn % group == 0);
        CV_Assert(blobs[0].size[0] == outCn && blobs[0].size[1] == inpCn / group);

-        colRowBlobShape.clear();
-        colRowBlobShape.push_back(ksize);
-        colRowBlobShape.push_back(inpH * inpW);
+        int dims[] = {inputs[0][0], outCn, outH, outW};
+        outputs.resize(inputs.size(), shape(dims));

-        ofsbuf.resize(ksize*3);
-        for( int k = 0; k < ksize; k++ )
-        {
-            int w_offset = k % kernel.width;
-            int h_offset = (k / kernel.width) % kernel.height;
-            int c_im = k / kernel.height / kernel.width;
-            ofsbuf[k*3] = w_offset;
-            ofsbuf[k*3+1] = h_offset;
-            ofsbuf[k*3+2] = c_im;
-        }
+        internals.push_back(MatShape());
+        if (!is1x1())
+            internals[0] = computeColRowShape(inputs[0], outputs[0]);
+
+        if (hasBias())
+            internals.push_back(shape(1, outH*outW));
+
+        return false;
    }

-    void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
+
+    void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
+        internals[0].setTo(0);
+        if (hasBias())
+            internals[1].setTo(1);
+
+        int outCn = blobs[0].size[0];
+        int inpCn = inputs[0]->size[1];
        Mat weightsMat = blobs[0].reshape(1, inpCn);
-        Mat biasesMat  = bias ? blobs[1].reshape(1, outCn) : Mat();
+        Mat biasesMat  = hasBias() ? blobs[1].reshape(1, outCn) : Mat();

        for (size_t ii = 0; ii < outputs.size(); ii++)
        {
+            int group = inpCn / blobs[0].size[1];
+            int inpGroupCn = blobs[0].size[1];
+            int outGroupCn = outCn / group;
            int numImg = inputs[ii]->size[0];
+
            Mat convBlob = inputs[ii]->reshape(1, numImg*inpCn);
            Mat decnBlob = outputs[ii].reshape(1, numImg*outCn);

@ -283,7 +302,7 @@ public:
                for (int g = 0; g < group; g++)
                {
                    Mat dstMat = decnBlob.rowRange(_Range((g + n * group) * outGroupCn, outGroupCn));
-                    Mat &colMat = (is1x1()) ? dstMat : colRowBlob;
+                    Mat &colMat = (is1x1()) ? dstMat : internals[0];

                    Mat convMat = convBlob.rowRange(_Range((g + n * group) * inpGroupCn, inpGroupCn));
                    Mat wghtMat = weightsMat.rowRange(_Range(g * inpGroupCn, inpGroupCn));
@ -291,20 +310,25 @@ public:
                    dnn::gemm(wghtMat, convMat, 1, colMat, 0, GEMM_1_T);

                    if (!is1x1())
-                        col2im(colMat, dstMat);
+                        col2im(colMat, dstMat, shape(*inputs[ii]), shape(outputs[ii]));

-                    if (bias)
+                    if (hasBias())
                    {
                        Mat curBiasMat = biasesMat.rowRange(_Range(g * outGroupCn, outGroupCn));
-                        dnn::gemm(curBiasMat, biasOnesBlob, 1, dstMat, 1);
+                        dnn::gemm(curBiasMat, internals[1], 1, dstMat, 1);
                    }
                }
            }
        }
    }

-    void col2im(const Mat &colMat, Mat &dstImg)
+    void col2im(const Mat &colMat, Mat &dstImg, const MatShape& inShape, const MatShape& outShape)
    {
+        int outCn = outShape[1], outH = outShape[2], outW = outShape[3];
+        int inpCn = inShape[1];
+        int group = inpCn / blobs[0].size[1];
+        int outGroupCn = outCn / group;
+
        if (is1x1())
        {
            dstImg = colMat;
--- a/modules/dnn/src/layers/crop_layer.cpp
+++ b/modules/dnn/src/layers/crop_layer.cpp
@ -63,7 +63,26 @@ public:
        }
    }

-    void allocate(const std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const
+    {
+        CV_Assert(inputs.size() == 2);
+
+        MatShape dstShape = inputs[0];
+        int start = clamp(startAxis, dstShape);
+        for (int i = start; i < dstShape.size(); i++)
+        {
+            dstShape[i] = inputs[1][i];
+        }
+
+        outputs.resize(1, dstShape);
+
+        return false;
+    }
+
+    void finalize(const std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
    {
        CV_Assert(2 == inputs.size());

@ -71,7 +90,7 @@ public:
        const Mat &inpSzBlob = *inputs[1];

        int dims = inpBlob.dims;
-        int start_axis = startAxis < 0 ? startAxis + dims : startAxis;
+        int start_axis = clamp(startAxis, dims);

        std::vector<int> offset_final(dims, 0);
        if (offset.size() == 1)
@ -82,17 +101,16 @@ public:
        else if (offset.size() > 1)
        {
            if ((int)offset.size() != dims - start_axis)
-                CV_Error(Error::StsBadArg, "number of offset values specified must be equal to the number of dimensions following axis.");
+                CV_Error(Error::StsBadArg, "number of offset values specified must be "
+                                           "equal to the number of dimensions following axis.");

            for (int i = start_axis; i < dims; i++)
                offset_final[i] = offset[i - start_axis];
        }

-        std::vector<int> dstShape(dims);
        crop_ranges.resize(dims, Range::all());
        for (int i = 0; i < dims; i++)
        {
-            dstShape[i] = inpSzBlob.size[i];
            if( i < start_axis )
                continue;

@ -112,12 +130,9 @@ public:
                crop_ranges[i] = Range(cur_crop, cur_crop + inpSzBlob.size[i]);
            }
        }
-
-        outputs.resize(1);
-        outputs[0].create(dims, &dstShape[0], inpBlob.type());
    }

-    void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
+    void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
        Mat &input = *inputs[0];
        Mat &output = outputs[0];
--- a/modules/dnn/src/layers/detection_output_layer.cpp
+++ b/modules/dnn/src/layers/detection_output_layer.cpp
@ -94,9 +94,6 @@ public:
    int _keepTopK;
    float _confidenceThreshold;

-    int _num;
-    int _numPriors;
-
    float _nmsThreshold;
    int _topK;

@ -184,58 +181,62 @@ public:
        }
    }

-    void allocate(const std::vector<Mat*> &inputs,
-                                        std::vector<Mat> &outputs)
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const
    {
        CV_Assert(inputs.size() > 0);
-        CV_Assert(inputs[0]->size[0] == inputs[1]->size[0]);
-        _num = inputs[0]->size[0];
+        CV_Assert(inputs[0][0] == inputs[1][0]);

-        _numPriors = inputs[2]->size[2] / 4;
-        CV_Assert((_numPriors * _numLocClasses * 4) == inputs[0]->size[1]);
-        CV_Assert(int(_numPriors * _numClasses) == inputs[1]->size[1]);
+        int numPriors = inputs[2][2] / 4;
+        CV_Assert((numPriors * _numLocClasses * 4) == inputs[0][1]);
+        CV_Assert(int(numPriors * _numClasses) == inputs[1][1]);

        // num() and channels() are 1.
        // Since the number of bboxes to be kept is unknown before nms, we manually
        // set it to (fake) 1.
        // Each row is a 7 dimension std::vector, which stores
        // [image_id, label, confidence, xmin, ymin, xmax, ymax]
-        int outputShape[] = {1, 1, 1, 7};
-        outputs[0].create(4, outputShape, CV_32F);
+        outputs.resize(1, shape(1, 1, 1, 7));
+
+        return false;
    }

-    void forward(std::vector<Mat*> &inputs,
-                                       std::vector<Mat> &outputs)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
        const float* locationData = inputs[0]->ptr<float>();
        const float* confidenceData = inputs[1]->ptr<float>();
        const float* priorData = inputs[2]->ptr<float>();

+        int num = inputs[0]->size[0];
+        int numPriors = inputs[2]->size[2] / 4;
+
        // Retrieve all location predictions.
        std::vector<LabelBBox> allLocationPredictions;
-        GetLocPredictions(locationData, _num, _numPriors, _numLocClasses,
+        GetLocPredictions(locationData, num, numPriors, _numLocClasses,
                          _shareLocation, &allLocationPredictions);

        // Retrieve all confidences.
        std::vector<std::map<int, std::vector<float> > > allConfidenceScores;
-        GetConfidenceScores(confidenceData, _num, _numPriors, _numClasses,
+        GetConfidenceScores(confidenceData, num, numPriors, _numClasses,
                            &allConfidenceScores);

        // Retrieve all prior bboxes. It is same within a batch since we assume all
        // images in a batch are of same dimension.
        std::vector<caffe::NormalizedBBox> priorBBoxes;
        std::vector<std::vector<float> > priorVariances;
-        GetPriorBBoxes(priorData, _numPriors, &priorBBoxes, &priorVariances);
+        GetPriorBBoxes(priorData, numPriors, &priorBBoxes, &priorVariances);

        // Decode all loc predictions to bboxes.
        std::vector<LabelBBox> allDecodedBBoxes;
-        DecodeBBoxesAll(allLocationPredictions, priorBBoxes, priorVariances, _num,
+        DecodeBBoxesAll(allLocationPredictions, priorBBoxes, priorVariances, num,
                        _shareLocation, _numLocClasses, _backgroundLabelId,
                        _codeType, _varianceEncodedInTarget, &allDecodedBBoxes);

        int numKept = 0;
        std::vector<std::map<int, std::vector<int> > > allIndices;
-        for (int i = 0; i < _num; ++i)
+        for (int i = 0; i < num; ++i)
        {
            const LabelBBox& decodeBBoxes = allDecodedBBoxes[i];
            const std::map<int, std::vector<float> >& confidenceScores =
@ -324,7 +325,7 @@ public:
        float* outputsData = outputs[0].ptr<float>();

        int count = 0;
-        for (int i = 0; i < _num; ++i)
+        for (int i = 0; i < num; ++i)
        {
            const std::map<int, std::vector<float> >& confidenceScores =
            allConfidenceScores[i];
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@ -36,16 +36,16 @@ public:

    ElementWiseLayer(bool run_parallel_=false, const Func &f=Func()) : func(f), run_parallel(run_parallel_) {}

-    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                                         const int requiredOutputs,
+                                         std::vector<MatShape> &outputs,
+                                         std::vector<MatShape> &internals) const
    {
-        outputs.resize(inputs.size());
-        for (size_t i = 0; i < inputs.size(); i++)
-        {
-            outputs[i] = *inputs[i];
-        }
+        Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        return true;
    }

-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
        for (size_t i = 0; i < inputs.size(); i++)
        {
@ -169,20 +169,16 @@ public:
        setParamsFrom(params);
    }

-    ////////////////////////////////////////////////////////////////////////////
-
-    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                                         const int requiredOutputs,
+                                         std::vector<MatShape> &outputs,
+                                         std::vector<MatShape> &internals) const
    {
-        CV_Assert(blobs.size() == 1);
-
-        outputs.resize(inputs.size());
-        for (size_t i = 0; i < inputs.size(); i++)
-        {
-            outputs[i].create(inputs[i]->dims, inputs[i]->size.p, inputs[i]->type());
-        }
+        Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        return true;
    }

-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
        CV_Assert(inputs.size() == 1);
        Mat &inpBlob = *inputs[0];
--- a/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/modules/dnn/src/layers/eltwise_layer.cpp
@ -41,7 +41,6 @@

 #include "../precomp.hpp"
 #include "layers_common.hpp"
-
 namespace cv
 {
 namespace dnn
@ -82,21 +81,26 @@ public:
        }
    }

-    void allocate(const std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const
    {
-        CV_Assert(2 <= inputs.size());
+        CV_Assert(inputs.size() >= 2);
        CV_Assert(coeffs.size() == 0 || coeffs.size() == inputs.size());
        CV_Assert(op == SUM || coeffs.size() == 0);

-        for (size_t i = 1; i < inputs.size(); ++i)
+        for (int i = 1; i < inputs.size(); i++)
        {
-            CV_Assert(inputs[i]->size == inputs[0]->size);
+            CV_Assert(inputs[0] == inputs[i]);
        }
-        outputs.resize(1);
-        outputs[0].create(inputs[0]->dims, inputs[0]->size.p, inputs[0]->type());
+
+        outputs.assign(1, inputs[0]);
+
+        return false;
    }

-    void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
+    void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
        Mat& output = outputs[0];
        switch (op)
--- a/modules/dnn/src/layers/flatten_layer.cpp
+++ b/modules/dnn/src/layers/flatten_layer.cpp
@ -43,6 +43,7 @@
 #include "layers_common.hpp"
 #include <float.h>
 #include <algorithm>
+#include <opencv2/dnn/shape_utils.hpp>

 namespace cv
 {
@ -59,56 +60,60 @@ public:
        setParamsFrom(params);
    }

-    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const
    {
-        size_t i, ninputs = inputs.size();
-        CV_Assert(ninputs > 0);
-        const Mat& inp0 = *inputs[0];
+        CV_Assert(inputs.size() > 0);
+        for (size_t i = 1; i < inputs.size(); i++)
+        {
+            CV_Assert(inputs[i] == inputs[0]);
+        }

-        for (i = 1; i < ninputs; i++)
+        int numAxes = inputs[0].size();
+        int startAxis = clamp(_startAxis, numAxes);
+        int endAxis = clamp(_endAxis, numAxes);
+
+        for (size_t i = 1; i < inputs.size(); i++)
        {
-            CV_Assert(inputs[i]->size == inp0.size);
+            CV_Assert(inputs[i] == inputs[0]);
        }

-        _numAxes = inp0.dims;
-        _endAxis = _endAxis < 0 ? _endAxis + _numAxes : _endAxis;
-        CV_Assert(_startAxis >= 0);
-        CV_Assert(_endAxis >= _startAxis && _endAxis < (int)_numAxes);

-        size_t flattenedDimensionSize = inp0.total(_startAxis, _endAxis+1);
+        CV_Assert(startAxis >= 0);
+        CV_Assert(endAxis >= startAxis && endAxis < (int)numAxes);
+
+        size_t flattenedDimensionSize = total(inputs[0], startAxis, endAxis);

-        resultShape.clear();
-        for (int j = 0; j < _startAxis; j++)
+        MatShape outputShapeVec;
+        for (int i = 0; i < startAxis; i++)
        {
-            resultShape.push_back(inp0.size[j]);
+            outputShapeVec.push_back(inputs[0][i]);
        }
-        resultShape.push_back(flattenedDimensionSize);
-        for (int j = _endAxis + 1; j < _numAxes; j++)
+        outputShapeVec.push_back(flattenedDimensionSize);
+        for (size_t i = endAxis + 1; i < numAxes; i++)
        {
-            resultShape.push_back(inp0.size[j]);
+            outputShapeVec.push_back(inputs[0][i]);
        }
-        CV_Assert(resultShape.size() <= 4);
+        CV_Assert(outputShapeVec.size() <= 4);

-        for (i = 0; i < ninputs; i++)
-        {
-            //in-place
-            outputs[i] = inputs[i]->reshape(1, (int)resultShape.size(), &resultShape[0]);
-        }
+        outputs.resize(inputs.size(), outputShapeVec);
+
+        return true;
    }

-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
        for (size_t i = 0; i < inputs.size(); i++)
        {
-            outputs[i] = inputs[i]->reshape(1, (int)resultShape.size(), &resultShape[0]);
+            MatShape outShape = shape(outputs[i]);
+            outputs[i] = inputs[i]->reshape(1, (int)outShape.size(), &outShape[0]);
        }
    }

    int _startAxis;
    int _endAxis;
-    size_t _numAxes;
-
-    std::vector<int> resultShape;
 };

 Ptr<FlattenLayer> FlattenLayer::create(const LayerParams& params)
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@ -57,8 +57,8 @@ public:
        setParamsFrom(params);
        CV_Assert(1 <= blobs.size() && blobs.size() <= 2);

-        numOutput = params.get<int>("num_output");
-        innerSize = (int)blobs[0].total() / numOutput;
+        int numOutput = params.get<int>("num_output");
+        int innerSize = (int)blobs[0].total() / numOutput;
        bias = params.get<bool>("bias_term", true);
        axis = params.get<int>("axis", 1);

@ -70,43 +70,39 @@ public:
            blobs[1] = blobs[1].reshape(1, 1);
    }

-    void allocate(const std::vector<Mat*> &input, std::vector<Mat> &output)
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const
    {
-        CV_Assert(input.size() > 0);
-        const Mat& inp0 = *input[0];
-
+        CV_Assert(inputs.size() > 0);
        CV_Assert(1 <= blobs.size() && blobs.size() <= 2);
        CV_Assert(blobs[0].dims == 2);

-        bias = (blobs.size() >= 1);
-        axisCan = axis < 0 ? axis + inp0.dims : axis;
-        dtype = inp0.type();
-        numOutput = blobs[0].size[0];
-        innerSize = blobs[0].size[1];
-        outerSize = inp0.total(0, axisCan);
-        size_t innerSize0 = inp0.total(axisCan);
+        int cAxis = clamp(axis, inputs[0]);
+        int outerSize = total(inputs[0], 0, cAxis);
+        int numOutput = blobs[0].size[0];
+        outputs.resize(inputs.size(), shape(outerSize, numOutput));

-        CV_Assert((size_t)innerSize == innerSize0);
-        CV_Assert(!bias || (size_t)numOutput == blobs[1].total());
+        internals.push_back(shape(outerSize, 1));

-        biasOnesBlob.create(outerSize, 1, dtype);
-        biasOnesBlob.setTo(1.);
+        CV_Assert(!bias || (size_t)numOutput == blobs[1].total());

-        output.resize(input.size());
-        for (size_t i = 0; i < input.size(); i++)
-        {
-            CV_Assert(i == 0 || (input[i]->size == input[0]->size && input[i]->type() == dtype));
-            output[i].create(outerSize, numOutput, dtype);
-        }
+        return false;
    }

-    void forward(std::vector<Mat*> &input, std::vector<Mat> &output)
+    void forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &internals)
    {
+        internals[0].setTo(1.);
        const Mat &weight = blobs[0];
        const Mat *biasMat = NULL, *biasOnesMat = NULL;
+
+        int axisCan = clamp(axis, input[0]->dims);
+        int outerSize = input[0]->total(0, axisCan);
+
        if (bias)
        {
-            biasOnesMat = &biasOnesBlob;
+            biasOnesMat = &internals[0];
            biasMat = &blobs[1];
        }

@ -121,10 +117,7 @@ public:
        }
    }

-    int axisCan, dtype;
-    int numOutput, innerSize, outerSize;
    bool bias;
-    Mat biasOnesBlob;
 };

 Ptr<InnerProductLayer> InnerProductLayer::create(const LayerParams& params)
--- a/modules/dnn/src/layers/layers_common.cpp
+++ b/modules/dnn/src/layers/layers_common.cpp
@ -163,25 +163,19 @@ void getConvolutionKernelParams(const LayerParams &params, int &kernelH, int &ke
 // We pad Pr/2 on the left and Pr - Pr/2 on the right, Pc/2 on the top
 // and Pc - Pc/2 on the bottom.  When Pr or Pc is odd, this means
 // we pad more on the right and bottom than on the top and left.
-void getConvPoolOutParams(const int inputH, const int inputW, const cv::Size &kernel,
-                          const cv::Size &stride, cv::Size& pad, const cv::String &padMode,
-                          int &outH, int &outW)
+void getConvPoolOutParams(const Size& inp, const Size &kernel,
+                          const Size &stride, const String &padMode,
+                          Size& out)
 {
    if (padMode == "VALID")
    {
-        outH = (inputH - kernel.height + stride.height) / stride.height;
-        outW = (inputW - kernel.width + stride.width) / stride.width;
-        pad = cv::Size(0,0);
+        out.height = (inp.height - kernel.height + stride.height) / stride.height;
+        out.width = (inp.width- kernel.width + stride.width) / stride.width;
    }
    else if (padMode == "SAME")
    {
-        outH = (inputH - 1 + stride.height) / stride.height;
-        outW = (inputW - 1 + stride.width) / stride.width;
-        int Ph = std::max(0, (outH - 1) * stride.height + kernel.height - inputH);
-        int Pw = std::max(0, (outW - 1) * stride.width + kernel.width - inputW);
-        // For odd values of total padding, add more padding at the 'right'
-        // side of the given dimension.
-        pad = cv::Size(Pw / 2, Ph / 2);
+        out.height = (inp.height - 1 + stride.height) / stride.height;
+        out.width = (inp.width - 1 + stride.width) / stride.width;
    }
    else
    {
@ -189,5 +183,23 @@ void getConvPoolOutParams(const int inputH, const int inputW, const cv::Size &ke
    }
 }

+void getConvPoolPaddings(const Size& inp, const Size& out,
+                         const Size &kernel, const Size &stride,
+                         const String &padMode, Size &pad)
+{
+    if (padMode == "VALID")
+    {
+        pad = cv::Size(0,0);
+    }
+    else if (padMode == "SAME")
+    {
+        int Ph = std::max(0, (out.height - 1) * stride.height + kernel.height - inp.height);
+        int Pw = std::max(0, (out.width - 1) * stride.width + kernel.width - inp.width);
+        // For odd values of total padding, add more padding at the 'right'
+        // side of the given dimension.
+        pad = cv::Size(Pw / 2, Ph / 2);
+    }
+}
+
 }
 }
--- a/modules/dnn/src/layers/layers_common.hpp
+++ b/modules/dnn/src/layers/layers_common.hpp
@ -44,6 +44,7 @@
 #include <opencv2/dnn.hpp>
 #include "op_blas.hpp"
 #include "op_im2col.hpp"
+#include <opencv2/dnn/shape_utils.hpp>

 namespace cv
 {
@ -56,10 +57,13 @@ void getConvolutionKernelParams(const LayerParams &params, int &kernelH, int &ke
 void getPoolingKernelParams(const LayerParams &params, int &kernelH, int &kernelW, bool &globalPooling,
                            int &padH, int &padW, int &strideH, int &strideW, cv::String& padMode);

-void getConvPoolOutParams(const int inputH, const int inputW, const cv::Size& kernel,
-                          const cv::Size& stride, cv::Size &pad, const cv::String& padMode,
-                          int &outH, int &outW);
+void getConvPoolOutParams(const Size& inp, const Size &kernel,
+                          const Size &stride, const String &padMode,
+                          Size& out);

+void getConvPoolPaddings(const Size& inp, const Size& out,
+                         const Size &kernel, const Size &stride,
+                         const String &padMode, Size &pad);
 }
 }

--- a/modules/dnn/src/layers/lrn_layer.cpp
+++ b/modules/dnn/src/layers/lrn_layer.cpp
@ -75,36 +75,28 @@ public:
        normBySize = params.get<bool>("norm_by_size", true);
    }

-    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
-        CV_Assert(inputs.size() == 1 && inputs[0]->dims == 4);
-        CV_Assert(type == CHANNEL_NRM || type == SPATIAL_NRM);
-
-        const Mat& inp0 = *inputs[0];
-
-        if (type == SPATIAL_NRM)
-            buf.create(inp0.size[2], inp0.size[3], inp0.type());
-
-        outputs.resize(1);
-        outputs[0].create(inp0.dims, inp0.size.p, inp0.type());
-    }
+        CV_Assert(inputs.size() == outputs.size());
+        for (int i = 0; i < inputs.size(); i++)
+        {
+            CV_Assert(inputs[i]->dims == 4);

-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
-    {
-        Mat &src = *inputs[0];
-        Mat &dst = outputs[0];
+            Mat &src = *inputs[i];
+            Mat &dst = outputs[i];

-        switch (type)
-        {
-            case CHANNEL_NRM:
-                channelNormalization(src, dst);
-                break;
-            case SPATIAL_NRM:
-                spatialNormalization(src, dst);
-                break;
-            default:
-                CV_Error(Error::StsNotImplemented, "Unimplemented mode of LRN layer");
-                break;
+            switch (type)
+            {
+                case CHANNEL_NRM:
+                    channelNormalization(src, dst);
+                    break;
+                case SPATIAL_NRM:
+                    spatialNormalization(src, dst);
+                    break;
+                default:
+                    CV_Error(Error::StsNotImplemented, "Unimplemented mode of LRN layer");
+                    break;
+            }
        }
    }

@ -179,8 +171,6 @@ public:
            }
        }
    }
-
-    Mat buf;
 };

 Ptr<LRNLayer> LRNLayer::create(const LayerParams& params)
--- a/modules/dnn/src/layers/max_unpooling_layer.cpp
+++ b/modules/dnn/src/layers/max_unpooling_layer.cpp
@ -29,22 +29,25 @@ public:
        poolStride = Size(params.get<int>("pool_stride_w"), params.get<int>("pool_stride_h"));
    }

-    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const
    {
        CV_Assert(inputs.size() == 2);
-        const Mat& inp0 = *inputs[0];
-        CV_Assert(inp0.total() == inputs[1]->total());
-        CV_Assert(inp0.dims == 4);
+        CV_Assert(total(inputs[0]) == total(inputs[1]));

-        int outShape[] = { inp0.size[0], inp0.size[1], inp0.size[2], inp0.size[3] };
+        MatShape outShape = inputs[0];
        outShape[2] = (outShape[2] - 1) * poolStride.height + poolKernel.height - 2 * poolPad.height;
        outShape[3] = (outShape[3] - 1) * poolStride.width + poolKernel.width - 2 * poolPad.width;

-        outputs.resize(1);
-        outputs[0].create(4, outShape, inp0.type());
+        outputs.clear();
+        outputs.push_back(outShape);
+
+        return false;
    }

-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
        CV_Assert(inputs.size() == 2);
        Mat& input = *inputs[0];
--- a/modules/dnn/src/layers/mvn_layer.cpp
+++ b/modules/dnn/src/layers/mvn_layer.cpp
@ -59,18 +59,7 @@ public:
        eps = params.get<double>("eps", 1e-9);
    }

-    void allocate(const std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
-    {
-        outputs.resize(inputs.size());
-        for (size_t i = 0; i < inputs.size(); i++)
-        {
-            int dims = inputs[i]->dims;
-            CV_Assert(!acrossChannels || dims >= 2);
-            outputs[i].create(dims, inputs[i]->size.p, inputs[i]->type());
-        }
-    }
-
-    void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
+    void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
        for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
        {
--- a/modules/dnn/src/layers/normalize_bbox_layer.cpp
+++ b/modules/dnn/src/layers/normalize_bbox_layer.cpp
@ -51,31 +51,17 @@ namespace cv
 namespace dnn
 {

-class NormalizeBBoxLayerImpl : public NormalizeBBoxLayer
+namespace
 {
-public:
-    Mat _buffer;
-
-    Mat _sumChannelMultiplier;
-    Mat _sumSpatialMultiplier;
-
-    Mat _scale;
+    const std::string layerName = "NormalizeBBox";
+}

+class NormalizeBBoxLayerImpl : public NormalizeBBoxLayer
+{
    float _eps;
    bool _across_spatial;
    bool _channel_shared;
-
-    size_t _num;
-    size_t _channels;
-    size_t _rows;
-    size_t _cols;
-
-    size_t _channelSize;
-    size_t _imageSize;
-
-    static const size_t _numAxes = 4;
-    static const std::string _layerName;
-
+public:
    bool getParameterDict(const LayerParams &params,
                          const std::string &parameterName,
                          DictValue& result)
@ -102,7 +88,7 @@ public:
        {
            if(required)
            {
-                std::string message = _layerName;
+                std::string message = layerName;
                message += " layer parameter does not contain ";
                message += parameterName;
                message += " parameter.";
@ -127,60 +113,63 @@ public:
    void checkInputs(const std::vector<Mat*> &inputs)
    {
        CV_Assert(inputs.size() > 0);
+        CV_Assert(inputs[0]->dims == 4 && inputs[0]->type() == CV_32F);
        for (size_t i = 1; i < inputs.size(); i++)
        {
+            CV_Assert(inputs[i]->dims == 4 && inputs[i]->type() == CV_32F);
            CV_Assert(inputs[i]->size == inputs[0]->size);
        }
        CV_Assert(inputs[0]->dims > 2);
    }

-    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const
    {
-        checkInputs(inputs);
-
-        const Mat& inp0 = *inputs[0];
-        CV_Assert(inp0.dims == 4 && inp0.type() == CV_32F);
+        bool inplace = Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        size_t channels = inputs[0][1];
+        size_t rows = inputs[0][2];
+        size_t cols = inputs[0][3];
+        size_t channelSize = rows * cols;

-        _num = inp0.size[0];
-        _channels = inp0.size[1];
-        _rows = inp0.size[2];
-        _cols = inp0.size[3];
+        internals.assign(1, shape(channels, channelSize));
+        internals.push_back(shape(channels, 1));
+        internals.push_back(shape(1, channelSize));

-        _channelSize = _rows * _cols;
-        _imageSize = _channelSize * _channels;
+        return inplace;
+    }

-        _buffer = Mat(_channels, _channelSize, CV_32F);
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+    {
+        checkInputs(inputs);

-        _sumChannelMultiplier = Mat(_channels, 1, CV_32F, Scalar(1.0));
-        _sumSpatialMultiplier = Mat(1, _channelSize, CV_32F, Scalar(1.0));
+        Mat& buffer = internals[0], sumChannelMultiplier = internals[1],
+                sumSpatialMultiplier = internals[2];

-        _scale = blobs[0];
-        size_t i, ninputs = inputs.size();
-        outputs.resize(ninputs);
+        sumChannelMultiplier.setTo(1.0);
+        sumSpatialMultiplier.setTo(1.0);

-        for(i = 0; i < ninputs; i++)
-        {
-            outputs[i].create(inp0.dims, inp0.size.p, inp0.type());
-        }
-    }
+        const Mat& inp0 = *inputs[0];
+        size_t num = inp0.size[0];
+        size_t channels = inp0.size[1];
+        size_t channelSize = inp0.size[2] * inp0.size[3];

-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
-    {
-        Mat zeroBuffer(_channels, _channelSize, CV_32F, Scalar(0));
+        Mat zeroBuffer(channels, channelSize, CV_32F, Scalar(0));
        Mat absDiff;
-
+        Mat scale = blobs[0];
        for (size_t j = 0; j < inputs.size(); j++)
        {
-            for (size_t n = 0; n < _num; ++n)
+            for (size_t n = 0; n < num; ++n)
            {
-                Mat src = Mat(_channels, _channelSize, CV_32F, inputs[j]->ptr<float>(n));
-                Mat dst = Mat(_channels, _channelSize, CV_32F, outputs[j].ptr<float>(n));
+                Mat src = Mat(channels, channelSize, CV_32F, inputs[j]->ptr<float>(n));
+                Mat dst = Mat(channels, channelSize, CV_32F, outputs[j].ptr<float>(n));

-                _buffer = src.mul(src);
+                buffer = src.mul(src);

                if (_across_spatial)
                {
-                    absdiff(_buffer, zeroBuffer, absDiff);
+                    absdiff(buffer, zeroBuffer, absDiff);

                    // add eps to avoid overflow
                    double absSum = sum(absDiff)[0] + _eps;
@ -190,34 +179,34 @@ public:
                }
                else
                {
-                    Mat norm(_channelSize, 1, _buffer.type()); // 1 x _channelSize
+                    Mat norm(channelSize, 1, buffer.type()); // 1 x channelSize

-                    // (_channels x_channelSize)T * _channels x 1 -> _channelSize x 1
-                    gemmCPU(_buffer, _sumChannelMultiplier, 1, norm, 0, GEMM_1_T);
+                    // (_channels x channelSize)T * _channels x 1 -> channelSize x 1
+                    gemmCPU(buffer, sumChannelMultiplier, 1, norm, 0, GEMM_1_T);

                    // compute norm
                    pow(norm, 0.5f, norm);

                    // scale the layer
-                    // _channels x 1 * (_channelSize x 1)T -> _channels x _channelSize
-                    gemmCPU(_sumChannelMultiplier, norm, 1, _buffer, 0, GEMM_2_T);
+                    // _channels x 1 * (channelSize x 1)T -> _channels x channelSize
+                    gemmCPU(sumChannelMultiplier, norm, 1, buffer, 0, GEMM_2_T);

-                    dst = src / _buffer;
+                    dst = src / buffer;
                }

                // scale the output
                if (_channel_shared)
                {
                    // _scale: 1 x 1
-                    dst *= _scale.at<float>(0, 0);
+                    dst *= scale.at<float>(0, 0);
                }
                else
                {
                    // _scale: _channels x 1
-                    // _channels x 1 * 1 x _channelSize -> _channels x _channelSize
-                    gemmCPU(_scale, _sumSpatialMultiplier, 1, _buffer, 0);
+                    // _channels x 1 * 1 x channelSize -> _channels x channelSize
+                    gemmCPU(scale, sumSpatialMultiplier, 1, buffer, 0);

-                    dst = dst.mul(_buffer);
+                    dst = dst.mul(buffer);
                }
            }
        }
@ -225,7 +214,6 @@ public:

 };

-const std::string NormalizeBBoxLayerImpl::_layerName = std::string("NormalizeBBox");

 Ptr<NormalizeBBoxLayer> NormalizeBBoxLayer::create(const LayerParams &params)
 {
--- a/modules/dnn/src/layers/padding_layer.cpp
+++ b/modules/dnn/src/layers/padding_layer.cpp
@ -33,25 +33,26 @@ public:
            CV_Error(cv::Error::StsNotImplemented, "Negative padding and dim aren't supported");
    }

-    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const
    {
-        size_t i, ninputs = inputs.size();
-        outputs.resize(ninputs);
-
-        for( i = 0; i < ninputs; i++ )
+        outputs.clear();
+        for(int i = 0; i < inputs.size(); i++)
        {
-            const Mat& inp = *inputs[i];
-            int dims = inp.dims;
-            std::vector<int> shape(inp.size.p, inp.size.p + dims);
+            MatShape shape = inputs[i];
            int dim = getPadDim(shape);
-            CV_Assert(dim < dims);
+            CV_Assert(dim < shape.size());

            shape[dim] += padding;
-            outputs[i].create(dims, &shape[0], inp.type());
+            outputs.push_back(shape);
        }
+
+        return false;
    }

-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
        for(int i = 0; i < inputs.size(); i++)
        {
@ -59,8 +60,8 @@ public:
            const Mat& inp = *inputs[i];
            Mat& out = outputs[i];
            int dims = inp.dims;
-            std::vector<int> inShape(inp.size.p, inp.size.p + dims);
-            std::vector<int> outShape(out.size.p, out.size.p + dims);
+            MatShape inShape(inp.size.p, inp.size.p + dims);
+            MatShape outShape(out.size.p, out.size.p + dims);
            int dim = getPadDim(inShape);

            int actualIndex = index;
@ -88,7 +89,7 @@ public:
        }
    }

-    int getPadDim(const std::vector<int>& shape) const
+    int getPadDim(const MatShape& shape) const
    {
        return inputDims > 0 && (int)shape.size() > inputDims ? paddingDim + 1 : paddingDim;
    }
--- a/modules/dnn/src/layers/permute_layer.cpp
+++ b/modules/dnn/src/layers/permute_layer.cpp
@ -110,7 +110,35 @@ public:
        checkNeedForPermutation();
    }

-    void computeStrides()
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const
+    {
+        if(!_needsPermute)
+            return true;
+
+        CV_Assert(inputs.size() > 0);
+        CV_Assert((int)_numAxes == inputs[0].size());
+
+        MatShape shapeBefore = inputs[0], shapeAfter;
+        for (size_t i = 0; i < _numAxes; i++)
+        {
+            shapeAfter[i] = shapeBefore[_order[i]];
+        }
+
+        outputs.clear();
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            CV_Assert(inputs[i][2] == shapeBefore[2] && inputs[i][3] == shapeBefore[3]);
+            outputs.push_back(shapeAfter);
+        }
+
+        return false;
+    }
+
+    void computeStrides(const MatShape &shapeBefore, const MatShape &shapeAfter)
    {
        _oldStride.resize(_numAxes);
        _newStride.resize(_numAxes);
@ -120,14 +148,14 @@ public:

        for(int i = _numAxes - 2; i >= 0; i--)
        {
-            _oldStride[i] = _oldStride[i + 1] * _oldDimensionSize[i + 1];
-            _newStride[i] = _newStride[i + 1] * _newDimensionSize[i + 1];
+            _oldStride[i] = _oldStride[i + 1] * shapeBefore[i + 1];
+            _newStride[i] = _newStride[i + 1] * shapeAfter[i + 1];
        }

-        _count = _oldStride[0] * _oldDimensionSize[0];
+        _count = _oldStride[0] * shapeBefore[0];
    }

-    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
    {
        if(!_needsPermute)
        {
@ -138,27 +166,10 @@ public:
        const Mat& inp0 = *inputs[0];
        CV_Assert((int)_numAxes == inp0.dims);

-        outputs.resize(inputs.size());
-
-        _newDimensionSize.resize(_numAxes);
-        _oldDimensionSize.resize(_numAxes);
-
-        for (size_t i = 0; i < _numAxes; i++)
-        {
-            _oldDimensionSize[i] = inp0.size[i];
-            _newDimensionSize[i] = inp0.size[_order[i]];
-        }
-
-        for (size_t i = 0; i < inputs.size(); i++)
-        {
-            CV_Assert(inputs[i]->size == inp0.size);
-            outputs[i].create(_numAxes, &_newDimensionSize[0], CV_32F);
-        }
-
-        computeStrides();
+        computeStrides(shape(*inputs[0]), shape(outputs[0]));
    }

-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
        size_t k, ninputs = inputs.size();
        if(!_needsPermute)
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@ -77,39 +77,22 @@ public:
        setParamsFrom(params);
    }

-    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
    {
        CV_Assert(inputs.size() == 1);

-        inp = Size(inputs[0]->size[3], inputs[0]->size[2]);
+        cv::Size inp(inputs[0]->size[3], inputs[0]->size[2]),
+                out(outputs[0].size[3], outputs[0].size[2]);

        if(globalPooling)
        {
            kernel = inp;
        }

-        computeOutputShape(inp);
-
-        outputs.resize(type == MAX ? 2 * inputs.size() : inputs.size());
-        for (size_t i = 0; i < inputs.size(); i++)
-        {
-            const Mat& inp_i = *inputs[i];
-            CV_Assert(inp_i.size[2] == inp.height && inp_i.size[3] == inp.width);
-            int outsz[] = { inp_i.size[0], inp_i.size[1], out.height, out.width };
-
-            if (type == MAX)
-            {
-                outputs[2 * i].create(4, outsz, CV_32F);
-                outputs[2 * i + 1].create(4, outsz, CV_32F);
-            }
-            else
-            {
-                outputs[i].create(4, outsz, CV_32F);
-            }
-        }
+        getConvPoolPaddings(inp, out, kernel, stride, padMode, pad);
    }

-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
        for (size_t ii = 0; ii < inputs.size(); ii++)
        {
@ -130,7 +113,8 @@ public:

    void maxPooling(Mat &src, Mat &dst, Mat &mask)
    {
-        CV_DbgAssert(dst.size[2] == out.height && dst.size[3] == out.width);
+        Size inp(src.size[3], src.size[2]),
+            out(dst.size[3], dst.size[2]);

        for (int n = 0; n < src.size[0]; ++n)
        {
@ -175,6 +159,8 @@ public:

    void avePooling(Mat &src, Mat &dst)
    {
+        Size inp(src.size[3], src.size[2]),
+            out(dst.size[3], dst.size[2]);
        for (int n = 0; n < src.size[0]; ++n)
        {
            for (int c = 0; c < src.size[1]; ++c)
@ -209,35 +195,52 @@ public:
        }
    }

-    void computeOutputShape(Size inpSz)
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const
    {
+        CV_Assert(inputs.size() != 0);
+        Size in(inputs[0][3], inputs[0][2]), out;
+
        if (padMode.empty()) {
            //Yeah, something strange Caffe scheme-)
-            out.height = static_cast<int>(ceil(static_cast<float>(inpSz.height + 2 * pad.height -
+            out.height = static_cast<int>(ceil(static_cast<float>(in.height + 2 * pad.height -
                                                                  kernel.height) / stride.height)) + 1;
-            out.width = static_cast<int>(ceil(static_cast<float>(inpSz.width + 2 * pad.width -
+            out.width = static_cast<int>(ceil(static_cast<float>(in.width + 2 * pad.width -
                                                                 kernel.width) / stride.width)) + 1;

            if (pad.height || pad.width)
            {
                // If we have padding, ensure that the last pooling starts strictly
                // inside the image (instead of at the padding); otherwise clip the last.
-                if ((out.height - 1) * stride.height >= inpSz.height + pad.height)
+                if ((out.height - 1) * stride.height >= in.height + pad.height)
                    --out.height;
-                if ((out.width - 1) * stride.width >= inpSz.width + pad.width)
+                if ((out.width - 1) * stride.width >= in.width + pad.width)
                    --out.width;
-                CV_Assert((out.height - 1) * stride.height < inpSz.height + pad.height);
-                CV_Assert((out.width - 1) * stride.width < inpSz.width + pad.width);
+                CV_Assert((out.height - 1) * stride.height < in.height + pad.height);
+                CV_Assert((out.width - 1) * stride.width < in.width + pad.width);
            }
        }
        else
        {
-            getConvPoolOutParams(inpSz.height, inpSz.width, kernel, stride, pad,
-                                 padMode, out.height, out.width);
+            getConvPoolOutParams(in, kernel, stride,
+                                 padMode, out);
        }
-    }

-    Size inp, out;
+        outputs.resize(type == MAX ? 2 * inputs.size() : inputs.size());
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            size_t index = type == MAX ? 2*i : i;
+            int dims[] = {inputs[i][0], inputs[i][1], out.height, out.width};
+            outputs[index] = shape(dims);
+
+            if (type == MAX)
+                outputs[index + 1] = shape(dims);
+        }
+
+        return false;
+    }
 };

 Ptr<PoolingLayer> PoolingLayer::create(const LayerParams& params)
--- a/modules/dnn/src/layers/prior_box_layer.cpp
+++ b/modules/dnn/src/layers/prior_box_layer.cpp
@ -185,34 +185,41 @@ public:
        }
    }

-    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const
    {
        CV_Assert(inputs.size() == 2);

-        _layerWidth = inputs[0]->size[3];
-        _layerHeight = inputs[0]->size[2];
-
-        _imageWidth = inputs[1]->size[3];
-        _imageHeight = inputs[1]->size[2];
-
-        _stepX = static_cast<float>(_imageWidth) / _layerWidth;
-        _stepY = static_cast<float>(_imageHeight) / _layerHeight;
+        int layerHeight = inputs[0][2];
+        int layerWidth = inputs[0][3];

        // Since all images in a batch has same height and width, we only need to
        // generate one set of priors which can be shared across all images.
-        int outNum = 1;
+        size_t outNum = 1;
        // 2 channels. First channel stores the mean of each prior coordinate.
        // Second channel stores the variance of each prior coordinate.
-        int outChannels = 2;
-        _outChannelSize = _layerHeight * _layerWidth * _numPriors * 4;
+        size_t outChannels = 2;
+
+        outputs.resize(1, shape(outNum, outChannels,
+                                layerHeight * layerWidth * _numPriors * 4));

-        int outsz[] = { outNum, outChannels, (int)_outChannelSize };
-        outputs[0].create(3, outsz, CV_32F);
+        return false;
    }

-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
-        (void)inputs; // to suppress unused parameter warning
+        int _layerWidth = inputs[0]->size[3];
+        int _layerHeight = inputs[0]->size[2];
+
+        int _imageWidth = inputs[1]->size[3];
+        int _imageHeight = inputs[1]->size[2];
+
+        float _stepX = static_cast<float>(_imageWidth) / _layerWidth;
+        float _stepY = static_cast<float>(_imageHeight) / _layerHeight;
+
+        int _outChannelSize = _layerHeight * _layerWidth * _numPriors * 4;

        float* outputPtr = outputs[0].ptr<float>();

@ -305,17 +312,6 @@ public:
        }
    }

-    size_t _layerWidth;
-    size_t _layerHeight;
-
-    size_t _imageWidth;
-    size_t _imageHeight;
-
-    size_t _outChannelSize;
-
-    float _stepX;
-    float _stepY;
-
    float _minSize;
    float _maxSize;

--- a/modules/dnn/src/layers/recurrent_layers.cpp
+++ b/modules/dnn/src/layers/recurrent_layers.cpp
@ -82,14 +82,11 @@ static void sigmoid(const Mat &src, Mat &dst)

 class LSTMLayerImpl : public LSTMLayer
 {
-    int numOut, numTimeStamps, numSamples, numInp;
-    Mat hInternal, cInternal;
-    Mat gates, dummyOnes;
+    int numTimeStamps, numSamples;
    bool allocated;

-    std::vector<int> outTailShape;                 //shape of single output sample
-    std::vector<int> outTsMatShape, outTsShape;    //shape of N output samples
-    std::vector<int> outResShape;                  //shape of T timestamps and N output samples
+    MatShape outTailShape;                 //shape of single output sample
+    MatShape outTsShape;    //shape of N output samples

    bool useTimestampDim;
    bool produceCellOutput;
@ -118,47 +115,9 @@ public:
        produceCellOutput = produce;
    }

-    void setC(const Mat &C)
+    void setOutShape(const MatShape &outTailShape_)
    {
-        CV_Assert(C.type() == CV_32F);
-        if (!cInternal.empty())
-        {
-            CV_Assert(C.total() == cInternal.total() && cInternal.isContinuous());
-            Mat cInternal_(C.dims, &C.size.p[0], C.type(), cInternal.ptr());
-            C.copyTo(cInternal_);
-        }
-        else
-            C.copyTo(cInternal);
-    }
-
-    void setH(const Mat &H)
-    {
-        CV_Assert(H.type() == CV_32F);
-        if (!hInternal.empty())
-        {
-            CV_Assert(H.total() == hInternal.total() && hInternal.isContinuous());
-            Mat hInternal_(H.dims, &H.size.p[0], H.type(), hInternal.ptr());
-            H.copyTo(hInternal_);
-        }
-        else
-            H.copyTo(hInternal);
-    }
-
-    Mat getC() const
-    {
-        CV_Assert(shapeTotal(outTsShape) == cInternal.total());
-        return Mat((int)outTsShape.size(), &outTsShape[0], cInternal.type(), (char*)cInternal.ptr());
-    }
-
-    Mat getH() const
-    {
-        CV_Assert(shapeTotal(outTsShape) == hInternal.total());
-        return Mat((int)outTsShape.size(), &outTsShape[0], hInternal.type(), (char*)hInternal.ptr());
-    }
-
-    void setOutShape(const std::vector<int> &outTailShape_)
-    {
-        CV_Assert(!allocated || shapeTotal(outTailShape) == shapeTotal(outTailShape_));
+        CV_Assert(!allocated || total(outTailShape) == total(outTailShape_));
        outTailShape = outTailShape_;
    }

@ -176,92 +135,103 @@ public:
        blobs[2] = Mat(bias.clone()).reshape(1, 1);
    }

-    void allocate(const std::vector<Mat*> &input, std::vector<Mat> &output)
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const
    {
        CV_Assert(blobs.size() == 3);
-        CV_Assert(input.size() == 1);
-        const Mat& inp0 = *input[0];
+        CV_Assert(inputs.size() == 1);
+        const MatShape& inp0 = inputs[0];

-        Mat &Wh = blobs[0], &Wx = blobs[1];
-        numOut = Wh.size[1];
-        numInp = Wx.size[1];
+        const Mat &Wh = blobs[0], &Wx = blobs[1];
+        int _numOut = Wh.size[1];
+        int _numInp = Wx.size[1];
+        MatShape outTailShape_(outTailShape), outResShape;

-        if (!outTailShape.empty())
-            CV_Assert(shapeTotal(outTailShape) == numOut);
+        if (!outTailShape_.empty())
+            CV_Assert(total(outTailShape_) == _numOut);
        else
-            outTailShape.assign(1, numOut);
+            outTailShape_.assign(1, _numOut);

-        outResShape.clear();
+        int _numTimeStamps, _numSamples;
        if (useTimestampDim)
        {
-            CV_Assert(inp0.dims >= 2 && (int)inp0.total(2) == numInp);
-            numTimeStamps = inp0.size[0];
-            numSamples = inp0.size[1];
-            outResShape.push_back(numTimeStamps);
+            CV_Assert(inp0.size() >= 2 && total(inp0, 2) == _numInp);
+            _numTimeStamps = inp0[0];
+            _numSamples = inp0[1];
+            outResShape.push_back(_numTimeStamps);
        }
        else
        {
-            CV_Assert(inp0.dims >= 2 && (int)inp0.total(1) == numInp);
-            numTimeStamps = 1;
-            numSamples = inp0.size[0];
+            CV_Assert(inp0.size() >= 2 && total(inp0, 1) == _numInp);
+            _numTimeStamps = 1;
+            _numSamples = inp0[0];
        }

-        outResShape.push_back(numSamples);
-        outResShape.insert(outResShape.end(), outTailShape.begin(), outTailShape.end());
+        outResShape.push_back(_numSamples);
+        outResShape.insert(outResShape.end(), outTailShape_.begin(), outTailShape_.end());

-        outTsMatShape.clear();
-        outTsMatShape.push_back(numSamples);
-        outTsMatShape.push_back(numOut);
+        size_t noutputs = produceCellOutput ? 2 : 1;
+        outputs.assign(noutputs, outResShape);

-        outTsShape.clear();
-        outTsShape.push_back(numSamples);
-        outTsShape.insert(outTsShape.end(), outTailShape.begin(), outTailShape.end());
+        internals.assign(1, shape(_numSamples, _numOut)); // hInternal
+        internals.push_back(shape(_numSamples, _numOut)); // cInternal
+        internals.push_back(shape(_numSamples, 1)); // dummyOnes
+        internals.push_back(shape(_numSamples, 4*_numOut)); // gates

-        const int dtype = CV_32F;
-        CV_Assert(inp0.type() == dtype && Wh.type() == dtype);
+        return false;
+    }

-        size_t i, noutputs = produceCellOutput ? 2 : 1;
-        output.resize(noutputs);
+    void finalize(const std::vector<Mat*> &input, std::vector<Mat> &output)
+    {
+        CV_Assert(blobs.size() == 3);
+        CV_Assert(input.size() == 1);
+        const Mat& inp0 = *input[0];

-        for( i = 0; i < noutputs; i++ )
-            output[i].create(outResShape, dtype);
+        Mat &Wh = blobs[0], &Wx = blobs[1];
+        int numOut = Wh.size[1];
+        int numInp = Wx.size[1];

-        if (hInternal.empty())
-        {
-            hInternal.create(outTsMatShape, dtype);
-            hInternal.setTo(0.);
-        }
+        if (!outTailShape.empty())
+            CV_Assert(total(outTailShape) == numOut);
        else
-        {
-            CV_Assert(hInternal.total() == (size_t)numSamples*numOut);
-            hInternal = hInternal.reshape(1, outTsMatShape);
-        }
+            outTailShape.assign(1, numOut);

-        if (cInternal.empty())
+        if (useTimestampDim)
        {
-            cInternal.create(outTsMatShape, dtype);
-            cInternal.setTo(0.);
+            CV_Assert(inp0.dims >= 2 && (int)inp0.total(2) == numInp);
+            numTimeStamps = inp0.size[0];
+            numSamples = inp0.size[1];
        }
        else
        {
-            CV_Assert(cInternal.total() == (size_t)numSamples*numOut);
-            cInternal = cInternal.reshape(1, outTsMatShape);
+            CV_Assert(inp0.dims >= 2 && (int)inp0.total(1) == numInp);
+            numTimeStamps = 1;
+            numSamples = inp0.size[0];
        }

-        gates.create(numSamples, 4*numOut, dtype);
-
-        dummyOnes.create(numSamples, 1, dtype);
-        dummyOnes.setTo(1.);
+        outTsShape.clear();
+        outTsShape.push_back(numSamples);
+        outTsShape.insert(outTsShape.end(), outTailShape.begin(), outTailShape.end());

        allocated = true;
    }

-    void forward(std::vector<Mat*> &input, std::vector<Mat> &output)
+    void forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &internals)
    {
        const Mat &Wh = blobs[0];
        const Mat &Wx = blobs[1];
        const Mat &bias = blobs[2];

+        int numOut = Wh.size[1];
+
+        Mat hInternal = internals[0], cInternal = internals[1],
+                dummyOnes = internals[2], gates = internals[3];
+        hInternal.setTo(0.);
+        cInternal.setTo(0.);
+        dummyOnes.setTo(1.);
+
        int numSamplesTotal = numTimeStamps*numSamples;
        Mat xTs = input[0]->reshape(1, numSamplesTotal);

@ -332,7 +302,6 @@ class RNNLayerImpl : public RNNLayer
    int dtype;
    Mat Whh, Wxh, bh;
    Mat Who, bo;
-    Mat hCurr, hPrev, dummyBiasOnes;
    bool produceH;

 public:
@ -364,7 +333,36 @@ public:
        blobs[4] = Mat(b_o.clone());
    }

-    void allocate(const std::vector<Mat*> &input, std::vector<Mat> &output)
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const
+    {
+        CV_Assert(inputs.size() >= 1 && inputs.size() <= 2);
+
+        Mat Who_ = blobs[3];
+        Mat Wxh_ = blobs[0];
+
+        int numTimestamps_ = inputs[0][0];
+        int numSamples_ = inputs[0][1];
+
+        int numO_ = Who_.rows;
+        int numH_ = Wxh_.rows;
+
+        outputs.clear();
+        int dims[] = {numTimestamps_, numSamples_, numO_};
+        outputs.push_back(shape(dims, 3));
+        dims[2] = numH_;
+        if (produceH)
+            outputs.push_back(shape(dims, 3));
+
+        internals.assign(2, shape(numSamples_, numH_));
+        internals.push_back(shape(numSamples_, 1));
+
+        return false;
+    }
+
+    void finalize(const std::vector<Mat*> &input, std::vector<Mat> &output)
    {
        CV_Assert(input.size() >= 1 && input.size() <= 2);

@ -388,16 +386,8 @@ public:
        numSamples = inp0.size[1];
        numSamplesTotal = numTimestamps * numSamples;

-        hCurr.create(numSamples, numH, dtype);
-        hPrev.create(numSamples, numH, dtype);
-        hPrev.setTo(0.);
-
-        dummyBiasOnes.create(numSamples, 1, dtype);
-        dummyBiasOnes.setTo(1.);
        bh = bh.reshape(1, 1); //is 1 x numH Mat
        bo = bo.reshape(1, 1); //is 1 x numO Mat
-
-        reshapeOutput(output);
    }

    void reshapeOutput(std::vector<Mat> &output)
@ -412,11 +402,17 @@ public:
        }
    }

-    void forward(std::vector<Mat*> &input, std::vector<Mat> &output)
+    void forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &internals)
    {
        Mat xTs = input[0]->reshape(1, numSamplesTotal);
        Mat oTs = output[0].reshape(1, numSamplesTotal);
        Mat hTs = produceH ? output[1].reshape(1, numSamplesTotal) : Mat();
+        Mat hCurr = internals[0];
+        Mat hPrev = internals[1];
+        Mat dummyBiasOnes = internals[2];
+
+        hPrev.setTo(0.);
+        dummyBiasOnes.setTo(1.);

        for (int ts = 0; ts < numTimestamps; ts++)
        {
--- a/modules/dnn/src/layers/reshape_layer.cpp
+++ b/modules/dnn/src/layers/reshape_layer.cpp
@ -48,10 +48,10 @@ namespace cv
 namespace dnn
 {

-static void computeShapeByReshapeMask(const std::vector<int> &srcShape,
-                                      const std::vector<int> &maskShape,
+static void computeShapeByReshapeMask(const MatShape &srcShape,
+                                      const MatShape &maskShape,
                                      Range srcRange /*= Range::all()*/,
-                                      std::vector<int>& dstShape)
+                                      MatShape& dstShape)
 {
    int srcShapeSize = (int)srcShape.size();
    int maskShapeSize = (int)maskShape.size();
@ -61,7 +61,7 @@ static void computeShapeByReshapeMask(const std::vector<int> &srcShape,
    else
    {
        int sz = srcRange.size();
-        srcRange.start = srcRange.start < 0 ? srcRange.start + srcShapeSize : srcRange.start;
+        srcRange.start = clamp(srcRange.start, srcShapeSize);
        srcRange.end = srcRange.end == INT_MAX ? srcShapeSize : srcRange.start + sz;
    }

@ -96,8 +96,8 @@ static void computeShapeByReshapeMask(const std::vector<int> &srcShape,
            CV_Error(Error::StsBadArg, "maskShape[i] >= -1");
    }

-    size_t srcTotal = shapeTotal(srcShape);
-    size_t dstTotal = shapeTotal(dstShape);
+    size_t srcTotal = total(srcShape);
+    size_t dstTotal = total(dstShape);

    if (inferDim != -1)
    {
@ -116,7 +116,8 @@ static void computeShapeByReshapeMask(const std::vector<int> &srcShape,
 class ReshapeLayerImpl : public ReshapeLayer
 {
 public:
-    ReshapeLayerImpl(const LayerParams& params)
+    ReshapeLayerImpl(const LayerParams& params):
+        performReordering(false)
    {
        setParamsFrom(params);
        int axis = params.get<int>("axis", 0);
@ -136,29 +137,40 @@ public:
        }
    }

-    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const
    {
-        outputs.resize(inputs.size());
-        outShapes.resize(inputs.size());
+        outputs.clear();

        for (size_t i = 0; i < inputs.size(); i++)
        {
-            std::vector<int> inputShape(inputs[i]->size.p, inputs[i]->size.p + inputs[i]->dims);
-            computeShapeByReshapeMask(inputShape, newShapeDesc, newShapeRange, outShapes[i]);
-            outputs[i] = inputs[i]->reshape(1, outShapes[i]);
+            outputs.push_back(MatShape());
+            computeShapeByReshapeMask(inputs[i], newShapeDesc, newShapeRange, outputs.back());
        }
+
+        return true;
+    }
+
+    void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    {
+        CV_Assert(inputs.size());
+        CV_Assert(outputs.size());
+        Mat srcBlob = *inputs[0];
+        int dims = srcBlob.dims;
+        MatShape inputShape = shape(srcBlob), outShape = shape(outputs[0]);
+        bool channelsReduced = dims > (int)outShape.size() ||
+                (dims == 4 && inputShape[1] > outShape[1]);
+        performReordering = enableReordering && dims == 4 && channelsReduced;
    }

-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
-        for (size_t i = 0; i < outputs.size(); i++)
+        for (size_t i = 0; i < inputs.size(); i++)
        {
            Mat srcBlob = *inputs[i];
-            int dims = srcBlob.dims;
-            std::vector<int> inputShape(srcBlob.size.p, srcBlob.size.p + dims);
-            bool channelsReduced = dims > (int)outShapes[i].size() ||
-            (dims == 4 && inputShape[1] > outShapes[i][1]);
-            bool performReordering = enableReordering && dims == 4 && channelsReduced;
+            MatShape inputShape = shape(srcBlob), outShape = shape(outputs[i]);

            if (performReordering)
            {
@ -185,16 +197,14 @@ public:
                    }
                }

-                srcBlob = reordered_blob;
+                outputs[i] = reordered_blob.reshape(1, outShape);
            }
-
-            // TODO: we should not assign srcBlob if performReordering is true.
-            outputs[i] = srcBlob.reshape(1, outShapes[i]);
        }
    }

+private:
    std::vector<std::vector<int> > outShapes;
-    bool enableReordering;
+    bool enableReordering, performReordering;
 };

 Ptr<ReshapeLayer> ReshapeLayer::create(const LayerParams& params)
--- a/modules/dnn/src/layers/scale_layer.cpp
+++ b/modules/dnn/src/layers/scale_layer.cpp
@ -27,20 +27,10 @@ public:
        hasBias = params.get<bool>("bias_term", false);
    }

-    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
        CV_Assert(blobs.size() == 1 + hasBias);

-        outputs.resize(inputs.size());
-        for (size_t i = 0; i < inputs.size(); i++)
-        {
-            const Mat& inp = *inputs[i];
-            outputs[i].create(inp.dims, inp.size.p, inp.type());
-        }
-    }
-
-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
-    {
        for (size_t ii = 0; ii < outputs.size(); ii++)
        {
            Mat &inpBlob = *inputs[ii];
--- a/modules/dnn/src/layers/shift_layer.cpp
+++ b/modules/dnn/src/layers/shift_layer.cpp
@ -11,6 +11,7 @@ Implementation of shift layer, which adds up const values to blob.

 #include "../precomp.hpp"
 #include "op_blas.hpp"
+#include <opencv2/dnn/shape_utils.hpp>

 namespace cv
 {
@ -35,42 +36,17 @@ public:
 #endif
    }

-    virtual void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const
    {
-        CV_Assert(inputs.size() > 0);
-        CV_Assert(blobs.size() > 0);
-        const Mat &inpBlob = *inputs[0];
-        CV_Assert(inpBlob.dims == 4 && inpBlob.type() == CV_32F);
-        const Mat &biasBlob = blobs[0];
-        outputs.resize(inputs.size());
-
-        if(inpBlob.dims == biasBlob.dims)
-        {
-            for (size_t i = 0; i < inputs.size(); i++)
-            {
-                CV_Assert(inputs[i]->type() == inpBlob.type());
-                CV_Assert(inputs[i]->dims == inpBlob.dims);
-
-                outputs[i] = *inputs[i];
-            }
-        }
-        else
-        {
-            CV_Assert(biasBlob.total() == (size_t)inpBlob.size[1]);
-
-            for (size_t i = 0; i < inputs.size(); i++)
-            {
-                CV_Assert(inputs[i]->type() == inpBlob.type());
-                CV_Assert(inputs[i]->dims == 4 && inputs[i]->size[1] == inpBlob.size[1]);
-
-                outputs[i] = *inputs[i];
-            }
-
-            biasOnesMat = Mat::ones(1, inpBlob.size[2] * inpBlob.size[3], inpBlob.type());
-        }
+        Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        internals.assign(1, shape(1, total(inputs[0], 2)));
+        return true;
    }

-    virtual void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    virtual void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
        CV_Assert(inputs.size() > 0);
        CV_Assert(blobs.size() > 0);
@ -87,6 +63,8 @@ public:
        }
        else
        {
+            Mat biasOnesMat = internals[0];
+            biasOnesMat.setTo(1);
            for (size_t ii = 0; ii < outputs.size(); ii++)
            {
                Mat &inpBlob = *inputs[ii];
@ -103,8 +81,6 @@ public:
            }
        }
    }
-
-    Mat biasOnesMat;
 };

 Ptr<ShiftLayer> ShiftLayer::create(const LayerParams& params)
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@ -66,66 +66,69 @@ public:
        }
    }

-    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                            const int requiredOutputs,
+                            std::vector<MatShape> &outputs,
+                            std::vector<MatShape> &internals) const
    {
        CV_Assert(inputs.size() == 1);
-        const Mat &inpBlob = *inputs[0];
-        int dims = inpBlob.dims;

-        axisIdx = axis < 0 ? axis + dims : axis;
-        int axisSize = inpBlob.size[axisIdx];
-        std::vector<int> inpShape(inpBlob.size.p, inpBlob.size.p + dims);
+        outputs.clear();
+
+        MatShape inpShape = inputs[0];
+        int cAxis = clamp(axis, inpShape.size());
+        int axisSize = inpShape[cAxis];

        if (sliceIndices.size()) //divide blob with respect to passed parameters
        {
-            std::vector<int> outAxisSize;
-            int prevSlice = 0;
+           std::vector<int> outAxisSize;
+           int prevSlice = 0;

-            for (size_t i = 0; i < sliceIndices.size(); i++)
-            {
-                if (!(prevSlice < sliceIndices[i] && sliceIndices[i] < axisSize))
-                    CV_Error(Error::StsBadArg, "Slice indices should be positive, increased and don't exceed size of sliced dimension");
+           for (size_t i = 0; i < sliceIndices.size(); i++)
+           {
+               if (!(prevSlice < sliceIndices[i] && sliceIndices[i] < axisSize))
+                   CV_Error(Error::StsBadArg, "Slice indices should be positive, increased and don't exceed size of sliced dimension");

-                outAxisSize.push_back(sliceIndices[i] - prevSlice);
-                prevSlice = sliceIndices[i];
+               outAxisSize.push_back(sliceIndices[i] - prevSlice);
+               prevSlice = sliceIndices[i];
            }
            outAxisSize.push_back(axisSize - prevSlice);

-            outputs.resize(outAxisSize.size());
            for (size_t i = 0; i < outAxisSize.size(); i++)
            {
-                inpShape[axisIdx] = outAxisSize[i];
-                outputs[i].create(inpShape, inpBlob.type());
+               inpShape[cAxis] = outAxisSize[i];
+              outputs.push_back(inpShape);
            }
        }
        else //divide blob with respect to count of output blobs
        {
-            CV_Assert(outputs.size() > 0 && axisSize % outputs.size() == 0);
-            int outAxisSize = axisSize / (int)outputs.size();
+           CV_Assert(requiredOutputs > 0 && axisSize % requiredOutputs == 0);
+           int outAxisSize = axisSize / (int)requiredOutputs;

-            for (size_t i = 0; i < outputs.size(); i++)
+           for (size_t i = 0; i < requiredOutputs; i++)
            {
-                inpShape[axisIdx] = outAxisSize;
-                outputs[i].create(inpShape, inpBlob.type());
+               inpShape[cAxis] = outAxisSize;
+               outputs.push_back(inpShape);
            }
        }
+
+        return false;
    }

-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
        const Mat& inpMat = *inputs[0];
        std::vector<Range> ranges(inpMat.dims, Range::all());
+        int cAxis = clamp(axis, inpMat.dims);

-        ranges[axisIdx].start = 0;
+        ranges[cAxis].start = 0;
        for (size_t i = 0; i < outputs.size(); i++)
        {
-            ranges[axisIdx].end = ranges[axisIdx].start + outputs[i].size[axisIdx];
+            ranges[cAxis].end = ranges[cAxis].start + outputs[i].size[cAxis];
            inpMat(&ranges[0]).copyTo(outputs[i]);
-            ranges[axisIdx].start = ranges[axisIdx].end;
+            ranges[cAxis].start = ranges[cAxis].end;
        }
    }
-
-    int axisIdx;
 };

 Ptr<SliceLayer> SliceLayer::create(const LayerParams& params)
--- a/modules/dnn/src/layers/softmax_layer.cpp
+++ b/modules/dnn/src/layers/softmax_layer.cpp
@ -60,36 +60,34 @@ public:
        setParamsFrom(params);
    }

-    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const
    {
-        CV_Assert(inputs.size() == 1);
-        const Mat& inp0 = *inputs[0];
-        int dims = inp0.dims;
-        axis = axisRaw < 0 ? axisRaw + dims : axisRaw;
-
-        outerSize = inp0.total(0, axis);
-        channels = inp0.size[axis];
-        innerSize = inp0.total(axis + 1);
-
-        std::vector<int> shape(inp0.size.p, inp0.size.p + dims);
-        shape[axis] = 1;
-        buf.create(shape, inp0.type());
-
-        outputs.resize(1);
-        outputs[0].create(inp0.dims, inp0.size.p, inp0.type());
+        bool inplace = Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        MatShape shape = inputs[0];
+        int cAxis = clamp(axisRaw, shape.size());
+        shape[cAxis] = 1;
+        internals.assign(1, shape);
+        return inplace;
    }

-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
        const Mat &src = *inputs[0];
        Mat &dst = outputs[0];

+        int axis = clamp(axisRaw, src.dims);
+        size_t outerSize = src.total(0, axis), channels = src.size[axis],
+                innerSize = src.total(axis + 1);
+
        CV_Assert(src.type() == CV_32F);
        CV_Assert(src.isContinuous() && dst.isContinuous());

        const float *srcPtr = src.ptr<float>();
        float *dstPtr = dst.ptr<float>();
-        float *bufPtr = buf.ptr<float>();
+        float *bufPtr = internals[0].ptr<float>();

        size_t outerStep = src.total(axis);
        size_t cnStep = src.total(axis + 1);
@ -148,9 +146,7 @@ public:
        }
    }

-    int axis, axisRaw;
-    Mat buf;
-    size_t outerSize, channels, innerSize;
+    int axisRaw;
 };

 Ptr<SoftmaxLayer> SoftmaxLayer::create(const LayerParams& params)
--- a/modules/dnn/src/layers/split_layer.cpp
+++ b/modules/dnn/src/layers/split_layer.cpp
@ -65,19 +65,20 @@ public:
        }
    }

-    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const
    {
        CV_Assert(inputs.size() == 1);
-        const Mat& inp0 = *inputs[0];

-        if (outputsCount >= 0)
-            outputs.resize(outputsCount);
+        outputs.resize(outputsCount >= 0 ? outputsCount : requiredOutputs,
+                       inputs[0]);

-        for (size_t i = 0; i < outputs.size(); i++)
-            outputs[i].create(inp0.dims, inp0.size.p, inp0.type());
+        return false;
    }

-    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
        for (size_t i = 0; i < outputs.size(); i++)
        {
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@ -51,7 +51,7 @@ struct Pin
    int blobIndex;
 };

-void blobShapeFromTensor(const tensorflow::TensorProto &tensor, std::vector<int>& shape)
+void blobShapeFromTensor(const tensorflow::TensorProto &tensor, MatShape& shape)
 {
    shape.clear();
    if (tensor.has_tensor_shape())
@ -72,7 +72,7 @@ void blobShapeFromTensor(const tensorflow::TensorProto &tensor, std::vector<int>
 template <typename T>
 void parseTensor(const tensorflow::TensorProto &tensor, Mat &dstBlob)
 {
-    std::vector<int> shape;
+    MatShape shape;
    blobShapeFromTensor(tensor, shape);
    int dims = (int)shape.size();

@ -236,7 +236,7 @@ void setStrides(LayerParams &layerParams, const tensorflow::NodeDef &layer)
 }

 DictValue parseDims(const tensorflow::TensorProto &tensor) {
-    std::vector<int> shape;
+    MatShape shape;
    blobShapeFromTensor(tensor, shape);
    int dims = (int)shape.size();

@ -396,7 +396,7 @@ TFImporter::TFImporter(const char *model)

 void TFImporter::kernelFromTensor(const tensorflow::TensorProto &tensor, Mat &dstBlob)
 {
-    std::vector<int> shape;
+    MatShape shape;
    blobShapeFromTensor(tensor, shape);
    int dims = (int)shape.size();

--- a/modules/dnn/test/npy_blob.hpp
+++ b/modules/dnn/test/npy_blob.hpp
@ -60,75 +60,6 @@ inline void saveBlobToNPY(const Mat &blob, const String &path)
    cnpy::npy_save(path.c_str(), blob.ptr<float>(), (unsigned*)&blob.size.p[0], blob.dims);
 }

-inline size_t shapeTotal(const std::vector<int>& shape)
-{
-    size_t p = 1, i, n = shape.size();
-    for( i = 0; i < n; i++)
-        p *= shape[i];
-    return p;
-}
-
-inline bool shapeEqual(const std::vector<int>& shape1, const std::vector<int>& shape2)
-{
-    size_t i, n1 = shape1.size(), n2 = shape2.size();
-    if( n1 != n2 )
-        return false;
-    for( i = 0; i < n1; i++ )
-        if( shape1[i] != shape2[i] )
-            return false;
-    return true;
-}
-
-inline std::vector<int> getShape(const Mat& m)
-{
-    return m.empty() ? std::vector<int>() : std::vector<int>(&m.size.p[0], &m.size.p[0] + m.dims);
-}
-
-inline std::vector<int> makeShape(int a0, int a1=-1, int a2=-1, int a3=-1, int a4=-1, int a5=-1)
-{
-    std::vector<int> s;
-    s.push_back(a0);
-    if(a1 > 0)
-    {
-        s.push_back(a1);
-        if(a2 > 0)
-        {
-            s.push_back(a2);
-            if(a3 > 0)
-            {
-                s.push_back(a3);
-                if(a4 > 0)
-                {
-                    s.push_back(a4);
-                    if(a5 > 0)
-                        s.push_back(a5);
-                }
-            }
-        }
-    }
-    return s;
-}
-
-inline std::vector<int> concatShape(const std::vector<int>& a, const std::vector<int>& b)
-{
-    size_t na = a.size(), nb = b.size();
-    std::vector<int> c(na + nb);
-
-    std::copy(a.begin(), a.end(), c.begin());
-    std::copy(b.begin(), b.end(), c.begin() + na);
-
-    return c;
-}
-
-inline void printShape(const String& name, const std::vector<int>& shape)
-{
-    printf("%s: [", name.c_str());
-    size_t i, n = shape.size();
-    for( i = 0; i < n; i++ )
-        printf(" %d", shape[i]);
-    printf(" ]\n");
-}
-
 }

 #endif
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@ -43,6 +43,7 @@
 #include <opencv2/core/ocl.hpp>
 #include <iostream>
 #include "npy_blob.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
 #include <opencv2/dnn/all_layers.hpp>
 #include <opencv2/ts/ocl_test.hpp>

@ -67,16 +68,28 @@ void runLayer(Ptr<Layer> layer, std::vector<Mat> &inpBlobs, std::vector<Mat> &ou
    size_t i, ninputs = inpBlobs.size();
    std::vector<Mat> inp_(ninputs);
    std::vector<Mat*> inp(ninputs);
-    std::vector<Mat> outp;
+    std::vector<Mat> outp, intp;
+    std::vector<MatShape> inputs, outputs, internals;

    for( i = 0; i < ninputs; i++ )
    {
        inp_[i] = inpBlobs[i].clone();
        inp[i] = &inp_[i];
+        inputs.push_back(shape(inp_[i]));
    }

-    layer->allocate(inp, outp);
-    layer->forward(inp, outp);
+    layer->getMemoryShapes(inputs, 0, outputs, internals);
+    for(int i = 0; i < outputs.size(); i++)
+    {
+        outp.push_back(Mat(outputs[i], CV_32F));
+    }
+    for(int i = 0; i < internals.size(); i++)
+    {
+        intp.push_back(Mat(internals[i], CV_32F));
+    }
+
+    layer->finalize(inp, outp);
+    layer->forward(inp, outp, intp);

    size_t noutputs = outp.size();
    outBlobs.resize(noutputs);
@ -165,18 +178,17 @@ TEST(Layer_Test_Reshape, squeeze)

    int sz[] = {4, 3, 1, 2};
    Mat inp(4, sz, CV_32F);
-    std::vector<Mat*> inpVec(1, &inp);
-    std::vector<Mat> outVec;
+    std::vector<Mat> inpVec(1, inp);
+    std::vector<Mat> outVec, intVec;

    Ptr<Layer> rl = LayerFactory::createLayerInstance("Reshape", params);
-    rl->allocate(inpVec, outVec);
-    rl->forward(inpVec, outVec);
+    runLayer(rl, inpVec, outVec);

    Mat& out = outVec[0];
-    std::vector<int> shape(out.size.p, out.size.p + out.dims);
+    MatShape shape(out.size.p, out.size.p + out.dims);
    int sh0[] = {4, 3, 2};
-    std::vector<int> shape0(sh0, sh0+3);
-    EXPECT_TRUE(shapeEqual(shape, shape0));
+    MatShape shape0(sh0, sh0+3);
+    EXPECT_EQ(shape, shape0);
 }

 TEST(Layer_Test_BatchNorm, Accuracy)
@ -253,10 +265,10 @@ public:

    Layer_LSTM_Test() {}

-    void init(const std::vector<int> &inpShape_, const std::vector<int> &outShape_)
+    void init(const MatShape &inpShape_, const MatShape &outShape_)
    {
-        numInp = (int)shapeTotal(inpShape_);
-        numOut = (int)shapeTotal(outShape_);
+        numInp = total(inpShape_);
+        numOut = total(outShape_);

        Wh = Mat::ones(4 * numOut, numOut, CV_32F);
        Wx = Mat::ones(4 * numOut, numInp, CV_32F);
@ -271,10 +283,10 @@ public:
 TEST_F(Layer_LSTM_Test, get_set_test)
 {
    const int TN = 4;
-    std::vector<int> inpShape = makeShape(5, 3, 2);
-    std::vector<int> outShape = makeShape(3, 1, 2);
-    std::vector<int> inpResShape = concatShape(makeShape(TN), inpShape);
-    std::vector<int> outResShape = concatShape(makeShape(TN), outShape);
+    MatShape inpShape = shape(5, 3, 2);
+    MatShape outShape = shape(3, 1, 2);
+    MatShape inpResShape = concat(shape(TN), inpShape);
+    MatShape outResShape = concat(shape(TN), outShape);

    init(inpShape, outShape);
    layer->setProduceCellOutput(true);
@ -285,8 +297,6 @@ TEST_F(Layer_LSTM_Test, get_set_test)
    randu(C, -1., 1.);
    Mat H = C.clone();
    randu(H, -1., 1.);
-    layer->setC(C);
-    layer->setH(H);

    Mat inp((int)inpResShape.size(), &inpResShape[0], CV_32F);
    randu(inp, -1., 1.);
@ -296,17 +306,12 @@ TEST_F(Layer_LSTM_Test, get_set_test)

    EXPECT_EQ(2u, outputs.size());

-    printShape("outResShape", outResShape);
-    printShape("out0", getShape(outputs[0]));
-    printShape("out1", getShape(outputs[0]));
-    printShape("C", getShape(layer->getC()));
-    printShape("H", getShape(layer->getH()));
-
-    EXPECT_TRUE(shapeEqual(outResShape, getShape(outputs[0])));
-    EXPECT_TRUE(shapeEqual(outResShape, getShape(outputs[1])));
+    print(outResShape, "outResShape");
+    print(shape(outputs[0]), "out0");
+    print(shape(outputs[0]), "out1");

-    EXPECT_TRUE(shapeEqual(outResShape, getShape(layer->getC())));
-    EXPECT_TRUE(shapeEqual(outResShape, getShape(layer->getH())));
+    EXPECT_EQ(outResShape, shape(outputs[0]));
+    EXPECT_EQ(outResShape, shape(outputs[1]));

    EXPECT_EQ(0, layer->inputNameToIndex("x"));
    EXPECT_EQ(0, layer->outputNameToIndex("h"));
@ -387,8 +392,8 @@ TEST_F(Layer_RNN_Test, get_set_test)
    runLayer(layer, inputs, outputs);

    EXPECT_EQ(outputs.size(), 2u);
-    EXPECT_TRUE(shapeEqual(getShape(outputs[0]), makeShape(nT, nS, nO)));
-    EXPECT_TRUE(shapeEqual(getShape(outputs[1]), makeShape(nT, nS, nH)));
+    EXPECT_EQ(shape(outputs[0]), shape(nT, nS, nO));
+    EXPECT_EQ(shape(outputs[1]), shape(nT, nS, nH));
 }

 }