From 0488d9bdb24b3b9f75003971e6b91eed2bcb474e Mon Sep 17 00:00:00 2001
From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Tue, 4 Jul 2017 17:23:47 +0300
Subject: [PATCH] optimize out scaleLayer & concatLayer whenever possible

fixed problem in concat layer by disabling memory re-use in layers with multiple inputs

trying to fix the tests when Halide is used to run deep nets

another attempt to fix Halide tests

see if the Halide tests will pass with concat layer fusion turned off

trying to fix failures in halide tests; another try

one more experiment to make halide_concat & halide_enet tests pass

continue attempts to fix halide tests

moving on

uncomment parallel concat layer

seemingly fixed failures in Halide tests and re-enabled concat layer fusion; thanks to dkurt for the patch
---
 modules/dnn/include/opencv2/dnn/dnn.hpp      |  32 ++-
 modules/dnn/src/dnn.cpp                      | 207 +++++++++++++++----
 modules/dnn/src/layers/concat_layer.cpp      |  93 ++++++++-
 modules/dnn/src/layers/convolution_layer.cpp |  59 +++++-
 modules/dnn/test/test_halide_layers.cpp      |   8 +-
 5 files changed, 337 insertions(+), 62 deletions(-)
diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp
index f4369eef4e..8324fe9d05 100644
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -152,6 +152,7 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
 
     class CV_EXPORTS ActivationLayer;
     class CV_EXPORTS BatchNormLayer;
+    class CV_EXPORTS ScaleLayer;
 
     /** @brief This interface class allows to build new Layers - are building blocks of networks.
      *
@@ -269,6 +270,19 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
          */
         virtual bool setBatchNorm(const Ptr<BatchNormLayer>& layer);
 
+        /**
+         * @brief Tries to attach to the layer the subsequent scaling layer, i.e. do the layer fusion in a partial case.
+         * @param[in] layer The subsequent scaling layer.
+         *
+         * Returns true if the scaling layer has been attached successfully.
+         */
+        virtual bool setScale(const Ptr<ScaleLayer>& layer);
+
+        /**
+         * @brief "Deattaches" all the layers, attached to particular layer.
+         */
+        virtual void unsetAttached();
+
         virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
                                      const int requiredOutputs,
                                      std::vector<MatShape> &outputs,
@@ -495,9 +509,10 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
 
         /** @overload */
         CV_WRAP void getLayerShapes(const std::vector<MatShape>& netInputShapes,
-                                     const int layerId,
-                                     std::vector<MatShape>* inLayerShapes,
-                                     std::vector<MatShape>* outLayerShapes) const;
+                                    const int layerId,
+                                    std::vector<MatShape>* inLayerShapes,
+                                    std::vector<MatShape>* outLayerShapes) const;
+
         /** @brief Computes FLOP for whole loaded model with specified input shapes.
          * @param netInputShapes vector of shapes for all net inputs.
          * @returns computed FLOP.
@@ -507,10 +522,10 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
         CV_WRAP int64 getFLOPS(const MatShape& netInputShape) const;
         /** @overload */
         CV_WRAP int64 getFLOPS(const int layerId,
-                              const std::vector<MatShape>& netInputShapes) const;
+                               const std::vector<MatShape>& netInputShapes) const;
         /** @overload */
         CV_WRAP int64 getFLOPS(const int layerId,
-                              const MatShape& netInputShape) const;
+                               const MatShape& netInputShape) const;
 
         /** @brief Returns list of types for layer used in model.
          * @param layersTypes output parameter for returning types.
@@ -557,8 +572,13 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
         CV_WRAP void getMemoryConsumption(const MatShape& netInputShape,
                                           CV_OUT std::vector<int>& layerIds, CV_OUT std::vector<size_t>& weights,
                                           CV_OUT std::vector<size_t>& blobs) const;
-    private:
 
+        /** @brief Enables or disables layer fusion in the network.
+         * @param fusion true to enable the fusion, false to disable. The fusion is enabled by default.
+         */
+        CV_WRAP void enableFusion(bool fusion);
+
+    private:
         struct Impl;
         Ptr<Impl> impl;
     };
diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp
index a371b18540..27433282db 100644
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -464,29 +464,34 @@ public:
         }
     }
 
-    void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst)
+    void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, bool force)
     {
-        std::map<LayerPin, Mat>::iterator hostIt;
-        std::map<LayerPin, int>::iterator refIt;
-
-        const int targetTotal = total(shape);
         Mat bestBlob;
-        int bestBlobTotal = INT_MAX;
         LayerPin bestBlobPin;
-        for (hostIt = memHosts.begin(); hostIt != memHosts.end(); ++hostIt)
+
+        if( !force )
         {
-            refIt = refCounter.find(hostIt->first);
-            // Use only blobs that had references before because if not,
-            // it might be used as output.
-            if (refIt != refCounter.end() && refIt->second == 0)
+            std::map<LayerPin, Mat>::iterator hostIt;
+            std::map<LayerPin, int>::iterator refIt;
+
+            const int targetTotal = total(shape);
+            int bestBlobTotal = INT_MAX;
+
+            for (hostIt = memHosts.begin(); hostIt != memHosts.end(); ++hostIt)
             {
-                Mat& unusedBlob = hostIt->second;
-                if (unusedBlob.total() >= targetTotal &&
-                    unusedBlob.total() < bestBlobTotal)
+                refIt = refCounter.find(hostIt->first);
+                // Use only blobs that had references before because if not,
+                // it might be used as output.
+                if (refIt != refCounter.end() && refIt->second == 0)
                 {
-                    bestBlobPin = hostIt->first;
-                    bestBlob = unusedBlob;
-                    bestBlobTotal = unusedBlob.total();
+                    Mat& unusedBlob = hostIt->second;
+                    if (unusedBlob.total() >= targetTotal &&
+                        unusedBlob.total() < bestBlobTotal)
+                    {
+                        bestBlobPin = hostIt->first;
+                        bestBlob = unusedBlob;
+                        bestBlobTotal = unusedBlob.total();
+                    }
                 }
             }
         }
@@ -505,7 +510,8 @@ public:
     }
 
     void allocateBlobsForLayer(LayerData &ld, const LayerShapes& layerShapes,
-                               std::vector<LayerPin>& pinsForInternalBlobs)
+                               std::vector<LayerPin>& pinsForInternalBlobs,
+                               bool maximizeReuse)
     {
         CV_TRACE_FUNCTION();
 
@@ -561,6 +567,7 @@ public:
         }
 
         std::map<int, std::vector<int> >::reverse_iterator it;
+        bool force = !maximizeReuse && ld.inputBlobsId.size() > 1;
         for(it = idxSizes.rbegin(); it != idxSizes.rend(); it++)
         {
             for(int j = 0; j < it->second.size(); j++)
@@ -569,7 +576,7 @@ public:
                 if (total(shapes[index]))
                 {
                     LayerPin blobPin(ld.id, index);
-                    if (index < outShapes.size() && inPlace)
+                    if (index < outShapes.size() && inPlace && !force)
                     {
                         CV_Assert(ld.inputBlobs[0]->total() == total(shapes[index]));
                         ld.outputBlobs[index] = ld.inputBlobs[0]->reshape(1, shapes[index]);
@@ -577,7 +584,7 @@ public:
                     }
                     else
                     {
-                        reuseOrCreate(shapes[index], blobPin, *blobs[index]);
+                        reuseOrCreate(shapes[index], blobPin, *blobs[index], force);
                     }
                 }
             }
@@ -628,6 +635,7 @@ struct Net::Impl
 
         lastLayerId = 1;
         netWasAllocated = false;
+        fusion = true;
         preferableBackend = DNN_BACKEND_DEFAULT;
         preferableTarget = DNN_TARGET_CPU;
     }
@@ -647,6 +655,7 @@ struct Net::Impl
     int lastLayerId;
 
     bool netWasAllocated;
+    bool fusion;
 
     void compileHalide()
     {
@@ -695,8 +704,7 @@ struct Net::Impl
             if( currLayer.empty() )
                 continue;
 
-            currLayer->setActivation(Ptr<ActivationLayer>());
-            currLayer->setBatchNorm(Ptr<BatchNormLayer>());
+            currLayer->unsetAttached();
 
             Ptr<PoolingLayer> poolingLayer = currLayer.dynamicCast<PoolingLayer>();
             if( !poolingLayer.empty() )
@@ -704,9 +712,11 @@ struct Net::Impl
                 poolingLayer->computeMaxIdx = true;
             }
         }
+        it = layers.find(0);
+        CV_Assert(it != layers.end());
+        it->second.skipFlags[DNN_BACKEND_DEFAULT] = true;
     }
 
-
     void setUpNet(const std::vector<LayerPin>& blobsToKeep_ = std::vector<LayerPin>())
     {
         CV_TRACE_FUNCTION();
@@ -783,13 +793,11 @@ struct Net::Impl
 
     LayerData& getLayerData(const DictValue &layerDesc)
     {
+        CV_Assert(layerDesc.isInt() || layerDesc.isString());
         if (layerDesc.isInt())
             return getLayerData(layerDesc.get<int>());
-        else if (layerDesc.isString())
+        else /*if (layerDesc.isString())*/
             return getLayerData(layerDesc.get<String>());
-
-        CV_Assert(layerDesc.isInt() || layerDesc.isString());
-        return *((LayerData*)NULL);
     }
 
     static void addLayerInput(LayerData &ld, int inNum, LayerPin from)
@@ -1021,7 +1029,8 @@ struct Net::Impl
         CV_Assert(layerShapesIt != layersShapes.end());
 
         std::vector<LayerPin> pinsForInternalBlobs;
-        blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs);
+        bool maximizeReuse = preferableBackend == DNN_BACKEND_HALIDE;
+        blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs, maximizeReuse);
 
         Ptr<Layer> layerPtr = ld.getLayerInstance();
         {
@@ -1044,8 +1053,17 @@ struct Net::Impl
         ld.flag = 1;
     }
 
+#if 0
+#define printf_(args) printf args
+#else
+#define printf_(args)
+#endif
+
     void fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
     {
+        if( !fusion || preferableBackend == DNN_BACKEND_HALIDE )
+            return;
+
         CV_TRACE_FUNCTION();
 
         // scan through all the layers. If there is convolution layer followed by the activation layer,
@@ -1060,11 +1078,17 @@ struct Net::Impl
             LayerData& ld = layers[lid];
             if( ld.skipFlags[DNN_BACKEND_DEFAULT] )
             {
+                printf_(("skipped %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str()));
                 continue;
             }
+            printf_(("analyzing %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str()));
             if( ld.consumers.size() == 0 )
                 outnames.push_back(ld.layerInstance->name);
 
+            // the optimization #1. try to fuse batch norm, scaling and/or activation layers
+            // with the current layer if they follow it. Normally, the are fused with the convolution layer,
+            // but some of them (like activation) may be fused with fully-connected, elemwise (+) and
+            // some other layers.
             Ptr<Layer>& currLayer = ld.layerInstance;
             if( ld.consumers.size() == 1 && pinsToKeep.count(LayerPin(lid, 0)) == 0 )
             {
@@ -1078,10 +1102,29 @@ struct Net::Impl
                     nextData = 0;
                     if( currLayer->setBatchNorm(nextBNormLayer) )
                     {
+                        printf_(("\tfused with %s\n", nextBNormLayer->name.c_str()));
                         bnormData->skipFlags[DNN_BACKEND_DEFAULT] = true;
                         ld.outputBlobs = layers[lpNext.lid].outputBlobs;
                         if( bnormData->consumers.size() == 1 )
                             nextData = &layers[bnormData->consumers[0].lid];
+                        lpNext = LayerPin(bnormData->consumers[0].lid, 0);
+                    }
+                }
+
+                Ptr<ScaleLayer> nextScaleLayer;
+                if( nextData )
+                    nextScaleLayer = nextData->layerInstance.dynamicCast<ScaleLayer>();
+                if( !nextScaleLayer.empty() && pinsToKeep.count(lpNext) == 0 )
+                {
+                    LayerData* scaleData = nextData;
+                    nextData = 0;
+                    if( currLayer->setScale(nextScaleLayer) )
+                    {
+                        printf_(("\tfused with %s\n", nextScaleLayer->name.c_str()));
+                        scaleData->skipFlags[DNN_BACKEND_DEFAULT] = true;
+                        ld.outputBlobs = layers[lpNext.lid].outputBlobs;
+                        if( scaleData->consumers.size() == 1 )
+                            nextData = &layers[scaleData->consumers[0].lid];
                     }
                 }
 
@@ -1091,11 +1134,16 @@ struct Net::Impl
 
                 if( !nextActivLayer.empty() && currLayer->setActivation(nextActivLayer) )
                 {
-                    //printf("successfully merged %s and %s\n", currLayer->name.c_str(), nextActivLayer->name.c_str());
+                    printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
                     nextData->skipFlags[DNN_BACKEND_DEFAULT] = true;
                     ld.outputBlobs = layers[lpNext.lid].outputBlobs;
                 }
             }
+
+            // the optimization #2. if there is no layer that takes max pooling layer's computed
+            // max indices (and only some semantical segmentation networks might need this;
+            // many others only take the maximum values), then we switch the max pooling
+            // layer to the faster operating mode.
             Ptr<PoolingLayer> poolingLayer = ld.layerInstance.dynamicCast<PoolingLayer>();
             if( !poolingLayer.empty() && !ld.consumers.empty() )
             {
@@ -1108,7 +1156,71 @@ struct Net::Impl
                 if( i >= nconsumers )
                 {
                     poolingLayer->computeMaxIdx = false;
-                    //printf("simplified pooling layer %s\n", poolingLayer->name.c_str());
+                    printf_(("\tsimplified pooling layer %s\n", poolingLayer->name.c_str()));
+                }
+            }
+
+            // the optimization #3. if there is concat layer that concatenates channels
+            // from the inputs together (i.e. axis == 1) then we make the inputs of
+            // the concat layer to write to the concatetion output buffer
+            // (and so we eliminate the concatenation layer, because the channels
+            // are concatenated implicitly).
+            Ptr<ConcatLayer> concatLayer = ld.layerInstance.dynamicCast<ConcatLayer>();
+            if( !concatLayer.empty() && concatLayer->axis == 1 &&
+                ld.outputBlobs.size() == 1 )
+            {
+                Mat& output = ld.outputBlobs[0];
+
+                // TODO: in general, this optimization can always be done, but
+                // many layers currently check that the input/output blobs are
+                // continuous arrays. Unfortunately, this is not true when
+                // the concatenation optimization is applied with batch_size > 1.
+                // so, for now, we only apply this optimization in the most popular
+                // case batch_size == 1.
+                if( output.dims == 4 && output.size[0] == 1 )
+                {
+                    size_t i, ninputs = ld.inputBlobsId.size();
+                    std::vector<LayerPin> realinputs(ninputs);
+                    for( i = 0; i < ninputs; i++ )
+                    {
+                        LayerPin pin = ld.inputBlobsId[i];
+                        LayerData* inp_i_data = &layers[pin.lid];
+                        while(inp_i_data->skipFlags[DNN_BACKEND_DEFAULT] &&
+                              inp_i_data->inputBlobsId.size() == 1)
+                        {
+                            pin = inp_i_data->inputBlobsId[0];
+                            inp_i_data = &layers[pin.lid];
+                        }
+                        printf_(("\treal input for %s is %s\n",
+                               layers[ld.inputBlobsId[i].lid].getLayerInstance()->name.c_str(),
+                               inp_i_data->getLayerInstance()->name.c_str()));
+
+                        if(inp_i_data->skipFlags[DNN_BACKEND_DEFAULT])
+                            break;
+                        realinputs[i] = pin;
+                    }
+
+                    if( i >= ninputs )
+                    {
+                        Range chrange[] = { Range::all(), Range::all(), Range::all(), Range::all() };
+                        int ofs = 0;
+                        for( i = 0; i < ninputs; i++ )
+                        {
+                            LayerPin pin = realinputs[i];
+                            LayerData* inp_i_data = &layers[pin.lid];
+                            int channels_i = ld.inputBlobs[i]->size[1];
+                            chrange[1] = Range(ofs, ofs + channels_i);
+                            printf_(("\toutput %s(%d) to channels (%d, %d)\n", inp_i_data->layerInstance->name.c_str(),
+                                   pin.oid, ofs, ofs + channels_i));
+                            ofs += channels_i;
+                            Mat output_slice = output(chrange);
+                            Mat& curr_output = inp_i_data->outputBlobs[pin.oid];
+                            CV_Assert(output_slice.isContinuous() && output_slice.size == curr_output.size);
+                            curr_output = output_slice;
+                        }
+                        ld.skipFlags[DNN_BACKEND_DEFAULT] = true;
+                        printf_(("\toptimized out Concat layer %s\n", concatLayer->name.c_str()));
+                    }
                 }
             }
         }
@@ -1458,9 +1570,12 @@ void Net::setPreferableBackend(int backendId)
     CV_TRACE_FUNCTION();
     CV_TRACE_ARG(backendId);
 
-    impl->netWasAllocated = impl->netWasAllocated &&
-                            impl->preferableBackend == backendId;
-    impl->preferableBackend = backendId;
+    if( impl->preferableBackend != backendId )
+    {
+        impl->preferableBackend = backendId;
+        impl->netWasAllocated = false;
+        impl->clear();
+    }
 }
 
 void Net::setPreferableTarget(int targetId)
@@ -1468,9 +1583,12 @@ void Net::setPreferableTarget(int targetId)
     CV_TRACE_FUNCTION();
     CV_TRACE_ARG(targetId);
 
-    impl->netWasAllocated = impl->netWasAllocated &&
-                            impl->preferableTarget == targetId;
-    impl->preferableTarget = targetId;
+    if( impl->preferableTarget != targetId )
+    {
+        impl->preferableTarget = targetId;
+        impl->netWasAllocated = false;
+        impl->clear();
+    }
 }
 
 void Net::setInputsNames(const std::vector<String> &inputBlobNames)
@@ -1825,6 +1943,16 @@ void Net::getMemoryConsumption(const MatShape& netInputShape, std::vector<int>&
                          weights, blobs);
 }
 
+void Net::enableFusion(bool fusion)
+{
+    if( impl->fusion != fusion )
+    {
+        impl->fusion = fusion;
+        impl->netWasAllocated = false;
+        impl->clear();
+    }
+}
+
 void Net::setHalideScheduler(const String& scheduler)
 {
     CV_TRACE_FUNCTION();
@@ -1950,6 +2078,13 @@ Ptr<BackendNode> Layer::tryAttach(const Ptr<BackendNode>& node)
 
 bool Layer::setActivation(const Ptr<ActivationLayer>&) { return false; }
 bool Layer::setBatchNorm(const Ptr<BatchNormLayer>&) { return false; }
+bool Layer::setScale(const Ptr<ScaleLayer>&) { return false; }
+void Layer::unsetAttached()
+{
+    setActivation(Ptr<ActivationLayer>());
+    setBatchNorm(Ptr<BatchNormLayer>());
+    setScale(Ptr<ScaleLayer>());
+}
 
 template <typename T>
 static void vecToPVec(const std::vector<T> &v, std::vector<T*> &pv)
diff --git a/modules/dnn/src/layers/concat_layer.cpp b/modules/dnn/src/layers/concat_layer.cpp
index f2d6d4e93c..662be1d096 100644
--- a/modules/dnn/src/layers/concat_layer.cpp
+++ b/modules/dnn/src/layers/concat_layer.cpp
@@ -94,6 +94,78 @@ public:
                backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1;  // By channels
     }
 
+    class ChannelConcatInvoker : public ParallelLoopBody
+    {
+    public:
+        std::vector<Mat*>* inputs;
+        Mat* output;
+        int nstripes;
+        std::vector<const float*> chptrs;
+
+        static void run(std::vector<Mat*>& inputs, Mat& output, int nstripes)
+        {
+            ChannelConcatInvoker cc;
+            cc.inputs = &inputs;
+            cc.output = &output;
+            cc.nstripes = nstripes;
+
+            size_t i, ninputs = inputs.size();
+            int nchannels = 0, batchsz = output.size[0];
+            for( i = 0; i < ninputs; i++ )
+            {
+                Mat& inp = *inputs[i];
+                CV_Assert( inp.isContinuous() && inp.type() == CV_32F &&
+                           inp.dims == 4 && inp.size[0] == output.size[0] &&
+                           inp.size[2] == output.size[2] &&
+                           inp.size[3] == output.size[3] );
+                nchannels += inp.size[1];
+            }
+            CV_Assert( nchannels == output.size[1] );
+            CV_Assert( output.isContinuous() && output.type() == CV_32F );
+
+            cc.chptrs.resize(nchannels*batchsz);
+
+            int ofs = 0;
+            for( i = 0; i < ninputs; i++)
+            {
+                Mat& inp = *inputs[i];
+                for( int j = 0; j < batchsz; j++ )
+                    for( int k = 0; k < inp.size[1]; k++ )
+                    {
+                        const float* ptr = inp.ptr<float>(j, k);
+                        cc.chptrs[ofs + j*nchannels + k] = ptr;
+                    }
+                ofs += inp.size[1];
+            }
+
+            parallel_for_(Range(0, nstripes), cc, nstripes);
+        }
+
+        ChannelConcatInvoker() {}
+
+        void operator()(const Range& r) const
+        {
+            size_t planeSize = (size_t)output->size[2]*output->size[3];
+            size_t nch = chptrs.size();
+            size_t total = nch*planeSize;
+            size_t stripeSize = (total + nstripes - 1)/nstripes;
+            size_t stripeStart = r.start*stripeSize;
+            size_t stripeEnd = std::min(total, r.end*stripeSize);
+            const float** ptrs = (const float**)&chptrs[0];
+            float* outptr = output->ptr<float>();
+            size_t blockSize0 = 1 << 16;
+
+            for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
+            {
+                size_t ch = ofs0/planeSize;
+                size_t ofs = ofs0 - ch*planeSize;
+                size_t blockSize = std::min(blockSize0, planeSize - ofs);
+                memcpy(outptr + ofs0, ptrs[ch] + ofs, blockSize*sizeof(outptr[0]));
+                ofs0 += blockSize;
+            }
+        }
+    };
+
     void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
     {
         CV_TRACE_FUNCTION();
@@ -101,14 +173,23 @@ public:
 
         int cAxis = clamp(axis, inputs[0]->dims);
         Mat& outMat = outputs[0];
-        std::vector<Range> ranges(outputs[0].dims, Range::all());
 
-        ranges[cAxis].start = 0;
-        for (size_t i = 0; i < inputs.size(); i++)
+        if( cAxis == 1 && outMat.dims == 4 )
+        {
+            int nstripes = getNumThreads();
+            ChannelConcatInvoker::run(inputs, outMat, nstripes);
+        }
+        else
         {
-            ranges[cAxis].end = ranges[cAxis].start + inputs[i]->size[cAxis];
-            inputs[i]->copyTo(outMat(&ranges[0]));
-            ranges[cAxis].start = ranges[cAxis].end;
+            std::vector<Range> ranges(outputs[0].dims, Range::all());
+
+            ranges[cAxis].start = 0;
+            for (size_t i = 0; i < inputs.size(); i++)
+            {
+                ranges[cAxis].end = ranges[cAxis].start + inputs[i]->size[cAxis];
+                inputs[i]->copyTo(outMat(&ranges[0]));
+                ranges[cAxis].start = ranges[cAxis].end;
+            }
         }
     }
 
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 6e09c8ca98..3dd63a3c36 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -148,6 +148,7 @@ public:
     std::vector<float> reluslope;
     Ptr<ActivationLayer> activ;
     Ptr<BatchNormLayer> bnorm;
+    Ptr<ScaleLayer> scaleLayer;
 
     MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const
     {
@@ -202,6 +203,9 @@ public:
 
     bool setBatchNorm(const Ptr<BatchNormLayer>& layer )
     {
+        // for now the scale layer followed by the batch norm cannot be fused, only vice versa.
+        if( !scaleLayer.empty() )
+            return false;
         bnorm = layer;
         // we will need to re-compute the weights with the batch
         // norm coefficients taken into account
@@ -209,6 +213,15 @@ public:
         return !bnorm.empty();
     }
 
+    bool setScale(const Ptr<ScaleLayer>& layer)
+    {
+        scaleLayer = layer;
+        // we will need to re-compute the weights with the scaling
+        // coefficients taken into account
+        weightsMat.release();
+        return !scaleLayer.empty();
+    }
+
     virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
     {
 #ifdef HAVE_HALIDE
@@ -678,32 +691,56 @@ public:
                     biasvec[k] = biasMat.at<float>(k);
             }
 
-            if( !bnorm.empty() )
+            if( !bnorm.empty() || !scaleLayer.empty() )
             {
-                Mat scale, shift;
-                bnorm->getScaleShift(scale, shift);
+                Mat scale, shift, scale2, shift2;
+                const float *scaleptr = 0, *shiftptr = 0;
+                const float *scaleptr2 = 0, *shiftptr2 = 0;
 
-                CV_Assert( scale.isContinuous() && shift.isContinuous() &&
-                           scale.type() == CV_32F && shift.type() == CV_32F &&
-                           scale.total() == (size_t)outCn &&
-                           shift.total() == (size_t)outCn );
+                if( !bnorm.empty() )
+                {
+                    bnorm->getScaleShift(scale, shift);
+                    CV_Assert( scale.isContinuous() && shift.isContinuous() &&
+                               scale.type() == CV_32F && shift.type() == CV_32F &&
+                               scale.total() == (size_t)outCn &&
+                               shift.total() == (size_t)outCn );
+                    scaleptr = scale.ptr<float>();
+                    shiftptr = shift.ptr<float>();
+                }
+                if( !scaleLayer.empty() )
+                {
+                    scale2 = scaleLayer->blobs[0];
+                    CV_Assert( scale2.isContinuous() && scale2.type() == CV_32F &&
+                               scale2.total() == (size_t)outCn );
+                    scaleptr2 = scale2.ptr<float>();
+                    if( scaleLayer->hasBias )
+                    {
+                        shift2 = scaleLayer->blobs[1];
+                        CV_Assert( shift2.isContinuous() && shift2.type() == CV_32F &&
+                                   shift2.total() == (size_t)outCn );
+                        shiftptr2 = shift2.ptr<float>();
+                    }
+                }
 
                 for( int i = 0; i < outCn; i++ )
                 {
-                    float s = scale.at<float>(i);
-                    float delta = shift.at<float>(i);
+                    float s1 = scaleptr ? scaleptr[i] : 1.f;
+                    float delta1 = shiftptr ? shiftptr[i] : 0.f;
+                    float s2 = scaleptr2 ? scaleptr2[i] : 1.f;
+                    float delta2 = shiftptr2 ? shiftptr2[i] : 0.f;
                     float* w_i = weightsMat.ptr<float>(i);
                     int j, wcols = weightsMat.cols;
 
                     for( j = 0; j < wcols; j++ )
-                        w_i[j] *= s;
+                        w_i[j] *= (s1*s2);
 
-                    biasvec[i] = biasvec[i]*s + delta;
+                    biasvec[i] = biasvec[i]*(s1*s2) + (delta1*s2 + delta2);
                 }
             }
             biasvec[outCn] = biasvec[outCn+1] = biasvec[outCn-1];
         }
 
+        reluslope.clear();
         if( activ )
         {
             Ptr<ReLULayer> activ_relu = activ.dynamicCast<ReLULayer>();
diff --git a/modules/dnn/test/test_halide_layers.cpp b/modules/dnn/test/test_halide_layers.cpp
index b2edf3af93..6801a7cba7 100644
--- a/modules/dnn/test/test_halide_layers.cpp
+++ b/modules/dnn/test/test_halide_layers.cpp
@@ -517,7 +517,8 @@ TEST_P(Concat, Accuracy)
 
     Net net;
 
-    std::vector<int> convLayerIds(numChannels.channels);
+    std::vector<int> convLayerIds;
+    convLayerIds.reserve(numChannels.channels);
     for (int i = 0, n = numChannels.channels; i < n; ++i)
     {
         if (!numChannels[i])
@@ -537,8 +538,9 @@ TEST_P(Concat, Accuracy)
         convParam.name = ss.str();
         convParam.blobs.push_back(weights);
 
-        convLayerIds[i] = net.addLayer(convParam.name, convParam.type, convParam);
-        net.connect(0, 0, convLayerIds[i], 0);
+        int layerId = net.addLayer(convParam.name, convParam.type, convParam);
+        convLayerIds.push_back(layerId);
+        net.connect(0, 0, layerId, 0);
     }
 
     LayerParams concatParam;