optimize out scaleLayer & concatLayer whenever possible

fixed problem in concat layer by disabling memory re-use in layers with multiple inputs trying to fix the tests when Halide is used to run deep nets another attempt to fix Halide tests see if the Halide tests will pass with concat layer fusion turned off trying to fix failures in halide tests; another try one more experiment to make halide_concat & halide_enet tests pass continue attempts to fix halide tests moving on uncomment parallel concat layer seemingly fixed failures in Halide tests and re-enabled concat layer fusion; thanks to dkurt for the patch
8 years ago · 0488d9bdb2
parent 431e2e6d68
commit 0488d9bdb2
5 changed files with 337 additions and 62 deletions
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@ -152,6 +152,7 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN

    class CV_EXPORTS ActivationLayer;
    class CV_EXPORTS BatchNormLayer;
+    class CV_EXPORTS ScaleLayer;

    /** @brief This interface class allows to build new Layers - are building blocks of networks.
     *
@ -269,6 +270,19 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
         */
        virtual bool setBatchNorm(const Ptr<BatchNormLayer>& layer);

+        /**
+         * @brief Tries to attach to the layer the subsequent scaling layer, i.e. do the layer fusion in a partial case.
+         * @param[in] layer The subsequent scaling layer.
+         *
+         * Returns true if the scaling layer has been attached successfully.
+         */
+        virtual bool setScale(const Ptr<ScaleLayer>& layer);
+
+        /**
+         * @brief "Deattaches" all the layers, attached to particular layer.
+         */
+        virtual void unsetAttached();
+
        virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
                                     const int requiredOutputs,
                                     std::vector<MatShape> &outputs,
@ -495,9 +509,10 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN

        /** @overload */
        CV_WRAP void getLayerShapes(const std::vector<MatShape>& netInputShapes,
-                                     const int layerId,
-                                     std::vector<MatShape>* inLayerShapes,
-                                     std::vector<MatShape>* outLayerShapes) const;
+                                    const int layerId,
+                                    std::vector<MatShape>* inLayerShapes,
+                                    std::vector<MatShape>* outLayerShapes) const;
+
        /** @brief Computes FLOP for whole loaded model with specified input shapes.
         * @param netInputShapes vector of shapes for all net inputs.
         * @returns computed FLOP.
@ -507,10 +522,10 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
        CV_WRAP int64 getFLOPS(const MatShape& netInputShape) const;
        /** @overload */
        CV_WRAP int64 getFLOPS(const int layerId,
-                              const std::vector<MatShape>& netInputShapes) const;
+                               const std::vector<MatShape>& netInputShapes) const;
        /** @overload */
        CV_WRAP int64 getFLOPS(const int layerId,
-                              const MatShape& netInputShape) const;
+                               const MatShape& netInputShape) const;

        /** @brief Returns list of types for layer used in model.
         * @param layersTypes output parameter for returning types.
@ -557,8 +572,13 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
        CV_WRAP void getMemoryConsumption(const MatShape& netInputShape,
                                          CV_OUT std::vector<int>& layerIds, CV_OUT std::vector<size_t>& weights,
                                          CV_OUT std::vector<size_t>& blobs) const;
-    private:

+        /** @brief Enables or disables layer fusion in the network.
+         * @param fusion true to enable the fusion, false to disable. The fusion is enabled by default.
+         */
+        CV_WRAP void enableFusion(bool fusion);
+
+    private:
        struct Impl;
        Ptr<Impl> impl;
    };
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@ -464,29 +464,34 @@ public:
        }
    }

-    void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst)
+    void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, bool force)
    {
-        std::map<LayerPin, Mat>::iterator hostIt;
-        std::map<LayerPin, int>::iterator refIt;
-
-        const int targetTotal = total(shape);
        Mat bestBlob;
-        int bestBlobTotal = INT_MAX;
        LayerPin bestBlobPin;
-        for (hostIt = memHosts.begin(); hostIt != memHosts.end(); ++hostIt)
+
+        if( !force )
        {
-            refIt = refCounter.find(hostIt->first);
-            // Use only blobs that had references before because if not,
-            // it might be used as output.
-            if (refIt != refCounter.end() && refIt->second == 0)
+            std::map<LayerPin, Mat>::iterator hostIt;
+            std::map<LayerPin, int>::iterator refIt;
+
+            const int targetTotal = total(shape);
+            int bestBlobTotal = INT_MAX;
+
+            for (hostIt = memHosts.begin(); hostIt != memHosts.end(); ++hostIt)
            {
-                Mat& unusedBlob = hostIt->second;
-                if (unusedBlob.total() >= targetTotal &&
-                    unusedBlob.total() < bestBlobTotal)
+                refIt = refCounter.find(hostIt->first);
+                // Use only blobs that had references before because if not,
+                // it might be used as output.
+                if (refIt != refCounter.end() && refIt->second == 0)
                {
-                    bestBlobPin = hostIt->first;
-                    bestBlob = unusedBlob;
-                    bestBlobTotal = unusedBlob.total();
+                    Mat& unusedBlob = hostIt->second;
+                    if (unusedBlob.total() >= targetTotal &&
+                        unusedBlob.total() < bestBlobTotal)
+                    {
+                        bestBlobPin = hostIt->first;
+                        bestBlob = unusedBlob;
+                        bestBlobTotal = unusedBlob.total();
+                    }
                }
            }
        }
@ -505,7 +510,8 @@ public:
    }

    void allocateBlobsForLayer(LayerData &ld, const LayerShapes& layerShapes,
-                               std::vector<LayerPin>& pinsForInternalBlobs)
+                               std::vector<LayerPin>& pinsForInternalBlobs,
+                               bool maximizeReuse)
    {
        CV_TRACE_FUNCTION();

@ -561,6 +567,7 @@ public:
        }

        std::map<int, std::vector<int> >::reverse_iterator it;
+        bool force = !maximizeReuse && ld.inputBlobsId.size() > 1;
        for(it = idxSizes.rbegin(); it != idxSizes.rend(); it++)
        {
            for(int j = 0; j < it->second.size(); j++)
@ -569,7 +576,7 @@ public:
                if (total(shapes[index]))
                {
                    LayerPin blobPin(ld.id, index);
-                    if (index < outShapes.size() && inPlace)
+                    if (index < outShapes.size() && inPlace && !force)
                    {
                        CV_Assert(ld.inputBlobs[0]->total() == total(shapes[index]));
                        ld.outputBlobs[index] = ld.inputBlobs[0]->reshape(1, shapes[index]);
@ -577,7 +584,7 @@ public:
                    }
                    else
                    {
-                        reuseOrCreate(shapes[index], blobPin, *blobs[index]);
+                        reuseOrCreate(shapes[index], blobPin, *blobs[index], force);
                    }
                }
            }
@ -628,6 +635,7 @@ struct Net::Impl

        lastLayerId = 1;
        netWasAllocated = false;
+        fusion = true;
        preferableBackend = DNN_BACKEND_DEFAULT;
        preferableTarget = DNN_TARGET_CPU;
    }
@ -647,6 +655,7 @@ struct Net::Impl
    int lastLayerId;

    bool netWasAllocated;
+    bool fusion;

    void compileHalide()
    {
@ -695,8 +704,7 @@ struct Net::Impl
            if( currLayer.empty() )
                continue;

-            currLayer->setActivation(Ptr<ActivationLayer>());
-            currLayer->setBatchNorm(Ptr<BatchNormLayer>());
+            currLayer->unsetAttached();

            Ptr<PoolingLayer> poolingLayer = currLayer.dynamicCast<PoolingLayer>();
            if( !poolingLayer.empty() )
@ -704,9 +712,11 @@ struct Net::Impl
                poolingLayer->computeMaxIdx = true;
            }
        }
+        it = layers.find(0);
+        CV_Assert(it != layers.end());
+        it->second.skipFlags[DNN_BACKEND_DEFAULT] = true;
    }

-
    void setUpNet(const std::vector<LayerPin>& blobsToKeep_ = std::vector<LayerPin>())
    {
        CV_TRACE_FUNCTION();
@ -783,13 +793,11 @@ struct Net::Impl

    LayerData& getLayerData(const DictValue &layerDesc)
    {
+        CV_Assert(layerDesc.isInt() || layerDesc.isString());
        if (layerDesc.isInt())
            return getLayerData(layerDesc.get<int>());
-        else if (layerDesc.isString())
+        else /*if (layerDesc.isString())*/
            return getLayerData(layerDesc.get<String>());
-
-        CV_Assert(layerDesc.isInt() || layerDesc.isString());
-        return *((LayerData*)NULL);
    }

    static void addLayerInput(LayerData &ld, int inNum, LayerPin from)
@ -1021,7 +1029,8 @@ struct Net::Impl
        CV_Assert(layerShapesIt != layersShapes.end());

        std::vector<LayerPin> pinsForInternalBlobs;
-        blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs);
+        bool maximizeReuse = preferableBackend == DNN_BACKEND_HALIDE;
+        blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs, maximizeReuse);

        Ptr<Layer> layerPtr = ld.getLayerInstance();
        {
@ -1044,8 +1053,17 @@ struct Net::Impl
        ld.flag = 1;
    }

+#if 0
+#define printf_(args) printf args
+#else
+#define printf_(args)
+#endif
+
    void fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
    {
+        if( !fusion || preferableBackend == DNN_BACKEND_HALIDE )
+            return;
+
        CV_TRACE_FUNCTION();

        // scan through all the layers. If there is convolution layer followed by the activation layer,
@ -1060,11 +1078,17 @@ struct Net::Impl
            LayerData& ld = layers[lid];
            if( ld.skipFlags[DNN_BACKEND_DEFAULT] )
            {
+                printf_(("skipped %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str()));
                continue;
            }
+            printf_(("analyzing %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str()));
            if( ld.consumers.size() == 0 )
                outnames.push_back(ld.layerInstance->name);

+            // the optimization #1. try to fuse batch norm, scaling and/or activation layers
+            // with the current layer if they follow it. Normally, the are fused with the convolution layer,
+            // but some of them (like activation) may be fused with fully-connected, elemwise (+) and
+            // some other layers.
            Ptr<Layer>& currLayer = ld.layerInstance;
            if( ld.consumers.size() == 1 && pinsToKeep.count(LayerPin(lid, 0)) == 0 )
            {
@ -1078,10 +1102,29 @@ struct Net::Impl
                    nextData = 0;
                    if( currLayer->setBatchNorm(nextBNormLayer) )
                    {
+                        printf_(("\tfused with %s\n", nextBNormLayer->name.c_str()));
                        bnormData->skipFlags[DNN_BACKEND_DEFAULT] = true;
                        ld.outputBlobs = layers[lpNext.lid].outputBlobs;
                        if( bnormData->consumers.size() == 1 )
                            nextData = &layers[bnormData->consumers[0].lid];
+                        lpNext = LayerPin(bnormData->consumers[0].lid, 0);
+                    }
+                }
+
+                Ptr<ScaleLayer> nextScaleLayer;
+                if( nextData )
+                    nextScaleLayer = nextData->layerInstance.dynamicCast<ScaleLayer>();
+                if( !nextScaleLayer.empty() && pinsToKeep.count(lpNext) == 0 )
+                {
+                    LayerData* scaleData = nextData;
+                    nextData = 0;
+                    if( currLayer->setScale(nextScaleLayer) )
+                    {
+                        printf_(("\tfused with %s\n", nextScaleLayer->name.c_str()));
+                        scaleData->skipFlags[DNN_BACKEND_DEFAULT] = true;
+                        ld.outputBlobs = layers[lpNext.lid].outputBlobs;
+                        if( scaleData->consumers.size() == 1 )
+                            nextData = &layers[scaleData->consumers[0].lid];
                    }
                }

@ -1091,11 +1134,16 @@ struct Net::Impl

                if( !nextActivLayer.empty() && currLayer->setActivation(nextActivLayer) )
                {
-                    //printf("successfully merged %s and %s\n", currLayer->name.c_str(), nextActivLayer->name.c_str());
+                    printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
                    nextData->skipFlags[DNN_BACKEND_DEFAULT] = true;
                    ld.outputBlobs = layers[lpNext.lid].outputBlobs;
                }
            }
+
+            // the optimization #2. if there is no layer that takes max pooling layer's computed
+            // max indices (and only some semantical segmentation networks might need this;
+            // many others only take the maximum values), then we switch the max pooling
+            // layer to the faster operating mode.
            Ptr<PoolingLayer> poolingLayer = ld.layerInstance.dynamicCast<PoolingLayer>();
            if( !poolingLayer.empty() && !ld.consumers.empty() )
            {
@ -1108,7 +1156,71 @@ struct Net::Impl
                if( i >= nconsumers )
                {
                    poolingLayer->computeMaxIdx = false;
-                    //printf("simplified pooling layer %s\n", poolingLayer->name.c_str());
+                    printf_(("\tsimplified pooling layer %s\n", poolingLayer->name.c_str()));
+                }
+            }
+
+            // the optimization #3. if there is concat layer that concatenates channels
+            // from the inputs together (i.e. axis == 1) then we make the inputs of
+            // the concat layer to write to the concatetion output buffer
+            // (and so we eliminate the concatenation layer, because the channels
+            // are concatenated implicitly).
+            Ptr<ConcatLayer> concatLayer = ld.layerInstance.dynamicCast<ConcatLayer>();
+            if( !concatLayer.empty() && concatLayer->axis == 1 &&
+                ld.outputBlobs.size() == 1 )
+            {
+                Mat& output = ld.outputBlobs[0];
+
+                // TODO: in general, this optimization can always be done, but
+                // many layers currently check that the input/output blobs are
+                // continuous arrays. Unfortunately, this is not true when
+                // the concatenation optimization is applied with batch_size > 1.
+                // so, for now, we only apply this optimization in the most popular
+                // case batch_size == 1.
+                if( output.dims == 4 && output.size[0] == 1 )
+                {
+                    size_t i, ninputs = ld.inputBlobsId.size();
+                    std::vector<LayerPin> realinputs(ninputs);
+                    for( i = 0; i < ninputs; i++ )
+                    {
+                        LayerPin pin = ld.inputBlobsId[i];
+                        LayerData* inp_i_data = &layers[pin.lid];
+                        while(inp_i_data->skipFlags[DNN_BACKEND_DEFAULT] &&
+                              inp_i_data->inputBlobsId.size() == 1)
+                        {
+                            pin = inp_i_data->inputBlobsId[0];
+                            inp_i_data = &layers[pin.lid];
+                        }
+                        printf_(("\treal input for %s is %s\n",
+                               layers[ld.inputBlobsId[i].lid].getLayerInstance()->name.c_str(),
+                               inp_i_data->getLayerInstance()->name.c_str()));
+
+                        if(inp_i_data->skipFlags[DNN_BACKEND_DEFAULT])
+                            break;
+                        realinputs[i] = pin;
+                    }
+
+                    if( i >= ninputs )
+                    {
+                        Range chrange[] = { Range::all(), Range::all(), Range::all(), Range::all() };
+                        int ofs = 0;
+                        for( i = 0; i < ninputs; i++ )
+                        {
+                            LayerPin pin = realinputs[i];
+                            LayerData* inp_i_data = &layers[pin.lid];
+                            int channels_i = ld.inputBlobs[i]->size[1];
+                            chrange[1] = Range(ofs, ofs + channels_i);
+                            printf_(("\toutput %s(%d) to channels (%d, %d)\n", inp_i_data->layerInstance->name.c_str(),
+                                   pin.oid, ofs, ofs + channels_i));
+                            ofs += channels_i;
+                            Mat output_slice = output(chrange);
+                            Mat& curr_output = inp_i_data->outputBlobs[pin.oid];
+                            CV_Assert(output_slice.isContinuous() && output_slice.size == curr_output.size);
+                            curr_output = output_slice;
+                        }
+                        ld.skipFlags[DNN_BACKEND_DEFAULT] = true;
+                        printf_(("\toptimized out Concat layer %s\n", concatLayer->name.c_str()));
+                    }
                }
            }
        }
@ -1458,9 +1570,12 @@ void Net::setPreferableBackend(int backendId)
    CV_TRACE_FUNCTION();
    CV_TRACE_ARG(backendId);

-    impl->netWasAllocated = impl->netWasAllocated &&
-                            impl->preferableBackend == backendId;
-    impl->preferableBackend = backendId;
+    if( impl->preferableBackend != backendId )
+    {
+        impl->preferableBackend = backendId;
+        impl->netWasAllocated = false;
+        impl->clear();
+    }
 }

 void Net::setPreferableTarget(int targetId)
@ -1468,9 +1583,12 @@ void Net::setPreferableTarget(int targetId)
    CV_TRACE_FUNCTION();
    CV_TRACE_ARG(targetId);

-    impl->netWasAllocated = impl->netWasAllocated &&
-                            impl->preferableTarget == targetId;
-    impl->preferableTarget = targetId;
+    if( impl->preferableTarget != targetId )
+    {
+        impl->preferableTarget = targetId;
+        impl->netWasAllocated = false;
+        impl->clear();
+    }
 }

 void Net::setInputsNames(const std::vector<String> &inputBlobNames)
@ -1825,6 +1943,16 @@ void Net::getMemoryConsumption(const MatShape& netInputShape, std::vector<int>&
                         weights, blobs);
 }

+void Net::enableFusion(bool fusion)
+{
+    if( impl->fusion != fusion )
+    {
+        impl->fusion = fusion;
+        impl->netWasAllocated = false;
+        impl->clear();
+    }
+}
+
 void Net::setHalideScheduler(const String& scheduler)
 {
    CV_TRACE_FUNCTION();
@ -1950,6 +2078,13 @@ Ptr<BackendNode> Layer::tryAttach(const Ptr<BackendNode>& node)

 bool Layer::setActivation(const Ptr<ActivationLayer>&) { return false; }
 bool Layer::setBatchNorm(const Ptr<BatchNormLayer>&) { return false; }
+bool Layer::setScale(const Ptr<ScaleLayer>&) { return false; }
+void Layer::unsetAttached()
+{
+    setActivation(Ptr<ActivationLayer>());
+    setBatchNorm(Ptr<BatchNormLayer>());
+    setScale(Ptr<ScaleLayer>());
+}

 template <typename T>
 static void vecToPVec(const std::vector<T> &v, std::vector<T*> &pv)
--- a/modules/dnn/src/layers/concat_layer.cpp
+++ b/modules/dnn/src/layers/concat_layer.cpp
@ -94,6 +94,78 @@ public:
               backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1;  // By channels
    }

+    class ChannelConcatInvoker : public ParallelLoopBody
+    {
+    public:
+        std::vector<Mat*>* inputs;
+        Mat* output;
+        int nstripes;
+        std::vector<const float*> chptrs;
+
+        static void run(std::vector<Mat*>& inputs, Mat& output, int nstripes)
+        {
+            ChannelConcatInvoker cc;
+            cc.inputs = &inputs;
+            cc.output = &output;
+            cc.nstripes = nstripes;
+
+            size_t i, ninputs = inputs.size();
+            int nchannels = 0, batchsz = output.size[0];
+            for( i = 0; i < ninputs; i++ )
+            {
+                Mat& inp = *inputs[i];
+                CV_Assert( inp.isContinuous() && inp.type() == CV_32F &&
+                           inp.dims == 4 && inp.size[0] == output.size[0] &&
+                           inp.size[2] == output.size[2] &&
+                           inp.size[3] == output.size[3] );
+                nchannels += inp.size[1];
+            }
+            CV_Assert( nchannels == output.size[1] );
+            CV_Assert( output.isContinuous() && output.type() == CV_32F );
+
+            cc.chptrs.resize(nchannels*batchsz);
+
+            int ofs = 0;
+            for( i = 0; i < ninputs; i++)
+            {
+                Mat& inp = *inputs[i];
+                for( int j = 0; j < batchsz; j++ )
+                    for( int k = 0; k < inp.size[1]; k++ )
+                    {
+                        const float* ptr = inp.ptr<float>(j, k);
+                        cc.chptrs[ofs + j*nchannels + k] = ptr;
+                    }
+                ofs += inp.size[1];
+            }
+
+            parallel_for_(Range(0, nstripes), cc, nstripes);
+        }
+
+        ChannelConcatInvoker() {}
+
+        void operator()(const Range& r) const
+        {
+            size_t planeSize = (size_t)output->size[2]*output->size[3];
+            size_t nch = chptrs.size();
+            size_t total = nch*planeSize;
+            size_t stripeSize = (total + nstripes - 1)/nstripes;
+            size_t stripeStart = r.start*stripeSize;
+            size_t stripeEnd = std::min(total, r.end*stripeSize);
+            const float** ptrs = (const float**)&chptrs[0];
+            float* outptr = output->ptr<float>();
+            size_t blockSize0 = 1 << 16;
+
+            for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
+            {
+                size_t ch = ofs0/planeSize;
+                size_t ofs = ofs0 - ch*planeSize;
+                size_t blockSize = std::min(blockSize0, planeSize - ofs);
+                memcpy(outptr + ofs0, ptrs[ch] + ofs, blockSize*sizeof(outptr[0]));
+                ofs0 += blockSize;
+            }
+        }
+    };
+
    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
    {
        CV_TRACE_FUNCTION();
@ -101,14 +173,23 @@ public:

        int cAxis = clamp(axis, inputs[0]->dims);
        Mat& outMat = outputs[0];
-        std::vector<Range> ranges(outputs[0].dims, Range::all());

-        ranges[cAxis].start = 0;
-        for (size_t i = 0; i < inputs.size(); i++)
+        if( cAxis == 1 && outMat.dims == 4 )
+        {
+            int nstripes = getNumThreads();
+            ChannelConcatInvoker::run(inputs, outMat, nstripes);
+        }
+        else
        {
-            ranges[cAxis].end = ranges[cAxis].start + inputs[i]->size[cAxis];
-            inputs[i]->copyTo(outMat(&ranges[0]));
-            ranges[cAxis].start = ranges[cAxis].end;
+            std::vector<Range> ranges(outputs[0].dims, Range::all());
+
+            ranges[cAxis].start = 0;
+            for (size_t i = 0; i < inputs.size(); i++)
+            {
+                ranges[cAxis].end = ranges[cAxis].start + inputs[i]->size[cAxis];
+                inputs[i]->copyTo(outMat(&ranges[0]));
+                ranges[cAxis].start = ranges[cAxis].end;
+            }
        }
    }

--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@ -148,6 +148,7 @@ public:
    std::vector<float> reluslope;
    Ptr<ActivationLayer> activ;
    Ptr<BatchNormLayer> bnorm;
+    Ptr<ScaleLayer> scaleLayer;

    MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const
    {
@ -202,6 +203,9 @@ public:

    bool setBatchNorm(const Ptr<BatchNormLayer>& layer )
    {
+        // for now the scale layer followed by the batch norm cannot be fused, only vice versa.
+        if( !scaleLayer.empty() )
+            return false;
        bnorm = layer;
        // we will need to re-compute the weights with the batch
        // norm coefficients taken into account
@ -209,6 +213,15 @@ public:
        return !bnorm.empty();
    }

+    bool setScale(const Ptr<ScaleLayer>& layer)
+    {
+        scaleLayer = layer;
+        // we will need to re-compute the weights with the scaling
+        // coefficients taken into account
+        weightsMat.release();
+        return !scaleLayer.empty();
+    }
+
    virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
    {
 #ifdef HAVE_HALIDE
@ -678,32 +691,56 @@ public:
                    biasvec[k] = biasMat.at<float>(k);
            }

-            if( !bnorm.empty() )
+            if( !bnorm.empty() || !scaleLayer.empty() )
            {
-                Mat scale, shift;
-                bnorm->getScaleShift(scale, shift);
+                Mat scale, shift, scale2, shift2;
+                const float *scaleptr = 0, *shiftptr = 0;
+                const float *scaleptr2 = 0, *shiftptr2 = 0;

-                CV_Assert( scale.isContinuous() && shift.isContinuous() &&
-                           scale.type() == CV_32F && shift.type() == CV_32F &&
-                           scale.total() == (size_t)outCn &&
-                           shift.total() == (size_t)outCn );
+                if( !bnorm.empty() )
+                {
+                    bnorm->getScaleShift(scale, shift);
+                    CV_Assert( scale.isContinuous() && shift.isContinuous() &&
+                               scale.type() == CV_32F && shift.type() == CV_32F &&
+                               scale.total() == (size_t)outCn &&
+                               shift.total() == (size_t)outCn );
+                    scaleptr = scale.ptr<float>();
+                    shiftptr = shift.ptr<float>();
+                }
+                if( !scaleLayer.empty() )
+                {
+                    scale2 = scaleLayer->blobs[0];
+                    CV_Assert( scale2.isContinuous() && scale2.type() == CV_32F &&
+                               scale2.total() == (size_t)outCn );
+                    scaleptr2 = scale2.ptr<float>();
+                    if( scaleLayer->hasBias )
+                    {
+                        shift2 = scaleLayer->blobs[1];
+                        CV_Assert( shift2.isContinuous() && shift2.type() == CV_32F &&
+                                   shift2.total() == (size_t)outCn );
+                        shiftptr2 = shift2.ptr<float>();
+                    }
+                }

                for( int i = 0; i < outCn; i++ )
                {
-                    float s = scale.at<float>(i);
-                    float delta = shift.at<float>(i);
+                    float s1 = scaleptr ? scaleptr[i] : 1.f;
+                    float delta1 = shiftptr ? shiftptr[i] : 0.f;
+                    float s2 = scaleptr2 ? scaleptr2[i] : 1.f;
+                    float delta2 = shiftptr2 ? shiftptr2[i] : 0.f;
                    float* w_i = weightsMat.ptr<float>(i);
                    int j, wcols = weightsMat.cols;

                    for( j = 0; j < wcols; j++ )
-                        w_i[j] *= s;
+                        w_i[j] *= (s1*s2);

-                    biasvec[i] = biasvec[i]*s + delta;
+                    biasvec[i] = biasvec[i]*(s1*s2) + (delta1*s2 + delta2);
                }
            }
            biasvec[outCn] = biasvec[outCn+1] = biasvec[outCn-1];
        }

+        reluslope.clear();
        if( activ )
        {
            Ptr<ReLULayer> activ_relu = activ.dynamicCast<ReLULayer>();
--- a/modules/dnn/test/test_halide_layers.cpp
+++ b/modules/dnn/test/test_halide_layers.cpp
@ -517,7 +517,8 @@ TEST_P(Concat, Accuracy)

    Net net;

-    std::vector<int> convLayerIds(numChannels.channels);
+    std::vector<int> convLayerIds;
+    convLayerIds.reserve(numChannels.channels);
    for (int i = 0, n = numChannels.channels; i < n; ++i)
    {
        if (!numChannels[i])
@ -537,8 +538,9 @@ TEST_P(Concat, Accuracy)
        convParam.name = ss.str();
        convParam.blobs.push_back(weights);

-        convLayerIds[i] = net.addLayer(convParam.name, convParam.type, convParam);
-        net.connect(0, 0, convLayerIds[i], 0);
+        int layerId = net.addLayer(convParam.name, convParam.type, convParam);
+        convLayerIds.push_back(layerId);
+        net.connect(0, 0, layerId, 0);
    }

    LayerParams concatParam;