Merge pull request #9058 from alalek:dnn_minor_fixes

7 years ago · 4238add35b
parent 520da7aaaf 4784c7be5f
commit 4238add35b
12 changed files with 481 additions and 665 deletions
--- a/modules/dnn/CMakeLists.txt
+++ b/modules/dnn/CMakeLists.txt
@ -9,6 +9,8 @@ endif()

 set(the_description "Deep neural network module. It allows to load models from different frameworks and to make forward pass")

+ocv_add_dispatched_file("layers/layers_common" AVX AVX2)
+
 ocv_add_module(dnn opencv_core opencv_imgproc WRAP python matlab java)
 ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-shadow -Wno-parentheses -Wmaybe-uninitialized -Wsign-promo
                                     -Wmissing-declarations -Wmissing-prototypes
--- a/modules/dnn/include/opencv2/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn.hpp
@ -44,7 +44,7 @@

 // This is an umbrealla header to include into you project.
 // We are free to change headers layout in dnn subfolder, so please include
-// this header for future compartibility
+// this header for future compatibility


 /** @defgroup dnn Deep Neural Network module
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@ -152,7 +152,19 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
        int outputNameToIndex(String outputName);
    };

-    //! Classical recurrent layer
+    /** @brief Classical recurrent layer
+
+    Accepts two inputs @f$x_t@f$ and @f$h_{t-1}@f$ and compute two outputs @f$o_t@f$ and @f$h_t@f$.
+
+    - input: should contain packed input @f$x_t@f$.
+    - output: should contain output @f$o_t@f$ (and @f$h_t@f$ if setProduceHiddenOutput() is set to true).
+
+    input[0] should have shape [`T`, `N`, `data_dims`] where `T` and `N` is number of timestamps and number of independent samples of @f$x_t@f$ respectively.
+
+    output[0] will have shape [`T`, `N`, @f$N_o@f$], where @f$N_o@f$ is number of rows in @f$ W_{xo} @f$ matrix.
+
+    If setProduceHiddenOutput() is set to true then @p output[1] will contain a Mat with shape [`T`, `N`, @f$N_h@f$], where @f$N_h@f$ is number of rows in @f$ W_{hh} @f$ matrix.
+    */
    class CV_EXPORTS RNNLayer : public Layer
    {
    public:
@ -180,17 +192,6 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
         */
        virtual void setProduceHiddenOutput(bool produce = false) = 0;

-        /** Accepts two inputs @f$x_t@f$ and @f$h_{t-1}@f$ and compute two outputs @f$o_t@f$ and @f$h_t@f$.
-
-        @param input should contain packed input @f$x_t@f$.
-        @param output should contain output @f$o_t@f$ (and @f$h_t@f$ if setProduceHiddenOutput() is set to true).
-
-        @p input[0] should have shape [`T`, `N`, `data_dims`] where `T` and `N` is number of timestamps and number of independent samples of @f$x_t@f$ respectively.
-
-        @p output[0] will have shape [`T`, `N`, @f$N_o@f$], where @f$N_o@f$ is number of rows in @f$ W_{xo} @f$ matrix.
-
-        If setProduceHiddenOutput() is set to true then @p output[1] will contain a Mat with shape [`T`, `N`, @f$N_h@f$], where @f$N_h@f$ is number of rows in @f$ W_{hh} @f$ matrix.
-        */
    };

    class CV_EXPORTS BaseConvolutionLayer : public Layer
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@ -371,28 +371,28 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
        /** @brief Runs forward pass to compute output of layer with name @p outputName.
         *  @param outputName name for layer which output is needed to get
         *  @return blob for first output of specified layer.
-          * @details By default runs forward pass for the whole network.
-          */
+         *  @details By default runs forward pass for the whole network.
+         */
        CV_WRAP Mat forward(const String& outputName = String());

        /** @brief Runs forward pass to compute output of layer with name @p outputName.
         *  @param outputBlobs contains all output blobs for specified layer.
         *  @param outputName name for layer which output is needed to get
-          * @details If @p outputName is empty, runs forward pass for the whole network.
-          */
+         *  @details If @p outputName is empty, runs forward pass for the whole network.
+         */
        CV_WRAP void forward(std::vector<Mat>& outputBlobs, const String& outputName = String());

        /** @brief Runs forward pass to compute outputs of layers listed in @p outBlobNames.
         *  @param outputBlobs contains blobs for first outputs of specified layers.
         *  @param outBlobNames names for layers which outputs are needed to get
-          */
+         */
        CV_WRAP void forward(std::vector<Mat>& outputBlobs,
                             const std::vector<String>& outBlobNames);

        /** @brief Runs forward pass to compute outputs of layers listed in @p outBlobNames.
         *  @param outputBlobs contains all output blobs for each layer specified in @p outBlobNames.
         *  @param outBlobNames names for layers which outputs are needed to get
-          */
+         */
        CV_WRAP void forward(std::vector<std::vector<Mat> >& outputBlobs,
                             const std::vector<String>& outBlobNames);

@ -460,103 +460,103 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
         */
        CV_WRAP std::vector<int> getUnconnectedOutLayers() const;
        /** @brief Returns input and output shapes for all layers in loaded model;
-          *  preliminary inferencing isn't necessary.
-          *  @param netInputShapes shapes for all input blobs in net input layer.
-          *  @param layersIds output parameter for layer IDs.
-          *  @param inLayersShapes output parameter for input layers shapes;
-          * order is the same as in layersIds
-          *  @param outLayersShapes output parameter for output layers shapes;
-          * order is the same as in layersIds
-          */
-         CV_WRAP void getLayersShapes(const std::vector<MatShape>& netInputShapes,
-                                      std::vector<int>* layersIds,
-                                      std::vector<std::vector<MatShape> >* inLayersShapes,
-                                      std::vector<std::vector<MatShape> >* outLayersShapes) const;
-
-         /** @overload */
-         CV_WRAP void getLayersShapes(const MatShape& netInputShape,
-                                      std::vector<int>* layersIds,
-                                      std::vector<std::vector<MatShape> >* inLayersShapes,
-                                      std::vector<std::vector<MatShape> >* outLayersShapes) const;
-
-         /** @brief Returns input and output shapes for layer with specified
-          * id in loaded model; preliminary inferencing isn't necessary.
-          *  @param netInputShape shape input blob in net input layer.
-          *  @param layerId id for layer.
-          *  @param inLayerShapes output parameter for input layers shapes;
-          * order is the same as in layersIds
-          *  @param outLayerShapes output parameter for output layers shapes;
-          * order is the same as in layersIds
-          */
-         CV_WRAP void getLayerShapes(const MatShape& netInputShape,
-                                     const int layerId,
-                                     std::vector<MatShape>* inLayerShapes,
-                                     std::vector<MatShape>* outLayerShapes) const;
+         *  preliminary inferencing isn't necessary.
+         *  @param netInputShapes shapes for all input blobs in net input layer.
+         *  @param layersIds output parameter for layer IDs.
+         *  @param inLayersShapes output parameter for input layers shapes;
+         * order is the same as in layersIds
+         *  @param outLayersShapes output parameter for output layers shapes;
+         * order is the same as in layersIds
+         */
+        CV_WRAP void getLayersShapes(const std::vector<MatShape>& netInputShapes,
+                                     std::vector<int>* layersIds,
+                                     std::vector<std::vector<MatShape> >* inLayersShapes,
+                                     std::vector<std::vector<MatShape> >* outLayersShapes) const;
+
+        /** @overload */
+        CV_WRAP void getLayersShapes(const MatShape& netInputShape,
+                                     std::vector<int>* layersIds,
+                                     std::vector<std::vector<MatShape> >* inLayersShapes,
+                                     std::vector<std::vector<MatShape> >* outLayersShapes) const;
+
+        /** @brief Returns input and output shapes for layer with specified
+         * id in loaded model; preliminary inferencing isn't necessary.
+         *  @param netInputShape shape input blob in net input layer.
+         *  @param layerId id for layer.
+         *  @param inLayerShapes output parameter for input layers shapes;
+         * order is the same as in layersIds
+         *  @param outLayerShapes output parameter for output layers shapes;
+         * order is the same as in layersIds
+         */
+        CV_WRAP void getLayerShapes(const MatShape& netInputShape,
+                                    const int layerId,
+                                    std::vector<MatShape>* inLayerShapes,
+                                    std::vector<MatShape>* outLayerShapes) const;

-         /** @overload */
-         CV_WRAP void getLayerShapes(const std::vector<MatShape>& netInputShapes,
+        /** @overload */
+        CV_WRAP void getLayerShapes(const std::vector<MatShape>& netInputShapes,
                                     const int layerId,
                                     std::vector<MatShape>* inLayerShapes,
                                     std::vector<MatShape>* outLayerShapes) const;
-         /** @brief Computes FLOP for whole loaded model with specified input shapes.
-          * @param netInputShapes vector of shapes for all net inputs.
-          * @returns computed FLOP.
-          */
-         CV_WRAP int64 getFLOPS(const std::vector<MatShape>& netInputShapes) const;
-         /** @overload */
-         CV_WRAP int64 getFLOPS(const MatShape& netInputShape) const;
-         /** @overload */
-         CV_WRAP int64 getFLOPS(const int layerId,
-                               const std::vector<MatShape>& netInputShapes) const;
-         /** @overload */
-         CV_WRAP int64 getFLOPS(const int layerId,
-                               const MatShape& netInputShape) const;
-
-         /** @brief Returns list of types for layer used in model.
-          * @param layersTypes output parameter for returning types.
-          */
-         CV_WRAP void getLayerTypes(CV_OUT std::vector<String>& layersTypes) const;
-
-         /** @brief Returns count of layers of specified type.
-          * @param layerType type.
-          * @returns count of layers
-          */
-         CV_WRAP int getLayersCount(const String& layerType) const;
-
-         /** @brief Computes bytes number which are requered to store
-          * all weights and intermediate blobs for model.
-          * @param netInputShapes vector of shapes for all net inputs.
-          * @param weights output parameter to store resulting bytes for weights.
-          * @param blobs output parameter to store resulting bytes for intermediate blobs.
-          */
-         CV_WRAP void getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
-                                           CV_OUT size_t& weights, CV_OUT size_t& blobs) const;
-         /** @overload */
-         CV_WRAP void getMemoryConsumption(const MatShape& netInputShape,
-                                           CV_OUT size_t& weights, CV_OUT size_t& blobs) const;
-         /** @overload */
-         CV_WRAP void getMemoryConsumption(const int layerId,
-                                           const std::vector<MatShape>& netInputShapes,
-                                           CV_OUT size_t& weights, CV_OUT size_t& blobs) const;
-         /** @overload */
-         CV_WRAP void getMemoryConsumption(const int layerId,
-                                           const MatShape& netInputShape,
-                                           CV_OUT size_t& weights, CV_OUT size_t& blobs) const;
-
-         /** @brief Computes bytes number which are requered to store
-          * all weights and intermediate blobs for each layer.
-          * @param netInputShapes vector of shapes for all net inputs.
-          * @param layerIds output vector to save layer IDs.
-          * @param weights output parameter to store resulting bytes for weights.
-          * @param blobs output parameter to store resulting bytes for intermediate blobs.
-          */
-         CV_WRAP void getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
-                                           CV_OUT std::vector<int>& layerIds, CV_OUT std::vector<size_t>& weights,
-                                           CV_OUT std::vector<size_t>& blobs) const;
-         /** @overload */
-         CV_WRAP void getMemoryConsumption(const MatShape& netInputShape,
-                                           CV_OUT std::vector<int>& layerIds, CV_OUT std::vector<size_t>& weights,
-                                           CV_OUT std::vector<size_t>& blobs) const;
+        /** @brief Computes FLOP for whole loaded model with specified input shapes.
+         * @param netInputShapes vector of shapes for all net inputs.
+         * @returns computed FLOP.
+         */
+        CV_WRAP int64 getFLOPS(const std::vector<MatShape>& netInputShapes) const;
+        /** @overload */
+        CV_WRAP int64 getFLOPS(const MatShape& netInputShape) const;
+        /** @overload */
+        CV_WRAP int64 getFLOPS(const int layerId,
+                              const std::vector<MatShape>& netInputShapes) const;
+        /** @overload */
+        CV_WRAP int64 getFLOPS(const int layerId,
+                              const MatShape& netInputShape) const;
+
+        /** @brief Returns list of types for layer used in model.
+         * @param layersTypes output parameter for returning types.
+         */
+        CV_WRAP void getLayerTypes(CV_OUT std::vector<String>& layersTypes) const;
+
+        /** @brief Returns count of layers of specified type.
+         * @param layerType type.
+         * @returns count of layers
+         */
+        CV_WRAP int getLayersCount(const String& layerType) const;
+
+        /** @brief Computes bytes number which are requered to store
+         * all weights and intermediate blobs for model.
+         * @param netInputShapes vector of shapes for all net inputs.
+         * @param weights output parameter to store resulting bytes for weights.
+         * @param blobs output parameter to store resulting bytes for intermediate blobs.
+         */
+        CV_WRAP void getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
+                                          CV_OUT size_t& weights, CV_OUT size_t& blobs) const;
+        /** @overload */
+        CV_WRAP void getMemoryConsumption(const MatShape& netInputShape,
+                                          CV_OUT size_t& weights, CV_OUT size_t& blobs) const;
+        /** @overload */
+        CV_WRAP void getMemoryConsumption(const int layerId,
+                                          const std::vector<MatShape>& netInputShapes,
+                                          CV_OUT size_t& weights, CV_OUT size_t& blobs) const;
+        /** @overload */
+        CV_WRAP void getMemoryConsumption(const int layerId,
+                                          const MatShape& netInputShape,
+                                          CV_OUT size_t& weights, CV_OUT size_t& blobs) const;
+
+        /** @brief Computes bytes number which are requered to store
+         * all weights and intermediate blobs for each layer.
+         * @param netInputShapes vector of shapes for all net inputs.
+         * @param layerIds output vector to save layer IDs.
+         * @param weights output parameter to store resulting bytes for weights.
+         * @param blobs output parameter to store resulting bytes for intermediate blobs.
+         */
+        CV_WRAP void getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
+                                          CV_OUT std::vector<int>& layerIds, CV_OUT std::vector<size_t>& weights,
+                                          CV_OUT std::vector<size_t>& blobs) const;
+        /** @overload */
+        CV_WRAP void getMemoryConsumption(const MatShape& netInputShape,
+                                          CV_OUT std::vector<int>& layerIds, CV_OUT std::vector<size_t>& weights,
+                                          CV_OUT std::vector<size_t>& blobs) const;
    private:

        struct Impl;
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@ -969,9 +969,6 @@ struct Net::Impl
        }
    }

-    #define CV_RETHROW_ERROR(err, newmsg)\
-        cv::error(err.code, newmsg, err.func.c_str(), err.file.c_str(), err.line)
-
    void allocateLayer(int lid, const LayersShapesMap& layersShapes)
    {
        CV_TRACE_FUNCTION();
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@ -506,13 +506,13 @@ public:
                        int bsz = ofs1 - ofs0;
                    #if CV_TRY_AVX2
                        if(useAVX2)
-                            fastConv_avx2(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
+                            opt_AVX2::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
                                          outShape, bsz, vsz, vsz_a, relu, cn0 == 0);
                        else
                    #endif
                    #if CV_TRY_AVX
                        if(useAVX)
-                            fastConv_avx(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
+                            opt_AVX::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
                                         outShape, bsz, vsz, vsz_a, relu, cn0 == 0);
                        else
                    #endif
@ -824,12 +824,12 @@ public:

        #if CV_TRY_AVX2
            if( useAVX2 )
-                fastGEMM_avx2( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax );
+                opt_AVX2::fastGEMM( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax );
            else
        #endif
        #if CV_TRY_AVX
            if( useAVX )
-                fastGEMM_avx( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax );
+                opt_AVX::fastGEMM( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax );
            else
        #endif
            for( m = 0; m < mmax; m += 2 )
--- a/modules/dnn/src/layers/detection_output_layer.cpp
+++ b/modules/dnn/src/layers/detection_output_layer.cpp
@ -55,29 +55,13 @@ namespace util
 {

 template <typename T>
-std::string to_string(T value)
-{
-    std::ostringstream stream;
-    stream << value;
-    return stream.str();
-}
-
-template <typename T>
-void make_error(const std::string& message1, const T& message2)
-{
-    std::string error(message1);
-    error += std::string(util::to_string<int>(message2));
-    CV_Error(Error::StsBadArg, error.c_str());
-}
-
-template <typename T>
-bool SortScorePairDescend(const std::pair<float, T>& pair1,
+static inline bool SortScorePairDescend(const std::pair<float, T>& pair1,
                          const std::pair<float, T>& pair2)
 {
    return pair1.first > pair2.first;
 }

-}
+} // namespace

 class DetectionOutputLayerImpl : public DetectionOutputLayer
 {
@ -133,7 +117,7 @@ public:
                message += " layer parameter does not contain ";
                message += parameterName;
                message += " parameter.";
-                CV_Error(Error::StsBadArg, message);
+                CV_ErrorNoReturn(Error::StsBadArg, message);
            }
            else
            {
@ -209,180 +193,173 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

-        const float* locationData = inputs[0]->ptr<float>();
-        const float* confidenceData = inputs[1]->ptr<float>();
-        const float* priorData = inputs[2]->ptr<float>();
+        std::vector<LabelBBox> allDecodedBBoxes;
+        std::vector<std::vector<std::vector<float> > > allConfidenceScores;

        int num = inputs[0]->size[0];
-        int numPriors = inputs[2]->size[2] / 4;

-        // Retrieve all location predictions.
-        std::vector<LabelBBox> allLocationPredictions;
-        GetLocPredictions(locationData, num, numPriors, _numLocClasses,
-                          _shareLocation, &allLocationPredictions);
+        // extract predictions from input layers
+        {
+            int numPriors = inputs[2]->size[2] / 4;

-        // Retrieve all confidences.
-        std::vector<std::vector<std::vector<float> > > allConfidenceScores;
-        GetConfidenceScores(confidenceData, num, numPriors, _numClasses,
-                            &allConfidenceScores);
+            const float* locationData = inputs[0]->ptr<float>();
+            const float* confidenceData = inputs[1]->ptr<float>();
+            const float* priorData = inputs[2]->ptr<float>();

-        // Retrieve all prior bboxes. It is same within a batch since we assume all
-        // images in a batch are of same dimension.
-        std::vector<caffe::NormalizedBBox> priorBBoxes;
-        std::vector<std::vector<float> > priorVariances;
-        GetPriorBBoxes(priorData, numPriors, &priorBBoxes, &priorVariances);
+            // Retrieve all location predictions
+            std::vector<LabelBBox> allLocationPredictions;
+            GetLocPredictions(locationData, num, numPriors, _numLocClasses,
+                              _shareLocation, allLocationPredictions);

-        const bool clip_bbox = false;
-        // Decode all loc predictions to bboxes.
-        std::vector<LabelBBox> allDecodedBBoxes;
-        DecodeBBoxesAll(allLocationPredictions, priorBBoxes, priorVariances, num,
-                        _shareLocation, _numLocClasses, _backgroundLabelId,
-                        _codeType, _varianceEncodedInTarget, clip_bbox, &allDecodedBBoxes);
+            // Retrieve all confidences
+            GetConfidenceScores(confidenceData, num, numPriors, _numClasses, allConfidenceScores);

-        int numKept = 0;
+            // Retrieve all prior bboxes
+            std::vector<caffe::NormalizedBBox> priorBBoxes;
+            std::vector<std::vector<float> > priorVariances;
+            GetPriorBBoxes(priorData, numPriors, priorBBoxes, priorVariances);
+
+            // Decode all loc predictions to bboxes
+            DecodeBBoxesAll(allLocationPredictions, priorBBoxes, priorVariances, num,
+                            _shareLocation, _numLocClasses, _backgroundLabelId,
+                            _codeType, _varianceEncodedInTarget, false, allDecodedBBoxes);
+        }
+
+        size_t numKept = 0;
        std::vector<std::map<int, std::vector<int> > > allIndices;
        for (int i = 0; i < num; ++i)
        {
-            const LabelBBox& decodeBBoxes = allDecodedBBoxes[i];
-            const std::vector<std::vector<float> >& confidenceScores =
-            allConfidenceScores[i];
-            std::map<int, std::vector<int> > indices;
-            int numDetections = 0;
-            for (int c = 0; c < (int)_numClasses; ++c)
-            {
-                if (c == _backgroundLabelId)
-                {
-                    // Ignore background class.
-                    continue;
-                }
-                if (confidenceScores.size() <= c)
-                {
-                    // Something bad happened if there are no predictions for current label.
-                    util::make_error<int>("Could not find confidence predictions for label ", c);
-                }
-
-                const std::vector<float>& scores = confidenceScores[c];
-                int label = _shareLocation ? -1 : c;
-                if (decodeBBoxes.find(label) == decodeBBoxes.end())
-                {
-                    // Something bad happened if there are no predictions for current label.
-                    util::make_error<int>("Could not find location predictions for label ", label);
-                    continue;
-                }
-                const std::vector<caffe::NormalizedBBox>& bboxes =
-                decodeBBoxes.find(label)->second;
-                ApplyNMSFast(bboxes, scores, _confidenceThreshold, _nmsThreshold, 1.0,
-                             _topK, &(indices[c]));
-                numDetections += indices[c].size();
-            }
-            if (_keepTopK > -1 && numDetections > _keepTopK)
-            {
-                std::vector<std::pair<float, std::pair<int, int> > > scoreIndexPairs;
-                for (std::map<int, std::vector<int> >::iterator it = indices.begin();
-                     it != indices.end(); ++it)
-                {
-                    int label = it->first;
-                    const std::vector<int>& labelIndices = it->second;
-                    if (confidenceScores.size() <= label)
-                    {
-                        // Something bad happened for current label.
-                        util::make_error<int>("Could not find location predictions for label ", label);
-                        continue;
-                    }
-                    const std::vector<float>& scores = confidenceScores[label];
-                    for (size_t j = 0; j < labelIndices.size(); ++j)
-                    {
-                        size_t idx = labelIndices[j];
-                        CV_Assert(idx < scores.size());
-                        scoreIndexPairs.push_back(
-                                                  std::make_pair(scores[idx], std::make_pair(label, idx)));
-                    }
-                }
-                // Keep outputs k results per image.
-                std::sort(scoreIndexPairs.begin(), scoreIndexPairs.end(),
-                          util::SortScorePairDescend<std::pair<int, int> >);
-                scoreIndexPairs.resize(_keepTopK);
-                // Store the new indices.
-                std::map<int, std::vector<int> > newIndices;
-                for (size_t j = 0; j < scoreIndexPairs.size(); ++j)
-                {
-                    int label = scoreIndexPairs[j].second.first;
-                    int idx = scoreIndexPairs[j].second.second;
-                    newIndices[label].push_back(idx);
-                }
-                allIndices.push_back(newIndices);
-                numKept += _keepTopK;
-            }
-            else
-            {
-                allIndices.push_back(indices);
-                numKept += numDetections;
-            }
+            numKept += processDetections_(allDecodedBBoxes[i], allConfidenceScores[i], allIndices);
        }

        if (numKept == 0)
        {
            CV_ErrorNoReturn(Error::StsError, "Couldn't find any detections");
-            return;
        }
-        int outputShape[] = {1, 1, numKept, 7};
+        int outputShape[] = {1, 1, (int)numKept, 7};
        outputs[0].create(4, outputShape, CV_32F);
        float* outputsData = outputs[0].ptr<float>();

-        int count = 0;
+        size_t count = 0;
        for (int i = 0; i < num; ++i)
        {
-            const std::vector<std::vector<float> >& confidenceScores =
-            allConfidenceScores[i];
-            const LabelBBox& decodeBBoxes = allDecodedBBoxes[i];
-            for (std::map<int, std::vector<int> >::iterator it = allIndices[i].begin();
-                 it != allIndices[i].end(); ++it)
+            count += outputDetections_(i, &outputsData[count * 7],
+                                       allDecodedBBoxes[i], allConfidenceScores[i],
+                                       allIndices[i]);
+        }
+        CV_Assert(count == numKept);
+    }
+
+    size_t outputDetections_(
+            const int i, float* outputsData,
+            const LabelBBox& decodeBBoxes, const std::vector<std::vector<float> >& confidenceScores,
+            const std::map<int, std::vector<int> >& indicesMap
+    )
+    {
+        size_t count = 0;
+        for (std::map<int, std::vector<int> >::const_iterator it = indicesMap.begin(); it != indicesMap.end(); ++it)
+        {
+            int label = it->first;
+            if (confidenceScores.size() <= label)
+                CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find confidence predictions for label %d", label));
+            const std::vector<float>& scores = confidenceScores[label];
+            int locLabel = _shareLocation ? -1 : label;
+            LabelBBox::const_iterator label_bboxes = decodeBBoxes.find(locLabel);
+            if (label_bboxes == decodeBBoxes.end())
+                CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find location predictions for label %d", locLabel));
+            const std::vector<int>& indices = it->second;
+
+            for (size_t j = 0; j < indices.size(); ++j, ++count)
+            {
+                int idx = indices[j];
+                const caffe::NormalizedBBox& decode_bbox = label_bboxes->second[idx];
+                outputsData[count * 7] = i;
+                outputsData[count * 7 + 1] = label;
+                outputsData[count * 7 + 2] = scores[idx];
+                outputsData[count * 7 + 3] = decode_bbox.xmin();
+                outputsData[count * 7 + 4] = decode_bbox.ymin();
+                outputsData[count * 7 + 5] = decode_bbox.xmax();
+                outputsData[count * 7 + 6] = decode_bbox.ymax();
+            }
+        }
+        return count;
+    }
+
+    size_t processDetections_(
+            const LabelBBox& decodeBBoxes, const std::vector<std::vector<float> >& confidenceScores,
+            std::vector<std::map<int, std::vector<int> > >& allIndices
+    )
+    {
+        std::map<int, std::vector<int> > indices;
+        size_t numDetections = 0;
+        for (int c = 0; c < (int)_numClasses; ++c)
+        {
+            if (c == _backgroundLabelId)
+                continue; // Ignore background class.
+            if (c >= confidenceScores.size())
+                CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find confidence predictions for label %d", c));
+
+            const std::vector<float>& scores = confidenceScores[c];
+            int label = _shareLocation ? -1 : c;
+
+            LabelBBox::const_iterator label_bboxes = decodeBBoxes.find(label);
+            if (label_bboxes == decodeBBoxes.end())
+                CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find location predictions for label %d", label));
+            ApplyNMSFast(label_bboxes->second, scores, _confidenceThreshold, _nmsThreshold, 1.0, _topK, indices[c]);
+            numDetections += indices[c].size();
+        }
+        if (_keepTopK > -1 && numDetections > (size_t)_keepTopK)
+        {
+            std::vector<std::pair<float, std::pair<int, int> > > scoreIndexPairs;
+            for (std::map<int, std::vector<int> >::iterator it = indices.begin();
+                 it != indices.end(); ++it)
            {
                int label = it->first;
-                if (confidenceScores.size() <= label)
-                {
-                    // Something bad happened if there are no predictions for current label.
-                    util::make_error<int>("Could not find confidence predictions for label ", label);
-                    continue;
-                }
+                const std::vector<int>& labelIndices = it->second;
+                if (label >= confidenceScores.size())
+                    CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find location predictions for label %d", label));
                const std::vector<float>& scores = confidenceScores[label];
-                int locLabel = _shareLocation ? -1 : label;
-                if (decodeBBoxes.find(locLabel) == decodeBBoxes.end())
+                for (size_t j = 0; j < labelIndices.size(); ++j)
                {
-                    // Something bad happened if there are no predictions for current label.
-                    util::make_error<int>("Could not find location predictions for label ", locLabel);
-                    continue;
+                    size_t idx = labelIndices[j];
+                    CV_Assert(idx < scores.size());
+                    scoreIndexPairs.push_back(std::make_pair(scores[idx], std::make_pair(label, idx)));
                }
-                const std::vector<caffe::NormalizedBBox>& bboxes =
-                decodeBBoxes.find(locLabel)->second;
-                std::vector<int>& indices = it->second;
+            }
+            // Keep outputs k results per image.
+            std::sort(scoreIndexPairs.begin(), scoreIndexPairs.end(),
+                      util::SortScorePairDescend<std::pair<int, int> >);
+            scoreIndexPairs.resize(_keepTopK);

-                for (size_t j = 0; j < indices.size(); ++j)
-                {
-                    int idx = indices[j];
-                    outputsData[count * 7] = i;
-                    outputsData[count * 7 + 1] = label;
-                    outputsData[count * 7 + 2] = scores[idx];
-                    caffe::NormalizedBBox clipBBox = bboxes[idx];
-                    outputsData[count * 7 + 3] = clipBBox.xmin();
-                    outputsData[count * 7 + 4] = clipBBox.ymin();
-                    outputsData[count * 7 + 5] = clipBBox.xmax();
-                    outputsData[count * 7 + 6] = clipBBox.ymax();
-
-                    ++count;
-                }
+            std::map<int, std::vector<int> > newIndices;
+            for (size_t j = 0; j < scoreIndexPairs.size(); ++j)
+            {
+                int label = scoreIndexPairs[j].second.first;
+                int idx = scoreIndexPairs[j].second.second;
+                newIndices[label].push_back(idx);
            }
+            allIndices.push_back(newIndices);
+            return (size_t)_keepTopK;
+        }
+        else
+        {
+            allIndices.push_back(indices);
+            return numDetections;
        }
    }

-    // Compute bbox size.
-    float BBoxSize(const caffe::NormalizedBBox& bbox,
-                   const bool normalized=true)
+
+    // **************************************************************
+    // Utility functions
+    // **************************************************************
+
+    // Compute bbox size
+    template<bool normalized>
+    static float BBoxSize(const caffe::NormalizedBBox& bbox)
    {
        if (bbox.xmax() < bbox.xmin() || bbox.ymax() < bbox.ymin())
        {
-            // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
-            return 0;
+            return 0; // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
        }
        else
        {
@ -407,193 +384,155 @@ public:
        }
    }

-    // Clip the caffe::NormalizedBBox such that the range for each corner is [0, 1].
-    void ClipBBox(const caffe::NormalizedBBox& bbox,
-                  caffe::NormalizedBBox* clipBBox)
-    {
-        clipBBox->set_xmin(std::max(std::min(bbox.xmin(), 1.f), 0.f));
-        clipBBox->set_ymin(std::max(std::min(bbox.ymin(), 1.f), 0.f));
-        clipBBox->set_xmax(std::max(std::min(bbox.xmax(), 1.f), 0.f));
-        clipBBox->set_ymax(std::max(std::min(bbox.ymax(), 1.f), 0.f));
-        clipBBox->clear_size();
-        clipBBox->set_size(BBoxSize(*clipBBox));
-        clipBBox->set_difficult(bbox.difficult());
-    }

-    // Decode a bbox according to a prior bbox.
-    void DecodeBBox(
+    // Decode a bbox according to a prior bbox
+    template<bool variance_encoded_in_target>
+    static void DecodeBBox(
        const caffe::NormalizedBBox& prior_bbox, const std::vector<float>& prior_variance,
-        const CodeType code_type, const bool variance_encoded_in_target,
+        const CodeType code_type,
        const bool clip_bbox, const caffe::NormalizedBBox& bbox,
-        caffe::NormalizedBBox* decode_bbox) {
-      if (code_type == caffe::PriorBoxParameter_CodeType_CORNER) {
-        if (variance_encoded_in_target) {
-          // variance is encoded in target, we simply need to add the offset
-          // predictions.
-          decode_bbox->set_xmin(prior_bbox.xmin() + bbox.xmin());
-          decode_bbox->set_ymin(prior_bbox.ymin() + bbox.ymin());
-          decode_bbox->set_xmax(prior_bbox.xmax() + bbox.xmax());
-          decode_bbox->set_ymax(prior_bbox.ymax() + bbox.ymax());
-        } else {
-          // variance is encoded in bbox, we need to scale the offset accordingly.
-          decode_bbox->set_xmin(
-              prior_bbox.xmin() + prior_variance[0] * bbox.xmin());
-          decode_bbox->set_ymin(
-              prior_bbox.ymin() + prior_variance[1] * bbox.ymin());
-          decode_bbox->set_xmax(
-              prior_bbox.xmax() + prior_variance[2] * bbox.xmax());
-          decode_bbox->set_ymax(
-              prior_bbox.ymax() + prior_variance[3] * bbox.ymax());
-        }
-      } else if (code_type == caffe::PriorBoxParameter_CodeType_CENTER_SIZE) {
-        float prior_width = prior_bbox.xmax() - prior_bbox.xmin();
-        CV_Assert(prior_width > 0);
-        float prior_height = prior_bbox.ymax() - prior_bbox.ymin();
-        CV_Assert(prior_height > 0);
-        float prior_center_x = (prior_bbox.xmin() + prior_bbox.xmax()) / 2.;
-        float prior_center_y = (prior_bbox.ymin() + prior_bbox.ymax()) / 2.;
-
-        float decode_bbox_center_x, decode_bbox_center_y;
-        float decode_bbox_width, decode_bbox_height;
-        if (variance_encoded_in_target) {
-          // variance is encoded in target, we simply need to retore the offset
-          // predictions.
-          decode_bbox_center_x = bbox.xmin() * prior_width + prior_center_x;
-          decode_bbox_center_y = bbox.ymin() * prior_height + prior_center_y;
-          decode_bbox_width = exp(bbox.xmax()) * prior_width;
-          decode_bbox_height = exp(bbox.ymax()) * prior_height;
-        } else {
-          // variance is encoded in bbox, we need to scale the offset accordingly.
-          decode_bbox_center_x =
-              prior_variance[0] * bbox.xmin() * prior_width + prior_center_x;
-          decode_bbox_center_y =
-              prior_variance[1] * bbox.ymin() * prior_height + prior_center_y;
-          decode_bbox_width =
-              exp(prior_variance[2] * bbox.xmax()) * prior_width;
-          decode_bbox_height =
-              exp(prior_variance[3] * bbox.ymax()) * prior_height;
+        caffe::NormalizedBBox& decode_bbox)
+    {
+        float bbox_xmin = variance_encoded_in_target ? bbox.xmin() : prior_variance[0] * bbox.xmin();
+        float bbox_ymin = variance_encoded_in_target ? bbox.ymin() : prior_variance[1] * bbox.ymin();
+        float bbox_xmax = variance_encoded_in_target ? bbox.xmax() : prior_variance[2] * bbox.xmax();
+        float bbox_ymax = variance_encoded_in_target ? bbox.ymax() : prior_variance[3] * bbox.ymax();
+        switch(code_type)
+        {
+            case caffe::PriorBoxParameter_CodeType_CORNER:
+                decode_bbox.set_xmin(prior_bbox.xmin() + bbox_xmin);
+                decode_bbox.set_ymin(prior_bbox.ymin() + bbox_ymin);
+                decode_bbox.set_xmax(prior_bbox.xmax() + bbox_xmax);
+                decode_bbox.set_ymax(prior_bbox.ymax() + bbox_ymax);
+                break;
+            case caffe::PriorBoxParameter_CodeType_CENTER_SIZE:
+            {
+                float prior_width = prior_bbox.xmax() - prior_bbox.xmin();
+                CV_Assert(prior_width > 0);
+                float prior_height = prior_bbox.ymax() - prior_bbox.ymin();
+                CV_Assert(prior_height > 0);
+                float prior_center_x = (prior_bbox.xmin() + prior_bbox.xmax()) * .5;
+                float prior_center_y = (prior_bbox.ymin() + prior_bbox.ymax()) * .5;
+
+                float decode_bbox_center_x, decode_bbox_center_y;
+                float decode_bbox_width, decode_bbox_height;
+                decode_bbox_center_x = bbox_xmin * prior_width + prior_center_x;
+                decode_bbox_center_y = bbox_ymin * prior_height + prior_center_y;
+                decode_bbox_width = exp(bbox_xmax) * prior_width;
+                decode_bbox_height = exp(bbox_ymax) * prior_height;
+                decode_bbox.set_xmin(decode_bbox_center_x - decode_bbox_width * .5);
+                decode_bbox.set_ymin(decode_bbox_center_y - decode_bbox_height * .5);
+                decode_bbox.set_xmax(decode_bbox_center_x + decode_bbox_width * .5);
+                decode_bbox.set_ymax(decode_bbox_center_y + decode_bbox_height * .5);
+                break;
+            }
+            default:
+                CV_ErrorNoReturn(Error::StsBadArg, "Unknown type.");
+        };
+        if (clip_bbox)
+        {
+            // Clip the caffe::NormalizedBBox such that the range for each corner is [0, 1]
+            decode_bbox.set_xmin(std::max(std::min(decode_bbox.xmin(), 1.f), 0.f));
+            decode_bbox.set_ymin(std::max(std::min(decode_bbox.ymin(), 1.f), 0.f));
+            decode_bbox.set_xmax(std::max(std::min(decode_bbox.xmax(), 1.f), 0.f));
+            decode_bbox.set_ymax(std::max(std::min(decode_bbox.ymax(), 1.f), 0.f));
        }
-
-        decode_bbox->set_xmin(decode_bbox_center_x - decode_bbox_width / 2.);
-        decode_bbox->set_ymin(decode_bbox_center_y - decode_bbox_height / 2.);
-        decode_bbox->set_xmax(decode_bbox_center_x + decode_bbox_width / 2.);
-        decode_bbox->set_ymax(decode_bbox_center_y + decode_bbox_height / 2.);
-      } else {
-        CV_Error(Error::StsBadArg, "Unknown LocLossType.");
-      }
-      float bbox_size = BBoxSize(*decode_bbox);
-      decode_bbox->set_size(bbox_size);
-      if (clip_bbox) {
-        ClipBBox(*decode_bbox, decode_bbox);
-      }
+        decode_bbox.clear_size();
+        decode_bbox.set_size(BBoxSize<true>(decode_bbox));
    }

-    // Decode a set of bboxes according to a set of prior bboxes.
-    void DecodeBBoxes(
+    // Decode a set of bboxes according to a set of prior bboxes
+    static void DecodeBBoxes(
        const std::vector<caffe::NormalizedBBox>& prior_bboxes,
        const std::vector<std::vector<float> >& prior_variances,
        const CodeType code_type, const bool variance_encoded_in_target,
        const bool clip_bbox, const std::vector<caffe::NormalizedBBox>& bboxes,
-        std::vector<caffe::NormalizedBBox>* decode_bboxes) {
-      CV_Assert(prior_bboxes.size() == prior_variances.size());
-      CV_Assert(prior_bboxes.size() == bboxes.size());
-      int num_bboxes = prior_bboxes.size();
-      if (num_bboxes >= 1) {
-        CV_Assert(prior_variances[0].size() == 4);
-      }
-      decode_bboxes->clear();
-      for (int i = 0; i < num_bboxes; ++i) {
-        caffe::NormalizedBBox decode_bbox;
-        DecodeBBox(prior_bboxes[i], prior_variances[i], code_type,
-                   variance_encoded_in_target, clip_bbox, bboxes[i], &decode_bbox);
-        decode_bboxes->push_back(decode_bbox);
-      }
+        std::vector<caffe::NormalizedBBox>& decode_bboxes)
+    {
+        CV_Assert(prior_bboxes.size() == prior_variances.size());
+        CV_Assert(prior_bboxes.size() == bboxes.size());
+        size_t num_bboxes = prior_bboxes.size();
+        CV_Assert(num_bboxes == 0 || prior_variances[0].size() == 4);
+        decode_bboxes.clear(); decode_bboxes.resize(num_bboxes);
+        if(variance_encoded_in_target)
+        {
+            for (int i = 0; i < num_bboxes; ++i)
+                DecodeBBox<true>(prior_bboxes[i], prior_variances[i], code_type,
+                                 clip_bbox, bboxes[i], decode_bboxes[i]);
+        }
+        else
+        {
+            for (int i = 0; i < num_bboxes; ++i)
+                DecodeBBox<false>(prior_bboxes[i], prior_variances[i], code_type,
+                                  clip_bbox, bboxes[i], decode_bboxes[i]);
+        }
    }

-    // Decode all bboxes in a batch.
-    void DecodeBBoxesAll(const std::vector<LabelBBox>& all_loc_preds,
+    // Decode all bboxes in a batch
+    static void DecodeBBoxesAll(const std::vector<LabelBBox>& all_loc_preds,
        const std::vector<caffe::NormalizedBBox>& prior_bboxes,
        const std::vector<std::vector<float> >& prior_variances,
        const int num, const bool share_location,
        const int num_loc_classes, const int background_label_id,
        const CodeType code_type, const bool variance_encoded_in_target,
-        const bool clip, std::vector<LabelBBox>* all_decode_bboxes) {
-      CV_Assert(all_loc_preds.size() == num);
-      all_decode_bboxes->clear();
-      all_decode_bboxes->resize(num);
-      for (int i = 0; i < num; ++i) {
-        // Decode predictions into bboxes.
-        LabelBBox& decode_bboxes = (*all_decode_bboxes)[i];
-        for (int c = 0; c < num_loc_classes; ++c) {
-          int label = share_location ? -1 : c;
-          if (label == background_label_id) {
-            // Ignore background class.
-            continue;
-          }
-          if (all_loc_preds[i].find(label) == all_loc_preds[i].end()) {
-            // Something bad happened if there are no predictions for current label.
-            util::make_error<int>("Could not find location predictions for label ", label);
-          }
-          const std::vector<caffe::NormalizedBBox>& label_loc_preds =
-              all_loc_preds[i].find(label)->second;
-          DecodeBBoxes(prior_bboxes, prior_variances,
-                       code_type, variance_encoded_in_target, clip,
-                       label_loc_preds, &(decode_bboxes[label]));
+        const bool clip, std::vector<LabelBBox>& all_decode_bboxes)
+    {
+        CV_Assert(all_loc_preds.size() == num);
+        all_decode_bboxes.clear();
+        all_decode_bboxes.resize(num);
+        for (int i = 0; i < num; ++i)
+        {
+            // Decode predictions into bboxes.
+            const LabelBBox& loc_preds = all_loc_preds[i];
+            LabelBBox& decode_bboxes = all_decode_bboxes[i];
+            for (int c = 0; c < num_loc_classes; ++c)
+            {
+                int label = share_location ? -1 : c;
+                if (label == background_label_id)
+                    continue; // Ignore background class.
+                LabelBBox::const_iterator label_loc_preds = loc_preds.find(label);
+                if (label_loc_preds == loc_preds.end())
+                    CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find location predictions for label %d", label));
+                DecodeBBoxes(prior_bboxes, prior_variances,
+                             code_type, variance_encoded_in_target, clip,
+                             label_loc_preds->second, decode_bboxes[label]);
+            }
        }
-      }
    }

-    // Get prior bounding boxes from prior_data.
+    // Get prior bounding boxes from prior_data
    //    prior_data: 1 x 2 x num_priors * 4 x 1 blob.
    //    num_priors: number of priors.
    //    prior_bboxes: stores all the prior bboxes in the format of caffe::NormalizedBBox.
    //    prior_variances: stores all the variances needed by prior bboxes.
-    void GetPriorBBoxes(const float* priorData, const int& numPriors,
-                        std::vector<caffe::NormalizedBBox>* priorBBoxes,
-                        std::vector<std::vector<float> >* priorVariances)
+    static void GetPriorBBoxes(const float* priorData, const int& numPriors,
+                        std::vector<caffe::NormalizedBBox>& priorBBoxes,
+                        std::vector<std::vector<float> >& priorVariances)
    {
-        priorBBoxes->clear();
-        priorVariances->clear();
+        priorBBoxes.clear(); priorBBoxes.resize(numPriors);
+        priorVariances.clear(); priorVariances.resize(numPriors);
        for (int i = 0; i < numPriors; ++i)
        {
            int startIdx = i * 4;
-            caffe::NormalizedBBox bbox;
+            caffe::NormalizedBBox& bbox = priorBBoxes[i];
            bbox.set_xmin(priorData[startIdx]);
            bbox.set_ymin(priorData[startIdx + 1]);
            bbox.set_xmax(priorData[startIdx + 2]);
            bbox.set_ymax(priorData[startIdx + 3]);
-            float bboxSize = BBoxSize(bbox);
-            bbox.set_size(bboxSize);
-            priorBBoxes->push_back(bbox);
+            bbox.set_size(BBoxSize<true>(bbox));
        }

        for (int i = 0; i < numPriors; ++i)
        {
            int startIdx = (numPriors + i) * 4;
-            std::vector<float> var;
+            // not needed here: priorVariances[i].clear();
            for (int j = 0; j < 4; ++j)
            {
-                var.push_back(priorData[startIdx + j]);
+                priorVariances[i].push_back(priorData[startIdx + j]);
            }
-            priorVariances->push_back(var);
        }
    }

-    // Scale the caffe::NormalizedBBox w.r.t. height and width.
-    void ScaleBBox(const caffe::NormalizedBBox& bbox,
-                   const int height, const int width,
-                   caffe::NormalizedBBox* scaleBBox)
-    {
-        scaleBBox->set_xmin(bbox.xmin() * width);
-        scaleBBox->set_ymin(bbox.ymin() * height);
-        scaleBBox->set_xmax(bbox.xmax() * width);
-        scaleBBox->set_ymax(bbox.ymax() * height);
-        scaleBBox->clear_size();
-        bool normalized = !(width > 1 || height > 1);
-        scaleBBox->set_size(BBoxSize(*scaleBBox, normalized));
-        scaleBBox->set_difficult(bbox.difficult());
-    }
-
    // Get location predictions from loc_data.
    //    loc_data: num x num_preds_per_class * num_loc_classes * 4 blob.
    //    num: the number of images.
@ -603,19 +542,19 @@ public:
    //    share_location: if true, all classes share the same location prediction.
    //    loc_preds: stores the location prediction, where each item contains
    //      location prediction for an image.
-    void GetLocPredictions(const float* locData, const int num,
+    static void GetLocPredictions(const float* locData, const int num,
                           const int numPredsPerClass, const int numLocClasses,
-                           const bool shareLocation, std::vector<LabelBBox>* locPreds)
+                           const bool shareLocation, std::vector<LabelBBox>& locPreds)
    {
-        locPreds->clear();
+        locPreds.clear();
        if (shareLocation)
        {
            CV_Assert(numLocClasses == 1);
        }
-        locPreds->resize(num);
-        for (int i = 0; i < num; ++i)
+        locPreds.resize(num);
+        for (int i = 0; i < num; ++i, locData += numPredsPerClass * numLocClasses * 4)
        {
-            LabelBBox& labelBBox = (*locPreds)[i];
+            LabelBBox& labelBBox = locPreds[i];
            for (int p = 0; p < numPredsPerClass; ++p)
            {
                int startIdx = p * numLocClasses * 4;
@ -626,13 +565,13 @@ public:
                    {
                        labelBBox[label].resize(numPredsPerClass);
                    }
-                    labelBBox[label][p].set_xmin(locData[startIdx + c * 4]);
-                    labelBBox[label][p].set_ymin(locData[startIdx + c * 4 + 1]);
-                    labelBBox[label][p].set_xmax(locData[startIdx + c * 4 + 2]);
-                    labelBBox[label][p].set_ymax(locData[startIdx + c * 4 + 3]);
+                    caffe::NormalizedBBox& bbox = labelBBox[label][p];
+                    bbox.set_xmin(locData[startIdx + c * 4]);
+                    bbox.set_ymin(locData[startIdx + c * 4 + 1]);
+                    bbox.set_xmax(locData[startIdx + c * 4 + 2]);
+                    bbox.set_ymax(locData[startIdx + c * 4 + 3]);
                }
            }
-            locData += numPredsPerClass * numLocClasses * 4;
        }
    }

@ -643,25 +582,24 @@ public:
    //    num_classes: number of classes.
    //    conf_preds: stores the confidence prediction, where each item contains
    //      confidence prediction for an image.
-    void GetConfidenceScores(const float* confData, const int num,
+    static void GetConfidenceScores(const float* confData, const int num,
                             const int numPredsPerClass, const int numClasses,
-                             std::vector<std::vector<std::vector<float> > >* confPreds)
+                             std::vector<std::vector<std::vector<float> > >& confPreds)
    {
-        confPreds->clear();
-        confPreds->resize(num);
-        for (int i = 0; i < num; ++i)
+        confPreds.clear(); confPreds.resize(num);
+        for (int i = 0; i < num; ++i, confData += numPredsPerClass * numClasses)
        {
-            std::vector<std::vector<float> >& labelScores = (*confPreds)[i];
+            std::vector<std::vector<float> >& labelScores = confPreds[i];
            labelScores.resize(numClasses);
-            for (int p = 0; p < numPredsPerClass; ++p)
+            for (int c = 0; c < numClasses; ++c)
            {
-                int startIdx = p * numClasses;
-                for (int c = 0; c < numClasses; ++c)
+                std::vector<float>& classLabelScores = labelScores[c];
+                classLabelScores.resize(numPredsPerClass);
+                for (int p = 0; p < numPredsPerClass; ++p)
                {
-                    labelScores[c].push_back(confData[startIdx + c]);
+                    classLabelScores[p] = confData[p * numClasses + c];
                }
            }
-            confData += numPredsPerClass * numClasses;
        }
    }

@ -674,40 +612,35 @@ public:
    //    nms_threshold: a threshold used in non maximum suppression.
    //    top_k: if not -1, keep at most top_k picked indices.
    //    indices: the kept indices of bboxes after nms.
-    void ApplyNMSFast(const std::vector<caffe::NormalizedBBox>& bboxes,
+    static void ApplyNMSFast(const std::vector<caffe::NormalizedBBox>& bboxes,
          const std::vector<float>& scores, const float score_threshold,
          const float nms_threshold, const float eta, const int top_k,
-          std::vector<int>* indices) {
-      // Sanity check.
-      CV_Assert(bboxes.size() == scores.size());
-
-      // Get top_k scores (with corresponding indices).
-      std::vector<std::pair<float, int> > score_index_vec;
-      GetMaxScoreIndex(scores, score_threshold, top_k, &score_index_vec);
-
-      // Do nms.
-      float adaptive_threshold = nms_threshold;
-      indices->clear();
-      while (score_index_vec.size() != 0) {
-        const int idx = score_index_vec.front().second;
-        bool keep = true;
-        for (int k = 0; k < indices->size(); ++k) {
-          if (keep) {
-            const int kept_idx = (*indices)[k];
-            float overlap = JaccardOverlap(bboxes[idx], bboxes[kept_idx]);
-            keep = overlap <= adaptive_threshold;
-          } else {
-            break;
-          }
-        }
-        if (keep) {
-          indices->push_back(idx);
-        }
-        score_index_vec.erase(score_index_vec.begin());
-        if (keep && eta < 1 && adaptive_threshold > 0.5) {
-          adaptive_threshold *= eta;
+          std::vector<int>& indices)
+    {
+        CV_Assert(bboxes.size() == scores.size());
+
+        // Get top_k scores (with corresponding indices).
+        std::vector<std::pair<float, int> > score_index_vec;
+        GetMaxScoreIndex(scores, score_threshold, top_k, score_index_vec);
+
+        // Do nms.
+        float adaptive_threshold = nms_threshold;
+        indices.clear();
+        while (score_index_vec.size() != 0) {
+            const int idx = score_index_vec.front().second;
+            bool keep = true;
+            for (int k = 0; k < (int)indices.size() && keep; ++k) {
+                const int kept_idx = indices[k];
+                float overlap = JaccardOverlap<true>(bboxes[idx], bboxes[kept_idx]);
+                keep = overlap <= adaptive_threshold;
+            }
+            if (keep)
+                indices.push_back(idx);
+            score_index_vec.erase(score_index_vec.begin());
+            if (keep && eta < 1 && adaptive_threshold > 0.5) {
+              adaptive_threshold *= eta;
+            }
        }
-      }
    }

    // Get max scores with corresponding indices.
@ -715,74 +648,66 @@ public:
    //    threshold: only consider scores higher than the threshold.
    //    top_k: if -1, keep all; otherwise, keep at most top_k.
    //    score_index_vec: store the sorted (score, index) pair.
-    void GetMaxScoreIndex(const std::vector<float>& scores, const float threshold,const int top_k,
-                          std::vector<std::pair<float, int> >* score_index_vec)
+    static void GetMaxScoreIndex(const std::vector<float>& scores, const float threshold, const int top_k,
+                          std::vector<std::pair<float, int> >& score_index_vec)
    {
+        CV_DbgAssert(score_index_vec.empty());
        // Generate index score pairs.
        for (size_t i = 0; i < scores.size(); ++i)
        {
            if (scores[i] > threshold)
            {
-                score_index_vec->push_back(std::make_pair(scores[i], i));
+                score_index_vec.push_back(std::make_pair(scores[i], i));
            }
        }

        // Sort the score pair according to the scores in descending order
-        std::stable_sort(score_index_vec->begin(), score_index_vec->end(),
+        std::stable_sort(score_index_vec.begin(), score_index_vec.end(),
                         util::SortScorePairDescend<int>);

        // Keep top_k scores if needed.
-        if (top_k > -1 && top_k < (int)score_index_vec->size())
+        if (top_k > -1 && top_k < (int)score_index_vec.size())
        {
-            score_index_vec->resize(top_k);
+            score_index_vec.resize(top_k);
        }
    }

-    // Compute the intersection between two bboxes.
-    void IntersectBBox(const caffe::NormalizedBBox& bbox1,
-                       const caffe::NormalizedBBox& bbox2,
-                       caffe::NormalizedBBox* intersect_bbox) {
+    // Compute the jaccard (intersection over union IoU) overlap between two bboxes.
+    template<bool normalized>
+    static float JaccardOverlap(const caffe::NormalizedBBox& bbox1,
+                         const caffe::NormalizedBBox& bbox2)
+    {
+        caffe::NormalizedBBox intersect_bbox;
        if (bbox2.xmin() > bbox1.xmax() || bbox2.xmax() < bbox1.xmin() ||
            bbox2.ymin() > bbox1.ymax() || bbox2.ymax() < bbox1.ymin())
        {
            // Return [0, 0, 0, 0] if there is no intersection.
-            intersect_bbox->set_xmin(0);
-            intersect_bbox->set_ymin(0);
-            intersect_bbox->set_xmax(0);
-            intersect_bbox->set_ymax(0);
+            intersect_bbox.set_xmin(0);
+            intersect_bbox.set_ymin(0);
+            intersect_bbox.set_xmax(0);
+            intersect_bbox.set_ymax(0);
        }
        else
        {
-            intersect_bbox->set_xmin(std::max(bbox1.xmin(), bbox2.xmin()));
-            intersect_bbox->set_ymin(std::max(bbox1.ymin(), bbox2.ymin()));
-            intersect_bbox->set_xmax(std::min(bbox1.xmax(), bbox2.xmax()));
-            intersect_bbox->set_ymax(std::min(bbox1.ymax(), bbox2.ymax()));
+            intersect_bbox.set_xmin(std::max(bbox1.xmin(), bbox2.xmin()));
+            intersect_bbox.set_ymin(std::max(bbox1.ymin(), bbox2.ymin()));
+            intersect_bbox.set_xmax(std::min(bbox1.xmax(), bbox2.xmax()));
+            intersect_bbox.set_ymax(std::min(bbox1.ymax(), bbox2.ymax()));
        }
-    }

-    // Compute the jaccard (intersection over union IoU) overlap between two bboxes.
-    float JaccardOverlap(const caffe::NormalizedBBox& bbox1,
-                         const caffe::NormalizedBBox& bbox2,
-                         const bool normalized=true)
-    {
-        caffe::NormalizedBBox intersect_bbox;
-        IntersectBBox(bbox1, bbox2, &intersect_bbox);
        float intersect_width, intersect_height;
-        if (normalized)
-        {
-            intersect_width = intersect_bbox.xmax() - intersect_bbox.xmin();
-            intersect_height = intersect_bbox.ymax() - intersect_bbox.ymin();
-        }
-        else
-        {
-            intersect_width = intersect_bbox.xmax() - intersect_bbox.xmin() + 1;
-            intersect_height = intersect_bbox.ymax() - intersect_bbox.ymin() + 1;
-        }
+        intersect_width = intersect_bbox.xmax() - intersect_bbox.xmin();
+        intersect_height = intersect_bbox.ymax() - intersect_bbox.ymin();
        if (intersect_width > 0 && intersect_height > 0)
        {
+            if (!normalized)
+            {
+                intersect_width++;
+                intersect_height++;
+            }
            float intersect_size = intersect_width * intersect_height;
-            float bbox1_size = BBoxSize(bbox1);
-            float bbox2_size = BBoxSize(bbox2);
+            float bbox1_size = BBoxSize<true>(bbox1);
+            float bbox2_size = BBoxSize<true>(bbox2);
            return intersect_size / (bbox1_size + bbox2_size - intersect_size);
        }
        else
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@ -177,12 +177,12 @@ public:

            #if CV_TRY_AVX2
                if( useAVX2 )
-                    fastGEMM1T_avx2( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
+                    opt_AVX2::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
                else
            #endif
            #if CV_TRY_AVX
                if( useAVX )
-                    fastGEMM1T_avx( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
+                    opt_AVX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
                else
            #endif
                {
@ -191,19 +191,19 @@ public:
            #if CV_SIMD128
                    for( ; i <= nw - 4; i += 4, wptr += 4*wstep )
                    {
-                        vfloat32x4 vs0 = v_setall_f32(0.f), vs1 = v_setall_f32(0.f);
-                        vfloat32x4 vs2 = v_setall_f32(0.f), vs3 = v_setall_f32(0.f);
+                        v_float32x4 vs0 = v_setall_f32(0.f), vs1 = v_setall_f32(0.f);
+                        v_float32x4 vs2 = v_setall_f32(0.f), vs3 = v_setall_f32(0.f);

                        for( k = 0; k < vecsize; k += 4 )
                        {
-                            vfloat32x4 v = v_load_aligned(sptr + k);
+                            v_float32x4 v = v_load_aligned(sptr + k);
                            vs0 += v*v_load_aligned(wptr + k);
                            vs1 += v*v_load_aligned(wptr + wstep + k);
                            vs2 += v*v_load_aligned(wptr + wstep*2 + k);
                            vs3 += v*v_load_aligned(wptr + wstep*3 + k);
                        }

-                        vfloat32x4 s = v_reduce_sum4(vs0, vs1, vs2, vs3);
+                        v_float32x4 s = v_reduce_sum4(vs0, vs1, vs2, vs3);
                        s += v_load(biasptr + i);
                        v_store(dptr + i, s);
                    }
--- a/modules/dnn/src/layers/layers_common.avx.cpp
+++ b/modules/dnn/src/layers/layers_common.avx.cpp
@ -1,54 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Copyright (C) 2017, Intel Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "layers_common.hpp"
-#include "opencv2/core/hal/intrin.hpp"
-
-#define fastConv_some_avx fastConv_avx
-#define fastGEMM1T_some_avx fastGEMM1T_avx
-#define fastGEMM_some_avx fastGEMM_avx
-
-#undef _mm256_fmadd_ps
-#define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
-
-#include "layers_common.simd.hpp"
--- a/modules/dnn/src/layers/layers_common.avx2.cpp
+++ b/modules/dnn/src/layers/layers_common.avx2.cpp
@ -1,51 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Copyright (C) 2017, Intel Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "layers_common.hpp"
-#include "opencv2/core/hal/intrin.hpp"
-
-#define fastConv_some_avx fastConv_avx2
-#define fastGEMM1T_some_avx fastGEMM1T_avx2
-#define fastGEMM_some_avx fastGEMM_avx2
-
-#include "layers_common.simd.hpp"
--- a/modules/dnn/src/layers/layers_common.hpp
+++ b/modules/dnn/src/layers/layers_common.hpp
@ -45,6 +45,10 @@
 #include <opencv2/dnn.hpp>
 #include <opencv2/dnn/shape_utils.hpp>

+// dispatched AVX/AVX2 optimizations
+#include "layers/layers_common.simd.hpp"
+#include "layers/layers_common.simd_declarations.hpp"
+
 namespace cv
 {
 namespace dnn
@ -64,32 +68,6 @@ void getConvPoolPaddings(const Size& inp, const Size& out,
                         const Size &kernel, const Size &stride,
                         const String &padMode, Size &pad);

-#if CV_TRY_AVX
-void fastConv_avx(const float* weights, size_t wstep, const float* bias,
-                   const float* rowbuf, float* output, const int* outShape,
-                   int blockSize, int vecsize, int vecsize_aligned,
-                   const float* relu, bool initOutput);
-void fastGEMM1T_avx( const float* vec, const float* weights,
-                     size_t wstep, const float* bias,
-                     float* dst, int nvecs, int vecsize );
-void fastGEMM_avx( const float* aptr, size_t astep, const float* bptr0,
-                   size_t bstep, float* cptr, size_t cstep,
-                   int ma, int na, int nb );
-#endif
-
-#if CV_TRY_AVX2
-void fastConv_avx2(const float* weights, size_t wstep, const float* bias,
-                   const float* rowbuf, float* output, const int* outShape,
-                   int blockSize, int vecsize, int vecsize_aligned,
-                   const float* relu, bool initOutput);
-void fastGEMM1T_avx2( const float* vec, const float* weights,
-                     size_t wstep, const float* bias,
-                     float* dst, int nvecs, int vecsize );
-void fastGEMM_avx2( const float* aptr, size_t astep, const float* bptr0,
-                   size_t bstep, float* cptr, size_t cstep,
-                   int ma, int na, int nb );
-#endif
-
 }
 }

--- a/modules/dnn/src/layers/layers_common.simd.hpp
+++ b/modules/dnn/src/layers/layers_common.simd.hpp
@ -40,16 +40,34 @@
 //
 //M*/

-#ifndef __DNN_LAYERS_COMMON_SIMD_HPP__
-#define __DNN_LAYERS_COMMON_SIMD_HPP__
+#include "opencv2/core/hal/intrin.hpp"

 namespace cv {
 namespace dnn {
+CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+
+void fastConv( const float* weights, size_t wstep, const float* bias,
+               const float* rowbuf, float* output, const int* outShape,
+               int blockSize, int vecsize, int vecsize_aligned,
+               const float* relu, bool initOutput );
+void fastGEMM1T( const float* vec, const float* weights,
+                 size_t wstep, const float* bias,
+                 float* dst, int nvecs, int vecsize );
+void fastGEMM( const float* aptr, size_t astep, const float* bptr,
+               size_t bstep, float* cptr, size_t cstep,
+               int ma, int na, int nb );
+
+#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX
+
+#if !CV_FMA // AVX workaround
+#undef _mm256_fmadd_ps
+#define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
+#endif

-void fastConv_some_avx( const float* weights, size_t wstep, const float* bias,
-                        const float* rowbuf, float* output, const int* outShape,
-                        int blockSize, int vecsize, int vecsize_aligned,
-                        const float* relu, bool initOutput )
+void fastConv( const float* weights, size_t wstep, const float* bias,
+               const float* rowbuf, float* output, const int* outShape,
+               int blockSize, int vecsize, int vecsize_aligned,
+               const float* relu, bool initOutput )
 {
    int outCn = outShape[1];
    size_t outPlaneSize = outShape[2]*outShape[3];
@ -214,9 +232,9 @@ void fastConv_some_avx( const float* weights, size_t wstep, const float* bias,
 }

 // dst = vec * weights^t + bias
-void fastGEMM1T_some_avx( const float* vec, const float* weights,
-                          size_t wstep, const float* bias,
-                          float* dst, int nvecs, int vecsize )
+void fastGEMM1T( const float* vec, const float* weights,
+                 size_t wstep, const float* bias,
+                 float* dst, int nvecs, int vecsize )
 {
    int i = 0;

@ -276,9 +294,9 @@ void fastGEMM1T_some_avx( const float* vec, const float* weights,
    _mm256_zeroupper();
 }

-void fastGEMM_some_avx( const float* aptr, size_t astep, const float* bptr,
-                        size_t bstep, float* cptr, size_t cstep,
-                        int ma, int na, int nb )
+void fastGEMM( const float* aptr, size_t astep, const float* bptr,
+               size_t bstep, float* cptr, size_t cstep,
+               int ma, int na, int nb )
 {
    int n = 0;
    for( ; n <= nb - 16; n += 16 )
@ -346,7 +364,7 @@ void fastGEMM_some_avx( const float* aptr, size_t astep, const float* bptr,
    _mm256_zeroupper();
 }

-}
-}
+#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY

-#endif
+CV_CPU_OPTIMIZATION_NAMESPACE_END
+}} // namespace