From 14da5ec311891859489a63a04faa83081d073ac8 Mon Sep 17 00:00:00 2001
From: Dmitry Kurtaev <dmitry.kurtaev+github@gmail.com>
Date: Sun, 15 Mar 2020 22:33:05 +0300
Subject: [PATCH 01/12] LSTM scalar

---
 modules/dnn/src/layers/recurrent_layers.cpp   |  10 ++
 .../dnn/src/onnx/onnx_graph_simplifier.cpp    |  25 +++
 modules/dnn/src/onnx/onnx_importer.cpp        | 165 ++++++++++++++++--
 modules/dnn/src/tensorflow/tf_importer.cpp    |   7 +
 modules/dnn/test/test_onnx_importer.cpp       |  11 ++
 5 files changed, 204 insertions(+), 14 deletions(-)
diff --git a/modules/dnn/src/layers/recurrent_layers.cpp b/modules/dnn/src/layers/recurrent_layers.cpp
index 3f9a229516..a3962db127 100644
--- a/modules/dnn/src/layers/recurrent_layers.cpp
+++ b/modules/dnn/src/layers/recurrent_layers.cpp
@@ -215,6 +215,8 @@ public:
         internals.push_back(shape(_numSamples, 1)); // dummyOnes
         internals.push_back(shape(_numSamples, 4*_numOut)); // gates
 
+
+        std::cout << "LSTM out: " << outputs[0] << '\n';
         return false;
     }
 
@@ -301,6 +303,8 @@ public:
             tsEnd = numTimeStamps;
             tsInc = 1;
         }
+        std::cout << "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" << '\n';
+        std::cout << tsStart << " " << tsEnd << '\n';
         for (int ts = tsStart; ts != tsEnd; ts += tsInc)
         {
             Range curRowRange(ts*numSamples, (ts + 1)*numSamples);
@@ -314,6 +318,7 @@ public:
             Mat gateF = gates.colRange(1*numOut, 2*numOut);
             Mat gateO = gates.colRange(2*numOut, 3*numOut);
             Mat gateG = gates.colRange(3*numOut, 4*numOut);
+            std::cout << "i " << gateI << '\n';
 
             if (forgetBias)
                 add(gateF, forgetBias, gateF);
@@ -329,6 +334,7 @@ public:
             {
                 Mat gatesIFO = gates.colRange(0, 3*numOut);
                 sigmoid(gatesIFO, gatesIFO);
+                std::cout << "ifo " << gatesIFO << '\n';
             }
 
             tanh(gateG, gateG);
@@ -345,12 +351,15 @@ public:
             }
             if (usePeephole)
             {
+                std::cout << "if (usePeephole)" << '\n';
                 gemm(cInternal, blobs[5], 1, gateO, 1, gateO);
                 sigmoid(gateO, gateO);
             }
 
             //compute h_t
             tanh(cInternal, hInternal);
+            std::cout << "o " << gateO << '\n';
+            std::cout << "tanh(o) " << hInternal << '\n';
             multiply(gateO, hInternal, hInternal);
 
             //save results in output blobs
@@ -358,6 +367,7 @@ public:
             if (produceCellOutput)
                 cInternal.copyTo(cOutTs.rowRange(curRowRange));
         }
+        std::cout << "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" << '\n';
     }
 };
 
diff --git a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
index fe96927840..6693a75ff4 100644
--- a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
+++ b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
@@ -290,6 +290,30 @@ public:
     }
 };
 
+// // To remove Squeeze after LSTM for non-bidirectional LSTM
+// class LSTMSqueeze : public Subgraph
+// {
+// public:
+//     LSTMSqueeze()
+//     {
+//         int input = addNodeToMatch("");
+//
+//         std::vector<int> lstmInps(7);
+//         lstmInps[0] = input;
+//
+//         for (int i = 1; i < 4; ++i)
+//             lstmInps[i] = addNodeToMatch("Unsqueeze");
+//         lstmInps[4] = addNodeToMatch("");
+//         for (int i = 5; i < 7; ++i)
+//             lstmInps[i] = addNodeToMatch("ConstantOfShape");
+//
+//         int lstm = addNodeToMatch("LSTM", lstmInps);
+//         addNodeToMatch("Squeeze", lstm);
+//
+//         setFusedNode("LSTM", lstmInps);
+//     }
+// };
+
 void simplifySubgraphs(opencv_onnx::GraphProto& net)
 {
     std::vector<Ptr<Subgraph> > subgraphs;
@@ -299,6 +323,7 @@ void simplifySubgraphs(opencv_onnx::GraphProto& net)
     subgraphs.push_back(makePtr<ResizeSubgraph1>());
     subgraphs.push_back(makePtr<ResizeSubgraph2>());
     subgraphs.push_back(makePtr<SoftMaxSubgraph>());
+    // subgraphs.push_back(makePtr<LSTMSqueeze>());
 
     simplifySubgraphs(Ptr<ImportGraphWrapper>(new ONNXGraphWrapper(net)), subgraphs);
 }
diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index 7913fa729d..bcf3d28eed 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -322,7 +322,7 @@ void ONNXImporter::populateNet(Net dstNet)
 
         std::string layer_type = node_proto.op_type();
         layerParams.type = layer_type;
-
+        std::cout << layerParams.name << " " << layer_type << '\n';
 
         if (layer_type == "MaxPool")
         {
@@ -457,6 +457,19 @@ void ONNXImporter::populateNet(Net dstNet)
                 constBlobs.insert(std::make_pair(layerParams.name, sliced[0]));
                 continue;
             }
+
+            layerParams.set("begin", DictValue::arrayInt(&begin[0], begin.size()));
+            layerParams.set("end", DictValue::arrayInt(&end[0], end.size()));
+
+            CV_Assert(node_proto.input_size() == 1);
+            if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
+            {
+                std::vector<Mat> inputs(1, getBlob(node_proto, constBlobs, 0)), sliced;
+                runLayer(layerParams, inputs, sliced);
+                CV_Assert(sliced.size() == 1);
+                constBlobs.insert(std::make_pair(layerParams.name, sliced[0]));
+                continue;
+            }
         }
         else if (layer_type == "Split")
         {
@@ -579,6 +592,117 @@ void ONNXImporter::populateNet(Net dstNet)
             constBlobs.insert(std::make_pair(layerParams.name, layerParams.blobs[0]));
             continue;
         }
+        else if (layer_type == "ConstantFill" || layer_type == "ConstantOfShape")
+        {
+            CV_Assert_N(node_proto.input_size());
+            MatShape inpShape = getBlob(node_proto, constBlobs, 0);
+            float value = layerParams.get("value", 0);
+            Mat fill(inpShape.size(), &inpShape[0], CV_32F, Scalar(value));
+            constBlobs.insert(std::make_pair(layerParams.name, fill));
+            continue;
+        }
+        else if (layer_type == "LSTM")
+        {
+            std::cout << "~~~~~~" << '\n';
+            std::cout << layerParams << '\n';
+            for (int i = 1; i < node_proto.input_size(); ++i) {
+              std::cout << "i: " << node_proto.input(i) << " " << constBlobs[node_proto.input(i)].size << '\n';
+            }
+
+            CV_Assert(node_proto.input_size() == 7);
+            Mat Wx = getBlob(node_proto, constBlobs, 1);
+            Mat Wh = getBlob(node_proto, constBlobs, 2);
+            Mat b = getBlob(node_proto, constBlobs, 3);
+
+
+            std::cout << Wx.size << '\n';
+            std::cout << Wh.size << '\n';
+
+            int Wx_shape[] = {Wx.size[1], Wx.size[2]};
+            int Wh_shape[] = {Wh.size[1], Wh.size[2]};
+            std::cout << "b.size " <<  b.size << '\n';
+            int b_shape[] = {2, b.size[1] / 2};
+
+            Wx = Wx.reshape(1, 2, &Wx_shape[0]);
+            b = b.reshape(1, 2, &b_shape[0]);
+
+            std::cout << "b ----------------" << '\n';
+
+            std::cout << b << '\n';
+            reduce(b, b, 0, REDUCE_SUM);
+            std::cout << b << '\n';
+
+            // https://pytorch.org/docs/stable/nn.html#lstm
+            // IFGO->IFOG
+            // swap each 3rd and 4th rows
+            // Wx = Wx.t();
+
+            float* weightData = (float*)Wx.data;
+            std::swap(weightData[1], weightData[2]);
+
+            float* biasData = (float*)b.data;
+            std::swap(biasData[1], biasData[2]);
+
+            // std::swap(weightData[2], weightData[3]);
+            //
+            // weightData = (float*)Wh.data;
+            // std::swap(weightData[1], weightData[2]);
+            // std::swap(weightData[2], weightData[3]);
+
+
+            // const int outSize = Wx.cols / 4;
+            // for (int i = 0; i < Wx.rows; ++i)
+            //     for (int j = 0; j < outSize; ++j)
+            //     {
+            //         // std::swap(weightData[i * W.cols + 1 * outSize + j],
+            //         //           weightData[i * W.cols + 2 * outSize + j]);
+            //         std::swap(weightData[i * Wx.cols + 2 * outSize + j],
+            //                   weightData[i * Wx.cols + 3 * outSize + j]);
+            //     }
+
+            // float* weightData = Wx.ptr<float>();
+            // for (int j = 0; j < 5; ++j)
+            // {
+            //     std::cout << "swap " << (10 + j) << " " << (15 + j) << '\n';
+            //     for (int i = 0; i < 12; ++i)
+            //         std::swap(weightData[(10 + j) * 12 + i],
+            //                   weightData[(15 + j) * 12 + i]);
+            // }
+
+            layerParams.blobs.resize(3);
+            layerParams.blobs[0] = Wh.reshape(1, 2, &Wh_shape[0]);
+            layerParams.blobs[1] = Wx;
+            layerParams.blobs[2] = b;
+
+            std::cout << "Wx" << '\n';
+            std::cout << layerParams.blobs[1] << '\n';
+
+            std::cout << "Wh" << '\n';
+            std::cout << layerParams.blobs[0] << '\n';
+
+            // layerParams.set("reverse", true);
+
+
+            // layerParams.set("use_peephole", true);
+            // layerParams.blobs.resize(6);
+            // for (int i = 0; i < 3; ++i)
+            // {
+            //     Mat w = Mat::eye(layerParams.blobs[0].cols, layerParams.blobs[0].cols, CV_32F);
+            //     layerParams.blobs[3 + i] = w;
+            // }
+
+            // std::cout << layerParams.blobs[1] << '\n';
+
+            // int lstmId = dstNet.addLayer(layerParams.name, layerParams.type, layerParams);
+            //
+            // layerParams = LayerParams();
+            //
+            // // Add reshape
+            // int shape[] = {1, 10, 11, 5};
+            // layerParams.name = node_proto.output(0) + "/reshape";
+            // layerParams.type = "Reshape";
+            // layerParams.set("dim", DictValue::arrayInt(&shape[0], 4));
+        }
         else if (layer_type == "ImageScaler")
         {
             const float scale = layerParams.has("scale") ? layerParams.get<float>("scale") : 1.0f;
@@ -881,14 +1005,14 @@ void ONNXImporter::populateNet(Net dstNet)
         else if (layer_type == "Squeeze")
         {
             CV_Assert_N(node_proto.input_size() == 1, layerParams.has("axes"));
-            DictValue axes_dict = layerParams.get("axes");
-            if (axes_dict.size() != 1)
-                CV_Error(Error::StsNotImplemented, "Multidimensional squeeze");
-
-            int axis = axes_dict.getIntValue(0);
-            layerParams.set("axis", axis - 1);
-            layerParams.set("end_axis", axis);
-            layerParams.type = "Flatten";
+            // DictValue axes_dict = layerParams.get("axes");
+            // if (axes_dict.size() != 1)
+            //     CV_Error(Error::StsNotImplemented, "Multidimensional squeeze");
+            //
+            // int axis = axes_dict.getIntValue(0);
+            // layerParams.set("axis", axis - 1);
+            // layerParams.set("end_axis", axis);
+            layerParams.type = "Identity";
         }
         else if (layer_type == "Flatten")
         {
@@ -1032,17 +1156,30 @@ void ONNXImporter::populateNet(Net dstNet)
         else if (layer_type == "Gather")
         {
             CV_Assert(node_proto.input_size() == 2);
-            CV_Assert(layerParams.has("axis"));
             Mat input = getBlob(node_proto, constBlobs, 0);
             Mat indexMat = getBlob(node_proto, constBlobs, 1);
             CV_Assert_N(indexMat.type() == CV_32S, indexMat.total() == 1);
             int index = indexMat.at<int>(0);
-            int axis = layerParams.get<int>("axis");
 
-            std::vector<cv::Range> ranges(input.dims, Range::all());
-            ranges[axis] = Range(index, index + 1);
+            Mat out;
+            if (layerParams.has("axis"))
+            {
+                int axis = layerParams.get<int>("axis");
+
+                std::vector<cv::Range> ranges(input.dims, Range::all());
+                ranges[axis] = Range(index, index + 1);
 
-            Mat out = input(ranges);
+                out = input(ranges);
+            }
+            else
+            {
+                CV_Assert(index < input.total());
+                const int dims = input.dims;
+                input = input.reshape(1, 1);
+                input.dims = 2;
+                out = input.reshape(1, 1).colRange(index, index + 1);
+                out.dims = dims;
+            }
             constBlobs.insert(std::make_pair(layerParams.name, out));
             continue;
         }
diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp
index fe7e47f7a0..60ba6d39c5 100644
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -1826,10 +1826,12 @@ void TFImporter::populateNet(Net dstNet)
             const int outSize = W.cols / 4;
 
             // IGFO->IFOG
+            std::cout << "(TF) W " << W.size << '\n';
             float* weightData = (float*)W.data;
             for (int i = 0; i < W.rows; ++i)
                 for (int j = 0; j < outSize; ++j)
                 {
+                    // std::cout << "swap " << i * W.cols + 1 * outSize << " " << i * W.cols + 2 * outSize << '\n';
                     std::swap(weightData[i * W.cols + 1 * outSize + j],
                               weightData[i * W.cols + 2 * outSize + j]);
                     std::swap(weightData[i * W.cols + 2 * outSize + j],
@@ -1838,6 +1840,11 @@ void TFImporter::populateNet(Net dstNet)
             Wx = W.rowRange(0, W.rows - outSize).t();
             Wh = W.rowRange(W.rows - outSize, W.rows).t();
 
+            std::cout << "(TF) Wx " << Wx.size << '\n';
+            std::cout << "(TF) Wh " << Wh.size << '\n';
+            std::cout << "(TF) b " << b.size << '\n';
+
+
             layerParams.blobs.resize(3);
             layerParams.blobs[0] = Wh;
             layerParams.blobs[1] = Wx;
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index 40110d2542..c5b243b8ab 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -79,6 +79,12 @@ public:
             netSoftmax.setInput(ref);
             ref = netSoftmax.forward();
         }
+        std::cout << "ref: " << ref.size << '\n';
+        std::cout << "out: " << out.size << '\n';
+        std::cout << ref.reshape(1, 1) << '\n';
+        std::cout << '\n';
+        std::cout << out.reshape(1, 1) << '\n';
+
         normAssert(ref, out, "", l1 ? l1 : default_l1, lInf ? lInf : default_lInf);
         if (checkNoFallbacks)
             expectNoFallbacksFromIE(net);
@@ -451,6 +457,11 @@ TEST_P(Test_ONNX_layers, Split_EltwiseMax)
     testONNXModels("split_max");
 }
 
+TEST_P(Test_ONNX_layers, LSTM)
+{
+    testONNXModels("lstm");
+}
+
 INSTANTIATE_TEST_CASE_P(/*nothing*/, Test_ONNX_layers, dnnBackendsAndTargets());
 
 class Test_ONNX_nets : public Test_ONNX_layers

From 8d69dbdf49f52c3610187753430de293dce823d0 Mon Sep 17 00:00:00 2001
From: Dmitry Kurtaev <dmitry.kurtaev+github@gmail.com>
Date: Sun, 15 Mar 2020 23:21:58 +0300
Subject: [PATCH 02/12] LSTM from ONNX works

---
 modules/dnn/src/layers/recurrent_layers.cpp   |  10 -
 .../dnn/src/onnx/onnx_graph_simplifier.cpp    |  25 ---
 modules/dnn/src/onnx/onnx_importer.cpp        | 186 +++++++-----------
 modules/dnn/src/tensorflow/tf_importer.cpp    |   7 -
 modules/dnn/test/test_onnx_importer.cpp       |   6 -
 5 files changed, 66 insertions(+), 168 deletions(-)

diff --git a/modules/dnn/src/layers/recurrent_layers.cpp b/modules/dnn/src/layers/recurrent_layers.cpp
index a3962db127..3f9a229516 100644
--- a/modules/dnn/src/layers/recurrent_layers.cpp
+++ b/modules/dnn/src/layers/recurrent_layers.cpp
@@ -215,8 +215,6 @@ public:
         internals.push_back(shape(_numSamples, 1)); // dummyOnes
         internals.push_back(shape(_numSamples, 4*_numOut)); // gates
 
-
-        std::cout << "LSTM out: " << outputs[0] << '\n';
         return false;
     }
 
@@ -303,8 +301,6 @@ public:
             tsEnd = numTimeStamps;
             tsInc = 1;
         }
-        std::cout << "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" << '\n';
-        std::cout << tsStart << " " << tsEnd << '\n';
         for (int ts = tsStart; ts != tsEnd; ts += tsInc)
         {
             Range curRowRange(ts*numSamples, (ts + 1)*numSamples);
@@ -318,7 +314,6 @@ public:
             Mat gateF = gates.colRange(1*numOut, 2*numOut);
             Mat gateO = gates.colRange(2*numOut, 3*numOut);
             Mat gateG = gates.colRange(3*numOut, 4*numOut);
-            std::cout << "i " << gateI << '\n';
 
             if (forgetBias)
                 add(gateF, forgetBias, gateF);
@@ -334,7 +329,6 @@ public:
             {
                 Mat gatesIFO = gates.colRange(0, 3*numOut);
                 sigmoid(gatesIFO, gatesIFO);
-                std::cout << "ifo " << gatesIFO << '\n';
             }
 
             tanh(gateG, gateG);
@@ -351,15 +345,12 @@ public:
             }
             if (usePeephole)
             {
-                std::cout << "if (usePeephole)" << '\n';
                 gemm(cInternal, blobs[5], 1, gateO, 1, gateO);
                 sigmoid(gateO, gateO);
             }
 
             //compute h_t
             tanh(cInternal, hInternal);
-            std::cout << "o " << gateO << '\n';
-            std::cout << "tanh(o) " << hInternal << '\n';
             multiply(gateO, hInternal, hInternal);
 
             //save results in output blobs
@@ -367,7 +358,6 @@ public:
             if (produceCellOutput)
                 cInternal.copyTo(cOutTs.rowRange(curRowRange));
         }
-        std::cout << "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" << '\n';
     }
 };
 
diff --git a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
index 6693a75ff4..fe96927840 100644
--- a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
+++ b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
@@ -290,30 +290,6 @@ public:
     }
 };
 
-// // To remove Squeeze after LSTM for non-bidirectional LSTM
-// class LSTMSqueeze : public Subgraph
-// {
-// public:
-//     LSTMSqueeze()
-//     {
-//         int input = addNodeToMatch("");
-//
-//         std::vector<int> lstmInps(7);
-//         lstmInps[0] = input;
-//
-//         for (int i = 1; i < 4; ++i)
-//             lstmInps[i] = addNodeToMatch("Unsqueeze");
-//         lstmInps[4] = addNodeToMatch("");
-//         for (int i = 5; i < 7; ++i)
-//             lstmInps[i] = addNodeToMatch("ConstantOfShape");
-//
-//         int lstm = addNodeToMatch("LSTM", lstmInps);
-//         addNodeToMatch("Squeeze", lstm);
-//
-//         setFusedNode("LSTM", lstmInps);
-//     }
-// };
-
 void simplifySubgraphs(opencv_onnx::GraphProto& net)
 {
     std::vector<Ptr<Subgraph> > subgraphs;
@@ -323,7 +299,6 @@ void simplifySubgraphs(opencv_onnx::GraphProto& net)
     subgraphs.push_back(makePtr<ResizeSubgraph1>());
     subgraphs.push_back(makePtr<ResizeSubgraph2>());
     subgraphs.push_back(makePtr<SoftMaxSubgraph>());
-    // subgraphs.push_back(makePtr<LSTMSqueeze>());
 
     simplifySubgraphs(Ptr<ImportGraphWrapper>(new ONNXGraphWrapper(net)), subgraphs);
 }
diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index bcf3d28eed..2bcba9e6ad 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -322,7 +322,7 @@ void ONNXImporter::populateNet(Net dstNet)
 
         std::string layer_type = node_proto.op_type();
         layerParams.type = layer_type;
-        std::cout << layerParams.name << " " << layer_type << '\n';
+
 
         if (layer_type == "MaxPool")
         {
@@ -457,19 +457,6 @@ void ONNXImporter::populateNet(Net dstNet)
                 constBlobs.insert(std::make_pair(layerParams.name, sliced[0]));
                 continue;
             }
-
-            layerParams.set("begin", DictValue::arrayInt(&begin[0], begin.size()));
-            layerParams.set("end", DictValue::arrayInt(&end[0], end.size()));
-
-            CV_Assert(node_proto.input_size() == 1);
-            if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
-            {
-                std::vector<Mat> inputs(1, getBlob(node_proto, constBlobs, 0)), sliced;
-                runLayer(layerParams, inputs, sliced);
-                CV_Assert(sliced.size() == 1);
-                constBlobs.insert(std::make_pair(layerParams.name, sliced[0]));
-                continue;
-            }
         }
         else if (layer_type == "Split")
         {
@@ -592,116 +579,43 @@ void ONNXImporter::populateNet(Net dstNet)
             constBlobs.insert(std::make_pair(layerParams.name, layerParams.blobs[0]));
             continue;
         }
-        else if (layer_type == "ConstantFill" || layer_type == "ConstantOfShape")
-        {
-            CV_Assert_N(node_proto.input_size());
-            MatShape inpShape = getBlob(node_proto, constBlobs, 0);
-            float value = layerParams.get("value", 0);
-            Mat fill(inpShape.size(), &inpShape[0], CV_32F, Scalar(value));
-            constBlobs.insert(std::make_pair(layerParams.name, fill));
-            continue;
-        }
         else if (layer_type == "LSTM")
         {
-            std::cout << "~~~~~~" << '\n';
-            std::cout << layerParams << '\n';
-            for (int i = 1; i < node_proto.input_size(); ++i) {
-              std::cout << "i: " << node_proto.input(i) << " " << constBlobs[node_proto.input(i)].size << '\n';
-            }
-
+            // https://pytorch.org/docs/stable/nn.html#lstm
             CV_Assert(node_proto.input_size() == 7);
             Mat Wx = getBlob(node_proto, constBlobs, 1);
             Mat Wh = getBlob(node_proto, constBlobs, 2);
             Mat b = getBlob(node_proto, constBlobs, 3);
 
+            const int numHidden = Wh.size[2];
 
-            std::cout << Wx.size << '\n';
-            std::cout << Wh.size << '\n';
-
-            int Wx_shape[] = {Wx.size[1], Wx.size[2]};
-            int Wh_shape[] = {Wh.size[1], Wh.size[2]};
-            std::cout << "b.size " <<  b.size << '\n';
-            int b_shape[] = {2, b.size[1] / 2};
-
-            Wx = Wx.reshape(1, 2, &Wx_shape[0]);
-            b = b.reshape(1, 2, &b_shape[0]);
-
-            std::cout << "b ----------------" << '\n';
-
-            std::cout << b << '\n';
+            Wx = Wx.reshape(1, Wx.size[1]);
+            Wh = Wh.reshape(1, Wh.size[1]);
+            b = b.reshape(1, 2);
             reduce(b, b, 0, REDUCE_SUM);
-            std::cout << b << '\n';
-
-            // https://pytorch.org/docs/stable/nn.html#lstm
-            // IFGO->IFOG
-            // swap each 3rd and 4th rows
-            // Wx = Wx.t();
-
-            float* weightData = (float*)Wx.data;
-            std::swap(weightData[1], weightData[2]);
 
+            // IFGO->IGFO
+            float* WxData = (float*)Wx.data;
+            float* WhData = (float*)Wh.data;
             float* biasData = (float*)b.data;
-            std::swap(biasData[1], biasData[2]);
-
-            // std::swap(weightData[2], weightData[3]);
-            //
-            // weightData = (float*)Wh.data;
-            // std::swap(weightData[1], weightData[2]);
-            // std::swap(weightData[2], weightData[3]);
-
-
-            // const int outSize = Wx.cols / 4;
-            // for (int i = 0; i < Wx.rows; ++i)
-            //     for (int j = 0; j < outSize; ++j)
-            //     {
-            //         // std::swap(weightData[i * W.cols + 1 * outSize + j],
-            //         //           weightData[i * W.cols + 2 * outSize + j]);
-            //         std::swap(weightData[i * Wx.cols + 2 * outSize + j],
-            //                   weightData[i * Wx.cols + 3 * outSize + j]);
-            //     }
-
-            // float* weightData = Wx.ptr<float>();
-            // for (int j = 0; j < 5; ++j)
-            // {
-            //     std::cout << "swap " << (10 + j) << " " << (15 + j) << '\n';
-            //     for (int i = 0; i < 12; ++i)
-            //         std::swap(weightData[(10 + j) * 12 + i],
-            //                   weightData[(15 + j) * 12 + i]);
-            // }
-
+            for (int j = 0; j < numHidden; ++j)
+            {
+                for (int i = 0; i < Wx.cols; ++i)
+                {
+                    std::swap(WxData[(numHidden + j) * Wx.cols + i],
+                              WxData[(numHidden * 2 + j) * Wx.cols + i]);
+                }
+                for (int i = 0; i < Wh.cols; ++i)
+                {
+                    std::swap(WhData[(numHidden + j) * Wh.cols + i],
+                              WhData[(numHidden * 2 + j) * Wh.cols + i]);
+                }
+                std::swap(biasData[numHidden + j], biasData[numHidden * 2 + j]);
+            }
             layerParams.blobs.resize(3);
-            layerParams.blobs[0] = Wh.reshape(1, 2, &Wh_shape[0]);
+            layerParams.blobs[0] = Wh;
             layerParams.blobs[1] = Wx;
             layerParams.blobs[2] = b;
-
-            std::cout << "Wx" << '\n';
-            std::cout << layerParams.blobs[1] << '\n';
-
-            std::cout << "Wh" << '\n';
-            std::cout << layerParams.blobs[0] << '\n';
-
-            // layerParams.set("reverse", true);
-
-
-            // layerParams.set("use_peephole", true);
-            // layerParams.blobs.resize(6);
-            // for (int i = 0; i < 3; ++i)
-            // {
-            //     Mat w = Mat::eye(layerParams.blobs[0].cols, layerParams.blobs[0].cols, CV_32F);
-            //     layerParams.blobs[3 + i] = w;
-            // }
-
-            // std::cout << layerParams.blobs[1] << '\n';
-
-            // int lstmId = dstNet.addLayer(layerParams.name, layerParams.type, layerParams);
-            //
-            // layerParams = LayerParams();
-            //
-            // // Add reshape
-            // int shape[] = {1, 10, 11, 5};
-            // layerParams.name = node_proto.output(0) + "/reshape";
-            // layerParams.type = "Reshape";
-            // layerParams.set("dim", DictValue::arrayInt(&shape[0], 4));
         }
         else if (layer_type == "ImageScaler")
         {
@@ -1005,14 +919,29 @@ void ONNXImporter::populateNet(Net dstNet)
         else if (layer_type == "Squeeze")
         {
             CV_Assert_N(node_proto.input_size() == 1, layerParams.has("axes"));
-            // DictValue axes_dict = layerParams.get("axes");
-            // if (axes_dict.size() != 1)
-            //     CV_Error(Error::StsNotImplemented, "Multidimensional squeeze");
-            //
-            // int axis = axes_dict.getIntValue(0);
-            // layerParams.set("axis", axis - 1);
-            // layerParams.set("end_axis", axis);
-            layerParams.type = "Identity";
+            DictValue axes_dict = layerParams.get("axes");
+            MatShape inpShape = outShapes[node_proto.input(0)];
+
+            std::vector<bool> maskedAxes(inpShape.size(), false);
+            for (int i = 0; i < axes_dict.size(); ++i)
+            {
+                int axis = axes_dict.getIntValue(i);
+                CV_CheckLE(axis, static_cast<int>(inpShape.size()), "Squeeze axis");
+                maskedAxes[axis] = inpShape[axis] == 1;
+            }
+            MatShape outShape;
+            for (int i = 0; i < inpShape.size(); ++i)
+            {
+                if (!maskedAxes[i])
+                    outShape.push_back(inpShape[i]);
+            }
+            if (outShape.size() != inpShape.size())
+            {
+                layerParams.type = "Reshape";
+                layerParams.set("dim", DictValue::arrayInt(&outShape[0], outShape.size()));
+            }
+            else
+                layerParams.type = "Identity";
         }
         else if (layer_type == "Flatten")
         {
@@ -1142,9 +1071,26 @@ void ONNXImporter::populateNet(Net dstNet)
             else
                 layerParams.type = "Identity";
         }
-        else if (layer_type == "ConstantOfShape")
+        else if (layer_type == "ConstantFill" || layer_type == "ConstantOfShape")
         {
-            float fill_value = layerParams.blobs.empty() ? 0 : layerParams.blobs[0].at<float>(0, 0);
+            CV_Assert_N(node_proto.input_size());
+            MatShape inpShape = getBlob(node_proto, constBlobs, 0);
+            float value = layerParams.get("value", 0);
+            Mat fill(inpShape.size(), &inpShape[0], CV_32F, Scalar(value));
+            constBlobs.insert(std::make_pair(layerParams.name, fill));
+            continue;
+        }
+        else if (layer_type == "ConstantOfShape" || layer_type == "ConstantFill")
+        {
+            float fill_value;
+            if (!layerParams.blobs.empty())
+            {
+                CV_Assert(!layerParams.has("value"));
+                fill_value = layerParams.blobs[0].at<float>(0, 0);
+            }
+            else
+                fill_value = layerParams.get("value", 0);
+
             MatShape inpShape = getBlob(node_proto, constBlobs, 0);
             for (int i = 0; i < inpShape.size(); i++)
                 CV_CheckGT(inpShape[i], 0, "");
diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp
index 60ba6d39c5..fe7e47f7a0 100644
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -1826,12 +1826,10 @@ void TFImporter::populateNet(Net dstNet)
             const int outSize = W.cols / 4;
 
             // IGFO->IFOG
-            std::cout << "(TF) W " << W.size << '\n';
             float* weightData = (float*)W.data;
             for (int i = 0; i < W.rows; ++i)
                 for (int j = 0; j < outSize; ++j)
                 {
-                    // std::cout << "swap " << i * W.cols + 1 * outSize << " " << i * W.cols + 2 * outSize << '\n';
                     std::swap(weightData[i * W.cols + 1 * outSize + j],
                               weightData[i * W.cols + 2 * outSize + j]);
                     std::swap(weightData[i * W.cols + 2 * outSize + j],
@@ -1840,11 +1838,6 @@ void TFImporter::populateNet(Net dstNet)
             Wx = W.rowRange(0, W.rows - outSize).t();
             Wh = W.rowRange(W.rows - outSize, W.rows).t();
 
-            std::cout << "(TF) Wx " << Wx.size << '\n';
-            std::cout << "(TF) Wh " << Wh.size << '\n';
-            std::cout << "(TF) b " << b.size << '\n';
-
-
             layerParams.blobs.resize(3);
             layerParams.blobs[0] = Wh;
             layerParams.blobs[1] = Wx;
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index c5b243b8ab..a2cd2c3a68 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -79,12 +79,6 @@ public:
             netSoftmax.setInput(ref);
             ref = netSoftmax.forward();
         }
-        std::cout << "ref: " << ref.size << '\n';
-        std::cout << "out: " << out.size << '\n';
-        std::cout << ref.reshape(1, 1) << '\n';
-        std::cout << '\n';
-        std::cout << out.reshape(1, 1) << '\n';
-
         normAssert(ref, out, "", l1 ? l1 : default_l1, lInf ? lInf : default_lInf);
         if (checkNoFallbacks)
             expectNoFallbacksFromIE(net);

From 11d565ca629d5b36993752941472a26244600e79 Mon Sep 17 00:00:00 2001
From: Dmitry Kurtaev <dmitry.kurtaev+github@gmail.com>
Date: Wed, 18 Mar 2020 00:00:24 +0300
Subject: [PATCH 03/12] Fix LSTM from ONNX with batch==1

---
 modules/dnn/src/layers/recurrent_layers.cpp |  9 +-
 modules/dnn/src/onnx/onnx_importer.cpp      | 97 ++++++++++++++-------
 2 files changed, 69 insertions(+), 37 deletions(-)

diff --git a/modules/dnn/src/layers/recurrent_layers.cpp b/modules/dnn/src/layers/recurrent_layers.cpp
index 3f9a229516..26d2ea9de5 100644
--- a/modules/dnn/src/layers/recurrent_layers.cpp
+++ b/modules/dnn/src/layers/recurrent_layers.cpp
@@ -110,10 +110,11 @@ public:
             const Mat& Wh = blobs[0];
             const Mat& Wx = blobs[1];
             const Mat& bias = blobs[2];
-            CV_Assert(Wh.dims == 2 && Wx.dims == 2);
-            CV_Assert(Wh.rows == Wx.rows);
-            CV_Assert(Wh.rows == 4*Wh.cols);
-            CV_Assert(Wh.rows == (int)bias.total());
+            CV_CheckEQ(Wh.dims, 2, "");
+            CV_CheckEQ(Wx.dims, 2, "");
+            CV_CheckEQ(Wh.rows, Wx.rows, "");
+            CV_CheckEQ(Wh.rows, 4*Wh.cols, "");
+            CV_CheckEQ(Wh.rows, (int)bias.total(), "");
             CV_Assert(Wh.type() == Wx.type() && Wx.type() == bias.type());
 
             // Peephole weights.
diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index 2bcba9e6ad..b243a986e7 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -49,6 +49,11 @@ class ONNXImporter
     LayerParams getLayerParams(const opencv_onnx::NodeProto& node_proto);
     bool isCeilMode(const LayerParams& layerParams);
 
+    void addLayer(Net& dstNet, LayerParams& layerParams,
+                  const opencv_onnx::NodeProto& node_proto,
+                  std::map<std::string, LayerInfo>& layer_id,
+                  std::map<std::string, MatShape>& outShapes);
+
 public:
 
     ONNXImporter(const char *onnxFile)
@@ -259,6 +264,42 @@ Mat ONNXImporter::getBlob(const opencv_onnx::NodeProto& node_proto,
     return constBlob->second;
 }
 
+void ONNXImporter::addLayer(Net& dstNet, LayerParams& layerParams,
+                            const opencv_onnx::NodeProto& node_proto,
+                            std::map<std::string, LayerInfo>& layer_id,
+                            std::map<std::string, MatShape>& outShapes)
+{
+    std::map<std::string, LayerInfo>::iterator layerId;
+    std::map<std::string, MatShape>::iterator shapeIt;
+
+    int id = dstNet.addLayer(layerParams.name, layerParams.type, layerParams);
+    for (int i = 0; i < node_proto.output_size(); ++i)
+    {
+        layer_id.insert(std::make_pair(node_proto.output(i), LayerInfo(id, i)));
+    }
+
+    std::vector<MatShape> layerInpShapes, layerOutShapes, layerInternalShapes;
+    int inpNum = 0;
+    for (int j = 0; j < node_proto.input_size(); j++) {
+        layerId = layer_id.find(node_proto.input(j));
+        if (layerId != layer_id.end()) {
+            dstNet.connect(layerId->second.layerId, layerId->second.outputId, id, inpNum);
+            ++inpNum;
+            // Collect input shapes.
+            shapeIt = outShapes.find(node_proto.input(j));
+            CV_Assert(shapeIt != outShapes.end());
+            layerInpShapes.push_back(shapeIt->second);
+        }
+    }
+    // Compute shape of output blob for this layer.
+    Ptr<Layer> layer = dstNet.getLayer(id);
+    layer->getMemoryShapes(layerInpShapes, 0, layerOutShapes, layerInternalShapes);
+    for (int i = 0; i < node_proto.output_size() && i < (int)layerOutShapes.size(); ++i)
+    {
+        outShapes[node_proto.output(i)] = layerOutShapes[i];
+    }
+}
+
 void ONNXImporter::populateNet(Net dstNet)
 {
     CV_Assert(model_proto.has_graph());
@@ -581,13 +622,16 @@ void ONNXImporter::populateNet(Net dstNet)
         }
         else if (layer_type == "LSTM")
         {
+            LayerParams lstmParams = layerParams;
+            lstmParams.name += "/lstm";
+
             // https://pytorch.org/docs/stable/nn.html#lstm
             CV_Assert(node_proto.input_size() == 7);
             Mat Wx = getBlob(node_proto, constBlobs, 1);
             Mat Wh = getBlob(node_proto, constBlobs, 2);
             Mat b = getBlob(node_proto, constBlobs, 3);
 
-            const int numHidden = Wh.size[2];
+            const int numHidden = lstmParams.get<int>("hidden_size");
 
             Wx = Wx.reshape(1, Wx.size[1]);
             Wh = Wh.reshape(1, Wh.size[1]);
@@ -612,10 +656,24 @@ void ONNXImporter::populateNet(Net dstNet)
                 }
                 std::swap(biasData[numHidden + j], biasData[numHidden * 2 + j]);
             }
-            layerParams.blobs.resize(3);
-            layerParams.blobs[0] = Wh;
-            layerParams.blobs[1] = Wx;
-            layerParams.blobs[2] = b;
+
+            lstmParams.blobs.resize(3);
+            lstmParams.blobs[0] = Wh;
+            lstmParams.blobs[1] = Wx;
+            lstmParams.blobs[2] = b;
+
+            node_proto.set_output(0, lstmParams.name);  // set different name so output shapes will be registered on that name
+            addLayer(dstNet, lstmParams, node_proto, layer_id, outShapes);
+
+            MatShape lstmShape = outShapes[node_proto.output(0)];
+
+            // Add fake 1 as it is done in ONNX
+            lstmShape.insert(lstmShape.begin() + 1, 1);
+
+            layerParams.type = "Reshape";
+            layerParams.set("dim", DictValue::arrayInt(&lstmShape[0], lstmShape.size()));
+            node_proto.set_input(0, lstmParams.name);  // redirect input to LSTM
+            node_proto.set_output(0, layerParams.name);  // keep origin LSTM's name
         }
         else if (layer_type == "ImageScaler")
         {
@@ -1228,34 +1286,7 @@ void ONNXImporter::populateNet(Net dstNet)
                     layerParams.blobs.push_back(getBlob(node_proto, constBlobs, j));
             }
         }
-
-        int id = dstNet.addLayer(layerParams.name, layerParams.type, layerParams);
-        for (int i = 0; i < node_proto.output_size(); ++i)
-        {
-            layer_id.insert(std::make_pair(node_proto.output(i), LayerInfo(id, i)));
-        }
-
-        std::vector<MatShape> layerInpShapes, layerOutShapes, layerInternalShapes;
-        int inpNum = 0;
-        for (int j = 0; j < node_proto.input_size(); j++) {
-            layerId = layer_id.find(node_proto.input(j));
-            if (layerId != layer_id.end()) {
-                dstNet.connect(layerId->second.layerId, layerId->second.outputId, id, inpNum);
-                ++inpNum;
-                // Collect input shapes.
-                shapeIt = outShapes.find(node_proto.input(j));
-                CV_Assert(shapeIt != outShapes.end());
-                layerInpShapes.push_back(shapeIt->second);
-            }
-        }
-
-        // Compute shape of output blob for this layer.
-        Ptr<Layer> layer = dstNet.getLayer(id);
-        layer->getMemoryShapes(layerInpShapes, 0, layerOutShapes, layerInternalShapes);
-        for (int i = 0; i < node_proto.output_size() && i < (int)layerOutShapes.size(); ++i)
-        {
-            outShapes[node_proto.output(i)] = layerOutShapes[i];
-        }
+        addLayer(dstNet, layerParams, node_proto, layer_id, outShapes);
     }
 }
 

From 0fb4f2cc9c31eb5345f72881bd537557cb0b8241 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Fri, 20 Mar 2020 21:04:29 +0000
Subject: [PATCH 04/12] imgproc: add src.empty() checks in filter operations

---
 .../imgproc/src/bilateral_filter.dispatch.cpp |  2 ++
 modules/imgproc/src/box_filter.dispatch.cpp   |  4 +++
 modules/imgproc/src/deriv.cpp                 |  6 ++++
 modules/imgproc/src/filter.dispatch.cpp       | 15 +++++++++
 modules/imgproc/src/median_blur.dispatch.cpp  |  2 ++
 modules/imgproc/src/morph.dispatch.cpp        |  8 +++++
 modules/imgproc/src/smooth.dispatch.cpp       |  2 ++
 modules/imgproc/test/test_filter.cpp          | 33 +++++++++++++++++++
 modules/ts/include/opencv2/ts/ts_ext.hpp      | 31 +++++++++++++++++
 9 files changed, 103 insertions(+)

diff --git a/modules/imgproc/src/bilateral_filter.dispatch.cpp b/modules/imgproc/src/bilateral_filter.dispatch.cpp
index a27ebb18f5..ed0e71bbab 100644
--- a/modules/imgproc/src/bilateral_filter.dispatch.cpp
+++ b/modules/imgproc/src/bilateral_filter.dispatch.cpp
@@ -406,6 +406,8 @@ void bilateralFilter( InputArray _src, OutputArray _dst, int d,
 {
     CV_INSTRUMENT_REGION();
 
+    CV_Assert(!_src.empty());
+
     _dst.create( _src.size(), _src.type() );
 
     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
diff --git a/modules/imgproc/src/box_filter.dispatch.cpp b/modules/imgproc/src/box_filter.dispatch.cpp
index 054e7474c6..c9ec693385 100644
--- a/modules/imgproc/src/box_filter.dispatch.cpp
+++ b/modules/imgproc/src/box_filter.dispatch.cpp
@@ -443,6 +443,8 @@ void boxFilter(InputArray _src, OutputArray _dst, int ddepth,
 {
     CV_INSTRUMENT_REGION();
 
+    CV_Assert(!_src.empty());
+
     CV_OCL_RUN(_dst.isUMat() &&
                (borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT ||
                 borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101),
@@ -514,6 +516,8 @@ void sqrBoxFilter(InputArray _src, OutputArray _dst, int ddepth,
 {
     CV_INSTRUMENT_REGION();
 
+    CV_Assert(!_src.empty());
+
     int srcType = _src.type(), sdepth = CV_MAT_DEPTH(srcType), cn = CV_MAT_CN(srcType);
     Size size = _src.size();
 
diff --git a/modules/imgproc/src/deriv.cpp b/modules/imgproc/src/deriv.cpp
index fa9defb405..1248ea1888 100644
--- a/modules/imgproc/src/deriv.cpp
+++ b/modules/imgproc/src/deriv.cpp
@@ -416,6 +416,8 @@ void cv::Sobel( InputArray _src, OutputArray _dst, int ddepth, int dx, int dy,
 {
     CV_INSTRUMENT_REGION();
 
+    CV_Assert(!_src.empty());
+
     int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype);
     if (ddepth < 0)
         ddepth = sdepth;
@@ -468,6 +470,8 @@ void cv::Scharr( InputArray _src, OutputArray _dst, int ddepth, int dx, int dy,
 {
     CV_INSTRUMENT_REGION();
 
+    CV_Assert(!_src.empty());
+
     int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype);
     if (ddepth < 0)
         ddepth = sdepth;
@@ -785,6 +789,8 @@ void cv::Laplacian( InputArray _src, OutputArray _dst, int ddepth, int ksize,
 {
     CV_INSTRUMENT_REGION();
 
+    CV_Assert(!_src.empty());
+
     int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype);
     if (ddepth < 0)
         ddepth = sdepth;
diff --git a/modules/imgproc/src/filter.dispatch.cpp b/modules/imgproc/src/filter.dispatch.cpp
index 65a066b57b..d39c749121 100644
--- a/modules/imgproc/src/filter.dispatch.cpp
+++ b/modules/imgproc/src/filter.dispatch.cpp
@@ -169,6 +169,9 @@ int FilterEngine::start(const Size& _wholeSize, const Size& sz, const Point& ofs
 {
     CV_INSTRUMENT_REGION();
 
+    CV_Assert(!sz.empty());
+    CV_Assert(!_wholeSize.empty());
+
     CV_CPU_DISPATCH(FilterEngine__start, (*this, _wholeSize, sz, ofs),
         CV_CPU_DISPATCH_MODES_ALL);
 }
@@ -176,6 +179,11 @@ int FilterEngine::start(const Size& _wholeSize, const Size& sz, const Point& ofs
 
 int FilterEngine::start(const Mat& src, const Size &wsz, const Point &ofs)
 {
+    CV_INSTRUMENT_REGION();
+
+    CV_Assert(!src.empty());
+    CV_Assert(!wsz.empty());
+
     start( wsz, src.size(), ofs);
     return startY - ofs.y;
 }
@@ -1398,6 +1406,9 @@ void filter2D(InputArray _src, OutputArray _dst, int ddepth,
 {
     CV_INSTRUMENT_REGION();
 
+    CV_Assert(!_src.empty());
+    CV_Assert(!_kernel.empty());
+
     CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
                ocl_filter2D(_src, _dst, ddepth, _kernel, anchor0, delta, borderType))
 
@@ -1429,6 +1440,10 @@ void sepFilter2D(InputArray _src, OutputArray _dst, int ddepth,
 {
     CV_INSTRUMENT_REGION();
 
+    CV_Assert(!_src.empty());
+    CV_Assert(!_kernelX.empty());
+    CV_Assert(!_kernelY.empty());
+
     CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && (size_t)_src.rows() > _kernelY.total() && (size_t)_src.cols() > _kernelX.total(),
                ocl_sepFilter2D(_src, _dst, ddepth, _kernelX, _kernelY, anchor, delta, borderType))
 
diff --git a/modules/imgproc/src/median_blur.dispatch.cpp b/modules/imgproc/src/median_blur.dispatch.cpp
index 79333f5f80..afef09f579 100644
--- a/modules/imgproc/src/median_blur.dispatch.cpp
+++ b/modules/imgproc/src/median_blur.dispatch.cpp
@@ -280,6 +280,8 @@ void medianBlur( InputArray _src0, OutputArray _dst, int ksize )
 {
     CV_INSTRUMENT_REGION();
 
+    CV_Assert(!_src0.empty());
+
     CV_Assert( (ksize % 2 == 1) && (_src0.dims() <= 2 ));
 
     if( ksize <= 1 || _src0.empty() )
diff --git a/modules/imgproc/src/morph.dispatch.cpp b/modules/imgproc/src/morph.dispatch.cpp
index cbb5315f31..45ae3994d9 100644
--- a/modules/imgproc/src/morph.dispatch.cpp
+++ b/modules/imgproc/src/morph.dispatch.cpp
@@ -939,6 +939,8 @@ static void morphOp( int op, InputArray _src, OutputArray _dst,
 {
     CV_INSTRUMENT_REGION();
 
+    CV_Assert(!_src.empty());
+
     Mat kernel = _kernel.getMat();
     Size ksize = !kernel.empty() ? kernel.size() : Size(3,3);
     anchor = normalizeAnchor(anchor, ksize);
@@ -1005,6 +1007,8 @@ void erode( InputArray src, OutputArray dst, InputArray kernel,
 {
     CV_INSTRUMENT_REGION();
 
+    CV_Assert(!src.empty());
+
     morphOp( MORPH_ERODE, src, dst, kernel, anchor, iterations, borderType, borderValue );
 }
 
@@ -1015,6 +1019,8 @@ void dilate( InputArray src, OutputArray dst, InputArray kernel,
 {
     CV_INSTRUMENT_REGION();
 
+    CV_Assert(!src.empty());
+
     morphOp( MORPH_DILATE, src, dst, kernel, anchor, iterations, borderType, borderValue );
 }
 
@@ -1154,6 +1160,8 @@ void morphologyEx( InputArray _src, OutputArray _dst, int op,
 {
     CV_INSTRUMENT_REGION();
 
+    CV_Assert(!_src.empty());
+
     Mat kernel = _kernel.getMat();
     if (kernel.empty())
     {
diff --git a/modules/imgproc/src/smooth.dispatch.cpp b/modules/imgproc/src/smooth.dispatch.cpp
index c90d0828b8..4ac7df8b4d 100644
--- a/modules/imgproc/src/smooth.dispatch.cpp
+++ b/modules/imgproc/src/smooth.dispatch.cpp
@@ -603,6 +603,8 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
 {
     CV_INSTRUMENT_REGION();
 
+    CV_Assert(!_src.empty());
+
     int type = _src.type();
     Size size = _src.size();
     _dst.create( size, type );
diff --git a/modules/imgproc/test/test_filter.cpp b/modules/imgproc/test/test_filter.cpp
index 947738f16a..11d87a0abe 100644
--- a/modules/imgproc/test/test_filter.cpp
+++ b/modules/imgproc/test/test_filter.cpp
@@ -2323,4 +2323,37 @@ TEST(Imgproc_Pyrdown, issue_12961)
     ASSERT_EQ(0.0, cv::norm(dst));
 }
 
+
+// https://github.com/opencv/opencv/issues/16857
+TEST(Imgproc, filter_empty_src_16857)
+{
+#define CV_TEST_EXPECT_EMPTY_THROW(statement) CV_TEST_EXPECT_EXCEPTION_MESSAGE(statement, ".empty()")
+
+    Mat src, dst, dst2;
+
+    CV_TEST_EXPECT_EMPTY_THROW(bilateralFilter(src, dst, 5, 50, 20));
+    CV_TEST_EXPECT_EMPTY_THROW(blur(src, dst, Size(3, 3)));
+    CV_TEST_EXPECT_EMPTY_THROW(boxFilter(src, dst, CV_8U, Size(3, 3)));
+    CV_TEST_EXPECT_EMPTY_THROW(sqrBoxFilter(src, dst, CV_8U, Size(3, 3)));
+    CV_TEST_EXPECT_EMPTY_THROW(medianBlur(src, dst, 3));
+    CV_TEST_EXPECT_EMPTY_THROW(GaussianBlur(src, dst, Size(3, 3), 0));
+    CV_TEST_EXPECT_EMPTY_THROW(cv::filter2D(src, dst, CV_8U, Mat_<float>::zeros(Size(3, 3))));
+    CV_TEST_EXPECT_EMPTY_THROW(sepFilter2D(src, dst, CV_8U, Mat_<float>::zeros(Size(3, 1)), Mat_<float>::zeros(Size(1, 3))));
+    CV_TEST_EXPECT_EMPTY_THROW(Sobel(src, dst, CV_8U, 1, 1));
+    CV_TEST_EXPECT_EMPTY_THROW(spatialGradient(src, dst, dst2));
+    CV_TEST_EXPECT_EMPTY_THROW(Scharr(src, dst, CV_8U, 1, 1));
+    CV_TEST_EXPECT_EMPTY_THROW(Laplacian(src, dst, CV_8U));
+
+    CV_TEST_EXPECT_EMPTY_THROW(cv::dilate(src, dst, Mat()));  // cvtest:: by default
+    CV_TEST_EXPECT_EMPTY_THROW(cv::erode(src, dst, Mat()));  // cvtest:: by default
+    CV_TEST_EXPECT_EMPTY_THROW(morphologyEx(src, dst, MORPH_OPEN, Mat()));
+
+    //debug: CV_TEST_EXPECT_EMPTY_THROW(blur(Mat_<uchar>(Size(3,3)), dst, Size(3, 3)));
+
+    EXPECT_TRUE(src.empty());
+    EXPECT_TRUE(dst.empty());
+    EXPECT_TRUE(dst2.empty());
+}
+
+
 }} // namespace
diff --git a/modules/ts/include/opencv2/ts/ts_ext.hpp b/modules/ts/include/opencv2/ts/ts_ext.hpp
index f22a9d42eb..b2a4cac241 100644
--- a/modules/ts/include/opencv2/ts/ts_ext.hpp
+++ b/modules/ts/include/opencv2/ts/ts_ext.hpp
@@ -161,4 +161,35 @@ bool checkBigDataTests();
 #undef TEST_P
 #define TEST_P(test_case_name, test_name) CV__TEST_P(test_case_name, test_name, Body, CV__TEST_BODY_IMPL)
 
+
+#define CV_TEST_EXPECT_EXCEPTION_MESSAGE(statement, msg) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    const char* msg_ = msg; \
+    bool hasException = false; \
+    try { \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    } \
+    catch (const cv::Exception& e) { \
+      if (NULL == strstr(e.what(), msg_)) \
+        ADD_FAILURE() << "Unexpected cv::Exception is raised: " << #statement << "\n  Expected message substring: '" << msg_ << "'. Actual message:\n" << e.what(); \
+      hasException = true; \
+    } \
+    catch (const std::exception& e) { \
+      ADD_FAILURE() << "Unexpected std::exception is raised: " << #statement << "\n" << e.what(); \
+      hasException = true; \
+    } \
+    catch (...) { \
+      ADD_FAILURE() << "Unexpected C++ exception is raised: " << #statement; \
+      hasException = true; \
+    } \
+    if (!hasException) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_test_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_test_, __LINE__): \
+      ADD_FAILURE() << "Failed: Expected: " #statement " throws an '" << msg << "' exception.\n" \
+           "  Actual: it doesn't."
+
+
 #endif  // OPENCV_TS_EXT_HPP

From 2fb1d9d02e069cbb0e0dee6b7803e8a21f9fc894 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Sat, 21 Mar 2020 00:25:49 +0000
Subject: [PATCH 05/12] doc: fix misused "see also" doxygen command

---
 .../js_gui/js_image_display/js_image_display.markdown      | 2 +-
 .../py_calib3d/py_calibration/py_calibration.markdown      | 4 ++--
 .../py_gui/py_image_display/py_image_display.markdown      | 2 +-
 .../py_contours_more_functions.markdown                    | 2 +-
 .../py_histogram_begins/py_histogram_begins.markdown       | 2 +-
 .../ml/introduction_to_svm/introduction_to_svm.markdown    | 2 +-
 .../video-input-psnr-ssim/video_input_psnr_ssim.markdown   | 2 +-
 modules/cudaoptflow/include/opencv2/cudaoptflow.hpp        | 4 ++--
 modules/videoio/include/opencv2/videoio.hpp                | 7 ++++---
 9 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/doc/js_tutorials/js_gui/js_image_display/js_image_display.markdown b/doc/js_tutorials/js_gui/js_image_display/js_image_display.markdown
index efe65e320e..9ad4ce2e53 100644
--- a/doc/js_tutorials/js_gui/js_image_display/js_image_display.markdown
+++ b/doc/js_tutorials/js_gui/js_image_display/js_image_display.markdown
@@ -13,7 +13,7 @@ OpenCV.js saves images as cv.Mat type. We use HTML canvas element to transfer cv
 or in reverse. The ImageData interface can represent or set the underlying pixel data of an area of a
 canvas element.
 
-@sa Please refer to canvas docs for more details.
+@note Please refer to canvas docs for more details.
 
 First, create an ImageData obj from canvas:
 @code{.js}
diff --git a/doc/py_tutorials/py_calib3d/py_calibration/py_calibration.markdown b/doc/py_tutorials/py_calib3d/py_calibration/py_calibration.markdown
index f56e639005..e337999efd 100644
--- a/doc/py_tutorials/py_calib3d/py_calibration/py_calibration.markdown
+++ b/doc/py_tutorials/py_calib3d/py_calibration/py_calibration.markdown
@@ -83,7 +83,7 @@ use 7x6 grid. (Normally a chess board has 8x8 squares and 7x7 internal corners).
 corner points and retval which will be True if pattern is obtained. These corners will be placed in
 an order (from left-to-right, top-to-bottom)
 
-@sa This function may not be able to find the required pattern in all the images. So, one good option
+@note This function may not be able to find the required pattern in all the images. So, one good option
 is to write the code such that, it starts the camera and check each frame for required pattern. Once
 the pattern is obtained, find the corners and store it in a list. Also, provide some interval before
 reading next frame so that we can adjust our chess board in different direction. Continue this
@@ -91,7 +91,7 @@ process until the required number of good patterns are obtained. Even in the exa
 are not sure how many images out of the 14 given are good.  Thus, we must read all the images and take only the good
 ones.
 
-@sa Instead of chess board, we can alternatively use a circular grid.  In this case, we must use the function
+@note Instead of chess board, we can alternatively use a circular grid.  In this case, we must use the function
 **cv.findCirclesGrid()** to find the pattern. Fewer images are sufficient to perform camera calibration using a circular grid.
 
 Once we find the corners, we can increase their accuracy using **cv.cornerSubPix()**. We can also
diff --git a/doc/py_tutorials/py_gui/py_image_display/py_image_display.markdown b/doc/py_tutorials/py_gui/py_image_display/py_image_display.markdown
index 8b8cae0e73..edb957bd95 100644
--- a/doc/py_tutorials/py_gui/py_image_display/py_image_display.markdown
+++ b/doc/py_tutorials/py_gui/py_image_display/py_image_display.markdown
@@ -132,7 +132,7 @@ A screen-shot of the window will look like this :
 
 ![image](images/matplotlib_screenshot.jpg)
 
-@sa Plenty of plotting options are available in Matplotlib. Please refer to Matplotlib docs for more
+@note Plenty of plotting options are available in Matplotlib. Please refer to Matplotlib docs for more
 details. Some, we will see on the way.
 
 __warning__
diff --git a/doc/py_tutorials/py_imgproc/py_contours/py_contours_more_functions/py_contours_more_functions.markdown b/doc/py_tutorials/py_imgproc/py_contours/py_contours_more_functions/py_contours_more_functions.markdown
index 378099a931..fb5f59bef6 100644
--- a/doc/py_tutorials/py_imgproc/py_contours/py_contours_more_functions/py_contours_more_functions.markdown
+++ b/doc/py_tutorials/py_imgproc/py_contours/py_contours_more_functions/py_contours_more_functions.markdown
@@ -113,7 +113,7 @@ I got following results:
 
 See, even image rotation doesn't affect much on this comparison.
 
-@sa [Hu-Moments](http://en.wikipedia.org/wiki/Image_moment#Rotation_invariant_moments) are seven
+@note [Hu-Moments](http://en.wikipedia.org/wiki/Image_moment#Rotation_invariant_moments) are seven
 moments invariant to translation, rotation and scale. Seventh one is skew-invariant. Those values
 can be found using **cv.HuMoments()** function.
 
diff --git a/doc/py_tutorials/py_imgproc/py_histograms/py_histogram_begins/py_histogram_begins.markdown b/doc/py_tutorials/py_imgproc/py_histograms/py_histogram_begins/py_histogram_begins.markdown
index c26449cad4..8cb24139e8 100644
--- a/doc/py_tutorials/py_imgproc/py_histograms/py_histogram_begins/py_histogram_begins.markdown
+++ b/doc/py_tutorials/py_imgproc/py_histograms/py_histogram_begins/py_histogram_begins.markdown
@@ -94,7 +94,7 @@ hist is same as we calculated before. But bins will have 257 elements, because N
 as 0-0.99, 1-1.99, 2-2.99 etc. So final range would be 255-255.99. To represent that, they also add
 256 at end of bins. But we don't need that 256. Upto 255 is sufficient.
 
-@sa Numpy has another function, **np.bincount()** which is much faster than (around 10X)
+@note Numpy has another function, **np.bincount()** which is much faster than (around 10X)
 np.histogram(). So for one-dimensional histograms, you can better try that. Don't forget to set
 minlength = 256 in np.bincount. For example, hist = np.bincount(img.ravel(),minlength=256)
 
diff --git a/doc/tutorials/ml/introduction_to_svm/introduction_to_svm.markdown b/doc/tutorials/ml/introduction_to_svm/introduction_to_svm.markdown
index e7bf3f4fb6..d8bf40e92f 100644
--- a/doc/tutorials/ml/introduction_to_svm/introduction_to_svm.markdown
+++ b/doc/tutorials/ml/introduction_to_svm/introduction_to_svm.markdown
@@ -51,7 +51,7 @@ Let's introduce the notation used to define formally a hyperplane:
 
 where \f$\beta\f$ is known as the *weight vector* and \f$\beta_{0}\f$ as the *bias*.
 
-@sa A more in depth description of this and hyperplanes you can find in the section 4.5 (*Separating
+@note A more in depth description of this and hyperplanes you can find in the section 4.5 (*Separating
 Hyperplanes*) of the book: *Elements of Statistical Learning* by T. Hastie, R. Tibshirani and J. H.
 Friedman (@cite HTF01).
 
diff --git a/doc/tutorials/videoio/video-input-psnr-ssim/video_input_psnr_ssim.markdown b/doc/tutorials/videoio/video-input-psnr-ssim/video_input_psnr_ssim.markdown
index 96c6637c5f..311ffe4120 100644
--- a/doc/tutorials/videoio/video-input-psnr-ssim/video_input_psnr_ssim.markdown
+++ b/doc/tutorials/videoio/video-input-psnr-ssim/video_input_psnr_ssim.markdown
@@ -164,7 +164,7 @@ Describing the methods goes well beyond the purpose of this tutorial. For that I
 the article introducing it. Nevertheless, you can get a good image of it by looking at the OpenCV
 implementation below.
 
-@sa
+@note
     SSIM is described more in-depth in the: "Z. Wang, A. C. Bovik, H. R. Sheikh and E. P.
     Simoncelli, "Image quality assessment: From error visibility to structural similarity," IEEE
     Transactions on Image Processing, vol. 13, no. 4, pp. 600-612, Apr. 2004." article.
diff --git a/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp b/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
index eb8c5ef6e2..67d502337a 100644
--- a/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
+++ b/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
@@ -256,8 +256,8 @@ public:
 
 /** @brief Implementation of the Zach, Pock and Bischof Dual TV-L1 Optical Flow method.
  *
- * @sa C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow".
- * @sa Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation".
+ * @note C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow".
+ * @note Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation".
  */
 class CV_EXPORTS OpticalFlowDual_TVL1 : public DenseOpticalFlow
 {
diff --git a/modules/videoio/include/opencv2/videoio.hpp b/modules/videoio/include/opencv2/videoio.hpp
index d09e62c15d..38a451b244 100644
--- a/modules/videoio/include/opencv2/videoio.hpp
+++ b/modules/videoio/include/opencv2/videoio.hpp
@@ -642,7 +642,8 @@ public:
       documentation of source stream to know the right URL.
     @param apiPreference preferred Capture API backends to use. Can be used to enforce a specific reader
     implementation if multiple are available: e.g. cv::CAP_FFMPEG or cv::CAP_IMAGES or cv::CAP_DSHOW.
-    @sa The list of supported API backends cv::VideoCaptureAPIs
+
+    @sa cv::VideoCaptureAPIs
     */
     CV_WRAP VideoCapture(const String& filename, int apiPreference);
 
@@ -653,7 +654,7 @@ public:
     Use a `domain_offset` to enforce a specific reader implementation if multiple are available like cv::CAP_FFMPEG or cv::CAP_IMAGES or cv::CAP_DSHOW.
     e.g. to open Camera 1 using the MS Media Foundation API use `index = 1 + cv::CAP_MSMF`
 
-    @sa The list of supported API backends cv::VideoCaptureAPIs
+    @sa cv::VideoCaptureAPIs
     */
     CV_WRAP VideoCapture(int index);
 
@@ -665,7 +666,7 @@ public:
     @param apiPreference preferred Capture API backends to use. Can be used to enforce a specific reader
     implementation if multiple are available: e.g. cv::CAP_DSHOW or cv::CAP_MSMF or cv::CAP_V4L2.
 
-    @sa The list of supported API backends cv::VideoCaptureAPIs
+    @sa cv::VideoCaptureAPIs
     */
     CV_WRAP VideoCapture(int index, int apiPreference);
 

From d7e839b8c5b24e2d8eb62fd74e5a7e6371c2c483 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Sat, 21 Mar 2020 00:51:34 +0000
Subject: [PATCH 06/12] objdetect(QR): avoid bug with empty input

---
 modules/objdetect/src/qrcode.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/modules/objdetect/src/qrcode.cpp b/modules/objdetect/src/qrcode.cpp
index 4990dfd388..12d89919f9 100644
--- a/modules/objdetect/src/qrcode.cpp
+++ b/modules/objdetect/src/qrcode.cpp
@@ -122,9 +122,16 @@ void QRDetect::init(const Mat& src, double eps_vertical_, double eps_horizontal_
 
     eps_vertical   = eps_vertical_;
     eps_horizontal = eps_horizontal_;
-    adaptiveThreshold(barcode, bin_barcode, 255, ADAPTIVE_THRESH_GAUSSIAN_C, THRESH_BINARY, 83, 2);
-    adaptiveThreshold(resized_barcode, resized_bin_barcode, 255, ADAPTIVE_THRESH_GAUSSIAN_C, THRESH_BINARY, 83, 2);
 
+    if (!barcode.empty())
+        adaptiveThreshold(barcode, bin_barcode, 255, ADAPTIVE_THRESH_GAUSSIAN_C, THRESH_BINARY, 83, 2);
+    else
+        bin_barcode.release();
+
+    if (!resized_barcode.empty())
+        adaptiveThreshold(resized_barcode, resized_bin_barcode, 255, ADAPTIVE_THRESH_GAUSSIAN_C, THRESH_BINARY, 83, 2);
+    else
+        resized_bin_barcode.release();
 }
 
 vector<Vec3d> QRDetect::searchHorizontalLines()

From 8253562794b3211486118deb51ee0e8d05969708 Mon Sep 17 00:00:00 2001
From: Andrei-Florin BENCSIK <andrei.bencsik@gmail.com>
Date: Sat, 21 Mar 2020 11:15:07 +0200
Subject: [PATCH 07/12] fix: minor typo in CMakeCVDetectPython

---
 cmake/OpenCVDetectPython.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/OpenCVDetectPython.cmake b/cmake/OpenCVDetectPython.cmake
index 5d0ee4a96b..4ff02a77d3 100644
--- a/cmake/OpenCVDetectPython.cmake
+++ b/cmake/OpenCVDetectPython.cmake
@@ -78,10 +78,10 @@ if(NOT ${found})
         AND NOT DEFINED ${executable}
     )
       if(NOT OPENCV_SKIP_PYTHON_WARNING)
-        message(WARNING "CMake's 'find_host_package(PythonInterp ${__python_package_version})' founds wrong Python version:\n"
+        message(WARNING "CMake's 'find_host_package(PythonInterp ${__python_package_version})' found wrong Python version:\n"
                         "PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}\n"
                         "PYTHON_VERSION_STRING=${PYTHON_VERSION_STRING}\n"
-                        "Consider specify '${executable}' variable via CMake command line or environment variables\n")
+                        "Consider providing the '${executable}' variable via CMake command line or environment variables\n")
       endif()
       ocv_clear_vars(PYTHONINTERP_FOUND PYTHON_EXECUTABLE PYTHON_VERSION_STRING PYTHON_VERSION_MAJOR PYTHON_VERSION_MINOR PYTHON_VERSION_PATCH)
       if(NOT CMAKE_VERSION VERSION_LESS "3.12")

From 8433620295891c184ce4edd86bbd5ad6440eda45 Mon Sep 17 00:00:00 2001
From: Dmitry Kurtaev <dmitry.kurtaev+github@gmail.com>
Date: Sun, 22 Mar 2020 00:20:36 +0300
Subject: [PATCH 08/12] Bidirectional LSTM

---
 modules/dnn/src/layers/recurrent_layers.cpp | 162 +++++++++++---------
 modules/dnn/src/onnx/onnx_importer.cpp      |  43 +++---
 modules/dnn/test/test_onnx_importer.cpp     |   5 +
 3 files changed, 116 insertions(+), 94 deletions(-)

diff --git a/modules/dnn/src/layers/recurrent_layers.cpp b/modules/dnn/src/layers/recurrent_layers.cpp
index 26d2ea9de5..69606a6b4e 100644
--- a/modules/dnn/src/layers/recurrent_layers.cpp
+++ b/modules/dnn/src/layers/recurrent_layers.cpp
@@ -93,6 +93,7 @@ class LSTMLayerImpl CV_FINAL : public LSTMLayer
     float forgetBias, cellClip;
     bool useCellClip, usePeephole;
     bool reverse;   // If true, go in negative direction along the time axis
+    bool bidirectional;  // If true, produces both forward and reversed directions along time axis
 
 public:
 
@@ -101,6 +102,7 @@ public:
     {
         setParamsFrom(params);
 
+        bidirectional = params.get<bool>("bidirectional", false);
         if (!blobs.empty())
         {
             CV_Assert(blobs.size() >= 3);
@@ -113,7 +115,7 @@ public:
             CV_CheckEQ(Wh.dims, 2, "");
             CV_CheckEQ(Wx.dims, 2, "");
             CV_CheckEQ(Wh.rows, Wx.rows, "");
-            CV_CheckEQ(Wh.rows, 4*Wh.cols, "");
+            CV_CheckEQ(Wh.rows, (1 + static_cast<int>(bidirectional))*4*Wh.cols, "");
             CV_CheckEQ(Wh.rows, (int)bias.total(), "");
             CV_Assert(Wh.type() == Wx.type() && Wx.type() == bias.type());
 
@@ -136,6 +138,7 @@ public:
         useCellClip = params.get<bool>("use_cell_clip", false);
         usePeephole = params.get<bool>("use_peephole", false);
         reverse = params.get<bool>("reverse", false);
+        CV_Assert(!reverse || !bidirectional);
 
         allocated = false;
         outTailShape.clear();
@@ -207,6 +210,7 @@ public:
 
         outResShape.push_back(_numSamples);
         outResShape.insert(outResShape.end(), outTailShape_.begin(), outTailShape_.end());
+        outResShape.back() *= (1 + static_cast<int>(bidirectional));
 
         size_t noutputs = produceCellOutput ? 2 : 1;
         outputs.assign(noutputs, outResShape);
@@ -253,6 +257,7 @@ public:
         outTsShape.clear();
         outTsShape.push_back(numSamples);
         outTsShape.insert(outTsShape.end(), outTailShape.begin(), outTailShape.end());
+        outTsShape.back() *= (1 + static_cast<int>(bidirectional));
 
         allocated = true;
     }
@@ -273,91 +278,96 @@ public:
         outputs_arr.getMatVector(output);
         internals_arr.getMatVector(internals);
 
-        const Mat &Wh = blobs[0];
-        const Mat &Wx = blobs[1];
-        const Mat &bias = blobs[2];
-
-        int numOut = Wh.size[1];
-
-        Mat hInternal = internals[0], cInternal = internals[1],
-                dummyOnes = internals[2], gates = internals[3];
-        hInternal.setTo(0.);
-        cInternal.setTo(0.);
-        dummyOnes.setTo(1.);
-
-        int numSamplesTotal = numTimeStamps*numSamples;
-        Mat xTs = input[0].reshape(1, numSamplesTotal);
-
-        Mat hOutTs = output[0].reshape(1, numSamplesTotal);
-        Mat cOutTs = produceCellOutput ? output[1].reshape(1, numSamplesTotal) : Mat();
-
-        int tsStart, tsEnd, tsInc;
-        if (reverse) {
-            tsStart = numTimeStamps - 1;
-            tsEnd = -1;
-            tsInc = -1;
-        }
-        else {
-            tsStart = 0;
-            tsEnd = numTimeStamps;
-            tsInc = 1;
-        }
-        for (int ts = tsStart; ts != tsEnd; ts += tsInc)
+        const int numDirs = 1 + static_cast<int>(bidirectional);
+        for (int i = 0; i < numDirs; ++i)
         {
-            Range curRowRange(ts*numSamples, (ts + 1)*numSamples);
-            Mat xCurr = xTs.rowRange(curRowRange);
+            const Mat &Wh = blobs[0].rowRange(i * blobs[0].rows / numDirs, (i + 1) * blobs[0].rows / numDirs);
+            const Mat &Wx = blobs[1].rowRange(i * blobs[1].rows / numDirs, (i + 1) * blobs[1].rows / numDirs);
+            const Mat &bias = blobs[2].colRange(i * blobs[2].cols / numDirs, (i + 1) * blobs[2].cols / numDirs);
+
+            int numOut = Wh.size[1];
+
+            Mat hInternal = internals[0], cInternal = internals[1],
+                    dummyOnes = internals[2], gates = internals[3];
+            hInternal.setTo(0.);
+            cInternal.setTo(0.);
+            dummyOnes.setTo(1.);
+
+            int numSamplesTotal = numTimeStamps*numSamples;
+            Mat xTs = input[0].reshape(1, numSamplesTotal);
+
+            Mat hOutTs = output[0].reshape(1, numSamplesTotal);
+            hOutTs = hOutTs.colRange(i * hOutTs.cols / numDirs, (i + 1) * hOutTs.cols / numDirs);
+            Mat cOutTs = produceCellOutput ? output[1].reshape(1, numSamplesTotal) : Mat();
+
+            int tsStart, tsEnd, tsInc;
+            if (reverse || i == 1) {
+                tsStart = numTimeStamps - 1;
+                tsEnd = -1;
+                tsInc = -1;
+            }
+            else {
+                tsStart = 0;
+                tsEnd = numTimeStamps;
+                tsInc = 1;
+            }
+            for (int ts = tsStart; ts != tsEnd; ts += tsInc)
+            {
+                Range curRowRange(ts*numSamples, (ts + 1)*numSamples);
+                Mat xCurr = xTs.rowRange(curRowRange);
 
-            gemm(xCurr, Wx, 1, gates, 0, gates, GEMM_2_T);      // Wx * x_t
-            gemm(hInternal, Wh, 1, gates, 1, gates, GEMM_2_T);  //+Wh * h_{t-1}
-            gemm(dummyOnes, bias, 1, gates, 1, gates);          //+b
+                gemm(xCurr, Wx, 1, gates, 0, gates, GEMM_2_T);      // Wx * x_t
+                gemm(hInternal, Wh, 1, gates, 1, gates, GEMM_2_T);  //+Wh * h_{t-1}
+                gemm(dummyOnes, bias, 1, gates, 1, gates);          //+b
 
-            Mat gateI = gates.colRange(0*numOut, 1*numOut);
-            Mat gateF = gates.colRange(1*numOut, 2*numOut);
-            Mat gateO = gates.colRange(2*numOut, 3*numOut);
-            Mat gateG = gates.colRange(3*numOut, 4*numOut);
+                Mat gateI = gates.colRange(0*numOut, 1*numOut);
+                Mat gateF = gates.colRange(1*numOut, 2*numOut);
+                Mat gateO = gates.colRange(2*numOut, 3*numOut);
+                Mat gateG = gates.colRange(3*numOut, 4*numOut);
 
-            if (forgetBias)
-                add(gateF, forgetBias, gateF);
+                if (forgetBias)
+                    add(gateF, forgetBias, gateF);
 
-            if (usePeephole)
-            {
-                Mat gatesIF = gates.colRange(0, 2*numOut);
-                gemm(cInternal, blobs[3], 1, gateI, 1, gateI);
-                gemm(cInternal, blobs[4], 1, gateF, 1, gateF);
-                sigmoid(gatesIF, gatesIF);
-            }
-            else
-            {
-                Mat gatesIFO = gates.colRange(0, 3*numOut);
-                sigmoid(gatesIFO, gatesIFO);
-            }
+                if (usePeephole)
+                {
+                    Mat gatesIF = gates.colRange(0, 2*numOut);
+                    gemm(cInternal, blobs[3], 1, gateI, 1, gateI);
+                    gemm(cInternal, blobs[4], 1, gateF, 1, gateF);
+                    sigmoid(gatesIF, gatesIF);
+                }
+                else
+                {
+                    Mat gatesIFO = gates.colRange(0, 3*numOut);
+                    sigmoid(gatesIFO, gatesIFO);
+                }
 
-            tanh(gateG, gateG);
+                tanh(gateG, gateG);
 
-            //compute c_t
-            multiply(gateF, cInternal, gateF);  // f_t (*) c_{t-1}
-            multiply(gateI, gateG, gateI);      // i_t (*) g_t
-            add(gateF, gateI, cInternal);       // c_t = f_t (*) c_{t-1} + i_t (*) g_t
+                //compute c_t
+                multiply(gateF, cInternal, gateF);  // f_t (*) c_{t-1}
+                multiply(gateI, gateG, gateI);      // i_t (*) g_t
+                add(gateF, gateI, cInternal);       // c_t = f_t (*) c_{t-1} + i_t (*) g_t
 
-            if (useCellClip)
-            {
-                min(cInternal, cellClip, cInternal);
-                max(cInternal, -cellClip, cInternal);
-            }
-            if (usePeephole)
-            {
-                gemm(cInternal, blobs[5], 1, gateO, 1, gateO);
-                sigmoid(gateO, gateO);
-            }
+                if (useCellClip)
+                {
+                    min(cInternal, cellClip, cInternal);
+                    max(cInternal, -cellClip, cInternal);
+                }
+                if (usePeephole)
+                {
+                    gemm(cInternal, blobs[5], 1, gateO, 1, gateO);
+                    sigmoid(gateO, gateO);
+                }
 
-            //compute h_t
-            tanh(cInternal, hInternal);
-            multiply(gateO, hInternal, hInternal);
+                //compute h_t
+                tanh(cInternal, hInternal);
+                multiply(gateO, hInternal, hInternal);
 
-            //save results in output blobs
-            hInternal.copyTo(hOutTs.rowRange(curRowRange));
-            if (produceCellOutput)
-                cInternal.copyTo(cOutTs.rowRange(curRowRange));
+                //save results in output blobs
+                hInternal.copyTo(hOutTs.rowRange(curRowRange));
+                if (produceCellOutput)
+                    cInternal.copyTo(cOutTs.rowRange(curRowRange));
+            }
         }
     }
 };
diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index b243a986e7..79386e6615 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -630,37 +630,44 @@ void ONNXImporter::populateNet(Net dstNet)
             Mat Wx = getBlob(node_proto, constBlobs, 1);
             Mat Wh = getBlob(node_proto, constBlobs, 2);
             Mat b = getBlob(node_proto, constBlobs, 3);
+            b = b.reshape(1, b.size[0]);
 
             const int numHidden = lstmParams.get<int>("hidden_size");
-
-            Wx = Wx.reshape(1, Wx.size[1]);
-            Wh = Wh.reshape(1, Wh.size[1]);
-            b = b.reshape(1, 2);
-            reduce(b, b, 0, REDUCE_SUM);
+            const int numDirs = Wx.size[0];  // Is 1 for forward only and 2 for bidirectional LSTM.
+            const int numFeatures = Wx.size[2];
+            Mat bx = b.colRange(0, b.cols / 2);
+            Mat bh = b.colRange(b.cols / 2, b.cols);
+            b = bx + bh;
 
             // IFGO->IGFO
-            float* WxData = (float*)Wx.data;
-            float* WhData = (float*)Wh.data;
-            float* biasData = (float*)b.data;
-            for (int j = 0; j < numHidden; ++j)
+            for (int k = 0; k < numDirs; ++k)
             {
-                for (int i = 0; i < Wx.cols; ++i)
-                {
-                    std::swap(WxData[(numHidden + j) * Wx.cols + i],
-                              WxData[(numHidden * 2 + j) * Wx.cols + i]);
-                }
-                for (int i = 0; i < Wh.cols; ++i)
+                float* WxData = Wx.ptr<float>(k);
+                float* WhData = Wh.ptr<float>(k);
+                float* biasData = b.ptr<float>(k);
+                for (int j = 0; j < numHidden; ++j)
                 {
-                    std::swap(WhData[(numHidden + j) * Wh.cols + i],
-                              WhData[(numHidden * 2 + j) * Wh.cols + i]);
+                    for (int i = 0; i < numFeatures; ++i)
+                    {
+                        std::swap(WxData[(numHidden + j) * numFeatures + i],
+                                  WxData[(numHidden * 2 + j) * numFeatures + i]);
+                    }
+                    for (int i = 0; i < numHidden; ++i)
+                    {
+                        std::swap(WhData[(numHidden + j) * numHidden + i],
+                                  WhData[(numHidden * 2 + j) * numHidden + i]);
+                    }
+                    std::swap(biasData[numHidden + j], biasData[numHidden * 2 + j]);
                 }
-                std::swap(biasData[numHidden + j], biasData[numHidden * 2 + j]);
             }
+            Wx = Wx.reshape(1, Wx.size[0] * Wx.size[1]);
+            Wh = Wh.reshape(1, Wh.size[0] * Wh.size[1]);
 
             lstmParams.blobs.resize(3);
             lstmParams.blobs[0] = Wh;
             lstmParams.blobs[1] = Wx;
             lstmParams.blobs[2] = b;
+            lstmParams.set("bidirectional", lstmParams.get<String>("direction", "") == "bidirectional");
 
             node_proto.set_output(0, lstmParams.name);  // set different name so output shapes will be registered on that name
             addLayer(dstNet, lstmParams, node_proto, layer_id, outShapes);
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index a2cd2c3a68..f741319959 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -456,6 +456,11 @@ TEST_P(Test_ONNX_layers, LSTM)
     testONNXModels("lstm");
 }
 
+TEST_P(Test_ONNX_layers, LSTM_bidirectional)
+{
+    testONNXModels("lstm_bidirectional");
+}
+
 INSTANTIATE_TEST_CASE_P(/*nothing*/, Test_ONNX_layers, dnnBackendsAndTargets());
 
 class Test_ONNX_nets : public Test_ONNX_layers

From 6db9f00fd1eb58e02c8f76164f31b19d0de1fb27 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Sun, 22 Mar 2020 23:55:11 +0000
Subject: [PATCH 09/12] cmake(apps): support OPENCV_INSTALL_APPS_LIST

Usage:
- cmake -DOPENCV_INSTALL_APPS_LIST=opencv_version ...
---
 apps/CMakeLists.txt | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt
index 260a08fab6..ad6a919689 100644
--- a/apps/CMakeLists.txt
+++ b/apps/CMakeLists.txt
@@ -1,6 +1,8 @@
 add_definitions(-D__OPENCV_BUILD=1)
 add_definitions(-D__OPENCV_APPS=1)
 
+string(REPLACE "," ";" OPENCV_INSTALL_APPS_LIST "${OPENCV_INSTALL_APPS_LIST}")  # support comma-separated list (,) too
+
 # Unified function for creating OpenCV applications:
 #   ocv_add_application(tgt [MODULES <m1> [<m2> ...]] SRCS <src1> [<src2> ...])
 function(ocv_add_application the_target)
@@ -25,12 +27,14 @@ function(ocv_add_application the_target)
     set_target_properties(${the_target} PROPERTIES FOLDER "applications")
   endif()
 
-  if(INSTALL_CREATE_DISTRIB)
+  if(NOT INSTALL_CREATE_DISTRIB
+      OR (OPENCV_INSTALL_APPS_LIST STREQUAL "all" OR ";${OPENCV_INSTALL_APPS_LIST};" MATCHES ";${the_target};")
+  )
+    install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT dev)
+  elseif(INSTALL_CREATE_DISTRIB)
     if(BUILD_SHARED_LIBS)
       install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT dev)
     endif()
-  else()
-    install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT dev)
   endif()
 endfunction()
 

From 801f26c35d6a8f1ab45da84309a0bfe308f79a13 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Mon, 23 Mar 2020 01:15:49 +0000
Subject: [PATCH 10/12] cmake: set CMP0066 => NEW

---
 CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 831602a4c1..6e887dc5a2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,6 +56,10 @@ if(POLICY CMP0056)
   cmake_policy(SET CMP0056 NEW)  # try_compile(): link flags
 endif()
 
+if(POLICY CMP0066)
+  cmake_policy(SET CMP0066 NEW)  # CMake 3.7: try_compile(): use per-config flags, like CMAKE_CXX_FLAGS_RELEASE
+endif()
+
 if(POLICY CMP0067)
   cmake_policy(SET CMP0067 NEW)  # CMake 3.8: try_compile(): honor language standard variables (like C++11)
 endif()

From 2f665ec5894cc91daeafc79899cb30c278e3670a Mon Sep 17 00:00:00 2001
From: Maksim Shabunin <maksim.shabunin@gmail.com>
Date: Mon, 23 Mar 2020 14:46:21 +0300
Subject: [PATCH 11/12] calib3d: fixed VS2019 warning C4756

---
 modules/calib3d/src/quadsubpix.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/calib3d/src/quadsubpix.cpp b/modules/calib3d/src/quadsubpix.cpp
index b4100a22f9..24dfdc14a4 100644
--- a/modules/calib3d/src/quadsubpix.cpp
+++ b/modules/calib3d/src/quadsubpix.cpp
@@ -61,13 +61,13 @@ static void orderContours(const std::vector<std::vector<Point> >& contours, Poin
     for(i = 0; i < n; i++)
     {
         size_t ni = contours[i].size();
-        double min_dist = std::numeric_limits<double>::max();
+        float min_dist = std::numeric_limits<float>::max();
         for(j = 0; j < ni; j++)
         {
             double dist = norm(Point2f((float)contours[i][j].x, (float)contours[i][j].y) - point);
-            min_dist = MIN(min_dist, dist);
+            min_dist = (float)MIN((double)min_dist, dist);
         }
-        order.push_back(std::pair<int, float>((int)i, (float)min_dist));
+        order.push_back(std::pair<int, float>((int)i, min_dist));
     }
 
     std::sort(order.begin(), order.end(), is_smaller);

From 467c3ef0ac621b2cbc296bbabe286bc9cc476696 Mon Sep 17 00:00:00 2001
From: Dmitry Kurtaev <dmitry.kurtaev+github@gmail.com>
Date: Sun, 22 Mar 2020 16:04:30 +0300
Subject: [PATCH 12/12] Add checks for LSTM initial h and c

---
 modules/dnn/src/onnx/onnx_importer.cpp  | 22 +++++++++++++---------
 modules/dnn/test/test_onnx_importer.cpp |  6 ++++--
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index 79386e6615..47b5aff674 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -496,6 +496,7 @@ void ONNXImporter::populateNet(Net dstNet)
                 runLayer(layerParams, inputs, sliced);
                 CV_Assert(sliced.size() == 1);
                 constBlobs.insert(std::make_pair(layerParams.name, sliced[0]));
+                outShapes[layerParams.name] = shape(sliced[0]);
                 continue;
             }
         }
@@ -630,6 +631,8 @@ void ONNXImporter::populateNet(Net dstNet)
             Mat Wx = getBlob(node_proto, constBlobs, 1);
             Mat Wh = getBlob(node_proto, constBlobs, 2);
             Mat b = getBlob(node_proto, constBlobs, 3);
+            CV_CheckEQ(countNonZero(getBlob(node_proto, constBlobs, 5)), 0, "Unsupported non zero initial_h");
+            CV_CheckEQ(countNonZero(getBlob(node_proto, constBlobs, 6)), 0, "Unsupported non zero initial_c");
             b = b.reshape(1, b.size[0]);
 
             const int numHidden = lstmParams.get<int>("hidden_size");
@@ -1007,6 +1010,16 @@ void ONNXImporter::populateNet(Net dstNet)
             }
             else
                 layerParams.type = "Identity";
+
+            if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
+            {
+                Mat inp = getBlob(node_proto, constBlobs, 0);
+                Mat out = inp.reshape(1, outShape);
+                out.dims = outShape.size();  // to workaround dims == 1
+                constBlobs.insert(std::make_pair(layerParams.name, out));
+                outShapes[layerParams.name] = shape(out);
+                continue;
+            }
         }
         else if (layer_type == "Flatten")
         {
@@ -1136,15 +1149,6 @@ void ONNXImporter::populateNet(Net dstNet)
             else
                 layerParams.type = "Identity";
         }
-        else if (layer_type == "ConstantFill" || layer_type == "ConstantOfShape")
-        {
-            CV_Assert_N(node_proto.input_size());
-            MatShape inpShape = getBlob(node_proto, constBlobs, 0);
-            float value = layerParams.get("value", 0);
-            Mat fill(inpShape.size(), &inpShape[0], CV_32F, Scalar(value));
-            constBlobs.insert(std::make_pair(layerParams.name, fill));
-            continue;
-        }
         else if (layer_type == "ConstantOfShape" || layer_type == "ConstantFill")
         {
             float fill_value;
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index f741319959..6932e83a4e 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -405,6 +405,8 @@ TEST_P(Test_ONNX_layers, Reshape)
 
 TEST_P(Test_ONNX_layers, Squeeze)
 {
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
     testONNXModels("squeeze");
 }
 
@@ -453,12 +455,12 @@ TEST_P(Test_ONNX_layers, Split_EltwiseMax)
 
 TEST_P(Test_ONNX_layers, LSTM)
 {
-    testONNXModels("lstm");
+    testONNXModels("lstm", npy, 0, 0, false, false);
 }
 
 TEST_P(Test_ONNX_layers, LSTM_bidirectional)
 {
-    testONNXModels("lstm_bidirectional");
+    testONNXModels("lstm_bidirectional", npy, 0, 0, false, false);
 }
 
 INSTANTIATE_TEST_CASE_P(/*nothing*/, Test_ONNX_layers, dnnBackendsAndTargets());