From 4e5699fa716a3e5b0faddf1d6a00213aeb5c60cc Mon Sep 17 00:00:00 2001
From: Julia Bareeva <34717687+JulieBar@users.noreply.github.com>
Date: Fri, 23 Jul 2021 17:11:50 +0300
Subject: [PATCH] Merge pull request #20450 from JulieBar:lstm_inside

Support non-zero hidden state for LSTM

* fully support non-zero hidden state for LSTM

* check dims of hidden state for LSTM

* fix failed test Test_Model.TextRecognition

* add new tests for LSTM w/ non-zero hidden params

Co-authored-by: Julie Bareeva <julia.bareeva@xperience.ai>
---
 modules/dnn/src/layers/recurrent_layers.cpp | 28 +++++---
 modules/dnn/src/onnx/onnx_importer.cpp      | 11 ++-
 modules/dnn/src/tensorflow/tf_importer.cpp  | 16 +++--
 modules/dnn/test/test_layers.cpp            | 80 ++++++++++++++++++++-
 modules/dnn/test/test_onnx_importer.cpp     | 10 +++
 5 files changed, 122 insertions(+), 23 deletions(-)
diff --git a/modules/dnn/src/layers/recurrent_layers.cpp b/modules/dnn/src/layers/recurrent_layers.cpp
index 69606a6b4e..a6715aefca 100644
--- a/modules/dnn/src/layers/recurrent_layers.cpp
+++ b/modules/dnn/src/layers/recurrent_layers.cpp
@@ -112,19 +112,24 @@ public:
             const Mat& Wh = blobs[0];
             const Mat& Wx = blobs[1];
             const Mat& bias = blobs[2];
+            const Mat& hInternal = blobs[3];
+            const Mat& cInternal = blobs[4];
             CV_CheckEQ(Wh.dims, 2, "");
             CV_CheckEQ(Wx.dims, 2, "");
             CV_CheckEQ(Wh.rows, Wx.rows, "");
             CV_CheckEQ(Wh.rows, (1 + static_cast<int>(bidirectional))*4*Wh.cols, "");
             CV_CheckEQ(Wh.rows, (int)bias.total(), "");
+            CV_CheckEQ(hInternal.cols, Wh.cols, "");
+            CV_CheckEQ(hInternal.cols, cInternal.cols, "");
+            CV_CheckEQ(hInternal.rows, cInternal.rows, "");
             CV_Assert(Wh.type() == Wx.type() && Wx.type() == bias.type());
 
             // Peephole weights.
-            if (blobs.size() > 3)
+            if (blobs.size() > 5)
             {
-                CV_Assert(blobs.size() == 6);
+                CV_Assert(blobs.size() == 8);
                 const int N = Wh.cols;
-                for (int i = 3; i < 6; ++i)
+                for (int i = 5; i < 8; ++i)
                 {
                     CV_Assert(blobs[i].rows == N && blobs[i].cols == N);
                     CV_Assert(blobs[i].type() == bias.type());
@@ -181,7 +186,7 @@ public:
                          std::vector<MatShape> &outputs,
                          std::vector<MatShape> &internals) const CV_OVERRIDE
     {
-        CV_Assert((!usePeephole && blobs.size() == 3) || (usePeephole && blobs.size() == 6));
+        CV_Assert((!usePeephole && blobs.size() == 5) || (usePeephole && blobs.size() == 8));
         CV_Assert(inputs.size() == 1);
         const MatShape& inp0 = inputs[0];
 
@@ -228,7 +233,7 @@ public:
         std::vector<Mat> input;
         inputs_arr.getMatVector(input);
 
-        CV_Assert((!usePeephole && blobs.size() == 3) || (usePeephole && blobs.size() == 6));
+        CV_Assert((!usePeephole && blobs.size() == 5) || (usePeephole && blobs.size() == 8));
         CV_Assert(input.size() == 1);
         const Mat& inp0 = input[0];
 
@@ -284,13 +289,14 @@ public:
             const Mat &Wh = blobs[0].rowRange(i * blobs[0].rows / numDirs, (i + 1) * blobs[0].rows / numDirs);
             const Mat &Wx = blobs[1].rowRange(i * blobs[1].rows / numDirs, (i + 1) * blobs[1].rows / numDirs);
             const Mat &bias = blobs[2].colRange(i * blobs[2].cols / numDirs, (i + 1) * blobs[2].cols / numDirs);
+            const Mat &h_0 = blobs[3].rowRange(i * blobs[3].rows / numDirs, (i + 1) * blobs[3].rows / numDirs);
+            const Mat &c_0 = blobs[4].rowRange(i * blobs[4].rows / numDirs, (i + 1) * blobs[4].rows / numDirs);
 
             int numOut = Wh.size[1];
-
             Mat hInternal = internals[0], cInternal = internals[1],
                     dummyOnes = internals[2], gates = internals[3];
-            hInternal.setTo(0.);
-            cInternal.setTo(0.);
+            h_0.copyTo(hInternal);
+            c_0.copyTo(cInternal);
             dummyOnes.setTo(1.);
 
             int numSamplesTotal = numTimeStamps*numSamples;
@@ -331,8 +337,8 @@ public:
                 if (usePeephole)
                 {
                     Mat gatesIF = gates.colRange(0, 2*numOut);
-                    gemm(cInternal, blobs[3], 1, gateI, 1, gateI);
-                    gemm(cInternal, blobs[4], 1, gateF, 1, gateF);
+                    gemm(cInternal, blobs[5], 1, gateI, 1, gateI);
+                    gemm(cInternal, blobs[6], 1, gateF, 1, gateF);
                     sigmoid(gatesIF, gatesIF);
                 }
                 else
@@ -355,7 +361,7 @@ public:
                 }
                 if (usePeephole)
                 {
-                    gemm(cInternal, blobs[5], 1, gateO, 1, gateO);
+                    gemm(cInternal, blobs[7], 1, gateO, 1, gateO);
                     sigmoid(gateO, gateO);
                 }
 
diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index ec61a9707e..4ad0fd496e 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -900,8 +900,9 @@ void ONNXImporter::handleNode(const opencv_onnx::NodeProto& node_proto_)
             Mat Wx = getBlob(node_proto, 1);
             Mat Wh = getBlob(node_proto, 2);
             Mat b = getBlob(node_proto, 3);
-            CV_CheckEQ(countNonZero(getBlob(node_proto, 5)), 0, "Unsupported non zero initial_h");
-            CV_CheckEQ(countNonZero(getBlob(node_proto, 6)), 0, "Unsupported non zero initial_c");
+            Mat h0 = getBlob(node_proto, 5);
+            Mat c0 = getBlob(node_proto, 6);
+
             b = b.reshape(1, b.size[0]);
 
             const int numHidden = lstmParams.get<int>("hidden_size");
@@ -934,11 +935,15 @@ void ONNXImporter::handleNode(const opencv_onnx::NodeProto& node_proto_)
             }
             Wx = Wx.reshape(1, Wx.size[0] * Wx.size[1]);
             Wh = Wh.reshape(1, Wh.size[0] * Wh.size[1]);
+            h0 = h0.reshape(1, h0.size[0] * h0.size[1]);
+            c0 = c0.reshape(1, c0.size[0] * c0.size[1]);
 
-            lstmParams.blobs.resize(3);
+            lstmParams.blobs.resize(5);
             lstmParams.blobs[0] = Wh;
             lstmParams.blobs[1] = Wx;
             lstmParams.blobs[2] = b;
+            lstmParams.blobs[3] = h0;
+            lstmParams.blobs[4] = c0;
             lstmParams.set("bidirectional", lstmParams.get<String>("direction", "") == "bidirectional");
 
             node_proto.set_output(0, lstmParams.name);  // set different name so output shapes will be registered on that name
diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp
index 426710989e..01fa0df985 100644
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -1838,8 +1838,8 @@ void TFImporter::parseBlockLSTM(tensorflow::GraphDef& net, const tensorflow::Nod
     // op: "BlockLSTM"
     // input: "lstm_block_wrapper/ToInt64/x"  (ignore, number of time stamps)
     // input: "input"
-    // input: "lstm_block_wrapper/zeros"      (ignore)
-    // input: "lstm_block_wrapper/zeros"      (ignore)
+    // input: "lstm_block_wrapper/zeros"
+    // input: "lstm_block_wrapper/zeros"
     // input: "lstm_block_wrapper/kernel"
     // input: "lstm_block_wrapper/w_i_diag"
     // input: "lstm_block_wrapper/w_f_diag"
@@ -1865,9 +1865,11 @@ void TFImporter::parseBlockLSTM(tensorflow::GraphDef& net, const tensorflow::Nod
         }
     }
 
-    Mat W, Wh, Wx, b;
+    Mat W, Wh, Wx, b, cs_prev, h_prev;
     blobFromTensor(getConstBlob(layer, value_id, 4), W);
     blobFromTensor(getConstBlob(layer, value_id, 8), b);
+    blobFromTensor(getConstBlob(layer, value_id, 2), cs_prev);
+    blobFromTensor(getConstBlob(layer, value_id, 3), h_prev);
     const int outSize = W.cols / 4;
 
     // IGFO->IFOG
@@ -1883,10 +1885,12 @@ void TFImporter::parseBlockLSTM(tensorflow::GraphDef& net, const tensorflow::Nod
     Wx = W.rowRange(0, W.rows - outSize).t();
     Wh = W.rowRange(W.rows - outSize, W.rows).t();
 
-    layerParams.blobs.resize(3);
+    layerParams.blobs.resize(5);
     layerParams.blobs[0] = Wh;
     layerParams.blobs[1] = Wx;
     layerParams.blobs[2] = b;
+    layerParams.blobs[3] = h_prev;
+    layerParams.blobs[4] = cs_prev;
 
     if (hasLayerAttr(layer, "use_peephole"))
     {
@@ -1894,14 +1898,14 @@ void TFImporter::parseBlockLSTM(tensorflow::GraphDef& net, const tensorflow::Nod
         if (usePeephole)
         {
             layerParams.set("use_peephole", true);
-            layerParams.blobs.resize(6);
+            layerParams.blobs.resize(8);
             for (int i = 0; i < 3; ++i)
             {
                 Mat w;
                 blobFromTensor(getConstBlob(layer, value_id, 5 + i), w);
                 w = w.reshape(1, w.total());  // Single column.
                 w = Mat::diag(w);  // Make a diagonal matrix.
-                layerParams.blobs[3 + i] = w;
+                layerParams.blobs[5 + i] = w;
             }
         }
     }
diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp
index 897603d274..fbe9605e7f 100644
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -434,7 +434,7 @@ class Layer_LSTM_Test : public ::testing::Test
 {
 public:
     int numInp, numOut;
-    Mat Wh, Wx, b;
+    Mat Wh, Wx, b, h, c;
     Ptr<LSTMLayer> layer;
     std::vector<Mat> inputs, outputs;
 
@@ -449,12 +449,17 @@ public:
         Wh = Mat::ones(4 * numOut, numOut, CV_32F);
         Wx = Mat::ones(4 * numOut, numInp, CV_32F);
         b  = Mat::ones(4 * numOut, 1, CV_32F);
+        h  = Mat::ones(4, numOut, CV_32F);
+        c  = Mat::ones(4, numOut, CV_32F);
 
         LayerParams lp;
-        lp.blobs.resize(3);
+        lp.blobs.resize(5);
         lp.blobs[0] = Wh;
         lp.blobs[1] = Wx;
         lp.blobs[2] = b;
+        lp.blobs[3] = h;
+        lp.blobs[4] = c;
+
         lp.set<bool>("produce_cell_output", produceCellOutput);
         lp.set<bool>("use_timestamp_dim", useTimestampDim);
 
@@ -502,10 +507,12 @@ TEST_F(Layer_LSTM_Test, get_set_test)
 TEST(Layer_LSTM_Test_Accuracy_with_, CaffeRecurrent)
 {
     LayerParams lp;
-    lp.blobs.resize(3);
+    lp.blobs.resize(5);
     lp.blobs[0] = blobFromNPY(_tf("lstm.prototxt.w_2.npy"));  // Wh
     lp.blobs[1] = blobFromNPY(_tf("lstm.prototxt.w_0.npy"));  // Wx
     lp.blobs[2] = blobFromNPY(_tf("lstm.prototxt.w_1.npy"));  // bias
+    lp.blobs[3] = Mat::zeros(2, 17, CV_32F);                     // h_0
+    lp.blobs[4] = Mat::zeros(2, 17, CV_32F);                     // c_0
     Ptr<LSTMLayer> layer = LSTMLayer::create(lp);
 
     Mat inp = blobFromNPY(_tf("recurrent.input.npy"));
@@ -516,6 +523,68 @@ TEST(Layer_LSTM_Test_Accuracy_with_, CaffeRecurrent)
     normAssert(h_t_reference, outputs[0]);
 }
 
+TEST(Layer_LSTM_Test_Accuracy_with_, HiddenParams)
+{
+    Mat Wx = blobFromNPY(_tf("lstm.hidden.W.npy"));
+    Mat Wh = blobFromNPY(_tf("lstm.hidden.R.npy"));
+    Mat b = blobFromNPY(_tf("lstm.hidden.B.npy"));
+    Mat h0 = blobFromNPY(_tf("lstm.hidden.h0.npy"));
+    Mat c0 = blobFromNPY(_tf("lstm.hidden.c0.npy"));
+
+    const int numHidden = 3;
+    const int numDirs = Wx.size[0];
+    const int numFeatures = Wx.size[2];
+
+    b = b.reshape(1, b.size[0]);
+    Mat bx = b.colRange(0, b.cols / 2);
+    Mat bh = b.colRange(b.cols / 2, b.cols);
+    b = bx + bh;
+
+    // IFGO->IGFO
+    for (int k = 0; k < numDirs; ++k)
+    {
+        float* WxData = Wx.ptr<float>(k);
+        float* WhData = Wh.ptr<float>(k);
+        float* biasData = b.ptr<float>(k);
+        for (int j = 0; j < numHidden; ++j)
+        {
+            for (int i = 0; i < numFeatures; ++i)
+            {
+                std::swap(WxData[(numHidden + j) * numFeatures + i],
+                          WxData[(numHidden * 2 + j) * numFeatures + i]);
+            }
+            for (int i = 0; i < numHidden; ++i)
+            {
+                std::swap(WhData[(numHidden + j) * numHidden + i],
+                          WhData[(numHidden * 2 + j) * numHidden + i]);
+            }
+            std::swap(biasData[numHidden + j], biasData[numHidden * 2 + j]);
+        }
+    }
+
+    Wx = Wx.reshape(1, Wx.size[0] * Wx.size[1]);
+    Wh = Wh.reshape(1, Wh.size[0] * Wh.size[1]);
+    h0 = h0.reshape(1, h0.size[0] * h0.size[1]);
+    c0 = c0.reshape(1, c0.size[0] * c0.size[1]);
+
+    LayerParams lstmParams;
+    lstmParams.blobs.resize(5);
+    lstmParams.blobs[0] = Wh;
+    lstmParams.blobs[1] = Wx;
+    lstmParams.blobs[2] = b;
+    lstmParams.blobs[3] = h0;
+    lstmParams.blobs[4] = c0;
+    lstmParams.set("bidirectional", false);
+    Ptr<LSTMLayer> layer = LSTMLayer::create(lstmParams);
+
+    Mat inp = blobFromNPY(_tf("lstm.hidden.input.npy"));
+    std::vector<Mat> inputs(1, inp), outputs;
+    runLayer(layer, inputs, outputs);
+
+    Mat h_t_reference = blobFromNPY(_tf("lstm.hidden.output.npy"));
+    normAssert(h_t_reference, outputs[0]);
+}
+
 TEST(Layer_RNN_Test_Accuracy_with_, CaffeRecurrent)
 {
     Ptr<RNNLayer> layer = RNNLayer::create(LayerParams());
@@ -560,6 +629,9 @@ TEST(Layer_LSTM_Test_Accuracy_, Reverse)
     bias.at<float>(2, 0) = 1e10f;  // Output gate - always output everything
     bias.at<float>(3, 0) = 0.f;  // Update signal
 
+    cv::Mat hInternal = cv::Mat::zeros(1, 1, CV_32FC1);
+    cv::Mat cInternal = cv::Mat::zeros(1, 1, CV_32FC1);
+
     LayerParams lp;
     lp.set("reverse", true);
     lp.set("use_timestamp_dim", true);
@@ -567,6 +639,8 @@ TEST(Layer_LSTM_Test_Accuracy_, Reverse)
     lp.blobs.push_back(Wh);
     lp.blobs.push_back(Wx);
     lp.blobs.push_back(bias);
+    lp.blobs.push_back(hInternal);
+    lp.blobs.push_back(cInternal);
 
     cv::Ptr<cv::dnn::LSTMLayer> layer = LSTMLayer::create(lp);
     std::vector<cv::Mat> outputs;
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index 3923068dbf..05f77730af 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -675,6 +675,16 @@ TEST_P(Test_ONNX_layers, LSTM_bidirectional)
     testONNXModels("lstm_bidirectional", npy, 0, 0, false, false);
 }
 
+TEST_P(Test_ONNX_layers, LSTM_hidden)
+{
+    testONNXModels("hidden_lstm", npy, 0, 0, false, false);
+}
+
+TEST_P(Test_ONNX_layers, LSTM_hidden_bidirectional)
+{
+    testONNXModels("hidden_lstm_bi", npy, 0, 0, false, false);
+}
+
 TEST_P(Test_ONNX_layers, Pad2d_Unfused)
 {
     testONNXModels("ReflectionPad2d");