From 4e5699fa716a3e5b0faddf1d6a00213aeb5c60cc Mon Sep 17 00:00:00 2001 From: Julia Bareeva <34717687+JulieBar@users.noreply.github.com> Date: Fri, 23 Jul 2021 17:11:50 +0300 Subject: [PATCH] Merge pull request #20450 from JulieBar:lstm_inside Support non-zero hidden state for LSTM * fully support non-zero hidden state for LSTM * check dims of hidden state for LSTM * fix failed test Test_Model.TextRecognition * add new tests for LSTM w/ non-zero hidden params Co-authored-by: Julie Bareeva --- modules/dnn/src/layers/recurrent_layers.cpp | 28 +++++--- modules/dnn/src/onnx/onnx_importer.cpp | 11 ++- modules/dnn/src/tensorflow/tf_importer.cpp | 16 +++-- modules/dnn/test/test_layers.cpp | 80 ++++++++++++++++++++- modules/dnn/test/test_onnx_importer.cpp | 10 +++ 5 files changed, 122 insertions(+), 23 deletions(-) diff --git a/modules/dnn/src/layers/recurrent_layers.cpp b/modules/dnn/src/layers/recurrent_layers.cpp index 69606a6b4e..a6715aefca 100644 --- a/modules/dnn/src/layers/recurrent_layers.cpp +++ b/modules/dnn/src/layers/recurrent_layers.cpp @@ -112,19 +112,24 @@ public: const Mat& Wh = blobs[0]; const Mat& Wx = blobs[1]; const Mat& bias = blobs[2]; + const Mat& hInternal = blobs[3]; + const Mat& cInternal = blobs[4]; CV_CheckEQ(Wh.dims, 2, ""); CV_CheckEQ(Wx.dims, 2, ""); CV_CheckEQ(Wh.rows, Wx.rows, ""); CV_CheckEQ(Wh.rows, (1 + static_cast(bidirectional))*4*Wh.cols, ""); CV_CheckEQ(Wh.rows, (int)bias.total(), ""); + CV_CheckEQ(hInternal.cols, Wh.cols, ""); + CV_CheckEQ(hInternal.cols, cInternal.cols, ""); + CV_CheckEQ(hInternal.rows, cInternal.rows, ""); CV_Assert(Wh.type() == Wx.type() && Wx.type() == bias.type()); // Peephole weights. - if (blobs.size() > 3) + if (blobs.size() > 5) { - CV_Assert(blobs.size() == 6); + CV_Assert(blobs.size() == 8); const int N = Wh.cols; - for (int i = 3; i < 6; ++i) + for (int i = 5; i < 8; ++i) { CV_Assert(blobs[i].rows == N && blobs[i].cols == N); CV_Assert(blobs[i].type() == bias.type()); @@ -181,7 +186,7 @@ public: std::vector &outputs, std::vector &internals) const CV_OVERRIDE { - CV_Assert((!usePeephole && blobs.size() == 3) || (usePeephole && blobs.size() == 6)); + CV_Assert((!usePeephole && blobs.size() == 5) || (usePeephole && blobs.size() == 8)); CV_Assert(inputs.size() == 1); const MatShape& inp0 = inputs[0]; @@ -228,7 +233,7 @@ public: std::vector input; inputs_arr.getMatVector(input); - CV_Assert((!usePeephole && blobs.size() == 3) || (usePeephole && blobs.size() == 6)); + CV_Assert((!usePeephole && blobs.size() == 5) || (usePeephole && blobs.size() == 8)); CV_Assert(input.size() == 1); const Mat& inp0 = input[0]; @@ -284,13 +289,14 @@ public: const Mat &Wh = blobs[0].rowRange(i * blobs[0].rows / numDirs, (i + 1) * blobs[0].rows / numDirs); const Mat &Wx = blobs[1].rowRange(i * blobs[1].rows / numDirs, (i + 1) * blobs[1].rows / numDirs); const Mat &bias = blobs[2].colRange(i * blobs[2].cols / numDirs, (i + 1) * blobs[2].cols / numDirs); + const Mat &h_0 = blobs[3].rowRange(i * blobs[3].rows / numDirs, (i + 1) * blobs[3].rows / numDirs); + const Mat &c_0 = blobs[4].rowRange(i * blobs[4].rows / numDirs, (i + 1) * blobs[4].rows / numDirs); int numOut = Wh.size[1]; - Mat hInternal = internals[0], cInternal = internals[1], dummyOnes = internals[2], gates = internals[3]; - hInternal.setTo(0.); - cInternal.setTo(0.); + h_0.copyTo(hInternal); + c_0.copyTo(cInternal); dummyOnes.setTo(1.); int numSamplesTotal = numTimeStamps*numSamples; @@ -331,8 +337,8 @@ public: if (usePeephole) { Mat gatesIF = gates.colRange(0, 2*numOut); - gemm(cInternal, blobs[3], 1, gateI, 1, gateI); - gemm(cInternal, blobs[4], 1, gateF, 1, gateF); + gemm(cInternal, blobs[5], 1, gateI, 1, gateI); + gemm(cInternal, blobs[6], 1, gateF, 1, gateF); sigmoid(gatesIF, gatesIF); } else @@ -355,7 +361,7 @@ public: } if (usePeephole) { - gemm(cInternal, blobs[5], 1, gateO, 1, gateO); + gemm(cInternal, blobs[7], 1, gateO, 1, gateO); sigmoid(gateO, gateO); } diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp index ec61a9707e..4ad0fd496e 100644 --- a/modules/dnn/src/onnx/onnx_importer.cpp +++ b/modules/dnn/src/onnx/onnx_importer.cpp @@ -900,8 +900,9 @@ void ONNXImporter::handleNode(const opencv_onnx::NodeProto& node_proto_) Mat Wx = getBlob(node_proto, 1); Mat Wh = getBlob(node_proto, 2); Mat b = getBlob(node_proto, 3); - CV_CheckEQ(countNonZero(getBlob(node_proto, 5)), 0, "Unsupported non zero initial_h"); - CV_CheckEQ(countNonZero(getBlob(node_proto, 6)), 0, "Unsupported non zero initial_c"); + Mat h0 = getBlob(node_proto, 5); + Mat c0 = getBlob(node_proto, 6); + b = b.reshape(1, b.size[0]); const int numHidden = lstmParams.get("hidden_size"); @@ -934,11 +935,15 @@ void ONNXImporter::handleNode(const opencv_onnx::NodeProto& node_proto_) } Wx = Wx.reshape(1, Wx.size[0] * Wx.size[1]); Wh = Wh.reshape(1, Wh.size[0] * Wh.size[1]); + h0 = h0.reshape(1, h0.size[0] * h0.size[1]); + c0 = c0.reshape(1, c0.size[0] * c0.size[1]); - lstmParams.blobs.resize(3); + lstmParams.blobs.resize(5); lstmParams.blobs[0] = Wh; lstmParams.blobs[1] = Wx; lstmParams.blobs[2] = b; + lstmParams.blobs[3] = h0; + lstmParams.blobs[4] = c0; lstmParams.set("bidirectional", lstmParams.get("direction", "") == "bidirectional"); node_proto.set_output(0, lstmParams.name); // set different name so output shapes will be registered on that name diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp index 426710989e..01fa0df985 100644 --- a/modules/dnn/src/tensorflow/tf_importer.cpp +++ b/modules/dnn/src/tensorflow/tf_importer.cpp @@ -1838,8 +1838,8 @@ void TFImporter::parseBlockLSTM(tensorflow::GraphDef& net, const tensorflow::Nod // op: "BlockLSTM" // input: "lstm_block_wrapper/ToInt64/x" (ignore, number of time stamps) // input: "input" - // input: "lstm_block_wrapper/zeros" (ignore) - // input: "lstm_block_wrapper/zeros" (ignore) + // input: "lstm_block_wrapper/zeros" + // input: "lstm_block_wrapper/zeros" // input: "lstm_block_wrapper/kernel" // input: "lstm_block_wrapper/w_i_diag" // input: "lstm_block_wrapper/w_f_diag" @@ -1865,9 +1865,11 @@ void TFImporter::parseBlockLSTM(tensorflow::GraphDef& net, const tensorflow::Nod } } - Mat W, Wh, Wx, b; + Mat W, Wh, Wx, b, cs_prev, h_prev; blobFromTensor(getConstBlob(layer, value_id, 4), W); blobFromTensor(getConstBlob(layer, value_id, 8), b); + blobFromTensor(getConstBlob(layer, value_id, 2), cs_prev); + blobFromTensor(getConstBlob(layer, value_id, 3), h_prev); const int outSize = W.cols / 4; // IGFO->IFOG @@ -1883,10 +1885,12 @@ void TFImporter::parseBlockLSTM(tensorflow::GraphDef& net, const tensorflow::Nod Wx = W.rowRange(0, W.rows - outSize).t(); Wh = W.rowRange(W.rows - outSize, W.rows).t(); - layerParams.blobs.resize(3); + layerParams.blobs.resize(5); layerParams.blobs[0] = Wh; layerParams.blobs[1] = Wx; layerParams.blobs[2] = b; + layerParams.blobs[3] = h_prev; + layerParams.blobs[4] = cs_prev; if (hasLayerAttr(layer, "use_peephole")) { @@ -1894,14 +1898,14 @@ void TFImporter::parseBlockLSTM(tensorflow::GraphDef& net, const tensorflow::Nod if (usePeephole) { layerParams.set("use_peephole", true); - layerParams.blobs.resize(6); + layerParams.blobs.resize(8); for (int i = 0; i < 3; ++i) { Mat w; blobFromTensor(getConstBlob(layer, value_id, 5 + i), w); w = w.reshape(1, w.total()); // Single column. w = Mat::diag(w); // Make a diagonal matrix. - layerParams.blobs[3 + i] = w; + layerParams.blobs[5 + i] = w; } } } diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp index 897603d274..fbe9605e7f 100644 --- a/modules/dnn/test/test_layers.cpp +++ b/modules/dnn/test/test_layers.cpp @@ -434,7 +434,7 @@ class Layer_LSTM_Test : public ::testing::Test { public: int numInp, numOut; - Mat Wh, Wx, b; + Mat Wh, Wx, b, h, c; Ptr layer; std::vector inputs, outputs; @@ -449,12 +449,17 @@ public: Wh = Mat::ones(4 * numOut, numOut, CV_32F); Wx = Mat::ones(4 * numOut, numInp, CV_32F); b = Mat::ones(4 * numOut, 1, CV_32F); + h = Mat::ones(4, numOut, CV_32F); + c = Mat::ones(4, numOut, CV_32F); LayerParams lp; - lp.blobs.resize(3); + lp.blobs.resize(5); lp.blobs[0] = Wh; lp.blobs[1] = Wx; lp.blobs[2] = b; + lp.blobs[3] = h; + lp.blobs[4] = c; + lp.set("produce_cell_output", produceCellOutput); lp.set("use_timestamp_dim", useTimestampDim); @@ -502,10 +507,12 @@ TEST_F(Layer_LSTM_Test, get_set_test) TEST(Layer_LSTM_Test_Accuracy_with_, CaffeRecurrent) { LayerParams lp; - lp.blobs.resize(3); + lp.blobs.resize(5); lp.blobs[0] = blobFromNPY(_tf("lstm.prototxt.w_2.npy")); // Wh lp.blobs[1] = blobFromNPY(_tf("lstm.prototxt.w_0.npy")); // Wx lp.blobs[2] = blobFromNPY(_tf("lstm.prototxt.w_1.npy")); // bias + lp.blobs[3] = Mat::zeros(2, 17, CV_32F); // h_0 + lp.blobs[4] = Mat::zeros(2, 17, CV_32F); // c_0 Ptr layer = LSTMLayer::create(lp); Mat inp = blobFromNPY(_tf("recurrent.input.npy")); @@ -516,6 +523,68 @@ TEST(Layer_LSTM_Test_Accuracy_with_, CaffeRecurrent) normAssert(h_t_reference, outputs[0]); } +TEST(Layer_LSTM_Test_Accuracy_with_, HiddenParams) +{ + Mat Wx = blobFromNPY(_tf("lstm.hidden.W.npy")); + Mat Wh = blobFromNPY(_tf("lstm.hidden.R.npy")); + Mat b = blobFromNPY(_tf("lstm.hidden.B.npy")); + Mat h0 = blobFromNPY(_tf("lstm.hidden.h0.npy")); + Mat c0 = blobFromNPY(_tf("lstm.hidden.c0.npy")); + + const int numHidden = 3; + const int numDirs = Wx.size[0]; + const int numFeatures = Wx.size[2]; + + b = b.reshape(1, b.size[0]); + Mat bx = b.colRange(0, b.cols / 2); + Mat bh = b.colRange(b.cols / 2, b.cols); + b = bx + bh; + + // IFGO->IGFO + for (int k = 0; k < numDirs; ++k) + { + float* WxData = Wx.ptr(k); + float* WhData = Wh.ptr(k); + float* biasData = b.ptr(k); + for (int j = 0; j < numHidden; ++j) + { + for (int i = 0; i < numFeatures; ++i) + { + std::swap(WxData[(numHidden + j) * numFeatures + i], + WxData[(numHidden * 2 + j) * numFeatures + i]); + } + for (int i = 0; i < numHidden; ++i) + { + std::swap(WhData[(numHidden + j) * numHidden + i], + WhData[(numHidden * 2 + j) * numHidden + i]); + } + std::swap(biasData[numHidden + j], biasData[numHidden * 2 + j]); + } + } + + Wx = Wx.reshape(1, Wx.size[0] * Wx.size[1]); + Wh = Wh.reshape(1, Wh.size[0] * Wh.size[1]); + h0 = h0.reshape(1, h0.size[0] * h0.size[1]); + c0 = c0.reshape(1, c0.size[0] * c0.size[1]); + + LayerParams lstmParams; + lstmParams.blobs.resize(5); + lstmParams.blobs[0] = Wh; + lstmParams.blobs[1] = Wx; + lstmParams.blobs[2] = b; + lstmParams.blobs[3] = h0; + lstmParams.blobs[4] = c0; + lstmParams.set("bidirectional", false); + Ptr layer = LSTMLayer::create(lstmParams); + + Mat inp = blobFromNPY(_tf("lstm.hidden.input.npy")); + std::vector inputs(1, inp), outputs; + runLayer(layer, inputs, outputs); + + Mat h_t_reference = blobFromNPY(_tf("lstm.hidden.output.npy")); + normAssert(h_t_reference, outputs[0]); +} + TEST(Layer_RNN_Test_Accuracy_with_, CaffeRecurrent) { Ptr layer = RNNLayer::create(LayerParams()); @@ -560,6 +629,9 @@ TEST(Layer_LSTM_Test_Accuracy_, Reverse) bias.at(2, 0) = 1e10f; // Output gate - always output everything bias.at(3, 0) = 0.f; // Update signal + cv::Mat hInternal = cv::Mat::zeros(1, 1, CV_32FC1); + cv::Mat cInternal = cv::Mat::zeros(1, 1, CV_32FC1); + LayerParams lp; lp.set("reverse", true); lp.set("use_timestamp_dim", true); @@ -567,6 +639,8 @@ TEST(Layer_LSTM_Test_Accuracy_, Reverse) lp.blobs.push_back(Wh); lp.blobs.push_back(Wx); lp.blobs.push_back(bias); + lp.blobs.push_back(hInternal); + lp.blobs.push_back(cInternal); cv::Ptr layer = LSTMLayer::create(lp); std::vector outputs; diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp index 3923068dbf..05f77730af 100644 --- a/modules/dnn/test/test_onnx_importer.cpp +++ b/modules/dnn/test/test_onnx_importer.cpp @@ -675,6 +675,16 @@ TEST_P(Test_ONNX_layers, LSTM_bidirectional) testONNXModels("lstm_bidirectional", npy, 0, 0, false, false); } +TEST_P(Test_ONNX_layers, LSTM_hidden) +{ + testONNXModels("hidden_lstm", npy, 0, 0, false, false); +} + +TEST_P(Test_ONNX_layers, LSTM_hidden_bidirectional) +{ + testONNXModels("hidden_lstm_bi", npy, 0, 0, false, false); +} + TEST_P(Test_ONNX_layers, Pad2d_Unfused) { testONNXModels("ReflectionPad2d");