diff --git a/modules/dnn/include/opencv2/dnn/all_layers.hpp b/modules/dnn/include/opencv2/dnn/all_layers.hpp
index 67f71e8b0..4d722b615 100644
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@@ -172,28 +172,35 @@ namespace dnn
 
         /** Setups learned weights.
 
-        Recurrent-layer behavior on each step is defined by current input x_t, previous state h_t and learned weights as follows:
+        Recurrent-layer behavior on each step is defined by current input @f$ x_t @f$, previous state @f$ h_t @f$ and learned weights as follows:
         @f{eqnarray*}{
         h_t &= tanh&(W_{hh} h_{t-1} + W_{xh} x_t + b_h),  \\
         o_t &= tanh&(W_{ho} h_t + b_o),
         @f}
 
-        @param Whh is @f$ W_{hh} @f$ matrix
         @param Wxh is @f$ W_{xh} @f$ matrix
         @param bh  is @f$ b_{h}  @f$ vector
+        @param Whh is @f$ W_{hh} @f$ matrix
         @param Who is @f$ W_{xo} @f$ matrix
         @param bo  is @f$ b_{o}  @f$ vector
         */
-        virtual void setWeights(const Blob &Whh, const Blob &Wxh, const Blob &bh, const Blob &Who, const Blob &bo) = 0;
+        virtual void setWeights(const Blob &Wxh, const Blob &bh, const Blob &Whh, const Blob &Who, const Blob &bo) = 0;
+
+        /** @brief If this flag is set to true then layer will produce @f$ h_t @f$ as second output.
+         * @details Shape of the second output is the same as first output.
+         */
+        virtual void setProduceHiddenOutput(bool produce = false) = 0;
 
         /** Accepts two inputs @f$x_t@f$ and @f$h_{t-1}@f$ and compute two outputs @f$o_t@f$ and @f$h_t@f$.
 
-        @param input could contain inputs @f$x_t@f$ and @f$h_{t-1}@f$.
-        @param output should contain outputs @f$o_t@f$ and @f$h_t@f$.
+        @param input should contain packed input @f$x_t@f$.
+        @param output should contain output @f$o_t@f$ (and @f$h_t@f$ if setProduceHiddenOutput() is set to true).
+
+        @p input[0] should have shape [`T`, `N`, `data_dims`] where `T` and `N` is number of timestamps and number of independent samples of @f$x_t@f$ respectively.
 
-        The first input @f$x_t@f$ is required whereas @f$h_{t-1}@f$ is optional.
-        If the second input @f$h_{t-1}@f$ isn't specified a layer will use internal @f$h_{t-1}@f$ from the previous calls, at the first call @f$h_{t-1}@f$ will be filled by zeros.
+        @p output[0] will have shape [`T`, `N`, @f$N_o@f$], where @f$N_o@f$ is number of rows in @f$ W_{xo} @f$ matrix.
 
+        If setProduceHiddenOutput() is set to true then @p output[1] will contain a Blob with shape [`T`, `N`, @f$N_h@f$], where @f$N_h@f$ is number of rows in @f$ W_{hh} @f$ matrix.
         */
         void forward(std::vector<Blob*> &input, std::vector<Blob> &output);
     };
diff --git a/modules/dnn/src/layers/op_blas.cpp b/modules/dnn/src/layers/op_blas.cpp
index 90a3c7a18..8b09750d5 100644
--- a/modules/dnn/src/layers/op_blas.cpp
+++ b/modules/dnn/src/layers/op_blas.cpp
@@ -40,6 +40,7 @@ void gemmCPU(const Mat &A, const Mat &B, double alpha, Mat &C, double beta, int
     CV_Assert(A.isContinuous() && B.isContinuous() && C.isContinuous());
     CV_Assert(A.type() == CV_32F || A.type() == CV_64F);
     CV_Assert(A.type() == B.type() && B.type() == C.type());
+    CV_Assert(A.data != C.data && B.data != C.data);
 
     if (C.type() == CV_32F)
     {
diff --git a/modules/dnn/src/layers/recurrent_layers.cpp b/modules/dnn/src/layers/recurrent_layers.cpp
index 9ef93abef..0cd68e2b5 100644
--- a/modules/dnn/src/layers/recurrent_layers.cpp
+++ b/modules/dnn/src/layers/recurrent_layers.cpp
@@ -321,20 +321,28 @@ int LSTMLayer::outputNameToIndex(String outputName)
 
 class RNNLayerImpl : public RNNLayer
 {
-    int nX, nH, nO, nSamples;
+    int numX, numH, numO;
+    int numSamples, numTimestamps, numSamplesTotal;
     int dtype;
     Mat Whh, Wxh, bh;
     Mat Who, bo;
-    Mat hPrevInternal, dummyBiasOnes;
+    Mat hCurr, hPrev, dummyBiasOnes;
+    bool produceH;
 
 public:
 
     RNNLayerImpl()
     {
         type = "RNN";
+        produceH = false;
     }
 
-    void setWeights(const Blob &W_hh, const Blob &W_xh, const Blob &b_h, const Blob &W_ho, const Blob &b_o)
+    void setProduceHiddenOutput(bool produce = false)
+    {
+        produceH = produce;
+    }
+
+    void setWeights(const Blob &W_xh, const Blob &b_h, const Blob &W_hh, const Blob &W_ho, const Blob &b_o)
     {
         CV_Assert(W_hh.dims() == 2 && W_xh.dims() == 2);
         CV_Assert(W_hh.size(0) == W_xh.size(0) && W_hh.size(0) == W_hh.size(1) && (int)b_h.total() == W_xh.size(0));
@@ -342,9 +350,9 @@ public:
         CV_Assert(W_ho.size(1) == W_hh.size(1));
 
         blobs.resize(5);
-        blobs[0] = W_hh;
-        blobs[1] = W_xh;
-        blobs[2] = b_h;
+        blobs[0] = W_xh;
+        blobs[1] = b_h;
+        blobs[2] = W_hh;
         blobs[3] = W_ho;
         blobs[4] = b_o;
     }
@@ -353,72 +361,68 @@ public:
     {
         CV_Assert(input.size() >= 1 && input.size() <= 2);
 
-        Whh = blobs[0].matRefConst();
-        Wxh = blobs[1].matRefConst();
-        bh  = blobs[2].matRefConst();
+        Wxh = blobs[0].matRefConst();
+        bh  = blobs[1].matRefConst();
+        Whh = blobs[2].matRefConst();
         Who = blobs[3].matRefConst();
         bo  = blobs[4].matRefConst();
 
-        nH = Wxh.rows;
-        nX = Wxh.cols;
-        nO = Who.rows;
+        numH = Wxh.rows;
+        numX = Wxh.cols;
+        numO = Who.rows;
 
-        CV_Assert(input[0]->size(-1) == Wxh.cols);
-        nSamples = input[0]->total(0, input[0]->dims() - 1);
-        BlobShape xShape = input[0]->shape();
-        BlobShape hShape = xShape;
-        BlobShape oShape = xShape;
-        hShape[-1] = nH;
-        oShape[-1] = nO;
-
-        if (input.size() == 2)
-        {
-            CV_Assert(input[1]->shape() == hShape);
-        }
-        else
-        {
-            hPrevInternal.create(nSamples, nH, input[0]->type());
-            hPrevInternal.setTo(0);
-        }
+        CV_Assert(input[0]->dims() >= 2);
+        CV_Assert((int)input[0]->total(2) == numX);
+        CV_Assert(input[0]->type() == CV_32F || input[0]->type() == CV_64F);
+        dtype = input[0]->type();
+        numTimestamps = input[0]->size(0);
+        numSamples = input[0]->size(1);
+        numSamplesTotal = numTimestamps * numSamples;
 
-        output.resize(2);
-        output[0].create(oShape, input[0]->type());
-        output[1].create(hShape, input[0]->type());
+        hCurr.create(numSamples, numH, dtype);
+        hPrev.create(numSamples, numH, dtype);
+        hPrev.setTo(0);
 
-        dummyBiasOnes.create(nSamples, 1, bh.type());
+        dummyBiasOnes.create(numSamples, 1, dtype);
         dummyBiasOnes.setTo(1);
-        bh = bh.reshape(1, 1); //is 1 x nH mat
-        bo = bo.reshape(1, 1); //is 1 x nO mat
+        bh = bh.reshape(1, 1); //is 1 x numH Mat
+        bo = bo.reshape(1, 1); //is 1 x numO Mat
+
+        reshapeOutput(output);
+    }
+
+    void reshapeOutput(std::vector<Blob> &output)
+    {
+        output.resize((produceH) ? 2 : 1);
+        output[0].create(BlobShape(numTimestamps, numSamples, numO), dtype);
+        if (produceH)
+            output[1].create(BlobShape(numTimestamps, numSamples, numH), dtype);
     }
 
     void forward(std::vector<Blob*> &input, std::vector<Blob> &output)
     {
-        Mat xCurr = input[0]->matRefConst();
-        Mat hPrev = (input.size() >= 2) ? input[1]->matRefConst() : hPrevInternal;
-        Mat oCurr = output[0].matRef();
-        Mat hCurr = output[1].matRef();
-
-        //TODO: Check types
-
-        int xsz[] = {nSamples, nX};
-        int hsz[] = {nSamples, nH};
-        int osz[] = {nSamples, nO};
-        if (xCurr.dims != 2) xCurr = xCurr.reshape(1, 2, xsz);
-        if (hPrev.dims != 2) hPrev = hPrev.reshape(1, 2, hsz);
-        if (oCurr.dims != 2) oCurr = oCurr.reshape(1, 2, osz);
-        if (hCurr.dims != 2) hCurr = hCurr.reshape(1, 2, hsz);
-
-        gemmCPU(hPrev, Whh, 1, hCurr, 0, GEMM_2_T); // W_{hh} * h_{prev}
-        gemmCPU(xCurr, Wxh, 1, hCurr, 1, GEMM_2_T); //+W_{xh} * x_{curr}
-        gemmCPU(dummyBiasOnes, bh, 1, hCurr, 1);    //+bh
-        tanh(hCurr, hCurr);
-
-        gemmCPU(hPrev, Who, 1, oCurr, 0, GEMM_2_T); // W_{ho} * h_{prev}
-        gemmCPU(dummyBiasOnes, bo, 1, oCurr, 1);    //+b_o
-        tanh(oCurr, oCurr);
-
-        if (input.size() < 2) //save h_{prev}
-            hCurr.copyTo(hPrevInternal);
+        Mat xTs = input[0]->reshaped(BlobShape(numSamplesTotal, numX)).matRefConst();
+        Mat oTs = output[0].reshaped(BlobShape(numSamplesTotal, numO)).matRef();
+        Mat hTs = (produceH) ? output[1].reshaped(BlobShape(numSamplesTotal, numH)).matRef() : Mat();
+
+        for (int ts = 0; ts < numTimestamps; ts++)
+        {
+            Range curRowRange = Range(ts * numSamples, (ts + 1) * numSamples);
+            Mat xCurr = xTs.rowRange(curRowRange);
+
+            gemmCPU(hPrev, Whh, 1, hCurr, 0, GEMM_2_T); // W_{hh} * h_{prev}
+            gemmCPU(xCurr, Wxh, 1, hCurr, 1, GEMM_2_T); //+W_{xh} * x_{curr}
+            gemmCPU(dummyBiasOnes, bh, 1, hCurr, 1);    //+bh
+            tanh(hCurr, hPrev);
+
+            Mat oCurr = oTs.rowRange(curRowRange);
+            gemmCPU(hPrev, Who, 1, oCurr, 0, GEMM_2_T); // W_{ho} * h_{prev}
+            gemmCPU(dummyBiasOnes, bo, 1, oCurr, 1);    //+b_o
+            tanh(oCurr, oCurr);
+
+            if (produceH)
+                hPrev.copyTo(hTs.rowRange(curRowRange));
+        }
     }
 };
 
diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp
index 68f472be2..5e9171265 100644
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -254,7 +254,7 @@ TEST_F(Layer_LSTM_Test, get_set_test)
     EXPECT_EQ(1, layer->outputNameToIndex("c"));
 }
 
-TEST(Layer_LSTM_Test_Accuracy_Reference_with_, CaffeRecurrent)
+TEST(Layer_LSTM_Test_Accuracy_with_, CaffeRecurrent)
 {
     Ptr<LSTMLayer> layer = LSTMLayer::create();
 
@@ -263,73 +263,70 @@ TEST(Layer_LSTM_Test_Accuracy_Reference_with_, CaffeRecurrent)
     Blob b  = blobFromNPY(_tf("lstm.prototxt.w_1.npy"));
     layer->setWeights(Wh, Wx, b);
 
-    Blob inp = blobFromNPY(_tf("blob.npy"));
+    Blob inp = blobFromNPY(_tf("recurrent.input.npy"));
     std::vector<Blob> inputs(1, inp), outputs;
     runLayer(layer, inputs, outputs);
 
-    Blob &h_t_gathered = outputs[0];
     Blob h_t_reference = blobFromNPY(_tf("lstm.prototxt.h_1.npy"));
+    normAssert(h_t_reference, outputs[0]);
+}
+
+TEST(Layer_RNN_Test_Accuracy_with_, CaffeRecurrent)
+{
+    Ptr<RNNLayer> layer = RNNLayer::create();
+
+    layer->setWeights(
+                blobFromNPY(_tf("rnn.prototxt.w_0.npy")),
+                blobFromNPY(_tf("rnn.prototxt.w_1.npy")),
+                blobFromNPY(_tf("rnn.prototxt.w_2.npy")),
+                blobFromNPY(_tf("rnn.prototxt.w_3.npy")),
+                blobFromNPY(_tf("rnn.prototxt.w_4.npy")) );
 
-    normAssert(h_t_reference, h_t_gathered);
+    std::vector<Blob> output, input(1, blobFromNPY(_tf("recurrent.input.npy")));
+    runLayer(layer, input, output);
+
+    Blob h_ref = blobFromNPY(_tf("rnn.prototxt.h_1.npy"));
+    normAssert(h_ref, output[0]);
 }
 
 
 class Layer_RNN_Test : public ::testing::Test
 {
 public:
-    int Nx, Nh, No;
+    int nX, nH, nO, nT, nS;
     Blob Whh, Wxh, bh, Who, bo;
     Ptr<RNNLayer> layer;
 
     std::vector<Blob> inputs, outputs;
-    std::vector<Blob*> inputsPtr;
 
-    Layer_RNN_Test(int _Nx = 31, int _Nh = 64, int _No = 100)
+    Layer_RNN_Test()
     {
-        Nx = _Nx;
-        Nh = _Nh;
-        No = _No;
-
-        Whh = Blob(BlobShape(Nh, Nh));
-        Wxh = Blob(BlobShape(Nh, Nx));
-        bh  = Blob(BlobShape(Nh, 1));
-        Who = Blob(BlobShape(No, Nh));
-        bo  = Blob(BlobShape(No, 1));
+        nT = 3;
+        nS = 5;
+        nX = 31;
+        nH = 64;
+        nO = 100;
+
+        Whh = Blob(BlobShape(nH, nH));
+        Wxh = Blob(BlobShape(nH, nX));
+        bh  = Blob(BlobShape(nH, 1));
+        Who = Blob(BlobShape(nO, nH));
+        bo  = Blob(BlobShape(nO, 1));
 
         layer = RNNLayer::create();
-        layer->setWeights(Whh, Wxh, bh, Who, bo);
-    }
-
-    void allocateAndForward()
-    {
-        inputsPtr.clear();
-        for (size_t i = 0; i < inputs.size(); i++)
-            inputsPtr.push_back(&inputs[i]);
-
-        layer->allocate(inputsPtr, outputs);
-        layer->forward(inputsPtr, outputs);
+        layer->setProduceHiddenOutput(true);
+        layer->setWeights(Wxh, bh, Whh, Who, bo);
     }
 };
 
-TEST_F(Layer_RNN_Test, BasicTest_1)
-{
-    inputs.push_back(Blob(BlobShape(1, 2, 3, Nx)));
-    allocateAndForward();
-
-    EXPECT_EQ(outputs.size(), 2);
-    EXPECT_EQ(outputs[0].shape(), BlobShape(1, 2, 3, No));
-    EXPECT_EQ(outputs[1].shape(), BlobShape(1, 2, 3, Nh));
-}
-
-TEST_F(Layer_RNN_Test, BasicTest_2)
+TEST_F(Layer_RNN_Test, get_set_test)
 {
-    inputs.push_back(Blob(BlobShape(1, 2, 3, Nx)));
-    inputs.push_back(Blob(BlobShape(1, 2, 3, Nh)));
-    allocateAndForward();
+    inputs.push_back(Blob(BlobShape(nT, nS, 1, nX)));
+    runLayer(layer, inputs, outputs);
 
     EXPECT_EQ(outputs.size(), 2);
-    EXPECT_EQ(outputs[0].shape(), BlobShape(1, 2, 3, No));
-    EXPECT_EQ(outputs[1].shape(), BlobShape(1, 2, 3, Nh));
+    EXPECT_EQ(outputs[0].shape(), BlobShape(nT, nS, nO));
+    EXPECT_EQ(outputs[1].shape(), BlobShape(nT, nS, nH));
 }
 
 }