diff --git a/modules/dnn/include/opencv2/dnn/all_layers.hpp b/modules/dnn/include/opencv2/dnn/all_layers.hpp
index 1e3d1fc7b..3db00b593 100644
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@@ -267,6 +267,14 @@ namespace dnn
         static Ptr<SoftmaxLayer> create(int axis = 1);
     };
 
+    class CV_EXPORTS_W InnerProductLayer : public Layer
+    {
+    public:
+        int axis;
+
+        static Ptr<InnerProductLayer> create(int axis = 1);
+    };
+
 //! @}
 //! @}
 
diff --git a/modules/dnn/src/init.cpp b/modules/dnn/src/init.cpp
index eac3eb8a5..e7e9ba3d2 100644
--- a/modules/dnn/src/init.cpp
+++ b/modules/dnn/src/init.cpp
@@ -84,7 +84,7 @@ void initModule()
     REG_RUNTIME_LAYER_FUNC(Pooling, createPoolingLayerFromCaffe)
     REG_RUNTIME_LAYER_CLASS(MVN, MVNLayer)
     REG_RUNTIME_LAYER_FUNC(LRN, createLRNLayerFromCaffe)
-    REG_RUNTIME_LAYER_CLASS(InnerProduct, FullyConnectedLayer)
+    REG_RUNTIME_LAYER_FUNC(InnerProduct, createInnerProductLayerFromCaffe)
 
     REG_RUNTIME_LAYER_CLASS(ReLU, ElementWiseLayer<ReLUFunctor>)
     REG_RUNTIME_LAYER_CLASS(TanH, ElementWiseLayer<TanHFunctor>)
diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp
index 80b91be5d..3e6668016 100644
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -43,73 +43,110 @@
 #include "layers_common.hpp"
 #include "fully_connected_layer.hpp"
 #include "op_blas.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+#include <opencv2/core/ocl.hpp>
 
 namespace cv
 {
 namespace dnn
 {
-    FullyConnectedLayer::FullyConnectedLayer(LayerParams &params) : Layer(params)
-    {
-        numOutputs = params.get<int>("num_output");
-        bias = params.get<bool>("bias_term", true);
-        axis_ = params.get<int>("axis", 1);
 
-        CV_Assert(blobs.size() == (bias ? 2U : 1U));
-        CV_Assert(blobs[0].dims() >= 2 && blobs[0].total() >= (size_t)numOutputs);
-        CV_Assert(!bias || blobs[1].total() == (size_t)numOutputs);
-    }
+FullyConnectedLayerImpl::FullyConnectedLayerImpl(int axis_)
+{
+    axis = axis_;
+}
 
-    void FullyConnectedLayer::allocate(const std::vector<Blob*> &input, std::vector<Blob> &output)
-    {
-        CV_Assert(input.size() > 0);
+void FullyConnectedLayerImpl::allocate(const std::vector<Blob*> &input, std::vector<Blob> &output)
+{
+    CV_Assert(input.size() > 0);
+    CV_Assert(1 <= blobs.size() && blobs.size() <= 2);
+    CV_Assert(blobs[0].dims() == 2);
 
-        axis = input[0]->canonicalAxis(axis_);
-        innerSize = (int)input[0]->total(axis);
+    bias = (blobs.size() >= 1);
+    axisCan = input[0]->canonicalAxis(axis);
+    dtype = input[0]->type();
+    numOutput = blobs[0].size(0);
+    innerSize = blobs[0].size(1);
+    outerSize = input[0]->total(0, axisCan);
 
-        CV_Assert((size_t)innerSize * (size_t)numOutputs == blobs[0].total());
-        CV_Assert(blobs[0].size(-2) == numOutputs && blobs[0].size(-1) == innerSize);
+    CV_Assert((size_t)innerSize == input[0]->total(axisCan));
+    CV_Assert(!bias || (size_t)numOutput == blobs[1].total());
 
-        output.resize(input.size());
-        for (size_t i = 0; i < input.size(); i++)
-        {
-            if (i != 0)
-                CV_Assert(input[i]->equalShape(*input[0]));
+    useOpenCL = ocl::useOpenCL();
+    int allocFlags = useOpenCL ? Blob::ALLOC_UMAT : Blob::ALLOC_UMAT;
 
-            this->reshape(*input[i], output[i]);
-        }
-    }
+    biasOnesBlob.create(Shape(outerSize, 1), dtype, allocFlags);
+    if (useOpenCL)
+        biasOnesBlob.getRef<UMat>().setTo(1);
+    else
+        biasOnesBlob.getRef<Mat>().setTo(1);
 
-    void FullyConnectedLayer::reshape(const Blob &inp, Blob &out)
+    output.resize(input.size());
+    for (size_t i = 0; i < input.size(); i++)
     {
-        BlobShape inpShape = inp.shape();
-        BlobShape outShape(axis+1, inpShape.ptr());
-        outShape[axis] = numOutputs;
+        CV_Assert(i == 0 || (input[i]->equalShape(*input[0]) && input[i]->type() == dtype));
+        Shape outShape = input[i]->shape().slice(0, axis) + Shape(numOutput);
+        output[i].create(outShape, dtype, allocFlags);
+    }
+}
 
-        out.create(outShape, inp.type());
+void FullyConnectedLayerImpl::forward(std::vector<Blob*> &input, std::vector<Blob> &output)
+{
+    if (!useOpenCL)
+        forward_<Mat>(input, output);
+    else
+        forward_<UMat>(input, output);
+}
+
+template<typename XMat>
+void FullyConnectedLayerImpl::forward_(std::vector<Blob *> &input, std::vector<Blob> &output)
+{
+    const XMat &weight = blobs[0].getRefConst<XMat>();
+    const XMat *biasMat, *biasOnesMat;
+    if (bias)
+    {
+        biasOnesMat = &biasOnesBlob.getRefConst<XMat>();
+        biasMat = &blobs[1].getRefConst<XMat>();
     }
 
-    void FullyConnectedLayer::forward(std::vector<Blob*> &input, std::vector<Blob> &output)
+    for (size_t i = 0; i < input.size(); i++)
     {
-        for (size_t i = 0; i < input.size(); i++)
-        {
-            int M = (int)input[i]->total(0, axis);
-            int N = numOutputs;
-            int K = innerSize;
-
-            Mat srcMat(M, K, input[i]->type(), input[i]->ptrf());
-            Mat weight(N, K, blobs[0].type(), blobs[0].ptrf());
-            Mat dstMat(M, N, output[i].type(), output[i].ptrf());
-
-            //important: for perfomance purposes Caffe stores weights as transposed array
-            gemmCPU(srcMat, weight, 1, dstMat, 0, GEMM_2_T);
-
-            if (bias)
-            {
-                Mat biasOnesMat = Mat::ones(M, 1, CV_32F);
-                Mat biasMat(1, N, CV_32F, blobs[1].ptrf());
-                gemmCPU(biasOnesMat, biasMat, 1, dstMat, 1);
-            }
-        }
+        const XMat srcMat = reshaped(input[i]->getRefConst<XMat>(), Shape(outerSize, innerSize));
+        XMat dstMat = reshaped(output[i].getRef<XMat>(), Shape(outerSize, numOutput));
+        dnn::gemm(srcMat, weight, 1, dstMat, 0, GEMM_2_T);
+
+        if (bias)
+            dnn::gemm(*biasOnesMat, *biasMat, 1, dstMat, 1);
     }
 }
+
+
+Ptr<InnerProductLayer> InnerProductLayer::create(int axis)
+{
+    return Ptr<InnerProductLayer>(new FullyConnectedLayerImpl(axis));
+}
+
+Ptr<Layer> createInnerProductLayerFromCaffe(LayerParams &params)
+{
+    const std::vector<Blob> &blobs = params.blobs;
+    CV_Assert(1 <= blobs.size() && blobs.size() <= 2);
+
+    int numOutputs = params.get<int>("num_output");
+    int innerSize = (int)blobs[0].total() / numOutputs;
+    bool bias = params.get<bool>("bias_term", true);
+    int axis = params.get<int>("axis", 1);
+
+    CV_Assert(blobs[0].dims() >= 2 && (size_t)(innerSize * numOutputs) == blobs[0].total());
+    CV_Assert(!bias || (blobs.size() == 2 && (size_t)numOutputs == blobs[1].total()));
+
+    Ptr<InnerProductLayer> l = InnerProductLayer::create(axis);
+    l->setParamsFrom(params);
+    l->blobs[0].reshape(Shape(numOutputs, innerSize));
+    if (bias)
+        l->blobs[1].reshape(Shape(1, numOutputs));
+
+    return Ptr<Layer>(l);
+}
+
+}
 }
diff --git a/modules/dnn/src/layers/fully_connected_layer.hpp b/modules/dnn/src/layers/fully_connected_layer.hpp
index 5213b98d8..714593e4c 100644
--- a/modules/dnn/src/layers/fully_connected_layer.hpp
+++ b/modules/dnn/src/layers/fully_connected_layer.hpp
@@ -42,26 +42,32 @@
 #ifndef __OPENCV_DNN_LAYERS_FULLY_CONNECTED_LAYER_HPP__
 #define __OPENCV_DNN_LAYERS_FULLY_CONNECTED_LAYER_HPP__
 #include "../precomp.hpp"
+#include <opencv2/dnn/all_layers.hpp>
 
 namespace cv
 {
 namespace dnn
 {
-    class FullyConnectedLayer : public Layer
-    {
-        bool bias;
-        int numOutputs;
-        int axis_, axis;
 
-        int innerSize;
+class FullyConnectedLayerImpl : public InnerProductLayer
+{
+    int axisCan, dtype;
+    int numOutput, innerSize, outerSize;
+    bool bias, useOpenCL;
+    Blob biasOnesBlob;
+
+    template<typename XMat>
+    void forward_(std::vector<Blob*> &input, std::vector<Blob> &output);
+
+public:
+
+    FullyConnectedLayerImpl(int axisCan = 1);
+    void allocate(const std::vector<Blob*> &input, std::vector<Blob> &output);
+    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+};
 
-        void reshape(const Blob &inp, Blob &out);
+Ptr<Layer> createInnerProductLayerFromCaffe(LayerParams &params);
 
-    public:
-        FullyConnectedLayer(LayerParams &params);
-        void allocate(const std::vector<Blob*> &input, std::vector<Blob> &output);
-        void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    };
 }
 }
 #endif
diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp
index 94d8945cf..009a32c57 100644
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -181,7 +181,7 @@ TEST(Layer_Test_Reshape, squeeze)
     rl->allocate(inpVec, outVec);
     rl->forward(inpVec, outVec);
 
-    EXPECT_EQ(outVec[0].shape(), BlobShape(Vec3i(4, 3, 2)));
+    EXPECT_EQ(outVec[0].shape(), BlobShape(4, 3, 2));
 }
 
 TEST(Layer_Test_Reshape_Split_Slice, Accuracy)