diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 72b7b9fb0..f8468ac43 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -1,5 +1,6 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
+#include "im2col.hpp"
 
 namespace cv
 {
@@ -8,19 +9,24 @@ namespace dnn
     //TODO: simultaneously convolution and bias addition for cache optimization
     class ConvolutionLayer : public Layer
     {
+    protected:
         bool bias;
         int numOutput, group;
         int padH, padW;
+        int kerH, kerW;
         int strideH, strideW;
-        int kernelH, kernelW;
 
-        int inH, inW, inCn, kerSize;
-        int outH, outW;
-        int groupCn, groupCnOut;
+        int inpH, inpW, inpCn;
+        int outH, outW, outCn;
+        int topH, topW, topCn; //switched between inp/out on deconv/conv
+        int inpGroupCn, outGroupCn;
+        int ksize;
 
-        Mat srcColsMat, biasOnesMat;
+        Mat colMat, biasOnesMat;
 
-        void computeOutputShape(int inH, int inW);
+        inline bool is1x1() const;
+        virtual void computeInpOutShape(const Blob &inpBlob);
+        void im2col(Blob &inpBlob, int imNum, int cnGroup);
 
     public:
         ConvolutionLayer(LayerParams &params);
@@ -28,13 +34,25 @@ namespace dnn
         void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
     };
 
+    class DeConvolutionLayer : public ConvolutionLayer
+    {
+    protected:
+        void computeInpOutShape(const Blob &inpBlob);
+        void col2im(Mat &dstMat);
+
+    public:
+        DeConvolutionLayer(LayerParams &params) : ConvolutionLayer(params) {}
+        void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+    };
+
 
     REGISTER_LAYER_CLASS(Convolution, ConvolutionLayer)
+    REGISTER_LAYER_CLASS(Deconvolution, DeConvolutionLayer)
 
 
     ConvolutionLayer::ConvolutionLayer(LayerParams &params)
     {
-        getKernelParams(params, kernelH, kernelW, padH, padW, strideH, strideW);
+        getKernelParams(params, kerH, kerW, padH, padW, strideH, strideW);
 
         numOutput = params.get<int>("num_output");
         bias = params.get<bool>("bias_term", true);
@@ -44,8 +62,8 @@ namespace dnn
         CV_Assert(params.learnedBlobs.size() >= 1 && (!bias || params.learnedBlobs.size() >= 2));
         learnedParams.assign(params.learnedBlobs.begin(), params.learnedBlobs.begin() + (bias ? 2 : 1));
 
-        Blob &weightBlob = learnedParams[0];
-        CV_Assert(weightBlob.cols() == kernelW && weightBlob.rows() == kernelH && weightBlob.num() == numOutput);
+        const Blob &wgtBlob = learnedParams[0];
+        CV_Assert(wgtBlob.dims() == 4 && wgtBlob.cols() == kerW && wgtBlob.rows() == kerH && wgtBlob.num() == numOutput);
 
         if (bias)
         {
@@ -58,92 +76,141 @@ namespace dnn
     {
         CV_Assert(inputs.size() > 0);
 
-        Blob &weightBlob = learnedParams[0];
+        const Blob &inpBlob = *inputs[0];
+        CV_Assert(inpBlob.dims() == 4 && inpBlob.type() == CV_32F);
+        computeInpOutShape(inpBlob);
 
-        inCn = inputs[0]->channels();
-        CV_Assert(inCn % group == 0 && numOutput % group == 0 && weightBlob.channels() == inCn/group);
-        groupCnOut = numOutput / group;
-        groupCn = inCn / group;
+        CV_Assert(inpCn % group == 0 && outCn % group == 0);
+        CV_Assert(learnedParams[0].channels() == inpCn / group);
+        CV_Assert(learnedParams[0].num() == outCn);
 
-        inH = inputs[0]->rows();
-        inW = inputs[0]->cols();
-        computeOutputShape(inH, inW);
+        outGroupCn = outCn / group;
+        inpGroupCn = inpCn / group;
+        ksize = inpGroupCn * kerH * kerW;
 
         outputs.resize(inputs.size());
         for (size_t i = 0; i < inputs.size(); i++)
         {
-            CV_Assert(inputs[i]->rows() == inH && inputs[i]->cols() == inW && inputs[i]->channels() == inCn);
-            outputs[i].create(BlobShape(inputs[i]->num(), numOutput, outH, outW));
+            CV_Assert(inputs[i]->type() == inpBlob.type());
+            CV_Assert(inputs[i]->dims() == 4 && inputs[i]->channels() == inpBlob.channels());
+            CV_Assert(inputs[i]->rows() == inpBlob.rows() && inputs[i]->cols() == inpBlob.cols());
+
+            outputs[i].create(BlobShape(inputs[i]->num(), topCn, topH, topW));
         }
 
-        kerSize = kernelH * kernelW * groupCn;
-        srcColsMat.create(kerSize, outH * outW, CV_32F);
+        if (!is1x1())
+            colMat.create(ksize, outH * outW, inpBlob.type());
 
         if (bias)
-        {
-            biasOnesMat = Mat::ones(1, outH * outW, CV_32F);
-        }
+            biasOnesMat = Mat::ones(1, topH * topW, inpBlob.type());
     }
 
-    template <typename Dtype>
-    void im2col_cpu(const Dtype* data_im, const int channels,
-        const int height, const int width, const int kernel_h, const int kernel_w,
-        const int pad_h, const int pad_w,
-        const int stride_h, const int stride_w,
-        Dtype* data_col)
+    inline bool ConvolutionLayer::is1x1() const
     {
-        int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-        int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
-        int channels_col = channels * kernel_h * kernel_w;
-        for (int c = 0; c < channels_col; ++c) {
-            int w_offset = c % kernel_w;
-            int h_offset = (c / kernel_w) % kernel_h;
-            int c_im = c / kernel_h / kernel_w;
-            for (int h = 0; h < height_col; ++h) {
-                for (int w = 0; w < width_col; ++w) {
-                    int h_pad = h * stride_h - pad_h + h_offset;
-                    int w_pad = w * stride_w - pad_w + w_offset;
-                    if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
-                        data_col[(c * height_col + h) * width_col + w] =
-                        data_im[(c_im * height + h_pad) * width + w_pad];
-                    else
-                        data_col[(c * height_col + h) * width_col + w] = 0;
+        return (kerH == 1 && kerW == 1);
+    }
+
+    void ConvolutionLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+    {
+        Blob &wgtBlob = learnedParams[0];
+
+        for (size_t ii = 0; ii < outputs.size(); ii++)
+        {
+            Blob &inpBlob = *inputs[ii];
+            Blob &outBlob = outputs[ii];
+
+            for (int n = 0; n < inpBlob.num(); n++)
+            {
+                for (int g = 0; g < group; g++)
+                {
+                    im2col(inpBlob, n, g);
+
+                    Mat kerMat(outGroupCn, ksize, wgtBlob.type(), wgtBlob.ptrRaw(g*outGroupCn));
+                    Mat dstMat(outGroupCn, outH*outW, outBlob.type(), outBlob.ptrRaw(n, g*outGroupCn));
+
+                    cv::gemm(kerMat, colMat, 1, noArray(), 0, dstMat);
+
+                    if (bias)
+                    {
+                        float *biasPtr = learnedParams[1].ptrf() + g*outGroupCn;
+                        Mat biasMat(outGroupCn, 1, CV_32F, biasPtr);
+                        cv::gemm(biasMat, biasOnesMat, 1, dstMat, 1, dstMat);
+                    }
                 }
             }
         }
     }
 
-    void ConvolutionLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+    void ConvolutionLayer::im2col(Blob &inpBlob, int imNum, int cnGroup)
+    {
+        uchar *srcPtr = inpBlob.ptrRaw(imNum, cnGroup*inpGroupCn);
+
+        if (is1x1())
+        {
+            colMat = Mat(ksize, inpBlob.rows()*inpBlob.cols(), inpBlob.type(), srcPtr);
+            return;
+        }
+
+        if (inpBlob.type() == CV_32F)
+            im2col_cpu((float *)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (float *)colMat.ptr());
+        if (inpBlob.type() == CV_64F)
+            im2col_cpu((double*)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (double*)colMat.ptr());
+    }
+
+    void ConvolutionLayer::computeInpOutShape(const Blob &inpBlob)
     {
-        CV_Assert(inputs.size() == outputs.size());
+        inpH = inpBlob.rows();
+        inpW = inpBlob.cols();
+        inpCn = inpBlob.channels();
 
-        float *srcColPtr = srcColsMat.ptr<float>();
+        outH = (inpH + 2 * padH - kerH) / strideH + 1;
+        outW = (inpW + 2 * padW - kerW) / strideW + 1;
+        outCn = learnedParams[0].num();
+
+        topH = outH; topW = outW; topCn = outCn;
+    }
+
+    void DeConvolutionLayer::computeInpOutShape(const Blob &inpBlob)
+    {
+        outH = inpBlob.rows();
+        outW = inpBlob.cols();
+        outCn = inpBlob.channels();
+
+        inpH = strideH * (outH - 1) + kerH - 2 * padH;
+        inpW = strideW * (outW - 1) + kerW - 2 * padW;
+        inpCn = learnedParams[0].channels();
+
+        topH = inpH; topW = inpW; topCn = inpCn;
+    }
+
+    void DeConvolutionLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+    {
+        Blob &wghtBlob = learnedParams[0];
 
         for (size_t ii = 0; ii < outputs.size(); ii++)
         {
-            Blob &input = *inputs[ii];
-            Blob &output = outputs[ii];
-            int num = input.num();
+            Blob &convBlob = *inputs[ii];
+            Blob &decnBlob = outputs[ii];
 
-            for (int n = 0; n < num; n++)
+            for (int n = 0; n < convBlob.num(); n++)
             {
                 for (int g = 0; g < group; g++)
                 {
-                    float *srcPtr = input.ptrf(n, g*groupCn);
-                    im2col_cpu(srcPtr, groupCn, inH, inW, kernelH, kernelW, padH, padW, strideH, strideW, srcColPtr);
+                    Mat dstMat(inpGroupCn, inpH*inpW, decnBlob.type(), decnBlob.ptrRaw(n, g*inpGroupCn));
+                    
+                    if (is1x1())
+                        colMat = dstMat;
 
-                    float *kerPtr = learnedParams[0].ptrf(g*groupCnOut);
-                    float *dstPtr = output.ptrf(n, g*groupCnOut);
+                    Mat convMat(outGroupCn, outH*outW, convBlob.type(), convBlob.ptrRaw(n, g*inpGroupCn));
+                    Mat wghtMat(outGroupCn, ksize, wghtBlob.type(), wghtBlob.ptrRaw(g*inpGroupCn));
+                    cv::gemm(wghtMat, convMat, 1, noArray(), 0, colMat, GEMM_1_T);
 
-                    Mat kerMat(groupCnOut, kerSize, CV_32F, kerPtr);
-                    Mat dstMat(groupCnOut, outH*outW, CV_32F, dstPtr);
-
-                    cv::gemm(kerMat, srcColsMat, 1, noArray(), 0, dstMat);
+                    col2im(dstMat);
 
                     if (bias)
                     {
-                        float *biasPtr = learnedParams[1].ptrf() + g*groupCnOut;
-                        Mat biasMat(groupCnOut, 1, CV_32F, biasPtr);
+                        float *biasPtr = learnedParams[1].ptrf() + g*outGroupCn;
+                        Mat biasMat(outGroupCn, 1, CV_32F, biasPtr);
                         cv::gemm(biasMat, biasOnesMat, 1, dstMat, 1, dstMat);
                     }
                 }
@@ -151,10 +218,14 @@ namespace dnn
         }
     }
 
-    void ConvolutionLayer::computeOutputShape(int inH, int inW)
+    void DeConvolutionLayer::col2im(Mat &dstMat)
     {
-        outH = (inH + 2 * padH - kernelH) / strideH + 1;
-        outW = (inW + 2 * padW - kernelW) / strideW + 1;
+        if (is1x1()) return;
+
+        if (dstMat.type() == CV_32F)
+            col2im_cpu((float*)colMat.ptr(), inpCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (float*)dstMat.ptr());
+        if (dstMat.type() == CV_64F)
+            col2im_cpu((double*)colMat.ptr(), inpCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (double*)dstMat.ptr());
     }
 }
 }
diff --git a/modules/dnn/src/layers/im2col.hpp b/modules/dnn/src/layers/im2col.hpp
new file mode 100644
index 000000000..b356a0550
--- /dev/null
+++ b/modules/dnn/src/layers/im2col.hpp
@@ -0,0 +1,74 @@
+#ifndef __OPENCV_DNN_LAYERS_IM2COL_HPP__
+#define __OPENCV_DNN_LAYERS_IM2COL_HPP__
+
+namespace cv
+{
+namespace dnn
+{
+
+template <typename Dtype>
+void im2col_cpu(const Dtype* data_im, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    Dtype* data_col)
+{
+    int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+    int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+    int channels_col = channels * kernel_h * kernel_w;
+    for (int c = 0; c < channels_col; ++c) {
+        int w_offset = c % kernel_w;
+        int h_offset = (c / kernel_w) % kernel_h;
+        int c_im = c / kernel_h / kernel_w;
+        for (int h = 0; h < height_col; ++h) {
+            for (int w = 0; w < width_col; ++w) {
+                int h_pad = h * stride_h - pad_h + h_offset;
+                int w_pad = w * stride_w - pad_w + w_offset;
+                if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+                    data_col[(c * height_col + h) * width_col + w] =
+                    data_im[(c_im * height + h_pad) * width + w_pad];
+                else
+                    data_col[(c * height_col + h) * width_col + w] = 0;
+            }
+        }
+    }
+}
+
+template <typename Dtype>
+void col2im_cpu(const Dtype* data_col, const int channels,
+    const int height, const int width, const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    Dtype* data_im)
+{
+    memset(data_im, 0, height * width * channels * sizeof(Dtype));
+
+    int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
+    int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
+    int channels_col = channels * patch_h * patch_w;
+
+    for (int c = 0; c < channels_col; ++c)
+    {
+        int w_offset = c % patch_w;
+        int h_offset = (c / patch_w) % patch_h;
+        int c_im = c / patch_h / patch_w;
+
+        for (int h = 0; h < height_col; ++h)
+        {
+            for (int w = 0; w < width_col; ++w)
+            {
+                int h_pad = h * stride_h - pad_h + h_offset;
+                int w_pad = w * stride_w - pad_w + w_offset;
+
+                if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+                    data_im[(c_im * height + h_pad) * width + w_pad] +=
+                    data_col[(c * height_col + h) * width_col + w];
+            }
+        }
+    }
+}
+
+}
+}
+
+#endif
\ No newline at end of file