diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index 72b7b9fb0..f8468ac43 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -1,5 +1,6 @@ #include "../precomp.hpp" #include "layers_common.hpp" +#include "im2col.hpp" namespace cv { @@ -8,19 +9,24 @@ namespace dnn //TODO: simultaneously convolution and bias addition for cache optimization class ConvolutionLayer : public Layer { + protected: bool bias; int numOutput, group; int padH, padW; + int kerH, kerW; int strideH, strideW; - int kernelH, kernelW; - int inH, inW, inCn, kerSize; - int outH, outW; - int groupCn, groupCnOut; + int inpH, inpW, inpCn; + int outH, outW, outCn; + int topH, topW, topCn; //switched between inp/out on deconv/conv + int inpGroupCn, outGroupCn; + int ksize; - Mat srcColsMat, biasOnesMat; + Mat colMat, biasOnesMat; - void computeOutputShape(int inH, int inW); + inline bool is1x1() const; + virtual void computeInpOutShape(const Blob &inpBlob); + void im2col(Blob &inpBlob, int imNum, int cnGroup); public: ConvolutionLayer(LayerParams ¶ms); @@ -28,13 +34,25 @@ namespace dnn void forward(std::vector &inputs, std::vector &outputs); }; + class DeConvolutionLayer : public ConvolutionLayer + { + protected: + void computeInpOutShape(const Blob &inpBlob); + void col2im(Mat &dstMat); + + public: + DeConvolutionLayer(LayerParams ¶ms) : ConvolutionLayer(params) {} + void forward(std::vector &inputs, std::vector &outputs); + }; + REGISTER_LAYER_CLASS(Convolution, ConvolutionLayer) + REGISTER_LAYER_CLASS(Deconvolution, DeConvolutionLayer) ConvolutionLayer::ConvolutionLayer(LayerParams ¶ms) { - getKernelParams(params, kernelH, kernelW, padH, padW, strideH, strideW); + getKernelParams(params, kerH, kerW, padH, padW, strideH, strideW); numOutput = params.get("num_output"); bias = params.get("bias_term", true); @@ -44,8 +62,8 @@ namespace dnn CV_Assert(params.learnedBlobs.size() >= 1 && (!bias || params.learnedBlobs.size() >= 2)); learnedParams.assign(params.learnedBlobs.begin(), params.learnedBlobs.begin() + (bias ? 2 : 1)); - Blob &weightBlob = learnedParams[0]; - CV_Assert(weightBlob.cols() == kernelW && weightBlob.rows() == kernelH && weightBlob.num() == numOutput); + const Blob &wgtBlob = learnedParams[0]; + CV_Assert(wgtBlob.dims() == 4 && wgtBlob.cols() == kerW && wgtBlob.rows() == kerH && wgtBlob.num() == numOutput); if (bias) { @@ -58,92 +76,141 @@ namespace dnn { CV_Assert(inputs.size() > 0); - Blob &weightBlob = learnedParams[0]; + const Blob &inpBlob = *inputs[0]; + CV_Assert(inpBlob.dims() == 4 && inpBlob.type() == CV_32F); + computeInpOutShape(inpBlob); - inCn = inputs[0]->channels(); - CV_Assert(inCn % group == 0 && numOutput % group == 0 && weightBlob.channels() == inCn/group); - groupCnOut = numOutput / group; - groupCn = inCn / group; + CV_Assert(inpCn % group == 0 && outCn % group == 0); + CV_Assert(learnedParams[0].channels() == inpCn / group); + CV_Assert(learnedParams[0].num() == outCn); - inH = inputs[0]->rows(); - inW = inputs[0]->cols(); - computeOutputShape(inH, inW); + outGroupCn = outCn / group; + inpGroupCn = inpCn / group; + ksize = inpGroupCn * kerH * kerW; outputs.resize(inputs.size()); for (size_t i = 0; i < inputs.size(); i++) { - CV_Assert(inputs[i]->rows() == inH && inputs[i]->cols() == inW && inputs[i]->channels() == inCn); - outputs[i].create(BlobShape(inputs[i]->num(), numOutput, outH, outW)); + CV_Assert(inputs[i]->type() == inpBlob.type()); + CV_Assert(inputs[i]->dims() == 4 && inputs[i]->channels() == inpBlob.channels()); + CV_Assert(inputs[i]->rows() == inpBlob.rows() && inputs[i]->cols() == inpBlob.cols()); + + outputs[i].create(BlobShape(inputs[i]->num(), topCn, topH, topW)); } - kerSize = kernelH * kernelW * groupCn; - srcColsMat.create(kerSize, outH * outW, CV_32F); + if (!is1x1()) + colMat.create(ksize, outH * outW, inpBlob.type()); if (bias) - { - biasOnesMat = Mat::ones(1, outH * outW, CV_32F); - } + biasOnesMat = Mat::ones(1, topH * topW, inpBlob.type()); } - template - void im2col_cpu(const Dtype* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - Dtype* data_col) + inline bool ConvolutionLayer::is1x1() const { - int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; - int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; - int channels_col = channels * kernel_h * kernel_w; - for (int c = 0; c < channels_col; ++c) { - int w_offset = c % kernel_w; - int h_offset = (c / kernel_w) % kernel_h; - int c_im = c / kernel_h / kernel_w; - for (int h = 0; h < height_col; ++h) { - for (int w = 0; w < width_col; ++w) { - int h_pad = h * stride_h - pad_h + h_offset; - int w_pad = w * stride_w - pad_w + w_offset; - if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) - data_col[(c * height_col + h) * width_col + w] = - data_im[(c_im * height + h_pad) * width + w_pad]; - else - data_col[(c * height_col + h) * width_col + w] = 0; + return (kerH == 1 && kerW == 1); + } + + void ConvolutionLayer::forward(std::vector &inputs, std::vector &outputs) + { + Blob &wgtBlob = learnedParams[0]; + + for (size_t ii = 0; ii < outputs.size(); ii++) + { + Blob &inpBlob = *inputs[ii]; + Blob &outBlob = outputs[ii]; + + for (int n = 0; n < inpBlob.num(); n++) + { + for (int g = 0; g < group; g++) + { + im2col(inpBlob, n, g); + + Mat kerMat(outGroupCn, ksize, wgtBlob.type(), wgtBlob.ptrRaw(g*outGroupCn)); + Mat dstMat(outGroupCn, outH*outW, outBlob.type(), outBlob.ptrRaw(n, g*outGroupCn)); + + cv::gemm(kerMat, colMat, 1, noArray(), 0, dstMat); + + if (bias) + { + float *biasPtr = learnedParams[1].ptrf() + g*outGroupCn; + Mat biasMat(outGroupCn, 1, CV_32F, biasPtr); + cv::gemm(biasMat, biasOnesMat, 1, dstMat, 1, dstMat); + } } } } } - void ConvolutionLayer::forward(std::vector &inputs, std::vector &outputs) + void ConvolutionLayer::im2col(Blob &inpBlob, int imNum, int cnGroup) + { + uchar *srcPtr = inpBlob.ptrRaw(imNum, cnGroup*inpGroupCn); + + if (is1x1()) + { + colMat = Mat(ksize, inpBlob.rows()*inpBlob.cols(), inpBlob.type(), srcPtr); + return; + } + + if (inpBlob.type() == CV_32F) + im2col_cpu((float *)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (float *)colMat.ptr()); + if (inpBlob.type() == CV_64F) + im2col_cpu((double*)srcPtr, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (double*)colMat.ptr()); + } + + void ConvolutionLayer::computeInpOutShape(const Blob &inpBlob) { - CV_Assert(inputs.size() == outputs.size()); + inpH = inpBlob.rows(); + inpW = inpBlob.cols(); + inpCn = inpBlob.channels(); - float *srcColPtr = srcColsMat.ptr(); + outH = (inpH + 2 * padH - kerH) / strideH + 1; + outW = (inpW + 2 * padW - kerW) / strideW + 1; + outCn = learnedParams[0].num(); + + topH = outH; topW = outW; topCn = outCn; + } + + void DeConvolutionLayer::computeInpOutShape(const Blob &inpBlob) + { + outH = inpBlob.rows(); + outW = inpBlob.cols(); + outCn = inpBlob.channels(); + + inpH = strideH * (outH - 1) + kerH - 2 * padH; + inpW = strideW * (outW - 1) + kerW - 2 * padW; + inpCn = learnedParams[0].channels(); + + topH = inpH; topW = inpW; topCn = inpCn; + } + + void DeConvolutionLayer::forward(std::vector &inputs, std::vector &outputs) + { + Blob &wghtBlob = learnedParams[0]; for (size_t ii = 0; ii < outputs.size(); ii++) { - Blob &input = *inputs[ii]; - Blob &output = outputs[ii]; - int num = input.num(); + Blob &convBlob = *inputs[ii]; + Blob &decnBlob = outputs[ii]; - for (int n = 0; n < num; n++) + for (int n = 0; n < convBlob.num(); n++) { for (int g = 0; g < group; g++) { - float *srcPtr = input.ptrf(n, g*groupCn); - im2col_cpu(srcPtr, groupCn, inH, inW, kernelH, kernelW, padH, padW, strideH, strideW, srcColPtr); + Mat dstMat(inpGroupCn, inpH*inpW, decnBlob.type(), decnBlob.ptrRaw(n, g*inpGroupCn)); + + if (is1x1()) + colMat = dstMat; - float *kerPtr = learnedParams[0].ptrf(g*groupCnOut); - float *dstPtr = output.ptrf(n, g*groupCnOut); + Mat convMat(outGroupCn, outH*outW, convBlob.type(), convBlob.ptrRaw(n, g*inpGroupCn)); + Mat wghtMat(outGroupCn, ksize, wghtBlob.type(), wghtBlob.ptrRaw(g*inpGroupCn)); + cv::gemm(wghtMat, convMat, 1, noArray(), 0, colMat, GEMM_1_T); - Mat kerMat(groupCnOut, kerSize, CV_32F, kerPtr); - Mat dstMat(groupCnOut, outH*outW, CV_32F, dstPtr); - - cv::gemm(kerMat, srcColsMat, 1, noArray(), 0, dstMat); + col2im(dstMat); if (bias) { - float *biasPtr = learnedParams[1].ptrf() + g*groupCnOut; - Mat biasMat(groupCnOut, 1, CV_32F, biasPtr); + float *biasPtr = learnedParams[1].ptrf() + g*outGroupCn; + Mat biasMat(outGroupCn, 1, CV_32F, biasPtr); cv::gemm(biasMat, biasOnesMat, 1, dstMat, 1, dstMat); } } @@ -151,10 +218,14 @@ namespace dnn } } - void ConvolutionLayer::computeOutputShape(int inH, int inW) + void DeConvolutionLayer::col2im(Mat &dstMat) { - outH = (inH + 2 * padH - kernelH) / strideH + 1; - outW = (inW + 2 * padW - kernelW) / strideW + 1; + if (is1x1()) return; + + if (dstMat.type() == CV_32F) + col2im_cpu((float*)colMat.ptr(), inpCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (float*)dstMat.ptr()); + if (dstMat.type() == CV_64F) + col2im_cpu((double*)colMat.ptr(), inpCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, (double*)dstMat.ptr()); } } } diff --git a/modules/dnn/src/layers/im2col.hpp b/modules/dnn/src/layers/im2col.hpp new file mode 100644 index 000000000..b356a0550 --- /dev/null +++ b/modules/dnn/src/layers/im2col.hpp @@ -0,0 +1,74 @@ +#ifndef __OPENCV_DNN_LAYERS_IM2COL_HPP__ +#define __OPENCV_DNN_LAYERS_IM2COL_HPP__ + +namespace cv +{ +namespace dnn +{ + +template +void im2col_cpu(const Dtype* data_im, const int channels, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + Dtype* data_col) +{ + int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; + int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; + int channels_col = channels * kernel_h * kernel_w; + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % kernel_w; + int h_offset = (c / kernel_w) % kernel_h; + int c_im = c / kernel_h / kernel_w; + for (int h = 0; h < height_col; ++h) { + for (int w = 0; w < width_col; ++w) { + int h_pad = h * stride_h - pad_h + h_offset; + int w_pad = w * stride_w - pad_w + w_offset; + if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) + data_col[(c * height_col + h) * width_col + w] = + data_im[(c_im * height + h_pad) * width + w_pad]; + else + data_col[(c * height_col + h) * width_col + w] = 0; + } + } + } +} + +template +void col2im_cpu(const Dtype* data_col, const int channels, + const int height, const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + Dtype* data_im) +{ + memset(data_im, 0, height * width * channels * sizeof(Dtype)); + + int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; + int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; + int channels_col = channels * patch_h * patch_w; + + for (int c = 0; c < channels_col; ++c) + { + int w_offset = c % patch_w; + int h_offset = (c / patch_w) % patch_h; + int c_im = c / patch_h / patch_w; + + for (int h = 0; h < height_col; ++h) + { + for (int w = 0; w < width_col; ++w) + { + int h_pad = h * stride_h - pad_h + h_offset; + int w_pad = w * stride_w - pad_w + w_offset; + + if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) + data_im[(c_im * height + h_pad) * width + w_pad] += + data_col[(c * height_col + h) * width_col + w]; + } + } + } +} + +} +} + +#endif \ No newline at end of file