Fixed im2col_ocl bug caused non-zero UMat offset

10 years ago · 7acfda2c11
parent 23d3ede6ab
commit 7acfda2c11
6 changed files with 37 additions and 21 deletions
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@ -173,7 +173,7 @@ struct Net::Impl
        MapIdToLayerData::iterator it = layers.find(id);

        if (it == layers.end())
-            CV_Error(Error::StsError, "Layer with requested id=" + toString(id) + " not found");
+            CV_Error(Error::StsObjectNotFound, format("Layer with requested id=%d not found", id));

        return it->second;
    }
@ -536,5 +536,10 @@ int Net::getLayerId(LayerId)
    return -1;
 }

+void Net::deleteLayer(LayerId)
+{
+    CV_Error(Error::StsNotImplemented, "");
+}
+
 }
 }
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@ -3,6 +3,7 @@
 #include "layers_common.hpp"
 #include "convolution_layer.hpp"
 #include "im2col.hpp"
+#include <iostream>

 namespace cv
 {
@ -28,6 +29,9 @@ namespace dnn
            Blob &biasBlob = learnedParams[1];
            CV_Assert(biasBlob.total() == (size_t)numOutput);
        }
+
+        //TBD
+        useOpenCL = params.has("use_opencl");
    }

    void ConvolutionLayer::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
@ -108,12 +112,17 @@ namespace dnn
            return;
        }

-        if (ocl::useOpenCL() && inpBlob.type() == CV_32F)
+        if (useOpenCL && ocl::useOpenCL() && inpBlob.type() == CV_32F && !is1x1())
        {
-            UMat src = inpBlob.getMatRef().getUMat(ACCESS_READ);
+            std::vector<Range> ranges(4, Range::all());
+            ranges[0] = Range(imNum, imNum+1);
+            ranges[1] = Range(cnGroup*inpGroupCn, (cnGroup + 1)*inpGroupCn);
+
+            UMat src = inpBlob.getMatRef()(&ranges[0]).getUMat(ACCESS_READ);
            UMat dst(colMat.size(), colMat.type());
            im2col_ocl(src, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dst);
            dst.copyTo(colMat);
+            return;
        }

        if (inpBlob.type() == CV_32F)
--- a/modules/dnn/src/layers/convolution_layer.hpp
+++ b/modules/dnn/src/layers/convolution_layer.hpp
@ -22,6 +22,7 @@ namespace dnn
        int inpGroupCn, outGroupCn;
        int ksize;

+        bool useOpenCL;
        Mat colMat, biasOnesMat;

        inline bool is1x1() const;
@ -47,4 +48,4 @@ namespace dnn
    };
 }
 }
-#endif
+#endif
--- a/modules/dnn/src/layers/im2col.cpp
+++ b/modules/dnn/src/layers/im2col.cpp
@ -20,22 +20,23 @@ void im2col_ocl(UMat &img,

    CV_Assert(img.isContinuous() && col.isContinuous());
    CV_Assert(img.total() == (size_t)channels * height * width);
-    CV_Assert(col.total() == (size_t)h_out * w_out * kernel_h * kernel_w);
+    CV_Assert(col.total() == (size_t)channels * kernel_h * kernel_w * h_out * w_out);

    ocl::Kernel im2col_ker("im2col", ocl::dnn::im2col_oclsrc);
+    CV_Assert(!im2col_ker.empty());

-    im2col_ker.args(ocl::KernelArg::PtrReadOnly(img), 
+    im2col_ker.args(ocl::KernelArg::PtrReadOnly(img), (int)img.offset,
             channels, height, width,
             kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
             h_out, w_out,
-             ocl::KernelArg::PtrWriteOnly(col)
+             ocl::KernelArg::PtrWriteOnly(col), (int)col.offset
        );

-    size_t globalSize[] = { (size_t)channels * h_out * w_out };
-    size_t localSize[] = { ocl::Device::getDefault().maxWorkGroupSize() };
+    size_t localSize = ocl::Device::getDefault().maxWorkGroupSize();
+    size_t globalSize = (size_t)channels * h_out * w_out;

-    CV_Assert(im2col_ker.run(1, globalSize, localSize, false));
+    CV_Assert(im2col_ker.run(1, &globalSize, &localSize, true));
 }

 }
-}
+}
--- a/modules/dnn/src/layers/im2col.hpp
+++ b/modules/dnn/src/layers/im2col.hpp
@ -80,4 +80,4 @@ void im2col_ocl(UMat &img,
 }
 }

-#endif
+#endif
--- a/modules/dnn/src/opencl/im2col.cl
+++ b/modules/dnn/src/opencl/im2col.cl
@ -1,11 +1,13 @@
-__kernel void im2col(__global const float *im_src,
-                     int channels, int height_inp, int width_inp, 
+__kernel void im2col(__global const float *im_src, int im_src_offset,
+                     int channels, int height_inp, int width_inp,
                     int kernel_h, int kernel_w, int pad_h, int pad_w, int stride_h, int stride_w,
                     int height_out, int width_out,
-                     __global float *im_col
+                     __global float *im_col, int im_col_offset
                    )
 {
    int index = get_global_id(0);
+    if (index >= height_out * width_out * channels)
+        return;
    int j_out = index % width_out;
    int i_out = (index / width_out) % height_out;
    int c_inp = (index / width_out) / height_out;
@ -14,17 +16,15 @@ __kernel void im2col(__global const float *im_src,
    int i_inp = i_out * stride_h - pad_h;
    int j_inp = j_out * stride_w - pad_w;

-    im_col += (c_out * height_out + i_out) * width_out + j_out;
-    im_src += (c_inp * height_inp + i_inp) * width_inp + j_inp;
+    im_src += (c_inp * height_inp + i_inp) * width_inp + j_inp + im_src_offset / sizeof(float);
+    im_col += (c_out * height_out + i_out) * width_out + j_out + im_col_offset / sizeof(float);

    for (int ki = 0; ki < kernel_h; ++ki)
        for (int kj = 0; kj < kernel_w; ++kj) {
            int i = i_inp + ki;
            int j = j_inp + kj;
-            *im_col = (h >= 0 && w >= 0 && h < height_inp && w < width_inp) ?
-                im_src[i * width_inp + j] : 0;
+            *im_col = (i >= 0 && j >= 0 && i < height_inp && j < width_inp) ?
+                im_src[ki * width_inp + kj] : 0;
            im_col += height_out * width_out;
      }
 }
-
-}