Fixed im2col_ocl bug caused non-zero UMat offset

pull/265/head
Vitaliy Lyudvichenko 10 years ago
parent 23d3ede6ab
commit 7acfda2c11
  1. 7
      modules/dnn/src/dnn.cpp
  2. 13
      modules/dnn/src/layers/convolution_layer.cpp
  3. 3
      modules/dnn/src/layers/convolution_layer.hpp
  4. 15
      modules/dnn/src/layers/im2col.cpp
  5. 2
      modules/dnn/src/layers/im2col.hpp
  6. 18
      modules/dnn/src/opencl/im2col.cl

@ -173,7 +173,7 @@ struct Net::Impl
MapIdToLayerData::iterator it = layers.find(id);
if (it == layers.end())
CV_Error(Error::StsError, "Layer with requested id=" + toString(id) + " not found");
CV_Error(Error::StsObjectNotFound, format("Layer with requested id=%d not found", id));
return it->second;
}
@ -536,5 +536,10 @@ int Net::getLayerId(LayerId)
return -1;
}
void Net::deleteLayer(LayerId)
{
CV_Error(Error::StsNotImplemented, "");
}
}
}

@ -3,6 +3,7 @@
#include "layers_common.hpp"
#include "convolution_layer.hpp"
#include "im2col.hpp"
#include <iostream>
namespace cv
{
@ -28,6 +29,9 @@ namespace dnn
Blob &biasBlob = learnedParams[1];
CV_Assert(biasBlob.total() == (size_t)numOutput);
}
//TBD
useOpenCL = params.has("use_opencl");
}
void ConvolutionLayer::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
@ -108,12 +112,17 @@ namespace dnn
return;
}
if (ocl::useOpenCL() && inpBlob.type() == CV_32F)
if (useOpenCL && ocl::useOpenCL() && inpBlob.type() == CV_32F && !is1x1())
{
UMat src = inpBlob.getMatRef().getUMat(ACCESS_READ);
std::vector<Range> ranges(4, Range::all());
ranges[0] = Range(imNum, imNum+1);
ranges[1] = Range(cnGroup*inpGroupCn, (cnGroup + 1)*inpGroupCn);
UMat src = inpBlob.getMatRef()(&ranges[0]).getUMat(ACCESS_READ);
UMat dst(colMat.size(), colMat.type());
im2col_ocl(src, inpGroupCn, inpH, inpW, kerH, kerW, padH, padW, strideH, strideW, dst);
dst.copyTo(colMat);
return;
}
if (inpBlob.type() == CV_32F)

@ -22,6 +22,7 @@ namespace dnn
int inpGroupCn, outGroupCn;
int ksize;
bool useOpenCL;
Mat colMat, biasOnesMat;
inline bool is1x1() const;
@ -47,4 +48,4 @@ namespace dnn
};
}
}
#endif
#endif

@ -20,22 +20,23 @@ void im2col_ocl(UMat &img,
CV_Assert(img.isContinuous() && col.isContinuous());
CV_Assert(img.total() == (size_t)channels * height * width);
CV_Assert(col.total() == (size_t)h_out * w_out * kernel_h * kernel_w);
CV_Assert(col.total() == (size_t)channels * kernel_h * kernel_w * h_out * w_out);
ocl::Kernel im2col_ker("im2col", ocl::dnn::im2col_oclsrc);
CV_Assert(!im2col_ker.empty());
im2col_ker.args(ocl::KernelArg::PtrReadOnly(img),
im2col_ker.args(ocl::KernelArg::PtrReadOnly(img), (int)img.offset,
channels, height, width,
kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
h_out, w_out,
ocl::KernelArg::PtrWriteOnly(col)
ocl::KernelArg::PtrWriteOnly(col), (int)col.offset
);
size_t globalSize[] = { (size_t)channels * h_out * w_out };
size_t localSize[] = { ocl::Device::getDefault().maxWorkGroupSize() };
size_t localSize = ocl::Device::getDefault().maxWorkGroupSize();
size_t globalSize = (size_t)channels * h_out * w_out;
CV_Assert(im2col_ker.run(1, globalSize, localSize, false));
CV_Assert(im2col_ker.run(1, &globalSize, &localSize, true));
}
}
}
}

@ -80,4 +80,4 @@ void im2col_ocl(UMat &img,
}
}
#endif
#endif

@ -1,11 +1,13 @@
__kernel void im2col(__global const float *im_src,
int channels, int height_inp, int width_inp,
__kernel void im2col(__global const float *im_src, int im_src_offset,
int channels, int height_inp, int width_inp,
int kernel_h, int kernel_w, int pad_h, int pad_w, int stride_h, int stride_w,
int height_out, int width_out,
__global float *im_col
__global float *im_col, int im_col_offset
)
{
int index = get_global_id(0);
if (index >= height_out * width_out * channels)
return;
int j_out = index % width_out;
int i_out = (index / width_out) % height_out;
int c_inp = (index / width_out) / height_out;
@ -14,17 +16,15 @@ __kernel void im2col(__global const float *im_src,
int i_inp = i_out * stride_h - pad_h;
int j_inp = j_out * stride_w - pad_w;
im_col += (c_out * height_out + i_out) * width_out + j_out;
im_src += (c_inp * height_inp + i_inp) * width_inp + j_inp;
im_src += (c_inp * height_inp + i_inp) * width_inp + j_inp + im_src_offset / sizeof(float);
im_col += (c_out * height_out + i_out) * width_out + j_out + im_col_offset / sizeof(float);
for (int ki = 0; ki < kernel_h; ++ki)
for (int kj = 0; kj < kernel_w; ++kj) {
int i = i_inp + ki;
int j = j_inp + kj;
*im_col = (h >= 0 && w >= 0 && h < height_inp && w < width_inp) ?
im_src[i * width_inp + j] : 0;
*im_col = (i >= 0 && j >= 0 && i < height_inp && j < width_inp) ?
im_src[ki * width_inp + kj] : 0;
im_col += height_out * width_out;
}
}
}
Loading…
Cancel
Save