dnn(ocl4dnn): refactor pooling OpenCL calls

pull/10150/head
Alexander Alekhin 7 years ago
parent f071a48ec7
commit e34b64c979
  1. 2
      modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
  2. 128
      modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
  3. 88
      modules/dnn/src/opencl/ocl4dnn_pooling.cl

@ -351,8 +351,6 @@ class OCL4DNNPool
UMat& top_data, UMat& top_data,
UMat& top_mask); UMat& top_mask);
private: private:
UMat mask_idx_;
// Pooling parameters // Pooling parameters
std::vector<int32_t> pad_; std::vector<int32_t> pad_;
std::vector<int32_t> stride_; std::vector<int32_t> stride_;

@ -88,7 +88,7 @@ OCL4DNNPool<Dtype>::OCL4DNNPool(OCL4DNNPoolConfig config)
template<typename Dtype> template<typename Dtype>
OCL4DNNPool<Dtype>::~OCL4DNNPool() OCL4DNNPool<Dtype>::~OCL4DNNPool()
{ {
mask_idx_.release(); // nothing
} }
template<typename Dtype> template<typename Dtype>
@ -99,99 +99,103 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
bool ret = true; bool ret = true;
size_t global[] = { 128 * 128 }; size_t global[] = { 128 * 128 };
size_t local[] = { 128 }; size_t local[] = { 128 };
cl_uint argIdx = 0;
// support 2D case // support 2D case
switch (pool_method_) switch (pool_method_)
{ {
case LIBDNN_POOLING_METHOD_MAX: case LIBDNN_POOLING_METHOD_MAX:
{ {
if (top_mask.empty() && mask_idx_.empty()) bool haveMask = !top_mask.empty();
{ ocl::Kernel oclk_max_pool_forward(
mask_idx_.create(1, count_, CV_32FC1); haveMask ? CL_KERNEL_SELECT("max_pool_forward_mask") : CL_KERNEL_SELECT("max_pool_forward"),
} ocl::dnn::ocl4dnn_pooling_oclsrc,
ocl::Kernel oclk_max_pool_forward(CL_KERNEL_SELECT("max_pool_forward"), format("-D KERNEL_MAX_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
cv::ocl::dnn::ocl4dnn_pooling_oclsrc); " -D STRIDE_W=%d -D STRIDE_H=%d"
" -D PAD_W=%d -D PAD_H=%d%s",
kernel_w_, kernel_h_,
stride_w_, stride_h_,
pad_w_, pad_h_,
haveMask ? " -D HAVE_MASK=1" : ""
));
if (oclk_max_pool_forward.empty()) if (oclk_max_pool_forward.empty())
return false; return false;
argIdx = 0; oclk_max_pool_forward.args(
oclk_max_pool_forward.set(argIdx++, count_); count_,
oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); ocl::KernelArg::PtrReadOnly(bottom),
oclk_max_pool_forward.set(argIdx++, batch_size_); batch_size_,
oclk_max_pool_forward.set(argIdx++, channels_); channels_,
oclk_max_pool_forward.set(argIdx++, height_); height_,
oclk_max_pool_forward.set(argIdx++, width_); width_,
oclk_max_pool_forward.set(argIdx++, pooled_height_); pooled_height_,
oclk_max_pool_forward.set(argIdx++, pooled_width_); pooled_width_,
oclk_max_pool_forward.set(argIdx++, kernel_h_); ocl::KernelArg::PtrWriteOnly(top),
oclk_max_pool_forward.set(argIdx++, kernel_w_); ocl::KernelArg::PtrWriteOnly(top_mask)
oclk_max_pool_forward.set(argIdx++, stride_h_); );
oclk_max_pool_forward.set(argIdx++, stride_w_);
oclk_max_pool_forward.set(argIdx++, pad_h_);
oclk_max_pool_forward.set(argIdx++, pad_w_);
oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
oclk_max_pool_forward.set(argIdx++, mask_idx_.empty() ? 0 : 1);
if (mask_idx_.empty())
oclk_max_pool_forward.set(argIdx++, (void *)NULL);
else
oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(mask_idx_));
oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top_mask));
ret = oclk_max_pool_forward.run(1, global, local, false); ret = oclk_max_pool_forward.run(1, global, local, false);
} }
break; break;
case LIBDNN_POOLING_METHOD_AVE: case LIBDNN_POOLING_METHOD_AVE:
{ {
CV_Assert(top_mask.empty());
ocl::Kernel oclk_ave_pool_forward(CL_KERNEL_SELECT("ave_pool_forward"), ocl::Kernel oclk_ave_pool_forward(CL_KERNEL_SELECT("ave_pool_forward"),
cv::ocl::dnn::ocl4dnn_pooling_oclsrc); ocl::dnn::ocl4dnn_pooling_oclsrc,
format("-D KERNEL_AVE_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
" -D STRIDE_W=%d -D STRIDE_H=%d"
" -D PAD_W=%d -D PAD_H=%d",
kernel_w_, kernel_h_,
stride_w_, stride_h_,
pad_w_, pad_h_
));
if (oclk_ave_pool_forward.empty()) if (oclk_ave_pool_forward.empty())
return false; return false;
argIdx = 0; oclk_ave_pool_forward.args(
oclk_ave_pool_forward.set(argIdx++, count_); count_,
oclk_ave_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); ocl::KernelArg::PtrReadOnly(bottom),
oclk_ave_pool_forward.set(argIdx++, batch_size_); batch_size_,
oclk_ave_pool_forward.set(argIdx++, channels_); channels_,
oclk_ave_pool_forward.set(argIdx++, height_); height_,
oclk_ave_pool_forward.set(argIdx++, width_); width_,
oclk_ave_pool_forward.set(argIdx++, pooled_height_); pooled_height_,
oclk_ave_pool_forward.set(argIdx++, pooled_width_); pooled_width_,
oclk_ave_pool_forward.set(argIdx++, kernel_h_); ocl::KernelArg::PtrWriteOnly(top)
oclk_ave_pool_forward.set(argIdx++, kernel_w_); );
oclk_ave_pool_forward.set(argIdx++, stride_h_);
oclk_ave_pool_forward.set(argIdx++, stride_w_);
oclk_ave_pool_forward.set(argIdx++, pad_h_);
oclk_ave_pool_forward.set(argIdx++, pad_w_);
oclk_ave_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
ret = oclk_ave_pool_forward.run(1, global, local, false); ret = oclk_ave_pool_forward.run(1, global, local, false);
} }
break; break;
case LIBDNN_POOLING_METHOD_STO: case LIBDNN_POOLING_METHOD_STO:
{ {
CV_Assert(top_mask.empty());
ocl::Kernel oclk_sto_pool_forward(CL_KERNEL_SELECT("sto_pool_forward_test"), ocl::Kernel oclk_sto_pool_forward(CL_KERNEL_SELECT("sto_pool_forward_test"),
cv::ocl::dnn::ocl4dnn_pooling_oclsrc); ocl::dnn::ocl4dnn_pooling_oclsrc,
format("-D KERNEL_STO_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
" -D STRIDE_W=%d -D STRIDE_H=%d",
kernel_w_, kernel_h_,
stride_w_, stride_h_
));
if (oclk_sto_pool_forward.empty()) if (oclk_sto_pool_forward.empty())
return false; return false;
argIdx = 0; oclk_sto_pool_forward.args(
oclk_sto_pool_forward.set(argIdx++, count_); count_,
oclk_sto_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); ocl::KernelArg::PtrReadOnly(bottom),
oclk_sto_pool_forward.set(argIdx++, batch_size_); batch_size_,
oclk_sto_pool_forward.set(argIdx++, channels_); channels_,
oclk_sto_pool_forward.set(argIdx++, height_); height_,
oclk_sto_pool_forward.set(argIdx++, width_); width_,
oclk_sto_pool_forward.set(argIdx++, pooled_height_); pooled_height_,
oclk_sto_pool_forward.set(argIdx++, pooled_width_); pooled_width_,
oclk_sto_pool_forward.set(argIdx++, kernel_h_); ocl::KernelArg::PtrWriteOnly(top)
oclk_sto_pool_forward.set(argIdx++, kernel_w_); );
oclk_sto_pool_forward.set(argIdx++, stride_h_);
oclk_sto_pool_forward.set(argIdx++, stride_w_);
oclk_sto_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
ret = oclk_sto_pool_forward.run(1, global, local, false); ret = oclk_sto_pool_forward.run(1, global, local, false);
} }

@ -44,14 +44,23 @@
#define TEMPLATE(name,type) CONCAT(name,type) #define TEMPLATE(name,type) CONCAT(name,type)
#define Dtype float #define Dtype float
void TEMPLATE(max_pool_forward_impl, Dtype)( #if defined KERNEL_MAX_POOL
__kernel void
#ifdef HAVE_MASK
TEMPLATE(max_pool_forward_mask, Dtype)
#else
TEMPLATE(max_pool_forward, Dtype)
#endif
(
const int nthreads, __global const Dtype* bottom_data, const int num, const int nthreads, __global const Dtype* bottom_data, const int num,
const int channels, const int height, const int width, const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, const int kernel_h, const int pooled_height, const int pooled_width,
const int kernel_w, const int stride_h, const int stride_w, const int pad_h, __global Dtype* top_data
const int pad_w, #ifdef HAVE_MASK
__global Dtype* top_data, , __global Dtype* mask
const int use_mask, __global int* mask, __global Dtype* top_mask, bool no_mask) #endif
)
{ {
for (int index = get_global_id(0); index < nthreads; for (int index = get_global_id(0); index < nthreads;
index += get_global_size(0)) index += get_global_size(0))
@ -60,10 +69,10 @@ void TEMPLATE(max_pool_forward_impl, Dtype)(
const int ph = (index / pooled_width) % pooled_height; const int ph = (index / pooled_width) % pooled_height;
const int c = (index / pooled_width / pooled_height) % channels; const int c = (index / pooled_width / pooled_height) % channels;
const int n = index / pooled_width / pooled_height / channels; const int n = index / pooled_width / pooled_height / channels;
int hstart = ph * stride_h - pad_h; int hstart = ph * STRIDE_H - PAD_H;
int wstart = pw * stride_w - pad_w; int wstart = pw * STRIDE_W - PAD_W;
const int hend = min(hstart + kernel_h, height); const int hend = min(hstart + KERNEL_H, height);
const int wend = min(wstart + kernel_w, width); const int wend = min(wstart + KERNEL_W, width);
hstart = max(hstart, (int)0); hstart = max(hstart, (int)0);
wstart = max(wstart, (int)0); wstart = max(wstart, (int)0);
Dtype maxval = -FLT_MAX; Dtype maxval = -FLT_MAX;
@ -79,38 +88,19 @@ void TEMPLATE(max_pool_forward_impl, Dtype)(
} }
} }
top_data[index] = maxval; top_data[index] = maxval;
if (!no_mask) { #ifdef HAVE_MASK
if (use_mask == 1) { mask[index] = maxidx;
mask[index] = maxidx; #endif
} else {
top_mask[index] = maxidx;
}
}
} }
} }
__kernel void TEMPLATE(max_pool_forward, Dtype)( #elif defined KERNEL_AVE_POOL
const int nthreads, __global const Dtype* bottom_data, const int num,
const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, const int kernel_h,
const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
const int pad_w,
__global Dtype* top_data,
const int use_mask, __global int* mask, __global Dtype* top_mask)
{
TEMPLATE(max_pool_forward_impl, Dtype)(
nthreads, bottom_data, num, channels, height, width,
pooled_height, pooled_width, kernel_h,
kernel_w, stride_h, stride_w, pad_h, pad_w, top_data, use_mask, mask, top_mask, false
);
}
__kernel void TEMPLATE(ave_pool_forward, Dtype)( __kernel void TEMPLATE(ave_pool_forward, Dtype)(
const int nthreads, __global const Dtype* const bottom_data, const int num, const int nthreads, __global const Dtype* const bottom_data, const int num,
const int channels, const int height, const int width, const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, const int kernel_h, const int pooled_height, const int pooled_width,
const int kernel_w, const int stride_h, const int stride_w, const int pad_h, __global Dtype* top_data)
const int pad_w, __global Dtype* top_data)
{ {
for (int index = get_global_id(0); index < nthreads; for (int index = get_global_id(0); index < nthreads;
index += get_global_size(0)) index += get_global_size(0))
@ -120,10 +110,10 @@ __kernel void TEMPLATE(ave_pool_forward, Dtype)(
const int ph = (index / pooled_width) % pooled_height; const int ph = (index / pooled_width) % pooled_height;
const int c = (index / pooled_width / pooled_height) % channels; const int c = (index / pooled_width / pooled_height) % channels;
const int n = index / pooled_width / pooled_height / channels; const int n = index / pooled_width / pooled_height / channels;
int hstart = ph * stride_h - pad_h; int hstart = ph * STRIDE_H - PAD_H;
int wstart = pw * stride_w - pad_w; int wstart = pw * STRIDE_W - PAD_W;
int hend = min(hstart + kernel_h, height + pad_h); int hend = min(hstart + KERNEL_H, height + PAD_H);
int wend = min(wstart + kernel_w, width + pad_w); int wend = min(wstart + KERNEL_W, width + PAD_W);
const int pool_size = (hend - hstart) * (wend - wstart); const int pool_size = (hend - hstart) * (wend - wstart);
hstart = max(hstart, (int)0); hstart = max(hstart, (int)0);
wstart = max(wstart, (int)0); wstart = max(wstart, (int)0);
@ -142,11 +132,12 @@ __kernel void TEMPLATE(ave_pool_forward, Dtype)(
} }
} }
#elif defined KERNEL_STO_POOL
__kernel void TEMPLATE(sto_pool_forward_test,Dtype)( __kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
const int nthreads, __global const Dtype* const bottom_data, const int num, const int nthreads, __global const Dtype* const bottom_data, const int num,
const int channels, const int height, const int width, const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, const int kernel_h, const int pooled_height, const int pooled_width,
const int kernel_w, const int stride_h, const int stride_w,
__global Dtype* top_data) __global Dtype* top_data)
{ {
for (int index = get_global_id(0); index < nthreads; for (int index = get_global_id(0); index < nthreads;
@ -156,10 +147,10 @@ __kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
const int ph = (index / pooled_width) % pooled_height; const int ph = (index / pooled_width) % pooled_height;
const int c = (index / pooled_width / pooled_height) % channels; const int c = (index / pooled_width / pooled_height) % channels;
const int n = index / pooled_width / pooled_height / channels; const int n = index / pooled_width / pooled_height / channels;
const int hstart = ph * stride_h; const int hstart = ph * STRIDE_H;
const int hend = min(hstart + kernel_h, height); const int hend = min(hstart + KERNEL_H, height);
const int wstart = pw * stride_w; const int wstart = pw * STRIDE_W;
const int wend = min(wstart + kernel_w, width); const int wend = min(wstart + KERNEL_W, width);
// We set cumsum to be 0 to avoid divide-by-zero problems // We set cumsum to be 0 to avoid divide-by-zero problems
Dtype cumsum = FLT_MIN; Dtype cumsum = FLT_MIN;
Dtype cumvalues = 0.; Dtype cumvalues = 0.;
@ -168,10 +159,13 @@ __kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
// First pass: get sum // First pass: get sum
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
cumsum += bottom_slice[h * width + w]; Dtype v = bottom_slice[h * width + w];
cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w]; cumsum += v;
cumvalues += v * v;
} }
} }
top_data[index] = cumvalues / cumsum; top_data[index] = cumvalues / cumsum;
} }
} }
#endif // KERNEL_*

Loading…
Cancel
Save