dnn(ocl4dnn): refactor pooling OpenCL calls

7 years ago · e34b64c979
parent f071a48ec7
commit e34b64c979
3 changed files with 107 additions and 111 deletions
--- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
+++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
@ -351,8 +351,6 @@ class OCL4DNNPool
                     UMat& top_data,
                     UMat& top_mask);
    private:
        UMat mask_idx_;
        // Pooling parameters
        std::vector<int32_t> pad_;
        std::vector<int32_t> stride_;
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
@ -88,7 +88,7 @@ OCL4DNNPool<Dtype>::OCL4DNNPool(OCL4DNNPoolConfig config)
 template<typename Dtype>
 OCL4DNNPool<Dtype>::~OCL4DNNPool()
 {
-    mask_idx_.release();
+    // nothing
 }
 template<typename Dtype>
@ -99,99 +99,103 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
    bool ret = true;
    size_t global[] = { 128 * 128 };
    size_t local[] = { 128 };
    cl_uint argIdx = 0;
    // support 2D case
    switch (pool_method_)
    {
    case LIBDNN_POOLING_METHOD_MAX:
        {
-            if (top_mask.empty() && mask_idx_.empty())
+            bool haveMask = !top_mask.empty();
-            {
+            ocl::Kernel oclk_max_pool_forward(
-                mask_idx_.create(1, count_, CV_32FC1);
+                haveMask ? CL_KERNEL_SELECT("max_pool_forward_mask") : CL_KERNEL_SELECT("max_pool_forward"),
-            }
+                ocl::dnn::ocl4dnn_pooling_oclsrc,
-            ocl::Kernel oclk_max_pool_forward(CL_KERNEL_SELECT("max_pool_forward"),
+                format("-D KERNEL_MAX_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
-                                              cv::ocl::dnn::ocl4dnn_pooling_oclsrc);
+                       " -D STRIDE_W=%d -D STRIDE_H=%d"
                       " -D PAD_W=%d -D PAD_H=%d%s",
                       kernel_w_, kernel_h_,
                       stride_w_, stride_h_,
                       pad_w_, pad_h_,
                       haveMask ? " -D HAVE_MASK=1" : ""
                ));
            if (oclk_max_pool_forward.empty())
                return false;
-            argIdx = 0;
+            oclk_max_pool_forward.args(
-            oclk_max_pool_forward.set(argIdx++, count_);
+                count_,
-            oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+                ocl::KernelArg::PtrReadOnly(bottom),
-            oclk_max_pool_forward.set(argIdx++, batch_size_);
+                batch_size_,
-            oclk_max_pool_forward.set(argIdx++, channels_);
+                channels_,
-            oclk_max_pool_forward.set(argIdx++, height_);
+                height_,
-            oclk_max_pool_forward.set(argIdx++, width_);
+                width_,
-            oclk_max_pool_forward.set(argIdx++, pooled_height_);
+                pooled_height_,
-            oclk_max_pool_forward.set(argIdx++, pooled_width_);
+                pooled_width_,
-            oclk_max_pool_forward.set(argIdx++, kernel_h_);
+                ocl::KernelArg::PtrWriteOnly(top),
-            oclk_max_pool_forward.set(argIdx++, kernel_w_);
+                ocl::KernelArg::PtrWriteOnly(top_mask)
-            oclk_max_pool_forward.set(argIdx++, stride_h_);
+            );
            oclk_max_pool_forward.set(argIdx++, stride_w_);
            oclk_max_pool_forward.set(argIdx++, pad_h_);
            oclk_max_pool_forward.set(argIdx++, pad_w_);
            oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
            oclk_max_pool_forward.set(argIdx++, mask_idx_.empty() ? 0 : 1);
            if (mask_idx_.empty())
                oclk_max_pool_forward.set(argIdx++, (void *)NULL);
            else
                oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(mask_idx_));
            oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top_mask));
            ret = oclk_max_pool_forward.run(1, global, local, false);
        }
        break;
    case LIBDNN_POOLING_METHOD_AVE:
        {
            CV_Assert(top_mask.empty());
            ocl::Kernel oclk_ave_pool_forward(CL_KERNEL_SELECT("ave_pool_forward"),
-                                              cv::ocl::dnn::ocl4dnn_pooling_oclsrc);
+                ocl::dnn::ocl4dnn_pooling_oclsrc,
                format("-D KERNEL_AVE_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
                       " -D STRIDE_W=%d -D STRIDE_H=%d"
                       " -D PAD_W=%d -D PAD_H=%d",
                       kernel_w_, kernel_h_,
                       stride_w_, stride_h_,
                       pad_w_, pad_h_
                ));
            if (oclk_ave_pool_forward.empty())
                return false;
-            argIdx = 0;
+            oclk_ave_pool_forward.args(
-            oclk_ave_pool_forward.set(argIdx++, count_);
+                count_,
-            oclk_ave_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+                ocl::KernelArg::PtrReadOnly(bottom),
-            oclk_ave_pool_forward.set(argIdx++, batch_size_);
+                batch_size_,
-            oclk_ave_pool_forward.set(argIdx++, channels_);
+                channels_,
-            oclk_ave_pool_forward.set(argIdx++, height_);
+                height_,
-            oclk_ave_pool_forward.set(argIdx++, width_);
+                width_,
-            oclk_ave_pool_forward.set(argIdx++, pooled_height_);
+                pooled_height_,
-            oclk_ave_pool_forward.set(argIdx++, pooled_width_);
+                pooled_width_,
-            oclk_ave_pool_forward.set(argIdx++, kernel_h_);
+                ocl::KernelArg::PtrWriteOnly(top)
-            oclk_ave_pool_forward.set(argIdx++, kernel_w_);
+            );
            oclk_ave_pool_forward.set(argIdx++, stride_h_);
            oclk_ave_pool_forward.set(argIdx++, stride_w_);
            oclk_ave_pool_forward.set(argIdx++, pad_h_);
            oclk_ave_pool_forward.set(argIdx++, pad_w_);
            oclk_ave_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
            ret = oclk_ave_pool_forward.run(1, global, local, false);
        }
        break;
    case LIBDNN_POOLING_METHOD_STO:
        {
            CV_Assert(top_mask.empty());
            ocl::Kernel oclk_sto_pool_forward(CL_KERNEL_SELECT("sto_pool_forward_test"),
-                                              cv::ocl::dnn::ocl4dnn_pooling_oclsrc);
+                ocl::dnn::ocl4dnn_pooling_oclsrc,
                format("-D KERNEL_STO_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
                       " -D STRIDE_W=%d -D STRIDE_H=%d",
                       kernel_w_, kernel_h_,
                       stride_w_, stride_h_
                ));
            if (oclk_sto_pool_forward.empty())
                return false;
-            argIdx = 0;
+            oclk_sto_pool_forward.args(
-            oclk_sto_pool_forward.set(argIdx++, count_);
+                count_,
-            oclk_sto_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+                ocl::KernelArg::PtrReadOnly(bottom),
-            oclk_sto_pool_forward.set(argIdx++, batch_size_);
+                batch_size_,
-            oclk_sto_pool_forward.set(argIdx++, channels_);
+                channels_,
-            oclk_sto_pool_forward.set(argIdx++, height_);
+                height_,
-            oclk_sto_pool_forward.set(argIdx++, width_);
+                width_,
-            oclk_sto_pool_forward.set(argIdx++, pooled_height_);
+                pooled_height_,
-            oclk_sto_pool_forward.set(argIdx++, pooled_width_);
+                pooled_width_,
-            oclk_sto_pool_forward.set(argIdx++, kernel_h_);
+                ocl::KernelArg::PtrWriteOnly(top)
-            oclk_sto_pool_forward.set(argIdx++, kernel_w_);
+            );
            oclk_sto_pool_forward.set(argIdx++, stride_h_);
            oclk_sto_pool_forward.set(argIdx++, stride_w_);
            oclk_sto_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
            ret = oclk_sto_pool_forward.run(1, global, local, false);
        }
--- a/modules/dnn/src/opencl/ocl4dnn_pooling.cl
+++ b/modules/dnn/src/opencl/ocl4dnn_pooling.cl
@ -44,14 +44,23 @@
 #define TEMPLATE(name,type) CONCAT(name,type)
 #define Dtype float
-void TEMPLATE(max_pool_forward_impl, Dtype)(
+#if defined KERNEL_MAX_POOL
 __kernel void
 #ifdef HAVE_MASK
    TEMPLATE(max_pool_forward_mask, Dtype)
 #else
    TEMPLATE(max_pool_forward, Dtype)
 #endif
 (
    const int nthreads, __global const Dtype* bottom_data, const int num,
    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int pooled_height, const int pooled_width,
-    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    __global Dtype* top_data
-    const int pad_w,
+#ifdef HAVE_MASK
-    __global Dtype* top_data,
+    , __global Dtype* mask
-    const int use_mask, __global int* mask, __global Dtype* top_mask, bool no_mask)
+#endif
 )
 {
  for (int index = get_global_id(0); index < nthreads;
      index += get_global_size(0))
@ -60,10 +69,10 @@ void TEMPLATE(max_pool_forward_impl, Dtype)(
    const int ph = (index / pooled_width) % pooled_height;
    const int c = (index / pooled_width / pooled_height) % channels;
    const int n = index / pooled_width / pooled_height / channels;
-    int hstart = ph * stride_h - pad_h;
+    int hstart = ph * STRIDE_H - PAD_H;
-    int wstart = pw * stride_w - pad_w;
+    int wstart = pw * STRIDE_W - PAD_W;
-    const int hend = min(hstart + kernel_h, height);
+    const int hend = min(hstart + KERNEL_H, height);
-    const int wend = min(wstart + kernel_w, width);
+    const int wend = min(wstart + KERNEL_W, width);
    hstart = max(hstart, (int)0);
    wstart = max(wstart, (int)0);
    Dtype maxval = -FLT_MAX;
@ -79,38 +88,19 @@ void TEMPLATE(max_pool_forward_impl, Dtype)(
      }
    }
    top_data[index] = maxval;
-    if (!no_mask) {
+#ifdef HAVE_MASK
-      if (use_mask == 1) {
+    mask[index] = maxidx;
-        mask[index] = maxidx;
+#endif
      } else {
        top_mask[index] = maxidx;
      }
    }
  }
 }
-__kernel void TEMPLATE(max_pool_forward, Dtype)(
+#elif defined KERNEL_AVE_POOL
    const int nthreads, __global const Dtype* bottom_data, const int num,
    const int channels, const int height, const int width,
    const int pooled_height, const int pooled_width, const int kernel_h,
    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
    const int pad_w,
    __global Dtype* top_data,
    const int use_mask, __global int* mask, __global Dtype* top_mask)
 {
    TEMPLATE(max_pool_forward_impl, Dtype)(
      nthreads, bottom_data, num, channels, height, width,
      pooled_height, pooled_width, kernel_h,
      kernel_w, stride_h, stride_w, pad_h, pad_w, top_data, use_mask, mask, top_mask, false
    );
 }
 __kernel void TEMPLATE(ave_pool_forward, Dtype)(
    const int nthreads, __global const Dtype* const bottom_data, const int num,
    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int pooled_height, const int pooled_width,
-    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    __global Dtype* top_data)
    const int pad_w, __global Dtype* top_data)
 {
  for (int index = get_global_id(0); index < nthreads;
      index += get_global_size(0))
@ -120,10 +110,10 @@ __kernel void TEMPLATE(ave_pool_forward, Dtype)(
      const int ph = (index / pooled_width) % pooled_height;
      const int c = (index / pooled_width / pooled_height) % channels;
      const int n = index / pooled_width / pooled_height / channels;
-      int hstart = ph * stride_h - pad_h;
+      int hstart = ph * STRIDE_H - PAD_H;
-      int wstart = pw * stride_w - pad_w;
+      int wstart = pw * STRIDE_W - PAD_W;
-      int hend = min(hstart + kernel_h, height + pad_h);
+      int hend = min(hstart + KERNEL_H, height + PAD_H);
-      int wend = min(wstart + kernel_w, width + pad_w);
+      int wend = min(wstart + KERNEL_W, width + PAD_W);
      const int pool_size = (hend - hstart) * (wend - wstart);
      hstart = max(hstart, (int)0);
      wstart = max(wstart, (int)0);
@ -142,11 +132,12 @@ __kernel void TEMPLATE(ave_pool_forward, Dtype)(
  }
 }
 #elif defined KERNEL_STO_POOL
 __kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
    const int nthreads, __global const Dtype* const bottom_data, const int num,
    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int pooled_height, const int pooled_width,
    const int kernel_w, const int stride_h, const int stride_w,
    __global Dtype* top_data)
 {
  for (int index = get_global_id(0); index < nthreads;
@ -156,10 +147,10 @@ __kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
    const int ph = (index / pooled_width) % pooled_height;
    const int c = (index / pooled_width / pooled_height) % channels;
    const int n = index / pooled_width / pooled_height / channels;
-    const int hstart = ph * stride_h;
+    const int hstart = ph * STRIDE_H;
-    const int hend = min(hstart + kernel_h, height);
+    const int hend = min(hstart + KERNEL_H, height);
-    const int wstart = pw * stride_w;
+    const int wstart = pw * STRIDE_W;
-    const int wend = min(wstart + kernel_w, width);
+    const int wend = min(wstart + KERNEL_W, width);
    // We set cumsum to be 0 to avoid divide-by-zero problems
    Dtype cumsum = FLT_MIN;
    Dtype cumvalues = 0.;
@ -168,10 +159,13 @@ __kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
    // First pass: get sum
    for (int h = hstart; h < hend; ++h) {
      for (int w = wstart; w < wend; ++w) {
-        cumsum += bottom_slice[h * width + w];
+        Dtype v = bottom_slice[h * width + w];
-        cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];
+        cumsum += v;
        cumvalues += v * v;
      }
    }
    top_data[index] = cumvalues / cumsum;
  }
 }
 #endif // KERNEL_*