merged 2 kernels

11 years ago · 26b73a7bbd
parent c836613bad
commit 26b73a7bbd
2 changed files with 69 additions and 59 deletions
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@ -3411,76 +3411,76 @@ namespace cv {
 static bool ocl_reduce(InputArray _src, OutputArray _dst,
                       int dim, int op, int op0, int stype, int dtype)
 {
+    const int min_opt_cols = 128, buf_cols = 32;
    int sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
            ddepth = CV_MAT_DEPTH(dtype), ddepth0 = ddepth;
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
+            useOptimized = 1 == dim && _src.cols() > min_opt_cols;

    if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
        return false;

    if (op == CV_REDUCE_AVG)
    {
-        op = CV_REDUCE_SUM;
        if (sdepth < CV_32S && ddepth < CV_32S)
            ddepth = CV_32S;
    }

    const char * const ops[4] = { "OCL_CV_REDUCE_SUM", "OCL_CV_REDUCE_AVG",
                                  "OCL_CV_REDUCE_MAX", "OCL_CV_REDUCE_MIN" };
-    char cvt[40];
-
-    const int min_opt_cols = 128;
-    if ((1 == dim) && (_src.cols() > min_opt_cols))
-    {
-        int buf_cols = 32;
-
-        cv::String build_opt_pre = format("-D BUF_COLS=%d -D %s -D dim=1 -D cn=%d -D ddepth=%d -D srcT=%s -D dstT=%s -D convertToDT=%s%s",
-                                 buf_cols, ops[op], cn, ddepth, ocl::typeToStr(sdepth), ocl::typeToStr(ddepth),
-                                 ocl::convertTypeStr(sdepth, ddepth, 1, cvt),
-                                 doubleSupport ? " -D DOUBLE_SUPPORT" : "");
+    char cvt[2][40];
+
+    int wdepth = std::max(ddepth, CV_32F);
+    cv::String build_opt = format("-D %s -D dim=%d -D cn=%d -D ddepth=%d"
+                                  " -D srcT=%s -D dstT=%s -D dstT0=%s -D convertToWT=%s"
+                                  " -D convertToDT=%s -D convertToDT0=%s%s",
+                                  ops[op], dim, cn, ddepth, ocl::typeToStr(useOptimized ? ddepth : sdepth),
+                                  ocl::typeToStr(ddepth), ocl::typeToStr(ddepth0),
+                                  ocl::convertTypeStr(ddepth, wdepth, 1, cvt[0]),
+                                  ocl::convertTypeStr(sdepth, ddepth, 1, cvt[0]),
+                                  ocl::convertTypeStr(wdepth, ddepth0, 1, cvt[1]),
+                                  doubleSupport ? " -D DOUBLE_SUPPORT" : "");
+
+    if (useOptimized)
+    {
+        cv::String build_opt_pre = format("-D OP_REDUCE_PRE -D BUF_COLS=%d -D %s -D dim=1"
+                                          "  -D cn=%d -D ddepth=%d -D srcT=%s -D dstT=%s -D convertToDT=%s%s",
+                                          buf_cols, ops[op], cn, ddepth, ocl::typeToStr(sdepth), ocl::typeToStr(ddepth),
+                                          ocl::convertTypeStr(sdepth, ddepth, 1, cvt[0]),
+                                          doubleSupport ? " -D DOUBLE_SUPPORT" : "");
        ocl::Kernel kpre("reduce_horz_pre", ocl::core::reduce2_oclsrc, build_opt_pre);
        if (kpre.empty())
            return false;

-        cv::String build_opt_main = format("-D %s -D dim=1 -D cn=%d -D ddepth=%d -D srcT=%s -D dstT=%s -D convertToDT=noconvert%s",
-                                 ops[op], cn, ddepth, ocl::typeToStr(ddepth), ocl::typeToStr(ddepth),
-                                 doubleSupport ? " -D DOUBLE_SUPPORT" : "");
-        ocl::Kernel kmain("reduce", ocl::core::reduce2_oclsrc, build_opt_main);
+        ocl::Kernel kmain("reduce", ocl::core::reduce2_oclsrc, build_opt);
        if (kmain.empty())
            return false;

        UMat src = _src.getUMat();
        Size dsize(1, src.rows);
        _dst.create(dsize, dtype);
-        UMat dst = _dst.getUMat(), temp = dst;
+        UMat dst = _dst.getUMat();

-        if (op0 == CV_REDUCE_AVG && sdepth < CV_32S && ddepth0 < CV_32S)
-            temp.create(dsize, CV_32SC(cn));
+        UMat buf(src.rows, buf_cols, dst.type());

-        UMat buf(src.rows, buf_cols, temp.type());
+        kpre.args(ocl::KernelArg::ReadOnly(src),
+                  ocl::KernelArg::WriteOnlyNoSize(buf));

        size_t globalSize[2] = { buf_cols, src.rows };
-
-        kpre.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnlyNoSize(buf));
        if (!kpre.run(2, globalSize, NULL, false))
            return false;

-        globalSize[0] = src.rows;
-        kmain.args(ocl::KernelArg::ReadOnly(buf), ocl::KernelArg::WriteOnlyNoSize(temp));
-        if (!kmain.run(1, globalSize, NULL, false))
-            return false;
-
        if (op0 == CV_REDUCE_AVG)
-            temp.convertTo(dst, ddepth0, 1. / src.cols);
+            kmain.args(ocl::KernelArg::ReadOnly(buf),
+                       ocl::KernelArg::WriteOnlyNoSize(dst), 1.0f / src.cols);
+        else
+            kmain.args(ocl::KernelArg::ReadOnly(buf),
+                       ocl::KernelArg::WriteOnlyNoSize(dst));

-        return true;
+        globalSize[0] = src.rows;
+        return kmain.run(1, globalSize, NULL, false);
    }

-    cv::String build_opt = format("-D %s -D dim=%d -D cn=%d -D ddepth=%d -D srcT=%s -D dstT=%s -D convertToDT=%s%s",
-                             ops[op], dim, cn, ddepth, ocl::typeToStr(sdepth), ocl::typeToStr(ddepth),
-                             ocl::convertTypeStr(sdepth, ddepth, 1, cvt),
-                             doubleSupport ? " -D DOUBLE_SUPPORT" : "");
-
    ocl::Kernel k("reduce", ocl::core::reduce2_oclsrc, build_opt);
    if (k.empty())
        return false;
@ -3488,22 +3488,18 @@ static bool ocl_reduce(InputArray _src, OutputArray _dst,
    UMat src = _src.getUMat();
    Size dsize(dim == 0 ? src.cols : 1, dim == 0 ? 1 : src.rows);
    _dst.create(dsize, dtype);
-    UMat dst = _dst.getUMat(), temp = dst;
-
-    if (op0 == CV_REDUCE_AVG && sdepth < CV_32S && ddepth0 < CV_32S)
-        temp.create(dsize, CV_32SC(cn));
-
-    size_t globalsize = std::max(dsize.width, dsize.height);
+    UMat dst = _dst.getUMat();

-    k.args(ocl::KernelArg::ReadOnly(src),
-           ocl::KernelArg::WriteOnlyNoSize(temp));
-    if (!k.run(1, &globalsize, NULL, false))
-        return false;
+    ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src),
+            temparg = ocl::KernelArg::WriteOnlyNoSize(dst);

    if (op0 == CV_REDUCE_AVG)
-        temp.convertTo(dst, ddepth0, 1. / (dim == 0 ? src.rows : src.cols));
+        k.args(srcarg, temparg, 1.0f / (dim == 0 ? src.rows : src.cols));
+    else
+        k.args(srcarg, temparg);

-    return true;
+    size_t globalsize = std::max(dsize.width, dsize.height);
+    return k.run(1, &globalsize, NULL, false);
 }

 }
--- a/modules/core/src/opencl/reduce2.cl
+++ b/modules/core/src/opencl/reduce2.cl
@ -76,24 +76,20 @@

 #define noconvert

-#ifdef OCL_CV_REDUCE_SUM
+#if defined OCL_CV_REDUCE_SUM || defined OCL_CV_REDUCE_AVG
 #define INIT_VALUE 0
 #define PROCESS_ELEM(acc, value) acc += value
-#elif defined(OCL_CV_REDUCE_MAX)
+#elif defined OCL_CV_REDUCE_MAX
 #define INIT_VALUE MIN_VAL
 #define PROCESS_ELEM(acc, value) acc = value > acc ? value : acc
-#elif defined(OCL_CV_REDUCE_MIN)
+#elif defined OCL_CV_REDUCE_MIN
 #define INIT_VALUE MAX_VAL
 #define PROCESS_ELEM(acc, value) acc = value < acc ? value : acc
-#elif defined(OCL_CV_REDUCE_AVG)
-#error "This operation should be implemented through OCL_CV_REDUCE_SUM"
 #else
 #error "No operation is specified"
 #endif

-#ifndef BUF_COLS
-#define BUF_COLS  32
-#endif
+#ifdef OP_REDUCE_PRE

 __kernel void reduce_horz_pre(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols,
                     __global uchar * bufptr, int buf_step, int buf_offset)
@ -126,15 +122,23 @@ __kernel void reduce_horz_pre(__global const uchar * srcptr, int src_step, int s
    }
 }

+#else
+
 __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols,
-                     __global uchar * dstptr, int dst_step, int dst_offset)
+                     __global uchar * dstptr, int dst_step, int dst_offset
+#ifdef OCL_CV_REDUCE_AVG
+                     , float fscale
+#endif
+                     )
 {
 #if dim == 0 // reduce to a single row
    int x = get_global_id(0);
    if (x < cols)
    {
        int src_index = mad24(x, (int)sizeof(srcT) * cn, src_offset);
-        __global dstT * dst = (__global dstT *)(dstptr + dst_offset) + x * cn;
+        int dst_index = mad24(x, (int)sizeof(dstT0) * cn, dst_offset);
+
+        __global dstT0 * dst = (__global dstT0 *)(dstptr + dst_index);
        dstT tmp[cn] = { INIT_VALUE };

        for (int y = 0; y < rows; ++y, src_index += src_step)
@ -150,7 +154,11 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset

        #pragma unroll
        for (int c = 0; c < cn; ++c)
-            dst[c] = tmp[c];
+#ifdef OCL_CV_REDUCE_AVG
+            dst[c] = convertToDT0(convertToWT(tmp[c]) * fscale);
+#else
+            dst[c] = convertToDT0(tmp[c]);
+#endif
    }
 #elif dim == 1 // reduce to a single column
    int y = get_global_id(0);
@ -175,9 +183,15 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset

        #pragma unroll
        for (int c = 0; c < cn; ++c)
-            dst[c] = tmp[c];
+#ifdef OCL_CV_REDUCE_AVG
+            dst[c] = convertToDT0(convertToWT(tmp[c]) * fscale);
+#else
+            dst[c] = convertToDT0(tmp[c]);
+#endif
    }
 #else
 #error "Dims must be either 0 or 1"
 #endif
 }
+
+#endif