Merge pull request #1878 from ilya-lavrenov:ocl_resize_area_master

11 years ago · daefe6983f
parent d83094b412 09795e3265
commit daefe6983f
8 changed files with 260 additions and 38 deletions
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@ -250,8 +250,12 @@ public:
    KernelArg();

    static KernelArg Local() { return KernelArg(LOCAL, 0); }
-    static KernelArg PtrOnly(const UMat & m)
-    { return KernelArg(PTR_ONLY, (UMat*)&m); }
+    static KernelArg PtrWriteOnly(const UMat& m)
+    { return KernelArg(PTR_ONLY+WRITE_ONLY, (UMat*)&m); }
+    static KernelArg PtrReadOnly(const UMat& m)
+    { return KernelArg(PTR_ONLY+READ_ONLY, (UMat*)&m); }
+    static KernelArg PtrReadWrite(const UMat& m)
+    { return KernelArg(PTR_ONLY+READ_WRITE, (UMat*)&m); }
    static KernelArg ReadWrite(const UMat& m, int wscale=1)
    { return KernelArg(READ_WRITE, (UMat*)&m, wscale); }
    static KernelArg ReadWriteNoSize(const UMat& m, int wscale=1)
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@ -2197,10 +2197,10 @@ int Kernel::set(int i, const UMat& m)
 int Kernel::set(int i, const KernelArg& arg)
 {
    CV_Assert( i >= 0 );
-    if( i == 0 )
-        p->cleanupUMats();
    if( !p || !p->handle )
        return -1;
+    if( i == 0 )
+        p->cleanupUMats();
    if( arg.m )
    {
        int accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : 0) +
@ -2222,7 +2222,7 @@ int Kernel::set(int i, const KernelArg& arg)
            {
                int cols = u2d.cols*arg.wscale;
                clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows);
-                clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.cols), &cols);
+                clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols);
                i += 2;
            }
        }
@ -2256,10 +2256,17 @@ bool Kernel::run(int dims, size_t globalsize[], size_t localsize[],
 {
    if(!p || !p->handle || p->e != 0)
        return false;
+
+    AutoBuffer<size_t> _globalSize(dims);
+    size_t * globalSizePtr = (size_t *)_globalSize;
+    for (int i = 0; i < dims; ++i)
+        globalSizePtr[i] = localsize == NULL ? globalsize[i] :
+                                               ((globalsize[i] + localsize[i] - 1) / localsize[i]) * localsize[i];
+
    cl_command_queue qq = getQueue(q);
    size_t offset[CV_MAX_DIM] = {0};
    cl_int retval = clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims,
-                                           offset, globalsize, localsize, 0, 0,
+                                           offset, globalSizePtr, localsize, 0, 0,
                                           sync ? 0 : &p->e);
    if( sync || retval < 0 )
    {
@ -2350,6 +2357,7 @@ struct Program::Impl
            void** deviceList = deviceListBuf;
            for( i = 0; i < n; i++ )
                deviceList[i] = ctx.device(i).ptr();
+
            retval = clBuildProgram(handle, n,
                                    (const cl_device_id*)deviceList,
                                    buildflags.c_str(), 0, 0);
--- a/modules/core/test/test_umat.cpp
+++ b/modules/core/test/test_umat.cpp
@ -107,8 +107,8 @@ bool CV_UMatTest::TestUMat()

        ra += Scalar::all(1.f);
        {
-        Mat temp = ura.getMat(ACCESS_RW);
-        temp += Scalar::all(1.f);
+            Mat temp = ura.getMat(ACCESS_RW);
+            temp += Scalar::all(1.f);
        }
        ra.copyTo(rb);
        CHECK_DIFF(ra, rb);
@ -146,8 +146,8 @@ bool CV_UMatTest::TestUMat()
        CHECK_DIFF(rc0, rc);

        {
-        UMat tmp = rc0.getUMat(ACCESS_WRITE);
-        cv::max(ura, urb, tmp);
+            UMat tmp = rc0.getUMat(ACCESS_WRITE);
+            cv::max(ura, urb, tmp);
        }
        CHECK_DIFF(rc0, rc);

--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@ -2875,7 +2875,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )

        k.create("RGB2XYZ", ocl::imgproc::cvtcolor_oclsrc,
                 format("-D depth=%d -D scn=%d -D dcn=3 -D bidx=%d", depth, scn, bidx));
-        k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrOnly(c));
+        k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(c));
        return k.run(2, globalsize, 0, false);
    }
    case COLOR_XYZ2BGR: case COLOR_XYZ2RGB:
@ -2924,7 +2924,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )

        k.create("XYZ2RGB", ocl::imgproc::cvtcolor_oclsrc,
                 format("-D depth=%d -D scn=3 -D dcn=%d -D bidx=%d", depth, dcn, bidx));
-        k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrOnly(c));
+        k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(c));
        return k.run(2, globalsize, 0, false);
    }
    case COLOR_BGR2HSV: case COLOR_RGB2HSV: case COLOR_BGR2HSV_FULL: case COLOR_RGB2HSV_FULL:
@ -2980,8 +2980,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
                                                                      depth, hrange, bidx, scn));

            k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst),
-                   ocl::KernelArg::PtrOnly(sdiv_data), hrange == 256 ? ocl::KernelArg::PtrOnly(hdiv_data256) :
-                                                                       ocl::KernelArg::PtrOnly(hdiv_data180));
+                   ocl::KernelArg::PtrReadOnly(sdiv_data), hrange == 256 ? ocl::KernelArg::PtrReadOnly(hdiv_data256) :
+                                                                       ocl::KernelArg::PtrReadOnly(hdiv_data180));

            return k.run(2, globalsize, NULL, false);
        }
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@ -48,8 +48,6 @@

 #include "precomp.hpp"
 #include "opencl_kernels.hpp"
-#include <iostream>
-#include <vector>

 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
 static IppStatus sts = ippInit();
@ -1902,18 +1900,72 @@ private:
 };
 #endif

+static void ocl_computeResizeAreaTabs(int ssize, int dsize, double scale, int * const map_tab,
+                                          float * const alpha_tab, int * const ofs_tab)
+{
+    int k = 0, dx = 0;
+    for ( ; dx < dsize; dx++)
+    {
+        ofs_tab[dx] = k;
+
+        double fsx1 = dx * scale;
+        double fsx2 = fsx1 + scale;
+        double cellWidth = std::min(scale, ssize - fsx1);
+
+        int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
+
+        sx2 = std::min(sx2, ssize - 1);
+        sx1 = std::min(sx1, sx2);
+
+        if (sx1 - fsx1 > 1e-3)
+        {
+            map_tab[k] = sx1 - 1;
+            alpha_tab[k++] = (float)((sx1 - fsx1) / cellWidth);
+        }
+
+        for (int sx = sx1; sx < sx2; sx++)
+        {
+            map_tab[k] = sx;
+            alpha_tab[k++] = float(1.0 / cellWidth);
+        }
+
+        if (fsx2 - sx2 > 1e-3)
+        {
+            map_tab[k] = sx2;
+            alpha_tab[k++] = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
+        }
+    }
+    ofs_tab[dx] = k;
+}
+
+static void ocl_computeResizeAreaFastTabs(int * dmap_tab, int * smap_tab, int scale, int dcols, int scol)
+{
+    for (int i = 0; i < dcols; ++i)
+        dmap_tab[i] = scale * i;
+
+    for (int i = 0, size = dcols * scale; i < size; ++i)
+        smap_tab[i] = std::min(scol - 1, i);
+}
+
 static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
                        double fx, double fy, int interpolation)
 {
    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
-    if( !(cn <= 4 &&
-           (interpolation == INTER_NEAREST ||
-           (interpolation == INTER_LINEAR))) )
+
+    double inv_fx = 1. / fx, inv_fy = 1. / fy;
+    float inv_fxf = (float)inv_fx, inv_fyf = (float)inv_fy;
+
+    if( cn == 3 || !(cn <= 4 &&
+           (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR ||
+            (interpolation == INTER_AREA && inv_fx >= 1 && inv_fy >= 1) )) )
        return false;
+
    UMat src = _src.getUMat();
    _dst.create(dsize, type);
    UMat dst = _dst.getUMat();
+
    ocl::Kernel k;
+    size_t globalsize[] = { dst.cols, dst.rows };

    if (interpolation == INTER_LINEAR)
    {
@ -1929,14 +1981,86 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
    else if (interpolation == INTER_NEAREST)
    {
        k.create("resizeNN", ocl::imgproc::resize_oclsrc,
-                 format("-D INTER_NEAREST -D PIXTYPE=%s", ocl::memopTypeToStr(type) ));
+                 format("-D INTER_NEAREST -D PIXTYPE=%s -D cn", ocl::memopTypeToStr(type), cn));
+    }
+    else if (interpolation == INTER_AREA)
+    {
+        int iscale_x = saturate_cast<int>(inv_fx);
+        int iscale_y = saturate_cast<int>(inv_fy);
+        bool is_area_fast = std::abs(inv_fx - iscale_x) < DBL_EPSILON &&
+                        std::abs(inv_fy - iscale_y) < DBL_EPSILON;
+        int wdepth = std::max(depth, is_area_fast ? CV_32S : CV_32F);
+        int wtype = CV_MAKE_TYPE(wdepth, cn);
+
+        char cvt[2][40];
+        String buildOption = format("-D INTER_AREA -D T=%s -D WTV=%s -D convertToWTV=%s",
+                                    ocl::typeToStr(type), ocl::typeToStr(wtype),
+                                    ocl::convertTypeStr(depth, wdepth, cn, cvt[0]));
+
+        UMat alphaOcl, tabofsOcl, mapOcl;
+        UMat dmap, smap;
+
+        if (is_area_fast)
+        {
+            int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn);
+            buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
+                                               " -D XSCALE=%d -D YSCALE=%d -D SCALE=%f",
+                                               ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]),
+                                               ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]),
+                                  iscale_x, iscale_y, 1.0f / (iscale_x * iscale_y));
+
+            k.create("resizeAREA_FAST", ocl::imgproc::resize_oclsrc, buildOption);
+
+            int smap_tab_size = dst.cols * iscale_x + dst.rows * iscale_y;
+            AutoBuffer<int> dmap_tab(dst.cols + dst.rows), smap_tab(smap_tab_size);
+            int * dxmap_tab = dmap_tab, * dymap_tab = dxmap_tab + dst.cols;
+            int * sxmap_tab = smap_tab, * symap_tab = smap_tab + dst.cols * iscale_y;
+
+            ocl_computeResizeAreaFastTabs(dxmap_tab, sxmap_tab, iscale_x, dst.cols, src.cols);
+            ocl_computeResizeAreaFastTabs(dymap_tab, symap_tab, iscale_y, dst.rows, src.rows);
+
+            Mat(1, dst.cols + dst.rows, CV_32SC1, (void *)dmap_tab).copyTo(dmap);
+            Mat(1, smap_tab_size, CV_32SC1, (void *)smap_tab).copyTo(smap);
+        }
+        else
+        {
+            buildOption = buildOption + format(" -D convertToT=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0]));
+            k.create("resizeAREA", ocl::imgproc::resize_oclsrc, buildOption);
+
+            Size ssize = src.size();
+            int xytab_size = (ssize.width + ssize.height) << 1;
+            int tabofs_size = dsize.height + dsize.width + 2;
+
+            AutoBuffer<int> _xymap_tab(xytab_size), _xyofs_tab(tabofs_size);
+            AutoBuffer<float> _xyalpha_tab(xytab_size);
+            int * xmap_tab = _xymap_tab, * ymap_tab = _xymap_tab + (ssize.width << 1);
+            float * xalpha_tab = _xyalpha_tab, * yalpha_tab = _xyalpha_tab + (ssize.width << 1);
+            int * xofs_tab = _xyofs_tab, * yofs_tab = _xyofs_tab + dsize.width + 1;
+
+            ocl_computeResizeAreaTabs(ssize.width, dsize.width, inv_fx, xmap_tab, xalpha_tab, xofs_tab);
+            ocl_computeResizeAreaTabs(ssize.height, dsize.height, inv_fy, ymap_tab, yalpha_tab, yofs_tab);
+
+            // loading precomputed arrays to GPU
+            Mat(1, xytab_size, CV_32FC1, (void *)_xyalpha_tab).copyTo(alphaOcl);
+            Mat(1, xytab_size, CV_32SC1, (void *)_xymap_tab).copyTo(mapOcl);
+            Mat(1, tabofs_size, CV_32SC1, (void *)_xyofs_tab).copyTo(tabofsOcl);
+        }
+
+        ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst);
+
+        if (is_area_fast)
+            k.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(dmap), ocl::KernelArg::PtrReadOnly(smap));
+        else
+            k.args(srcarg, dstarg, inv_fxf, inv_fyf, ocl::KernelArg::PtrReadOnly(tabofsOcl),
+                   ocl::KernelArg::PtrReadOnly(mapOcl), ocl::KernelArg::PtrReadOnly(alphaOcl));
+
+        return k.run(2, globalsize, NULL, false);
    }

    if( k.empty() )
        return false;
    k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
-           (float)(1./fx), (float)(1./fy));
-    size_t globalsize[] = { dst.cols, dst.rows };
+           (float)inv_fx, (float)inv_fy);

    return k.run(2, globalsize, 0, false);
 }
@ -2069,7 +2193,7 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
    }

    if( ocl::useOpenCL() && _dst.kind() == _InputArray::UMAT &&
-        ocl_resize(_src, _dst, dsize, inv_scale_x, inv_scale_y, interpolation) )
+            ocl_resize(_src, _dst, dsize, inv_scale_x, inv_scale_y, interpolation))
        return;

    Mat src = _src.getMat();
--- a/modules/imgproc/src/opencl/resize.cl
+++ b/modules/imgproc/src/opencl/resize.cl
@ -43,16 +43,8 @@
 //
 //M*/

-
-// resize kernel
-// Currently, CV_8UC1  CV_8UC4  CV_32FC1 and CV_32FC4are supported.
-// We shall support other types later if necessary.
-
 #if defined DOUBLE_SUPPORT
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
-#define F double
-#else
-#define F float
 #endif

 #define INTER_RESIZE_COEF_BITS 11
@ -141,8 +133,8 @@ __kernel void resizeNN(__global const uchar* srcptr, int srcstep, int srcoffset,

    if( dx < dstcols && dy < dstrows )
    {
-        F s1 = dx*ifx;
-        F s2 = dy*ify;
+        float s1 = dx*ifx;
+        float s2 = dy*ify;
        int sx = min(convert_int_rtz(s1), srccols-1);
        int sy = min(convert_int_rtz(s2), srcrows-1);

@ -153,4 +145,91 @@ __kernel void resizeNN(__global const uchar* srcptr, int srcstep, int srcoffset,
    }
 }

+#elif defined INTER_AREA
+
+#define TSIZE ((int)(sizeof(T)))
+
+#ifdef INTER_AREA_FAST
+
+__kernel void resizeAREA_FAST(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
+                              __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+                              __global const int * dmap_tab, __global const int * smap_tab)
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    if (dx < dst_cols && dy < dst_rows)
+    {
+        int dst_index = mad24(dy, dst_step, dst_offset);
+
+        __global const int * xmap_tab = dmap_tab;
+        __global const int * ymap_tab = dmap_tab + dst_cols;
+        __global const int * sxmap_tab = smap_tab;
+        __global const int * symap_tab = smap_tab + XSCALE * dst_cols;
+
+        int sx = xmap_tab[dx], sy = ymap_tab[dy];
+        WTV sum = (WTV)(0);
+
+        #pragma unroll
+        for (int y = 0; y < YSCALE; ++y)
+        {
+            int src_index = mad24(symap_tab[y + sy], src_step, src_offset);
+            #pragma unroll
+            for (int x = 0; x < XSCALE; ++x)
+                sum += convertToWTV(((__global const T*)(src + src_index))[sxmap_tab[sx + x]]);
+        }
+
+        ((__global T*)(dst + dst_index))[dx] = convertToT(convertToWT2V(sum) * (WT2V)(SCALE));
+    }
+}
+
+#else
+
+__kernel void resizeAREA(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
+                         __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+                         float ifx, float ify, __global const int * ofs_tab,
+                         __global const int * map_tab, __global const float * alpha_tab)
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    if (dx < dst_cols && dy < dst_rows)
+    {
+        int dst_index = mad24(dy, dst_step, dst_offset);
+
+        __global const int * xmap_tab = map_tab;
+        __global const int * ymap_tab = (__global const int *)(map_tab + (src_cols << 1));
+        __global const float * xalpha_tab = alpha_tab;
+        __global const float * yalpha_tab = (__global const float *)(alpha_tab + (src_cols << 1));
+        __global const int * xofs_tab = ofs_tab;
+        __global const int * yofs_tab = (__global const int *)(ofs_tab + dst_cols + 1);
+
+        int xk0 = xofs_tab[dx], xk1 = xofs_tab[dx + 1];
+        int yk0 = yofs_tab[dy], yk1 = yofs_tab[dy + 1];
+
+        int sy0 = ymap_tab[yk0], sy1 = ymap_tab[yk1 - 1];
+        int sx0 = xmap_tab[xk0], sx1 = xmap_tab[xk1 - 1];
+
+        WTV sum = (WTV)(0), buf;
+        int src_index = mad24(sy0, src_step, src_offset);
+
+        for (int sy = sy0, yk = yk0; sy <= sy1; ++sy, src_index += src_step, ++yk)
+        {
+            WTV beta = (WTV)(yalpha_tab[yk]);
+            buf = (WTV)(0);
+
+            for (int sx = sx0, xk = xk0; sx <= sx1; ++sx, ++xk)
+            {
+                WTV alpha = (WTV)(xalpha_tab[xk]);
+                buf += convertToWTV(((__global const T*)(src + src_index))[sx]) * alpha;
+            }
+            sum += buf * beta;
+        }
+
+        ((__global T*)(dst + dst_index))[dx] = convertToT(sum);
+    }
+}
+
+#endif
+
 #endif
--- a/modules/imgproc/test/ocl/test_warp.cpp
+++ b/modules/imgproc/test/ocl/test_warp.cpp
@ -127,13 +127,20 @@ OCL_TEST_P(Resize, Mat)

 /////////////////////////////////////////////////////////////////////////////////////

-OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarp, Resize, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_16UC2, CV_32FC1, CV_32FC4),
-                            Values(0.5, 1.5, 2.0),
-                            Values(0.5, 1.5, 2.0),
+OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarpResize, Resize, Combine(
+                            Values((MatType)CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
+                            Values(0.7, 0.4, 2.0),
+                            Values(0.3, 0.6, 2.0),
                            Values((Interpolation)INTER_NEAREST, (Interpolation)INTER_LINEAR),
                            Bool()));

+OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarpResizeArea, Resize, Combine(
+                            Values((MatType)CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
+                            Values(0.7, 0.4, 0.5),
+                            Values(0.3, 0.6, 0.5),
+                            Values((Interpolation)INTER_AREA),
+                            Bool()));
+
 } } // namespace cvtest::ocl

 #endif // HAVE_OPENCL
--- a/modules/ts/include/opencv2/ts/ocl_test.hpp
+++ b/modules/ts/include/opencv2/ts/ocl_test.hpp
@ -305,7 +305,7 @@ IMPLEMENT_PARAM_CLASS(Channels, int)
 #define OCL_ALL_DEPTHS Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F)
 #define OCL_ALL_CHANNELS Values(1, 2, 3, 4)

-CV_ENUM(Interpolation, INTER_NEAREST, INTER_LINEAR, INTER_CUBIC)
+CV_ENUM(Interpolation, INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, INTER_AREA)

 #define OCL_INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator) \
    INSTANTIATE_TEST_CASE_P(OCL_ ## prefix, test_case_name, generator)