Merge pull request #2495 from ilya-lavrenov:tapi_flip

11 years ago · 7193762d2d
parent d18ebfa8e8 04884ebf45
commit 7193762d2d
4 changed files with 51 additions and 40 deletions
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@ -482,9 +482,9 @@ enum { FLIP_COLS = 1 << 0, FLIP_ROWS = 1 << 1, FLIP_BOTH = FLIP_ROWS | FLIP_COLS
 static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )
 {
    CV_Assert(flipCode >= - 1 && flipCode <= 1);
-    int type = _src.type(), cn = CV_MAT_CN(type), flipType;
+    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), flipType;

-    if (cn > 4 || cn == 3)
+    if (cn > 4)
        return false;

    const char * kernelName;
@ -506,7 +506,8 @@ static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )
    }

    ocl::Kernel k(kernelName, ocl::core::flip_oclsrc,
-        format( "-D type=%s", ocl::memopTypeToStr(type)));
+        format( "-D T=%s -D T1=%s -D cn=%d", ocl::memopTypeToStr(type),
+                ocl::memopTypeToStr(depth), cn));
    if (k.empty())
        return false;

--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@ -2679,17 +2679,17 @@ namespace cv {

 static bool ocl_setIdentity( InputOutputArray _m, const Scalar& s )
 {
-    int type = _m.type(), cn = CV_MAT_CN(type);
-    if (cn == 3)
-        return false;
+    int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
+            sctype = CV_MAKE_TYPE(depth, cn == 3 ? 4 : cn);

    ocl::Kernel k("setIdentity", ocl::core::set_identity_oclsrc,
-                  format("-D T=%s", ocl::memopTypeToStr(type)));
+                  format("-D T=%s -D T1=%s -D cn=%d -D ST=%s", ocl::memopTypeToStr(type),
+                         ocl::memopTypeToStr(depth), cn, ocl::memopTypeToStr(sctype)));
    if (k.empty())
        return false;

    UMat m = _m.getUMat();
-    k.args(ocl::KernelArg::WriteOnly(m), ocl::KernelArg::Constant(Mat(1, 1, type, s)));
+    k.args(ocl::KernelArg::WriteOnly(m), ocl::KernelArg::Constant(Mat(1, 1, sctype, s)));

    size_t globalsize[2] = { m.cols, m.rows };
    return k.run(2, globalsize, NULL, false);
--- a/modules/core/src/opencl/flip.cl
+++ b/modules/core/src/opencl/flip.cl
@ -39,10 +39,18 @@
 //
 //M*/

-#define sizeoftype ((int)sizeof(type))
+#if cn != 3
+#define loadpix(addr) *(__global const T *)(addr)
+#define storepix(val, addr)  *(__global T *)(addr) = val
+#define TSIZE (int)sizeof(T)
+#else
+#define loadpix(addr) vload3(0, (__global const T1 *)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))
+#define TSIZE ((int)sizeof(T1)*3)
+#endif

-__kernel void arithm_flip_rows(__global const uchar* srcptr, int srcstep, int srcoffset,
-                               __global uchar* dstptr, int dststep, int dstoffset,
+__kernel void arithm_flip_rows(__global const uchar * srcptr, int src_step, int src_offset,
+                               __global uchar * dstptr, int dst_step, int dst_offset,
                               int rows, int cols, int thread_rows, int thread_cols)
 {
    int x = get_global_id(0);
@ -50,19 +58,16 @@ __kernel void arithm_flip_rows(__global const uchar* srcptr, int srcstep, int sr

    if (x < cols && y < thread_rows)
    {
-        __global const type* src0 = (__global const type*)(srcptr + mad24(y, srcstep, mad24(x, sizeoftype, srcoffset)));
-        __global const type* src1 = (__global const type*)(srcptr + mad24(rows - y - 1, srcstep, mad24(x, sizeoftype, srcoffset)));
+        T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset)));
+        T src1 = loadpix(srcptr + mad24(rows - y - 1, src_step, mad24(x, TSIZE, src_offset)));

-        __global type* dst0 = (__global type*)(dstptr + mad24(y, dststep, mad24(x, sizeoftype, dstoffset)));
-        __global type* dst1 = (__global type*)(dstptr + mad24(rows - y - 1, dststep, mad24(x, sizeoftype, dstoffset)));
-
-        dst0[0] = src1[0];
-        dst1[0] = src0[0];
+        storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset)));
+        storepix(src0, dstptr + mad24(rows - y - 1, dst_step, mad24(x, TSIZE, dst_offset)));
    }
 }

-__kernel void arithm_flip_rows_cols(__global const uchar* srcptr, int srcstep, int srcoffset,
-                                    __global uchar* dstptr, int dststep, int dstoffset,
+__kernel void arithm_flip_rows_cols(__global const uchar * srcptr, int src_step, int src_offset,
+                                    __global uchar * dstptr, int dst_step, int dst_offset,
                                    int rows, int cols, int thread_rows, int thread_cols)
 {
    int x = get_global_id(0);
@ -71,19 +76,16 @@ __kernel void arithm_flip_rows_cols(__global const uchar* srcptr, int srcstep, i
    if (x < cols && y < thread_rows)
    {
        int x1 = cols - x - 1;
-        __global const type* src0 = (__global const type*)(srcptr + mad24(y, srcstep, mad24(x, sizeoftype, srcoffset)));
-        __global const type* src1 = (__global const type*)(srcptr + mad24(rows - y - 1, srcstep, mad24(x1, sizeoftype, srcoffset)));
-
-        __global type* dst0 = (__global type*)(dstptr + mad24(rows - y - 1, dststep, mad24(x1, sizeoftype, dstoffset)));
-        __global type* dst1 = (__global type*)(dstptr + mad24(y, dststep, mad24(x, sizeoftype, dstoffset)));
+        T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset)));
+        T src1 = loadpix(srcptr + mad24(rows - y - 1, src_step, mad24(x1, TSIZE, src_offset)));

-        dst0[0] = src0[0];
-        dst1[0] = src1[0];
+        storepix(src0, dstptr + mad24(rows - y - 1, dst_step, mad24(x1, TSIZE, dst_offset)));
+        storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset)));
    }
 }

-__kernel void arithm_flip_cols(__global const uchar* srcptr, int srcstep, int srcoffset,
-                               __global uchar* dstptr, int dststep, int dstoffset,
+__kernel void arithm_flip_cols(__global const uchar * srcptr, int src_step, int src_offset,
+                               __global uchar * dstptr, int dst_step, int dst_offset,
                               int rows, int cols, int thread_rows, int thread_cols)
 {
    int x = get_global_id(0);
@ -92,13 +94,10 @@ __kernel void arithm_flip_cols(__global const uchar* srcptr, int srcstep, int sr
    if (x < thread_cols && y < rows)
    {
        int x1 = cols - x - 1;
-        __global const type* src0 = (__global const type*)(srcptr + mad24(y, srcstep, mad24(x, sizeoftype, srcoffset)));
-        __global const type* src1 = (__global const type*)(srcptr + mad24(y, srcstep, mad24(x1, sizeoftype, srcoffset)));
-
-        __global type* dst0 = (__global type*)(dstptr + mad24(y, dststep, mad24(x1, sizeoftype, dstoffset)));
-        __global type* dst1 = (__global type*)(dstptr + mad24(y, dststep, mad24(x, sizeoftype, dstoffset)));
+        T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset)));
+        T src1 = loadpix(srcptr + mad24(y, src_step, mad24(x1, TSIZE, src_offset)));

-        dst1[0] = src1[0];
-        dst0[0] = src0[0];
+        storepix(src0, dstptr + mad24(y, dst_step, mad24(x1, TSIZE, dst_offset)));
+        storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset)));
    }
 }
--- a/modules/core/src/opencl/set_identity.cl
+++ b/modules/core/src/opencl/set_identity.cl
@ -43,17 +43,28 @@
 //
 //M*/

+#if cn != 3
+#define loadpix(addr) *(__global const T *)(addr)
+#define storepix(val, addr)  *(__global T *)(addr) = val
+#define TSIZE (int)sizeof(T)
+#define scalar scalar_
+#else
+#define loadpix(addr) vload3(0, (__global const T1 *)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))
+#define TSIZE ((int)sizeof(T1)*3)
+#define scalar (T)(scalar_.x, scalar_.y, scalar_.z)
+#endif
+
 __kernel void setIdentity(__global uchar * srcptr, int src_step, int src_offset, int rows, int cols,
-                          T scalar)
+                          ST scalar_)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);

    if (x < cols && y < rows)
    {
-        int src_index = mad24(y, src_step, mad24(x, (int)sizeof(T), src_offset));
-        __global T * src = (__global T *)(srcptr + src_index);
+        int src_index = mad24(y, src_step, mad24(x, TSIZE, src_offset));

-        src[0] = x == y ? scalar : (T)(0);
+        storepix(x == y ? scalar : (T)(0), srcptr + src_index);
    }
 }