From 07b475062fa2ca671d61639a3bae1e5c86718001 Mon Sep 17 00:00:00 2001
From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Fri, 21 Feb 2020 16:13:41 +0300
Subject: [PATCH] Merge pull request #16608 from vpisarev:fix_mac_ocl_tests

* fixed several problems when running tests on Mac:
* OCL_pyrUp
* OCL_flip
* some basic UMat tests
* histogram badarg test (out of range access)

* retained the storepix fix in ocl_flip only for 16U/16S datatype, where the OpenCL compiler on Mac generates incorrect code

* moved deletion of ACCESS_FAST flag to non-SVM branch (where SVM is shared virtual memory (in OpenCL 2.x), not support vector machine)

* force OpenCL to use read/write for GPU<=>CPU memory transfers on machines with discrete video only on Macs. On Windows/Linux the drivers are seemingly smart enough to implement map/unmap properly (and maybe more efficiently than explicit read/write)
---
 modules/core/src/copy.cpp                |  4 ++--
 modules/core/src/ocl.cpp                 | 13 +++++++++++--
 modules/core/src/opencl/flip.cl          | 24 ++++++++++++++++++------
 modules/imgproc/src/pyramids.cpp         | 23 +++++++++--------------
 modules/imgproc/test/test_histograms.cpp |  2 +-
 5 files changed, 41 insertions(+), 25 deletions(-)

diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp
index 1f981ee871..48440ef265 100644
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -916,9 +916,9 @@ static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )
     kercn = (cn!=3 || flipType == FLIP_ROWS) ? std::max(kercn, cn) : cn;
 
     ocl::Kernel k(kernelName, ocl::core::flip_oclsrc,
-        format( "-D T=%s -D T1=%s -D cn=%d -D PIX_PER_WI_Y=%d -D kercn=%d",
+        format( "-D T=%s -D T1=%s -D DEPTH=%d -D cn=%d -D PIX_PER_WI_Y=%d -D kercn=%d",
                 kercn != cn ? ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)) : ocl::vecopTypeToStr(CV_MAKE_TYPE(depth, kercn)),
-                kercn != cn ? ocl::typeToStr(depth) : ocl::vecopTypeToStr(depth), cn, pxPerWIy, kercn));
+                kercn != cn ? ocl::typeToStr(depth) : ocl::vecopTypeToStr(depth), depth, cn, pxPerWIy, kercn));
     if (k.empty())
         return false;
 
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
index 7780364f1c..dbebf02f7f 100644
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -4705,6 +4705,8 @@ public:
             int createFlags = 0, flags0 = 0;
             getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0);
 
+            bool copyOnMap = (flags0 & UMatData::COPY_ON_MAP) != 0;
+
             cl_context ctx_handle = (cl_context)ctx.ptr();
             int allocatorFlags = 0;
             int tempUMatFlags = 0;
@@ -4764,8 +4766,15 @@ public:
             else
 #endif
             {
+                if( copyOnMap )
+                    accessFlags &= ~ACCESS_FAST;
+
                 tempUMatFlags = UMatData::TEMP_UMAT;
-                if (CV_OPENCL_ENABLE_MEM_USE_HOST_PTR
+                if (
+                #ifdef __APPLE__
+                    !copyOnMap &&
+                #endif
+                    CV_OPENCL_ENABLE_MEM_USE_HOST_PTR
                     // There are OpenCL runtime issues for less aligned data
                     && (CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR != 0
                         && u->origdata == cv::alignPtr(u->origdata, (int)CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR))
@@ -4793,7 +4802,7 @@ public:
             u->handle = handle;
             u->prevAllocator = u->currAllocator;
             u->currAllocator = this;
-            u->flags |= tempUMatFlags;
+            u->flags |= tempUMatFlags | flags0;
             u->allocatorFlags_ = allocatorFlags;
         }
         if(accessFlags & ACCESS_WRITE)
diff --git a/modules/core/src/opencl/flip.cl b/modules/core/src/opencl/flip.cl
index bd670a5b72..afd14e4e1f 100644
--- a/modules/core/src/opencl/flip.cl
+++ b/modules/core/src/opencl/flip.cl
@@ -42,10 +42,25 @@
 #if kercn != 3
 #define loadpix(addr) *(__global const T *)(addr)
 #define storepix(val, addr)  *(__global T *)(addr) = val
+#define storepix_2(val0, val1, addr0, addr1) \
+    *(__global T *)(addr0) = val0; *(__global T *)(addr1) = val1
 #define TSIZE (int)sizeof(T)
 #else
 #define loadpix(addr) vload3(0, (__global const T1 *)(addr))
 #define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))
+#if DEPTH == 2 || DEPTH == 3
+#define storepix_2(val0, val1, addr0, addr1) \
+    ((__global T1 *)(addr0))[0] = val0.x; \
+    ((__global T1 *)(addr1))[0] = val1.x; \
+    ((__global T1 *)(addr0))[1] = val0.y; \
+    ((__global T1 *)(addr1))[1] = val1.y; \
+    ((__global T1 *)(addr0))[2] = val0.z; \
+    ((__global T1 *)(addr1))[2] = val1.z
+#else
+#define storepix_2(val0, val1, addr0, addr1) \
+    storepix(val0, addr0); \
+    storepix(val1, addr1)
+#endif
 #define TSIZE ((int)sizeof(T1)*3)
 #endif
 
@@ -69,8 +84,7 @@ __kernel void arithm_flip_rows(__global const uchar * srcptr, int src_step, int
             T src0 = loadpix(srcptr + src_index0);
             T src1 = loadpix(srcptr + src_index1);
 
-            storepix(src1, dstptr + dst_index0);
-            storepix(src0, dstptr + dst_index1);
+            storepix_2(src1, src0, dstptr + dst_index0, dstptr + dst_index1);
 
             src_index0 += src_step;
             src_index1 -= src_step;
@@ -115,8 +129,7 @@ __kernel void arithm_flip_rows_cols(__global const uchar * srcptr, int src_step,
 #endif
 #endif
 
-            storepix(src1, dstptr + dst_index0);
-            storepix(src0, dstptr + dst_index1);
+            storepix_2(src1, src0, dstptr + dst_index0, dstptr + dst_index1);
 
             src_index0 += src_step;
             src_index1 -= src_step;
@@ -161,8 +174,7 @@ __kernel void arithm_flip_cols(__global const uchar * srcptr, int src_step, int
 #endif
 #endif
 
-            storepix(src1, dstptr + dst_index0);
-            storepix(src0, dstptr + dst_index1);
+            storepix_2(src1, src0, dstptr + dst_index0, dstptr + dst_index1);
 
             src_index0 += src_step;
             src_index1 += src_step;
diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp
index ec4427f219..ab6c8fdb6f 100644
--- a/modules/imgproc/src/pyramids.cpp
+++ b/modules/imgproc/src/pyramids.cpp
@@ -1078,7 +1078,7 @@ static bool ocl_pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int
     UMat dst = _dst.getUMat();
 
     int float_depth = depth == CV_64F ? CV_64F : CV_32F;
-    const int local_size = 16;
+    const int local_size = channels == 1 ? 16 : 8;
     char cvt[2][50];
     String buildOptions = format(
             "-D T=%s -D FT=%s -D convertToT=%s -D convertToFT=%s%s "
@@ -1092,22 +1092,17 @@ static bool ocl_pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int
     size_t globalThreads[2] = { (size_t)dst.cols, (size_t)dst.rows };
     size_t localThreads[2] = { (size_t)local_size, (size_t)local_size };
     ocl::Kernel k;
-    if (ocl::Device::getDefault().isIntel() && channels == 1)
+    if (type == CV_8UC1 && src.cols % 2 == 0)
     {
-        if (type == CV_8UC1 && src.cols % 2 == 0)
-        {
-            buildOptions.clear();
-            k.create("pyrUp_cols2", ocl::imgproc::pyramid_up_oclsrc, buildOptions);
-            globalThreads[0] = dst.cols/4; globalThreads[1] = dst.rows/2;
-        }
-        else
-        {
-            k.create("pyrUp_unrolled", ocl::imgproc::pyr_up_oclsrc, buildOptions);
-            globalThreads[0] = dst.cols/2; globalThreads[1] = dst.rows/2;
-        }
+        buildOptions.clear();
+        k.create("pyrUp_cols2", ocl::imgproc::pyramid_up_oclsrc, buildOptions);
+        globalThreads[0] = dst.cols/4; globalThreads[1] = dst.rows/2;
     }
     else
-        k.create("pyrUp", ocl::imgproc::pyr_up_oclsrc, buildOptions);
+    {
+        k.create("pyrUp_unrolled", ocl::imgproc::pyr_up_oclsrc, buildOptions);
+        globalThreads[0] = dst.cols/2; globalThreads[1] = dst.rows/2;
+    }
 
     if (k.empty())
         return false;
diff --git a/modules/imgproc/test/test_histograms.cpp b/modules/imgproc/test/test_histograms.cpp
index fdf31fe771..afe6e53603 100644
--- a/modules/imgproc/test/test_histograms.cpp
+++ b/modules/imgproc/test/test_histograms.cpp
@@ -1966,7 +1966,7 @@ TEST(Imgproc_Hist_Calc, badarg)
     Mat img = cv::Mat::zeros(10, 10, CV_8UC1);
     Mat imgInt = cv::Mat::zeros(10, 10, CV_32SC1);
     Mat hist;
-    const int hist_size[] = { 100 };
+    const int hist_size[] = { 100, 100 };
     // base run
     EXPECT_NO_THROW(cv::calcHist(&img, 1, channels, noArray(), hist, 1, hist_size, ranges, true));
     // bad parameters