From 07b475062fa2ca671d61639a3bae1e5c86718001 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com> Date: Fri, 21 Feb 2020 16:13:41 +0300 Subject: [PATCH] Merge pull request #16608 from vpisarev:fix_mac_ocl_tests * fixed several problems when running tests on Mac: * OCL_pyrUp * OCL_flip * some basic UMat tests * histogram badarg test (out of range access) * retained the storepix fix in ocl_flip only for 16U/16S datatype, where the OpenCL compiler on Mac generates incorrect code * moved deletion of ACCESS_FAST flag to non-SVM branch (where SVM is shared virtual memory (in OpenCL 2.x), not support vector machine) * force OpenCL to use read/write for GPU<=>CPU memory transfers on machines with discrete video only on Macs. On Windows/Linux the drivers are seemingly smart enough to implement map/unmap properly (and maybe more efficiently than explicit read/write) --- modules/core/src/copy.cpp | 4 ++-- modules/core/src/ocl.cpp | 13 +++++++++++-- modules/core/src/opencl/flip.cl | 24 ++++++++++++++++++------ modules/imgproc/src/pyramids.cpp | 23 +++++++++-------------- modules/imgproc/test/test_histograms.cpp | 2 +- 5 files changed, 41 insertions(+), 25 deletions(-) diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp index 1f981ee871..48440ef265 100644 --- a/modules/core/src/copy.cpp +++ b/modules/core/src/copy.cpp @@ -916,9 +916,9 @@ static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode ) kercn = (cn!=3 || flipType == FLIP_ROWS) ? std::max(kercn, cn) : cn; ocl::Kernel k(kernelName, ocl::core::flip_oclsrc, - format( "-D T=%s -D T1=%s -D cn=%d -D PIX_PER_WI_Y=%d -D kercn=%d", + format( "-D T=%s -D T1=%s -D DEPTH=%d -D cn=%d -D PIX_PER_WI_Y=%d -D kercn=%d", kercn != cn ? ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)) : ocl::vecopTypeToStr(CV_MAKE_TYPE(depth, kercn)), - kercn != cn ? ocl::typeToStr(depth) : ocl::vecopTypeToStr(depth), cn, pxPerWIy, kercn)); + kercn != cn ? ocl::typeToStr(depth) : ocl::vecopTypeToStr(depth), depth, cn, pxPerWIy, kercn)); if (k.empty()) return false; diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp index 7780364f1c..dbebf02f7f 100644 --- a/modules/core/src/ocl.cpp +++ b/modules/core/src/ocl.cpp @@ -4705,6 +4705,8 @@ public: int createFlags = 0, flags0 = 0; getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0); + bool copyOnMap = (flags0 & UMatData::COPY_ON_MAP) != 0; + cl_context ctx_handle = (cl_context)ctx.ptr(); int allocatorFlags = 0; int tempUMatFlags = 0; @@ -4764,8 +4766,15 @@ public: else #endif { + if( copyOnMap ) + accessFlags &= ~ACCESS_FAST; + tempUMatFlags = UMatData::TEMP_UMAT; - if (CV_OPENCL_ENABLE_MEM_USE_HOST_PTR + if ( + #ifdef __APPLE__ + !copyOnMap && + #endif + CV_OPENCL_ENABLE_MEM_USE_HOST_PTR // There are OpenCL runtime issues for less aligned data && (CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR != 0 && u->origdata == cv::alignPtr(u->origdata, (int)CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR)) @@ -4793,7 +4802,7 @@ public: u->handle = handle; u->prevAllocator = u->currAllocator; u->currAllocator = this; - u->flags |= tempUMatFlags; + u->flags |= tempUMatFlags | flags0; u->allocatorFlags_ = allocatorFlags; } if(accessFlags & ACCESS_WRITE) diff --git a/modules/core/src/opencl/flip.cl b/modules/core/src/opencl/flip.cl index bd670a5b72..afd14e4e1f 100644 --- a/modules/core/src/opencl/flip.cl +++ b/modules/core/src/opencl/flip.cl @@ -42,10 +42,25 @@ #if kercn != 3 #define loadpix(addr) *(__global const T *)(addr) #define storepix(val, addr) *(__global T *)(addr) = val +#define storepix_2(val0, val1, addr0, addr1) \ + *(__global T *)(addr0) = val0; *(__global T *)(addr1) = val1 #define TSIZE (int)sizeof(T) #else #define loadpix(addr) vload3(0, (__global const T1 *)(addr)) #define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr)) +#if DEPTH == 2 || DEPTH == 3 +#define storepix_2(val0, val1, addr0, addr1) \ + ((__global T1 *)(addr0))[0] = val0.x; \ + ((__global T1 *)(addr1))[0] = val1.x; \ + ((__global T1 *)(addr0))[1] = val0.y; \ + ((__global T1 *)(addr1))[1] = val1.y; \ + ((__global T1 *)(addr0))[2] = val0.z; \ + ((__global T1 *)(addr1))[2] = val1.z +#else +#define storepix_2(val0, val1, addr0, addr1) \ + storepix(val0, addr0); \ + storepix(val1, addr1) +#endif #define TSIZE ((int)sizeof(T1)*3) #endif @@ -69,8 +84,7 @@ __kernel void arithm_flip_rows(__global const uchar * srcptr, int src_step, int T src0 = loadpix(srcptr + src_index0); T src1 = loadpix(srcptr + src_index1); - storepix(src1, dstptr + dst_index0); - storepix(src0, dstptr + dst_index1); + storepix_2(src1, src0, dstptr + dst_index0, dstptr + dst_index1); src_index0 += src_step; src_index1 -= src_step; @@ -115,8 +129,7 @@ __kernel void arithm_flip_rows_cols(__global const uchar * srcptr, int src_step, #endif #endif - storepix(src1, dstptr + dst_index0); - storepix(src0, dstptr + dst_index1); + storepix_2(src1, src0, dstptr + dst_index0, dstptr + dst_index1); src_index0 += src_step; src_index1 -= src_step; @@ -161,8 +174,7 @@ __kernel void arithm_flip_cols(__global const uchar * srcptr, int src_step, int #endif #endif - storepix(src1, dstptr + dst_index0); - storepix(src0, dstptr + dst_index1); + storepix_2(src1, src0, dstptr + dst_index0, dstptr + dst_index1); src_index0 += src_step; src_index1 += src_step; diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp index ec4427f219..ab6c8fdb6f 100644 --- a/modules/imgproc/src/pyramids.cpp +++ b/modules/imgproc/src/pyramids.cpp @@ -1078,7 +1078,7 @@ static bool ocl_pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int UMat dst = _dst.getUMat(); int float_depth = depth == CV_64F ? CV_64F : CV_32F; - const int local_size = 16; + const int local_size = channels == 1 ? 16 : 8; char cvt[2][50]; String buildOptions = format( "-D T=%s -D FT=%s -D convertToT=%s -D convertToFT=%s%s " @@ -1092,22 +1092,17 @@ static bool ocl_pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int size_t globalThreads[2] = { (size_t)dst.cols, (size_t)dst.rows }; size_t localThreads[2] = { (size_t)local_size, (size_t)local_size }; ocl::Kernel k; - if (ocl::Device::getDefault().isIntel() && channels == 1) + if (type == CV_8UC1 && src.cols % 2 == 0) { - if (type == CV_8UC1 && src.cols % 2 == 0) - { - buildOptions.clear(); - k.create("pyrUp_cols2", ocl::imgproc::pyramid_up_oclsrc, buildOptions); - globalThreads[0] = dst.cols/4; globalThreads[1] = dst.rows/2; - } - else - { - k.create("pyrUp_unrolled", ocl::imgproc::pyr_up_oclsrc, buildOptions); - globalThreads[0] = dst.cols/2; globalThreads[1] = dst.rows/2; - } + buildOptions.clear(); + k.create("pyrUp_cols2", ocl::imgproc::pyramid_up_oclsrc, buildOptions); + globalThreads[0] = dst.cols/4; globalThreads[1] = dst.rows/2; } else - k.create("pyrUp", ocl::imgproc::pyr_up_oclsrc, buildOptions); + { + k.create("pyrUp_unrolled", ocl::imgproc::pyr_up_oclsrc, buildOptions); + globalThreads[0] = dst.cols/2; globalThreads[1] = dst.rows/2; + } if (k.empty()) return false; diff --git a/modules/imgproc/test/test_histograms.cpp b/modules/imgproc/test/test_histograms.cpp index fdf31fe771..afe6e53603 100644 --- a/modules/imgproc/test/test_histograms.cpp +++ b/modules/imgproc/test/test_histograms.cpp @@ -1966,7 +1966,7 @@ TEST(Imgproc_Hist_Calc, badarg) Mat img = cv::Mat::zeros(10, 10, CV_8UC1); Mat imgInt = cv::Mat::zeros(10, 10, CV_32SC1); Mat hist; - const int hist_size[] = { 100 }; + const int hist_size[] = { 100, 100 }; // base run EXPECT_NO_THROW(cv::calcHist(&img, 1, channels, noArray(), hist, 1, hist_size, ranges, true)); // bad parameters