From 4612b4b8276eba25f6fba4af20073d930d6c9820 Mon Sep 17 00:00:00 2001 From: Alexander Karsakov Date: Tue, 13 May 2014 15:21:47 +0400 Subject: [PATCH 1/2] Added clamp() for THRESH_TRUNC mode --- modules/imgproc/src/opencl/threshold.cl | 12 ++++++------ modules/imgproc/src/thresh.cpp | 6 +++++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/modules/imgproc/src/opencl/threshold.cl b/modules/imgproc/src/opencl/threshold.cl index 5049426a53..6282aa86f8 100644 --- a/modules/imgproc/src/opencl/threshold.cl +++ b/modules/imgproc/src/opencl/threshold.cl @@ -53,7 +53,7 @@ __kernel void threshold(__global const uchar * srcptr, int src_step, int src_offset, __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols, - T1 thresh, T1 max_val) + T1 thresh, T1 max_val, T1 min_val) { int gx = get_global_id(0); int gy = get_global_id(1); @@ -67,15 +67,15 @@ __kernel void threshold(__global const uchar * srcptr, int src_step, int src_off __global T * dst = (__global T *)(dstptr + dst_index); #ifdef THRESH_BINARY - dst[0] = sdata > (T)(thresh) ? (T)(max_val) : (T)(0); + dst[0] = sdata > (thresh) ? (T)(max_val) : (T)(0); #elif defined THRESH_BINARY_INV - dst[0] = sdata > (T)(thresh) ? (T)(0) : (T)(max_val); + dst[0] = sdata > (thresh) ? (T)(0) : (T)(max_val); #elif defined THRESH_TRUNC - dst[0] = sdata > (T)(thresh) ? (T)(thresh) : sdata; + dst[0] = clamp(sdata, (T)min_val, (T)(thresh)); #elif defined THRESH_TOZERO - dst[0] = sdata > (T)(thresh) ? sdata : (T)(0); + dst[0] = sdata > (thresh) ? sdata : (T)(0); #elif defined THRESH_TOZERO_INV - dst[0] = sdata > (T)(thresh) ? (T)(0) : sdata; + dst[0] = sdata > (thresh) ? (T)(0) : sdata; #endif } } diff --git a/modules/imgproc/src/thresh.cpp b/modules/imgproc/src/thresh.cpp index e6a55d700d..b32a4365cf 100644 --- a/modules/imgproc/src/thresh.cpp +++ b/modules/imgproc/src/thresh.cpp @@ -847,9 +847,13 @@ static bool ocl_threshold( InputArray _src, OutputArray _dst, double & thresh, d if (depth <= CV_32S) thresh = cvFloor(thresh); + const double min_vals[] = { 0, CHAR_MIN, 0, SHRT_MIN, INT_MIN, -FLT_MAX, -DBL_MAX, 0 }; + double min_val = min_vals[depth]; + k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst, cn, kercn), ocl::KernelArg::Constant(Mat(1, 1, depth, Scalar::all(thresh))), - ocl::KernelArg::Constant(Mat(1, 1, depth, Scalar::all(maxval)))); + ocl::KernelArg::Constant(Mat(1, 1, depth, Scalar::all(maxval))), + ocl::KernelArg::Constant(Mat(1, 1, depth, Scalar::all(min_val)))); size_t globalsize[2] = { dst.cols * cn / kercn, dst.rows }; return k.run(2, globalsize, NULL, false); From 8ecb8c6a3db1ff4bedca161a9c86784474bef5b9 Mon Sep 17 00:00:00 2001 From: Alexander Karsakov Date: Fri, 16 May 2014 10:22:03 +0400 Subject: [PATCH 2/2] Optimized memory access by using stride pattern --- modules/imgproc/src/opencl/threshold.cl | 41 ++++++++++++++++--------- modules/imgproc/src/thresh.cpp | 8 +++-- 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/modules/imgproc/src/opencl/threshold.cl b/modules/imgproc/src/opencl/threshold.cl index 6282aa86f8..43f1ea2c96 100644 --- a/modules/imgproc/src/opencl/threshold.cl +++ b/modules/imgproc/src/opencl/threshold.cl @@ -56,26 +56,37 @@ __kernel void threshold(__global const uchar * srcptr, int src_step, int src_off T1 thresh, T1 max_val, T1 min_val) { int gx = get_global_id(0); - int gy = get_global_id(1); + int gy = get_global_id(1) * STRIDE_SIZE; - if (gx < cols && gy < rows) + if (gx < cols) { int src_index = mad24(gy, src_step, mad24(gx, (int)sizeof(T), src_offset)); int dst_index = mad24(gy, dst_step, mad24(gx, (int)sizeof(T), dst_offset)); - T sdata = *(__global const T *)(srcptr + src_index); - __global T * dst = (__global T *)(dstptr + dst_index); + #pragma unroll + for (int i = 0; i < STRIDE_SIZE; i++) + { + if (gy < rows) + { + T sdata = *(__global const T *)(srcptr + src_index); + __global T * dst = (__global T *)(dstptr + dst_index); -#ifdef THRESH_BINARY - dst[0] = sdata > (thresh) ? (T)(max_val) : (T)(0); -#elif defined THRESH_BINARY_INV - dst[0] = sdata > (thresh) ? (T)(0) : (T)(max_val); -#elif defined THRESH_TRUNC - dst[0] = clamp(sdata, (T)min_val, (T)(thresh)); -#elif defined THRESH_TOZERO - dst[0] = sdata > (thresh) ? sdata : (T)(0); -#elif defined THRESH_TOZERO_INV - dst[0] = sdata > (thresh) ? (T)(0) : sdata; -#endif + #ifdef THRESH_BINARY + dst[0] = sdata > (thresh) ? (T)(max_val) : (T)(0); + #elif defined THRESH_BINARY_INV + dst[0] = sdata > (thresh) ? (T)(0) : (T)(max_val); + #elif defined THRESH_TRUNC + dst[0] = clamp(sdata, (T)min_val, (T)(thresh)); + #elif defined THRESH_TOZERO + dst[0] = sdata > (thresh) ? sdata : (T)(0); + #elif defined THRESH_TOZERO_INV + dst[0] = sdata > (thresh) ? (T)(0) : sdata; + #endif + + gy++; + src_index += src_step; + dst_index += dst_step; + } + } } } diff --git a/modules/imgproc/src/thresh.cpp b/modules/imgproc/src/thresh.cpp index b32a4365cf..988fc9e9f6 100644 --- a/modules/imgproc/src/thresh.cpp +++ b/modules/imgproc/src/thresh.cpp @@ -833,9 +833,12 @@ static bool ocl_threshold( InputArray _src, OutputArray _dst, double & thresh, d const char * const thresholdMap[] = { "THRESH_BINARY", "THRESH_BINARY_INV", "THRESH_TRUNC", "THRESH_TOZERO", "THRESH_TOZERO_INV" }; + ocl::Device dev = ocl::Device::getDefault(); + int stride_size = dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU) ? 4 : 1; + ocl::Kernel k("threshold", ocl::imgproc::threshold_oclsrc, - format("-D %s -D T=%s -D T1=%s%s", thresholdMap[thresh_type], - ocl::typeToStr(ktype), ocl::typeToStr(depth), + format("-D %s -D T=%s -D T1=%s -D STRIDE_SIZE=%d%s", thresholdMap[thresh_type], + ocl::typeToStr(ktype), ocl::typeToStr(depth), stride_size, doubleSupport ? " -D DOUBLE_SUPPORT" : "")); if (k.empty()) return false; @@ -856,6 +859,7 @@ static bool ocl_threshold( InputArray _src, OutputArray _dst, double & thresh, d ocl::KernelArg::Constant(Mat(1, 1, depth, Scalar::all(min_val)))); size_t globalsize[2] = { dst.cols * cn / kercn, dst.rows }; + globalsize[1] = (globalsize[1] + stride_size - 1) / stride_size; return k.run(2, globalsize, NULL, false); }