From e89cee35e5821377a75b3de6d92d1c33391bc31e Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Sat, 21 Jun 2014 19:17:03 +0400 Subject: [PATCH] optimized cv::inRange --- modules/core/src/arithm.cpp | 17 ++++++--- modules/core/src/opencl/inrange.cl | 57 +++++++++++++++++++++--------- 2 files changed, 53 insertions(+), 21 deletions(-) diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index 7ca8b4b48d..197ebc1de4 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -3130,9 +3130,16 @@ static bool ocl_inRange( InputArray _src, InputArray _lowerb, (!haveScalar && (sdepth != ldepth || sdepth != udepth)) ) return false; - ocl::Kernel ker("inrange", ocl::core::inrange_oclsrc, - format("%s-D cn=%d -D T=%s%s", haveScalar ? "-D HAVE_SCALAR " : "", - cn, ocl::typeToStr(sdepth), doubleSupport ? " -D DOUBLE_SUPPORT" : "")); + int kercn = haveScalar ? cn : std::max(std::min(ocl::predictOptimalVectorWidth(_src, _lowerb, _upperb, _dst), 4), cn); + if (kercn % cn != 0) + kercn = cn; + int colsPerWI = kercn / cn; + String opts = format("%s-D cn=%d -D srcT=%s -D srcT1=%s -D dstT=%s -D kercn=%d -D depth=%d%s -D colsPerWI=%d", + haveScalar ? "-D HAVE_SCALAR " : "", cn, ocl::typeToStr(CV_MAKE_TYPE(sdepth, kercn)), + ocl::typeToStr(sdepth), ocl::typeToStr(CV_8UC(colsPerWI)), kercn, sdepth, + doubleSupport ? " -D DOUBLE_SUPPORT" : "", colsPerWI); + + ocl::Kernel ker("inrange", ocl::core::inrange_oclsrc, opts); if (ker.empty()) return false; @@ -3180,7 +3187,7 @@ static bool ocl_inRange( InputArray _src, InputArray _lowerb, } ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src), - dstarg = ocl::KernelArg::WriteOnly(dst); + dstarg = ocl::KernelArg::WriteOnly(dst, 1, colsPerWI); if (haveScalar) { @@ -3194,7 +3201,7 @@ static bool ocl_inRange( InputArray _src, InputArray _lowerb, ker.args(srcarg, dstarg, ocl::KernelArg::ReadOnlyNoSize(lscalaru), ocl::KernelArg::ReadOnlyNoSize(uscalaru), rowsPerWI); - size_t globalsize[2] = { ssize.width, (ssize.height + rowsPerWI - 1) / rowsPerWI }; + size_t globalsize[2] = { ssize.width / colsPerWI, (ssize.height + rowsPerWI - 1) / rowsPerWI }; return ker.run(2, globalsize, NULL, false); } diff --git a/modules/core/src/opencl/inrange.cl b/modules/core/src/opencl/inrange.cl index 0de561f5f2..538259539a 100644 --- a/modules/core/src/opencl/inrange.cl +++ b/modules/core/src/opencl/inrange.cl @@ -52,7 +52,7 @@ __kernel void inrange(__global const uchar * src1ptr, int src1_step, int src1_offset, __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, #ifdef HAVE_SCALAR - __global const T * src2, __global const T * src3, + __global const srcT1 * src2, __global const srcT1 * src3, #else __global const uchar * src2ptr, int src2_step, int src2_offset, __global const uchar * src3ptr, int src3_step, int src3_offset, @@ -64,31 +64,56 @@ __kernel void inrange(__global const uchar * src1ptr, int src1_step, int src1_of if (x < dst_cols) { - int src1_index = mad24(y0, src1_step, mad24(x, (int)sizeof(T) * cn, src1_offset)); - int dst_index = mad24(y0, dst_step, x + dst_offset); + int src1_index = mad24(y0, src1_step, mad24(x, (int)sizeof(srcT1) * kercn, src1_offset)); + int dst_index = mad24(y0, dst_step, mad24(x, colsPerWI, dst_offset)); #ifndef HAVE_SCALAR - int src2_index = mad24(y0, src2_step, mad24(x, (int)sizeof(T) * cn, src2_offset)); - int src3_index = mad24(y0, src3_step, mad24(x, (int)sizeof(T) * cn, src3_offset)); + int src2_index = mad24(y0, src2_step, mad24(x, (int)sizeof(srcT1) * kercn, src2_offset)); + int src3_index = mad24(y0, src3_step, mad24(x, (int)sizeof(srcT1) * kercn, src3_offset)); #endif for (int y = y0, y1 = min(dst_rows, y0 + rowsPerWI); y < y1; ++y, src1_index += src1_step, dst_index += dst_step) { - __global const T * src1 = (__global const T *)(src1ptr + src1_index); +#if kercn >= cn && kercn == 4 && depth <= 4 && !defined HAVE_SCALAR + srcT src1 = *(__global const srcT *)(src1ptr + src1_index); + srcT src2 = *(__global const srcT *)(src2ptr + src2_index); + srcT src3 = *(__global const srcT *)(src3ptr + src3_index); + __global dstT * dst = (__global dstT *)(dstptr + dst_index); +#if cn == 1 + dst[0] = src2 > src1 || src3 < src1 ? (dstT)(0) : (dstT)(255); +#elif cn == 2 + dst[0] = (dstT)(src2.xy > src1.xy || src3.xy < src1.xy || + src2.zw > src1.zw || src3.zw < src1.zw ? (dstT)(0) : (dstT)(255); +#elif cn == 4 + dst[0] = (dstT)(src2.x > src1.x || src3.x < src1.x || + src2.y > src1.y || src3.y < src1.y || + src2.z > src1.z || src3.z < src1.z || + src2.w > src1.w || src3.w < src1.w ? 0 : 255); +#endif +#else + __global const srcT1 * src1 = (__global const srcT1 *)(src1ptr + src1_index); __global uchar * dst = dstptr + dst_index; #ifndef HAVE_SCALAR - __global const T * src2 = (__global const T *)(src2ptr + src2_index); - __global const T * src3 = (__global const T *)(src3ptr + src3_index); + __global const srcT1 * src2 = (__global const srcT1 *)(src2ptr + src2_index); + __global const srcT1 * src3 = (__global const srcT1 *)(src3ptr + src3_index); #endif - dst[0] = 255; - - for (int c = 0; c < cn; ++c) - if (src2[c] > src1[c] || src3[c] < src1[c]) - { - dst[0] = 0; - break; - } + #pragma unroll + for (int px = 0; px < colsPerWI; ++px, src1 += cn +#ifndef HAVE_SCALAR + , src2 += cn, src3 += cn +#endif + ) + { + dst[px] = 255; + for (int c = 0; c < cn; ++c) + if (src2[c] > src1[c] || src3[c] < src1[c]) + { + dst[px] = 0; + break; + } + } +#endif // kercn >= cn #ifndef HAVE_SCALAR src2_index += src2_step; src3_index += src3_step;