From 3607da9f6b769d1e8c26c9b118747410461526de Mon Sep 17 00:00:00 2001 From: Li Peng Date: Fri, 14 Oct 2016 15:13:10 +0800 Subject: [PATCH] ocl kernel performance optimization for box filter The optimization is for CV_8UC1 format and 3x3 box filter, it is 15%~87% faster than current ocl kernel with below perf test ./modules/ts/misc/run.py -t imgproc --gtest_filter=OCL_BlurFixture* Also add test cases for this ocl kernel. Signed-off-by: Li Peng --- modules/imgproc/src/opencl/boxFilter3x3.cl | 127 ++++++++++++++++++++ modules/imgproc/src/smooth.cpp | 60 +++++++++ modules/imgproc/test/ocl/test_boxfilter.cpp | 74 ++++++++++++ 3 files changed, 261 insertions(+) create mode 100644 modules/imgproc/src/opencl/boxFilter3x3.cl diff --git a/modules/imgproc/src/opencl/boxFilter3x3.cl b/modules/imgproc/src/opencl/boxFilter3x3.cl new file mode 100644 index 0000000000..7050a4b081 --- /dev/null +++ b/modules/imgproc/src/opencl/boxFilter3x3.cl @@ -0,0 +1,127 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +__kernel void boxFilter3x3_8UC1_cols16_rows2(__global const uint* src, int src_step, + __global uint* dst, int dst_step, int rows, int cols +#ifdef NORMALIZE + , float alpha +#endif + ) +{ + int block_x = get_global_id(0); + int y = get_global_id(1) * 2; + int ssx, dsx; + + if ((block_x * 16) >= cols || y >= rows) return; + + uint4 line[4]; + uint4 line_out[2]; + ushort a; ushort16 b; ushort c; + ushort d; ushort16 e; ushort f; + ushort g; ushort16 h; ushort i; + ushort j; ushort16 k; ushort l; + + ssx = dsx = 1; + int src_index = block_x * 4 * ssx + (y - 1) * (src_step / 4); + line[1] = vload4(0, src + src_index + (src_step / 4)); + line[2] = vload4(0, src + src_index + 2 * (src_step / 4)); + +#ifdef BORDER_CONSTANT + line[0] = (y == 0) ? (uint4)0 : vload4(0, src + src_index); + line[3] = (y == (rows - 2)) ? (uint4)0 : vload4(0, src + src_index + 3 * (src_step / 4)); +#elif defined BORDER_REFLECT_101 + line[0] = (y == 0) ? line[2] : vload4(0, src + src_index); + line[3] = (y == (rows - 2)) ? line[1] : vload4(0, src + src_index + 3 * (src_step / 4)); +#elif defined (BORDER_REPLICATE) || defined(BORDER_REFLECT) + line[0] = (y == 0) ? line[1] : vload4(0, src + src_index); + line[3] = (y == (rows - 2)) ? line[2] : vload4(0, src + src_index + 3 * (src_step / 4)); +#endif + + ushort16 sum, mid; + __global uchar *src_p = (__global uchar *)src; + + src_index = block_x * 16 * ssx + (y - 1) * src_step; + bool line_end = ((block_x + 1) * 16 == cols); + + b = convert_ushort16(as_uchar16(line[0])); + e = convert_ushort16(as_uchar16(line[1])); + h = convert_ushort16(as_uchar16(line[2])); + k = convert_ushort16(as_uchar16(line[3])); + +#ifdef BORDER_CONSTANT + a = (block_x == 0 || y == 0) ? 0 : convert_ushort(src_p[src_index - 1]); + c = (line_end || y == 0) ? 0 : convert_ushort(src_p[src_index + 16]); + + d = (block_x == 0) ? 0 : convert_ushort(src_p[src_index + src_step - 1]); + f = line_end ? 0 : convert_ushort(src_p[src_index + src_step + 16]); + + g = (block_x == 0) ? 0 : convert_ushort(src_p[src_index + 2 * src_step - 1]); + i = line_end ? 0 : convert_ushort(src_p[src_index + 2 * src_step + 16]); + + j = (block_x == 0 || y == (rows - 2)) ? 0 : convert_ushort(src_p[src_index + 3 * src_step - 1]); + l = (line_end || y == (rows - 2))? 0 : convert_ushort(src_p[src_index + 3 * src_step + 16]); + +#elif defined BORDER_REFLECT_101 + int offset; + offset = (y == 0) ? (2 * src_step) : 0; + + a = (block_x == 0) ? convert_ushort(src_p[src_index + offset + 1]) : convert_ushort(src_p[src_index + offset - 1]); + c = line_end ? convert_ushort(src_p[src_index + offset + 14]) : convert_ushort(src_p[src_index + offset + 16]); + + d = (block_x == 0) ? convert_ushort(src_p[src_index + src_step + 1]) : convert_ushort(src_p[src_index + src_step - 1]); + f = line_end ? convert_ushort(src_p[src_index + src_step + 14]) : convert_ushort(src_p[src_index + src_step + 16]); + + g = (block_x == 0) ? convert_ushort(src_p[src_index + 2 * src_step + 1]) : convert_ushort(src_p[src_index + 2 * src_step - 1]); + i = line_end ? convert_ushort(src_p[src_index + 2 * src_step + 14]) : convert_ushort(src_p[src_index + 2 * src_step + 16]); + + offset = (y == (rows - 2)) ? (1 * src_step) : (3 * src_step); + + j = (block_x == 0) ? convert_ushort(src_p[src_index + offset + 1]) : convert_ushort(src_p[src_index + offset - 1]); + l = line_end ? convert_ushort(src_p[src_index + offset + 14]) : convert_ushort(src_p[src_index + offset + 16]); + +#elif defined (BORDER_REPLICATE) || defined(BORDER_REFLECT) + int offset; + offset = (y == 0) ? (1 * src_step) : 0; + + a = (block_x == 0) ? convert_ushort(src_p[src_index + offset]) : convert_ushort(src_p[src_index + offset - 1]); + c = line_end ? convert_ushort(src_p[src_index + offset + 15]) : convert_ushort(src_p[src_index + offset + 16]); + + d = (block_x == 0) ? convert_ushort(src_p[src_index + src_step]) : convert_ushort(src_p[src_index + src_step - 1]); + f = line_end ? convert_ushort(src_p[src_index + src_step + 15]) : convert_ushort(src_p[src_index + src_step + 16]); + + g = (block_x == 0) ? convert_ushort(src_p[src_index + 2 * src_step]) : convert_ushort(src_p[src_index + 2 * src_step - 1]); + i = line_end ? convert_ushort(src_p[src_index + 2 * src_step + 15]) : convert_ushort(src_p[src_index + 2 * src_step + 16]); + + offset = (y == (rows - 2)) ? (2 * src_step) : (3 * src_step); + + j = (block_x == 0) ? convert_ushort(src_p[src_index + offset]) : convert_ushort(src_p[src_index + offset - 1]); + l = line_end ? convert_ushort(src_p[src_index + offset + 15]) : convert_ushort(src_p[src_index + offset + 16]); + +#endif + + mid = (ushort16)(d, e.s0123, e.s456789ab, e.scde) + e + (ushort16)(e.s123, e.s4567, e.s89abcdef, f) + + (ushort16)(g, h.s0123, h.s456789ab, h.scde) + h + (ushort16)(h.s123, h.s4567, h.s89abcdef, i); + + sum = (ushort16)(a, b.s0123, b.s456789ab, b.scde) + b + (ushort16)(b.s123, b.s4567, b.s89abcdef, c) + + mid; + +#ifdef NORMALIZE + line_out[0] = as_uint4(convert_uchar16_sat_rte((convert_float16(sum) * alpha))); +#else + line_out[0] = as_uint4(convert_uchar16_sat_rte(sum)); +#endif + + sum = mid + + (ushort16)(j, k.s0123, k.s456789ab, k.scde) + k + (ushort16)(k.s123, k.s4567, k.s89abcdef, l); + +#ifdef NORMALIZE + line_out[1] = as_uint4(convert_uchar16_sat_rte((convert_float16(sum) * alpha))); +#else + line_out[1] = as_uint4(convert_uchar16_sat_rte(sum)); +#endif + + int dst_index = block_x * 4 * dsx + y * (dst_step / 4); + vstore4(line_out[0], 0, dst + dst_index); + vstore4(line_out[1], 0, dst + dst_index + (dst_step / 4)); +} diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index f0d6bfbfca..3b5d4e0395 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -1295,6 +1295,61 @@ struct ColumnSum : #ifdef HAVE_OPENCL +static bool ocl_boxFilter3x3_8UC1( InputArray _src, OutputArray _dst, int ddepth, + Size ksize, Point anchor, int borderType, bool normalize ) +{ + const ocl::Device & dev = ocl::Device::getDefault(); + int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); + + if (ddepth < 0) + ddepth = sdepth; + + if (anchor.x < 0) + anchor.x = ksize.width / 2; + if (anchor.y < 0) + anchor.y = ksize.height / 2; + + if ( !(dev.isIntel() && (type == CV_8UC1) && + (_src.offset() == 0) && (_src.step() % 4 == 0) && + (_src.cols() % 16 == 0) && (_src.rows() % 2 == 0) && + (anchor.x == 1) && (anchor.y == 1) && + (ksize.width == 3) && (ksize.height == 3)) ) + return false; + + float alpha = 1.0f / (ksize.height * ksize.width); + Size size = _src.size(); + size_t globalsize[2] = { 0, 0 }; + size_t localsize[2] = { 0, 0 }; + const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" }; + + globalsize[0] = size.width / 16; + globalsize[1] = size.height / 2; + + char build_opts[1024]; + sprintf(build_opts, "-D %s %s", borderMap[borderType], normalize ? "-D NORMALIZE" : ""); + + ocl::Kernel kernel("boxFilter3x3_8UC1_cols16_rows2", cv::ocl::imgproc::boxFilter3x3_oclsrc, build_opts); + if (kernel.empty()) + return false; + + UMat src = _src.getUMat(); + _dst.create(size, CV_MAKETYPE(ddepth, cn)); + if (!(_dst.offset() == 0 && _dst.step() % 4 == 0)) + return false; + UMat dst = _dst.getUMat(); + + int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src)); + idxArg = kernel.set(idxArg, (int)src.step); + idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst)); + idxArg = kernel.set(idxArg, (int)dst.step); + idxArg = kernel.set(idxArg, (int)dst.rows); + idxArg = kernel.set(idxArg, (int)dst.cols); + if (normalize) + idxArg = kernel.set(idxArg, (float)alpha); + + return kernel.run(2, globalsize, (localsize[0] == 0) ? NULL : localsize, false); +} + #define DIVUP(total, grain) ((total + grain - 1) / (grain)) #define ROUNDUP(sz, n) ((sz) + (n) - 1 - (((sz) + (n) - 1) % (n))) @@ -1683,6 +1738,11 @@ void cv::boxFilter( InputArray _src, OutputArray _dst, int ddepth, { CV_INSTRUMENT_REGION() + CV_OCL_RUN(_dst.isUMat() && + (borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || + borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101), + ocl_boxFilter3x3_8UC1(_src, _dst, ddepth, ksize, anchor, borderType, normalize)) + CV_OCL_RUN(_dst.isUMat(), ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize)) Mat src = _src.getMat(); diff --git a/modules/imgproc/test/ocl/test_boxfilter.cpp b/modules/imgproc/test/ocl/test_boxfilter.cpp index 19a6ace75e..5d6803a563 100644 --- a/modules/imgproc/test/ocl/test_boxfilter.cpp +++ b/modules/imgproc/test/ocl/test_boxfilter.cpp @@ -157,6 +157,80 @@ OCL_INSTANTIATE_TEST_CASE_P(ImageProc, SqrBoxFilter, ); +PARAM_TEST_CASE(BoxFilter3x3_cols16_rows2_Base, MatDepth, Channels, BorderType, bool, bool) +{ + int depth, cn, borderType; + Size ksize, dsize; + Point anchor; + bool normalize, useRoi; + + TEST_DECLARE_INPUT_PARAMETER(src); + TEST_DECLARE_OUTPUT_PARAMETER(dst); + + virtual void SetUp() + { + depth = GET_PARAM(0); + cn = GET_PARAM(1); + borderType = GET_PARAM(2); // only not isolated border tested, because CPU module doesn't support isolated border case. + normalize = GET_PARAM(3); + useRoi = GET_PARAM(4); + } + + void random_roi() + { + int type = CV_MAKE_TYPE(depth, cn); + ksize = Size(3,3); + + Size roiSize = randomSize(ksize.width, MAX_VALUE, ksize.height, MAX_VALUE); + roiSize.width = std::max(ksize.width + 13, roiSize.width & (~0xf)); + roiSize.height = std::max(ksize.height + 1, roiSize.height & (~0x1)); + Border srcBorder = {0, 0, 0, 0}; + randomSubMat(src, src_roi, roiSize, srcBorder, type, -MAX_VALUE, MAX_VALUE); + + Border dstBorder = {0, 0, 0, 0}; + randomSubMat(dst, dst_roi, roiSize, dstBorder, type, -MAX_VALUE, MAX_VALUE); + + anchor.x = -1; + anchor.y = -1; + + UMAT_UPLOAD_INPUT_PARAMETER(src); + UMAT_UPLOAD_OUTPUT_PARAMETER(dst); + } + + void Near(double threshold = 0.0) + { + OCL_EXPECT_MATS_NEAR(dst, threshold); + } +}; + +typedef BoxFilter3x3_cols16_rows2_Base BoxFilter3x3_cols16_rows2; + +OCL_TEST_P(BoxFilter3x3_cols16_rows2, Mat) +{ + for (int j = 0; j < test_loop_times; j++) + { + random_roi(); + + OCL_OFF(cv::boxFilter(src_roi, dst_roi, -1, ksize, anchor, normalize, borderType)); + OCL_ON(cv::boxFilter(usrc_roi, udst_roi, -1, ksize, anchor, normalize, borderType)); + + Near(depth <= CV_32S ? 1 : 3e-3); + } +} + +OCL_INSTANTIATE_TEST_CASE_P(ImageProc, BoxFilter3x3_cols16_rows2, + Combine( + Values((MatDepth)CV_8U), + Values((Channels)1), + Values((BorderType)BORDER_CONSTANT, + (BorderType)BORDER_REPLICATE, + (BorderType)BORDER_REFLECT, + (BorderType)BORDER_REFLECT_101), + Bool(), + Values(false) // ROI + ) + ); + } } // namespace cvtest::ocl #endif // HAVE_OPENCL