From 8f63f51e81ef1ed2ab07b328e58e8f68a15fa887 Mon Sep 17 00:00:00 2001 From: Li Peng Date: Fri, 21 Oct 2016 13:17:45 +0800 Subject: [PATCH] gaussian blur ocl kernel optimization This ocl kernel is for 3x3 kernel size and CV_8UC1 format It is 115% ~ 300% faster than current ocl path in perf test python ./modules/ts/misc/run.py -t imgproc --gtest_filter=OCL_GaussianBlurFixture* Signed-off-by: Li Peng --- modules/imgproc/src/opencl/gaussianBlur3x3.cl | 133 ++++++++++++++++++ modules/imgproc/src/smooth.cpp | 69 ++++++++- modules/imgproc/test/ocl/test_filters.cpp | 89 ++++++++++++ 3 files changed, 289 insertions(+), 2 deletions(-) create mode 100644 modules/imgproc/src/opencl/gaussianBlur3x3.cl diff --git a/modules/imgproc/src/opencl/gaussianBlur3x3.cl b/modules/imgproc/src/opencl/gaussianBlur3x3.cl new file mode 100644 index 0000000000..724e73b6b6 --- /dev/null +++ b/modules/imgproc/src/opencl/gaussianBlur3x3.cl @@ -0,0 +1,133 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#define DIG(a) a, +__constant float kx[] = { KERNEL_MATRIX_X }; +__constant float ky[] = { KERNEL_MATRIX_Y }; + +#define OP(delta, y, x) (convert_float16(arr[(y + delta) * 3 + x]) * ky[y] * kx[x]) + +__kernel void gaussianBlur3x3_8UC1_cols16_rows2(__global const uint* src, int src_step, + __global uint* dst, int dst_step, int rows, int cols) +{ + int block_x = get_global_id(0); + int y = get_global_id(1) * 2; + int ssx, dsx; + + if ((block_x * 16) >= cols || y >= rows) return; + + uint4 line[4]; + uint4 line_out[2]; + uchar a; uchar16 b; uchar c; + uchar d; uchar16 e; uchar f; + uchar g; uchar16 h; uchar i; + uchar j; uchar16 k; uchar l; + + ssx = dsx = 1; + int src_index = block_x * 4 * ssx + (y - 1) * (src_step / 4); + line[1] = vload4(0, src + src_index + (src_step / 4)); + line[2] = vload4(0, src + src_index + 2 * (src_step / 4)); + +#ifdef BORDER_CONSTANT + line[0] = (y == 0) ? (uint4)0 : vload4(0, src + src_index); + line[3] = (y == (rows - 2)) ? (uint4)0 : vload4(0, src + src_index + 3 * (src_step / 4)); +#elif defined BORDER_REFLECT_101 + line[0] = (y == 0) ? line[2] : vload4(0, src + src_index); + line[3] = (y == (rows - 2)) ? line[1] : vload4(0, src + src_index + 3 * (src_step / 4)); +#elif defined (BORDER_REPLICATE) || defined(BORDER_REFLECT) + line[0] = (y == 0) ? line[1] : vload4(0, src + src_index); + line[3] = (y == (rows - 2)) ? line[2] : vload4(0, src + src_index + 3 * (src_step / 4)); +#endif + + __global uchar *src_p = (__global uchar *)src; + + src_index = block_x * 16 * ssx + (y - 1) * src_step; + bool line_end = ((block_x + 1) * 16 == cols); + + b = as_uchar16(line[0]); + e = as_uchar16(line[1]); + h = as_uchar16(line[2]); + k = as_uchar16(line[3]); + +#ifdef BORDER_CONSTANT + a = (block_x == 0 || y == 0) ? 0 : src_p[src_index - 1]; + c = (line_end || y == 0) ? 0 : src_p[src_index + 16]; + + d = (block_x == 0) ? 0 : src_p[src_index + src_step - 1]; + f = line_end ? 0 : src_p[src_index + src_step + 16]; + + g = (block_x == 0) ? 0 : src_p[src_index + 2 * src_step - 1]; + i = line_end ? 0 : src_p[src_index + 2 * src_step + 16]; + + j = (block_x == 0 || y == (rows - 2)) ? 0 : src_p[src_index + 3 * src_step - 1]; + l = (line_end || y == (rows - 2))? 0 : src_p[src_index + 3 * src_step + 16]; + +#elif defined BORDER_REFLECT_101 + int offset; + offset = (y == 0) ? (2 * src_step) : 0; + + a = (block_x == 0) ? src_p[src_index + offset + 1] : src_p[src_index + offset - 1]; + c = line_end ? src_p[src_index + offset + 14] : src_p[src_index + offset + 16]; + + d = (block_x == 0) ? src_p[src_index + src_step + 1] : src_p[src_index + src_step - 1]; + f = line_end ? src_p[src_index + src_step + 14] : src_p[src_index + src_step + 16]; + + g = (block_x == 0) ? src_p[src_index + 2 * src_step + 1] : src_p[src_index + 2 * src_step - 1]; + i = line_end ? src_p[src_index + 2 * src_step + 14] : src_p[src_index + 2 * src_step + 16]; + + offset = (y == (rows - 2)) ? (1 * src_step) : (3 * src_step); + + j = (block_x == 0) ? src_p[src_index + offset + 1] : src_p[src_index + offset - 1]; + l = line_end ? src_p[src_index + offset + 14] : src_p[src_index + offset + 16]; + +#elif defined (BORDER_REPLICATE) || defined(BORDER_REFLECT) + int offset; + offset = (y == 0) ? (1 * src_step) : 0; + + a = (block_x == 0) ? src_p[src_index + offset] : src_p[src_index + offset - 1]; + c = line_end ? src_p[src_index + offset + 15] : src_p[src_index + offset + 16]; + + d = (block_x == 0) ? src_p[src_index + src_step] : src_p[src_index + src_step - 1]; + f = line_end ? src_p[src_index + src_step + 15] : src_p[src_index + src_step + 16]; + + g = (block_x == 0) ? src_p[src_index + 2 * src_step] : src_p[src_index + 2 * src_step - 1]; + i = line_end ? src_p[src_index + 2 * src_step + 15] : src_p[src_index + 2 * src_step + 16]; + + offset = (y == (rows - 2)) ? (2 * src_step) : (3 * src_step); + + j = (block_x == 0) ? src_p[src_index + offset] : src_p[src_index + offset - 1]; + l = line_end ? src_p[src_index + offset + 15] : src_p[src_index + offset + 16]; +#endif + + uchar16 arr[12]; + float16 sum[2]; + + arr[0] = (uchar16)(a, b.s0123, b.s456789ab, b.scde); + arr[1] = b; + arr[2] = (uchar16)(b.s123, b.s4567, b.s89abcdef, c); + arr[3] = (uchar16)(d, e.s0123, e.s456789ab, e.scde); + arr[4] = e; + arr[5] = (uchar16)(e.s123, e.s4567, e.s89abcdef, f); + arr[6] = (uchar16)(g, h.s0123, h.s456789ab, h.scde); + arr[7] = h; + arr[8] = (uchar16)(h.s123, h.s4567, h.s89abcdef, i); + arr[9] = (uchar16)(j, k.s0123, k.s456789ab, k.scde); + arr[10] = k; + arr[11] = (uchar16)(k.s123, k.s4567, k.s89abcdef, l); + + sum[0] = OP(0, 0, 0) + OP(0, 0, 1) + OP(0, 0, 2) + + OP(0, 1, 0) + OP(0, 1, 1) + OP(0, 1, 2) + + OP(0, 2, 0) + OP(0, 2, 1) + OP(0, 2, 2); + + sum[1] = OP(1, 0, 0) + OP(1, 0, 1) + OP(1, 0, 2) + + OP(1, 1, 0) + OP(1, 1, 1) + OP(1, 1, 2) + + OP(1, 2, 0) + OP(1, 2, 1) + OP(1, 2, 2); + + line_out[0] = as_uint4(convert_uchar16_sat_rte(sum[0])); + line_out[1] = as_uint4(convert_uchar16_sat_rte(sum[1])); + + int dst_index = block_x * 4 * dsx + y * (dst_step / 4); + vstore4(line_out[0], 0, dst + dst_index); + vstore4(line_out[1], 0, dst + dst_index + (dst_step / 4)); +} diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index a0df333d2c..1584165059 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -2016,9 +2016,68 @@ cv::Ptr cv::createGaussianFilter( int type, Size ksize, return createSeparableLinearFilter( type, type, kx, ky, Point(-1,-1), 0, borderType ); } -#ifdef HAVE_IPP namespace cv { +#ifdef HAVE_OPENCL + +static bool ocl_GaussianBlur3x3_8UC1(InputArray _src, OutputArray _dst, int ddepth, + InputArray _kernelX, InputArray _kernelY, int borderType) +{ + const ocl::Device & dev = ocl::Device::getDefault(); + int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); + + if ( !(dev.isIntel() && (type == CV_8UC1) && + (_src.offset() == 0) && (_src.step() % 4 == 0) && + (_src.cols() % 16 == 0) && (_src.rows() % 2 == 0)) ) + return false; + + Mat kernelX = _kernelX.getMat().reshape(1, 1); + if (kernelX.cols % 2 != 1) + return false; + Mat kernelY = _kernelY.getMat().reshape(1, 1); + if (kernelY.cols % 2 != 1) + return false; + + if (ddepth < 0) + ddepth = sdepth; + + Size size = _src.size(); + size_t globalsize[2] = { 0, 0 }; + size_t localsize[2] = { 0, 0 }; + + globalsize[0] = size.width / 16; + globalsize[1] = size.height / 2; + + const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" }; + char build_opts[1024]; + sprintf(build_opts, "-D %s %s%s", borderMap[borderType], + ocl::kernelToStr(kernelX, CV_32F, "KERNEL_MATRIX_X").c_str(), + ocl::kernelToStr(kernelY, CV_32F, "KERNEL_MATRIX_Y").c_str()); + + ocl::Kernel kernel("gaussianBlur3x3_8UC1_cols16_rows2", cv::ocl::imgproc::gaussianBlur3x3_oclsrc, build_opts); + if (kernel.empty()) + return false; + + UMat src = _src.getUMat(); + _dst.create(size, CV_MAKETYPE(ddepth, cn)); + if (!(_dst.offset() == 0 && _dst.step() % 4 == 0)) + return false; + UMat dst = _dst.getUMat(); + + int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src)); + idxArg = kernel.set(idxArg, (int)src.step); + idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst)); + idxArg = kernel.set(idxArg, (int)dst.step); + idxArg = kernel.set(idxArg, (int)dst.rows); + idxArg = kernel.set(idxArg, (int)dst.cols); + + return kernel.run(2, globalsize, (localsize[0] == 0) ? NULL : localsize, false); +} + +#endif + +#ifdef HAVE_IPP + static bool ipp_GaussianBlur( InputArray _src, OutputArray _dst, Size ksize, double sigma1, double sigma2, int borderType ) @@ -2109,8 +2168,8 @@ static bool ipp_GaussianBlur( InputArray _src, OutputArray _dst, Size ksize, #endif return false; } -} #endif +} void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize, @@ -2148,6 +2207,12 @@ void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize, Mat kx, ky; createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2); + + CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && + ksize.width == 3 && ksize.height == 3 && + (size_t)_src.rows() > ky.total() && (size_t)_src.cols() > kx.total(), + ocl_GaussianBlur3x3_8UC1(_src, _dst, CV_MAT_DEPTH(type), kx, ky, borderType)); + sepFilter2D(_src, _dst, CV_MAT_DEPTH(type), kx, ky, Point(-1,-1), 0, borderType ); } diff --git a/modules/imgproc/test/ocl/test_filters.cpp b/modules/imgproc/test/ocl/test_filters.cpp index f3eb3a8e51..8dd5b95c7c 100644 --- a/modules/imgproc/test/ocl/test_filters.cpp +++ b/modules/imgproc/test/ocl/test_filters.cpp @@ -229,6 +229,86 @@ OCL_TEST_P(GaussianBlurTest, Mat) } } +PARAM_TEST_CASE(GaussianBlur3x3_cols16_rows2_Base, MatType, + int, // kernel size + Size, // dx, dy + BorderType, // border type + double, // optional parameter + bool, // roi or not + int) // width multiplier +{ + int type, borderType, ksize; + Size size; + double param; + bool useRoi; + int widthMultiple; + + TEST_DECLARE_INPUT_PARAMETER(src); + TEST_DECLARE_OUTPUT_PARAMETER(dst); + + virtual void SetUp() + { + type = GET_PARAM(0); + ksize = GET_PARAM(1); + size = GET_PARAM(2); + borderType = GET_PARAM(3); + param = GET_PARAM(4); + useRoi = GET_PARAM(5); + widthMultiple = GET_PARAM(6); + } + + void random_roi() + { + size = Size(3, 3); + + Size roiSize = randomSize(size.width, MAX_VALUE, size.height, MAX_VALUE); + roiSize.width = std::max(size.width + 13, roiSize.width & (~0xf)); + roiSize.height = std::max(size.height + 1, roiSize.height & (~0x1)); + + Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0); + randomSubMat(src, src_roi, roiSize, srcBorder, type, 5, 256); + + Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0); + randomSubMat(dst, dst_roi, roiSize, dstBorder, type, -60, 70); + + UMAT_UPLOAD_INPUT_PARAMETER(src); + UMAT_UPLOAD_OUTPUT_PARAMETER(dst); + } + + void Near() + { + Near(1, false); + } + + void Near(double threshold, bool relative) + { + if (relative) + OCL_EXPECT_MATS_NEAR_RELATIVE(dst, threshold); + else + OCL_EXPECT_MATS_NEAR(dst, threshold); + } +}; + +typedef GaussianBlur3x3_cols16_rows2_Base GaussianBlur3x3_cols16_rows2; + +OCL_TEST_P(GaussianBlur3x3_cols16_rows2, Mat) +{ + Size kernelSize(ksize, ksize); + + for (int j = 0; j < test_loop_times; j++) + { + random_roi(); + + double sigma1 = rng.uniform(0.1, 1.0); + double sigma2 = j % 2 == 0 ? sigma1 : rng.uniform(0.1, 1.0); + + OCL_OFF(cv::GaussianBlur(src_roi, dst_roi, Size(ksize, ksize), sigma1, sigma2, borderType)); + OCL_ON(cv::GaussianBlur(usrc_roi, udst_roi, Size(ksize, ksize), sigma1, sigma2, borderType)); + + Near(CV_MAT_DEPTH(type) >= CV_32F ? 1e-3 : 4, CV_MAT_DEPTH(type) >= CV_32F); + } +} + ///////////////////////////////////////////////////////////////////////////////////////////////// // Erode @@ -490,6 +570,15 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, GaussianBlurTest, Combine( Bool(), Values(1))); // not used +OCL_INSTANTIATE_TEST_CASE_P(Filter, GaussianBlur3x3_cols16_rows2, Combine( + Values((MatType)CV_8UC1), + Values(3), // kernel size + Values(Size(0, 0)), // not used + FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED, + Values(0.0), // not used + Bool(), + Values(1))); // not used + OCL_INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine( Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4, CV_64FC1, CV_64FC4), Values(0, 3, 5, 7), // kernel size, 0 means kernel = Mat()