diff --git a/modules/imgproc/perf/opencl/perf_filters.cpp b/modules/imgproc/perf/opencl/perf_filters.cpp index b4e29ae67f..a179f7d009 100644 --- a/modules/imgproc/perf/opencl/perf_filters.cpp +++ b/modules/imgproc/perf/opencl/perf_filters.cpp @@ -313,6 +313,62 @@ OCL_PERF_TEST_P(Filter2DFixture, Filter2D, SANITY_CHECK(dst, eps); } +///////////// SepFilter2D ///////////// + +typedef FilterFixture OCL_SepFilter2D; + +PERF_TEST_P_(OCL_SepFilter2D, SepFilter2D) +{ + const FilterParams& params = GetParam(); + const Size srcSize = get<0>(params); + const int type = get<1>(params), ksize = get<2>(params); + + checkDeviceMaxMemoryAllocSize(srcSize, type); + + UMat src(srcSize, type), dst(srcSize, type); + declare.in(src, WARMUP_RNG).out(dst); + + Mat kernelX(1, ksize, CV_32FC1); + randu(kernelX, -3.0, 3.0); + Mat kernelY(1, ksize, CV_32FC1); + randu(kernelY, -3.0, 3.0); + + OCL_TEST_CYCLE() cv::sepFilter2D(src, dst, -1, kernelX, kernelY, cv::Point(-1, -1), 1.0f, cv::BORDER_CONSTANT); + + SANITY_CHECK_NOTHING(); +} + +PERF_TEST_P_(OCL_SepFilter2D, SepFilter2D_BitExact) +{ + const FilterParams& params = GetParam(); + const Size srcSize = get<0>(params); + const int type = get<1>(params), ksize = get<2>(params); + + checkDeviceMaxMemoryAllocSize(srcSize, type); + + UMat src(srcSize, type), dst(srcSize, type); + declare.in(src, WARMUP_RNG).out(dst); + + Mat kernelX(1, ksize, CV_32SC1); + randu(kernelX, -16.0, 16.0); + kernelX.convertTo(kernelX, CV_32FC1, 1/16.0f, 0); + Mat kernelY(1, ksize, CV_32SC1); + randu(kernelY, -16.0, 16.0); + kernelY.convertTo(kernelY, CV_32FC1, 1/16.0f, 0); + + OCL_TEST_CYCLE() cv::sepFilter2D(src, dst, -1, kernelX, kernelY, cv::Point(-1, -1), 1.0f, cv::BORDER_CONSTANT); + + SANITY_CHECK_NOTHING(); +} + +INSTANTIATE_TEST_CASE_P(/*nothing*/, OCL_SepFilter2D, + ::testing::Combine( + ::testing::Values(sz1080p), + OCL_TEST_TYPES, + OCL_PERF_ENUM(3, 5, 7, 9, 11) + ) +); + ///////////// Bilateral //////////////////////// typedef TestBaseWithParam BilateralFixture; diff --git a/modules/imgproc/src/filter.dispatch.cpp b/modules/imgproc/src/filter.dispatch.cpp index d39c749121..c9d1bb457c 100644 --- a/modules/imgproc/src/filter.dispatch.cpp +++ b/modules/imgproc/src/filter.dispatch.cpp @@ -729,11 +729,12 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth, return k.run(2, globalsize, localsize, false); } -const int shift_bits = 8; - static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX, int anchor, - int borderType, int ddepth, bool fast8uc1, bool int_arithm) + int borderType, int ddepth, bool fast8uc1, + bool int_arithm, int shift_bits) { + CV_Assert(shift_bits == 0 || int_arithm); + int type = src.type(), cn = CV_MAT_CN(type), sdepth = CV_MAT_DEPTH(type); bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; Size bufSize = buf.size(); @@ -801,8 +802,11 @@ static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX return k.run(2, globalsize, localsize, false); } -static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor, bool int_arithm) +static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor, + bool int_arithm, int shift_bits) { + CV_Assert(shift_bits == 0 || int_arithm); + bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; if (dst.depth() == CV_64F && !doubleSupport) return false; @@ -821,13 +825,16 @@ static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY globalsize[1] = DIVUP(sz.height, localsize[1]) * localsize[1]; globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0]; - char cvt[40]; + char cvt[2][40]; + int floatT = std::max(CV_32F, bdepth); cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d" - " -D srcT=%s -D dstT=%s -D convertToDstT=%s" + " -D srcT=%s -D dstT=%s -D convertToFloatT=%s -D floatT=%s -D convertToDstT=%s" " -D srcT1=%s -D dstT1=%s -D SHIFT_BITS=%d%s%s", anchor, (int)localsize[0], (int)localsize[1], cn, ocl::typeToStr(buf_type), ocl::typeToStr(dtype), - ocl::convertTypeStr(bdepth, ddepth, cn, cvt), + ocl::convertTypeStr(bdepth, floatT, cn, cvt[0]), + ocl::typeToStr(CV_MAKETYPE(floatT, cn)), + ocl::convertTypeStr(shift_bits ? floatT : bdepth, ddepth, cn, cvt[1]), ocl::typeToStr(bdepth), ocl::typeToStr(ddepth), 2*shift_bits, doubleSupport ? " -D DOUBLE_SUPPORT" : "", int_arithm ? " -D INTEGER_ARITHMETIC" : ""); @@ -839,7 +846,7 @@ static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY return false; k.args(ocl::KernelArg::ReadOnly(buf), ocl::KernelArg::WriteOnly(dst), - static_cast(delta)); + static_cast(delta * (1u << (2 * shift_bits)))); return k.run(2, globalsize, localsize, false); } @@ -848,16 +855,21 @@ const int optimizedSepFilterLocalWidth = 16; const int optimizedSepFilterLocalHeight = 8; static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, - Mat row_kernel, Mat col_kernel, - double delta, int borderType, int ddepth, int bdepth, bool int_arithm) + const Mat& kernelX_, const Mat& kernelY_, + double delta, int borderType, int ddepth, int bdepth, + bool int_arithm, int shift_bits) { - Size size = _src.size(), wholeSize; - Point origin; + //CV_Assert(shift_bits == 0 || int_arithm); + + const ocl::Device& d = ocl::Device::getDefault(); + + Size size = _src.size(); int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype), esz = CV_ELEM_SIZE(stype), wdepth = std::max(std::max(sdepth, ddepth), bdepth), dtype = CV_MAKE_TYPE(ddepth, cn); size_t src_step = _src.step(), src_offset = _src.offset(); - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; + + bool doubleSupport = d.doubleFPConfig() > 0; if (esz == 0 || src_step == 0 || (src_offset % src_step) % esz != 0 @@ -869,6 +881,13 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, || borderType == BORDER_REFLECT_101)) return false; + Mat kernelX, kernelY; + kernelX_.convertTo(kernelX, wdepth); + if (kernelX_.data != kernelY_.data) + kernelY_.convertTo(kernelY, wdepth); + else + kernelY = kernelX; + size_t lt2[2] = { optimizedSepFilterLocalWidth, optimizedSepFilterLocalHeight }; size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1]}; @@ -879,9 +898,9 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s" " -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s" " -D %s -D srcT1=%s -D dstT1=%s -D WT1=%s -D CN=%d -D SHIFT_BITS=%d%s", - (int)lt2[0], (int)lt2[1], row_kernel.cols / 2, col_kernel.cols / 2, - ocl::kernelToStr(row_kernel, wdepth, "KERNEL_MATRIX_X").c_str(), - ocl::kernelToStr(col_kernel, wdepth, "KERNEL_MATRIX_Y").c_str(), + (int)lt2[0], (int)lt2[1], kernelX.cols / 2, kernelY.cols / 2, + ocl::kernelToStr(kernelX, wdepth, "KERNEL_MATRIX_X").c_str(), + ocl::kernelToStr(kernelY, wdepth, "KERNEL_MATRIX_Y").c_str(), ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype), ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType], @@ -896,21 +915,30 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, _dst.create(size, dtype); UMat dst = _dst.getUMat(); - int src_offset_x = static_cast((src_offset % src_step) / esz); - int src_offset_y = static_cast(src_offset / src_step); + // TODO Future: emit error on inplace processing + //CV_Assert(src.u != dst.u && "Inplace processing is not allowed with UMat"); + if (src.u == dst.u) + { + CV_LOG_ONCE_WARNING(NULL, "sepFilter2D: inplace arguments are not allowed for non-inplace operations. Performance impact warning."); + src = src.clone(); + } + Size wholeSize; + Point origin; src.locateROI(wholeSize, origin); - k.args(ocl::KernelArg::PtrReadOnly(src), (int)src_step, src_offset_x, src_offset_y, + k.args(ocl::KernelArg::PtrReadOnly(src), (int)src_step, origin.x, origin.y, wholeSize.height, wholeSize.width, ocl::KernelArg::WriteOnly(dst), - static_cast(delta)); + static_cast(delta * (1u << (2 * shift_bits)))); return k.run(2, gt2, lt2, false); } -bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, - InputArray _kernelX, InputArray _kernelY, Point anchor, - double delta, int borderType ) +bool ocl_sepFilter2D( + InputArray _src, OutputArray _dst, int ddepth, + InputArray _kernelX, InputArray _kernelY, Point anchor, + double delta, int borderType +) { const ocl::Device & d = ocl::Device::getDefault(); Size imgSize = _src.size(); @@ -934,59 +962,152 @@ bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, if (anchor.y < 0) anchor.y = kernelY.cols >> 1; - int rtype = getKernelType(kernelX, - kernelX.rows == 1 ? Point(anchor.x, 0) : Point(0, anchor.x)); - int ctype = getKernelType(kernelY, - kernelY.rows == 1 ? Point(anchor.y, 0) : Point(0, anchor.y)); - int bdepth = CV_32F; bool int_arithm = false; - if( sdepth == CV_8U && ddepth == CV_8U && - rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && - ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL) + int shift_bits = 0; + + while (sdepth == CV_8U && ddepth == CV_8U) { - if (ocl::Device::getDefault().isIntel()) + int bits_ = 8; + if (delta * 256.0f != (float)(int)(delta * 256)) { - for (int i=0; i(0, i) = (float) cvRound(kernelX.at(0, i) * (1 << shift_bits)); - if (kernelX.data != kernelY.data) - for (int i=0; i(0, i) = (float) cvRound(kernelY.at(0, i) * (1 << shift_bits)); - } else + CV_LOG_DEBUG(NULL, "ocl_sepFilter2D: bit-exact delta can't be applied: delta=" << delta); + break; + } + Mat kernelX_BitExact, kernelY_BitExact; + bool isValidBitExactRowKernel = createBitExactKernel_32S(kernelX, kernelX_BitExact, bits_); + bool isValidBitExactColumnKernel = createBitExactKernel_32S(kernelY, kernelY_BitExact, bits_); + if (!isValidBitExactRowKernel) + { + CV_LOG_DEBUG(NULL, "ocl_sepFilter2D: bit-exact row-kernel can't be applied: ksize=" << kernelX_BitExact.total()); + } + else if (!isValidBitExactColumnKernel) + { + CV_LOG_DEBUG(NULL, "ocl_sepFilter2D: bit-exact column-kernel can't be applied: ksize=" << kernelY_BitExact.total()); + } + else { bdepth = CV_32S; - kernelX.convertTo( kernelX, bdepth, 1 << shift_bits ); - kernelY.convertTo( kernelY, bdepth, 1 << shift_bits ); + shift_bits = bits_; + int_arithm = true; + + kernelX = kernelX_BitExact; + kernelY = kernelY_BitExact; } - int_arithm = true; + break; } - CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 && - imgSize.width > optimizedSepFilterLocalWidth + anchor.x && - imgSize.height > optimizedSepFilterLocalHeight + anchor.y && - (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) && - anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) && - OCL_PERFORMANCE_CHECK(d.isIntel()), // TODO FIXIT - ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta, - borderType & ~BORDER_ISOLATED, ddepth, bdepth, int_arithm), true) + CV_OCL_RUN_( + kernelY.cols <= 21 && kernelX.cols <= 21 && + imgSize.width > optimizedSepFilterLocalWidth + anchor.x && + imgSize.height > optimizedSepFilterLocalHeight + anchor.y && + (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) && + anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) && + OCL_PERFORMANCE_CHECK(d.isIntel()), // TODO FIXIT + ocl_sepFilter2D_SinglePass( + _src, _dst, kernelX, kernelY, delta, + borderType & ~BORDER_ISOLATED, ddepth, + CV_32F, // force FP32 mode + false, shift_bits + ), + true + ); UMat src = _src.getUMat(); - Size srcWholeSize; Point srcOffset; - src.locateROI(srcWholeSize, srcOffset); - bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 && - src.cols % 4 == 0 && src.step % 4 == 0; + bool fast8uc1 = false; + if (type == CV_8UC1) + { + Size srcWholeSize; + Point srcOffset; + src.locateROI(srcWholeSize, srcOffset); + fast8uc1 = srcOffset.x % 4 == 0 && + src.cols % 4 == 0 && src.step % 4 == 0; + } + + Size srcSize = src.size(); + Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1); + UMat buf(bufSize, CV_MAKETYPE(bdepth, cn)); + if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1, int_arithm, shift_bits)) + return false; + + _dst.create(srcSize, CV_MAKETYPE(ddepth, cn)); + UMat dst = _dst.getUMat(); + + return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, int_arithm, shift_bits); +} + +bool ocl_sepFilter2D_BitExact( + InputArray _src, OutputArray _dst, int ddepth, + const Size& ksize, + const uint16_t *fkx, const uint16_t *fky, + Point anchor, + double delta, int borderType, + int shift_bits +) +{ + const ocl::Device & d = ocl::Device::getDefault(); + Size imgSize = _src.size(); + + int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); + if (cn > 4) + return false; + + if (ksize.width % 2 != 1) + return false; + if (ksize.height % 2 != 1) + return false; + + Mat kernelX(1, ksize.width, CV_16SC1, (void*)fkx); + Mat kernelY(1, ksize.height, CV_16SC1, (void*)fky); + + if (ddepth < 0) + ddepth = sdepth; + + if (anchor.x < 0) + anchor.x = kernelX.cols >> 1; + if (anchor.y < 0) + anchor.y = kernelY.cols >> 1; + + int bdepth = sdepth == CV_8U ? CV_32S : CV_32F; + + CV_OCL_RUN_( + kernelY.cols <= 21 && kernelX.cols <= 21 && + imgSize.width > optimizedSepFilterLocalWidth + anchor.x && + imgSize.height > optimizedSepFilterLocalHeight + anchor.y && + (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) && + anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) && + OCL_PERFORMANCE_CHECK(d.isIntel()), // TODO FIXIT + ocl_sepFilter2D_SinglePass( + _src, _dst, kernelX, kernelY, delta, + borderType & ~BORDER_ISOLATED, ddepth, bdepth, + true, shift_bits + ), + true + ); + + UMat src = _src.getUMat(); + + bool fast8uc1 = false; + if (type == CV_8UC1) + { + Size srcWholeSize; + Point srcOffset; + src.locateROI(srcWholeSize, srcOffset); + fast8uc1 = srcOffset.x % 4 == 0 && + src.cols % 4 == 0 && src.step % 4 == 0; + } Size srcSize = src.size(); Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1); UMat buf(bufSize, CV_MAKETYPE(bdepth, cn)); - if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1, int_arithm)) + if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1, true, shift_bits)) return false; _dst.create(srcSize, CV_MAKETYPE(ddepth, cn)); UMat dst = _dst.getUMat(); - return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, int_arithm); + return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, true, shift_bits); } #endif @@ -1444,7 +1565,7 @@ void sepFilter2D(InputArray _src, OutputArray _dst, int ddepth, CV_Assert(!_kernelX.empty()); CV_Assert(!_kernelY.empty()); - CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && (size_t)_src.rows() > _kernelY.total() && (size_t)_src.cols() > _kernelX.total(), + CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && (size_t)_src.rows() >= _kernelY.total() && (size_t)_src.cols() >= _kernelX.total(), ocl_sepFilter2D(_src, _dst, ddepth, _kernelX, _kernelY, anchor, delta, borderType)) Mat src = _src.getMat(), kernelX = _kernelX.getMat(), kernelY = _kernelY.getMat(); diff --git a/modules/imgproc/src/filter.hpp b/modules/imgproc/src/filter.hpp index 7b792d1935..570fecec17 100644 --- a/modules/imgproc/src/filter.hpp +++ b/modules/imgproc/src/filter.hpp @@ -46,13 +46,25 @@ namespace cv { #ifdef HAVE_OPENCL - bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, - InputArray _kernelX, InputArray _kernelY, Point anchor, - double delta, int borderType ); +bool ocl_sepFilter2D( + InputArray _src, OutputArray _dst, int ddepth, + InputArray _kernelX, InputArray _kernelY, Point anchor, + double delta, int borderType +); + +bool ocl_sepFilter2D_BitExact( + InputArray _src, OutputArray _dst, int ddepth, + const Size& ksize, + const uint16_t *fkx, const uint16_t *fky, + Point anchor, + double delta, int borderType, + int shift_bits +); #endif - void preprocess2DKernel(const Mat& kernel, std::vector& coords, std::vector& coeffs); -} +void preprocess2DKernel(const Mat& kernel, std::vector& coords, std::vector& coeffs); + +} // namespace #endif diff --git a/modules/imgproc/src/opencl/filterSepCol.cl b/modules/imgproc/src/opencl/filterSepCol.cl index afcdbea89c..f2024db334 100644 --- a/modules/imgproc/src/opencl/filterSepCol.cl +++ b/modules/imgproc/src/opencl/filterSepCol.cl @@ -61,7 +61,11 @@ #endif #define DIG(a) a, +#if defined(INTEGER_ARITHMETIC) +__constant int mat_kernel[] = { COEFF }; +#else __constant srcT1 mat_kernel[] = { COEFF }; +#endif __kernel void col_filter(__global const uchar * src, int src_step, int src_offset, int src_whole_rows, int src_whole_cols, __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, float delta) @@ -92,30 +96,28 @@ __kernel void col_filter(__global const uchar * src, int src_step, int src_offse barrier(CLK_LOCAL_MEM_FENCE); // read pixels from lds and calculate the result - sum = LDS_DAT[l_y + RADIUSY][l_x] * mat_kernel[RADIUSY]; + sum = LDS_DAT[l_y + RADIUSY][l_x] * mat_kernel[RADIUSY] + (srcT)delta; for (int i = 1; i <= RADIUSY; ++i) { temp[0] = LDS_DAT[l_y + RADIUSY - i][l_x]; temp[1] = LDS_DAT[l_y + RADIUSY + i][l_x]; -#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE) +#if defined(INTEGER_ARITHMETIC) sum += mad24(temp[0],mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]); #else sum += mad(temp[0], mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]); #endif } -#ifdef INTEGER_ARITHMETIC -#ifdef INTEL_DEVICE - sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS); -#else - sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS; -#endif -#endif - // write the result to dst if (x < dst_cols && y < dst_rows) { +#if defined(SHIFT_BITS) && SHIFT_BITS > 0 + dstT result = convertToDstT(convertToFloatT(sum) * (floatT)(1.0f / (1 << SHIFT_BITS))); +#else + dstT result = convertToDstT(sum); +#endif + start_addr = mad24(y, dst_step, mad24(DSTSIZE, x, dst_offset)); - storepix(convertToDstT(sum + (srcT)(delta)), dst + start_addr); + storepix(result, dst + start_addr); } } diff --git a/modules/imgproc/src/opencl/filterSepRow.cl b/modules/imgproc/src/opencl/filterSepRow.cl index 8a317ae13d..23f4b6268c 100644 --- a/modules/imgproc/src/opencl/filterSepRow.cl +++ b/modules/imgproc/src/opencl/filterSepRow.cl @@ -139,9 +139,13 @@ #endif #define DIG(a) a, +#if defined(INTEGER_ARITHMETIC) +__constant int mat_kernel[] = { COEFF }; +#else __constant dstT1 mat_kernel[] = { COEFF }; +#endif -#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE) +#if defined(INTEGER_ARITHMETIC) #define dstT4 int4 #define convertDstVec convert_int4 #else @@ -263,7 +267,7 @@ __kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel { temp[0] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset - i); temp[1] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset + i); -#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE) +#if defined(INTEGER_ARITHMETIC) sum += mad24(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]); #else sum += mad(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]); @@ -368,7 +372,7 @@ __kernel void row_filter(__global const uchar * src, int src_step, int src_offse { temp[0] = LDS_DAT[l_y][l_x + RADIUSX - i]; temp[1] = LDS_DAT[l_y][l_x + RADIUSX + i]; -#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE) +#if defined(INTEGER_ARITHMETIC) sum += mad24(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]); #else sum += mad(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]); diff --git a/modules/imgproc/src/opencl/filterSep_singlePass.cl b/modules/imgproc/src/opencl/filterSep_singlePass.cl index 1f96d7d6e1..a91cf7b0e4 100644 --- a/modules/imgproc/src/opencl/filterSep_singlePass.cl +++ b/modules/imgproc/src/opencl/filterSep_singlePass.cl @@ -160,7 +160,7 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int { sum = (WT) 0; for (i=0; i<=2*RADIUSY; i++) -#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE) +#if defined(INTEGER_ARITHMETIC) sum = mad24(lsmem[liy + i][clocX], mat_kernelY[i], sum); #else sum = mad(lsmem[liy + i][clocX], mat_kernelY[i], sum); @@ -177,25 +177,27 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int { // do second horizontal filter pass // and calculate final result - sum = 0.0f; + sum = (WT)(delta); for (i=0; i<=2*RADIUSX; i++) -#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE) +#if defined(INTEGER_ARITHMETIC) sum = mad24(lsmemDy[liy][lix+i], mat_kernelX[i], sum); #else sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum); #endif -#ifdef INTEGER_ARITHMETIC -#ifdef INTEL_DEVICE - sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS); +#if defined(SHIFT_BITS) && SHIFT_BITS > 0 +#if !defined(INTEGER_ARITHMETIC) + sum = sum * (1.0f / (1 << SHIFT_BITS)); #else sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS; #endif #endif + // store result into destination image - storepix(convertToDstT(sum + (WT)(delta)), Dst + mad24(y + liy, dst_step, mad24(x, DSTSIZE, dst_offset))); + storepix(convertToDstT(sum), Dst + mad24(y + liy, dst_step, mad24(x, DSTSIZE, dst_offset))); } + barrier(CLK_LOCAL_MEM_FENCE); for (int i = liy * BLK_X + lix; i < (RADIUSY*2) * (BLK_X+(RADIUSX*2)); i += BLK_X * BLK_Y) { int clocX = i % (BLK_X+(RADIUSX*2)); diff --git a/modules/imgproc/src/smooth.dispatch.cpp b/modules/imgproc/src/smooth.dispatch.cpp index 65122d20e2..65d1fc8ed6 100644 --- a/modules/imgproc/src/smooth.dispatch.cpp +++ b/modules/imgproc/src/smooth.dispatch.cpp @@ -48,6 +48,7 @@ #include #include +#include #include "opencv2/core/hal/intrin.hpp" #include "opencl_kernels_imgproc.hpp" @@ -637,10 +638,9 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize, return; } - bool useOpenCL = (ocl::isOpenCLActivated() && _dst.isUMat() && _src.dims() <= 2 && - ((ksize.width == 3 && ksize.height == 3) || - (ksize.width == 5 && ksize.height == 5)) && - _src.rows() > ksize.height && _src.cols() > ksize.width); + bool useOpenCL = ocl::isOpenCLActivated() && _dst.isUMat() && _src.dims() <= 2 && + _src.rows() >= ksize.height && _src.cols() >= ksize.width && + ksize.width > 1 && ksize.height > 1; CV_UNUSED(useOpenCL); int sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); @@ -648,27 +648,13 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize, Mat kx, ky; createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2); - CV_OCL_RUN(useOpenCL, ocl_GaussianBlur_8UC1(_src, _dst, ksize, CV_MAT_DEPTH(type), kx, ky, borderType)); + CV_OCL_RUN(useOpenCL && sdepth == CV_8U && + ((ksize.width == 3 && ksize.height == 3) || + (ksize.width == 5 && ksize.height == 5)), + ocl_GaussianBlur_8UC1(_src, _dst, ksize, CV_MAT_DEPTH(type), kx, ky, borderType) + ); - CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && (size_t)_src.rows() > kx.total() && (size_t)_src.cols() > kx.total(), - ocl_sepFilter2D(_src, _dst, sdepth, kx, ky, Point(-1, -1), 0, borderType)) - - Mat src = _src.getMat(); - Mat dst = _dst.getMat(); - - Point ofs; - Size wsz(src.cols, src.rows); - if(!(borderType & BORDER_ISOLATED)) - src.locateROI( wsz, ofs ); - - CALL_HAL(gaussianBlur, cv_hal_gaussianBlur, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, cn, - ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, ksize.height, - sigma1, sigma2, borderType&~BORDER_ISOLATED); - - CV_OVX_RUN(true, - openvx_gaussianBlur(src, dst, ksize, sigma1, sigma2, borderType)) - - if(sdepth == CV_8U && ((borderType & BORDER_ISOLATED) || !_src.getMat().isSubmatrix())) + if(sdepth == CV_8U && ((borderType & BORDER_ISOLATED) || !_src.isSubmatrix())) { std::vector fkx, fky; createGaussianKernels(fkx, fky, type, ksize, sigma1, sigma2); @@ -684,6 +670,17 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize, } else { + CV_OCL_RUN(useOpenCL, + ocl_sepFilter2D_BitExact(_src, _dst, sdepth, + ksize, + (const uint16_t*)&fkx[0], (const uint16_t*)&fky[0], + Point(-1, -1), 0, borderType, + 8/*shift_bits*/) + ); + + Mat src = _src.getMat(); + Mat dst = _dst.getMat(); + if (src.data == dst.data) src = src.clone(); CV_CPU_DISPATCH(GaussianBlurFixedPoint, (src, dst, (const uint16_t*)&fkx[0], (int)fkx.size(), (const uint16_t*)&fky[0], (int)fky.size(), borderType), @@ -692,6 +689,29 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize, } } +#ifdef HAVE_OPENCL + if (useOpenCL) + { + sepFilter2D(_src, _dst, sdepth, kx, ky, Point(-1, -1), 0, borderType); + return; + } +#endif + + Mat src = _src.getMat(); + Mat dst = _dst.getMat(); + + Point ofs; + Size wsz(src.cols, src.rows); + if(!(borderType & BORDER_ISOLATED)) + src.locateROI( wsz, ofs ); + + CALL_HAL(gaussianBlur, cv_hal_gaussianBlur, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, cn, + ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, ksize.height, + sigma1, sigma2, borderType&~BORDER_ISOLATED); + + CV_OVX_RUN(true, + openvx_gaussianBlur(src, dst, ksize, sigma1, sigma2, borderType)) + #if defined ENABLE_IPP_GAUSSIAN_BLUR // IPP is not bit-exact to OpenCV implementation CV_IPP_RUN_FAST(ipp_GaussianBlur(src, dst, ksize, sigma1, sigma2, borderType)); diff --git a/modules/imgproc/test/ocl/test_sepfilter2d.cpp b/modules/imgproc/test/ocl/test_sepfilter2d.cpp index 9b1f1690ae..12f247ed36 100644 --- a/modules/imgproc/test/ocl/test_sepfilter2d.cpp +++ b/modules/imgproc/test/ocl/test_sepfilter2d.cpp @@ -73,7 +73,7 @@ PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool) useRoi = GET_PARAM(4); } - void random_roi() + void random_roi(bool bitExact) { Size ksize = randomSize(kernelMinSize, kernelMaxSize); if (1 != ksize.width % 2) @@ -81,11 +81,19 @@ PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool) if (1 != ksize.height % 2) ksize.height++; - Mat temp = randomMat(Size(ksize.width, 1), CV_MAKE_TYPE(CV_32F, 1), -MAX_VALUE, MAX_VALUE); + Mat temp = randomMat(Size(ksize.width, 1), CV_32FC1, -0.5, 1.0); cv::normalize(temp, kernelX, 1.0, 0.0, NORM_L1); - temp = randomMat(Size(1, ksize.height), CV_MAKE_TYPE(CV_32F, 1), -MAX_VALUE, MAX_VALUE); + temp = randomMat(Size(1, ksize.height), CV_32FC1, -0.5, 1.0); cv::normalize(temp, kernelY, 1.0, 0.0, NORM_L1); + if (bitExact) + { + kernelX.convertTo(temp, CV_32S, 256); + temp.convertTo(kernelX, CV_32F, 1.0 / 256); + kernelY.convertTo(temp, CV_32S, 256); + temp.convertTo(kernelY, CV_32F, 1.0 / 256); + } + Size roiSize = randomSize(ksize.width, MAX_VALUE, ksize.height, MAX_VALUE); Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0); randomSubMat(src, src_roi, roiSize, srcBorder, type, -MAX_VALUE, MAX_VALUE); @@ -96,6 +104,11 @@ PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool) anchor.x = anchor.y = -1; delta = randomDouble(-100, 100); + if (bitExact) + { + delta = (int)(delta * 256) / 256.0; + } + UMAT_UPLOAD_INPUT_PARAMETER(src); UMAT_UPLOAD_OUTPUT_PARAMETER(dst); } @@ -110,7 +123,7 @@ OCL_TEST_P(SepFilter2D, Mat) { for (int j = 0; j < test_loop_times + 3; j++) { - random_roi(); + random_roi(false); OCL_OFF(cv::sepFilter2D(src_roi, dst_roi, -1, kernelX, kernelY, anchor, delta, borderType)); OCL_ON(cv::sepFilter2D(usrc_roi, udst_roi, -1, kernelX, kernelY, anchor, delta, borderType)); @@ -119,6 +132,22 @@ OCL_TEST_P(SepFilter2D, Mat) } } +OCL_TEST_P(SepFilter2D, Mat_BitExact) +{ + for (int j = 0; j < test_loop_times + 3; j++) + { + random_roi(true); + + OCL_OFF(cv::sepFilter2D(src_roi, dst_roi, -1, kernelX, kernelY, anchor, delta, borderType)); + OCL_ON(cv::sepFilter2D(usrc_roi, udst_roi, -1, kernelX, kernelY, anchor, delta, borderType)); + + if (src_roi.depth() < CV_32F) + Near(0.0); + else + Near(1e-3); + } +} + OCL_INSTANTIATE_TEST_CASE_P(ImageProc, SepFilter2D, Combine( Values(CV_8U, CV_32F), diff --git a/modules/stitching/src/exposure_compensate.cpp b/modules/stitching/src/exposure_compensate.cpp index 8ce2dda2d9..189b2b4fb1 100644 --- a/modules/stitching/src/exposure_compensate.cpp +++ b/modules/stitching/src/exposure_compensate.cpp @@ -275,8 +275,12 @@ void BlocksGainCompensator::feed(const std::vector &corners, const std::v gain_map(by, bx) = static_cast(gains[bl_idx]); } - sepFilter2D(gain_maps_[img_idx], gain_maps_[img_idx], CV_32F, ker, ker); - sepFilter2D(gain_maps_[img_idx], gain_maps_[img_idx], CV_32F, ker, ker); + // 2 smooth passes + UMat result; + sepFilter2D(gain_maps_[img_idx], result, CV_32F, ker, ker); + UMat result2; + sepFilter2D(result, result2, CV_32F, ker, ker); + swap(gain_maps_[img_idx], result2); } }