diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp index d9469530f3..e2a1964466 100644 --- a/modules/imgproc/src/filter.cpp +++ b/modules/imgproc/src/filter.cpp @@ -3388,12 +3388,12 @@ const int optimizedSepFilterLocalSize = 16; static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, Mat row_kernel, Mat col_kernel, - double delta, int borderType, int ddepth) + double delta, int borderType, int ddepth, int bdepth, bool int_arithm) { Size size = _src.size(), wholeSize; Point origin; int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype), - esz = CV_ELEM_SIZE(stype), wdepth = std::max(std::max(sdepth, ddepth), CV_32F), + esz = CV_ELEM_SIZE(stype), wdepth = std::max(std::max(sdepth, ddepth), bdepth), dtype = CV_MAKE_TYPE(ddepth, cn); size_t src_step = _src.step(), src_offset = _src.offset(); bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; @@ -3413,14 +3413,15 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s" " -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s" - " -D %s -D srcT1=%s -D dstT1=%s -D CN=%d", (int)lt2[0], (int)lt2[1], - row_kernel.cols / 2, col_kernel.cols / 2, - ocl::kernelToStr(row_kernel, CV_32F, "KERNEL_MATRIX_X").c_str(), - ocl::kernelToStr(col_kernel, CV_32F, "KERNEL_MATRIX_Y").c_str(), + " -D %s -D srcT1=%s -D dstT1=%s -D CN=%d -D SHIFT_BITS=%d%s", + (int)lt2[0], (int)lt2[1], row_kernel.cols / 2, col_kernel.cols / 2, + ocl::kernelToStr(row_kernel, wdepth, "KERNEL_MATRIX_X").c_str(), + ocl::kernelToStr(col_kernel, wdepth, "KERNEL_MATRIX_Y").c_str(), ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype), ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType], - ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn); + ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn, 2*shift_bits, + int_arithm ? " -D INTEGER_ARITHMETIC" : ""); ocl::Kernel k("sep_filter", ocl::imgproc::filterSep_singlePass_oclsrc, opts); if (k.empty()) @@ -3485,14 +3486,14 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, int_arithm = true; } - CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 && !int_arithm && + CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 && imgSize.width > optimizedSepFilterLocalSize + anchor.x && imgSize.height > optimizedSepFilterLocalSize + anchor.y && (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) && anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) && (d.isIntel() || (d.isAMD() && !d.hostUnifiedMemory())), ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta, - borderType & ~BORDER_ISOLATED, ddepth), true) + borderType & ~BORDER_ISOLATED, ddepth, bdepth, int_arithm), true) UMat src = _src.getUMat(); Size srcWholeSize; Point srcOffset; diff --git a/modules/imgproc/src/opencl/filterSep_singlePass.cl b/modules/imgproc/src/opencl/filterSep_singlePass.cl index e75574035f..5fbf763e1e 100644 --- a/modules/imgproc/src/opencl/filterSep_singlePass.cl +++ b/modules/imgproc/src/opencl/filterSep_singlePass.cl @@ -100,8 +100,8 @@ // horizontal and vertical filter kernels // should be defined on host during compile time to avoid overhead #define DIG(a) a, -__constant float mat_kernelX[] = { KERNEL_MATRIX_X }; -__constant float mat_kernelY[] = { KERNEL_MATRIX_Y }; +__constant WT mat_kernelX[] = { KERNEL_MATRIX_X }; +__constant WT mat_kernelY[] = { KERNEL_MATRIX_Y }; __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int srcOffsetY, int height, int width, __global uchar* Dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, float delta) @@ -159,12 +159,16 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int // do vertical filter pass // and store intermediate results to second local memory array int i, clocX = lix; - WT sum = 0.0f; + WT sum = (WT) 0; do { - sum = 0.0f; + sum = (WT) 0; for (i=0; i<=2*RADIUSY; i++) +#ifndef INTEGER_ARITHMETIC sum = mad(lsmem[liy+i][clocX], mat_kernelY[i], sum); +#else + sum = mad24(lsmem[liy+i][clocX], mat_kernelY[i], sum); +#endif lsmemDy[liy][clocX] = sum; clocX += BLK_X; } @@ -180,8 +184,13 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int // and calculate final result sum = 0.0f; for (i=0; i<=2*RADIUSX; i++) +#ifndef INTEGER_ARITHMETIC sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum); +#else + sum = mad24(lsmemDy[liy][lix+i], mat_kernelX[i], sum); + sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS; +#endif // store result into destination image storepix(convertToDstT(sum + (WT)(delta)), Dst + mad24(y, dst_step, mad24(x, DSTSIZE, dst_offset))); }