diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp index ab64ffe6dd..84f7a9c7eb 100644 --- a/modules/imgproc/include/opencv2/imgproc.hpp +++ b/modules/imgproc/include/opencv2/imgproc.hpp @@ -1620,6 +1620,22 @@ CV_EXPORTS_W void blur( InputArray src, OutputArray dst, Size ksize, Point anchor = Point(-1,-1), int borderType = BORDER_DEFAULT ); +/** @brief Blurs an image using the StackBlur. +The function applies and StackBlur to an image. +StackBlur can generate similar results as Gaussian blur, and the time does not increase as the kernel size increases. +It creates a kind of moving stack of colors whilst scanning through the image. Thereby it just has to add one new block of color to the right side +of the stack and remove the leftmost color. The remaining colors on the topmost layer of the stack are either added on or reduced by one, +depending on if they are on the right or on the left side of the stack. +Described here: http://underdestruction.com/2004/02/25/stackblur-2004. +Stack Blur Algorithm by Mario Klingemann +@param src input image. The number of channels can be arbitrary, but the depth should be one of +CV_8U, CV_16U, CV_16S or CV_32F. +@param dst output image of the same size and type as src. +@param ksize stack-blurring kernel size. The ksize.width and ksize.height can differ but they both must be +positive and odd. +*/ +CV_EXPORTS_W void stackBlur(InputArray src, OutputArray dst, Size ksize); + /** @brief Convolves an image with the kernel. The function applies an arbitrary linear filter to an image. In-place operation is supported. When diff --git a/modules/imgproc/perf/perf_blur.cpp b/modules/imgproc/perf/perf_blur.cpp index e4092ccb16..d1f5a6b1ca 100644 --- a/modules/imgproc/perf/perf_blur.cpp +++ b/modules/imgproc/perf/perf_blur.cpp @@ -253,4 +253,52 @@ PERF_TEST_P(Size_MatType, BlendLinear, SANITY_CHECK_NOTHING(); } +///////////// Stackblur //////////////////////// +PERF_TEST_P(Size_MatType, stackblur3x3, + testing::Combine( + testing::Values(sz720p, sz1080p, sz2160p), + testing::Values(CV_8UC1, CV_8UC4, CV_16UC1, CV_16SC1, CV_32FC1) + ) +) +{ + Size size = get<0>(GetParam()); + int type = get<1>(GetParam()); + double eps = 1e-3; + + eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : eps; + + Mat src(size, type); + Mat dst(size, type); + + declare.in(src, WARMUP_RNG).out(dst); + + TEST_CYCLE() stackBlur(src, dst, Size(3,3)); + + SANITY_CHECK_NOTHING(); +} + +PERF_TEST_P(Size_MatType, stackblur101x101, + testing::Combine( + testing::Values(sz720p, sz1080p, sz2160p), + testing::Values(CV_8UC1, CV_8UC4, CV_16UC1, CV_16SC1, CV_32FC1) + ) +) +{ + Size size = get<0>(GetParam()); + int type = get<1>(GetParam()); + double eps = 1e-3; + + eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : eps; + + Mat src(size, type); + Mat dst(size, type); + + declare.in(src, WARMUP_RNG).out(dst); + + TEST_CYCLE() stackBlur(src, dst, Size(101,101)); + + SANITY_CHECK_NOTHING(); +} + + } // namespace diff --git a/modules/imgproc/src/stackblur.cpp b/modules/imgproc/src/stackblur.cpp new file mode 100644 index 0000000000..7a911ffbb8 --- /dev/null +++ b/modules/imgproc/src/stackblur.cpp @@ -0,0 +1,1261 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +/* +StackBlur - a fast almost Gaussian Blur +Theory: http://underdestruction.com/2004/02/25/stackblur-2004 +The code has been borrowed from (https://github.com/flozz/StackBlur) +and adapted for OpenCV by Zihao Mu. + +Below is the original copyright +*/ + +/* +Copyright (c) 2010 Mario Klingemann + +Permission is hereby granted, free of charge, to any person +obtaining a copy of this software and associated documentation +files (the "Software"), to deal in the Software without +restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + */ + + +#include "precomp.hpp" +#include "opencv2/core/hal/intrin.hpp" + +#include + +using namespace std; + +#define STACKBLUR_MAX_RADIUS 254 + +static unsigned short const stackblurMul[255] = + { + 512,512,456,512,328,456,335,512,405,328,271,456,388,335,292,512, + 454,405,364,328,298,271,496,456,420,388,360,335,312,292,273,512, + 482,454,428,405,383,364,345,328,312,298,284,271,259,496,475,456, + 437,420,404,388,374,360,347,335,323,312,302,292,282,273,265,512, + 497,482,468,454,441,428,417,405,394,383,373,364,354,345,337,328, + 320,312,305,298,291,284,278,271,265,259,507,496,485,475,465,456, + 446,437,428,420,412,404,396,388,381,374,367,360,354,347,341,335, + 329,323,318,312,307,302,297,292,287,282,278,273,269,265,261,512, + 505,497,489,482,475,468,461,454,447,441,435,428,422,417,411,405, + 399,394,389,383,378,373,368,364,359,354,350,345,341,337,332,328, + 324,320,316,312,309,305,301,298,294,291,287,284,281,278,274,271, + 268,265,262,259,257,507,501,496,491,485,480,475,470,465,460,456, + 451,446,442,437,433,428,424,420,416,412,408,404,400,396,392,388, + 385,381,377,374,370,367,363,360,357,354,350,347,344,341,338,335, + 332,329,326,323,320,318,315,312,310,307,304,302,299,297,294,292, + 289,287,285,282,280,278,275,273,271,269,267,265,263,261,259 + }; + +static unsigned char const stackblurShr[255] = + { + 9, 11, 12, 13, 13, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, + 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 18, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, + 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24 + }; + +namespace cv{ + +#if CV_SIMD +template +inline int opRow(const T* , T* , const std::vector& , const float , const int radius, const int CN, const int ) +{ + return radius * CN; +} + +template<> +inline int opRow(const uchar* srcPtr, uchar* dstPtr, const std::vector& kVec, const float , const int radius, const int CN, const int widthCN) +{ + int kernelSize = (int)kVec.size(); + + int i = radius * CN; + if (radius > STACKBLUR_MAX_RADIUS) + return i; + + const int mulValTab= stackblurMul[radius]; + const int shrValTab= stackblurShr[radius]; + + const int VEC_LINE = v_uint8::nlanes; + + if (kernelSize == 3) + { + v_uint32 v_mulVal = vx_setall_u32(mulValTab); + for (; i <= widthCN - VEC_LINE; i += VEC_LINE) + { + v_uint16 x0l, x0h, x1l, x1h, x2l, x2h; + v_expand(vx_load(srcPtr + i - CN), x0l, x0h); + v_expand(vx_load(srcPtr + i), x1l, x1h); + v_expand(vx_load(srcPtr + i + CN), x2l, x2h); + + x1l = v_add_wrap(v_add_wrap(x1l, x1l), v_add_wrap(x0l, x2l)); + x1h = v_add_wrap(v_add_wrap(x1h, x1h), v_add_wrap(x0h, x2h)); + + v_uint32 y00, y01, y10, y11; + v_expand(x1l, y00, y01); + v_expand(x1h, y10, y11); + + y00 = (y00 * v_mulVal)>>shrValTab; + y01 = (y01 * v_mulVal)>>shrValTab; + y10 = (y10 * v_mulVal)>>shrValTab; + y11 = (y11 * v_mulVal)>>shrValTab; + + v_store(dstPtr + i, v_pack(v_pack(y00, y01), v_pack(y10, y11))); + } + } + else + { + const ushort * kx = kVec.data() + kernelSize/2; + v_int32 v_mulVal = vx_setall_s32(mulValTab); + v_int16 k0 = vx_setall_s16((short)(kx[0])); + + srcPtr += i; + for( ; i <= widthCN - VEC_LINE; i += VEC_LINE, srcPtr += VEC_LINE) + { + v_uint8 v_src = vx_load(srcPtr); + v_int32 s0, s1, s2, s3; + v_mul_expand(v_reinterpret_as_s16(v_expand_low(v_src)), k0, s0, s1); + v_mul_expand(v_reinterpret_as_s16(v_expand_high(v_src)), k0, s2, s3); + + int k = 1, j = CN; + for (; k <= kernelSize / 2 - 1; k += 2, j += 2 * CN) + { + v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32(((int)kx[k] & 0xFFFF) | ((int)kx[k + 1] << 16))); + + v_uint8 v_src0 = vx_load(srcPtr - j); + v_uint8 v_src1 = vx_load(srcPtr - j - CN); + v_uint8 v_src2 = vx_load(srcPtr + j); + v_uint8 v_src3 = vx_load(srcPtr + j + CN); + + v_int16 xl, xh; + v_zip(v_reinterpret_as_s16(v_expand_low(v_src0) + v_expand_low(v_src2)), v_reinterpret_as_s16(v_expand_low(v_src1) + v_expand_low(v_src3)), xl, xh); + s0 += v_dotprod(xl, k12); + s1 += v_dotprod(xh, k12); + v_zip(v_reinterpret_as_s16(v_expand_high(v_src0) + v_expand_high(v_src2)), v_reinterpret_as_s16(v_expand_high(v_src1) + v_expand_high(v_src3)), xl, xh); + s2 += v_dotprod(xl, k12); + s3 += v_dotprod(xh, k12); + } + if( k < kernelSize / 2 + 1 ) + { + v_int16 k1 = vx_setall_s16((short)(kx[k])); + + v_uint8 v_src0 = vx_load(srcPtr - j); + v_uint8 v_src1 = vx_load(srcPtr + j); + + v_int16 xl, xh; + v_zip(v_reinterpret_as_s16(v_expand_low(v_src0)), v_reinterpret_as_s16(v_expand_low(v_src1)), xl, xh); + s0 += v_dotprod(xl, k1); + s1 += v_dotprod(xh, k1); + v_zip(v_reinterpret_as_s16(v_expand_high(v_src0)), v_reinterpret_as_s16(v_expand_high(v_src1)), xl, xh); + s2 += v_dotprod(xl, k1); + s3 += v_dotprod(xh, k1); + } + + s0 = (s0 * v_mulVal)>>shrValTab; + s1 = (s1 * v_mulVal)>>shrValTab; + s2 = (s2 * v_mulVal)>>shrValTab; + s3 = (s3 * v_mulVal)>>shrValTab; + + v_store(dstPtr + i, v_pack(v_reinterpret_as_u16(v_pack(s0, s1)), v_reinterpret_as_u16(v_pack(s2, s3)))); + } + } + return i; +} + +template<> +inline int opRow(const ushort* srcPtr, ushort* dstPtr, const std::vector& kVec, const float , const int radius, const int CN, const int widthCN) +{ + int kernelSize = (int)kVec.size(); + + int i = radius * CN; + if (radius > STACKBLUR_MAX_RADIUS) + return i; + + const int mulValTab= stackblurMul[radius]; + const int shrValTab= stackblurShr[radius]; + + const int VEC_LINE = v_uint16::nlanes; + + v_uint32 v_mulVal = vx_setall_u32(mulValTab); + if (kernelSize == 3) + { + for (; i <= widthCN - VEC_LINE; i += VEC_LINE) + { + v_uint32 x0l, x0h, x1l, x1h, x2l, x2h; + v_expand(vx_load(srcPtr + i - CN), x0l, x0h); + v_expand(vx_load(srcPtr + i), x1l, x1h); + v_expand(vx_load(srcPtr + i + CN), x2l, x2h); + + x1l = v_add(v_add(x1l, x1l), v_add(x0l, x2l)); + x1h = v_add(v_add(x1h, x1h), v_add(x0h, x2h)); + + v_store(dstPtr + i, v_pack((x1l * v_mulVal)>>shrValTab, (x1h * v_mulVal)>>shrValTab)); + } + } + else + { + const ushort * kx = kVec.data() + kernelSize/2; + v_uint16 k0 = vx_setall_u16(kx[0]); + + srcPtr += i; + for( ; i <= widthCN - VEC_LINE; i += VEC_LINE, srcPtr += VEC_LINE) + { + v_uint16 v_src = vx_load(srcPtr); + v_uint32 s0, s1; + + v_mul_expand(v_src, k0, s0, s1); + + int k = 1, j = CN; + for (; k <= kernelSize / 2 - 1; k += 2, j += 2*CN) + { + v_uint16 k1 = vx_setall_u16(kx[k]); + v_uint16 k2 = vx_setall_u16(kx[k + 1]); + + v_uint32 y0, y1; + v_mul_expand(vx_load(srcPtr - j) + vx_load(srcPtr + j), k1, y0, y1); + s0 += y0; + s1 += y1; + v_mul_expand(vx_load(srcPtr - j - CN) + vx_load(srcPtr + j + CN), k2, y0, y1); + s0 += y0; + s1 += y1; + } + if( k < kernelSize / 2 + 1 ) + { + v_uint16 k1 = vx_setall_u16(kx[k]); + + v_uint32 y0, y1; + v_mul_expand(vx_load(srcPtr - j) + vx_load(srcPtr + j), k1, y0, y1); + s0 += y0; + s1 += y1; + } + + s0 = (s0 * v_mulVal)>>shrValTab; + s1 = (s1 * v_mulVal)>>shrValTab; + + v_store(dstPtr + i, v_pack(s0, s1)); + } + } + + return i; +} + +template<> +inline int opRow(const short* srcPtr, short* dstPtr, const std::vector& kVec, const float , const int radius, const int CN, const int widthCN) +{ + int kernelSize = (int)kVec.size(); + int i = radius * CN; + + if (radius > STACKBLUR_MAX_RADIUS) + return i; + + const int mulValTab= stackblurMul[radius]; + const int shrValTab= stackblurShr[radius]; + + const int VEC_LINE = v_int16::nlanes; + v_int32 v_mulVal = vx_setall_s32(mulValTab); + + if (kernelSize == 3) + { + for (; i <= widthCN - VEC_LINE; i += VEC_LINE) + { + v_int32 x0l, x0h, x1l, x1h, x2l, x2h; + v_expand(vx_load(srcPtr + i - CN), x0l, x0h); + v_expand(vx_load(srcPtr + i), x1l, x1h); + v_expand(vx_load(srcPtr + i + CN), x2l, x2h); + + x1l = v_add(v_add(x1l, x1l), v_add(x0l, x2l)); + x1h = v_add(v_add(x1h, x1h), v_add(x0h, x2h)); + + v_store(dstPtr + i, v_pack((x1l * v_mulVal)>>shrValTab, (x1h * v_mulVal)>>shrValTab)); + } + } + else + { + const ushort * kx = kVec.data() + kernelSize/2; + v_int16 k0 = vx_setall_s16((short)(kx[0])); + + srcPtr += i; + for( ; i <= widthCN - VEC_LINE; i += VEC_LINE, srcPtr += VEC_LINE) + { + v_int16 v_src = vx_load(srcPtr); + v_int32 s0, s1; + v_mul_expand(v_src, k0, s0, s1); + + int k = 1, j = CN; + for (; k <= kernelSize / 2 - 1; k += 2, j += 2 * CN) + { + v_int16 k1 = vx_setall_s16((short)kx[k]); + v_int16 k2 = vx_setall_s16((short)kx[k + 1]); + + v_int32 y0, y1; + + v_mul_expand(vx_load(srcPtr - j) + vx_load(srcPtr + j), k1, y0, y1); + s0 += y0; + s1 += y1; + v_mul_expand(vx_load(srcPtr - j - CN) + vx_load(srcPtr + j + CN), k2, y0, y1); + s0 += y0; + s1 += y1; + } + if( k < kernelSize / 2 + 1 ) + { + v_int16 k1 = vx_setall_s16((short)kx[k]); + v_int32 y0, y1; + v_mul_expand(vx_load(srcPtr - j) + vx_load(srcPtr + j), k1, y0, y1); + s0 += y0; + s1 += y1; + } + + s0 = (s0 * v_mulVal)>>shrValTab; + s1 = (s1 * v_mulVal)>>shrValTab; + + v_store(dstPtr + i, v_pack(s0, s1)); + } + } + return i; +} + +template<> +inline int opRow(const float* srcPtr, float* dstPtr, const std::vector& kVec, const float mulVal, const int radius, const int CN, const int widthCN) +{ + int kernelSize = (int)kVec.size(); + int i = radius * CN; + + v_float32 v_mulVal = vx_setall_f32(mulVal); + const int VEC_LINE = v_float32::nlanes; + const int VEC_LINE4 = VEC_LINE * 4; + + if (kernelSize == 3) + { + for (; i <= widthCN - VEC_LINE4; i += VEC_LINE4) + { + v_float32 v_srcPtr0 = vx_load(srcPtr + i); + v_float32 v_srcPtr1 = vx_load(srcPtr + VEC_LINE + i) ; + v_float32 v_srcPtr2 = vx_load(srcPtr + VEC_LINE * 2 + i); + v_float32 v_srcPtr3 = vx_load(srcPtr + VEC_LINE * 3 + i); + + v_float32 v_sumVal0 = v_srcPtr0 + v_srcPtr0 + vx_load(srcPtr + i - CN) + vx_load(srcPtr + i + CN); + v_float32 v_sumVal1 = v_srcPtr1 + v_srcPtr1 + vx_load(srcPtr + VEC_LINE + i - CN) + vx_load(srcPtr + VEC_LINE + i + CN); + v_float32 v_sumVal2 = v_srcPtr2 + v_srcPtr2 + vx_load(srcPtr + VEC_LINE * 2 + i - CN) + vx_load(srcPtr + VEC_LINE * 2 + i + CN); + v_float32 v_sumVal3 = v_srcPtr3 + v_srcPtr3 + vx_load(srcPtr + VEC_LINE * 3 + i - CN) + vx_load(srcPtr + VEC_LINE * 3 + i + CN); + + v_store(dstPtr + i, v_sumVal0 * v_mulVal); + v_store(dstPtr + i + VEC_LINE, v_sumVal1 * v_mulVal); + v_store(dstPtr + i + VEC_LINE * 2, v_sumVal2 * v_mulVal); + v_store(dstPtr + i + VEC_LINE * 3, v_sumVal3 * v_mulVal); + } + + for (; i <= widthCN - VEC_LINE; i += VEC_LINE) + { + v_float32 v_srcPtr = vx_load(srcPtr + i); + v_float32 v_sumVal = v_srcPtr + v_srcPtr + vx_load(srcPtr + i - CN) + vx_load(srcPtr + i + CN); + v_store(dstPtr + i, v_sumVal * v_mulVal); + } + } + else + { + const ushort * kx = kVec.data() + kernelSize/2; + v_float32 k0 = vx_setall_f32((float)(kx[0])); + + srcPtr += i; + for( ; i <= widthCN - VEC_LINE; i += VEC_LINE, srcPtr += VEC_LINE) + { + v_float32 v_src = vx_load(srcPtr); + v_float32 s0; + s0 = v_src * k0; + + int k = 1, j = CN; + for (; k <= kernelSize / 2 - 1; k += 2, j += 2 * CN) + { + v_float32 k1 = vx_setall_f32((float)kx[k]); + v_float32 k2 = vx_setall_f32((float)kx[k + 1]); + + s0 += (vx_load(srcPtr - j) + vx_load(srcPtr + j)) * k1; + s0 += (vx_load(srcPtr - j - CN) + vx_load(srcPtr + j + CN)) * k2; + } + if( k < kernelSize / 2 + 1 ) + { + v_float32 k1 = vx_setall_f32((float)kx[k]); + + s0 += (vx_load(srcPtr - j) + vx_load(srcPtr + j)) * k1; + } + + v_store(dstPtr + i, s0 * v_mulVal); + } + } + return i; +} + +template +inline int opComputeDiff(const T*& , TBuf*& , const int , const int) +{ + return 0; +} + +template<> +inline int opComputeDiff(const uchar*& srcPtr, int*& diff0, const int w, const int CNR1) +{ + int index = 0; + const int VEC_LINE_8 = v_uint8::nlanes; + const int VEC_LINE_32 = v_int32::nlanes; + for (; index <= w - VEC_LINE_8; index += VEC_LINE_8, diff0+=VEC_LINE_8, srcPtr+=VEC_LINE_8) + { + v_uint16 x0l, x0h, x1l, x1h; + v_expand(vx_load(srcPtr + CNR1), x0l, x0h); + v_expand(vx_load(srcPtr), x1l, x1h); + + v_int32 y0, y1, y2, y3; + v_expand(v_reinterpret_as_s16(x0l) - v_reinterpret_as_s16(x1l), y0, y1); + v_expand(v_reinterpret_as_s16(x0h) - v_reinterpret_as_s16(x1h), y2, y3); + + v_store(diff0, y0); + v_store(diff0 + VEC_LINE_32, y1); + v_store(diff0 + VEC_LINE_32 * 2, y2); + v_store(diff0 + VEC_LINE_32 * 3, y3); + } + return index; +} +#endif + +template +class ParallelStackBlurRow : public ParallelLoopBody +{ +public: + ParallelStackBlurRow (const Mat &_src, Mat &_dst, int _radius): src(_src), dst(_dst) ,radius(_radius) + { + width= dst.cols; + wm = width - 1; + mulVal = 1.0f / ((radius + 1) * (radius + 1)); + CN = src.channels(); + } + + ~ParallelStackBlurRow() {} + + ParallelStackBlurRow& operator=(const ParallelStackBlurRow &) { return *this; } + + /* + * The idea is as follows: + * The stack can be understood as a sliding window of length kernel size. + * The sliding window moves one element at a time from left to right. + * The sumIn stores the elements added to the stack each time, + * and sumOut stores the subtracted elements. Every time stack moves, stack, sumIn and sumOut are updated. + * The dst will be calculated using the following formula: + * dst[i] = (stack + sumIn - sumOut) / stack_num + * In the Row direction, in order to avoid redundant computation, + * we save the sumIn - sumOut as a diff vector. + * So the new formula is: + * dst[i] = (stack + diff[i]) / stack_num. + * In practice, we use multiplication and bit shift right to simulate integer division: + * dst[i] = ((stack + diff[i]) * mulVal) >> shrVal. + * */ + virtual void operator ()(const Range& range) const CV_OVERRIDE + { + const int kernelSize = 2 * radius + 1; + + if (kernelSize <= 9 && width > kernelSize) // Special branch for small kernel + { + std::vector kVec; + for (int i = 0; i < kernelSize; i++) + { + if (i <= radius) + kVec.push_back(ushort(i + 1)); + else + kVec.push_back(ushort(2 * radius - i + 1)); + } + + const ushort * kx = kVec.data() + kernelSize/2; + for (int row = range.start; row < range.end; row++) + { + const T* srcPtr = src.ptr(row); + T* dstPtr = dst.ptr(row); + TBuf sumVal; + + // init + for (int i = 0; i < radius; i++) + { + for (int ci = 0; ci < CN; ci++) + { + sumVal = 0; + for (int k = 0; k < kernelSize; k++) + { + int index = std::max(k - radius + i, 0); + sumVal += (TBuf)srcPtr[index * CN + ci] * (TBuf)kVec[k]; + } + dstPtr[i*CN + ci] = (T)(sumVal * mulVal); + } + } + + int widthCN = (width - radius) * CN; + + // middle + int wc = radius * CN; +#if CV_SIMD + wc = opRow(srcPtr, dstPtr, kVec, mulVal, radius, CN, widthCN); +#endif + for (; wc < widthCN; wc++) + { + sumVal = srcPtr[wc] * kx[0]; + for (int k = 1; k <= radius; k++) + sumVal += ((TBuf)(srcPtr[wc + k * CN])+(TBuf)(srcPtr[wc - k * CN])) * (TBuf)kx[k]; + dstPtr[wc] = (T)(sumVal * mulVal); + } + + // tail + for (int i = wc / CN; i < width; i++) + { + for (int ci = 0; ci < CN; ci++) + { + sumVal = 0; + for (int k = 0; k < kernelSize; k++) + { + int index = std::min(k - radius + i, wm); + sumVal += (TBuf)srcPtr[index * CN + ci] * (TBuf)kVec[k]; + } + dstPtr[i*CN + ci] = (T)(sumVal * mulVal); + } + } + + } + } + else + { + size_t bufSize = CN * (width + radius) * sizeof(TBuf) + 2 * CN * sizeof(TBuf); + AutoBuffer _buf(bufSize + 16); + uchar* bufptr = alignPtr(_buf.data(), 16); + TBuf* diffVal = (TBuf*)bufptr; + TBuf* sum = diffVal+CN; + TBuf* diff = sum + CN; + + const int CNR1 = CN * (radius + 1); + const int widthCN = (width - radius - 1) * CN; + + for (int row = range.start; row < range.end; row++) + { + memset(bufptr, 0, bufSize); + + const T* srcPtr = src.ptr(row); + T* dstPtr = dst.ptr(row); + + int radiusMul = (radius + 2) * (radius + 1) / 2; + for (int ci = 0; ci < CN; ci++) + sum[ci] += (TBuf)srcPtr[ci] * radiusMul; + + // compute diff + const T* srcPtr0 = srcPtr; + + // init + for (int i = 0; i < radius; i++) + { + if (i < wm) srcPtr0 += CN; + for (int ci = 0; ci < CN; ci++) + { + diff[i*CN + ci] = (TBuf)srcPtr0[ci] - (TBuf)srcPtr[ci]; + diffVal[ci] += diff[i*CN + ci]; + sum[ci] += srcPtr0[ci] * (radius - i); + } + } + + // middle + auto diff0 = diff + radius * CN; + int index = 0; +#if CV_SIMD + index = opComputeDiff(srcPtr, diff0, widthCN, CNR1); +#endif + + for (; index < widthCN; index++, diff0++, srcPtr++) + diff0[0] = (TBuf)(srcPtr[CNR1]) - (TBuf)(srcPtr[0]); + + // tails + srcPtr0 = src.ptr(row) + index; + const T* srcPtr1 = src.ptr(row) + (width - 1) * CN; + int dist = width - index/CN; + for (int r = 0; r < radius; r++, diff0 += CN) + { + for (int ci = 0; ci < CN; ci++) + diff0[ci] = (TBuf)(srcPtr1[ci]) - (TBuf)(srcPtr0[ci]); + + if (dist >= r) + { + srcPtr0 += CN; + dist--; + } + } + + srcPtr = src.ptr(row); + diff0 = diff + radius * CN; + for (int ci = 0; ci < CN; ci++) + diffVal[ci] += diff0[ci]; + diff0 += CN; + + if (CN == 1) + { + for (int i = 0; i < width; i++, diff0 ++, dstPtr ++, srcPtr ++) + { + *(dstPtr) = saturate_cast((sum[0] * mulVal)); + sum[0] += diffVal[0]; + diffVal[0] += (diff0[0] - diff0[-CNR1]); + } + } + else if (CN == 3) + { + for (int i = 0; i < width; i++, diff0 += CN, dstPtr += CN, srcPtr += CN) + { + *(dstPtr + 0) = saturate_cast((sum[0] * mulVal)); + *(dstPtr + 1) = saturate_cast((sum[1] * mulVal)); + *(dstPtr + 2) = saturate_cast((sum[2] * mulVal)); + + sum[0] += diffVal[0]; + sum[1] += diffVal[1]; + sum[2] += diffVal[2]; + + diffVal[0] += (diff0[0] - diff0[0 - CNR1]); + diffVal[1] += (diff0[1] - diff0[1 - CNR1]); + diffVal[2] += (diff0[2] - diff0[2 - CNR1]); + } + } + else if (CN == 4) + { + for (int i = 0; i < width; i++, diff0 += CN, dstPtr += CN, srcPtr += CN) + { + *(dstPtr + 0) = saturate_cast((sum[0] * mulVal)); + *(dstPtr + 1) = saturate_cast((sum[1] * mulVal)); + *(dstPtr + 2) = saturate_cast((sum[2] * mulVal)); + *(dstPtr + 3) = saturate_cast((sum[3] * mulVal)); + + sum[0] += diffVal[0]; + sum[1] += diffVal[1]; + sum[2] += diffVal[2]; + sum[3] += diffVal[3]; + + diffVal[0] += (diff0[0] - diff0[0 - CNR1]); + diffVal[1] += (diff0[1] - diff0[1 - CNR1]); + diffVal[2] += (diff0[2] - diff0[2 - CNR1]); + diffVal[3] += (diff0[3] - diff0[3 - CNR1]); + } + } + else + { + int i = 0; + for (; i < width; i++, diff0 += CN, dstPtr += CN, srcPtr += CN) + { + for (int ci = 0; ci < CN; ci++) + { + *(dstPtr + ci) = saturate_cast((sum[ci] * mulVal)); + sum[ci] += diffVal[ci]; + diffVal[ci] += (diff0[ci] - diff0[ci - CNR1]); + } + } + } + } + } + } + +private: + const Mat &src; + Mat &dst; + int radius; + int width; + int wm; + int CN; + float mulVal; +}; + +#if CV_SIMD +template +inline int opColumn(const T* , T* , T* , TBuf* , TBuf* , TBuf* , const float , + const int , const int , const int , const int , const int ) +{ + return 0; +} + +template<> +inline int opColumn(const float* srcPtr, float* dstPtr, float* stack, float* sum, float* sumIn, + float* sumOut, const float mulVal, const int , const int , + const int widthLen, const int ss, const int sp1) +{ + int k = 0; + v_float32 v_mulVal = vx_setall_f32(mulVal); + const int VEC_LINE = v_float32::nlanes; + const int VEC_LINE4 = 4 * VEC_LINE; + + auto stackStartPtr = stack + ss * widthLen; + auto stackSp1Ptr = stack + sp1 * widthLen; + + for (;k <= widthLen - VEC_LINE4; k += VEC_LINE4) + { + v_float32 v_sum0 = vx_load(sum + k); + v_float32 v_sum1 = vx_load(sum + VEC_LINE + k); + v_float32 v_sum2 = vx_load(sum + VEC_LINE * 2 + k); + v_float32 v_sum3 = vx_load(sum + VEC_LINE * 3 + k); + + v_float32 v_sumOut0 = vx_load(sumOut + k); + v_float32 v_sumOut1 = vx_load(sumOut + VEC_LINE + k); + v_float32 v_sumOut2 = vx_load(sumOut + VEC_LINE * 2 + k); + v_float32 v_sumOut3 = vx_load(sumOut + VEC_LINE * 3 + k); + + v_float32 v_sumIn0 = vx_load(sumIn + k); + v_float32 v_sumIn1 = vx_load(sumIn + VEC_LINE + k); + v_float32 v_sumIn2 = vx_load(sumIn + VEC_LINE * 2 + k); + v_float32 v_sumIn3 = vx_load(sumIn + VEC_LINE * 3+ k); + + v_store(dstPtr + k, v_sum0 * v_mulVal); + v_store(dstPtr + VEC_LINE + k, v_sum1 * v_mulVal); + v_store(dstPtr + VEC_LINE * 2 + k, v_sum2 * v_mulVal); + v_store(dstPtr + VEC_LINE * 3 + k, v_sum3 * v_mulVal); + + v_sum0 -= v_sumOut0; + v_sum1 -= v_sumOut1; + v_sum2 -= v_sumOut2; + v_sum3 -= v_sumOut3; + + v_sumOut0 -= vx_load(stackStartPtr + k); + v_sumOut1 -= vx_load(stackStartPtr + VEC_LINE + k); + v_sumOut2 -= vx_load(stackStartPtr + VEC_LINE * 2 + k); + v_sumOut3 -= vx_load(stackStartPtr + VEC_LINE * 3 + k); + + v_float32 v_srcPtr0 = vx_load(srcPtr + k); + v_float32 v_srcPtr1 = vx_load(srcPtr + VEC_LINE + k); + v_float32 v_srcPtr2 = vx_load(srcPtr + VEC_LINE * 2 + k); + v_float32 v_srcPtr3 = vx_load(srcPtr + VEC_LINE * 3 + k); + + v_store(stackStartPtr + k, v_srcPtr0); + v_store(stackStartPtr + VEC_LINE + k, v_srcPtr1); + v_store(stackStartPtr + VEC_LINE * 2 + k, v_srcPtr2); + v_store(stackStartPtr + VEC_LINE * 3 + k, v_srcPtr3); + + v_sumIn0 += v_srcPtr0; + v_sumIn1 += v_srcPtr1; + v_sumIn2 += v_srcPtr2; + v_sumIn3 += v_srcPtr3; + + v_store(sum + k, v_sum0 + v_sumIn0); + v_store(sum + VEC_LINE + k, v_sum1 + v_sumIn1); + v_store(sum + VEC_LINE * 2 + k, v_sum2 + v_sumIn2); + v_store(sum + VEC_LINE * 3 + k, v_sum3 + v_sumIn3); + + v_srcPtr0 = vx_load(stackSp1Ptr + k); + v_srcPtr1 = vx_load(stackSp1Ptr + VEC_LINE + k); + v_srcPtr2 = vx_load(stackSp1Ptr + VEC_LINE * 2 + k); + v_srcPtr3 = vx_load(stackSp1Ptr + VEC_LINE * 3 + k); + + v_sumOut0 += v_srcPtr0; + v_sumOut1 += v_srcPtr1; + v_sumOut2 += v_srcPtr2; + v_sumOut3 += v_srcPtr3; + + v_store(sumOut + k, v_sumOut0); + v_store(sumOut + VEC_LINE + k, v_sumOut1); + v_store(sumOut + VEC_LINE * 2 + k, v_sumOut2); + v_store(sumOut + VEC_LINE * 3 + k, v_sumOut3); + + v_sumIn0 -= v_srcPtr0; + v_sumIn1 -= v_srcPtr1; + v_sumIn2 -= v_srcPtr2; + v_sumIn3 -= v_srcPtr3; + + v_store(sumIn + k, v_sumIn0); + v_store(sumIn + VEC_LINE + k, v_sumIn1); + v_store(sumIn + VEC_LINE * 2 + k, v_sumIn2); + v_store(sumIn + VEC_LINE * 3 + k, v_sumIn3); + } + + for (;k <= widthLen - VEC_LINE; k += VEC_LINE) + { + v_float32 v_sum = vx_load(sum + k); + v_float32 v_sumOut = vx_load(sumOut + k); + v_float32 v_sumIn = vx_load(sumIn + k); + + v_store(dstPtr + k, v_sum * v_mulVal); + v_sum -= v_sumOut; + v_sumOut -= vx_load(stackStartPtr + k); + + v_float32 v_srcPtr = vx_load(srcPtr + k); + v_store(stackStartPtr + k, v_srcPtr); + + v_sumIn += v_srcPtr; + v_store(sum + k, v_sum + v_sumIn); + + v_srcPtr = vx_load(stackSp1Ptr + k); + v_sumOut += v_srcPtr; + v_store(sumOut + k, v_sumOut); + v_sumIn -= v_srcPtr; + v_store(sumIn + k, v_sumIn); + } + return k; +} + +template<> +inline int opColumn(const uchar* srcPtr, uchar* dstPtr, uchar* stack, int* sum, int* sumIn, + int* sumOut, const float , const int mulValTab, const int shrValTab, + const int widthLen, const int ss, const int sp1) +{ + int k = 0; + if (mulValTab != 0 && shrValTab != 0) + { + const int VEC_LINE_8 = v_uint8::nlanes; + const int VEC_LINE_32 = v_int32::nlanes; + v_int32 v_mulVal = vx_setall_s32(mulValTab); + + auto stackStartPtr = stack + ss * widthLen; + auto stackSp1Ptr = stack + sp1 * widthLen; + + for (;k <= widthLen - VEC_LINE_8; k += VEC_LINE_8) + { + v_int32 v_sum0, v_sum1, v_sum2, v_sum3; + v_int32 v_sumIn0, v_sumIn1, v_sumIn2, v_sumIn3; + v_int32 v_sumOut0, v_sumOut1, v_sumOut2, v_sumOut3; + + v_sum0 = vx_load(sum + k); + v_sum1 = vx_load(sum + k + VEC_LINE_32); + v_sum2 = vx_load(sum + k + VEC_LINE_32 * 2); + v_sum3 = vx_load(sum + k + VEC_LINE_32 * 3); + + v_sumIn0 = vx_load(sumIn + k); + v_sumIn1 = vx_load(sumIn + k + VEC_LINE_32); + v_sumIn2 = vx_load(sumIn + k + VEC_LINE_32 * 2); + v_sumIn3 = vx_load(sumIn + k + VEC_LINE_32 * 3); + + v_sumOut0 = vx_load(sumOut + k); + v_sumOut1 = vx_load(sumOut + k + VEC_LINE_32); + v_sumOut2 = vx_load(sumOut + k + VEC_LINE_32 * 2); + v_sumOut3 = vx_load(sumOut + k + VEC_LINE_32 * 3); + + v_store(dstPtr + k, + v_pack( + v_reinterpret_as_u16(v_pack((v_sum0 * v_mulVal)>>shrValTab, (v_sum1 * v_mulVal)>>shrValTab)), + v_reinterpret_as_u16(v_pack((v_sum2 * v_mulVal)>>shrValTab, (v_sum3 * v_mulVal)>>shrValTab)))); + + v_sum0 -= v_sumOut0; + v_sum1 -= v_sumOut1; + v_sum2 -= v_sumOut2; + v_sum3 -= v_sumOut3; + + v_uint16 x0l, x0h; + v_int32 v_ss0, v_ss1, v_ss2, v_ss3; + + v_expand(vx_load(stackStartPtr + k), x0l, x0h); + v_expand(v_reinterpret_as_s16(x0l), v_ss0, v_ss1); + v_expand(v_reinterpret_as_s16(x0h), v_ss2, v_ss3); + + v_sumOut0 -= v_ss0; + v_sumOut1 -= v_ss1; + v_sumOut2 -= v_ss2; + v_sumOut3 -= v_ss3; + + v_expand(vx_load(srcPtr + k), x0l, x0h); + v_expand(v_reinterpret_as_s16(x0l), v_ss0, v_ss1); + v_expand(v_reinterpret_as_s16(x0h), v_ss2, v_ss3); + + memcpy(stackStartPtr + k,srcPtr + k, VEC_LINE_8 * sizeof (uchar)); + + v_sumIn0 += v_ss0; + v_sumIn1 += v_ss1; + v_sumIn2 += v_ss2; + v_sumIn3 += v_ss3; + + v_store(sum + k, v_sum0 + v_sumIn0); + v_store(sum + VEC_LINE_32 + k, v_sum1 + v_sumIn1); + v_store(sum + VEC_LINE_32 * 2 + k, v_sum2 + v_sumIn2); + v_store(sum + VEC_LINE_32 * 3 + k, v_sum3 + v_sumIn3); + + v_expand(vx_load(stackSp1Ptr + k), x0l, x0h); + v_expand(v_reinterpret_as_s16(x0l), v_ss0, v_ss1); + v_expand(v_reinterpret_as_s16(x0h), v_ss2, v_ss3); + + v_sumOut0 += v_ss0; + v_sumOut1 += v_ss1; + v_sumOut2 += v_ss2; + v_sumOut3 += v_ss3; + + v_store(sumOut + k, v_sumOut0); + v_store(sumOut + VEC_LINE_32 + k, v_sumOut1); + v_store(sumOut + VEC_LINE_32 * 2 + k, v_sumOut2); + v_store(sumOut + VEC_LINE_32 * 3 + k, v_sumOut3); + + v_sumIn0 -= v_ss0; + v_sumIn1 -= v_ss1; + v_sumIn2 -= v_ss2; + v_sumIn3 -= v_ss3; + + v_store(sumIn + k, v_sumIn0); + v_store(sumIn + VEC_LINE_32 + k, v_sumIn1); + v_store(sumIn + VEC_LINE_32 * 2 + k, v_sumIn2); + v_store(sumIn + VEC_LINE_32 * 3 + k, v_sumIn3); + } + } + return k; +} + +template<> +inline int opColumn(const short* srcPtr, short* dstPtr, short* stack, int* sum, int* sumIn, + int* sumOut, const float , const int mulValTab, const int shrValTab, + const int widthLen, const int ss, const int sp1) +{ + int k = 0; + if (mulValTab != 0 && shrValTab != 0) + { + const int VEC_LINE_16 = v_int16::nlanes; + const int VEC_LINE_32 = v_int32::nlanes; + v_int32 v_mulVal = vx_setall_s32(mulValTab); + + auto stackStartPtr = stack + ss * widthLen; + auto stackSp1Ptr = stack + sp1 * widthLen; + for (;k <= widthLen - VEC_LINE_16; k += VEC_LINE_16) + { + v_int32 v_sum0, v_sum1; + v_int32 v_sumIn0, v_sumIn1; + v_int32 v_sumOut0, v_sumOut1; + + v_sum0 = vx_load(sum + k); + v_sum1 = vx_load(sum + k + VEC_LINE_32); + + v_sumIn0 = vx_load(sumIn + k); + v_sumIn1 = vx_load(sumIn + k + VEC_LINE_32); + + v_sumOut0 = vx_load(sumOut + k); + v_sumOut1 = vx_load(sumOut + k + VEC_LINE_32); + + v_store(dstPtr + k,v_pack((v_sum0 * v_mulVal)>>shrValTab, (v_sum1 * v_mulVal)>>shrValTab)); + + v_sum0 -= v_sumOut0; + v_sum1 -= v_sumOut1; + + v_int32 v_ss0, v_ss1; + v_expand(vx_load(stackStartPtr + k), v_ss0, v_ss1); + + v_sumOut0 -= v_ss0; + v_sumOut1 -= v_ss1; + + v_expand(vx_load(srcPtr + k), v_ss0, v_ss1); + memcpy(stackStartPtr + k,srcPtr + k, VEC_LINE_16 * sizeof (short)); + + v_sumIn0 += v_ss0; + v_sumIn1 += v_ss1; + + v_sum0 += v_sumIn0; + v_sum1 += v_sumIn1; + + v_store(sum + k, v_sum0); + v_store(sum + VEC_LINE_32 + k, v_sum1); + + v_expand(vx_load(stackSp1Ptr + k), v_ss0, v_ss1); + + v_sumOut0 += v_ss0; + v_sumOut1 += v_ss1; + + v_store(sumOut + k, v_sumOut0); + v_store(sumOut + VEC_LINE_32 + k, v_sumOut1); + + v_sumIn0 -= v_ss0; + v_sumIn1 -= v_ss1; + + v_store(sumIn + k, v_sumIn0); + v_store(sumIn + VEC_LINE_32 + k, v_sumIn1); + } + } + return k; +} + +template<> +inline int opColumn(const ushort* srcPtr, ushort* dstPtr, ushort* stack, int* sum, int* sumIn, + int* sumOut, const float , const int mulValTab, const int shrValTab, + const int widthLen, const int ss, const int sp1) +{ + int k = 0; + if (mulValTab != 0 && shrValTab != 0) + { + const int VEC_LINE_16 = v_uint16::nlanes; + const int VEC_LINE_32 = v_int32::nlanes; + v_uint32 v_mulVal = vx_setall_u32((uint32_t)mulValTab); + + auto stackStartPtr = stack + ss * widthLen; + auto stackSp1Ptr = stack + sp1 * widthLen; + for (;k <= widthLen - VEC_LINE_16; k += VEC_LINE_16) + { + v_int32 v_sum0, v_sum1; + v_int32 v_sumIn0, v_sumIn1; + v_int32 v_sumOut0, v_sumOut1; + + v_sum0 = vx_load(sum + k); + v_sum1 = vx_load(sum + k + VEC_LINE_32); + + v_sumIn0 = vx_load(sumIn + k); + v_sumIn1 = vx_load(sumIn + k + VEC_LINE_32); + + v_sumOut0 = vx_load(sumOut + k); + v_sumOut1 = vx_load(sumOut + k + VEC_LINE_32); + + v_store(dstPtr + k, v_pack((v_reinterpret_as_u32(v_sum0) * v_mulVal)>>shrValTab, (v_reinterpret_as_u32(v_sum1) * v_mulVal)>>shrValTab)); + + v_sum0 -= v_sumOut0; + v_sum1 -= v_sumOut1; + + v_uint32 v_ss0, v_ss1; + v_expand(vx_load(stackStartPtr + k), v_ss0, v_ss1); + + v_sumOut0 -= v_reinterpret_as_s32(v_ss0); + v_sumOut1 -= v_reinterpret_as_s32(v_ss1); + + v_expand(vx_load(srcPtr + k), v_ss0, v_ss1); + + memcpy(stackStartPtr + k,srcPtr + k, VEC_LINE_16 * sizeof (ushort)); + + v_sumIn0 += v_reinterpret_as_s32(v_ss0); + v_sumIn1 += v_reinterpret_as_s32(v_ss1); + + v_sum0 += v_sumIn0; + v_sum1 += v_sumIn1; + + v_store(sum + k, v_sum0); + v_store(sum + VEC_LINE_32 + k, v_sum1); + + v_expand(vx_load(stackSp1Ptr + k), v_ss0, v_ss1); + + v_sumOut0 += v_reinterpret_as_s32(v_ss0); + v_sumOut1 += v_reinterpret_as_s32(v_ss1); + + v_store(sumOut + k, v_sumOut0); + v_store(sumOut + VEC_LINE_32 + k, v_sumOut1); + + v_sumIn0 -= v_reinterpret_as_s32(v_ss0); + v_sumIn1 -= v_reinterpret_as_s32(v_ss1); + + v_store(sumIn + k, v_sumIn0); + v_store(sumIn + VEC_LINE_32 + k, v_sumIn1); + } + } + return k; +} +#endif + +template +class ParallelStackBlurColumn: + public ParallelLoopBody +{ +public: + ParallelStackBlurColumn (const Mat & _src, Mat &_dst, int _radius):src(_src), dst(_dst) ,radius(_radius) + { + CN = src.channels(); + widthElem = CN * src.cols; + height = src.rows; + hm = src.rows - 1; + mulVal = 1.0f / ((radius + 1)*(radius + 1)); + if (radius <= STACKBLUR_MAX_RADIUS) + { + shrValTab = stackblurShr[radius]; + mulValTab = stackblurMul[radius]; + } + else + { + shrValTab = 0; + mulValTab = 0; + } + } + + ~ParallelStackBlurColumn() {} + + ParallelStackBlurColumn& operator=(const ParallelStackBlurColumn &) { return *this; } + + virtual void operator ()(const Range& range) const CV_OVERRIDE + { + if (radius == 0) + return; + + const int kernelSize = 2 * radius + 1; + int widthImg = std::min(range.end, src.cols * CN); + int widthLen = widthImg - range.start; + + size_t bufSize = 3 * widthLen * sizeof(TBuf) + kernelSize * widthLen * sizeof(T); + + AutoBuffer _buf(bufSize + 16); + uchar* bufptr = alignPtr(_buf.data(), 16); + + TBuf* sum = (TBuf *)bufptr; + TBuf* sumIn = sum + widthLen; + TBuf* sumOut = sumIn + widthLen; + T* stack = (T* )(sumOut + widthLen); + + memset(bufptr, 0, bufSize); + + const T* srcPtr =dst.ptr() + range.start; + + for (int i = 0; i <= radius; i++) + { + for (int k = 0; k < widthLen; k++) + { + stack[i * widthLen + k] = *(srcPtr + k); + sum[k] += *(srcPtr + k) * (i + 1); + sumOut[k] += *(srcPtr + k); + } + } + + for (int i = 1; i <= radius; i++) + { + if (i <= hm) srcPtr += widthElem; + for (int k = 0; k < widthLen; k++) + { + T tmp = *(srcPtr + k); + stack[(i + radius) * widthLen + k] = tmp; + sum[k] += tmp * (radius - i + 1); + sumIn[k] += tmp; + } + } + + int sp = radius; + int yp = radius; + + if (yp > hm) yp = hm; + + T* dstPtr = dst.ptr() + range.start; + srcPtr = dst.ptr(yp) + range.start; + int stackStart = 0; + + for(int i = 0; i < height; i++) + { + stackStart = sp + kernelSize - radius; + if (stackStart >= kernelSize) stackStart -= kernelSize; + + int sp1 = sp + 1; + sp1 &= -(sp1 < kernelSize); + + if (yp < hm) + { + yp++; + srcPtr += widthElem; + } + + int k = 0; +#if CV_SIMD + k = opColumn(srcPtr, dstPtr, stack, sum, sumIn, sumOut, mulVal, mulValTab, shrValTab, + widthLen, stackStart, sp1); +#endif + + for (; k < widthLen; k++) + { + *(dstPtr + k) = static_cast(sum[k] * mulVal); + sum[k] -= sumOut[k]; + sumOut[k] -= stack[stackStart * widthLen + k]; + + stack[stackStart * widthLen + k] = *(srcPtr + k); + sumIn[k] += *(srcPtr + k); + sum[k] += sumIn[k]; + + sumOut[k] += stack[sp1 * widthLen + k]; + sumIn[k] -= stack[sp1 * widthLen + k]; + } + + dstPtr += widthElem; + ++sp; + if (sp >= kernelSize) sp = 0; + } + } + +private: + const Mat &src; + Mat &dst; + int radius; + int CN; + int height; + int widthElem; + int hm; + float mulVal; + int mulValTab; + int shrValTab; +}; + +void stackBlur(InputArray _src, OutputArray _dst, Size ksize) +{ + CV_INSTRUMENT_REGION(); + CV_Assert(!_src.empty()); + + CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 && + ksize.height > 0 && ksize.height % 2 == 1 ); + + int radiusH = ksize.height / 2; + int radiusW = ksize.width / 2; + + int stype = _src.type(), sdepth = _src.depth(); + Mat src = _src.getMat(); + + if (ksize.width == 1) + { + _src.copyTo(_dst); + + if (ksize.height == 1) + return; + } + else + { + _dst.create( src.size(), stype); + } + + Mat dst = _dst.getMat(); + int numOfThreads = getNumThreads(); + int widthElem = src.cols * src.channels(); + + if (dst.rows / numOfThreads < 3) + numOfThreads = std::max(1, dst.rows / 3); + + if (sdepth == CV_8U) + { + if (ksize.width != 1) + parallel_for_(Range(0, src.rows), ParallelStackBlurRow(src, dst, radiusW), numOfThreads); + if (ksize.height != 1) + parallel_for_(Range(0, widthElem), ParallelStackBlurColumn(dst, dst, radiusH), numOfThreads); + } + else if (sdepth == CV_16S) + { + if (ksize.width != 1) + parallel_for_(Range(0, src.rows), ParallelStackBlurRow(src, dst, radiusW), numOfThreads); + if (ksize.height != 1) + parallel_for_(Range(0, widthElem), ParallelStackBlurColumn(dst, dst, radiusH), numOfThreads); + } + else if (sdepth == CV_16U) + { + if (ksize.width != 1) + parallel_for_(Range(0, src.rows), ParallelStackBlurRow(src, dst, radiusW), numOfThreads); + if (ksize.height != 1) + parallel_for_(Range(0, widthElem), ParallelStackBlurColumn(dst, dst, radiusH), numOfThreads); + } + else if (sdepth == CV_32F) + { + if (ksize.width != 1) + parallel_for_(Range(0, src.rows), ParallelStackBlurRow(src, dst, radiusW), numOfThreads); + if (ksize.height != 1) + parallel_for_(Range(0, widthElem), ParallelStackBlurColumn(dst, dst, radiusH), numOfThreads); + } + else + CV_Error_( CV_StsNotImplemented, + ("Unsupported input format in StackBlur, the supported formats are: CV_8U, CV_16U, CV_16S and CV_32F.")); +} +} //namespace diff --git a/modules/imgproc/test/test_stackblur.cpp b/modules/imgproc/test/test_stackblur.cpp new file mode 100644 index 0000000000..32310d2762 --- /dev/null +++ b/modules/imgproc/test/test_stackblur.cpp @@ -0,0 +1,313 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +/* +StackBlur - a fast almost Gaussian Blur +Theory: http://underdestruction.com/2004/02/25/stackblur-2004 +The code has been borrowed from (https://github.com/flozz/StackBlur). + +Below is the original copyright +*/ + +/* +Copyright (c) 2010 Mario Klingemann + +Permission is hereby granted, free of charge, to any person +obtaining a copy of this software and associated documentation +files (the "Software"), to deal in the Software without +restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + */ + + +#include "test_precomp.hpp" + +namespace opencv_test { namespace { + +template +void _stackblurRef(const Mat& src, Mat& dst, Size ksize) +{ + CV_Assert(!src.empty()); + CV_Assert(ksize.width > 0 && ksize.height > 0 && ksize.height % 2 == 1 && ksize.width % 2 == 1); + + dst.create(src.size(), src.type()); + const int CN = src.channels(); + + int rowsImg = src.rows; + int colsImg = src.cols; + int wm = colsImg - 1; + + int radiusW = ksize.width / 2; + int stackLenW = ksize.width; + const float mulW = 1.0f / (((float )radiusW + 1.0f) * ((float )radiusW + 1.0f)); + + // Horizontal direction + std::vector stack(stackLenW * CN); + for (int row = 0; row < rowsImg; row++) + { + std::vector sum(CN, 0); + std::vector sumIn(CN, 0); + std::vector sumOut(CN, 0); + + const T* srcPtr = src.ptr(row); + + for (int i = 0; i <= radiusW; i++) + { + for (int ci = 0; ci < CN; ci++) + { + T tmp = *(srcPtr + ci); + stack[i * CN + ci] = tmp; + sum[ci] += tmp * (i + 1); + sumOut[ci] += tmp; + } + } + + for (int i = 1; i <= radiusW; i++) + { + if (i <= wm) srcPtr += CN; + for(int ci = 0; ci < CN; ci++) + { + T tmp = *(srcPtr + ci); + stack[(i + radiusW) * CN + ci] = tmp; + sum[ci] += tmp * (radiusW + 1 - i); + sumIn[ci] += tmp; + } + } + + int sp = radiusW; + int xp = radiusW ; + if (xp > wm) xp = wm; + + T* dstPtr = dst.ptr(row); + srcPtr = src.ptr(row) + xp * CN; + + int stackStart= 0; + + for (int i = 0; i < colsImg; i++) + { + stackStart = sp + stackLenW - radiusW; + + if (stackStart >= stackLenW) stackStart -= stackLenW; + + for(int ci = 0; ci < CN; ci++) + { + *(dstPtr + ci) = cv::saturate_cast(sum[ci] * mulW); + sum[ci] -= sumOut[ci]; + sumOut[ci] -= stack[stackStart*CN + ci]; + } + + const T* srcNew = srcPtr; + + if(xp < wm) + srcNew += CN; + + for (int ci = 0; ci < CN; ci++) + { + stack[stackStart * CN + ci] = *(srcNew + ci); + sumIn[ci] += *(srcNew + ci); + sum[ci] += sumIn[ci]; + } + + int sp1 = sp + 1; + sp1 &= -(sp1 < stackLenW); + + for(int ci = 0; ci < CN; ci++) + { + T tmp = stack[sp1*CN + ci]; + sumOut[ci] += tmp; + sumIn[ci] -= tmp; + } + + dstPtr += CN; + + if (xp < wm) + { + xp++; + srcPtr += CN; + } + + ++sp; + if (sp >= stackLenW) sp = 0; + } + } + + // Vertical direction + int hm = rowsImg - 1; + int widthElem = colsImg * CN; + int radiusH = ksize.height / 2; + int stackLenH = ksize.height; + const float mulH = 1.0f / (((float )radiusH + 1.0f) * ((float )radiusH + 1.0f)); + + stack.resize(stackLenH, 0); + for (int col = 0; col < widthElem; col++) + { + const T* srcPtr =dst.ptr() + col; + float sum0 = 0; + float sumIn0 = 0; + float sumOut0 = 0; + + for (int i = 0; i <= radiusH; i++) + { + T tmp = (T)(*srcPtr); + stack[i] = tmp; + sum0 += tmp * (i + 1); + sumOut0 += tmp; + } + + for (int i = 1; i <= radiusH; i++) + { + if (i <= hm) srcPtr += widthElem; + T tmp = (T)(*srcPtr); + stack[i + radiusH] = tmp; + sum0 += tmp * (radiusH - i + 1); + sumIn0 += tmp; + } + + int sp = radiusH; + int yp = radiusH; + + if (yp > hm) yp = hm; + + T* dstPtr = dst.ptr() + col; + srcPtr = dst.ptr(yp) + col; + + const T* srcNew; + + int stackStart = 0; + + for (int i = 0; i < rowsImg; i++) + { + stackStart = sp + stackLenH - radiusH; + if (stackStart >= stackLenH) stackStart -= stackLenH; + + *(dstPtr) = saturate_cast(sum0 * mulH); + sum0 -= sumOut0; + sumOut0 -= stack[stackStart]; + srcNew = srcPtr; + + if (yp < hm) + srcNew += widthElem; + + stack[stackStart] = *(srcNew); + sumIn0 += *(srcNew); + sum0 += sumIn0; + + int sp1 = sp + 1; + sp1 &= -(sp1 < stackLenH); + + sumOut0 += stack[sp1]; + sumIn0 -= stack[sp1]; + + dstPtr += widthElem; + + if (yp < hm) + { + yp++; + srcPtr += widthElem; + } + + ++sp; + if (sp >= stackLenH) sp = 0; + } + } +} + +void stackBlurRef(const Mat& img, Mat& dst, Size ksize) +{ + if(img.depth() == CV_8U) + _stackblurRef(img, dst, ksize); + else if (img.depth() == CV_16S) + _stackblurRef(img, dst, ksize); + else if (img.depth() == CV_16U) + _stackblurRef(img, dst, ksize); + else if (img.depth() == CV_32F) + _stackblurRef(img, dst, ksize); + else + CV_Error_( CV_StsNotImplemented, + ("Unsupported Mat type in stackBlurRef, " + "the supported formats are: CV_8U, CV_16U, CV_16S and CV_32F.")); +} + +std::vector kernelSizeVec = { + Size(3, 3), + Size(5, 5), + Size(101, 101), + Size(3, 9) + }; + +typedef testing::TestWithParam > StackBlur; + +TEST_P (StackBlur, regression) +{ + Mat img_ = imread(findDataFile("shared/fruits.png"), 1); + const int cn = get<0>(GetParam()); + const int kIndex = get<1>(GetParam()); + const int dtype = get<2>(GetParam()); + + Size ksize = kernelSizeVec[kIndex]; + + Mat img, dstRef, dst; + convert(img_, img, dtype); + + vector channels; + split(img, channels); + channels.push_back(channels[0]); // channels size is 4. + + Mat imgCn; + if (cn == 1) + imgCn = channels[0]; + else if (cn == 4) + merge(channels, imgCn); + else + imgCn = img; + + stackBlurRef(imgCn, dstRef, ksize); + stackBlur(imgCn, dst, ksize); + EXPECT_LE(cvtest::norm(dstRef, dst, NORM_INF), 2.); +} + +INSTANTIATE_TEST_CASE_P(Imgproc, StackBlur, + testing::Combine( + testing::Values(1, 3, 4), + testing::Values(0, 1, 2, 3), + testing::Values(CV_8U, CV_16S, CV_16U, CV_32F) + ) +); + +typedef testing::TestWithParam > StackBlur_GaussianBlur; + +// StackBlur should produce similar results as GaussianBlur output. +TEST_P(StackBlur_GaussianBlur, compare) +{ + Mat img_ = imread(findDataFile("shared/fruits.png"), 1); + const int dtype = get<0>(GetParam()); + + Size ksize(3, 3); + Mat img, dstS, dstG; + convert(img_, img, dtype); + + stackBlur(img, dstS, ksize); + GaussianBlur(img, dstG, ksize, 0); + + EXPECT_LE(cvtest::norm(dstS, dstG, NORM_INF), 13.); +} + +INSTANTIATE_TEST_CASE_P(Imgproc, StackBlur_GaussianBlur, testing::Values(CV_8U, CV_16S, CV_16U, CV_32F)); +} +}