mirror of https://github.com/opencv/opencv.git
Open Source Computer Vision Library
https://opencv.org/
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1259 lines
46 KiB
1259 lines
46 KiB
// This file is part of OpenCV project. |
|
// It is subject to the license terms in the LICENSE file found in the top-level directory |
|
// of this distribution and at http://opencv.org/license.html. |
|
|
|
/* |
|
StackBlur - a fast almost Gaussian Blur |
|
Theory: http://underdestruction.com/2004/02/25/stackblur-2004 |
|
The code has been borrowed from (https://github.com/flozz/StackBlur) |
|
and adapted for OpenCV by Zihao Mu. |
|
|
|
Below is the original copyright |
|
*/ |
|
|
|
/* |
|
Copyright (c) 2010 Mario Klingemann |
|
|
|
Permission is hereby granted, free of charge, to any person |
|
obtaining a copy of this software and associated documentation |
|
files (the "Software"), to deal in the Software without |
|
restriction, including without limitation the rights to use, |
|
copy, modify, merge, publish, distribute, sublicense, and/or sell |
|
copies of the Software, and to permit persons to whom the |
|
Software is furnished to do so, subject to the following |
|
conditions: |
|
|
|
The above copyright notice and this permission notice shall be |
|
included in all copies or substantial portions of the Software. |
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
|
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES |
|
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
|
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT |
|
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, |
|
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
|
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
|
OTHER DEALINGS IN THE SOFTWARE. |
|
*/ |
|
|
|
|
|
#include "precomp.hpp" |
|
#include "opencv2/core/hal/intrin.hpp" |
|
|
|
#include <iostream> |
|
|
|
using namespace std; |
|
|
|
#define STACKBLUR_MAX_RADIUS 254 |
|
|
|
static unsigned short const stackblurMul[255] = |
|
{ |
|
512,512,456,512,328,456,335,512,405,328,271,456,388,335,292,512, |
|
454,405,364,328,298,271,496,456,420,388,360,335,312,292,273,512, |
|
482,454,428,405,383,364,345,328,312,298,284,271,259,496,475,456, |
|
437,420,404,388,374,360,347,335,323,312,302,292,282,273,265,512, |
|
497,482,468,454,441,428,417,405,394,383,373,364,354,345,337,328, |
|
320,312,305,298,291,284,278,271,265,259,507,496,485,475,465,456, |
|
446,437,428,420,412,404,396,388,381,374,367,360,354,347,341,335, |
|
329,323,318,312,307,302,297,292,287,282,278,273,269,265,261,512, |
|
505,497,489,482,475,468,461,454,447,441,435,428,422,417,411,405, |
|
399,394,389,383,378,373,368,364,359,354,350,345,341,337,332,328, |
|
324,320,316,312,309,305,301,298,294,291,287,284,281,278,274,271, |
|
268,265,262,259,257,507,501,496,491,485,480,475,470,465,460,456, |
|
451,446,442,437,433,428,424,420,416,412,408,404,400,396,392,388, |
|
385,381,377,374,370,367,363,360,357,354,350,347,344,341,338,335, |
|
332,329,326,323,320,318,315,312,310,307,304,302,299,297,294,292, |
|
289,287,285,282,280,278,275,273,271,269,267,265,263,261,259 |
|
}; |
|
|
|
static unsigned char const stackblurShr[255] = |
|
{ |
|
9, 11, 12, 13, 13, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, |
|
17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 18, 19, |
|
19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, |
|
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, |
|
21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, |
|
21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, |
|
22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, |
|
22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, |
|
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, |
|
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, |
|
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, |
|
23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, |
|
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, |
|
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, |
|
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, |
|
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24 |
|
}; |
|
|
|
namespace cv{ |
|
|
|
#if (CV_SIMD || CV_SIMD_SCALABLE) |
|
template<typename T> |
|
inline int opRow(const T* , T* , const std::vector<ushort>& , const float , const int radius, const int CN, const int ) |
|
{ |
|
return radius * CN; |
|
} |
|
|
|
template<> |
|
inline int opRow<uchar>(const uchar* srcPtr, uchar* dstPtr, const std::vector<ushort>& kVec, const float , const int radius, const int CN, const int widthCN) |
|
{ |
|
int kernelSize = (int)kVec.size(); |
|
|
|
int i = radius * CN; |
|
if (radius > STACKBLUR_MAX_RADIUS) |
|
return i; |
|
|
|
const int mulValTab= stackblurMul[radius]; |
|
const int shrValTab= stackblurShr[radius]; |
|
|
|
const int VEC_LINE = VTraits<v_uint8>::vlanes(); |
|
|
|
if (kernelSize == 3) |
|
{ |
|
v_uint32 v_mulVal = vx_setall_u32(mulValTab); |
|
for (; i <= widthCN - VEC_LINE; i += VEC_LINE) |
|
{ |
|
v_uint16 x0l, x0h, x1l, x1h, x2l, x2h; |
|
v_expand(vx_load(srcPtr + i - CN), x0l, x0h); |
|
v_expand(vx_load(srcPtr + i), x1l, x1h); |
|
v_expand(vx_load(srcPtr + i + CN), x2l, x2h); |
|
|
|
x1l = v_add_wrap(v_add_wrap(x1l, x1l), v_add_wrap(x0l, x2l)); |
|
x1h = v_add_wrap(v_add_wrap(x1h, x1h), v_add_wrap(x0h, x2h)); |
|
|
|
v_uint32 y00, y01, y10, y11; |
|
v_expand(x1l, y00, y01); |
|
v_expand(x1h, y10, y11); |
|
|
|
y00 = v_shr(v_mul(y00, v_mulVal), shrValTab); |
|
y01 = v_shr(v_mul(y01, v_mulVal), shrValTab); |
|
y10 = v_shr(v_mul(y10, v_mulVal), shrValTab); |
|
y11 = v_shr(v_mul(y11, v_mulVal), shrValTab); |
|
|
|
v_store(dstPtr + i, v_pack(v_pack(y00, y01), v_pack(y10, y11))); |
|
} |
|
} |
|
else |
|
{ |
|
const ushort * kx = kVec.data() + kernelSize/2; |
|
v_int32 v_mulVal = vx_setall_s32(mulValTab); |
|
v_int16 k0 = vx_setall_s16((short)(kx[0])); |
|
|
|
srcPtr += i; |
|
for( ; i <= widthCN - VEC_LINE; i += VEC_LINE, srcPtr += VEC_LINE) |
|
{ |
|
v_uint8 v_src = vx_load(srcPtr); |
|
v_int32 s0, s1, s2, s3; |
|
v_mul_expand(v_reinterpret_as_s16(v_expand_low(v_src)), k0, s0, s1); |
|
v_mul_expand(v_reinterpret_as_s16(v_expand_high(v_src)), k0, s2, s3); |
|
|
|
int k = 1, j = CN; |
|
for (; k <= kernelSize / 2 - 1; k += 2, j += 2 * CN) |
|
{ |
|
v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32(((int)kx[k] & 0xFFFF) | ((int)kx[k + 1] << 16))); |
|
|
|
v_uint8 v_src0 = vx_load(srcPtr - j); |
|
v_uint8 v_src1 = vx_load(srcPtr - j - CN); |
|
v_uint8 v_src2 = vx_load(srcPtr + j); |
|
v_uint8 v_src3 = vx_load(srcPtr + j + CN); |
|
|
|
v_int16 xl, xh; |
|
v_zip(v_reinterpret_as_s16(v_add(v_expand_low(v_src0), v_expand_low(v_src2))), v_reinterpret_as_s16(v_add(v_expand_low(v_src1), v_expand_low(v_src3))), xl, xh); |
|
s0 = v_add(s0, v_dotprod(xl, k12)); |
|
s1 = v_add(s1, v_dotprod(xh, k12)); |
|
v_zip(v_reinterpret_as_s16(v_add(v_expand_high(v_src0), v_expand_high(v_src2))), v_reinterpret_as_s16(v_add(v_expand_high(v_src1), v_expand_high(v_src3))), xl, xh); |
|
s2 = v_add(s2, v_dotprod(xl, k12)); |
|
s3 = v_add(s3, v_dotprod(xh, k12)); |
|
} |
|
if( k < kernelSize / 2 + 1 ) |
|
{ |
|
v_int16 k1 = vx_setall_s16((short)(kx[k])); |
|
|
|
v_uint8 v_src0 = vx_load(srcPtr - j); |
|
v_uint8 v_src1 = vx_load(srcPtr + j); |
|
|
|
v_int16 xl, xh; |
|
v_zip(v_reinterpret_as_s16(v_expand_low(v_src0)), v_reinterpret_as_s16(v_expand_low(v_src1)), xl, xh); |
|
s0 = v_add(s0, v_dotprod(xl, k1)); |
|
s1 = v_add(s1, v_dotprod(xh, k1)); |
|
v_zip(v_reinterpret_as_s16(v_expand_high(v_src0)), v_reinterpret_as_s16(v_expand_high(v_src1)), xl, xh); |
|
s2 = v_add(s2, v_dotprod(xl, k1)); |
|
s3 = v_add(s3, v_dotprod(xh, k1)); |
|
} |
|
|
|
s0 = v_shr(v_mul(s0, v_mulVal), shrValTab); |
|
s1 = v_shr(v_mul(s1, v_mulVal), shrValTab); |
|
s2 = v_shr(v_mul(s2, v_mulVal), shrValTab); |
|
s3 = v_shr(v_mul(s3, v_mulVal), shrValTab); |
|
|
|
v_store(dstPtr + i, v_pack(v_reinterpret_as_u16(v_pack(s0, s1)), v_reinterpret_as_u16(v_pack(s2, s3)))); |
|
} |
|
} |
|
return i; |
|
} |
|
|
|
template<> |
|
inline int opRow<ushort>(const ushort* srcPtr, ushort* dstPtr, const std::vector<ushort>& kVec, const float , const int radius, const int CN, const int widthCN) |
|
{ |
|
int kernelSize = (int)kVec.size(); |
|
|
|
int i = radius * CN; |
|
if (radius > STACKBLUR_MAX_RADIUS) |
|
return i; |
|
|
|
const int mulValTab= stackblurMul[radius]; |
|
const int shrValTab= stackblurShr[radius]; |
|
|
|
const int VEC_LINE = VTraits<v_uint16>::vlanes(); |
|
|
|
v_uint32 v_mulVal = vx_setall_u32(mulValTab); |
|
if (kernelSize == 3) |
|
{ |
|
for (; i <= widthCN - VEC_LINE; i += VEC_LINE) |
|
{ |
|
v_uint32 x0l, x0h, x1l, x1h, x2l, x2h; |
|
v_expand(vx_load(srcPtr + i - CN), x0l, x0h); |
|
v_expand(vx_load(srcPtr + i), x1l, x1h); |
|
v_expand(vx_load(srcPtr + i + CN), x2l, x2h); |
|
|
|
x1l = v_add(v_add(x1l, x1l), v_add(x0l, x2l)); |
|
x1h = v_add(v_add(x1h, x1h), v_add(x0h, x2h)); |
|
|
|
v_store(dstPtr + i, v_pack(v_shr(v_mul(x1l, v_mulVal), shrValTab), v_shr(v_mul(x1h, v_mulVal), shrValTab))); |
|
} |
|
} |
|
else |
|
{ |
|
const ushort * kx = kVec.data() + kernelSize/2; |
|
v_uint16 k0 = vx_setall_u16(kx[0]); |
|
|
|
srcPtr += i; |
|
for( ; i <= widthCN - VEC_LINE; i += VEC_LINE, srcPtr += VEC_LINE) |
|
{ |
|
v_uint16 v_src = vx_load(srcPtr); |
|
v_uint32 s0, s1; |
|
|
|
v_mul_expand(v_src, k0, s0, s1); |
|
|
|
int k = 1, j = CN; |
|
for (; k <= kernelSize / 2 - 1; k += 2, j += 2*CN) |
|
{ |
|
v_uint16 k1 = vx_setall_u16(kx[k]); |
|
v_uint16 k2 = vx_setall_u16(kx[k + 1]); |
|
|
|
v_uint32 y0, y1; |
|
v_mul_expand(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1, y0, y1); |
|
s0 = v_add(s0, y0); |
|
s1 = v_add(s1, y1); |
|
v_mul_expand(v_add(vx_load(srcPtr - j - CN), vx_load(srcPtr + j + CN)), k2, y0, y1); |
|
s0 = v_add(s0, y0); |
|
s1 = v_add(s1, y1); |
|
} |
|
if( k < kernelSize / 2 + 1 ) |
|
{ |
|
v_uint16 k1 = vx_setall_u16(kx[k]); |
|
|
|
v_uint32 y0, y1; |
|
v_mul_expand(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1, y0, y1); |
|
s0 = v_add(s0, y0); |
|
s1 = v_add(s1, y1); |
|
} |
|
|
|
s0 = v_shr(v_mul(s0, v_mulVal), shrValTab); |
|
s1 = v_shr(v_mul(s1, v_mulVal), shrValTab); |
|
|
|
v_store(dstPtr + i, v_pack(s0, s1)); |
|
} |
|
} |
|
|
|
return i; |
|
} |
|
|
|
template<> |
|
inline int opRow<short>(const short* srcPtr, short* dstPtr, const std::vector<ushort>& kVec, const float , const int radius, const int CN, const int widthCN) |
|
{ |
|
int kernelSize = (int)kVec.size(); |
|
int i = radius * CN; |
|
|
|
if (radius > STACKBLUR_MAX_RADIUS) |
|
return i; |
|
|
|
const int mulValTab= stackblurMul[radius]; |
|
const int shrValTab= stackblurShr[radius]; |
|
|
|
const int VEC_LINE = VTraits<v_int16>::vlanes(); |
|
v_int32 v_mulVal = vx_setall_s32(mulValTab); |
|
|
|
if (kernelSize == 3) |
|
{ |
|
for (; i <= widthCN - VEC_LINE; i += VEC_LINE) |
|
{ |
|
v_int32 x0l, x0h, x1l, x1h, x2l, x2h; |
|
v_expand(vx_load(srcPtr + i - CN), x0l, x0h); |
|
v_expand(vx_load(srcPtr + i), x1l, x1h); |
|
v_expand(vx_load(srcPtr + i + CN), x2l, x2h); |
|
|
|
x1l = v_add(v_add(x1l, x1l), v_add(x0l, x2l)); |
|
x1h = v_add(v_add(x1h, x1h), v_add(x0h, x2h)); |
|
|
|
v_store(dstPtr + i, v_pack(v_shr(v_mul(x1l, v_mulVal), shrValTab), v_shr(v_mul(x1h, v_mulVal), shrValTab))); |
|
} |
|
} |
|
else |
|
{ |
|
const ushort * kx = kVec.data() + kernelSize/2; |
|
v_int16 k0 = vx_setall_s16((short)(kx[0])); |
|
|
|
srcPtr += i; |
|
for( ; i <= widthCN - VEC_LINE; i += VEC_LINE, srcPtr += VEC_LINE) |
|
{ |
|
v_int16 v_src = vx_load(srcPtr); |
|
v_int32 s0, s1; |
|
v_mul_expand(v_src, k0, s0, s1); |
|
|
|
int k = 1, j = CN; |
|
for (; k <= kernelSize / 2 - 1; k += 2, j += 2 * CN) |
|
{ |
|
v_int16 k1 = vx_setall_s16((short)kx[k]); |
|
v_int16 k2 = vx_setall_s16((short)kx[k + 1]); |
|
|
|
v_int32 y0, y1; |
|
|
|
v_mul_expand(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1, y0, y1); |
|
s0 = v_add(s0, y0); |
|
s1 = v_add(s1, y1); |
|
v_mul_expand(v_add(vx_load(srcPtr - j - CN), vx_load(srcPtr + j + CN)), k2, y0, y1); |
|
s0 = v_add(s0, y0); |
|
s1 = v_add(s1, y1); |
|
} |
|
if( k < kernelSize / 2 + 1 ) |
|
{ |
|
v_int16 k1 = vx_setall_s16((short)kx[k]); |
|
v_int32 y0, y1; |
|
v_mul_expand(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1, y0, y1); |
|
s0 = v_add(s0, y0); |
|
s1 = v_add(s1, y1); |
|
} |
|
|
|
s0 = v_shr(v_mul(s0, v_mulVal), shrValTab); |
|
s1 = v_shr(v_mul(s1, v_mulVal), shrValTab); |
|
|
|
v_store(dstPtr + i, v_pack(s0, s1)); |
|
} |
|
} |
|
return i; |
|
} |
|
|
|
template<> |
|
inline int opRow<float>(const float* srcPtr, float* dstPtr, const std::vector<ushort>& kVec, const float mulVal, const int radius, const int CN, const int widthCN) |
|
{ |
|
int kernelSize = (int)kVec.size(); |
|
int i = radius * CN; |
|
|
|
v_float32 v_mulVal = vx_setall_f32(mulVal); |
|
const int VEC_LINE = VTraits<v_float32>::vlanes(); |
|
const int VEC_LINE4 = VEC_LINE * 4; |
|
|
|
if (kernelSize == 3) |
|
{ |
|
for (; i <= widthCN - VEC_LINE4; i += VEC_LINE4) |
|
{ |
|
v_float32 v_srcPtr0 = vx_load(srcPtr + i); |
|
v_float32 v_srcPtr1 = vx_load(srcPtr + VEC_LINE + i) ; |
|
v_float32 v_srcPtr2 = vx_load(srcPtr + VEC_LINE * 2 + i); |
|
v_float32 v_srcPtr3 = vx_load(srcPtr + VEC_LINE * 3 + i); |
|
|
|
v_float32 v_sumVal0 = v_add(v_add(v_add(v_srcPtr0, v_srcPtr0), vx_load(srcPtr + i - CN)), vx_load(srcPtr + i + CN)); |
|
v_float32 v_sumVal1 = v_add(v_add(v_add(v_srcPtr1, v_srcPtr1), vx_load(srcPtr + VEC_LINE + i - CN)), vx_load(srcPtr + VEC_LINE + i + CN)); |
|
v_float32 v_sumVal2 = v_add(v_add(v_add(v_srcPtr2, v_srcPtr2), vx_load(srcPtr + VEC_LINE * 2 + i - CN)), vx_load(srcPtr + VEC_LINE * 2 + i + CN)); |
|
v_float32 v_sumVal3 = v_add(v_add(v_add(v_srcPtr3, v_srcPtr3), vx_load(srcPtr + VEC_LINE * 3 + i - CN)), vx_load(srcPtr + VEC_LINE * 3 + i + CN)); |
|
|
|
v_store(dstPtr + i, v_mul(v_sumVal0, v_mulVal)); |
|
v_store(dstPtr + i + VEC_LINE, v_mul(v_sumVal1, v_mulVal)); |
|
v_store(dstPtr + i + VEC_LINE * 2, v_mul(v_sumVal2, v_mulVal)); |
|
v_store(dstPtr + i + VEC_LINE * 3, v_mul(v_sumVal3, v_mulVal)); |
|
} |
|
|
|
for (; i <= widthCN - VEC_LINE; i += VEC_LINE) |
|
{ |
|
v_float32 v_srcPtr = vx_load(srcPtr + i); |
|
v_float32 v_sumVal = v_add(v_add(v_add(v_srcPtr, v_srcPtr), vx_load(srcPtr + i - CN)), vx_load(srcPtr + i + CN)); |
|
v_store(dstPtr + i, v_mul(v_sumVal, v_mulVal)); |
|
} |
|
} |
|
else |
|
{ |
|
const ushort * kx = kVec.data() + kernelSize/2; |
|
v_float32 k0 = vx_setall_f32((float)(kx[0])); |
|
|
|
srcPtr += i; |
|
for( ; i <= widthCN - VEC_LINE; i += VEC_LINE, srcPtr += VEC_LINE) |
|
{ |
|
v_float32 v_src = vx_load(srcPtr); |
|
v_float32 s0; |
|
s0 = v_mul(v_src, k0); |
|
|
|
int k = 1, j = CN; |
|
for (; k <= kernelSize / 2 - 1; k += 2, j += 2 * CN) |
|
{ |
|
v_float32 k1 = vx_setall_f32((float)kx[k]); |
|
v_float32 k2 = vx_setall_f32((float)kx[k + 1]); |
|
|
|
s0 = v_add(s0, v_mul(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1)); |
|
s0 = v_add(s0, v_mul(v_add(vx_load(srcPtr - j - CN), vx_load(srcPtr + j + CN)), k2)); |
|
} |
|
if( k < kernelSize / 2 + 1 ) |
|
{ |
|
v_float32 k1 = vx_setall_f32((float)kx[k]); |
|
|
|
s0 = v_add(s0, v_mul(v_add(vx_load(srcPtr - j), vx_load(srcPtr + j)), k1)); |
|
} |
|
|
|
v_store(dstPtr + i, v_mul(s0, v_mulVal)); |
|
} |
|
} |
|
return i; |
|
} |
|
|
|
template<typename T, typename TBuf> |
|
inline int opComputeDiff(const T*& , TBuf*& , const int , const int) |
|
{ |
|
return 0; |
|
} |
|
|
|
template<> |
|
inline int opComputeDiff<uchar, int>(const uchar*& srcPtr, int*& diff0, const int w, const int CNR1) |
|
{ |
|
int index = 0; |
|
const int VEC_LINE_8 = VTraits<v_uint8>::vlanes(); |
|
const int VEC_LINE_32 = VTraits<v_int32>::vlanes(); |
|
for (; index <= w - VEC_LINE_8; index += VEC_LINE_8, diff0+=VEC_LINE_8, srcPtr+=VEC_LINE_8) |
|
{ |
|
v_uint16 x0l, x0h, x1l, x1h; |
|
v_expand(vx_load(srcPtr + CNR1), x0l, x0h); |
|
v_expand(vx_load(srcPtr), x1l, x1h); |
|
|
|
v_int32 y0, y1, y2, y3; |
|
v_expand(v_sub(v_reinterpret_as_s16(x0l), v_reinterpret_as_s16(x1l)), y0, y1); |
|
v_expand(v_sub(v_reinterpret_as_s16(x0h), v_reinterpret_as_s16(x1h)), y2, y3); |
|
|
|
v_store(diff0, y0); |
|
v_store(diff0 + VEC_LINE_32, y1); |
|
v_store(diff0 + VEC_LINE_32 * 2, y2); |
|
v_store(diff0 + VEC_LINE_32 * 3, y3); |
|
} |
|
return index; |
|
} |
|
#endif |
|
|
|
template<typename T, typename TBuf> |
|
class ParallelStackBlurRow : public ParallelLoopBody |
|
{ |
|
public: |
|
ParallelStackBlurRow (const Mat &_src, Mat &_dst, int _radius): src(_src), dst(_dst) ,radius(_radius) |
|
{ |
|
width= dst.cols; |
|
wm = width - 1; |
|
mulVal = 1.0f / ((radius + 1) * (radius + 1)); |
|
CN = src.channels(); |
|
} |
|
|
|
~ParallelStackBlurRow() {} |
|
|
|
/* |
|
* The idea is as follows: |
|
* The stack can be understood as a sliding window of length kernel size. |
|
* The sliding window moves one element at a time from left to right. |
|
* The sumIn stores the elements added to the stack each time, |
|
* and sumOut stores the subtracted elements. Every time stack moves, stack, sumIn and sumOut are updated. |
|
* The dst will be calculated using the following formula: |
|
* dst[i] = (stack + sumIn - sumOut) / stack_num |
|
* In the Row direction, in order to avoid redundant computation, |
|
* we save the sumIn - sumOut as a diff vector. |
|
* So the new formula is: |
|
* dst[i] = (stack + diff[i]) / stack_num. |
|
* In practice, we use multiplication and bit shift right to simulate integer division: |
|
* dst[i] = ((stack + diff[i]) * mulVal) >> shrVal. |
|
* */ |
|
virtual void operator ()(const Range& range) const CV_OVERRIDE |
|
{ |
|
const int kernelSize = 2 * radius + 1; |
|
|
|
if (kernelSize <= 9 && width > kernelSize) // Special branch for small kernel |
|
{ |
|
std::vector<ushort> kVec; |
|
for (int i = 0; i < kernelSize; i++) |
|
{ |
|
if (i <= radius) |
|
kVec.push_back(ushort(i + 1)); |
|
else |
|
kVec.push_back(ushort(2 * radius - i + 1)); |
|
} |
|
|
|
const ushort * kx = kVec.data() + kernelSize/2; |
|
for (int row = range.start; row < range.end; row++) |
|
{ |
|
const T* srcPtr = src.ptr<T>(row); |
|
T* dstPtr = dst.ptr<T>(row); |
|
TBuf sumVal; |
|
|
|
// init |
|
for (int i = 0; i < radius; i++) |
|
{ |
|
for (int ci = 0; ci < CN; ci++) |
|
{ |
|
sumVal = 0; |
|
for (int k = 0; k < kernelSize; k++) |
|
{ |
|
int index = std::max(k - radius + i, 0); |
|
sumVal += (TBuf)srcPtr[index * CN + ci] * (TBuf)kVec[k]; |
|
} |
|
dstPtr[i*CN + ci] = (T)(sumVal * mulVal); |
|
} |
|
} |
|
|
|
int widthCN = (width - radius) * CN; |
|
|
|
// middle |
|
int wc = radius * CN; |
|
#if (CV_SIMD || CV_SIMD_SCALABLE) |
|
wc = opRow<T>(srcPtr, dstPtr, kVec, mulVal, radius, CN, widthCN); |
|
#endif |
|
for (; wc < widthCN; wc++) |
|
{ |
|
sumVal = srcPtr[wc] * kx[0]; |
|
for (int k = 1; k <= radius; k++) |
|
sumVal += ((TBuf)(srcPtr[wc + k * CN])+(TBuf)(srcPtr[wc - k * CN])) * (TBuf)kx[k]; |
|
dstPtr[wc] = (T)(sumVal * mulVal); |
|
} |
|
|
|
// tail |
|
for (int i = wc / CN; i < width; i++) |
|
{ |
|
for (int ci = 0; ci < CN; ci++) |
|
{ |
|
sumVal = 0; |
|
for (int k = 0; k < kernelSize; k++) |
|
{ |
|
int index = std::min(k - radius + i, wm); |
|
sumVal += (TBuf)srcPtr[index * CN + ci] * (TBuf)kVec[k]; |
|
} |
|
dstPtr[i*CN + ci] = (T)(sumVal * mulVal); |
|
} |
|
} |
|
|
|
} |
|
} |
|
else |
|
{ |
|
size_t bufSize = CN * (width + radius) * sizeof(TBuf) + 2 * CN * sizeof(TBuf); |
|
AutoBuffer<uchar> _buf(bufSize + 16); |
|
uchar* bufptr = alignPtr(_buf.data(), 16); |
|
TBuf* diffVal = (TBuf*)bufptr; |
|
TBuf* sum = diffVal+CN; |
|
TBuf* diff = sum + CN; |
|
|
|
const int CNR1 = CN * (radius + 1); |
|
const int widthCN = (width - radius - 1) * CN; |
|
|
|
for (int row = range.start; row < range.end; row++) |
|
{ |
|
memset(bufptr, 0, bufSize); |
|
|
|
const T* srcPtr = src.ptr<T>(row); |
|
T* dstPtr = dst.ptr<T>(row); |
|
|
|
int radiusMul = (radius + 2) * (radius + 1) / 2; |
|
for (int ci = 0; ci < CN; ci++) |
|
sum[ci] += (TBuf)srcPtr[ci] * radiusMul; |
|
|
|
// compute diff |
|
const T* srcPtr0 = srcPtr; |
|
|
|
// init |
|
for (int i = 0; i < radius; i++) |
|
{ |
|
if (i < wm) srcPtr0 += CN; |
|
for (int ci = 0; ci < CN; ci++) |
|
{ |
|
diff[i*CN + ci] = (TBuf)srcPtr0[ci] - (TBuf)srcPtr[ci]; |
|
diffVal[ci] += diff[i*CN + ci]; |
|
sum[ci] += srcPtr0[ci] * (radius - i); |
|
} |
|
} |
|
|
|
// middle |
|
auto diff0 = diff + radius * CN; |
|
int index = 0; |
|
#if (CV_SIMD || CV_SIMD_SCALABLE) |
|
index = opComputeDiff(srcPtr, diff0, widthCN, CNR1); |
|
#endif |
|
|
|
for (; index < widthCN; index++, diff0++, srcPtr++) |
|
diff0[0] = (TBuf)(srcPtr[CNR1]) - (TBuf)(srcPtr[0]); |
|
|
|
// tails |
|
srcPtr0 = src.ptr<T>(row) + index; |
|
const T* srcPtr1 = src.ptr<T>(row) + (width - 1) * CN; |
|
int dist = width - index/CN; |
|
for (int r = 0; r < radius; r++, diff0 += CN) |
|
{ |
|
for (int ci = 0; ci < CN; ci++) |
|
diff0[ci] = (TBuf)(srcPtr1[ci]) - (TBuf)(srcPtr0[ci]); |
|
|
|
if (dist >= r) |
|
{ |
|
srcPtr0 += CN; |
|
dist--; |
|
} |
|
} |
|
|
|
srcPtr = src.ptr<T>(row); |
|
diff0 = diff + radius * CN; |
|
for (int ci = 0; ci < CN; ci++) |
|
diffVal[ci] += diff0[ci]; |
|
diff0 += CN; |
|
|
|
if (CN == 1) |
|
{ |
|
for (int i = 0; i < width; i++, diff0 ++, dstPtr ++, srcPtr ++) |
|
{ |
|
*(dstPtr) = saturate_cast<T>((sum[0] * mulVal)); |
|
sum[0] += diffVal[0]; |
|
diffVal[0] += (diff0[0] - diff0[-CNR1]); |
|
} |
|
} |
|
else if (CN == 3) |
|
{ |
|
for (int i = 0; i < width; i++, diff0 += CN, dstPtr += CN, srcPtr += CN) |
|
{ |
|
*(dstPtr + 0) = saturate_cast<T>((sum[0] * mulVal)); |
|
*(dstPtr + 1) = saturate_cast<T>((sum[1] * mulVal)); |
|
*(dstPtr + 2) = saturate_cast<T>((sum[2] * mulVal)); |
|
|
|
sum[0] += diffVal[0]; |
|
sum[1] += diffVal[1]; |
|
sum[2] += diffVal[2]; |
|
|
|
diffVal[0] += (diff0[0] - diff0[0 - CNR1]); |
|
diffVal[1] += (diff0[1] - diff0[1 - CNR1]); |
|
diffVal[2] += (diff0[2] - diff0[2 - CNR1]); |
|
} |
|
} |
|
else if (CN == 4) |
|
{ |
|
for (int i = 0; i < width; i++, diff0 += CN, dstPtr += CN, srcPtr += CN) |
|
{ |
|
*(dstPtr + 0) = saturate_cast<T>((sum[0] * mulVal)); |
|
*(dstPtr + 1) = saturate_cast<T>((sum[1] * mulVal)); |
|
*(dstPtr + 2) = saturate_cast<T>((sum[2] * mulVal)); |
|
*(dstPtr + 3) = saturate_cast<T>((sum[3] * mulVal)); |
|
|
|
sum[0] += diffVal[0]; |
|
sum[1] += diffVal[1]; |
|
sum[2] += diffVal[2]; |
|
sum[3] += diffVal[3]; |
|
|
|
diffVal[0] += (diff0[0] - diff0[0 - CNR1]); |
|
diffVal[1] += (diff0[1] - diff0[1 - CNR1]); |
|
diffVal[2] += (diff0[2] - diff0[2 - CNR1]); |
|
diffVal[3] += (diff0[3] - diff0[3 - CNR1]); |
|
} |
|
} |
|
else |
|
{ |
|
int i = 0; |
|
for (; i < width; i++, diff0 += CN, dstPtr += CN, srcPtr += CN) |
|
{ |
|
for (int ci = 0; ci < CN; ci++) |
|
{ |
|
*(dstPtr + ci) = saturate_cast<T>((sum[ci] * mulVal)); |
|
sum[ci] += diffVal[ci]; |
|
diffVal[ci] += (diff0[ci] - diff0[ci - CNR1]); |
|
} |
|
} |
|
} |
|
} |
|
} |
|
} |
|
|
|
private: |
|
const Mat &src; |
|
Mat &dst; |
|
int radius; |
|
int width; |
|
int wm; |
|
int CN; |
|
float mulVal; |
|
}; |
|
|
|
#if (CV_SIMD || CV_SIMD_SCALABLE) |
|
template<typename T, typename TBuf> |
|
inline int opColumn(const T* , T* , T* , TBuf* , TBuf* , TBuf* , const float , |
|
const int , const int , const int , const int , const int ) |
|
{ |
|
return 0; |
|
} |
|
|
|
template<> |
|
inline int opColumn<float, float>(const float* srcPtr, float* dstPtr, float* stack, float* sum, float* sumIn, |
|
float* sumOut, const float mulVal, const int , const int , |
|
const int widthLen, const int ss, const int sp1) |
|
{ |
|
int k = 0; |
|
v_float32 v_mulVal = vx_setall_f32(mulVal); |
|
const int VEC_LINE = VTraits<v_float32>::vlanes(); |
|
const int VEC_LINE4 = 4 * VEC_LINE; |
|
|
|
auto stackStartPtr = stack + ss * widthLen; |
|
auto stackSp1Ptr = stack + sp1 * widthLen; |
|
|
|
for (;k <= widthLen - VEC_LINE4; k += VEC_LINE4) |
|
{ |
|
v_float32 v_sum0 = vx_load(sum + k); |
|
v_float32 v_sum1 = vx_load(sum + VEC_LINE + k); |
|
v_float32 v_sum2 = vx_load(sum + VEC_LINE * 2 + k); |
|
v_float32 v_sum3 = vx_load(sum + VEC_LINE * 3 + k); |
|
|
|
v_float32 v_sumOut0 = vx_load(sumOut + k); |
|
v_float32 v_sumOut1 = vx_load(sumOut + VEC_LINE + k); |
|
v_float32 v_sumOut2 = vx_load(sumOut + VEC_LINE * 2 + k); |
|
v_float32 v_sumOut3 = vx_load(sumOut + VEC_LINE * 3 + k); |
|
|
|
v_float32 v_sumIn0 = vx_load(sumIn + k); |
|
v_float32 v_sumIn1 = vx_load(sumIn + VEC_LINE + k); |
|
v_float32 v_sumIn2 = vx_load(sumIn + VEC_LINE * 2 + k); |
|
v_float32 v_sumIn3 = vx_load(sumIn + VEC_LINE * 3+ k); |
|
|
|
v_store(dstPtr + k, v_mul(v_sum0, v_mulVal)); |
|
v_store(dstPtr + VEC_LINE + k, v_mul(v_sum1, v_mulVal)); |
|
v_store(dstPtr + VEC_LINE * 2 + k, v_mul(v_sum2, v_mulVal)); |
|
v_store(dstPtr + VEC_LINE * 3 + k, v_mul(v_sum3, v_mulVal)); |
|
|
|
v_sum0 = v_sub(v_sum0, v_sumOut0); |
|
v_sum1 = v_sub(v_sum1, v_sumOut1); |
|
v_sum2 = v_sub(v_sum2, v_sumOut2); |
|
v_sum3 = v_sub(v_sum3, v_sumOut3); |
|
|
|
v_sumOut0 = v_sub(v_sumOut0, vx_load(stackStartPtr + k)); |
|
v_sumOut1 = v_sub(v_sumOut1, vx_load(stackStartPtr + VEC_LINE + k)); |
|
v_sumOut2 = v_sub(v_sumOut2, vx_load(stackStartPtr + VEC_LINE * 2 + k)); |
|
v_sumOut3 = v_sub(v_sumOut3, vx_load(stackStartPtr + VEC_LINE * 3 + k)); |
|
|
|
v_float32 v_srcPtr0 = vx_load(srcPtr + k); |
|
v_float32 v_srcPtr1 = vx_load(srcPtr + VEC_LINE + k); |
|
v_float32 v_srcPtr2 = vx_load(srcPtr + VEC_LINE * 2 + k); |
|
v_float32 v_srcPtr3 = vx_load(srcPtr + VEC_LINE * 3 + k); |
|
|
|
v_store(stackStartPtr + k, v_srcPtr0); |
|
v_store(stackStartPtr + VEC_LINE + k, v_srcPtr1); |
|
v_store(stackStartPtr + VEC_LINE * 2 + k, v_srcPtr2); |
|
v_store(stackStartPtr + VEC_LINE * 3 + k, v_srcPtr3); |
|
|
|
v_sumIn0 = v_add(v_sumIn0, v_srcPtr0); |
|
v_sumIn1 = v_add(v_sumIn1, v_srcPtr1); |
|
v_sumIn2 = v_add(v_sumIn2, v_srcPtr2); |
|
v_sumIn3 = v_add(v_sumIn3, v_srcPtr3); |
|
|
|
v_store(sum + k, v_add(v_sum0, v_sumIn0)); |
|
v_store(sum + VEC_LINE + k, v_add(v_sum1, v_sumIn1)); |
|
v_store(sum + VEC_LINE * 2 + k, v_add(v_sum2, v_sumIn2)); |
|
v_store(sum + VEC_LINE * 3 + k, v_add(v_sum3, v_sumIn3)); |
|
|
|
v_srcPtr0 = vx_load(stackSp1Ptr + k); |
|
v_srcPtr1 = vx_load(stackSp1Ptr + VEC_LINE + k); |
|
v_srcPtr2 = vx_load(stackSp1Ptr + VEC_LINE * 2 + k); |
|
v_srcPtr3 = vx_load(stackSp1Ptr + VEC_LINE * 3 + k); |
|
|
|
v_sumOut0 = v_add(v_sumOut0, v_srcPtr0); |
|
v_sumOut1 = v_add(v_sumOut1, v_srcPtr1); |
|
v_sumOut2 = v_add(v_sumOut2, v_srcPtr2); |
|
v_sumOut3 = v_add(v_sumOut3, v_srcPtr3); |
|
|
|
v_store(sumOut + k, v_sumOut0); |
|
v_store(sumOut + VEC_LINE + k, v_sumOut1); |
|
v_store(sumOut + VEC_LINE * 2 + k, v_sumOut2); |
|
v_store(sumOut + VEC_LINE * 3 + k, v_sumOut3); |
|
|
|
v_sumIn0 = v_sub(v_sumIn0, v_srcPtr0); |
|
v_sumIn1 = v_sub(v_sumIn1, v_srcPtr1); |
|
v_sumIn2 = v_sub(v_sumIn2, v_srcPtr2); |
|
v_sumIn3 = v_sub(v_sumIn3, v_srcPtr3); |
|
|
|
v_store(sumIn + k, v_sumIn0); |
|
v_store(sumIn + VEC_LINE + k, v_sumIn1); |
|
v_store(sumIn + VEC_LINE * 2 + k, v_sumIn2); |
|
v_store(sumIn + VEC_LINE * 3 + k, v_sumIn3); |
|
} |
|
|
|
for (;k <= widthLen - VEC_LINE; k += VEC_LINE) |
|
{ |
|
v_float32 v_sum = vx_load(sum + k); |
|
v_float32 v_sumOut = vx_load(sumOut + k); |
|
v_float32 v_sumIn = vx_load(sumIn + k); |
|
|
|
v_store(dstPtr + k, v_mul(v_sum, v_mulVal)); |
|
v_sum = v_sub(v_sum, v_sumOut); |
|
v_sumOut = v_sub(v_sumOut, vx_load(stackStartPtr + k)); |
|
|
|
v_float32 v_srcPtr = vx_load(srcPtr + k); |
|
v_store(stackStartPtr + k, v_srcPtr); |
|
|
|
v_sumIn = v_add(v_sumIn, v_srcPtr); |
|
v_store(sum + k, v_add(v_sum, v_sumIn)); |
|
|
|
v_srcPtr = vx_load(stackSp1Ptr + k); |
|
v_sumOut = v_add(v_sumOut, v_srcPtr); |
|
v_store(sumOut + k, v_sumOut); |
|
v_sumIn = v_sub(v_sumIn, v_srcPtr); |
|
v_store(sumIn + k, v_sumIn); |
|
} |
|
return k; |
|
} |
|
|
|
template<> |
|
inline int opColumn<uchar, int>(const uchar* srcPtr, uchar* dstPtr, uchar* stack, int* sum, int* sumIn, |
|
int* sumOut, const float , const int mulValTab, const int shrValTab, |
|
const int widthLen, const int ss, const int sp1) |
|
{ |
|
int k = 0; |
|
if (mulValTab != 0 && shrValTab != 0) |
|
{ |
|
const int VEC_LINE_8 = VTraits<v_uint8>::vlanes(); |
|
const int VEC_LINE_32 = VTraits<v_int32>::vlanes(); |
|
v_int32 v_mulVal = vx_setall_s32(mulValTab); |
|
|
|
auto stackStartPtr = stack + ss * widthLen; |
|
auto stackSp1Ptr = stack + sp1 * widthLen; |
|
|
|
for (;k <= widthLen - VEC_LINE_8; k += VEC_LINE_8) |
|
{ |
|
v_int32 v_sum0, v_sum1, v_sum2, v_sum3; |
|
v_int32 v_sumIn0, v_sumIn1, v_sumIn2, v_sumIn3; |
|
v_int32 v_sumOut0, v_sumOut1, v_sumOut2, v_sumOut3; |
|
|
|
v_sum0 = vx_load(sum + k); |
|
v_sum1 = vx_load(sum + k + VEC_LINE_32); |
|
v_sum2 = vx_load(sum + k + VEC_LINE_32 * 2); |
|
v_sum3 = vx_load(sum + k + VEC_LINE_32 * 3); |
|
|
|
v_sumIn0 = vx_load(sumIn + k); |
|
v_sumIn1 = vx_load(sumIn + k + VEC_LINE_32); |
|
v_sumIn2 = vx_load(sumIn + k + VEC_LINE_32 * 2); |
|
v_sumIn3 = vx_load(sumIn + k + VEC_LINE_32 * 3); |
|
|
|
v_sumOut0 = vx_load(sumOut + k); |
|
v_sumOut1 = vx_load(sumOut + k + VEC_LINE_32); |
|
v_sumOut2 = vx_load(sumOut + k + VEC_LINE_32 * 2); |
|
v_sumOut3 = vx_load(sumOut + k + VEC_LINE_32 * 3); |
|
|
|
v_store(dstPtr + k, |
|
v_pack( |
|
v_reinterpret_as_u16(v_pack(v_shr(v_mul(v_sum0, v_mulVal), shrValTab), v_shr(v_mul(v_sum1, v_mulVal), shrValTab))), |
|
v_reinterpret_as_u16(v_pack(v_shr(v_mul(v_sum2, v_mulVal), shrValTab), v_shr(v_mul(v_sum3, v_mulVal), shrValTab))))); |
|
|
|
v_sum0 = v_sub(v_sum0, v_sumOut0); |
|
v_sum1 = v_sub(v_sum1, v_sumOut1); |
|
v_sum2 = v_sub(v_sum2, v_sumOut2); |
|
v_sum3 = v_sub(v_sum3, v_sumOut3); |
|
|
|
v_uint16 x0l, x0h; |
|
v_int32 v_ss0, v_ss1, v_ss2, v_ss3; |
|
|
|
v_expand(vx_load(stackStartPtr + k), x0l, x0h); |
|
v_expand(v_reinterpret_as_s16(x0l), v_ss0, v_ss1); |
|
v_expand(v_reinterpret_as_s16(x0h), v_ss2, v_ss3); |
|
|
|
v_sumOut0 = v_sub(v_sumOut0, v_ss0); |
|
v_sumOut1 = v_sub(v_sumOut1, v_ss1); |
|
v_sumOut2 = v_sub(v_sumOut2, v_ss2); |
|
v_sumOut3 = v_sub(v_sumOut3, v_ss3); |
|
|
|
v_expand(vx_load(srcPtr + k), x0l, x0h); |
|
v_expand(v_reinterpret_as_s16(x0l), v_ss0, v_ss1); |
|
v_expand(v_reinterpret_as_s16(x0h), v_ss2, v_ss3); |
|
|
|
memcpy(stackStartPtr + k,srcPtr + k, VEC_LINE_8 * sizeof (uchar)); |
|
|
|
v_sumIn0 = v_add(v_sumIn0, v_ss0); |
|
v_sumIn1 = v_add(v_sumIn1, v_ss1); |
|
v_sumIn2 = v_add(v_sumIn2, v_ss2); |
|
v_sumIn3 = v_add(v_sumIn3, v_ss3); |
|
|
|
v_store(sum + k, v_add(v_sum0, v_sumIn0)); |
|
v_store(sum + VEC_LINE_32 + k, v_add(v_sum1, v_sumIn1)); |
|
v_store(sum + VEC_LINE_32 * 2 + k, v_add(v_sum2, v_sumIn2)); |
|
v_store(sum + VEC_LINE_32 * 3 + k, v_add(v_sum3, v_sumIn3)); |
|
|
|
v_expand(vx_load(stackSp1Ptr + k), x0l, x0h); |
|
v_expand(v_reinterpret_as_s16(x0l), v_ss0, v_ss1); |
|
v_expand(v_reinterpret_as_s16(x0h), v_ss2, v_ss3); |
|
|
|
v_sumOut0 = v_add(v_sumOut0, v_ss0); |
|
v_sumOut1 = v_add(v_sumOut1, v_ss1); |
|
v_sumOut2 = v_add(v_sumOut2, v_ss2); |
|
v_sumOut3 = v_add(v_sumOut3, v_ss3); |
|
|
|
v_store(sumOut + k, v_sumOut0); |
|
v_store(sumOut + VEC_LINE_32 + k, v_sumOut1); |
|
v_store(sumOut + VEC_LINE_32 * 2 + k, v_sumOut2); |
|
v_store(sumOut + VEC_LINE_32 * 3 + k, v_sumOut3); |
|
|
|
v_sumIn0 = v_sub(v_sumIn0, v_ss0); |
|
v_sumIn1 = v_sub(v_sumIn1, v_ss1); |
|
v_sumIn2 = v_sub(v_sumIn2, v_ss2); |
|
v_sumIn3 = v_sub(v_sumIn3, v_ss3); |
|
|
|
v_store(sumIn + k, v_sumIn0); |
|
v_store(sumIn + VEC_LINE_32 + k, v_sumIn1); |
|
v_store(sumIn + VEC_LINE_32 * 2 + k, v_sumIn2); |
|
v_store(sumIn + VEC_LINE_32 * 3 + k, v_sumIn3); |
|
} |
|
} |
|
return k; |
|
} |
|
|
|
template<> |
|
inline int opColumn<short, int>(const short* srcPtr, short* dstPtr, short* stack, int* sum, int* sumIn, |
|
int* sumOut, const float , const int mulValTab, const int shrValTab, |
|
const int widthLen, const int ss, const int sp1) |
|
{ |
|
int k = 0; |
|
if (mulValTab != 0 && shrValTab != 0) |
|
{ |
|
const int VEC_LINE_16 = VTraits<v_int16>::vlanes(); |
|
const int VEC_LINE_32 = VTraits<v_int32>::vlanes(); |
|
v_int32 v_mulVal = vx_setall_s32(mulValTab); |
|
|
|
auto stackStartPtr = stack + ss * widthLen; |
|
auto stackSp1Ptr = stack + sp1 * widthLen; |
|
for (;k <= widthLen - VEC_LINE_16; k += VEC_LINE_16) |
|
{ |
|
v_int32 v_sum0, v_sum1; |
|
v_int32 v_sumIn0, v_sumIn1; |
|
v_int32 v_sumOut0, v_sumOut1; |
|
|
|
v_sum0 = vx_load(sum + k); |
|
v_sum1 = vx_load(sum + k + VEC_LINE_32); |
|
|
|
v_sumIn0 = vx_load(sumIn + k); |
|
v_sumIn1 = vx_load(sumIn + k + VEC_LINE_32); |
|
|
|
v_sumOut0 = vx_load(sumOut + k); |
|
v_sumOut1 = vx_load(sumOut + k + VEC_LINE_32); |
|
|
|
v_store(dstPtr + k,v_pack(v_shr(v_mul(v_sum0, v_mulVal), shrValTab), v_shr(v_mul(v_sum1, v_mulVal), shrValTab))); |
|
|
|
v_sum0 = v_sub(v_sum0, v_sumOut0); |
|
v_sum1 = v_sub(v_sum1, v_sumOut1); |
|
|
|
v_int32 v_ss0, v_ss1; |
|
v_expand(vx_load(stackStartPtr + k), v_ss0, v_ss1); |
|
|
|
v_sumOut0 = v_sub(v_sumOut0, v_ss0); |
|
v_sumOut1 = v_sub(v_sumOut1, v_ss1); |
|
|
|
v_expand(vx_load(srcPtr + k), v_ss0, v_ss1); |
|
memcpy(stackStartPtr + k,srcPtr + k, VEC_LINE_16 * sizeof (short)); |
|
|
|
v_sumIn0 = v_add(v_sumIn0, v_ss0); |
|
v_sumIn1 = v_add(v_sumIn1, v_ss1); |
|
|
|
v_sum0 = v_add(v_sum0, v_sumIn0); |
|
v_sum1 = v_add(v_sum1, v_sumIn1); |
|
|
|
v_store(sum + k, v_sum0); |
|
v_store(sum + VEC_LINE_32 + k, v_sum1); |
|
|
|
v_expand(vx_load(stackSp1Ptr + k), v_ss0, v_ss1); |
|
|
|
v_sumOut0 = v_add(v_sumOut0, v_ss0); |
|
v_sumOut1 = v_add(v_sumOut1, v_ss1); |
|
|
|
v_store(sumOut + k, v_sumOut0); |
|
v_store(sumOut + VEC_LINE_32 + k, v_sumOut1); |
|
|
|
v_sumIn0 = v_sub(v_sumIn0, v_ss0); |
|
v_sumIn1 = v_sub(v_sumIn1, v_ss1); |
|
|
|
v_store(sumIn + k, v_sumIn0); |
|
v_store(sumIn + VEC_LINE_32 + k, v_sumIn1); |
|
} |
|
} |
|
return k; |
|
} |
|
|
|
template<> |
|
inline int opColumn<ushort, int>(const ushort* srcPtr, ushort* dstPtr, ushort* stack, int* sum, int* sumIn, |
|
int* sumOut, const float , const int mulValTab, const int shrValTab, |
|
const int widthLen, const int ss, const int sp1) |
|
{ |
|
int k = 0; |
|
if (mulValTab != 0 && shrValTab != 0) |
|
{ |
|
const int VEC_LINE_16 = VTraits<v_uint16>::vlanes(); |
|
const int VEC_LINE_32 = VTraits<v_int32>::vlanes(); |
|
v_uint32 v_mulVal = vx_setall_u32((uint32_t)mulValTab); |
|
|
|
auto stackStartPtr = stack + ss * widthLen; |
|
auto stackSp1Ptr = stack + sp1 * widthLen; |
|
for (;k <= widthLen - VEC_LINE_16; k += VEC_LINE_16) |
|
{ |
|
v_int32 v_sum0, v_sum1; |
|
v_int32 v_sumIn0, v_sumIn1; |
|
v_int32 v_sumOut0, v_sumOut1; |
|
|
|
v_sum0 = vx_load(sum + k); |
|
v_sum1 = vx_load(sum + k + VEC_LINE_32); |
|
|
|
v_sumIn0 = vx_load(sumIn + k); |
|
v_sumIn1 = vx_load(sumIn + k + VEC_LINE_32); |
|
|
|
v_sumOut0 = vx_load(sumOut + k); |
|
v_sumOut1 = vx_load(sumOut + k + VEC_LINE_32); |
|
|
|
v_store(dstPtr + k, v_pack(v_shr(v_mul(v_reinterpret_as_u32(v_sum0), v_mulVal), shrValTab), v_shr(v_mul(v_reinterpret_as_u32(v_sum1), v_mulVal), shrValTab))); |
|
|
|
v_sum0 = v_sub(v_sum0, v_sumOut0); |
|
v_sum1 = v_sub(v_sum1, v_sumOut1); |
|
|
|
v_uint32 v_ss0, v_ss1; |
|
v_expand(vx_load(stackStartPtr + k), v_ss0, v_ss1); |
|
|
|
v_sumOut0 = v_sub(v_sumOut0, v_reinterpret_as_s32(v_ss0)); |
|
v_sumOut1 = v_sub(v_sumOut1, v_reinterpret_as_s32(v_ss1)); |
|
|
|
v_expand(vx_load(srcPtr + k), v_ss0, v_ss1); |
|
|
|
memcpy(stackStartPtr + k,srcPtr + k, VEC_LINE_16 * sizeof (ushort)); |
|
|
|
v_sumIn0 = v_add(v_sumIn0, v_reinterpret_as_s32(v_ss0)); |
|
v_sumIn1 = v_add(v_sumIn1, v_reinterpret_as_s32(v_ss1)); |
|
|
|
v_sum0 = v_add(v_sum0, v_sumIn0); |
|
v_sum1 = v_add(v_sum1, v_sumIn1); |
|
|
|
v_store(sum + k, v_sum0); |
|
v_store(sum + VEC_LINE_32 + k, v_sum1); |
|
|
|
v_expand(vx_load(stackSp1Ptr + k), v_ss0, v_ss1); |
|
|
|
v_sumOut0 = v_add(v_sumOut0, v_reinterpret_as_s32(v_ss0)); |
|
v_sumOut1 = v_add(v_sumOut1, v_reinterpret_as_s32(v_ss1)); |
|
|
|
v_store(sumOut + k, v_sumOut0); |
|
v_store(sumOut + VEC_LINE_32 + k, v_sumOut1); |
|
|
|
v_sumIn0 = v_sub(v_sumIn0, v_reinterpret_as_s32(v_ss0)); |
|
v_sumIn1 = v_sub(v_sumIn1, v_reinterpret_as_s32(v_ss1)); |
|
|
|
v_store(sumIn + k, v_sumIn0); |
|
v_store(sumIn + VEC_LINE_32 + k, v_sumIn1); |
|
} |
|
} |
|
return k; |
|
} |
|
#endif |
|
|
|
template<typename T, typename TBuf> |
|
class ParallelStackBlurColumn: |
|
public ParallelLoopBody |
|
{ |
|
public: |
|
ParallelStackBlurColumn (const Mat & _src, Mat &_dst, int _radius):src(_src), dst(_dst) ,radius(_radius) |
|
{ |
|
CN = src.channels(); |
|
widthElem = CN * src.cols; |
|
height = src.rows; |
|
hm = src.rows - 1; |
|
mulVal = 1.0f / ((radius + 1)*(radius + 1)); |
|
if (radius <= STACKBLUR_MAX_RADIUS) |
|
{ |
|
shrValTab = stackblurShr[radius]; |
|
mulValTab = stackblurMul[radius]; |
|
} |
|
else |
|
{ |
|
shrValTab = 0; |
|
mulValTab = 0; |
|
} |
|
} |
|
|
|
~ParallelStackBlurColumn() {} |
|
|
|
virtual void operator ()(const Range& range) const CV_OVERRIDE |
|
{ |
|
if (radius == 0) |
|
return; |
|
|
|
const int kernelSize = 2 * radius + 1; |
|
int widthImg = std::min(range.end, src.cols * CN); |
|
int widthLen = widthImg - range.start; |
|
|
|
size_t bufSize = 3 * widthLen * sizeof(TBuf) + kernelSize * widthLen * sizeof(T); |
|
|
|
AutoBuffer<uchar> _buf(bufSize + 16); |
|
uchar* bufptr = alignPtr(_buf.data(), 16); |
|
|
|
TBuf* sum = (TBuf *)bufptr; |
|
TBuf* sumIn = sum + widthLen; |
|
TBuf* sumOut = sumIn + widthLen; |
|
T* stack = (T* )(sumOut + widthLen); |
|
|
|
memset(bufptr, 0, bufSize); |
|
|
|
const T* srcPtr =dst.ptr<T>() + range.start; |
|
|
|
for (int i = 0; i <= radius; i++) |
|
{ |
|
for (int k = 0; k < widthLen; k++) |
|
{ |
|
stack[i * widthLen + k] = *(srcPtr + k); |
|
sum[k] += *(srcPtr + k) * (i + 1); |
|
sumOut[k] += *(srcPtr + k); |
|
} |
|
} |
|
|
|
for (int i = 1; i <= radius; i++) |
|
{ |
|
if (i <= hm) srcPtr += widthElem; |
|
for (int k = 0; k < widthLen; k++) |
|
{ |
|
T tmp = *(srcPtr + k); |
|
stack[(i + radius) * widthLen + k] = tmp; |
|
sum[k] += tmp * (radius - i + 1); |
|
sumIn[k] += tmp; |
|
} |
|
} |
|
|
|
int sp = radius; |
|
int yp = radius; |
|
|
|
if (yp > hm) yp = hm; |
|
|
|
T* dstPtr = dst.ptr<T>() + range.start; |
|
srcPtr = dst.ptr<T>(yp) + range.start; |
|
int stackStart = 0; |
|
|
|
for(int i = 0; i < height; i++) |
|
{ |
|
stackStart = sp + kernelSize - radius; |
|
if (stackStart >= kernelSize) stackStart -= kernelSize; |
|
|
|
int sp1 = sp + 1; |
|
if (sp1 >= kernelSize) |
|
sp1 = 0; |
|
|
|
if (yp < hm) |
|
{ |
|
yp++; |
|
srcPtr += widthElem; |
|
} |
|
|
|
int k = 0; |
|
#if (CV_SIMD || CV_SIMD_SCALABLE) |
|
k = opColumn<T, TBuf>(srcPtr, dstPtr, stack, sum, sumIn, sumOut, mulVal, mulValTab, shrValTab, |
|
widthLen, stackStart, sp1); |
|
#endif |
|
|
|
for (; k < widthLen; k++) |
|
{ |
|
*(dstPtr + k) = static_cast<T>(sum[k] * mulVal); |
|
sum[k] -= sumOut[k]; |
|
sumOut[k] -= stack[stackStart * widthLen + k]; |
|
|
|
stack[stackStart * widthLen + k] = *(srcPtr + k); |
|
sumIn[k] += *(srcPtr + k); |
|
sum[k] += sumIn[k]; |
|
|
|
sumOut[k] += stack[sp1 * widthLen + k]; |
|
sumIn[k] -= stack[sp1 * widthLen + k]; |
|
} |
|
|
|
dstPtr += widthElem; |
|
++sp; |
|
if (sp >= kernelSize) |
|
sp = 0; |
|
} |
|
} |
|
|
|
private: |
|
const Mat &src; |
|
Mat &dst; |
|
int radius; |
|
int CN; |
|
int height; |
|
int widthElem; |
|
int hm; |
|
float mulVal; |
|
int mulValTab; |
|
int shrValTab; |
|
}; |
|
|
|
void stackBlur(InputArray _src, OutputArray _dst, Size ksize) |
|
{ |
|
CV_INSTRUMENT_REGION(); |
|
CV_Assert(!_src.empty()); |
|
|
|
CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 && |
|
ksize.height > 0 && ksize.height % 2 == 1 ); |
|
|
|
int radiusH = ksize.height / 2; |
|
int radiusW = ksize.width / 2; |
|
|
|
int stype = _src.type(), sdepth = _src.depth(); |
|
Mat src = _src.getMat(); |
|
|
|
if (ksize.width == 1) |
|
{ |
|
_src.copyTo(_dst); |
|
|
|
if (ksize.height == 1) |
|
return; |
|
} |
|
else |
|
{ |
|
_dst.create( src.size(), stype); |
|
} |
|
|
|
Mat dst = _dst.getMat(); |
|
int numOfThreads = getNumThreads(); |
|
int widthElem = src.cols * src.channels(); |
|
|
|
if (dst.rows / numOfThreads < 3) |
|
numOfThreads = std::max(1, dst.rows / 3); |
|
|
|
if (sdepth == CV_8U) |
|
{ |
|
if (ksize.width != 1) |
|
parallel_for_(Range(0, src.rows), ParallelStackBlurRow<uchar, int>(src, dst, radiusW), numOfThreads); |
|
if (ksize.height != 1) |
|
parallel_for_(Range(0, widthElem), ParallelStackBlurColumn<uchar, int>(dst, dst, radiusH), numOfThreads); |
|
} |
|
else if (sdepth == CV_16S) |
|
{ |
|
if (ksize.width != 1) |
|
parallel_for_(Range(0, src.rows), ParallelStackBlurRow<short, int>(src, dst, radiusW), numOfThreads); |
|
if (ksize.height != 1) |
|
parallel_for_(Range(0, widthElem), ParallelStackBlurColumn<short, int>(dst, dst, radiusH), numOfThreads); |
|
} |
|
else if (sdepth == CV_16U) |
|
{ |
|
if (ksize.width != 1) |
|
parallel_for_(Range(0, src.rows), ParallelStackBlurRow<ushort, int>(src, dst, radiusW), numOfThreads); |
|
if (ksize.height != 1) |
|
parallel_for_(Range(0, widthElem), ParallelStackBlurColumn<ushort, int>(dst, dst, radiusH), numOfThreads); |
|
} |
|
else if (sdepth == CV_32F) |
|
{ |
|
if (ksize.width != 1) |
|
parallel_for_(Range(0, src.rows), ParallelStackBlurRow<float, float>(src, dst, radiusW), numOfThreads); |
|
if (ksize.height != 1) |
|
parallel_for_(Range(0, widthElem), ParallelStackBlurColumn<float, float>(dst, dst, radiusH), numOfThreads); |
|
} |
|
else |
|
CV_Error(Error::StsNotImplemented, |
|
("Unsupported input format in StackBlur, the supported formats are: CV_8U, CV_16U, CV_16S and CV_32F.")); |
|
} |
|
} //namespace
|
|
|