From e5d7f446d68c6e99956b7a79beb473d5d2e193e9 Mon Sep 17 00:00:00 2001 From: Vitaly Tuzov Date: Wed, 7 Nov 2018 23:59:36 +0300 Subject: [PATCH] Merge pull request #13056 from terfendail:box_wintr * Updated boxFilter implementations to use wide universal intrinsics * boxFilter implementation moved to separate file * Replaced ROUNDUP macro with roundUp() function --- modules/core/include/opencv2/core/utility.hpp | 17 + modules/imgproc/src/box_filter.cpp | 1804 +++++++++++++++++ modules/imgproc/src/smooth.cpp | 1634 --------------- 3 files changed, 1821 insertions(+), 1634 deletions(-) create mode 100644 modules/imgproc/src/box_filter.cpp diff --git a/modules/core/include/opencv2/core/utility.hpp b/modules/core/include/opencv2/core/utility.hpp index c576c56b86..8534b6e729 100644 --- a/modules/core/include/opencv2/core/utility.hpp +++ b/modules/core/include/opencv2/core/utility.hpp @@ -517,6 +517,23 @@ static inline size_t divUp(size_t a, unsigned int b) return (a + b - 1) / b; } +/** @brief Round first value up to the nearest multiple of second value. + +Use this function instead of `ceil((float)a / b) * b` expressions. + +@sa divUp +*/ +static inline int roundUp(int a, unsigned int b) +{ + CV_DbgAssert(a >= 0); + return a + b - 1 - (a + b -1) % b; +} +/** @overload */ +static inline size_t roundUp(size_t a, unsigned int b) +{ + return a + b - 1 - (a + b - 1) % b; +} + /** @brief Enables or disables the optimized code. The function can be used to dynamically turn on and off optimized dispatched code (code that uses SSE4.2, AVX/AVX2, diff --git a/modules/imgproc/src/box_filter.cpp b/modules/imgproc/src/box_filter.cpp new file mode 100644 index 0000000000..7b20abb850 --- /dev/null +++ b/modules/imgproc/src/box_filter.cpp @@ -0,0 +1,1804 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, 2018, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +#include + +#include "opencv2/core/hal/intrin.hpp" +#include "opencl_kernels_imgproc.hpp" + +#include "opencv2/core/openvx/ovx_defs.hpp" + +namespace cv +{ + +/****************************************************************************************\ + Box Filter +\****************************************************************************************/ + +template +struct RowSum : + public BaseRowFilter +{ + RowSum( int _ksize, int _anchor ) : + BaseRowFilter() + { + ksize = _ksize; + anchor = _anchor; + } + + virtual void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE + { + const T* S = (const T*)src; + ST* D = (ST*)dst; + int i = 0, k, ksz_cn = ksize*cn; + + width = (width - 1)*cn; + if( ksize == 3 ) + { + for( i = 0; i < width + cn; i++ ) + { + D[i] = (ST)S[i] + (ST)S[i+cn] + (ST)S[i+cn*2]; + } + } + else if( ksize == 5 ) + { + for( i = 0; i < width + cn; i++ ) + { + D[i] = (ST)S[i] + (ST)S[i+cn] + (ST)S[i+cn*2] + (ST)S[i + cn*3] + (ST)S[i + cn*4]; + } + } + else if( cn == 1 ) + { + ST s = 0; + for( i = 0; i < ksz_cn; i++ ) + s += (ST)S[i]; + D[0] = s; + for( i = 0; i < width; i++ ) + { + s += (ST)S[i + ksz_cn] - (ST)S[i]; + D[i+1] = s; + } + } + else if( cn == 3 ) + { + ST s0 = 0, s1 = 0, s2 = 0; + for( i = 0; i < ksz_cn; i += 3 ) + { + s0 += (ST)S[i]; + s1 += (ST)S[i+1]; + s2 += (ST)S[i+2]; + } + D[0] = s0; + D[1] = s1; + D[2] = s2; + for( i = 0; i < width; i += 3 ) + { + s0 += (ST)S[i + ksz_cn] - (ST)S[i]; + s1 += (ST)S[i + ksz_cn + 1] - (ST)S[i + 1]; + s2 += (ST)S[i + ksz_cn + 2] - (ST)S[i + 2]; + D[i+3] = s0; + D[i+4] = s1; + D[i+5] = s2; + } + } + else if( cn == 4 ) + { + ST s0 = 0, s1 = 0, s2 = 0, s3 = 0; + for( i = 0; i < ksz_cn; i += 4 ) + { + s0 += (ST)S[i]; + s1 += (ST)S[i+1]; + s2 += (ST)S[i+2]; + s3 += (ST)S[i+3]; + } + D[0] = s0; + D[1] = s1; + D[2] = s2; + D[3] = s3; + for( i = 0; i < width; i += 4 ) + { + s0 += (ST)S[i + ksz_cn] - (ST)S[i]; + s1 += (ST)S[i + ksz_cn + 1] - (ST)S[i + 1]; + s2 += (ST)S[i + ksz_cn + 2] - (ST)S[i + 2]; + s3 += (ST)S[i + ksz_cn + 3] - (ST)S[i + 3]; + D[i+4] = s0; + D[i+5] = s1; + D[i+6] = s2; + D[i+7] = s3; + } + } + else + for( k = 0; k < cn; k++, S++, D++ ) + { + ST s = 0; + for( i = 0; i < ksz_cn; i += cn ) + s += (ST)S[i]; + D[0] = s; + for( i = 0; i < width; i += cn ) + { + s += (ST)S[i + ksz_cn] - (ST)S[i]; + D[i+cn] = s; + } + } + } +}; + + +template +struct ColumnSum : + public BaseColumnFilter +{ + ColumnSum( int _ksize, int _anchor, double _scale ) : + BaseColumnFilter() + { + ksize = _ksize; + anchor = _anchor; + scale = _scale; + sumCount = 0; + } + + virtual void reset() CV_OVERRIDE { sumCount = 0; } + + virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE + { + int i; + ST* SUM; + bool haveScale = scale != 1; + double _scale = scale; + + if( width != (int)sum.size() ) + { + sum.resize(width); + sumCount = 0; + } + + SUM = &sum[0]; + if( sumCount == 0 ) + { + memset((void*)SUM, 0, width*sizeof(ST)); + + for( ; sumCount < ksize - 1; sumCount++, src++ ) + { + const ST* Sp = (const ST*)src[0]; + + for( i = 0; i < width; i++ ) + SUM[i] += Sp[i]; + } + } + else + { + CV_Assert( sumCount == ksize-1 ); + src += ksize-1; + } + + for( ; count--; src++ ) + { + const ST* Sp = (const ST*)src[0]; + const ST* Sm = (const ST*)src[1-ksize]; + T* D = (T*)dst; + if( haveScale ) + { + for( i = 0; i <= width - 2; i += 2 ) + { + ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1]; + D[i] = saturate_cast(s0*_scale); + D[i+1] = saturate_cast(s1*_scale); + s0 -= Sm[i]; s1 -= Sm[i+1]; + SUM[i] = s0; SUM[i+1] = s1; + } + + for( ; i < width; i++ ) + { + ST s0 = SUM[i] + Sp[i]; + D[i] = saturate_cast(s0*_scale); + SUM[i] = s0 - Sm[i]; + } + } + else + { + for( i = 0; i <= width - 2; i += 2 ) + { + ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1]; + D[i] = saturate_cast(s0); + D[i+1] = saturate_cast(s1); + s0 -= Sm[i]; s1 -= Sm[i+1]; + SUM[i] = s0; SUM[i+1] = s1; + } + + for( ; i < width; i++ ) + { + ST s0 = SUM[i] + Sp[i]; + D[i] = saturate_cast(s0); + SUM[i] = s0 - Sm[i]; + } + } + dst += dststep; + } + } + + double scale; + int sumCount; + std::vector sum; +}; + + +template<> +struct ColumnSum : + public BaseColumnFilter +{ + ColumnSum( int _ksize, int _anchor, double _scale ) : + BaseColumnFilter() + { + ksize = _ksize; + anchor = _anchor; + scale = _scale; + sumCount = 0; + } + + virtual void reset() CV_OVERRIDE { sumCount = 0; } + + virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE + { + int* SUM; + bool haveScale = scale != 1; + double _scale = scale; + + if( width != (int)sum.size() ) + { + sum.resize(width); + sumCount = 0; + } + + SUM = &sum[0]; + if( sumCount == 0 ) + { + memset((void*)SUM, 0, width*sizeof(int)); + for( ; sumCount < ksize - 1; sumCount++, src++ ) + { + const int* Sp = (const int*)src[0]; + int i = 0; +#if CV_SIMD + for (; i <= width - v_int32::nlanes; i += v_int32::nlanes) + { + v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i)); + } +#if CV_SIMD_WIDTH > 16 + for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes) + { + v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); + } +#endif +#endif + for( ; i < width; i++ ) + SUM[i] += Sp[i]; + } + } + else + { + CV_Assert( sumCount == ksize-1 ); + src += ksize-1; + } + + for( ; count--; src++ ) + { + const int* Sp = (const int*)src[0]; + const int* Sm = (const int*)src[1-ksize]; + uchar* D = (uchar*)dst; + if( haveScale ) + { + int i = 0; +#if CV_SIMD + v_float32 _v_scale = vx_setall_f32((float)_scale); + for( ; i <= width - v_uint16::nlanes; i += v_uint16::nlanes ) + { + v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); + v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes); + + v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * _v_scale)); + v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * _v_scale)); + + v_uint16 v_dst = v_pack(v_s0d, v_s01d); + v_pack_store(D + i, v_dst); + + v_store(SUM + i, v_s0 - vx_load(Sm + i)); + v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes)); + } +#if CV_SIMD_WIDTH > 16 + v_float32x4 v_scale = v_setall_f32((float)_scale); + for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes ) + { + v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); + v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes); + + v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale)); + v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale)); + + v_uint16x8 v_dst = v_pack(v_s0d, v_s01d); + v_pack_store(D + i, v_dst); + + v_store(SUM + i, v_s0 - v_load(Sm + i)); + v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes)); + } +#endif +#endif + for( ; i < width; i++ ) + { + int s0 = SUM[i] + Sp[i]; + D[i] = saturate_cast(s0*_scale); + SUM[i] = s0 - Sm[i]; + } + } + else + { + int i = 0; +#if CV_SIMD + for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes ) + { + v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); + v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes); + + v_uint16 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)); + v_pack_store(D + i, v_dst); + + v_store(SUM + i, v_s0 - vx_load(Sm + i)); + v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes)); + } +#if CV_SIMD_WIDTH > 16 + for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes ) + { + v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); + v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes); + + v_uint16x8 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)); + v_pack_store(D + i, v_dst); + + v_store(SUM + i, v_s0 - v_load(Sm + i)); + v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes)); + } +#endif +#endif + for( ; i < width; i++ ) + { + int s0 = SUM[i] + Sp[i]; + D[i] = saturate_cast(s0); + SUM[i] = s0 - Sm[i]; + } + } + dst += dststep; + } +#if CV_SIMD + vx_cleanup(); +#endif + } + + double scale; + int sumCount; + std::vector sum; +}; + + +template<> +struct ColumnSum : +public BaseColumnFilter +{ + enum { SHIFT = 23 }; + + ColumnSum( int _ksize, int _anchor, double _scale ) : + BaseColumnFilter() + { + ksize = _ksize; + anchor = _anchor; + scale = _scale; + sumCount = 0; + divDelta = 0; + divScale = 1; + if( scale != 1 ) + { + int d = cvRound(1./scale); + double scalef = ((double)(1 << SHIFT))/d; + divScale = cvFloor(scalef); + scalef -= divScale; + divDelta = d/2; + if( scalef < 0.5 ) + divDelta++; + else + divScale++; + } + } + + virtual void reset() CV_OVERRIDE { sumCount = 0; } + + virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE + { + const int ds = divScale; + const int dd = divDelta; + ushort* SUM; + const bool haveScale = scale != 1; + + if( width != (int)sum.size() ) + { + sum.resize(width); + sumCount = 0; + } + + SUM = &sum[0]; + if( sumCount == 0 ) + { + memset((void*)SUM, 0, width*sizeof(SUM[0])); + for( ; sumCount < ksize - 1; sumCount++, src++ ) + { + const ushort* Sp = (const ushort*)src[0]; + int i = 0; +#if CV_SIMD + for( ; i <= width - v_uint16::nlanes; i += v_uint16::nlanes ) + { + v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i)); + } +#if CV_SIMD_WIDTH > 16 + for( ; i <= width - v_uint16x8::nlanes; i += v_uint16x8::nlanes ) + { + v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); + } +#endif +#endif + for( ; i < width; i++ ) + SUM[i] += Sp[i]; + } + } + else + { + CV_Assert( sumCount == ksize-1 ); + src += ksize-1; + } + + for( ; count--; src++ ) + { + const ushort* Sp = (const ushort*)src[0]; + const ushort* Sm = (const ushort*)src[1-ksize]; + uchar* D = (uchar*)dst; + if( haveScale ) + { + int i = 0; +#if CV_SIMD + v_uint32 _ds4 = vx_setall_u32((unsigned)ds); + v_uint16 _dd8 = vx_setall_u16((ushort)dd); + + for( ; i <= width-v_uint8::nlanes; i+=v_uint8::nlanes ) + { + v_uint16 _sm0 = vx_load(Sm + i); + v_uint16 _sm1 = vx_load(Sm + i + v_uint16::nlanes); + + v_uint16 _s0 = v_add_wrap(vx_load(SUM + i), vx_load(Sp + i)); + v_uint16 _s1 = v_add_wrap(vx_load(SUM + i + v_uint16::nlanes), vx_load(Sp + i + v_uint16::nlanes)); + + v_uint32 _s00, _s01, _s10, _s11; + + v_expand(_s0 + _dd8, _s00, _s01); + v_expand(_s1 + _dd8, _s10, _s11); + + _s00 = v_shr(_s00*_ds4); + _s01 = v_shr(_s01*_ds4); + _s10 = v_shr(_s10*_ds4); + _s11 = v_shr(_s11*_ds4); + + v_int16 r0 = v_pack(v_reinterpret_as_s32(_s00), v_reinterpret_as_s32(_s01)); + v_int16 r1 = v_pack(v_reinterpret_as_s32(_s10), v_reinterpret_as_s32(_s11)); + + _s0 = v_sub_wrap(_s0, _sm0); + _s1 = v_sub_wrap(_s1, _sm1); + + v_store(D + i, v_pack_u(r0, r1)); + v_store(SUM + i, _s0); + v_store(SUM + i + v_uint16::nlanes, _s1); + } +#if CV_SIMD_WIDTH > 16 + v_uint32x4 ds4 = v_setall_u32((unsigned)ds); + v_uint16x8 dd8 = v_setall_u16((ushort)dd); + + for( ; i <= width-v_uint8x16::nlanes; i+=v_uint8x16::nlanes ) + { + v_uint16x8 _sm0 = v_load(Sm + i); + v_uint16x8 _sm1 = v_load(Sm + i + v_uint16x8::nlanes); + + v_uint16x8 _s0 = v_add_wrap(v_load(SUM + i), v_load(Sp + i)); + v_uint16x8 _s1 = v_add_wrap(v_load(SUM + i + v_uint16x8::nlanes), v_load(Sp + i + v_uint16x8::nlanes)); + + v_uint32x4 _s00, _s01, _s10, _s11; + + v_expand(_s0 + dd8, _s00, _s01); + v_expand(_s1 + dd8, _s10, _s11); + + _s00 = v_shr(_s00*ds4); + _s01 = v_shr(_s01*ds4); + _s10 = v_shr(_s10*ds4); + _s11 = v_shr(_s11*ds4); + + v_int16x8 r0 = v_pack(v_reinterpret_as_s32(_s00), v_reinterpret_as_s32(_s01)); + v_int16x8 r1 = v_pack(v_reinterpret_as_s32(_s10), v_reinterpret_as_s32(_s11)); + + _s0 = v_sub_wrap(_s0, _sm0); + _s1 = v_sub_wrap(_s1, _sm1); + + v_store(D + i, v_pack_u(r0, r1)); + v_store(SUM + i, _s0); + v_store(SUM + i + v_uint16x8::nlanes, _s1); + } +#endif +#endif + for( ; i < width; i++ ) + { + int s0 = SUM[i] + Sp[i]; + D[i] = (uchar)((s0 + dd)*ds >> SHIFT); + SUM[i] = (ushort)(s0 - Sm[i]); + } + } + else + { + int i = 0; + for( ; i < width; i++ ) + { + int s0 = SUM[i] + Sp[i]; + D[i] = saturate_cast(s0); + SUM[i] = (ushort)(s0 - Sm[i]); + } + } + dst += dststep; + } +#if CV_SIMD + vx_cleanup(); +#endif + } + + double scale; + int sumCount; + int divDelta; + int divScale; + std::vector sum; +}; + + +template<> +struct ColumnSum : + public BaseColumnFilter +{ + ColumnSum( int _ksize, int _anchor, double _scale ) : + BaseColumnFilter() + { + ksize = _ksize; + anchor = _anchor; + scale = _scale; + sumCount = 0; + } + + virtual void reset() CV_OVERRIDE { sumCount = 0; } + + virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE + { + int i; + int* SUM; + bool haveScale = scale != 1; + double _scale = scale; + + if( width != (int)sum.size() ) + { + sum.resize(width); + sumCount = 0; + } + + SUM = &sum[0]; + if( sumCount == 0 ) + { + memset((void*)SUM, 0, width*sizeof(int)); + for( ; sumCount < ksize - 1; sumCount++, src++ ) + { + const int* Sp = (const int*)src[0]; + i = 0; +#if CV_SIMD + for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes ) + { + v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i)); + } +#if CV_SIMD_WIDTH > 16 + for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes ) + { + v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); + } +#endif +#endif + for( ; i < width; i++ ) + SUM[i] += Sp[i]; + } + } + else + { + CV_Assert( sumCount == ksize-1 ); + src += ksize-1; + } + + for( ; count--; src++ ) + { + const int* Sp = (const int*)src[0]; + const int* Sm = (const int*)src[1-ksize]; + short* D = (short*)dst; + if( haveScale ) + { + i = 0; +#if CV_SIMD + v_float32 _v_scale = vx_setall_f32((float)_scale); + for( ; i <= width-v_int16::nlanes; i+=v_int16::nlanes ) + { + v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); + v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes); + + v_int32 v_s0d = v_round(v_cvt_f32(v_s0) * _v_scale); + v_int32 v_s01d = v_round(v_cvt_f32(v_s01) * _v_scale); + v_store(D + i, v_pack(v_s0d, v_s01d)); + + v_store(SUM + i, v_s0 - vx_load(Sm + i)); + v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes)); + } +#if CV_SIMD_WIDTH > 16 + v_float32x4 v_scale = v_setall_f32((float)_scale); + for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes ) + { + v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); + v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes); + + v_int32x4 v_s0d = v_round(v_cvt_f32(v_s0) * v_scale); + v_int32x4 v_s01d = v_round(v_cvt_f32(v_s01) * v_scale); + v_store(D + i, v_pack(v_s0d, v_s01d)); + + v_store(SUM + i, v_s0 - v_load(Sm + i)); + v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes)); + } +#endif +#endif + for( ; i < width; i++ ) + { + int s0 = SUM[i] + Sp[i]; + D[i] = saturate_cast(s0*_scale); + SUM[i] = s0 - Sm[i]; + } + } + else + { + i = 0; +#if CV_SIMD + for( ; i <= width-v_int16::nlanes; i+=v_int16::nlanes ) + { + v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); + v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes); + + v_store(D + i, v_pack(v_s0, v_s01)); + + v_store(SUM + i, v_s0 - vx_load(Sm + i)); + v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes)); + } +#if CV_SIMD_WIDTH > 16 + for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes ) + { + v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); + v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes); + + v_store(D + i, v_pack(v_s0, v_s01)); + + v_store(SUM + i, v_s0 - v_load(Sm + i)); + v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes)); + } +#endif +#endif + + for( ; i < width; i++ ) + { + int s0 = SUM[i] + Sp[i]; + D[i] = saturate_cast(s0); + SUM[i] = s0 - Sm[i]; + } + } + dst += dststep; + } +#if CV_SIMD + vx_cleanup(); +#endif + } + + double scale; + int sumCount; + std::vector sum; +}; + + +template<> +struct ColumnSum : + public BaseColumnFilter +{ + ColumnSum( int _ksize, int _anchor, double _scale ) : + BaseColumnFilter() + { + ksize = _ksize; + anchor = _anchor; + scale = _scale; + sumCount = 0; + } + + virtual void reset() CV_OVERRIDE { sumCount = 0; } + + virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE + { + int* SUM; + bool haveScale = scale != 1; + double _scale = scale; + + if( width != (int)sum.size() ) + { + sum.resize(width); + sumCount = 0; + } + + SUM = &sum[0]; + if( sumCount == 0 ) + { + memset((void*)SUM, 0, width*sizeof(int)); + for( ; sumCount < ksize - 1; sumCount++, src++ ) + { + const int* Sp = (const int*)src[0]; + int i = 0; +#if CV_SIMD + for (; i <= width - v_int32::nlanes; i += v_int32::nlanes) + { + v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i)); + } +#if CV_SIMD_WIDTH > 16 + for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes) + { + v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); + } +#endif +#endif + for( ; i < width; i++ ) + SUM[i] += Sp[i]; + } + } + else + { + CV_Assert( sumCount == ksize-1 ); + src += ksize-1; + } + + for( ; count--; src++ ) + { + const int* Sp = (const int*)src[0]; + const int* Sm = (const int*)src[1-ksize]; + ushort* D = (ushort*)dst; + if( haveScale ) + { + int i = 0; +#if CV_SIMD + v_float32 _v_scale = vx_setall_f32((float)_scale); + for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes ) + { + v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); + v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes); + + v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * _v_scale)); + v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * _v_scale)); + v_store(D + i, v_pack(v_s0d, v_s01d)); + + v_store(SUM + i, v_s0 - vx_load(Sm + i)); + v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes)); + } +#if CV_SIMD_WIDTH > 16 + v_float32x4 v_scale = v_setall_f32((float)_scale); + for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes ) + { + v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); + v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes); + + v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale)); + v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale)); + v_store(D + i, v_pack(v_s0d, v_s01d)); + + v_store(SUM + i, v_s0 - v_load(Sm + i)); + v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes)); + } +#endif +#endif + for( ; i < width; i++ ) + { + int s0 = SUM[i] + Sp[i]; + D[i] = saturate_cast(s0*_scale); + SUM[i] = s0 - Sm[i]; + } + } + else + { + int i = 0; +#if CV_SIMD + for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes ) + { + v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); + v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes); + + v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01))); + + v_store(SUM + i, v_s0 - vx_load(Sm + i)); + v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes)); + } +#if CV_SIMD_WIDTH > 16 + for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes ) + { + v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); + v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes); + + v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01))); + + v_store(SUM + i, v_s0 - v_load(Sm + i)); + v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes)); + } +#endif +#endif + for( ; i < width; i++ ) + { + int s0 = SUM[i] + Sp[i]; + D[i] = saturate_cast(s0); + SUM[i] = s0 - Sm[i]; + } + } + dst += dststep; + } +#if CV_SIMD + vx_cleanup(); +#endif + } + + double scale; + int sumCount; + std::vector sum; +}; + +template<> +struct ColumnSum : + public BaseColumnFilter +{ + ColumnSum( int _ksize, int _anchor, double _scale ) : + BaseColumnFilter() + { + ksize = _ksize; + anchor = _anchor; + scale = _scale; + sumCount = 0; + } + + virtual void reset() CV_OVERRIDE { sumCount = 0; } + + virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE + { + int* SUM; + bool haveScale = scale != 1; + double _scale = scale; + + if( width != (int)sum.size() ) + { + sum.resize(width); + sumCount = 0; + } + + SUM = &sum[0]; + if( sumCount == 0 ) + { + memset((void*)SUM, 0, width*sizeof(int)); + for( ; sumCount < ksize - 1; sumCount++, src++ ) + { + const int* Sp = (const int*)src[0]; + int i = 0; +#if CV_SIMD + for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes ) + { + v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i)); + } +#if CV_SIMD_WIDTH > 16 + for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes ) + { + v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); + } +#endif +#endif + for( ; i < width; i++ ) + SUM[i] += Sp[i]; + } + } + else + { + CV_Assert( sumCount == ksize-1 ); + src += ksize-1; + } + + for( ; count--; src++ ) + { + const int* Sp = (const int*)src[0]; + const int* Sm = (const int*)src[1-ksize]; + int* D = (int*)dst; + if( haveScale ) + { + int i = 0; +#if CV_SIMD + v_float32 _v_scale = vx_setall_f32((float)_scale); + for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes ) + { + v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); + v_int32 v_s0d = v_round(v_cvt_f32(v_s0) * _v_scale); + + v_store(D + i, v_s0d); + v_store(SUM + i, v_s0 - vx_load(Sm + i)); + } +#if CV_SIMD_WIDTH > 16 + v_float32x4 v_scale = v_setall_f32((float)_scale); + for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes ) + { + v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); + v_int32x4 v_s0d = v_round(v_cvt_f32(v_s0) * v_scale); + + v_store(D + i, v_s0d); + v_store(SUM + i, v_s0 - v_load(Sm + i)); + } +#endif +#endif + for( ; i < width; i++ ) + { + int s0 = SUM[i] + Sp[i]; + D[i] = saturate_cast(s0*_scale); + SUM[i] = s0 - Sm[i]; + } + } + else + { + int i = 0; +#if CV_SIMD + for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes ) + { + v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); + + v_store(D + i, v_s0); + v_store(SUM + i, v_s0 - vx_load(Sm + i)); + } +#if CV_SIMD_WIDTH > 16 + for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes ) + { + v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); + + v_store(D + i, v_s0); + v_store(SUM + i, v_s0 - v_load(Sm + i)); + } +#endif +#endif + for( ; i < width; i++ ) + { + int s0 = SUM[i] + Sp[i]; + D[i] = s0; + SUM[i] = s0 - Sm[i]; + } + } + dst += dststep; + } +#if CV_SIMD + vx_cleanup(); +#endif + } + + double scale; + int sumCount; + std::vector sum; +}; + + +template<> +struct ColumnSum : + public BaseColumnFilter +{ + ColumnSum( int _ksize, int _anchor, double _scale ) : + BaseColumnFilter() + { + ksize = _ksize; + anchor = _anchor; + scale = _scale; + sumCount = 0; + } + + virtual void reset() CV_OVERRIDE { sumCount = 0; } + + virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE + { + int* SUM; + bool haveScale = scale != 1; + double _scale = scale; + + if( width != (int)sum.size() ) + { + sum.resize(width); + sumCount = 0; + } + + SUM = &sum[0]; + if( sumCount == 0 ) + { + memset((void*)SUM, 0, width*sizeof(int)); + for( ; sumCount < ksize - 1; sumCount++, src++ ) + { + const int* Sp = (const int*)src[0]; + int i = 0; +#if CV_SIMD + for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes ) + { + v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i)); + } +#if CV_SIMD_WIDTH > 16 + for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes ) + { + v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); + } +#endif +#endif + + for( ; i < width; i++ ) + SUM[i] += Sp[i]; + } + } + else + { + CV_Assert( sumCount == ksize-1 ); + src += ksize-1; + } + + for( ; count--; src++ ) + { + const int * Sp = (const int*)src[0]; + const int * Sm = (const int*)src[1-ksize]; + float* D = (float*)dst; + if( haveScale ) + { + int i = 0; + +#if CV_SIMD + v_float32 _v_scale = vx_setall_f32((float)_scale); + for (; i <= width - v_int32::nlanes; i += v_int32::nlanes) + { + v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); + v_store(D + i, v_cvt_f32(v_s0) * _v_scale); + v_store(SUM + i, v_s0 - vx_load(Sm + i)); + } +#if CV_SIMD_WIDTH > 16 + v_float32x4 v_scale = v_setall_f32((float)_scale); + for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes) + { + v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); + v_store(D + i, v_cvt_f32(v_s0) * v_scale); + v_store(SUM + i, v_s0 - v_load(Sm + i)); + } +#endif +#endif + for( ; i < width; i++ ) + { + int s0 = SUM[i] + Sp[i]; + D[i] = (float)(s0*_scale); + SUM[i] = s0 - Sm[i]; + } + } + else + { + int i = 0; + +#if CV_SIMD + for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes ) + { + v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); + v_store(D + i, v_cvt_f32(v_s0)); + v_store(SUM + i, v_s0 - vx_load(Sm + i)); + } +#if CV_SIMD_WIDTH > 16 + for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes ) + { + v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); + v_store(D + i, v_cvt_f32(v_s0)); + v_store(SUM + i, v_s0 - v_load(Sm + i)); + } +#endif +#endif + for( ; i < width; i++ ) + { + int s0 = SUM[i] + Sp[i]; + D[i] = (float)(s0); + SUM[i] = s0 - Sm[i]; + } + } + dst += dststep; + } +#if CV_SIMD + vx_cleanup(); +#endif + } + + double scale; + int sumCount; + std::vector sum; +}; + +#ifdef HAVE_OPENCL + +static bool ocl_boxFilter3x3_8UC1( InputArray _src, OutputArray _dst, int ddepth, + Size ksize, Point anchor, int borderType, bool normalize ) +{ + const ocl::Device & dev = ocl::Device::getDefault(); + int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); + + if (ddepth < 0) + ddepth = sdepth; + + if (anchor.x < 0) + anchor.x = ksize.width / 2; + if (anchor.y < 0) + anchor.y = ksize.height / 2; + + if ( !(dev.isIntel() && (type == CV_8UC1) && + (_src.offset() == 0) && (_src.step() % 4 == 0) && + (_src.cols() % 16 == 0) && (_src.rows() % 2 == 0) && + (anchor.x == 1) && (anchor.y == 1) && + (ksize.width == 3) && (ksize.height == 3)) ) + return false; + + float alpha = 1.0f / (ksize.height * ksize.width); + Size size = _src.size(); + size_t globalsize[2] = { 0, 0 }; + size_t localsize[2] = { 0, 0 }; + const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" }; + + globalsize[0] = size.width / 16; + globalsize[1] = size.height / 2; + + char build_opts[1024]; + sprintf(build_opts, "-D %s %s", borderMap[borderType], normalize ? "-D NORMALIZE" : ""); + + ocl::Kernel kernel("boxFilter3x3_8UC1_cols16_rows2", cv::ocl::imgproc::boxFilter3x3_oclsrc, build_opts); + if (kernel.empty()) + return false; + + UMat src = _src.getUMat(); + _dst.create(size, CV_MAKETYPE(ddepth, cn)); + if (!(_dst.offset() == 0 && _dst.step() % 4 == 0)) + return false; + UMat dst = _dst.getUMat(); + + int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src)); + idxArg = kernel.set(idxArg, (int)src.step); + idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst)); + idxArg = kernel.set(idxArg, (int)dst.step); + idxArg = kernel.set(idxArg, (int)dst.rows); + idxArg = kernel.set(idxArg, (int)dst.cols); + if (normalize) + idxArg = kernel.set(idxArg, (float)alpha); + + return kernel.run(2, globalsize, (localsize[0] == 0) ? NULL : localsize, false); +} + +static bool ocl_boxFilter( InputArray _src, OutputArray _dst, int ddepth, + Size ksize, Point anchor, int borderType, bool normalize, bool sqr = false ) +{ + const ocl::Device & dev = ocl::Device::getDefault(); + int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz = CV_ELEM_SIZE(type); + bool doubleSupport = dev.doubleFPConfig() > 0; + + if (ddepth < 0) + ddepth = sdepth; + + if (cn > 4 || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) || + _src.offset() % esz != 0 || _src.step() % esz != 0) + return false; + + if (anchor.x < 0) + anchor.x = ksize.width / 2; + if (anchor.y < 0) + anchor.y = ksize.height / 2; + + int computeUnits = ocl::Device::getDefault().maxComputeUnits(); + float alpha = 1.0f / (ksize.height * ksize.width); + Size size = _src.size(), wholeSize; + bool isolated = (borderType & BORDER_ISOLATED) != 0; + borderType &= ~BORDER_ISOLATED; + int wdepth = std::max(CV_32F, std::max(ddepth, sdepth)), + wtype = CV_MAKE_TYPE(wdepth, cn), dtype = CV_MAKE_TYPE(ddepth, cn); + + const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" }; + size_t globalsize[2] = { (size_t)size.width, (size_t)size.height }; + size_t localsize_general[2] = { 0, 1 }, * localsize = NULL; + + UMat src = _src.getUMat(); + if (!isolated) + { + Point ofs; + src.locateROI(wholeSize, ofs); + } + + int h = isolated ? size.height : wholeSize.height; + int w = isolated ? size.width : wholeSize.width; + + size_t maxWorkItemSizes[32]; + ocl::Device::getDefault().maxWorkItemSizes(maxWorkItemSizes); + int tryWorkItems = (int)maxWorkItemSizes[0]; + + ocl::Kernel kernel; + + if (dev.isIntel() && !(dev.type() & ocl::Device::TYPE_CPU) && + ((ksize.width < 5 && ksize.height < 5 && esz <= 4) || + (ksize.width == 5 && ksize.height == 5 && cn == 1))) + { + if (w < ksize.width || h < ksize.height) + return false; + + // Figure out what vector size to use for loading the pixels. + int pxLoadNumPixels = cn != 1 || size.width % 4 ? 1 : 4; + int pxLoadVecSize = cn * pxLoadNumPixels; + + // Figure out how many pixels per work item to compute in X and Y + // directions. Too many and we run out of registers. + int pxPerWorkItemX = 1, pxPerWorkItemY = 1; + if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4) + { + pxPerWorkItemX = size.width % 8 ? size.width % 4 ? size.width % 2 ? 1 : 2 : 4 : 8; + pxPerWorkItemY = size.height % 2 ? 1 : 2; + } + else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4)) + { + pxPerWorkItemX = size.width % 2 ? 1 : 2; + pxPerWorkItemY = size.height % 2 ? 1 : 2; + } + globalsize[0] = size.width / pxPerWorkItemX; + globalsize[1] = size.height / pxPerWorkItemY; + + // Need some padding in the private array for pixels + int privDataWidth = roundUp(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels); + + // Make the global size a nice round number so the runtime can pick + // from reasonable choices for the workgroup size + const int wgRound = 256; + globalsize[0] = roundUp(globalsize[0], wgRound); + + char build_options[1024], cvt[2][40]; + sprintf(build_options, "-D cn=%d " + "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d " + "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d " + "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s " + "-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d " + "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s " + "-D convertToWT=%s -D convertToDstT=%s%s%s -D PX_LOAD_FLOAT_VEC_CONV=convert_%s -D OP_BOX_FILTER", + cn, anchor.x, anchor.y, ksize.width, ksize.height, + pxLoadVecSize, pxLoadNumPixels, + pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType], + isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", + privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1, + ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype), + ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth), + ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), + ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), + normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "", + ocl::typeToStr(CV_MAKE_TYPE(wdepth, pxLoadVecSize)) //PX_LOAD_FLOAT_VEC_CONV + ); + + + if (!kernel.create("filterSmall", cv::ocl::imgproc::filterSmall_oclsrc, build_options)) + return false; + } + else + { + localsize = localsize_general; + for ( ; ; ) + { + int BLOCK_SIZE_X = tryWorkItems, BLOCK_SIZE_Y = std::min(ksize.height * 10, size.height); + + while (BLOCK_SIZE_X > 32 && BLOCK_SIZE_X >= ksize.width * 2 && BLOCK_SIZE_X > size.width * 2) + BLOCK_SIZE_X /= 2; + while (BLOCK_SIZE_Y < BLOCK_SIZE_X / 8 && BLOCK_SIZE_Y * computeUnits * 32 < size.height) + BLOCK_SIZE_Y *= 2; + + if (ksize.width > BLOCK_SIZE_X || w < ksize.width || h < ksize.height) + return false; + + char cvt[2][50]; + String opts = format("-D LOCAL_SIZE_X=%d -D BLOCK_SIZE_Y=%d -D ST=%s -D DT=%s -D WT=%s -D convertToDT=%s -D convertToWT=%s" + " -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s%s%s%s%s" + " -D ST1=%s -D DT1=%s -D cn=%d", + BLOCK_SIZE_X, BLOCK_SIZE_Y, ocl::typeToStr(type), ocl::typeToStr(CV_MAKE_TYPE(ddepth, cn)), + ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), + ocl::convertTypeStr(wdepth, ddepth, cn, cvt[0]), + ocl::convertTypeStr(sdepth, wdepth, cn, cvt[1]), + anchor.x, anchor.y, ksize.width, ksize.height, borderMap[borderType], + isolated ? " -D BORDER_ISOLATED" : "", doubleSupport ? " -D DOUBLE_SUPPORT" : "", + normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "", + ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn); + + localsize[0] = BLOCK_SIZE_X; + globalsize[0] = divUp(size.width, BLOCK_SIZE_X - (ksize.width - 1)) * BLOCK_SIZE_X; + globalsize[1] = divUp(size.height, BLOCK_SIZE_Y); + + kernel.create("boxFilter", cv::ocl::imgproc::boxFilter_oclsrc, opts); + if (kernel.empty()) + return false; + + size_t kernelWorkGroupSize = kernel.workGroupSize(); + if (localsize[0] <= kernelWorkGroupSize) + break; + if (BLOCK_SIZE_X < (int)kernelWorkGroupSize) + return false; + + tryWorkItems = (int)kernelWorkGroupSize; + } + } + + _dst.create(size, CV_MAKETYPE(ddepth, cn)); + UMat dst = _dst.getUMat(); + + int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src)); + idxArg = kernel.set(idxArg, (int)src.step); + int srcOffsetX = (int)((src.offset % src.step) / src.elemSize()); + int srcOffsetY = (int)(src.offset / src.step); + int srcEndX = isolated ? srcOffsetX + size.width : wholeSize.width; + int srcEndY = isolated ? srcOffsetY + size.height : wholeSize.height; + idxArg = kernel.set(idxArg, srcOffsetX); + idxArg = kernel.set(idxArg, srcOffsetY); + idxArg = kernel.set(idxArg, srcEndX); + idxArg = kernel.set(idxArg, srcEndY); + idxArg = kernel.set(idxArg, ocl::KernelArg::WriteOnly(dst)); + if (normalize) + idxArg = kernel.set(idxArg, (float)alpha); + + return kernel.run(2, globalsize, localsize, false); +} + +#endif + +} + + +cv::Ptr cv::getRowSumFilter(int srcType, int sumType, int ksize, int anchor) +{ + int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType); + CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) ); + + if( anchor < 0 ) + anchor = ksize/2; + + if( sdepth == CV_8U && ddepth == CV_32S ) + return makePtr >(ksize, anchor); + if( sdepth == CV_8U && ddepth == CV_16U ) + return makePtr >(ksize, anchor); + if( sdepth == CV_8U && ddepth == CV_64F ) + return makePtr >(ksize, anchor); + if( sdepth == CV_16U && ddepth == CV_32S ) + return makePtr >(ksize, anchor); + if( sdepth == CV_16U && ddepth == CV_64F ) + return makePtr >(ksize, anchor); + if( sdepth == CV_16S && ddepth == CV_32S ) + return makePtr >(ksize, anchor); + if( sdepth == CV_32S && ddepth == CV_32S ) + return makePtr >(ksize, anchor); + if( sdepth == CV_16S && ddepth == CV_64F ) + return makePtr >(ksize, anchor); + if( sdepth == CV_32F && ddepth == CV_64F ) + return makePtr >(ksize, anchor); + if( sdepth == CV_64F && ddepth == CV_64F ) + return makePtr >(ksize, anchor); + + CV_Error_( CV_StsNotImplemented, + ("Unsupported combination of source format (=%d), and buffer format (=%d)", + srcType, sumType)); +} + + +cv::Ptr cv::getColumnSumFilter(int sumType, int dstType, int ksize, + int anchor, double scale) +{ + int sdepth = CV_MAT_DEPTH(sumType), ddepth = CV_MAT_DEPTH(dstType); + CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(dstType) ); + + if( anchor < 0 ) + anchor = ksize/2; + + if( ddepth == CV_8U && sdepth == CV_32S ) + return makePtr >(ksize, anchor, scale); + if( ddepth == CV_8U && sdepth == CV_16U ) + return makePtr >(ksize, anchor, scale); + if( ddepth == CV_8U && sdepth == CV_64F ) + return makePtr >(ksize, anchor, scale); + if( ddepth == CV_16U && sdepth == CV_32S ) + return makePtr >(ksize, anchor, scale); + if( ddepth == CV_16U && sdepth == CV_64F ) + return makePtr >(ksize, anchor, scale); + if( ddepth == CV_16S && sdepth == CV_32S ) + return makePtr >(ksize, anchor, scale); + if( ddepth == CV_16S && sdepth == CV_64F ) + return makePtr >(ksize, anchor, scale); + if( ddepth == CV_32S && sdepth == CV_32S ) + return makePtr >(ksize, anchor, scale); + if( ddepth == CV_32F && sdepth == CV_32S ) + return makePtr >(ksize, anchor, scale); + if( ddepth == CV_32F && sdepth == CV_64F ) + return makePtr >(ksize, anchor, scale); + if( ddepth == CV_64F && sdepth == CV_32S ) + return makePtr >(ksize, anchor, scale); + if( ddepth == CV_64F && sdepth == CV_64F ) + return makePtr >(ksize, anchor, scale); + + CV_Error_( CV_StsNotImplemented, + ("Unsupported combination of sum format (=%d), and destination format (=%d)", + sumType, dstType)); +} + + +cv::Ptr cv::createBoxFilter( int srcType, int dstType, Size ksize, + Point anchor, bool normalize, int borderType ) +{ + int sdepth = CV_MAT_DEPTH(srcType); + int cn = CV_MAT_CN(srcType), sumType = CV_64F; + if( sdepth == CV_8U && CV_MAT_DEPTH(dstType) == CV_8U && + ksize.width*ksize.height <= 256 ) + sumType = CV_16U; + else if( sdepth <= CV_32S && (!normalize || + ksize.width*ksize.height <= (sdepth == CV_8U ? (1<<23) : + sdepth == CV_16U ? (1 << 15) : (1 << 16))) ) + sumType = CV_32S; + sumType = CV_MAKETYPE( sumType, cn ); + + Ptr rowFilter = getRowSumFilter(srcType, sumType, ksize.width, anchor.x ); + Ptr columnFilter = getColumnSumFilter(sumType, + dstType, ksize.height, anchor.y, normalize ? 1./(ksize.width*ksize.height) : 1); + + return makePtr(Ptr(), rowFilter, columnFilter, + srcType, dstType, sumType, borderType ); +} + +#ifdef HAVE_OPENVX +namespace cv +{ + namespace ovx { + template <> inline bool skipSmallImages(int w, int h) { return w*h < 640 * 480; } + } + static bool openvx_boxfilter(InputArray _src, OutputArray _dst, int ddepth, + Size ksize, Point anchor, + bool normalize, int borderType) + { + if (ddepth < 0) + ddepth = CV_8UC1; + if (_src.type() != CV_8UC1 || ddepth != CV_8U || !normalize || + _src.cols() < 3 || _src.rows() < 3 || + ksize.width != 3 || ksize.height != 3 || + (anchor.x >= 0 && anchor.x != 1) || + (anchor.y >= 0 && anchor.y != 1) || + ovx::skipSmallImages(_src.cols(), _src.rows())) + return false; + + Mat src = _src.getMat(); + + if ((borderType & BORDER_ISOLATED) == 0 && src.isSubmatrix()) + return false; //Process isolated borders only + vx_enum border; + switch (borderType & ~BORDER_ISOLATED) + { + case BORDER_CONSTANT: + border = VX_BORDER_CONSTANT; + break; + case BORDER_REPLICATE: + border = VX_BORDER_REPLICATE; + break; + default: + return false; + } + + _dst.create(src.size(), CV_8UC1); + Mat dst = _dst.getMat(); + + try + { + ivx::Context ctx = ovx::getOpenVXContext(); + + Mat a; + if (dst.data != src.data) + a = src; + else + src.copyTo(a); + + ivx::Image + ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8, + ivx::Image::createAddressing(a.cols, a.rows, 1, (vx_int32)(a.step)), a.data), + ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8, + ivx::Image::createAddressing(dst.cols, dst.rows, 1, (vx_int32)(dst.step)), dst.data); + + //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments + //since OpenVX standard says nothing about thread-safety for now + ivx::border_t prevBorder = ctx.immediateBorder(); + ctx.setImmediateBorder(border, (vx_uint8)(0)); + ivx::IVX_CHECK_STATUS(vxuBox3x3(ctx, ia, ib)); + ctx.setImmediateBorder(prevBorder); + } + catch (ivx::RuntimeError & e) + { + VX_DbgThrow(e.what()); + } + catch (ivx::WrapperError & e) + { + VX_DbgThrow(e.what()); + } + + return true; + } +} +#endif + +#if defined(HAVE_IPP) +namespace cv +{ +static bool ipp_boxfilter(Mat &src, Mat &dst, Size ksize, Point anchor, bool normalize, int borderType) +{ +#ifdef HAVE_IPP_IW + CV_INSTRUMENT_REGION_IPP(); + +#if IPP_VERSION_X100 < 201801 + // Problem with SSE42 optimization for 16s and some 8u modes + if(ipp::getIppTopFeatures() == ippCPUID_SSE42 && (((src.depth() == CV_16S || src.depth() == CV_16U) && (src.channels() == 3 || src.channels() == 4)) || (src.depth() == CV_8U && src.channels() == 3 && (ksize.width > 5 || ksize.height > 5)))) + return false; + + // Other optimizations has some degradations too + if((((src.depth() == CV_16S || src.depth() == CV_16U) && (src.channels() == 4)) || (src.depth() == CV_8U && src.channels() == 1 && (ksize.width > 5 || ksize.height > 5)))) + return false; +#endif + + if(!normalize) + return false; + + if(!ippiCheckAnchor(anchor, ksize)) + return false; + + try + { + ::ipp::IwiImage iwSrc = ippiGetImage(src); + ::ipp::IwiImage iwDst = ippiGetImage(dst); + ::ipp::IwiSize iwKSize = ippiGetSize(ksize); + ::ipp::IwiBorderSize borderSize(iwKSize); + ::ipp::IwiBorderType ippBorder(ippiGetBorder(iwSrc, borderType, borderSize)); + if(!ippBorder) + return false; + + CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterBox, iwSrc, iwDst, iwKSize, ::ipp::IwDefault(), ippBorder); + } + catch (const ::ipp::IwException &) + { + return false; + } + + return true; +#else + CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(ksize); CV_UNUSED(anchor); CV_UNUSED(normalize); CV_UNUSED(borderType); + return false; +#endif +} +} +#endif + + +void cv::boxFilter( InputArray _src, OutputArray _dst, int ddepth, + Size ksize, Point anchor, + bool normalize, int borderType ) +{ + CV_INSTRUMENT_REGION(); + + CV_OCL_RUN(_dst.isUMat() && + (borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || + borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101), + ocl_boxFilter3x3_8UC1(_src, _dst, ddepth, ksize, anchor, borderType, normalize)) + + CV_OCL_RUN(_dst.isUMat(), ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize)) + + Mat src = _src.getMat(); + int stype = src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype); + if( ddepth < 0 ) + ddepth = sdepth; + _dst.create( src.size(), CV_MAKETYPE(ddepth, cn) ); + Mat dst = _dst.getMat(); + if( borderType != BORDER_CONSTANT && normalize && (borderType & BORDER_ISOLATED) != 0 ) + { + if( src.rows == 1 ) + ksize.height = 1; + if( src.cols == 1 ) + ksize.width = 1; + } + + Point ofs; + Size wsz(src.cols, src.rows); + if(!(borderType&BORDER_ISOLATED)) + src.locateROI( wsz, ofs ); + + CALL_HAL(boxFilter, cv_hal_boxFilter, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, ddepth, cn, + ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, ksize.height, + anchor.x, anchor.y, normalize, borderType&~BORDER_ISOLATED); + + CV_OVX_RUN(true, + openvx_boxfilter(src, dst, ddepth, ksize, anchor, normalize, borderType)) + + CV_IPP_RUN_FAST(ipp_boxfilter(src, dst, ksize, anchor, normalize, borderType)); + + borderType = (borderType&~BORDER_ISOLATED); + + Ptr f = createBoxFilter( src.type(), dst.type(), + ksize, anchor, normalize, borderType ); + + f->apply( src, dst, wsz, ofs ); +} + + +void cv::blur( InputArray src, OutputArray dst, + Size ksize, Point anchor, int borderType ) +{ + CV_INSTRUMENT_REGION(); + + boxFilter( src, dst, -1, ksize, anchor, true, borderType ); +} + + +/****************************************************************************************\ + Squared Box Filter +\****************************************************************************************/ + +namespace cv +{ + +template +struct SqrRowSum : + public BaseRowFilter +{ + SqrRowSum( int _ksize, int _anchor ) : + BaseRowFilter() + { + ksize = _ksize; + anchor = _anchor; + } + + virtual void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE + { + const T* S = (const T*)src; + ST* D = (ST*)dst; + int i = 0, k, ksz_cn = ksize*cn; + + width = (width - 1)*cn; + for( k = 0; k < cn; k++, S++, D++ ) + { + ST s = 0; + for( i = 0; i < ksz_cn; i += cn ) + { + ST val = (ST)S[i]; + s += val*val; + } + D[0] = s; + for( i = 0; i < width; i += cn ) + { + ST val0 = (ST)S[i], val1 = (ST)S[i + ksz_cn]; + s += val1*val1 - val0*val0; + D[i+cn] = s; + } + } + } +}; + +static Ptr getSqrRowSumFilter(int srcType, int sumType, int ksize, int anchor) +{ + int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType); + CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) ); + + if( anchor < 0 ) + anchor = ksize/2; + + if( sdepth == CV_8U && ddepth == CV_32S ) + return makePtr >(ksize, anchor); + if( sdepth == CV_8U && ddepth == CV_64F ) + return makePtr >(ksize, anchor); + if( sdepth == CV_16U && ddepth == CV_64F ) + return makePtr >(ksize, anchor); + if( sdepth == CV_16S && ddepth == CV_64F ) + return makePtr >(ksize, anchor); + if( sdepth == CV_32F && ddepth == CV_64F ) + return makePtr >(ksize, anchor); + if( sdepth == CV_64F && ddepth == CV_64F ) + return makePtr >(ksize, anchor); + + CV_Error_( CV_StsNotImplemented, + ("Unsupported combination of source format (=%d), and buffer format (=%d)", + srcType, sumType)); +} + +} + +void cv::sqrBoxFilter( InputArray _src, OutputArray _dst, int ddepth, + Size ksize, Point anchor, + bool normalize, int borderType ) +{ + CV_INSTRUMENT_REGION(); + + int srcType = _src.type(), sdepth = CV_MAT_DEPTH(srcType), cn = CV_MAT_CN(srcType); + Size size = _src.size(); + + if( ddepth < 0 ) + ddepth = sdepth < CV_32F ? CV_32F : CV_64F; + + if( borderType != BORDER_CONSTANT && normalize ) + { + if( size.height == 1 ) + ksize.height = 1; + if( size.width == 1 ) + ksize.width = 1; + } + + CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2, + ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize, true)) + + int sumDepth = CV_64F; + if( sdepth == CV_8U ) + sumDepth = CV_32S; + int sumType = CV_MAKETYPE( sumDepth, cn ), dstType = CV_MAKETYPE(ddepth, cn); + + Mat src = _src.getMat(); + _dst.create( size, dstType ); + Mat dst = _dst.getMat(); + + Ptr rowFilter = getSqrRowSumFilter(srcType, sumType, ksize.width, anchor.x ); + Ptr columnFilter = getColumnSumFilter(sumType, + dstType, ksize.height, anchor.y, + normalize ? 1./(ksize.width*ksize.height) : 1); + + Ptr f = makePtr(Ptr(), rowFilter, columnFilter, + srcType, dstType, sumType, borderType ); + Point ofs; + Size wsz(src.cols, src.rows); + src.locateROI( wsz, ofs ); + + f->apply( src, dst, wsz, ofs ); +} + +/* End of file. */ diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index 4d64caac66..a4292b6089 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -54,1640 +54,6 @@ #include "fixedpoint.inl.hpp" -namespace cv -{ - -/****************************************************************************************\ - Box Filter -\****************************************************************************************/ - -template -struct RowSum : - public BaseRowFilter -{ - RowSum( int _ksize, int _anchor ) : - BaseRowFilter() - { - ksize = _ksize; - anchor = _anchor; - } - - virtual void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE - { - const T* S = (const T*)src; - ST* D = (ST*)dst; - int i = 0, k, ksz_cn = ksize*cn; - - width = (width - 1)*cn; - if( ksize == 3 ) - { - for( i = 0; i < width + cn; i++ ) - { - D[i] = (ST)S[i] + (ST)S[i+cn] + (ST)S[i+cn*2]; - } - } - else if( ksize == 5 ) - { - for( i = 0; i < width + cn; i++ ) - { - D[i] = (ST)S[i] + (ST)S[i+cn] + (ST)S[i+cn*2] + (ST)S[i + cn*3] + (ST)S[i + cn*4]; - } - } - else if( cn == 1 ) - { - ST s = 0; - for( i = 0; i < ksz_cn; i++ ) - s += (ST)S[i]; - D[0] = s; - for( i = 0; i < width; i++ ) - { - s += (ST)S[i + ksz_cn] - (ST)S[i]; - D[i+1] = s; - } - } - else if( cn == 3 ) - { - ST s0 = 0, s1 = 0, s2 = 0; - for( i = 0; i < ksz_cn; i += 3 ) - { - s0 += (ST)S[i]; - s1 += (ST)S[i+1]; - s2 += (ST)S[i+2]; - } - D[0] = s0; - D[1] = s1; - D[2] = s2; - for( i = 0; i < width; i += 3 ) - { - s0 += (ST)S[i + ksz_cn] - (ST)S[i]; - s1 += (ST)S[i + ksz_cn + 1] - (ST)S[i + 1]; - s2 += (ST)S[i + ksz_cn + 2] - (ST)S[i + 2]; - D[i+3] = s0; - D[i+4] = s1; - D[i+5] = s2; - } - } - else if( cn == 4 ) - { - ST s0 = 0, s1 = 0, s2 = 0, s3 = 0; - for( i = 0; i < ksz_cn; i += 4 ) - { - s0 += (ST)S[i]; - s1 += (ST)S[i+1]; - s2 += (ST)S[i+2]; - s3 += (ST)S[i+3]; - } - D[0] = s0; - D[1] = s1; - D[2] = s2; - D[3] = s3; - for( i = 0; i < width; i += 4 ) - { - s0 += (ST)S[i + ksz_cn] - (ST)S[i]; - s1 += (ST)S[i + ksz_cn + 1] - (ST)S[i + 1]; - s2 += (ST)S[i + ksz_cn + 2] - (ST)S[i + 2]; - s3 += (ST)S[i + ksz_cn + 3] - (ST)S[i + 3]; - D[i+4] = s0; - D[i+5] = s1; - D[i+6] = s2; - D[i+7] = s3; - } - } - else - for( k = 0; k < cn; k++, S++, D++ ) - { - ST s = 0; - for( i = 0; i < ksz_cn; i += cn ) - s += (ST)S[i]; - D[0] = s; - for( i = 0; i < width; i += cn ) - { - s += (ST)S[i + ksz_cn] - (ST)S[i]; - D[i+cn] = s; - } - } - } -}; - - -template -struct ColumnSum : - public BaseColumnFilter -{ - ColumnSum( int _ksize, int _anchor, double _scale ) : - BaseColumnFilter() - { - ksize = _ksize; - anchor = _anchor; - scale = _scale; - sumCount = 0; - } - - virtual void reset() CV_OVERRIDE { sumCount = 0; } - - virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - int i; - ST* SUM; - bool haveScale = scale != 1; - double _scale = scale; - - if( width != (int)sum.size() ) - { - sum.resize(width); - sumCount = 0; - } - - SUM = &sum[0]; - if( sumCount == 0 ) - { - memset((void*)SUM, 0, width*sizeof(ST)); - - for( ; sumCount < ksize - 1; sumCount++, src++ ) - { - const ST* Sp = (const ST*)src[0]; - - for( i = 0; i < width; i++ ) - SUM[i] += Sp[i]; - } - } - else - { - CV_Assert( sumCount == ksize-1 ); - src += ksize-1; - } - - for( ; count--; src++ ) - { - const ST* Sp = (const ST*)src[0]; - const ST* Sm = (const ST*)src[1-ksize]; - T* D = (T*)dst; - if( haveScale ) - { - for( i = 0; i <= width - 2; i += 2 ) - { - ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1]; - D[i] = saturate_cast(s0*_scale); - D[i+1] = saturate_cast(s1*_scale); - s0 -= Sm[i]; s1 -= Sm[i+1]; - SUM[i] = s0; SUM[i+1] = s1; - } - - for( ; i < width; i++ ) - { - ST s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0*_scale); - SUM[i] = s0 - Sm[i]; - } - } - else - { - for( i = 0; i <= width - 2; i += 2 ) - { - ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1]; - D[i] = saturate_cast(s0); - D[i+1] = saturate_cast(s1); - s0 -= Sm[i]; s1 -= Sm[i+1]; - SUM[i] = s0; SUM[i+1] = s1; - } - - for( ; i < width; i++ ) - { - ST s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0); - SUM[i] = s0 - Sm[i]; - } - } - dst += dststep; - } - } - - double scale; - int sumCount; - std::vector sum; -}; - - -template<> -struct ColumnSum : - public BaseColumnFilter -{ - ColumnSum( int _ksize, int _anchor, double _scale ) : - BaseColumnFilter() - { - ksize = _ksize; - anchor = _anchor; - scale = _scale; - sumCount = 0; - } - - virtual void reset() CV_OVERRIDE { sumCount = 0; } - - virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - int* SUM; - bool haveScale = scale != 1; - double _scale = scale; - -#if CV_SIMD128 - bool haveSIMD128 = hasSIMD128(); -#endif - - if( width != (int)sum.size() ) - { - sum.resize(width); - sumCount = 0; - } - - SUM = &sum[0]; - if( sumCount == 0 ) - { - memset((void*)SUM, 0, width*sizeof(int)); - for( ; sumCount < ksize - 1; sumCount++, src++ ) - { - const int* Sp = (const int*)src[0]; - int i = 0; -#if CV_SIMD128 - if( haveSIMD128 ) - { - for (; i <= width - 4; i += 4) - { - v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); - } - } -#endif - for( ; i < width; i++ ) - SUM[i] += Sp[i]; - } - } - else - { - CV_Assert( sumCount == ksize-1 ); - src += ksize-1; - } - - for( ; count--; src++ ) - { - const int* Sp = (const int*)src[0]; - const int* Sm = (const int*)src[1-ksize]; - uchar* D = (uchar*)dst; - if( haveScale ) - { - int i = 0; -#if CV_SIMD128 - if( haveSIMD128 ) - { - - v_float32x4 v_scale = v_setall_f32((float)_scale); - for( ; i <= width-8; i+=8 ) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s01 = v_load(SUM + i + 4) + v_load(Sp + i + 4); - - v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale)); - v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale)); - - v_uint16x8 v_dst = v_pack(v_s0d, v_s01d); - v_pack_store(D + i, v_dst); - - v_store(SUM + i, v_s0 - v_load(Sm + i)); - v_store(SUM + i + 4, v_s01 - v_load(Sm + i + 4)); - } - } -#endif - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0*_scale); - SUM[i] = s0 - Sm[i]; - } - } - else - { - int i = 0; -#if CV_SIMD128 - if( haveSIMD128 ) - { - for( ; i <= width-8; i+=8 ) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s01 = v_load(SUM + i + 4) + v_load(Sp + i + 4); - - v_uint16x8 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)); - v_pack_store(D + i, v_dst); - - v_store(SUM + i, v_s0 - v_load(Sm + i)); - v_store(SUM + i + 4, v_s01 - v_load(Sm + i + 4)); - } - } -#endif - - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0); - SUM[i] = s0 - Sm[i]; - } - } - dst += dststep; - } - } - - double scale; - int sumCount; - std::vector sum; -}; - - -template<> -struct ColumnSum : -public BaseColumnFilter -{ - enum { SHIFT = 23 }; - - ColumnSum( int _ksize, int _anchor, double _scale ) : - BaseColumnFilter() - { - ksize = _ksize; - anchor = _anchor; - scale = _scale; - sumCount = 0; - divDelta = 0; - divScale = 1; - if( scale != 1 ) - { - int d = cvRound(1./scale); - double scalef = ((double)(1 << SHIFT))/d; - divScale = cvFloor(scalef); - scalef -= divScale; - divDelta = d/2; - if( scalef < 0.5 ) - divDelta++; - else - divScale++; - } - } - - virtual void reset() CV_OVERRIDE { sumCount = 0; } - - virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - const int ds = divScale; - const int dd = divDelta; - ushort* SUM; - const bool haveScale = scale != 1; - -#if CV_SIMD128 - bool haveSIMD128 = hasSIMD128(); -#endif - - if( width != (int)sum.size() ) - { - sum.resize(width); - sumCount = 0; - } - - SUM = &sum[0]; - if( sumCount == 0 ) - { - memset((void*)SUM, 0, width*sizeof(SUM[0])); - for( ; sumCount < ksize - 1; sumCount++, src++ ) - { - const ushort* Sp = (const ushort*)src[0]; - int i = 0; -#if CV_SIMD128 - if( haveSIMD128 ) - { - for( ; i <= width - 8; i += 8 ) - { - v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); - } - } -#endif - for( ; i < width; i++ ) - SUM[i] += Sp[i]; - } - } - else - { - CV_Assert( sumCount == ksize-1 ); - src += ksize-1; - } - - for( ; count--; src++ ) - { - const ushort* Sp = (const ushort*)src[0]; - const ushort* Sm = (const ushort*)src[1-ksize]; - uchar* D = (uchar*)dst; - if( haveScale ) - { - int i = 0; - #if CV_SIMD128 - v_uint32x4 ds4 = v_setall_u32((unsigned)ds); - v_uint16x8 dd8 = v_setall_u16((ushort)dd); - - for( ; i <= width-16; i+=16 ) - { - v_uint16x8 _sm0 = v_load(Sm + i); - v_uint16x8 _sm1 = v_load(Sm + i + 8); - - v_uint16x8 _s0 = v_add_wrap(v_load(SUM + i), v_load(Sp + i)); - v_uint16x8 _s1 = v_add_wrap(v_load(SUM + i + 8), v_load(Sp + i + 8)); - - v_uint32x4 _s00, _s01, _s10, _s11; - - v_expand(_s0 + dd8, _s00, _s01); - v_expand(_s1 + dd8, _s10, _s11); - - _s00 = v_shr(_s00*ds4); - _s01 = v_shr(_s01*ds4); - _s10 = v_shr(_s10*ds4); - _s11 = v_shr(_s11*ds4); - - v_int16x8 r0 = v_pack(v_reinterpret_as_s32(_s00), v_reinterpret_as_s32(_s01)); - v_int16x8 r1 = v_pack(v_reinterpret_as_s32(_s10), v_reinterpret_as_s32(_s11)); - - _s0 = v_sub_wrap(_s0, _sm0); - _s1 = v_sub_wrap(_s1, _sm1); - - v_store(D + i, v_pack_u(r0, r1)); - v_store(SUM + i, _s0); - v_store(SUM + i + 8, _s1); - } - #endif - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = (uchar)((s0 + dd)*ds >> SHIFT); - SUM[i] = (ushort)(s0 - Sm[i]); - } - } - else - { - int i = 0; - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0); - SUM[i] = (ushort)(s0 - Sm[i]); - } - } - dst += dststep; - } - } - - double scale; - int sumCount; - int divDelta; - int divScale; - std::vector sum; -}; - - -template<> -struct ColumnSum : - public BaseColumnFilter -{ - ColumnSum( int _ksize, int _anchor, double _scale ) : - BaseColumnFilter() - { - ksize = _ksize; - anchor = _anchor; - scale = _scale; - sumCount = 0; - } - - virtual void reset() CV_OVERRIDE { sumCount = 0; } - - virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - int i; - int* SUM; - bool haveScale = scale != 1; - double _scale = scale; - -#if CV_SIMD128 - bool haveSIMD128 = hasSIMD128(); -#endif - - if( width != (int)sum.size() ) - { - sum.resize(width); - sumCount = 0; - } - - SUM = &sum[0]; - if( sumCount == 0 ) - { - memset((void*)SUM, 0, width*sizeof(int)); - for( ; sumCount < ksize - 1; sumCount++, src++ ) - { - const int* Sp = (const int*)src[0]; - i = 0; -#if CV_SIMD128 - if( haveSIMD128 ) - { - for( ; i <= width - 4; i+=4 ) - { - v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); - } - } - #endif - for( ; i < width; i++ ) - SUM[i] += Sp[i]; - } - } - else - { - CV_Assert( sumCount == ksize-1 ); - src += ksize-1; - } - - for( ; count--; src++ ) - { - const int* Sp = (const int*)src[0]; - const int* Sm = (const int*)src[1-ksize]; - short* D = (short*)dst; - if( haveScale ) - { - i = 0; -#if CV_SIMD128 - if( haveSIMD128 ) - { - v_float32x4 v_scale = v_setall_f32((float)_scale); - for( ; i <= width-8; i+=8 ) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s01 = v_load(SUM + i + 4) + v_load(Sp + i + 4); - - v_int32x4 v_s0d = v_round(v_cvt_f32(v_s0) * v_scale); - v_int32x4 v_s01d = v_round(v_cvt_f32(v_s01) * v_scale); - v_store(D + i, v_pack(v_s0d, v_s01d)); - - v_store(SUM + i, v_s0 - v_load(Sm + i)); - v_store(SUM + i + 4, v_s01 - v_load(Sm + i + 4)); - } - } -#endif - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0*_scale); - SUM[i] = s0 - Sm[i]; - } - } - else - { - i = 0; -#if CV_SIMD128 - if( haveSIMD128 ) - { - for( ; i <= width-8; i+=8 ) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s01 = v_load(SUM + i + 4) + v_load(Sp + i + 4); - - v_store(D + i, v_pack(v_s0, v_s01)); - - v_store(SUM + i, v_s0 - v_load(Sm + i)); - v_store(SUM + i + 4, v_s01 - v_load(Sm + i + 4)); - } - } -#endif - - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0); - SUM[i] = s0 - Sm[i]; - } - } - dst += dststep; - } - } - - double scale; - int sumCount; - std::vector sum; -}; - - -template<> -struct ColumnSum : - public BaseColumnFilter -{ - ColumnSum( int _ksize, int _anchor, double _scale ) : - BaseColumnFilter() - { - ksize = _ksize; - anchor = _anchor; - scale = _scale; - sumCount = 0; - } - - virtual void reset() CV_OVERRIDE { sumCount = 0; } - - virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - int* SUM; - bool haveScale = scale != 1; - double _scale = scale; - -#if CV_SIMD128 - bool haveSIMD128 = hasSIMD128(); -#endif - - if( width != (int)sum.size() ) - { - sum.resize(width); - sumCount = 0; - } - - SUM = &sum[0]; - if( sumCount == 0 ) - { - memset((void*)SUM, 0, width*sizeof(int)); - for( ; sumCount < ksize - 1; sumCount++, src++ ) - { - const int* Sp = (const int*)src[0]; - int i = 0; -#if CV_SIMD128 - if( haveSIMD128 ) - { - for (; i <= width - 4; i += 4) - { - v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); - } - } -#endif - for( ; i < width; i++ ) - SUM[i] += Sp[i]; - } - } - else - { - CV_Assert( sumCount == ksize-1 ); - src += ksize-1; - } - - for( ; count--; src++ ) - { - const int* Sp = (const int*)src[0]; - const int* Sm = (const int*)src[1-ksize]; - ushort* D = (ushort*)dst; - if( haveScale ) - { - int i = 0; -#if CV_SIMD128 - if( haveSIMD128 ) - { - v_float32x4 v_scale = v_setall_f32((float)_scale); - for( ; i <= width-8; i+=8 ) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s01 = v_load(SUM + i + 4) + v_load(Sp + i + 4); - - v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale)); - v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale)); - v_store(D + i, v_pack(v_s0d, v_s01d)); - - v_store(SUM + i, v_s0 - v_load(Sm + i)); - v_store(SUM + i + 4, v_s01 - v_load(Sm + i + 4)); - } - } -#endif - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0*_scale); - SUM[i] = s0 - Sm[i]; - } - } - else - { - int i = 0; -#if CV_SIMD128 - if( haveSIMD128 ) - { - for( ; i <= width-8; i+=8 ) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s01 = v_load(SUM + i + 4) + v_load(Sp + i + 4); - - v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01))); - - v_store(SUM + i, v_s0 - v_load(Sm + i)); - v_store(SUM + i + 4, v_s01 - v_load(Sm + i + 4)); - } - } -#endif - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0); - SUM[i] = s0 - Sm[i]; - } - } - dst += dststep; - } - } - - double scale; - int sumCount; - std::vector sum; -}; - -template<> -struct ColumnSum : - public BaseColumnFilter -{ - ColumnSum( int _ksize, int _anchor, double _scale ) : - BaseColumnFilter() - { - ksize = _ksize; - anchor = _anchor; - scale = _scale; - sumCount = 0; - } - - virtual void reset() CV_OVERRIDE { sumCount = 0; } - - virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - int* SUM; - bool haveScale = scale != 1; - double _scale = scale; - -#if CV_SIMD128 - bool haveSIMD128 = hasSIMD128(); -#endif - - if( width != (int)sum.size() ) - { - sum.resize(width); - sumCount = 0; - } - - SUM = &sum[0]; - if( sumCount == 0 ) - { - memset((void*)SUM, 0, width*sizeof(int)); - for( ; sumCount < ksize - 1; sumCount++, src++ ) - { - const int* Sp = (const int*)src[0]; - int i = 0; -#if CV_SIMD128 - if( haveSIMD128 ) - { - for( ; i <= width - 4; i+=4 ) - { - v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); - } - } -#endif - for( ; i < width; i++ ) - SUM[i] += Sp[i]; - } - } - else - { - CV_Assert( sumCount == ksize-1 ); - src += ksize-1; - } - - for( ; count--; src++ ) - { - const int* Sp = (const int*)src[0]; - const int* Sm = (const int*)src[1-ksize]; - int* D = (int*)dst; - if( haveScale ) - { - int i = 0; -#if CV_SIMD128 - if( haveSIMD128 ) - { - v_float32x4 v_scale = v_setall_f32((float)_scale); - for( ; i <= width-4; i+=4 ) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s0d = v_round(v_cvt_f32(v_s0) * v_scale); - - v_store(D + i, v_s0d); - v_store(SUM + i, v_s0 - v_load(Sm + i)); - } - } -#endif - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0*_scale); - SUM[i] = s0 - Sm[i]; - } - } - else - { - int i = 0; -#if CV_SIMD128 - if( haveSIMD128 ) - { - for( ; i <= width-4; i+=4 ) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - - v_store(D + i, v_s0); - v_store(SUM + i, v_s0 - v_load(Sm + i)); - } - } -#endif - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = s0; - SUM[i] = s0 - Sm[i]; - } - } - dst += dststep; - } - } - - double scale; - int sumCount; - std::vector sum; -}; - - -template<> -struct ColumnSum : - public BaseColumnFilter -{ - ColumnSum( int _ksize, int _anchor, double _scale ) : - BaseColumnFilter() - { - ksize = _ksize; - anchor = _anchor; - scale = _scale; - sumCount = 0; - } - - virtual void reset() CV_OVERRIDE { sumCount = 0; } - - virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - int* SUM; - bool haveScale = scale != 1; - double _scale = scale; - -#if CV_SIMD128 - bool haveSIMD128 = hasSIMD128(); -#endif - - if( width != (int)sum.size() ) - { - sum.resize(width); - sumCount = 0; - } - - SUM = &sum[0]; - if( sumCount == 0 ) - { - memset((void*)SUM, 0, width*sizeof(int)); - for( ; sumCount < ksize - 1; sumCount++, src++ ) - { - const int* Sp = (const int*)src[0]; - int i = 0; -#if CV_SIMD128 - if( haveSIMD128 ) - { - for( ; i <= width - 4; i+=4 ) - { - v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); - } - } -#endif - - for( ; i < width; i++ ) - SUM[i] += Sp[i]; - } - } - else - { - CV_Assert( sumCount == ksize-1 ); - src += ksize-1; - } - - for( ; count--; src++ ) - { - const int * Sp = (const int*)src[0]; - const int * Sm = (const int*)src[1-ksize]; - float* D = (float*)dst; - if( haveScale ) - { - int i = 0; - -#if CV_SIMD128 - if( haveSIMD128 ) - { - v_float32x4 v_scale = v_setall_f32((float)_scale); - for (; i <= width - 8; i += 8) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s01 = v_load(SUM + i + 4) + v_load(Sp + i + 4); - - v_store(D + i, v_cvt_f32(v_s0) * v_scale); - v_store(D + i + 4, v_cvt_f32(v_s01) * v_scale); - - v_store(SUM + i, v_s0 - v_load(Sm + i)); - v_store(SUM + i + 4, v_s01 - v_load(Sm + i + 4)); - } - } -#endif - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = (float)(s0*_scale); - SUM[i] = s0 - Sm[i]; - } - } - else - { - int i = 0; - -#if CV_SIMD128 - if( haveSIMD128 ) - { - for( ; i <= width-8; i+=8 ) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s01 = v_load(SUM + i + 4) + v_load(Sp + i + 4); - - v_store(D + i, v_cvt_f32(v_s0)); - v_store(D + i + 4, v_cvt_f32(v_s01)); - - v_store(SUM + i, v_s0 - v_load(Sm + i)); - v_store(SUM + i + 4, v_s01 - v_load(Sm + i + 4)); - } - } -#endif - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = (float)(s0); - SUM[i] = s0 - Sm[i]; - } - } - dst += dststep; - } - } - - double scale; - int sumCount; - std::vector sum; -}; - -#ifdef HAVE_OPENCL - -static bool ocl_boxFilter3x3_8UC1( InputArray _src, OutputArray _dst, int ddepth, - Size ksize, Point anchor, int borderType, bool normalize ) -{ - const ocl::Device & dev = ocl::Device::getDefault(); - int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - - if (ddepth < 0) - ddepth = sdepth; - - if (anchor.x < 0) - anchor.x = ksize.width / 2; - if (anchor.y < 0) - anchor.y = ksize.height / 2; - - if ( !(dev.isIntel() && (type == CV_8UC1) && - (_src.offset() == 0) && (_src.step() % 4 == 0) && - (_src.cols() % 16 == 0) && (_src.rows() % 2 == 0) && - (anchor.x == 1) && (anchor.y == 1) && - (ksize.width == 3) && (ksize.height == 3)) ) - return false; - - float alpha = 1.0f / (ksize.height * ksize.width); - Size size = _src.size(); - size_t globalsize[2] = { 0, 0 }; - size_t localsize[2] = { 0, 0 }; - const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" }; - - globalsize[0] = size.width / 16; - globalsize[1] = size.height / 2; - - char build_opts[1024]; - sprintf(build_opts, "-D %s %s", borderMap[borderType], normalize ? "-D NORMALIZE" : ""); - - ocl::Kernel kernel("boxFilter3x3_8UC1_cols16_rows2", cv::ocl::imgproc::boxFilter3x3_oclsrc, build_opts); - if (kernel.empty()) - return false; - - UMat src = _src.getUMat(); - _dst.create(size, CV_MAKETYPE(ddepth, cn)); - if (!(_dst.offset() == 0 && _dst.step() % 4 == 0)) - return false; - UMat dst = _dst.getUMat(); - - int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src)); - idxArg = kernel.set(idxArg, (int)src.step); - idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst)); - idxArg = kernel.set(idxArg, (int)dst.step); - idxArg = kernel.set(idxArg, (int)dst.rows); - idxArg = kernel.set(idxArg, (int)dst.cols); - if (normalize) - idxArg = kernel.set(idxArg, (float)alpha); - - return kernel.run(2, globalsize, (localsize[0] == 0) ? NULL : localsize, false); -} - -#define DIVUP(total, grain) ((total + grain - 1) / (grain)) -#define ROUNDUP(sz, n) ((sz) + (n) - 1 - (((sz) + (n) - 1) % (n))) - -static bool ocl_boxFilter( InputArray _src, OutputArray _dst, int ddepth, - Size ksize, Point anchor, int borderType, bool normalize, bool sqr = false ) -{ - const ocl::Device & dev = ocl::Device::getDefault(); - int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz = CV_ELEM_SIZE(type); - bool doubleSupport = dev.doubleFPConfig() > 0; - - if (ddepth < 0) - ddepth = sdepth; - - if (cn > 4 || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) || - _src.offset() % esz != 0 || _src.step() % esz != 0) - return false; - - if (anchor.x < 0) - anchor.x = ksize.width / 2; - if (anchor.y < 0) - anchor.y = ksize.height / 2; - - int computeUnits = ocl::Device::getDefault().maxComputeUnits(); - float alpha = 1.0f / (ksize.height * ksize.width); - Size size = _src.size(), wholeSize; - bool isolated = (borderType & BORDER_ISOLATED) != 0; - borderType &= ~BORDER_ISOLATED; - int wdepth = std::max(CV_32F, std::max(ddepth, sdepth)), - wtype = CV_MAKE_TYPE(wdepth, cn), dtype = CV_MAKE_TYPE(ddepth, cn); - - const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" }; - size_t globalsize[2] = { (size_t)size.width, (size_t)size.height }; - size_t localsize_general[2] = { 0, 1 }, * localsize = NULL; - - UMat src = _src.getUMat(); - if (!isolated) - { - Point ofs; - src.locateROI(wholeSize, ofs); - } - - int h = isolated ? size.height : wholeSize.height; - int w = isolated ? size.width : wholeSize.width; - - size_t maxWorkItemSizes[32]; - ocl::Device::getDefault().maxWorkItemSizes(maxWorkItemSizes); - int tryWorkItems = (int)maxWorkItemSizes[0]; - - ocl::Kernel kernel; - - if (dev.isIntel() && !(dev.type() & ocl::Device::TYPE_CPU) && - ((ksize.width < 5 && ksize.height < 5 && esz <= 4) || - (ksize.width == 5 && ksize.height == 5 && cn == 1))) - { - if (w < ksize.width || h < ksize.height) - return false; - - // Figure out what vector size to use for loading the pixels. - int pxLoadNumPixels = cn != 1 || size.width % 4 ? 1 : 4; - int pxLoadVecSize = cn * pxLoadNumPixels; - - // Figure out how many pixels per work item to compute in X and Y - // directions. Too many and we run out of registers. - int pxPerWorkItemX = 1, pxPerWorkItemY = 1; - if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4) - { - pxPerWorkItemX = size.width % 8 ? size.width % 4 ? size.width % 2 ? 1 : 2 : 4 : 8; - pxPerWorkItemY = size.height % 2 ? 1 : 2; - } - else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4)) - { - pxPerWorkItemX = size.width % 2 ? 1 : 2; - pxPerWorkItemY = size.height % 2 ? 1 : 2; - } - globalsize[0] = size.width / pxPerWorkItemX; - globalsize[1] = size.height / pxPerWorkItemY; - - // Need some padding in the private array for pixels - int privDataWidth = ROUNDUP(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels); - - // Make the global size a nice round number so the runtime can pick - // from reasonable choices for the workgroup size - const int wgRound = 256; - globalsize[0] = ROUNDUP(globalsize[0], wgRound); - - char build_options[1024], cvt[2][40]; - sprintf(build_options, "-D cn=%d " - "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d " - "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d " - "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s " - "-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d " - "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s " - "-D convertToWT=%s -D convertToDstT=%s%s%s -D PX_LOAD_FLOAT_VEC_CONV=convert_%s -D OP_BOX_FILTER", - cn, anchor.x, anchor.y, ksize.width, ksize.height, - pxLoadVecSize, pxLoadNumPixels, - pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType], - isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", - privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1, - ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype), - ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth), - ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), - ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), - normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "", - ocl::typeToStr(CV_MAKE_TYPE(wdepth, pxLoadVecSize)) //PX_LOAD_FLOAT_VEC_CONV - ); - - - if (!kernel.create("filterSmall", cv::ocl::imgproc::filterSmall_oclsrc, build_options)) - return false; - } - else - { - localsize = localsize_general; - for ( ; ; ) - { - int BLOCK_SIZE_X = tryWorkItems, BLOCK_SIZE_Y = std::min(ksize.height * 10, size.height); - - while (BLOCK_SIZE_X > 32 && BLOCK_SIZE_X >= ksize.width * 2 && BLOCK_SIZE_X > size.width * 2) - BLOCK_SIZE_X /= 2; - while (BLOCK_SIZE_Y < BLOCK_SIZE_X / 8 && BLOCK_SIZE_Y * computeUnits * 32 < size.height) - BLOCK_SIZE_Y *= 2; - - if (ksize.width > BLOCK_SIZE_X || w < ksize.width || h < ksize.height) - return false; - - char cvt[2][50]; - String opts = format("-D LOCAL_SIZE_X=%d -D BLOCK_SIZE_Y=%d -D ST=%s -D DT=%s -D WT=%s -D convertToDT=%s -D convertToWT=%s" - " -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s%s%s%s%s" - " -D ST1=%s -D DT1=%s -D cn=%d", - BLOCK_SIZE_X, BLOCK_SIZE_Y, ocl::typeToStr(type), ocl::typeToStr(CV_MAKE_TYPE(ddepth, cn)), - ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), - ocl::convertTypeStr(wdepth, ddepth, cn, cvt[0]), - ocl::convertTypeStr(sdepth, wdepth, cn, cvt[1]), - anchor.x, anchor.y, ksize.width, ksize.height, borderMap[borderType], - isolated ? " -D BORDER_ISOLATED" : "", doubleSupport ? " -D DOUBLE_SUPPORT" : "", - normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "", - ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn); - - localsize[0] = BLOCK_SIZE_X; - globalsize[0] = DIVUP(size.width, BLOCK_SIZE_X - (ksize.width - 1)) * BLOCK_SIZE_X; - globalsize[1] = DIVUP(size.height, BLOCK_SIZE_Y); - - kernel.create("boxFilter", cv::ocl::imgproc::boxFilter_oclsrc, opts); - if (kernel.empty()) - return false; - - size_t kernelWorkGroupSize = kernel.workGroupSize(); - if (localsize[0] <= kernelWorkGroupSize) - break; - if (BLOCK_SIZE_X < (int)kernelWorkGroupSize) - return false; - - tryWorkItems = (int)kernelWorkGroupSize; - } - } - - _dst.create(size, CV_MAKETYPE(ddepth, cn)); - UMat dst = _dst.getUMat(); - - int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src)); - idxArg = kernel.set(idxArg, (int)src.step); - int srcOffsetX = (int)((src.offset % src.step) / src.elemSize()); - int srcOffsetY = (int)(src.offset / src.step); - int srcEndX = isolated ? srcOffsetX + size.width : wholeSize.width; - int srcEndY = isolated ? srcOffsetY + size.height : wholeSize.height; - idxArg = kernel.set(idxArg, srcOffsetX); - idxArg = kernel.set(idxArg, srcOffsetY); - idxArg = kernel.set(idxArg, srcEndX); - idxArg = kernel.set(idxArg, srcEndY); - idxArg = kernel.set(idxArg, ocl::KernelArg::WriteOnly(dst)); - if (normalize) - idxArg = kernel.set(idxArg, (float)alpha); - - return kernel.run(2, globalsize, localsize, false); -} - -#undef DIVUP -#undef ROUNDUP - -#endif - -} - - -cv::Ptr cv::getRowSumFilter(int srcType, int sumType, int ksize, int anchor) -{ - int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType); - CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) ); - - if( anchor < 0 ) - anchor = ksize/2; - - if( sdepth == CV_8U && ddepth == CV_32S ) - return makePtr >(ksize, anchor); - if( sdepth == CV_8U && ddepth == CV_16U ) - return makePtr >(ksize, anchor); - if( sdepth == CV_8U && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - if( sdepth == CV_16U && ddepth == CV_32S ) - return makePtr >(ksize, anchor); - if( sdepth == CV_16U && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - if( sdepth == CV_16S && ddepth == CV_32S ) - return makePtr >(ksize, anchor); - if( sdepth == CV_32S && ddepth == CV_32S ) - return makePtr >(ksize, anchor); - if( sdepth == CV_16S && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - if( sdepth == CV_32F && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - if( sdepth == CV_64F && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - - CV_Error_( CV_StsNotImplemented, - ("Unsupported combination of source format (=%d), and buffer format (=%d)", - srcType, sumType)); -} - - -cv::Ptr cv::getColumnSumFilter(int sumType, int dstType, int ksize, - int anchor, double scale) -{ - int sdepth = CV_MAT_DEPTH(sumType), ddepth = CV_MAT_DEPTH(dstType); - CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(dstType) ); - - if( anchor < 0 ) - anchor = ksize/2; - - if( ddepth == CV_8U && sdepth == CV_32S ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_8U && sdepth == CV_16U ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_8U && sdepth == CV_64F ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_16U && sdepth == CV_32S ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_16U && sdepth == CV_64F ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_16S && sdepth == CV_32S ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_16S && sdepth == CV_64F ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_32S && sdepth == CV_32S ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_32F && sdepth == CV_32S ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_32F && sdepth == CV_64F ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_64F && sdepth == CV_32S ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_64F && sdepth == CV_64F ) - return makePtr >(ksize, anchor, scale); - - CV_Error_( CV_StsNotImplemented, - ("Unsupported combination of sum format (=%d), and destination format (=%d)", - sumType, dstType)); -} - - -cv::Ptr cv::createBoxFilter( int srcType, int dstType, Size ksize, - Point anchor, bool normalize, int borderType ) -{ - int sdepth = CV_MAT_DEPTH(srcType); - int cn = CV_MAT_CN(srcType), sumType = CV_64F; - if( sdepth == CV_8U && CV_MAT_DEPTH(dstType) == CV_8U && - ksize.width*ksize.height <= 256 ) - sumType = CV_16U; - else if( sdepth <= CV_32S && (!normalize || - ksize.width*ksize.height <= (sdepth == CV_8U ? (1<<23) : - sdepth == CV_16U ? (1 << 15) : (1 << 16))) ) - sumType = CV_32S; - sumType = CV_MAKETYPE( sumType, cn ); - - Ptr rowFilter = getRowSumFilter(srcType, sumType, ksize.width, anchor.x ); - Ptr columnFilter = getColumnSumFilter(sumType, - dstType, ksize.height, anchor.y, normalize ? 1./(ksize.width*ksize.height) : 1); - - return makePtr(Ptr(), rowFilter, columnFilter, - srcType, dstType, sumType, borderType ); -} - -#ifdef HAVE_OPENVX -namespace cv -{ - namespace ovx { - template <> inline bool skipSmallImages(int w, int h) { return w*h < 640 * 480; } - } - static bool openvx_boxfilter(InputArray _src, OutputArray _dst, int ddepth, - Size ksize, Point anchor, - bool normalize, int borderType) - { - if (ddepth < 0) - ddepth = CV_8UC1; - if (_src.type() != CV_8UC1 || ddepth != CV_8U || !normalize || - _src.cols() < 3 || _src.rows() < 3 || - ksize.width != 3 || ksize.height != 3 || - (anchor.x >= 0 && anchor.x != 1) || - (anchor.y >= 0 && anchor.y != 1) || - ovx::skipSmallImages(_src.cols(), _src.rows())) - return false; - - Mat src = _src.getMat(); - - if ((borderType & BORDER_ISOLATED) == 0 && src.isSubmatrix()) - return false; //Process isolated borders only - vx_enum border; - switch (borderType & ~BORDER_ISOLATED) - { - case BORDER_CONSTANT: - border = VX_BORDER_CONSTANT; - break; - case BORDER_REPLICATE: - border = VX_BORDER_REPLICATE; - break; - default: - return false; - } - - _dst.create(src.size(), CV_8UC1); - Mat dst = _dst.getMat(); - - try - { - ivx::Context ctx = ovx::getOpenVXContext(); - - Mat a; - if (dst.data != src.data) - a = src; - else - src.copyTo(a); - - ivx::Image - ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8, - ivx::Image::createAddressing(a.cols, a.rows, 1, (vx_int32)(a.step)), a.data), - ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8, - ivx::Image::createAddressing(dst.cols, dst.rows, 1, (vx_int32)(dst.step)), dst.data); - - //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments - //since OpenVX standard says nothing about thread-safety for now - ivx::border_t prevBorder = ctx.immediateBorder(); - ctx.setImmediateBorder(border, (vx_uint8)(0)); - ivx::IVX_CHECK_STATUS(vxuBox3x3(ctx, ia, ib)); - ctx.setImmediateBorder(prevBorder); - } - catch (ivx::RuntimeError & e) - { - VX_DbgThrow(e.what()); - } - catch (ivx::WrapperError & e) - { - VX_DbgThrow(e.what()); - } - - return true; - } -} -#endif - -#if defined(HAVE_IPP) -namespace cv -{ -static bool ipp_boxfilter(Mat &src, Mat &dst, Size ksize, Point anchor, bool normalize, int borderType) -{ -#ifdef HAVE_IPP_IW - CV_INSTRUMENT_REGION_IPP(); - -#if IPP_VERSION_X100 < 201801 - // Problem with SSE42 optimization for 16s and some 8u modes - if(ipp::getIppTopFeatures() == ippCPUID_SSE42 && (((src.depth() == CV_16S || src.depth() == CV_16U) && (src.channels() == 3 || src.channels() == 4)) || (src.depth() == CV_8U && src.channels() == 3 && (ksize.width > 5 || ksize.height > 5)))) - return false; - - // Other optimizations has some degradations too - if((((src.depth() == CV_16S || src.depth() == CV_16U) && (src.channels() == 4)) || (src.depth() == CV_8U && src.channels() == 1 && (ksize.width > 5 || ksize.height > 5)))) - return false; -#endif - - if(!normalize) - return false; - - if(!ippiCheckAnchor(anchor, ksize)) - return false; - - try - { - ::ipp::IwiImage iwSrc = ippiGetImage(src); - ::ipp::IwiImage iwDst = ippiGetImage(dst); - ::ipp::IwiSize iwKSize = ippiGetSize(ksize); - ::ipp::IwiBorderSize borderSize(iwKSize); - ::ipp::IwiBorderType ippBorder(ippiGetBorder(iwSrc, borderType, borderSize)); - if(!ippBorder) - return false; - - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterBox, iwSrc, iwDst, iwKSize, ::ipp::IwDefault(), ippBorder); - } - catch (const ::ipp::IwException &) - { - return false; - } - - return true; -#else - CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(ksize); CV_UNUSED(anchor); CV_UNUSED(normalize); CV_UNUSED(borderType); - return false; -#endif -} -} -#endif - - -void cv::boxFilter( InputArray _src, OutputArray _dst, int ddepth, - Size ksize, Point anchor, - bool normalize, int borderType ) -{ - CV_INSTRUMENT_REGION(); - - CV_OCL_RUN(_dst.isUMat() && - (borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || - borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101), - ocl_boxFilter3x3_8UC1(_src, _dst, ddepth, ksize, anchor, borderType, normalize)) - - CV_OCL_RUN(_dst.isUMat(), ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize)) - - Mat src = _src.getMat(); - int stype = src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype); - if( ddepth < 0 ) - ddepth = sdepth; - _dst.create( src.size(), CV_MAKETYPE(ddepth, cn) ); - Mat dst = _dst.getMat(); - if( borderType != BORDER_CONSTANT && normalize && (borderType & BORDER_ISOLATED) != 0 ) - { - if( src.rows == 1 ) - ksize.height = 1; - if( src.cols == 1 ) - ksize.width = 1; - } - - Point ofs; - Size wsz(src.cols, src.rows); - if(!(borderType&BORDER_ISOLATED)) - src.locateROI( wsz, ofs ); - - CALL_HAL(boxFilter, cv_hal_boxFilter, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, ddepth, cn, - ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, ksize.height, - anchor.x, anchor.y, normalize, borderType&~BORDER_ISOLATED); - - CV_OVX_RUN(true, - openvx_boxfilter(src, dst, ddepth, ksize, anchor, normalize, borderType)) - - CV_IPP_RUN_FAST(ipp_boxfilter(src, dst, ksize, anchor, normalize, borderType)); - - borderType = (borderType&~BORDER_ISOLATED); - - Ptr f = createBoxFilter( src.type(), dst.type(), - ksize, anchor, normalize, borderType ); - - f->apply( src, dst, wsz, ofs ); -} - - -void cv::blur( InputArray src, OutputArray dst, - Size ksize, Point anchor, int borderType ) -{ - CV_INSTRUMENT_REGION(); - - boxFilter( src, dst, -1, ksize, anchor, true, borderType ); -} - - -/****************************************************************************************\ - Squared Box Filter -\****************************************************************************************/ - -namespace cv -{ - -template -struct SqrRowSum : - public BaseRowFilter -{ - SqrRowSum( int _ksize, int _anchor ) : - BaseRowFilter() - { - ksize = _ksize; - anchor = _anchor; - } - - virtual void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE - { - const T* S = (const T*)src; - ST* D = (ST*)dst; - int i = 0, k, ksz_cn = ksize*cn; - - width = (width - 1)*cn; - for( k = 0; k < cn; k++, S++, D++ ) - { - ST s = 0; - for( i = 0; i < ksz_cn; i += cn ) - { - ST val = (ST)S[i]; - s += val*val; - } - D[0] = s; - for( i = 0; i < width; i += cn ) - { - ST val0 = (ST)S[i], val1 = (ST)S[i + ksz_cn]; - s += val1*val1 - val0*val0; - D[i+cn] = s; - } - } - } -}; - -static Ptr getSqrRowSumFilter(int srcType, int sumType, int ksize, int anchor) -{ - int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType); - CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) ); - - if( anchor < 0 ) - anchor = ksize/2; - - if( sdepth == CV_8U && ddepth == CV_32S ) - return makePtr >(ksize, anchor); - if( sdepth == CV_8U && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - if( sdepth == CV_16U && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - if( sdepth == CV_16S && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - if( sdepth == CV_32F && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - if( sdepth == CV_64F && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - - CV_Error_( CV_StsNotImplemented, - ("Unsupported combination of source format (=%d), and buffer format (=%d)", - srcType, sumType)); -} - -} - -void cv::sqrBoxFilter( InputArray _src, OutputArray _dst, int ddepth, - Size ksize, Point anchor, - bool normalize, int borderType ) -{ - CV_INSTRUMENT_REGION(); - - int srcType = _src.type(), sdepth = CV_MAT_DEPTH(srcType), cn = CV_MAT_CN(srcType); - Size size = _src.size(); - - if( ddepth < 0 ) - ddepth = sdepth < CV_32F ? CV_32F : CV_64F; - - if( borderType != BORDER_CONSTANT && normalize ) - { - if( size.height == 1 ) - ksize.height = 1; - if( size.width == 1 ) - ksize.width = 1; - } - - CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2, - ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize, true)) - - int sumDepth = CV_64F; - if( sdepth == CV_8U ) - sumDepth = CV_32S; - int sumType = CV_MAKETYPE( sumDepth, cn ), dstType = CV_MAKETYPE(ddepth, cn); - - Mat src = _src.getMat(); - _dst.create( size, dstType ); - Mat dst = _dst.getMat(); - - Ptr rowFilter = getSqrRowSumFilter(srcType, sumType, ksize.width, anchor.x ); - Ptr columnFilter = getColumnSumFilter(sumType, - dstType, ksize.height, anchor.y, - normalize ? 1./(ksize.width*ksize.height) : 1); - - Ptr f = makePtr(Ptr(), rowFilter, columnFilter, - srcType, dstType, sumType, borderType ); - Point ofs; - Size wsz(src.cols, src.rows); - src.locateROI( wsz, ofs ); - - f->apply( src, dst, wsz, ofs ); -} - - /****************************************************************************************\ Gaussian Blur \****************************************************************************************/