From 06f32e3b3e327200cf153ea87acc3346cf9ee79c Mon Sep 17 00:00:00 2001 From: Vitaly Tuzov Date: Mon, 3 Dec 2018 22:58:31 +0300 Subject: [PATCH] Reworked separable filter to use wide universal intrinsics --- .../include/opencv2/core/hal/intrin_avx.hpp | 10 + modules/imgproc/src/filter.cpp | 2790 +++++------------ 2 files changed, 875 insertions(+), 1925 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index 19de221005..c3797d67c1 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -1278,6 +1278,16 @@ OPENCV_HAL_IMPL_AVX_CHECK_FLT(v_float64x4, 15) OPENCV_HAL_IMPL_AVX_MULADD(v_float32x8, ps) OPENCV_HAL_IMPL_AVX_MULADD(v_float64x4, pd) +inline v_int32x8 v_fma(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c) +{ + return a * b + c; +} + +inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c) +{ + return v_fma(a, b, c); +} + inline v_float32x8 v_invsqrt(const v_float32x8& x) { v_float32x8 half = x * v256_setall_f32(0.5); diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp index da2370e5a7..9107d0019b 100644 --- a/modules/imgproc/src/filter.cpp +++ b/modules/imgproc/src/filter.cpp @@ -44,6 +44,7 @@ #include "opencv2/core/opencl/ocl_defs.hpp" #include "opencl_kernels_imgproc.hpp" #include "hal_replacement.hpp" +#include "opencv2/core/hal/intrin.hpp" #include "filter.hpp" @@ -477,7 +478,7 @@ struct FilterNoVec }; -#if CV_SSE2 +#if CV_SIMD ///////////////////////////////////// 8u-16s & 8u-8u ////////////////////////////////// @@ -502,9 +503,6 @@ struct RowVec_8u32s int operator()(const uchar* _src, uchar* _dst, int width, int cn) const { - if( !checkHardwareSupport(CV_CPU_SSE2) ) - return 0; - int i = 0, k, _ksize = kernel.rows + kernel.cols - 1; int* dst = (int*)_dst; const int* _kx = kernel.ptr(); @@ -512,52 +510,81 @@ struct RowVec_8u32s if( smallValues ) { - __m128i z = _mm_setzero_si128(); - for( ; i <= width - 8; i += 8 ) + for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) { const uchar* src = _src + i; - __m128i s0 = z, s1 = z; - - for( k = 0; k < _ksize; k++, src += cn ) + v_int32 s0 = vx_setzero_s32(); + v_int32 s1 = vx_setzero_s32(); + v_int32 s2 = vx_setzero_s32(); + v_int32 s3 = vx_setzero_s32(); + k = 0; + for (; k <= _ksize - 2; k += 2, src += 2 * cn) + { + v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16)); + v_uint8 x0, x1; + v_zip(vx_load(src), vx_load(src + cn), x0, x1); + s0 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x0)), v_reinterpret_as_s16(f)); + s1 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x0)), v_reinterpret_as_s16(f)); + s2 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x1)), v_reinterpret_as_s16(f)); + s3 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x1)), v_reinterpret_as_s16(f)); + } + if (k < _ksize) + { + v_int32 f = vx_setall_s32(_kx[k]); + v_uint16 x0, x1; + v_expand(vx_load(src), x0, x1); + s0 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x0)), v_reinterpret_as_s16(f)); + s1 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x0)), v_reinterpret_as_s16(f)); + s2 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x1)), v_reinterpret_as_s16(f)); + s3 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x1)), v_reinterpret_as_s16(f)); + } + v_store(dst + i, s0); + v_store(dst + i + v_int32::nlanes, s1); + v_store(dst + i + 2*v_int32::nlanes, s2); + v_store(dst + i + 3*v_int32::nlanes, s3); + } + if( i <= width - v_uint16::nlanes ) + { + const uchar* src = _src + i; + v_int32 s0 = vx_setzero_s32(); + v_int32 s1 = vx_setzero_s32(); + k = 0; + for( ; k <= _ksize - 2; k += 2, src += 2*cn ) { - __m128i f = _mm_cvtsi32_si128(_kx[k]); - f = _mm_shuffle_epi32(f, 0); - - __m128i x0 = _mm_loadl_epi64((const __m128i*)src); - x0 = _mm_unpacklo_epi8(x0, z); - - __m128i x1 = _mm_unpackhi_epi16(x0, z); - x0 = _mm_unpacklo_epi16(x0, z); - - x0 = _mm_madd_epi16(x0, f); - x1 = _mm_madd_epi16(x1, f); - - s0 = _mm_add_epi32(s0, x0); - s1 = _mm_add_epi32(s1, x1); + v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16)); + v_uint16 x0, x1; + v_zip(vx_load_expand(src), vx_load_expand(src + cn), x0, x1); + s0 += v_dotprod(v_reinterpret_as_s16(x0), v_reinterpret_as_s16(f)); + s1 += v_dotprod(v_reinterpret_as_s16(x1), v_reinterpret_as_s16(f)); } - - _mm_store_si128((__m128i*)(dst + i), s0); - _mm_store_si128((__m128i*)(dst + i + 4), s1); + if( k < _ksize ) + { + v_int32 f = vx_setall_s32(_kx[k]); + v_uint32 x0, x1; + v_expand(vx_load_expand(src), x0, x1); + s0 += v_dotprod(v_reinterpret_as_s16(x0), v_reinterpret_as_s16(f)); + s1 += v_dotprod(v_reinterpret_as_s16(x1), v_reinterpret_as_s16(f)); + } + v_store(dst + i, s0); + v_store(dst + i + v_int32::nlanes, s1); + i += v_uint16::nlanes; } - - if( i <= width - 4 ) + if( i <= width - v_uint32::nlanes ) { + v_int32 d = vx_setzero_s32(); + k = 0; const uchar* src = _src + i; - __m128i s0 = z; - - for( k = 0; k < _ksize; k++, src += cn ) + for (; k <= _ksize - 2; k += 2, src += 2*cn) { - __m128i f = _mm_cvtsi32_si128(_kx[k]); - f = _mm_shuffle_epi32(f, 0); - - __m128i x0 = _mm_cvtsi32_si128(*(const int*)src); - x0 = _mm_unpacklo_epi8(x0, z); - x0 = _mm_unpacklo_epi16(x0, z); - x0 = _mm_madd_epi16(x0, f); - s0 = _mm_add_epi32(s0, x0); + v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16)); + v_uint32 x0, x1; + v_zip(vx_load_expand_q(src), vx_load_expand_q(src + cn), x0, x1); + d += v_dotprod(v_pack(v_reinterpret_as_s32(x0), v_reinterpret_as_s32(x1)), v_reinterpret_as_s16(f)); } - _mm_store_si128((__m128i*)(dst + i), s0); - i += 4; + if (k < _ksize) + d += v_dotprod(v_reinterpret_as_s16(vx_load_expand_q(src)), v_reinterpret_as_s16(vx_setall_s32(_kx[k]))); + v_store(dst + i, d); + i += v_uint32::nlanes; } } return i; @@ -590,9 +617,6 @@ struct SymmRowSmallVec_8u32s int operator()(const uchar* src, uchar* _dst, int width, int cn) const { - if( !checkHardwareSupport(CV_CPU_SSE2) ) - return 0; - int i = 0, j, k, _ksize = kernel.rows + kernel.cols - 1; int* dst = (int*)_dst; bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; @@ -603,7 +627,6 @@ struct SymmRowSmallVec_8u32s src += (_ksize/2)*cn; width *= cn; - __m128i z = _mm_setzero_si128(); if( symmetrical ) { if( _ksize == 1 ) @@ -611,143 +634,276 @@ struct SymmRowSmallVec_8u32s if( _ksize == 3 ) { if( kx[0] == 2 && kx[1] == 1 ) - for( ; i <= width - 16; i += 16, src += 16 ) + { + for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) + { + v_uint16 x0l, x0h, x1l, x1h, x2l, x2h; + v_expand(vx_load(src - cn), x0l, x0h); + v_expand(vx_load(src), x1l, x1h); + v_expand(vx_load(src + cn), x2l, x2h); + x1l = v_add_wrap(v_add_wrap(x1l, x1l), v_add_wrap(x0l, x2l)); + x1h = v_add_wrap(v_add_wrap(x1h, x1h), v_add_wrap(x0h, x2h)); + v_store(dst + i, v_reinterpret_as_s32(v_expand_low(x1l))); + v_store(dst + i + v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x1l))); + v_store(dst + i + 2*v_int32::nlanes, v_reinterpret_as_s32(v_expand_low(x1h))); + v_store(dst + i + 3*v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x1h))); + } + if( i <= width - v_uint16::nlanes ) + { + v_uint16 x = vx_load_expand(src); + x = v_add_wrap(v_add_wrap(x, x), v_add_wrap(vx_load_expand(src - cn), vx_load_expand(src + cn))); + v_store(dst + i, v_reinterpret_as_s32(v_expand_low(x))); + v_store(dst + i + v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x))); + i += v_uint16::nlanes; src += v_uint16::nlanes; + } + if( i <= width - v_uint32::nlanes ) { - __m128i x0, x1, x2, y0, y1, y2; - x0 = _mm_loadu_si128((__m128i*)(src - cn)); - x1 = _mm_loadu_si128((__m128i*)src); - x2 = _mm_loadu_si128((__m128i*)(src + cn)); - y0 = _mm_unpackhi_epi8(x0, z); - x0 = _mm_unpacklo_epi8(x0, z); - y1 = _mm_unpackhi_epi8(x1, z); - x1 = _mm_unpacklo_epi8(x1, z); - y2 = _mm_unpackhi_epi8(x2, z); - x2 = _mm_unpacklo_epi8(x2, z); - x0 = _mm_add_epi16(x0, _mm_add_epi16(_mm_add_epi16(x1, x1), x2)); - y0 = _mm_add_epi16(y0, _mm_add_epi16(_mm_add_epi16(y1, y1), y2)); - _mm_store_si128((__m128i*)(dst + i), _mm_unpacklo_epi16(x0, z)); - _mm_store_si128((__m128i*)(dst + i + 4), _mm_unpackhi_epi16(x0, z)); - _mm_store_si128((__m128i*)(dst + i + 8), _mm_unpacklo_epi16(y0, z)); - _mm_store_si128((__m128i*)(dst + i + 12), _mm_unpackhi_epi16(y0, z)); + v_uint32 x = vx_load_expand_q(src); + x = (x + x) + vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn); + v_store(dst + i, v_reinterpret_as_s32(x)); + i += v_uint32::nlanes; } + } else if( kx[0] == -2 && kx[1] == 1 ) - for( ; i <= width - 16; i += 16, src += 16 ) + { + for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) + { + v_uint16 x0l, x0h, x1l, x1h, x2l, x2h; + v_expand(vx_load(src - cn), x0l, x0h); + v_expand(vx_load(src), x1l, x1h); + v_expand(vx_load(src + cn), x2l, x2h); + x1l = v_sub_wrap(v_add_wrap(x0l, x2l), v_add_wrap(x1l, x1l)); + x1h = v_sub_wrap(v_add_wrap(x0h, x2h), v_add_wrap(x1h, x1h)); + v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x1l))); + v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1l))); + v_store(dst + i + 2*v_int32::nlanes, v_expand_low(v_reinterpret_as_s16(x1h))); + v_store(dst + i + 3*v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1h))); + } + if( i <= width - v_uint16::nlanes ) + { + v_uint16 x = vx_load_expand(src); + x = v_sub_wrap(v_add_wrap(vx_load_expand(src - cn), vx_load_expand(src + cn)), v_add_wrap(x, x)); + v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x))); + v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x))); + i += v_uint16::nlanes; src += v_uint16::nlanes; + } + if( i <= width - v_uint32::nlanes ) { - __m128i x0, x1, x2, y0, y1, y2; - x0 = _mm_loadu_si128((__m128i*)(src - cn)); - x1 = _mm_loadu_si128((__m128i*)src); - x2 = _mm_loadu_si128((__m128i*)(src + cn)); - y0 = _mm_unpackhi_epi8(x0, z); - x0 = _mm_unpacklo_epi8(x0, z); - y1 = _mm_unpackhi_epi8(x1, z); - x1 = _mm_unpacklo_epi8(x1, z); - y2 = _mm_unpackhi_epi8(x2, z); - x2 = _mm_unpacklo_epi8(x2, z); - x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1))); - y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1))); - _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16)); - _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16)); - _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16)); - _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16)); + v_int32 x = v_reinterpret_as_s32(vx_load_expand_q(src)); + x = v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)) - (x + x); + v_store(dst + i, x); + i += v_uint32::nlanes; } + } else { - __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0), - k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0); - k1 = _mm_packs_epi32(k1, k1); - - for( ; i <= width - 8; i += 8, src += 8 ) + v_int16 k0 = vx_setall_s16((short)kx[0]); + v_int16 k1 = vx_setall_s16((short)kx[1]); + for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) + { + v_uint16 x0l, x0h, x1l, x1h, x2l, x2h; + v_expand(vx_load(src - cn), x0l, x0h); + v_expand(vx_load(src), x1l, x1h); + v_expand(vx_load(src + cn), x2l, x2h); + + v_int32 dl, dh; + v_int16 x0, x1; + v_mul_expand(v_reinterpret_as_s16(x1l), k0, dl, dh); + v_zip(v_reinterpret_as_s16(x0l), v_reinterpret_as_s16(x2l), x0, x1); + dl += v_dotprod(x0, k1); + dh += v_dotprod(x1, k1); + v_store(dst + i, dl); + v_store(dst + i + v_int32::nlanes, dh); + + v_mul_expand(v_reinterpret_as_s16(x1h), k0, dl, dh); + v_zip(v_reinterpret_as_s16(x0h), v_reinterpret_as_s16(x2h), x0, x1); + dl += v_dotprod(x0, k1); + dh += v_dotprod(x1, k1); + v_store(dst + i + 2*v_int32::nlanes, dl); + v_store(dst + i + 3*v_int32::nlanes, dh); + } + if ( i <= width - v_uint16::nlanes ) + { + v_int32 dl, dh; + v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, dl, dh); + v_int16 x0, x1; + v_zip(v_reinterpret_as_s16(vx_load_expand(src - cn)), v_reinterpret_as_s16(vx_load_expand(src + cn)), x0, x1); + dl += v_dotprod(x0, k1); + dh += v_dotprod(x1, k1); + v_store(dst + i, dl); + v_store(dst + i + v_int32::nlanes, dh); + i += v_uint16::nlanes; src += v_uint16::nlanes; + } + if ( i <= width - v_uint32::nlanes ) { - __m128i x0 = _mm_loadl_epi64((__m128i*)(src - cn)); - __m128i x1 = _mm_loadl_epi64((__m128i*)src); - __m128i x2 = _mm_loadl_epi64((__m128i*)(src + cn)); - - x0 = _mm_unpacklo_epi8(x0, z); - x1 = _mm_unpacklo_epi8(x1, z); - x2 = _mm_unpacklo_epi8(x2, z); - __m128i x3 = _mm_unpacklo_epi16(x0, x2); - __m128i x4 = _mm_unpackhi_epi16(x0, x2); - __m128i x5 = _mm_unpacklo_epi16(x1, z); - __m128i x6 = _mm_unpackhi_epi16(x1, z); - x3 = _mm_madd_epi16(x3, k1); - x4 = _mm_madd_epi16(x4, k1); - x5 = _mm_madd_epi16(x5, k0); - x6 = _mm_madd_epi16(x6, k0); - x3 = _mm_add_epi32(x3, x5); - x4 = _mm_add_epi32(x4, x6); - - _mm_store_si128((__m128i*)(dst + i), x3); - _mm_store_si128((__m128i*)(dst + i + 4), x4); + v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src)), vx_setall_s32(kx[0]), v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)) * vx_setall_s32(kx[1]))); + i += v_uint32::nlanes; } } } else if( _ksize == 5 ) { if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 ) - for( ; i <= width - 16; i += 16, src += 16 ) + { + for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) + { + v_uint16 x0l, x0h, x1l, x1h, x2l, x2h; + v_expand(vx_load(src - 2*cn), x0l, x0h); + v_expand(vx_load(src), x1l, x1h); + v_expand(vx_load(src + 2*cn), x2l, x2h); + x1l = v_sub_wrap(v_add_wrap(x0l, x2l), v_add_wrap(x1l, x1l)); + x1h = v_sub_wrap(v_add_wrap(x0h, x2h), v_add_wrap(x1h, x1h)); + v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x1l))); + v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1l))); + v_store(dst + i + 2*v_int32::nlanes, v_expand_low(v_reinterpret_as_s16(x1h))); + v_store(dst + i + 3*v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1h))); + } + if( i <= width - v_uint16::nlanes ) { - __m128i x0, x1, x2, y0, y1, y2; - x0 = _mm_loadu_si128((__m128i*)(src - cn*2)); - x1 = _mm_loadu_si128((__m128i*)src); - x2 = _mm_loadu_si128((__m128i*)(src + cn*2)); - y0 = _mm_unpackhi_epi8(x0, z); - x0 = _mm_unpacklo_epi8(x0, z); - y1 = _mm_unpackhi_epi8(x1, z); - x1 = _mm_unpacklo_epi8(x1, z); - y2 = _mm_unpackhi_epi8(x2, z); - x2 = _mm_unpacklo_epi8(x2, z); - x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1))); - y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1))); - _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16)); - _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16)); - _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16)); - _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16)); + v_uint16 x = vx_load_expand(src); + x = v_sub_wrap(v_add_wrap(vx_load_expand(src - 2*cn), vx_load_expand(src + 2*cn)), v_add_wrap(x, x)); + v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x))); + v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x))); + i += v_uint16::nlanes; src += v_uint16::nlanes; } + if( i <= width - v_uint32::nlanes ) + { + v_int32 x = v_reinterpret_as_s32(vx_load_expand_q(src)); + x = v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn) + vx_load_expand_q(src + 2*cn)) - (x + x); + v_store(dst + i, x); + i += v_uint32::nlanes; + } + } else { - __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0), - k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0), - k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0); - k1 = _mm_packs_epi32(k1, k1); - k2 = _mm_packs_epi32(k2, k2); + v_int16 k0 = vx_setall_s16((short)(kx[0])); + v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (kx[2] << 16))); + for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) + { + v_int32 x0, x1, x2, x3; + v_uint16 x0l, x0h, x1l, x1h, x2l, x2h, x3l, x3h; + v_int16 xl, xh; + + v_expand(vx_load(src), x0l, x0h); + v_mul_expand(v_reinterpret_as_s16(x0l), k0, x0, x1); + v_mul_expand(v_reinterpret_as_s16(x0h), k0, x2, x3); + + v_expand(vx_load(src - cn), x0l, x0h); + v_expand(vx_load(src + cn), x1l, x1h); + v_expand(vx_load(src - 2*cn), x2l, x2h); + v_expand(vx_load(src + 2*cn), x3l, x3h); + v_zip(v_reinterpret_as_s16(x0l + x1l), v_reinterpret_as_s16(x2l + x3l), xl, xh); + x0 += v_dotprod(xl, k12); + x1 += v_dotprod(xh, k12); + v_zip(v_reinterpret_as_s16(x0h + x1h), v_reinterpret_as_s16(x2h + x3h), xl, xh); + x2 += v_dotprod(xl, k12); + x3 += v_dotprod(xh, k12); + + v_store(dst + i, x0); + v_store(dst + i + v_int32::nlanes, x1); + v_store(dst + i + 2*v_int32::nlanes, x2); + v_store(dst + i + 3*v_int32::nlanes, x3); + } + if( i <= width - v_uint16::nlanes ) + { + v_int32 x1, x2; + v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, x1, x2); - for( ; i <= width - 8; i += 8, src += 8 ) + v_int16 xl, xh; + v_zip(v_reinterpret_as_s16(vx_load_expand(src - cn) + vx_load_expand(src + cn)), v_reinterpret_as_s16(vx_load_expand(src - 2*cn) + vx_load_expand(src + 2*cn)), xl, xh); + x1 += v_dotprod(xl, k12); + x2 += v_dotprod(xh, k12); + + v_store(dst + i, x1); + v_store(dst + i + v_int32::nlanes, x2); + i += v_uint16::nlanes, src += v_uint16::nlanes; + } + if( i <= width - v_uint32::nlanes ) + { + v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src)), vx_setall_s32(kx[0]), + v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)), vx_setall_s32(kx[1]), + v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn) + vx_load_expand_q(src + 2*cn)) * vx_setall_s32(kx[2])))); + i += v_uint32::nlanes; + } + } + } + else + { + v_int16 k0 = vx_setall_s16((short)(kx[0])); + for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) + { + v_uint8 v_src = vx_load(src); + v_int32 s0, s1, s2, s3; + v_mul_expand(v_reinterpret_as_s16(v_expand_low(v_src)), k0, s0, s1); + v_mul_expand(v_reinterpret_as_s16(v_expand_high(v_src)), k0, s2, s3); + for (k = 1, j = cn; k <= _ksize / 2 - 1; k += 2, j += 2 * cn) + { + v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k + 1] << 16))); + + v_uint8 v_src0 = vx_load(src - j); + v_uint8 v_src1 = vx_load(src - j - cn); + v_uint8 v_src2 = vx_load(src + j); + v_uint8 v_src3 = vx_load(src + j + cn); + + v_int16 xl, xh; + v_zip(v_reinterpret_as_s16(v_expand_low(v_src0) + v_expand_low(v_src2)), v_reinterpret_as_s16(v_expand_low(v_src1) + v_expand_low(v_src3)), xl, xh); + s0 += v_dotprod(xl, k12); + s1 += v_dotprod(xh, k12); + v_zip(v_reinterpret_as_s16(v_expand_high(v_src0) + v_expand_high(v_src2)), v_reinterpret_as_s16(v_expand_high(v_src1) + v_expand_high(v_src3)), xl, xh); + s2 += v_dotprod(xl, k12); + s3 += v_dotprod(xh, k12); + } + if( k < _ksize / 2 + 1 ) + { + v_int16 k1 = vx_setall_s16((short)(kx[k])); + + v_uint8 v_src0 = vx_load(src - j); + v_uint8 v_src1 = vx_load(src + j); + + v_int16 xl, xh; + v_zip(v_reinterpret_as_s16(v_expand_low(v_src0)), v_reinterpret_as_s16(v_expand_low(v_src1)), xl, xh); + s0 += v_dotprod(xl, k1); + s1 += v_dotprod(xh, k1); + v_zip(v_reinterpret_as_s16(v_expand_high(v_src0)), v_reinterpret_as_s16(v_expand_high(v_src1)), xl, xh); + s2 += v_dotprod(xl, k1); + s3 += v_dotprod(xh, k1); + } + v_store(dst + i, s0); + v_store(dst + i + v_int32::nlanes, s1); + v_store(dst + i + 2*v_int32::nlanes, s2); + v_store(dst + i + 3*v_int32::nlanes, s3); + } + if( i <= width - v_uint16::nlanes ) + { + v_int32 s0, s1; + v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, s0, s1); + for (k = 1, j = cn; k <= _ksize / 2 - 1; k+=2, j += 2*cn) + { + v_int16 xl, xh; + v_zip(v_reinterpret_as_s16(vx_load_expand(src - j) + vx_load_expand(src + j)), v_reinterpret_as_s16(vx_load_expand(src - j - cn) + vx_load_expand(src + j + cn)), xl, xh); + v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k+1] << 16))); + s0 += v_dotprod(xl, k12); + s1 += v_dotprod(xh, k12); + } + if ( k < _ksize / 2 + 1 ) { - __m128i x0 = _mm_loadl_epi64((__m128i*)src); - - x0 = _mm_unpacklo_epi8(x0, z); - __m128i x1 = _mm_unpacklo_epi16(x0, z); - __m128i x2 = _mm_unpackhi_epi16(x0, z); - x1 = _mm_madd_epi16(x1, k0); - x2 = _mm_madd_epi16(x2, k0); - - __m128i x3 = _mm_loadl_epi64((__m128i*)(src - cn)); - __m128i x4 = _mm_loadl_epi64((__m128i*)(src + cn)); - - x3 = _mm_unpacklo_epi8(x3, z); - x4 = _mm_unpacklo_epi8(x4, z); - __m128i x5 = _mm_unpacklo_epi16(x3, x4); - __m128i x6 = _mm_unpackhi_epi16(x3, x4); - x5 = _mm_madd_epi16(x5, k1); - x6 = _mm_madd_epi16(x6, k1); - x1 = _mm_add_epi32(x1, x5); - x2 = _mm_add_epi32(x2, x6); - - x3 = _mm_loadl_epi64((__m128i*)(src - cn*2)); - x4 = _mm_loadl_epi64((__m128i*)(src + cn*2)); - - x3 = _mm_unpacklo_epi8(x3, z); - x4 = _mm_unpacklo_epi8(x4, z); - x5 = _mm_unpacklo_epi16(x3, x4); - x6 = _mm_unpackhi_epi16(x3, x4); - x5 = _mm_madd_epi16(x5, k2); - x6 = _mm_madd_epi16(x6, k2); - x1 = _mm_add_epi32(x1, x5); - x2 = _mm_add_epi32(x2, x6); - - _mm_store_si128((__m128i*)(dst + i), x1); - _mm_store_si128((__m128i*)(dst + i + 4), x2); + v_int16 xl, xh; + v_zip(v_reinterpret_as_s16(vx_load_expand(src - j)), v_reinterpret_as_s16(vx_load_expand(src + j)), xl, xh); + v_int16 k1 = vx_setall_s16((short)(kx[k])); + s0 += v_dotprod(xl, k1); + s1 += v_dotprod(xh, k1); } + v_store(dst + i, s0); + v_store(dst + i + v_int32::nlanes, s1); + i += v_uint16::nlanes; src += v_uint16::nlanes; + } + if( i <= width - v_uint32::nlanes ) + { + v_int32 s0 = v_reinterpret_as_s32(vx_load_expand_q(src)) * vx_setall_s32(kx[0]); + for( k = 1, j = cn; k < _ksize / 2 + 1; k++, j += cn ) + s0 = v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src - j) + vx_load_expand_q(src + j)), vx_setall_s32(kx[k]), s0); + v_store(dst + i, s0); + i += v_uint32::nlanes; } } } @@ -756,111 +912,175 @@ struct SymmRowSmallVec_8u32s if( _ksize == 3 ) { if( kx[0] == 0 && kx[1] == 1 ) - for( ; i <= width - 16; i += 16, src += 16 ) + { + for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) + { + v_uint16 x0l, x0h, x2l, x2h; + v_expand(vx_load(src - cn), x0l, x0h); + v_expand(vx_load(src + cn), x2l, x2h); + v_int16 dl = v_reinterpret_as_s16(v_sub_wrap(x2l, x0l)); + v_int16 dh = v_reinterpret_as_s16(v_sub_wrap(x2h, x0h)); + v_store(dst + i, v_expand_low(dl)); + v_store(dst + i + v_int32::nlanes, v_expand_high(dl)); + v_store(dst + i + 2*v_int32::nlanes, v_expand_low(dh)); + v_store(dst + i + 3*v_int32::nlanes, v_expand_high(dh)); + } + if( i <= width - v_uint16::nlanes ) { - __m128i x0, x1, y0; - x0 = _mm_loadu_si128((__m128i*)(src + cn)); - x1 = _mm_loadu_si128((__m128i*)(src - cn)); - y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z)); - x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z)); - _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16)); - _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16)); - _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16)); - _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16)); + v_int16 dl = v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + cn), vx_load_expand(src - cn))); + v_store(dst + i, v_expand_low(dl)); + v_store(dst + i + v_int32::nlanes, v_expand_high(dl)); + i += v_uint16::nlanes; src += v_uint16::nlanes; } + if (i <= width - v_uint32::nlanes) + { + v_store(dst + i, v_reinterpret_as_s32(vx_load_expand_q(src + cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - cn))); + i += v_uint32::nlanes; + } + } else { - __m128i k0 = _mm_set_epi32(-kx[1], kx[1], -kx[1], kx[1]); - k0 = _mm_packs_epi32(k0, k0); - - for( ; i <= width - 16; i += 16, src += 16 ) + v_int16 k0 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (-kx[1] << 16))); + for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) + { + v_uint16 x0l, x0h, x2l, x2h; + v_expand(vx_load(src - cn), x0l, x0h); + v_expand(vx_load(src + cn), x2l, x2h); + v_int16 xl, xh; + v_zip(v_reinterpret_as_s16(x2l), v_reinterpret_as_s16(x0l), xl, xh); + v_store(dst + i, v_dotprod(xl, k0)); + v_store(dst + i + v_int32::nlanes, v_dotprod(xh, k0)); + v_zip(v_reinterpret_as_s16(x2h), v_reinterpret_as_s16(x0h), xl, xh); + v_store(dst + i + 2*v_int32::nlanes, v_dotprod(xl, k0)); + v_store(dst + i + 3*v_int32::nlanes, v_dotprod(xh, k0)); + } + if( i <= width - v_uint16::nlanes ) + { + v_int16 xl, xh; + v_zip(v_reinterpret_as_s16(vx_load_expand(src + cn)), v_reinterpret_as_s16(vx_load_expand(src - cn)), xl, xh); + v_store(dst + i, v_dotprod(xl, k0)); + v_store(dst + i + v_int32::nlanes, v_dotprod(xh, k0)); + i += v_uint16::nlanes; src += v_uint16::nlanes; + } + if (i <= width - v_uint32::nlanes) { - __m128i x0 = _mm_loadu_si128((__m128i*)(src + cn)); - __m128i x1 = _mm_loadu_si128((__m128i*)(src - cn)); - - __m128i x2 = _mm_unpacklo_epi8(x0, z); - __m128i x3 = _mm_unpacklo_epi8(x1, z); - __m128i x4 = _mm_unpackhi_epi8(x0, z); - __m128i x5 = _mm_unpackhi_epi8(x1, z); - __m128i x6 = _mm_unpacklo_epi16(x2, x3); - __m128i x7 = _mm_unpacklo_epi16(x4, x5); - __m128i x8 = _mm_unpackhi_epi16(x2, x3); - __m128i x9 = _mm_unpackhi_epi16(x4, x5); - x6 = _mm_madd_epi16(x6, k0); - x7 = _mm_madd_epi16(x7, k0); - x8 = _mm_madd_epi16(x8, k0); - x9 = _mm_madd_epi16(x9, k0); - - _mm_store_si128((__m128i*)(dst + i), x6); - _mm_store_si128((__m128i*)(dst + i + 4), x8); - _mm_store_si128((__m128i*)(dst + i + 8), x7); - _mm_store_si128((__m128i*)(dst + i + 12), x9); + v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + cn)), vx_setall_s32(kx[1]), v_reinterpret_as_s32(vx_load_expand_q(src - cn)) * vx_setall_s32(-kx[1]))); + i += v_uint32::nlanes; } } } else if( _ksize == 5 ) { - __m128i k0 = _mm_loadl_epi64((__m128i*)(kx + 1)); - k0 = _mm_unpacklo_epi64(k0, k0); - k0 = _mm_packs_epi32(k0, k0); - - for( ; i <= width - 16; i += 16, src += 16 ) + v_int16 k0 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (kx[2] << 16))); + for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) + { + v_uint16 x0l, x0h, x1l, x1h, x2l, x2h, x3l, x3h; + v_expand(vx_load(src - cn), x0l, x0h); + v_expand(vx_load(src - 2*cn), x1l, x1h); + v_expand(vx_load(src + cn), x2l, x2h); + v_expand(vx_load(src + 2*cn), x3l, x3h); + v_int16 x0, x1; + v_zip(v_reinterpret_as_s16(v_sub_wrap(x2l, x0l)), v_reinterpret_as_s16(v_sub_wrap(x3l, x1l)), x0, x1); + v_store(dst + i, v_dotprod(x0, k0)); + v_store(dst + i + v_int32::nlanes, v_dotprod(x1, k0)); + v_zip(v_reinterpret_as_s16(v_sub_wrap(x2h, x0h)), v_reinterpret_as_s16(v_sub_wrap(x3h, x1h)), x0, x1); + v_store(dst + i + 2*v_int32::nlanes, v_dotprod(x0, k0)); + v_store(dst + i + 3*v_int32::nlanes, v_dotprod(x1, k0)); + } + if( i <= width - v_uint16::nlanes ) + { + v_int16 x0, x1; + v_zip(v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + cn), vx_load_expand(src - cn))), + v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + 2*cn), vx_load_expand(src - 2*cn))), x0, x1); + v_store(dst + i, v_dotprod(x0, k0)); + v_store(dst + i + v_int32::nlanes, v_dotprod(x1, k0)); + i += v_uint16::nlanes; src += v_uint16::nlanes; + } + if( i <= width - v_uint32::nlanes ) { - __m128i x0 = _mm_loadu_si128((__m128i*)(src + cn)); - __m128i x1 = _mm_loadu_si128((__m128i*)(src - cn)); - - __m128i x2 = _mm_unpackhi_epi8(x0, z); - __m128i x3 = _mm_unpackhi_epi8(x1, z); - x0 = _mm_unpacklo_epi8(x0, z); - x1 = _mm_unpacklo_epi8(x1, z); - __m128i x5 = _mm_sub_epi16(x2, x3); - __m128i x4 = _mm_sub_epi16(x0, x1); - - __m128i x6 = _mm_loadu_si128((__m128i*)(src + cn * 2)); - __m128i x7 = _mm_loadu_si128((__m128i*)(src - cn * 2)); - - __m128i x8 = _mm_unpackhi_epi8(x6, z); - __m128i x9 = _mm_unpackhi_epi8(x7, z); - x6 = _mm_unpacklo_epi8(x6, z); - x7 = _mm_unpacklo_epi8(x7, z); - __m128i x11 = _mm_sub_epi16(x8, x9); - __m128i x10 = _mm_sub_epi16(x6, x7); - - __m128i x13 = _mm_unpackhi_epi16(x5, x11); - __m128i x12 = _mm_unpackhi_epi16(x4, x10); - x5 = _mm_unpacklo_epi16(x5, x11); - x4 = _mm_unpacklo_epi16(x4, x10); - x5 = _mm_madd_epi16(x5, k0); - x4 = _mm_madd_epi16(x4, k0); - x13 = _mm_madd_epi16(x13, k0); - x12 = _mm_madd_epi16(x12, k0); - - _mm_store_si128((__m128i*)(dst + i), x4); - _mm_store_si128((__m128i*)(dst + i + 4), x12); - _mm_store_si128((__m128i*)(dst + i + 8), x5); - _mm_store_si128((__m128i*)(dst + i + 12), x13); + v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - cn)), vx_setall_s32(kx[1]), + (v_reinterpret_as_s32(vx_load_expand_q(src + 2*cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn))) * vx_setall_s32(kx[2]))); + i += v_uint32::nlanes; } } - } - - src -= (_ksize/2)*cn; - kx -= _ksize/2; - for( ; i <= width - 4; i += 4, src += 4 ) - { - __m128i s0 = z; - - for( k = j = 0; k < _ksize; k++, j += cn ) + else { - __m128i f = _mm_cvtsi32_si128(kx[k]); - f = _mm_shuffle_epi32(f, 0); - - __m128i x0 = _mm_cvtsi32_si128(*(const int*)(src + j)); - x0 = _mm_unpacklo_epi8(x0, z); - x0 = _mm_unpacklo_epi16(x0, z); - x0 = _mm_madd_epi16(x0, f); - s0 = _mm_add_epi32(s0, x0); + v_int16 k0 = vx_setall_s16((short)(kx[0])); + for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) + { + v_uint8 v_src = vx_load(src); + v_int32 s0, s1, s2, s3; + v_mul_expand(v_reinterpret_as_s16(v_expand_low(v_src)), k0, s0, s1); + v_mul_expand(v_reinterpret_as_s16(v_expand_high(v_src)), k0, s2, s3); + for( k = 1, j = cn; k <= _ksize / 2 - 1; k += 2, j += 2 * cn ) + { + v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k + 1] << 16))); + + v_uint8 v_src0 = vx_load(src - j); + v_uint8 v_src1 = vx_load(src - j - cn); + v_uint8 v_src2 = vx_load(src + j); + v_uint8 v_src3 = vx_load(src + j + cn); + + v_int16 xl, xh; + v_zip(v_reinterpret_as_s16(v_sub_wrap(v_expand_low(v_src2), v_expand_low(v_src0))), v_reinterpret_as_s16(v_sub_wrap(v_expand_low(v_src3), v_expand_low(v_src1))), xl, xh); + s0 += v_dotprod(xl, k12); + s1 += v_dotprod(xh, k12); + v_zip(v_reinterpret_as_s16(v_sub_wrap(v_expand_high(v_src2), v_expand_high(v_src0))), v_reinterpret_as_s16(v_sub_wrap(v_expand_high(v_src3), v_expand_high(v_src1))), xl, xh); + s2 += v_dotprod(xl, k12); + s3 += v_dotprod(xh, k12); + } + if( k < _ksize / 2 + 1 ) + { + v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (-kx[k] << 16))); + v_uint8 v_src0 = vx_load(src - j); + v_uint8 v_src1 = vx_load(src + j); + + v_int16 xl, xh; + v_zip(v_reinterpret_as_s16(v_expand_low(v_src1)), v_reinterpret_as_s16(v_expand_low(v_src0)), xl, xh); + s0 += v_dotprod(xl, k12); + s1 += v_dotprod(xh, k12); + v_zip(v_reinterpret_as_s16(v_expand_high(v_src1)), v_reinterpret_as_s16(v_expand_high(v_src0)), xl, xh); + s2 += v_dotprod(xl, k12); + s3 += v_dotprod(xh, k12); + } + v_store(dst + i, s0); + v_store(dst + i + v_int32::nlanes, s1); + v_store(dst + i + 2*v_int32::nlanes, s2); + v_store(dst + i + 3*v_int32::nlanes, s3); + } + if( i <= width - v_uint16::nlanes ) + { + v_int32 s0, s1; + v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, s0, s1); + for( k = 1, j = cn; k <= _ksize / 2 - 1; k += 2, j += 2 * cn ) + { + v_int16 xl, xh; + v_zip(v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + j), vx_load_expand(src - j))), v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + j + cn), vx_load_expand(src - j - cn))), xl, xh); + v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k + 1] << 16))); + s0 += v_dotprod(xl, k12); + s1 += v_dotprod(xh, k12); + } + if( k < _ksize / 2 + 1 ) + { + v_int16 k1 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (-kx[k] << 16))); + v_int16 xl, xh; + v_zip(v_reinterpret_as_s16(vx_load_expand(src + j)), v_reinterpret_as_s16(vx_load_expand(src - j)), xl, xh); + s0 += v_dotprod(xl, k1); + s1 += v_dotprod(xh, k1); + } + v_store(dst + i, s0); + v_store(dst + i + v_int32::nlanes, s1); + i += v_uint16::nlanes; src += v_uint16::nlanes; + } + if( i <= width - v_uint32::nlanes ) + { + v_int32 s0 = v_reinterpret_as_s32(vx_load_expand_q(src)) * vx_setall_s32(kx[0]); + for (k = 1, j = cn; k < _ksize / 2 + 1; k++, j += cn) + s0 = v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + j)) - v_reinterpret_as_s32(vx_load_expand_q(src - j)), vx_setall_s32(kx[k]), s0); + v_store(dst + i, s0); + i += v_uint32::nlanes; + } } - _mm_store_si128((__m128i*)(dst + i), s0); } return i; @@ -885,129 +1105,117 @@ struct SymmColumnVec_32s8u int operator()(const uchar** _src, uchar* dst, int width) const { - if( !checkHardwareSupport(CV_CPU_SSE2) ) - return 0; - - int ksize2 = (kernel.rows + kernel.cols - 1)/2; + int _ksize = kernel.rows + kernel.cols - 1; + int ksize2 = _ksize/2; const float* ky = kernel.ptr() + ksize2; int i = 0, k; bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; const int** src = (const int**)_src; - const __m128i *S, *S2; - __m128 d4 = _mm_set1_ps(delta); + v_float32 d4 = vx_setall_f32(delta); if( symmetrical ) { - for( ; i <= width - 16; i += 16 ) - { - __m128 f = _mm_load_ss(ky); - f = _mm_shuffle_ps(f, f, 0); - __m128 s0, s1, s2, s3; - __m128i x0, x1; - S = (const __m128i*)(src[0] + i); - s0 = _mm_cvtepi32_ps(_mm_load_si128(S)); - s1 = _mm_cvtepi32_ps(_mm_load_si128(S+1)); - s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4); - s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4); - s2 = _mm_cvtepi32_ps(_mm_load_si128(S+2)); - s3 = _mm_cvtepi32_ps(_mm_load_si128(S+3)); - s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4); - s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4); - + if (_ksize == 1) + return 0; + v_float32 f0 = vx_setall_f32(ky[0]); + for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) + { + const int* S = src[0] + i; + v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S)), f0, d4); + v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4); + v_float32 s2 = v_muladd(v_cvt_f32(vx_load(S + 2*v_int32::nlanes)), f0, d4); + v_float32 s3 = v_muladd(v_cvt_f32(vx_load(S + 3*v_int32::nlanes)), f0, d4); for( k = 1; k <= ksize2; k++ ) { - S = (const __m128i*)(src[k] + i); - S2 = (const __m128i*)(src[-k] + i); - f = _mm_load_ss(ky+k); - f = _mm_shuffle_ps(f, f, 0); - x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2)); - x1 = _mm_add_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1)); - s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f)); - s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f)); - x0 = _mm_add_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2)); - x1 = _mm_add_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3)); - s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f)); - s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f)); + v_float32 f = vx_setall_f32(ky[k]); + const int* S0 = src[k] + i; + const int* S1 = src[-k] + i; + s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0); + s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1); + s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) + vx_load(S1 + 2*v_int32::nlanes)), f, s2); + s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3*v_int32::nlanes) + vx_load(S1 + 3*v_int32::nlanes)), f, s3); } - - x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1)); - x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3)); - x0 = _mm_packus_epi16(x0, x1); - _mm_storeu_si128((__m128i*)(dst + i), x0); + v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3)))); } - - for( ; i <= width - 4; i += 4 ) + if( i <= width - v_uint16::nlanes ) { - __m128 f = _mm_load_ss(ky); - f = _mm_shuffle_ps(f, f, 0); - __m128i x0; - __m128 s0 = _mm_cvtepi32_ps(_mm_load_si128((const __m128i*)(src[0] + i))); - s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4); - + const int* S = src[0] + i; + v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S)), f0, d4); + v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4); for( k = 1; k <= ksize2; k++ ) { - S = (const __m128i*)(src[k] + i); - S2 = (const __m128i*)(src[-k] + i); - f = _mm_load_ss(ky+k); - f = _mm_shuffle_ps(f, f, 0); - x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2)); - s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f)); + v_float32 f = vx_setall_f32(ky[k]); + const int* S0 = src[k] + i; + const int* S1 = src[-k] + i; + s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0); + s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1); } - - x0 = _mm_cvtps_epi32(s0); - x0 = _mm_packs_epi32(x0, x0); - x0 = _mm_packus_epi16(x0, x0); - *(int*)(dst + i) = _mm_cvtsi128_si32(x0); + v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1))); + i += v_uint16::nlanes; + } +#if CV_SIMD_WIDTH > 16 + while( i <= width - v_int32x4::nlanes ) +#else + if( i <= width - v_int32x4::nlanes ) +#endif + { + v_float32x4 s0 = v_muladd(v_cvt_f32(v_load(src[0] + i)), v_setall_f32(ky[0]), v_setall_f32(delta)); + for( k = 1; k <= ksize2; k++ ) + s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) + v_load(src[-k] + i)), v_setall_f32(ky[k]), s0); + v_int32x4 s32 = v_round(s0); + v_int16x8 s16 = v_pack(s32, s32); + *(int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0(); + i += v_int32x4::nlanes; } } else { - for( ; i <= width - 16; i += 16 ) + for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) { - __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4; - __m128i x0, x1; - - for( k = 1; k <= ksize2; k++ ) + v_float32 s0 = d4; + v_float32 s1 = d4; + v_float32 s2 = d4; + v_float32 s3 = d4; + for ( k = 1; k <= ksize2; k++ ) { - S = (const __m128i*)(src[k] + i); - S2 = (const __m128i*)(src[-k] + i); - f = _mm_load_ss(ky+k); - f = _mm_shuffle_ps(f, f, 0); - x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2)); - x1 = _mm_sub_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1)); - s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f)); - s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f)); - x0 = _mm_sub_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2)); - x1 = _mm_sub_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3)); - s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f)); - s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f)); + v_float32 f = vx_setall_f32(ky[k]); + const int* S0 = src[k] + i; + const int* S1 = src[-k] + i; + s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0); + s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1); + s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) - vx_load(S1 + 2*v_int32::nlanes)), f, s2); + s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3*v_int32::nlanes) - vx_load(S1 + 3*v_int32::nlanes)), f, s3); } - - x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1)); - x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3)); - x0 = _mm_packus_epi16(x0, x1); - _mm_storeu_si128((__m128i*)(dst + i), x0); + v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3)))); } - - for( ; i <= width - 4; i += 4 ) + if( i <= width - v_uint16::nlanes ) { - __m128 f, s0 = d4; - __m128i x0; - - for( k = 1; k <= ksize2; k++ ) + v_float32 s0 = d4; + v_float32 s1 = d4; + for ( k = 1; k <= ksize2; k++ ) { - S = (const __m128i*)(src[k] + i); - S2 = (const __m128i*)(src[-k] + i); - f = _mm_load_ss(ky+k); - f = _mm_shuffle_ps(f, f, 0); - x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2)); - s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f)); + v_float32 f = vx_setall_f32(ky[k]); + const int* S0 = src[k] + i; + const int* S1 = src[-k] + i; + s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0); + s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1); } - - x0 = _mm_cvtps_epi32(s0); - x0 = _mm_packs_epi32(x0, x0); - x0 = _mm_packus_epi16(x0, x0); - *(int*)(dst + i) = _mm_cvtsi128_si32(x0); + v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1))); + i += v_uint16::nlanes; + } +#if CV_SIMD_WIDTH > 16 + while( i <= width - v_int32x4::nlanes ) +#else + if( i <= width - v_int32x4::nlanes ) +#endif + { + v_float32x4 s0 = v_setall_f32(delta); + for (k = 1; k <= ksize2; k++) + s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) - v_load(src[-k] + i)), v_setall_f32(ky[k]), s0); + v_int32x4 s32 = v_round(s0); + v_int16x8 s16 = v_pack(s32, s32); + *(int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0(); + i += v_int32x4::nlanes; } } @@ -1033,9 +1241,6 @@ struct SymmColumnSmallVec_32s16s int operator()(const uchar** _src, uchar* _dst, int width) const { - if( !checkHardwareSupport(CV_CPU_SSE2) ) - return 0; - int ksize2 = (kernel.rows + kernel.cols - 1)/2; const float* ky = kernel.ptr() + ksize2; int i = 0; @@ -1043,66 +1248,63 @@ struct SymmColumnSmallVec_32s16s const int** src = (const int**)_src; const int *S0 = src[-1], *S1 = src[0], *S2 = src[1]; short* dst = (short*)_dst; - __m128 df4 = _mm_set1_ps(delta); - __m128i d4 = _mm_cvtps_epi32(df4); + v_float32 df4 = vx_setall_f32(delta); + v_int32 d4 = v_round(df4); if( symmetrical ) { if( ky[0] == 2 && ky[1] == 1 ) { - for( ; i <= width - 8; i += 8 ) + for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes ) + { + v_int32 sl = vx_load(S1 + i); + v_int32 sh = vx_load(S1 + i + v_int32::nlanes); + v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + d4 + (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + d4 + (sh + sh))); + } + if( i <= width - v_int32::nlanes ) { - __m128i s0, s1, s2, s3, s4, s5; - s0 = _mm_load_si128((__m128i*)(S0 + i)); - s1 = _mm_load_si128((__m128i*)(S0 + i + 4)); - s2 = _mm_load_si128((__m128i*)(S1 + i)); - s3 = _mm_load_si128((__m128i*)(S1 + i + 4)); - s4 = _mm_load_si128((__m128i*)(S2 + i)); - s5 = _mm_load_si128((__m128i*)(S2 + i + 4)); - s0 = _mm_add_epi32(s0, _mm_add_epi32(s4, _mm_add_epi32(s2, s2))); - s1 = _mm_add_epi32(s1, _mm_add_epi32(s5, _mm_add_epi32(s3, s3))); - s0 = _mm_add_epi32(s0, d4); - s1 = _mm_add_epi32(s1, d4); - _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1)); + v_int32 s = vx_load(S1 + i); + v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 + (s + s)); + i += v_int32::nlanes; } } else if( ky[0] == -2 && ky[1] == 1 ) { - for( ; i <= width - 8; i += 8 ) + for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes ) + { + v_int32 sl = vx_load(S1 + i); + v_int32 sh = vx_load(S1 + i + v_int32::nlanes); + v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + d4 - (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + d4 - (sh + sh))); + } + if( i <= width - v_int32::nlanes ) + { + v_int32 s = vx_load(S1 + i); + v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 - (s + s)); + i += v_int32::nlanes; + } + } + else if( ky[0] == (float)((int)ky[0]) && ky[1] == (float)((int)ky[1]) ) + { + v_int32 k0 = vx_setall_s32((int)ky[0]), k1 = vx_setall_s32((int)ky[1]); + for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes ) + v_store(dst + i, v_pack(v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)), + v_muladd(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + v_int32::nlanes), k0, d4)))); + if( i <= width - v_int32::nlanes ) { - __m128i s0, s1, s2, s3, s4, s5; - s0 = _mm_load_si128((__m128i*)(S0 + i)); - s1 = _mm_load_si128((__m128i*)(S0 + i + 4)); - s2 = _mm_load_si128((__m128i*)(S1 + i)); - s3 = _mm_load_si128((__m128i*)(S1 + i + 4)); - s4 = _mm_load_si128((__m128i*)(S2 + i)); - s5 = _mm_load_si128((__m128i*)(S2 + i + 4)); - s0 = _mm_add_epi32(s0, _mm_sub_epi32(s4, _mm_add_epi32(s2, s2))); - s1 = _mm_add_epi32(s1, _mm_sub_epi32(s5, _mm_add_epi32(s3, s3))); - s0 = _mm_add_epi32(s0, d4); - s1 = _mm_add_epi32(s1, d4); - _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1)); + v_pack_store(dst + i, v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4))); + i += v_int32::nlanes; } } else { - __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]); - for( ; i <= width - 8; i += 8 ) + v_float32 k0 = vx_setall_f32(ky[0]), k1 = vx_setall_f32(ky[1]); + for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes ) + v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))), + v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + v_int32::nlanes)), k0, df4))))); + if( i <= width - v_int32::nlanes ) { - __m128 s0, s1; - s0 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i))); - s1 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i + 4))); - s0 = _mm_add_ps(_mm_mul_ps(s0, k0), df4); - s1 = _mm_add_ps(_mm_mul_ps(s1, k0), df4); - __m128i x0, x1; - x0 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i)), - _mm_load_si128((__m128i*)(S2 + i))); - x1 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i + 4)), - _mm_load_si128((__m128i*)(S2 + i + 4))); - s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1)); - s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1)); - x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1)); - _mm_storeu_si128((__m128i*)(dst + i), x0); + v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4)))); + i += v_int32::nlanes; } } } @@ -1112,33 +1314,24 @@ struct SymmColumnSmallVec_32s16s { if( ky[1] < 0 ) std::swap(S0, S2); - for( ; i <= width - 8; i += 8 ) + for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes ) + v_store(dst + i, v_pack(vx_load(S2 + i) - vx_load(S0 + i) + d4, vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes) + d4)); + if( i <= width - v_int32::nlanes ) { - __m128i s0, s1, s2, s3; - s0 = _mm_load_si128((__m128i*)(S2 + i)); - s1 = _mm_load_si128((__m128i*)(S2 + i + 4)); - s2 = _mm_load_si128((__m128i*)(S0 + i)); - s3 = _mm_load_si128((__m128i*)(S0 + i + 4)); - s0 = _mm_add_epi32(_mm_sub_epi32(s0, s2), d4); - s1 = _mm_add_epi32(_mm_sub_epi32(s1, s3), d4); - _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1)); + v_pack_store(dst + i, vx_load(S2 + i) - vx_load(S0 + i) + d4); + i += v_int32::nlanes; } } else { - __m128 k1 = _mm_set1_ps(ky[1]); - for( ; i <= width - 8; i += 8 ) + v_float32 k1 = vx_setall_f32(ky[1]); + for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes ) + v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)), + v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)), k1, df4)))); + if( i <= width - v_int32::nlanes ) { - __m128 s0 = df4, s1 = df4; - __m128i x0, x1; - x0 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S2 + i)), - _mm_load_si128((__m128i*)(S0 + i))); - x1 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S2 + i + 4)), - _mm_load_si128((__m128i*)(S0 + i + 4))); - s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1)); - s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1)); - x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1)); - _mm_storeu_si128((__m128i*)(dst + i), x0); + v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4))); + i += v_int32::nlanes; } } } @@ -1156,188 +1349,118 @@ struct SymmColumnSmallVec_32s16s struct RowVec_16s32f { - RowVec_16s32f() { sse2_supported = false; } + RowVec_16s32f() {} RowVec_16s32f( const Mat& _kernel ) { kernel = _kernel; - sse2_supported = checkHardwareSupport(CV_CPU_SSE2); } int operator()(const uchar* _src, uchar* _dst, int width, int cn) const { - if( !sse2_supported ) - return 0; - int i = 0, k, _ksize = kernel.rows + kernel.cols - 1; float* dst = (float*)_dst; const float* _kx = kernel.ptr(); width *= cn; - for( ; i <= width - 8; i += 8 ) + for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes ) { const short* src = (const short*)_src + i; - __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1; + v_float32 s0 = vx_setzero_f32(); + v_float32 s1 = vx_setzero_f32(); for( k = 0; k < _ksize; k++, src += cn ) { - f = _mm_load_ss(_kx+k); - f = _mm_shuffle_ps(f, f, 0); - - __m128i x0i = _mm_loadu_si128((const __m128i*)src); - __m128i x1i = _mm_srai_epi32(_mm_unpackhi_epi16(x0i, x0i), 16); - x0i = _mm_srai_epi32(_mm_unpacklo_epi16(x0i, x0i), 16); - x0 = _mm_cvtepi32_ps(x0i); - x1 = _mm_cvtepi32_ps(x1i); - s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); - s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f)); + v_int16 x = vx_load(src); + s0 = v_muladd(v_cvt_f32(v_expand_low(x)), vx_setall_f32(_kx[k]), s0); + s1 = v_muladd(v_cvt_f32(v_expand_high(x)), vx_setall_f32(_kx[k]), s1); } - _mm_store_ps(dst + i, s0); - _mm_store_ps(dst + i + 4, s1); + v_store(dst + i, s0); + v_store(dst + i + v_float32::nlanes, s1); + } + if( i <= width - v_float32::nlanes ) + { + const short* src = (const short*)_src + i; + v_float32 s0 = vx_setzero_f32(); + for( k = 0; k < _ksize; k++, src += cn ) + s0 = v_muladd(v_cvt_f32(vx_load_expand(src)), vx_setall_f32(_kx[k]), s0); + v_store(dst + i, s0); + i += v_float32::nlanes; } return i; } Mat kernel; - bool sse2_supported; }; struct SymmColumnVec_32f16s { - SymmColumnVec_32f16s() { symmetryType=0; delta = 0; sse2_supported = false; } + SymmColumnVec_32f16s() { symmetryType=0; delta = 0; } SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta) { symmetryType = _symmetryType; kernel = _kernel; delta = (float)_delta; CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); - sse2_supported = checkHardwareSupport(CV_CPU_SSE2); } int operator()(const uchar** _src, uchar* _dst, int width) const { - if( !sse2_supported ) - return 0; - - int ksize2 = (kernel.rows + kernel.cols - 1)/2; + int _ksize = kernel.rows + kernel.cols - 1; + int ksize2 = _ksize / 2; const float* ky = kernel.ptr() + ksize2; int i = 0, k; bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; const float** src = (const float**)_src; - const float *S, *S2; short* dst = (short*)_dst; - __m128 d4 = _mm_set1_ps(delta); + v_float32 d4 = vx_setall_f32(delta); if( symmetrical ) { - for( ; i <= width - 16; i += 16 ) + if (_ksize == 1) + return 0; + v_float32 k0 = vx_setall_f32(ky[0]); + for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes ) { - __m128 f = _mm_load_ss(ky); - f = _mm_shuffle_ps(f, f, 0); - __m128 s0, s1, s2, s3; - __m128 x0, x1; - S = src[0] + i; - s0 = _mm_load_ps(S); - s1 = _mm_load_ps(S+4); - s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4); - s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4); - s2 = _mm_load_ps(S+8); - s3 = _mm_load_ps(S+12); - s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4); - s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4); - + v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); + v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4); for( k = 1; k <= ksize2; k++ ) { - S = src[k] + i; - S2 = src[-k] + i; - f = _mm_load_ss(ky+k); - f = _mm_shuffle_ps(f, f, 0); - x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2)); - x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4)); - s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); - s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f)); - x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8)); - x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12)); - s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f)); - s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f)); + v_float32 k1 = vx_setall_f32(ky[k]); + s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k1, s0); + s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k1, s1); } - - __m128i s0i = _mm_cvtps_epi32(s0); - __m128i s1i = _mm_cvtps_epi32(s1); - __m128i s2i = _mm_cvtps_epi32(s2); - __m128i s3i = _mm_cvtps_epi32(s3); - - _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0i, s1i)); - _mm_storeu_si128((__m128i*)(dst + i + 8), _mm_packs_epi32(s2i, s3i)); + v_store(dst + i, v_pack(v_round(s0), v_round(s1))); } - - for( ; i <= width - 4; i += 4 ) + if( i <= width - v_float32::nlanes ) { - __m128 f = _mm_load_ss(ky); - f = _mm_shuffle_ps(f, f, 0); - __m128 x0, s0 = _mm_load_ps(src[0] + i); - s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4); - + v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); for( k = 1; k <= ksize2; k++ ) - { - f = _mm_load_ss(ky+k); - f = _mm_shuffle_ps(f, f, 0); - S = src[k] + i; - S2 = src[-k] + i; - x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i)); - s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); - } - - __m128i s0i = _mm_cvtps_epi32(s0); - _mm_storel_epi64((__m128i*)(dst + i), _mm_packs_epi32(s0i, s0i)); + s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0); + v_pack_store(dst + i, v_round(s0)); + i += v_float32::nlanes; } } else { - for( ; i <= width - 16; i += 16 ) + for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes ) { - __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4; - __m128 x0, x1; - S = src[0] + i; - + v_float32 s0 = d4; + v_float32 s1 = d4; for( k = 1; k <= ksize2; k++ ) { - S = src[k] + i; - S2 = src[-k] + i; - f = _mm_load_ss(ky+k); - f = _mm_shuffle_ps(f, f, 0); - x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2)); - x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4)); - s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); - s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f)); - x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8)); - x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12)); - s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f)); - s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f)); + v_float32 k1 = vx_setall_f32(ky[k]); + s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k1, s0); + s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k1, s1); } - - __m128i s0i = _mm_cvtps_epi32(s0); - __m128i s1i = _mm_cvtps_epi32(s1); - __m128i s2i = _mm_cvtps_epi32(s2); - __m128i s3i = _mm_cvtps_epi32(s3); - - _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0i, s1i)); - _mm_storeu_si128((__m128i*)(dst + i + 8), _mm_packs_epi32(s2i, s3i)); + v_store(dst + i, v_pack(v_round(s0), v_round(s1))); } - - for( ; i <= width - 4; i += 4 ) + if( i <= width - v_float32::nlanes ) { - __m128 f, x0, s0 = d4; - + v_float32 s0 = d4; for( k = 1; k <= ksize2; k++ ) - { - f = _mm_load_ss(ky+k); - f = _mm_shuffle_ps(f, f, 0); - x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i)); - s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); - } - - __m128i s0i = _mm_cvtps_epi32(s0); - _mm_storel_epi64((__m128i*)(dst + i), _mm_packs_epi32(s0i, s0i)); + s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0); + v_pack_store(dst + i, v_round(s0)); + i += v_float32::nlanes; } } @@ -1347,7 +1470,6 @@ struct SymmColumnVec_32f16s int symmetryType; float delta; Mat kernel; - bool sse2_supported; }; @@ -1357,7 +1479,6 @@ struct RowVec_32f { RowVec_32f() { - haveSSE = checkHardwareSupport(CV_CPU_SSE); haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2; #if defined USE_IPP_SEP_FILTERS bufsz = -1; @@ -1367,7 +1488,6 @@ struct RowVec_32f RowVec_32f( const Mat& _kernel ) { kernel = _kernel; - haveSSE = checkHardwareSupport(CV_CPU_SSE); haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2; #if defined USE_IPP_SEP_FILTERS bufsz = -1; @@ -1389,9 +1509,6 @@ struct RowVec_32f float* dst = (float*)_dst; const float* _kx = kernel.ptr(); - if( !haveSSE ) - return 0; - int i = 0, k; width *= cn; @@ -1399,27 +1516,18 @@ struct RowVec_32f if (haveAVX2) return RowVec_32f_AVX(src0, _kx, dst, width, cn, _ksize); #endif - for( ; i <= width - 8; i += 8 ) + for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) { const float* src = src0 + i; - __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1; + v_float32 s0 = vx_setzero_f32(); for( k = 0; k < _ksize; k++, src += cn ) - { - f = _mm_set1_ps(_kx[k]); - - x0 = _mm_loadu_ps(src); - x1 = _mm_loadu_ps(src + 4); - s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); - s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f)); - } - _mm_store_ps(dst + i, s0); - _mm_store_ps(dst + i + 4, s1); + s0 = v_muladd(vx_load(src), vx_setall_f32(_kx[k]), s0); + v_store(dst + i, s0); } return i; } Mat kernel; - bool haveSSE; bool haveAVX2; #if defined USE_IPP_SEP_FILTERS private: @@ -1475,9 +1583,6 @@ struct SymmRowSmallVec_32f int operator()(const uchar* _src, uchar* _dst, int width, int cn) const { - if( !checkHardwareSupport(CV_CPU_SSE) ) - return 0; - int i = 0, _ksize = kernel.rows + kernel.cols - 1; float* dst = (float*)_dst; const float* src = (const float*)_src + (_ksize/2)*cn; @@ -1491,101 +1596,32 @@ struct SymmRowSmallVec_32f return 0; if( _ksize == 3 ) { - if( kx[0] == 2 && kx[1] == 1 ) - for( ; i <= width - 8; i += 8, src += 8 ) - { - __m128 x0, x1, x2, y0, y1, y2; - x0 = _mm_loadu_ps(src - cn); - x1 = _mm_loadu_ps(src); - x2 = _mm_loadu_ps(src + cn); - y0 = _mm_loadu_ps(src - cn + 4); - y1 = _mm_loadu_ps(src + 4); - y2 = _mm_loadu_ps(src + cn + 4); - x0 = _mm_add_ps(x0, _mm_add_ps(_mm_add_ps(x1, x1), x2)); - y0 = _mm_add_ps(y0, _mm_add_ps(_mm_add_ps(y1, y1), y2)); - _mm_store_ps(dst + i, x0); - _mm_store_ps(dst + i + 4, y0); - } - else if( kx[0] == -2 && kx[1] == 1 ) - for( ; i <= width - 8; i += 8, src += 8 ) - { - __m128 x0, x1, x2, y0, y1, y2; - x0 = _mm_loadu_ps(src - cn); - x1 = _mm_loadu_ps(src); - x2 = _mm_loadu_ps(src + cn); - y0 = _mm_loadu_ps(src - cn + 4); - y1 = _mm_loadu_ps(src + 4); - y2 = _mm_loadu_ps(src + cn + 4); - x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1))); - y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1))); - _mm_store_ps(dst + i, x0); - _mm_store_ps(dst + i + 4, y0); - } + if( fabs(kx[0]) == 2 && kx[1] == 1 ) + { + v_float32 k0 = vx_setall_f32(kx[0]); + for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) + v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - cn) + vx_load(src + cn))); + } else { - __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]); - for( ; i <= width - 8; i += 8, src += 8 ) - { - __m128 x0, x1, x2, y0, y1, y2; - x0 = _mm_loadu_ps(src - cn); - x1 = _mm_loadu_ps(src); - x2 = _mm_loadu_ps(src + cn); - y0 = _mm_loadu_ps(src - cn + 4); - y1 = _mm_loadu_ps(src + 4); - y2 = _mm_loadu_ps(src + cn + 4); - - x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1); - y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1); - x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0)); - y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0)); - _mm_store_ps(dst + i, x0); - _mm_store_ps(dst + i + 4, y0); - } + v_float32 k0 = vx_setall_f32(kx[0]), k1 = vx_setall_f32(kx[1]); + for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) + v_store(dst + i, v_muladd(vx_load(src), k0, (vx_load(src - cn) + vx_load(src + cn)) * k1)); } } else if( _ksize == 5 ) { if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 ) - for( ; i <= width - 8; i += 8, src += 8 ) - { - __m128 x0, x1, x2, y0, y1, y2; - x0 = _mm_loadu_ps(src - cn*2); - x1 = _mm_loadu_ps(src); - x2 = _mm_loadu_ps(src + cn*2); - y0 = _mm_loadu_ps(src - cn*2 + 4); - y1 = _mm_loadu_ps(src + 4); - y2 = _mm_loadu_ps(src + cn*2 + 4); - x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1))); - y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1))); - _mm_store_ps(dst + i, x0); - _mm_store_ps(dst + i + 4, y0); - } + { + v_float32 k0 = vx_setall_f32(-2); + for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) + v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - 2*cn) + vx_load(src + 2*cn))); + } else { - __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]); - for( ; i <= width - 8; i += 8, src += 8 ) - { - __m128 x0, x1, x2, y0, y1, y2; - x0 = _mm_loadu_ps(src - cn); - x1 = _mm_loadu_ps(src); - x2 = _mm_loadu_ps(src + cn); - y0 = _mm_loadu_ps(src - cn + 4); - y1 = _mm_loadu_ps(src + 4); - y2 = _mm_loadu_ps(src + cn + 4); - - x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1); - y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1); - x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0)); - y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0)); - - x2 = _mm_add_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2)); - y2 = _mm_add_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4)); - x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2)); - y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2)); - - _mm_store_ps(dst + i, x0); - _mm_store_ps(dst + i + 4, y0); - } + v_float32 k0 = vx_setall_f32(kx[0]), k1 = vx_setall_f32(kx[1]), k2 = vx_setall_f32(kx[2]); + for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) + v_store(dst + i, v_muladd(vx_load(src + 2*cn) + vx_load(src - 2*cn), k2, v_muladd(vx_load(src), k0, (vx_load(src - cn) + vx_load(src + cn)) * k1))); } } } @@ -1594,58 +1630,20 @@ struct SymmRowSmallVec_32f if( _ksize == 3 ) { if( kx[0] == 0 && kx[1] == 1 ) - for( ; i <= width - 8; i += 8, src += 8 ) - { - __m128 x0, x2, y0, y2; - x0 = _mm_loadu_ps(src + cn); - x2 = _mm_loadu_ps(src - cn); - y0 = _mm_loadu_ps(src + cn + 4); - y2 = _mm_loadu_ps(src - cn + 4); - x0 = _mm_sub_ps(x0, x2); - y0 = _mm_sub_ps(y0, y2); - _mm_store_ps(dst + i, x0); - _mm_store_ps(dst + i + 4, y0); - } + for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) + v_store(dst + i, vx_load(src + cn) - vx_load(src - cn)); else { - __m128 k1 = _mm_set1_ps(kx[1]); - for( ; i <= width - 8; i += 8, src += 8 ) - { - __m128 x0, x2, y0, y2; - x0 = _mm_loadu_ps(src + cn); - x2 = _mm_loadu_ps(src - cn); - y0 = _mm_loadu_ps(src + cn + 4); - y2 = _mm_loadu_ps(src - cn + 4); - - x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1); - y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1); - _mm_store_ps(dst + i, x0); - _mm_store_ps(dst + i + 4, y0); - } + v_float32 k1 = vx_setall_f32(kx[1]); + for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) + v_store(dst + i, (vx_load(src + cn) - vx_load(src - cn)) * k1); } } else if( _ksize == 5 ) { - __m128 k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]); - for( ; i <= width - 8; i += 8, src += 8 ) - { - __m128 x0, x2, y0, y2; - x0 = _mm_loadu_ps(src + cn); - x2 = _mm_loadu_ps(src - cn); - y0 = _mm_loadu_ps(src + cn + 4); - y2 = _mm_loadu_ps(src - cn + 4); - - x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1); - y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1); - - x2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2)); - y2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4)); - x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2)); - y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2)); - - _mm_store_ps(dst + i, x0); - _mm_store_ps(dst + i + 4, y0); - } + v_float32 k1 = vx_setall_f32(kx[1]), k2 = vx_setall_f32(kx[2]); + for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) + v_store(dst + i, v_muladd(vx_load(src + 2*cn) - vx_load(src - 2*cn), k2, (vx_load(src + cn) - vx_load(src - cn)) * k1)); } } @@ -1661,7 +1659,6 @@ struct SymmColumnVec_32f { SymmColumnVec_32f() { symmetryType=0; - haveSSE = checkHardwareSupport(CV_CPU_SSE); haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2; delta = 0; } @@ -1670,22 +1667,17 @@ struct SymmColumnVec_32f symmetryType = _symmetryType; kernel = _kernel; delta = (float)_delta; - haveSSE = checkHardwareSupport(CV_CPU_SSE); haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2; CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); } int operator()(const uchar** _src, uchar* _dst, int width) const { - if( !haveSSE ) - return 0; - int ksize2 = (kernel.rows + kernel.cols - 1)/2; const float* ky = kernel.ptr() + ksize2; int i = 0, k; bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; const float** src = (const float**)_src; - const float *S, *S2; float* dst = (float*)_dst; if( symmetrical ) @@ -1695,869 +1687,28 @@ struct SymmColumnVec_32f if (haveAVX2) return SymmColumnVec_32f_Symm_AVX(src, ky, dst, delta, width, ksize2); #endif - const __m128 d4 = _mm_set1_ps(delta); - for( ; i <= width - 16; i += 16 ) - { - __m128 f = _mm_set1_ps(ky[0]); - __m128 s0, s1, s2, s3; - __m128 x0, x1; - S = src[0] + i; - s0 = _mm_load_ps(S); - s1 = _mm_load_ps(S+4); - s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4); - s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4); - s2 = _mm_load_ps(S+8); - s3 = _mm_load_ps(S+12); - s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4); - s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4); - - for( k = 1; k <= ksize2; k++ ) - { - S = src[k] + i; - S2 = src[-k] + i; - f = _mm_set1_ps(ky[k]); - x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2)); - x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4)); - s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); - s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f)); - x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8)); - x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12)); - s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f)); - s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f)); - } - - _mm_storeu_ps(dst + i, s0); - _mm_storeu_ps(dst + i + 4, s1); - _mm_storeu_ps(dst + i + 8, s2); - _mm_storeu_ps(dst + i + 12, s3); - } - - for( ; i <= width - 4; i += 4 ) - { - __m128 f = _mm_set1_ps(ky[0]); - __m128 x0, s0 = _mm_load_ps(src[0] + i); - s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4); - - for( k = 1; k <= ksize2; k++ ) - { - f = _mm_set1_ps(ky[k]); - S = src[k] + i; - S2 = src[-k] + i; - x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i)); - s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); - } - - _mm_storeu_ps(dst + i, s0); - } - } - else - { -#if CV_TRY_AVX2 - if (haveAVX2) - return SymmColumnVec_32f_Unsymm_AVX(src, ky, dst, delta, width, ksize2); -#endif - const __m128 d4 = _mm_set1_ps(delta); - for( ; i <= width - 16; i += 16 ) - { - __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4; - __m128 x0, x1; - S = src[0] + i; - - for( k = 1; k <= ksize2; k++ ) - { - S = src[k] + i; - S2 = src[-k] + i; - f = _mm_set1_ps(ky[k]); - x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2)); - x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4)); - s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); - s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f)); - x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8)); - x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12)); - s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f)); - s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f)); - } - - _mm_storeu_ps(dst + i, s0); - _mm_storeu_ps(dst + i + 4, s1); - _mm_storeu_ps(dst + i + 8, s2); - _mm_storeu_ps(dst + i + 12, s3); - } - - for( ; i <= width - 4; i += 4 ) + const v_float32 d4 = vx_setall_f32(delta); + for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) { - __m128 f, x0, s0 = d4; - + v_float32 s0 = v_muladd(vx_load(src[0] + i), vx_setall_f32(ky[0]), d4); for( k = 1; k <= ksize2; k++ ) - { - f = _mm_set1_ps(ky[k]); - x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i)); - s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); - } - - _mm_storeu_ps(dst + i, s0); + s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0); + v_store(dst + i, s0); } } - - return i; - } - - int symmetryType; - float delta; - Mat kernel; - bool haveSSE; - bool haveAVX2; -}; - - -struct SymmColumnSmallVec_32f -{ - SymmColumnSmallVec_32f() { symmetryType=0; delta = 0; } - SymmColumnSmallVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta) - { - symmetryType = _symmetryType; - kernel = _kernel; - delta = (float)_delta; - CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); - } - - int operator()(const uchar** _src, uchar* _dst, int width) const - { - if( !checkHardwareSupport(CV_CPU_SSE) ) - return 0; - - int ksize2 = (kernel.rows + kernel.cols - 1)/2; - const float* ky = kernel.ptr() + ksize2; - int i = 0; - bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; - const float** src = (const float**)_src; - const float *S0 = src[-1], *S1 = src[0], *S2 = src[1]; - float* dst = (float*)_dst; - __m128 d4 = _mm_set1_ps(delta); - - if( symmetrical ) - { - if( ky[0] == 2 && ky[1] == 1 ) - { - for( ; i <= width - 8; i += 8 ) - { - __m128 s0, s1, s2, s3, s4, s5; - s0 = _mm_load_ps(S0 + i); - s1 = _mm_load_ps(S0 + i + 4); - s2 = _mm_load_ps(S1 + i); - s3 = _mm_load_ps(S1 + i + 4); - s4 = _mm_load_ps(S2 + i); - s5 = _mm_load_ps(S2 + i + 4); - s0 = _mm_add_ps(s0, _mm_add_ps(s4, _mm_add_ps(s2, s2))); - s1 = _mm_add_ps(s1, _mm_add_ps(s5, _mm_add_ps(s3, s3))); - s0 = _mm_add_ps(s0, d4); - s1 = _mm_add_ps(s1, d4); - _mm_storeu_ps(dst + i, s0); - _mm_storeu_ps(dst + i + 4, s1); - } - } - else if( ky[0] == -2 && ky[1] == 1 ) - { - for( ; i <= width - 8; i += 8 ) - { - __m128 s0, s1, s2, s3, s4, s5; - s0 = _mm_load_ps(S0 + i); - s1 = _mm_load_ps(S0 + i + 4); - s2 = _mm_load_ps(S1 + i); - s3 = _mm_load_ps(S1 + i + 4); - s4 = _mm_load_ps(S2 + i); - s5 = _mm_load_ps(S2 + i + 4); - s0 = _mm_add_ps(s0, _mm_sub_ps(s4, _mm_add_ps(s2, s2))); - s1 = _mm_add_ps(s1, _mm_sub_ps(s5, _mm_add_ps(s3, s3))); - s0 = _mm_add_ps(s0, d4); - s1 = _mm_add_ps(s1, d4); - _mm_storeu_ps(dst + i, s0); - _mm_storeu_ps(dst + i + 4, s1); - } - } - else - { - __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]); - for( ; i <= width - 8; i += 8 ) - { - __m128 s0, s1, x0, x1; - s0 = _mm_load_ps(S1 + i); - s1 = _mm_load_ps(S1 + i + 4); - s0 = _mm_add_ps(_mm_mul_ps(s0, k0), d4); - s1 = _mm_add_ps(_mm_mul_ps(s1, k0), d4); - x0 = _mm_add_ps(_mm_load_ps(S0 + i), _mm_load_ps(S2 + i)); - x1 = _mm_add_ps(_mm_load_ps(S0 + i + 4), _mm_load_ps(S2 + i + 4)); - s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1)); - s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1)); - _mm_storeu_ps(dst + i, s0); - _mm_storeu_ps(dst + i + 4, s1); - } - } - } - else - { - if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] ) - { - if( ky[1] < 0 ) - std::swap(S0, S2); - for( ; i <= width - 8; i += 8 ) - { - __m128 s0, s1, s2, s3; - s0 = _mm_load_ps(S2 + i); - s1 = _mm_load_ps(S2 + i + 4); - s2 = _mm_load_ps(S0 + i); - s3 = _mm_load_ps(S0 + i + 4); - s0 = _mm_add_ps(_mm_sub_ps(s0, s2), d4); - s1 = _mm_add_ps(_mm_sub_ps(s1, s3), d4); - _mm_storeu_ps(dst + i, s0); - _mm_storeu_ps(dst + i + 4, s1); - } - } - else - { - __m128 k1 = _mm_set1_ps(ky[1]); - for( ; i <= width - 8; i += 8 ) - { - __m128 s0 = d4, s1 = d4, x0, x1; - x0 = _mm_sub_ps(_mm_load_ps(S2 + i), _mm_load_ps(S0 + i)); - x1 = _mm_sub_ps(_mm_load_ps(S2 + i + 4), _mm_load_ps(S0 + i + 4)); - s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1)); - s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1)); - _mm_storeu_ps(dst + i, s0); - _mm_storeu_ps(dst + i + 4, s1); - } - } - } - - return i; - } - - int symmetryType; - float delta; - Mat kernel; -}; - - -/////////////////////////////// non-separable filters /////////////////////////////// - -///////////////////////////////// 8u<->8u, 8u<->16s ///////////////////////////////// - -struct FilterVec_8u -{ - FilterVec_8u() { delta = 0; _nz = 0; } - FilterVec_8u(const Mat& _kernel, int _bits, double _delta) - { - Mat kernel; - _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0); - delta = (float)(_delta/(1 << _bits)); - std::vector coords; - preprocess2DKernel(kernel, coords, coeffs); - _nz = (int)coords.size(); - } - - int operator()(const uchar** src, uchar* dst, int width) const - { - if( !checkHardwareSupport(CV_CPU_SSE2) ) - return 0; - - const float* kf = (const float*)&coeffs[0]; - int i = 0, k, nz = _nz; - __m128 d4 = _mm_set1_ps(delta); - - for( ; i <= width - 16; i += 16 ) - { - __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4; - __m128i x0, x1, z = _mm_setzero_si128(); - - for( k = 0; k < nz; k++ ) - { - __m128 f = _mm_load_ss(kf+k), t0, t1; - f = _mm_shuffle_ps(f, f, 0); - - x0 = _mm_loadu_si128((const __m128i*)(src[k] + i)); - x1 = _mm_unpackhi_epi8(x0, z); - x0 = _mm_unpacklo_epi8(x0, z); - - t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z)); - t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z)); - s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f)); - s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f)); - - t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z)); - t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z)); - s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f)); - s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f)); - } - - x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1)); - x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3)); - x0 = _mm_packus_epi16(x0, x1); - _mm_storeu_si128((__m128i*)(dst + i), x0); - } - - for( ; i <= width - 4; i += 4 ) - { - __m128 s0 = d4; - __m128i x0, z = _mm_setzero_si128(); - - for( k = 0; k < nz; k++ ) - { - __m128 f = _mm_load_ss(kf+k), t0; - f = _mm_shuffle_ps(f, f, 0); - - x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i)); - x0 = _mm_unpacklo_epi8(x0, z); - t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z)); - s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f)); - } - - x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z); - x0 = _mm_packus_epi16(x0, x0); - *(int*)(dst + i) = _mm_cvtsi128_si32(x0); - } - - return i; - } - - int _nz; - std::vector coeffs; - float delta; -}; - - -struct FilterVec_8u16s -{ - FilterVec_8u16s() { delta = 0; _nz = 0; } - FilterVec_8u16s(const Mat& _kernel, int _bits, double _delta) - { - Mat kernel; - _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0); - delta = (float)(_delta/(1 << _bits)); - std::vector coords; - preprocess2DKernel(kernel, coords, coeffs); - _nz = (int)coords.size(); - } - - int operator()(const uchar** src, uchar* _dst, int width) const - { - if( !checkHardwareSupport(CV_CPU_SSE2) ) - return 0; - - const float* kf = (const float*)&coeffs[0]; - short* dst = (short*)_dst; - int i = 0, k, nz = _nz; - __m128 d4 = _mm_set1_ps(delta); - - for( ; i <= width - 16; i += 16 ) - { - __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4; - __m128i x0, x1, z = _mm_setzero_si128(); - - for( k = 0; k < nz; k++ ) - { - __m128 f = _mm_load_ss(kf+k), t0, t1; - f = _mm_shuffle_ps(f, f, 0); - - x0 = _mm_loadu_si128((const __m128i*)(src[k] + i)); - x1 = _mm_unpackhi_epi8(x0, z); - x0 = _mm_unpacklo_epi8(x0, z); - - t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z)); - t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z)); - s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f)); - s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f)); - - t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z)); - t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z)); - s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f)); - s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f)); - } - - x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1)); - x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3)); - _mm_storeu_si128((__m128i*)(dst + i), x0); - _mm_storeu_si128((__m128i*)(dst + i + 8), x1); - } - - for( ; i <= width - 4; i += 4 ) - { - __m128 s0 = d4; - __m128i x0, z = _mm_setzero_si128(); - - for( k = 0; k < nz; k++ ) - { - __m128 f = _mm_load_ss(kf+k), t0; - f = _mm_shuffle_ps(f, f, 0); - - x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i)); - x0 = _mm_unpacklo_epi8(x0, z); - t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z)); - s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f)); - } - - x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z); - _mm_storel_epi64((__m128i*)(dst + i), x0); - } - - return i; - } - - int _nz; - std::vector coeffs; - float delta; -}; - - -struct FilterVec_32f -{ - FilterVec_32f() { delta = 0; _nz = 0; } - FilterVec_32f(const Mat& _kernel, int, double _delta) - { - delta = (float)_delta; - std::vector coords; - preprocess2DKernel(_kernel, coords, coeffs); - _nz = (int)coords.size(); - } - - int operator()(const uchar** _src, uchar* _dst, int width) const - { - if( !checkHardwareSupport(CV_CPU_SSE) ) - return 0; - - const float* kf = (const float*)&coeffs[0]; - const float** src = (const float**)_src; - float* dst = (float*)_dst; - int i = 0, k, nz = _nz; - __m128 d4 = _mm_set1_ps(delta); - - for( ; i <= width - 16; i += 16 ) - { - __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4; - - for( k = 0; k < nz; k++ ) - { - __m128 f = _mm_load_ss(kf+k), t0, t1; - f = _mm_shuffle_ps(f, f, 0); - const float* S = src[k] + i; - - t0 = _mm_loadu_ps(S); - t1 = _mm_loadu_ps(S + 4); - s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f)); - s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f)); - - t0 = _mm_loadu_ps(S + 8); - t1 = _mm_loadu_ps(S + 12); - s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f)); - s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f)); - } - - _mm_storeu_ps(dst + i, s0); - _mm_storeu_ps(dst + i + 4, s1); - _mm_storeu_ps(dst + i + 8, s2); - _mm_storeu_ps(dst + i + 12, s3); - } - - for( ; i <= width - 4; i += 4 ) - { - __m128 s0 = d4; - - for( k = 0; k < nz; k++ ) - { - __m128 f = _mm_load_ss(kf+k), t0; - f = _mm_shuffle_ps(f, f, 0); - t0 = _mm_loadu_ps(src[k] + i); - s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f)); - } - _mm_storeu_ps(dst + i, s0); - } - - return i; - } - - int _nz; - std::vector coeffs; - float delta; -}; - - -#elif CV_NEON - -struct SymmRowSmallVec_8u32s -{ - SymmRowSmallVec_8u32s() { smallValues = false; } - SymmRowSmallVec_8u32s( const Mat& _kernel, int _symmetryType ) - { - kernel = _kernel; - symmetryType = _symmetryType; - smallValues = true; - int k, ksize = kernel.rows + kernel.cols - 1; - for( k = 0; k < ksize; k++ ) - { - int v = kernel.ptr()[k]; - if( v < SHRT_MIN || v > SHRT_MAX ) - { - smallValues = false; - break; - } - } - } - - int operator()(const uchar* src, uchar* _dst, int width, int cn) const - { - if( !checkHardwareSupport(CV_CPU_NEON) ) - return 0; - - int i = 0, _ksize = kernel.rows + kernel.cols - 1; - int* dst = (int*)_dst; - bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; - const int* kx = kernel.ptr() + _ksize/2; - if( !smallValues ) - return 0; - - src += (_ksize/2)*cn; - width *= cn; - - if( symmetrical ) - { - if( _ksize == 1 ) - return 0; - if( _ksize == 3 ) - { - if( kx[0] == 2 && kx[1] == 1 ) - { - uint16x8_t zq = vdupq_n_u16(0); - - for( ; i <= width - 8; i += 8, src += 8 ) - { - uint8x8_t x0, x1, x2; - x0 = vld1_u8( (uint8_t *) (src - cn) ); - x1 = vld1_u8( (uint8_t *) (src) ); - x2 = vld1_u8( (uint8_t *) (src + cn) ); - - uint16x8_t y0, y1, y2; - y0 = vaddl_u8(x0, x2); - y1 = vshll_n_u8(x1, 1); - y2 = vaddq_u16(y0, y1); - - uint16x8x2_t str; - str.val[0] = y2; str.val[1] = zq; - vst2q_u16( (uint16_t *) (dst + i), str ); - } - } - else if( kx[0] == -2 && kx[1] == 1 ) - return 0; - else - { - int32x4_t k32 = vdupq_n_s32(0); - k32 = vld1q_lane_s32(kx, k32, 0); - k32 = vld1q_lane_s32(kx + 1, k32, 1); - - int16x4_t k = vqmovn_s32(k32); - - uint8x8_t z = vdup_n_u8(0); - - for( ; i <= width - 8; i += 8, src += 8 ) - { - uint8x8_t x0, x1, x2; - x0 = vld1_u8( (uint8_t *) (src - cn) ); - x1 = vld1_u8( (uint8_t *) (src) ); - x2 = vld1_u8( (uint8_t *) (src + cn) ); - - int16x8_t y0, y1; - int32x4_t y2, y3; - y0 = vreinterpretq_s16_u16(vaddl_u8(x1, z)); - y1 = vreinterpretq_s16_u16(vaddl_u8(x0, x2)); - y2 = vmull_lane_s16(vget_low_s16(y0), k, 0); - y2 = vmlal_lane_s16(y2, vget_low_s16(y1), k, 1); - y3 = vmull_lane_s16(vget_high_s16(y0), k, 0); - y3 = vmlal_lane_s16(y3, vget_high_s16(y1), k, 1); - - vst1q_s32((int32_t *)(dst + i), y2); - vst1q_s32((int32_t *)(dst + i + 4), y3); - } - } - } - else if( _ksize == 5 ) - { - if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 ) - return 0; - else - { - int32x4_t k32 = vdupq_n_s32(0); - k32 = vld1q_lane_s32(kx, k32, 0); - k32 = vld1q_lane_s32(kx + 1, k32, 1); - k32 = vld1q_lane_s32(kx + 2, k32, 2); - - int16x4_t k = vqmovn_s32(k32); - - uint8x8_t z = vdup_n_u8(0); - - for( ; i <= width - 8; i += 8, src += 8 ) - { - uint8x8_t x0, x1, x2, x3, x4; - x0 = vld1_u8( (uint8_t *) (src - cn) ); - x1 = vld1_u8( (uint8_t *) (src) ); - x2 = vld1_u8( (uint8_t *) (src + cn) ); - - int16x8_t y0, y1; - int32x4_t accl, acch; - y0 = vreinterpretq_s16_u16(vaddl_u8(x1, z)); - y1 = vreinterpretq_s16_u16(vaddl_u8(x0, x2)); - accl = vmull_lane_s16(vget_low_s16(y0), k, 0); - accl = vmlal_lane_s16(accl, vget_low_s16(y1), k, 1); - acch = vmull_lane_s16(vget_high_s16(y0), k, 0); - acch = vmlal_lane_s16(acch, vget_high_s16(y1), k, 1); - - int16x8_t y2; - x3 = vld1_u8( (uint8_t *) (src - cn*2) ); - x4 = vld1_u8( (uint8_t *) (src + cn*2) ); - y2 = vreinterpretq_s16_u16(vaddl_u8(x3, x4)); - accl = vmlal_lane_s16(accl, vget_low_s16(y2), k, 2); - acch = vmlal_lane_s16(acch, vget_high_s16(y2), k, 2); - - vst1q_s32((int32_t *)(dst + i), accl); - vst1q_s32((int32_t *)(dst + i + 4), acch); - } - } - } - } - else - { - if( _ksize == 3 ) - { - if( kx[0] == 0 && kx[1] == 1 ) - { - uint8x8_t z = vdup_n_u8(0); - - for( ; i <= width - 8; i += 8, src += 8 ) - { - uint8x8_t x0, x1; - x0 = vld1_u8( (uint8_t *) (src - cn) ); - x1 = vld1_u8( (uint8_t *) (src + cn) ); - - int16x8_t y0; - y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)), - vreinterpretq_s16_u16(vaddl_u8(x0, z))); - - vst1q_s32((int32_t *)(dst + i), vmovl_s16(vget_low_s16(y0))); - vst1q_s32((int32_t *)(dst + i + 4), vmovl_s16(vget_high_s16(y0))); - } - } - else - { - int32x4_t k32 = vdupq_n_s32(0); - k32 = vld1q_lane_s32(kx + 1, k32, 1); - - int16x4_t k = vqmovn_s32(k32); - - uint8x8_t z = vdup_n_u8(0); - - for( ; i <= width - 8; i += 8, src += 8 ) - { - uint8x8_t x0, x1; - x0 = vld1_u8( (uint8_t *) (src - cn) ); - x1 = vld1_u8( (uint8_t *) (src + cn) ); - - int16x8_t y0; - int32x4_t y1, y2; - y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)), - vreinterpretq_s16_u16(vaddl_u8(x0, z))); - y1 = vmull_lane_s16(vget_low_s16(y0), k, 1); - y2 = vmull_lane_s16(vget_high_s16(y0), k, 1); - - vst1q_s32((int32_t *)(dst + i), y1); - vst1q_s32((int32_t *)(dst + i + 4), y2); - } - } - } - else if( _ksize == 5 ) - { - int32x4_t k32 = vdupq_n_s32(0); - k32 = vld1q_lane_s32(kx + 1, k32, 1); - k32 = vld1q_lane_s32(kx + 2, k32, 2); - - int16x4_t k = vqmovn_s32(k32); - - uint8x8_t z = vdup_n_u8(0); - - for( ; i <= width - 8; i += 8, src += 8 ) - { - uint8x8_t x0, x1; - x0 = vld1_u8( (uint8_t *) (src - cn) ); - x1 = vld1_u8( (uint8_t *) (src + cn) ); - - int32x4_t accl, acch; - int16x8_t y0; - y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)), - vreinterpretq_s16_u16(vaddl_u8(x0, z))); - accl = vmull_lane_s16(vget_low_s16(y0), k, 1); - acch = vmull_lane_s16(vget_high_s16(y0), k, 1); - - uint8x8_t x2, x3; - x2 = vld1_u8( (uint8_t *) (src - cn*2) ); - x3 = vld1_u8( (uint8_t *) (src + cn*2) ); - - int16x8_t y1; - y1 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x3, z)), - vreinterpretq_s16_u16(vaddl_u8(x2, z))); - accl = vmlal_lane_s16(accl, vget_low_s16(y1), k, 2); - acch = vmlal_lane_s16(acch, vget_high_s16(y1), k, 2); - - vst1q_s32((int32_t *)(dst + i), accl); - vst1q_s32((int32_t *)(dst + i + 4), acch); - } - } - } - - return i; - } - - Mat kernel; - int symmetryType; - bool smallValues; -}; - - -struct SymmColumnVec_32s8u -{ - SymmColumnVec_32s8u() { symmetryType=0; } - SymmColumnVec_32s8u(const Mat& _kernel, int _symmetryType, int _bits, double _delta) - { - symmetryType = _symmetryType; - _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0); - delta = (float)(_delta/(1 << _bits)); - CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); - } - - int operator()(const uchar** _src, uchar* dst, int width) const - { - if( !checkHardwareSupport(CV_CPU_NEON) ) - return 0; - - int _ksize = kernel.rows + kernel.cols - 1; - int ksize2 = _ksize / 2; - const float* ky = kernel.ptr() + ksize2; - int i = 0, k; - bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; - const int** src = (const int**)_src; - const int *S, *S2; - - float32x4_t d4 = vdupq_n_f32(delta); - - if( symmetrical ) - { - if( _ksize == 1 ) - return 0; - - - float32x2_t k32; - k32 = vdup_n_f32(0); - k32 = vld1_lane_f32(ky, k32, 0); - k32 = vld1_lane_f32(ky + 1, k32, 1); - - for( ; i <= width - 8; i += 8 ) - { - float32x4_t accl, acch; - float32x4_t f0l, f0h, f1l, f1h, f2l, f2h; - - S = src[0] + i; - - f0l = vcvtq_f32_s32( vld1q_s32(S) ); - f0h = vcvtq_f32_s32( vld1q_s32(S + 4) ); - - S = src[1] + i; - S2 = src[-1] + i; - - f1l = vcvtq_f32_s32( vld1q_s32(S) ); - f1h = vcvtq_f32_s32( vld1q_s32(S + 4) ); - f2l = vcvtq_f32_s32( vld1q_s32(S2) ); - f2h = vcvtq_f32_s32( vld1q_s32(S2 + 4) ); - - accl = acch = d4; - accl = vmlaq_lane_f32(accl, f0l, k32, 0); - acch = vmlaq_lane_f32(acch, f0h, k32, 0); - accl = vmlaq_lane_f32(accl, vaddq_f32(f1l, f2l), k32, 1); - acch = vmlaq_lane_f32(acch, vaddq_f32(f1h, f2h), k32, 1); - - for( k = 2; k <= ksize2; k++ ) - { - S = src[k] + i; - S2 = src[-k] + i; - - float32x4_t f3l, f3h, f4l, f4h; - f3l = vcvtq_f32_s32( vld1q_s32(S) ); - f3h = vcvtq_f32_s32( vld1q_s32(S + 4) ); - f4l = vcvtq_f32_s32( vld1q_s32(S2) ); - f4h = vcvtq_f32_s32( vld1q_s32(S2 + 4) ); - - accl = vmlaq_n_f32(accl, vaddq_f32(f3l, f4l), ky[k]); - acch = vmlaq_n_f32(acch, vaddq_f32(f3h, f4h), ky[k]); - } - - int32x4_t s32l, s32h; - s32l = vcvtq_s32_f32(accl); - s32h = vcvtq_s32_f32(acch); - - int16x4_t s16l, s16h; - s16l = vqmovn_s32(s32l); - s16h = vqmovn_s32(s32h); - - uint8x8_t u8; - u8 = vqmovun_s16(vcombine_s16(s16l, s16h)); - - vst1_u8((uint8_t *)(dst + i), u8); - } - } - else - { - float32x2_t k32; - k32 = vdup_n_f32(0); - k32 = vld1_lane_f32(ky + 1, k32, 1); - - for( ; i <= width - 8; i += 8 ) - { - float32x4_t accl, acch; - float32x4_t f1l, f1h, f2l, f2h; - - S = src[1] + i; - S2 = src[-1] + i; - - f1l = vcvtq_f32_s32( vld1q_s32(S) ); - f1h = vcvtq_f32_s32( vld1q_s32(S + 4) ); - f2l = vcvtq_f32_s32( vld1q_s32(S2) ); - f2h = vcvtq_f32_s32( vld1q_s32(S2 + 4) ); - - accl = acch = d4; - accl = vmlaq_lane_f32(accl, vsubq_f32(f1l, f2l), k32, 1); - acch = vmlaq_lane_f32(acch, vsubq_f32(f1h, f2h), k32, 1); - - for( k = 2; k <= ksize2; k++ ) - { - S = src[k] + i; - S2 = src[-k] + i; - - float32x4_t f3l, f3h, f4l, f4h; - f3l = vcvtq_f32_s32( vld1q_s32(S) ); - f3h = vcvtq_f32_s32( vld1q_s32(S + 4) ); - f4l = vcvtq_f32_s32( vld1q_s32(S2) ); - f4h = vcvtq_f32_s32( vld1q_s32(S2 + 4) ); - - accl = vmlaq_n_f32(accl, vsubq_f32(f3l, f4l), ky[k]); - acch = vmlaq_n_f32(acch, vsubq_f32(f3h, f4h), ky[k]); - } - - int32x4_t s32l, s32h; - s32l = vcvtq_s32_f32(accl); - s32h = vcvtq_s32_f32(acch); - - int16x4_t s16l, s16h; - s16l = vqmovn_s32(s32l); - s16h = vqmovn_s32(s32h); - - uint8x8_t u8; - u8 = vqmovun_s16(vcombine_s16(s16l, s16h)); - - vst1_u8((uint8_t *)(dst + i), u8); + else + { +#if CV_TRY_AVX2 + if (haveAVX2) + return SymmColumnVec_32f_Unsymm_AVX(src, ky, dst, delta, width, ksize2); +#endif + const v_float32 d4 = vx_setall_f32(delta); + for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) + { + v_float32 s0 = d4; + for( k = 1; k <= ksize2; k++ ) + s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0); + v_store(dst + i, s0); } } @@ -2567,128 +1718,45 @@ struct SymmColumnVec_32s8u int symmetryType; float delta; Mat kernel; + bool haveAVX2; }; -struct SymmColumnSmallVec_32s16s +struct SymmColumnSmallVec_32f { - SymmColumnSmallVec_32s16s() { symmetryType=0; } - SymmColumnSmallVec_32s16s(const Mat& _kernel, int _symmetryType, int _bits, double _delta) + SymmColumnSmallVec_32f() { symmetryType=0; delta = 0; } + SymmColumnSmallVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta) { symmetryType = _symmetryType; - _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0); - delta = (float)(_delta/(1 << _bits)); + kernel = _kernel; + delta = (float)_delta; CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); } int operator()(const uchar** _src, uchar* _dst, int width) const { - if( !checkHardwareSupport(CV_CPU_NEON) ) - return 0; - int ksize2 = (kernel.rows + kernel.cols - 1)/2; const float* ky = kernel.ptr() + ksize2; int i = 0; bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; - const int** src = (const int**)_src; - const int *S0 = src[-1], *S1 = src[0], *S2 = src[1]; - short* dst = (short*)_dst; - float32x4_t df4 = vdupq_n_f32(delta); - int32x4_t d4 = vcvtq_s32_f32(df4); + const float** src = (const float**)_src; + const float *S0 = src[-1], *S1 = src[0], *S2 = src[1]; + float* dst = (float*)_dst; + v_float32 d4 = vx_setall_f32(delta); if( symmetrical ) { - if( ky[0] == 2 && ky[1] == 1 ) - { - for( ; i <= width - 4; i += 4 ) - { - int32x4_t x0, x1, x2; - x0 = vld1q_s32((int32_t const *)(S0 + i)); - x1 = vld1q_s32((int32_t const *)(S1 + i)); - x2 = vld1q_s32((int32_t const *)(S2 + i)); - - int32x4_t y0, y1, y2, y3; - y0 = vaddq_s32(x0, x2); - y1 = vqshlq_n_s32(x1, 1); - y2 = vaddq_s32(y0, y1); - y3 = vaddq_s32(y2, d4); - - int16x4_t t; - t = vqmovn_s32(y3); - - vst1_s16((int16_t *)(dst + i), t); - } - } - else if( ky[0] == -2 && ky[1] == 1 ) - { - for( ; i <= width - 4; i += 4 ) - { - int32x4_t x0, x1, x2; - x0 = vld1q_s32((int32_t const *)(S0 + i)); - x1 = vld1q_s32((int32_t const *)(S1 + i)); - x2 = vld1q_s32((int32_t const *)(S2 + i)); - - int32x4_t y0, y1, y2, y3; - y0 = vaddq_s32(x0, x2); - y1 = vqshlq_n_s32(x1, 1); - y2 = vsubq_s32(y0, y1); - y3 = vaddq_s32(y2, d4); - - int16x4_t t; - t = vqmovn_s32(y3); - - vst1_s16((int16_t *)(dst + i), t); - } - } - else if( ky[0] == 10 && ky[1] == 3 ) + if( fabs(ky[0]) == 2 && ky[1] == 1 ) { - for( ; i <= width - 4; i += 4 ) - { - int32x4_t x0, x1, x2, x3; - x0 = vld1q_s32((int32_t const *)(S0 + i)); - x1 = vld1q_s32((int32_t const *)(S1 + i)); - x2 = vld1q_s32((int32_t const *)(S2 + i)); - - x3 = vaddq_s32(x0, x2); - - int32x4_t y0; - y0 = vmlaq_n_s32(d4, x1, 10); - y0 = vmlaq_n_s32(y0, x3, 3); - - int16x4_t t; - t = vqmovn_s32(y0); - - vst1_s16((int16_t *)(dst + i), t); - } + v_float32 k0 = vx_setall_f32(ky[0]); + for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) + v_store(dst + i, v_muladd(vx_load(S1 + i), k0, vx_load(S0 + i) + vx_load(S2 + i) + d4)); } else { - float32x2_t k32 = vdup_n_f32(0); - k32 = vld1_lane_f32(ky, k32, 0); - k32 = vld1_lane_f32(ky + 1, k32, 1); - - for( ; i <= width - 4; i += 4 ) - { - int32x4_t x0, x1, x2, x3, x4; - x0 = vld1q_s32((int32_t const *)(S0 + i)); - x1 = vld1q_s32((int32_t const *)(S1 + i)); - x2 = vld1q_s32((int32_t const *)(S2 + i)); - - x3 = vaddq_s32(x0, x2); - - float32x4_t s0, s1, s2; - s0 = vcvtq_f32_s32(x1); - s1 = vcvtq_f32_s32(x3); - s2 = vmlaq_lane_f32(df4, s0, k32, 0); - s2 = vmlaq_lane_f32(s2, s1, k32, 1); - - x4 = vcvtq_s32_f32(s2); - - int16x4_t x5; - x5 = vqmovn_s32(x4); - - vst1_s16((int16_t *)(dst + i), x5); - } + v_float32 k0 = vx_setall_f32(ky[0]), k1 = vx_setall_f32(ky[1]); + for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) + v_store(dst + i, v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4))); } } else @@ -2697,46 +1765,14 @@ struct SymmColumnSmallVec_32s16s { if( ky[1] < 0 ) std::swap(S0, S2); - for( ; i <= width - 4; i += 4 ) - { - int32x4_t x0, x1; - x0 = vld1q_s32((int32_t const *)(S0 + i)); - x1 = vld1q_s32((int32_t const *)(S2 + i)); - - int32x4_t y0, y1; - y0 = vsubq_s32(x1, x0); - y1 = vqaddq_s32(y0, d4); - - int16x4_t t; - t = vqmovn_s32(y1); - - vst1_s16((int16_t *)(dst + i), t); - } + for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) + v_store(dst + i, vx_load(S2 + i) - vx_load(S0 + i) + d4); } else { - float32x2_t k32 = vdup_n_f32(0); - k32 = vld1_lane_f32(ky + 1, k32, 1); - - for( ; i <= width - 4; i += 4 ) - { - int32x4_t x0, x1, x2, x3; - x0 = vld1q_s32((int32_t const *)(S0 + i)); - x1 = vld1q_s32((int32_t const *)(S2 + i)); - - x2 = vsubq_s32(x1, x0); - - float32x4_t s0, s1; - s0 = vcvtq_f32_s32(x2); - s1 = vmlaq_lane_f32(df4, s0, k32, 1); - - x3 = vcvtq_s32_f32(s1); - - int16x4_t x4; - x4 = vqmovn_s32(x3); - - vst1_s16((int16_t *)(dst + i), x4); - } + v_float32 k1 = vx_setall_f32(ky[1]); + for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) + v_store(dst + i, v_muladd(vx_load(S2 + i) - vx_load(S0 + i), k1, d4)); } } @@ -2749,276 +1785,186 @@ struct SymmColumnSmallVec_32s16s }; -struct SymmColumnVec_32f16s +/////////////////////////////// non-separable filters /////////////////////////////// + +///////////////////////////////// 8u<->8u, 8u<->16s ///////////////////////////////// + +struct FilterVec_8u { - SymmColumnVec_32f16s() { symmetryType=0; } - SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta) + FilterVec_8u() { delta = 0; _nz = 0; } + FilterVec_8u(const Mat& _kernel, int _bits, double _delta) { - symmetryType = _symmetryType; - kernel = _kernel; - delta = (float)_delta; - CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); - neon_supported = checkHardwareSupport(CV_CPU_NEON); + Mat kernel; + _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0); + delta = (float)(_delta/(1 << _bits)); + std::vector coords; + preprocess2DKernel(kernel, coords, coeffs); + _nz = (int)coords.size(); } - int operator()(const uchar** _src, uchar* _dst, int width) const + int operator()(const uchar** src, uchar* dst, int width) const { - if( !neon_supported ) - return 0; - - int _ksize = kernel.rows + kernel.cols - 1; - int ksize2 = _ksize / 2; - const float* ky = kernel.ptr() + ksize2; - int i = 0, k; - bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; - const float** src = (const float**)_src; - const float *S, *S2; - short* dst = (short*)_dst; - - float32x4_t d4 = vdupq_n_f32(delta); + const float* kf = (const float*)&coeffs[0]; + int i = 0, k, nz = _nz; - if( symmetrical ) + v_float32 d4 = vx_setall_f32(delta); + for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) { - if( _ksize == 1 ) - return 0; - - - float32x2_t k32; - k32 = vdup_n_f32(0); - k32 = vld1_lane_f32(ky, k32, 0); - k32 = vld1_lane_f32(ky + 1, k32, 1); - - for( ; i <= width - 8; i += 8 ) + v_float32 s0 = d4, s1 = d4, s2 = d4, s3 = d4; + for( k = 0; k < nz; k++ ) { - float32x4_t x0l, x0h, x1l, x1h, x2l, x2h; - float32x4_t accl, acch; - - S = src[0] + i; - - x0l = vld1q_f32(S); - x0h = vld1q_f32(S + 4); - - S = src[1] + i; - S2 = src[-1] + i; - - x1l = vld1q_f32(S); - x1h = vld1q_f32(S + 4); - x2l = vld1q_f32(S2); - x2h = vld1q_f32(S2 + 4); - - accl = acch = d4; - accl = vmlaq_lane_f32(accl, x0l, k32, 0); - acch = vmlaq_lane_f32(acch, x0h, k32, 0); - accl = vmlaq_lane_f32(accl, vaddq_f32(x1l, x2l), k32, 1); - acch = vmlaq_lane_f32(acch, vaddq_f32(x1h, x2h), k32, 1); - - for( k = 2; k <= ksize2; k++ ) - { - S = src[k] + i; - S2 = src[-k] + i; - - float32x4_t x3l, x3h, x4l, x4h; - x3l = vld1q_f32(S); - x3h = vld1q_f32(S + 4); - x4l = vld1q_f32(S2); - x4h = vld1q_f32(S2 + 4); - - accl = vmlaq_n_f32(accl, vaddq_f32(x3l, x4l), ky[k]); - acch = vmlaq_n_f32(acch, vaddq_f32(x3h, x4h), ky[k]); - } - - int32x4_t s32l, s32h; - s32l = vcvtq_s32_f32(accl); - s32h = vcvtq_s32_f32(acch); - - int16x4_t s16l, s16h; - s16l = vqmovn_s32(s32l); - s16h = vqmovn_s32(s32h); - - vst1_s16((int16_t *)(dst + i), s16l); - vst1_s16((int16_t *)(dst + i + 4), s16h); - } + v_float32 f = vx_setall_f32(kf[k]); + v_uint16 xl, xh; + v_expand(vx_load(src[k] + i), xl, xh); + v_uint32 x0, x1, x2, x3; + v_expand(xl, x0, x1); + v_expand(xh, x2, x3); + s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f, s0); + s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f, s1); + s2 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x2)), f, s2); + s3 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x3)), f, s3); + } + v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3)))); } - else + if( i <= width - v_uint16::nlanes ) { - float32x2_t k32; - k32 = vdup_n_f32(0); - k32 = vld1_lane_f32(ky + 1, k32, 1); - - for( ; i <= width - 8; i += 8 ) + v_float32 s0 = d4, s1 = d4; + for( k = 0; k < nz; k++ ) { - float32x4_t x1l, x1h, x2l, x2h; - float32x4_t accl, acch; - - S = src[1] + i; - S2 = src[-1] + i; - - x1l = vld1q_f32(S); - x1h = vld1q_f32(S + 4); - x2l = vld1q_f32(S2); - x2h = vld1q_f32(S2 + 4); - - accl = acch = d4; - accl = vmlaq_lane_f32(accl, vsubq_f32(x1l, x2l), k32, 1); - acch = vmlaq_lane_f32(acch, vsubq_f32(x1h, x2h), k32, 1); - - for( k = 2; k <= ksize2; k++ ) - { - S = src[k] + i; - S2 = src[-k] + i; - - float32x4_t x3l, x3h, x4l, x4h; - x3l = vld1q_f32(S); - x3h = vld1q_f32(S + 4); - x4l = vld1q_f32(S2); - x4h = vld1q_f32(S2 + 4); - - accl = vmlaq_n_f32(accl, vsubq_f32(x3l, x4l), ky[k]); - acch = vmlaq_n_f32(acch, vsubq_f32(x3h, x4h), ky[k]); - } - - int32x4_t s32l, s32h; - s32l = vcvtq_s32_f32(accl); - s32h = vcvtq_s32_f32(acch); - - int16x4_t s16l, s16h; - s16l = vqmovn_s32(s32l); - s16h = vqmovn_s32(s32h); - - vst1_s16((int16_t *)(dst + i), s16l); - vst1_s16((int16_t *)(dst + i + 4), s16h); + v_float32 f = vx_setall_f32(kf[k]); + v_uint32 x0, x1; + v_expand(vx_load_expand(src[k] + i), x0, x1); + s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f, s0); + s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f, s1); } + v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1))); + i += v_uint16::nlanes; + } +#if CV_SIMD_WIDTH > 16 + while( i <= width - v_int32x4::nlanes ) +#else + if( i <= width - v_int32x4::nlanes ) +#endif + { + v_float32x4 s0 = v_setall_f32(delta); + for( k = 0; k < nz; k++ ) + s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(src[k] + i))), v_setall_f32(kf[k]), s0); + v_int32x4 s32 = v_round(s0); + v_int16x8 s16 = v_pack(s32, s32); + *(int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0(); + i += v_int32x4::nlanes; } return i; } - int symmetryType; + int _nz; + std::vector coeffs; float delta; - Mat kernel; - bool neon_supported; }; -struct SymmRowSmallVec_32f +struct FilterVec_8u16s { - SymmRowSmallVec_32f() {} - SymmRowSmallVec_32f( const Mat& _kernel, int _symmetryType ) + FilterVec_8u16s() { delta = 0; _nz = 0; } + FilterVec_8u16s(const Mat& _kernel, int _bits, double _delta) { - kernel = _kernel; - symmetryType = _symmetryType; + Mat kernel; + _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0); + delta = (float)(_delta/(1 << _bits)); + std::vector coords; + preprocess2DKernel(kernel, coords, coeffs); + _nz = (int)coords.size(); } - int operator()(const uchar* _src, uchar* _dst, int width, int cn) const + int operator()(const uchar** src, uchar* _dst, int width) const { - if( !checkHardwareSupport(CV_CPU_NEON) ) - return 0; - - int i = 0, _ksize = kernel.rows + kernel.cols - 1; - float* dst = (float*)_dst; - const float* src = (const float*)_src + (_ksize/2)*cn; - bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; - const float* kx = kernel.ptr() + _ksize/2; - width *= cn; + const float* kf = (const float*)&coeffs[0]; + short* dst = (short*)_dst; + int i = 0, k, nz = _nz; - if( symmetrical ) + v_float32 d4 = vx_setall_f32(delta); + for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) { - if( _ksize == 1 ) - return 0; - if( _ksize == 3 ) - { - if( kx[0] == 2 && kx[1] == 1 ) - return 0; - else if( kx[0] == -2 && kx[1] == 1 ) - return 0; - else - { - return 0; - } - } - else if( _ksize == 5 ) + v_float32 s0 = d4, s1 = d4, s2 = d4, s3 = d4; + for( k = 0; k < nz; k++ ) { - if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 ) - return 0; - else - { - float32x2_t k0, k1; - k0 = k1 = vdup_n_f32(0); - k0 = vld1_lane_f32(kx + 0, k0, 0); - k0 = vld1_lane_f32(kx + 1, k0, 1); - k1 = vld1_lane_f32(kx + 2, k1, 0); - - for( ; i <= width - 4; i += 4, src += 4 ) - { - float32x4_t x0, x1, x2, x3, x4; - x0 = vld1q_f32(src); - x1 = vld1q_f32(src - cn); - x2 = vld1q_f32(src + cn); - x3 = vld1q_f32(src - cn*2); - x4 = vld1q_f32(src + cn*2); - - float32x4_t y0; - y0 = vmulq_lane_f32(x0, k0, 0); - y0 = vmlaq_lane_f32(y0, vaddq_f32(x1, x2), k0, 1); - y0 = vmlaq_lane_f32(y0, vaddq_f32(x3, x4), k1, 0); - - vst1q_f32(dst + i, y0); - } - } + v_float32 f = vx_setall_f32(kf[k]); + v_uint16 xl, xh; + v_expand(vx_load(src[k] + i), xl, xh); + s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xl))), f, s0); + s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xl))), f, s1); + s2 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xh))), f, s2); + s3 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xh))), f, s3); } + v_store(dst + i, v_pack(v_round(s0), v_round(s1))); + v_store(dst + i + v_int16::nlanes, v_pack(v_round(s2), v_round(s3))); } - else + if( i <= width - v_uint16::nlanes ) { - if( _ksize == 3 ) - { - if( kx[0] == 0 && kx[1] == 1 ) - return 0; - else - { - return 0; - } - } - else if( _ksize == 5 ) + v_float32 s0 = d4, s1 = d4; + for( k = 0; k < nz; k++ ) { - float32x2_t k; - k = vdup_n_f32(0); - k = vld1_lane_f32(kx + 1, k, 0); - k = vld1_lane_f32(kx + 2, k, 1); - - for( ; i <= width - 4; i += 4, src += 4 ) - { - float32x4_t x0, x1, x2, x3; - x0 = vld1q_f32(src - cn); - x1 = vld1q_f32(src + cn); - x2 = vld1q_f32(src - cn*2); - x3 = vld1q_f32(src + cn*2); - - float32x4_t y0; - y0 = vmulq_lane_f32(vsubq_f32(x1, x0), k, 0); - y0 = vmlaq_lane_f32(y0, vsubq_f32(x3, x2), k, 1); - - vst1q_f32(dst + i, y0); - } + v_float32 f = vx_setall_f32(kf[k]); + v_uint16 x = vx_load_expand(src[k] + i); + s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(x))), f, s0); + s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(x))), f, s1); } + v_store(dst + i, v_pack(v_round(s0), v_round(s1))); + i += v_uint16::nlanes; + } + if( i <= width - v_int32::nlanes ) + { + v_float32 s0 = d4; + for( k = 0; k < nz; k++ ) + s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[k] + i))), vx_setall_f32(kf[k]), s0); + v_pack_store(dst + i, v_round(s0)); + i += v_int32::nlanes; } return i; } - Mat kernel; - int symmetryType; + int _nz; + std::vector coeffs; + float delta; }; -typedef RowNoVec RowVec_8u32s; -typedef RowNoVec RowVec_16s32f; -typedef RowNoVec RowVec_32f; -typedef ColumnNoVec SymmColumnVec_32f; -typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f; -typedef FilterNoVec FilterVec_8u; -typedef FilterNoVec FilterVec_8u16s; -typedef FilterNoVec FilterVec_32f; +struct FilterVec_32f +{ + FilterVec_32f() { delta = 0; _nz = 0; } + FilterVec_32f(const Mat& _kernel, int, double _delta) + { + delta = (float)_delta; + std::vector coords; + preprocess2DKernel(_kernel, coords, coeffs); + _nz = (int)coords.size(); + } + + int operator()(const uchar** _src, uchar* _dst, int width) const + { + const float* kf = (const float*)&coeffs[0]; + const float** src = (const float**)_src; + float* dst = (float*)_dst; + int i = 0, k, nz = _nz; + + v_float32 d4 = vx_setall_f32(delta); + for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) + { + v_float32 s0 = d4; + for( k = 0; k < nz; k++ ) + s0 = v_muladd(vx_load(src[k] + i), vx_setall_f32(kf[k]), s0); + v_store(dst + i, s0); + } + + return i; + } + int _nz; + std::vector coeffs; + float delta; +}; #else @@ -4655,15 +3601,9 @@ static bool dftFilter2D(int stype, int dtype, int kernel_type, double delta, int borderType) { { -#if CV_SSE2 int sdepth = CV_MAT_DEPTH(stype); int ddepth = CV_MAT_DEPTH(dtype); - int dft_filter_size = ((sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S)) || (sdepth == CV_32F && ddepth == CV_32F)) && checkHardwareSupport(CV_CPU_SSE3) ? 130 : 50; -#else - CV_UNUSED(stype); - CV_UNUSED(dtype); - int dft_filter_size = 50; -#endif + int dft_filter_size = checkHardwareSupport(CV_CPU_SSE3) && ((sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S)) || (sdepth == CV_32F && ddepth == CV_32F)) ? 130 : 50; if (kernel_width * kernel_height < dft_filter_size) return false; }