diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index 476178d4ce..b40f1de777 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -899,6 +899,15 @@ inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps) OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd) +#define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec, cast) \ +inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ +{ return cast(v_reinterpret_as_f64(a) == v_reinterpret_as_f64(b)); } \ +inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ +{ return cast(v_reinterpret_as_f64(a) != v_reinterpret_as_f64(b)); } + +OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64); +OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64); + OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8) OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8) OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16) @@ -1520,6 +1529,35 @@ inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& v_transpose4x4(u0, u1, u2, u3, a, b, c, d); } +inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c) +{ + __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); + __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); + __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); + + a = v_uint64x2(_mm_unpacklo_epi64(t0, _mm_unpackhi_epi64(t1, t1))); + b = v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2)); + c = v_uint64x2(_mm_unpacklo_epi64(t1, _mm_unpackhi_epi64(t2, t2))); +} + +inline void v_load_deinterleave(const int64 *ptr, v_int64x2& a, v_int64x2& b, v_int64x2& c) +{ + v_uint64x2 t0, t1, t2; + v_load_deinterleave((const uint64*)ptr, t0, t1, t2); + a = v_reinterpret_as_s64(t0); + b = v_reinterpret_as_s64(t1); + c = v_reinterpret_as_s64(t2); +} + +inline void v_load_deinterleave(const double *ptr, v_float64x2& a, v_float64x2& b, v_float64x2& c) +{ + v_uint64x2 t0, t1, t2; + v_load_deinterleave((const uint64*)ptr, t0, t1, t2); + a = v_reinterpret_as_f64(t0); + b = v_reinterpret_as_f64(t1); + c = v_reinterpret_as_f64(t2); +} + // 2-channel, float only inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b) { @@ -1717,6 +1755,27 @@ inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32 _mm_storeu_ps((ptr + 4), u1); } +inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c) +{ + __m128i t0 = _mm_unpacklo_epi64(a.val, b.val); + __m128i t1 = _mm_unpacklo_epi64(c.val, _mm_unpackhi_epi64(a.val, a.val)); + __m128i t2 = _mm_unpackhi_epi64(b.val, c.val); + + _mm_storeu_si128((__m128i*)ptr, t0); + _mm_storeu_si128((__m128i*)(ptr + 2), t1); + _mm_storeu_si128((__m128i*)(ptr + 4), t2); +} + +inline void v_store_interleave(int64 *ptr, const v_int64x2& a, const v_int64x2& b, const v_int64x2& c) +{ + v_store_interleave((uint64*)ptr, v_reinterpret_as_u64(a), v_reinterpret_as_u64(b), v_reinterpret_as_u64(c)); +} + +inline void v_store_interleave(double *ptr, const v_float64x2& a, const v_float64x2& b, const v_float64x2& c) +{ + v_store_interleave((uint64*)ptr, v_reinterpret_as_u64(a), v_reinterpret_as_u64(b), v_reinterpret_as_u64(c)); +} + #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \ inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \ _Tpvec& b0, _Tpvec& c0 ) \ diff --git a/modules/imgproc/CMakeLists.txt b/modules/imgproc/CMakeLists.txt index afe89aaa1e..87d0711809 100644 --- a/modules/imgproc/CMakeLists.txt +++ b/modules/imgproc/CMakeLists.txt @@ -1,2 +1,3 @@ set(the_description "Image Processing") +ocv_add_dispatched_file(accum SSE2 AVX NEON) ocv_define_module(imgproc opencv_core WRAP java python) diff --git a/modules/imgproc/src/accum.cpp b/modules/imgproc/src/accum.cpp index 941c100814..2b43f4e42c 100644 --- a/modules/imgproc/src/accum.cpp +++ b/modules/imgproc/src/accum.cpp @@ -44,1718 +44,17 @@ #include "precomp.hpp" #include "opencl_kernels_imgproc.hpp" #include "opencv2/core/hal/intrin.hpp" - +#define CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY +#include "accum.simd.hpp" +#include "accum.simd_declarations.hpp" #include "opencv2/core/openvx/ovx_defs.hpp" namespace cv { -template -struct Acc_SIMD -{ - int operator() (const T *, AT *, const uchar *, int, int) const - { - return 0; - } -}; - -template -struct AccSqr_SIMD -{ - int operator() (const T *, AT *, const uchar *, int, int) const - { - return 0; - } -}; - -template -struct AccProd_SIMD -{ - int operator() (const T *, const T *, AT *, const uchar *, int, int) const - { - return 0; - } -}; - -template -struct AccW_SIMD -{ - int operator() (const T *, AT *, const uchar *, int, int, AT) const - { - return 0; - } -}; - -#if CV_AVX -template <> -struct Acc_SIMD -{ - int operator() (const float * src, float * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - if (!mask) - { - len *= cn; - for ( ; x <= len - 8 ; x += 8) - { - __m256 v_src = _mm256_loadu_ps(src + x); - __m256 v_dst = _mm256_loadu_ps(dst + x); - v_dst = _mm256_add_ps(v_src, v_dst); - _mm256_storeu_ps(dst + x, v_dst); - } - } - return x; - } -}; - -template <> -struct Acc_SIMD -{ - int operator() (const float * src, double * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - if (!mask) - { - len *= cn; - for ( ; x <= len - 8 ; x += 8) - { - __m256 v_src = _mm256_loadu_ps(src + x); - __m256d v_src0 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,0)); - __m256d v_src1 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,1)); - __m256d v_dst0 = _mm256_loadu_pd(dst + x); - __m256d v_dst1 = _mm256_loadu_pd(dst + x + 4); - v_dst0 = _mm256_add_pd(v_src0, v_dst0); - v_dst1 = _mm256_add_pd(v_src1, v_dst1); - _mm256_storeu_pd(dst + x, v_dst0); - _mm256_storeu_pd(dst + x + 4, v_dst1); - } - } - return x; - } -}; - -template <> -struct Acc_SIMD -{ - int operator() (const double * src, double * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - - if (!mask) - { - len *= cn; - for ( ; x <= len - 4; x += 4) - { - __m256d v_src = _mm256_loadu_pd(src + x); - __m256d v_dst = _mm256_loadu_pd(dst + x); - - v_dst = _mm256_add_pd(v_dst, v_src); - _mm256_storeu_pd(dst + x, v_dst); - } - } - return x; - } -}; - -template <> -struct AccSqr_SIMD -{ - int operator() (const float * src, float * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - if (!mask) - { - len *= cn; - for ( ; x <= len - 8 ; x += 8) - { - __m256 v_src = _mm256_loadu_ps(src + x); - __m256 v_dst = _mm256_loadu_ps(dst + x); - - v_src = _mm256_mul_ps(v_src, v_src); - v_dst = _mm256_add_ps(v_src, v_dst); - _mm256_storeu_ps(dst + x, v_dst); - } - } - return x; - } -}; - -template <> -struct AccSqr_SIMD -{ - int operator() (const float * src, double * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - if (!mask) - { - len *= cn; - for ( ; x <= len - 8 ; x += 8) - { - __m256 v_src = _mm256_loadu_ps(src + x); - __m256d v_src0 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,0)); - __m256d v_src1 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,1)); - __m256d v_dst0 = _mm256_loadu_pd(dst + x); - __m256d v_dst1 = _mm256_loadu_pd(dst + x + 4); - - v_src0 = _mm256_mul_pd(v_src0, v_src0); - v_src1 = _mm256_mul_pd(v_src1, v_src1); - v_dst0 = _mm256_add_pd(v_src0, v_dst0); - v_dst1 = _mm256_add_pd(v_src1, v_dst1); - _mm256_storeu_pd(dst + x, v_dst0); - _mm256_storeu_pd(dst + x + 4, v_dst1); - } - } - return x; - } -}; - -template <> -struct AccSqr_SIMD -{ - int operator() (const double * src, double * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - - if (!mask) - { - len *= cn; - for ( ; x <= len - 4; x += 4) - { - __m256d v_src = _mm256_loadu_pd(src + x); - __m256d v_dst = _mm256_loadu_pd(dst + x); - - v_src = _mm256_mul_pd(v_src, v_src); - v_dst = _mm256_add_pd(v_dst, v_src); - _mm256_storeu_pd(dst + x, v_dst); - } - } - return x; - } -}; - -template <> -struct AccProd_SIMD -{ - int operator() (const float * src1, const float * src2, float * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - - if (!mask) - { - len *= cn; - for ( ; x <= len - 8; x += 8) - { - __m256 v_src0 = _mm256_loadu_ps(src1 + x); - __m256 v_src1 = _mm256_loadu_ps(src2 + x); - __m256 v_dst = _mm256_loadu_ps(dst + x); - __m256 v_src = _mm256_mul_ps(v_src0, v_src1); - - v_dst = _mm256_add_ps(v_src, v_dst); - _mm256_storeu_ps(dst + x, v_dst); - } - } - - return x; - } -}; - -template <> -struct AccProd_SIMD -{ - int operator() (const float * src1, const float * src2, double * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - - if (!mask) - { - len *= cn; - for ( ; x <= len - 8; x += 8) - { - __m256 v_1src = _mm256_loadu_ps(src1 + x); - __m256 v_2src = _mm256_loadu_ps(src2 + x); - __m256d v_src00 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_1src,0)); - __m256d v_src01 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_1src,1)); - __m256d v_src10 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_2src,0)); - __m256d v_src11 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_2src,1)); - __m256d v_dst0 = _mm256_loadu_pd(dst + x); - __m256d v_dst1 = _mm256_loadu_pd(dst + x + 4); - - __m256d v_src0 = _mm256_mul_pd(v_src00, v_src10); - __m256d v_src1 = _mm256_mul_pd(v_src01, v_src11); - v_dst0 = _mm256_add_pd(v_src0, v_dst0); - v_dst1 = _mm256_add_pd(v_src1, v_dst1); - _mm256_storeu_pd(dst + x, v_dst0); - _mm256_storeu_pd(dst + x + 4, v_dst1); - } - } - return x; - } -}; - -template <> -struct AccProd_SIMD -{ - int operator() (const double * src1, const double * src2, double * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - - if (!mask) - { - len *= cn; - for ( ; x <= len - 4; x += 4) - { - __m256d v_src0 = _mm256_loadu_pd(src1 + x); - __m256d v_src1 = _mm256_loadu_pd(src2 + x); - __m256d v_dst = _mm256_loadu_pd(dst + x); - - v_src0 = _mm256_mul_pd(v_src0, v_src1); - v_dst = _mm256_add_pd(v_dst, v_src0); - _mm256_storeu_pd(dst + x, v_dst); - } - } - return x; - } -}; - -template <> -struct AccW_SIMD -{ - int operator() (const float * src, float * dst, const uchar * mask, int len, int cn, float alpha) const - { - int x = 0; - __m256 v_alpha = _mm256_set1_ps(alpha); - __m256 v_beta = _mm256_set1_ps(1.0f - alpha); - - if (!mask) - { - len *= cn; - for ( ; x <= len - 16; x += 16) - { - _mm256_storeu_ps(dst + x, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + x), v_beta), _mm256_mul_ps(_mm256_loadu_ps(src + x), v_alpha))); - _mm256_storeu_ps(dst + x + 8, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + x + 8), v_beta), _mm256_mul_ps(_mm256_loadu_ps(src + x + 8), v_alpha))); - } - } - - return x; - } -}; - -template <> -struct AccW_SIMD -{ - int operator() (const float * src, double * dst, const uchar * mask, int len, int cn, double alpha) const - { - int x = 0; - __m256d v_alpha = _mm256_set1_pd(alpha); - __m256d v_beta = _mm256_set1_pd(1.0f - alpha); - - if (!mask) - { - len *= cn; - for ( ; x <= len - 16; x += 16) - { - __m256 v_src0 = _mm256_loadu_ps(src + x); - __m256 v_src1 = _mm256_loadu_ps(src + x + 8); - __m256d v_src00 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src0,0)); - __m256d v_src01 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src0,1)); - __m256d v_src10 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src1,0)); - __m256d v_src11 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src1,1)); - - _mm256_storeu_pd(dst + x, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x), v_beta), _mm256_mul_pd(v_src00, v_alpha))); - _mm256_storeu_pd(dst + x + 4, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 4), v_beta), _mm256_mul_pd(v_src01, v_alpha))); - _mm256_storeu_pd(dst + x + 8, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 8), v_beta), _mm256_mul_pd(v_src10, v_alpha))); - _mm256_storeu_pd(dst + x + 12, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 12), v_beta), _mm256_mul_pd(v_src11, v_alpha))); - } - } - - return x; - } -}; - -template <> -struct AccW_SIMD -{ - int operator() (const double * src, double * dst, const uchar * mask, int len, int cn, double alpha) const - { - int x = 0; - __m256d v_alpha = _mm256_set1_pd(alpha); - __m256d v_beta = _mm256_set1_pd(1.0f - alpha); - - if (!mask) - { - len *= cn; - for ( ; x <= len - 8; x += 8) - { - __m256d v_src0 = _mm256_loadu_pd(src + x); - __m256d v_src1 = _mm256_loadu_pd(src + x + 4); - - _mm256_storeu_pd(dst + x, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x), v_beta), _mm256_mul_pd(v_src0, v_alpha))); - _mm256_storeu_pd(dst + x + 4, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 4), v_beta), _mm256_mul_pd(v_src1, v_alpha))); - } - } - - return x; - } -}; -#elif CV_SIMD128 -template <> -struct Acc_SIMD -{ - int operator() (const float * src, float * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - - if (!mask) - { - len *= cn; - for ( ; x <= len - 8; x += 8) - { - v_store(dst + x, v_load(dst + x) + v_load(src + x)); - v_store(dst + x + 4, v_load(dst + x + 4) + v_load(src + x + 4)); - } - } - - return x; - } -}; - -#if CV_SIMD128_64F -template <> -struct Acc_SIMD -{ - int operator() (const float * src, double * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - - if (!mask) - { - len *= cn; - for ( ; x <= len - 4; x += 4) - { - v_float32x4 v_src = v_load(src + x); - v_float64x2 v_src0 = v_cvt_f64(v_src); - v_float64x2 v_src1 = v_cvt_f64_high(v_src); - - v_store(dst + x, v_load(dst + x) + v_src0); - v_store(dst + x + 2, v_load(dst + x + 2) + v_src1); - } - } - return x; - } -}; - -template <> -struct Acc_SIMD -{ - int operator() (const double * src, double * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - - if (!mask) - { - len *= cn; - for ( ; x <= len - 4; x += 4) - { - v_float64x2 v_src0 = v_load(src + x); - v_float64x2 v_src1 = v_load(src + x + 2); - - v_store(dst + x, v_load(dst + x) + v_src0); - v_store(dst + x + 2, v_load(dst + x + 2) + v_src1); - } - } - return x; - } -}; -#endif //CV_SIMD128_64F - -template <> -struct AccSqr_SIMD -{ - int operator() (const float * src, float * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - - if (!mask) - { - len *= cn; - for ( ; x <= len - 8; x += 8) - { - v_float32x4 v_src0 = v_load(src + x); - v_float32x4 v_src1 = v_load(src + x + 4); - v_src0 = v_src0 * v_src0; - v_src1 = v_src1 * v_src1; - - v_store(dst + x, v_load(dst + x) + v_src0); - v_store(dst + x + 4, v_load(dst + x + 4) + v_src1); - } - } - - return x; - } -}; - -#if CV_SIMD128_64F -template <> -struct AccSqr_SIMD -{ - int operator() (const float * src, double * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - - if (!mask) - { - len *= cn; - for ( ; x <= len - 4; x += 4) - { - v_float32x4 v_src = v_load(src + x); - v_float64x2 v_src0 = v_cvt_f64(v_src); - v_float64x2 v_src1 = v_cvt_f64_high(v_src); - v_src0 = v_src0 * v_src0; - v_src1 = v_src1 * v_src1; - - v_store(dst + x, v_load(dst + x) + v_src0); - v_store(dst + x + 2, v_load(dst + x + 2) + v_src1); - } - } - return x; - } -}; - -template <> -struct AccSqr_SIMD -{ - int operator() (const double * src, double * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - - if (!mask) - { - len *= cn; - for ( ; x <= len - 4; x += 4) - { - v_float64x2 v_src0 = v_load(src + x); - v_float64x2 v_src1 = v_load(src + x + 2); - v_src0 = v_src0 * v_src0; - v_src1 = v_src1 * v_src1; - - v_store(dst + x, v_load(dst + x) + v_src0); - v_store(dst + x + 2, v_load(dst + x + 2) + v_src1); - } - } - return x; - } -}; -#endif //CV_SIMD128_64F - -template <> -struct AccProd_SIMD -{ - int operator() (const float * src1, const float * src2, float * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - - if (!mask) - { - len *= cn; - for ( ; x <= len - 8; x += 8) - { - v_store(dst + x, v_load(dst + x) + v_load(src1 + x) * v_load(src2 + x)); - v_store(dst + x + 4, v_load(dst + x + 4) + v_load(src1 + x + 4) * v_load(src2 + x + 4)); - } - } - - return x; - } -}; - -#if CV_SIMD128_64F -template <> -struct AccProd_SIMD -{ - int operator() (const float * src1, const float * src2, double * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - - if (!mask) - { - len *= cn; - for ( ; x <= len - 4; x += 4) - { - v_float32x4 v_1src = v_load(src1 + x); - v_float32x4 v_2src = v_load(src2 + x); - - v_float64x2 v_1src0 = v_cvt_f64(v_1src); - v_float64x2 v_1src1 = v_cvt_f64_high(v_1src); - v_float64x2 v_2src0 = v_cvt_f64(v_2src); - v_float64x2 v_2src1 = v_cvt_f64_high(v_2src); - - v_store(dst + x, v_load(dst + x) + (v_1src0 * v_2src0)); - v_store(dst + x + 2, v_load(dst + x + 2) + (v_1src1 * v_2src1)); - } - } - return x; - } -}; - -template <> -struct AccProd_SIMD -{ - int operator() (const double * src1, const double * src2, double * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - - if (!mask) - { - len *= cn; - for ( ; x <= len - 4; x += 4) - { - v_float64x2 v_src00 = v_load(src1 + x); - v_float64x2 v_src01 = v_load(src1 + x + 2); - v_float64x2 v_src10 = v_load(src2 + x); - v_float64x2 v_src11 = v_load(src2 + x + 2); - - v_store(dst + x, v_load(dst + x) + (v_src00 * v_src10)); - v_store(dst + x + 2, v_load(dst + x + 2) + (v_src01 * v_src11)); - } - } - return x; - } -}; -#endif //CV_SIMD128_64F - -template <> -struct AccW_SIMD -{ - int operator() (const float * src, float * dst, const uchar * mask, int len, int cn, float alpha) const - { - int x = 0; - v_float32x4 v_alpha = v_setall_f32(alpha); - v_float32x4 v_beta = v_setall_f32(1.0f - alpha); - - if (!mask) - { - len *= cn; - for ( ; x <= len - 8; x += 8) - { - v_store(dst + x, ((v_load(dst + x) * v_beta) + (v_load(src + x) * v_alpha))); - v_store(dst + x + 4, ((v_load(dst + x + 4) * v_beta) + (v_load(src + x + 4) * v_alpha))); - } - } - - return x; - } -}; - -#if CV_SIMD128_64F -template <> -struct AccW_SIMD -{ - int operator() (const float * src, double * dst, const uchar * mask, int len, int cn, double alpha) const - { - int x = 0; - v_float64x2 v_alpha = v_setall_f64(alpha); - v_float64x2 v_beta = v_setall_f64(1.0f - alpha); - - if (!mask) - { - len *= cn; - for ( ; x <= len - 8; x += 8) - { - v_float32x4 v_src0 = v_load(src + x); - v_float32x4 v_src1 = v_load(src + x + 4); - v_float64x2 v_src00 = v_cvt_f64(v_src0); - v_float64x2 v_src01 = v_cvt_f64_high(v_src0); - v_float64x2 v_src10 = v_cvt_f64(v_src1); - v_float64x2 v_src11 = v_cvt_f64_high(v_src1); - - v_store(dst + x, ((v_load(dst + x) * v_beta) + (v_src00 * v_alpha))); - v_store(dst + x + 2, ((v_load(dst + x + 2) * v_beta) + (v_src01 * v_alpha))); - v_store(dst + x + 4, ((v_load(dst + x + 4) * v_beta) + (v_src10 * v_alpha))); - v_store(dst + x + 6, ((v_load(dst + x + 6) * v_beta) + (v_src11 * v_alpha))); - } - } - - return x; - } -}; - -template <> -struct AccW_SIMD -{ - int operator() (const double * src, double * dst, const uchar * mask, int len, int cn, double alpha) const - { - int x = 0; - v_float64x2 v_alpha = v_setall_f64(alpha); - v_float64x2 v_beta = v_setall_f64(1.0f - alpha); - - if (!mask) - { - len *= cn; - for ( ; x <= len - 4; x += 4) - { - v_float64x2 v_src0 = v_load(src + x); - v_float64x2 v_src1 = v_load(src + x + 2); - - v_store(dst + x, ((v_load(dst + x) * v_beta) + (v_src0 * v_alpha))); - v_store(dst + x + 2, ((v_load(dst + x + 2) * v_beta) + (v_src1 * v_alpha))); - } - } - - return x; - } -}; -#endif //CV_SIMD128_64F -#endif //CV_SIMD128 - -#if CV_SIMD128 -template <> -struct Acc_SIMD -{ - int operator() (const uchar * src, float * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - - if (!mask) - { - len *= cn; - for ( ; x <= len - 16; x += 16) - { - v_uint8x16 v_src = v_load(src + x); - v_uint16x8 v_src0, v_src1; - v_expand(v_src, v_src0, v_src1); - - v_uint32x4 v_src00, v_src01, v_src10, v_src11; - v_expand(v_src0, v_src00, v_src01); - v_expand(v_src1, v_src10, v_src11); - - v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); - v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); - v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); - v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); - } - } - else if (cn == 1) - { - v_uint8x16 v_0 = v_setall_u8(0); - - for ( ; x <= len - 16; x += 16) - { - v_uint8x16 v_mask = v_load(mask + x); - v_mask = ~(v_0 == v_mask); - v_uint8x16 v_src = v_load(src + x); - v_src = v_src & v_mask; - v_uint16x8 v_src0, v_src1; - v_expand(v_src, v_src0, v_src1); - - v_uint32x4 v_src00, v_src01, v_src10, v_src11; - v_expand(v_src0, v_src00, v_src01); - v_expand(v_src1, v_src10, v_src11); - - v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); - v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); - v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); - v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); - } - } - - return x; - } -}; - -template <> -struct Acc_SIMD -{ - int operator() (const ushort * src, float * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - if (!mask) - { - len *= cn; - for ( ; x <= len - 8; x += 8) - { - v_uint16x8 v_src = v_load(src + x); - v_uint32x4 v_src0, v_src1; - v_expand(v_src, v_src0, v_src1); - - v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src0))); - v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src1))); - } - } - - return x; - } -}; - -#if CV_SIMD128_64F -template <> -struct Acc_SIMD -{ - int operator() (const uchar * src, double * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - - if (!mask) - { - len *= cn; - for ( ; x <= len - 16; x += 16) - { - v_uint8x16 v_src = v_load(src + x); - v_uint16x8 v_int0, v_int1; - v_expand(v_src, v_int0, v_int1); - - v_uint32x4 v_int00, v_int01, v_int10, v_int11; - v_expand(v_int0, v_int00, v_int01); - v_expand(v_int1, v_int10, v_int11); - - v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int00)); - v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00)); - v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int01)); - v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01)); - v_float64x2 v_src4 = v_cvt_f64(v_reinterpret_as_s32(v_int10)); - v_float64x2 v_src5 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10)); - v_float64x2 v_src6 = v_cvt_f64(v_reinterpret_as_s32(v_int11)); - v_float64x2 v_src7 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11)); - - v_float64x2 v_dst0 = v_load(dst + x); - v_float64x2 v_dst1 = v_load(dst + x + 2); - v_float64x2 v_dst2 = v_load(dst + x + 4); - v_float64x2 v_dst3 = v_load(dst + x + 6); - v_float64x2 v_dst4 = v_load(dst + x + 8); - v_float64x2 v_dst5 = v_load(dst + x + 10); - v_float64x2 v_dst6 = v_load(dst + x + 12); - v_float64x2 v_dst7 = v_load(dst + x + 14); - - v_dst0 = v_dst0 + v_src0; - v_dst1 = v_dst1 + v_src1; - v_dst2 = v_dst2 + v_src2; - v_dst3 = v_dst3 + v_src3; - v_dst4 = v_dst4 + v_src4; - v_dst5 = v_dst5 + v_src5; - v_dst6 = v_dst6 + v_src6; - v_dst7 = v_dst7 + v_src7; - - v_store(dst + x, v_dst0); - v_store(dst + x + 2, v_dst1); - v_store(dst + x + 4, v_dst2); - v_store(dst + x + 6, v_dst3); - v_store(dst + x + 8, v_dst4); - v_store(dst + x + 10, v_dst5); - v_store(dst + x + 12, v_dst6); - v_store(dst + x + 14, v_dst7); - } - } - return x; - } -}; - -template <> -struct Acc_SIMD -{ - int operator() (const ushort * src, double * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - - if (!mask) - { - len *= cn; - for ( ; x <= len - 8; x += 8) - { - v_uint16x8 v_src = v_load(src + x); - v_uint32x4 v_int0, v_int1; - v_expand(v_src, v_int0, v_int1); - - v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0)); - v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0)); - v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1)); - v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1)); - - v_float64x2 v_dst0 = v_load(dst + x); - v_float64x2 v_dst1 = v_load(dst + x + 2); - v_float64x2 v_dst2 = v_load(dst + x + 4); - v_float64x2 v_dst3 = v_load(dst + x + 6); - - v_dst0 = v_dst0 + v_src0; - v_dst1 = v_dst1 + v_src1; - v_dst2 = v_dst2 + v_src2; - v_dst3 = v_dst3 + v_src3; - - v_store(dst + x, v_dst0); - v_store(dst + x + 2, v_dst1); - v_store(dst + x + 4, v_dst2); - v_store(dst + x + 6, v_dst3); - } - } - return x; - } -}; -#endif - -template <> -struct AccSqr_SIMD -{ - int operator() (const uchar * src, float * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - - if (!mask) - { - len *= cn; - for ( ; x <= len - 16; x += 16) - { - v_uint8x16 v_src = v_load(src + x); - v_uint16x8 v_src0, v_src1; - v_expand(v_src, v_src0, v_src1); - v_src0 = v_src0 * v_src0; - v_src1 = v_src1 * v_src1; - - v_uint32x4 v_src00, v_src01, v_src10, v_src11; - v_expand(v_src0, v_src00, v_src01); - v_expand(v_src1, v_src10, v_src11); - - v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); - v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); - v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); - v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); - } - } - else if (cn == 1) - { - v_uint8x16 v_0 = v_setall_u8(0); - for ( ; x <= len - 16; x += 16) - { - v_uint8x16 v_mask = v_load(mask + x); - v_mask = ~(v_0 == v_mask); - v_uint8x16 v_src = v_load(src + x); - v_src = v_src & v_mask; - v_uint16x8 v_src0, v_src1; - v_expand(v_src, v_src0, v_src1); - v_src0 = v_src0 * v_src0; - v_src1 = v_src1 * v_src1; - - v_uint32x4 v_src00, v_src01, v_src10, v_src11; - v_expand(v_src0, v_src00, v_src01); - v_expand(v_src1, v_src10, v_src11); - - v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); - v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); - v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); - v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); - } - } - - return x; - } -}; - -template <> -struct AccSqr_SIMD -{ - int operator() (const ushort * src, float * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - - if (!mask) - { - len *= cn; - for ( ; x <= len - 8; x += 8) - { - v_uint16x8 v_src = v_load(src + x); - v_uint32x4 v_src0, v_src1; - v_expand(v_src, v_src0, v_src1); - - v_float32x4 v_float0, v_float1; - v_float0 = v_cvt_f32(v_reinterpret_as_s32(v_src0)); - v_float1 = v_cvt_f32(v_reinterpret_as_s32(v_src1)); - v_float0 = v_float0 * v_float0; - v_float1 = v_float1 * v_float1; - - v_store(dst + x, v_load(dst + x) + v_float0); - v_store(dst + x + 4, v_load(dst + x + 4) + v_float1); - } - } - - return x; - } -}; - -#if CV_SIMD128_64F -template <> -struct AccSqr_SIMD -{ - int operator() (const uchar * src, double * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - - if (!mask) - { - len *= cn; - for ( ; x <= len - 8; x += 8) - { - v_uint8x16 v_src = v_load(src + x); - v_uint16x8 v_int, dummy; - v_expand(v_src, v_int, dummy); - - v_uint32x4 v_int0, v_int1; - v_expand(v_int, v_int0, v_int1); - - v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0)); - v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0)); - v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1)); - v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1)); - v_src0 = v_src0 * v_src0; - v_src1 = v_src1 * v_src1; - v_src2 = v_src2 * v_src2; - v_src3 = v_src3 * v_src3; - - v_float64x2 v_dst0 = v_load(dst + x); - v_float64x2 v_dst1 = v_load(dst + x + 2); - v_float64x2 v_dst2 = v_load(dst + x + 4); - v_float64x2 v_dst3 = v_load(dst + x + 6); - - v_dst0 += v_src0; - v_dst1 += v_src1; - v_dst2 += v_src2; - v_dst3 += v_src3; - - v_store(dst + x, v_dst0); - v_store(dst + x + 2, v_dst1); - v_store(dst + x + 4, v_dst2); - v_store(dst + x + 6, v_dst3); - } - } - return x; - } -}; - -template <> -struct AccSqr_SIMD -{ - int operator() (const ushort * src, double * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - - if (!mask) - { - len *= cn; - for ( ; x <= len - 8; x += 8) - { - v_uint16x8 v_src = v_load(src + x); - v_uint32x4 v_int_0, v_int_1; - v_expand(v_src, v_int_0, v_int_1); - - v_int32x4 v_int0 = v_reinterpret_as_s32(v_int_0); - v_int32x4 v_int1 = v_reinterpret_as_s32(v_int_1); - - v_float64x2 v_src0 = v_cvt_f64(v_int0); - v_float64x2 v_src1 = v_cvt_f64_high(v_int0); - v_float64x2 v_src2 = v_cvt_f64(v_int1); - v_float64x2 v_src3 = v_cvt_f64_high(v_int1); - v_src0 = v_src0 * v_src0; - v_src1 = v_src1 * v_src1; - v_src2 = v_src2 * v_src2; - v_src3 = v_src3 * v_src3; - - v_float64x2 v_dst0 = v_load(dst + x); - v_float64x2 v_dst1 = v_load(dst + x + 2); - v_float64x2 v_dst2 = v_load(dst + x + 4); - v_float64x2 v_dst3 = v_load(dst + x + 6); - - v_dst0 += v_src0; - v_dst1 += v_src1; - v_dst2 += v_src2; - v_dst3 += v_src3; - - v_store(dst + x, v_dst0); - v_store(dst + x + 2, v_dst1); - v_store(dst + x + 4, v_dst2); - v_store(dst + x + 6, v_dst3); - } - } - return x; - } -}; -#endif - -template <> -struct AccProd_SIMD -{ - int operator() (const uchar * src1, const uchar * src2, float * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - - len *= cn; - if (!mask) - { - for ( ; x <= len - 16; x += 16) - { - v_uint8x16 v_1src = v_load(src1 + x); - v_uint8x16 v_2src = v_load(src2 + x); - - v_uint16x8 v_1src0, v_1src1, v_2src0, v_2src1; - v_expand(v_1src, v_1src0, v_1src1); - v_expand(v_2src, v_2src0, v_2src1); - - v_uint16x8 v_src0, v_src1; - v_src0 = v_1src0 * v_2src0; - v_src1 = v_1src1 * v_2src1; - - v_uint32x4 v_src00, v_src01, v_src10, v_src11; - v_expand(v_src0, v_src00, v_src01); - v_expand(v_src1, v_src10, v_src11); - - v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); - v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); - v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); - v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); - } - } - else if (cn == 1) - { - v_uint8x16 v_0 = v_setzero_u8(); - - for ( ; x <= len - 16; x += 16) - { - v_uint8x16 v_mask = v_load(mask + x); - v_mask = ~(v_0 == v_mask); - - v_uint8x16 v_1src = v_load(src1 + x) & v_mask; - v_uint8x16 v_2src = v_load(src2 + x) & v_mask; - - v_uint16x8 v_1src0, v_1src1, v_2src0, v_2src1; - v_expand(v_1src, v_1src0, v_1src1); - v_expand(v_2src, v_2src0, v_2src1); - - v_uint16x8 v_src0, v_src1; - v_src0 = v_1src0 * v_2src0; - v_src1 = v_1src1 * v_2src1; - - v_uint32x4 v_src00, v_src01, v_src10, v_src11; - v_expand(v_src0, v_src00, v_src01); - v_expand(v_src1, v_src10, v_src11); - - v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); - v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); - v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); - v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); - } - } - - return x; - } -}; - -template <> -struct AccProd_SIMD -{ - int operator() (const ushort * src1, const ushort * src2, float * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - - if (!mask) - { - len *= cn; - for ( ; x <= len - 8; x += 8) - { - v_uint16x8 v_1src = v_load(src1 + x); - v_uint16x8 v_2src = v_load(src2 + x); - - v_uint32x4 v_1src0, v_1src1, v_2src0, v_2src1; - v_expand(v_1src, v_1src0, v_1src1); - v_expand(v_2src, v_2src0, v_2src1); - - v_float32x4 v_1float0 = v_cvt_f32(v_reinterpret_as_s32(v_1src0)); - v_float32x4 v_1float1 = v_cvt_f32(v_reinterpret_as_s32(v_1src1)); - v_float32x4 v_2float0 = v_cvt_f32(v_reinterpret_as_s32(v_2src0)); - v_float32x4 v_2float1 = v_cvt_f32(v_reinterpret_as_s32(v_2src1)); - - v_float32x4 v_src0 = v_1float0 * v_2float0; - v_float32x4 v_src1 = v_1float1 * v_2float1; - - v_store(dst + x, v_load(dst + x) + v_src0); - v_store(dst + x + 4, v_load(dst + x + 4) + v_src1); - } - } - else if (cn == 1) - { - v_uint16x8 v_0 = v_setzero_u16(); - - for ( ; x <= len - 8; x += 8) - { - v_uint8x16 v_mask = v_load_halves(mask + x, mask + x); - v_uint16x8 v_mask0, v_mask1; - v_expand(v_mask, v_mask0, v_mask1); - v_mask0 = ~(v_0 == v_mask0); - - v_uint16x8 v_1src = v_load(src1 + x) & v_mask0; - v_uint16x8 v_2src = v_load(src2 + x) & v_mask0; - - v_uint32x4 v_1src0, v_1src1, v_2src0, v_2src1; - v_expand(v_1src, v_1src0, v_1src1); - v_expand(v_2src, v_2src0, v_2src1); - - v_float32x4 v_1float0 = v_cvt_f32(v_reinterpret_as_s32(v_1src0)); - v_float32x4 v_1float1 = v_cvt_f32(v_reinterpret_as_s32(v_1src1)); - v_float32x4 v_2float0 = v_cvt_f32(v_reinterpret_as_s32(v_2src0)); - v_float32x4 v_2float1 = v_cvt_f32(v_reinterpret_as_s32(v_2src1)); - - v_float32x4 v_src0 = v_1float0 * v_2float0; - v_float32x4 v_src1 = v_1float1 * v_2float1; - - v_store(dst + x, v_load(dst + x) + v_src0); - v_store(dst + x + 4, v_load(dst + x + 4) + v_src1); - } - } - - return x; - } -}; - -#if CV_SIMD128_64F -template <> -struct AccProd_SIMD -{ - int operator() (const uchar * src1, const uchar * src2, double * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - - if (!mask) - { - len *= cn; - for ( ; x <= len - 8; x += 8) - { - v_uint8x16 v_1src = v_load(src1 + x); - v_uint8x16 v_2src = v_load(src2 + x); - - v_uint16x8 v_1int, v_2int, dummy; - v_expand(v_1src, v_1int, dummy); - v_expand(v_2src, v_2int, dummy); - - v_uint32x4 v_1int_0, v_1int_1, v_2int_0, v_2int_1; - v_expand(v_1int, v_1int_0, v_1int_1); - v_expand(v_2int, v_2int_0, v_2int_1); - - v_int32x4 v_1int0 = v_reinterpret_as_s32(v_1int_0); - v_int32x4 v_1int1 = v_reinterpret_as_s32(v_1int_1); - v_int32x4 v_2int0 = v_reinterpret_as_s32(v_2int_0); - v_int32x4 v_2int1 = v_reinterpret_as_s32(v_2int_1); - - v_float64x2 v_src0 = v_cvt_f64(v_1int0) * v_cvt_f64(v_2int0); - v_float64x2 v_src1 = v_cvt_f64_high(v_1int0) * v_cvt_f64_high(v_2int0); - v_float64x2 v_src2 = v_cvt_f64(v_1int1) * v_cvt_f64(v_2int1); - v_float64x2 v_src3 = v_cvt_f64_high(v_1int1) * v_cvt_f64_high(v_2int1); - - v_float64x2 v_dst0 = v_load(dst + x); - v_float64x2 v_dst1 = v_load(dst + x + 2); - v_float64x2 v_dst2 = v_load(dst + x + 4); - v_float64x2 v_dst3 = v_load(dst + x + 6); - - v_dst0 += v_src0; - v_dst1 += v_src1; - v_dst2 += v_src2; - v_dst3 += v_src3; - - v_store(dst + x, v_dst0); - v_store(dst + x + 2, v_dst1); - v_store(dst + x + 4, v_dst2); - v_store(dst + x + 6, v_dst3); - } - } - return x; - } -}; - -template <> -struct AccProd_SIMD -{ - int operator() (const ushort * src1, const ushort * src2, double * dst, const uchar * mask, int len, int cn) const - { - int x = 0; - - if (!mask) - { - len *= cn; - for ( ; x <= len - 8; x += 8) - { - v_uint16x8 v_1src = v_load(src1 + x); - v_uint16x8 v_2src = v_load(src2 + x); - - v_uint32x4 v_1int_0, v_1int_1, v_2int_0, v_2int_1; - v_expand(v_1src, v_1int_0, v_1int_1); - v_expand(v_2src, v_2int_0, v_2int_1); - - v_int32x4 v_1int0 = v_reinterpret_as_s32(v_1int_0); - v_int32x4 v_1int1 = v_reinterpret_as_s32(v_1int_1); - v_int32x4 v_2int0 = v_reinterpret_as_s32(v_2int_0); - v_int32x4 v_2int1 = v_reinterpret_as_s32(v_2int_1); - - v_float64x2 v_src0 = v_cvt_f64(v_1int0) * v_cvt_f64(v_2int0); - v_float64x2 v_src1 = v_cvt_f64_high(v_1int0) * v_cvt_f64_high(v_2int0); - v_float64x2 v_src2 = v_cvt_f64(v_1int1) * v_cvt_f64(v_2int1); - v_float64x2 v_src3 = v_cvt_f64_high(v_1int1) * v_cvt_f64_high(v_2int1); - - v_float64x2 v_dst0 = v_load(dst + x); - v_float64x2 v_dst1 = v_load(dst + x + 2); - v_float64x2 v_dst2 = v_load(dst + x + 4); - v_float64x2 v_dst3 = v_load(dst + x + 6); - - v_dst0 = v_dst0 + v_src0; - v_dst1 = v_dst1 + v_src1; - v_dst2 = v_dst2 + v_src2; - v_dst3 = v_dst3 + v_src3; - - v_store(dst + x, v_dst0); - v_store(dst + x + 2, v_dst1); - v_store(dst + x + 4, v_dst2); - v_store(dst + x + 6, v_dst3); - } - } - return x; - } -}; -#endif - -template <> -struct AccW_SIMD -{ - int operator() (const uchar * src, float * dst, const uchar * mask, int len, int cn, float alpha) const - { - int x = 0; - v_float32x4 v_alpha = v_setall_f32(alpha); - v_float32x4 v_beta = v_setall_f32(1.0f - alpha); - - if (!mask) - { - len *= cn; - for ( ; x <= len - 16; x += 16) - { - v_uint8x16 v_src = v_load(src + x); - - v_uint16x8 v_src0, v_src1; - v_expand(v_src, v_src0, v_src1); - - v_uint32x4 v_src00, v_src01, v_src10, v_src11; - v_expand(v_src0, v_src00, v_src01); - v_expand(v_src1, v_src10, v_src11); - - v_float32x4 v_dst00 = v_load(dst + x); - v_float32x4 v_dst01 = v_load(dst + x + 4); - v_float32x4 v_dst10 = v_load(dst + x + 8); - v_float32x4 v_dst11 = v_load(dst + x + 12); - - v_dst00 = (v_dst00 * v_beta) + (v_cvt_f32(v_reinterpret_as_s32(v_src00)) * v_alpha); - v_dst01 = (v_dst01 * v_beta) + (v_cvt_f32(v_reinterpret_as_s32(v_src01)) * v_alpha); - v_dst10 = (v_dst10 * v_beta) + (v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha); - v_dst11 = (v_dst11 * v_beta) + (v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha); - - v_store(dst + x, v_dst00); - v_store(dst + x + 4, v_dst01); - v_store(dst + x + 8, v_dst10); - v_store(dst + x + 12, v_dst11); - } - } - - return x; - } -}; - -template <> -struct AccW_SIMD -{ - int operator() (const ushort * src, float * dst, const uchar * mask, int len, int cn, float alpha) const - { - int x = 0; - v_float32x4 v_alpha = v_setall_f32(alpha); - v_float32x4 v_beta = v_setall_f32(1.0f - alpha); - - if (!mask) - { - len *= cn; - for ( ; x <= len - 8; x += 8) - { - v_uint16x8 v_src = v_load(src + x); - v_uint32x4 v_int0, v_int1; - v_expand(v_src, v_int0, v_int1); - - v_float32x4 v_src0 = v_cvt_f32(v_reinterpret_as_s32(v_int0)); - v_float32x4 v_src1 = v_cvt_f32(v_reinterpret_as_s32(v_int1)); - v_src0 = v_src0 * v_alpha; - v_src1 = v_src1 * v_alpha; - - v_float32x4 v_dst0 = v_load(dst + x) * v_beta; - v_float32x4 v_dst1 = v_load(dst + x + 4) * v_beta; - - v_store(dst + x, v_dst0 + v_src0); - v_store(dst + x + 4, v_dst1 + v_src1); - } - } - - return x; - } -}; - -#if CV_SIMD128_64F -template <> -struct AccW_SIMD -{ - int operator() (const uchar * src, double * dst, const uchar * mask, int len, int cn, double alpha) const - { - int x = 0; - v_float64x2 v_alpha = v_setall_f64(alpha); - v_float64x2 v_beta = v_setall_f64(1.0f - alpha); - - if (!mask) - { - len *= cn; - for ( ; x <= len - 8; x += 8) - { - v_uint8x16 v_src = v_load(src + x); - v_uint16x8 v_int, dummy; - v_expand(v_src, v_int, dummy); - - v_uint32x4 v_int_0, v_int_1; - v_expand(v_int, v_int_0, v_int_1); - - v_int32x4 v_int0 = v_reinterpret_as_s32(v_int_0); - v_int32x4 v_int1 = v_reinterpret_as_s32(v_int_1); - - v_float64x2 v_src0 = v_cvt_f64(v_int0); - v_float64x2 v_src1 = v_cvt_f64_high(v_int0); - v_float64x2 v_src2 = v_cvt_f64(v_int1); - v_float64x2 v_src3 = v_cvt_f64_high(v_int1); - - v_float64x2 v_dst0 = v_load(dst + x); - v_float64x2 v_dst1 = v_load(dst + x + 2); - v_float64x2 v_dst2 = v_load(dst + x + 4); - v_float64x2 v_dst3 = v_load(dst + x + 6); - - v_dst0 = (v_dst0 * v_beta) + (v_src0 * v_alpha); - v_dst1 = (v_dst1 * v_beta) + (v_src1 * v_alpha); - v_dst2 = (v_dst2 * v_beta) + (v_src2 * v_alpha); - v_dst3 = (v_dst3 * v_beta) + (v_src3 * v_alpha); - - v_store(dst + x, v_dst0); - v_store(dst + x + 2, v_dst1); - v_store(dst + x + 4, v_dst2); - v_store(dst + x + 6, v_dst3); - } - } - - return x; - } -}; - -template <> -struct AccW_SIMD -{ - int operator() (const ushort * src, double * dst, const uchar * mask, int len, int cn, double alpha) const - { - int x = 0; - v_float64x2 v_alpha = v_setall_f64(alpha); - v_float64x2 v_beta = v_setall_f64(1.0f - alpha); - - if (!mask) - { - len *= cn; - for ( ; x <= len - 8; x += 8) - { - v_uint16x8 v_src = v_load(src + x); - v_uint32x4 v_int_0, v_int_1; - v_expand(v_src, v_int_0, v_int_1); - - v_int32x4 v_int0 = v_reinterpret_as_s32(v_int_0); - v_int32x4 v_int1 = v_reinterpret_as_s32(v_int_1); - - v_float64x2 v_src00 = v_cvt_f64(v_int0); - v_float64x2 v_src01 = v_cvt_f64_high(v_int0); - v_float64x2 v_src10 = v_cvt_f64(v_int1); - v_float64x2 v_src11 = v_cvt_f64_high(v_int1); - - v_float64x2 v_dst00 = v_load(dst + x); - v_float64x2 v_dst01 = v_load(dst + x + 2); - v_float64x2 v_dst10 = v_load(dst + x + 4); - v_float64x2 v_dst11 = v_load(dst + x + 6); - - v_dst00 = (v_dst00 * v_beta) + (v_src00 * v_alpha); - v_dst01 = (v_dst01 * v_beta) + (v_src01 * v_alpha); - v_dst10 = (v_dst10 * v_beta) + (v_src10 * v_alpha); - v_dst11 = (v_dst11 * v_beta) + (v_src11 * v_alpha); - - v_store(dst + x, v_dst00); - v_store(dst + x + 2, v_dst01); - v_store(dst + x + 4, v_dst10); - v_store(dst + x + 6, v_dst11); - } - } - - return x; - } -}; -#endif //CV_SIMD128_64F -#endif //CV_SIMD128 - -template void -acc_( const T* src, AT* dst, const uchar* mask, int len, int cn ) -{ - int i = Acc_SIMD()(src, dst, mask, len, cn); - - if( !mask ) - { - len *= cn; - #if CV_ENABLE_UNROLLED - for( ; i <= len - 4; i += 4 ) - { - AT t0, t1; - t0 = src[i] + dst[i]; - t1 = src[i+1] + dst[i+1]; - dst[i] = t0; dst[i+1] = t1; - - t0 = src[i+2] + dst[i+2]; - t1 = src[i+3] + dst[i+3]; - dst[i+2] = t0; dst[i+3] = t1; - } - #endif - for( ; i < len; i++ ) - dst[i] += src[i]; - } - else if( cn == 1 ) - { - for( ; i < len; i++ ) - { - if( mask[i] ) - dst[i] += src[i]; - } - } - else if( cn == 3 ) - { - for( ; i < len; i++, src += 3, dst += 3 ) - { - if( mask[i] ) - { - AT t0 = src[0] + dst[0]; - AT t1 = src[1] + dst[1]; - AT t2 = src[2] + dst[2]; - - dst[0] = t0; dst[1] = t1; dst[2] = t2; - } - } - } - else - { - for( ; i < len; i++, src += cn, dst += cn ) - if( mask[i] ) - { - for( int k = 0; k < cn; k++ ) - dst[k] += src[k]; - } - } -} - - -template void -accSqr_( const T* src, AT* dst, const uchar* mask, int len, int cn ) -{ - int i = AccSqr_SIMD()(src, dst, mask, len, cn); - - if( !mask ) - { - len *= cn; - #if CV_ENABLE_UNROLLED - for( ; i <= len - 4; i += 4 ) - { - AT t0, t1; - t0 = (AT)src[i]*src[i] + dst[i]; - t1 = (AT)src[i+1]*src[i+1] + dst[i+1]; - dst[i] = t0; dst[i+1] = t1; - - t0 = (AT)src[i+2]*src[i+2] + dst[i+2]; - t1 = (AT)src[i+3]*src[i+3] + dst[i+3]; - dst[i+2] = t0; dst[i+3] = t1; - } - #endif - for( ; i < len; i++ ) - dst[i] += (AT)src[i]*src[i]; - } - else if( cn == 1 ) - { - for( ; i < len; i++ ) - { - if( mask[i] ) - dst[i] += (AT)src[i]*src[i]; - } - } - else if( cn == 3 ) - { - for( ; i < len; i++, src += 3, dst += 3 ) - { - if( mask[i] ) - { - AT t0 = (AT)src[0]*src[0] + dst[0]; - AT t1 = (AT)src[1]*src[1] + dst[1]; - AT t2 = (AT)src[2]*src[2] + dst[2]; - - dst[0] = t0; dst[1] = t1; dst[2] = t2; - } - } - } - else - { - for( ; i < len; i++, src += cn, dst += cn ) - if( mask[i] ) - { - for( int k = 0; k < cn; k++ ) - dst[k] += (AT)src[k]*src[k]; - } - } -} - - -template void -accProd_( const T* src1, const T* src2, AT* dst, const uchar* mask, int len, int cn ) -{ - int i = AccProd_SIMD()(src1, src2, dst, mask, len, cn); - - if( !mask ) - { - len *= cn; - #if CV_ENABLE_UNROLLED - for( ; i <= len - 4; i += 4 ) - { - AT t0, t1; - t0 = (AT)src1[i]*src2[i] + dst[i]; - t1 = (AT)src1[i+1]*src2[i+1] + dst[i+1]; - dst[i] = t0; dst[i+1] = t1; - - t0 = (AT)src1[i+2]*src2[i+2] + dst[i+2]; - t1 = (AT)src1[i+3]*src2[i+3] + dst[i+3]; - dst[i+2] = t0; dst[i+3] = t1; - } - #endif - for( ; i < len; i++ ) - dst[i] += (AT)src1[i]*src2[i]; - } - else if( cn == 1 ) - { - for( ; i < len; i++ ) - { - if( mask[i] ) - dst[i] += (AT)src1[i]*src2[i]; - } - } - else if( cn == 3 ) - { - for( ; i < len; i++, src1 += 3, src2 += 3, dst += 3 ) - { - if( mask[i] ) - { - AT t0 = (AT)src1[0]*src2[0] + dst[0]; - AT t1 = (AT)src1[1]*src2[1] + dst[1]; - AT t2 = (AT)src1[2]*src2[2] + dst[2]; - - dst[0] = t0; dst[1] = t1; dst[2] = t2; - } - } - } - else - { - for( ; i < len; i++, src1 += cn, src2 += cn, dst += cn ) - if( mask[i] ) - { - for( int k = 0; k < cn; k++ ) - dst[k] += (AT)src1[k]*src2[k]; - } - } -} - - -template void -accW_( const T* src, AT* dst, const uchar* mask, int len, int cn, double alpha ) -{ - AT a = (AT)alpha, b = 1 - a; - int i = AccW_SIMD()(src, dst, mask, len, cn, a); - - if( !mask ) - { - len *= cn; - #if CV_ENABLE_UNROLLED - for( ; i <= len - 4; i += 4 ) - { - AT t0, t1; - t0 = src[i]*a + dst[i]*b; - t1 = src[i+1]*a + dst[i+1]*b; - dst[i] = t0; dst[i+1] = t1; - - t0 = src[i+2]*a + dst[i+2]*b; - t1 = src[i+3]*a + dst[i+3]*b; - dst[i+2] = t0; dst[i+3] = t1; - } - #endif - for( ; i < len; i++ ) - dst[i] = src[i]*a + dst[i]*b; - } - else if( cn == 1 ) - { - for( ; i < len; i++ ) - { - if( mask[i] ) - dst[i] = src[i]*a + dst[i]*b; - } - } - else if( cn == 3 ) - { - for( ; i < len; i++, src += 3, dst += 3 ) - { - if( mask[i] ) - { - AT t0 = src[0]*a + dst[0]*b; - AT t1 = src[1]*a + dst[1]*b; - AT t2 = src[2]*a + dst[2]*b; - - dst[0] = t0; dst[1] = t1; dst[2] = t2; - } - } - } - else - { - for( ; i < len; i++, src += cn, dst += cn ) - if( mask[i] ) - { - for( int k = 0; k < cn; k++ ) - dst[k] = src[k]*a + dst[k]*b; - } - } -} - - -#define DEF_ACC_FUNCS(suffix, type, acctype) \ -static void acc_##suffix(const type* src, acctype* dst, \ - const uchar* mask, int len, int cn) \ -{ acc_(src, dst, mask, len, cn); } \ -\ -static void accSqr_##suffix(const type* src, acctype* dst, \ - const uchar* mask, int len, int cn) \ -{ accSqr_(src, dst, mask, len, cn); } \ -\ -static void accProd_##suffix(const type* src1, const type* src2, \ - acctype* dst, const uchar* mask, int len, int cn) \ -{ accProd_(src1, src2, dst, mask, len, cn); } \ -\ -static void accW_##suffix(const type* src, acctype* dst, \ - const uchar* mask, int len, int cn, double alpha) \ -{ accW_(src, dst, mask, len, cn, alpha); } - - -DEF_ACC_FUNCS(8u32f, uchar, float) -DEF_ACC_FUNCS(8u64f, uchar, double) -DEF_ACC_FUNCS(16u32f, ushort, float) -DEF_ACC_FUNCS(16u64f, ushort, double) -DEF_ACC_FUNCS(32f, float, float) -DEF_ACC_FUNCS(32f64f, float, double) -DEF_ACC_FUNCS(64f, double, double) - - -typedef void (*AccFunc)(const uchar*, uchar*, const uchar*, int, int); -typedef void (*AccProdFunc)(const uchar*, const uchar*, uchar*, const uchar*, int, int); -typedef void (*AccWFunc)(const uchar*, uchar*, const uchar*, int, int, double); +typedef void(*AccFunc)(const uchar*, uchar*, const uchar*, int, int); +typedef void(*AccProdFunc)(const uchar*, const uchar*, uchar*, const uchar*, int, int); +typedef void(*AccWFunc)(const uchar*, uchar*, const uchar*, int, int, double); static AccFunc accTab[] = { diff --git a/modules/imgproc/src/accum.dispatch.cpp b/modules/imgproc/src/accum.dispatch.cpp new file mode 100644 index 0000000000..8bbf37cc4a --- /dev/null +++ b/modules/imgproc/src/accum.dispatch.cpp @@ -0,0 +1,20 @@ + // This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "precomp.hpp" + +#include "accum.simd.hpp" +#include "accum.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content + +namespace cv { + +DEF_ACC_INT_FUNCS(8u32f, uchar, float) +DEF_ACC_INT_FUNCS(8u64f, uchar, double) +DEF_ACC_INT_FUNCS(16u32f, ushort, float) +DEF_ACC_INT_FUNCS(16u64f, ushort, double) +DEF_ACC_FLT_FUNCS(32f, float, float) +DEF_ACC_FLT_FUNCS(32f64f, float, double) +DEF_ACC_FLT_FUNCS(64f, double, double) + +} //cv::hal \ No newline at end of file diff --git a/modules/imgproc/src/accum.simd.hpp b/modules/imgproc/src/accum.simd.hpp new file mode 100644 index 0000000000..e2be2c952e --- /dev/null +++ b/modules/imgproc/src/accum.simd.hpp @@ -0,0 +1,3187 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "opencv2/core/hal/intrin.hpp" + +#define DEF_ACC_INT_FUNCS(suffix, type, acctype) \ +void acc_##suffix(const type* src, acctype* dst, \ + const uchar* mask, int len, int cn) \ +{ \ + CV_CPU_CALL_NEON(acc_simd_, (src, dst, mask, len, cn)); \ + CV_CPU_CALL_SSE2(acc_simd_, (src, dst, mask, len, cn)); \ + CV_CPU_CALL_BASELINE(acc_general_, (src, dst, mask, len, cn)); \ +} \ +void accSqr_##suffix(const type* src, acctype* dst, \ + const uchar* mask, int len, int cn) \ +{ \ + CV_CPU_CALL_NEON(accSqr_simd_, (src, dst, mask, len, cn)); \ + CV_CPU_CALL_SSE2(accSqr_simd_, (src, dst, mask, len, cn)); \ + CV_CPU_CALL_BASELINE(accSqr_general_, (src, dst, mask, len, cn)); \ +} \ +void accProd_##suffix(const type* src1, const type* src2, \ + acctype* dst, const uchar* mask, int len, int cn) \ +{ \ + CV_CPU_CALL_NEON(accProd_simd_, (src1, src2, dst, mask, len, cn)); \ + CV_CPU_CALL_SSE2(accProd_simd_, (src1, src2, dst, mask, len, cn)); \ + CV_CPU_CALL_BASELINE(accProd_general_, (src1, src2, dst, mask, len, cn)); \ +} \ +void accW_##suffix(const type* src, acctype* dst, \ + const uchar* mask, int len, int cn, double alpha) \ +{ \ + CV_CPU_CALL_NEON(accW_simd_, (src, dst, mask, len, cn, alpha)); \ + CV_CPU_CALL_SSE2(accW_simd_, (src, dst, mask, len, cn, alpha)); \ + CV_CPU_CALL_BASELINE(accW_general_, (src, dst, mask, len, cn, alpha)); \ +} +#define DEF_ACC_FLT_FUNCS(suffix, type, acctype) \ +void acc_##suffix(const type* src, acctype* dst, \ + const uchar* mask, int len, int cn) \ +{ \ + CV_CPU_CALL_AVX(acc_avx_##suffix, (src, dst, mask, len, cn)); \ + CV_CPU_CALL_NEON(acc_simd_, (src, dst, mask, len, cn)); \ + CV_CPU_CALL_SSE2(acc_simd_, (src, dst, mask, len, cn)); \ + CV_CPU_CALL_BASELINE(acc_general_, (src, dst, mask, len, cn)); \ +} \ +void accSqr_##suffix(const type* src, acctype* dst, \ + const uchar* mask, int len, int cn) \ +{ \ + CV_CPU_CALL_AVX(accSqr_avx_##suffix, (src, dst, mask, len, cn)); \ + CV_CPU_CALL_NEON(accSqr_simd_, (src, dst, mask, len, cn)); \ + CV_CPU_CALL_SSE2(accSqr_simd_, (src, dst, mask, len, cn)); \ + CV_CPU_CALL_BASELINE(accSqr_general_, (src, dst, mask, len, cn)); \ +} \ +void accProd_##suffix(const type* src1, const type* src2, \ + acctype* dst, const uchar* mask, int len, int cn) \ +{ \ + CV_CPU_CALL_AVX(accProd_avx_##suffix, (src1, src2, dst, mask, len, cn)); \ + CV_CPU_CALL_NEON(accProd_simd_, (src1, src2, dst, mask, len, cn)); \ + CV_CPU_CALL_SSE2(accProd_simd_, (src1, src2, dst, mask, len, cn)); \ + CV_CPU_CALL_BASELINE(accProd_general_, (src1, src2, dst, mask, len, cn)); \ +} \ +void accW_##suffix(const type* src, acctype* dst, \ + const uchar* mask, int len, int cn, double alpha) \ +{ \ + CV_CPU_CALL_AVX(accW_avx_##suffix, (src, dst, mask, len, cn, alpha)); \ + CV_CPU_CALL_NEON(accW_simd_, (src, dst, mask, len, cn, alpha)); \ + CV_CPU_CALL_SSE2(accW_simd_, (src, dst, mask, len, cn, alpha)); \ + CV_CPU_CALL_BASELINE(accW_general_, (src, dst, mask, len, cn, alpha)); \ +} +#define DECLARATE_ACC_FUNCS(suffix, type, acctype) \ +void acc_##suffix(const type* src, acctype* dst, const uchar* mask, int len, int cn); \ +void accSqr_##suffix(const type* src, acctype* dst, const uchar* mask, int len, int cn); \ +void accProd_##suffix(const type* src1, const type* src2, acctype* dst, const uchar* mask, int len, int cn); \ +void accW_##suffix(const type* src, acctype* dst, const uchar* mask, int len, int cn, double alpha); + + +namespace cv { + +DECLARATE_ACC_FUNCS(8u32f, uchar, float) +DECLARATE_ACC_FUNCS(8u64f, uchar, double) +DECLARATE_ACC_FUNCS(16u32f, ushort, float) +DECLARATE_ACC_FUNCS(16u64f, ushort, double) +DECLARATE_ACC_FUNCS(32f, float, float) +DECLARATE_ACC_FUNCS(32f64f, float, double) +DECLARATE_ACC_FUNCS(64f, double, double) + +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN + +void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn); +void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn); +void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn); +void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn); +void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn); +void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn); +void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int cn); +void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn); +void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn); +void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn); +void accSqr_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn); +void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int cn); +void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int cn); +void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, int cn); +void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar* mask, int len, int cn); +void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uchar* mask, int len, int cn); +void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const uchar* mask, int len, int cn); +void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uchar* mask, int len, int cn); +void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar* mask, int len, int cn); +void accProd_simd_(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn); +void accProd_simd_(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn); +void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn, double alpha); +void accW_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn, double alpha); +void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn, double alpha); +void accW_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn, double alpha); +void accW_simd_(const float* src, float* dst, const uchar* mask, int len, int cn, double alpha); +void accW_simd_(const float* src, double* dst, const uchar* mask, int len, int cn, double alpha); +void accW_simd_(const double* src, double* dst, const uchar* mask, int len, int cn, double alpha); + +// accumulate series optimized by AVX +void acc_avx_32f(const float* src, float* dst, const uchar* mask, int len, int cn); +void acc_avx_32f64f(const float* src, double* dst, const uchar* mask, int len, int cn); +void acc_avx_64f(const double* src, double* dst, const uchar* mask, int len, int cn); +void accSqr_avx_32f(const float* src, float* dst, const uchar* mask, int len, int cn); +void accSqr_avx_32f64f(const float* src, double* dst, const uchar* mask, int len, int cn); +void accSqr_avx_64f(const double* src, double* dst, const uchar* mask, int len, int cn); +void accProd_avx_32f(const float* src1, const float* src2, float* dst, const uchar* mask, int len, int cn); +void accProd_avx_32f64f(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn); +void accProd_avx_64f(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn); +void accW_avx_32f(const float* src, float* dst, const uchar* mask, int len, int cn, double alpha); +void accW_avx_32f64f(const float* src, double* dst, const uchar* mask, int len, int cn, double alpha); +void accW_avx_64f(const double* src, double* dst, const uchar* mask, int len, int cn, double alpha); + +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +template +void acc_general_(const T* src, AT* dst, const uchar* mask, int len, int cn, int start = 0 ) +{ + int i = start; + + if( !mask ) + { + len *= cn; + #if CV_ENABLE_UNROLLED + for( ; i <= len - 4; i += 4 ) + { + AT t0, t1; + t0 = src[i] + dst[i]; + t1 = src[i+1] + dst[i+1]; + dst[i] = t0; dst[i+1] = t1; + + t0 = src[i+2] + dst[i+2]; + t1 = src[i+3] + dst[i+3]; + dst[i+2] = t0; dst[i+3] = t1; + } + #endif + for( ; i < len; i++ ) + { + dst[i] += src[i]; + } + } + else + { + src += (i * cn); + dst += (i * cn); + for( ; i < len; i++, src += cn, dst += cn ) + { + if( mask[i] ) + { + for( int k = 0; k < cn; k++ ) + { + dst[k] += src[k]; + } + } + } + } + +} + +template void +accSqr_general_( const T* src, AT* dst, const uchar* mask, int len, int cn, int start = 0 ) +{ + int i = start; + + if( !mask ) + { + len *= cn; + #if CV_ENABLE_UNROLLED + for( ; i <= len - 4; i += 4 ) + { + AT t0, t1; + t0 = (AT)src[i]*src[i] + dst[i]; + t1 = (AT)src[i+1]*src[i+1] + dst[i+1]; + dst[i] = t0; dst[i+1] = t1; + + t0 = (AT)src[i+2]*src[i+2] + dst[i+2]; + t1 = (AT)src[i+3]*src[i+3] + dst[i+3]; + dst[i+2] = t0; dst[i+3] = t1; + } + #endif + for( ; i < len; i++ ) + { + dst[i] += (AT)src[i]*src[i]; + } + } + else + { + src += (i * cn); + dst += (i * cn); + for( ; i < len; i++, src += cn, dst += cn ) + { + if( mask[i] ) + { + for( int k = 0; k < cn; k++ ) + { + dst[k] += (AT)src[k]*src[k]; + } + } + } + } +} + +template void +accProd_general_( const T* src1, const T* src2, AT* dst, const uchar* mask, int len, int cn, int start = 0 ) +{ + int i = start; + + if( !mask ) + { + len *= cn; + #if CV_ENABLE_UNROLLED + for( ; i <= len - 4; i += 4 ) + { + AT t0, t1; + t0 = (AT)src1[i]*src2[i] + dst[i]; + t1 = (AT)src1[i+1]*src2[i+1] + dst[i+1]; + dst[i] = t0; dst[i+1] = t1; + + t0 = (AT)src1[i+2]*src2[i+2] + dst[i+2]; + t1 = (AT)src1[i+3]*src2[i+3] + dst[i+3]; + dst[i+2] = t0; dst[i+3] = t1; + } + #endif + for( ; i < len; i++ ) + { + dst[i] += (AT)src1[i]*src2[i]; + } + } + else + { + src1 += (i * cn); + src2 += (i * cn); + dst += (i * cn); + for( ; i < len; i++, src1 += cn, src2 += cn, dst += cn ) + { + if( mask[i] ) + { + for( int k = 0; k < cn; k++ ) + { + dst[k] += (AT)src1[k]*src2[k]; + } + } + } + } +} + +template void +accW_general_( const T* src, AT* dst, const uchar* mask, int len, int cn, double alpha, int start = 0 ) +{ + AT a = (AT)alpha, b = 1 - a; + int i = start; + + if( !mask ) + { + len *= cn; + #if CV_ENABLE_UNROLLED + for( ; i <= len - 4; i += 4 ) + { + AT t0, t1; + t0 = src[i]*a + dst[i]*b; + t1 = src[i+1]*a + dst[i+1]*b; + dst[i] = t0; dst[i+1] = t1; + + t0 = src[i+2]*a + dst[i+2]*b; + t1 = src[i+3]*a + dst[i+3]*b; + dst[i+2] = t0; dst[i+3] = t1; + } + #endif + for( ; i < len; i++ ) + { + dst[i] = src[i]*a + dst[i]*b; + } + } + else + { + src += (i * cn); + dst += (i * cn); + for( ; i < len; i++, src += cn, dst += cn ) + { + if( mask[i] ) + { + for( int k = 0; k < cn; k++ ) + { + dst[k] = src[k]*a + dst[k]*b; + } + } + } + } +} + +#if CV_SIMD128 + +void acc_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 16; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_uint8x16 v_src = v_load(src + x); + v_uint16x8 v_src0, v_src1; + v_expand(v_src, v_src0, v_src1); + + v_uint32x4 v_src00, v_src01, v_src10, v_src11; + v_expand(v_src0, v_src00, v_src01); + v_expand(v_src1, v_src10, v_src11); + + v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); + v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); + v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); + v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); + } + } + else + { + v_uint8x16 v_0 = v_setall_u8(0); + if (cn == 1) + { + for ( ; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint8x16 v_mask = v_load(mask + x); + v_mask = ~(v_0 == v_mask); + v_uint8x16 v_src = v_load(src + x); + v_src = v_src & v_mask; + v_uint16x8 v_src0, v_src1; + v_expand(v_src, v_src0, v_src1); + + v_uint32x4 v_src00, v_src01, v_src10, v_src11; + v_expand(v_src0, v_src00, v_src01); + v_expand(v_src1, v_src10, v_src11); + + v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); + v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); + v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); + v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); + } + } + else if (cn == 3) + { + for ( ; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint8x16 v_mask = v_load(mask + x); + v_mask = ~(v_0 == v_mask); + v_uint8x16 v_src0, v_src1, v_src2; + v_load_deinterleave(src + (x * cn), v_src0, v_src1, v_src2); + v_src0 = v_src0 & v_mask; + v_src1 = v_src1 & v_mask; + v_src2 = v_src2 & v_mask; + v_uint16x8 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; + v_expand(v_src0, v_src00, v_src01); + v_expand(v_src1, v_src10, v_src11); + v_expand(v_src2, v_src20, v_src21); + + v_uint32x4 v_src000, v_src001, v_src010, v_src011; + v_uint32x4 v_src100, v_src101, v_src110, v_src111; + v_uint32x4 v_src200, v_src201, v_src210, v_src211; + v_expand(v_src00, v_src000, v_src001); + v_expand(v_src01, v_src010, v_src011); + v_expand(v_src10, v_src100, v_src101); + v_expand(v_src11, v_src110, v_src111); + v_expand(v_src20, v_src200, v_src201); + v_expand(v_src21, v_src210, v_src211); + + v_float32x4 v_dst000, v_dst001, v_dst010, v_dst011; + v_float32x4 v_dst100, v_dst101, v_dst110, v_dst111; + v_float32x4 v_dst200, v_dst201, v_dst210, v_dst211; + v_load_deinterleave(dst + (x * cn), v_dst000, v_dst100, v_dst200); + v_load_deinterleave(dst + ((x + 4) * cn), v_dst001, v_dst101, v_dst201); + v_load_deinterleave(dst + ((x + 8) * cn), v_dst010, v_dst110, v_dst210); + v_load_deinterleave(dst + ((x + 12) * cn), v_dst011, v_dst111, v_dst211); + + v_store_interleave(dst + (x * cn), v_dst000 + v_cvt_f32(v_reinterpret_as_s32(v_src000)), v_dst100 + v_cvt_f32(v_reinterpret_as_s32(v_src100)), v_dst200 + v_cvt_f32(v_reinterpret_as_s32(v_src200))); + v_store_interleave(dst + ((x + 4) * cn), v_dst001 + v_cvt_f32(v_reinterpret_as_s32(v_src001)), v_dst101 + v_cvt_f32(v_reinterpret_as_s32(v_src101)), v_dst201 + v_cvt_f32(v_reinterpret_as_s32(v_src201))); + v_store_interleave(dst + ((x + 8) * cn), v_dst010 + v_cvt_f32(v_reinterpret_as_s32(v_src010)), v_dst110 + v_cvt_f32(v_reinterpret_as_s32(v_src110)), v_dst210 + v_cvt_f32(v_reinterpret_as_s32(v_src210))); + v_store_interleave(dst + ((x + 12) * cn), v_dst011 + v_cvt_f32(v_reinterpret_as_s32(v_src011)), v_dst111 + v_cvt_f32(v_reinterpret_as_s32(v_src111)), v_dst211 + v_cvt_f32(v_reinterpret_as_s32(v_src211))); + } + } + } + + acc_general_(src, dst, mask, len, cn, x); +} + +void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 8; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_uint16x8 v_src = v_load(src + x); + v_uint32x4 v_src0, v_src1; + v_expand(v_src, v_src0, v_src1); + + v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src0))); + v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src1))); + } + } + else + { + if (cn == 1) + { + v_uint16x8 v_0 = v_setall_u16(0); + for ( ; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint8x16 _v_mask = v_load(mask + x); + v_uint16x8 v_mask, dummy; + v_expand(_v_mask, v_mask, dummy); + v_mask = ~(v_mask == v_0); + v_uint16x8 v_src = v_load(src + x); + v_src = v_src & v_mask; + v_uint32x4 v_src0, v_src1; + v_expand(v_src, v_src0, v_src1); + + v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src0))); + v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src1))); + } + } + else if (cn == 3) + { + v_uint16x8 v_0 = v_setall_u16(0); + for ( ; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint8x16 _v_mask = v_load(mask + x); + v_uint16x8 v_mask, dummy; + v_expand(_v_mask, v_mask, dummy); + v_mask = ~(v_mask == v_0); + v_uint16x8 v_src0, v_src1, v_src2; + v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); + v_src0 = v_src0 & v_mask; + v_src1 = v_src1 & v_mask; + v_src2 = v_src2 & v_mask; + v_uint32x4 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; + v_expand(v_src0, v_src00, v_src01); + v_expand(v_src1, v_src10, v_src11); + v_expand(v_src2, v_src20, v_src21); + + v_float32x4 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; + v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_load_deinterleave(dst + (x + 4) * cn, v_dst01, v_dst11, v_dst21); + + v_store_interleave(dst + x * cn, v_dst00 + v_cvt_f32(v_reinterpret_as_s32(v_src00)), v_dst10 + v_cvt_f32(v_reinterpret_as_s32(v_src10)), v_dst20 + v_cvt_f32(v_reinterpret_as_s32(v_src20))); + v_store_interleave(dst + (x + 4) * cn, v_dst01 + v_cvt_f32(v_reinterpret_as_s32(v_src01)), v_dst11 + v_cvt_f32(v_reinterpret_as_s32(v_src11)), v_dst21 + v_cvt_f32(v_reinterpret_as_s32(v_src21))); + } + } + } + + acc_general_(src, dst, mask, len, cn, x); +} + +void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 8; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_store(dst + x, v_load(dst + x) + v_load(src + x)); + v_store(dst + x + 4, v_load(dst + x + 4) + v_load(src + x + 4)); + } + } + else + { + v_float32x4 v_0 = v_setzero_f32(); + if (cn == 1) + { + for ( ; x <= len - cVectorWidth ; x += cVectorWidth) + { + v_uint16x8 v_masku16, dummy0; + v_expand(v_load(mask + x), v_masku16, dummy0); + v_uint32x4 v_masku320, v_masku321; + v_expand(v_masku16, v_masku320, v_masku321); + v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0))); + v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0))); + + v_store(dst + x, v_load(dst + x) + (v_load(src + x) & v_mask0)); + v_store(dst + x + 4, v_load(dst + x + 4) + (v_load(src + x + 4) & v_mask1)); + } + } + else if (cn == 3) + { + for ( ; x <= len - cVectorWidth ; x += cVectorWidth) + { + v_uint16x8 v_masku16, dummy0; + v_expand(v_load(mask + x), v_masku16, dummy0); + v_uint32x4 v_masku320, v_masku321; + v_expand(v_masku16, v_masku320, v_masku321); + v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0))); + v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_masku321 == v_reinterpret_as_u32(v_0))); + + v_float32x4 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; + v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20); + v_load_deinterleave(src + (x + 4) * cn, v_src01, v_src11, v_src21); + v_src00 = v_src00 & v_mask0; + v_src01 = v_src01 & v_mask1; + v_src10 = v_src10 & v_mask0; + v_src11 = v_src11 & v_mask1; + v_src20 = v_src20 & v_mask0; + v_src21 = v_src21 & v_mask1; + + v_float32x4 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; + v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_load_deinterleave(dst + (x + 4) * cn, v_dst01, v_dst11, v_dst21); + + v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); + v_store_interleave(dst + (x + 4) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); + } + } + } + + acc_general_(src, dst, mask, len, cn, x); +} + +#if CV_SIMD128_64F +void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 16; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_uint8x16 v_src = v_load(src + x); + v_uint16x8 v_int0, v_int1; + v_expand(v_src, v_int0, v_int1); + + v_uint32x4 v_int00, v_int01, v_int10, v_int11; + v_expand(v_int0, v_int00, v_int01); + v_expand(v_int1, v_int10, v_int11); + + v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int00)); + v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00)); + v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int01)); + v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01)); + v_float64x2 v_src4 = v_cvt_f64(v_reinterpret_as_s32(v_int10)); + v_float64x2 v_src5 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10)); + v_float64x2 v_src6 = v_cvt_f64(v_reinterpret_as_s32(v_int11)); + v_float64x2 v_src7 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11)); + + v_float64x2 v_dst0 = v_load(dst + x); + v_float64x2 v_dst1 = v_load(dst + x + 2); + v_float64x2 v_dst2 = v_load(dst + x + 4); + v_float64x2 v_dst3 = v_load(dst + x + 6); + v_float64x2 v_dst4 = v_load(dst + x + 8); + v_float64x2 v_dst5 = v_load(dst + x + 10); + v_float64x2 v_dst6 = v_load(dst + x + 12); + v_float64x2 v_dst7 = v_load(dst + x + 14); + + v_dst0 = v_dst0 + v_src0; + v_dst1 = v_dst1 + v_src1; + v_dst2 = v_dst2 + v_src2; + v_dst3 = v_dst3 + v_src3; + v_dst4 = v_dst4 + v_src4; + v_dst5 = v_dst5 + v_src5; + v_dst6 = v_dst6 + v_src6; + v_dst7 = v_dst7 + v_src7; + + v_store(dst + x, v_dst0); + v_store(dst + x + 2, v_dst1); + v_store(dst + x + 4, v_dst2); + v_store(dst + x + 6, v_dst3); + v_store(dst + x + 8, v_dst4); + v_store(dst + x + 10, v_dst5); + v_store(dst + x + 12, v_dst6); + v_store(dst + x + 14, v_dst7); + } + } + else + { + v_uint8x16 v_0 = v_setall_u8(0); + if (cn == 1) + { + for ( ; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint8x16 v_mask = v_load(mask + x); + v_mask = ~(v_mask == v_0); + v_uint8x16 v_src = v_load(src + x); + v_src = v_src & v_mask; + v_uint16x8 v_int0, v_int1; + v_expand(v_src, v_int0, v_int1); + + v_uint32x4 v_int00, v_int01, v_int10, v_int11; + v_expand(v_int0, v_int00, v_int01); + v_expand(v_int1, v_int10, v_int11); + + v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int00)); + v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00)); + v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int01)); + v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01)); + v_float64x2 v_src4 = v_cvt_f64(v_reinterpret_as_s32(v_int10)); + v_float64x2 v_src5 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10)); + v_float64x2 v_src6 = v_cvt_f64(v_reinterpret_as_s32(v_int11)); + v_float64x2 v_src7 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11)); + + v_float64x2 v_dst0 = v_load(dst + x); + v_float64x2 v_dst1 = v_load(dst + x + 2); + v_float64x2 v_dst2 = v_load(dst + x + 4); + v_float64x2 v_dst3 = v_load(dst + x + 6); + v_float64x2 v_dst4 = v_load(dst + x + 8); + v_float64x2 v_dst5 = v_load(dst + x + 10); + v_float64x2 v_dst6 = v_load(dst + x + 12); + v_float64x2 v_dst7 = v_load(dst + x + 14); + + v_dst0 = v_dst0 + v_src0; + v_dst1 = v_dst1 + v_src1; + v_dst2 = v_dst2 + v_src2; + v_dst3 = v_dst3 + v_src3; + v_dst4 = v_dst4 + v_src4; + v_dst5 = v_dst5 + v_src5; + v_dst6 = v_dst6 + v_src6; + v_dst7 = v_dst7 + v_src7; + + v_store(dst + x, v_dst0); + v_store(dst + x + 2, v_dst1); + v_store(dst + x + 4, v_dst2); + v_store(dst + x + 6, v_dst3); + v_store(dst + x + 8, v_dst4); + v_store(dst + x + 10, v_dst5); + v_store(dst + x + 12, v_dst6); + v_store(dst + x + 14, v_dst7); + } + } + else if (cn == 3) + { + for ( ; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint8x16 v_mask = v_load(mask + x); + v_mask = ~(v_0 == v_mask); + v_uint8x16 v_src0, v_src1, v_src2; + v_load_deinterleave(src + (x * cn), v_src0, v_src1, v_src2); + v_src0 = v_src0 & v_mask; + v_src1 = v_src1 & v_mask; + v_src2 = v_src2 & v_mask; + v_uint16x8 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; + v_expand(v_src0, v_src00, v_src01); + v_expand(v_src1, v_src10, v_src11); + v_expand(v_src2, v_src20, v_src21); + + v_uint32x4 v_src000, v_src001, v_src010, v_src011; + v_uint32x4 v_src100, v_src101, v_src110, v_src111; + v_uint32x4 v_src200, v_src201, v_src210, v_src211; + v_expand(v_src00, v_src000, v_src001); + v_expand(v_src01, v_src010, v_src011); + v_expand(v_src10, v_src100, v_src101); + v_expand(v_src11, v_src110, v_src111); + v_expand(v_src20, v_src200, v_src201); + v_expand(v_src21, v_src210, v_src211); + + v_float64x2 v_src0000, v_src0001, v_src0010, v_src0011, v_src0100, v_src0101, v_src0110, v_src0111; + v_float64x2 v_src1000, v_src1001, v_src1010, v_src1011, v_src1100, v_src1101, v_src1110, v_src1111; + v_float64x2 v_src2000, v_src2001, v_src2010, v_src2011, v_src2100, v_src2101, v_src2110, v_src2111; + v_src0000 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src000))); + v_src0001 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src000))); + v_src0010 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src001))); + v_src0011 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src001))); + v_src0100 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src010))); + v_src0101 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src010))); + v_src0110 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src011))); + v_src0111 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src011))); + v_src1000 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src100))); + v_src1001 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src100))); + v_src1010 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src101))); + v_src1011 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src101))); + v_src1100 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src110))); + v_src1101 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src110))); + v_src1110 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src111))); + v_src1111 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src111))); + v_src2000 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src200))); + v_src2001 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src200))); + v_src2010 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src201))); + v_src2011 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src201))); + v_src2100 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src210))); + v_src2101 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src210))); + v_src2110 = v_cvt_f64(v_cvt_f32(v_reinterpret_as_s32(v_src211))); + v_src2111 = v_cvt_f64_high(v_cvt_f32(v_reinterpret_as_s32(v_src211))); + + v_float64x2 v_dst0000, v_dst0001, v_dst0010, v_dst0011, v_dst0100, v_dst0101, v_dst0110, v_dst0111; + v_float64x2 v_dst1000, v_dst1001, v_dst1010, v_dst1011, v_dst1100, v_dst1101, v_dst1110, v_dst1111; + v_float64x2 v_dst2000, v_dst2001, v_dst2010, v_dst2011, v_dst2100, v_dst2101, v_dst2110, v_dst2111; + v_load_deinterleave(dst + (x * cn), v_dst0000, v_dst1000, v_dst2000); + v_load_deinterleave(dst + ((x + 2) * cn), v_dst0001, v_dst1001, v_dst2001); + v_load_deinterleave(dst + ((x + 4) * cn), v_dst0010, v_dst1010, v_dst2010); + v_load_deinterleave(dst + ((x + 6) * cn), v_dst0011, v_dst1011, v_dst2011); + v_load_deinterleave(dst + ((x + 8) * cn), v_dst0100, v_dst1100, v_dst2100); + v_load_deinterleave(dst + ((x + 10) * cn), v_dst0101, v_dst1101, v_dst2101); + v_load_deinterleave(dst + ((x + 12) * cn), v_dst0110, v_dst1110, v_dst2110); + v_load_deinterleave(dst + ((x + 14) * cn), v_dst0111, v_dst1111, v_dst2111); + + v_store_interleave(dst + (x * cn), v_dst0000 + v_src0000, v_dst1000 + v_src1000, v_dst2000 + v_src2000); + v_store_interleave(dst + ((x + 2) * cn), v_dst0001 + v_src0001, v_dst1001 + v_src1001, v_dst2001 + v_src2001); + v_store_interleave(dst + ((x + 4) * cn), v_dst0010 + v_src0010, v_dst1010 + v_src1010, v_dst2010 + v_src2010); + v_store_interleave(dst + ((x + 6) * cn), v_dst0011 + v_src0011, v_dst1011 + v_src1011, v_dst2011 + v_src2011); + v_store_interleave(dst + ((x + 8) * cn), v_dst0100 + v_src0100, v_dst1100 + v_src1100, v_dst2100 + v_src2100); + v_store_interleave(dst + ((x + 10) * cn), v_dst0101 + v_src0101, v_dst1101 + v_src1101, v_dst2101 + v_src2101); + v_store_interleave(dst + ((x + 12) * cn), v_dst0110 + v_src0110, v_dst1110 + v_src1110, v_dst2110 + v_src2110); + v_store_interleave(dst + ((x + 14) * cn), v_dst0111 + v_src0111, v_dst1111 + v_src1111, v_dst2111 + v_src2111); + } + } + } + + acc_general_(src, dst, mask, len, cn, x); +} + +void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 8; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_uint16x8 v_src = v_load(src + x); + v_uint32x4 v_int0, v_int1; + v_expand(v_src, v_int0, v_int1); + + v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0)); + v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0)); + v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1)); + v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1)); + + v_float64x2 v_dst0 = v_load(dst + x); + v_float64x2 v_dst1 = v_load(dst + x + 2); + v_float64x2 v_dst2 = v_load(dst + x + 4); + v_float64x2 v_dst3 = v_load(dst + x + 6); + + v_dst0 = v_dst0 + v_src0; + v_dst1 = v_dst1 + v_src1; + v_dst2 = v_dst2 + v_src2; + v_dst3 = v_dst3 + v_src3; + + v_store(dst + x, v_dst0); + v_store(dst + x + 2, v_dst1); + v_store(dst + x + 4, v_dst2); + v_store(dst + x + 6, v_dst3); + } + } + else + { + v_uint16x8 v_0 = v_setzero_u16(); + if (cn == 1) + { + for ( ; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint16x8 v_mask, dummy; + v_expand(v_load(mask + x), v_mask, dummy); + v_mask = ~(v_mask == v_0); + v_uint16x8 v_src = v_load(src + x); + v_src = v_src & v_mask; + v_uint32x4 v_int0, v_int1; + v_expand(v_src, v_int0, v_int1); + + v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0)); + v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0)); + v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1)); + v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1)); + + v_float64x2 v_dst0 = v_load(dst + x); + v_float64x2 v_dst1 = v_load(dst + x + 2); + v_float64x2 v_dst2 = v_load(dst + x + 4); + v_float64x2 v_dst3 = v_load(dst + x + 6); + + v_dst0 = v_dst0 + v_src0; + v_dst1 = v_dst1 + v_src1; + v_dst2 = v_dst2 + v_src2; + v_dst3 = v_dst3 + v_src3; + + v_store(dst + x, v_dst0); + v_store(dst + x + 2, v_dst1); + v_store(dst + x + 4, v_dst2); + v_store(dst + x + 6, v_dst3); + } + } + if (cn == 3) + { + for ( ; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint16x8 v_mask, dummy; + v_expand(v_load(mask + x), v_mask, dummy); + v_mask = ~(v_mask == v_0); + v_uint16x8 v_src0, v_src1, v_src2; + v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); + v_src0 = v_src0 & v_mask; + v_src1 = v_src1 & v_mask; + v_src2 = v_src2 & v_mask; + v_uint32x4 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21; + v_expand(v_src0, v_int00, v_int01); + v_expand(v_src1, v_int10, v_int11); + v_expand(v_src2, v_int20, v_int21); + + v_float64x2 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00)); + v_float64x2 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00)); + v_float64x2 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01)); + v_float64x2 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01)); + v_float64x2 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10)); + v_float64x2 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10)); + v_float64x2 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11)); + v_float64x2 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11)); + v_float64x2 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20)); + v_float64x2 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20)); + v_float64x2 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21)); + v_float64x2 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21)); + + v_float64x2 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23; + v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21); + v_load_deinterleave(dst + (x + 4) * cn, v_dst02, v_dst12, v_dst22); + v_load_deinterleave(dst + (x + 6) * cn, v_dst03, v_dst13, v_dst23); + + v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); + v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); + v_store_interleave(dst + (x + 4) * cn, v_dst02 + v_src02, v_dst12 + v_src12, v_dst22 + v_src22); + v_store_interleave(dst + (x + 6) * cn, v_dst03 + v_src03, v_dst13 + v_src13, v_dst23 + v_src23); + } + } + } + + acc_general_(src, dst, mask, len, cn, x); +} + +void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 4; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_float32x4 v_src = v_load(src + x); + v_float64x2 v_src0 = v_cvt_f64(v_src); + v_float64x2 v_src1 = v_cvt_f64_high(v_src); + + v_store(dst + x, v_load(dst + x) + v_src0); + v_store(dst + x + 2, v_load(dst + x + 2) + v_src1); + } + } + else + { + v_uint64x2 v_0 = v_setzero_u64(); + if (cn == 1) + { + for ( ; x <= len - cVectorWidth ; x += cVectorWidth) + { + v_uint16x8 v_masku16, dummy0; + v_expand(v_load(mask + x), v_masku16, dummy0); + v_uint32x4 v_masku32, dummy1; + v_expand(v_masku16, v_masku32, dummy1); + v_uint64x2 v_masku640, v_masku641; + v_expand(v_masku32, v_masku640, v_masku641); + v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); + v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); + + v_float32x4 v_src = v_load(src + x); + v_float64x2 v_src0 = v_cvt_f64(v_src) & v_mask0; + v_float64x2 v_src1 = v_cvt_f64_high(v_src) & v_mask1; + + v_store(dst + x, v_load(dst + x) + v_src0); + v_store(dst + x + 2, v_load(dst + x + 2) + v_src1); + } + } + else if (cn == 3) + { + for ( ; x <= len - cVectorWidth ; x += cVectorWidth) + { + v_uint16x8 v_masku16, dummy0; + v_expand(v_load(mask + x), v_masku16, dummy0); + v_uint32x4 v_masku32, dummy1; + v_expand(v_masku16, v_masku32, dummy1); + v_uint64x2 v_masku640, v_masku641; + v_expand(v_masku32, v_masku640, v_masku641); + v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); + v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); + + v_float32x4 v_src0, v_src1, v_src2; + v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); + v_float64x2 v_src00 = v_cvt_f64(v_src0) & v_mask0; + v_float64x2 v_src01 = v_cvt_f64_high(v_src0) & v_mask1; + v_float64x2 v_src10 = v_cvt_f64(v_src1) & v_mask0; + v_float64x2 v_src11 = v_cvt_f64_high(v_src1) & v_mask1; + v_float64x2 v_src20 = v_cvt_f64(v_src2) & v_mask0; + v_float64x2 v_src21 = v_cvt_f64_high(v_src2) & v_mask1; + + v_float64x2 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; + v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21); + + v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); + v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); + } + } + } + + acc_general_(src, dst, mask, len, cn, x); +} + +void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 4; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_float64x2 v_src0 = v_load(src + x); + v_float64x2 v_src1 = v_load(src + x + 2); + + v_store(dst + x, v_load(dst + x) + v_src0); + v_store(dst + x + 2, v_load(dst + x + 2) + v_src1); + } + } + else + { + v_uint64x2 v_0 = v_setzero_u64(); + if (cn == 1) + { + for ( ; x <= len - cVectorWidth ; x += cVectorWidth) + { + v_uint16x8 v_masku16, dummy0; + v_expand(v_load(mask + x), v_masku16, dummy0); + v_uint32x4 v_masku32, dummy1; + v_expand(v_masku16, v_masku32, dummy1); + v_uint64x2 v_masku640, v_masku641; + v_expand(v_masku32, v_masku640, v_masku641); + v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); + v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); + + v_float64x2 v_src0 = v_load(src + x); + v_float64x2 v_src1 = v_load(src + x + 2); + + v_store(dst + x, v_load(dst + x) + (v_src0 & v_mask0)); + v_store(dst + x + 2, v_load(dst + x + 2) + (v_src1 & v_mask1)); + } + } + else if (cn == 3) + { + for ( ; x <= len - cVectorWidth ; x += cVectorWidth) + { + v_uint16x8 v_masku16, dummy0; + v_expand(v_load(mask + x), v_masku16, dummy0); + v_uint32x4 v_masku32, dummy1; + v_expand(v_masku16, v_masku32, dummy1); + v_uint64x2 v_masku640, v_masku641; + v_expand(v_masku32, v_masku640, v_masku641); + v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); + v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); + + v_float64x2 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21; + v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20); + v_load_deinterleave(src + (x + 2) * cn, v_src01, v_src11, v_src21); + v_src00 = v_src00 & v_mask0; + v_src01 = v_src01 & v_mask1; + v_src10 = v_src10 & v_mask0; + v_src11 = v_src11 & v_mask1; + v_src20 = v_src20 & v_mask0; + v_src21 = v_src21 & v_mask1; + + v_float64x2 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21; + v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21); + + v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); + v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); + } + } + } + + acc_general_(src, dst, mask, len, cn, x); +} +#else +void acc_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn) +{ + acc_general_(src, dst, mask, len, cn, 0); +} + +void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn) +{ + acc_general_(src, dst, mask, len, cn, 0); +} + +void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn) +{ + acc_general_(src, dst, mask, len, cn, 0); +} + +void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int cn) +{ + acc_general_(src, dst, mask, len, cn, 0); +} +#endif + +// square accumulate optimized by universal intrinsic +void accSqr_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 16; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_uint8x16 v_src = v_load(src + x); + v_uint16x8 v_src0, v_src1; + v_expand(v_src, v_src0, v_src1); + v_src0 = v_src0 * v_src0; + v_src1 = v_src1 * v_src1; + + v_uint32x4 v_src00, v_src01, v_src10, v_src11; + v_expand(v_src0, v_src00, v_src01); + v_expand(v_src1, v_src10, v_src11); + + v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); + v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); + v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); + v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); + } + } + else + { + v_uint8x16 v_0 = v_setall_u8(0); + if (cn == 1) + { + for ( ; x <= len - cVectorWidth ; x += cVectorWidth) + { + v_uint8x16 v_mask = v_load(mask + x); + v_mask = ~(v_0 == v_mask); + v_uint8x16 v_src = v_load(src + x); + v_src = v_src & v_mask; + v_uint16x8 v_src0, v_src1; + v_expand(v_src, v_src0, v_src1); + v_src0 = v_src0 * v_src0; + v_src1 = v_src1 * v_src1; + + v_uint32x4 v_src00, v_src01, v_src10, v_src11; + v_expand(v_src0, v_src00, v_src01); + v_expand(v_src1, v_src10, v_src11); + + v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); + v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); + v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); + v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); + } + } + else if (cn == 3) + { + for ( ; x <= len - cVectorWidth ; x += cVectorWidth) + { + v_uint8x16 v_mask = v_load(mask + x); + v_mask = ~(v_0 == v_mask); + + v_uint8x16 v_src0, v_src1, v_src2; + v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); + v_src0 = v_src0 & v_mask; + v_src1 = v_src1 & v_mask; + v_src2 = v_src2 & v_mask; + + v_uint16x8 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; + v_expand(v_src0, v_src00, v_src01); + v_expand(v_src1, v_src10, v_src11); + v_expand(v_src2, v_src20, v_src21); + v_src00 = v_src00 * v_src00; + v_src01 = v_src01 * v_src01; + v_src10 = v_src10 * v_src10; + v_src11 = v_src11 * v_src11; + v_src20 = v_src20 * v_src20; + v_src21 = v_src21 * v_src21; + + v_uint32x4 v_src000, v_src001, v_src010, v_src011; + v_uint32x4 v_src100, v_src101, v_src110, v_src111; + v_uint32x4 v_src200, v_src201, v_src210, v_src211; + v_expand(v_src00, v_src000, v_src001); + v_expand(v_src01, v_src010, v_src011); + v_expand(v_src10, v_src100, v_src101); + v_expand(v_src11, v_src110, v_src111); + v_expand(v_src20, v_src200, v_src201); + v_expand(v_src21, v_src210, v_src211); + + v_float32x4 v_dst000, v_dst001, v_dst010, v_dst011; + v_float32x4 v_dst100, v_dst101, v_dst110, v_dst111; + v_float32x4 v_dst200, v_dst201, v_dst210, v_dst211; + v_load_deinterleave(dst + x * cn, v_dst000, v_dst100, v_dst200); + v_load_deinterleave(dst + (x + 4) * cn, v_dst001, v_dst101, v_dst201); + v_load_deinterleave(dst + (x + 8) * cn, v_dst010, v_dst110, v_dst210); + v_load_deinterleave(dst + (x + 12) * cn, v_dst011, v_dst111, v_dst211); + + v_store_interleave(dst + x * cn, v_dst000 + v_cvt_f32(v_reinterpret_as_s32(v_src000)), v_dst100 + v_cvt_f32(v_reinterpret_as_s32(v_src100)), v_dst200 + v_cvt_f32(v_reinterpret_as_s32(v_src200))); + v_store_interleave(dst + (x + 4) * cn, v_dst001 + v_cvt_f32(v_reinterpret_as_s32(v_src001)), v_dst101 + v_cvt_f32(v_reinterpret_as_s32(v_src101)), v_dst201 + v_cvt_f32(v_reinterpret_as_s32(v_src201))); + v_store_interleave(dst + (x + 8) * cn, v_dst010 + v_cvt_f32(v_reinterpret_as_s32(v_src010)), v_dst110 + v_cvt_f32(v_reinterpret_as_s32(v_src110)), v_dst210 + v_cvt_f32(v_reinterpret_as_s32(v_src210))); + v_store_interleave(dst + (x + 12) * cn, v_dst011 + v_cvt_f32(v_reinterpret_as_s32(v_src011)), v_dst111 + v_cvt_f32(v_reinterpret_as_s32(v_src111)), v_dst211 + v_cvt_f32(v_reinterpret_as_s32(v_src211))); + } + } + } + + accSqr_general_(src, dst, mask, len, cn, x); +} + +void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 8; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_uint16x8 v_src = v_load(src + x); + v_uint32x4 v_src0, v_src1; + v_expand(v_src, v_src0, v_src1); + + v_float32x4 v_float0, v_float1; + v_float0 = v_cvt_f32(v_reinterpret_as_s32(v_src0)); + v_float1 = v_cvt_f32(v_reinterpret_as_s32(v_src1)); + v_float0 = v_float0 * v_float0; + v_float1 = v_float1 * v_float1; + + v_store(dst + x, v_load(dst + x) + v_float0); + v_store(dst + x + 4, v_load(dst + x + 4) + v_float1); + } + } + else + { + v_uint32x4 v_0 = v_setzero_u32(); + if (cn == 1) + { + for ( ; x <= len - cVectorWidth ; x += cVectorWidth) + { + v_uint16x8 stub = v_load_expand(mask + x); + v_uint32x4 v_mask0, v_mask1; + v_expand(stub, v_mask0, v_mask1); + v_mask0 = ~(v_mask0 == v_0); + v_mask1 = ~(v_mask1 == v_0); + v_uint16x8 v_src = v_load(src + x); + v_uint32x4 v_src0, v_src1; + v_expand(v_src, v_src0, v_src1); + v_src0 = v_src0 & v_mask0; + v_src1 = v_src1 & v_mask1; + + v_float32x4 v_float0, v_float1; + v_float0 = v_cvt_f32(v_reinterpret_as_s32(v_src0)); + v_float1 = v_cvt_f32(v_reinterpret_as_s32(v_src1)); + v_float0 = v_float0 * v_float0; + v_float1 = v_float1 * v_float1; + + v_store(dst + x, v_load(dst + x) + v_float0); + v_store(dst + x + 4, v_load(dst + x + 4) + v_float1); + } + } + else if (cn == 3) + { + for ( ; x <= len - cVectorWidth ; x += cVectorWidth) + { + v_uint16x8 stub = v_load_expand(mask + x); + v_uint32x4 v_mask0, v_mask1; + v_expand(stub, v_mask0, v_mask1); + v_mask0 = ~(v_mask0 == v_0); + v_mask1 = ~(v_mask1 == v_0); + + v_uint16x8 v_src0, v_src1, v_src2; + v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); + v_uint32x4 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21; + v_expand(v_src0, v_int00, v_int01); + v_expand(v_src1, v_int10, v_int11); + v_expand(v_src2, v_int20, v_int21); + v_int00 = v_int00 & v_mask0; + v_int01 = v_int01 & v_mask1; + v_int10 = v_int10 & v_mask0; + v_int11 = v_int11 & v_mask1; + v_int20 = v_int20 & v_mask0; + v_int21 = v_int21 & v_mask1; + + v_float32x4 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; + v_src00 = v_cvt_f32(v_reinterpret_as_s32(v_int00)); + v_src01 = v_cvt_f32(v_reinterpret_as_s32(v_int01)); + v_src10 = v_cvt_f32(v_reinterpret_as_s32(v_int10)); + v_src11 = v_cvt_f32(v_reinterpret_as_s32(v_int11)); + v_src20 = v_cvt_f32(v_reinterpret_as_s32(v_int20)); + v_src21 = v_cvt_f32(v_reinterpret_as_s32(v_int21)); + v_src00 = v_src00 * v_src00; + v_src01 = v_src01 * v_src01; + v_src10 = v_src10 * v_src10; + v_src11 = v_src11 * v_src11; + v_src20 = v_src20 * v_src20; + v_src21 = v_src21 * v_src21; + + v_float32x4 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; + v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_load_deinterleave(dst + (x + 4) * cn, v_dst01, v_dst11, v_dst21); + + v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); + v_store_interleave(dst + (x + 4) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); + } + } + } + + accSqr_general_(src, dst, mask, len, cn, x); +} + +void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 8; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_float32x4 v_src0 = v_load(src + x); + v_float32x4 v_src1 = v_load(src + x + 4); + v_src0 = v_src0 * v_src0; + v_src1 = v_src1 * v_src1; + + v_store(dst + x, v_load(dst + x) + v_src0); + v_store(dst + x + 4, v_load(dst + x + 4) + v_src1); + } + } + else + { + v_uint32x4 v_0 = v_setzero_u32(); + if (cn == 1) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint16x8 v_stub = v_load_expand(mask + x); + v_uint32x4 v_stub0, v_stub1; + v_expand(v_stub, v_stub0, v_stub1); + v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_stub0 == v_0)); + v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_stub1 == v_0)); + v_float32x4 v_src0 = v_load(src + x); + v_float32x4 v_src1 = v_load(src + x + 4); + v_src0 = v_src0 & v_mask0; + v_src1 = v_src1 & v_mask1; + v_src0 = v_src0 * v_src0; + v_src1 = v_src1 * v_src1; + + v_store(dst + x, v_load(dst + x) + v_src0); + v_store(dst + x + 4, v_load(dst + x + 4) + v_src1); + } + } + else if (cn == 3) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint16x8 v_stub = v_load_expand(mask + x); + v_uint32x4 v_stub0, v_stub1; + v_expand(v_stub, v_stub0, v_stub1); + v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_stub0 == v_0)); + v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_stub1 == v_0)); + + v_float32x4 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21; + v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20); + v_load_deinterleave(src + (x + 4) * cn, v_src01, v_src11, v_src21); + v_src00 = v_src00 & v_mask0; + v_src01 = v_src01 & v_mask1; + v_src10 = v_src10 & v_mask0; + v_src11 = v_src11 & v_mask1; + v_src20 = v_src20 & v_mask0; + v_src21 = v_src21 & v_mask1; + v_src00 = v_src00 * v_src00; + v_src01 = v_src01 * v_src01; + v_src10 = v_src10 * v_src10; + v_src11 = v_src11 * v_src11; + v_src20 = v_src20 * v_src20; + v_src21 = v_src21 * v_src21; + + v_float32x4 v_dst00, v_dst10, v_dst20, v_dst01, v_dst11, v_dst21; + v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_load_deinterleave(dst + (x + 4) * cn, v_dst01, v_dst11, v_dst21); + + v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); + v_store_interleave(dst + (x + 4) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); + } + } + } + + accSqr_general_(src, dst, mask, len, cn, x); +} +#if CV_SIMD128_64F +void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 8; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_uint8x16 v_src = v_load(src + x); + v_uint16x8 v_int, dummy; + v_expand(v_src, v_int, dummy); + + v_uint32x4 v_int0, v_int1; + v_expand(v_int, v_int0, v_int1); + + v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0)); + v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0)); + v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1)); + v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1)); + v_src0 = v_src0 * v_src0; + v_src1 = v_src1 * v_src1; + v_src2 = v_src2 * v_src2; + v_src3 = v_src3 * v_src3; + + v_float64x2 v_dst0 = v_load(dst + x); + v_float64x2 v_dst1 = v_load(dst + x + 2); + v_float64x2 v_dst2 = v_load(dst + x + 4); + v_float64x2 v_dst3 = v_load(dst + x + 6); + + v_dst0 += v_src0; + v_dst1 += v_src1; + v_dst2 += v_src2; + v_dst3 += v_src3; + + v_store(dst + x, v_dst0); + v_store(dst + x + 2, v_dst1); + v_store(dst + x + 4, v_dst2); + v_store(dst + x + 6, v_dst3); + } + } + else + { + v_uint8x16 v_0 = v_setzero_u8(); + if (cn == 1) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint8x16 v_mask = v_load(mask + x); + v_mask = ~(v_mask == v_0); + v_uint8x16 v_src = v_load(src + x); + v_src = v_src & v_mask; + v_uint16x8 v_int, dummy; + v_expand(v_src, v_int, dummy); + + v_uint32x4 v_int0, v_int1; + v_expand(v_int, v_int0, v_int1); + + v_float64x2 v_src0 = v_cvt_f64(v_reinterpret_as_s32(v_int0)); + v_float64x2 v_src1 = v_cvt_f64_high(v_reinterpret_as_s32(v_int0)); + v_float64x2 v_src2 = v_cvt_f64(v_reinterpret_as_s32(v_int1)); + v_float64x2 v_src3 = v_cvt_f64_high(v_reinterpret_as_s32(v_int1)); + v_src0 = v_src0 * v_src0; + v_src1 = v_src1 * v_src1; + v_src2 = v_src2 * v_src2; + v_src3 = v_src3 * v_src3; + + v_float64x2 v_dst0 = v_load(dst + x); + v_float64x2 v_dst1 = v_load(dst + x + 2); + v_float64x2 v_dst2 = v_load(dst + x + 4); + v_float64x2 v_dst3 = v_load(dst + x + 6); + + v_dst0 += v_src0; + v_dst1 += v_src1; + v_dst2 += v_src2; + v_dst3 += v_src3; + + v_store(dst + x, v_dst0); + v_store(dst + x + 2, v_dst1); + v_store(dst + x + 4, v_dst2); + v_store(dst + x + 6, v_dst3); + } + } + else if (cn == 3) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint8x16 v_mask = v_load(mask + x); + v_mask = ~(v_mask == v_0); + v_uint8x16 v_src0, v_src1, v_src2; + v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); + v_src0 = v_src0 & v_mask; + v_src1 = v_src1 & v_mask; + v_src2 = v_src2 & v_mask; + v_uint16x8 v_int0, v_int1, v_int2, dummy; + v_expand(v_src0, v_int0, dummy); + v_expand(v_src1, v_int1, dummy); + v_expand(v_src2, v_int2, dummy); + + v_uint32x4 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21; + v_expand(v_int0, v_int00, v_int01); + v_expand(v_int1, v_int10, v_int11); + v_expand(v_int2, v_int20, v_int21); + + v_float64x2 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00)); + v_float64x2 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00)); + v_float64x2 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01)); + v_float64x2 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01)); + v_float64x2 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10)); + v_float64x2 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10)); + v_float64x2 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11)); + v_float64x2 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11)); + v_float64x2 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20)); + v_float64x2 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20)); + v_float64x2 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21)); + v_float64x2 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21)); + v_src00 = v_src00 * v_src00; + v_src01 = v_src01 * v_src01; + v_src02 = v_src02 * v_src02; + v_src03 = v_src03 * v_src03; + v_src10 = v_src10 * v_src10; + v_src11 = v_src11 * v_src11; + v_src12 = v_src12 * v_src12; + v_src13 = v_src13 * v_src13; + v_src20 = v_src20 * v_src20; + v_src21 = v_src21 * v_src21; + v_src22 = v_src22 * v_src22; + v_src23 = v_src23 * v_src23; + + v_float64x2 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23; + v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21); + v_load_deinterleave(dst + (x + 4) * cn, v_dst02, v_dst12, v_dst22); + v_load_deinterleave(dst + (x + 6) * cn, v_dst03, v_dst13, v_dst23); + + v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); + v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); + v_store_interleave(dst + (x + 4) * cn, v_dst02 + v_src02, v_dst12 + v_src12, v_dst22 + v_src22); + v_store_interleave(dst + (x + 6) * cn, v_dst03 + v_src03, v_dst13 + v_src13, v_dst23 + v_src23); + } + } + } + + accSqr_general_(src, dst, mask, len, cn, x); +} + +void accSqr_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 8; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_uint16x8 v_src = v_load(src + x); + v_uint32x4 v_int_0, v_int_1; + v_expand(v_src, v_int_0, v_int_1); + + v_int32x4 v_int0 = v_reinterpret_as_s32(v_int_0); + v_int32x4 v_int1 = v_reinterpret_as_s32(v_int_1); + + v_float64x2 v_src0 = v_cvt_f64(v_int0); + v_float64x2 v_src1 = v_cvt_f64_high(v_int0); + v_float64x2 v_src2 = v_cvt_f64(v_int1); + v_float64x2 v_src3 = v_cvt_f64_high(v_int1); + v_src0 = v_src0 * v_src0; + v_src1 = v_src1 * v_src1; + v_src2 = v_src2 * v_src2; + v_src3 = v_src3 * v_src3; + + v_float64x2 v_dst0 = v_load(dst + x); + v_float64x2 v_dst1 = v_load(dst + x + 2); + v_float64x2 v_dst2 = v_load(dst + x + 4); + v_float64x2 v_dst3 = v_load(dst + x + 6); + + v_dst0 += v_src0; + v_dst1 += v_src1; + v_dst2 += v_src2; + v_dst3 += v_src3; + + v_store(dst + x, v_dst0); + v_store(dst + x + 2, v_dst1); + v_store(dst + x + 4, v_dst2); + v_store(dst + x + 6, v_dst3); + } + } + else + { + v_uint16x8 v_0 = v_setzero_u16(); + if (cn == 1) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint16x8 v_mask = v_load_expand(mask + x); + v_mask = ~(v_mask == v_0); + v_uint16x8 v_src = v_load(src + x); + v_src = v_src & v_mask; + v_uint32x4 v_int_0, v_int_1; + v_expand(v_src, v_int_0, v_int_1); + + v_int32x4 v_int0 = v_reinterpret_as_s32(v_int_0); + v_int32x4 v_int1 = v_reinterpret_as_s32(v_int_1); + + v_float64x2 v_src0 = v_cvt_f64(v_int0); + v_float64x2 v_src1 = v_cvt_f64_high(v_int0); + v_float64x2 v_src2 = v_cvt_f64(v_int1); + v_float64x2 v_src3 = v_cvt_f64_high(v_int1); + v_src0 = v_src0 * v_src0; + v_src1 = v_src1 * v_src1; + v_src2 = v_src2 * v_src2; + v_src3 = v_src3 * v_src3; + + v_float64x2 v_dst0 = v_load(dst + x); + v_float64x2 v_dst1 = v_load(dst + x + 2); + v_float64x2 v_dst2 = v_load(dst + x + 4); + v_float64x2 v_dst3 = v_load(dst + x + 6); + + v_dst0 += v_src0; + v_dst1 += v_src1; + v_dst2 += v_src2; + v_dst3 += v_src3; + + v_store(dst + x, v_dst0); + v_store(dst + x + 2, v_dst1); + v_store(dst + x + 4, v_dst2); + v_store(dst + x + 6, v_dst3); + } + } + else if (cn == 3) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint16x8 v_mask = v_load_expand(mask + x); + v_mask = ~(v_mask == v_0); + v_uint16x8 v_src0, v_src1, v_src2; + v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); + v_src0 = v_src0 & v_mask; + v_src1 = v_src1 & v_mask; + v_src2 = v_src2 & v_mask; + v_uint32x4 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21; + v_expand(v_src0, v_int00, v_int01); + v_expand(v_src1, v_int10, v_int11); + v_expand(v_src2, v_int20, v_int21); + + v_float64x2 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_int00)); + v_float64x2 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_int00)); + v_float64x2 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_int01)); + v_float64x2 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_int01)); + v_float64x2 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_int10)); + v_float64x2 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_int10)); + v_float64x2 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_int11)); + v_float64x2 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_int11)); + v_float64x2 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_int20)); + v_float64x2 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_int20)); + v_float64x2 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_int21)); + v_float64x2 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_int21)); + v_src00 = v_src00 * v_src00; + v_src01 = v_src01 * v_src01; + v_src02 = v_src02 * v_src02; + v_src03 = v_src03 * v_src03; + v_src10 = v_src10 * v_src10; + v_src11 = v_src11 * v_src11; + v_src12 = v_src12 * v_src12; + v_src13 = v_src13 * v_src13; + v_src20 = v_src20 * v_src20; + v_src21 = v_src21 * v_src21; + v_src22 = v_src22 * v_src22; + v_src23 = v_src23 * v_src23; + + v_float64x2 v_dst00, v_dst01, v_dst02, v_dst03; + v_float64x2 v_dst10, v_dst11, v_dst12, v_dst13; + v_float64x2 v_dst20, v_dst21, v_dst22, v_dst23; + v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_load_deinterleave(dst + (x + 2)* cn, v_dst01, v_dst11, v_dst21); + v_load_deinterleave(dst + (x + 4)* cn, v_dst02, v_dst12, v_dst22); + v_load_deinterleave(dst + (x + 6)* cn, v_dst03, v_dst13, v_dst23); + + v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); + v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); + v_store_interleave(dst + (x + 4) * cn, v_dst02 + v_src02, v_dst12 + v_src12, v_dst22 + v_src22); + v_store_interleave(dst + (x + 6) * cn, v_dst03 + v_src03, v_dst13 + v_src13, v_dst23 + v_src23); + } + } + } + + accSqr_general_(src, dst, mask, len, cn, x); +} + +void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 4; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_float32x4 v_src = v_load(src + x); + v_float64x2 v_src0 = v_cvt_f64(v_src); + v_float64x2 v_src1 = v_cvt_f64_high(v_src); + v_src0 = v_src0 * v_src0; + v_src1 = v_src1 * v_src1; + + v_store(dst + x, v_load(dst + x) + v_src0); + v_store(dst + x + 2, v_load(dst + x + 2) + v_src1); + } + } + else + { + v_uint32x4 v_0 = v_setzero_u32(); + if (cn == 1) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint16x8 stub = v_load_expand(mask + x); + v_uint32x4 v_mask, dummy; + v_expand(stub, v_mask, dummy); + v_mask = ~(v_mask == v_0); + v_float32x4 v_src = v_load(src + x); + v_src = v_src & v_reinterpret_as_f32(v_mask); + v_float64x2 v_src0 = v_cvt_f64(v_src); + v_float64x2 v_src1 = v_cvt_f64_high(v_src); + v_src0 = v_src0 * v_src0; + v_src1 = v_src1 * v_src1; + + v_store(dst + x, v_load(dst + x) + v_src0); + v_store(dst + x + 2, v_load(dst + x + 2) + v_src1); + } + } + else if (cn == 3) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint16x8 stub = v_load_expand(mask + x); + v_uint32x4 v_mask, dummy; + v_expand(stub, v_mask, dummy); + v_mask = ~(v_mask == v_0); + + v_float32x4 v_src0, v_src1, v_src2; + v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); + v_src0 = v_src0 & v_reinterpret_as_f32(v_mask); + v_src1 = v_src1 & v_reinterpret_as_f32(v_mask); + v_src2 = v_src2 & v_reinterpret_as_f32(v_mask); + + v_float64x2 v_src00 = v_cvt_f64(v_src0); + v_float64x2 v_src01 = v_cvt_f64_high(v_src0); + v_float64x2 v_src10 = v_cvt_f64(v_src1); + v_float64x2 v_src11 = v_cvt_f64_high(v_src1); + v_float64x2 v_src20 = v_cvt_f64(v_src2); + v_float64x2 v_src21 = v_cvt_f64_high(v_src2); + v_src00 = v_src00 * v_src00; + v_src01 = v_src01 * v_src01; + v_src10 = v_src10 * v_src10; + v_src11 = v_src11 * v_src11; + v_src20 = v_src20 * v_src20; + v_src21 = v_src21 * v_src21; + + v_float64x2 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; + v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21); + + v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); + v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); + } + } + } + + accSqr_general_(src, dst, mask, len, cn, x); +} + +void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 4; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_float64x2 v_src0 = v_load(src + x); + v_float64x2 v_src1 = v_load(src + x + 2); + v_src0 = v_src0 * v_src0; + v_src1 = v_src1 * v_src1; + + v_store(dst + x, v_load(dst + x) + v_src0); + v_store(dst + x + 2, v_load(dst + x + 2) + v_src1); + } + } + else + { + v_uint64x2 v_0 = v_setzero_u64(); + if (cn == 1) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint16x8 stub = v_load_expand(mask + x); + v_uint32x4 stub0, stub1; + v_expand(stub, stub0, stub1); + v_uint64x2 v_masku640, v_masku641; + v_expand(stub0, v_masku640, v_masku641); + v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); + v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); + v_float64x2 v_src0 = v_load(src + x); + v_float64x2 v_src1 = v_load(src + x + 2); + v_src0 = v_src0 & v_mask0; + v_src1 = v_src1 & v_mask1; + v_src0 = v_src0 * v_src0; + v_src1 = v_src1 * v_src1; + + v_store(dst + x, v_load(dst + x) + v_src0); + v_store(dst + x + 2, v_load(dst + x + 2) + v_src1); + } + } + else if (cn == 3) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint16x8 stub = v_load_expand(mask + x); + v_uint32x4 stub0, stub1; + v_expand(stub, stub0, stub1); + v_uint64x2 v_masku640, v_masku641; + v_expand(stub0, v_masku640, v_masku641); + v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); + v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); + + v_float64x2 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; + v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20); + v_load_deinterleave(src + (x + 2) * cn, v_src01, v_src11, v_src21); + v_src00 = v_src00 & v_mask0; + v_src01 = v_src01 & v_mask1; + v_src10 = v_src10 & v_mask0; + v_src11 = v_src11 & v_mask1; + v_src20 = v_src20 & v_mask0; + v_src21 = v_src21 & v_mask1; + v_src00 = v_src00 * v_src00; + v_src01 = v_src01 * v_src01; + v_src10 = v_src10 * v_src10; + v_src11 = v_src11 * v_src11; + v_src20 = v_src20 * v_src20; + v_src21 = v_src21 * v_src21; + + v_float64x2 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; + v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21); + + v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); + v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); + } + } + } + + accSqr_general_(src, dst, mask, len, cn, x); +} +#else +void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn) +{ + accSqr_general_(src, dst, mask, len, cn, 0); +} + +void accSqr_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn) +{ + accSqr_general_(src, dst, mask, len, cn, 0); +} + +void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int cn) +{ + accSqr_general_(src, dst, mask, len, cn, 0); +} + +void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, int cn) +{ + accSqr_general_(src, dst, mask, len, cn, 0); +} +#endif + +// product accumulate optimized by universal intrinsic +void accProd_simd_(const uchar* src1, const uchar* src2, float* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 16; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_uint8x16 v_1src = v_load(src1 + x); + v_uint8x16 v_2src = v_load(src2 + x); + + v_uint16x8 v_1src0, v_1src1, v_2src0, v_2src1; + v_expand(v_1src, v_1src0, v_1src1); + v_expand(v_2src, v_2src0, v_2src1); + + v_uint16x8 v_src0, v_src1; + v_src0 = v_1src0 * v_2src0; + v_src1 = v_1src1 * v_2src1; + + v_uint32x4 v_src00, v_src01, v_src10, v_src11; + v_expand(v_src0, v_src00, v_src01); + v_expand(v_src1, v_src10, v_src11); + + v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); + v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); + v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); + v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); + } + } + else + { + v_uint8x16 v_0 = v_setzero_u8(); + if (cn == 1) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint8x16 v_mask = v_load(mask + x); + v_mask = ~(v_mask == v_0); + v_uint8x16 v_1src = v_load(src1 + x); + v_uint8x16 v_2src = v_load(src2 + x); + v_1src = v_1src & v_mask; + v_2src = v_2src & v_mask; + + v_uint16x8 v_1src0, v_1src1, v_2src0, v_2src1; + v_expand(v_1src, v_1src0, v_1src1); + v_expand(v_2src, v_2src0, v_2src1); + + v_uint16x8 v_src0, v_src1; + v_src0 = v_1src0 * v_2src0; + v_src1 = v_1src1 * v_2src1; + + v_uint32x4 v_src00, v_src01, v_src10, v_src11; + v_expand(v_src0, v_src00, v_src01); + v_expand(v_src1, v_src10, v_src11); + + v_store(dst + x, v_load(dst + x) + v_cvt_f32(v_reinterpret_as_s32(v_src00))); + v_store(dst + x + 4, v_load(dst + x + 4) + v_cvt_f32(v_reinterpret_as_s32(v_src01))); + v_store(dst + x + 8, v_load(dst + x + 8) + v_cvt_f32(v_reinterpret_as_s32(v_src10))); + v_store(dst + x + 12, v_load(dst + x + 12) + v_cvt_f32(v_reinterpret_as_s32(v_src11))); + } + } + else if (cn == 3) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint8x16 v_mask = v_load(mask + x); + v_mask = ~(v_mask == v_0); + v_uint8x16 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2; + v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2); + v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2); + v_1src0 = v_1src0 & v_mask; + v_1src1 = v_1src1 & v_mask; + v_1src2 = v_1src2 & v_mask; + v_2src0 = v_2src0 & v_mask; + v_2src1 = v_2src1 & v_mask; + v_2src2 = v_2src2 & v_mask; + + v_uint16x8 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21, v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21; + v_expand(v_1src0, v_1src00, v_1src01); + v_expand(v_1src1, v_1src10, v_1src11); + v_expand(v_1src2, v_1src20, v_1src21); + v_expand(v_2src0, v_2src00, v_2src01); + v_expand(v_2src1, v_2src10, v_2src11); + v_expand(v_2src2, v_2src20, v_2src21); + + v_uint16x8 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21; + v_src00 = v_1src00 * v_2src00; + v_src01 = v_1src01 * v_2src01; + v_src10 = v_1src10 * v_2src10; + v_src11 = v_1src11 * v_2src11; + v_src20 = v_1src20 * v_2src20; + v_src21 = v_1src21 * v_2src21; + + v_uint32x4 v_src000, v_src001, v_src002, v_src003, v_src100, v_src101, v_src102, v_src103, v_src200, v_src201, v_src202, v_src203; + v_expand(v_src00, v_src000, v_src001); + v_expand(v_src01, v_src002, v_src003); + v_expand(v_src10, v_src100, v_src101); + v_expand(v_src11, v_src102, v_src103); + v_expand(v_src20, v_src200, v_src201); + v_expand(v_src21, v_src202, v_src203); + + v_float32x4 v_dst000, v_dst001, v_dst002, v_dst003, v_dst100, v_dst101, v_dst102, v_dst103, v_dst200, v_dst201, v_dst202, v_dst203; + v_load_deinterleave(dst + x * cn, v_dst000, v_dst100, v_dst200); + v_load_deinterleave(dst + (x + 4) * cn, v_dst001, v_dst101, v_dst201); + v_load_deinterleave(dst + (x + 8) * cn, v_dst002, v_dst102, v_dst202); + v_load_deinterleave(dst + (x + 12) * cn, v_dst003, v_dst103, v_dst203); + v_dst000 = v_dst000 + v_cvt_f32(v_reinterpret_as_s32(v_src000)); + v_dst001 = v_dst001 + v_cvt_f32(v_reinterpret_as_s32(v_src001)); + v_dst002 = v_dst002 + v_cvt_f32(v_reinterpret_as_s32(v_src002)); + v_dst003 = v_dst003 + v_cvt_f32(v_reinterpret_as_s32(v_src003)); + v_dst100 = v_dst100 + v_cvt_f32(v_reinterpret_as_s32(v_src100)); + v_dst101 = v_dst101 + v_cvt_f32(v_reinterpret_as_s32(v_src101)); + v_dst102 = v_dst102 + v_cvt_f32(v_reinterpret_as_s32(v_src102)); + v_dst103 = v_dst103 + v_cvt_f32(v_reinterpret_as_s32(v_src103)); + v_dst200 = v_dst200 + v_cvt_f32(v_reinterpret_as_s32(v_src200)); + v_dst201 = v_dst201 + v_cvt_f32(v_reinterpret_as_s32(v_src201)); + v_dst202 = v_dst202 + v_cvt_f32(v_reinterpret_as_s32(v_src202)); + v_dst203 = v_dst203 + v_cvt_f32(v_reinterpret_as_s32(v_src203)); + + v_store_interleave(dst + x * cn, v_dst000, v_dst100, v_dst200); + v_store_interleave(dst + (x + 4) * cn, v_dst001, v_dst101, v_dst201); + v_store_interleave(dst + (x + 8) * cn, v_dst002, v_dst102, v_dst202); + v_store_interleave(dst + (x + 12) * cn, v_dst003, v_dst103, v_dst203); + } + } + } + + accProd_general_(src1, src2, dst, mask, len, cn, x); +} + +void accProd_simd_(const ushort* src1, const ushort* src2, float* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 8; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_uint16x8 v_1src = v_load(src1 + x); + v_uint16x8 v_2src = v_load(src2 + x); + + v_uint32x4 v_1src0, v_1src1, v_2src0, v_2src1; + v_expand(v_1src, v_1src0, v_1src1); + v_expand(v_2src, v_2src0, v_2src1); + + v_float32x4 v_1float0 = v_cvt_f32(v_reinterpret_as_s32(v_1src0)); + v_float32x4 v_1float1 = v_cvt_f32(v_reinterpret_as_s32(v_1src1)); + v_float32x4 v_2float0 = v_cvt_f32(v_reinterpret_as_s32(v_2src0)); + v_float32x4 v_2float1 = v_cvt_f32(v_reinterpret_as_s32(v_2src1)); + + v_float32x4 v_src0 = v_1float0 * v_2float0; + v_float32x4 v_src1 = v_1float1 * v_2float1; + + v_store(dst + x, v_load(dst + x) + v_src0); + v_store(dst + x + 4, v_load(dst + x + 4) + v_src1); + } + } + else + { + v_uint16x8 v_0 = v_setzero_u16(); + if (cn == 1) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint16x8 v_mask = v_load_expand(mask + x); + v_mask = ~(v_0 == v_mask); + + v_uint16x8 v_1src = v_load(src1 + x) & v_mask; + v_uint16x8 v_2src = v_load(src2 + x) & v_mask; + + v_uint32x4 v_1src0, v_1src1, v_2src0, v_2src1; + v_expand(v_1src, v_1src0, v_1src1); + v_expand(v_2src, v_2src0, v_2src1); + + v_float32x4 v_1float0 = v_cvt_f32(v_reinterpret_as_s32(v_1src0)); + v_float32x4 v_1float1 = v_cvt_f32(v_reinterpret_as_s32(v_1src1)); + v_float32x4 v_2float0 = v_cvt_f32(v_reinterpret_as_s32(v_2src0)); + v_float32x4 v_2float1 = v_cvt_f32(v_reinterpret_as_s32(v_2src1)); + + v_float32x4 v_src0 = v_1float0 * v_2float0; + v_float32x4 v_src1 = v_1float1 * v_2float1; + + v_store(dst + x, v_load(dst + x) + v_src0); + v_store(dst + x + 4, v_load(dst + x + 4) + v_src1); + } + } + else if (cn == 3) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint16x8 v_mask = v_load_expand(mask + x); + v_mask = ~(v_0 == v_mask); + + v_uint16x8 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2; + v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2); + v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2); + v_1src0 = v_1src0 & v_mask; + v_1src1 = v_1src1 & v_mask; + v_1src2 = v_1src2 & v_mask; + v_2src0 = v_2src0 & v_mask; + v_2src1 = v_2src1 & v_mask; + v_2src2 = v_2src2 & v_mask; + + v_uint32x4 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21, v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21; + v_expand(v_1src0, v_1src00, v_1src01); + v_expand(v_1src1, v_1src10, v_1src11); + v_expand(v_1src2, v_1src20, v_1src21); + v_expand(v_2src0, v_2src00, v_2src01); + v_expand(v_2src1, v_2src10, v_2src11); + v_expand(v_2src2, v_2src20, v_2src21); + + v_float32x4 v_1float00 = v_cvt_f32(v_reinterpret_as_s32(v_1src00)); + v_float32x4 v_1float01 = v_cvt_f32(v_reinterpret_as_s32(v_1src01)); + v_float32x4 v_1float10 = v_cvt_f32(v_reinterpret_as_s32(v_1src10)); + v_float32x4 v_1float11 = v_cvt_f32(v_reinterpret_as_s32(v_1src11)); + v_float32x4 v_1float20 = v_cvt_f32(v_reinterpret_as_s32(v_1src20)); + v_float32x4 v_1float21 = v_cvt_f32(v_reinterpret_as_s32(v_1src21)); + v_float32x4 v_2float00 = v_cvt_f32(v_reinterpret_as_s32(v_2src00)); + v_float32x4 v_2float01 = v_cvt_f32(v_reinterpret_as_s32(v_2src01)); + v_float32x4 v_2float10 = v_cvt_f32(v_reinterpret_as_s32(v_2src10)); + v_float32x4 v_2float11 = v_cvt_f32(v_reinterpret_as_s32(v_2src11)); + v_float32x4 v_2float20 = v_cvt_f32(v_reinterpret_as_s32(v_2src20)); + v_float32x4 v_2float21 = v_cvt_f32(v_reinterpret_as_s32(v_2src21)); + + v_float32x4 v_src00 = v_1float00 * v_2float00; + v_float32x4 v_src01 = v_1float01 * v_2float01; + v_float32x4 v_src10 = v_1float10 * v_2float10; + v_float32x4 v_src11 = v_1float11 * v_2float11; + v_float32x4 v_src20 = v_1float20 * v_2float20; + v_float32x4 v_src21 = v_1float21 * v_2float21; + + v_float32x4 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; + v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_load_deinterleave(dst + (x + 4) * cn, v_dst01, v_dst11, v_dst21); + + v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); + v_store_interleave(dst + (x + 4) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); + } + } + } + + accProd_general_(src1, src2, dst, mask, len, cn, x); +} + +void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 8; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_store(dst + x, v_load(dst + x) + v_load(src1 + x) * v_load(src2 + x)); + v_store(dst + x + 4, v_load(dst + x + 4) + v_load(src1 + x + 4) * v_load(src2 + x + 4)); + } + } + else + { + v_uint32x4 v_0 = v_setzero_u32(); + if (cn == 1) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint16x8 stub = v_load_expand(mask + x); + v_uint32x4 stub0, stub1; + v_expand(stub, stub0, stub1); + v_float32x4 v_mask0 = v_reinterpret_as_f32(~(stub0 == v_0)); + v_float32x4 v_mask1 = v_reinterpret_as_f32(~(stub1 == v_0)); + + v_store(dst + x, v_load(dst + x) + ((v_load(src1 + x) * v_load(src2 + x)) & v_mask0)); + v_store(dst + x + 4, v_load(dst + x + 4) + ((v_load(src1 + x + 4) * v_load(src2 + x + 4)) & v_mask1)); + } + } + else if (cn == 3) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint16x8 stub = v_load_expand(mask + x); + v_uint32x4 stub0, stub1; + v_expand(stub, stub0, stub1); + v_float32x4 v_mask0 = v_reinterpret_as_f32(~(stub0 == v_0)); + v_float32x4 v_mask1 = v_reinterpret_as_f32(~(stub1 == v_0)); + + v_float32x4 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21; + v_float32x4 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21; + v_load_deinterleave(src1 + x * cn, v_1src00, v_1src10, v_1src20); + v_load_deinterleave(src2 + x * cn, v_2src00, v_2src10, v_2src20); + v_load_deinterleave(src1 + (x + 4) * cn, v_1src01, v_1src11, v_1src21); + v_load_deinterleave(src2 + (x + 4) * cn, v_2src01, v_2src11, v_2src21); + + v_float32x4 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; + v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_load_deinterleave(dst + (x + 4) * cn, v_dst01, v_dst11, v_dst21); + + v_store_interleave(dst + x * cn, v_dst00 + ((v_1src00 * v_2src00) & v_mask0), v_dst10 + ((v_1src10 * v_2src10) & v_mask0), v_dst20 + ((v_1src20 * v_2src20) & v_mask0)); + v_store_interleave(dst + (x + 4) * cn, v_dst01 + ((v_1src01 * v_2src01) & v_mask1), v_dst11 + ((v_1src11 * v_2src11) & v_mask1), v_dst21 + ((v_1src21 * v_2src21) & v_mask1)); + } + } + } + + accProd_general_(src1, src2, dst, mask, len, cn, x); +} +#if CV_SIMD128_64F +void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 8; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_uint8x16 v_1src = v_load(src1 + x); + v_uint8x16 v_2src = v_load(src2 + x); + + v_uint16x8 v_1int, v_2int, dummy; + v_expand(v_1src, v_1int, dummy); + v_expand(v_2src, v_2int, dummy); + + v_uint32x4 v_1int_0, v_1int_1, v_2int_0, v_2int_1; + v_expand(v_1int, v_1int_0, v_1int_1); + v_expand(v_2int, v_2int_0, v_2int_1); + + v_int32x4 v_1int0 = v_reinterpret_as_s32(v_1int_0); + v_int32x4 v_1int1 = v_reinterpret_as_s32(v_1int_1); + v_int32x4 v_2int0 = v_reinterpret_as_s32(v_2int_0); + v_int32x4 v_2int1 = v_reinterpret_as_s32(v_2int_1); + + v_float64x2 v_src0 = v_cvt_f64(v_1int0) * v_cvt_f64(v_2int0); + v_float64x2 v_src1 = v_cvt_f64_high(v_1int0) * v_cvt_f64_high(v_2int0); + v_float64x2 v_src2 = v_cvt_f64(v_1int1) * v_cvt_f64(v_2int1); + v_float64x2 v_src3 = v_cvt_f64_high(v_1int1) * v_cvt_f64_high(v_2int1); + + v_float64x2 v_dst0 = v_load(dst + x); + v_float64x2 v_dst1 = v_load(dst + x + 2); + v_float64x2 v_dst2 = v_load(dst + x + 4); + v_float64x2 v_dst3 = v_load(dst + x + 6); + + v_dst0 += v_src0; + v_dst1 += v_src1; + v_dst2 += v_src2; + v_dst3 += v_src3; + + v_store(dst + x, v_dst0); + v_store(dst + x + 2, v_dst1); + v_store(dst + x + 4, v_dst2); + v_store(dst + x + 6, v_dst3); + } + } + else + { + v_uint8x16 v_0 = v_setzero_u8(); + if (cn == 1) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint8x16 v_mask = v_load(mask + x); + v_mask = ~(v_mask == v_0); + v_uint8x16 v_1src = v_load(src1 + x) & v_mask; + v_uint8x16 v_2src = v_load(src2 + x) & v_mask; + + v_uint16x8 v_1int, v_2int, dummy; + v_expand(v_1src, v_1int, dummy); + v_expand(v_2src, v_2int, dummy); + + v_uint32x4 v_1int_0, v_1int_1, v_2int_0, v_2int_1; + v_expand(v_1int, v_1int_0, v_1int_1); + v_expand(v_2int, v_2int_0, v_2int_1); + + v_int32x4 v_1int0 = v_reinterpret_as_s32(v_1int_0); + v_int32x4 v_1int1 = v_reinterpret_as_s32(v_1int_1); + v_int32x4 v_2int0 = v_reinterpret_as_s32(v_2int_0); + v_int32x4 v_2int1 = v_reinterpret_as_s32(v_2int_1); + + v_float64x2 v_src0 = v_cvt_f64(v_1int0) * v_cvt_f64(v_2int0); + v_float64x2 v_src1 = v_cvt_f64_high(v_1int0) * v_cvt_f64_high(v_2int0); + v_float64x2 v_src2 = v_cvt_f64(v_1int1) * v_cvt_f64(v_2int1); + v_float64x2 v_src3 = v_cvt_f64_high(v_1int1) * v_cvt_f64_high(v_2int1); + + v_float64x2 v_dst0 = v_load(dst + x); + v_float64x2 v_dst1 = v_load(dst + x + 2); + v_float64x2 v_dst2 = v_load(dst + x + 4); + v_float64x2 v_dst3 = v_load(dst + x + 6); + + v_dst0 += v_src0; + v_dst1 += v_src1; + v_dst2 += v_src2; + v_dst3 += v_src3; + + v_store(dst + x, v_dst0); + v_store(dst + x + 2, v_dst1); + v_store(dst + x + 4, v_dst2); + v_store(dst + x + 6, v_dst3); + } + } + else if (cn == 3) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint8x16 v_mask = v_load(mask + x); + v_mask = ~(v_mask == v_0); + v_uint8x16 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2; + v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2); + v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2); + v_1src0 = v_1src0 & v_mask; + v_1src1 = v_1src1 & v_mask; + v_1src2 = v_1src2 & v_mask; + v_2src0 = v_2src0 & v_mask; + v_2src1 = v_2src1 & v_mask; + v_2src2 = v_2src2 & v_mask; + + v_uint16x8 v_1int0, v_1int1, v_1int2, v_2int0, v_2int1, v_2int2, dummy; + v_expand(v_1src0, v_1int0, dummy); + v_expand(v_1src1, v_1int1, dummy); + v_expand(v_1src2, v_1int2, dummy); + v_expand(v_2src0, v_2int0, dummy); + v_expand(v_2src1, v_2int1, dummy); + v_expand(v_2src2, v_2int2, dummy); + + v_uint32x4 v_1int00, v_1int01, v_1int10, v_1int11, v_1int20, v_1int21; + v_uint32x4 v_2int00, v_2int01, v_2int10, v_2int11, v_2int20, v_2int21; + v_expand(v_1int0, v_1int00, v_1int01); + v_expand(v_1int1, v_1int10, v_1int11); + v_expand(v_1int2, v_1int20, v_1int21); + v_expand(v_2int0, v_2int00, v_2int01); + v_expand(v_2int1, v_2int10, v_2int11); + v_expand(v_2int2, v_2int20, v_2int21); + + v_float64x2 v_src00 = v_cvt_f64(v_reinterpret_as_s32(v_1int00)) * v_cvt_f64(v_reinterpret_as_s32(v_2int00)); + v_float64x2 v_src01 = v_cvt_f64_high(v_reinterpret_as_s32(v_1int00)) * v_cvt_f64_high(v_reinterpret_as_s32(v_2int00)); + v_float64x2 v_src02 = v_cvt_f64(v_reinterpret_as_s32(v_1int01)) * v_cvt_f64(v_reinterpret_as_s32(v_2int01)); + v_float64x2 v_src03 = v_cvt_f64_high(v_reinterpret_as_s32(v_1int01)) * v_cvt_f64_high(v_reinterpret_as_s32(v_2int01)); + v_float64x2 v_src10 = v_cvt_f64(v_reinterpret_as_s32(v_1int10)) * v_cvt_f64(v_reinterpret_as_s32(v_2int10)); + v_float64x2 v_src11 = v_cvt_f64_high(v_reinterpret_as_s32(v_1int10)) * v_cvt_f64_high(v_reinterpret_as_s32(v_2int10)); + v_float64x2 v_src12 = v_cvt_f64(v_reinterpret_as_s32(v_1int11)) * v_cvt_f64(v_reinterpret_as_s32(v_2int11)); + v_float64x2 v_src13 = v_cvt_f64_high(v_reinterpret_as_s32(v_1int11)) * v_cvt_f64_high(v_reinterpret_as_s32(v_2int11)); + v_float64x2 v_src20 = v_cvt_f64(v_reinterpret_as_s32(v_1int20)) * v_cvt_f64(v_reinterpret_as_s32(v_2int20)); + v_float64x2 v_src21 = v_cvt_f64_high(v_reinterpret_as_s32(v_1int20)) * v_cvt_f64_high(v_reinterpret_as_s32(v_2int20)); + v_float64x2 v_src22 = v_cvt_f64(v_reinterpret_as_s32(v_1int21)) * v_cvt_f64(v_reinterpret_as_s32(v_2int21)); + v_float64x2 v_src23 = v_cvt_f64_high(v_reinterpret_as_s32(v_1int21)) * v_cvt_f64_high(v_reinterpret_as_s32(v_2int21)); + + v_float64x2 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13, v_dst20, v_dst21, v_dst22, v_dst23; + v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21); + v_load_deinterleave(dst + (x + 4) * cn, v_dst02, v_dst12, v_dst22); + v_load_deinterleave(dst + (x + 6) * cn, v_dst03, v_dst13, v_dst23); + + v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); + v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); + v_store_interleave(dst + (x + 4) * cn, v_dst02 + v_src02, v_dst12 + v_src12, v_dst22 + v_src22); + v_store_interleave(dst + (x + 6) * cn, v_dst03 + v_src03, v_dst13 + v_src13, v_dst23 + v_src23); + } + } + } + + accProd_general_(src1, src2, dst, mask, len, cn, x); +} + +void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 8; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_uint16x8 v_1src = v_load(src1 + x); + v_uint16x8 v_2src = v_load(src2 + x); + + v_uint32x4 v_1int_0, v_1int_1, v_2int_0, v_2int_1; + v_expand(v_1src, v_1int_0, v_1int_1); + v_expand(v_2src, v_2int_0, v_2int_1); + + v_int32x4 v_1int0 = v_reinterpret_as_s32(v_1int_0); + v_int32x4 v_1int1 = v_reinterpret_as_s32(v_1int_1); + v_int32x4 v_2int0 = v_reinterpret_as_s32(v_2int_0); + v_int32x4 v_2int1 = v_reinterpret_as_s32(v_2int_1); + + v_float64x2 v_src0 = v_cvt_f64(v_1int0) * v_cvt_f64(v_2int0); + v_float64x2 v_src1 = v_cvt_f64_high(v_1int0) * v_cvt_f64_high(v_2int0); + v_float64x2 v_src2 = v_cvt_f64(v_1int1) * v_cvt_f64(v_2int1); + v_float64x2 v_src3 = v_cvt_f64_high(v_1int1) * v_cvt_f64_high(v_2int1); + + v_float64x2 v_dst0 = v_load(dst + x); + v_float64x2 v_dst1 = v_load(dst + x + 2); + v_float64x2 v_dst2 = v_load(dst + x + 4); + v_float64x2 v_dst3 = v_load(dst + x + 6); + + v_dst0 = v_dst0 + v_src0; + v_dst1 = v_dst1 + v_src1; + v_dst2 = v_dst2 + v_src2; + v_dst3 = v_dst3 + v_src3; + v_store(dst + x, v_dst0); + v_store(dst + x + 2, v_dst1); + v_store(dst + x + 4, v_dst2); + v_store(dst + x + 6, v_dst3); + } + } + else + { + v_uint16x8 v_0 = v_setzero_u16(); + if (cn == 1) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint16x8 v_mask = v_load_expand(mask + x); + v_mask = ~(v_mask == v_0); + v_uint16x8 v_1src = v_load(src1 + x); + v_uint16x8 v_2src = v_load(src2 + x); + v_1src = v_1src & v_mask; + v_2src = v_2src & v_mask; + + v_uint32x4 v_1int_0, v_1int_1, v_2int_0, v_2int_1; + v_expand(v_1src, v_1int_0, v_1int_1); + v_expand(v_2src, v_2int_0, v_2int_1); + + v_int32x4 v_1int0 = v_reinterpret_as_s32(v_1int_0); + v_int32x4 v_1int1 = v_reinterpret_as_s32(v_1int_1); + v_int32x4 v_2int0 = v_reinterpret_as_s32(v_2int_0); + v_int32x4 v_2int1 = v_reinterpret_as_s32(v_2int_1); + + v_float64x2 v_src0 = v_cvt_f64(v_1int0) * v_cvt_f64(v_2int0); + v_float64x2 v_src1 = v_cvt_f64_high(v_1int0) * v_cvt_f64_high(v_2int0); + v_float64x2 v_src2 = v_cvt_f64(v_1int1) * v_cvt_f64(v_2int1); + v_float64x2 v_src3 = v_cvt_f64_high(v_1int1) * v_cvt_f64_high(v_2int1); + + v_float64x2 v_dst0 = v_load(dst + x); + v_float64x2 v_dst1 = v_load(dst + x + 2); + v_float64x2 v_dst2 = v_load(dst + x + 4); + v_float64x2 v_dst3 = v_load(dst + x + 6); + + v_dst0 = v_dst0 + v_src0; + v_dst1 = v_dst1 + v_src1; + v_dst2 = v_dst2 + v_src2; + v_dst3 = v_dst3 + v_src3; + v_store(dst + x, v_dst0); + v_store(dst + x + 2, v_dst1); + v_store(dst + x + 4, v_dst2); + v_store(dst + x + 6, v_dst3); + } + } + else if (cn == 3) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint16x8 v_mask = v_load_expand(mask + x); + v_mask = ~(v_mask == v_0); + v_uint16x8 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2; + v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2); + v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2); + v_1src0 = v_1src0 & v_mask; + v_1src1 = v_1src1 & v_mask; + v_1src2 = v_1src2 & v_mask; + v_2src0 = v_2src0 & v_mask; + v_2src1 = v_2src1 & v_mask; + v_2src2 = v_2src2 & v_mask; + + v_uint32x4 v_1int_00, v_1int_01, v_2int_00, v_2int_01; + v_uint32x4 v_1int_10, v_1int_11, v_2int_10, v_2int_11; + v_uint32x4 v_1int_20, v_1int_21, v_2int_20, v_2int_21; + v_expand(v_1src0, v_1int_00, v_1int_01); + v_expand(v_1src1, v_1int_10, v_1int_11); + v_expand(v_1src2, v_1int_20, v_1int_21); + v_expand(v_2src0, v_2int_00, v_2int_01); + v_expand(v_2src1, v_2int_10, v_2int_11); + v_expand(v_2src2, v_2int_20, v_2int_21); + + v_int32x4 v_1int00 = v_reinterpret_as_s32(v_1int_00); + v_int32x4 v_1int01 = v_reinterpret_as_s32(v_1int_01); + v_int32x4 v_1int10 = v_reinterpret_as_s32(v_1int_10); + v_int32x4 v_1int11 = v_reinterpret_as_s32(v_1int_11); + v_int32x4 v_1int20 = v_reinterpret_as_s32(v_1int_20); + v_int32x4 v_1int21 = v_reinterpret_as_s32(v_1int_21); + v_int32x4 v_2int00 = v_reinterpret_as_s32(v_2int_00); + v_int32x4 v_2int01 = v_reinterpret_as_s32(v_2int_01); + v_int32x4 v_2int10 = v_reinterpret_as_s32(v_2int_10); + v_int32x4 v_2int11 = v_reinterpret_as_s32(v_2int_11); + v_int32x4 v_2int20 = v_reinterpret_as_s32(v_2int_20); + v_int32x4 v_2int21 = v_reinterpret_as_s32(v_2int_21); + + v_float64x2 v_src00 = v_cvt_f64(v_1int00) * v_cvt_f64(v_2int00); + v_float64x2 v_src01 = v_cvt_f64_high(v_1int00) * v_cvt_f64_high(v_2int00); + v_float64x2 v_src02 = v_cvt_f64(v_1int01) * v_cvt_f64(v_2int01); + v_float64x2 v_src03 = v_cvt_f64_high(v_1int01) * v_cvt_f64_high(v_2int01); + v_float64x2 v_src10 = v_cvt_f64(v_1int10) * v_cvt_f64(v_2int10); + v_float64x2 v_src11 = v_cvt_f64_high(v_1int10) * v_cvt_f64_high(v_2int10); + v_float64x2 v_src12 = v_cvt_f64(v_1int11) * v_cvt_f64(v_2int11); + v_float64x2 v_src13 = v_cvt_f64_high(v_1int11) * v_cvt_f64_high(v_2int11); + v_float64x2 v_src20 = v_cvt_f64(v_1int20) * v_cvt_f64(v_2int20); + v_float64x2 v_src21 = v_cvt_f64_high(v_1int20) * v_cvt_f64_high(v_2int20); + v_float64x2 v_src22 = v_cvt_f64(v_1int21) * v_cvt_f64(v_2int21); + v_float64x2 v_src23 = v_cvt_f64_high(v_1int21) * v_cvt_f64_high(v_2int21); + + v_float64x2 v_dst00, v_dst01, v_dst02, v_dst03; + v_float64x2 v_dst10, v_dst11, v_dst12, v_dst13; + v_float64x2 v_dst20, v_dst21, v_dst22, v_dst23; + v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21); + v_load_deinterleave(dst + (x + 4) * cn, v_dst02, v_dst12, v_dst22); + v_load_deinterleave(dst + (x + 6) * cn, v_dst03, v_dst13, v_dst23); + + v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); + v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); + v_store_interleave(dst + (x + 4) * cn, v_dst02 + v_src02, v_dst12 + v_src12, v_dst22 + v_src22); + v_store_interleave(dst + (x + 6) * cn, v_dst03 + v_src03, v_dst13 + v_src13, v_dst23 + v_src23); + } + } + } + + accProd_general_(src1, src2, dst, mask, len, cn, x); +} + +void accProd_simd_(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 4; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_float32x4 v_1src = v_load(src1 + x); + v_float32x4 v_2src = v_load(src2 + x); + + v_float64x2 v_1src0 = v_cvt_f64(v_1src); + v_float64x2 v_1src1 = v_cvt_f64_high(v_1src); + v_float64x2 v_2src0 = v_cvt_f64(v_2src); + v_float64x2 v_2src1 = v_cvt_f64_high(v_2src); + + v_store(dst + x, v_load(dst + x) + (v_1src0 * v_2src0)); + v_store(dst + x + 2, v_load(dst + x + 2) + (v_1src1 * v_2src1)); + } + } + else + { + v_uint32x4 v_0 = v_setzero_u32(); + if (cn == 1) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint16x8 stub = v_load_expand(mask + x); + v_uint32x4 v_mask, dummy; + v_expand(stub, v_mask, dummy); + v_mask = ~(v_mask == v_0); + v_float32x4 v_1src = v_load(src1 + x); + v_float32x4 v_2src = v_load(src2 + x); + v_1src = v_1src & v_reinterpret_as_f32(v_mask); + v_2src = v_2src & v_reinterpret_as_f32(v_mask); + + v_float64x2 v_1src0 = v_cvt_f64(v_1src); + v_float64x2 v_1src1 = v_cvt_f64_high(v_1src); + v_float64x2 v_2src0 = v_cvt_f64(v_2src); + v_float64x2 v_2src1 = v_cvt_f64_high(v_2src); + + v_store(dst + x, v_load(dst + x) + (v_1src0 * v_2src0)); + v_store(dst + x + 2, v_load(dst + x + 2) + (v_1src1 * v_2src1)); + } + } + else if (cn == 3) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint16x8 stub = v_load_expand(mask + x); + v_uint32x4 v_mask, dummy; + v_expand(stub, v_mask, dummy); + v_mask = ~(v_mask == v_0); + v_float32x4 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2; + v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2); + v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2); + v_1src0 = v_1src0 & v_reinterpret_as_f32(v_mask); + v_1src1 = v_1src1 & v_reinterpret_as_f32(v_mask); + v_1src2 = v_1src2 & v_reinterpret_as_f32(v_mask); + v_2src0 = v_2src0 & v_reinterpret_as_f32(v_mask); + v_2src1 = v_2src1 & v_reinterpret_as_f32(v_mask); + v_2src2 = v_2src2 & v_reinterpret_as_f32(v_mask); + + v_float64x2 v_src00 = v_cvt_f64(v_1src0) * v_cvt_f64(v_2src0); + v_float64x2 v_src01 = v_cvt_f64_high(v_1src0) * v_cvt_f64_high(v_2src0); + v_float64x2 v_src10 = v_cvt_f64(v_1src1) * v_cvt_f64(v_2src1); + v_float64x2 v_src11 = v_cvt_f64_high(v_1src1) * v_cvt_f64_high(v_2src1); + v_float64x2 v_src20 = v_cvt_f64(v_1src2) * v_cvt_f64(v_2src2); + v_float64x2 v_src21 = v_cvt_f64_high(v_1src2) * v_cvt_f64_high(v_2src2); + + v_float64x2 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; + v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21); + + v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); + v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); + } + } + } + + accProd_general_(src1, src2, dst, mask, len, cn, x); +} + +void accProd_simd_(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 4; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_float64x2 v_src00 = v_load(src1 + x); + v_float64x2 v_src01 = v_load(src1 + x + 2); + v_float64x2 v_src10 = v_load(src2 + x); + v_float64x2 v_src11 = v_load(src2 + x + 2); + + v_store(dst + x, v_load(dst + x) + (v_src00 * v_src10)); + v_store(dst + x + 2, v_load(dst + x + 2) + (v_src01 * v_src11)); + } + } + else + { + v_uint64x2 v_0 = v_setzero_u64(); + if (cn == 1) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint16x8 stub = v_load_expand(mask + x); + v_uint32x4 stub0, stub1; + v_expand(stub, stub0, stub1); + v_uint64x2 v_masku640, v_masku641; + v_expand(stub0, v_masku640, v_masku641); + v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); + v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); + + v_float64x2 v_src00 = v_load(src1 + x); + v_float64x2 v_src01 = v_load(src1 + x + 2); + v_float64x2 v_src10 = v_load(src2 + x); + v_float64x2 v_src11 = v_load(src2 + x + 2); + + v_store(dst + x, v_load(dst + x) + ((v_src00 * v_src10) & v_mask0)); + v_store(dst + x + 2, v_load(dst + x + 2) + ((v_src01 * v_src11) & v_mask1)); + } + } + else if (cn == 3) + { + for (; x <= len - cVectorWidth; x += cVectorWidth) + { + v_uint16x8 stub = v_load_expand(mask + x); + v_uint32x4 stub0, stub1; + v_expand(stub, stub0, stub1); + v_uint64x2 v_masku640, v_masku641; + v_expand(stub0, v_masku640, v_masku641); + v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); + v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); + + v_float64x2 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21; + v_float64x2 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21; + v_load_deinterleave(src1 + x * cn, v_1src00, v_1src10, v_1src20); + v_load_deinterleave(src1 + (x + 2) * cn, v_1src01, v_1src11, v_1src21); + v_load_deinterleave(src2 + x * cn, v_2src00, v_2src10, v_2src20); + v_load_deinterleave(src2 + (x + 2) * cn, v_2src01, v_2src11, v_2src21); + v_float64x2 v_src00 = (v_1src00 & v_mask0) * v_2src00; + v_float64x2 v_src01 = (v_1src01 & v_mask1) * v_2src01; + v_float64x2 v_src10 = (v_1src10 & v_mask0) * v_2src10; + v_float64x2 v_src11 = (v_1src11 & v_mask1) * v_2src11; + v_float64x2 v_src20 = (v_1src20 & v_mask0) * v_2src20; + v_float64x2 v_src21 = (v_1src21 & v_mask1) * v_2src21; + + v_float64x2 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21; + v_load_deinterleave(dst + x * cn, v_dst00, v_dst10, v_dst20); + v_load_deinterleave(dst + (x + 2) * cn, v_dst01, v_dst11, v_dst21); + + v_store_interleave(dst + x * cn, v_dst00 + v_src00, v_dst10 + v_src10, v_dst20 + v_src20); + v_store_interleave(dst + (x + 2) * cn, v_dst01 + v_src01, v_dst11 + v_src11, v_dst21 + v_src21); + } + } + } + + accProd_general_(src1, src2, dst, mask, len, cn, x); +} +#else +void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const uchar* mask, int len, int cn) +{ + accProd_general_(src1, src2, dst, mask, len, cn, 0); +} + +void accProd_simd_(const ushort* src1, const ushort* src2, double* dst, const uchar* mask, int len, int cn) +{ + accProd_general_(src1, src2, dst, mask, len, cn, 0); +} + +void accProd_simd_(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn) +{ + accProd_general_(src1, src2, dst, mask, len, cn, 0); +} + +void accProd_simd_(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn) +{ + accProd_general_(src1, src2, dst, mask, len, cn, 0); +} +#endif + +// running weight accumulate optimized by universal intrinsic +void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn, double alpha) +{ + int x = 0; + const v_float32x4 v_alpha = v_setall_f32((float)alpha); + const v_float32x4 v_beta = v_setall_f32((float)(1.0f - alpha)); + const int cVectorWidth = 16; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_uint8x16 v_src = v_load(src + x); + + v_uint16x8 v_src0, v_src1; + v_expand(v_src, v_src0, v_src1); + + v_uint32x4 v_src00, v_src01, v_src10, v_src11; + v_expand(v_src0, v_src00, v_src01); + v_expand(v_src1, v_src10, v_src11); + + v_float32x4 v_dst00 = v_load(dst + x); + v_float32x4 v_dst01 = v_load(dst + x + 4); + v_float32x4 v_dst10 = v_load(dst + x + 8); + v_float32x4 v_dst11 = v_load(dst + x + 12); + + v_dst00 = (v_dst00 * v_beta) + (v_cvt_f32(v_reinterpret_as_s32(v_src00)) * v_alpha); + v_dst01 = (v_dst01 * v_beta) + (v_cvt_f32(v_reinterpret_as_s32(v_src01)) * v_alpha); + v_dst10 = (v_dst10 * v_beta) + (v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha); + v_dst11 = (v_dst11 * v_beta) + (v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha); + + v_store(dst + x, v_dst00); + v_store(dst + x + 4, v_dst01); + v_store(dst + x + 8, v_dst10); + v_store(dst + x + 12, v_dst11); + } + } + + accW_general_(src, dst, mask, len, cn, alpha, x); +} + +void accW_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn, double alpha) +{ + int x = 0; + const v_float32x4 v_alpha = v_setall_f32((float)alpha); + const v_float32x4 v_beta = v_setall_f32((float)(1.0f - alpha)); + const int cVectorWidth = 8; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_uint16x8 v_src = v_load(src + x); + v_uint32x4 v_int0, v_int1; + v_expand(v_src, v_int0, v_int1); + + v_float32x4 v_src0 = v_cvt_f32(v_reinterpret_as_s32(v_int0)); + v_float32x4 v_src1 = v_cvt_f32(v_reinterpret_as_s32(v_int1)); + v_src0 = v_src0 * v_alpha; + v_src1 = v_src1 * v_alpha; + + v_float32x4 v_dst0 = v_load(dst + x) * v_beta; + v_float32x4 v_dst1 = v_load(dst + x + 4) * v_beta; + + v_store(dst + x, v_dst0 + v_src0); + v_store(dst + x + 4, v_dst1 + v_src1); + } + } + + accW_general_(src, dst, mask, len, cn, alpha, x); +} + +void accW_simd_(const float* src, float* dst, const uchar* mask, int len, int cn, double alpha) +{ + int x = 0; + const v_float32x4 v_alpha = v_setall_f32((float)alpha); + const v_float32x4 v_beta = v_setall_f32((float)(1.0f - alpha)); + const int cVectorWidth = 8; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_store(dst + x, ((v_load(dst + x) * v_beta) + (v_load(src + x) * v_alpha))); + v_store(dst + x + 4, ((v_load(dst + x + 4) * v_beta) + (v_load(src + x + 4) * v_alpha))); + } + } + + accW_general_(src, dst, mask, len, cn, alpha, x); +} +#if CV_SIMD128_64F +void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn, double alpha) +{ + int x = 0; + const v_float64x2 v_alpha = v_setall_f64(alpha); + const v_float64x2 v_beta = v_setall_f64(1.0f - alpha); + const int cVectorWidth = 8; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_uint8x16 v_src = v_load(src + x); + v_uint16x8 v_int, dummy; + v_expand(v_src, v_int, dummy); + + v_uint32x4 v_int_0, v_int_1; + v_expand(v_int, v_int_0, v_int_1); + + v_int32x4 v_int0 = v_reinterpret_as_s32(v_int_0); + v_int32x4 v_int1 = v_reinterpret_as_s32(v_int_1); + + v_float64x2 v_src0 = v_cvt_f64(v_int0); + v_float64x2 v_src1 = v_cvt_f64_high(v_int0); + v_float64x2 v_src2 = v_cvt_f64(v_int1); + v_float64x2 v_src3 = v_cvt_f64_high(v_int1); + + v_float64x2 v_dst0 = v_load(dst + x); + v_float64x2 v_dst1 = v_load(dst + x + 2); + v_float64x2 v_dst2 = v_load(dst + x + 4); + v_float64x2 v_dst3 = v_load(dst + x + 6); + + v_dst0 = (v_dst0 * v_beta) + (v_src0 * v_alpha); + v_dst1 = (v_dst1 * v_beta) + (v_src1 * v_alpha); + v_dst2 = (v_dst2 * v_beta) + (v_src2 * v_alpha); + v_dst3 = (v_dst3 * v_beta) + (v_src3 * v_alpha); + + v_store(dst + x, v_dst0); + v_store(dst + x + 2, v_dst1); + v_store(dst + x + 4, v_dst2); + v_store(dst + x + 6, v_dst3); + } + } + + accW_general_(src, dst, mask, len, cn, alpha, x); +} + +void accW_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn, double alpha) +{ + int x = 0; + const v_float64x2 v_alpha = v_setall_f64(alpha); + const v_float64x2 v_beta = v_setall_f64(1.0f - alpha); + const int cVectorWidth = 8; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_uint16x8 v_src = v_load(src + x); + v_uint32x4 v_int_0, v_int_1; + v_expand(v_src, v_int_0, v_int_1); + + v_int32x4 v_int0 = v_reinterpret_as_s32(v_int_0); + v_int32x4 v_int1 = v_reinterpret_as_s32(v_int_1); + + v_float64x2 v_src00 = v_cvt_f64(v_int0); + v_float64x2 v_src01 = v_cvt_f64_high(v_int0); + v_float64x2 v_src10 = v_cvt_f64(v_int1); + v_float64x2 v_src11 = v_cvt_f64_high(v_int1); + + v_float64x2 v_dst00 = v_load(dst + x); + v_float64x2 v_dst01 = v_load(dst + x + 2); + v_float64x2 v_dst10 = v_load(dst + x + 4); + v_float64x2 v_dst11 = v_load(dst + x + 6); + + v_dst00 = (v_dst00 * v_beta) + (v_src00 * v_alpha); + v_dst01 = (v_dst01 * v_beta) + (v_src01 * v_alpha); + v_dst10 = (v_dst10 * v_beta) + (v_src10 * v_alpha); + v_dst11 = (v_dst11 * v_beta) + (v_src11 * v_alpha); + + v_store(dst + x, v_dst00); + v_store(dst + x + 2, v_dst01); + v_store(dst + x + 4, v_dst10); + v_store(dst + x + 6, v_dst11); + } + } + + accW_general_(src, dst, mask, len, cn, alpha, x); +} + +void accW_simd_(const float* src, double* dst, const uchar* mask, int len, int cn, double alpha) +{ + int x = 0; + const v_float64x2 v_alpha = v_setall_f64(alpha); + const v_float64x2 v_beta = v_setall_f64(1.0f - alpha); + const int cVectorWidth = 8; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_float32x4 v_src0 = v_load(src + x); + v_float32x4 v_src1 = v_load(src + x + 4); + v_float64x2 v_src00 = v_cvt_f64(v_src0); + v_float64x2 v_src01 = v_cvt_f64_high(v_src0); + v_float64x2 v_src10 = v_cvt_f64(v_src1); + v_float64x2 v_src11 = v_cvt_f64_high(v_src1); + + v_store(dst + x, ((v_load(dst + x) * v_beta) + (v_src00 * v_alpha))); + v_store(dst + x + 2, ((v_load(dst + x + 2) * v_beta) + (v_src01 * v_alpha))); + v_store(dst + x + 4, ((v_load(dst + x + 4) * v_beta) + (v_src10 * v_alpha))); + v_store(dst + x + 6, ((v_load(dst + x + 6) * v_beta) + (v_src11 * v_alpha))); + } + } + + accW_general_(src, dst, mask, len, cn, alpha, x); +} + +void accW_simd_(const double* src, double* dst, const uchar* mask, int len, int cn, double alpha) +{ + int x = 0; + const v_float64x2 v_alpha = v_setall_f64(alpha); + const v_float64x2 v_beta = v_setall_f64(1.0f - alpha); + const int cVectorWidth = 4; + + if (!mask) + { + int size = len * cn; + for (; x <= size - cVectorWidth; x += cVectorWidth) + { + v_float64x2 v_src0 = v_load(src + x); + v_float64x2 v_src1 = v_load(src + x + 2); + + v_store(dst + x, ((v_load(dst + x) * v_beta) + (v_src0 * v_alpha))); + v_store(dst + x + 2, ((v_load(dst + x + 2) * v_beta) + (v_src1 * v_alpha))); + } + } + + accW_general_(src, dst, mask, len, cn, alpha, x); +} +#else +void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int cn, double alpha) +{ + accW_general_(src, dst, mask, len, cn, alpha, 0); +} + +void accW_simd_(const ushort* src, double* dst, const uchar* mask, int len, int cn, double alpha) +{ + accW_general_(src, dst, mask, len, cn, alpha, 0); +} + +void accW_simd_(const float* src, double* dst, const uchar* mask, int len, int cn, double alpha) +{ + accW_general_(src, dst, mask, len, cn, alpha, 0); +} + +void accW_simd_(const double* src, double* dst, const uchar* mask, int len, int cn, double alpha) +{ + accW_general_(src, dst, mask, len, cn, alpha, 0); +} +#endif // CV_SIMD128_64F +#endif // CV_SIMD128 +#if CV_AVX +// accumulate optimized by AVX +void acc_avx_32f(const float* src, float* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 8; + + if (!mask) + { + int size = len * cn; + for ( ; x <= size - cVectorWidth ; x += cVectorWidth) + { + __m256 v_src = _mm256_loadu_ps(src + x); + __m256 v_dst = _mm256_loadu_ps(dst + x); + v_dst = _mm256_add_ps(v_src, v_dst); + _mm256_storeu_ps(dst + x, v_dst); + } + acc_general_(src, dst, mask, len, cn, x); + } + else + { + acc_simd_(src, dst, mask, len, cn); + } +} + +void acc_avx_32f64f(const float* src, double* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 8; + + if (!mask) + { + int size = len * cn; + for ( ; x <= size - cVectorWidth ; x += cVectorWidth) + { + __m256 v_src = _mm256_loadu_ps(src + x); + __m256d v_src0 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src, 0)); + __m256d v_src1 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src, 1)); + __m256d v_dst0 = _mm256_loadu_pd(dst + x); + __m256d v_dst1 = _mm256_loadu_pd(dst + x + 4); + v_dst0 = _mm256_add_pd(v_src0, v_dst0); + v_dst1 = _mm256_add_pd(v_src1, v_dst1); + _mm256_storeu_pd(dst + x, v_dst0); + _mm256_storeu_pd(dst + x + 4, v_dst1); + } + acc_general_(src, dst, mask, len, cn, x); + } + else + { + acc_simd_(src, dst, mask, len, cn); + } +} + +void acc_avx_64f(const double* src, double* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 4; + + if (!mask) + { + int size = len * cn; + for ( ; x <= size - cVectorWidth ; x += cVectorWidth) + { + __m256d v_src = _mm256_loadu_pd(src + x); + __m256d v_dst = _mm256_loadu_pd(dst + x); + v_dst = _mm256_add_pd(v_dst, v_src); + _mm256_storeu_pd(dst + x, v_dst); + } + acc_general_(src, dst, mask, len, cn, x); + } + else + { + acc_simd_(src, dst, mask, len, cn); + } +} + +// square accumulate optimized by avx +void accSqr_avx_32f(const float* src, float* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 8; + + if (!mask) + { + int size = len * cn; + for ( ; x <= size - cVectorWidth ; x += cVectorWidth) + { + __m256 v_src = _mm256_loadu_ps(src + x); + __m256 v_dst = _mm256_loadu_ps(dst + x); + v_src = _mm256_mul_ps(v_src, v_src); + v_dst = _mm256_add_ps(v_src, v_dst); + _mm256_storeu_ps(dst + x, v_dst); + } + accSqr_general_(src, dst, mask, len, cn, x); + } + else + { + accSqr_simd_(src, dst, mask, len, cn); + } +} + +void accSqr_avx_32f64f(const float* src, double* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 8; + + if (!mask) + { + int size = len * cn; + for ( ; x <= size - cVectorWidth ; x += cVectorWidth) + { + __m256 v_src = _mm256_loadu_ps(src + x); + __m256d v_src0 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,0)); + __m256d v_src1 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src,1)); + __m256d v_dst0 = _mm256_loadu_pd(dst + x); + __m256d v_dst1 = _mm256_loadu_pd(dst + x + 4); + v_src0 = _mm256_mul_pd(v_src0, v_src0); + v_src1 = _mm256_mul_pd(v_src1, v_src1); + v_dst0 = _mm256_add_pd(v_src0, v_dst0); + v_dst1 = _mm256_add_pd(v_src1, v_dst1); + _mm256_storeu_pd(dst + x, v_dst0); + _mm256_storeu_pd(dst + x + 4, v_dst1); + } + accSqr_general_(src, dst, mask, len, cn, x); + } + else + { + accSqr_simd_(src, dst, mask, len, cn); + } +} + +void accSqr_avx_64f(const double* src, double* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 4; + + if (!mask) + { + int size = len * cn; + for ( ; x <= size - cVectorWidth ; x += cVectorWidth) + { + __m256d v_src = _mm256_loadu_pd(src + x); + __m256d v_dst = _mm256_loadu_pd(dst + x); + v_src = _mm256_mul_pd(v_src, v_src); + v_dst = _mm256_add_pd(v_dst, v_src); + _mm256_storeu_pd(dst + x, v_dst); + } + accSqr_general_(src, dst, mask, len, cn, x); + } + else + { + accSqr_simd_(src, dst, mask, len, cn); + } +} + +// product accumulate optimized by avx +void accProd_avx_32f(const float* src1, const float* src2, float* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 8; + + if (!mask) + { + int size = len * cn; + for ( ; x <= size - cVectorWidth ; x += cVectorWidth) + { + __m256 v_src0 = _mm256_loadu_ps(src1 + x); + __m256 v_src1 = _mm256_loadu_ps(src2 + x); + __m256 v_dst = _mm256_loadu_ps(dst + x); + __m256 v_src = _mm256_mul_ps(v_src0, v_src1); + v_dst = _mm256_add_ps(v_src, v_dst); + _mm256_storeu_ps(dst + x, v_dst); + } + accProd_general_(src1, src2, dst, mask, len, cn, x); + } + else + { + accProd_simd_(src1, src2, dst, mask, len, cn); + } +} + +void accProd_avx_32f64f(const float* src1, const float* src2, double* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 8; + + if (!mask) + { + int size = len * cn; + for ( ; x <= size - cVectorWidth ; x += cVectorWidth) + { + __m256 v_1src = _mm256_loadu_ps(src1 + x); + __m256 v_2src = _mm256_loadu_ps(src2 + x); + __m256d v_src00 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_1src,0)); + __m256d v_src01 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_1src,1)); + __m256d v_src10 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_2src,0)); + __m256d v_src11 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_2src,1)); + __m256d v_dst0 = _mm256_loadu_pd(dst + x); + __m256d v_dst1 = _mm256_loadu_pd(dst + x + 4); + __m256d v_src0 = _mm256_mul_pd(v_src00, v_src10); + __m256d v_src1 = _mm256_mul_pd(v_src01, v_src11); + v_dst0 = _mm256_add_pd(v_src0, v_dst0); + v_dst1 = _mm256_add_pd(v_src1, v_dst1); + _mm256_storeu_pd(dst + x, v_dst0); + _mm256_storeu_pd(dst + x + 4, v_dst1); + } + accProd_general_(src1, src2, dst, mask, len, cn, x); + } + else + { + accProd_simd_(src1, src2, dst, mask, len, cn); + } +} + +void accProd_avx_64f(const double* src1, const double* src2, double* dst, const uchar* mask, int len, int cn) +{ + int x = 0; + const int cVectorWidth = 4; + + if (!mask) + { + int size = len * cn; + for ( ; x <= size - cVectorWidth ; x += cVectorWidth) + { + __m256d v_src0 = _mm256_loadu_pd(src1 + x); + __m256d v_src1 = _mm256_loadu_pd(src2 + x); + __m256d v_dst = _mm256_loadu_pd(dst + x); + v_src0 = _mm256_mul_pd(v_src0, v_src1); + v_dst = _mm256_add_pd(v_dst, v_src0); + _mm256_storeu_pd(dst + x, v_dst); + } + accProd_general_(src1, src2, dst, mask, len, cn, x); + } + else + { + accProd_simd_(src1, src2, dst, mask, len, cn); + } +} + +// running weight accumulate optimized by avx +void accW_avx_32f(const float* src, float* dst, const uchar* mask, int len, int cn, double alpha) +{ + int x = 0; + const __m256 v_alpha = _mm256_set1_ps((float)alpha); + const __m256 v_beta = _mm256_set1_ps((float)(1.0f - alpha)); + const int cVectorWidth = 16; + + if (!mask) + { + int size = len * cn; + for ( ; x <= size - cVectorWidth ; x += cVectorWidth) + { + _mm256_storeu_ps(dst + x, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + x), v_beta), _mm256_mul_ps(_mm256_loadu_ps(src + x), v_alpha))); + _mm256_storeu_ps(dst + x + 8, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + x + 8), v_beta), _mm256_mul_ps(_mm256_loadu_ps(src + x + 8), v_alpha))); + } + accW_general_(src, dst, mask, len, cn, alpha, x); + } + else + { + accW_simd_(src, dst, mask, len, cn, alpha); + } + +} + +void accW_avx_32f64f(const float* src, double* dst, const uchar* mask, int len, int cn, double alpha) +{ + int x = 0; + const __m256d v_alpha = _mm256_set1_pd(alpha); + const __m256d v_beta = _mm256_set1_pd(1.0f - alpha); + const int cVectorWidth = 16; + + if (!mask) + { + int size = len * cn; + for ( ; x <= size - cVectorWidth ; x += cVectorWidth) + { + __m256 v_src0 = _mm256_loadu_ps(src + x); + __m256 v_src1 = _mm256_loadu_ps(src + x + 8); + __m256d v_src00 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src0,0)); + __m256d v_src01 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src0,1)); + __m256d v_src10 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src1,0)); + __m256d v_src11 = _mm256_cvtps_pd(_mm256_extractf128_ps(v_src1,1)); + + _mm256_storeu_pd(dst + x, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x), v_beta), _mm256_mul_pd(v_src00, v_alpha))); + _mm256_storeu_pd(dst + x + 4, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 4), v_beta), _mm256_mul_pd(v_src01, v_alpha))); + _mm256_storeu_pd(dst + x + 8, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 8), v_beta), _mm256_mul_pd(v_src10, v_alpha))); + _mm256_storeu_pd(dst + x + 12, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 12), v_beta), _mm256_mul_pd(v_src11, v_alpha))); + } + accW_general_(src, dst, mask, len, cn, alpha, x); + } + else + { + accW_simd_(src, dst, mask, len, cn, alpha); + } +} + +void accW_avx_64f(const double* src, double* dst, const uchar* mask, int len, int cn, double alpha) +{ + int x = 0; + const __m256d v_alpha = _mm256_set1_pd(alpha); + const __m256d v_beta = _mm256_set1_pd(1.0f - alpha); + const int cVectorWidth = 8; + + if (!mask) + { + int size = len * cn; + for ( ; x <= size - cVectorWidth ; x += cVectorWidth) + { + __m256d v_src0 = _mm256_loadu_pd(src + x); + __m256d v_src1 = _mm256_loadu_pd(src + x + 4); + + _mm256_storeu_pd(dst + x, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x), v_beta), _mm256_mul_pd(v_src0, v_alpha))); + _mm256_storeu_pd(dst + x + 4, _mm256_add_pd(_mm256_mul_pd(_mm256_loadu_pd(dst + x + 4), v_beta), _mm256_mul_pd(v_src1, v_alpha))); + } + accW_general_(src, dst, mask, len, cn, alpha, x); + } + else + { + accW_simd_(src, dst, mask, len, cn, alpha); + } +} +#endif +#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +CV_CPU_OPTIMIZATION_NAMESPACE_END + +} // namespace cv + +///* End of file. */