From 43d925609613f272bb5924d4e948e509c56ce2e2 Mon Sep 17 00:00:00 2001 From: Vitaly Tuzov Date: Wed, 5 Sep 2018 16:17:43 +0300 Subject: [PATCH] Replaced core module calls to universal intrinsics with wide universal intrinsics --- modules/core/src/copy.cpp | 30 +++--- modules/core/src/lapack.cpp | 89 ++++++++-------- modules/core/src/mathfuncs.cpp | 121 +++++++++++----------- modules/core/src/matmul.cpp | 180 +++++++++++++++------------------ 4 files changed, 204 insertions(+), 216 deletions(-) diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp index 38264cc58f..98ab15d4ca 100644 --- a/modules/core/src/copy.cpp +++ b/modules/core/src/copy.cpp @@ -90,20 +90,21 @@ copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mste const uchar* src = (const uchar*)_src; uchar* dst = (uchar*)_dst; int x = 0; - #if CV_SIMD128 + #if CV_SIMD { - v_uint8x16 v_zero = v_setzero_u8(); + v_uint8 v_zero = vx_setzero_u8(); - for( ; x <= size.width - 16; x += 16 ) + for( ; x <= size.width - v_uint8::nlanes; x += v_uint8::nlanes ) { - v_uint8x16 v_src = v_load(src + x), - v_dst = v_load(dst + x), - v_nmask = v_load(mask + x) == v_zero; + v_uint8 v_src = vx_load(src + x), + v_dst = vx_load(dst + x), + v_nmask = vx_load(mask + x) == v_zero; v_dst = v_select(v_nmask, v_dst, v_src); v_store(dst + x, v_dst); } } + vx_cleanup(); #endif for( ; x < size.width; x++ ) if( mask[x] ) @@ -121,25 +122,26 @@ copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mst const ushort* src = (const ushort*)_src; ushort* dst = (ushort*)_dst; int x = 0; - #if CV_SIMD128 + #if CV_SIMD { - v_uint8x16 v_zero = v_setzero_u8(); + v_uint8 v_zero = vx_setzero_u8(); - for( ; x <= size.width - 16; x += 16 ) + for( ; x <= size.width - v_uint8::nlanes; x += v_uint8::nlanes ) { - v_uint16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + 8), - v_dst1 = v_load(dst + x), v_dst2 = v_load(dst + x + 8); + v_uint16 v_src1 = vx_load(src + x), v_src2 = vx_load(src + x + v_uint16::nlanes), + v_dst1 = vx_load(dst + x), v_dst2 = vx_load(dst + x + v_uint16::nlanes); - v_uint8x16 v_nmask1, v_nmask2; - v_uint8x16 v_nmask = v_load(mask + x) == v_zero; + v_uint8 v_nmask1, v_nmask2; + v_uint8 v_nmask = vx_load(mask + x) == v_zero; v_zip(v_nmask, v_nmask, v_nmask1, v_nmask2); v_dst1 = v_select(v_reinterpret_as_u16(v_nmask1), v_dst1, v_src1); v_dst2 = v_select(v_reinterpret_as_u16(v_nmask2), v_dst2, v_src2); v_store(dst + x, v_dst1); - v_store(dst + x + 8, v_dst2); + v_store(dst + x + v_uint16::nlanes, v_dst2); } } + vx_cleanup(); #endif for( ; x < size.width; x++ ) if( mask[x] ) diff --git a/modules/core/src/lapack.cpp b/modules/core/src/lapack.cpp index 3fe2d77083..649f6baac5 100644 --- a/modules/core/src/lapack.cpp +++ b/modules/core/src/lapack.cpp @@ -277,40 +277,42 @@ template struct VBLAS int givensx(T*, T*, int, T, T, T*, T*) const { return 0; } }; -#if CV_SIMD128 +#if CV_SIMD template<> inline int VBLAS::dot(const float* a, const float* b, int n, float* result) const { - if( n < 8 ) + if( n < 2*v_float32::nlanes ) return 0; int k = 0; - v_float32x4 s0 = v_setzero_f32(); - for( ; k <= n - v_float32x4::nlanes; k += v_float32x4::nlanes ) + v_float32 s0 = vx_setzero_f32(); + for( ; k <= n - v_float32::nlanes; k += v_float32::nlanes ) { - v_float32x4 a0 = v_load(a + k); - v_float32x4 b0 = v_load(b + k); + v_float32 a0 = vx_load(a + k); + v_float32 b0 = vx_load(b + k); s0 += a0 * b0; } *result = v_reduce_sum(s0); + vx_cleanup(); return k; } template<> inline int VBLAS::givens(float* a, float* b, int n, float c, float s) const { - if( n < 4 ) + if( n < v_float32::nlanes) return 0; int k = 0; - v_float32x4 c4 = v_setall_f32(c), s4 = v_setall_f32(s); - for( ; k <= n - v_float32x4::nlanes; k += v_float32x4::nlanes ) + v_float32 c4 = vx_setall_f32(c), s4 = vx_setall_f32(s); + for( ; k <= n - v_float32::nlanes; k += v_float32::nlanes ) { - v_float32x4 a0 = v_load(a + k); - v_float32x4 b0 = v_load(b + k); - v_float32x4 t0 = (a0 * c4) + (b0 * s4); - v_float32x4 t1 = (b0 * c4) - (a0 * s4); + v_float32 a0 = vx_load(a + k); + v_float32 b0 = vx_load(b + k); + v_float32 t0 = (a0 * c4) + (b0 * s4); + v_float32 t1 = (b0 * c4) - (a0 * s4); v_store(a + k, t0); v_store(b + k, t1); } + vx_cleanup(); return k; } @@ -318,17 +320,17 @@ template<> inline int VBLAS::givens(float* a, float* b, int n, float c, f template<> inline int VBLAS::givensx(float* a, float* b, int n, float c, float s, float* anorm, float* bnorm) const { - if( n < 4 ) + if( n < v_float32::nlanes) return 0; int k = 0; - v_float32x4 c4 = v_setall_f32(c), s4 = v_setall_f32(s); - v_float32x4 sa = v_setzero_f32(), sb = v_setzero_f32(); - for( ; k <= n - v_float32x4::nlanes; k += v_float32x4::nlanes ) + v_float32 c4 = vx_setall_f32(c), s4 = vx_setall_f32(s); + v_float32 sa = vx_setzero_f32(), sb = vx_setzero_f32(); + for( ; k <= n - v_float32::nlanes; k += v_float32::nlanes ) { - v_float32x4 a0 = v_load(a + k); - v_float32x4 b0 = v_load(b + k); - v_float32x4 t0 = (a0 * c4) + (b0 * s4); - v_float32x4 t1 = (b0 * c4) - (a0 * s4); + v_float32 a0 = vx_load(a + k); + v_float32 b0 = vx_load(b + k); + v_float32 t0 = (a0 * c4) + (b0 * s4); + v_float32 t1 = (b0 * c4) - (a0 * s4); v_store(a + k, t0); v_store(b + k, t1); sa += t0 + t0; @@ -336,26 +338,28 @@ template<> inline int VBLAS::givensx(float* a, float* b, int n, float c, } *anorm = v_reduce_sum(sa); *bnorm = v_reduce_sum(sb); + vx_cleanup(); return k; } -#if CV_SIMD128_64F +#if CV_SIMD_64F template<> inline int VBLAS::dot(const double* a, const double* b, int n, double* result) const { - if( n < 4 ) + if( n < 2*v_float64::nlanes ) return 0; int k = 0; - v_float64x2 s0 = v_setzero_f64(); - for( ; k <= n - v_float64x2::nlanes; k += v_float64x2::nlanes ) + v_float64 s0 = vx_setzero_f64(); + for( ; k <= n - v_float64::nlanes; k += v_float64::nlanes ) { - v_float64x2 a0 = v_load(a + k); - v_float64x2 b0 = v_load(b + k); + v_float64 a0 = vx_load(a + k); + v_float64 b0 = vx_load(b + k); s0 += a0 * b0; } double sbuf[2]; v_store(sbuf, s0); *result = sbuf[0] + sbuf[1]; + vx_cleanup(); return k; } @@ -363,16 +367,17 @@ template<> inline int VBLAS::dot(const double* a, const double* b, int n template<> inline int VBLAS::givens(double* a, double* b, int n, double c, double s) const { int k = 0; - v_float64x2 c2 = v_setall_f64(c), s2 = v_setall_f64(s); - for( ; k <= n - v_float64x2::nlanes; k += v_float64x2::nlanes ) + v_float64 c2 = vx_setall_f64(c), s2 = vx_setall_f64(s); + for( ; k <= n - v_float64::nlanes; k += v_float64::nlanes ) { - v_float64x2 a0 = v_load(a + k); - v_float64x2 b0 = v_load(b + k); - v_float64x2 t0 = (a0 * c2) + (b0 * s2); - v_float64x2 t1 = (b0 * c2) - (a0 * s2); + v_float64 a0 = vx_load(a + k); + v_float64 b0 = vx_load(b + k); + v_float64 t0 = (a0 * c2) + (b0 * s2); + v_float64 t1 = (b0 * c2) - (a0 * s2); v_store(a + k, t0); v_store(b + k, t1); } + vx_cleanup(); return k; } @@ -381,14 +386,14 @@ template<> inline int VBLAS::givensx(double* a, double* b, int n, double double* anorm, double* bnorm) const { int k = 0; - v_float64x2 c2 = v_setall_f64(c), s2 = v_setall_f64(s); - v_float64x2 sa = v_setzero_f64(), sb = v_setzero_f64(); - for( ; k <= n - v_float64x2::nlanes; k += v_float64x2::nlanes ) + v_float64 c2 = vx_setall_f64(c), s2 = vx_setall_f64(s); + v_float64 sa = vx_setzero_f64(), sb = vx_setzero_f64(); + for( ; k <= n - v_float64::nlanes; k += v_float64::nlanes ) { - v_float64x2 a0 = v_load(a + k); - v_float64x2 b0 = v_load(b + k); - v_float64x2 t0 = (a0 * c2) + (b0 * s2); - v_float64x2 t1 = (b0 * c2) - (a0 * s2); + v_float64 a0 = vx_load(a + k); + v_float64 b0 = vx_load(b + k); + v_float64 t0 = (a0 * c2) + (b0 * s2); + v_float64 t1 = (b0 * c2) - (a0 * s2); v_store(a + k, t0); v_store(b + k, t1); sa += t0 * t0; @@ -401,8 +406,8 @@ template<> inline int VBLAS::givensx(double* a, double* b, int n, double *bnorm = bbuf[0] + bbuf[1]; return k; } -#endif //CV_SIMD128_64F -#endif //CV_SIMD128 +#endif //CV_SIMD_64F +#endif //CV_SIMD template void JacobiSVDImpl_(_Tp* At, size_t astep, _Tp* _W, _Tp* Vt, size_t vstep, diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp index e8067b5128..ba7f688e10 100644 --- a/modules/core/src/mathfuncs.cpp +++ b/modules/core/src/mathfuncs.cpp @@ -606,17 +606,15 @@ void polarToCart( InputArray src1, InputArray src2, { k = 0; -#if CV_SIMD128 - if( hasSIMD128() ) +#if CV_SIMD + int cWidth = v_float32::nlanes; + for( ; k <= len - cWidth; k += cWidth ) { - int cWidth = v_float32x4::nlanes; - for( ; k <= len - cWidth; k += cWidth ) - { - v_float32x4 v_m = v_load(mag + k); - v_store(x + k, v_load(x + k) * v_m); - v_store(y + k, v_load(y + k) * v_m); - } + v_float32 v_m = vx_load(mag + k); + v_store(x + k, vx_load(x + k) * v_m); + v_store(y + k, vx_load(y + k) * v_m); } + vx_cleanup(); #endif for( ; k < len; k++ ) @@ -735,7 +733,7 @@ struct iPow_SIMD } }; -#if CV_SIMD128 +#if CV_SIMD template <> struct iPow_SIMD @@ -743,13 +741,13 @@ struct iPow_SIMD int operator() ( const uchar * src, uchar * dst, int len, int power ) { int i = 0; - v_uint32x4 v_1 = v_setall_u32(1u); + v_uint32 v_1 = vx_setall_u32(1u); - for ( ; i <= len - 8; i += 8) + for ( ; i <= len - v_uint16::nlanes; i += v_uint16::nlanes) { - v_uint32x4 v_a1 = v_1, v_a2 = v_1; - v_uint16x8 v = v_load_expand(src + i); - v_uint32x4 v_b1, v_b2; + v_uint32 v_a1 = v_1, v_a2 = v_1; + v_uint16 v = vx_load_expand(src + i); + v_uint32 v_b1, v_b2; v_expand(v, v_b1, v_b2); int p = power; @@ -771,6 +769,7 @@ struct iPow_SIMD v = v_pack(v_a1, v_a2); v_pack_store(dst + i, v); } + vx_cleanup(); return i; } @@ -782,13 +781,13 @@ struct iPow_SIMD int operator() ( const schar * src, schar * dst, int len, int power) { int i = 0; - v_int32x4 v_1 = v_setall_s32(1); + v_int32 v_1 = vx_setall_s32(1); - for ( ; i <= len - 8; i += 8) + for ( ; i <= len - v_int16::nlanes; i += v_int16::nlanes) { - v_int32x4 v_a1 = v_1, v_a2 = v_1; - v_int16x8 v = v_load_expand(src + i); - v_int32x4 v_b1, v_b2; + v_int32 v_a1 = v_1, v_a2 = v_1; + v_int16 v = vx_load_expand(src + i); + v_int32 v_b1, v_b2; v_expand(v, v_b1, v_b2); int p = power; @@ -810,6 +809,7 @@ struct iPow_SIMD v = v_pack(v_a1, v_a2); v_pack_store(dst + i, v); } + vx_cleanup(); return i; } @@ -821,13 +821,13 @@ struct iPow_SIMD int operator() ( const ushort * src, ushort * dst, int len, int power) { int i = 0; - v_uint32x4 v_1 = v_setall_u32(1u); + v_uint32 v_1 = vx_setall_u32(1u); - for ( ; i <= len - 8; i += 8) + for ( ; i <= len - v_uint16::nlanes; i += v_uint16::nlanes) { - v_uint32x4 v_a1 = v_1, v_a2 = v_1; - v_uint16x8 v = v_load(src + i); - v_uint32x4 v_b1, v_b2; + v_uint32 v_a1 = v_1, v_a2 = v_1; + v_uint16 v = vx_load(src + i); + v_uint32 v_b1, v_b2; v_expand(v, v_b1, v_b2); int p = power; @@ -849,6 +849,7 @@ struct iPow_SIMD v = v_pack(v_a1, v_a2); v_store(dst + i, v); } + vx_cleanup(); return i; } @@ -860,13 +861,13 @@ struct iPow_SIMD int operator() ( const short * src, short * dst, int len, int power) { int i = 0; - v_int32x4 v_1 = v_setall_s32(1); + v_int32 v_1 = vx_setall_s32(1); - for ( ; i <= len - 8; i += 8) + for ( ; i <= len - v_int16::nlanes; i += v_int16::nlanes) { - v_int32x4 v_a1 = v_1, v_a2 = v_1; - v_int16x8 v = v_load(src + i); - v_int32x4 v_b1, v_b2; + v_int32 v_a1 = v_1, v_a2 = v_1; + v_int16 v = vx_load(src + i); + v_int32 v_b1, v_b2; v_expand(v, v_b1, v_b2); int p = power; @@ -888,6 +889,7 @@ struct iPow_SIMD v = v_pack(v_a1, v_a2); v_store(dst + i, v); } + vx_cleanup(); return i; } @@ -899,12 +901,12 @@ struct iPow_SIMD int operator() ( const int * src, int * dst, int len, int power) { int i = 0; - v_int32x4 v_1 = v_setall_s32(1); + v_int32 v_1 = vx_setall_s32(1); - for ( ; i <= len - 8; i += 8) + for ( ; i <= len - v_int32::nlanes*2; i += v_int32::nlanes*2) { - v_int32x4 v_a1 = v_1, v_a2 = v_1; - v_int32x4 v_b1 = v_load(src + i), v_b2 = v_load(src + i + 4); + v_int32 v_a1 = v_1, v_a2 = v_1; + v_int32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + v_int32::nlanes); int p = power; while( p > 1 ) @@ -923,8 +925,9 @@ struct iPow_SIMD v_a2 *= v_b2; v_store(dst + i, v_a1); - v_store(dst + i + 4, v_a2); + v_store(dst + i + v_int32::nlanes, v_a2); } + vx_cleanup(); return i; } @@ -936,12 +939,12 @@ struct iPow_SIMD int operator() ( const float * src, float * dst, int len, int power) { int i = 0; - v_float32x4 v_1 = v_setall_f32(1.f); + v_float32 v_1 = vx_setall_f32(1.f); - for ( ; i <= len - 8; i += 8) + for ( ; i <= len - v_float32::nlanes*2; i += v_float32::nlanes*2) { - v_float32x4 v_a1 = v_1, v_a2 = v_1; - v_float32x4 v_b1 = v_load(src + i), v_b2 = v_load(src + i + 4); + v_float32 v_a1 = v_1, v_a2 = v_1; + v_float32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + v_float32::nlanes); int p = std::abs(power); if( power < 0 ) { @@ -965,26 +968,27 @@ struct iPow_SIMD v_a2 *= v_b2; v_store(dst + i, v_a1); - v_store(dst + i + 4, v_a2); + v_store(dst + i + v_float32::nlanes, v_a2); } + vx_cleanup(); return i; } }; -#if CV_SIMD128_64F +#if CV_SIMD_64F template <> struct iPow_SIMD { int operator() ( const double * src, double * dst, int len, int power) { int i = 0; - v_float64x2 v_1 = v_setall_f64(1.); + v_float64 v_1 = vx_setall_f64(1.); - for ( ; i <= len - 4; i += 4) + for ( ; i <= len - v_float64::nlanes*2; i += v_float64::nlanes*2) { - v_float64x2 v_a1 = v_1, v_a2 = v_1; - v_float64x2 v_b1 = v_load(src + i), v_b2 = v_load(src + i + 2); + v_float64 v_a1 = v_1, v_a2 = v_1; + v_float64 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + v_float64::nlanes); int p = std::abs(power); if( power < 0 ) { @@ -1008,8 +1012,9 @@ struct iPow_SIMD v_a2 *= v_b2; v_store(dst + i, v_a1); - v_store(dst + i + 2, v_a2); + v_store(dst + i + v_float64::nlanes, v_a2); } + vx_cleanup(); return i; } @@ -1594,9 +1599,9 @@ void patchNaNs( InputOutputArray _a, double _val ) Cv32suf val; val.f = (float)_val; -#if CV_SIMD128 - v_int32x4 v_mask1 = v_setall_s32(0x7fffffff), v_mask2 = v_setall_s32(0x7f800000); - v_int32x4 v_val = v_setall_s32(val.i); +#if CV_SIMD + v_int32 v_mask1 = vx_setall_s32(0x7fffffff), v_mask2 = vx_setall_s32(0x7f800000); + v_int32 v_val = vx_setall_s32(val.i); #endif for( size_t i = 0; i < it.nplanes; i++, ++it ) @@ -1604,18 +1609,16 @@ void patchNaNs( InputOutputArray _a, double _val ) int* tptr = ptrs[0]; size_t j = 0; -#if CV_SIMD128 - if( hasSIMD128() ) +#if CV_SIMD + size_t cWidth = (size_t)v_int32::nlanes; + for ( ; j + cWidth <= len; j += cWidth) { - size_t cWidth = (size_t)v_int32x4::nlanes; - for ( ; j + cWidth <= len; j += cWidth) - { - v_int32x4 v_src = v_load(tptr + j); - v_int32x4 v_cmp_mask = v_mask2 < (v_src & v_mask1); - v_int32x4 v_dst = v_select(v_cmp_mask, v_val, v_src); - v_store(tptr + j, v_dst); - } + v_int32 v_src = vx_load(tptr + j); + v_int32 v_cmp_mask = v_mask2 < (v_src & v_mask1); + v_int32 v_dst = v_select(v_cmp_mask, v_val, v_src); + v_store(tptr + j, v_dst); } + vx_cleanup(); #endif for( ; j < len; j++ ) diff --git a/modules/core/src/matmul.cpp b/modules/core/src/matmul.cpp index 7cd89c6222..4f85e06140 100644 --- a/modules/core/src/matmul.cpp +++ b/modules/core/src/matmul.cpp @@ -2310,18 +2310,12 @@ static void scaleAdd_32f(const float* src1, const float* src2, float* dst, { float alpha = *_alpha; int i = 0; -#if CV_SIMD128 - if (hasSIMD128()) - { - v_float32x4 v_alpha = v_setall_f32(alpha); - const int cWidth = v_float32x4::nlanes; - for (; i <= len - cWidth; i += cWidth) - { - v_float32x4 v_src1 = v_load(src1 + i); - v_float32x4 v_src2 = v_load(src2 + i); - v_store(dst + i, (v_src1 * v_alpha) + v_src2); - } - } +#if CV_SIMD + v_float32 v_alpha = vx_setall_f32(alpha); + const int cWidth = v_float32::nlanes; + for (; i <= len - cWidth; i += cWidth) + v_store(dst + i, v_muladd(vx_load(src1 + i), v_alpha, vx_load(src2 + i))); + vx_cleanup(); #endif for (; i < len; i++) dst[i] = src1[i] * alpha + src2[i]; @@ -2333,22 +2327,12 @@ static void scaleAdd_64f(const double* src1, const double* src2, double* dst, { double alpha = *_alpha; int i = 0; -#if CV_SIMD128_64F - if (hasSIMD128()) - { - v_float64x2 a2 = v_setall_f64(alpha); - const int cWidth = v_float64x2::nlanes; - for (; i <= len - cWidth * 2; i += cWidth * 2) - { - v_float64x2 x0, x1, y0, y1, t0, t1; - x0 = v_load(src1 + i); x1 = v_load(src1 + i + cWidth); - y0 = v_load(src2 + i); y1 = v_load(src2 + i + cWidth); - t0 = x0 * a2 + y0; - t1 = x1 * a2 + y1; - v_store(dst + i, t0); - v_store(dst + i + cWidth, t1); - } - } +#if CV_SIMD_64F + v_float64 a2 = vx_setall_f64(alpha); + const int cWidth = v_float64::nlanes; + for (; i <= len - cWidth; i += cWidth) + v_store(dst + i, v_muladd(vx_load(src1 + i), a2, vx_load(src2 + i))); + vx_cleanup(); #endif for (; i < len; i++) dst[i] = src1[i] * alpha + src2[i]; @@ -3025,42 +3009,40 @@ static double dotProd_8u(const uchar* src1, const uchar* src2, int len) #endif int i = 0; -#if CV_SIMD128 - if (hasSIMD128()) +#if CV_SIMD + int len0 = len & -v_uint16::nlanes, blockSize0 = (1 << 15), blockSize; + + while (i < len0) { - int len0 = len & -8, blockSize0 = (1 << 15), blockSize; + blockSize = std::min(len0 - i, blockSize0); + v_int32 v_sum = vx_setzero_s32(); + const int cWidth = v_uint16::nlanes; - while (i < len0) + int j = 0; + for (; j <= blockSize - cWidth * 2; j += cWidth * 2) { - blockSize = std::min(len0 - i, blockSize0); - v_int32x4 v_sum = v_setzero_s32(); - const int cWidth = v_uint16x8::nlanes; + v_uint16 v_src10, v_src20, v_src11, v_src21; + v_expand(vx_load(src1 + j), v_src10, v_src11); + v_expand(vx_load(src2 + j), v_src20, v_src21); - int j = 0; - for (; j <= blockSize - cWidth * 2; j += cWidth * 2) - { - v_uint16x8 v_src10, v_src20, v_src11, v_src21; - v_expand(v_load(src1 + j), v_src10, v_src11); - v_expand(v_load(src2 + j), v_src20, v_src21); - - v_sum += v_dotprod(v_reinterpret_as_s16(v_src10), v_reinterpret_as_s16(v_src20)); - v_sum += v_dotprod(v_reinterpret_as_s16(v_src11), v_reinterpret_as_s16(v_src21)); - } - - for (; j <= blockSize - cWidth; j += cWidth) - { - v_int16x8 v_src10 = v_reinterpret_as_s16(v_load_expand(src1 + j)); - v_int16x8 v_src20 = v_reinterpret_as_s16(v_load_expand(src2 + j)); + v_sum += v_dotprod(v_reinterpret_as_s16(v_src10), v_reinterpret_as_s16(v_src20)); + v_sum += v_dotprod(v_reinterpret_as_s16(v_src11), v_reinterpret_as_s16(v_src21)); + } - v_sum += v_dotprod(v_src10, v_src20); - } - r += (double)v_reduce_sum(v_sum); + for (; j <= blockSize - cWidth; j += cWidth) + { + v_int16 v_src10 = v_reinterpret_as_s16(vx_load_expand(src1 + j)); + v_int16 v_src20 = v_reinterpret_as_s16(vx_load_expand(src2 + j)); - src1 += blockSize; - src2 += blockSize; - i += blockSize; + v_sum += v_dotprod(v_src10, v_src20); } + r += (double)v_reduce_sum(v_sum); + + src1 += blockSize; + src2 += blockSize; + i += blockSize; } + vx_cleanup(); #elif CV_NEON if( cv::checkHardwareSupport(CV_CPU_NEON) ) { @@ -3113,42 +3095,40 @@ static double dotProd_8s(const schar* src1, const schar* src2, int len) double r = 0.0; int i = 0; -#if CV_SIMD128 - if (hasSIMD128()) +#if CV_SIMD + int len0 = len & -v_int16::nlanes, blockSize0 = (1 << 14), blockSize; + + while (i < len0) { - int len0 = len & -8, blockSize0 = (1 << 14), blockSize; + blockSize = std::min(len0 - i, blockSize0); + v_int32 v_sum = vx_setzero_s32(); + const int cWidth = v_int16::nlanes; - while (i < len0) + int j = 0; + for (; j <= blockSize - cWidth * 2; j += cWidth * 2) { - blockSize = std::min(len0 - i, blockSize0); - v_int32x4 v_sum = v_setzero_s32(); - const int cWidth = v_int16x8::nlanes; - - int j = 0; - for (; j <= blockSize - cWidth * 2; j += cWidth * 2) - { - v_int16x8 v_src10, v_src20, v_src11, v_src21; - v_expand(v_load(src1 + j), v_src10, v_src11); - v_expand(v_load(src2 + j), v_src20, v_src21); + v_int16 v_src10, v_src20, v_src11, v_src21; + v_expand(vx_load(src1 + j), v_src10, v_src11); + v_expand(vx_load(src2 + j), v_src20, v_src21); - v_sum += v_dotprod(v_src10, v_src20); - v_sum += v_dotprod(v_src11, v_src21); - } - - for (; j <= blockSize - cWidth; j += cWidth) - { - v_int16x8 v_src10 = v_load_expand(src1 + j); - v_int16x8 v_src20 = v_load_expand(src2 + j); + v_sum += v_dotprod(v_src10, v_src20); + v_sum += v_dotprod(v_src11, v_src21); + } - v_sum += v_dotprod(v_src10, v_src20); - } - r += (double)v_reduce_sum(v_sum); + for (; j <= blockSize - cWidth; j += cWidth) + { + v_int16 v_src10 = vx_load_expand(src1 + j); + v_int16 v_src20 = vx_load_expand(src2 + j); - src1 += blockSize; - src2 += blockSize; - i += blockSize; + v_sum += v_dotprod(v_src10, v_src20); } + r += (double)v_reduce_sum(v_sum); + + src1 += blockSize; + src2 += blockSize; + i += blockSize; } + vx_cleanup(); #elif CV_NEON if( cv::checkHardwareSupport(CV_CPU_NEON) ) { @@ -3232,28 +3212,26 @@ static double dotProd_32f(const float* src1, const float* src2, int len) #endif int i = 0; -#if CV_SIMD128 - if (hasSIMD128()) - { - int len0 = len & -4, blockSize0 = (1 << 13), blockSize; +#if CV_SIMD + int len0 = len & -v_float32::nlanes, blockSize0 = (1 << 13), blockSize; - while (i < len0) - { - blockSize = std::min(len0 - i, blockSize0); - v_float32x4 v_sum = v_setzero_f32(); + while (i < len0) + { + blockSize = std::min(len0 - i, blockSize0); + v_float32 v_sum = vx_setzero_f32(); - int j = 0; - int cWidth = v_float32x4::nlanes; - for (; j <= blockSize - cWidth; j += cWidth) - v_sum = v_muladd(v_load(src1 + j), v_load(src2 + j), v_sum); + int j = 0; + int cWidth = v_float32::nlanes; + for (; j <= blockSize - cWidth; j += cWidth) + v_sum = v_muladd(vx_load(src1 + j), vx_load(src2 + j), v_sum); - r += v_reduce_sum(v_sum); + r += v_reduce_sum(v_sum); - src1 += blockSize; - src2 += blockSize; - i += blockSize; - } + src1 += blockSize; + src2 += blockSize; + i += blockSize; } + vx_cleanup(); #endif return r + dotProd_(src1, src2, len - i); }