diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp index dab82489f8..6c28b44f5b 100644 --- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp @@ -924,6 +924,9 @@ inline scalartype v_reduce_sum(const _Tpvec& a) \ return (scalartype)v_get0(res); \ } OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float32, v_float32, vfloat32m1_t, float, f32, VTraits::vlanes()) +#if CV_SIMD_SCALABLE_64F +OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float64, v_float64, vfloat64m1_t, float, f64, VTraits::vlanes()) +#endif #define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, func, scalartype, suffix, vl, red) \ inline scalartype v_reduce_##func(const _Tpvec& a) \ diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index 5709ec12e4..c5e561e26e 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -1332,7 +1332,7 @@ struct InRange_SIMD } }; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) template <> struct InRange_SIMD @@ -1341,7 +1341,7 @@ struct InRange_SIMD uchar * dst, int len) const { int x = 0; - const int width = v_uint8::nlanes; + const int width = VTraits::vlanes(); for (; x <= len - width; x += width) { @@ -1349,7 +1349,7 @@ struct InRange_SIMD v_uint8 low = vx_load(src2 + x); v_uint8 high = vx_load(src3 + x); - v_store(dst + x, (values >= low) & (high >= values)); + v_store(dst + x, v_and(v_ge(values, low), v_ge(high, values))); } vx_cleanup(); return x; @@ -1363,7 +1363,7 @@ struct InRange_SIMD uchar * dst, int len) const { int x = 0; - const int width = v_int8::nlanes; + const int width = VTraits::vlanes(); for (; x <= len - width; x += width) { @@ -1371,7 +1371,7 @@ struct InRange_SIMD v_int8 low = vx_load(src2 + x); v_int8 high = vx_load(src3 + x); - v_store((schar*)(dst + x), (values >= low) & (high >= values)); + v_store((schar*)(dst + x), v_and(v_ge(values, low), v_ge(high, values))); } vx_cleanup(); return x; @@ -1385,7 +1385,7 @@ struct InRange_SIMD uchar * dst, int len) const { int x = 0; - const int width = v_uint16::nlanes * 2; + const int width = VTraits::vlanes() * 2; for (; x <= len - width; x += width) { @@ -1393,11 +1393,11 @@ struct InRange_SIMD v_uint16 low1 = vx_load(src2 + x); v_uint16 high1 = vx_load(src3 + x); - v_uint16 values2 = vx_load(src1 + x + v_uint16::nlanes); - v_uint16 low2 = vx_load(src2 + x + v_uint16::nlanes); - v_uint16 high2 = vx_load(src3 + x + v_uint16::nlanes); + v_uint16 values2 = vx_load(src1 + x + VTraits::vlanes()); + v_uint16 low2 = vx_load(src2 + x + VTraits::vlanes()); + v_uint16 high2 = vx_load(src3 + x + VTraits::vlanes()); - v_store(dst + x, v_pack((values1 >= low1) & (high1 >= values1), (values2 >= low2) & (high2 >= values2))); + v_store(dst + x, v_pack(v_and(v_ge(values1, low1), v_ge(high1, values1)), v_and(v_ge(values2, low2), v_ge(high2, values2)))); } vx_cleanup(); return x; @@ -1411,7 +1411,7 @@ struct InRange_SIMD uchar * dst, int len) const { int x = 0; - const int width = (int)v_int16::nlanes * 2; + const int width = (int)VTraits::vlanes() * 2; for (; x <= len - width; x += width) { @@ -1419,11 +1419,11 @@ struct InRange_SIMD v_int16 low1 = vx_load(src2 + x); v_int16 high1 = vx_load(src3 + x); - v_int16 values2 = vx_load(src1 + x + v_int16::nlanes); - v_int16 low2 = vx_load(src2 + x + v_int16::nlanes); - v_int16 high2 = vx_load(src3 + x + v_int16::nlanes); + v_int16 values2 = vx_load(src1 + x + VTraits::vlanes()); + v_int16 low2 = vx_load(src2 + x + VTraits::vlanes()); + v_int16 high2 = vx_load(src3 + x + VTraits::vlanes()); - v_store((schar*)(dst + x), v_pack((values1 >= low1) & (high1 >= values1), (values2 >= low2) & (high2 >= values2))); + v_store((schar*)(dst + x), v_pack(v_and(v_ge(values1, low1), v_ge(high1, values1)), v_and(v_ge(values2, low2), v_ge(high2, values2)))); } vx_cleanup(); return x; @@ -1437,7 +1437,7 @@ struct InRange_SIMD uchar * dst, int len) const { int x = 0; - const int width = (int)v_int32::nlanes * 2; + const int width = (int)VTraits::vlanes() * 2; for (; x <= len - width; x += width) { @@ -1445,11 +1445,11 @@ struct InRange_SIMD v_int32 low1 = vx_load(src2 + x); v_int32 high1 = vx_load(src3 + x); - v_int32 values2 = vx_load(src1 + x + v_int32::nlanes); - v_int32 low2 = vx_load(src2 + x + v_int32::nlanes); - v_int32 high2 = vx_load(src3 + x + v_int32::nlanes); + v_int32 values2 = vx_load(src1 + x + VTraits::vlanes()); + v_int32 low2 = vx_load(src2 + x + VTraits::vlanes()); + v_int32 high2 = vx_load(src3 + x + VTraits::vlanes()); - v_pack_store(dst + x, v_reinterpret_as_u16(v_pack((values1 >= low1) & (high1 >= values1), (values2 >= low2) & (high2 >= values2)))); + v_pack_store(dst + x, v_reinterpret_as_u16(v_pack(v_and(v_ge(values1, low1), v_ge(high1, values1)), v_and(v_ge(values2, low2), v_ge(high2, values2))))); } vx_cleanup(); return x; @@ -1463,7 +1463,7 @@ struct InRange_SIMD uchar * dst, int len) const { int x = 0; - const int width = (int)v_float32::nlanes * 2; + const int width = (int)VTraits::vlanes() * 2; for (; x <= len - width; x += width) { @@ -1471,12 +1471,12 @@ struct InRange_SIMD v_float32 low1 = vx_load(src2 + x); v_float32 high1 = vx_load(src3 + x); - v_float32 values2 = vx_load(src1 + x + v_float32::nlanes); - v_float32 low2 = vx_load(src2 + x + v_float32::nlanes); - v_float32 high2 = vx_load(src3 + x + v_float32::nlanes); + v_float32 values2 = vx_load(src1 + x + VTraits::vlanes()); + v_float32 low2 = vx_load(src2 + x + VTraits::vlanes()); + v_float32 high2 = vx_load(src3 + x + VTraits::vlanes()); - v_pack_store(dst + x, v_pack(v_reinterpret_as_u32(values1 >= low1) & v_reinterpret_as_u32(high1 >= values1), - v_reinterpret_as_u32(values2 >= low2) & v_reinterpret_as_u32(high2 >= values2))); + v_pack_store(dst + x, v_pack(v_and(v_reinterpret_as_u32(v_ge(values1, low1)), v_reinterpret_as_u32(v_ge(high1, values1))), + v_and(v_reinterpret_as_u32(v_ge(values2, low2)), v_reinterpret_as_u32(v_ge(high2, values2))))); } vx_cleanup(); return x; diff --git a/modules/core/src/arithm.simd.hpp b/modules/core/src/arithm.simd.hpp index 06ebfb7678..1c97e91fbe 100644 --- a/modules/core/src/arithm.simd.hpp +++ b/modules/core/src/arithm.simd.hpp @@ -219,7 +219,7 @@ template struct op_add { static inline Tvec r(const Tvec& a, const Tvec& b) - { return a + b; } + { return v_add(a, b); } static inline T1 r(T1 a, T1 b) { return c_add(a, b); } }; @@ -229,7 +229,7 @@ template struct op_sub { static inline Tvec r(const Tvec& a, const Tvec& b) - { return a - b; } + { return v_sub(a, b); } static inline T1 r(T1 a, T1 b) { return c_sub(a, b); } }; @@ -266,7 +266,7 @@ struct op_absdiff template<> struct op_absdiff { -#if CV_SIMD +#if CV_SIMD || CV_SIMD_SCALABLE static inline v_int8 r(const v_int8& a, const v_int8& b) { return v_absdiffs(a, b); } #endif @@ -276,7 +276,7 @@ struct op_absdiff template<> struct op_absdiff { -#if CV_SIMD +#if CV_SIMD || CV_SIMD_SCALABLE static inline v_int16 r(const v_int16& a, const v_int16& b) { return v_absdiffs(a, b); } #endif @@ -286,7 +286,7 @@ struct op_absdiff template<> struct op_absdiff { -#if CV_SIMD +#if CV_SIMD || CV_SIMD_SCALABLE static inline v_int32 r(const v_int32& a, const v_int32& b) { return v_reinterpret_as_s32(v_absdiff(a, b)); } #endif @@ -299,7 +299,7 @@ template struct op_or { static inline Tvec r(const Tvec& a, const Tvec& b) - { return a | b; } + { return v_or(a, b); } static inline T1 r(T1 a, T1 b) { return a | b; } }; @@ -307,7 +307,7 @@ template struct op_xor { static inline Tvec r(const Tvec& a, const Tvec& b) - { return a ^ b; } + { return v_xor(a, b); } static inline T1 r(T1 a, T1 b) { return a ^ b; } }; @@ -315,7 +315,7 @@ template struct op_and { static inline Tvec r(const Tvec& a, const Tvec& b) - { return a & b; } + { return v_and(a, b); } static inline T1 r(T1 a, T1 b) { return a & b; } }; @@ -324,14 +324,14 @@ struct op_not { // ignored b from loader level static inline Tvec r(const Tvec& a) - { return ~a; } + { return v_not(a); } static inline T1 r(T1 a, T1) { return ~a; } }; //////////////////////////// Loaders ///////////////////////////////// -#if CV_SIMD +#if CV_SIMD || CV_SIMD_SCALABLE template< template class OP, typename T1, typename Tvec> struct bin_loader @@ -396,13 +396,13 @@ template class OP, typename T1, typename Tv static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height) { typedef OP op; -#if CV_SIMD +#if CV_SIMD || CV_SIMD_SCALABLE typedef bin_loader ldr; - enum {wide_step = Tvec::nlanes}; + const int wide_step = VTraits::vlanes(); #if !CV_NEON && CV_SIMD_WIDTH == 16 - enum {wide_step_l = wide_step * 2}; + const int wide_step_l = wide_step * 2; #else - enum {wide_step_l = wide_step}; + const int wide_step_l = wide_step; #endif #endif // CV_SIMD @@ -414,7 +414,7 @@ static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, { int x = 0; - #if CV_SIMD + #if CV_SIMD || CV_SIMD_SCALABLE #if !CV_NEON && !CV_MSA if (is_aligned(src1, src2, dst)) { @@ -587,7 +587,7 @@ template struct op_cmplt { static inline Tvec r(const Tvec& a, const Tvec& b) - { return a < b; } + { return v_lt(a, b); } static inline uchar r(T1 a, T1 b) { return (uchar)-(int)(a < b); } }; @@ -596,7 +596,7 @@ template struct op_cmple { static inline Tvec r(const Tvec& a, const Tvec& b) - { return a <= b; } + { return v_le(a, b); } static inline uchar r(T1 a, T1 b) { return (uchar)-(int)(a <= b); } }; @@ -605,7 +605,7 @@ template struct op_cmpeq { static inline Tvec r(const Tvec& a, const Tvec& b) - { return a == b; } + { return v_eq(a, b); } static inline uchar r(T1 a, T1 b) { return (uchar)-(int)(a == b); } }; @@ -614,14 +614,14 @@ template struct op_cmpne { static inline Tvec r(const Tvec& a, const Tvec& b) - { return a != b; } + { return v_ne(a, b); } static inline uchar r(T1 a, T1 b) { return (uchar)-(int)(a != b); } }; //////////////////////////// Loaders ///////////////////////////////// -#if CV_SIMD +#if CV_SIMD || CV_SIMD_SCALABLE // todo: add support for RW alignment & stream template class OP, typename T1, typename Tvec> struct cmp_loader_n @@ -646,10 +646,10 @@ template class OP, typename T1, typename Tv struct cmp_loader_n { typedef OP op; - enum {step = Tvec::nlanes}; static inline void l(const T1* src1, const T1* src2, uchar* dst) { + const int step = VTraits::vlanes(); Tvec c0 = op::r(vx_load(src1), vx_load(src2)); Tvec c1 = op::r(vx_load(src1 + step), vx_load(src2 + step)); v_store(dst, v_pack_b(v_reinterpret_as_u16(c0), v_reinterpret_as_u16(c1))); @@ -660,10 +660,10 @@ template class OP, typename T1, typename Tv struct cmp_loader_n { typedef OP op; - enum {step = Tvec::nlanes}; static inline void l(const T1* src1, const T1* src2, uchar* dst) { + const int step = VTraits::vlanes(); v_uint32 c0 = v_reinterpret_as_u32(op::r(vx_load(src1), vx_load(src2))); v_uint32 c1 = v_reinterpret_as_u32(op::r(vx_load(src1 + step), vx_load(src2 + step))); v_uint32 c2 = v_reinterpret_as_u32(op::r(vx_load(src1 + step * 2), vx_load(src2 + step * 2))); @@ -676,10 +676,10 @@ template class OP, typename T1, typename Tv struct cmp_loader_n { typedef OP op; - enum {step = Tvec::nlanes}; static inline void l(const T1* src1, const T1* src2, uchar* dst) { + const int step = VTraits::vlanes(); v_uint64 c0 = v_reinterpret_as_u64(op::r(vx_load(src1), vx_load(src2))); v_uint64 c1 = v_reinterpret_as_u64(op::r(vx_load(src1 + step), vx_load(src2 + step))); v_uint64 c2 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 2), vx_load(src2 + step * 2))); @@ -701,9 +701,9 @@ template class OP, typename T1, typename Tv static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height) { typedef OP op; -#if CV_SIMD +#if CV_SIMD || CV_SIMD_SCALABLE typedef cmp_loader_n ldr; - enum {wide_step = Tvec::nlanes * sizeof(T1)}; + const int wide_step = VTraits::vlanes() * sizeof(T1); #endif // CV_SIMD step1 /= sizeof(T1); @@ -713,7 +713,7 @@ static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, { int x = 0; - #if CV_SIMD + #if CV_SIMD || CV_SIMD_SCALABLE for (; x <= width - wide_step; x += wide_step) { ldr::l(src1 + x, src2 + x, dst + x); @@ -880,7 +880,7 @@ DEFINE_SIMD_ALL(cmp) //////////////////////////// Loaders /////////////////////////////// -#if CV_SIMD +#if CV_SIMD || CV_SIMD_SCALABLE // todo: add support for RW alignment & stream template class OP, typename T1, typename T2, typename Tvec> struct scalar_loader_n @@ -1013,10 +1013,10 @@ template class OP, typename T2 struct scalar_loader_n { typedef OP op; - enum {step = v_int32::nlanes}; static inline void l(const int* src1, const int* src2, const T2* scalar, int* dst) { + const int step = VTraits::vlanes(); v_int32 v_src1 = vx_load(src1); v_int32 v_src2 = vx_load(src2); v_int32 v_src1s = vx_load(src1 + step); @@ -1043,6 +1043,7 @@ struct scalar_loader_n static inline void l(const int* src1, const T2* scalar, int* dst) { + const int step = VTraits::vlanes(); v_int32 v_src1 = vx_load(src1); v_int32 v_src1s = vx_load(src1 + step); @@ -1068,10 +1069,9 @@ template class OP, typename T2 struct scalar_loader_n { typedef OP op; - enum {step = v_float32::nlanes}; - static inline void l(const float* src1, const float* src2, const T2* scalar, float* dst) { + const int step = VTraits::vlanes(); v_float32 v_src1 = vx_load(src1); v_float32 v_src2 = vx_load(src2); v_float32 v_src1s = vx_load(src1 + step); @@ -1086,6 +1086,7 @@ struct scalar_loader_n static inline void l(const float* src1, const T2* scalar, float* dst) { + const int step = VTraits::vlanes(); v_float32 v_src1 = vx_load(src1); v_float32 v_src1s = vx_load(src1 + step); @@ -1262,10 +1263,10 @@ static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t ste T1* dst, size_t step, int width, int height, const T2* scalar) { typedef OP op; -#if CV_SIMD +#if CV_SIMD || CV_SIMD_SCALABLE typedef scalar_loader_n ldr; - const int wide_step = sizeof(T1) > sizeof(ushort) ? Tvec::nlanes * 2 : - sizeof(T1) == sizeof(uchar) ? Tvec::nlanes / 2 : Tvec::nlanes; + const int wide_step = sizeof(T1) > sizeof(ushort) ? VTraits::vlanes() * 2 : + sizeof(T1) == sizeof(uchar) ? VTraits::vlanes() / 2 : VTraits::vlanes(); #endif // CV_SIMD step1 /= sizeof(T1); @@ -1276,7 +1277,7 @@ static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t ste { int x = 0; - #if CV_SIMD + #if CV_SIMD || CV_SIMD_SCALABLE for (; x <= width - wide_step; x += wide_step) { ldr::l(src1 + x, src2 + x, scalar, dst + x); @@ -1308,10 +1309,10 @@ template class OP, typename T1 static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const T2* scalar) { typedef OP op; -#if CV_SIMD +#if CV_SIMD || CV_SIMD_SCALABLE typedef scalar_loader_n ldr; - const int wide_step = sizeof(T1) > sizeof(ushort) ? Tvec::nlanes * 2 : - sizeof(T1) == sizeof(uchar) ? Tvec::nlanes / 2 : Tvec::nlanes; + const int wide_step = sizeof(T1) > sizeof(ushort) ? VTraits::vlanes() * 2 : + sizeof(T1) == sizeof(uchar) ? VTraits::vlanes() / 2 : VTraits::vlanes(); #endif // CV_SIMD step1 /= sizeof(T1); @@ -1321,7 +1322,7 @@ static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int { int x = 0; - #if CV_SIMD + #if CV_SIMD || CV_SIMD_SCALABLE for (; x <= width - wide_step; x += wide_step) { ldr::l(src1 + x, scalar, dst + x); @@ -1428,7 +1429,7 @@ template struct op_mul { static inline Tvec r(const Tvec& a, const Tvec& b) - { return a * b; } + { return v_mul(a, b); } static inline T1 r(T1 a, T1 b) { return saturate_cast(a * b); } }; @@ -1436,11 +1437,11 @@ struct op_mul template struct op_mul_scale { -#if CV_SIMD +#if CV_SIMD || CV_SIMD_SCALABLE static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar) { const v_float32 v_scalar = vx_setall_f32(*scalar); - return v_scalar * a * b; + return v_mul(v_scalar , a , b); } #endif static inline T1 r(T1 a, T1 b, const T2* scalar) @@ -1456,7 +1457,7 @@ struct op_mul_scale static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar) { const v_float64 v_scalar = vx_setall_f64(*scalar); - return v_scalar * a * b; + return v_mul(v_mul(v_scalar, a), b); } #endif static inline double r(double a, double b, const double* scalar) @@ -1569,7 +1570,7 @@ template struct op_div_f { static inline Tvec r(const Tvec& a, const Tvec& b) - { return a / b; } + { return v_div(a, b); } static inline T1 r(T1 a, T1 b) { return a / b; } }; @@ -1577,16 +1578,16 @@ struct op_div_f template struct op_div_scale { -#if CV_SIMD +#if CV_SIMD || CV_SIMD_SCALABLE static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar) { const v_float32 v_scalar = vx_setall_f32(*scalar); - return a * v_scalar / b; + return v_div(v_mul(a, v_scalar), b); } static inline Tvec pre(const Tvec& denom, const Tvec& res) { - const Tvec v_zero = vx_setall(0); - return v_select(denom == v_zero, v_zero, res); + const Tvec v_zero = vx_setall::lane_type>(0); + return v_select(v_eq(denom, v_zero), v_zero, res); } #endif static inline T1 r(T1 a, T1 denom, const T2* scalar) @@ -1599,11 +1600,11 @@ struct op_div_scale template<> struct op_div_scale { -#if CV_SIMD +#if CV_SIMD || CV_SIMD_SCALABLE static inline v_float32 r(const v_float32& a, const v_float32& b, const float* scalar) { const v_float32 v_scalar = vx_setall_f32(*scalar); - return a * v_scalar / b; + return v_div(v_mul(a, v_scalar), b); } #endif static inline float r(float a, float denom, const float* scalar) @@ -1617,7 +1618,7 @@ struct op_div_scale static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar) { const v_float64 v_scalar = vx_setall_f64(*scalar); - return a * v_scalar / b; + return v_div(v_mul(a, v_scalar), b); } #endif static inline double r(double a, double denom, const double* scalar) @@ -1685,7 +1686,7 @@ DEFINE_SIMD_ALL(div, div_loop) template struct op_add_scale { -#if CV_SIMD +#if CV_SIMD || CV_SIMD_SCALABLE static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar) { const v_float32 v_alpha = vx_setall_f32(*scalar); @@ -1718,7 +1719,7 @@ struct op_add_scale template struct op_add_weighted { -#if CV_SIMD +#if CV_SIMD || CV_SIMD_SCALABLE static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalars) { const v_float32 v_alpha = vx_setall_f32(scalars[0]); @@ -1835,16 +1836,16 @@ DEFINE_SIMD_F64(addWeighted, add_weighted_loop_d) template struct op_recip { -#if CV_SIMD +#if CV_SIMD || CV_SIMD_SCALABLE static inline v_float32 r(const v_float32& a, const T2* scalar) { const v_float32 v_scalar = vx_setall_f32(*scalar); - return v_scalar / a; + return v_div(v_scalar, a); } static inline Tvec pre(const Tvec& denom, const Tvec& res) { - const Tvec v_zero = vx_setall(0); - return v_select(denom == v_zero, v_zero, res); + const Tvec v_zero = vx_setall::lane_type>(0); + return v_select(v_eq(denom, v_zero), v_zero, res); } #endif static inline T1 r(T1 denom, const T2* scalar) @@ -1857,11 +1858,11 @@ struct op_recip template<> struct op_recip { -#if CV_SIMD +#if CV_SIMD || CV_SIMD_SCALABLE static inline v_float32 r(const v_float32& a, const float* scalar) { const v_float32 v_scalar = vx_setall_f32(*scalar); - return v_scalar / a; + return v_div(v_scalar, a); } #endif static inline float r(float denom, const float* scalar) @@ -1875,7 +1876,7 @@ struct op_recip static inline v_float64 r(const v_float64& a, const double* scalar) { const v_float64 v_scalar = vx_setall_f64(*scalar); - return v_scalar / a; + return v_div(v_scalar, a); } #endif static inline double r(double denom, const double* scalar) diff --git a/modules/core/src/convert.hpp b/modules/core/src/convert.hpp index 4b9ddbb413..c689276218 100644 --- a/modules/core/src/convert.hpp +++ b/modules/core/src/convert.hpp @@ -11,7 +11,7 @@ namespace cv { -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) static inline void vx_load_as(const uchar* ptr, v_float32& a) { a = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(ptr))); } @@ -62,7 +62,7 @@ static inline void vx_load_pair_as(const schar* ptr, v_uint16& a, v_uint16& b) } static inline void vx_load_pair_as(const ushort* ptr, v_uint16& a, v_uint16& b) -{ a = vx_load(ptr); b = vx_load(ptr + v_uint16::nlanes); } +{ a = vx_load(ptr); b = vx_load(ptr + VTraits::vlanes()); } static inline void vx_load_pair_as(const uchar* ptr, v_int16& a, v_int16& b) { @@ -76,7 +76,7 @@ static inline void vx_load_pair_as(const schar* ptr, v_int16& a, v_int16& b) { v_expand(vx_load(ptr), a, b); } static inline void vx_load_pair_as(const short* ptr, v_int16& a, v_int16& b) -{ a = vx_load(ptr); b = vx_load(ptr + v_uint16::nlanes); } +{ a = vx_load(ptr); b = vx_load(ptr + VTraits::vlanes()); } static inline void vx_load_pair_as(const uchar* ptr, v_int32& a, v_int32& b) { @@ -105,7 +105,7 @@ static inline void vx_load_pair_as(const short* ptr, v_int32& a, v_int32& b) static inline void vx_load_pair_as(const int* ptr, v_int32& a, v_int32& b) { a = vx_load(ptr); - b = vx_load(ptr + v_int32::nlanes); + b = vx_load(ptr + VTraits::vlanes()); } static inline void vx_load_pair_as(const uchar* ptr, v_float32& a, v_float32& b) @@ -142,18 +142,18 @@ static inline void vx_load_pair_as(const short* ptr, v_float32& a, v_float32& b) static inline void vx_load_pair_as(const int* ptr, v_float32& a, v_float32& b) { - v_int32 ia = vx_load(ptr), ib = vx_load(ptr + v_int32::nlanes); + v_int32 ia = vx_load(ptr), ib = vx_load(ptr + VTraits::vlanes()); a = v_cvt_f32(ia); b = v_cvt_f32(ib); } static inline void vx_load_pair_as(const float* ptr, v_float32& a, v_float32& b) -{ a = vx_load(ptr); b = vx_load(ptr + v_float32::nlanes); } +{ a = vx_load(ptr); b = vx_load(ptr + VTraits::vlanes()); } static inline void vx_load_pair_as(const float16_t* ptr, v_float32& a, v_float32& b) { a = vx_load_expand(ptr); - b = vx_load_expand(ptr + v_float32::nlanes); + b = vx_load_expand(ptr + VTraits::vlanes()); } static inline void v_store_pair_as(uchar* ptr, const v_uint16& a, const v_uint16& b) @@ -169,7 +169,7 @@ static inline void v_store_pair_as(schar* ptr, const v_uint16& a, const v_uint16 } static inline void v_store_pair_as(ushort* ptr, const v_uint16& a, const v_uint16& b) -{ v_store(ptr, a); v_store(ptr + v_uint16::nlanes, b); } +{ v_store(ptr, a); v_store(ptr + VTraits::vlanes(), b); } static inline void v_store_pair_as(uchar* ptr, const v_int16& a, const v_int16& b) { v_store(ptr, v_pack_u(a, b)); } @@ -178,7 +178,7 @@ static inline void v_store_pair_as(schar* ptr, const v_int16& a, const v_int16& { v_store(ptr, v_pack(a, b)); } static inline void v_store_pair_as(short* ptr, const v_int16& a, const v_int16& b) -{ v_store(ptr, a); v_store(ptr + v_int16::nlanes, b); } +{ v_store(ptr, a); v_store(ptr + VTraits::vlanes(), b); } static inline void v_store_pair_as(uchar* ptr, const v_int32& a, const v_int32& b) { v_pack_u_store(ptr, v_pack(a, b)); } @@ -195,7 +195,7 @@ static inline void v_store_pair_as(short* ptr, const v_int32& a, const v_int32& static inline void v_store_pair_as(int* ptr, const v_int32& a, const v_int32& b) { v_store(ptr, a); - v_store(ptr + v_int32::nlanes, b); + v_store(ptr + VTraits::vlanes(), b); } static inline void v_store_pair_as(uchar* ptr, const v_float32& a, const v_float32& b) @@ -214,24 +214,24 @@ static inline void v_store_pair_as(int* ptr, const v_float32& a, const v_float32 { v_int32 ia = v_round(a), ib = v_round(b); v_store(ptr, ia); - v_store(ptr + v_int32::nlanes, ib); + v_store(ptr + VTraits::vlanes(), ib); } static inline void v_store_pair_as(float* ptr, const v_float32& a, const v_float32& b) -{ v_store(ptr, a); v_store(ptr + v_float32::nlanes, b); } +{ v_store(ptr, a); v_store(ptr + VTraits::vlanes(), b); } -#if CV_SIMD_64F +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) static inline void vx_load_as(const double* ptr, v_float32& a) { - v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes); + v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + VTraits::vlanes()); a = v_cvt_f32(v0, v1); } static inline void vx_load_pair_as(const double* ptr, v_int32& a, v_int32& b) { - v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes); - v_float64 v2 = vx_load(ptr + v_float64::nlanes*2), v3 = vx_load(ptr + v_float64::nlanes*3); + v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + VTraits::vlanes()); + v_float64 v2 = vx_load(ptr + VTraits::vlanes()*2), v3 = vx_load(ptr + VTraits::vlanes()*3); v_int32 iv0 = v_round(v0), iv1 = v_round(v1); v_int32 iv2 = v_round(v2), iv3 = v_round(v3); a = v_combine_low(iv0, iv1); @@ -240,8 +240,8 @@ static inline void vx_load_pair_as(const double* ptr, v_int32& a, v_int32& b) static inline void vx_load_pair_as(const double* ptr, v_float32& a, v_float32& b) { - v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes); - v_float64 v2 = vx_load(ptr + v_float64::nlanes*2), v3 = vx_load(ptr + v_float64::nlanes*3); + v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + VTraits::vlanes()); + v_float64 v2 = vx_load(ptr + VTraits::vlanes()*2), v3 = vx_load(ptr + VTraits::vlanes()*3); a = v_cvt_f32(v0, v1); b = v_cvt_f32(v2, v3); } @@ -291,7 +291,7 @@ static inline void vx_load_pair_as(const float* ptr, v_float64& a, v_float64& b) static inline void vx_load_pair_as(const double* ptr, v_float64& a, v_float64& b) { a = vx_load(ptr); - b = vx_load(ptr + v_float64::nlanes); + b = vx_load(ptr + VTraits::vlanes()); } static inline void vx_load_pair_as(const float16_t* ptr, v_float64& a, v_float64& b) @@ -305,7 +305,7 @@ static inline void v_store_as(double* ptr, const v_float32& a) { v_float64 fa0 = v_cvt_f64(a), fa1 = v_cvt_f64_high(a); v_store(ptr, fa0); - v_store(ptr + v_float64::nlanes, fa1); + v_store(ptr + VTraits::vlanes(), fa1); } static inline void v_store_pair_as(double* ptr, const v_int32& a, const v_int32& b) @@ -314,9 +314,9 @@ static inline void v_store_pair_as(double* ptr, const v_int32& a, const v_int32& v_float64 fb0 = v_cvt_f64(b), fb1 = v_cvt_f64_high(b); v_store(ptr, fa0); - v_store(ptr + v_float64::nlanes, fa1); - v_store(ptr + v_float64::nlanes*2, fb0); - v_store(ptr + v_float64::nlanes*3, fb1); + v_store(ptr + VTraits::vlanes(), fa1); + v_store(ptr + VTraits::vlanes()*2, fb0); + v_store(ptr + VTraits::vlanes()*3, fb1); } static inline void v_store_pair_as(double* ptr, const v_float32& a, const v_float32& b) @@ -325,15 +325,15 @@ static inline void v_store_pair_as(double* ptr, const v_float32& a, const v_floa v_float64 fb0 = v_cvt_f64(b), fb1 = v_cvt_f64_high(b); v_store(ptr, fa0); - v_store(ptr + v_float64::nlanes, fa1); - v_store(ptr + v_float64::nlanes*2, fb0); - v_store(ptr + v_float64::nlanes*3, fb1); + v_store(ptr + VTraits::vlanes(), fa1); + v_store(ptr + VTraits::vlanes()*2, fb0); + v_store(ptr + VTraits::vlanes()*3, fb1); } static inline void v_store_pair_as(double* ptr, const v_float64& a, const v_float64& b) { v_store(ptr, a); - v_store(ptr + v_float64::nlanes, b); + v_store(ptr + VTraits::vlanes(), b); } static inline void v_store_pair_as(int* ptr, const v_float64& a, const v_float64& b) diff --git a/modules/core/src/convert.simd.hpp b/modules/core/src/convert.simd.hpp index 5154041b6d..c126450a13 100644 --- a/modules/core/src/convert.simd.hpp +++ b/modules/core/src/convert.simd.hpp @@ -39,8 +39,8 @@ void cvt16f32f( const float16_t* src, float* dst, int len ) { CV_INSTRUMENT_REGION(); int j = 0; -#if CV_SIMD - const int VECSZ = v_float32::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int VECSZ = VTraits::vlanes(); for( ; j < len; j += VECSZ ) { if( j > len - VECSZ ) @@ -60,8 +60,8 @@ void cvt32f16f( const float* src, float16_t* dst, int len ) { CV_INSTRUMENT_REGION(); int j = 0; -#if CV_SIMD - const int VECSZ = v_float32::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int VECSZ = VTraits::vlanes(); for( ; j < len; j += VECSZ ) { if( j > len - VECSZ ) @@ -108,8 +108,8 @@ cvt_( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size ) for( int i = 0; i < size.height; i++, src += sstep, dst += dstep ) { int j = 0; -#if CV_SIMD - const int VECSZ = _Twvec::nlanes*2; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int VECSZ = VTraits<_Twvec>::vlanes()*2; for( ; j < size.width; j += VECSZ ) { if( j > size.width - VECSZ ) @@ -139,8 +139,8 @@ cvt1_( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size ) for( int i = 0; i < size.height; i++, src += sstep, dst += dstep ) { int j = 0; -#if CV_SIMD - const int VECSZ = _Twvec::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int VECSZ = VTraits<_Twvec>::vlanes(); for( ; j < size.width; j += VECSZ ) { if( j > size.width - VECSZ ) diff --git a/modules/core/src/convert_scale.simd.hpp b/modules/core/src/convert_scale.simd.hpp index 2c6d55462b..c79a33f1b1 100644 --- a/modules/core/src/convert_scale.simd.hpp +++ b/modules/core/src/convert_scale.simd.hpp @@ -22,9 +22,9 @@ template inline void cvtabs_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size, float a, float b ) { -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b); - const int VECSZ = v_float32::nlanes*2; + const int VECSZ = VTraits::vlanes()*2; #endif sstep /= sizeof(src[0]); dstep /= sizeof(dst[0]); @@ -32,7 +32,7 @@ cvtabs_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, for( int i = 0; i < size.height; i++, src += sstep, dst += dstep ) { int j = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) for( ; j < size.width; j += VECSZ ) { if( j > size.width - VECSZ ) @@ -58,9 +58,9 @@ template inline void cvtabs1_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size, float a, float b ) { -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b); - const int VECSZ = v_float32::nlanes*2; + const int VECSZ = VTraits::vlanes()*2; #endif sstep /= sizeof(src[0]); dstep /= sizeof(dst[0]); @@ -68,7 +68,7 @@ cvtabs1_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, for( int i = 0; i < size.height; i++, src += sstep, dst += dstep ) { int j = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) for( ; j < size.width; j += VECSZ ) { if( j > size.width - VECSZ ) @@ -92,9 +92,9 @@ template inline void cvt_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size, float a, float b ) { -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b); - const int VECSZ = v_float32::nlanes*2; + const int VECSZ = VTraits::vlanes()*2; #endif sstep /= sizeof(src[0]); dstep /= sizeof(dst[0]); @@ -102,7 +102,7 @@ cvt_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, for( int i = 0; i < size.height; i++, src += sstep, dst += dstep ) { int j = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) for( ; j < size.width; j += VECSZ ) { if( j > size.width - VECSZ ) @@ -128,9 +128,9 @@ template inline void cvt1_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size, float a, float b ) { -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b); - const int VECSZ = v_float32::nlanes; + const int VECSZ = VTraits::vlanes(); #endif sstep /= sizeof(src[0]); dstep /= sizeof(dst[0]); @@ -138,7 +138,7 @@ cvt1_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, for( int i = 0; i < size.height; i++, src += sstep, dst += dstep ) { int j = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) for( ; j < size.width; j += VECSZ ) { if( j > size.width - VECSZ ) @@ -163,9 +163,9 @@ template inline void cvt_64f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size, double a, double b ) { -#if CV_SIMD_64F +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) v_float64 va = vx_setall_f64(a), vb = vx_setall_f64(b); - const int VECSZ = v_float64::nlanes*2; + const int VECSZ = VTraits::vlanes()*2; #endif sstep /= sizeof(src[0]); dstep /= sizeof(dst[0]); @@ -173,7 +173,7 @@ cvt_64f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, for( int i = 0; i < size.height; i++, src += sstep, dst += dstep ) { int j = 0; -#if CV_SIMD_64F +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) for( ; j < size.width; j += VECSZ ) { if( j > size.width - VECSZ ) diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp index 89948fb878..aab4fbd3f0 100644 --- a/modules/core/src/copy.cpp +++ b/modules/core/src/copy.cpp @@ -156,15 +156,15 @@ copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mste const uchar* src = (const uchar*)_src; uchar* dst = (uchar*)_dst; int x = 0; - #if CV_SIMD + #if (CV_SIMD || CV_SIMD_SCALABLE) { v_uint8 v_zero = vx_setzero_u8(); - for( ; x <= size.width - v_uint8::nlanes; x += v_uint8::nlanes ) + for( ; x <= size.width - VTraits::vlanes(); x += VTraits::vlanes() ) { v_uint8 v_src = vx_load(src + x), v_dst = vx_load(dst + x), - v_nmask = vx_load(mask + x) == v_zero; + v_nmask = v_eq(vx_load(mask + x), v_zero); v_dst = v_select(v_nmask, v_dst, v_src); v_store(dst + x, v_dst); @@ -188,23 +188,23 @@ copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mst const ushort* src = (const ushort*)_src; ushort* dst = (ushort*)_dst; int x = 0; - #if CV_SIMD + #if (CV_SIMD || CV_SIMD_SCALABLE) { v_uint8 v_zero = vx_setzero_u8(); - for( ; x <= size.width - v_uint8::nlanes; x += v_uint8::nlanes ) + for( ; x <= size.width - VTraits::vlanes(); x += VTraits::vlanes() ) { - v_uint16 v_src1 = vx_load(src + x), v_src2 = vx_load(src + x + v_uint16::nlanes), - v_dst1 = vx_load(dst + x), v_dst2 = vx_load(dst + x + v_uint16::nlanes); + v_uint16 v_src1 = vx_load(src + x), v_src2 = vx_load(src + x + VTraits::vlanes()), + v_dst1 = vx_load(dst + x), v_dst2 = vx_load(dst + x + VTraits::vlanes()); v_uint8 v_nmask1, v_nmask2; - v_uint8 v_nmask = vx_load(mask + x) == v_zero; + v_uint8 v_nmask = v_eq(vx_load(mask + x), v_zero); v_zip(v_nmask, v_nmask, v_nmask1, v_nmask2); v_dst1 = v_select(v_reinterpret_as_u16(v_nmask1), v_dst1, v_src1); v_dst2 = v_select(v_reinterpret_as_u16(v_nmask2), v_dst2, v_src2); v_store(dst + x, v_dst1); - v_store(dst + x + v_uint16::nlanes, v_dst2); + v_store(dst + x + VTraits::vlanes(), v_dst2); } } vx_cleanup(); diff --git a/modules/core/src/count_non_zero.simd.hpp b/modules/core/src/count_non_zero.simd.hpp index 6994564127..ce7c75aa54 100644 --- a/modules/core/src/count_non_zero.simd.hpp +++ b/modules/core/src/count_non_zero.simd.hpp @@ -32,8 +32,8 @@ static int countNonZero_(const T* src, int len ) static int countNonZero8u( const uchar* src, int len ) { int i=0, nz = 0; -#if CV_SIMD - int len0 = len & -v_uint8::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + int len0 = len & -VTraits::vlanes(); v_uint8 v_zero = vx_setzero_u8(); v_uint8 v_one = vx_setall_u8(1); @@ -42,20 +42,20 @@ static int countNonZero8u( const uchar* src, int len ) { v_uint16 v_sum16 = vx_setzero_u16(); int j = i; - while (j < std::min(len0, i + 65280 * v_uint16::nlanes)) + while (j < std::min(len0, i + 65280 * VTraits::vlanes())) { v_uint8 v_sum8 = vx_setzero_u8(); int k = j; - for (; k < std::min(len0, j + 255 * v_uint8::nlanes); k += v_uint8::nlanes) - v_sum8 += v_one & (vx_load(src + k) == v_zero); + for (; k < std::min(len0, j + 255 * VTraits::vlanes()); k += VTraits::vlanes()) + v_sum8 = v_add(v_sum8, v_and(v_one, v_eq(vx_load(src + k), v_zero))); v_uint16 part1, part2; v_expand(v_sum8, part1, part2); - v_sum16 += part1 + part2; + v_sum16 = v_add(v_sum16, v_add(part1, part2)); j = k; } v_uint32 part1, part2; v_expand(v_sum16, part1, part2); - v_sum32 += part1 + part2; + v_sum32 = v_add(v_sum32, v_add(part1, part2)); i = j; } nz = i - v_reduce_sum(v_sum32); @@ -69,8 +69,8 @@ static int countNonZero8u( const uchar* src, int len ) static int countNonZero16u( const ushort* src, int len ) { int i = 0, nz = 0; -#if CV_SIMD - int len0 = len & -v_int8::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + int len0 = len & -VTraits::vlanes(); v_uint16 v_zero = vx_setzero_u16(); v_int8 v_one = vx_setall_s8(1); @@ -79,20 +79,20 @@ static int countNonZero16u( const ushort* src, int len ) { v_int16 v_sum16 = vx_setzero_s16(); int j = i; - while (j < std::min(len0, i + 32766 * v_int16::nlanes)) + while (j < std::min(len0, i + 32766 * VTraits::vlanes())) { v_int8 v_sum8 = vx_setzero_s8(); int k = j; - for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes) - v_sum8 += v_one & v_pack(v_reinterpret_as_s16(vx_load(src + k) == v_zero), v_reinterpret_as_s16(vx_load(src + k + v_uint16::nlanes) == v_zero)); + for (; k < std::min(len0, j + 127 * VTraits::vlanes()); k += VTraits::vlanes()) + v_sum8 = v_add(v_sum8, v_and(v_one, v_pack(v_reinterpret_as_s16(v_eq(vx_load(src + k), v_zero)), v_reinterpret_as_s16(v_eq(vx_load(src + k + VTraits::vlanes()), v_zero))))); v_int16 part1, part2; v_expand(v_sum8, part1, part2); - v_sum16 += part1 + part2; + v_sum16 = v_add(v_sum16, v_add(part1, part2)); j = k; } v_int32 part1, part2; v_expand(v_sum16, part1, part2); - v_sum32 += part1 + part2; + v_sum32 = v_add(v_sum32, v_add(part1, part2)); i = j; } nz = i - v_reduce_sum(v_sum32); @@ -104,8 +104,8 @@ static int countNonZero16u( const ushort* src, int len ) static int countNonZero32s( const int* src, int len ) { int i = 0, nz = 0; -#if CV_SIMD - int len0 = len & -v_int8::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + int len0 = len & -VTraits::vlanes(); v_int32 v_zero = vx_setzero_s32(); v_int8 v_one = vx_setall_s8(1); @@ -114,23 +114,20 @@ static int countNonZero32s( const int* src, int len ) { v_int16 v_sum16 = vx_setzero_s16(); int j = i; - while (j < std::min(len0, i + 32766 * v_int16::nlanes)) + while (j < std::min(len0, i + 32766 * VTraits::vlanes())) { v_int8 v_sum8 = vx_setzero_s8(); int k = j; - for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes) - v_sum8 += v_one & v_pack( - v_pack(vx_load(src + k ) == v_zero, vx_load(src + k + v_int32::nlanes) == v_zero), - v_pack(vx_load(src + k + 2*v_int32::nlanes) == v_zero, vx_load(src + k + 3*v_int32::nlanes) == v_zero) - ); + for (; k < std::min(len0, j + 127 * VTraits::vlanes()); k += VTraits::vlanes()) + v_sum8 = v_add(v_sum8, v_and(v_one, v_pack(v_pack(v_eq(vx_load(src + k), v_zero), v_eq(vx_load(src + k + VTraits::vlanes()), v_zero)), v_pack(v_eq(vx_load(src + k + 2 * VTraits::vlanes()), v_zero), v_eq(vx_load(src + k + 3 * VTraits::vlanes()), v_zero))))); v_int16 part1, part2; v_expand(v_sum8, part1, part2); - v_sum16 += part1 + part2; + v_sum16 = v_add(v_sum16, v_add(part1, part2)); j = k; } v_int32 part1, part2; v_expand(v_sum16, part1, part2); - v_sum32 += part1 + part2; + v_sum32 = v_add(v_sum32, v_add(part1, part2)); i = j; } nz = i - v_reduce_sum(v_sum32); @@ -142,8 +139,8 @@ static int countNonZero32s( const int* src, int len ) static int countNonZero32f( const float* src, int len ) { int i = 0, nz = 0; -#if CV_SIMD - int len0 = len & -v_int8::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + int len0 = len & -VTraits::vlanes(); v_float32 v_zero = vx_setzero_f32(); v_int8 v_one = vx_setall_s8(1); @@ -152,23 +149,20 @@ static int countNonZero32f( const float* src, int len ) { v_int16 v_sum16 = vx_setzero_s16(); int j = i; - while (j < std::min(len0, i + 32766 * v_int16::nlanes)) + while (j < std::min(len0, i + 32766 * VTraits::vlanes())) { v_int8 v_sum8 = vx_setzero_s8(); int k = j; - for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes) - v_sum8 += v_one & v_pack( - v_pack(v_reinterpret_as_s32(vx_load(src + k ) == v_zero), v_reinterpret_as_s32(vx_load(src + k + v_float32::nlanes) == v_zero)), - v_pack(v_reinterpret_as_s32(vx_load(src + k + 2*v_float32::nlanes) == v_zero), v_reinterpret_as_s32(vx_load(src + k + 3*v_float32::nlanes) == v_zero)) - ); + for (; k < std::min(len0, j + 127 * VTraits::vlanes()); k += VTraits::vlanes()) + v_sum8 = v_add(v_sum8, v_and(v_one, v_pack(v_pack(v_reinterpret_as_s32(v_eq(vx_load(src + k), v_zero)), v_reinterpret_as_s32(v_eq(vx_load(src + k + VTraits::vlanes()), v_zero))), v_pack(v_reinterpret_as_s32(v_eq(vx_load(src + k + 2 * VTraits::vlanes()), v_zero)), v_reinterpret_as_s32(v_eq(vx_load(src + k + 3 * VTraits::vlanes()), v_zero)))))); v_int16 part1, part2; v_expand(v_sum8, part1, part2); - v_sum16 += part1 + part2; + v_sum16 = v_add(v_sum16, v_add(part1, part2)); j = k; } v_int32 part1, part2; v_expand(v_sum16, part1, part2); - v_sum32 += part1 + part2; + v_sum32 = v_add(v_sum32, v_add(part1, part2)); i = j; } nz = i - v_reduce_sum(v_sum32); @@ -180,21 +174,21 @@ static int countNonZero32f( const float* src, int len ) static int countNonZero64f( const double* src, int len ) { int nz = 0, i = 0; -#if CV_SIMD_64F +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) v_int64 sum1 = vx_setzero_s64(); v_int64 sum2 = vx_setzero_s64(); v_float64 zero = vx_setzero_f64(); - int step = v_float64::nlanes * 2; + int step = VTraits::vlanes() * 2; int len0 = len & -step; for(i = 0; i < len0; i += step ) { - sum1 += v_reinterpret_as_s64(vx_load(&src[i]) == zero); - sum2 += v_reinterpret_as_s64(vx_load(&src[i + step / 2]) == zero); + sum1 = v_add(sum1, v_reinterpret_as_s64(v_eq(vx_load(&src[i]), zero))); + sum2 = v_add(sum2, v_reinterpret_as_s64(v_eq(vx_load(&src[i + step / 2]), zero))); } // N.B the value is incremented by -1 (0xF...F) for each value - nz = i + (int)v_reduce_sum(sum1 + sum2); + nz = i + (int)v_reduce_sum(v_add(sum1, sum2)); v_cleanup(); #endif return nz + countNonZero_(src + i, len - i); diff --git a/modules/core/src/lapack.cpp b/modules/core/src/lapack.cpp index a644fe15a7..43c6d07d58 100644 --- a/modules/core/src/lapack.cpp +++ b/modules/core/src/lapack.cpp @@ -274,22 +274,21 @@ template struct VBLAS { int dot(const T*, const T*, int, T*) const { return 0; } int givens(T*, T*, int, T, T) const { return 0; } - int givensx(T*, T*, int, T, T, T*, T*) const { return 0; } }; -#if CV_SIMD +#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE_64F template<> inline int VBLAS::dot(const float* a, const float* b, int n, float* result) const { - if( n < 2*v_float32::nlanes ) + if( n < 2*VTraits::vlanes() ) return 0; int k = 0; v_float32 s0 = vx_setzero_f32(); - for( ; k <= n - v_float32::nlanes; k += v_float32::nlanes ) + for( ; k <= n - VTraits::vlanes(); k += VTraits::vlanes() ) { v_float32 a0 = vx_load(a + k); v_float32 b0 = vx_load(b + k); - s0 += a0 * b0; + s0 = v_add(s0, v_mul(a0, b0)); } *result = v_reduce_sum(s0); vx_cleanup(); @@ -299,16 +298,16 @@ template<> inline int VBLAS::dot(const float* a, const float* b, int n, f template<> inline int VBLAS::givens(float* a, float* b, int n, float c, float s) const { - if( n < v_float32::nlanes) + if( n < VTraits::vlanes()) return 0; int k = 0; v_float32 c4 = vx_setall_f32(c), s4 = vx_setall_f32(s); - for( ; k <= n - v_float32::nlanes; k += v_float32::nlanes ) + for( ; k <= n - VTraits::vlanes(); k += VTraits::vlanes() ) { v_float32 a0 = vx_load(a + k); v_float32 b0 = vx_load(b + k); - v_float32 t0 = (a0 * c4) + (b0 * s4); - v_float32 t1 = (b0 * c4) - (a0 * s4); + v_float32 t0 = v_add(v_mul(a0, c4), v_mul(b0, s4)); + v_float32 t1 = v_sub(v_mul(b0, c4), v_mul(a0, s4)); v_store(a + k, t0); v_store(b + k, t1); } @@ -317,44 +316,19 @@ template<> inline int VBLAS::givens(float* a, float* b, int n, float c, f } -template<> inline int VBLAS::givensx(float* a, float* b, int n, float c, float s, - float* anorm, float* bnorm) const -{ - if( n < v_float32::nlanes) - return 0; - int k = 0; - v_float32 c4 = vx_setall_f32(c), s4 = vx_setall_f32(s); - v_float32 sa = vx_setzero_f32(), sb = vx_setzero_f32(); - for( ; k <= n - v_float32::nlanes; k += v_float32::nlanes ) - { - v_float32 a0 = vx_load(a + k); - v_float32 b0 = vx_load(b + k); - v_float32 t0 = (a0 * c4) + (b0 * s4); - v_float32 t1 = (b0 * c4) - (a0 * s4); - v_store(a + k, t0); - v_store(b + k, t1); - sa += t0 + t0; - sb += t1 + t1; - } - *anorm = v_reduce_sum(sa); - *bnorm = v_reduce_sum(sb); - vx_cleanup(); - return k; -} - -#if CV_SIMD_64F +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) template<> inline int VBLAS::dot(const double* a, const double* b, int n, double* result) const { - if( n < 2*v_float64::nlanes ) + if( n < 2*VTraits::vlanes() ) return 0; int k = 0; v_float64 s0 = vx_setzero_f64(); - for( ; k <= n - v_float64::nlanes; k += v_float64::nlanes ) + for( ; k <= n - VTraits::vlanes(); k += VTraits::vlanes() ) { v_float64 a0 = vx_load(a + k); v_float64 b0 = vx_load(b + k); - s0 += a0 * b0; + s0 = v_add(s0, v_mul(a0, b0)); } double sbuf[2]; v_store(sbuf, s0); @@ -368,12 +342,12 @@ template<> inline int VBLAS::givens(double* a, double* b, int n, double { int k = 0; v_float64 c2 = vx_setall_f64(c), s2 = vx_setall_f64(s); - for( ; k <= n - v_float64::nlanes; k += v_float64::nlanes ) + for( ; k <= n - VTraits::vlanes(); k += VTraits::vlanes() ) { v_float64 a0 = vx_load(a + k); v_float64 b0 = vx_load(b + k); - v_float64 t0 = (a0 * c2) + (b0 * s2); - v_float64 t1 = (b0 * c2) - (a0 * s2); + v_float64 t0 = v_add(v_mul(a0, c2), v_mul(b0, s2)); + v_float64 t1 = v_sub(v_mul(b0, c2), v_mul(a0, s2)); v_store(a + k, t0); v_store(b + k, t1); } @@ -382,30 +356,6 @@ template<> inline int VBLAS::givens(double* a, double* b, int n, double } -template<> inline int VBLAS::givensx(double* a, double* b, int n, double c, double s, - double* anorm, double* bnorm) const -{ - int k = 0; - v_float64 c2 = vx_setall_f64(c), s2 = vx_setall_f64(s); - v_float64 sa = vx_setzero_f64(), sb = vx_setzero_f64(); - for( ; k <= n - v_float64::nlanes; k += v_float64::nlanes ) - { - v_float64 a0 = vx_load(a + k); - v_float64 b0 = vx_load(b + k); - v_float64 t0 = (a0 * c2) + (b0 * s2); - v_float64 t1 = (b0 * c2) - (a0 * s2); - v_store(a + k, t0); - v_store(b + k, t1); - sa += t0 * t0; - sb += t1 * t1; - } - double abuf[2], bbuf[2]; - v_store(abuf, sa); - v_store(bbuf, sb); - *anorm = abuf[0] + abuf[1]; - *bnorm = bbuf[0] + bbuf[1]; - return k; -} #endif //CV_SIMD_64F #endif //CV_SIMD @@ -916,7 +866,7 @@ double invert( InputArray _src, OutputArray _dst, int method ) #if CV_SIMD128 const float d_32f = (float)d; const v_float32x4 d_vec(d_32f, -d_32f, -d_32f, d_32f); - v_float32x4 s0 = v_load_halves((const float*)srcdata, (const float*)(srcdata + srcstep)) * d_vec;//0123//3120 + v_float32x4 s0 = v_mul(v_load_halves((const float *)srcdata, (const float *)(srcdata + srcstep)), d_vec);//0123//3120 s0 = v_extract<3>(s0, v_combine_low(v_rotate_right<1>(s0), s0)); v_store_low((float*)dstdata, s0); v_store_high((float*)(dstdata + dststep), s0); @@ -942,10 +892,10 @@ double invert( InputArray _src, OutputArray _dst, int method ) d = 1./d; #if CV_SIMD128_64F v_float64x2 det = v_setall_f64(d); - v_float64x2 s0 = v_load((const double*)srcdata) * det; - v_float64x2 s1 = v_load((const double*)(srcdata+srcstep)) * det; + v_float64x2 s0 = v_mul(v_load((const double *)srcdata), det); + v_float64x2 s1 = v_mul(v_load((const double *)(srcdata + srcstep)), det); v_float64x2 sm = v_extract<1>(s1, s0);//30 - v_float64x2 ss = v_setall(0) - v_extract<1>(s0, s1);//12 + v_float64x2 ss = v_sub(v_setall(0), v_extract<1>(s0, s1));//12 v_store((double*)dstdata, v_combine_low(sm, ss));//31 v_store((double*)(dstdata + dststep), v_combine_high(ss, sm));//20 #else diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp index 056be63a71..9e3a1dbad2 100644 --- a/modules/core/src/mathfuncs.cpp +++ b/modules/core/src/mathfuncs.cpp @@ -614,13 +614,13 @@ void polarToCart( InputArray src1, InputArray src2, { k = 0; -#if CV_SIMD - int cWidth = v_float32::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + int cWidth = VTraits::vlanes(); for( ; k <= len - cWidth; k += cWidth ) { v_float32 v_m = vx_load(mag + k); - v_store(x + k, vx_load(x + k) * v_m); - v_store(y + k, vx_load(y + k) * v_m); + v_store(x + k, v_mul(vx_load(x + k), v_m)); + v_store(y + k, v_mul(vx_load(y + k), v_m)); } vx_cleanup(); #endif @@ -741,7 +741,7 @@ struct iPow_SIMD } }; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) template <> struct iPow_SIMD @@ -751,7 +751,7 @@ struct iPow_SIMD int i = 0; v_uint32 v_1 = vx_setall_u32(1u); - for ( ; i <= len - v_uint16::nlanes; i += v_uint16::nlanes) + for ( ; i <= len - VTraits::vlanes(); i += VTraits::vlanes()) { v_uint32 v_a1 = v_1, v_a2 = v_1; v_uint16 v = vx_load_expand(src + i); @@ -763,16 +763,16 @@ struct iPow_SIMD { if (p & 1) { - v_a1 *= v_b1; - v_a2 *= v_b2; + v_a1 = v_mul(v_a1, v_b1); + v_a2 = v_mul(v_a2, v_b2); } - v_b1 *= v_b1; - v_b2 *= v_b2; + v_b1 = v_mul(v_b1, v_b1); + v_b2 = v_mul(v_b2, v_b2); p >>= 1; } - v_a1 *= v_b1; - v_a2 *= v_b2; + v_a1 = v_mul(v_a1, v_b1); + v_a2 = v_mul(v_a2, v_b2); v = v_pack(v_a1, v_a2); v_pack_store(dst + i, v); @@ -791,7 +791,7 @@ struct iPow_SIMD int i = 0; v_int32 v_1 = vx_setall_s32(1); - for ( ; i <= len - v_int16::nlanes; i += v_int16::nlanes) + for ( ; i <= len - VTraits::vlanes(); i += VTraits::vlanes()) { v_int32 v_a1 = v_1, v_a2 = v_1; v_int16 v = vx_load_expand(src + i); @@ -803,16 +803,16 @@ struct iPow_SIMD { if (p & 1) { - v_a1 *= v_b1; - v_a2 *= v_b2; + v_a1 = v_mul(v_a1, v_b1); + v_a2 = v_mul(v_a2, v_b2); } - v_b1 *= v_b1; - v_b2 *= v_b2; + v_b1 = v_mul(v_b1, v_b1); + v_b2 = v_mul(v_b2, v_b2); p >>= 1; } - v_a1 *= v_b1; - v_a2 *= v_b2; + v_a1 = v_mul(v_a1, v_b1); + v_a2 = v_mul(v_a2, v_b2); v = v_pack(v_a1, v_a2); v_pack_store(dst + i, v); @@ -831,7 +831,7 @@ struct iPow_SIMD int i = 0; v_uint32 v_1 = vx_setall_u32(1u); - for ( ; i <= len - v_uint16::nlanes; i += v_uint16::nlanes) + for ( ; i <= len - VTraits::vlanes(); i += VTraits::vlanes()) { v_uint32 v_a1 = v_1, v_a2 = v_1; v_uint16 v = vx_load(src + i); @@ -843,16 +843,16 @@ struct iPow_SIMD { if (p & 1) { - v_a1 *= v_b1; - v_a2 *= v_b2; + v_a1 = v_mul(v_a1, v_b1); + v_a2 = v_mul(v_a2, v_b2); } - v_b1 *= v_b1; - v_b2 *= v_b2; + v_b1 = v_mul(v_b1, v_b1); + v_b2 = v_mul(v_b2, v_b2); p >>= 1; } - v_a1 *= v_b1; - v_a2 *= v_b2; + v_a1 = v_mul(v_a1, v_b1); + v_a2 = v_mul(v_a2, v_b2); v = v_pack(v_a1, v_a2); v_store(dst + i, v); @@ -871,7 +871,7 @@ struct iPow_SIMD int i = 0; v_int32 v_1 = vx_setall_s32(1); - for ( ; i <= len - v_int16::nlanes; i += v_int16::nlanes) + for ( ; i <= len - VTraits::vlanes(); i += VTraits::vlanes()) { v_int32 v_a1 = v_1, v_a2 = v_1; v_int16 v = vx_load(src + i); @@ -883,16 +883,16 @@ struct iPow_SIMD { if (p & 1) { - v_a1 *= v_b1; - v_a2 *= v_b2; + v_a1 = v_mul(v_a1, v_b1); + v_a2 = v_mul(v_a2, v_b2); } - v_b1 *= v_b1; - v_b2 *= v_b2; + v_b1 = v_mul(v_b1, v_b1); + v_b2 = v_mul(v_b2, v_b2); p >>= 1; } - v_a1 *= v_b1; - v_a2 *= v_b2; + v_a1 = v_mul(v_a1, v_b1); + v_a2 = v_mul(v_a2, v_b2); v = v_pack(v_a1, v_a2); v_store(dst + i, v); @@ -911,29 +911,29 @@ struct iPow_SIMD int i = 0; v_int32 v_1 = vx_setall_s32(1); - for ( ; i <= len - v_int32::nlanes*2; i += v_int32::nlanes*2) + for ( ; i <= len - VTraits::vlanes()*2; i += VTraits::vlanes()*2) { v_int32 v_a1 = v_1, v_a2 = v_1; - v_int32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + v_int32::nlanes); + v_int32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + VTraits::vlanes()); int p = power; while( p > 1 ) { if (p & 1) { - v_a1 *= v_b1; - v_a2 *= v_b2; + v_a1 = v_mul(v_a1, v_b1); + v_a2 = v_mul(v_a2, v_b2); } - v_b1 *= v_b1; - v_b2 *= v_b2; + v_b1 = v_mul(v_b1, v_b1); + v_b2 = v_mul(v_b2, v_b2); p >>= 1; } - v_a1 *= v_b1; - v_a2 *= v_b2; + v_a1 = v_mul(v_a1, v_b1); + v_a2 = v_mul(v_a2, v_b2); v_store(dst + i, v_a1); - v_store(dst + i + v_int32::nlanes, v_a2); + v_store(dst + i + VTraits::vlanes(), v_a2); } vx_cleanup(); @@ -949,34 +949,34 @@ struct iPow_SIMD int i = 0; v_float32 v_1 = vx_setall_f32(1.f); - for ( ; i <= len - v_float32::nlanes*2; i += v_float32::nlanes*2) + for ( ; i <= len - VTraits::vlanes()*2; i += VTraits::vlanes()*2) { v_float32 v_a1 = v_1, v_a2 = v_1; - v_float32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + v_float32::nlanes); + v_float32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + VTraits::vlanes()); int p = std::abs(power); if( power < 0 ) { - v_b1 = v_1 / v_b1; - v_b2 = v_1 / v_b2; + v_b1 = v_div(v_1, v_b1); + v_b2 = v_div(v_1, v_b2); } while( p > 1 ) { if (p & 1) { - v_a1 *= v_b1; - v_a2 *= v_b2; + v_a1 = v_mul(v_a1, v_b1); + v_a2 = v_mul(v_a2, v_b2); } - v_b1 *= v_b1; - v_b2 *= v_b2; + v_b1 = v_mul(v_b1, v_b1); + v_b2 = v_mul(v_b2, v_b2); p >>= 1; } - v_a1 *= v_b1; - v_a2 *= v_b2; + v_a1 = v_mul(v_a1, v_b1); + v_a2 = v_mul(v_a2, v_b2); v_store(dst + i, v_a1); - v_store(dst + i + v_float32::nlanes, v_a2); + v_store(dst + i + VTraits::vlanes(), v_a2); } vx_cleanup(); @@ -984,7 +984,7 @@ struct iPow_SIMD } }; -#if CV_SIMD_64F +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) template <> struct iPow_SIMD { @@ -993,34 +993,34 @@ struct iPow_SIMD int i = 0; v_float64 v_1 = vx_setall_f64(1.); - for ( ; i <= len - v_float64::nlanes*2; i += v_float64::nlanes*2) + for ( ; i <= len - VTraits::vlanes()*2; i += VTraits::vlanes()*2) { v_float64 v_a1 = v_1, v_a2 = v_1; - v_float64 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + v_float64::nlanes); + v_float64 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + VTraits::vlanes()); int p = std::abs(power); if( power < 0 ) { - v_b1 = v_1 / v_b1; - v_b2 = v_1 / v_b2; + v_b1 = v_div(v_1, v_b1); + v_b2 = v_div(v_1, v_b2); } while( p > 1 ) { if (p & 1) { - v_a1 *= v_b1; - v_a2 *= v_b2; + v_a1 = v_mul(v_a1, v_b1); + v_a2 = v_mul(v_a2, v_b2); } - v_b1 *= v_b1; - v_b2 *= v_b2; + v_b1 = v_mul(v_b1, v_b1); + v_b2 = v_mul(v_b2, v_b2); p >>= 1; } - v_a1 *= v_b1; - v_a2 *= v_b2; + v_a1 = v_mul(v_a1, v_b1); + v_a2 = v_mul(v_a2, v_b2); v_store(dst + i, v_a1); - v_store(dst + i + v_float64::nlanes, v_a2); + v_store(dst + i + VTraits::vlanes(), v_a2); } vx_cleanup(); @@ -1614,7 +1614,7 @@ void patchNaNs( InputOutputArray _a, double _val ) Cv32suf val; val.f = (float)_val; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_int32 v_mask1 = vx_setall_s32(0x7fffffff), v_mask2 = vx_setall_s32(0x7f800000); v_int32 v_val = vx_setall_s32(val.i); #endif @@ -1624,12 +1624,12 @@ void patchNaNs( InputOutputArray _a, double _val ) int* tptr = ptrs[0]; size_t j = 0; -#if CV_SIMD - size_t cWidth = (size_t)v_int32::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + size_t cWidth = (size_t)VTraits::vlanes(); for ( ; j + cWidth <= len; j += cWidth) { v_int32 v_src = vx_load(tptr + j); - v_int32 v_cmp_mask = v_mask2 < (v_src & v_mask1); + v_int32 v_cmp_mask = v_lt(v_mask2, v_and(v_src, v_mask1)); v_int32 v_dst = v_select(v_cmp_mask, v_val, v_src); v_store(tptr + j, v_dst); } diff --git a/modules/core/src/matmul.simd.hpp b/modules/core/src/matmul.simd.hpp index 5a7f36d12b..62aacc0d63 100644 --- a/modules/core/src/matmul.simd.hpp +++ b/modules/core/src/matmul.simd.hpp @@ -1454,7 +1454,7 @@ transform_( const T* src, T* dst, const WT* m, int len, int scn, int dcn ) static void transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn ) { -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) const int BITS = 10, SCALE = 1 << BITS; const float MAX_M = (float)(1 << (15 - BITS)); @@ -1485,7 +1485,7 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in v_int32 m10 = vx_setall_s32(m32[4]); v_int32 m11 = vx_setall_s32(m32[5]); int x = 0; - for (; x <= (len - v_uint8::nlanes) * nChannels; x += v_uint8::nlanes * nChannels) + for (; x <= (len - VTraits::vlanes()) * nChannels; x += VTraits::vlanes() * nChannels) { v_uint8 b, g, r; v_load_deinterleave(src + x, b, g, r); @@ -1499,20 +1499,20 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in v_int32 p1, p3; v_expand(bgl, p0, p2); v_expand(v_reinterpret_as_s16(rl), p1, p3); - dbl = v_rshr_pack(v_dotprod(v_reinterpret_as_s16(p0), m01) + p1 * m2 + m3, - v_dotprod(v_reinterpret_as_s16(p2), m01) + p3 * m2 + m3); - dgl = v_rshr_pack(v_dotprod(v_reinterpret_as_s16(p0), m45) + p1 * m6 + m7, - v_dotprod(v_reinterpret_as_s16(p2), m45) + p3 * m6 + m7); - drl = v_rshr_pack(v_dotprod(v_reinterpret_as_s16(p0), m89) + p1 * m10 + m11, - v_dotprod(v_reinterpret_as_s16(p2), m89) + p3 * m10 + m11); + dbl = v_rshr_pack(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m01), v_mul(p1, m2)), m3), + v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m01), v_mul(p3, m2)), m3)); + dgl = v_rshr_pack(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m45), v_mul(p1, m6)), m7), + v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m45), v_mul(p3, m6)), m7)); + drl = v_rshr_pack(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m89), v_mul(p1, m10)), m11), + v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m89), v_mul(p3, m10)), m11)); v_expand(bgh, p0, p2); v_expand(v_reinterpret_as_s16(rh), p1, p3); - dbh = v_rshr_pack(v_dotprod(v_reinterpret_as_s16(p0), m01) + p1 * m2 + m3, - v_dotprod(v_reinterpret_as_s16(p2), m01) + p3 * m2 + m3); - dgh = v_rshr_pack(v_dotprod(v_reinterpret_as_s16(p0), m45) + p1 * m6 + m7, - v_dotprod(v_reinterpret_as_s16(p2), m45) + p3 * m6 + m7); - drh = v_rshr_pack(v_dotprod(v_reinterpret_as_s16(p0), m89) + p1 * m10 + m11, - v_dotprod(v_reinterpret_as_s16(p2), m89) + p3 * m10 + m11); + dbh = v_rshr_pack(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m01), v_mul(p1, m2)), m3), + v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m01), v_mul(p3, m2)), m3)); + dgh = v_rshr_pack(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m45), v_mul(p1, m6)), m7), + v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m45), v_mul(p3, m6)), m7)); + drh = v_rshr_pack(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m89), v_mul(p1, m10)), m11), + v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m89), v_mul(p3, m10)), m11)); v_store_interleave(dst + x, v_pack_u(dbl, dbh), v_pack_u(dgl, dgh), v_pack_u(drl, drh)); } m32[1] = saturate_cast((m[3] + 0.5f)*SCALE); @@ -1537,7 +1537,7 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in static void transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn ) { -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) if( scn == 3 && dcn == 3 ) { int x = 0; @@ -1555,7 +1555,7 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn, v_float32 m10 = vx_setall_f32(m[10]); v_float32 m11 = vx_setall_f32(m[11] - 32768.f); v_int16 delta = vx_setall_s16(-32768); - for (; x <= (len - v_uint16::nlanes)*3; x += v_uint16::nlanes*3) + for (; x <= (len - VTraits::vlanes())*3; x += VTraits::vlanes()*3) { v_uint16 b, g, r; v_load_deinterleave(src + x, b, g, r); @@ -1574,6 +1574,7 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn, v_store_interleave(dst + x, v_reinterpret_as_u16(db), v_reinterpret_as_u16(dg), v_reinterpret_as_u16(dr)); } #endif +#if CV_SIMD128 v_float32x4 _m0l(m[0], m[4], m[ 8], 0.f); v_float32x4 _m1l(m[1], m[5], m[ 9], 0.f); v_float32x4 _m2l(m[2], m[6], m[10], 0.f); @@ -1587,6 +1588,7 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn, v_store(dst + x, v_rotate_right<1>(v_reinterpret_as_u16(v_add_wrap(v_pack( v_round(v_matmuladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand(src + x ))), _m0h, _m1h, _m2h, _m3h)), v_round(v_matmuladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand(src + x + 3))), _m0l, _m1l, _m2l, _m3l))), _delta)))); +#endif //CV_SIMD128 for( ; x < len * 3; x += 3 ) { float v0 = src[x], v1 = src[x + 1], v2 = src[x + 2]; @@ -1606,25 +1608,25 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn, static void transform_32f( const float* src, float* dst, const float* m, int len, int scn, int dcn ) { -#if CV_SIMD && !defined(__aarch64__) && !defined(_M_ARM64) +#if (CV_SIMD || CV_SIMD_SCALABLE) && !defined(__aarch64__) && !defined(_M_ARM64) int x = 0; if( scn == 3 && dcn == 3 ) { - int idx[v_float32::nlanes/2]; - for( int i = 0; i < v_float32::nlanes/4; i++ ) + int idx[VTraits::max_nlanes/2]; + for( int i = 0; i < VTraits::vlanes()/4; i++ ) { idx[i] = 3*i; - idx[i + v_float32::nlanes/4] = 0; + idx[i + VTraits::vlanes()/4] = 0; } float _m[] = { m[0], m[4], m[ 8], 0.f, m[1], m[5], m[ 9], 0.f, m[2], m[6], m[10], 0.f, m[3], m[7], m[11], 0.f }; - v_float32 m0 = vx_lut_quads(_m , idx + v_float32::nlanes/4); - v_float32 m1 = vx_lut_quads(_m + 4, idx + v_float32::nlanes/4); - v_float32 m2 = vx_lut_quads(_m + 8, idx + v_float32::nlanes/4); - v_float32 m3 = vx_lut_quads(_m + 12, idx + v_float32::nlanes/4); - for( ; x <= len*3 - v_float32::nlanes; x += 3*v_float32::nlanes/4 ) + v_float32 m0 = vx_lut_quads(_m , idx + VTraits::vlanes()/4); + v_float32 m1 = vx_lut_quads(_m + 4, idx + VTraits::vlanes()/4); + v_float32 m2 = vx_lut_quads(_m + 8, idx + VTraits::vlanes()/4); + v_float32 m3 = vx_lut_quads(_m + 12, idx + VTraits::vlanes()/4); + for( ; x <= len*3 - VTraits::vlanes(); x += 3*VTraits::vlanes()/4 ) v_store(dst + x, v_pack_triplets(v_matmuladd(vx_lut_quads(src + x, idx), m0, m1, m2, m3))); for( ; x < len*3; x += 3 ) { @@ -1641,8 +1643,8 @@ transform_32f( const float* src, float* dst, const float* m, int len, int scn, i if( scn == 4 && dcn == 4 ) { #if CV_SIMD_WIDTH > 16 - int idx[v_float32::nlanes/4]; - for( int i = 0; i < v_float32::nlanes/4; i++ ) + int idx[VTraits::max_nlanes/4]; + for( int i = 0; i < VTraits::vlanes()/4; i++ ) idx[i] = 0; float _m[] = { m[4], m[9], m[14], m[19] }; v_float32 m0 = vx_lut_quads(m , idx); @@ -1650,12 +1652,13 @@ transform_32f( const float* src, float* dst, const float* m, int len, int scn, i v_float32 m2 = vx_lut_quads(m+10, idx); v_float32 m3 = vx_lut_quads(m+15, idx); v_float32 m4 = vx_lut_quads(_m, idx); - for( ; x <= len*4 - v_float32::nlanes; x += v_float32::nlanes ) + for( ; x <= len*4 - VTraits::vlanes(); x += VTraits::vlanes() ) { v_float32 v_src = vx_load(src + x); - v_store(dst + x, v_reduce_sum4(v_src * m0, v_src * m1, v_src * m2, v_src * m3) + m4); + v_store(dst + x, v_add(v_reduce_sum4(v_mul(v_src, m0), v_mul(v_src, m1), v_mul(v_src, m2), v_mul(v_src, m3)), m4)); } #endif +#if CV_SIMD128 v_float32x4 _m0 = v_load(m ); v_float32x4 _m1 = v_load(m + 5); v_float32x4 _m2 = v_load(m + 10); @@ -1666,6 +1669,17 @@ transform_32f( const float* src, float* dst, const float* m, int len, int scn, i v_float32x4 v_src = v_load(src + x); v_store(dst + x, v_reduce_sum4(v_src * _m0, v_src * _m1, v_src * _m2, v_src * _m3) + _m4); } +#else // CV_SIMD_WIDTH >= 16 && !CV_SIMD128 + for( ; x < len*4; x += 4 ) + { + float v0 = src[x], v1 = src[x+1], v2 = src[x+2], v3 = src[x+3]; + float t0 = saturate_cast(m[0]*v0 + m[1]*v1 + m[ 2]*v2 + m[ 3]*v3 + m[ 4]); + float t1 = saturate_cast(m[5]*v0 + m[6]*v1 + m[ 7]*v2 + m[ 8]*v3 + m[ 9]); + float t2 = saturate_cast(m[10]*v0 + m[11]*v1 + m[12]*v2 + m[13]*v3 + m[14]); + float t3 = saturate_cast(m[15]*v0 + m[16]*v1 + m[17]*v2 + m[18]*v3 + m[19]); + dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2; dst[x+3] = t3; + } +#endif vx_cleanup(); return; } @@ -1936,9 +1950,9 @@ static void scaleAdd_32f(const float* src1, const float* src2, float* dst, { float alpha = *_alpha; int i = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 v_alpha = vx_setall_f32(alpha); - const int cWidth = v_float32::nlanes; + const int cWidth = VTraits::vlanes(); for (; i <= len - cWidth; i += cWidth) v_store(dst + i, v_muladd(vx_load(src1 + i), v_alpha, vx_load(src2 + i))); vx_cleanup(); @@ -1953,9 +1967,9 @@ static void scaleAdd_64f(const double* src1, const double* src2, double* dst, { double alpha = *_alpha; int i = 0; -#if CV_SIMD_64F +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) v_float64 a2 = vx_setall_f64(alpha); - const int cWidth = v_float64::nlanes; + const int cWidth = VTraits::vlanes(); for (; i <= len - cWidth; i += cWidth) v_store(dst + i, v_muladd(vx_load(src1 + i), a2, vx_load(src2 + i))); vx_cleanup(); @@ -2078,7 +2092,7 @@ MulTransposedR(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double deltastep = deltastep ? 4 : 0; } -#if CV_SIMD_64F +#if CV_SIMD128_64F v_float64x2 v_scale = v_setall_f64(scale); #endif @@ -2090,7 +2104,7 @@ MulTransposedR(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double for( j = i; j <= size.width - 4; j += 4 ) { -#if CV_SIMD_64F +#if CV_SIMD128_64F if (DataType::depth == CV_64F && DataType
::depth == CV_64F) { v_float64x2 s0 = v_setzero_f64(), s1 = v_setzero_f64(); @@ -2150,7 +2164,7 @@ MulTransposedR(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double for( j = i; j <= size.width - 4; j += 4 ) { -#if CV_SIMD_64F +#if CV_SIMD128_64F if (DataType::depth == CV_64F && DataType
::depth == CV_64F) { v_float64x2 s0 = v_setzero_f64(), s1 = v_setzero_f64(); @@ -2227,7 +2241,7 @@ MulTransposedL(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double double s = 0; const sT *tsrc1 = src + i*srcstep; const sT *tsrc2 = src + j*srcstep; -#if CV_SIMD_64F +#if CV_SIMD128_64F if (DataType::depth == CV_64F && DataType
::depth == CV_64F) { const double *v_tsrc1 = (double *)(tsrc1); @@ -2280,7 +2294,7 @@ MulTransposedL(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double delta_buf[2] = delta_buf[3] = tdelta2[0]; tdelta2 = delta_buf; } -#if CV_SIMD_64F +#if CV_SIMD128_64F if (DataType::depth == CV_64F && DataType
::depth == CV_64F) { const double *v_tsrc2 = (double *)(tsrc2); @@ -2393,14 +2407,14 @@ double dotProd_8u(const uchar* src1, const uchar* src2, int len) double r = 0; int i = 0; -#if CV_SIMD - int len0 = len & -v_uint16::nlanes, blockSize0 = (1 << 15), blockSize; +#if (CV_SIMD || CV_SIMD_SCALABLE) + int len0 = len & -VTraits::vlanes(), blockSize0 = (1 << 15), blockSize; while (i < len0) { blockSize = std::min(len0 - i, blockSize0); v_uint32 v_sum = vx_setzero_u32(); - const int cWidth = v_uint16::nlanes; + const int cWidth = VTraits::vlanes(); int j = 0; for (; j <= blockSize - cWidth * 2; j += cWidth * 2) @@ -2414,7 +2428,7 @@ double dotProd_8u(const uchar* src1, const uchar* src2, int len) { v_int16 v_src10 = v_reinterpret_as_s16(vx_load_expand(src1 + j)); v_int16 v_src20 = v_reinterpret_as_s16(vx_load_expand(src2 + j)); - v_sum += v_reinterpret_as_u32(v_dotprod_fast(v_src10, v_src20)); + v_sum = v_add(v_sum, v_reinterpret_as_u32(v_dotprod_fast(v_src10, v_src20))); } r += (double)v_reduce_sum(v_sum); @@ -2433,14 +2447,14 @@ double dotProd_8s(const schar* src1, const schar* src2, int len) double r = 0.0; int i = 0; -#if CV_SIMD - int len0 = len & -v_int16::nlanes, blockSize0 = (1 << 14), blockSize; +#if (CV_SIMD || CV_SIMD_SCALABLE) + int len0 = len & -VTraits::vlanes(), blockSize0 = (1 << 14), blockSize; while (i < len0) { blockSize = std::min(len0 - i, blockSize0); v_int32 v_sum = vx_setzero_s32(); - const int cWidth = v_int16::nlanes; + const int cWidth = VTraits::vlanes(); int j = 0; for (; j <= blockSize - cWidth * 2; j += cWidth * 2) @@ -2473,14 +2487,14 @@ double dotProd_16u(const ushort* src1, const ushort* src2, int len) double r = 0.0; int i = 0; -#if CV_SIMD - int len0 = len & -v_uint16::nlanes, blockSize0 = (1 << 24), blockSize; +#if (CV_SIMD || CV_SIMD_SCALABLE) + int len0 = len & -VTraits::vlanes(), blockSize0 = (1 << 24), blockSize; while (i < len0) { blockSize = std::min(len0 - i, blockSize0); v_uint64 v_sum = vx_setzero_u64(); - const int cWidth = v_uint16::nlanes; + const int cWidth = VTraits::vlanes(); int j = 0; for (; j <= blockSize - cWidth; j += cWidth) @@ -2505,14 +2519,14 @@ double dotProd_16s(const short* src1, const short* src2, int len) double r = 0.0; int i = 0; -#if CV_SIMD - int len0 = len & -v_int16::nlanes, blockSize0 = (1 << 24), blockSize; +#if (CV_SIMD || CV_SIMD_SCALABLE) + int len0 = len & -VTraits::vlanes(), blockSize0 = (1 << 24), blockSize; while (i < len0) { blockSize = std::min(len0 - i, blockSize0); v_int64 v_sum = vx_setzero_s64(); - const int cWidth = v_int16::nlanes; + const int cWidth = VTraits::vlanes(); int j = 0; for (; j <= blockSize - cWidth; j += cWidth) @@ -2534,10 +2548,10 @@ double dotProd_16s(const short* src1, const short* src2, int len) double dotProd_32s(const int* src1, const int* src2, int len) { -#if CV_SIMD_64F +#if CV_SIMD_64F // TODO: enable for CV_SIMD_SCALABLE_64F double r = .0; int i = 0; - const int step = v_int32::nlanes; + const int step = VTraits::vlanes(); v_float64 v_sum0 = vx_setzero_f64(); #if CV_SIMD_WIDTH == 16 const int wstep = step * 2; @@ -2572,8 +2586,8 @@ double dotProd_32f(const float* src1, const float* src2, int len) double r = 0.0; int i = 0; -#if CV_SIMD - int len0 = len & -v_float32::nlanes, blockSize0 = (1 << 13), blockSize; +#if (CV_SIMD || CV_SIMD_SCALABLE) + int len0 = len & -VTraits::vlanes(), blockSize0 = (1 << 13), blockSize; while (i < len0) { @@ -2581,7 +2595,7 @@ double dotProd_32f(const float* src1, const float* src2, int len) v_float32 v_sum = vx_setzero_f32(); int j = 0; - int cWidth = v_float32::nlanes; + int cWidth = VTraits::vlanes(); #if CV_ENABLE_UNROLLED v_float32 v_sum1 = vx_setzero_f32(); @@ -2600,7 +2614,7 @@ double dotProd_32f(const float* src1, const float* src2, int len) vx_load(src2 + j + (cWidth * 3)), v_sum3); } - v_sum += v_sum1 + v_sum2 + v_sum3; + v_sum = v_add(v_sum, v_add(v_add(v_sum1, v_sum2), v_sum3)); #endif for (; j <= blockSize - cWidth; j += cWidth) diff --git a/modules/core/src/matrix_transform.cpp b/modules/core/src/matrix_transform.cpp index 7f1043fbbe..744ee69b0d 100644 --- a/modules/core/src/matrix_transform.cpp +++ b/modules/core/src/matrix_transform.cpp @@ -440,7 +440,7 @@ template CV_ALWAYS_INLINE void flipHoriz_double( const static void flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz ) { -#if CV_SIMD +#if CV_SIMD128 #if CV_STRONG_ALIGNMENT size_t alignmentMark = ((size_t)src)|((size_t)dst)|sstep|dstep; #endif @@ -563,7 +563,7 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, } #endif else -#endif // CV_SIMD +#endif // CV_SIMD128 { int i, j, limit = (int)(((size.width + 1)/2)*esz); AutoBuffer _tab(size.width*esz); @@ -596,12 +596,12 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size, dst0 += dstep, dst1 -= dstep ) { int i = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) #if CV_STRONG_ALIGNMENT if (isAligned(src0, src1, dst0, dst1)) #endif { - for (; i <= size.width - CV_SIMD_WIDTH; i += CV_SIMD_WIDTH) + for (; i <= size.width - VTraits::vlanes(); i += VTraits::vlanes()) { v_int32 t0 = v_reinterpret_as_s32(vx_load(src0 + i)); v_int32 t1 = v_reinterpret_as_s32(vx_load(src1 + i)); @@ -612,7 +612,7 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size, #if CV_STRONG_ALIGNMENT else { - for (; i <= size.width - CV_SIMD_WIDTH; i += CV_SIMD_WIDTH) + for (; i <= size.width - VTraits::vlanes(); i += VTraits::vlanes()) { v_uint8 t0 = vx_load(src0 + i); v_uint8 t1 = vx_load(src1 + i); diff --git a/modules/core/src/merge.simd.hpp b/modules/core/src/merge.simd.hpp index ad08dd8879..d67a117c7b 100644 --- a/modules/core/src/merge.simd.hpp +++ b/modules/core/src/merge.simd.hpp @@ -15,7 +15,7 @@ void merge64s(const int64** src, int64* dst, int len, int cn); #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) /* The trick with STORE_UNALIGNED/STORE_ALIGNED_NOCACHE is the following: on IA there are instructions movntps and such to which @@ -38,7 +38,7 @@ void merge64s(const int64** src, int64* dst, int len, int cn); template static void vecmerge_( const T** src, T* dst, int len, int cn ) { - const int VECSZ = VecT::nlanes; + const int VECSZ = VTraits::vlanes(); int i, i0 = 0; const T* src0 = src[0]; const T* src1 = src[1]; @@ -173,8 +173,8 @@ merge_( const T** src, T* dst, int len, int cn ) void merge8u(const uchar** src, uchar* dst, int len, int cn ) { CV_INSTRUMENT_REGION(); -#if CV_SIMD - if( len >= v_uint8::nlanes && 2 <= cn && cn <= 4 ) +#if (CV_SIMD || CV_SIMD_SCALABLE) + if( len >= VTraits::vlanes() && 2 <= cn && cn <= 4 ) vecmerge_(src, dst, len, cn); else #endif @@ -184,8 +184,8 @@ void merge8u(const uchar** src, uchar* dst, int len, int cn ) void merge16u(const ushort** src, ushort* dst, int len, int cn ) { CV_INSTRUMENT_REGION(); -#if CV_SIMD - if( len >= v_uint16::nlanes && 2 <= cn && cn <= 4 ) +#if (CV_SIMD || CV_SIMD_SCALABLE) + if( len >= VTraits::vlanes() && 2 <= cn && cn <= 4 ) vecmerge_(src, dst, len, cn); else #endif @@ -195,8 +195,8 @@ void merge16u(const ushort** src, ushort* dst, int len, int cn ) void merge32s(const int** src, int* dst, int len, int cn ) { CV_INSTRUMENT_REGION(); -#if CV_SIMD - if( len >= v_int32::nlanes && 2 <= cn && cn <= 4 ) +#if (CV_SIMD || CV_SIMD_SCALABLE) + if( len >= VTraits::vlanes() && 2 <= cn && cn <= 4 ) vecmerge_(src, dst, len, cn); else #endif @@ -206,8 +206,8 @@ void merge32s(const int** src, int* dst, int len, int cn ) void merge64s(const int64** src, int64* dst, int len, int cn ) { CV_INSTRUMENT_REGION(); -#if CV_SIMD - if( len >= v_int64::nlanes && 2 <= cn && cn <= 4 ) +#if (CV_SIMD || CV_SIMD_SCALABLE) + if( len >= VTraits::vlanes() && 2 <= cn && cn <= 4 ) vecmerge_(src, dst, len, cn); else #endif diff --git a/modules/core/src/norm.cpp b/modules/core/src/norm.cpp index 69da85f291..be68efddf0 100644 --- a/modules/core/src/norm.cpp +++ b/modules/core/src/norm.cpp @@ -63,25 +63,25 @@ int normHamming(const uchar* a, int n, int cellSize) return -1; int i = 0; int result = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_uint64 t = vx_setzero_u64(); if ( cellSize == 2) { v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x55)); - for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes) + for(; i <= n - VTraits::vlanes(); i += VTraits::vlanes()) { v_uint16 a0 = v_reinterpret_as_u16(vx_load(a + i)); - t += v_popcount(v_reinterpret_as_u64((a0 | (a0 >> 1)) & mask)); + t = v_add(t, v_popcount(v_reinterpret_as_u64(v_and(v_or(a0, v_shr<1>(a0)), mask)))); } } else // cellSize == 4 { v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x11)); - for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes) + for(; i <= n - VTraits::vlanes(); i += VTraits::vlanes()) { v_uint16 a0 = v_reinterpret_as_u16(vx_load(a + i)); - v_uint16 a1 = a0 | (a0 >> 2); - t += v_popcount(v_reinterpret_as_u64((a1 | (a1 >> 1)) & mask)); + v_uint16 a1 = v_or(a0, v_shr<2>(a0)); + t = v_add(t, v_popcount(v_reinterpret_as_u64(v_and(v_or(a1, v_shr<1>(a1)), mask)))); } } @@ -109,25 +109,25 @@ int normHamming(const uchar* a, const uchar* b, int n, int cellSize) return -1; int i = 0; int result = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_uint64 t = vx_setzero_u64(); if ( cellSize == 2) { v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x55)); - for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes) + for(; i <= n - VTraits::vlanes(); i += VTraits::vlanes()) { - v_uint16 ab0 = v_reinterpret_as_u16(vx_load(a + i) ^ vx_load(b + i)); - t += v_popcount(v_reinterpret_as_u64((ab0 | (ab0 >> 1)) & mask)); + v_uint16 ab0 = v_reinterpret_as_u16(v_xor(vx_load(a + i), vx_load(b + i))); + t = v_add(t, v_popcount(v_reinterpret_as_u64(v_and(v_or(ab0, v_shr<1>(ab0)), mask)))); } } else // cellSize == 4 { v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x11)); - for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes) + for(; i <= n - VTraits::vlanes(); i += VTraits::vlanes()) { - v_uint16 ab0 = v_reinterpret_as_u16(vx_load(a + i) ^ vx_load(b + i)); - v_uint16 ab1 = ab0 | (ab0 >> 2); - t += v_popcount(v_reinterpret_as_u64((ab1 | (ab1 >> 1)) & mask)); + v_uint16 ab0 = v_reinterpret_as_u16(v_xor(vx_load(a + i), vx_load(b + i))); + v_uint16 ab1 = v_or(ab0, v_shr<2>(ab0)); + t = v_add(t, v_popcount(v_reinterpret_as_u64(v_and(v_or(ab1, v_shr<1>(ab1)), mask)))); } } result += (int)v_reduce_sum(t); @@ -145,21 +145,21 @@ int normHamming(const uchar* a, const uchar* b, int n, int cellSize) float normL2Sqr_(const float* a, const float* b, int n) { int j = 0; float d = 0.f; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 v_d0 = vx_setzero_f32(), v_d1 = vx_setzero_f32(); v_float32 v_d2 = vx_setzero_f32(), v_d3 = vx_setzero_f32(); - for (; j <= n - 4 * v_float32::nlanes; j += 4 * v_float32::nlanes) + for (; j <= n - 4 * VTraits::vlanes(); j += 4 * VTraits::vlanes()) { - v_float32 t0 = vx_load(a + j) - vx_load(b + j); - v_float32 t1 = vx_load(a + j + v_float32::nlanes) - vx_load(b + j + v_float32::nlanes); + v_float32 t0 = v_sub(vx_load(a + j), vx_load(b + j)); + v_float32 t1 = v_sub(vx_load(a + j + VTraits::vlanes()), vx_load(b + j + VTraits::vlanes())); v_d0 = v_muladd(t0, t0, v_d0); - v_float32 t2 = vx_load(a + j + 2 * v_float32::nlanes) - vx_load(b + j + 2 * v_float32::nlanes); + v_float32 t2 = v_sub(vx_load(a + j + 2 * VTraits::vlanes()), vx_load(b + j + 2 * VTraits::vlanes())); v_d1 = v_muladd(t1, t1, v_d1); - v_float32 t3 = vx_load(a + j + 3 * v_float32::nlanes) - vx_load(b + j + 3 * v_float32::nlanes); + v_float32 t3 = v_sub(vx_load(a + j + 3 * VTraits::vlanes()), vx_load(b + j + 3 * VTraits::vlanes())); v_d2 = v_muladd(t2, t2, v_d2); v_d3 = v_muladd(t3, t3, v_d3); } - d = v_reduce_sum(v_d0 + v_d1 + v_d2 + v_d3); + d = v_reduce_sum(v_add(v_add(v_add(v_d0, v_d1), v_d2), v_d3)); #endif for( ; j < n; j++ ) { @@ -173,17 +173,17 @@ float normL2Sqr_(const float* a, const float* b, int n) float normL1_(const float* a, const float* b, int n) { int j = 0; float d = 0.f; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 v_d0 = vx_setzero_f32(), v_d1 = vx_setzero_f32(); v_float32 v_d2 = vx_setzero_f32(), v_d3 = vx_setzero_f32(); - for (; j <= n - 4 * v_float32::nlanes; j += 4 * v_float32::nlanes) + for (; j <= n - 4 * VTraits::vlanes(); j += 4 * VTraits::vlanes()) { - v_d0 += v_absdiff(vx_load(a + j), vx_load(b + j)); - v_d1 += v_absdiff(vx_load(a + j + v_float32::nlanes), vx_load(b + j + v_float32::nlanes)); - v_d2 += v_absdiff(vx_load(a + j + 2 * v_float32::nlanes), vx_load(b + j + 2 * v_float32::nlanes)); - v_d3 += v_absdiff(vx_load(a + j + 3 * v_float32::nlanes), vx_load(b + j + 3 * v_float32::nlanes)); + v_d0 = v_add(v_d0, v_absdiff(vx_load(a + j), vx_load(b + j))); + v_d1 = v_add(v_d1, v_absdiff(vx_load(a + j + VTraits::vlanes()), vx_load(b + j + VTraits::vlanes()))); + v_d2 = v_add(v_d2, v_absdiff(vx_load(a + j + 2 * VTraits::vlanes()), vx_load(b + j + 2 * VTraits::vlanes()))); + v_d3 = v_add(v_d3, v_absdiff(vx_load(a + j + 3 * VTraits::vlanes()), vx_load(b + j + 3 * VTraits::vlanes()))); } - d = v_reduce_sum(v_d0 + v_d1 + v_d2 + v_d3); + d = v_reduce_sum(v_add(v_add(v_add(v_d0, v_d1), v_d2), v_d3)); #endif for( ; j < n; j++ ) d += std::abs(a[j] - b[j]); @@ -193,12 +193,12 @@ float normL1_(const float* a, const float* b, int n) int normL1_(const uchar* a, const uchar* b, int n) { int j = 0, d = 0; -#if CV_SIMD - for (; j <= n - 4 * v_uint8::nlanes; j += 4 * v_uint8::nlanes) +#if (CV_SIMD || CV_SIMD_SCALABLE) + for (; j <= n - 4 * VTraits::vlanes(); j += 4 * VTraits::vlanes()) d += v_reduce_sad(vx_load(a + j), vx_load(b + j)) + - v_reduce_sad(vx_load(a + j + v_uint8::nlanes), vx_load(b + j + v_uint8::nlanes)) + - v_reduce_sad(vx_load(a + j + 2 * v_uint8::nlanes), vx_load(b + j + 2 * v_uint8::nlanes)) + - v_reduce_sad(vx_load(a + j + 3 * v_uint8::nlanes), vx_load(b + j + 3 * v_uint8::nlanes)); + v_reduce_sad(vx_load(a + j + VTraits::vlanes()), vx_load(b + j + VTraits::vlanes())) + + v_reduce_sad(vx_load(a + j + 2 * VTraits::vlanes()), vx_load(b + j + 2 * VTraits::vlanes())) + + v_reduce_sad(vx_load(a + j + 3 * VTraits::vlanes()), vx_load(b + j + 3 * VTraits::vlanes())); #endif for( ; j < n; j++ ) d += std::abs(a[j] - b[j]); diff --git a/modules/core/src/split.simd.hpp b/modules/core/src/split.simd.hpp index 25e90c0063..88414161b8 100644 --- a/modules/core/src/split.simd.hpp +++ b/modules/core/src/split.simd.hpp @@ -15,12 +15,12 @@ void split64s(const int64* src, int64** dst, int len, int cn); #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) // see the comments for vecmerge_ in merge.cpp template static void vecsplit_( const T* src, T** dst, int len, int cn ) { - const int VECSZ = VecT::nlanes; + const int VECSZ = VTraits::vlanes(); int i, i0 = 0; T* dst0 = dst[0]; T* dst1 = dst[1]; @@ -177,8 +177,8 @@ split_( const T* src, T** dst, int len, int cn ) void split8u(const uchar* src, uchar** dst, int len, int cn ) { CV_INSTRUMENT_REGION(); -#if CV_SIMD - if( len >= v_uint8::nlanes && 2 <= cn && cn <= 4 ) +#if (CV_SIMD || CV_SIMD_SCALABLE) + if( len >= VTraits::vlanes() && 2 <= cn && cn <= 4 ) vecsplit_(src, dst, len, cn); else #endif @@ -188,8 +188,8 @@ void split8u(const uchar* src, uchar** dst, int len, int cn ) void split16u(const ushort* src, ushort** dst, int len, int cn ) { CV_INSTRUMENT_REGION(); -#if CV_SIMD - if( len >= v_uint16::nlanes && 2 <= cn && cn <= 4 ) +#if (CV_SIMD || CV_SIMD_SCALABLE) + if( len >= VTraits::vlanes() && 2 <= cn && cn <= 4 ) vecsplit_(src, dst, len, cn); else #endif @@ -199,8 +199,8 @@ void split16u(const ushort* src, ushort** dst, int len, int cn ) void split32s(const int* src, int** dst, int len, int cn ) { CV_INSTRUMENT_REGION(); -#if CV_SIMD - if( len >= v_uint32::nlanes && 2 <= cn && cn <= 4 ) +#if (CV_SIMD || CV_SIMD_SCALABLE) + if( len >= VTraits::vlanes() && 2 <= cn && cn <= 4 ) vecsplit_(src, dst, len, cn); else #endif @@ -210,8 +210,8 @@ void split32s(const int* src, int** dst, int len, int cn ) void split64s(const int64* src, int64** dst, int len, int cn ) { CV_INSTRUMENT_REGION(); -#if CV_SIMD - if( len >= v_int64::nlanes && 2 <= cn && cn <= 4 ) +#if (CV_SIMD || CV_SIMD_SCALABLE) + if( len >= VTraits::vlanes() && 2 <= cn && cn <= 4 ) vecsplit_(src, dst, len, cn); else #endif diff --git a/modules/core/src/stat.simd.hpp b/modules/core/src/stat.simd.hpp index 0592f84794..a5fb05476d 100644 --- a/modules/core/src/stat.simd.hpp +++ b/modules/core/src/stat.simd.hpp @@ -33,11 +33,11 @@ int normHamming(const uchar* a, int n) int i = 0; int result = 0; -#if CV_SIMD && CV_SIMD_WIDTH > 16 +#if (CV_SIMD || CV_SIMD_SCALABLE) { v_uint64 t = vx_setzero_u64(); - for (; i <= n - v_uint8::nlanes; i += v_uint8::nlanes) - t += v_popcount(v_reinterpret_as_u64(vx_load(a + i))); + for (; i <= n - VTraits::vlanes(); i += VTraits::vlanes()) + t = v_add(t, v_popcount(v_reinterpret_as_u64(vx_load(a + i)))); result = (int)v_reduce_sum(t); vx_cleanup(); } @@ -56,13 +56,6 @@ int normHamming(const uchar* a, int n) result += CV_POPCNT_U32(*(uint*)(a + i)); } } -#elif CV_SIMD - { - v_uint64x2 t = v_setzero_u64(); - for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes) - t += v_popcount(v_reinterpret_as_u64(v_load(a + i))); - result += (int)v_reduce_sum(t); - } #endif #if CV_ENABLE_UNROLLED for(; i <= n - 4; i += 4) @@ -85,11 +78,11 @@ int normHamming(const uchar* a, const uchar* b, int n) int i = 0; int result = 0; -#if CV_SIMD && CV_SIMD_WIDTH > 16 +#if (CV_SIMD || CV_SIMD_SCALABLE) { v_uint64 t = vx_setzero_u64(); - for (; i <= n - v_uint8::nlanes; i += v_uint8::nlanes) - t += v_popcount(v_reinterpret_as_u64(vx_load(a + i) ^ vx_load(b + i))); + for (; i <= n - VTraits::vlanes(); i += VTraits::vlanes()) + t = v_add(t, v_popcount(v_reinterpret_as_u64(v_xor(vx_load(a + i), vx_load(b + i))))); result += (int)v_reduce_sum(t); } #endif @@ -107,13 +100,6 @@ int normHamming(const uchar* a, const uchar* b, int n) result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i)); } } -#elif CV_SIMD - { - v_uint64x2 t = v_setzero_u64(); - for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes) - t += v_popcount(v_reinterpret_as_u64(v_load(a + i) ^ v_load(b + i))); - result += (int)v_reduce_sum(t); - } #endif #if CV_ENABLE_UNROLLED for(; i <= n - 4; i += 4) diff --git a/modules/core/src/sum.simd.hpp b/modules/core/src/sum.simd.hpp index 2232013b24..e20cd39b70 100644 --- a/modules/core/src/sum.simd.hpp +++ b/modules/core/src/sum.simd.hpp @@ -22,7 +22,7 @@ struct Sum_SIMD } }; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) template <> struct Sum_SIMD @@ -36,41 +36,41 @@ struct Sum_SIMD int x = 0; v_uint32 v_sum = vx_setzero_u32(); - int len0 = len & -v_uint8::nlanes; + int len0 = len & -VTraits::vlanes(); while (x < len0) { - const int len_tmp = min(x + 256*v_uint16::nlanes, len0); + const int len_tmp = min(x + 256*VTraits::vlanes(), len0); v_uint16 v_sum16 = vx_setzero_u16(); - for (; x < len_tmp; x += v_uint8::nlanes) + for (; x < len_tmp; x += VTraits::vlanes()) { v_uint16 v_src0, v_src1; v_expand(vx_load(src0 + x), v_src0, v_src1); - v_sum16 += v_src0 + v_src1; + v_sum16 = v_add(v_sum16, v_add(v_src0, v_src1)); } v_uint32 v_half0, v_half1; v_expand(v_sum16, v_half0, v_half1); - v_sum += v_half0 + v_half1; + v_sum = v_add(v_sum, v_add(v_half0, v_half1)); } - if (x <= len - v_uint16::nlanes) + if (x <= len - VTraits::vlanes()) { v_uint32 v_half0, v_half1; v_expand(vx_load_expand(src0 + x), v_half0, v_half1); - v_sum += v_half0 + v_half1; - x += v_uint16::nlanes; + v_sum = v_add(v_sum, v_add(v_half0, v_half1)); + x += VTraits::vlanes(); } - if (x <= len - v_uint32::nlanes) + if (x <= len - VTraits::vlanes()) { - v_sum += vx_load_expand_q(src0 + x); - x += v_uint32::nlanes; + v_sum = v_add(v_sum, vx_load_expand_q(src0 + x)); + x += VTraits::vlanes(); } if (cn == 1) *dst += v_reduce_sum(v_sum); else { - uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_uint32::nlanes]; + uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits::max_nlanes]; v_store_aligned(ar, v_sum); - for (int i = 0; i < v_uint32::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) dst[i % cn] += ar[i]; } v_cleanup(); @@ -91,41 +91,41 @@ struct Sum_SIMD int x = 0; v_int32 v_sum = vx_setzero_s32(); - int len0 = len & -v_int8::nlanes; + int len0 = len & -VTraits::vlanes(); while (x < len0) { - const int len_tmp = min(x + 256*v_int16::nlanes, len0); + const int len_tmp = min(x + 256*VTraits::vlanes(), len0); v_int16 v_sum16 = vx_setzero_s16(); - for (; x < len_tmp; x += v_int8::nlanes) + for (; x < len_tmp; x += VTraits::vlanes()) { v_int16 v_src0, v_src1; v_expand(vx_load(src0 + x), v_src0, v_src1); - v_sum16 += v_src0 + v_src1; + v_sum16 = v_add(v_sum16, v_add(v_src0, v_src1)); } v_int32 v_half0, v_half1; v_expand(v_sum16, v_half0, v_half1); - v_sum += v_half0 + v_half1; + v_sum = v_add(v_sum, v_add(v_half0, v_half1)); } - if (x <= len - v_int16::nlanes) + if (x <= len - VTraits::vlanes()) { v_int32 v_half0, v_half1; v_expand(vx_load_expand(src0 + x), v_half0, v_half1); - v_sum += v_half0 + v_half1; - x += v_int16::nlanes; + v_sum = v_add(v_sum, v_add(v_half0, v_half1)); + x += VTraits::vlanes(); } - if (x <= len - v_int32::nlanes) + if (x <= len - VTraits::vlanes()) { - v_sum += vx_load_expand_q(src0 + x); - x += v_int32::nlanes; + v_sum = v_add(v_sum, vx_load_expand_q(src0 + x)); + x += VTraits::vlanes(); } if (cn == 1) *dst += v_reduce_sum(v_sum); else { - int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_int32::nlanes]; + int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits::max_nlanes]; v_store_aligned(ar, v_sum); - for (int i = 0; i < v_int32::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) dst[i % cn] += ar[i]; } v_cleanup(); @@ -146,25 +146,25 @@ struct Sum_SIMD int x = 0; v_uint32 v_sum = vx_setzero_u32(); - for (; x <= len - v_uint16::nlanes; x += v_uint16::nlanes) + for (; x <= len - VTraits::vlanes(); x += VTraits::vlanes()) { v_uint32 v_src0, v_src1; v_expand(vx_load(src0 + x), v_src0, v_src1); - v_sum += v_src0 + v_src1; + v_sum = v_add(v_sum, v_add(v_src0, v_src1)); } - if (x <= len - v_uint32::nlanes) + if (x <= len - VTraits::vlanes()) { - v_sum += vx_load_expand(src0 + x); - x += v_uint32::nlanes; + v_sum = v_add(v_sum, vx_load_expand(src0 + x)); + x += VTraits::vlanes(); } if (cn == 1) *dst += v_reduce_sum(v_sum); else { - uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_uint32::nlanes]; + uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits::max_nlanes]; v_store_aligned(ar, v_sum); - for (int i = 0; i < v_uint32::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) dst[i % cn] += ar[i]; } v_cleanup(); @@ -185,25 +185,25 @@ struct Sum_SIMD int x = 0; v_int32 v_sum = vx_setzero_s32(); - for (; x <= len - v_int16::nlanes; x += v_int16::nlanes) + for (; x <= len - VTraits::vlanes(); x += VTraits::vlanes()) { v_int32 v_src0, v_src1; v_expand(vx_load(src0 + x), v_src0, v_src1); - v_sum += v_src0 + v_src1; + v_sum = v_add(v_sum, v_add(v_src0, v_src1)); } - if (x <= len - v_int32::nlanes) + if (x <= len - VTraits::vlanes()) { - v_sum += vx_load_expand(src0 + x); - x += v_int32::nlanes; + v_sum = v_add(v_sum, vx_load_expand(src0 + x)); + x += VTraits::vlanes(); } if (cn == 1) *dst += v_reduce_sum(v_sum); else { - int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_int32::nlanes]; + int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits::max_nlanes]; v_store_aligned(ar, v_sum); - for (int i = 0; i < v_int32::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) dst[i % cn] += ar[i]; } v_cleanup(); @@ -212,7 +212,7 @@ struct Sum_SIMD } }; -#if CV_SIMD_64F +#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F) template <> struct Sum_SIMD { @@ -226,24 +226,24 @@ struct Sum_SIMD v_float64 v_sum0 = vx_setzero_f64(); v_float64 v_sum1 = vx_setzero_f64(); - for (; x <= len - 2 * v_int32::nlanes; x += 2 * v_int32::nlanes) + for (; x <= len - 2 * VTraits::vlanes(); x += 2 * VTraits::vlanes()) { v_int32 v_src0 = vx_load(src0 + x); - v_int32 v_src1 = vx_load(src0 + x + v_int32::nlanes); - v_sum0 += v_cvt_f64(v_src0) + v_cvt_f64(v_src1); - v_sum1 += v_cvt_f64_high(v_src0) + v_cvt_f64_high(v_src1); + v_int32 v_src1 = vx_load(src0 + x + VTraits::vlanes()); + v_sum0 = v_add(v_sum0, v_add(v_cvt_f64(v_src0), v_cvt_f64(v_src1))); + v_sum1 = v_add(v_sum1, v_add(v_cvt_f64_high(v_src0), v_cvt_f64_high(v_src1))); } #if CV_SIMD256 || CV_SIMD512 - double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_float64::nlanes]; - v_store_aligned(ar, v_sum0 + v_sum1); - for (int i = 0; i < v_float64::nlanes; ++i) + double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits::max_nlanes]; + v_store_aligned(ar, v_add(v_sum0, v_sum1)); + for (int i = 0; i < VTraits::vlanes(); ++i) dst[i % cn] += ar[i]; #else - double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * v_float64::nlanes]; + double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * VTraits::max_nlanes]; v_store_aligned(ar, v_sum0); - v_store_aligned(ar + v_float64::nlanes, v_sum1); - for (int i = 0; i < 2 * v_float64::nlanes; ++i) + v_store_aligned(ar + VTraits::vlanes(), v_sum1); + for (int i = 0; i < 2 * VTraits::vlanes(); ++i) dst[i % cn] += ar[i]; #endif v_cleanup(); @@ -265,24 +265,24 @@ struct Sum_SIMD v_float64 v_sum0 = vx_setzero_f64(); v_float64 v_sum1 = vx_setzero_f64(); - for (; x <= len - 2 * v_float32::nlanes; x += 2 * v_float32::nlanes) + for (; x <= len - 2 * VTraits::vlanes(); x += 2 * VTraits::vlanes()) { v_float32 v_src0 = vx_load(src0 + x); - v_float32 v_src1 = vx_load(src0 + x + v_float32::nlanes); - v_sum0 += v_cvt_f64(v_src0) + v_cvt_f64(v_src1); - v_sum1 += v_cvt_f64_high(v_src0) + v_cvt_f64_high(v_src1); + v_float32 v_src1 = vx_load(src0 + x + VTraits::vlanes()); + v_sum0 = v_add(v_sum0, v_add(v_cvt_f64(v_src0), v_cvt_f64(v_src1))); + v_sum1 = v_add(v_sum1, v_add(v_cvt_f64_high(v_src0), v_cvt_f64_high(v_src1))); } #if CV_SIMD256 || CV_SIMD512 - double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_float64::nlanes]; - v_store_aligned(ar, v_sum0 + v_sum1); - for (int i = 0; i < v_float64::nlanes; ++i) + double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits::max_nlanes]; + v_store_aligned(ar, v_add(v_sum0, v_sum1)); + for (int i = 0; i < VTraits::vlanes(); ++i) dst[i % cn] += ar[i]; #else - double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * v_float64::nlanes]; + double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * VTraits::max_nlanes]; v_store_aligned(ar, v_sum0); - v_store_aligned(ar + v_float64::nlanes, v_sum1); - for (int i = 0; i < 2 * v_float64::nlanes; ++i) + v_store_aligned(ar + VTraits::vlanes(), v_sum1); + for (int i = 0; i < 2 * VTraits::vlanes(); ++i) dst[i % cn] += ar[i]; #endif v_cleanup();