Merge pull request #23980 from hanliutong:rewrite-core

Rewrite Universal Intrinsic code by using new API: Core module. #23980 The goal of this PR is to match and modify all SIMD code blocks guarded by `CV_SIMD` macro in the `opencv/modules/core` folder and rewrite them by using the new Universal Intrinsic API. The patch is almost auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter), related PR #23885. Most of the files have been rewritten, but I marked this PR as draft because, the `CV_SIMD` macro also exists in the following files, and the reasons why they are not rewrited are: 1. ~~code design for fixed-size SIMD (v_int16x8, v_float32x4, etc.), need to manually rewrite.~~ Rewrited - ./modules/core/src/stat.simd.hpp - ./modules/core/src/matrix_transform.cpp - ./modules/core/src/matmul.simd.hpp 2. Vector types are wrapped in other class/struct, that are not supported by the compiler in variable-length backends. Can not be rewrited directly. - ./modules/core/src/mathfuncs_core.simd.hpp ```cpp struct v_atan_f32 { explicit v_atan_f32(const float& scale) { ... } v_float32 compute(const v_float32& y, const v_float32& x) { ... } ... v_float32 val90; // sizeless type can not used in a class v_float32 val180; v_float32 val360; v_float32 s; }; ``` 3. The API interface does not support/does not match - ./modules/core/src/norm.cpp Use `v_popcount`, ~~waiting for #23966~~ Fixed - ./modules/core/src/has_non_zero.simd.hpp Use illegal Universal Intrinsic API: For float type, there is no logical operation `|`. Further discussion needed ```cpp /** @brief Bitwise OR Only for integer types. */ template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); ``` ```cpp #if CV_SIMD typedef v_float32 v_type; const v_type v_zero = vx_setzero_f32(); constexpr const int unrollCount = 8; int step = v_type::nlanes * unrollCount; int len0 = len & -step; const float* srcSimdEnd = src+len0; int countSIMD = static_cast<int>((srcSimdEnd-src)/step); while(!res && countSIMD--) { v_type v0 = vx_load(src); src += v_type::nlanes; v_type v1 = vx_load(src); src += v_type::nlanes; .... src += v_type::nlanes; v0 |= v1; //Illegal ? .... //res = v_check_any(((v0 | v4) != v_zero));//beware : (NaN != 0) returns "false" since != is mapped to _CMP_NEQ_OQ and not _CMP_NEQ_UQ res = !v_check_all(((v0 | v4) == v_zero)); } v_cleanup(); #endif ``` ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [ ] I agree to contribute to the project under Apache 2 License. - [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
1 year ago · 0dd7769bb1
parent 3421b950ce
commit 0dd7769bb1
17 changed files with 466 additions and 518 deletions
--- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
@ -924,6 +924,9 @@ inline scalartype v_reduce_sum(const _Tpvec& a)  \
    return (scalartype)v_get0(res); \
 }
 OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float32, v_float32, vfloat32m1_t, float, f32, VTraits<v_float32>::vlanes())
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float64, v_float64, vfloat64m1_t, float, f64, VTraits<v_float64>::vlanes())
+#endif

 #define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, func, scalartype, suffix, vl, red) \
 inline scalartype v_reduce_##func(const _Tpvec& a)  \
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@ -1332,7 +1332,7 @@ struct InRange_SIMD
    }
 };

-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)

 template <>
 struct InRange_SIMD<uchar>
@ -1341,7 +1341,7 @@ struct InRange_SIMD<uchar>
        uchar * dst, int len) const
    {
        int x = 0;
-        const int width = v_uint8::nlanes;
+        const int width = VTraits<v_uint8>::vlanes();

        for (; x <= len - width; x += width)
        {
@ -1349,7 +1349,7 @@ struct InRange_SIMD<uchar>
            v_uint8 low = vx_load(src2 + x);
            v_uint8 high = vx_load(src3 + x);

-            v_store(dst + x, (values >= low) & (high >= values));
+            v_store(dst + x, v_and(v_ge(values, low), v_ge(high, values)));
        }
        vx_cleanup();
        return x;
@ -1363,7 +1363,7 @@ struct InRange_SIMD<schar>
        uchar * dst, int len) const
    {
        int x = 0;
-        const int width = v_int8::nlanes;
+        const int width = VTraits<v_int8>::vlanes();

        for (; x <= len - width; x += width)
        {
@ -1371,7 +1371,7 @@ struct InRange_SIMD<schar>
            v_int8 low = vx_load(src2 + x);
            v_int8 high = vx_load(src3 + x);

-            v_store((schar*)(dst + x), (values >= low) & (high >= values));
+            v_store((schar*)(dst + x), v_and(v_ge(values, low), v_ge(high, values)));
        }
        vx_cleanup();
        return x;
@ -1385,7 +1385,7 @@ struct InRange_SIMD<ushort>
        uchar * dst, int len) const
    {
        int x = 0;
-        const int width = v_uint16::nlanes * 2;
+        const int width = VTraits<v_uint16>::vlanes() * 2;

        for (; x <= len - width; x += width)
        {
@ -1393,11 +1393,11 @@ struct InRange_SIMD<ushort>
            v_uint16 low1 = vx_load(src2 + x);
            v_uint16 high1 = vx_load(src3 + x);

-            v_uint16 values2 = vx_load(src1 + x + v_uint16::nlanes);
-            v_uint16 low2 = vx_load(src2 + x + v_uint16::nlanes);
-            v_uint16 high2 = vx_load(src3 + x + v_uint16::nlanes);
+            v_uint16 values2 = vx_load(src1 + x + VTraits<v_uint16>::vlanes());
+            v_uint16 low2 = vx_load(src2 + x + VTraits<v_uint16>::vlanes());
+            v_uint16 high2 = vx_load(src3 + x + VTraits<v_uint16>::vlanes());

-            v_store(dst + x, v_pack((values1 >= low1) & (high1 >= values1), (values2 >= low2) & (high2 >= values2)));
+            v_store(dst + x, v_pack(v_and(v_ge(values1, low1), v_ge(high1, values1)), v_and(v_ge(values2, low2), v_ge(high2, values2))));
        }
        vx_cleanup();
        return x;
@ -1411,7 +1411,7 @@ struct InRange_SIMD<short>
        uchar * dst, int len) const
    {
        int x = 0;
-        const int width = (int)v_int16::nlanes * 2;
+        const int width = (int)VTraits<v_int16>::vlanes() * 2;

        for (; x <= len - width; x += width)
        {
@ -1419,11 +1419,11 @@ struct InRange_SIMD<short>
            v_int16 low1 = vx_load(src2 + x);
            v_int16 high1 = vx_load(src3 + x);

-            v_int16 values2 = vx_load(src1 + x + v_int16::nlanes);
-            v_int16 low2 = vx_load(src2 + x + v_int16::nlanes);
-            v_int16 high2 = vx_load(src3 + x + v_int16::nlanes);
+            v_int16 values2 = vx_load(src1 + x + VTraits<v_int16>::vlanes());
+            v_int16 low2 = vx_load(src2 + x + VTraits<v_int16>::vlanes());
+            v_int16 high2 = vx_load(src3 + x + VTraits<v_int16>::vlanes());

-            v_store((schar*)(dst + x), v_pack((values1 >= low1) & (high1 >= values1), (values2 >= low2) & (high2 >= values2)));
+            v_store((schar*)(dst + x), v_pack(v_and(v_ge(values1, low1), v_ge(high1, values1)), v_and(v_ge(values2, low2), v_ge(high2, values2))));
        }
        vx_cleanup();
        return x;
@ -1437,7 +1437,7 @@ struct InRange_SIMD<int>
        uchar * dst, int len) const
    {
        int x = 0;
-        const int width = (int)v_int32::nlanes * 2;
+        const int width = (int)VTraits<v_int32>::vlanes() * 2;

        for (; x <= len - width; x += width)
        {
@ -1445,11 +1445,11 @@ struct InRange_SIMD<int>
            v_int32 low1 = vx_load(src2 + x);
            v_int32 high1 = vx_load(src3 + x);

-            v_int32 values2 = vx_load(src1 + x + v_int32::nlanes);
-            v_int32 low2 = vx_load(src2 + x + v_int32::nlanes);
-            v_int32 high2 = vx_load(src3 + x + v_int32::nlanes);
+            v_int32 values2 = vx_load(src1 + x + VTraits<v_int32>::vlanes());
+            v_int32 low2 = vx_load(src2 + x + VTraits<v_int32>::vlanes());
+            v_int32 high2 = vx_load(src3 + x + VTraits<v_int32>::vlanes());

-            v_pack_store(dst + x, v_reinterpret_as_u16(v_pack((values1 >= low1) & (high1 >= values1), (values2 >= low2) & (high2 >= values2))));
+            v_pack_store(dst + x, v_reinterpret_as_u16(v_pack(v_and(v_ge(values1, low1), v_ge(high1, values1)), v_and(v_ge(values2, low2), v_ge(high2, values2)))));
        }
        vx_cleanup();
        return x;
@ -1463,7 +1463,7 @@ struct InRange_SIMD<float>
        uchar * dst, int len) const
    {
        int x = 0;
-        const int width = (int)v_float32::nlanes * 2;
+        const int width = (int)VTraits<v_float32>::vlanes() * 2;

        for (; x <= len - width; x += width)
        {
@ -1471,12 +1471,12 @@ struct InRange_SIMD<float>
            v_float32 low1 = vx_load(src2 + x);
            v_float32 high1 = vx_load(src3 + x);

-            v_float32 values2 = vx_load(src1 + x + v_float32::nlanes);
-            v_float32 low2 = vx_load(src2 + x + v_float32::nlanes);
-            v_float32 high2 = vx_load(src3 + x + v_float32::nlanes);
+            v_float32 values2 = vx_load(src1 + x + VTraits<v_float32>::vlanes());
+            v_float32 low2 = vx_load(src2 + x + VTraits<v_float32>::vlanes());
+            v_float32 high2 = vx_load(src3 + x + VTraits<v_float32>::vlanes());

-            v_pack_store(dst + x, v_pack(v_reinterpret_as_u32(values1 >= low1) & v_reinterpret_as_u32(high1 >= values1),
-                                         v_reinterpret_as_u32(values2 >= low2) & v_reinterpret_as_u32(high2 >= values2)));
+            v_pack_store(dst + x, v_pack(v_and(v_reinterpret_as_u32(v_ge(values1, low1)), v_reinterpret_as_u32(v_ge(high1, values1))),
+                                         v_and(v_reinterpret_as_u32(v_ge(values2, low2)), v_reinterpret_as_u32(v_ge(high2, values2)))));
        }
        vx_cleanup();
        return x;
--- a/modules/core/src/arithm.simd.hpp
+++ b/modules/core/src/arithm.simd.hpp
@ -219,7 +219,7 @@ template<typename T1, typename Tvec>
 struct op_add
 {
    static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a + b; }
+    { return v_add(a, b); }
    static inline T1 r(T1 a, T1 b)
    { return c_add(a, b); }
 };
@ -229,7 +229,7 @@ template<typename T1, typename Tvec>
 struct op_sub
 {
    static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a - b; }
+    { return v_sub(a, b); }
    static inline T1 r(T1 a, T1 b)
    { return c_sub(a, b); }
 };
@ -266,7 +266,7 @@ struct op_absdiff
 template<>
 struct op_absdiff<schar, v_int8>
 {
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
    static inline v_int8 r(const v_int8& a, const v_int8& b)
    { return v_absdiffs(a, b); }
 #endif
@ -276,7 +276,7 @@ struct op_absdiff<schar, v_int8>
 template<>
 struct op_absdiff<short, v_int16>
 {
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
    static inline v_int16 r(const v_int16& a, const v_int16& b)
    { return v_absdiffs(a, b); }
 #endif
@ -286,7 +286,7 @@ struct op_absdiff<short, v_int16>
 template<>
 struct op_absdiff<int, v_int32>
 {
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
    static inline v_int32 r(const v_int32& a, const v_int32& b)
    { return v_reinterpret_as_s32(v_absdiff(a, b)); }
 #endif
@ -299,7 +299,7 @@ template<typename T1, typename Tvec>
 struct op_or
 {
    static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a | b; }
+    { return v_or(a, b); }
    static inline T1 r(T1 a, T1 b)
    { return a | b; }
 };
@ -307,7 +307,7 @@ template<typename T1, typename Tvec>
 struct op_xor
 {
    static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a ^ b; }
+    { return v_xor(a, b); }
    static inline T1 r(T1 a, T1 b)
    { return a ^ b; }
 };
@ -315,7 +315,7 @@ template<typename T1, typename Tvec>
 struct op_and
 {
    static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a & b; }
+    { return v_and(a, b); }
    static inline T1 r(T1 a, T1 b)
    { return a & b; }
 };
@ -324,14 +324,14 @@ struct op_not
 {
    // ignored b from loader level
    static inline Tvec r(const Tvec& a)
-    { return ~a; }
+    { return v_not(a); }
    static inline T1 r(T1 a, T1)
    { return ~a; }
 };

 //////////////////////////// Loaders /////////////////////////////////

-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE

 template< template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
 struct bin_loader
@ -396,13 +396,13 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
 static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height)
 {
    typedef OP<T1, Tvec> op;
-#if CV_SIMD
+#if CV_SIMD  || CV_SIMD_SCALABLE
    typedef bin_loader<OP, T1, Tvec> ldr;
-    enum {wide_step = Tvec::nlanes};
+    const int wide_step = VTraits<Tvec>::vlanes();
    #if !CV_NEON && CV_SIMD_WIDTH == 16
-        enum {wide_step_l = wide_step * 2};
+        const int wide_step_l = wide_step * 2;
    #else
-        enum {wide_step_l = wide_step};
+        const int wide_step_l = wide_step;
    #endif
 #endif // CV_SIMD

@ -414,7 +414,7 @@ static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
    {
        int x = 0;

-    #if CV_SIMD
+    #if CV_SIMD || CV_SIMD_SCALABLE
        #if !CV_NEON && !CV_MSA
        if (is_aligned(src1, src2, dst))
        {
@ -587,7 +587,7 @@ template<typename T1, typename Tvec>
 struct op_cmplt
 {
    static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a < b; }
+    { return v_lt(a, b); }
    static inline uchar r(T1 a, T1 b)
    { return (uchar)-(int)(a < b); }
 };
@ -596,7 +596,7 @@ template<typename T1, typename Tvec>
 struct op_cmple
 {
    static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a <= b; }
+    { return v_le(a, b); }
    static inline uchar r(T1 a, T1 b)
    { return (uchar)-(int)(a <= b); }
 };
@ -605,7 +605,7 @@ template<typename T1, typename Tvec>
 struct op_cmpeq
 {
    static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a == b; }
+    { return v_eq(a, b); }
    static inline uchar r(T1 a, T1 b)
    { return (uchar)-(int)(a == b); }
 };
@ -614,14 +614,14 @@ template<typename T1, typename Tvec>
 struct op_cmpne
 {
    static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a != b; }
+    { return v_ne(a, b); }
    static inline uchar r(T1 a, T1 b)
    { return (uchar)-(int)(a != b); }
 };

 //////////////////////////// Loaders /////////////////////////////////

-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
 // todo: add support for RW alignment & stream
 template<int nload, template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
 struct cmp_loader_n
@ -646,10 +646,10 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
 struct cmp_loader_n<sizeof(ushort), OP, T1, Tvec>
 {
    typedef OP<T1, Tvec> op;
-    enum {step = Tvec::nlanes};

    static inline void l(const T1* src1, const T1* src2, uchar* dst)
    {
+        const int step = VTraits<Tvec>::vlanes();
        Tvec c0 = op::r(vx_load(src1), vx_load(src2));
        Tvec c1 = op::r(vx_load(src1 + step), vx_load(src2 + step));
        v_store(dst, v_pack_b(v_reinterpret_as_u16(c0), v_reinterpret_as_u16(c1)));
@ -660,10 +660,10 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
 struct cmp_loader_n<sizeof(unsigned), OP, T1, Tvec>
 {
    typedef OP<T1, Tvec> op;
-    enum {step = Tvec::nlanes};

    static inline void l(const T1* src1, const T1* src2, uchar* dst)
    {
+        const int step = VTraits<Tvec>::vlanes();
        v_uint32 c0 = v_reinterpret_as_u32(op::r(vx_load(src1), vx_load(src2)));
        v_uint32 c1 = v_reinterpret_as_u32(op::r(vx_load(src1 + step), vx_load(src2 + step)));
        v_uint32 c2 = v_reinterpret_as_u32(op::r(vx_load(src1 + step * 2), vx_load(src2 + step * 2)));
@ -676,10 +676,10 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
 struct cmp_loader_n<sizeof(double), OP, T1, Tvec>
 {
    typedef OP<T1, Tvec> op;
-    enum {step = Tvec::nlanes};

    static inline void l(const T1* src1, const T1* src2, uchar* dst)
    {
+        const int step = VTraits<Tvec>::vlanes();
        v_uint64 c0 = v_reinterpret_as_u64(op::r(vx_load(src1), vx_load(src2)));
        v_uint64 c1 = v_reinterpret_as_u64(op::r(vx_load(src1 + step), vx_load(src2 + step)));
        v_uint64 c2 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 2), vx_load(src2 + step * 2)));
@ -701,9 +701,9 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
 static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height)
 {
    typedef OP<T1, Tvec> op;
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
    typedef cmp_loader_n<sizeof(T1), OP, T1, Tvec> ldr;
-    enum {wide_step = Tvec::nlanes * sizeof(T1)};
+    const int wide_step = VTraits<Tvec>::vlanes() * sizeof(T1);
 #endif // CV_SIMD

    step1 /= sizeof(T1);
@ -713,7 +713,7 @@ static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
    {
        int x = 0;

-    #if CV_SIMD
+    #if CV_SIMD || CV_SIMD_SCALABLE
        for (; x <= width - wide_step; x += wide_step)
        {
            ldr::l(src1 + x, src2 + x, dst + x);
@ -880,7 +880,7 @@ DEFINE_SIMD_ALL(cmp)

 //////////////////////////// Loaders ///////////////////////////////

-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
 // todo: add support for RW alignment & stream
 template<int nload, template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
 struct scalar_loader_n
@ -1013,10 +1013,10 @@ template<template<typename T1, typename T2, typename Tvec> class OP, typename T2
 struct scalar_loader_n<sizeof(int), OP, int, T2, v_int32>
 {
    typedef OP<int, T2, v_int32> op;
-    enum {step = v_int32::nlanes};

    static inline void l(const int* src1, const int* src2, const T2* scalar, int* dst)
    {
+        const int step = VTraits<v_int32>::vlanes();
        v_int32 v_src1 = vx_load(src1);
        v_int32 v_src2 = vx_load(src2);
        v_int32 v_src1s = vx_load(src1 + step);
@ -1043,6 +1043,7 @@ struct scalar_loader_n<sizeof(int), OP, int, T2, v_int32>

    static inline void l(const int* src1, const T2* scalar, int* dst)
    {
+        const int step = VTraits<v_int32>::vlanes();
        v_int32 v_src1 = vx_load(src1);
        v_int32 v_src1s = vx_load(src1 + step);

@ -1068,10 +1069,9 @@ template<template<typename T1, typename T2, typename Tvec> class OP, typename T2
 struct scalar_loader_n<sizeof(float), OP, float, T2, v_float32>
 {
    typedef OP<float, T2, v_float32> op;
-    enum {step = v_float32::nlanes};
-
    static inline void l(const float* src1, const float* src2, const T2* scalar, float* dst)
    {
+        const int step = VTraits<v_float32>::vlanes();
        v_float32 v_src1 = vx_load(src1);
        v_float32 v_src2 = vx_load(src2);
        v_float32 v_src1s = vx_load(src1 + step);
@ -1086,6 +1086,7 @@ struct scalar_loader_n<sizeof(float), OP, float, T2, v_float32>

    static inline void l(const float* src1, const T2* scalar, float* dst)
    {
+        const int step = VTraits<v_float32>::vlanes();
        v_float32 v_src1 = vx_load(src1);
        v_float32 v_src1s = vx_load(src1 + step);

@ -1262,10 +1263,10 @@ static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t ste
                 T1* dst, size_t step, int width, int height, const T2* scalar)
 {
    typedef OP<T1, T2, Tvec> op;
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
    typedef scalar_loader_n<sizeof(T1), OP, T1, T2, Tvec> ldr;
-    const int wide_step = sizeof(T1) > sizeof(ushort) ? Tvec::nlanes * 2 :
-                          sizeof(T1) == sizeof(uchar) ? Tvec::nlanes / 2 : Tvec::nlanes;
+    const int wide_step = sizeof(T1) > sizeof(ushort) ? VTraits<Tvec>::vlanes() * 2 :
+                          sizeof(T1) == sizeof(uchar) ? VTraits<Tvec>::vlanes() / 2 : VTraits<Tvec>::vlanes();
 #endif // CV_SIMD

    step1 /= sizeof(T1);
@ -1276,7 +1277,7 @@ static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t ste
    {
        int x = 0;

-    #if CV_SIMD
+    #if CV_SIMD || CV_SIMD_SCALABLE
        for (; x <= width - wide_step; x += wide_step)
        {
            ldr::l(src1 + x, src2 + x, scalar, dst + x);
@ -1308,10 +1309,10 @@ template<template<typename T1, typename T2, typename Tvec> class OP, typename T1
 static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const T2* scalar)
 {
    typedef OP<T1, T2, Tvec> op;
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
    typedef scalar_loader_n<sizeof(T1), OP, T1, T2, Tvec> ldr;
-    const int wide_step = sizeof(T1) > sizeof(ushort) ? Tvec::nlanes * 2 :
-                          sizeof(T1) == sizeof(uchar) ? Tvec::nlanes / 2 : Tvec::nlanes;
+    const int wide_step = sizeof(T1) > sizeof(ushort) ? VTraits<Tvec>::vlanes() * 2 :
+                          sizeof(T1) == sizeof(uchar) ? VTraits<Tvec>::vlanes() / 2 : VTraits<Tvec>::vlanes();
 #endif // CV_SIMD

    step1 /= sizeof(T1);
@ -1321,7 +1322,7 @@ static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int
    {
        int x = 0;

-    #if CV_SIMD
+    #if CV_SIMD || CV_SIMD_SCALABLE
        for (; x <= width - wide_step; x += wide_step)
        {
            ldr::l(src1 + x, scalar, dst + x);
@ -1428,7 +1429,7 @@ template<typename T1, typename Tvec>
 struct op_mul
 {
    static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a * b; }
+    { return v_mul(a, b); }
    static inline T1 r(T1 a, T1 b)
    { return saturate_cast<T1>(a * b); }
 };
@ -1436,11 +1437,11 @@ struct op_mul
 template<typename T1, typename T2, typename Tvec>
 struct op_mul_scale
 {
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
    static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
    {
        const v_float32 v_scalar = vx_setall_f32(*scalar);
-        return v_scalar * a * b;
+        return v_mul(v_scalar , a , b);
    }
 #endif
    static inline T1 r(T1 a, T1 b, const T2* scalar)
@ -1456,7 +1457,7 @@ struct op_mul_scale<double, double, v_float64>
    static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
    {
        const v_float64 v_scalar = vx_setall_f64(*scalar);
-        return v_scalar * a * b;
+        return v_mul(v_mul(v_scalar, a), b);
    }
 #endif
    static inline double r(double a, double b, const double* scalar)
@ -1569,7 +1570,7 @@ template<typename T1, typename Tvec>
 struct op_div_f
 {
    static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a / b; }
+    { return v_div(a, b); }
    static inline T1 r(T1 a, T1 b)
    { return a / b; }
 };
@ -1577,16 +1578,16 @@ struct op_div_f
 template<typename T1, typename T2, typename Tvec>
 struct op_div_scale
 {
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
    static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
    {
        const v_float32 v_scalar = vx_setall_f32(*scalar);
-        return a * v_scalar / b;
+        return v_div(v_mul(a, v_scalar), b);
    }
    static inline Tvec pre(const Tvec& denom, const Tvec& res)
    {
-        const Tvec v_zero = vx_setall<typename Tvec::lane_type>(0);
-        return v_select(denom == v_zero, v_zero, res);
+        const Tvec v_zero = vx_setall<typename VTraits<Tvec>::lane_type>(0);
+        return v_select(v_eq(denom, v_zero), v_zero, res);
    }
 #endif
    static inline T1 r(T1 a, T1 denom, const T2* scalar)
@ -1599,11 +1600,11 @@ struct op_div_scale
 template<>
 struct op_div_scale<float, float, v_float32>
 {
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
    static inline v_float32 r(const v_float32& a, const v_float32& b, const float* scalar)
    {
        const v_float32 v_scalar = vx_setall_f32(*scalar);
-        return a * v_scalar / b;
+        return v_div(v_mul(a, v_scalar), b);
    }
 #endif
    static inline float r(float a, float denom, const float* scalar)
@ -1617,7 +1618,7 @@ struct op_div_scale<double, double, v_float64>
    static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
    {
        const v_float64 v_scalar = vx_setall_f64(*scalar);
-        return a * v_scalar / b;
+        return v_div(v_mul(a, v_scalar), b);
    }
 #endif
    static inline double r(double a, double denom, const double* scalar)
@ -1685,7 +1686,7 @@ DEFINE_SIMD_ALL(div, div_loop)
 template<typename T1, typename T2, typename Tvec>
 struct op_add_scale
 {
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
    static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
    {
        const v_float32 v_alpha = vx_setall_f32(*scalar);
@ -1718,7 +1719,7 @@ struct op_add_scale<double, double, v_float64>
 template<typename T1, typename T2, typename Tvec>
 struct op_add_weighted
 {
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
    static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalars)
    {
        const v_float32 v_alpha = vx_setall_f32(scalars[0]);
@ -1835,16 +1836,16 @@ DEFINE_SIMD_F64(addWeighted, add_weighted_loop_d)
 template<typename T1, typename T2, typename Tvec>
 struct op_recip
 {
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
    static inline v_float32 r(const v_float32& a, const T2* scalar)
    {
        const v_float32 v_scalar = vx_setall_f32(*scalar);
-        return v_scalar / a;
+        return v_div(v_scalar, a);
    }
    static inline Tvec pre(const Tvec& denom, const Tvec& res)
    {
-        const Tvec v_zero = vx_setall<typename Tvec::lane_type>(0);
-        return v_select(denom == v_zero, v_zero, res);
+        const Tvec v_zero = vx_setall<typename VTraits<Tvec>::lane_type>(0);
+        return v_select(v_eq(denom, v_zero), v_zero, res);
    }
 #endif
    static inline T1 r(T1 denom, const T2* scalar)
@ -1857,11 +1858,11 @@ struct op_recip
 template<>
 struct op_recip<float, float, v_float32>
 {
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
    static inline v_float32 r(const v_float32& a, const float* scalar)
    {
        const v_float32 v_scalar = vx_setall_f32(*scalar);
-        return v_scalar / a;
+        return v_div(v_scalar, a);
    }
 #endif
    static inline float r(float denom, const float* scalar)
@ -1875,7 +1876,7 @@ struct op_recip<double, double, v_float64>
    static inline v_float64 r(const v_float64& a, const double* scalar)
    {
        const v_float64 v_scalar = vx_setall_f64(*scalar);
-        return v_scalar / a;
+        return v_div(v_scalar, a);
    }
 #endif
    static inline double r(double denom, const double* scalar)
--- a/modules/core/src/convert.hpp
+++ b/modules/core/src/convert.hpp
@ -11,7 +11,7 @@
 namespace cv
 {

-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)

 static inline void vx_load_as(const uchar* ptr, v_float32& a)
 { a = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(ptr))); }
@ -62,7 +62,7 @@ static inline void vx_load_pair_as(const schar* ptr, v_uint16& a, v_uint16& b)
 }

 static inline void vx_load_pair_as(const ushort* ptr, v_uint16& a, v_uint16& b)
-{ a = vx_load(ptr); b = vx_load(ptr + v_uint16::nlanes); }
+{ a = vx_load(ptr); b = vx_load(ptr + VTraits<v_uint16>::vlanes()); }

 static inline void vx_load_pair_as(const uchar* ptr, v_int16& a, v_int16& b)
 {
@ -76,7 +76,7 @@ static inline void vx_load_pair_as(const schar* ptr, v_int16& a, v_int16& b)
 { v_expand(vx_load(ptr), a, b); }

 static inline void vx_load_pair_as(const short* ptr, v_int16& a, v_int16& b)
-{ a = vx_load(ptr); b = vx_load(ptr + v_uint16::nlanes); }
+{ a = vx_load(ptr); b = vx_load(ptr + VTraits<v_uint16>::vlanes()); }

 static inline void vx_load_pair_as(const uchar* ptr, v_int32& a, v_int32& b)
 {
@ -105,7 +105,7 @@ static inline void vx_load_pair_as(const short* ptr, v_int32& a, v_int32& b)
 static inline void vx_load_pair_as(const int* ptr, v_int32& a, v_int32& b)
 {
    a = vx_load(ptr);
-    b = vx_load(ptr + v_int32::nlanes);
+    b = vx_load(ptr + VTraits<v_int32>::vlanes());
 }

 static inline void vx_load_pair_as(const uchar* ptr, v_float32& a, v_float32& b)
@ -142,18 +142,18 @@ static inline void vx_load_pair_as(const short* ptr, v_float32& a, v_float32& b)

 static inline void vx_load_pair_as(const int* ptr, v_float32& a, v_float32& b)
 {
-    v_int32 ia = vx_load(ptr), ib = vx_load(ptr + v_int32::nlanes);
+    v_int32 ia = vx_load(ptr), ib = vx_load(ptr + VTraits<v_int32>::vlanes());
    a = v_cvt_f32(ia);
    b = v_cvt_f32(ib);
 }

 static inline void vx_load_pair_as(const float* ptr, v_float32& a, v_float32& b)
-{ a = vx_load(ptr); b = vx_load(ptr + v_float32::nlanes); }
+{ a = vx_load(ptr); b = vx_load(ptr + VTraits<v_float32>::vlanes()); }

 static inline void vx_load_pair_as(const float16_t* ptr, v_float32& a, v_float32& b)
 {
    a = vx_load_expand(ptr);
-    b = vx_load_expand(ptr + v_float32::nlanes);
+    b = vx_load_expand(ptr + VTraits<v_float32>::vlanes());
 }

 static inline void v_store_pair_as(uchar* ptr, const v_uint16& a, const v_uint16& b)
@ -169,7 +169,7 @@ static inline void v_store_pair_as(schar* ptr, const v_uint16& a, const v_uint16
 }

 static inline void v_store_pair_as(ushort* ptr, const v_uint16& a, const v_uint16& b)
-{ v_store(ptr, a); v_store(ptr + v_uint16::nlanes, b); }
+{ v_store(ptr, a); v_store(ptr + VTraits<v_uint16>::vlanes(), b); }

 static inline void v_store_pair_as(uchar* ptr, const v_int16& a, const v_int16& b)
 { v_store(ptr, v_pack_u(a, b)); }
@ -178,7 +178,7 @@ static inline void v_store_pair_as(schar* ptr, const v_int16& a, const v_int16&
 { v_store(ptr, v_pack(a, b)); }

 static inline void v_store_pair_as(short* ptr, const v_int16& a, const v_int16& b)
-{ v_store(ptr, a); v_store(ptr + v_int16::nlanes, b); }
+{ v_store(ptr, a); v_store(ptr + VTraits<v_int16>::vlanes(), b); }

 static inline void v_store_pair_as(uchar* ptr, const v_int32& a, const v_int32& b)
 { v_pack_u_store(ptr, v_pack(a, b)); }
@ -195,7 +195,7 @@ static inline void v_store_pair_as(short* ptr, const v_int32& a, const v_int32&
 static inline void v_store_pair_as(int* ptr, const v_int32& a, const v_int32& b)
 {
    v_store(ptr, a);
-    v_store(ptr + v_int32::nlanes, b);
+    v_store(ptr + VTraits<v_int32>::vlanes(), b);
 }

 static inline void v_store_pair_as(uchar* ptr, const v_float32& a, const v_float32& b)
@ -214,24 +214,24 @@ static inline void v_store_pair_as(int* ptr, const v_float32& a, const v_float32
 {
    v_int32 ia = v_round(a), ib = v_round(b);
    v_store(ptr, ia);
-    v_store(ptr + v_int32::nlanes, ib);
+    v_store(ptr + VTraits<v_int32>::vlanes(), ib);
 }

 static inline void v_store_pair_as(float* ptr, const v_float32& a, const v_float32& b)
-{ v_store(ptr, a); v_store(ptr + v_float32::nlanes, b); }
+{ v_store(ptr, a); v_store(ptr + VTraits<v_float32>::vlanes(), b); }

-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)

 static inline void vx_load_as(const double* ptr, v_float32& a)
 {
-    v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes);
+    v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + VTraits<v_float64>::vlanes());
    a = v_cvt_f32(v0, v1);
 }

 static inline void vx_load_pair_as(const double* ptr, v_int32& a, v_int32& b)
 {
-    v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes);
-    v_float64 v2 = vx_load(ptr + v_float64::nlanes*2), v3 = vx_load(ptr + v_float64::nlanes*3);
+    v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + VTraits<v_float64>::vlanes());
+    v_float64 v2 = vx_load(ptr + VTraits<v_float64>::vlanes()*2), v3 = vx_load(ptr + VTraits<v_float64>::vlanes()*3);
    v_int32 iv0 = v_round(v0), iv1 = v_round(v1);
    v_int32 iv2 = v_round(v2), iv3 = v_round(v3);
    a = v_combine_low(iv0, iv1);
@ -240,8 +240,8 @@ static inline void vx_load_pair_as(const double* ptr, v_int32& a, v_int32& b)

 static inline void vx_load_pair_as(const double* ptr, v_float32& a, v_float32& b)
 {
-    v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes);
-    v_float64 v2 = vx_load(ptr + v_float64::nlanes*2), v3 = vx_load(ptr + v_float64::nlanes*3);
+    v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + VTraits<v_float64>::vlanes());
+    v_float64 v2 = vx_load(ptr + VTraits<v_float64>::vlanes()*2), v3 = vx_load(ptr + VTraits<v_float64>::vlanes()*3);
    a = v_cvt_f32(v0, v1);
    b = v_cvt_f32(v2, v3);
 }
@ -291,7 +291,7 @@ static inline void vx_load_pair_as(const float* ptr, v_float64& a, v_float64& b)
 static inline void vx_load_pair_as(const double* ptr, v_float64& a, v_float64& b)
 {
    a = vx_load(ptr);
-    b = vx_load(ptr + v_float64::nlanes);
+    b = vx_load(ptr + VTraits<v_float64>::vlanes());
 }

 static inline void vx_load_pair_as(const float16_t* ptr, v_float64& a, v_float64& b)
@ -305,7 +305,7 @@ static inline void v_store_as(double* ptr, const v_float32& a)
 {
    v_float64 fa0 = v_cvt_f64(a), fa1 = v_cvt_f64_high(a);
    v_store(ptr, fa0);
-    v_store(ptr + v_float64::nlanes, fa1);
+    v_store(ptr + VTraits<v_float64>::vlanes(), fa1);
 }

 static inline void v_store_pair_as(double* ptr, const v_int32& a, const v_int32& b)
@ -314,9 +314,9 @@ static inline void v_store_pair_as(double* ptr, const v_int32& a, const v_int32&
    v_float64 fb0 = v_cvt_f64(b), fb1 = v_cvt_f64_high(b);

    v_store(ptr, fa0);
-    v_store(ptr + v_float64::nlanes, fa1);
-    v_store(ptr + v_float64::nlanes*2, fb0);
-    v_store(ptr + v_float64::nlanes*3, fb1);
+    v_store(ptr + VTraits<v_float64>::vlanes(), fa1);
+    v_store(ptr + VTraits<v_float64>::vlanes()*2, fb0);
+    v_store(ptr + VTraits<v_float64>::vlanes()*3, fb1);
 }

 static inline void v_store_pair_as(double* ptr, const v_float32& a, const v_float32& b)
@ -325,15 +325,15 @@ static inline void v_store_pair_as(double* ptr, const v_float32& a, const v_floa
    v_float64 fb0 = v_cvt_f64(b), fb1 = v_cvt_f64_high(b);

    v_store(ptr, fa0);
-    v_store(ptr + v_float64::nlanes, fa1);
-    v_store(ptr + v_float64::nlanes*2, fb0);
-    v_store(ptr + v_float64::nlanes*3, fb1);
+    v_store(ptr + VTraits<v_float64>::vlanes(), fa1);
+    v_store(ptr + VTraits<v_float64>::vlanes()*2, fb0);
+    v_store(ptr + VTraits<v_float64>::vlanes()*3, fb1);
 }

 static inline void v_store_pair_as(double* ptr, const v_float64& a, const v_float64& b)
 {
    v_store(ptr, a);
-    v_store(ptr + v_float64::nlanes, b);
+    v_store(ptr + VTraits<v_float64>::vlanes(), b);
 }

 static inline void v_store_pair_as(int* ptr, const v_float64& a, const v_float64& b)
--- a/modules/core/src/convert.simd.hpp
+++ b/modules/core/src/convert.simd.hpp
@ -39,8 +39,8 @@ void cvt16f32f( const float16_t* src, float* dst, int len )
 {
    CV_INSTRUMENT_REGION();
    int j = 0;
-#if CV_SIMD
-    const int VECSZ = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_float32>::vlanes();
    for( ; j < len; j += VECSZ )
    {
        if( j > len - VECSZ )
@ -60,8 +60,8 @@ void cvt32f16f( const float* src, float16_t* dst, int len )
 {
    CV_INSTRUMENT_REGION();
    int j = 0;
-#if CV_SIMD
-    const int VECSZ = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_float32>::vlanes();
    for( ; j < len; j += VECSZ )
    {
        if( j > len - VECSZ )
@ -108,8 +108,8 @@ cvt_( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size )
    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
    {
        int j = 0;
-#if CV_SIMD
-        const int VECSZ = _Twvec::nlanes*2;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int VECSZ = VTraits<_Twvec>::vlanes()*2;
        for( ; j < size.width; j += VECSZ )
        {
            if( j > size.width - VECSZ )
@ -139,8 +139,8 @@ cvt1_( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size )
    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
    {
        int j = 0;
-#if CV_SIMD
-        const int VECSZ = _Twvec::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int VECSZ = VTraits<_Twvec>::vlanes();
        for( ; j < size.width; j += VECSZ )
        {
            if( j > size.width - VECSZ )
--- a/modules/core/src/convert_scale.simd.hpp
+++ b/modules/core/src/convert_scale.simd.hpp
@ -22,9 +22,9 @@ template<typename _Ts, typename _Td> inline void
 cvtabs_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
            Size size, float a, float b )
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b);
-    const int VECSZ = v_float32::nlanes*2;
+    const int VECSZ = VTraits<v_float32>::vlanes()*2;
 #endif
    sstep /= sizeof(src[0]);
    dstep /= sizeof(dst[0]);
@ -32,7 +32,7 @@ cvtabs_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
    {
        int j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
        for( ; j < size.width; j += VECSZ )
        {
            if( j > size.width - VECSZ )
@ -58,9 +58,9 @@ template<typename _Ts, typename _Td> inline void
 cvtabs1_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
             Size size, float a, float b )
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b);
-    const int VECSZ = v_float32::nlanes*2;
+    const int VECSZ = VTraits<v_float32>::vlanes()*2;
 #endif
    sstep /= sizeof(src[0]);
    dstep /= sizeof(dst[0]);
@ -68,7 +68,7 @@ cvtabs1_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
    {
        int j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
        for( ; j < size.width; j += VECSZ )
        {
            if( j > size.width - VECSZ )
@ -92,9 +92,9 @@ template<typename _Ts, typename _Td> inline void
 cvt_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
         Size size, float a, float b )
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b);
-    const int VECSZ = v_float32::nlanes*2;
+    const int VECSZ = VTraits<v_float32>::vlanes()*2;
 #endif
    sstep /= sizeof(src[0]);
    dstep /= sizeof(dst[0]);
@ -102,7 +102,7 @@ cvt_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
    {
        int j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
        for( ; j < size.width; j += VECSZ )
        {
            if( j > size.width - VECSZ )
@ -128,9 +128,9 @@ template<typename _Ts, typename _Td> inline void
 cvt1_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
          Size size, float a, float b )
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b);
-    const int VECSZ = v_float32::nlanes;
+    const int VECSZ = VTraits<v_float32>::vlanes();
 #endif
    sstep /= sizeof(src[0]);
    dstep /= sizeof(dst[0]);
@ -138,7 +138,7 @@ cvt1_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
    {
        int j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
        for( ; j < size.width; j += VECSZ )
        {
            if( j > size.width - VECSZ )
@ -163,9 +163,9 @@ template<typename _Ts, typename _Td> inline void
 cvt_64f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
         Size size, double a, double b )
 {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
    v_float64 va = vx_setall_f64(a), vb = vx_setall_f64(b);
-    const int VECSZ = v_float64::nlanes*2;
+    const int VECSZ = VTraits<v_float64>::vlanes()*2;
 #endif
    sstep /= sizeof(src[0]);
    dstep /= sizeof(dst[0]);
@ -173,7 +173,7 @@ cvt_64f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
    {
        int j = 0;
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
        for( ; j < size.width; j += VECSZ )
        {
            if( j > size.width - VECSZ )
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@ -156,15 +156,15 @@ copyMask_<uchar>(const uchar* _src, size_t sstep, const uchar* mask, size_t mste
        const uchar* src = (const uchar*)_src;
        uchar* dst = (uchar*)_dst;
        int x = 0;
-        #if CV_SIMD
+        #if (CV_SIMD || CV_SIMD_SCALABLE)
        {
            v_uint8 v_zero = vx_setzero_u8();

-            for( ; x <= size.width - v_uint8::nlanes; x += v_uint8::nlanes )
+            for( ; x <= size.width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes() )
            {
                v_uint8 v_src   = vx_load(src  + x),
                        v_dst   = vx_load(dst  + x),
-                        v_nmask = vx_load(mask + x) == v_zero;
+                        v_nmask = v_eq(vx_load(mask + x), v_zero);

                v_dst = v_select(v_nmask, v_dst, v_src);
                v_store(dst + x, v_dst);
@ -188,23 +188,23 @@ copyMask_<ushort>(const uchar* _src, size_t sstep, const uchar* mask, size_t mst
        const ushort* src = (const ushort*)_src;
        ushort* dst = (ushort*)_dst;
        int x = 0;
-        #if CV_SIMD
+        #if (CV_SIMD || CV_SIMD_SCALABLE)
        {
            v_uint8 v_zero = vx_setzero_u8();

-            for( ; x <= size.width - v_uint8::nlanes; x += v_uint8::nlanes )
+            for( ; x <= size.width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes() )
            {
-                v_uint16 v_src1 = vx_load(src + x), v_src2 = vx_load(src + x + v_uint16::nlanes),
-                         v_dst1 = vx_load(dst + x), v_dst2 = vx_load(dst + x + v_uint16::nlanes);
+                v_uint16 v_src1 = vx_load(src + x), v_src2 = vx_load(src + x + VTraits<v_uint16>::vlanes()),
+                         v_dst1 = vx_load(dst + x), v_dst2 = vx_load(dst + x + VTraits<v_uint16>::vlanes());

                v_uint8 v_nmask1, v_nmask2;
-                v_uint8 v_nmask = vx_load(mask + x) == v_zero;
+                v_uint8 v_nmask = v_eq(vx_load(mask + x), v_zero);
                v_zip(v_nmask, v_nmask, v_nmask1, v_nmask2);

                v_dst1 = v_select(v_reinterpret_as_u16(v_nmask1), v_dst1, v_src1);
                v_dst2 = v_select(v_reinterpret_as_u16(v_nmask2), v_dst2, v_src2);
                v_store(dst + x, v_dst1);
-                v_store(dst + x + v_uint16::nlanes, v_dst2);
+                v_store(dst + x + VTraits<v_uint16>::vlanes(), v_dst2);
            }
        }
        vx_cleanup();
--- a/modules/core/src/count_non_zero.simd.hpp
+++ b/modules/core/src/count_non_zero.simd.hpp
@ -32,8 +32,8 @@ static int countNonZero_(const T* src, int len )
 static int countNonZero8u( const uchar* src, int len )
 {
    int i=0, nz = 0;
-#if CV_SIMD
-    int len0 = len & -v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int len0 = len & -VTraits<v_uint8>::vlanes();
    v_uint8 v_zero = vx_setzero_u8();
    v_uint8 v_one = vx_setall_u8(1);

@ -42,20 +42,20 @@ static int countNonZero8u( const uchar* src, int len )
    {
        v_uint16 v_sum16 = vx_setzero_u16();
        int j = i;
-        while (j < std::min(len0, i + 65280 * v_uint16::nlanes))
+        while (j < std::min(len0, i + 65280 * VTraits<v_uint16>::vlanes()))
        {
            v_uint8 v_sum8 = vx_setzero_u8();
            int k = j;
-            for (; k < std::min(len0, j + 255 * v_uint8::nlanes); k += v_uint8::nlanes)
-                v_sum8 += v_one & (vx_load(src + k) == v_zero);
+            for (; k < std::min(len0, j + 255 * VTraits<v_uint8>::vlanes()); k += VTraits<v_uint8>::vlanes())
+                v_sum8 = v_add(v_sum8, v_and(v_one, v_eq(vx_load(src + k), v_zero)));
            v_uint16 part1, part2;
            v_expand(v_sum8, part1, part2);
-            v_sum16 += part1 + part2;
+            v_sum16 = v_add(v_sum16, v_add(part1, part2));
            j = k;
        }
        v_uint32 part1, part2;
        v_expand(v_sum16, part1, part2);
-        v_sum32 += part1 + part2;
+        v_sum32 = v_add(v_sum32, v_add(part1, part2));
        i = j;
    }
    nz = i - v_reduce_sum(v_sum32);
@ -69,8 +69,8 @@ static int countNonZero8u( const uchar* src, int len )
 static int countNonZero16u( const ushort* src, int len )
 {
    int i = 0, nz = 0;
-#if CV_SIMD
-    int len0 = len & -v_int8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int len0 = len & -VTraits<v_int8>::vlanes();
    v_uint16 v_zero = vx_setzero_u16();
    v_int8 v_one = vx_setall_s8(1);

@ -79,20 +79,20 @@ static int countNonZero16u( const ushort* src, int len )
    {
        v_int16 v_sum16 = vx_setzero_s16();
        int j = i;
-        while (j < std::min(len0, i + 32766 * v_int16::nlanes))
+        while (j < std::min(len0, i + 32766 * VTraits<v_int16>::vlanes()))
        {
            v_int8 v_sum8 = vx_setzero_s8();
            int k = j;
-            for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes)
-                v_sum8 += v_one & v_pack(v_reinterpret_as_s16(vx_load(src + k) == v_zero), v_reinterpret_as_s16(vx_load(src + k + v_uint16::nlanes) == v_zero));
+            for (; k < std::min(len0, j + 127 * VTraits<v_int8>::vlanes()); k += VTraits<v_int8>::vlanes())
+                v_sum8 = v_add(v_sum8, v_and(v_one, v_pack(v_reinterpret_as_s16(v_eq(vx_load(src + k), v_zero)), v_reinterpret_as_s16(v_eq(vx_load(src + k + VTraits<v_uint16>::vlanes()), v_zero)))));
            v_int16 part1, part2;
            v_expand(v_sum8, part1, part2);
-            v_sum16 += part1 + part2;
+            v_sum16 = v_add(v_sum16, v_add(part1, part2));
            j = k;
        }
        v_int32 part1, part2;
        v_expand(v_sum16, part1, part2);
-        v_sum32 += part1 + part2;
+        v_sum32 = v_add(v_sum32, v_add(part1, part2));
        i = j;
    }
    nz = i - v_reduce_sum(v_sum32);
@ -104,8 +104,8 @@ static int countNonZero16u( const ushort* src, int len )
 static int countNonZero32s( const int* src, int len )
 {
    int i = 0, nz = 0;
-#if CV_SIMD
-    int len0 = len & -v_int8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int len0 = len & -VTraits<v_int8>::vlanes();
    v_int32 v_zero = vx_setzero_s32();
    v_int8 v_one = vx_setall_s8(1);

@ -114,23 +114,20 @@ static int countNonZero32s( const int* src, int len )
    {
        v_int16 v_sum16 = vx_setzero_s16();
        int j = i;
-        while (j < std::min(len0, i + 32766 * v_int16::nlanes))
+        while (j < std::min(len0, i + 32766 * VTraits<v_int16>::vlanes()))
        {
            v_int8 v_sum8 = vx_setzero_s8();
            int k = j;
-            for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes)
-                v_sum8 += v_one & v_pack(
-                    v_pack(vx_load(src + k                    ) == v_zero, vx_load(src + k +   v_int32::nlanes) == v_zero),
-                    v_pack(vx_load(src + k + 2*v_int32::nlanes) == v_zero, vx_load(src + k + 3*v_int32::nlanes) == v_zero)
-                );
+            for (; k < std::min(len0, j + 127 * VTraits<v_int8>::vlanes()); k += VTraits<v_int8>::vlanes())
+                v_sum8 = v_add(v_sum8, v_and(v_one, v_pack(v_pack(v_eq(vx_load(src + k), v_zero), v_eq(vx_load(src + k + VTraits<v_int32>::vlanes()), v_zero)), v_pack(v_eq(vx_load(src + k + 2 * VTraits<v_int32>::vlanes()), v_zero), v_eq(vx_load(src + k + 3 * VTraits<v_int32>::vlanes()), v_zero)))));
            v_int16 part1, part2;
            v_expand(v_sum8, part1, part2);
-            v_sum16 += part1 + part2;
+            v_sum16 = v_add(v_sum16, v_add(part1, part2));
            j = k;
        }
        v_int32 part1, part2;
        v_expand(v_sum16, part1, part2);
-        v_sum32 += part1 + part2;
+        v_sum32 = v_add(v_sum32, v_add(part1, part2));
        i = j;
    }
    nz = i - v_reduce_sum(v_sum32);
@ -142,8 +139,8 @@ static int countNonZero32s( const int* src, int len )
 static int countNonZero32f( const float* src, int len )
 {
    int i = 0, nz = 0;
-#if CV_SIMD
-    int len0 = len & -v_int8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int len0 = len & -VTraits<v_int8>::vlanes();
    v_float32 v_zero = vx_setzero_f32();
    v_int8 v_one = vx_setall_s8(1);

@ -152,23 +149,20 @@ static int countNonZero32f( const float* src, int len )
    {
        v_int16 v_sum16 = vx_setzero_s16();
        int j = i;
-        while (j < std::min(len0, i + 32766 * v_int16::nlanes))
+        while (j < std::min(len0, i + 32766 * VTraits<v_int16>::vlanes()))
        {
            v_int8 v_sum8 = vx_setzero_s8();
            int k = j;
-            for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes)
-                v_sum8 += v_one & v_pack(
-                    v_pack(v_reinterpret_as_s32(vx_load(src + k                      ) == v_zero), v_reinterpret_as_s32(vx_load(src + k +   v_float32::nlanes) == v_zero)),
-                    v_pack(v_reinterpret_as_s32(vx_load(src + k + 2*v_float32::nlanes) == v_zero), v_reinterpret_as_s32(vx_load(src + k + 3*v_float32::nlanes) == v_zero))
-                );
+            for (; k < std::min(len0, j + 127 * VTraits<v_int8>::vlanes()); k += VTraits<v_int8>::vlanes())
+                v_sum8 = v_add(v_sum8, v_and(v_one, v_pack(v_pack(v_reinterpret_as_s32(v_eq(vx_load(src + k), v_zero)), v_reinterpret_as_s32(v_eq(vx_load(src + k + VTraits<v_float32>::vlanes()), v_zero))), v_pack(v_reinterpret_as_s32(v_eq(vx_load(src + k + 2 * VTraits<v_float32>::vlanes()), v_zero)), v_reinterpret_as_s32(v_eq(vx_load(src + k + 3 * VTraits<v_float32>::vlanes()), v_zero))))));
            v_int16 part1, part2;
            v_expand(v_sum8, part1, part2);
-            v_sum16 += part1 + part2;
+            v_sum16 = v_add(v_sum16, v_add(part1, part2));
            j = k;
        }
        v_int32 part1, part2;
        v_expand(v_sum16, part1, part2);
-        v_sum32 += part1 + part2;
+        v_sum32 = v_add(v_sum32, v_add(part1, part2));
        i = j;
    }
    nz = i - v_reduce_sum(v_sum32);
@ -180,21 +174,21 @@ static int countNonZero32f( const float* src, int len )
 static int countNonZero64f( const double* src, int len )
 {
    int nz = 0, i = 0;
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
    v_int64 sum1 = vx_setzero_s64();
    v_int64 sum2 = vx_setzero_s64();
    v_float64 zero = vx_setzero_f64();
-    int step = v_float64::nlanes * 2;
+    int step = VTraits<v_float64>::vlanes() * 2;
    int len0 = len & -step;

    for(i = 0; i < len0; i += step )
        {
-        sum1 += v_reinterpret_as_s64(vx_load(&src[i]) == zero);
-        sum2 += v_reinterpret_as_s64(vx_load(&src[i + step / 2]) == zero);
+        sum1 = v_add(sum1, v_reinterpret_as_s64(v_eq(vx_load(&src[i]), zero)));
+        sum2 = v_add(sum2, v_reinterpret_as_s64(v_eq(vx_load(&src[i + step / 2]), zero)));
        }

    // N.B the value is incremented by -1 (0xF...F) for each value
-    nz = i + (int)v_reduce_sum(sum1 + sum2);
+    nz = i + (int)v_reduce_sum(v_add(sum1, sum2));
    v_cleanup();
 #endif
    return nz + countNonZero_(src + i, len - i);
--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@ -274,22 +274,21 @@ template<typename T> struct VBLAS
 {
    int dot(const T*, const T*, int, T*) const { return 0; }
    int givens(T*, T*, int, T, T) const { return 0; }
-    int givensx(T*, T*, int, T, T, T*, T*) const { return 0; }
 };

-#if CV_SIMD
+#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE_64F
 template<> inline int VBLAS<float>::dot(const float* a, const float* b, int n, float* result) const
 {
-    if( n < 2*v_float32::nlanes )
+    if( n < 2*VTraits<v_float32>::vlanes() )
        return 0;
    int k = 0;
    v_float32 s0 = vx_setzero_f32();
-    for( ; k <= n - v_float32::nlanes; k += v_float32::nlanes )
+    for( ; k <= n - VTraits<v_float32>::vlanes(); k += VTraits<v_float32>::vlanes() )
    {
        v_float32 a0 = vx_load(a + k);
        v_float32 b0 = vx_load(b + k);

-        s0 += a0 * b0;
+        s0 = v_add(s0, v_mul(a0, b0));
    }
    *result = v_reduce_sum(s0);
    vx_cleanup();
@ -299,16 +298,16 @@ template<> inline int VBLAS<float>::dot(const float* a, const float* b, int n, f

 template<> inline int VBLAS<float>::givens(float* a, float* b, int n, float c, float s) const
 {
-    if( n < v_float32::nlanes)
+    if( n < VTraits<v_float32>::vlanes())
        return 0;
    int k = 0;
    v_float32 c4 = vx_setall_f32(c), s4 = vx_setall_f32(s);
-    for( ; k <= n - v_float32::nlanes; k += v_float32::nlanes )
+    for( ; k <= n - VTraits<v_float32>::vlanes(); k += VTraits<v_float32>::vlanes() )
    {
        v_float32 a0 = vx_load(a + k);
        v_float32 b0 = vx_load(b + k);
-        v_float32 t0 = (a0 * c4) + (b0 * s4);
-        v_float32 t1 = (b0 * c4) - (a0 * s4);
+        v_float32 t0 = v_add(v_mul(a0, c4), v_mul(b0, s4));
+        v_float32 t1 = v_sub(v_mul(b0, c4), v_mul(a0, s4));
        v_store(a + k, t0);
        v_store(b + k, t1);
    }
@ -317,44 +316,19 @@ template<> inline int VBLAS<float>::givens(float* a, float* b, int n, float c, f
 }


-template<> inline int VBLAS<float>::givensx(float* a, float* b, int n, float c, float s,
-                                             float* anorm, float* bnorm) const
-{
-    if( n < v_float32::nlanes)
-        return 0;
-    int k = 0;
-    v_float32 c4 = vx_setall_f32(c), s4 = vx_setall_f32(s);
-    v_float32 sa = vx_setzero_f32(), sb = vx_setzero_f32();
-    for( ; k <= n - v_float32::nlanes; k += v_float32::nlanes )
-    {
-        v_float32 a0 = vx_load(a + k);
-        v_float32 b0 = vx_load(b + k);
-        v_float32 t0 = (a0 * c4) + (b0 * s4);
-        v_float32 t1 = (b0 * c4) - (a0 * s4);
-        v_store(a + k, t0);
-        v_store(b + k, t1);
-        sa += t0 + t0;
-        sb += t1 + t1;
-    }
-    *anorm = v_reduce_sum(sa);
-    *bnorm = v_reduce_sum(sb);
-    vx_cleanup();
-    return k;
-}
-
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 template<> inline int VBLAS<double>::dot(const double* a, const double* b, int n, double* result) const
 {
-    if( n < 2*v_float64::nlanes )
+    if( n < 2*VTraits<v_float64>::vlanes() )
        return 0;
    int k = 0;
    v_float64 s0 = vx_setzero_f64();
-    for( ; k <= n - v_float64::nlanes; k += v_float64::nlanes )
+    for( ; k <= n - VTraits<v_float64>::vlanes(); k += VTraits<v_float64>::vlanes() )
    {
        v_float64 a0 = vx_load(a + k);
        v_float64 b0 = vx_load(b + k);

-        s0 += a0 * b0;
+        s0 = v_add(s0, v_mul(a0, b0));
    }
    double sbuf[2];
    v_store(sbuf, s0);
@ -368,12 +342,12 @@ template<> inline int VBLAS<double>::givens(double* a, double* b, int n, double
 {
    int k = 0;
    v_float64 c2 = vx_setall_f64(c), s2 = vx_setall_f64(s);
-    for( ; k <= n - v_float64::nlanes; k += v_float64::nlanes )
+    for( ; k <= n - VTraits<v_float64>::vlanes(); k += VTraits<v_float64>::vlanes() )
    {
        v_float64 a0 = vx_load(a + k);
        v_float64 b0 = vx_load(b + k);
-        v_float64 t0 = (a0 * c2) + (b0 * s2);
-        v_float64 t1 = (b0 * c2) - (a0 * s2);
+        v_float64 t0 = v_add(v_mul(a0, c2), v_mul(b0, s2));
+        v_float64 t1 = v_sub(v_mul(b0, c2), v_mul(a0, s2));
        v_store(a + k, t0);
        v_store(b + k, t1);
    }
@ -382,30 +356,6 @@ template<> inline int VBLAS<double>::givens(double* a, double* b, int n, double
 }


-template<> inline int VBLAS<double>::givensx(double* a, double* b, int n, double c, double s,
-                                              double* anorm, double* bnorm) const
-{
-    int k = 0;
-    v_float64 c2 = vx_setall_f64(c), s2 = vx_setall_f64(s);
-    v_float64 sa = vx_setzero_f64(), sb = vx_setzero_f64();
-    for( ; k <= n - v_float64::nlanes; k += v_float64::nlanes )
-    {
-        v_float64 a0 = vx_load(a + k);
-        v_float64 b0 = vx_load(b + k);
-        v_float64 t0 = (a0 * c2) + (b0 * s2);
-        v_float64 t1 = (b0 * c2) - (a0 * s2);
-        v_store(a + k, t0);
-        v_store(b + k, t1);
-        sa += t0 * t0;
-        sb += t1 * t1;
-    }
-    double abuf[2], bbuf[2];
-    v_store(abuf, sa);
-    v_store(bbuf, sb);
-    *anorm = abuf[0] + abuf[1];
-    *bnorm = bbuf[0] + bbuf[1];
-    return k;
-}
 #endif //CV_SIMD_64F
 #endif //CV_SIMD

@ -916,7 +866,7 @@ double invert( InputArray _src, OutputArray _dst, int method )
                #if CV_SIMD128
                    const float d_32f = (float)d;
                    const v_float32x4 d_vec(d_32f, -d_32f, -d_32f, d_32f);
-                    v_float32x4 s0 = v_load_halves((const float*)srcdata, (const float*)(srcdata + srcstep)) * d_vec;//0123//3120
+                    v_float32x4 s0 = v_mul(v_load_halves((const float *)srcdata, (const float *)(srcdata + srcstep)), d_vec);//0123//3120
                    s0 = v_extract<3>(s0, v_combine_low(v_rotate_right<1>(s0), s0));
                    v_store_low((float*)dstdata, s0);
                    v_store_high((float*)(dstdata + dststep), s0);
@ -942,10 +892,10 @@ double invert( InputArray _src, OutputArray _dst, int method )
                    d = 1./d;
                #if CV_SIMD128_64F
                    v_float64x2 det = v_setall_f64(d);
-                    v_float64x2 s0 = v_load((const double*)srcdata) * det;
-                    v_float64x2 s1 = v_load((const double*)(srcdata+srcstep)) * det;
+                    v_float64x2 s0 = v_mul(v_load((const double *)srcdata), det);
+                    v_float64x2 s1 = v_mul(v_load((const double *)(srcdata + srcstep)), det);
                    v_float64x2 sm = v_extract<1>(s1, s0);//30
-                    v_float64x2 ss = v_setall<double>(0) - v_extract<1>(s0, s1);//12
+                    v_float64x2 ss = v_sub(v_setall<double>(0), v_extract<1>(s0, s1));//12
                    v_store((double*)dstdata, v_combine_low(sm, ss));//31
                    v_store((double*)(dstdata + dststep), v_combine_high(ss, sm));//20
                #else
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@ -614,13 +614,13 @@ void polarToCart( InputArray src1, InputArray src2,
                {
                    k = 0;

-#if CV_SIMD
-                    int cWidth = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                    int cWidth = VTraits<v_float32>::vlanes();
                    for( ; k <= len - cWidth; k += cWidth )
                    {
                        v_float32 v_m = vx_load(mag + k);
-                        v_store(x + k, vx_load(x + k) * v_m);
-                        v_store(y + k, vx_load(y + k) * v_m);
+                        v_store(x + k, v_mul(vx_load(x + k), v_m));
+                        v_store(y + k, v_mul(vx_load(y + k), v_m));
                    }
                    vx_cleanup();
 #endif
@ -741,7 +741,7 @@ struct iPow_SIMD
    }
 };

-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)

 template <>
 struct iPow_SIMD<uchar, int>
@ -751,7 +751,7 @@ struct iPow_SIMD<uchar, int>
        int i = 0;
        v_uint32 v_1 = vx_setall_u32(1u);

-        for ( ; i <= len - v_uint16::nlanes; i += v_uint16::nlanes)
+        for ( ; i <= len - VTraits<v_uint16>::vlanes(); i += VTraits<v_uint16>::vlanes())
        {
            v_uint32 v_a1 = v_1, v_a2 = v_1;
            v_uint16 v = vx_load_expand(src + i);
@ -763,16 +763,16 @@ struct iPow_SIMD<uchar, int>
            {
                if (p & 1)
                {
-                    v_a1 *= v_b1;
-                    v_a2 *= v_b2;
+                    v_a1 = v_mul(v_a1, v_b1);
+                    v_a2 = v_mul(v_a2, v_b2);
                }
-                v_b1 *= v_b1;
-                v_b2 *= v_b2;
+                v_b1 = v_mul(v_b1, v_b1);
+                v_b2 = v_mul(v_b2, v_b2);
                p >>= 1;
            }

-            v_a1 *= v_b1;
-            v_a2 *= v_b2;
+            v_a1 = v_mul(v_a1, v_b1);
+            v_a2 = v_mul(v_a2, v_b2);

            v = v_pack(v_a1, v_a2);
            v_pack_store(dst + i, v);
@ -791,7 +791,7 @@ struct iPow_SIMD<schar, int>
        int i = 0;
        v_int32 v_1 = vx_setall_s32(1);

-        for ( ; i <= len - v_int16::nlanes; i += v_int16::nlanes)
+        for ( ; i <= len - VTraits<v_int16>::vlanes(); i += VTraits<v_int16>::vlanes())
        {
            v_int32 v_a1 = v_1, v_a2 = v_1;
            v_int16 v = vx_load_expand(src + i);
@ -803,16 +803,16 @@ struct iPow_SIMD<schar, int>
            {
                if (p & 1)
                {
-                    v_a1 *= v_b1;
-                    v_a2 *= v_b2;
+                    v_a1 = v_mul(v_a1, v_b1);
+                    v_a2 = v_mul(v_a2, v_b2);
                }
-                v_b1 *= v_b1;
-                v_b2 *= v_b2;
+                v_b1 = v_mul(v_b1, v_b1);
+                v_b2 = v_mul(v_b2, v_b2);
                p >>= 1;
            }

-            v_a1 *= v_b1;
-            v_a2 *= v_b2;
+            v_a1 = v_mul(v_a1, v_b1);
+            v_a2 = v_mul(v_a2, v_b2);

            v = v_pack(v_a1, v_a2);
            v_pack_store(dst + i, v);
@ -831,7 +831,7 @@ struct iPow_SIMD<ushort, int>
        int i = 0;
        v_uint32 v_1 = vx_setall_u32(1u);

-        for ( ; i <= len - v_uint16::nlanes; i += v_uint16::nlanes)
+        for ( ; i <= len - VTraits<v_uint16>::vlanes(); i += VTraits<v_uint16>::vlanes())
        {
            v_uint32 v_a1 = v_1, v_a2 = v_1;
            v_uint16 v = vx_load(src + i);
@ -843,16 +843,16 @@ struct iPow_SIMD<ushort, int>
            {
                if (p & 1)
                {
-                    v_a1 *= v_b1;
-                    v_a2 *= v_b2;
+                    v_a1 = v_mul(v_a1, v_b1);
+                    v_a2 = v_mul(v_a2, v_b2);
                }
-                v_b1 *= v_b1;
-                v_b2 *= v_b2;
+                v_b1 = v_mul(v_b1, v_b1);
+                v_b2 = v_mul(v_b2, v_b2);
                p >>= 1;
            }

-            v_a1 *= v_b1;
-            v_a2 *= v_b2;
+            v_a1 = v_mul(v_a1, v_b1);
+            v_a2 = v_mul(v_a2, v_b2);

            v = v_pack(v_a1, v_a2);
            v_store(dst + i, v);
@ -871,7 +871,7 @@ struct iPow_SIMD<short, int>
        int i = 0;
        v_int32 v_1 = vx_setall_s32(1);

-        for ( ; i <= len - v_int16::nlanes; i += v_int16::nlanes)
+        for ( ; i <= len - VTraits<v_int16>::vlanes(); i += VTraits<v_int16>::vlanes())
        {
            v_int32 v_a1 = v_1, v_a2 = v_1;
            v_int16 v = vx_load(src + i);
@ -883,16 +883,16 @@ struct iPow_SIMD<short, int>
            {
                if (p & 1)
                {
-                    v_a1 *= v_b1;
-                    v_a2 *= v_b2;
+                    v_a1 = v_mul(v_a1, v_b1);
+                    v_a2 = v_mul(v_a2, v_b2);
                }
-                v_b1 *= v_b1;
-                v_b2 *= v_b2;
+                v_b1 = v_mul(v_b1, v_b1);
+                v_b2 = v_mul(v_b2, v_b2);
                p >>= 1;
            }

-            v_a1 *= v_b1;
-            v_a2 *= v_b2;
+            v_a1 = v_mul(v_a1, v_b1);
+            v_a2 = v_mul(v_a2, v_b2);

            v = v_pack(v_a1, v_a2);
            v_store(dst + i, v);
@ -911,29 +911,29 @@ struct iPow_SIMD<int, int>
        int i = 0;
        v_int32 v_1 = vx_setall_s32(1);

-        for ( ; i <= len - v_int32::nlanes*2; i += v_int32::nlanes*2)
+        for ( ; i <= len - VTraits<v_int32>::vlanes()*2; i += VTraits<v_int32>::vlanes()*2)
        {
            v_int32 v_a1 = v_1, v_a2 = v_1;
-            v_int32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + v_int32::nlanes);
+            v_int32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + VTraits<v_int32>::vlanes());
            int p = power;

            while( p > 1 )
            {
                if (p & 1)
                {
-                    v_a1 *= v_b1;
-                    v_a2 *= v_b2;
+                    v_a1 = v_mul(v_a1, v_b1);
+                    v_a2 = v_mul(v_a2, v_b2);
                }
-                v_b1 *= v_b1;
-                v_b2 *= v_b2;
+                v_b1 = v_mul(v_b1, v_b1);
+                v_b2 = v_mul(v_b2, v_b2);
                p >>= 1;
            }

-            v_a1 *= v_b1;
-            v_a2 *= v_b2;
+            v_a1 = v_mul(v_a1, v_b1);
+            v_a2 = v_mul(v_a2, v_b2);

            v_store(dst + i, v_a1);
-            v_store(dst + i + v_int32::nlanes, v_a2);
+            v_store(dst + i + VTraits<v_int32>::vlanes(), v_a2);
        }
        vx_cleanup();

@ -949,34 +949,34 @@ struct iPow_SIMD<float, float>
        int i = 0;
        v_float32 v_1 = vx_setall_f32(1.f);

-        for ( ; i <= len - v_float32::nlanes*2; i += v_float32::nlanes*2)
+        for ( ; i <= len - VTraits<v_float32>::vlanes()*2; i += VTraits<v_float32>::vlanes()*2)
        {
            v_float32 v_a1 = v_1, v_a2 = v_1;
-            v_float32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + v_float32::nlanes);
+            v_float32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + VTraits<v_float32>::vlanes());
            int p = std::abs(power);
            if( power < 0 )
            {
-                v_b1 = v_1 / v_b1;
-                v_b2 = v_1 / v_b2;
+                v_b1 = v_div(v_1, v_b1);
+                v_b2 = v_div(v_1, v_b2);
            }

            while( p > 1 )
            {
                if (p & 1)
                {
-                    v_a1 *= v_b1;
-                    v_a2 *= v_b2;
+                    v_a1 = v_mul(v_a1, v_b1);
+                    v_a2 = v_mul(v_a2, v_b2);
                }
-                v_b1 *= v_b1;
-                v_b2 *= v_b2;
+                v_b1 = v_mul(v_b1, v_b1);
+                v_b2 = v_mul(v_b2, v_b2);
                p >>= 1;
            }

-            v_a1 *= v_b1;
-            v_a2 *= v_b2;
+            v_a1 = v_mul(v_a1, v_b1);
+            v_a2 = v_mul(v_a2, v_b2);

            v_store(dst + i, v_a1);
-            v_store(dst + i + v_float32::nlanes, v_a2);
+            v_store(dst + i + VTraits<v_float32>::vlanes(), v_a2);
        }
        vx_cleanup();

@ -984,7 +984,7 @@ struct iPow_SIMD<float, float>
    }
 };

-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 template <>
 struct iPow_SIMD<double, double>
 {
@ -993,34 +993,34 @@ struct iPow_SIMD<double, double>
        int i = 0;
        v_float64 v_1 = vx_setall_f64(1.);

-        for ( ; i <= len - v_float64::nlanes*2; i += v_float64::nlanes*2)
+        for ( ; i <= len - VTraits<v_float64>::vlanes()*2; i += VTraits<v_float64>::vlanes()*2)
        {
            v_float64 v_a1 = v_1, v_a2 = v_1;
-            v_float64 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + v_float64::nlanes);
+            v_float64 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + VTraits<v_float64>::vlanes());
            int p = std::abs(power);
            if( power < 0 )
            {
-                v_b1 = v_1 / v_b1;
-                v_b2 = v_1 / v_b2;
+                v_b1 = v_div(v_1, v_b1);
+                v_b2 = v_div(v_1, v_b2);
            }

            while( p > 1 )
            {
                if (p & 1)
                {
-                    v_a1 *= v_b1;
-                    v_a2 *= v_b2;
+                    v_a1 = v_mul(v_a1, v_b1);
+                    v_a2 = v_mul(v_a2, v_b2);
                }
-                v_b1 *= v_b1;
-                v_b2 *= v_b2;
+                v_b1 = v_mul(v_b1, v_b1);
+                v_b2 = v_mul(v_b2, v_b2);
                p >>= 1;
            }

-            v_a1 *= v_b1;
-            v_a2 *= v_b2;
+            v_a1 = v_mul(v_a1, v_b1);
+            v_a2 = v_mul(v_a2, v_b2);

            v_store(dst + i, v_a1);
-            v_store(dst + i + v_float64::nlanes, v_a2);
+            v_store(dst + i + VTraits<v_float64>::vlanes(), v_a2);
        }
        vx_cleanup();

@ -1614,7 +1614,7 @@ void patchNaNs( InputOutputArray _a, double _val )
    Cv32suf val;
    val.f = (float)_val;

-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    v_int32 v_mask1 = vx_setall_s32(0x7fffffff), v_mask2 = vx_setall_s32(0x7f800000);
    v_int32 v_val = vx_setall_s32(val.i);
 #endif
@ -1624,12 +1624,12 @@ void patchNaNs( InputOutputArray _a, double _val )
        int* tptr = ptrs[0];
        size_t j = 0;

-#if CV_SIMD
-        size_t cWidth = (size_t)v_int32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        size_t cWidth = (size_t)VTraits<v_int32>::vlanes();
        for ( ; j + cWidth <= len; j += cWidth)
        {
            v_int32 v_src = vx_load(tptr + j);
-            v_int32 v_cmp_mask = v_mask2 < (v_src & v_mask1);
+            v_int32 v_cmp_mask = v_lt(v_mask2, v_and(v_src, v_mask1));
            v_int32 v_dst = v_select(v_cmp_mask, v_val, v_src);
            v_store(tptr + j, v_dst);
        }
--- a/modules/core/src/matmul.simd.hpp
+++ b/modules/core/src/matmul.simd.hpp
@ -1454,7 +1454,7 @@ transform_( const T* src, T* dst, const WT* m, int len, int scn, int dcn )
 static void
 transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn )
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    const int BITS = 10, SCALE = 1 << BITS;
    const float MAX_M = (float)(1 << (15 - BITS));

@ -1485,7 +1485,7 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in
        v_int32 m10 = vx_setall_s32(m32[4]);
        v_int32 m11 = vx_setall_s32(m32[5]);
        int x = 0;
-        for (; x <= (len - v_uint8::nlanes) * nChannels; x += v_uint8::nlanes * nChannels)
+        for (; x <= (len - VTraits<v_uint8>::vlanes()) * nChannels; x += VTraits<v_uint8>::vlanes() * nChannels)
        {
            v_uint8 b, g, r;
            v_load_deinterleave(src + x, b, g, r);
@ -1499,20 +1499,20 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in
            v_int32 p1, p3;
            v_expand(bgl, p0, p2);
            v_expand(v_reinterpret_as_s16(rl), p1, p3);
-            dbl = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m01) + p1 *  m2 + m3,
-                                    v_dotprod(v_reinterpret_as_s16(p2), m01) + p3 *  m2 + m3);
-            dgl = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m45) + p1 *  m6 + m7,
-                                    v_dotprod(v_reinterpret_as_s16(p2), m45) + p3 *  m6 + m7);
-            drl = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m89) + p1 * m10 + m11,
-                                    v_dotprod(v_reinterpret_as_s16(p2), m89) + p3 * m10 + m11);
+            dbl = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m01), v_mul(p1, m2)), m3),
+                                    v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m01), v_mul(p3, m2)), m3));
+            dgl = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m45), v_mul(p1, m6)), m7),
+                                    v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m45), v_mul(p3, m6)), m7));
+            drl = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m89), v_mul(p1, m10)), m11),
+                                    v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m89), v_mul(p3, m10)), m11));
            v_expand(bgh, p0, p2);
            v_expand(v_reinterpret_as_s16(rh), p1, p3);
-            dbh = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m01) + p1 *  m2 + m3,
-                                    v_dotprod(v_reinterpret_as_s16(p2), m01) + p3 *  m2 + m3);
-            dgh = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m45) + p1 *  m6 + m7,
-                                    v_dotprod(v_reinterpret_as_s16(p2), m45) + p3 *  m6 + m7);
-            drh = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m89) + p1 * m10 + m11,
-                                    v_dotprod(v_reinterpret_as_s16(p2), m89) + p3 * m10 + m11);
+            dbh = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m01), v_mul(p1, m2)), m3),
+                                    v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m01), v_mul(p3, m2)), m3));
+            dgh = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m45), v_mul(p1, m6)), m7),
+                                    v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m45), v_mul(p3, m6)), m7));
+            drh = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m89), v_mul(p1, m10)), m11),
+                                    v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m89), v_mul(p3, m10)), m11));
            v_store_interleave(dst + x, v_pack_u(dbl, dbh), v_pack_u(dgl, dgh), v_pack_u(drl, drh));
        }
        m32[1] = saturate_cast<int>((m[3] + 0.5f)*SCALE);
@ -1537,7 +1537,7 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in
 static void
 transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn )
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    if( scn == 3 && dcn == 3 )
    {
        int x = 0;
@ -1555,7 +1555,7 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn,
        v_float32 m10 = vx_setall_f32(m[10]);
        v_float32 m11 = vx_setall_f32(m[11] - 32768.f);
        v_int16 delta = vx_setall_s16(-32768);
-        for (; x <= (len - v_uint16::nlanes)*3; x += v_uint16::nlanes*3)
+        for (; x <= (len - VTraits<v_uint16>::vlanes())*3; x +=  VTraits<v_uint16>::vlanes()*3)
        {
            v_uint16 b, g, r;
            v_load_deinterleave(src + x, b, g, r);
@ -1574,6 +1574,7 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn,
            v_store_interleave(dst + x, v_reinterpret_as_u16(db), v_reinterpret_as_u16(dg), v_reinterpret_as_u16(dr));
        }
 #endif
+#if CV_SIMD128
        v_float32x4 _m0l(m[0], m[4], m[ 8], 0.f);
        v_float32x4 _m1l(m[1], m[5], m[ 9], 0.f);
        v_float32x4 _m2l(m[2], m[6], m[10], 0.f);
@ -1587,6 +1588,7 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn,
            v_store(dst + x, v_rotate_right<1>(v_reinterpret_as_u16(v_add_wrap(v_pack(
                             v_round(v_matmuladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand(src + x    ))), _m0h, _m1h, _m2h, _m3h)),
                             v_round(v_matmuladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand(src + x + 3))), _m0l, _m1l, _m2l, _m3l))), _delta))));
+#endif //CV_SIMD128
        for( ; x < len * 3; x += 3 )
        {
            float v0 = src[x], v1 = src[x + 1], v2 = src[x + 2];
@ -1606,25 +1608,25 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn,
 static void
 transform_32f( const float* src, float* dst, const float* m, int len, int scn, int dcn )
 {
-#if CV_SIMD && !defined(__aarch64__) && !defined(_M_ARM64)
+#if (CV_SIMD || CV_SIMD_SCALABLE) && !defined(__aarch64__) && !defined(_M_ARM64)
    int x = 0;
    if( scn == 3 && dcn == 3 )
    {
-        int idx[v_float32::nlanes/2];
-        for( int i = 0; i < v_float32::nlanes/4; i++ )
+        int idx[VTraits<v_float32>::max_nlanes/2];
+        for( int i = 0; i < VTraits<v_float32>::vlanes()/4; i++ )
        {
            idx[i] = 3*i;
-            idx[i + v_float32::nlanes/4] = 0;
+            idx[i + VTraits<v_float32>::vlanes()/4] = 0;
        }
        float _m[] = { m[0], m[4], m[ 8], 0.f,
                       m[1], m[5], m[ 9], 0.f,
                       m[2], m[6], m[10], 0.f,
                       m[3], m[7], m[11], 0.f };
-        v_float32 m0 = vx_lut_quads(_m     , idx + v_float32::nlanes/4);
-        v_float32 m1 = vx_lut_quads(_m +  4, idx + v_float32::nlanes/4);
-        v_float32 m2 = vx_lut_quads(_m +  8, idx + v_float32::nlanes/4);
-        v_float32 m3 = vx_lut_quads(_m + 12, idx + v_float32::nlanes/4);
-        for( ; x <= len*3 - v_float32::nlanes; x += 3*v_float32::nlanes/4 )
+        v_float32 m0 = vx_lut_quads(_m     , idx + VTraits<v_float32>::vlanes()/4);
+        v_float32 m1 = vx_lut_quads(_m +  4, idx + VTraits<v_float32>::vlanes()/4);
+        v_float32 m2 = vx_lut_quads(_m +  8, idx + VTraits<v_float32>::vlanes()/4);
+        v_float32 m3 = vx_lut_quads(_m + 12, idx + VTraits<v_float32>::vlanes()/4);
+        for( ; x <= len*3 - VTraits<v_float32>::vlanes(); x += 3*VTraits<v_float32>::vlanes()/4 )
            v_store(dst + x, v_pack_triplets(v_matmuladd(vx_lut_quads(src + x, idx), m0, m1, m2, m3)));
        for( ; x < len*3; x += 3 )
        {
@ -1641,8 +1643,8 @@ transform_32f( const float* src, float* dst, const float* m, int len, int scn, i
    if( scn == 4 && dcn == 4 )
    {
 #if CV_SIMD_WIDTH > 16
-        int idx[v_float32::nlanes/4];
-        for( int i = 0; i < v_float32::nlanes/4; i++ )
+        int idx[VTraits<v_float32>::max_nlanes/4];
+        for( int i = 0; i < VTraits<v_float32>::vlanes()/4; i++ )
            idx[i] = 0;
        float _m[] = { m[4], m[9], m[14], m[19] };
        v_float32 m0 = vx_lut_quads(m   , idx);
@ -1650,12 +1652,13 @@ transform_32f( const float* src, float* dst, const float* m, int len, int scn, i
        v_float32 m2 = vx_lut_quads(m+10, idx);
        v_float32 m3 = vx_lut_quads(m+15, idx);
        v_float32 m4 = vx_lut_quads(_m, idx);
-        for( ; x <= len*4 - v_float32::nlanes; x += v_float32::nlanes )
+        for( ; x <= len*4 - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes() )
        {
            v_float32 v_src = vx_load(src + x);
-            v_store(dst + x, v_reduce_sum4(v_src * m0, v_src * m1, v_src * m2, v_src * m3) + m4);
+            v_store(dst + x, v_add(v_reduce_sum4(v_mul(v_src, m0), v_mul(v_src, m1), v_mul(v_src, m2), v_mul(v_src, m3)), m4));
        }
 #endif
+#if CV_SIMD128
        v_float32x4 _m0 = v_load(m     );
        v_float32x4 _m1 = v_load(m +  5);
        v_float32x4 _m2 = v_load(m + 10);
@ -1666,6 +1669,17 @@ transform_32f( const float* src, float* dst, const float* m, int len, int scn, i
            v_float32x4 v_src = v_load(src + x);
            v_store(dst + x, v_reduce_sum4(v_src * _m0, v_src * _m1, v_src * _m2, v_src * _m3) + _m4);
        }
+#else // CV_SIMD_WIDTH >= 16 && !CV_SIMD128
+        for( ; x < len*4; x += 4 )
+        {
+            float v0 = src[x], v1 = src[x+1], v2 = src[x+2], v3 = src[x+3];
+            float t0 = saturate_cast<float>(m[0]*v0 + m[1]*v1 + m[ 2]*v2 + m[ 3]*v3 + m[ 4]);
+            float t1 = saturate_cast<float>(m[5]*v0 + m[6]*v1 + m[ 7]*v2 + m[ 8]*v3 + m[ 9]);
+            float t2 = saturate_cast<float>(m[10]*v0 + m[11]*v1 + m[12]*v2 + m[13]*v3 + m[14]);
+            float t3 = saturate_cast<float>(m[15]*v0 + m[16]*v1 + m[17]*v2 + m[18]*v3 + m[19]);
+            dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2; dst[x+3] = t3;
+        }
+#endif
        vx_cleanup();
        return;
    }
@ -1936,9 +1950,9 @@ static void scaleAdd_32f(const float* src1, const float* src2, float* dst,
 {
    float alpha = *_alpha;
    int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    v_float32 v_alpha = vx_setall_f32(alpha);
-    const int cWidth = v_float32::nlanes;
+    const int cWidth = VTraits<v_float32>::vlanes();
    for (; i <= len - cWidth; i += cWidth)
        v_store(dst + i, v_muladd(vx_load(src1 + i), v_alpha, vx_load(src2 + i)));
    vx_cleanup();
@ -1953,9 +1967,9 @@ static void scaleAdd_64f(const double* src1, const double* src2, double* dst,
 {
    double alpha = *_alpha;
    int i = 0;
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
    v_float64 a2 = vx_setall_f64(alpha);
-    const int cWidth = v_float64::nlanes;
+    const int cWidth = VTraits<v_float64>::vlanes();
    for (; i <= len - cWidth; i += cWidth)
        v_store(dst + i, v_muladd(vx_load(src1 + i), a2, vx_load(src2 + i)));
    vx_cleanup();
@ -2078,7 +2092,7 @@ MulTransposedR(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
        deltastep = deltastep ? 4 : 0;
    }

-#if CV_SIMD_64F
+#if CV_SIMD128_64F
    v_float64x2 v_scale = v_setall_f64(scale);
 #endif

@ -2090,7 +2104,7 @@ MulTransposedR(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double

            for( j = i; j <= size.width - 4; j += 4 )
            {
-#if CV_SIMD_64F
+#if CV_SIMD128_64F
                if (DataType<sT>::depth == CV_64F && DataType<dT>::depth == CV_64F)
                {
                    v_float64x2 s0 = v_setzero_f64(), s1 = v_setzero_f64();
@ -2150,7 +2164,7 @@ MulTransposedR(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double

            for( j = i; j <= size.width - 4; j += 4 )
            {
-#if CV_SIMD_64F
+#if CV_SIMD128_64F
                if (DataType<sT>::depth == CV_64F && DataType<dT>::depth == CV_64F)
                {
                    v_float64x2 s0 = v_setzero_f64(), s1 = v_setzero_f64();
@ -2227,7 +2241,7 @@ MulTransposedL(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
                double s = 0;
                const sT *tsrc1 = src + i*srcstep;
                const sT *tsrc2 = src + j*srcstep;
-#if CV_SIMD_64F
+#if CV_SIMD128_64F
                if (DataType<sT>::depth == CV_64F && DataType<dT>::depth == CV_64F)
                {
                    const double *v_tsrc1 = (double *)(tsrc1);
@ -2280,7 +2294,7 @@ MulTransposedL(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
                        delta_buf[2] = delta_buf[3] = tdelta2[0];
                    tdelta2 = delta_buf;
                }
-#if CV_SIMD_64F
+#if CV_SIMD128_64F
                if (DataType<sT>::depth == CV_64F && DataType<dT>::depth == CV_64F)
                {
                    const double *v_tsrc2 = (double *)(tsrc2);
@ -2393,14 +2407,14 @@ double dotProd_8u(const uchar* src1, const uchar* src2, int len)
    double r = 0;
    int i = 0;

-#if CV_SIMD
-    int len0 = len & -v_uint16::nlanes, blockSize0 = (1 << 15), blockSize;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int len0 = len & -VTraits<v_uint16>::vlanes(), blockSize0 = (1 << 15), blockSize;

    while (i < len0)
    {
        blockSize = std::min(len0 - i, blockSize0);
        v_uint32 v_sum = vx_setzero_u32();
-        const int cWidth = v_uint16::nlanes;
+        const int cWidth = VTraits<v_uint16>::vlanes();

        int j = 0;
        for (; j <= blockSize - cWidth * 2; j += cWidth * 2)
@ -2414,7 +2428,7 @@ double dotProd_8u(const uchar* src1, const uchar* src2, int len)
        {
            v_int16 v_src10 = v_reinterpret_as_s16(vx_load_expand(src1 + j));
            v_int16 v_src20 = v_reinterpret_as_s16(vx_load_expand(src2 + j));
-            v_sum += v_reinterpret_as_u32(v_dotprod_fast(v_src10, v_src20));
+            v_sum = v_add(v_sum, v_reinterpret_as_u32(v_dotprod_fast(v_src10, v_src20)));
        }
        r += (double)v_reduce_sum(v_sum);

@ -2433,14 +2447,14 @@ double dotProd_8s(const schar* src1, const schar* src2, int len)
    double r = 0.0;
    int i = 0;

-#if CV_SIMD
-    int len0 = len & -v_int16::nlanes, blockSize0 = (1 << 14), blockSize;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int len0 = len & -VTraits<v_int16>::vlanes(), blockSize0 = (1 << 14), blockSize;

    while (i < len0)
    {
        blockSize = std::min(len0 - i, blockSize0);
        v_int32 v_sum = vx_setzero_s32();
-        const int cWidth = v_int16::nlanes;
+        const int cWidth = VTraits<v_int16>::vlanes();

        int j = 0;
        for (; j <= blockSize - cWidth * 2; j += cWidth * 2)
@ -2473,14 +2487,14 @@ double dotProd_16u(const ushort* src1, const ushort* src2, int len)
    double r = 0.0;
    int i = 0;

-#if CV_SIMD
-    int len0 = len & -v_uint16::nlanes, blockSize0 = (1 << 24), blockSize;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int len0 = len & -VTraits<v_uint16>::vlanes(), blockSize0 = (1 << 24), blockSize;

    while (i < len0)
    {
        blockSize = std::min(len0 - i, blockSize0);
        v_uint64 v_sum = vx_setzero_u64();
-        const int cWidth = v_uint16::nlanes;
+        const int cWidth = VTraits<v_uint16>::vlanes();

        int j = 0;
        for (; j <= blockSize - cWidth; j += cWidth)
@ -2505,14 +2519,14 @@ double dotProd_16s(const short* src1, const short* src2, int len)
    double r = 0.0;
    int i = 0;

-#if CV_SIMD
-    int len0 = len & -v_int16::nlanes, blockSize0 = (1 << 24), blockSize;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int len0 = len & -VTraits<v_int16>::vlanes(), blockSize0 = (1 << 24), blockSize;

    while (i < len0)
    {
        blockSize = std::min(len0 - i, blockSize0);
        v_int64 v_sum = vx_setzero_s64();
-        const int cWidth = v_int16::nlanes;
+        const int cWidth = VTraits<v_int16>::vlanes();

        int j = 0;
        for (; j <= blockSize - cWidth; j += cWidth)
@ -2534,10 +2548,10 @@ double dotProd_16s(const short* src1, const short* src2, int len)

 double dotProd_32s(const int* src1, const int* src2, int len)
 {
-#if CV_SIMD_64F
+#if CV_SIMD_64F // TODO: enable for CV_SIMD_SCALABLE_64F
    double r = .0;
    int i = 0;
-    const int step  = v_int32::nlanes;
+    const int step  = VTraits<v_int32>::vlanes();
    v_float64 v_sum0 = vx_setzero_f64();
 #if CV_SIMD_WIDTH == 16
    const int wstep = step * 2;
@ -2572,8 +2586,8 @@ double dotProd_32f(const float* src1, const float* src2, int len)
    double r = 0.0;
    int i = 0;

-#if CV_SIMD
-    int len0 = len & -v_float32::nlanes, blockSize0 = (1 << 13), blockSize;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int len0 = len & -VTraits<v_float32>::vlanes(), blockSize0 = (1 << 13), blockSize;

    while (i < len0)
    {
@ -2581,7 +2595,7 @@ double dotProd_32f(const float* src1, const float* src2, int len)
        v_float32 v_sum = vx_setzero_f32();

        int j = 0;
-        int cWidth = v_float32::nlanes;
+        int cWidth = VTraits<v_float32>::vlanes();

 #if CV_ENABLE_UNROLLED
        v_float32 v_sum1 = vx_setzero_f32();
@ -2600,7 +2614,7 @@ double dotProd_32f(const float* src1, const float* src2, int len)
                              vx_load(src2 + j + (cWidth * 3)), v_sum3);
        }

-        v_sum += v_sum1 + v_sum2 + v_sum3;
+        v_sum = v_add(v_sum, v_add(v_add(v_sum1, v_sum2), v_sum3));
 #endif

        for (; j <= blockSize - cWidth; j += cWidth)
--- a/modules/core/src/matrix_transform.cpp
+++ b/modules/core/src/matrix_transform.cpp
@ -440,7 +440,7 @@ template<typename T1, typename T2> CV_ALWAYS_INLINE void flipHoriz_double( const
 static void
 flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
 {
-#if CV_SIMD
+#if CV_SIMD128
 #if CV_STRONG_ALIGNMENT
    size_t alignmentMark = ((size_t)src)|((size_t)dst)|sstep|dstep;
 #endif
@ -563,7 +563,7 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size,
    }
 #endif
    else
-#endif // CV_SIMD
+#endif // CV_SIMD128
    {
        int i, j, limit = (int)(((size.width + 1)/2)*esz);
        AutoBuffer<int> _tab(size.width*esz);
@ -596,12 +596,12 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size,
                                                  dst0 += dstep, dst1 -= dstep )
    {
        int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 #if CV_STRONG_ALIGNMENT
        if (isAligned<sizeof(int)>(src0, src1, dst0, dst1))
 #endif
        {
-            for (; i <= size.width - CV_SIMD_WIDTH; i += CV_SIMD_WIDTH)
+            for (; i <= size.width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
            {
                v_int32 t0 = v_reinterpret_as_s32(vx_load(src0 + i));
                v_int32 t1 = v_reinterpret_as_s32(vx_load(src1 + i));
@ -612,7 +612,7 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size,
 #if CV_STRONG_ALIGNMENT
        else
        {
-            for (; i <= size.width - CV_SIMD_WIDTH; i += CV_SIMD_WIDTH)
+            for (; i <= size.width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
            {
                v_uint8 t0 = vx_load(src0 + i);
                v_uint8 t1 = vx_load(src1 + i);
--- a/modules/core/src/merge.simd.hpp
+++ b/modules/core/src/merge.simd.hpp
@ -15,7 +15,7 @@ void merge64s(const int64** src, int64* dst, int len, int cn);

 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY

-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 /*
  The trick with STORE_UNALIGNED/STORE_ALIGNED_NOCACHE is the following:
  on IA there are instructions movntps and such to which
@ -38,7 +38,7 @@ void merge64s(const int64** src, int64* dst, int len, int cn);
 template<typename T, typename VecT> static void
 vecmerge_( const T** src, T* dst, int len, int cn )
 {
-    const int VECSZ = VecT::nlanes;
+    const int VECSZ = VTraits<VecT>::vlanes();
    int i, i0 = 0;
    const T* src0 = src[0];
    const T* src1 = src[1];
@ -173,8 +173,8 @@ merge_( const T** src, T* dst, int len, int cn )
 void merge8u(const uchar** src, uchar* dst, int len, int cn )
 {
    CV_INSTRUMENT_REGION();
-#if CV_SIMD
-    if( len >= v_uint8::nlanes && 2 <= cn && cn <= 4 )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    if( len >= VTraits<v_uint8>::vlanes() && 2 <= cn && cn <= 4 )
        vecmerge_<uchar, v_uint8>(src, dst, len, cn);
    else
 #endif
@ -184,8 +184,8 @@ void merge8u(const uchar** src, uchar* dst, int len, int cn )
 void merge16u(const ushort** src, ushort* dst, int len, int cn )
 {
    CV_INSTRUMENT_REGION();
-#if CV_SIMD
-    if( len >= v_uint16::nlanes && 2 <= cn && cn <= 4 )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    if( len >= VTraits<v_uint16>::vlanes() && 2 <= cn && cn <= 4 )
        vecmerge_<ushort, v_uint16>(src, dst, len, cn);
    else
 #endif
@ -195,8 +195,8 @@ void merge16u(const ushort** src, ushort* dst, int len, int cn )
 void merge32s(const int** src, int* dst, int len, int cn )
 {
    CV_INSTRUMENT_REGION();
-#if CV_SIMD
-    if( len >= v_int32::nlanes && 2 <= cn && cn <= 4 )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    if( len >= VTraits<v_int32>::vlanes() && 2 <= cn && cn <= 4 )
        vecmerge_<int, v_int32>(src, dst, len, cn);
    else
 #endif
@ -206,8 +206,8 @@ void merge32s(const int** src, int* dst, int len, int cn )
 void merge64s(const int64** src, int64* dst, int len, int cn )
 {
    CV_INSTRUMENT_REGION();
-#if CV_SIMD
-    if( len >= v_int64::nlanes && 2 <= cn && cn <= 4 )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    if( len >= VTraits<v_int64>::vlanes() && 2 <= cn && cn <= 4 )
        vecmerge_<int64, v_int64>(src, dst, len, cn);
    else
 #endif
--- a/modules/core/src/norm.cpp
+++ b/modules/core/src/norm.cpp
@ -63,25 +63,25 @@ int normHamming(const uchar* a, int n, int cellSize)
        return -1;
    int i = 0;
    int result = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    v_uint64 t = vx_setzero_u64();
    if ( cellSize == 2)
    {
        v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x55));
-        for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
+        for(; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
        {
            v_uint16 a0 = v_reinterpret_as_u16(vx_load(a + i));
-            t += v_popcount(v_reinterpret_as_u64((a0 | (a0 >> 1)) & mask));
+            t = v_add(t, v_popcount(v_reinterpret_as_u64(v_and(v_or(a0, v_shr<1>(a0)), mask))));
        }
    }
    else    // cellSize == 4
    {
        v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x11));
-        for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
+        for(; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
        {
            v_uint16 a0 = v_reinterpret_as_u16(vx_load(a + i));
-            v_uint16 a1 = a0 | (a0 >> 2);
-            t += v_popcount(v_reinterpret_as_u64((a1 | (a1 >> 1)) & mask));
+            v_uint16 a1 = v_or(a0, v_shr<2>(a0));
+            t = v_add(t, v_popcount(v_reinterpret_as_u64(v_and(v_or(a1, v_shr<1>(a1)), mask))));

        }
    }
@ -109,25 +109,25 @@ int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
        return -1;
    int i = 0;
    int result = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    v_uint64 t = vx_setzero_u64();
    if ( cellSize == 2)
    {
        v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x55));
-        for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
+        for(; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
        {
-            v_uint16 ab0 = v_reinterpret_as_u16(vx_load(a + i) ^ vx_load(b + i));
-            t += v_popcount(v_reinterpret_as_u64((ab0 | (ab0 >> 1)) & mask));
+            v_uint16 ab0 = v_reinterpret_as_u16(v_xor(vx_load(a + i), vx_load(b + i)));
+            t = v_add(t, v_popcount(v_reinterpret_as_u64(v_and(v_or(ab0, v_shr<1>(ab0)), mask))));
        }
    }
    else    // cellSize == 4
    {
        v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x11));
-        for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
+        for(; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
        {
-            v_uint16 ab0 = v_reinterpret_as_u16(vx_load(a + i) ^ vx_load(b + i));
-            v_uint16 ab1 = ab0 | (ab0 >> 2);
-            t += v_popcount(v_reinterpret_as_u64((ab1 | (ab1 >> 1)) & mask));
+            v_uint16 ab0 = v_reinterpret_as_u16(v_xor(vx_load(a + i), vx_load(b + i)));
+            v_uint16 ab1 = v_or(ab0, v_shr<2>(ab0));
+            t = v_add(t, v_popcount(v_reinterpret_as_u64(v_and(v_or(ab1, v_shr<1>(ab1)), mask))));
        }
    }
    result += (int)v_reduce_sum(t);
@ -145,21 +145,21 @@ int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
 float normL2Sqr_(const float* a, const float* b, int n)
 {
    int j = 0; float d = 0.f;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    v_float32 v_d0 = vx_setzero_f32(), v_d1 = vx_setzero_f32();
    v_float32 v_d2 = vx_setzero_f32(), v_d3 = vx_setzero_f32();
-    for (; j <= n - 4 * v_float32::nlanes; j += 4 * v_float32::nlanes)
+    for (; j <= n - 4 * VTraits<v_float32>::vlanes(); j += 4 * VTraits<v_float32>::vlanes())
    {
-        v_float32 t0 = vx_load(a + j) - vx_load(b + j);
-        v_float32 t1 = vx_load(a + j + v_float32::nlanes) - vx_load(b + j + v_float32::nlanes);
+        v_float32 t0 = v_sub(vx_load(a + j), vx_load(b + j));
+        v_float32 t1 = v_sub(vx_load(a + j + VTraits<v_float32>::vlanes()), vx_load(b + j + VTraits<v_float32>::vlanes()));
        v_d0 = v_muladd(t0, t0, v_d0);
-        v_float32 t2 = vx_load(a + j + 2 * v_float32::nlanes) - vx_load(b + j + 2 * v_float32::nlanes);
+        v_float32 t2 = v_sub(vx_load(a + j + 2 * VTraits<v_float32>::vlanes()), vx_load(b + j + 2 * VTraits<v_float32>::vlanes()));
        v_d1 = v_muladd(t1, t1, v_d1);
-        v_float32 t3 = vx_load(a + j + 3 * v_float32::nlanes) - vx_load(b + j + 3 * v_float32::nlanes);
+        v_float32 t3 = v_sub(vx_load(a + j + 3 * VTraits<v_float32>::vlanes()), vx_load(b + j + 3 * VTraits<v_float32>::vlanes()));
        v_d2 = v_muladd(t2, t2, v_d2);
        v_d3 = v_muladd(t3, t3, v_d3);
    }
-    d = v_reduce_sum(v_d0 + v_d1 + v_d2 + v_d3);
+    d = v_reduce_sum(v_add(v_add(v_add(v_d0, v_d1), v_d2), v_d3));
 #endif
    for( ; j < n; j++ )
    {
@ -173,17 +173,17 @@ float normL2Sqr_(const float* a, const float* b, int n)
 float normL1_(const float* a, const float* b, int n)
 {
    int j = 0; float d = 0.f;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    v_float32 v_d0 = vx_setzero_f32(), v_d1 = vx_setzero_f32();
    v_float32 v_d2 = vx_setzero_f32(), v_d3 = vx_setzero_f32();
-    for (; j <= n - 4 * v_float32::nlanes; j += 4 * v_float32::nlanes)
+    for (; j <= n - 4 * VTraits<v_float32>::vlanes(); j += 4 * VTraits<v_float32>::vlanes())
    {
-        v_d0 += v_absdiff(vx_load(a + j), vx_load(b + j));
-        v_d1 += v_absdiff(vx_load(a + j + v_float32::nlanes), vx_load(b + j + v_float32::nlanes));
-        v_d2 += v_absdiff(vx_load(a + j + 2 * v_float32::nlanes), vx_load(b + j + 2 * v_float32::nlanes));
-        v_d3 += v_absdiff(vx_load(a + j + 3 * v_float32::nlanes), vx_load(b + j + 3 * v_float32::nlanes));
+        v_d0 = v_add(v_d0, v_absdiff(vx_load(a + j), vx_load(b + j)));
+        v_d1 = v_add(v_d1, v_absdiff(vx_load(a + j + VTraits<v_float32>::vlanes()), vx_load(b + j + VTraits<v_float32>::vlanes())));
+        v_d2 = v_add(v_d2, v_absdiff(vx_load(a + j + 2 * VTraits<v_float32>::vlanes()), vx_load(b + j + 2 * VTraits<v_float32>::vlanes())));
+        v_d3 = v_add(v_d3, v_absdiff(vx_load(a + j + 3 * VTraits<v_float32>::vlanes()), vx_load(b + j + 3 * VTraits<v_float32>::vlanes())));
    }
-    d = v_reduce_sum(v_d0 + v_d1 + v_d2 + v_d3);
+    d = v_reduce_sum(v_add(v_add(v_add(v_d0, v_d1), v_d2), v_d3));
 #endif
    for( ; j < n; j++ )
        d += std::abs(a[j] - b[j]);
@ -193,12 +193,12 @@ float normL1_(const float* a, const float* b, int n)
 int normL1_(const uchar* a, const uchar* b, int n)
 {
    int j = 0, d = 0;
-#if CV_SIMD
-    for (; j <= n - 4 * v_uint8::nlanes; j += 4 * v_uint8::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    for (; j <= n - 4 * VTraits<v_uint8>::vlanes(); j += 4 * VTraits<v_uint8>::vlanes())
        d += v_reduce_sad(vx_load(a + j), vx_load(b + j)) +
-             v_reduce_sad(vx_load(a + j + v_uint8::nlanes), vx_load(b + j + v_uint8::nlanes)) +
-             v_reduce_sad(vx_load(a + j + 2 * v_uint8::nlanes), vx_load(b + j + 2 * v_uint8::nlanes)) +
-             v_reduce_sad(vx_load(a + j + 3 * v_uint8::nlanes), vx_load(b + j + 3 * v_uint8::nlanes));
+             v_reduce_sad(vx_load(a + j + VTraits<v_uint8>::vlanes()), vx_load(b + j + VTraits<v_uint8>::vlanes())) +
+             v_reduce_sad(vx_load(a + j + 2 * VTraits<v_uint8>::vlanes()), vx_load(b + j + 2 * VTraits<v_uint8>::vlanes())) +
+             v_reduce_sad(vx_load(a + j + 3 * VTraits<v_uint8>::vlanes()), vx_load(b + j + 3 * VTraits<v_uint8>::vlanes()));
 #endif
    for( ; j < n; j++ )
        d += std::abs(a[j] - b[j]);
--- a/modules/core/src/split.simd.hpp
+++ b/modules/core/src/split.simd.hpp
@ -15,12 +15,12 @@ void split64s(const int64* src, int64** dst, int len, int cn);

 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY

-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 // see the comments for vecmerge_ in merge.cpp
 template<typename T, typename VecT> static void
 vecsplit_( const T* src, T** dst, int len, int cn )
 {
-    const int VECSZ = VecT::nlanes;
+    const int VECSZ = VTraits<VecT>::vlanes();
    int i, i0 = 0;
    T* dst0 = dst[0];
    T* dst1 = dst[1];
@ -177,8 +177,8 @@ split_( const T* src, T** dst, int len, int cn )
 void split8u(const uchar* src, uchar** dst, int len, int cn )
 {
    CV_INSTRUMENT_REGION();
-#if CV_SIMD
-    if( len >= v_uint8::nlanes && 2 <= cn && cn <= 4 )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    if( len >= VTraits<v_uint8>::vlanes() && 2 <= cn && cn <= 4 )
        vecsplit_<uchar, v_uint8>(src, dst, len, cn);
    else
 #endif
@ -188,8 +188,8 @@ void split8u(const uchar* src, uchar** dst, int len, int cn )
 void split16u(const ushort* src, ushort** dst, int len, int cn )
 {
    CV_INSTRUMENT_REGION();
-#if CV_SIMD
-    if( len >= v_uint16::nlanes && 2 <= cn && cn <= 4 )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    if( len >= VTraits<v_uint16>::vlanes() && 2 <= cn && cn <= 4 )
        vecsplit_<ushort, v_uint16>(src, dst, len, cn);
    else
 #endif
@ -199,8 +199,8 @@ void split16u(const ushort* src, ushort** dst, int len, int cn )
 void split32s(const int* src, int** dst, int len, int cn )
 {
    CV_INSTRUMENT_REGION();
-#if CV_SIMD
-    if( len >= v_uint32::nlanes && 2 <= cn && cn <= 4 )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    if( len >= VTraits<v_uint32>::vlanes() && 2 <= cn && cn <= 4 )
        vecsplit_<int, v_int32>(src, dst, len, cn);
    else
 #endif
@ -210,8 +210,8 @@ void split32s(const int* src, int** dst, int len, int cn )
 void split64s(const int64* src, int64** dst, int len, int cn )
 {
    CV_INSTRUMENT_REGION();
-#if CV_SIMD
-    if( len >= v_int64::nlanes && 2 <= cn && cn <= 4 )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    if( len >= VTraits<v_int64>::vlanes() && 2 <= cn && cn <= 4 )
        vecsplit_<int64, v_int64>(src, dst, len, cn);
    else
 #endif
--- a/modules/core/src/stat.simd.hpp
+++ b/modules/core/src/stat.simd.hpp
@ -33,11 +33,11 @@ int normHamming(const uchar* a, int n)
    int i = 0;
    int result = 0;

-#if CV_SIMD && CV_SIMD_WIDTH > 16
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    {
        v_uint64 t = vx_setzero_u64();
-        for (; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
-            t += v_popcount(v_reinterpret_as_u64(vx_load(a + i)));
+        for (; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
+            t = v_add(t, v_popcount(v_reinterpret_as_u64(vx_load(a + i))));
        result = (int)v_reduce_sum(t);
        vx_cleanup();
    }
@ -56,13 +56,6 @@ int normHamming(const uchar* a, int n)
            result += CV_POPCNT_U32(*(uint*)(a + i));
        }
    }
-#elif CV_SIMD
-    {
-        v_uint64x2 t = v_setzero_u64();
-        for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
-            t += v_popcount(v_reinterpret_as_u64(v_load(a + i)));
-        result += (int)v_reduce_sum(t);
-    }
 #endif
 #if CV_ENABLE_UNROLLED
    for(; i <= n - 4; i += 4)
@ -85,11 +78,11 @@ int normHamming(const uchar* a, const uchar* b, int n)
    int i = 0;
    int result = 0;

-#if CV_SIMD && CV_SIMD_WIDTH > 16
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    {
        v_uint64 t = vx_setzero_u64();
-        for (; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
-            t += v_popcount(v_reinterpret_as_u64(vx_load(a + i) ^ vx_load(b + i)));
+        for (; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
+            t = v_add(t, v_popcount(v_reinterpret_as_u64(v_xor(vx_load(a + i), vx_load(b + i)))));
        result += (int)v_reduce_sum(t);
    }
 #endif
@ -107,13 +100,6 @@ int normHamming(const uchar* a, const uchar* b, int n)
            result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i));
        }
    }
-#elif CV_SIMD
-    {
-        v_uint64x2 t = v_setzero_u64();
-        for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
-            t += v_popcount(v_reinterpret_as_u64(v_load(a + i) ^ v_load(b + i)));
-        result += (int)v_reduce_sum(t);
-    }
 #endif
 #if CV_ENABLE_UNROLLED
    for(; i <= n - 4; i += 4)
--- a/modules/core/src/sum.simd.hpp
+++ b/modules/core/src/sum.simd.hpp
@ -22,7 +22,7 @@ struct Sum_SIMD
    }
 };

-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)

 template <>
 struct Sum_SIMD<uchar, int>
@ -36,41 +36,41 @@ struct Sum_SIMD<uchar, int>
        int x = 0;
        v_uint32 v_sum = vx_setzero_u32();

-        int len0 = len & -v_uint8::nlanes;
+        int len0 = len & -VTraits<v_uint8>::vlanes();
        while (x < len0)
        {
-            const int len_tmp = min(x + 256*v_uint16::nlanes, len0);
+            const int len_tmp = min(x + 256*VTraits<v_uint16>::vlanes(), len0);
            v_uint16 v_sum16 = vx_setzero_u16();
-            for (; x < len_tmp; x += v_uint8::nlanes)
+            for (; x < len_tmp; x += VTraits<v_uint8>::vlanes())
            {
                v_uint16 v_src0, v_src1;
                v_expand(vx_load(src0 + x), v_src0, v_src1);
-                v_sum16 += v_src0 + v_src1;
+                v_sum16 = v_add(v_sum16, v_add(v_src0, v_src1));
            }
            v_uint32 v_half0, v_half1;
            v_expand(v_sum16, v_half0, v_half1);
-            v_sum += v_half0 + v_half1;
+            v_sum = v_add(v_sum, v_add(v_half0, v_half1));
        }
-        if (x <= len - v_uint16::nlanes)
+        if (x <= len - VTraits<v_uint16>::vlanes())
        {
            v_uint32 v_half0, v_half1;
            v_expand(vx_load_expand(src0 + x), v_half0, v_half1);
-            v_sum += v_half0 + v_half1;
-            x += v_uint16::nlanes;
+            v_sum = v_add(v_sum, v_add(v_half0, v_half1));
+            x += VTraits<v_uint16>::vlanes();
        }
-        if (x <= len - v_uint32::nlanes)
+        if (x <= len - VTraits<v_uint32>::vlanes())
        {
-            v_sum += vx_load_expand_q(src0 + x);
-            x += v_uint32::nlanes;
+            v_sum = v_add(v_sum, vx_load_expand_q(src0 + x));
+            x += VTraits<v_uint32>::vlanes();
        }

        if (cn == 1)
            *dst += v_reduce_sum(v_sum);
        else
        {
-            uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_uint32::nlanes];
+            uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_uint32>::max_nlanes];
            v_store_aligned(ar, v_sum);
-            for (int i = 0; i < v_uint32::nlanes; ++i)
+            for (int i = 0; i < VTraits<v_uint32>::vlanes(); ++i)
                dst[i % cn] += ar[i];
        }
        v_cleanup();
@ -91,41 +91,41 @@ struct Sum_SIMD<schar, int>
        int x = 0;
        v_int32 v_sum = vx_setzero_s32();

-        int len0 = len & -v_int8::nlanes;
+        int len0 = len & -VTraits<v_int8>::vlanes();
        while (x < len0)
        {
-            const int len_tmp = min(x + 256*v_int16::nlanes, len0);
+            const int len_tmp = min(x + 256*VTraits<v_int16>::vlanes(), len0);
            v_int16 v_sum16 = vx_setzero_s16();
-            for (; x < len_tmp; x += v_int8::nlanes)
+            for (; x < len_tmp; x += VTraits<v_int8>::vlanes())
            {
                v_int16 v_src0, v_src1;
                v_expand(vx_load(src0 + x), v_src0, v_src1);
-                v_sum16 += v_src0 + v_src1;
+                v_sum16 = v_add(v_sum16, v_add(v_src0, v_src1));
            }
            v_int32 v_half0, v_half1;
            v_expand(v_sum16, v_half0, v_half1);
-            v_sum += v_half0 + v_half1;
+            v_sum = v_add(v_sum, v_add(v_half0, v_half1));
        }
-        if (x <= len - v_int16::nlanes)
+        if (x <= len - VTraits<v_int16>::vlanes())
        {
            v_int32 v_half0, v_half1;
            v_expand(vx_load_expand(src0 + x), v_half0, v_half1);
-            v_sum += v_half0 + v_half1;
-            x += v_int16::nlanes;
+            v_sum = v_add(v_sum, v_add(v_half0, v_half1));
+            x += VTraits<v_int16>::vlanes();
        }
-        if (x <= len - v_int32::nlanes)
+        if (x <= len - VTraits<v_int32>::vlanes())
        {
-            v_sum += vx_load_expand_q(src0 + x);
-            x += v_int32::nlanes;
+            v_sum = v_add(v_sum, vx_load_expand_q(src0 + x));
+            x += VTraits<v_int32>::vlanes();
        }

        if (cn == 1)
            *dst += v_reduce_sum(v_sum);
        else
        {
-            int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_int32::nlanes];
+            int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_int32>::max_nlanes];
            v_store_aligned(ar, v_sum);
-            for (int i = 0; i < v_int32::nlanes; ++i)
+            for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i)
                dst[i % cn] += ar[i];
        }
        v_cleanup();
@ -146,25 +146,25 @@ struct Sum_SIMD<ushort, int>
        int x = 0;
        v_uint32 v_sum = vx_setzero_u32();

-        for (; x <= len - v_uint16::nlanes; x += v_uint16::nlanes)
+        for (; x <= len - VTraits<v_uint16>::vlanes(); x += VTraits<v_uint16>::vlanes())
        {
            v_uint32 v_src0, v_src1;
            v_expand(vx_load(src0 + x), v_src0, v_src1);
-            v_sum += v_src0 + v_src1;
+            v_sum = v_add(v_sum, v_add(v_src0, v_src1));
        }
-        if (x <= len - v_uint32::nlanes)
+        if (x <= len - VTraits<v_uint32>::vlanes())
        {
-            v_sum += vx_load_expand(src0 + x);
-            x += v_uint32::nlanes;
+            v_sum = v_add(v_sum, vx_load_expand(src0 + x));
+            x += VTraits<v_uint32>::vlanes();
        }

        if (cn == 1)
            *dst += v_reduce_sum(v_sum);
        else
        {
-            uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_uint32::nlanes];
+            uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_uint32>::max_nlanes];
            v_store_aligned(ar, v_sum);
-            for (int i = 0; i < v_uint32::nlanes; ++i)
+            for (int i = 0; i < VTraits<v_uint32>::vlanes(); ++i)
                dst[i % cn] += ar[i];
        }
        v_cleanup();
@ -185,25 +185,25 @@ struct Sum_SIMD<short, int>
        int x = 0;
        v_int32 v_sum = vx_setzero_s32();

-        for (; x <= len - v_int16::nlanes; x += v_int16::nlanes)
+        for (; x <= len - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
        {
            v_int32 v_src0, v_src1;
            v_expand(vx_load(src0 + x), v_src0, v_src1);
-            v_sum += v_src0 + v_src1;
+            v_sum = v_add(v_sum, v_add(v_src0, v_src1));
        }
-        if (x <= len - v_int32::nlanes)
+        if (x <= len - VTraits<v_int32>::vlanes())
        {
-            v_sum += vx_load_expand(src0 + x);
-            x += v_int32::nlanes;
+            v_sum = v_add(v_sum, vx_load_expand(src0 + x));
+            x += VTraits<v_int32>::vlanes();
        }

        if (cn == 1)
            *dst += v_reduce_sum(v_sum);
        else
        {
-            int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_int32::nlanes];
+            int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_int32>::max_nlanes];
            v_store_aligned(ar, v_sum);
-            for (int i = 0; i < v_int32::nlanes; ++i)
+            for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i)
                dst[i % cn] += ar[i];
        }
        v_cleanup();
@ -212,7 +212,7 @@ struct Sum_SIMD<short, int>
    }
 };

-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 template <>
 struct Sum_SIMD<int, double>
 {
@ -226,24 +226,24 @@ struct Sum_SIMD<int, double>
        v_float64 v_sum0 = vx_setzero_f64();
        v_float64 v_sum1 = vx_setzero_f64();

-        for (; x <= len - 2 * v_int32::nlanes; x += 2 * v_int32::nlanes)
+        for (; x <= len - 2 * VTraits<v_int32>::vlanes(); x += 2 * VTraits<v_int32>::vlanes())
        {
            v_int32 v_src0 = vx_load(src0 + x);
-            v_int32 v_src1 = vx_load(src0 + x + v_int32::nlanes);
-            v_sum0 += v_cvt_f64(v_src0) + v_cvt_f64(v_src1);
-            v_sum1 += v_cvt_f64_high(v_src0) + v_cvt_f64_high(v_src1);
+            v_int32 v_src1 = vx_load(src0 + x + VTraits<v_int32>::vlanes());
+            v_sum0 = v_add(v_sum0, v_add(v_cvt_f64(v_src0), v_cvt_f64(v_src1)));
+            v_sum1 = v_add(v_sum1, v_add(v_cvt_f64_high(v_src0), v_cvt_f64_high(v_src1)));
        }

 #if CV_SIMD256 || CV_SIMD512
-        double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_float64::nlanes];
-        v_store_aligned(ar, v_sum0 + v_sum1);
-        for (int i = 0; i < v_float64::nlanes; ++i)
+        double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_float64>::max_nlanes];
+        v_store_aligned(ar, v_add(v_sum0, v_sum1));
+        for (int i = 0; i < VTraits<v_float64>::vlanes(); ++i)
            dst[i % cn] += ar[i];
 #else
-        double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * v_float64::nlanes];
+        double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * VTraits<v_float64>::max_nlanes];
        v_store_aligned(ar, v_sum0);
-        v_store_aligned(ar + v_float64::nlanes, v_sum1);
-        for (int i = 0; i < 2 * v_float64::nlanes; ++i)
+        v_store_aligned(ar + VTraits<v_float64>::vlanes(), v_sum1);
+        for (int i = 0; i < 2 * VTraits<v_float64>::vlanes(); ++i)
            dst[i % cn] += ar[i];
 #endif
        v_cleanup();
@ -265,24 +265,24 @@ struct Sum_SIMD<float, double>
        v_float64 v_sum0 = vx_setzero_f64();
        v_float64 v_sum1 = vx_setzero_f64();

-        for (; x <= len - 2 * v_float32::nlanes; x += 2 * v_float32::nlanes)
+        for (; x <= len - 2 * VTraits<v_float32>::vlanes(); x += 2 * VTraits<v_float32>::vlanes())
        {
            v_float32 v_src0 = vx_load(src0 + x);
-            v_float32 v_src1 = vx_load(src0 + x + v_float32::nlanes);
-            v_sum0 += v_cvt_f64(v_src0) + v_cvt_f64(v_src1);
-            v_sum1 += v_cvt_f64_high(v_src0) + v_cvt_f64_high(v_src1);
+            v_float32 v_src1 = vx_load(src0 + x + VTraits<v_float32>::vlanes());
+            v_sum0 = v_add(v_sum0, v_add(v_cvt_f64(v_src0), v_cvt_f64(v_src1)));
+            v_sum1 = v_add(v_sum1, v_add(v_cvt_f64_high(v_src0), v_cvt_f64_high(v_src1)));
        }

 #if CV_SIMD256 || CV_SIMD512
-        double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_float64::nlanes];
-        v_store_aligned(ar, v_sum0 + v_sum1);
-        for (int i = 0; i < v_float64::nlanes; ++i)
+        double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_float64>::max_nlanes];
+        v_store_aligned(ar, v_add(v_sum0, v_sum1));
+        for (int i = 0; i < VTraits<v_float64>::vlanes(); ++i)
            dst[i % cn] += ar[i];
 #else
-        double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * v_float64::nlanes];
+        double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * VTraits<v_float64>::max_nlanes];
        v_store_aligned(ar, v_sum0);
-        v_store_aligned(ar + v_float64::nlanes, v_sum1);
-        for (int i = 0; i < 2 * v_float64::nlanes; ++i)
+        v_store_aligned(ar + VTraits<v_float64>::vlanes(), v_sum1);
+        for (int i = 0; i < 2 * VTraits<v_float64>::vlanes(); ++i)
            dst[i % cn] += ar[i];
 #endif
        v_cleanup();