From 320c0bf419043e0179d47742f9f0ebd7885b5457 Mon Sep 17 00:00:00 2001
From: HAN Liutong <liutong2020@iscas.ac.cn>
Date: Tue, 19 Sep 2023 20:12:52 +0800
Subject: [PATCH] Merge pull request #24166 from hanliutong:rewrite-remaining

Rewrite Universal Intrinsic code: ImgProc (CV_SIMD_WIDTH related Part) #24166

Related PR: #24058, #24132. The goal of this series of PRs is to modify the SIMD code blocks in the opencv/modules/imgproc folder by using the new Universal Intrinsic API.

The modification of this PR mainly focuses on the code that uses the `CV_SIMD_WIDTH` macro. This macro is sometimes used for loop tail processing, such as `box_filter.simd.hpp` and `morph.simd.hpp`.

```cpp
#if CV_SIMD
int i = 0;
for (i < n - v_uint16::nlanes; i += v_uint16::nlanes) {
// some universal intrinsic code
// e.g. v_uint16...
}
#if CV_SIMD_WIDTH > 16
for (i < n - v_uint16x8::nlanes; i += v_uint16x8::nlanes) {
// handle loop tail by 128 bit SIMD
// e.g. v_uint16x8
}
#endif //CV_SIMD_WIDTH
#endif// CV_SIMD
```
The main contradiction is that the variable-length Universal Intrinsic backend cannot use 128bit fixed-length data structures. Therefore, this PR uses the scalar loop to handle the loop tail.

This PR is marked as draft because the modification of the `box_filter.simd.hpp` file caused a compilation error. The cause of the error is initially believed to be due to an internal error in the GCC compiler.

```bash
box_filter.simd.hpp:1162:5: internal compiler error: Segmentation fault
 1162 |     }
      |     ^
0xe03883 crash_signal
        /wafer/share/gcc/gcc/toplev.cc:314
0x7ff261c4251f ???
        ./signal/../sysdeps/unix/sysv/linux/x86_64/libc_sigaction.c:0
0x6bde48 hash_set<rtl_ssa::set_info*, false, default_hash_traits<rtl_ssa::set_info*> >::iterator::operator*()
        /wafer/share/gcc/gcc/hash-set.h:125
0x6bde48 extract_single_source
        /wafer/share/gcc/gcc/config/riscv/riscv-vsetvl.cc:1184
0x6bde48 extract_single_source
        /wafer/share/gcc/gcc/config/riscv/riscv-vsetvl.cc:1174
0x119ad9e pass_vsetvl::propagate_avl() const
        /wafer/share/gcc/gcc/config/riscv/riscv-vsetvl.cc:4087
0x119ceaf pass_vsetvl::execute(function*)
        /wafer/share/gcc/gcc/config/riscv/riscv-vsetvl.cc:4344
0x119ceaf pass_vsetvl::execute(function*)
        /wafer/share/gcc/gcc/config/riscv/riscv-vsetvl.cc:4325
Please submit a full bug report, with preprocessed source (by using -freport-bug).
Please include the complete backtrace with any bug report.
```

This PR can be compiled with Clang 16, and `opencv_test_imgproc` is passed on QEMU.

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
---
 modules/imgproc/src/box_filter.simd.hpp | 316 ++++++++++++------------
 modules/imgproc/src/morph.simd.hpp      | 170 ++++++-------
 modules/imgproc/src/shapedescr.cpp      |  52 ++--
 3 files changed, 269 insertions(+), 269 deletions(-)

diff --git a/modules/imgproc/src/box_filter.simd.hpp b/modules/imgproc/src/box_filter.simd.hpp
index 4a4d205216..735935c04f 100644
--- a/modules/imgproc/src/box_filter.simd.hpp
+++ b/modules/imgproc/src/box_filter.simd.hpp
@@ -309,15 +309,15 @@ struct ColumnSum<int, uchar> :
             {
                 const int* Sp = (const int*)src[0];
                 int i = 0;
-#if CV_SIMD
-                for (; i <= width - v_int32::nlanes; i += v_int32::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for (; i <= width - VTraits<v_int32>::vlanes(); i += VTraits<v_int32>::vlanes())
                 {
-                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+                    v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
                 }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                 for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
                 {
-                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+                    v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
                 }
 #endif
 #endif
@@ -339,37 +339,37 @@ struct ColumnSum<int, uchar> :
             if( haveScale )
             {
                 int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 v_float32 _v_scale = vx_setall_f32((float)_scale);
-                for( ; i <= width - v_uint16::nlanes; i += v_uint16::nlanes )
+                for( ; i <= width - VTraits<v_uint16>::vlanes(); i += VTraits<v_uint16>::vlanes() )
                 {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
+                    v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));
 
-                    v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * _v_scale));
-                    v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * _v_scale));
+                    v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), _v_scale)));
+                    v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), _v_scale)));
 
                     v_uint16 v_dst = v_pack(v_s0d, v_s01d);
                     v_pack_store(D + i, v_dst);
 
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
-                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
+                    v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
                 }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                 v_float32x4 v_scale = v_setall_f32((float)_scale);
                 for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
                 {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
+                    v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
 
-                    v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale));
-                    v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale));
+                    v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), v_scale)));
+                    v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), v_scale)));
 
                     v_uint16x8 v_dst = v_pack(v_s0d, v_s01d);
                     v_pack_store(D + i, v_dst);
 
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
-                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
+                    v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
             }
 #endif
 #endif
@@ -383,29 +383,29 @@ struct ColumnSum<int, uchar> :
             else
             {
                 int i = 0;
-#if CV_SIMD
-                for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; i <= width-VTraits<v_uint16>::vlanes(); i+=VTraits<v_uint16>::vlanes() )
                 {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
+                    v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));
 
                     v_uint16 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01));
                     v_pack_store(D + i, v_dst);
 
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
-                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
+                    v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
                 }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                 for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
                 {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
+                    v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
 
                     v_uint16x8 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01));
                     v_pack_store(D + i, v_dst);
 
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
-                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
+                    v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
                 }
 #endif
 #endif
@@ -480,15 +480,15 @@ public BaseColumnFilter
             {
                 const ushort* Sp = (const ushort*)src[0];
                 int i = 0;
-#if CV_SIMD
-                for( ; i <= width - v_uint16::nlanes; i += v_uint16::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; i <= width - VTraits<v_uint16>::vlanes(); i += VTraits<v_uint16>::vlanes() )
                 {
-                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+                    v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
                 }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                 for( ; i <= width - v_uint16x8::nlanes; i += v_uint16x8::nlanes )
                 {
-                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+                    v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
                 }
 #endif
 #endif
@@ -510,27 +510,27 @@ public BaseColumnFilter
             if( haveScale )
             {
                 int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 v_uint32 _ds4 = vx_setall_u32((unsigned)ds);
                 v_uint16 _dd8 = vx_setall_u16((ushort)dd);
 
-                for( ; i <= width-v_uint8::nlanes; i+=v_uint8::nlanes )
+                for( ; i <= width-VTraits<v_uint8>::vlanes(); i+=VTraits<v_uint8>::vlanes() )
                 {
                     v_uint16 _sm0 = vx_load(Sm + i);
-                    v_uint16 _sm1 = vx_load(Sm + i + v_uint16::nlanes);
+                    v_uint16 _sm1 = vx_load(Sm + i + VTraits<v_uint16>::vlanes());
 
                     v_uint16 _s0 = v_add_wrap(vx_load(SUM + i), vx_load(Sp + i));
-                    v_uint16 _s1 = v_add_wrap(vx_load(SUM + i + v_uint16::nlanes), vx_load(Sp + i + v_uint16::nlanes));
+                    v_uint16 _s1 = v_add_wrap(vx_load(SUM + i + VTraits<v_uint16>::vlanes()), vx_load(Sp + i + VTraits<v_uint16>::vlanes()));
 
                     v_uint32 _s00, _s01, _s10, _s11;
 
-                    v_expand(_s0 + _dd8, _s00, _s01);
-                    v_expand(_s1 + _dd8, _s10, _s11);
+                    v_expand(v_add(_s0, _dd8), _s00, _s01);
+                    v_expand(v_add(_s1, _dd8), _s10, _s11);
 
-                    _s00 = v_shr<SHIFT>(_s00*_ds4);
-                    _s01 = v_shr<SHIFT>(_s01*_ds4);
-                    _s10 = v_shr<SHIFT>(_s10*_ds4);
-                    _s11 = v_shr<SHIFT>(_s11*_ds4);
+                    _s00 = v_shr<SHIFT>(v_mul(_s00, _ds4));
+                    _s01 = v_shr<SHIFT>(v_mul(_s01, _ds4));
+                    _s10 = v_shr<SHIFT>(v_mul(_s10, _ds4));
+                    _s11 = v_shr<SHIFT>(v_mul(_s11, _ds4));
 
                     v_int16 r0 = v_pack(v_reinterpret_as_s32(_s00), v_reinterpret_as_s32(_s01));
                     v_int16 r1 = v_pack(v_reinterpret_as_s32(_s10), v_reinterpret_as_s32(_s11));
@@ -540,9 +540,9 @@ public BaseColumnFilter
 
                     v_store(D + i, v_pack_u(r0, r1));
                     v_store(SUM + i, _s0);
-                    v_store(SUM + i + v_uint16::nlanes, _s1);
+                    v_store(SUM + i + VTraits<v_uint16>::vlanes(), _s1);
                 }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                 v_uint32x4 ds4 = v_setall_u32((unsigned)ds);
                 v_uint16x8 dd8 = v_setall_u16((ushort)dd);
 
@@ -556,13 +556,13 @@ public BaseColumnFilter
 
                     v_uint32x4 _s00, _s01, _s10, _s11;
 
-                    v_expand(_s0 + dd8, _s00, _s01);
-                    v_expand(_s1 + dd8, _s10, _s11);
+                    v_expand(v_add(_s0, dd8), _s00, _s01);
+                    v_expand(v_add(_s1, dd8), _s10, _s11);
 
-                    _s00 = v_shr<SHIFT>(_s00*ds4);
-                    _s01 = v_shr<SHIFT>(_s01*ds4);
-                    _s10 = v_shr<SHIFT>(_s10*ds4);
-                    _s11 = v_shr<SHIFT>(_s11*ds4);
+                    _s00 = v_shr<SHIFT>(v_mul(_s00, ds4));
+                    _s01 = v_shr<SHIFT>(v_mul(_s01, ds4));
+                    _s10 = v_shr<SHIFT>(v_mul(_s10, ds4));
+                    _s11 = v_shr<SHIFT>(v_mul(_s11, ds4));
 
                     v_int16x8 r0 = v_pack(v_reinterpret_as_s32(_s00), v_reinterpret_as_s32(_s01));
                     v_int16x8 r1 = v_pack(v_reinterpret_as_s32(_s10), v_reinterpret_as_s32(_s11));
@@ -643,15 +643,15 @@ struct ColumnSum<int, short> :
             {
                 const int* Sp = (const int*)src[0];
                 i = 0;
-#if CV_SIMD
-                for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; i <= width - VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
                 {
-                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+                    v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
                 }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                 for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
                 {
-                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+                    v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
                 }
 #endif
 #endif
@@ -673,33 +673,33 @@ struct ColumnSum<int, short> :
             if( haveScale )
             {
                 i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 v_float32 _v_scale = vx_setall_f32((float)_scale);
-                for( ; i <= width-v_int16::nlanes; i+=v_int16::nlanes )
+                for( ; i <= width-VTraits<v_int16>::vlanes(); i+=VTraits<v_int16>::vlanes() )
                 {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
+                    v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));
 
-                    v_int32 v_s0d =  v_round(v_cvt_f32(v_s0) * _v_scale);
-                    v_int32 v_s01d = v_round(v_cvt_f32(v_s01) * _v_scale);
+                    v_int32 v_s0d =  v_round(v_mul(v_cvt_f32(v_s0), _v_scale));
+                    v_int32 v_s01d = v_round(v_mul(v_cvt_f32(v_s01), _v_scale));
                     v_store(D + i, v_pack(v_s0d, v_s01d));
 
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
-                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
+                    v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
                 }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                 v_float32x4 v_scale = v_setall_f32((float)_scale);
                 for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes )
                 {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
+                    v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
 
-                    v_int32x4 v_s0d =  v_round(v_cvt_f32(v_s0) * v_scale);
-                    v_int32x4 v_s01d = v_round(v_cvt_f32(v_s01) * v_scale);
+                    v_int32x4 v_s0d =  v_round(v_mul(v_cvt_f32(v_s0), v_scale));
+                    v_int32x4 v_s01d = v_round(v_mul(v_cvt_f32(v_s01), v_scale));
                     v_store(D + i, v_pack(v_s0d, v_s01d));
 
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
-                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
+                    v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
                 }
 #endif
 #endif
@@ -713,27 +713,27 @@ struct ColumnSum<int, short> :
             else
             {
                 i = 0;
-#if CV_SIMD
-                for( ; i <= width-v_int16::nlanes; i+=v_int16::nlanes )
+#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE, GCC 13 related
+                for( ; i <= width-VTraits<v_int16>::vlanes(); i+=VTraits<v_int16>::vlanes() )
                 {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
+                    v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));
 
                     v_store(D + i, v_pack(v_s0, v_s01));
 
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
-                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
+                    v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
                 }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                 for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes )
                 {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
+                    v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
 
                     v_store(D + i, v_pack(v_s0, v_s01));
 
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
-                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
+                    v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
                 }
 #endif
 #endif
@@ -792,15 +792,15 @@ struct ColumnSum<int, ushort> :
             {
                 const int* Sp = (const int*)src[0];
                 int i = 0;
-#if CV_SIMD
-                for (; i <= width - v_int32::nlanes; i += v_int32::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for (; i <= width - VTraits<v_int32>::vlanes(); i += VTraits<v_int32>::vlanes())
                 {
-                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+                    v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
                 }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                 for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
                 {
-                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+                    v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
                 }
 #endif
 #endif
@@ -822,33 +822,33 @@ struct ColumnSum<int, ushort> :
             if( haveScale )
             {
                 int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 v_float32 _v_scale = vx_setall_f32((float)_scale);
-                for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes )
+                for( ; i <= width-VTraits<v_uint16>::vlanes(); i+=VTraits<v_uint16>::vlanes() )
                 {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
+                    v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));
 
-                    v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * _v_scale));
-                    v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * _v_scale));
+                    v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), _v_scale)));
+                    v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), _v_scale)));
                     v_store(D + i, v_pack(v_s0d, v_s01d));
 
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
-                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
+                    v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
                 }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                 v_float32x4 v_scale = v_setall_f32((float)_scale);
                 for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
                 {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
+                    v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
 
-                    v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale));
-                    v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale));
+                    v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), v_scale)));
+                    v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), v_scale)));
                     v_store(D + i, v_pack(v_s0d, v_s01d));
 
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
-                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
+                    v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
                 }
 #endif
 #endif
@@ -862,27 +862,27 @@ struct ColumnSum<int, ushort> :
             else
             {
                 int i = 0;
-#if CV_SIMD
-                for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; i <= width-VTraits<v_uint16>::vlanes(); i+=VTraits<v_uint16>::vlanes() )
                 {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
+                    v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));
 
                     v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)));
 
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
-                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
+                    v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
                 }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                 for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
                 {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
+                    v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
 
                     v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)));
 
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
-                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
+                    v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
                 }
 #endif
 #endif
@@ -939,15 +939,15 @@ struct ColumnSum<int, int> :
             {
                 const int* Sp = (const int*)src[0];
                 int i = 0;
-#if CV_SIMD
-                for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; i <= width - VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
                 {
-                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+                    v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
                 }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                 for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
                 {
-                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+                    v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
                 }
 #endif
 #endif
@@ -969,25 +969,25 @@ struct ColumnSum<int, int> :
             if( haveScale )
             {
                 int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 v_float32 _v_scale = vx_setall_f32((float)_scale);
-                for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes )
+                for( ; i <= width-VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
                 {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_int32 v_s0d = v_round(v_cvt_f32(v_s0) * _v_scale);
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
+                    v_int32 v_s0d = v_round(v_mul(v_cvt_f32(v_s0), _v_scale));
 
                     v_store(D + i, v_s0d);
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
                 }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                 v_float32x4 v_scale = v_setall_f32((float)_scale);
                 for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
                 {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_int32x4 v_s0d = v_round(v_cvt_f32(v_s0) * v_scale);
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
+                    v_int32x4 v_s0d = v_round(v_mul(v_cvt_f32(v_s0), v_scale));
 
                     v_store(D + i, v_s0d);
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
                 }
 #endif
 #endif
@@ -1001,21 +1001,21 @@ struct ColumnSum<int, int> :
             else
             {
                 int i = 0;
-#if CV_SIMD
-                for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; i <= width-VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
                 {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
 
                     v_store(D + i, v_s0);
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
                 }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                 for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
                 {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
 
                     v_store(D + i, v_s0);
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
                 }
 #endif
 #endif
@@ -1073,15 +1073,15 @@ struct ColumnSum<int, float> :
             {
                 const int* Sp = (const int*)src[0];
                 int i = 0;
-#if CV_SIMD
-                for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; i <= width - VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
                 {
-                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+                    v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
                 }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                 for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
                 {
-                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+                    v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
                 }
 #endif
 #endif
@@ -1105,21 +1105,21 @@ struct ColumnSum<int, float> :
             {
                 int i = 0;
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 v_float32 _v_scale = vx_setall_f32((float)_scale);
-                for (; i <= width - v_int32::nlanes; i += v_int32::nlanes)
+                for (; i <= width - VTraits<v_int32>::vlanes(); i += VTraits<v_int32>::vlanes())
                 {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_store(D + i, v_cvt_f32(v_s0) * _v_scale);
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
+                    v_store(D + i, v_mul(v_cvt_f32(v_s0), _v_scale));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
                 }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                 v_float32x4 v_scale = v_setall_f32((float)_scale);
                 for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
                 {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_store(D + i, v_cvt_f32(v_s0) * v_scale);
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
+                    v_store(D + i, v_mul(v_cvt_f32(v_s0), v_scale));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
                 }
 #endif
 #endif
@@ -1134,19 +1134,19 @@ struct ColumnSum<int, float> :
             {
                 int i = 0;
 
-#if CV_SIMD
-                for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; i <= width-VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
                 {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
                     v_store(D + i, v_cvt_f32(v_s0));
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
                 }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                 for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
                 {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
                     v_store(D + i, v_cvt_f32(v_s0));
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
                 }
 #endif
 #endif
diff --git a/modules/imgproc/src/morph.simd.hpp b/modules/imgproc/src/morph.simd.hpp
index 9b3023f8f0..ef813dccec 100644
--- a/modules/imgproc/src/morph.simd.hpp
+++ b/modules/imgproc/src/morph.simd.hpp
@@ -106,12 +106,12 @@ struct MorphNoVec
     int operator()(uchar**, int, uchar*, int) const { return 0; }
 };
 
-#if CV_SIMD
+#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE, GCC 13 related
 
 template<class VecUpdate> struct MorphRowVec
 {
     typedef typename VecUpdate::vtype vtype;
-    typedef typename vtype::lane_type stype;
+    typedef typename VTraits<vtype>::lane_type stype;
     MorphRowVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {}
     int operator()(const uchar* src, uchar* dst, int width, int cn) const
     {
@@ -121,52 +121,52 @@ template<class VecUpdate> struct MorphRowVec
         width *= cn;
         VecUpdate updateOp;
 
-        for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes )
+        for( i = 0; i <= width - 4*VTraits<vtype>::vlanes(); i += 4*VTraits<vtype>::vlanes() )
         {
             vtype s0 = vx_load((const stype*)src + i);
-            vtype s1 = vx_load((const stype*)src + i + vtype::nlanes);
-            vtype s2 = vx_load((const stype*)src + i + 2*vtype::nlanes);
-            vtype s3 = vx_load((const stype*)src + i + 3*vtype::nlanes);
+            vtype s1 = vx_load((const stype*)src + i + VTraits<vtype>::vlanes());
+            vtype s2 = vx_load((const stype*)src + i + 2*VTraits<vtype>::vlanes());
+            vtype s3 = vx_load((const stype*)src + i + 3*VTraits<vtype>::vlanes());
             for (k = cn; k < _ksize; k += cn)
             {
                 s0 = updateOp(s0, vx_load((const stype*)src + i + k));
-                s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes));
-                s2 = updateOp(s2, vx_load((const stype*)src + i + k + 2*vtype::nlanes));
-                s3 = updateOp(s3, vx_load((const stype*)src + i + k + 3*vtype::nlanes));
+                s1 = updateOp(s1, vx_load((const stype*)src + i + k + VTraits<vtype>::vlanes()));
+                s2 = updateOp(s2, vx_load((const stype*)src + i + k + 2*VTraits<vtype>::vlanes()));
+                s3 = updateOp(s3, vx_load((const stype*)src + i + k + 3*VTraits<vtype>::vlanes()));
             }
             v_store((stype*)dst + i, s0);
-            v_store((stype*)dst + i + vtype::nlanes, s1);
-            v_store((stype*)dst + i + 2*vtype::nlanes, s2);
-            v_store((stype*)dst + i + 3*vtype::nlanes, s3);
+            v_store((stype*)dst + i + VTraits<vtype>::vlanes(), s1);
+            v_store((stype*)dst + i + 2*VTraits<vtype>::vlanes(), s2);
+            v_store((stype*)dst + i + 3*VTraits<vtype>::vlanes(), s3);
         }
-        if( i <= width - 2*vtype::nlanes )
+        if( i <= width - 2*VTraits<vtype>::vlanes() )
         {
             vtype s0 = vx_load((const stype*)src + i);
-            vtype s1 = vx_load((const stype*)src + i + vtype::nlanes);
+            vtype s1 = vx_load((const stype*)src + i + VTraits<vtype>::vlanes());
             for( k = cn; k < _ksize; k += cn )
             {
                 s0 = updateOp(s0, vx_load((const stype*)src + i + k));
-                s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes));
+                s1 = updateOp(s1, vx_load((const stype*)src + i + k + VTraits<vtype>::vlanes()));
             }
             v_store((stype*)dst + i, s0);
-            v_store((stype*)dst + i + vtype::nlanes, s1);
-            i += 2*vtype::nlanes;
+            v_store((stype*)dst + i + VTraits<vtype>::vlanes(), s1);
+            i += 2*VTraits<vtype>::vlanes();
         }
-        if( i <= width - vtype::nlanes )
+        if( i <= width - VTraits<vtype>::vlanes() )
         {
             vtype s = vx_load((const stype*)src + i);
             for( k = cn; k < _ksize; k += cn )
                 s = updateOp(s, vx_load((const stype*)src + i + k));
             v_store((stype*)dst + i, s);
-            i += vtype::nlanes;
+            i += VTraits<vtype>::vlanes();
         }
-        if( i <= width - vtype::nlanes/2 )
+        if( i <= width - VTraits<vtype>::vlanes()/2 )
         {
             vtype s = vx_load_low((const stype*)src + i);
             for( k = cn; k < _ksize; k += cn )
                 s = updateOp(s, vx_load_low((const stype*)src + i + k));
             v_store_low((stype*)dst + i, s);
-            i += vtype::nlanes/2;
+            i += VTraits<vtype>::vlanes()/2;
         }
 
         return i - i % cn;
@@ -179,7 +179,7 @@ template<class VecUpdate> struct MorphRowVec
 template<class VecUpdate> struct MorphColumnVec
 {
     typedef typename VecUpdate::vtype vtype;
-    typedef typename vtype::lane_type stype;
+    typedef typename VTraits<vtype>::lane_type stype;
     MorphColumnVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {}
     int operator()(const uchar** _src, uchar* _dst, int dststep, int count, int width) const
     {
@@ -189,7 +189,7 @@ template<class VecUpdate> struct MorphColumnVec
         VecUpdate updateOp;
 
         for( i = 0; i < count + ksize - 1; i++ )
-            CV_Assert( ((size_t)_src[i] & (CV_SIMD_WIDTH-1)) == 0 );
+            CV_Assert( ((size_t)_src[i] & (VTraits<v_uint8>::vlanes()-1)) == 0 );
 
         const stype** src = (const stype**)_src;
         stype* dst = (stype*)_dst;
@@ -197,58 +197,58 @@ template<class VecUpdate> struct MorphColumnVec
 
         for( ; _ksize > 1 && count > 1; count -= 2, dst += dststep*2, src += 2 )
         {
-            for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes)
+            for( i = 0; i <= width - 4*VTraits<vtype>::vlanes(); i += 4*VTraits<vtype>::vlanes())
             {
                 const stype* sptr = src[1] + i;
                 vtype s0 = vx_load_aligned(sptr);
-                vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
-                vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes);
-                vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes);
+                vtype s1 = vx_load_aligned(sptr + VTraits<vtype>::vlanes());
+                vtype s2 = vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes());
+                vtype s3 = vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes());
 
                 for( k = 2; k < _ksize; k++ )
                 {
                     sptr = src[k] + i;
                     s0 = updateOp(s0, vx_load_aligned(sptr));
-                    s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
-                    s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes));
-                    s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes));
+                    s1 = updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes()));
+                    s2 = updateOp(s2, vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes()));
+                    s3 = updateOp(s3, vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes()));
                 }
 
                 sptr = src[0] + i;
                 v_store(dst + i, updateOp(s0, vx_load_aligned(sptr)));
-                v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
-                v_store(dst + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)));
-                v_store(dst + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)));
+                v_store(dst + i + VTraits<vtype>::vlanes(), updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes())));
+                v_store(dst + i + 2*VTraits<vtype>::vlanes(), updateOp(s2, vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes())));
+                v_store(dst + i + 3*VTraits<vtype>::vlanes(), updateOp(s3, vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes())));
 
                 sptr = src[k] + i;
                 v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr)));
-                v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
-                v_store(dst + dststep + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)));
-                v_store(dst + dststep + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)));
+                v_store(dst + dststep + i + VTraits<vtype>::vlanes(), updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes())));
+                v_store(dst + dststep + i + 2*VTraits<vtype>::vlanes(), updateOp(s2, vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes())));
+                v_store(dst + dststep + i + 3*VTraits<vtype>::vlanes(), updateOp(s3, vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes())));
             }
-            if( i <= width - 2*vtype::nlanes )
+            if( i <= width - 2*VTraits<vtype>::vlanes() )
             {
                 const stype* sptr = src[1] + i;
                 vtype s0 = vx_load_aligned(sptr);
-                vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
+                vtype s1 = vx_load_aligned(sptr + VTraits<vtype>::vlanes());
 
                 for( k = 2; k < _ksize; k++ )
                 {
                     sptr = src[k] + i;
                     s0 = updateOp(s0, vx_load_aligned(sptr));
-                    s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
+                    s1 = updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes()));
                 }
 
                 sptr = src[0] + i;
                 v_store(dst + i, updateOp(s0, vx_load_aligned(sptr)));
-                v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
+                v_store(dst + i + VTraits<vtype>::vlanes(), updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes())));
 
                 sptr = src[k] + i;
                 v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr)));
-                v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
-                i += 2*vtype::nlanes;
+                v_store(dst + dststep + i + VTraits<vtype>::vlanes(), updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes())));
+                i += 2*VTraits<vtype>::vlanes();
             }
-            if( i <= width - vtype::nlanes )
+            if( i <= width - VTraits<vtype>::vlanes() )
             {
                 vtype s0 = vx_load_aligned(src[1] + i);
 
@@ -257,9 +257,9 @@ template<class VecUpdate> struct MorphColumnVec
 
                 v_store(dst + i, updateOp(s0, vx_load_aligned(src[0] + i)));
                 v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(src[k] + i)));
-                i += vtype::nlanes;
+                i += VTraits<vtype>::vlanes();
             }
-            if( i <= width - vtype::nlanes/2 )
+            if( i <= width - VTraits<vtype>::vlanes()/2 )
             {
                 vtype s0 = vx_load_low(src[1] + i);
 
@@ -268,66 +268,66 @@ template<class VecUpdate> struct MorphColumnVec
 
                 v_store_low(dst + i, updateOp(s0, vx_load_low(src[0] + i)));
                 v_store_low(dst + dststep + i, updateOp(s0, vx_load_low(src[k] + i)));
-                i += vtype::nlanes/2;
+                i += VTraits<vtype>::vlanes()/2;
             }
         }
 
         for( ; count > 0; count--, dst += dststep, src++ )
         {
-            for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes)
+            for( i = 0; i <= width - 4*VTraits<vtype>::vlanes(); i += 4*VTraits<vtype>::vlanes())
             {
                 const stype* sptr = src[0] + i;
                 vtype s0 = vx_load_aligned(sptr);
-                vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
-                vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes);
-                vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes);
+                vtype s1 = vx_load_aligned(sptr + VTraits<vtype>::vlanes());
+                vtype s2 = vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes());
+                vtype s3 = vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes());
 
                 for( k = 1; k < _ksize; k++ )
                 {
                     sptr = src[k] + i;
                     s0 = updateOp(s0, vx_load_aligned(sptr));
-                    s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
-                    s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes));
-                    s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes));
+                    s1 = updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes()));
+                    s2 = updateOp(s2, vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes()));
+                    s3 = updateOp(s3, vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes()));
                 }
                 v_store(dst + i, s0);
-                v_store(dst + i + vtype::nlanes, s1);
-                v_store(dst + i + 2*vtype::nlanes, s2);
-                v_store(dst + i + 3*vtype::nlanes, s3);
+                v_store(dst + i + VTraits<vtype>::vlanes(), s1);
+                v_store(dst + i + 2*VTraits<vtype>::vlanes(), s2);
+                v_store(dst + i + 3*VTraits<vtype>::vlanes(), s3);
             }
-            if( i <= width - 2*vtype::nlanes )
+            if( i <= width - 2*VTraits<vtype>::vlanes() )
             {
                 const stype* sptr = src[0] + i;
                 vtype s0 = vx_load_aligned(sptr);
-                vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
+                vtype s1 = vx_load_aligned(sptr + VTraits<vtype>::vlanes());
 
                 for( k = 1; k < _ksize; k++ )
                 {
                     sptr = src[k] + i;
                     s0 = updateOp(s0, vx_load_aligned(sptr));
-                    s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
+                    s1 = updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes()));
                 }
                 v_store(dst + i, s0);
-                v_store(dst + i + vtype::nlanes, s1);
-                i += 2*vtype::nlanes;
+                v_store(dst + i + VTraits<vtype>::vlanes(), s1);
+                i += 2*VTraits<vtype>::vlanes();
             }
-            if( i <= width - vtype::nlanes )
+            if( i <= width - VTraits<vtype>::vlanes() )
             {
                 vtype s0 = vx_load_aligned(src[0] + i);
 
                 for( k = 1; k < _ksize; k++ )
                     s0 = updateOp(s0, vx_load_aligned(src[k] + i));
                 v_store(dst + i, s0);
-                i += vtype::nlanes;
+                i += VTraits<vtype>::vlanes();
             }
-            if( i <= width - vtype::nlanes/2 )
+            if( i <= width - VTraits<vtype>::vlanes()/2 )
             {
                 vtype s0 = vx_load_low(src[0] + i);
 
                 for( k = 1; k < _ksize; k++ )
                     s0 = updateOp(s0, vx_load_low(src[k] + i));
                 v_store_low(dst + i, s0);
-                i += vtype::nlanes/2;
+                i += VTraits<vtype>::vlanes()/2;
             }
         }
 
@@ -341,7 +341,7 @@ template<class VecUpdate> struct MorphColumnVec
 template<class VecUpdate> struct MorphVec
 {
     typedef typename VecUpdate::vtype vtype;
-    typedef typename vtype::lane_type stype;
+    typedef typename VTraits<vtype>::lane_type stype;
     int operator()(uchar** _src, int nz, uchar* _dst, int width) const
     {
         CV_INSTRUMENT_REGION();
@@ -351,56 +351,56 @@ template<class VecUpdate> struct MorphVec
         int i, k;
         VecUpdate updateOp;
 
-        for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes )
+        for( i = 0; i <= width - 4*VTraits<vtype>::vlanes(); i += 4*VTraits<vtype>::vlanes() )
         {
             const stype* sptr = src[0] + i;
             vtype s0 = vx_load(sptr);
-            vtype s1 = vx_load(sptr + vtype::nlanes);
-            vtype s2 = vx_load(sptr + 2*vtype::nlanes);
-            vtype s3 = vx_load(sptr + 3*vtype::nlanes);
+            vtype s1 = vx_load(sptr + VTraits<vtype>::vlanes());
+            vtype s2 = vx_load(sptr + 2*VTraits<vtype>::vlanes());
+            vtype s3 = vx_load(sptr + 3*VTraits<vtype>::vlanes());
             for( k = 1; k < nz; k++ )
             {
                 sptr = src[k] + i;
                 s0 = updateOp(s0, vx_load(sptr));
-                s1 = updateOp(s1, vx_load(sptr + vtype::nlanes));
-                s2 = updateOp(s2, vx_load(sptr + 2*vtype::nlanes));
-                s3 = updateOp(s3, vx_load(sptr + 3*vtype::nlanes));
+                s1 = updateOp(s1, vx_load(sptr + VTraits<vtype>::vlanes()));
+                s2 = updateOp(s2, vx_load(sptr + 2*VTraits<vtype>::vlanes()));
+                s3 = updateOp(s3, vx_load(sptr + 3*VTraits<vtype>::vlanes()));
             }
             v_store(dst + i, s0);
-            v_store(dst + i + vtype::nlanes, s1);
-            v_store(dst + i + 2*vtype::nlanes, s2);
-            v_store(dst + i + 3*vtype::nlanes, s3);
+            v_store(dst + i + VTraits<vtype>::vlanes(), s1);
+            v_store(dst + i + 2*VTraits<vtype>::vlanes(), s2);
+            v_store(dst + i + 3*VTraits<vtype>::vlanes(), s3);
         }
-        if( i <= width - 2*vtype::nlanes )
+        if( i <= width - 2*VTraits<vtype>::vlanes() )
         {
             const stype* sptr = src[0] + i;
             vtype s0 = vx_load(sptr);
-            vtype s1 = vx_load(sptr + vtype::nlanes);
+            vtype s1 = vx_load(sptr + VTraits<vtype>::vlanes());
             for( k = 1; k < nz; k++ )
             {
                 sptr = src[k] + i;
                 s0 = updateOp(s0, vx_load(sptr));
-                s1 = updateOp(s1, vx_load(sptr + vtype::nlanes));
+                s1 = updateOp(s1, vx_load(sptr + VTraits<vtype>::vlanes()));
             }
             v_store(dst + i, s0);
-            v_store(dst + i + vtype::nlanes, s1);
-            i += 2*vtype::nlanes;
+            v_store(dst + i + VTraits<vtype>::vlanes(), s1);
+            i += 2*VTraits<vtype>::vlanes();
         }
-        if( i <= width - vtype::nlanes )
+        if( i <= width - VTraits<vtype>::vlanes() )
         {
             vtype s0 = vx_load(src[0] + i);
             for( k = 1; k < nz; k++ )
                 s0 = updateOp(s0, vx_load(src[k] + i));
             v_store(dst + i, s0);
-            i += vtype::nlanes;
+            i += VTraits<vtype>::vlanes();
         }
-        if( i <= width - vtype::nlanes/2 )
+        if( i <= width - VTraits<vtype>::vlanes()/2 )
         {
             vtype s0 = vx_load_low(src[0] + i);
             for( k = 1; k < nz; k++ )
                 s0 = updateOp(s0, vx_load_low(src[k] + i));
             v_store_low(dst + i, s0);
-            i += vtype::nlanes/2;
+            i += VTraits<vtype>::vlanes()/2;
         }
         return i;
     }
diff --git a/modules/imgproc/src/shapedescr.cpp b/modules/imgproc/src/shapedescr.cpp
index 4c73910e27..194e5a0862 100644
--- a/modules/imgproc/src/shapedescr.cpp
+++ b/modules/imgproc/src/shapedescr.cpp
@@ -879,14 +879,14 @@ static Rect pointSetBoundingRect( const Mat& points )
     if( npoints == 0 )
         return Rect();
 
-#if CV_SIMD
+#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE, loop tail related.
     const int64_t* pts = points.ptr<int64_t>();
 
     if( !is_float )
     {
         v_int32 minval, maxval;
         minval = maxval = v_reinterpret_as_s32(vx_setall_s64(*pts)); //min[0]=pt.x, min[1]=pt.y, min[2]=pt.x, min[3]=pt.y
-        for( i = 1; i <= npoints - v_int32::nlanes/2; i+= v_int32::nlanes/2 )
+        for( i = 1; i <= npoints - VTraits<v_int32>::vlanes()/2; i+= VTraits<v_int32>::vlanes()/2 )
         {
             v_int32 ptXY2 = v_reinterpret_as_s32(vx_load(pts + i));
             minval = v_min(ptXY2, minval);
@@ -894,22 +894,22 @@ static Rect pointSetBoundingRect( const Mat& points )
         }
         minval = v_min(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))));
         maxval = v_max(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))));
-        if( i <= npoints - v_int32::nlanes/4 )
+        if( i <= npoints - VTraits<v_int32>::vlanes()/4 )
         {
             v_int32 ptXY = v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(vx_load_low(pts + i))));
             minval = v_min(ptXY, minval);
             maxval = v_max(ptXY, maxval);
-            i += v_int64::nlanes/2;
+            i += VTraits<v_int64>::vlanes()/2;
         }
-        for(int j = 16; j < CV_SIMD_WIDTH; j*=2)
+        for(int j = 16; j < VTraits<v_uint8>::vlanes(); j*=2)
         {
             minval = v_min(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))));
             maxval = v_max(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))));
         }
-        xmin = minval.get0();
-        xmax = maxval.get0();
-        ymin = v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))).get0();
-        ymax = v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))).get0();
+        xmin = v_get0(minval);
+        xmax = v_get0(maxval);
+        ymin = v_get0(v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))));
+        ymax = v_get0(v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))));
 #if CV_SIMD_WIDTH > 16
         if( i < npoints )
         {
@@ -921,18 +921,18 @@ static Rect pointSetBoundingRect( const Mat& points )
                 minval2 = v_min(ptXY, minval2);
                 maxval2 = v_max(ptXY, maxval2);
             }
-            xmin = min(xmin, minval2.get0());
-            xmax = max(xmax, maxval2.get0());
-            ymin = min(ymin, v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval2))).get0());
-            ymax = max(ymax, v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval2))).get0());
+            xmin = min(xmin, v_get0(minval2));
+            xmax = max(xmax, v_get0(maxval2));
+            ymin = min(ymin, v_get0(v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval2)))));
+            ymax = max(ymax, v_get0(v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval2)))));
         }
-#endif
+#endif // CV_SIMD
     }
     else
     {
         v_float32 minval, maxval;
         minval = maxval = v_reinterpret_as_f32(vx_setall_s64(*pts)); //min[0]=pt.x, min[1]=pt.y, min[2]=pt.x, min[3]=pt.y
-        for( i = 1; i <= npoints - v_float32::nlanes/2; i+= v_float32::nlanes/2 )
+        for( i = 1; i <= npoints - VTraits<v_float32>::vlanes()/2; i+= VTraits<v_float32>::vlanes()/2 )
         {
             v_float32 ptXY2 = v_reinterpret_as_f32(vx_load(pts + i));
             minval = v_min(ptXY2, minval);
@@ -940,22 +940,22 @@ static Rect pointSetBoundingRect( const Mat& points )
         }
         minval = v_min(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))));
         maxval = v_max(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))));
-        if( i <= npoints - v_float32::nlanes/4 )
+        if( i <= npoints - VTraits<v_float32>::vlanes()/4 )
         {
             v_float32 ptXY = v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(vx_load_low(pts + i))));
             minval = v_min(ptXY, minval);
             maxval = v_max(ptXY, maxval);
-            i += v_float32::nlanes/4;
+            i += VTraits<v_float32>::vlanes()/4;
         }
-        for(int j = 16; j < CV_SIMD_WIDTH; j*=2)
+        for(int j = 16; j < VTraits<v_uint8>::vlanes(); j*=2)
         {
             minval = v_min(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))));
             maxval = v_max(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))));
         }
-        xmin = cvFloor(minval.get0());
-        xmax = cvFloor(maxval.get0());
-        ymin = cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))).get0());
-        ymax = cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))).get0());
+        xmin = cvFloor(v_get0(minval));
+        xmax = cvFloor(v_get0(maxval));
+        ymin = cvFloor(v_get0(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval)))));
+        ymax = cvFloor(v_get0(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval)))));
 #if CV_SIMD_WIDTH > 16
         if( i < npoints )
         {
@@ -967,10 +967,10 @@ static Rect pointSetBoundingRect( const Mat& points )
                 minval2 = v_min(ptXY, minval2);
                 maxval2 = v_max(ptXY, maxval2);
             }
-            xmin = min(xmin, cvFloor(minval2.get0()));
-            xmax = max(xmax, cvFloor(maxval2.get0()));
-            ymin = min(ymin, cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval2))).get0()));
-            ymax = max(ymax, cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval2))).get0()));
+            xmin = min(xmin, cvFloor(v_get0(minval2)));
+            xmax = max(xmax, cvFloor(v_get0(maxval2)));
+            ymin = min(ymin, cvFloor(v_get0(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval2))))));
+            ymax = max(ymax, cvFloor(v_get0(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval2))))));
         }
 #endif
     }