Merge pull request #24166 from hanliutong:rewrite-remaining

Rewrite Universal Intrinsic code: ImgProc (CV_SIMD_WIDTH related Part) #24166 Related PR: #24058, #24132. The goal of this series of PRs is to modify the SIMD code blocks in the opencv/modules/imgproc folder by using the new Universal Intrinsic API. The modification of this PR mainly focuses on the code that uses the `CV_SIMD_WIDTH` macro. This macro is sometimes used for loop tail processing, such as `box_filter.simd.hpp` and `morph.simd.hpp`. ```cpp #if CV_SIMD int i = 0; for (i < n - v_uint16::nlanes; i += v_uint16::nlanes) { // some universal intrinsic code // e.g. v_uint16... } #if CV_SIMD_WIDTH > 16 for (i < n - v_uint16x8::nlanes; i += v_uint16x8::nlanes) { // handle loop tail by 128 bit SIMD // e.g. v_uint16x8 } #endif //CV_SIMD_WIDTH #endif// CV_SIMD ``` The main contradiction is that the variable-length Universal Intrinsic backend cannot use 128bit fixed-length data structures. Therefore, this PR uses the scalar loop to handle the loop tail. This PR is marked as draft because the modification of the `box_filter.simd.hpp` file caused a compilation error. The cause of the error is initially believed to be due to an internal error in the GCC compiler. ```bash box_filter.simd.hpp:1162:5: internal compiler error: Segmentation fault 1162 | } | ^ 0xe03883 crash_signal /wafer/share/gcc/gcc/toplev.cc:314 0x7ff261c4251f ??? ./signal/../sysdeps/unix/sysv/linux/x86_64/libc_sigaction.c:0 0x6bde48 hash_set<rtl_ssa::set_info*, false, default_hash_traits<rtl_ssa::set_info*> >::iterator::operator*() /wafer/share/gcc/gcc/hash-set.h:125 0x6bde48 extract_single_source /wafer/share/gcc/gcc/config/riscv/riscv-vsetvl.cc:1184 0x6bde48 extract_single_source /wafer/share/gcc/gcc/config/riscv/riscv-vsetvl.cc:1174 0x119ad9e pass_vsetvl::propagate_avl() const /wafer/share/gcc/gcc/config/riscv/riscv-vsetvl.cc:4087 0x119ceaf pass_vsetvl::execute(function*) /wafer/share/gcc/gcc/config/riscv/riscv-vsetvl.cc:4344 0x119ceaf pass_vsetvl::execute(function*) /wafer/share/gcc/gcc/config/riscv/riscv-vsetvl.cc:4325 Please submit a full bug report, with preprocessed source (by using -freport-bug). Please include the complete backtrace with any bug report. ``` This PR can be compiled with Clang 16, and `opencv_test_imgproc` is passed on QEMU. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [ ] I agree to contribute to the project under Apache 2 License. - [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
1 year ago · 320c0bf419
parent b870ad46bf
commit 320c0bf419
3 changed files with 269 additions and 269 deletions
--- a/modules/imgproc/src/box_filter.simd.hpp
+++ b/modules/imgproc/src/box_filter.simd.hpp
@ -309,15 +309,15 @@ struct ColumnSum<int, uchar> :
            {
                const int* Sp = (const int*)src[0];
                int i = 0;
-#if CV_SIMD
-                for (; i <= width - v_int32::nlanes; i += v_int32::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for (; i <= width - VTraits<v_int32>::vlanes(); i += VTraits<v_int32>::vlanes())
                {
-                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+                    v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
                }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
                {
-                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+                    v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
                }
 #endif
 #endif
@ -339,37 +339,37 @@ struct ColumnSum<int, uchar> :
            if( haveScale )
            {
                int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                v_float32 _v_scale = vx_setall_f32((float)_scale);
-                for( ; i <= width - v_uint16::nlanes; i += v_uint16::nlanes )
+                for( ; i <= width - VTraits<v_uint16>::vlanes(); i += VTraits<v_uint16>::vlanes() )
                {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
+                    v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));

-                    v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * _v_scale));
-                    v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * _v_scale));
+                    v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), _v_scale)));
+                    v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), _v_scale)));

                    v_uint16 v_dst = v_pack(v_s0d, v_s01d);
                    v_pack_store(D + i, v_dst);

-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
-                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
+                    v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
                }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                v_float32x4 v_scale = v_setall_f32((float)_scale);
                for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
                {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
+                    v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));

-                    v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale));
-                    v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale));
+                    v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), v_scale)));
+                    v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), v_scale)));

                    v_uint16x8 v_dst = v_pack(v_s0d, v_s01d);
                    v_pack_store(D + i, v_dst);

-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
-                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
+                    v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
            }
 #endif
 #endif
@ -383,29 +383,29 @@ struct ColumnSum<int, uchar> :
            else
            {
                int i = 0;
-#if CV_SIMD
-                for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; i <= width-VTraits<v_uint16>::vlanes(); i+=VTraits<v_uint16>::vlanes() )
                {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
+                    v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));

                    v_uint16 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01));
                    v_pack_store(D + i, v_dst);

-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
-                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
+                    v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
                }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
                {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
+                    v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));

                    v_uint16x8 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01));
                    v_pack_store(D + i, v_dst);

-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
-                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
+                    v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
                }
 #endif
 #endif
@ -480,15 +480,15 @@ public BaseColumnFilter
            {
                const ushort* Sp = (const ushort*)src[0];
                int i = 0;
-#if CV_SIMD
-                for( ; i <= width - v_uint16::nlanes; i += v_uint16::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; i <= width - VTraits<v_uint16>::vlanes(); i += VTraits<v_uint16>::vlanes() )
                {
-                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+                    v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
                }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                for( ; i <= width - v_uint16x8::nlanes; i += v_uint16x8::nlanes )
                {
-                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+                    v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
                }
 #endif
 #endif
@ -510,27 +510,27 @@ public BaseColumnFilter
            if( haveScale )
            {
                int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                v_uint32 _ds4 = vx_setall_u32((unsigned)ds);
                v_uint16 _dd8 = vx_setall_u16((ushort)dd);

-                for( ; i <= width-v_uint8::nlanes; i+=v_uint8::nlanes )
+                for( ; i <= width-VTraits<v_uint8>::vlanes(); i+=VTraits<v_uint8>::vlanes() )
                {
                    v_uint16 _sm0 = vx_load(Sm + i);
-                    v_uint16 _sm1 = vx_load(Sm + i + v_uint16::nlanes);
+                    v_uint16 _sm1 = vx_load(Sm + i + VTraits<v_uint16>::vlanes());

                    v_uint16 _s0 = v_add_wrap(vx_load(SUM + i), vx_load(Sp + i));
-                    v_uint16 _s1 = v_add_wrap(vx_load(SUM + i + v_uint16::nlanes), vx_load(Sp + i + v_uint16::nlanes));
+                    v_uint16 _s1 = v_add_wrap(vx_load(SUM + i + VTraits<v_uint16>::vlanes()), vx_load(Sp + i + VTraits<v_uint16>::vlanes()));

                    v_uint32 _s00, _s01, _s10, _s11;

-                    v_expand(_s0 + _dd8, _s00, _s01);
-                    v_expand(_s1 + _dd8, _s10, _s11);
+                    v_expand(v_add(_s0, _dd8), _s00, _s01);
+                    v_expand(v_add(_s1, _dd8), _s10, _s11);

-                    _s00 = v_shr<SHIFT>(_s00*_ds4);
-                    _s01 = v_shr<SHIFT>(_s01*_ds4);
-                    _s10 = v_shr<SHIFT>(_s10*_ds4);
-                    _s11 = v_shr<SHIFT>(_s11*_ds4);
+                    _s00 = v_shr<SHIFT>(v_mul(_s00, _ds4));
+                    _s01 = v_shr<SHIFT>(v_mul(_s01, _ds4));
+                    _s10 = v_shr<SHIFT>(v_mul(_s10, _ds4));
+                    _s11 = v_shr<SHIFT>(v_mul(_s11, _ds4));

                    v_int16 r0 = v_pack(v_reinterpret_as_s32(_s00), v_reinterpret_as_s32(_s01));
                    v_int16 r1 = v_pack(v_reinterpret_as_s32(_s10), v_reinterpret_as_s32(_s11));
@ -540,9 +540,9 @@ public BaseColumnFilter

                    v_store(D + i, v_pack_u(r0, r1));
                    v_store(SUM + i, _s0);
-                    v_store(SUM + i + v_uint16::nlanes, _s1);
+                    v_store(SUM + i + VTraits<v_uint16>::vlanes(), _s1);
                }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                v_uint32x4 ds4 = v_setall_u32((unsigned)ds);
                v_uint16x8 dd8 = v_setall_u16((ushort)dd);

@ -556,13 +556,13 @@ public BaseColumnFilter

                    v_uint32x4 _s00, _s01, _s10, _s11;

-                    v_expand(_s0 + dd8, _s00, _s01);
-                    v_expand(_s1 + dd8, _s10, _s11);
+                    v_expand(v_add(_s0, dd8), _s00, _s01);
+                    v_expand(v_add(_s1, dd8), _s10, _s11);

-                    _s00 = v_shr<SHIFT>(_s00*ds4);
-                    _s01 = v_shr<SHIFT>(_s01*ds4);
-                    _s10 = v_shr<SHIFT>(_s10*ds4);
-                    _s11 = v_shr<SHIFT>(_s11*ds4);
+                    _s00 = v_shr<SHIFT>(v_mul(_s00, ds4));
+                    _s01 = v_shr<SHIFT>(v_mul(_s01, ds4));
+                    _s10 = v_shr<SHIFT>(v_mul(_s10, ds4));
+                    _s11 = v_shr<SHIFT>(v_mul(_s11, ds4));

                    v_int16x8 r0 = v_pack(v_reinterpret_as_s32(_s00), v_reinterpret_as_s32(_s01));
                    v_int16x8 r1 = v_pack(v_reinterpret_as_s32(_s10), v_reinterpret_as_s32(_s11));
@ -643,15 +643,15 @@ struct ColumnSum<int, short> :
            {
                const int* Sp = (const int*)src[0];
                i = 0;
-#if CV_SIMD
-                for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; i <= width - VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
                {
-                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+                    v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
                }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
                {
-                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+                    v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
                }
 #endif
 #endif
@ -673,33 +673,33 @@ struct ColumnSum<int, short> :
            if( haveScale )
            {
                i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                v_float32 _v_scale = vx_setall_f32((float)_scale);
-                for( ; i <= width-v_int16::nlanes; i+=v_int16::nlanes )
+                for( ; i <= width-VTraits<v_int16>::vlanes(); i+=VTraits<v_int16>::vlanes() )
                {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
+                    v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));

-                    v_int32 v_s0d =  v_round(v_cvt_f32(v_s0) * _v_scale);
-                    v_int32 v_s01d = v_round(v_cvt_f32(v_s01) * _v_scale);
+                    v_int32 v_s0d =  v_round(v_mul(v_cvt_f32(v_s0), _v_scale));
+                    v_int32 v_s01d = v_round(v_mul(v_cvt_f32(v_s01), _v_scale));
                    v_store(D + i, v_pack(v_s0d, v_s01d));

-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
-                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
+                    v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
                }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                v_float32x4 v_scale = v_setall_f32((float)_scale);
                for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes )
                {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
+                    v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));

-                    v_int32x4 v_s0d =  v_round(v_cvt_f32(v_s0) * v_scale);
-                    v_int32x4 v_s01d = v_round(v_cvt_f32(v_s01) * v_scale);
+                    v_int32x4 v_s0d =  v_round(v_mul(v_cvt_f32(v_s0), v_scale));
+                    v_int32x4 v_s01d = v_round(v_mul(v_cvt_f32(v_s01), v_scale));
                    v_store(D + i, v_pack(v_s0d, v_s01d));

-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
-                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
+                    v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
                }
 #endif
 #endif
@ -713,27 +713,27 @@ struct ColumnSum<int, short> :
            else
            {
                i = 0;
-#if CV_SIMD
-                for( ; i <= width-v_int16::nlanes; i+=v_int16::nlanes )
+#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE, GCC 13 related
+                for( ; i <= width-VTraits<v_int16>::vlanes(); i+=VTraits<v_int16>::vlanes() )
                {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
+                    v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));

                    v_store(D + i, v_pack(v_s0, v_s01));

-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
-                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
+                    v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
                }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes )
                {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
+                    v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));

                    v_store(D + i, v_pack(v_s0, v_s01));

-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
-                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
+                    v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
                }
 #endif
 #endif
@ -792,15 +792,15 @@ struct ColumnSum<int, ushort> :
            {
                const int* Sp = (const int*)src[0];
                int i = 0;
-#if CV_SIMD
-                for (; i <= width - v_int32::nlanes; i += v_int32::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for (; i <= width - VTraits<v_int32>::vlanes(); i += VTraits<v_int32>::vlanes())
                {
-                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+                    v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
                }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
                {
-                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+                    v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
                }
 #endif
 #endif
@ -822,33 +822,33 @@ struct ColumnSum<int, ushort> :
            if( haveScale )
            {
                int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                v_float32 _v_scale = vx_setall_f32((float)_scale);
-                for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes )
+                for( ; i <= width-VTraits<v_uint16>::vlanes(); i+=VTraits<v_uint16>::vlanes() )
                {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
+                    v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));

-                    v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * _v_scale));
-                    v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * _v_scale));
+                    v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), _v_scale)));
+                    v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), _v_scale)));
                    v_store(D + i, v_pack(v_s0d, v_s01d));

-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
-                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
+                    v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
                }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                v_float32x4 v_scale = v_setall_f32((float)_scale);
                for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
                {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
+                    v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));

-                    v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale));
-                    v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale));
+                    v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), v_scale)));
+                    v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), v_scale)));
                    v_store(D + i, v_pack(v_s0d, v_s01d));

-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
-                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
+                    v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
                }
 #endif
 #endif
@ -862,27 +862,27 @@ struct ColumnSum<int, ushort> :
            else
            {
                int i = 0;
-#if CV_SIMD
-                for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; i <= width-VTraits<v_uint16>::vlanes(); i+=VTraits<v_uint16>::vlanes() )
                {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
+                    v_int32 v_s01 = v_add(vx_load(SUM + i + VTraits<v_int32>::vlanes()), vx_load(Sp + i + VTraits<v_int32>::vlanes()));

                    v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)));

-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
-                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
+                    v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
                }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
                {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
+                    v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));

                    v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)));

-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
-                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
+                    v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
                }
 #endif
 #endif
@ -939,15 +939,15 @@ struct ColumnSum<int, int> :
            {
                const int* Sp = (const int*)src[0];
                int i = 0;
-#if CV_SIMD
-                for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; i <= width - VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
                {
-                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+                    v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
                }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
                {
-                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+                    v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
                }
 #endif
 #endif
@ -969,25 +969,25 @@ struct ColumnSum<int, int> :
            if( haveScale )
            {
                int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                v_float32 _v_scale = vx_setall_f32((float)_scale);
-                for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes )
+                for( ; i <= width-VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
                {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_int32 v_s0d = v_round(v_cvt_f32(v_s0) * _v_scale);
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
+                    v_int32 v_s0d = v_round(v_mul(v_cvt_f32(v_s0), _v_scale));

                    v_store(D + i, v_s0d);
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
                }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                v_float32x4 v_scale = v_setall_f32((float)_scale);
                for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
                {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_int32x4 v_s0d = v_round(v_cvt_f32(v_s0) * v_scale);
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
+                    v_int32x4 v_s0d = v_round(v_mul(v_cvt_f32(v_s0), v_scale));

                    v_store(D + i, v_s0d);
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
                }
 #endif
 #endif
@ -1001,21 +1001,21 @@ struct ColumnSum<int, int> :
            else
            {
                int i = 0;
-#if CV_SIMD
-                for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; i <= width-VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
                {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));

                    v_store(D + i, v_s0);
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
                }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
                {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));

                    v_store(D + i, v_s0);
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
                }
 #endif
 #endif
@ -1073,15 +1073,15 @@ struct ColumnSum<int, float> :
            {
                const int* Sp = (const int*)src[0];
                int i = 0;
-#if CV_SIMD
-                for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; i <= width - VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
                {
-                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+                    v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
                }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
                {
-                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+                    v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
                }
 #endif
 #endif
@ -1105,21 +1105,21 @@ struct ColumnSum<int, float> :
            {
                int i = 0;

-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                v_float32 _v_scale = vx_setall_f32((float)_scale);
-                for (; i <= width - v_int32::nlanes; i += v_int32::nlanes)
+                for (; i <= width - VTraits<v_int32>::vlanes(); i += VTraits<v_int32>::vlanes())
                {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
-                    v_store(D + i, v_cvt_f32(v_s0) * _v_scale);
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
+                    v_store(D + i, v_mul(v_cvt_f32(v_s0), _v_scale));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
                }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                v_float32x4 v_scale = v_setall_f32((float)_scale);
                for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
                {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                    v_store(D + i, v_cvt_f32(v_s0) * v_scale);
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
+                    v_store(D + i, v_mul(v_cvt_f32(v_s0), v_scale));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
                }
 #endif
 #endif
@ -1134,19 +1134,19 @@ struct ColumnSum<int, float> :
            {
                int i = 0;

-#if CV_SIMD
-                for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( ; i <= width-VTraits<v_int32>::vlanes(); i+=VTraits<v_int32>::vlanes() )
                {
-                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
+                    v_int32 v_s0 = v_add(vx_load(SUM + i), vx_load(Sp + i));
                    v_store(D + i, v_cvt_f32(v_s0));
-                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
+                    v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
                }
-#if CV_SIMD_WIDTH > 16
+#if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
                {
-                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
+                    v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
                    v_store(D + i, v_cvt_f32(v_s0));
-                    v_store(SUM + i, v_s0 - v_load(Sm + i));
+                    v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
                }
 #endif
 #endif
--- a/modules/imgproc/src/morph.simd.hpp
+++ b/modules/imgproc/src/morph.simd.hpp
@ -106,12 +106,12 @@ struct MorphNoVec
    int operator()(uchar**, int, uchar*, int) const { return 0; }
 };

-#if CV_SIMD
+#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE, GCC 13 related

 template<class VecUpdate> struct MorphRowVec
 {
    typedef typename VecUpdate::vtype vtype;
-    typedef typename vtype::lane_type stype;
+    typedef typename VTraits<vtype>::lane_type stype;
    MorphRowVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {}
    int operator()(const uchar* src, uchar* dst, int width, int cn) const
    {
@ -121,52 +121,52 @@ template<class VecUpdate> struct MorphRowVec
        width *= cn;
        VecUpdate updateOp;

-        for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes )
+        for( i = 0; i <= width - 4*VTraits<vtype>::vlanes(); i += 4*VTraits<vtype>::vlanes() )
        {
            vtype s0 = vx_load((const stype*)src + i);
-            vtype s1 = vx_load((const stype*)src + i + vtype::nlanes);
-            vtype s2 = vx_load((const stype*)src + i + 2*vtype::nlanes);
-            vtype s3 = vx_load((const stype*)src + i + 3*vtype::nlanes);
+            vtype s1 = vx_load((const stype*)src + i + VTraits<vtype>::vlanes());
+            vtype s2 = vx_load((const stype*)src + i + 2*VTraits<vtype>::vlanes());
+            vtype s3 = vx_load((const stype*)src + i + 3*VTraits<vtype>::vlanes());
            for (k = cn; k < _ksize; k += cn)
            {
                s0 = updateOp(s0, vx_load((const stype*)src + i + k));
-                s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes));
-                s2 = updateOp(s2, vx_load((const stype*)src + i + k + 2*vtype::nlanes));
-                s3 = updateOp(s3, vx_load((const stype*)src + i + k + 3*vtype::nlanes));
+                s1 = updateOp(s1, vx_load((const stype*)src + i + k + VTraits<vtype>::vlanes()));
+                s2 = updateOp(s2, vx_load((const stype*)src + i + k + 2*VTraits<vtype>::vlanes()));
+                s3 = updateOp(s3, vx_load((const stype*)src + i + k + 3*VTraits<vtype>::vlanes()));
            }
            v_store((stype*)dst + i, s0);
-            v_store((stype*)dst + i + vtype::nlanes, s1);
-            v_store((stype*)dst + i + 2*vtype::nlanes, s2);
-            v_store((stype*)dst + i + 3*vtype::nlanes, s3);
+            v_store((stype*)dst + i + VTraits<vtype>::vlanes(), s1);
+            v_store((stype*)dst + i + 2*VTraits<vtype>::vlanes(), s2);
+            v_store((stype*)dst + i + 3*VTraits<vtype>::vlanes(), s3);
        }
-        if( i <= width - 2*vtype::nlanes )
+        if( i <= width - 2*VTraits<vtype>::vlanes() )
        {
            vtype s0 = vx_load((const stype*)src + i);
-            vtype s1 = vx_load((const stype*)src + i + vtype::nlanes);
+            vtype s1 = vx_load((const stype*)src + i + VTraits<vtype>::vlanes());
            for( k = cn; k < _ksize; k += cn )
            {
                s0 = updateOp(s0, vx_load((const stype*)src + i + k));
-                s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes));
+                s1 = updateOp(s1, vx_load((const stype*)src + i + k + VTraits<vtype>::vlanes()));
            }
            v_store((stype*)dst + i, s0);
-            v_store((stype*)dst + i + vtype::nlanes, s1);
-            i += 2*vtype::nlanes;
+            v_store((stype*)dst + i + VTraits<vtype>::vlanes(), s1);
+            i += 2*VTraits<vtype>::vlanes();
        }
-        if( i <= width - vtype::nlanes )
+        if( i <= width - VTraits<vtype>::vlanes() )
        {
            vtype s = vx_load((const stype*)src + i);
            for( k = cn; k < _ksize; k += cn )
                s = updateOp(s, vx_load((const stype*)src + i + k));
            v_store((stype*)dst + i, s);
-            i += vtype::nlanes;
+            i += VTraits<vtype>::vlanes();
        }
-        if( i <= width - vtype::nlanes/2 )
+        if( i <= width - VTraits<vtype>::vlanes()/2 )
        {
            vtype s = vx_load_low((const stype*)src + i);
            for( k = cn; k < _ksize; k += cn )
                s = updateOp(s, vx_load_low((const stype*)src + i + k));
            v_store_low((stype*)dst + i, s);
-            i += vtype::nlanes/2;
+            i += VTraits<vtype>::vlanes()/2;
        }

        return i - i % cn;
@ -179,7 +179,7 @@ template<class VecUpdate> struct MorphRowVec
 template<class VecUpdate> struct MorphColumnVec
 {
    typedef typename VecUpdate::vtype vtype;
-    typedef typename vtype::lane_type stype;
+    typedef typename VTraits<vtype>::lane_type stype;
    MorphColumnVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {}
    int operator()(const uchar** _src, uchar* _dst, int dststep, int count, int width) const
    {
@ -189,7 +189,7 @@ template<class VecUpdate> struct MorphColumnVec
        VecUpdate updateOp;

        for( i = 0; i < count + ksize - 1; i++ )
-            CV_Assert( ((size_t)_src[i] & (CV_SIMD_WIDTH-1)) == 0 );
+            CV_Assert( ((size_t)_src[i] & (VTraits<v_uint8>::vlanes()-1)) == 0 );

        const stype** src = (const stype**)_src;
        stype* dst = (stype*)_dst;
@ -197,58 +197,58 @@ template<class VecUpdate> struct MorphColumnVec

        for( ; _ksize > 1 && count > 1; count -= 2, dst += dststep*2, src += 2 )
        {
-            for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes)
+            for( i = 0; i <= width - 4*VTraits<vtype>::vlanes(); i += 4*VTraits<vtype>::vlanes())
            {
                const stype* sptr = src[1] + i;
                vtype s0 = vx_load_aligned(sptr);
-                vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
-                vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes);
-                vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes);
+                vtype s1 = vx_load_aligned(sptr + VTraits<vtype>::vlanes());
+                vtype s2 = vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes());
+                vtype s3 = vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes());

                for( k = 2; k < _ksize; k++ )
                {
                    sptr = src[k] + i;
                    s0 = updateOp(s0, vx_load_aligned(sptr));
-                    s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
-                    s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes));
-                    s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes));
+                    s1 = updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes()));
+                    s2 = updateOp(s2, vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes()));
+                    s3 = updateOp(s3, vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes()));
                }

                sptr = src[0] + i;
                v_store(dst + i, updateOp(s0, vx_load_aligned(sptr)));
-                v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
-                v_store(dst + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)));
-                v_store(dst + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)));
+                v_store(dst + i + VTraits<vtype>::vlanes(), updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes())));
+                v_store(dst + i + 2*VTraits<vtype>::vlanes(), updateOp(s2, vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes())));
+                v_store(dst + i + 3*VTraits<vtype>::vlanes(), updateOp(s3, vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes())));

                sptr = src[k] + i;
                v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr)));
-                v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
-                v_store(dst + dststep + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)));
-                v_store(dst + dststep + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)));
+                v_store(dst + dststep + i + VTraits<vtype>::vlanes(), updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes())));
+                v_store(dst + dststep + i + 2*VTraits<vtype>::vlanes(), updateOp(s2, vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes())));
+                v_store(dst + dststep + i + 3*VTraits<vtype>::vlanes(), updateOp(s3, vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes())));
            }
-            if( i <= width - 2*vtype::nlanes )
+            if( i <= width - 2*VTraits<vtype>::vlanes() )
            {
                const stype* sptr = src[1] + i;
                vtype s0 = vx_load_aligned(sptr);
-                vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
+                vtype s1 = vx_load_aligned(sptr + VTraits<vtype>::vlanes());

                for( k = 2; k < _ksize; k++ )
                {
                    sptr = src[k] + i;
                    s0 = updateOp(s0, vx_load_aligned(sptr));
-                    s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
+                    s1 = updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes()));
                }

                sptr = src[0] + i;
                v_store(dst + i, updateOp(s0, vx_load_aligned(sptr)));
-                v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
+                v_store(dst + i + VTraits<vtype>::vlanes(), updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes())));

                sptr = src[k] + i;
                v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr)));
-                v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)));
-                i += 2*vtype::nlanes;
+                v_store(dst + dststep + i + VTraits<vtype>::vlanes(), updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes())));
+                i += 2*VTraits<vtype>::vlanes();
            }
-            if( i <= width - vtype::nlanes )
+            if( i <= width - VTraits<vtype>::vlanes() )
            {
                vtype s0 = vx_load_aligned(src[1] + i);

@ -257,9 +257,9 @@ template<class VecUpdate> struct MorphColumnVec

                v_store(dst + i, updateOp(s0, vx_load_aligned(src[0] + i)));
                v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(src[k] + i)));
-                i += vtype::nlanes;
+                i += VTraits<vtype>::vlanes();
            }
-            if( i <= width - vtype::nlanes/2 )
+            if( i <= width - VTraits<vtype>::vlanes()/2 )
            {
                vtype s0 = vx_load_low(src[1] + i);

@ -268,66 +268,66 @@ template<class VecUpdate> struct MorphColumnVec

                v_store_low(dst + i, updateOp(s0, vx_load_low(src[0] + i)));
                v_store_low(dst + dststep + i, updateOp(s0, vx_load_low(src[k] + i)));
-                i += vtype::nlanes/2;
+                i += VTraits<vtype>::vlanes()/2;
            }
        }

        for( ; count > 0; count--, dst += dststep, src++ )
        {
-            for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes)
+            for( i = 0; i <= width - 4*VTraits<vtype>::vlanes(); i += 4*VTraits<vtype>::vlanes())
            {
                const stype* sptr = src[0] + i;
                vtype s0 = vx_load_aligned(sptr);
-                vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
-                vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes);
-                vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes);
+                vtype s1 = vx_load_aligned(sptr + VTraits<vtype>::vlanes());
+                vtype s2 = vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes());
+                vtype s3 = vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes());

                for( k = 1; k < _ksize; k++ )
                {
                    sptr = src[k] + i;
                    s0 = updateOp(s0, vx_load_aligned(sptr));
-                    s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
-                    s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes));
-                    s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes));
+                    s1 = updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes()));
+                    s2 = updateOp(s2, vx_load_aligned(sptr + 2*VTraits<vtype>::vlanes()));
+                    s3 = updateOp(s3, vx_load_aligned(sptr + 3*VTraits<vtype>::vlanes()));
                }
                v_store(dst + i, s0);
-                v_store(dst + i + vtype::nlanes, s1);
-                v_store(dst + i + 2*vtype::nlanes, s2);
-                v_store(dst + i + 3*vtype::nlanes, s3);
+                v_store(dst + i + VTraits<vtype>::vlanes(), s1);
+                v_store(dst + i + 2*VTraits<vtype>::vlanes(), s2);
+                v_store(dst + i + 3*VTraits<vtype>::vlanes(), s3);
            }
-            if( i <= width - 2*vtype::nlanes )
+            if( i <= width - 2*VTraits<vtype>::vlanes() )
            {
                const stype* sptr = src[0] + i;
                vtype s0 = vx_load_aligned(sptr);
-                vtype s1 = vx_load_aligned(sptr + vtype::nlanes);
+                vtype s1 = vx_load_aligned(sptr + VTraits<vtype>::vlanes());

                for( k = 1; k < _ksize; k++ )
                {
                    sptr = src[k] + i;
                    s0 = updateOp(s0, vx_load_aligned(sptr));
-                    s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes));
+                    s1 = updateOp(s1, vx_load_aligned(sptr + VTraits<vtype>::vlanes()));
                }
                v_store(dst + i, s0);
-                v_store(dst + i + vtype::nlanes, s1);
-                i += 2*vtype::nlanes;
+                v_store(dst + i + VTraits<vtype>::vlanes(), s1);
+                i += 2*VTraits<vtype>::vlanes();
            }
-            if( i <= width - vtype::nlanes )
+            if( i <= width - VTraits<vtype>::vlanes() )
            {
                vtype s0 = vx_load_aligned(src[0] + i);

                for( k = 1; k < _ksize; k++ )
                    s0 = updateOp(s0, vx_load_aligned(src[k] + i));
                v_store(dst + i, s0);
-                i += vtype::nlanes;
+                i += VTraits<vtype>::vlanes();
            }
-            if( i <= width - vtype::nlanes/2 )
+            if( i <= width - VTraits<vtype>::vlanes()/2 )
            {
                vtype s0 = vx_load_low(src[0] + i);

                for( k = 1; k < _ksize; k++ )
                    s0 = updateOp(s0, vx_load_low(src[k] + i));
                v_store_low(dst + i, s0);
-                i += vtype::nlanes/2;
+                i += VTraits<vtype>::vlanes()/2;
            }
        }

@ -341,7 +341,7 @@ template<class VecUpdate> struct MorphColumnVec
 template<class VecUpdate> struct MorphVec
 {
    typedef typename VecUpdate::vtype vtype;
-    typedef typename vtype::lane_type stype;
+    typedef typename VTraits<vtype>::lane_type stype;
    int operator()(uchar** _src, int nz, uchar* _dst, int width) const
    {
        CV_INSTRUMENT_REGION();
@ -351,56 +351,56 @@ template<class VecUpdate> struct MorphVec
        int i, k;
        VecUpdate updateOp;

-        for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes )
+        for( i = 0; i <= width - 4*VTraits<vtype>::vlanes(); i += 4*VTraits<vtype>::vlanes() )
        {
            const stype* sptr = src[0] + i;
            vtype s0 = vx_load(sptr);
-            vtype s1 = vx_load(sptr + vtype::nlanes);
-            vtype s2 = vx_load(sptr + 2*vtype::nlanes);
-            vtype s3 = vx_load(sptr + 3*vtype::nlanes);
+            vtype s1 = vx_load(sptr + VTraits<vtype>::vlanes());
+            vtype s2 = vx_load(sptr + 2*VTraits<vtype>::vlanes());
+            vtype s3 = vx_load(sptr + 3*VTraits<vtype>::vlanes());
            for( k = 1; k < nz; k++ )
            {
                sptr = src[k] + i;
                s0 = updateOp(s0, vx_load(sptr));
-                s1 = updateOp(s1, vx_load(sptr + vtype::nlanes));
-                s2 = updateOp(s2, vx_load(sptr + 2*vtype::nlanes));
-                s3 = updateOp(s3, vx_load(sptr + 3*vtype::nlanes));
+                s1 = updateOp(s1, vx_load(sptr + VTraits<vtype>::vlanes()));
+                s2 = updateOp(s2, vx_load(sptr + 2*VTraits<vtype>::vlanes()));
+                s3 = updateOp(s3, vx_load(sptr + 3*VTraits<vtype>::vlanes()));
            }
            v_store(dst + i, s0);
-            v_store(dst + i + vtype::nlanes, s1);
-            v_store(dst + i + 2*vtype::nlanes, s2);
-            v_store(dst + i + 3*vtype::nlanes, s3);
+            v_store(dst + i + VTraits<vtype>::vlanes(), s1);
+            v_store(dst + i + 2*VTraits<vtype>::vlanes(), s2);
+            v_store(dst + i + 3*VTraits<vtype>::vlanes(), s3);
        }
-        if( i <= width - 2*vtype::nlanes )
+        if( i <= width - 2*VTraits<vtype>::vlanes() )
        {
            const stype* sptr = src[0] + i;
            vtype s0 = vx_load(sptr);
-            vtype s1 = vx_load(sptr + vtype::nlanes);
+            vtype s1 = vx_load(sptr + VTraits<vtype>::vlanes());
            for( k = 1; k < nz; k++ )
            {
                sptr = src[k] + i;
                s0 = updateOp(s0, vx_load(sptr));
-                s1 = updateOp(s1, vx_load(sptr + vtype::nlanes));
+                s1 = updateOp(s1, vx_load(sptr + VTraits<vtype>::vlanes()));
            }
            v_store(dst + i, s0);
-            v_store(dst + i + vtype::nlanes, s1);
-            i += 2*vtype::nlanes;
+            v_store(dst + i + VTraits<vtype>::vlanes(), s1);
+            i += 2*VTraits<vtype>::vlanes();
        }
-        if( i <= width - vtype::nlanes )
+        if( i <= width - VTraits<vtype>::vlanes() )
        {
            vtype s0 = vx_load(src[0] + i);
            for( k = 1; k < nz; k++ )
                s0 = updateOp(s0, vx_load(src[k] + i));
            v_store(dst + i, s0);
-            i += vtype::nlanes;
+            i += VTraits<vtype>::vlanes();
        }
-        if( i <= width - vtype::nlanes/2 )
+        if( i <= width - VTraits<vtype>::vlanes()/2 )
        {
            vtype s0 = vx_load_low(src[0] + i);
            for( k = 1; k < nz; k++ )
                s0 = updateOp(s0, vx_load_low(src[k] + i));
            v_store_low(dst + i, s0);
-            i += vtype::nlanes/2;
+            i += VTraits<vtype>::vlanes()/2;
        }
        return i;
    }
--- a/modules/imgproc/src/shapedescr.cpp
+++ b/modules/imgproc/src/shapedescr.cpp
@ -879,14 +879,14 @@ static Rect pointSetBoundingRect( const Mat& points )
    if( npoints == 0 )
        return Rect();

-#if CV_SIMD
+#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE, loop tail related.
    const int64_t* pts = points.ptr<int64_t>();

    if( !is_float )
    {
        v_int32 minval, maxval;
        minval = maxval = v_reinterpret_as_s32(vx_setall_s64(*pts)); //min[0]=pt.x, min[1]=pt.y, min[2]=pt.x, min[3]=pt.y
-        for( i = 1; i <= npoints - v_int32::nlanes/2; i+= v_int32::nlanes/2 )
+        for( i = 1; i <= npoints - VTraits<v_int32>::vlanes()/2; i+= VTraits<v_int32>::vlanes()/2 )
        {
            v_int32 ptXY2 = v_reinterpret_as_s32(vx_load(pts + i));
            minval = v_min(ptXY2, minval);
@ -894,22 +894,22 @@ static Rect pointSetBoundingRect( const Mat& points )
        }
        minval = v_min(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))));
        maxval = v_max(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))));
-        if( i <= npoints - v_int32::nlanes/4 )
+        if( i <= npoints - VTraits<v_int32>::vlanes()/4 )
        {
            v_int32 ptXY = v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(vx_load_low(pts + i))));
            minval = v_min(ptXY, minval);
            maxval = v_max(ptXY, maxval);
-            i += v_int64::nlanes/2;
+            i += VTraits<v_int64>::vlanes()/2;
        }
-        for(int j = 16; j < CV_SIMD_WIDTH; j*=2)
+        for(int j = 16; j < VTraits<v_uint8>::vlanes(); j*=2)
        {
            minval = v_min(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))));
            maxval = v_max(v_reinterpret_as_s32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))));
        }
-        xmin = minval.get0();
-        xmax = maxval.get0();
-        ymin = v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))).get0();
-        ymax = v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))).get0();
+        xmin = v_get0(minval);
+        xmax = v_get0(maxval);
+        ymin = v_get0(v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval))));
+        ymax = v_get0(v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval))));
 #if CV_SIMD_WIDTH > 16
        if( i < npoints )
        {
@ -921,18 +921,18 @@ static Rect pointSetBoundingRect( const Mat& points )
                minval2 = v_min(ptXY, minval2);
                maxval2 = v_max(ptXY, maxval2);
            }
-            xmin = min(xmin, minval2.get0());
-            xmax = max(xmax, maxval2.get0());
-            ymin = min(ymin, v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval2))).get0());
-            ymax = max(ymax, v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval2))).get0());
+            xmin = min(xmin, v_get0(minval2));
+            xmax = max(xmax, v_get0(maxval2));
+            ymin = min(ymin, v_get0(v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(minval2)))));
+            ymax = max(ymax, v_get0(v_reinterpret_as_s32(v_expand_high(v_reinterpret_as_u32(maxval2)))));
        }
-#endif
+#endif // CV_SIMD
    }
    else
    {
        v_float32 minval, maxval;
        minval = maxval = v_reinterpret_as_f32(vx_setall_s64(*pts)); //min[0]=pt.x, min[1]=pt.y, min[2]=pt.x, min[3]=pt.y
-        for( i = 1; i <= npoints - v_float32::nlanes/2; i+= v_float32::nlanes/2 )
+        for( i = 1; i <= npoints - VTraits<v_float32>::vlanes()/2; i+= VTraits<v_float32>::vlanes()/2 )
        {
            v_float32 ptXY2 = v_reinterpret_as_f32(vx_load(pts + i));
            minval = v_min(ptXY2, minval);
@ -940,22 +940,22 @@ static Rect pointSetBoundingRect( const Mat& points )
        }
        minval = v_min(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))));
        maxval = v_max(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))));
-        if( i <= npoints - v_float32::nlanes/4 )
+        if( i <= npoints - VTraits<v_float32>::vlanes()/4 )
        {
            v_float32 ptXY = v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(vx_load_low(pts + i))));
            minval = v_min(ptXY, minval);
            maxval = v_max(ptXY, maxval);
-            i += v_float32::nlanes/4;
+            i += VTraits<v_float32>::vlanes()/4;
        }
-        for(int j = 16; j < CV_SIMD_WIDTH; j*=2)
+        for(int j = 16; j < VTraits<v_uint8>::vlanes(); j*=2)
        {
            minval = v_min(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(minval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))));
            maxval = v_max(v_reinterpret_as_f32(v_expand_low(v_reinterpret_as_u32(maxval))), v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))));
        }
-        xmin = cvFloor(minval.get0());
-        xmax = cvFloor(maxval.get0());
-        ymin = cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval))).get0());
-        ymax = cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval))).get0());
+        xmin = cvFloor(v_get0(minval));
+        xmax = cvFloor(v_get0(maxval));
+        ymin = cvFloor(v_get0(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval)))));
+        ymax = cvFloor(v_get0(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval)))));
 #if CV_SIMD_WIDTH > 16
        if( i < npoints )
        {
@ -967,10 +967,10 @@ static Rect pointSetBoundingRect( const Mat& points )
                minval2 = v_min(ptXY, minval2);
                maxval2 = v_max(ptXY, maxval2);
            }
-            xmin = min(xmin, cvFloor(minval2.get0()));
-            xmax = max(xmax, cvFloor(maxval2.get0()));
-            ymin = min(ymin, cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval2))).get0()));
-            ymax = max(ymax, cvFloor(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval2))).get0()));
+            xmin = min(xmin, cvFloor(v_get0(minval2)));
+            xmax = max(xmax, cvFloor(v_get0(maxval2)));
+            ymin = min(ymin, cvFloor(v_get0(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(minval2))))));
+            ymax = max(ymax, cvFloor(v_get0(v_reinterpret_as_f32(v_expand_high(v_reinterpret_as_u32(maxval2))))));
        }
 #endif
    }