Merge pull request #24301 from hanliutong:rewrite-stereo-sift

Rewrite Universal Intrinsic code: features2d and calib3d module. #24301 The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API. This is the modification to the features2d module and calib3d module. Test with clang 16 and QEMU v7.0.0. `AP3P.ctheta1p_nan_23607` failed beacuse of a small calculation error. But this patch does not touch the relevant code, and this error always reproduce on QEMU, regardless of whether the patch is applied or not. I think we can ignore it ``` [ RUN ] AP3P.ctheta1p_nan_23607 /home/hanliutong/project/opencv/modules/calib3d/test/test_solvepnp_ransac.cpp:2319: Failure Expected: (cvtest::norm(res.colRange(0, 2), expected, NORM_INF)) <= (3e-16), actual: 3.33067e-16 vs 3e-16 [ FAILED ] AP3P.ctheta1p_nan_23607 (26 ms) ... [==========] 148 tests from 64 test cases ran. (1147114 ms total) [ PASSED ] 147 tests. [ FAILED ] 1 test, listed below: [ FAILED ] AP3P.ctheta1p_nan_23607 ``` Note: There are 2 test cases failed with GCC 13.2.1 without this patch, seems like there are someting wrong with RVV part on GCC. ``` [----------] Global test environment tear-down [==========] 148 tests from 64 test cases ran. (1511399 ms total) [ PASSED ] 146 tests. [ FAILED ] 2 tests, listed below: [ FAILED ] Calib3d_StereoSGBM.regression [ FAILED ] Calib3d_StereoSGBM_HH4.regression ``` The patch is partially auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter). ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [ ] I agree to contribute to the project under Apache 2 License. - [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
1 year ago · aa143a3dd1
parent 3889dcf3f8
commit aa143a3dd1
3 changed files with 319 additions and 319 deletions
--- a/modules/calib3d/src/stereobm.cpp
+++ b/modules/calib3d/src/stereobm.cpp
@ -231,13 +231,13 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
        dptr0[0] = dptr0[size.width-1] = dptr1[0] = dptr1[size.width-1] = val0;
        x = 1;

-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
        {
            v_int16 ftz = vx_setall_s16((short) ftzero);
            v_int16 ftz2 = vx_setall_s16((short)(ftzero*2));
            v_int16 z = vx_setzero_s16();

-            for(; x <= (size.width - 1) - v_int16::nlanes; x += v_int16::nlanes)
+            for(; x <= (size.width - 1) - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
            {
                v_int16 s00 = v_reinterpret_as_s16(vx_load_expand(srow0 + x + 1));
                v_int16 s01 = v_reinterpret_as_s16(vx_load_expand(srow0 + x - 1));
@ -248,13 +248,13 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
                v_int16 s30 = v_reinterpret_as_s16(vx_load_expand(srow3 + x + 1));
                v_int16 s31 = v_reinterpret_as_s16(vx_load_expand(srow3 + x - 1));

-                v_int16 d0 = s00 - s01;
-                v_int16 d1 = s10 - s11;
-                v_int16 d2 = s20 - s21;
-                v_int16 d3 = s30 - s31;
+                v_int16 d0 = v_sub(s00, s01);
+                v_int16 d1 = v_sub(s10, s11);
+                v_int16 d2 = v_sub(s20, s21);
+                v_int16 d3 = v_sub(s30, s31);

-                v_uint16 v0 = v_reinterpret_as_u16(v_max(v_min(d0 + d1 + d1 + d2 + ftz, ftz2), z));
-                v_uint16 v1 = v_reinterpret_as_u16(v_max(v_min(d1 + d2 + d2 + d3 + ftz, ftz2), z));
+                v_uint16 v0 = v_reinterpret_as_u16(v_max(v_min(v_add(v_add(v_add(v_add(d0, d1), d1), d2), ftz), ftz2), z));
+                v_uint16 v1 = v_reinterpret_as_u16(v_max(v_min(v_add(v_add(v_add(v_add(d1, d2), d2), d3), ftz), ftz2), z));

                v_pack_store(dptr0 + x, v0);
                v_pack_store(dptr1 + x, v1);
@ -277,10 +277,10 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
    {
        uchar* dptr = dst.ptr<uchar>(y);
        x = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
        {
            v_uint8 val0_16 = vx_setall_u8(val0);
-            for(; x <= size.width-v_uint8::nlanes; x+=v_uint8::nlanes)
+            for(; x <= size.width-VTraits<v_uint8>::vlanes(); x+=VTraits<v_uint8>::vlanes())
                v_store(dptr + x, val0_16);
        }
 #endif
@ -356,7 +356,7 @@ public:
        for (size_t i = 0; i < nstripes; ++i)
        {
            // 1D: [1][  ndisp  ][1]
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
            if (params.useShorts())
                area.allocate(sad_short[i], ndisp + 2);
            else
@ -364,7 +364,7 @@ public:
                area.allocate(sad[i], ndisp + 2);

            // 2D: [ wsz/2 + 1 ][   height   ][ wsz/2 + 1 ] * [ ndisp ]
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
            if (params.useShorts())
                area.allocate(hsad_short[i], (height + wsz + 2) * ndisp);
            else
@ -390,7 +390,7 @@ public:
    }
 };

-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 template <typename dType>
 static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
                                            Mat& disp, Mat& cost, const StereoBMParams& state,
@ -422,8 +422,8 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
    short costbuf = 0;
    int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0;
    const uchar * tab = bufX.tab;
-    short v_seq[v_int16::nlanes];
-    for (short i = 0; i < v_int16::nlanes; ++i)
+    short v_seq[VTraits<v_int16>::max_nlanes];
+    for (short i = 0; i < VTraits<v_int16>::vlanes(); ++i)
        v_seq[i] = i;

    ushort *sad = bufX.sad_short[bufNum] + 1;
@ -446,19 +446,19 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
        {
            int lval = lptr[0];
            v_uint8 lv = vx_setall_u8((uchar)lval);
-            for( d = 0; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
+            for( d = 0; d <= ndisp - VTraits<v_uint8>::vlanes(); d += VTraits<v_uint8>::vlanes() )
            {
                v_uint8 diff = v_absdiff(lv, vx_load(rptr + d));
                v_store(cbuf + d, diff);
-                v_store(hsad + d, vx_load(hsad + d) + v_expand_low(diff));
-                v_store(hsad + d + v_uint16::nlanes, vx_load(hsad + d + v_uint16::nlanes) + v_expand_high(diff));
+                v_store(hsad + d, v_add(vx_load(hsad + d), v_expand_low(diff)));
+                v_store(hsad + d + VTraits<v_uint16>::vlanes(), v_add(vx_load(hsad + d + VTraits<v_uint16>::vlanes()), v_expand_high(diff)));
            }
-            if( d <= ndisp - v_uint16::nlanes )
+            if( d <= ndisp - VTraits<v_uint16>::vlanes() )
            {
                v_uint8 diff = v_absdiff(lv, vx_load_low(rptr + d));
                v_store_low(cbuf + d, diff);
-                v_store(hsad + d, vx_load(hsad + d) + v_expand_low(diff));
-                d += v_uint16::nlanes;
+                v_store(hsad + d, v_add(vx_load(hsad + d), v_expand_low(diff)));
+                d += VTraits<v_uint16>::vlanes();
            }
            for( ; d < ndisp; d++ )
            {
@ -496,20 +496,20 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
        {
            int lval = lptr[0];
            v_uint8 lv = vx_setall_u8((uchar)lval);
-            for( d = 0; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
+            for( d = 0; d <= ndisp - VTraits<v_uint8>::vlanes(); d += VTraits<v_uint8>::vlanes() )
            {
                v_uint8 diff = v_absdiff(lv, vx_load(rptr + d));
                v_int8 cbs = v_reinterpret_as_s8(vx_load(cbuf_sub + d));
                v_store(cbuf + d, diff);
-                v_store(hsad + d, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d) + v_expand_low(diff)) - v_expand_low(cbs)));
-                v_store(hsad + d + v_uint16::nlanes, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d + v_uint16::nlanes) + v_expand_high(diff)) - v_expand_high(cbs)));
+                v_store(hsad + d, v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(v_add(vx_load(hsad + d), v_expand_low(diff))), v_expand_low(cbs))));
+                v_store(hsad + d + VTraits<v_uint16>::vlanes(), v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(v_add(vx_load(hsad + d + VTraits<v_uint16>::vlanes()), v_expand_high(diff))), v_expand_high(cbs))));
            }
-            if( d <= ndisp - v_uint16::nlanes)
+            if( d <= ndisp - VTraits<v_uint16>::vlanes())
            {
                v_uint8 diff = v_absdiff(lv, vx_load_low(rptr + d));
                v_store_low(cbuf + d, diff);
-                v_store(hsad + d, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d) + v_expand_low(diff)) - vx_load_expand((schar*)cbuf_sub + d)));
-                d += v_uint16::nlanes;
+                v_store(hsad + d, v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(v_add(vx_load(hsad + d), v_expand_low(diff))), vx_load_expand((schar *)cbuf_sub + d))));
+                d += VTraits<v_uint16>::vlanes();
            }
            for( ; d < ndisp; d++ )
            {
@ -533,20 +533,20 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
        hsad = hsad0 + (1 - dy0)*ndisp;
        for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
        {
-            for( d = 0; d <= ndisp-2*v_uint16::nlanes; d += 2*v_uint16::nlanes )
+            for( d = 0; d <= ndisp-2*VTraits<v_uint16>::vlanes(); d += 2*VTraits<v_uint16>::vlanes() )
            {
-                v_store(sad + d, vx_load(sad + d) + vx_load(hsad + d));
-                v_store(sad + d + v_uint16::nlanes, vx_load(sad + d + v_uint16::nlanes) + vx_load(hsad + d + v_uint16::nlanes));
+                v_store(sad + d, v_add(vx_load(sad + d), vx_load(hsad + d)));
+                v_store(sad + d + VTraits<v_uint16>::vlanes(), v_add(vx_load(sad + d + VTraits<v_uint16>::vlanes()), vx_load(hsad + d + VTraits<v_uint16>::vlanes())));
            }
-            if( d <= ndisp-v_uint16::nlanes )
+            if( d <= ndisp-VTraits<v_uint16>::vlanes() )
            {
-                v_store(sad + d, vx_load(sad + d) + vx_load(hsad + d));
-                d += v_uint16::nlanes;
+                v_store(sad + d, v_add(vx_load(sad + d), vx_load(hsad + d)));
+                d += VTraits<v_uint16>::vlanes();
            }
-            if( d <= ndisp-v_uint16::nlanes/2 )
+            if( d <= ndisp-VTraits<v_uint16>::vlanes()/2 )
            {
-                v_store_low(sad + d, vx_load_low(sad + d) + vx_load_low(hsad + d));
-                d += v_uint16::nlanes/2;
+                v_store_low(sad + d, v_add(vx_load_low(sad + d), vx_load_low(hsad + d)));
+                d += VTraits<v_uint16>::vlanes()/2;
            }
            for( ; d < ndisp; d++ )
                sad[d] = sad[d] + hsad[d];
@ -564,29 +564,29 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
            v_int16 minsad8 = vx_setall_s16(SHRT_MAX);
            v_int16 mind8 = vx_setall_s16(0);

-            for( d = 0; d <= ndisp - 2*v_int16::nlanes; d += 2*v_int16::nlanes )
+            for( d = 0; d <= ndisp - 2*VTraits<v_int16>::vlanes(); d += 2*VTraits<v_int16>::vlanes() )
            {
-                v_int16 sad8 = v_reinterpret_as_s16(vx_load(hsad + d)) - v_reinterpret_as_s16(vx_load(hsad_sub + d)) + v_reinterpret_as_s16(vx_load(sad + d));
+                v_int16 sad8 = v_add(v_sub(v_reinterpret_as_s16(vx_load(hsad + d)), v_reinterpret_as_s16(vx_load(hsad_sub + d))), v_reinterpret_as_s16(vx_load(sad + d)));
                v_store(sad + d, v_reinterpret_as_u16(sad8));
-                mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)d));
+                mind8 = v_max(mind8, v_and(v_gt(minsad8, sad8), vx_setall_s16((short)d)));
                minsad8 = v_min(minsad8, sad8);

-                sad8 = v_reinterpret_as_s16(vx_load(hsad + d + v_int16::nlanes)) - v_reinterpret_as_s16(vx_load(hsad_sub + d + v_int16::nlanes)) + v_reinterpret_as_s16(vx_load(sad + d + v_int16::nlanes));
-                v_store(sad + d + v_int16::nlanes, v_reinterpret_as_u16(sad8));
-                mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)(d+v_int16::nlanes)));
+                sad8 = v_add(v_sub(v_reinterpret_as_s16(vx_load(hsad + d + VTraits<v_int16>::vlanes())), v_reinterpret_as_s16(vx_load(hsad_sub + d + VTraits<v_int16>::vlanes()))), v_reinterpret_as_s16(vx_load(sad + d + VTraits<v_int16>::vlanes())));
+                v_store(sad + d + VTraits<v_int16>::vlanes(), v_reinterpret_as_u16(sad8));
+                mind8 = v_max(mind8, v_and(v_gt(minsad8, sad8), vx_setall_s16((short)(d + VTraits<v_int16>::vlanes()))));
                minsad8 = v_min(minsad8, sad8);
            }
-            if( d <= ndisp - v_int16::nlanes )
+            if( d <= ndisp - VTraits<v_int16>::vlanes() )
            {
-                v_int16 sad8 = v_reinterpret_as_s16(vx_load(hsad + d)) - v_reinterpret_as_s16(vx_load(hsad_sub + d)) + v_reinterpret_as_s16(vx_load(sad + d));
+                v_int16 sad8 = v_add(v_sub(v_reinterpret_as_s16(vx_load(hsad + d)), v_reinterpret_as_s16(vx_load(hsad_sub + d))), v_reinterpret_as_s16(vx_load(sad + d)));
                v_store(sad + d, v_reinterpret_as_u16(sad8));
-                mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)d));
+                mind8 = v_max(mind8, v_and(v_gt(minsad8, sad8), vx_setall_s16((short)d)));
                minsad8 = v_min(minsad8, sad8);
-                d += v_int16::nlanes;
+                d += VTraits<v_int16>::vlanes();
            }
            minsad = v_reduce_min(minsad8);
-            v_int16 v_mask = (vx_setall_s16((short)minsad) == minsad8);
-            mind = v_reduce_min(((mind8+vx_load(v_seq)) & v_mask) | (vx_setall_s16(SHRT_MAX) & ~v_mask));
+            v_int16 v_mask = (v_eq(vx_setall_s16((short)minsad), minsad8));
+            mind = v_reduce_min(v_or(v_and(v_add(mind8, vx_load(v_seq)), v_mask), v_and(vx_setall_s16(32767), v_not(v_mask))));
            for( ; d < ndisp; d++ )
            {
                int sad8 = (int)(hsad[d]) - hsad_sub[d] + sad[d];
@ -610,34 +610,34 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
                int thresh = minsad + (minsad * uniquenessRatio/100);
                v_int32 thresh4 = vx_setall_s32(thresh + 1);
                v_int32 d1 = vx_setall_s32(mind-1), d2 = vx_setall_s32(mind+1);
-                v_int32 dd_4 = vx_setall_s32(v_int32::nlanes);
+                v_int32 dd_4 = vx_setall_s32(VTraits<v_int32>::vlanes());
                v_int32 d4 = vx_load_expand(v_seq);

-                for( d = 0; d <= ndisp - v_int16::nlanes; d += v_int16::nlanes )
+                for( d = 0; d <= ndisp - VTraits<v_int16>::vlanes(); d += VTraits<v_int16>::vlanes() )
                {
                    v_int32 sad4_l, sad4_h;
                    v_expand(v_reinterpret_as_s16(vx_load(sad + d)), sad4_l, sad4_h);
-                    if( v_check_any((thresh4 > sad4_l) & ((d1 > d4) | (d4 > d2))) )
+                    if( v_check_any(v_and(v_gt(thresh4, sad4_l), v_or(v_gt(d1, d4), v_gt(d4, d2)))) )
                        break;
-                    d4 += dd_4;
-                    if( v_check_any((thresh4 > sad4_h) & ((d1 > d4) | (d4 > d2))) )
+                    d4 = v_add(d4, dd_4);
+                    if( v_check_any(v_and(v_gt(thresh4, sad4_h), v_or(v_gt(d1, d4), v_gt(d4, d2)))) )
                        break;
-                    d4 += dd_4;
+                    d4 = v_add(d4, dd_4);
                }
-                if( d <= ndisp - v_int16::nlanes )
+                if( d <= ndisp - VTraits<v_int16>::vlanes() )
                {
                    dptr[y*dstep] = FILTERED;
                    continue;
                }
-                if( d <= ndisp - v_int32::nlanes )
+                if( d <= ndisp - VTraits<v_int32>::vlanes() )
                {
                    v_int32 sad4_l = vx_load_expand((short*)sad + d);
-                    if (v_check_any((thresh4 > sad4_l) & ((d1 > d4) | (d4 > d2))))
+                    if (v_check_any(v_and(v_gt(thresh4, sad4_l), v_or(v_gt(d1, d4), v_gt(d4, d2)))))
                    {
                        dptr[y*dstep] = FILTERED;
                        continue;
                    }
-                    d += v_int16::nlanes;
+                    d += VTraits<v_int16>::vlanes();
                }
                for( ; d < ndisp; d++ )
                {
@ -699,11 +699,11 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
    int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0;
    const uchar * tab = bufX.tab;

-#if CV_SIMD
-    int v_seq[v_int32::nlanes];
-    for (int i = 0; i < v_int32::nlanes; ++i)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int v_seq[VTraits<v_int32>::max_nlanes];
+    for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i)
        v_seq[i] = i;
-    v_int32 d0_4 = vx_load(v_seq), dd_4 = vx_setall_s32(v_int32::nlanes);
+    v_int32 d0_4 = vx_load(v_seq), dd_4 = vx_setall_s32(VTraits<v_int32>::vlanes());
 #endif

    int *sad = bufX.sad[bufNum] + 1;
@ -725,17 +725,17 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
        {
            int lval = lptr[0];
            d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
            {
                v_uint8 lv = vx_setall_u8((uchar)lval);

-                for( ; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
+                for( ; d <= ndisp - VTraits<v_uint8>::vlanes(); d += VTraits<v_uint8>::vlanes() )
                {
                    v_uint8 rv = vx_load(rptr + d);
                    v_int32 hsad_0 = vx_load(hsad + d);
-                    v_int32 hsad_1 = vx_load(hsad + d + v_int32::nlanes);
-                    v_int32 hsad_2 = vx_load(hsad + d + 2*v_int32::nlanes);
-                    v_int32 hsad_3 = vx_load(hsad + d + 3*v_int32::nlanes);
+                    v_int32 hsad_1 = vx_load(hsad + d + VTraits<v_int32>::vlanes());
+                    v_int32 hsad_2 = vx_load(hsad + d + 2*VTraits<v_int32>::vlanes());
+                    v_int32 hsad_3 = vx_load(hsad + d + 3*VTraits<v_int32>::vlanes());
                    v_uint8 diff = v_absdiff(lv, rv);
                    v_store(cbuf + d, diff);

@ -745,15 +745,15 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
                    v_expand(diff0, diff00, diff01);
                    v_expand(diff1, diff10, diff11);

-                    hsad_0 += v_reinterpret_as_s32(diff00);
-                    hsad_1 += v_reinterpret_as_s32(diff01);
-                    hsad_2 += v_reinterpret_as_s32(diff10);
-                    hsad_3 += v_reinterpret_as_s32(diff11);
+                    hsad_0 = v_add(hsad_0, v_reinterpret_as_s32(diff00));
+                    hsad_1 = v_add(hsad_1, v_reinterpret_as_s32(diff01));
+                    hsad_2 = v_add(hsad_2, v_reinterpret_as_s32(diff10));
+                    hsad_3 = v_add(hsad_3, v_reinterpret_as_s32(diff11));

                    v_store(hsad + d, hsad_0);
-                    v_store(hsad + d + v_int32::nlanes, hsad_1);
-                    v_store(hsad + d + 2*v_int32::nlanes, hsad_2);
-                    v_store(hsad + d + 3*v_int32::nlanes, hsad_3);
+                    v_store(hsad + d + VTraits<v_int32>::vlanes(), hsad_1);
+                    v_store(hsad + d + 2*VTraits<v_int32>::vlanes(), hsad_2);
+                    v_store(hsad + d + 3*VTraits<v_int32>::vlanes(), hsad_3);
                }
            }
 #endif
@ -793,16 +793,16 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
        {
            int lval = lptr[0];
            d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
            {
                v_uint8 lv = vx_setall_u8((uchar)lval);
-                for( ; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
+                for( ; d <= ndisp - VTraits<v_uint8>::vlanes(); d += VTraits<v_uint8>::vlanes() )
                {
                    v_uint8 rv = vx_load(rptr + d);
                    v_int32 hsad_0 = vx_load(hsad + d);
-                    v_int32 hsad_1 = vx_load(hsad + d + v_int32::nlanes);
-                    v_int32 hsad_2 = vx_load(hsad + d + 2*v_int32::nlanes);
-                    v_int32 hsad_3 = vx_load(hsad + d + 3*v_int32::nlanes);
+                    v_int32 hsad_1 = vx_load(hsad + d + VTraits<v_int32>::vlanes());
+                    v_int32 hsad_2 = vx_load(hsad + d + 2*VTraits<v_int32>::vlanes());
+                    v_int32 hsad_3 = vx_load(hsad + d + 3*VTraits<v_int32>::vlanes());
                    v_uint8 cbs = vx_load(cbuf_sub + d);
                    v_uint8 diff = v_absdiff(lv, rv);
                    v_store(cbuf + d, diff);
@ -816,19 +816,19 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
                    v_expand(v_reinterpret_as_s16(cbs0), cbs00, cbs01);
                    v_expand(v_reinterpret_as_s16(cbs1), cbs10, cbs11);

-                    v_int32 diff_0 = diff00 - cbs00;
-                    v_int32 diff_1 = diff01 - cbs01;
-                    v_int32 diff_2 = diff10 - cbs10;
-                    v_int32 diff_3 = diff11 - cbs11;
-                    hsad_0 += diff_0;
-                    hsad_1 += diff_1;
-                    hsad_2 += diff_2;
-                    hsad_3 += diff_3;
+                    v_int32 diff_0 = v_sub(diff00, cbs00);
+                    v_int32 diff_1 = v_sub(diff01, cbs01);
+                    v_int32 diff_2 = v_sub(diff10, cbs10);
+                    v_int32 diff_3 = v_sub(diff11, cbs11);
+                    hsad_0 = v_add(hsad_0, diff_0);
+                    hsad_1 = v_add(hsad_1, diff_1);
+                    hsad_2 = v_add(hsad_2, diff_2);
+                    hsad_3 = v_add(hsad_3, diff_3);

                    v_store(hsad + d, hsad_0);
-                    v_store(hsad + d + v_int32::nlanes, hsad_1);
-                    v_store(hsad + d + 2*v_int32::nlanes, hsad_2);
-                    v_store(hsad + d + 3*v_int32::nlanes, hsad_3);
+                    v_store(hsad + d + VTraits<v_int32>::vlanes(), hsad_1);
+                    v_store(hsad + d + 2*VTraits<v_int32>::vlanes(), hsad_2);
+                    v_store(hsad + d + 3*VTraits<v_int32>::vlanes(), hsad_3);
                }
            }
 #endif
@ -855,18 +855,18 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
        for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
        {
            d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
            {
-                for( d = 0; d <= ndisp-2*v_int32::nlanes; d += 2*v_int32::nlanes )
+                for( d = 0; d <= ndisp-2*VTraits<v_int32>::vlanes(); d += 2*VTraits<v_int32>::vlanes() )
                {
                    v_int32 s0 = vx_load(sad + d);
-                    v_int32 s1 = vx_load(sad + d + v_int32::nlanes);
+                    v_int32 s1 = vx_load(sad + d + VTraits<v_int32>::vlanes());
                    v_int32 t0 = vx_load(hsad + d);
-                    v_int32 t1 = vx_load(hsad + d + v_int32::nlanes);
-                    s0 += t0;
-                    s1 += t1;
+                    v_int32 t1 = vx_load(hsad + d + VTraits<v_int32>::vlanes());
+                    s0 = v_add(s0, t0);
+                    s1 = v_add(s1, t1);
                    v_store(sad + d, s0);
-                    v_store(sad + d + v_int32::nlanes, s1);
+                    v_store(sad + d + VTraits<v_int32>::vlanes(), s1);
                }
            }
 #endif
@ -884,30 +884,30 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
            hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
            hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
            d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
            {
                v_int32 minsad4 = vx_setall_s32(INT_MAX);
                v_int32 mind4 = vx_setall_s32(0), d4 = d0_4;

-                for( ; d <= ndisp - 2*v_int32::nlanes; d += 2*v_int32::nlanes )
+                for( ; d <= ndisp - 2*VTraits<v_int32>::vlanes(); d += 2*VTraits<v_int32>::vlanes() )
                {
-                    v_int32 sad4 = vx_load(sad + d) + vx_load(hsad + d) - vx_load(hsad_sub + d);
+                    v_int32 sad4 = v_sub(v_add(vx_load(sad + d), vx_load(hsad + d)), vx_load(hsad_sub + d));
                    v_store(sad + d, sad4);
-                    mind4 = v_select(minsad4 > sad4, d4, mind4);
+                    mind4 = v_select(v_gt(minsad4, sad4), d4, mind4);
                    minsad4 = v_min(minsad4, sad4);
-                    d4 += dd_4;
+                    d4 = v_add(d4, dd_4);

-                    sad4 = vx_load(sad + d + v_int32::nlanes) + vx_load(hsad + d + v_int32::nlanes) - vx_load(hsad_sub + d + v_int32::nlanes);
-                    v_store(sad + d + v_int32::nlanes, sad4);
-                    mind4 = v_select(minsad4 > sad4, d4, mind4);
+                    sad4 = v_sub(v_add(vx_load(sad + d + VTraits<v_int32>::vlanes()), vx_load(hsad + d + VTraits<v_int32>::vlanes())), vx_load(hsad_sub + d + VTraits<v_int32>::vlanes()));
+                    v_store(sad + d + VTraits<v_int32>::vlanes(), sad4);
+                    mind4 = v_select(v_gt(minsad4, sad4), d4, mind4);
                    minsad4 = v_min(minsad4, sad4);
-                    d4 += dd_4;
+                    d4 = v_add(d4, dd_4);
                }

-                int CV_DECL_ALIGNED(CV_SIMD_WIDTH) minsad_buf[v_int32::nlanes], mind_buf[v_int32::nlanes];
+                int CV_DECL_ALIGNED(CV_SIMD_WIDTH) minsad_buf[VTraits<v_int32>::max_nlanes], mind_buf[VTraits<v_int32>::max_nlanes];
                v_store(minsad_buf, minsad4);
                v_store(mind_buf, mind4);
-                for (int i = 0; i < v_int32::nlanes; ++i)
+                for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i)
                    if(minsad_buf[i] < minsad || (minsad == minsad_buf[i] && mind_buf[i] < mind)) { minsad = minsad_buf[i]; mind = mind_buf[i]; }
            }
 #endif
@ -1102,7 +1102,7 @@ struct FindStereoCorrespInvoker : public ParallelLoopBody
        Mat disp_i = disp->rowRange(row0, row1);
        Mat cost_i = state.disp12MaxDiff >= 0 ? cost->rowRange(row0, row1) : Mat();

-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
        if (state.useShorts())
        {
            if( disp_i.type() == CV_16S)
--- a/modules/calib3d/src/stereosgbm.cpp
+++ b/modules/calib3d/src/stereosgbm.cpp
@ -123,7 +123,7 @@ struct StereoSGBMParams
    int mode;
 };

-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 #if CV_SIMD_WIDTH == 16
 static inline v_int16 vx_setseq_s16()
 { return v_int16(0, 1, 2, 3, 4, 5, 6, 7); }
@ -136,10 +136,10 @@ static inline v_int16 vx_setseq_s16()
 #else
 struct vseq_s16
 {
-    short data[v_int16::nlanes];
+    short data[VTraits<v_int16>::max_nlanes];
    vseq_s16()
    {
-        for (int i = 0; i < v_int16::nlanes; i++)
+        for (int i = 0; i < VTraits<v_int16>::vlanes(); i++)
            data[i] = i;
    }
 };
@ -153,8 +153,8 @@ static inline v_int16 vx_setseq_s16()
 static inline void min_pos(const v_int16& val, const v_int16& pos, short &min_val, short &min_pos)
 {
    min_val = v_reduce_min(val);
-    v_int16 v_mask = (vx_setall_s16(min_val) == val);
-    min_pos = v_reduce_min(((pos+vx_setseq_s16()) & v_mask) | (vx_setall_s16(SHRT_MAX) & ~v_mask));
+    v_int16 v_mask = (v_eq(vx_setall_s16(min_val), val));
+    min_pos = v_reduce_min(v_or(v_and(v_add(pos, vx_setseq_s16()), v_mask), v_and(vx_setall_s16(SHRT_MAX), v_not(v_mask))));
 }
 #endif

@ -270,26 +270,26 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
            int u1 = std::max(ul, ur); u1 = std::max(u1, u);

            int d = minD;
-        #if CV_SIMD
+        #if (CV_SIMD || CV_SIMD_SCALABLE)
            v_uint8 _u  = vx_setall_u8((uchar)u), _u0 = vx_setall_u8((uchar)u0);
            v_uint8 _u1 = vx_setall_u8((uchar)u1);

-            for( ; d <= maxD - 2*v_int16::nlanes; d += 2*v_int16::nlanes )
+            for( ; d <= maxD - 2*VTraits<v_int16>::vlanes(); d += 2*VTraits<v_int16>::vlanes() )
            {
                v_uint8 _v  = vx_load(prow2  + width-x-1 + d);
                v_uint8 _v0 = vx_load(buffer + width-x-1 + d);
                v_uint8 _v1 = vx_load(buffer + width-x-1 + d + width2);
-                v_uint8 c0 = v_max(_u - _v1, _v0 - _u);
-                v_uint8 c1 = v_max(_v - _u1, _u0 - _v);
+                v_uint8 c0 = v_max(v_sub(_u, _v1), v_sub(_v0, _u));
+                v_uint8 c1 = v_max(v_sub(_v, _u1), v_sub(_u0, _v));
                v_uint8 diff = v_min(c0, c1);

                v_int16 _c0 = vx_load_aligned(cost + x*D + d);
-                v_int16 _c1 = vx_load_aligned(cost + x*D + d + v_int16::nlanes);
+                v_int16 _c1 = vx_load_aligned(cost + x*D + d + VTraits<v_int16>::vlanes());

                v_uint16 diff1,diff2;
                v_expand(diff,diff1,diff2);
-                v_store_aligned(cost + x*D + d,                   _c0 + v_reinterpret_as_s16(diff1 >> diff_scale));
-                v_store_aligned(cost + x*D + d + v_int16::nlanes, _c1 + v_reinterpret_as_s16(diff2 >> diff_scale));
+                v_store_aligned(cost + x*D + d,                   v_add(_c0, v_reinterpret_as_s16(v_shr(diff1, diff_scale))));
+                v_store_aligned(cost + x*D + d + VTraits<v_int16>::vlanes(), v_add(_c1, v_reinterpret_as_s16(v_shr(diff2, diff_scale))));
            }
        #endif
            for( ; d < maxD; d++ )
@ -555,13 +555,13 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                        calcPixelCostBT( img1, img2, k, minD, maxD, mem.pixDiff, mem.tempBuf, mem.getClipTab() );

                        memset(hsumAdd, 0, Da*sizeof(CostType));
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                        v_int16 h_scale = vx_setall_s16((short)SW2 + 1);
-                        for( d = 0; d < Da; d += v_int16::nlanes )
+                        for( d = 0; d < Da; d += VTraits<v_int16>::vlanes() )
                        {
-                            v_int16 v_hsumAdd = vx_load_aligned(mem.pixDiff + d) * h_scale;
+                            v_int16 v_hsumAdd = v_mul(vx_load_aligned(mem.pixDiff + d), h_scale);
                            for( x = Da; x <= SW2*Da; x += Da )
-                                v_hsumAdd += vx_load_aligned(mem.pixDiff + x + d);
+                                v_hsumAdd = v_add(v_hsumAdd, vx_load_aligned(mem.pixDiff + x + d));
                            v_store_aligned(hsumAdd + d, v_hsumAdd);
                        }
 #else
@ -578,9 +578,9 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                            const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
                            const CostType* Cprev =  mem.getCBuf(y - 1);

-#if CV_SIMD
-                            for (d = 0; d < Da; d += v_int16::nlanes)
-                                v_store_aligned(C + d, vx_load_aligned(Cprev + d) + vx_load_aligned(hsumAdd + d) - vx_load_aligned(hsumSub + d));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                            for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
+                                v_store_aligned(C + d, v_sub(v_add(vx_load_aligned(Cprev + d), vx_load_aligned(hsumAdd + d)), vx_load_aligned(hsumSub + d)));
 #else
                            for (d = 0; d < D; d++)
                                C[d] = (CostType)(Cprev[d] + hsumAdd[d] - hsumSub[d]);
@ -590,12 +590,12 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                            {
                                const CostType* pixAdd = mem.pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
                                const CostType* pixSub = mem.pixDiff + std::max(x - (SW2+1)*Da, 0);
-#if CV_SIMD
-                                for( d = 0; d < Da; d += v_int16::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                                for( d = 0; d < Da; d += VTraits<v_int16>::vlanes() )
                                {
-                                    v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) - vx_load_aligned(pixSub + d) + vx_load_aligned(pixAdd + d);
+                                    v_int16 hv = v_add(v_sub(vx_load_aligned(hsumAdd + x - Da + d), vx_load_aligned(pixSub + d)), vx_load_aligned(pixAdd + d));
                                    v_store_aligned(hsumAdd + x + d, hv);
-                                    v_store_aligned(C + x + d, vx_load_aligned(Cprev + x + d) - vx_load_aligned(hsumSub + x + d) + hv);
+                                    v_store_aligned(C + x + d, v_add(v_sub(vx_load_aligned(Cprev + x + d), vx_load_aligned(hsumSub + x + d)), hv));
                                }
 #else
                                for( d = 0; d < D; d++ )
@ -608,10 +608,10 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                        }
                        else
                        {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                            v_int16 v_scale = vx_setall_s16(k == 0 ? (short)SH2 + 1 : 1);
-                            for (d = 0; d < Da; d += v_int16::nlanes)
-                                v_store_aligned(C + d, vx_load_aligned(C + d) + vx_load_aligned(hsumAdd + d) * v_scale);
+                            for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
+                                v_store_aligned(C + d, v_add(vx_load_aligned(C + d), v_mul(vx_load_aligned(hsumAdd + d), v_scale)));
 #else
                            int scale = k == 0 ? SH2 + 1 : 1;
                            for (d = 0; d < D; d++)
@ -622,12 +622,12 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                                const CostType* pixAdd = mem.pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
                                const CostType* pixSub = mem.pixDiff + std::max(x - (SW2+1)*Da, 0);

-#if CV_SIMD
-                                for (d = 0; d < Da; d += v_int16::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                                for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
                                {
-                                    v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) + vx_load_aligned(pixAdd + d) - vx_load_aligned(pixSub + d);
+                                    v_int16 hv = v_sub(v_add(vx_load_aligned(hsumAdd + x - Da + d), vx_load_aligned(pixAdd + d)), vx_load_aligned(pixSub + d));
                                    v_store_aligned(hsumAdd + x + d, hv);
-                                    v_store_aligned(C + x + d, vx_load_aligned(C + x + d) + hv * v_scale);
+                                    v_store_aligned(C + x + d, v_add(vx_load_aligned(C + x + d), v_mul(hv, v_scale)));
                                }
 #else
                                for( d = 0; d < D; d++ )
@ -646,9 +646,9 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                        {
                            const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
                            const CostType* Cprev = mem.getCBuf(y - 1);
-#if CV_SIMD
-                            for (x = 0; x < width1*Da; x += v_int16::nlanes)
-                                v_store_aligned(C + x, vx_load_aligned(Cprev + x) - vx_load_aligned(hsumSub + x) + vx_load_aligned(hsumAdd + x));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                            for (x = 0; x < width1*Da; x += VTraits<v_int16>::vlanes())
+                                v_store_aligned(C + x, v_add(v_sub(vx_load_aligned(Cprev + x), vx_load_aligned(hsumSub + x)), vx_load_aligned(hsumAdd + x)));
 #else
                            for (x = 0; x < width1*Da; x++)
                                C[x] = (CostType)(Cprev[x] + hsumAdd[x] - hsumSub[x]);
@ -656,9 +656,9 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                        }
                        else
                        {
-#if CV_SIMD
-                            for (x = 0; x < width1*Da; x += v_int16::nlanes)
-                                v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                            for (x = 0; x < width1*Da; x += VTraits<v_int16>::vlanes())
+                                v_store_aligned(C + x, v_add(vx_load_aligned(C + x), vx_load_aligned(hsumAdd + x)));
 #else
                            for (x = 0; x < width1*Da; x++)
                                C[x] = (CostType)(C[x] + hsumAdd[x]);
@ -714,7 +714,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,

                CostType* minL = mem.getMinLr(lrID, x);
                d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                v_int16 _P1 = vx_setall_s16((short)P1);

                v_int16 _delta0 = vx_setall_s16((short)delta0);
@ -726,31 +726,31 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                v_int16 _minL2 = vx_setall_s16((short)MAX_COST);
                v_int16 _minL3 = vx_setall_s16((short)MAX_COST);

-                for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
+                for( ; d <= D - VTraits<v_int16>::vlanes(); d += VTraits<v_int16>::vlanes() )
                {
                    v_int16 Cpd = vx_load_aligned(Cp + d);
                    v_int16 Spd = vx_load_aligned(Sp + d);
                    v_int16 L;

-                    L = v_min(v_min(v_min(vx_load_aligned(Lr_p0 + d), vx_load(Lr_p0 + d - 1) + _P1), vx_load(Lr_p0 + d + 1) + _P1), _delta0) - _delta0 + Cpd;
+                    L = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_p0 + d), v_add(vx_load(Lr_p0 + d - 1), _P1)), v_add(vx_load(Lr_p0 + d + 1), _P1)), _delta0), _delta0), Cpd);
                    v_store_aligned(Lr_p + d, L);
                    _minL0 = v_min(_minL0, L);
-                    Spd += L;
+                    Spd = v_add(Spd, L);

-                    L = v_min(v_min(v_min(vx_load_aligned(Lr_p1 + d), vx_load(Lr_p1 + d - 1) + _P1), vx_load(Lr_p1 + d + 1) + _P1), _delta1) - _delta1 + Cpd;
+                    L = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_p1 + d), v_add(vx_load(Lr_p1 + d - 1), _P1)), v_add(vx_load(Lr_p1 + d + 1), _P1)), _delta1), _delta1), Cpd);
                    v_store_aligned(Lr_p + d + Dlra, L);
                    _minL1 = v_min(_minL1, L);
-                    Spd += L;
+                    Spd = v_add(Spd, L);

-                    L = v_min(v_min(v_min(vx_load_aligned(Lr_p2 + d), vx_load(Lr_p2 + d - 1) + _P1), vx_load(Lr_p2 + d + 1) + _P1), _delta2) - _delta2 + Cpd;
+                    L = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_p2 + d), v_add(vx_load(Lr_p2 + d - 1), _P1)), v_add(vx_load(Lr_p2 + d + 1), _P1)), _delta2), _delta2), Cpd);
                    v_store_aligned(Lr_p + d + Dlra*2, L);
                    _minL2 = v_min(_minL2, L);
-                    Spd += L;
+                    Spd = v_add(Spd, L);

-                    L = v_min(v_min(v_min(vx_load_aligned(Lr_p3 + d), vx_load(Lr_p3 + d - 1) + _P1), vx_load(Lr_p3 + d + 1) + _P1), _delta3) - _delta3 + Cpd;
+                    L = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_p3 + d), v_add(vx_load(Lr_p3 + d - 1), _P1)), v_add(vx_load(Lr_p3 + d + 1), _P1)), _delta3), _delta3), Cpd);
                    v_store_aligned(Lr_p + d + Dlra*3, L);
                    _minL3 = v_min(_minL3, L);
-                    Spd += L;
+                    Spd = v_add(Spd, L);

                    v_store_aligned(Sp + d, Spd);
                }
@ -769,7 +769,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                t0 = v_min(t0, t1);
                t0 = v_min(t0, v_rotate_right<4>(t0));
 #if CV_SIMD_WIDTH == 32
-                CostType buf[v_int16::nlanes];
+                CostType buf[VTraits<v_int16>::max_nlanes];
                v_store_low(buf, v_min(t0, v_rotate_right<8>(t0)));
                minL[0] = buf[0];
                minL[1] = buf[1];
@ -817,10 +817,10 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
            if( pass == npasses )
            {
                x = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                v_int16 v_inv_dist = vx_setall_s16((DispType)INVALID_DISP_SCALED);
                v_int16 v_max_cost = vx_setall_s16(MAX_COST);
-                for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes )
+                for( ; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes() )
                {
                    v_store(disp1ptr + x, v_inv_dist);
                    v_store(mem.disp2ptr + x, v_inv_dist);
@ -850,23 +850,23 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                        d = 0;
                        int delta0 = P2 + *mem.getMinLr(lrID, x + 1);
                        int minL0 = MAX_COST;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                        v_int16 _P1 = vx_setall_s16((short)P1);
                        v_int16 _delta0 = vx_setall_s16((short)delta0);

                        v_int16 _minL0 = vx_setall_s16((short)MAX_COST);
                        v_int16 _minS = vx_setall_s16(MAX_COST), _bestDisp = vx_setall_s16(-1);
-                        for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
+                        for( ; d <= D - VTraits<v_int16>::vlanes(); d += VTraits<v_int16>::vlanes() )
                        {
                            v_int16 Cpd = vx_load_aligned(Cp + d);
-                            v_int16 L0 = v_min(v_min(v_min(vx_load_aligned(Lr_p0 + d), vx_load(Lr_p0 + d - 1) + _P1), vx_load(Lr_p0 + d + 1) + _P1), _delta0) - _delta0 + Cpd;
+                            v_int16 L0 = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_p0 + d), v_add(vx_load(Lr_p0 + d - 1), _P1)), v_add(vx_load(Lr_p0 + d + 1), _P1)), _delta0), _delta0), Cpd);

                            v_store_aligned(Lr_p + d, L0);
                            _minL0 = v_min(_minL0, L0);
-                            L0 += vx_load_aligned(Sp + d);
+                            L0 = v_add(L0, vx_load_aligned(Sp + d));
                            v_store_aligned(Sp + d, L0);

-                            _bestDisp = v_select(_minS > L0, vx_setall_s16((short)d), _bestDisp);
+                            _bestDisp = v_select(v_gt(_minS, L0), vx_setall_s16((short)d), _bestDisp);
                            _minS = v_min(_minS, L0);
                        }
                        minL0 = (CostType)v_reduce_min(_minL0);
@ -891,12 +891,12 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                    else
                    {
                        d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                        v_int16 _minS = vx_setall_s16(MAX_COST), _bestDisp = vx_setall_s16(-1);
-                        for( ; d <= D - v_int16::nlanes; d+= v_int16::nlanes )
+                        for( ; d <= D - VTraits<v_int16>::vlanes(); d+= VTraits<v_int16>::vlanes() )
                        {
                            v_int16 L0 = vx_load_aligned(Sp + d);
-                            _bestDisp = v_select(_minS > L0, vx_setall_s16((short)d), _bestDisp);
+                            _bestDisp = v_select(v_gt(_minS, L0), vx_setall_s16((short)d), _bestDisp);
                            _minS = v_min( L0, _minS );
                        }
                        min_pos(_minS, _bestDisp, minS, bestDisp);
@ -1039,9 +1039,9 @@ struct CalcVerticalSums: public ParallelLoopBody
                            for( x = (x1 - SW2)*Da; x <= (x1 + SW2)*Da; x += Da )
                            {
                                int xbord = x <= 0 ? 0 : (x > (width1 - 1)*Da ? (width1 - 1)*Da : x);
-#if CV_SIMD
-                                for( d = 0; d < Da; d += v_int16::nlanes )
-                                    v_store_aligned(hsumAdd + x1*Da + d, vx_load_aligned(hsumAdd + x1*Da + d) + vx_load_aligned(pixDiff + xbord + d));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                                for( d = 0; d < Da; d += VTraits<v_int16>::vlanes() )
+                                    v_store_aligned(hsumAdd + x1*Da + d, v_add(vx_load_aligned(hsumAdd + x1 * this->Da + d), vx_load_aligned(pixDiff + xbord + d)));
 #else
                                for( d = 0; d < D; d++ )
                                    hsumAdd[x1*Da + d] = (CostType)(hsumAdd[x1*Da + d] + pixDiff[xbord + d]);
@ -1052,9 +1052,9 @@ struct CalcVerticalSums: public ParallelLoopBody
                            {
                                const CostType* hsumSub =  mem.getHSumBuf(std::max(y - SH2 - 1, 0));
                                const CostType* Cprev = mem.getCBuf(y - 1);
-#if CV_SIMD
-                                for( d = 0; d < Da; d += v_int16::nlanes )
-                                    v_store_aligned(C + x1*Da + d, vx_load_aligned(Cprev + x1*Da + d) + vx_load_aligned(hsumAdd + x1*Da + d) - vx_load_aligned(hsumSub + x1*Da + d));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                                for( d = 0; d < Da; d += VTraits<v_int16>::vlanes() )
+                                    v_store_aligned(C + x1*Da + d, v_sub(v_add(vx_load_aligned(Cprev + x1 * this->Da + d), vx_load_aligned(hsumAdd + x1 * this->Da + d)), vx_load_aligned(hsumSub + x1 * this->Da + d)));
 #else
                                for( d = 0; d < D; d++ )
                                    C[x1*Da + d] = (CostType)(Cprev[x1*Da + d] + hsumAdd[x1*Da + d] - hsumSub[x1*Da + d]);
@ -1064,12 +1064,12 @@ struct CalcVerticalSums: public ParallelLoopBody
                                    const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
                                    const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);

-#if CV_SIMD
-                                    for( d = 0; d < Da; d += v_int16::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                                    for( d = 0; d < Da; d += VTraits<v_int16>::vlanes() )
                                    {
-                                        v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) - vx_load_aligned(pixSub + d) + vx_load_aligned(pixAdd + d);
+                                        v_int16 hv = v_add(v_sub(vx_load_aligned(hsumAdd + x - this->Da + d), vx_load_aligned(pixSub + d)), vx_load_aligned(pixAdd + d));
                                        v_store_aligned(hsumAdd + x + d, hv);
-                                        v_store_aligned(C + x + d, vx_load_aligned(Cprev + x + d) - vx_load_aligned(hsumSub + x + d) + hv);
+                                        v_store_aligned(C + x + d, v_add(v_sub(vx_load_aligned(Cprev + x + d), vx_load_aligned(hsumSub + x + d)), hv));
                                    }
 #else
                                    for( d = 0; d < D; d++ )
@ -1082,10 +1082,10 @@ struct CalcVerticalSums: public ParallelLoopBody
                            }
                            else
                            {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                                v_int16 v_scale = vx_setall_s16(k == 0 ? (short)SH2 + 1 : 1);
-                                for (d = 0; d < Da; d += v_int16::nlanes)
-                                    v_store_aligned(C + x1*Da + d, vx_load_aligned(C + x1*Da + d) + vx_load_aligned(hsumAdd + x1*Da + d) * v_scale);
+                                for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
+                                    v_store_aligned(C + x1*Da + d, v_add(vx_load_aligned(C + x1 * this->Da + d), v_mul(vx_load_aligned(hsumAdd + x1 * this->Da + d), v_scale)));
 #else
                                int scale = k == 0 ? SH2 + 1 : 1;
                                for (d = 0; d < D; d++)
@ -1095,12 +1095,12 @@ struct CalcVerticalSums: public ParallelLoopBody
                                {
                                    const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
                                    const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
-#if CV_SIMD
-                                    for (d = 0; d < Da; d += v_int16::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                                    for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
                                    {
-                                        v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) + vx_load_aligned(pixAdd + d) - vx_load_aligned(pixSub + d);
+                                        v_int16 hv = v_sub(v_add(vx_load_aligned(hsumAdd + x - this->Da + d), vx_load_aligned(pixAdd + d)), vx_load_aligned(pixSub + d));
                                        v_store_aligned(hsumAdd + x + d, hv);
-                                        v_store_aligned(C + x + d, vx_load_aligned(C + x + d) + hv * v_scale);
+                                        v_store_aligned(C + x + d, v_add(vx_load_aligned(C + x + d), v_mul(hv, v_scale)));
                                    }
 #else
                                    for( d = 0; d < D; d++ )
@ -1120,9 +1120,9 @@ struct CalcVerticalSums: public ParallelLoopBody
                                const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
                                const CostType* Cprev = mem.getCBuf(y - 1);

-#if CV_SIMD
-                                for( x = x1*Da; x < x2*Da; x += v_int16::nlanes )
-                                    v_store_aligned(C + x, vx_load_aligned(Cprev + x) - vx_load_aligned(hsumSub + x) + vx_load_aligned(hsumAdd + x));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                                for( x = x1*Da; x < x2*Da; x += VTraits<v_int16>::vlanes() )
+                                    v_store_aligned(C + x, v_add(v_sub(vx_load_aligned(Cprev + x), vx_load_aligned(hsumSub + x)), vx_load_aligned(hsumAdd + x)));
 #else
                                for( x = x1*Da; x < x2*Da; x++ )
                                    C[x] = (CostType)(Cprev[x] + hsumAdd[x] - hsumSub[x]);
@ -1131,9 +1131,9 @@ struct CalcVerticalSums: public ParallelLoopBody
                            else*/
                            if(y == 0)
                            {
-#if CV_SIMD
-                                for( x = x1*Da; x < x2*Da; x += v_int16::nlanes )
-                                    v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                                for( x = x1*Da; x < x2*Da; x += VTraits<v_int16>::vlanes() )
+                                    v_store_aligned(C + x, v_add(vx_load_aligned(C + x), vx_load_aligned(hsumAdd + x)));
 #else
                                for( x = x1*Da; x < x2*Da; x++ )
                                    C[x] = (CostType)(C[x] + hsumAdd[x]);
@ -1167,19 +1167,19 @@ struct CalcVerticalSums: public ParallelLoopBody

                    CostType& minL = *(mem.getMinLr(lrID, x));
                    d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                    v_int16 _P1 = vx_setall_s16((short)P1);

                    v_int16 _delta = vx_setall_s16((short)delta);
                    v_int16 _minL = vx_setall_s16((short)MAX_COST);

-                    for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
+                    for( ; d <= D - VTraits<v_int16>::vlanes(); d += VTraits<v_int16>::vlanes() )
                    {
                        v_int16 Cpd = vx_load_aligned(Cp + d);
-                        v_int16 L = v_min(v_min(v_min(vx_load_aligned(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
+                        v_int16 L = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_ppr + d), v_add(vx_load(Lr_ppr + d - 1), _P1)), v_add(vx_load(Lr_ppr + d + 1), _P1)), _delta), _delta), Cpd);
                        v_store_aligned(Lr_p + d, L);
                        _minL = v_min(_minL, L);
-                        v_store_aligned(Sp + d, vx_load_aligned(Sp + d) + L);
+                        v_store_aligned(Sp + d, v_add(vx_load_aligned(Sp + d), L));
                    }
                    minL = v_reduce_min(_minL);
 #else
@ -1264,10 +1264,10 @@ struct CalcHorizontalSums: public ParallelLoopBody
            CostType* S = mem.getSBuf(y);

            x = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
            v_int16 v_inv_dist = vx_setall_s16((DispType)INVALID_DISP_SCALED);
            v_int16 v_max_cost = vx_setall_s16(MAX_COST);
-            for (; x <= width - v_int16::nlanes; x += v_int16::nlanes)
+            for (; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
            {
                v_store(disp1ptr + x, v_inv_dist);
                v_store(disp2ptr + x, v_inv_dist);
@ -1304,19 +1304,19 @@ struct CalcHorizontalSums: public ParallelLoopBody
                CostType* Sp = S + x*Da;

                d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                v_int16 _P1 = vx_setall_s16((short)P1);

                v_int16 _delta = vx_setall_s16((short)delta);
                v_int16 _minL = vx_setall_s16((short)MAX_COST);

-                for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes)
+                for( ; d <= D - VTraits<v_int16>::vlanes(); d += VTraits<v_int16>::vlanes())
                {
                    v_int16 Cpd = vx_load_aligned(Cp + d);
-                    v_int16 L = v_min(v_min(v_min(vx_load(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
+                    v_int16 L = v_add(v_sub(v_min(v_min(v_min(vx_load(Lr_ppr + d), v_add(vx_load(Lr_ppr + d - 1), _P1)), v_add(vx_load(Lr_ppr + d + 1), _P1)), _delta), _delta), Cpd);
                    v_store(Lr_p + d, L);
                    _minL = v_min(_minL, L);
-                    v_store_aligned(Sp + d, vx_load_aligned(Sp + d) + L);
+                    v_store_aligned(Sp + d, v_add(vx_load_aligned(Sp + d), L));
                }
                minLr = v_reduce_min(_minL);
 #else
@ -1349,22 +1349,22 @@ struct CalcHorizontalSums: public ParallelLoopBody
                minLr = MAX_COST;

                d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                v_int16 _P1 = vx_setall_s16((short)P1);
                v_int16 _delta = vx_setall_s16((short)delta);

                v_int16 _minL = vx_setall_s16((short)MAX_COST);
                v_int16 _minS = vx_setall_s16(MAX_COST), _bestDisp = vx_setall_s16(-1);
-                for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
+                for( ; d <= D - VTraits<v_int16>::vlanes(); d += VTraits<v_int16>::vlanes() )
                {
                    v_int16 Cpd = vx_load_aligned(Cp + d);
-                    v_int16 L = v_min(v_min(v_min(vx_load(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
+                    v_int16 L = v_add(v_sub(v_min(v_min(v_min(vx_load(Lr_ppr + d), v_add(vx_load(Lr_ppr + d - 1), _P1)), v_add(vx_load(Lr_ppr + d + 1), _P1)), _delta), _delta), Cpd);
                    v_store(Lr_p + d, L);
                    _minL = v_min(_minL, L);
-                    L += vx_load_aligned(Sp + d);
+                    L = v_add(L, vx_load_aligned(Sp + d));
                    v_store_aligned(Sp + d, L);

-                    _bestDisp = v_select(_minS > L, vx_setall_s16((short)d), _bestDisp);
+                    _bestDisp = v_select(v_gt(_minS, L), vx_setall_s16((short)d), _bestDisp);
                    _minS = v_min( L, _minS );
                }
                minLr = v_reduce_min(_minL);
@ -1581,8 +1581,8 @@ struct SGBM3WayMainLoop : public ParallelLoopBody

    utils::BufferArea aux_area;
    PixType* clipTab;
-#if CV_SIMD
-    short idx_row[v_int16::nlanes];
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    short idx_row[VTraits<v_int16>::max_nlanes];
 #endif
    SGBM3WayMainLoop(const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, int stripe_size, int _stripe_overlap);
    void operator () (const Range& range) const CV_OVERRIDE;
@ -1637,8 +1637,8 @@ SGBM3WayMainLoop::SGBM3WayMainLoop(const Mat& _img1,
    uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
    disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;

-#if CV_SIMD
-    for(short i = 0; i < v_int16::nlanes; ++i)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    for(short i = 0; i < VTraits<v_int16>::vlanes(); ++i)
        idx_row[i] = i;
 #endif
 }
@ -1659,13 +1659,13 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int
        {
            calcPixelCostBT( *img1, *img2, k, minD, maxD, pixDiff, tmpBuf, clipTab + TAB_OFS );

-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
            v_int16 sw2_1 = vx_setall_s16((short)SW2 + 1);
-            for (d = 0; d < Da; d += v_int16::nlanes)
+            for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
            {
-                v_int16 hsA = vx_load_aligned(pixDiff + d) * sw2_1;
+                v_int16 hsA = v_mul(vx_load_aligned(pixDiff + d), sw2_1);
                for (x = Da; x <= SW2 * Da; x += Da)
-                    hsA += vx_load_aligned(pixDiff + x + d);
+                    hsA = v_add(hsA, vx_load_aligned(pixDiff + x + d));
                v_store_aligned(hsumAdd + d, hsA);
            }
 #else
@ -1681,9 +1681,9 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int
            {
                const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, src_start_idx));

-#if CV_SIMD
-                for (d = 0; d < Da; d += v_int16::nlanes)
-                    v_store_aligned(C + d, vx_load_aligned(C + d) + vx_load_aligned(hsumAdd + d) - vx_load_aligned(hsumSub + d));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
+                    v_store_aligned(C + d, v_sub(v_add(vx_load_aligned(C + d), vx_load_aligned(hsumAdd + d)), vx_load_aligned(hsumSub + d)));
 #else
                for (d = 0; d < D; d++)
                    C[d] = (CostType)(C[d] + hsumAdd[d] - hsumSub[d]);
@ -1693,13 +1693,13 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int
                {
                    const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
                    const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                    v_int16 hv_reg;
-                    for( d = 0; d < Da; d+=v_int16::nlanes )
+                    for( d = 0; d < Da; d+=VTraits<v_int16>::vlanes() )
                    {
-                        hv_reg = vx_load_aligned(hsumAdd+x-Da+d) + vx_load_aligned(pixAdd+d) - vx_load_aligned(pixSub+d);
+                        hv_reg = v_sub(v_add(vx_load_aligned(hsumAdd + x - this->Da + d), vx_load_aligned(pixAdd + d)), vx_load_aligned(pixSub + d));
                        v_store_aligned(hsumAdd+x+d,hv_reg);
-                        v_store_aligned(C+x+d,vx_load_aligned(C+x+d)+hv_reg-vx_load_aligned(hsumSub+x+d));
+                        v_store_aligned(C+x+d,v_sub(v_add(vx_load_aligned(C + x + d), hv_reg), vx_load_aligned(hsumSub + x + d)));
                    }
 #else
                    for( d = 0; d < D; d++ )
@ -1712,10 +1712,10 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int
            }
            else
            {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                v_int16 v_scale = vx_setall_s16(k == src_start_idx ? (short)SH2 + 1 : 1);
-                for (d = 0; d < Da; d += v_int16::nlanes)
-                    v_store_aligned(C + d, vx_load_aligned(C + d) + vx_load_aligned(hsumAdd + d) * v_scale);
+                for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
+                    v_store_aligned(C + d, v_add(vx_load_aligned(C + d), v_mul(vx_load_aligned(hsumAdd + d), v_scale)));
 #else
                int scale = k == src_start_idx ? SH2 + 1 : 1;
                for (d = 0; d < D; d++)
@ -1725,12 +1725,12 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int
                {
                    const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
                    const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
-#if CV_SIMD
-                    for (d = 0; d < Da; d += v_int16::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                    for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
                    {
-                        v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) + vx_load_aligned(pixAdd + d) - vx_load_aligned(pixSub + d);
+                        v_int16 hv = v_sub(v_add(vx_load_aligned(hsumAdd + x - this->Da + d), vx_load_aligned(pixAdd + d)), vx_load_aligned(pixSub + d));
                        v_store_aligned(hsumAdd + x + d, hv);
-                        v_store_aligned(C + x + d, vx_load_aligned(C + x + d) + hv * v_scale);
+                        v_store_aligned(C + x + d, v_add(vx_load_aligned(C + x + d), v_mul(hv, v_scale)));
                    }
 #else
                    for (d = 0; d < D; d++)
@ -1748,9 +1748,9 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int
            if( y > src_start_idx )
            {
                const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, src_start_idx));
-#if CV_SIMD
-                for( x = 0; x < width1*Da; x += v_int16::nlanes)
-                    v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x) - vx_load_aligned(hsumSub + x));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( x = 0; x < width1*Da; x += VTraits<v_int16>::vlanes())
+                    v_store_aligned(C + x, v_sub(v_add(vx_load_aligned(C + x), vx_load_aligned(hsumAdd + x)), vx_load_aligned(hsumSub + x)));
 #else
                for( x = 0; x < width1*Da; x++ )
                    C[x] = (CostType)(C[x] + hsumAdd[x] - hsumSub[x]);
@ -1758,9 +1758,9 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int
            }
            else
            {
-#if CV_SIMD
-                for( x = 0; x < width1*Da; x += v_int16::nlanes)
-                    v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( x = 0; x < width1*Da; x += VTraits<v_int16>::vlanes())
+                    v_store_aligned(C + x, v_add(vx_load_aligned(C + x), vx_load_aligned(hsumAdd + x)));
 #else
                for( x = 0; x < width1*Da; x++ )
                    C[x] = (CostType)(C[x] + hsumAdd[x]);
@ -1781,7 +1781,7 @@ void SGBM3WayMainLoop::accumulateCostsLeftTop(const BufferSGBM3Way &mem, int x,
    CostType *costs = mem.curCostVolumeLine - Da + x;
    CostType& topMinCost = mem.vertPassMin[x/Da];
    int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    v_int16 P1_reg = vx_setall_s16(cv::saturate_cast<CostType>(P1));

    v_int16 leftMinCostP2_reg   = vx_setall_s16(cv::saturate_cast<CostType>(leftMinCost+P2));
@ -1798,18 +1798,18 @@ void SGBM3WayMainLoop::accumulateCostsLeftTop(const BufferSGBM3Way &mem, int x,
    v_int16 src_shifted_left,src_shifted_right;
    v_int16 res;

-    for(;i<Da-v_int16::nlanes;i+= v_int16::nlanes)
+    for(;i<Da-VTraits<v_int16>::vlanes();i+= VTraits<v_int16>::vlanes())
    {
        //process leftBuf:
        //lookahead load:
-        src2 = vx_load_aligned(leftBuf_prev+i+v_int16::nlanes);
+        src2 = vx_load_aligned(leftBuf_prev+i+VTraits<v_int16>::vlanes());

        //get shifted versions of the current block and add P1:
        src_shifted_left  = v_rotate_left<1>  (src1_leftBuf,src0_leftBuf);
        src_shifted_right = v_rotate_right<1> (src1_leftBuf,src2        );

        // process and save current block:
-        res = vx_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
+        res = v_add(vx_load_aligned(costs + i), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_leftBuf, leftMinCostP2_reg)), leftMinCostP2_reg));
        leftMinCost_new_reg = v_min(leftMinCost_new_reg,res);
        v_store_aligned(leftBuf+i, res);

@ -1819,14 +1819,14 @@ void SGBM3WayMainLoop::accumulateCostsLeftTop(const BufferSGBM3Way &mem, int x,

        //process topBuf:
        //lookahead load:
-        src2 = vx_load_aligned(topBuf+i+v_int16::nlanes);
+        src2 = vx_load_aligned(topBuf+i+VTraits<v_int16>::vlanes());

        //get shifted versions of the current block and add P1:
        src_shifted_left  = v_rotate_left<1>  (src1_topBuf,src0_topBuf);
        src_shifted_right = v_rotate_right<1> (src1_topBuf,src2       );

        // process and save current block:
-        res = vx_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
+        res = v_add(vx_load_aligned(costs + i), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_topBuf, topMinCostP2_reg)), topMinCostP2_reg));
        topMinCost_new_reg = v_min(topMinCost_new_reg,res);
        v_store_aligned(topBuf+i, res);

@ -1843,17 +1843,17 @@ void SGBM3WayMainLoop::accumulateCostsLeftTop(const BufferSGBM3Way &mem, int x,
        src_shifted_left  = v_rotate_left<1>  (src1_leftBuf,src0_leftBuf);
        src_shifted_right = v_rotate_right<1> (src1_leftBuf,src2        );

-        res = vx_load_aligned(costs+Da-v_int16::nlanes) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
+        res = v_add(vx_load_aligned(costs + this->Da - VTraits<v_int16>::vlanes()), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_leftBuf, leftMinCostP2_reg)), leftMinCostP2_reg));
        leftMinCost = v_reduce_min(v_min(leftMinCost_new_reg,res));
-        v_store_aligned(leftBuf+Da-v_int16::nlanes, res);
+        v_store_aligned(leftBuf+Da-VTraits<v_int16>::vlanes(), res);

        //process topBuf:
        src_shifted_left  = v_rotate_left<1>  (src1_topBuf,src0_topBuf);
        src_shifted_right = v_rotate_right<1> (src1_topBuf,src2       );

-        res = vx_load_aligned(costs+Da-v_int16::nlanes) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
+        res = v_add(vx_load_aligned(costs + this->Da - VTraits<v_int16>::vlanes()), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_topBuf, topMinCostP2_reg)), topMinCostP2_reg));
        topMinCost = v_reduce_min(v_min(topMinCost_new_reg,res));
-        v_store_aligned(topBuf+Da-v_int16::nlanes, res);
+        v_store_aligned(topBuf+Da-VTraits<v_int16>::vlanes(), res);
    }
    else
    {
@ -1904,7 +1904,7 @@ void SGBM3WayMainLoop::accumulateCostsRight(const BufferSGBM3Way &mem, int x,
    CostType* leftBuf = mem.horPassCostVolume + x;

    int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    v_int16 P1_reg = vx_setall_s16(cv::saturate_cast<CostType>(P1));

    v_int16 rightMinCostP2_reg   = vx_setall_s16(cv::saturate_cast<CostType>(rightMinCost+P2));
@ -1919,27 +1919,27 @@ void SGBM3WayMainLoop::accumulateCostsRight(const BufferSGBM3Way &mem, int x,
    v_int16 min_sum_cost_reg = vx_setall_s16(SHRT_MAX);
    v_int16 min_sum_pos_reg  = vx_setall_s16(0);

-    for(;i<Da-v_int16::nlanes;i+=v_int16::nlanes)
+    for(;i<Da-VTraits<v_int16>::vlanes();i+=VTraits<v_int16>::vlanes())
    {
        //lookahead load:
-        src2 = vx_load_aligned(rightBuf+i+v_int16::nlanes);
+        src2 = vx_load_aligned(rightBuf+i+VTraits<v_int16>::vlanes());

        //get shifted versions of the current block and add P1:
        src_shifted_left  = v_rotate_left<1>  (src1_rightBuf,src0_rightBuf);
        src_shifted_right = v_rotate_right<1> (src1_rightBuf,src2         );

        // process and save current block:
-        res = vx_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
+        res = v_add(vx_load_aligned(costs + i), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_rightBuf, rightMinCostP2_reg)), rightMinCostP2_reg));
        rightMinCost_new_reg = v_min(rightMinCost_new_reg,res);
        v_store_aligned(rightBuf+i, res);

        // compute and save total cost:
-        res = res + vx_load_aligned(leftBuf+i) + vx_load_aligned(topBuf+i);
+        res = v_add(v_add(res, vx_load_aligned(leftBuf + i)), vx_load_aligned(topBuf + i));
        v_store_aligned(leftBuf+i, res);

        // track disparity value with the minimum cost:
        min_sum_cost_reg = v_min(min_sum_cost_reg,res);
-        min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (vx_setall_s16((short)i) - min_sum_pos_reg));
+        min_sum_pos_reg = v_add(min_sum_pos_reg, v_and(v_eq(min_sum_cost_reg, res), v_sub(vx_setall_s16((short)i), min_sum_pos_reg)));

        //update src:
        src0_rightBuf    = src1_rightBuf;
@ -1953,15 +1953,15 @@ void SGBM3WayMainLoop::accumulateCostsRight(const BufferSGBM3Way &mem, int x,
        src_shifted_left  = v_rotate_left<1>  (src1_rightBuf,src0_rightBuf);
        src_shifted_right = v_rotate_right<1> (src1_rightBuf,src2         );

-        res = vx_load_aligned(costs+D-v_int16::nlanes) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
+        res = v_add(vx_load_aligned(costs + this->D - VTraits<v_int16>::vlanes()), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_rightBuf, rightMinCostP2_reg)), rightMinCostP2_reg));
        rightMinCost = v_reduce_min(v_min(rightMinCost_new_reg,res));
-        v_store_aligned(rightBuf+D-v_int16::nlanes, res);
+        v_store_aligned(rightBuf+D-VTraits<v_int16>::vlanes(), res);

-        res = res + vx_load_aligned(leftBuf+D-v_int16::nlanes) + vx_load_aligned(topBuf+D-v_int16::nlanes);
-        v_store_aligned(leftBuf+D-v_int16::nlanes, res);
+        res = v_add(v_add(res, vx_load_aligned(leftBuf + this->D - VTraits<v_int16>::vlanes())), vx_load_aligned(topBuf + this->D - VTraits<v_int16>::vlanes()));
+        v_store_aligned(leftBuf+D-VTraits<v_int16>::vlanes(), res);

        min_sum_cost_reg = v_min(min_sum_cost_reg,res);
-        min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (vx_setall_s16((short)(D-v_int16::nlanes)) - min_sum_pos_reg));
+        min_sum_pos_reg = v_add(min_sum_pos_reg, v_and(v_eq(min_sum_cost_reg, res), v_sub(vx_setall_s16((short)(this->D - VTraits<v_int16>::vlanes())), min_sum_pos_reg)));
        min_pos(min_sum_cost_reg,min_sum_pos_reg, min_cost, optimal_disp);
    }
    else
@ -2070,40 +2070,40 @@ void SGBM3WayMainLoop::impl(const Range& range) const
            if(uniquenessRatio>0)
            {
                d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                horPassCostVolume+=x;
                int thresh = (100*min_cost)/(100-uniquenessRatio);
                v_int16 thresh_reg = vx_setall_s16((short)(thresh+1));
                v_int16 d1 = vx_setall_s16((short)(best_d-1));
                v_int16 d2 = vx_setall_s16((short)(best_d+1));
-                v_int16 eight_reg = vx_setall_s16((short)v_int16::nlanes);
+                v_int16 eight_reg = vx_setall_s16((short)VTraits<v_int16>::vlanes());
                v_int16 cur_d = vx_load(idx_row);
                v_int16 mask;

-                for( ; d <= D - 2*v_int16::nlanes; d+=2*v_int16::nlanes )
+                for( ; d <= D - 2*VTraits<v_int16>::vlanes(); d+=2*VTraits<v_int16>::vlanes() )
                {
-                    mask = (vx_load_aligned(horPassCostVolume + d) < thresh_reg) & ( (cur_d<d1) | (cur_d>d2) );
-                    cur_d = cur_d+eight_reg;
+                    mask = v_and(v_lt(vx_load_aligned(horPassCostVolume + d), thresh_reg), v_or(v_lt(cur_d, d1), v_gt(cur_d, d2)));
+                    cur_d = v_add(cur_d, eight_reg);
                    if( v_check_any(mask) )
                        break;
-                    mask = (vx_load_aligned(horPassCostVolume + d + v_int16::nlanes) < thresh_reg) & ( (cur_d<d1) | (cur_d>d2) );
-                    cur_d = cur_d+eight_reg;
+                    mask = v_and(v_lt(vx_load_aligned(horPassCostVolume + d + VTraits<v_int16>::vlanes()), thresh_reg), v_or(v_lt(cur_d, d1), v_gt(cur_d, d2)));
+                    cur_d = v_add(cur_d, eight_reg);
                    if( v_check_any(mask) )
                        break;
                }
-                if( d <= D - 2*v_int16::nlanes )
+                if( d <= D - 2*VTraits<v_int16>::vlanes() )
                {
                    horPassCostVolume-=x;
                    continue;
                }
-                if( d <= D - v_int16::nlanes )
+                if( d <= D - VTraits<v_int16>::vlanes() )
                {
-                    if( v_check_any((vx_load_aligned(horPassCostVolume + d) < thresh_reg) & ((cur_d < d1) | (cur_d > d2))) )
+                    if( v_check_any(v_and(v_lt(vx_load_aligned(horPassCostVolume + d), thresh_reg), v_or(v_lt(cur_d, d1), v_gt(cur_d, d2)))) )
                    {
                        horPassCostVolume-=x;
                        continue;
                    }
-                    d+=v_int16::nlanes;
+                    d+=VTraits<v_int16>::vlanes();
                }
                horPassCostVolume-=x;
 #endif
--- a/modules/features2d/src/sift.simd.hpp
+++ b/modules/features2d/src/sift.simd.hpp
@ -210,24 +210,24 @@ float calcOrientationHist(
    cv::hal::magnitude32f(X, Y, Mag, len);

    k = 0;
-#if CV_SIMD
-    const int vecsize = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int vecsize = VTraits<v_float32>::vlanes();
    v_float32 nd360 = vx_setall_f32(n/360.f);
    v_int32 __n = vx_setall_s32(n);
-    int CV_DECL_ALIGNED(CV_SIMD_WIDTH) bin_buf[vecsize];
-    float CV_DECL_ALIGNED(CV_SIMD_WIDTH) w_mul_mag_buf[vecsize];
+    int CV_DECL_ALIGNED(CV_SIMD_WIDTH) bin_buf[VTraits<v_float32>::max_nlanes];
+    float CV_DECL_ALIGNED(CV_SIMD_WIDTH) w_mul_mag_buf[VTraits<v_float32>::max_nlanes];

    for( ; k <= len - vecsize; k += vecsize )
    {
        v_float32 w = vx_load_aligned( W + k );
        v_float32 mag = vx_load_aligned( Mag + k );
        v_float32 ori = vx_load_aligned( Ori + k );
-        v_int32 bin = v_round( nd360 * ori );
+        v_int32 bin = v_round( v_mul(nd360, ori) );

-        bin = v_select(bin >= __n, bin - __n, bin);
-        bin = v_select(bin < vx_setzero_s32(), bin + __n, bin);
+        bin = v_select(v_ge(bin, __n), v_sub(bin, __n), bin);
+        bin = v_select(v_lt(bin, vx_setzero_s32()), v_add(bin, __n), bin);

-        w = w * mag;
+        w = v_mul(w, mag);
        v_store_aligned(bin_buf, bin);
        v_store_aligned(w_mul_mag_buf, w);
        for(int vi = 0; vi < vecsize; vi++)
@ -253,19 +253,19 @@ float calcOrientationHist(
    temphist[n+1] = temphist[1];

    i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    v_float32 d_1_16 = vx_setall_f32(1.f/16.f);
    v_float32 d_4_16 = vx_setall_f32(4.f/16.f);
    v_float32 d_6_16 = vx_setall_f32(6.f/16.f);
-    for( ; i <= n - v_float32::nlanes; i += v_float32::nlanes )
+    for( ; i <= n - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes() )
    {
        v_float32 tn2 = vx_load_aligned(temphist + i-2);
        v_float32 tn1 = vx_load(temphist + i-1);
        v_float32 t0 = vx_load(temphist + i);
        v_float32 t1 = vx_load(temphist + i+1);
        v_float32 t2 = vx_load(temphist + i+2);
-        v_float32 _hist = v_fma(tn2 + t2, d_1_16,
-            v_fma(tn1 + t1, d_4_16, t0 * d_6_16));
+        v_float32 _hist = v_fma(v_add(tn2, t2), d_1_16,
+            v_fma(v_add(tn1, t1), d_4_16, v_mul(t0, d_6_16)));
        v_store(hist + i, _hist);
    }
 #endif
@ -452,8 +452,8 @@ public:
            const sift_wt* nextptr = next.ptr<sift_wt>(r);
            int c = SIFT_IMG_BORDER;

-#if CV_SIMD && !(DoG_TYPE_SHORT)
-            const int vecsize = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE) && !(DoG_TYPE_SHORT)
+            const int vecsize = VTraits<v_float32>::vlanes();
            for( ; c <= cols-SIFT_IMG_BORDER - vecsize; c += vecsize)
            {
                v_float32 val = vx_load(&currptr[c]);
@ -464,7 +464,7 @@ public:
                v_float32 vmin,vmax;


-                v_float32 cond = v_abs(val) > vx_setall_f32((float)threshold);
+                v_float32 cond = v_gt(v_abs(val), vx_setall_f32((float)this->threshold));
                if (!v_check_any(cond))
                {
                    continue;
@ -477,10 +477,10 @@ public:
                vmax = v_max(v_max(v_max(_00,_01),v_max(_02,_10)),v_max(v_max(_12,_20),v_max(_21,_22)));
                vmin = v_min(v_min(v_min(_00,_01),v_min(_02,_10)),v_min(v_min(_12,_20),v_min(_21,_22)));

-                v_float32 condp = cond & (val > vx_setall_f32(0)) & (val >= vmax);
-                v_float32 condm = cond & (val < vx_setall_f32(0)) & (val <= vmin);
+                v_float32 condp = v_and(v_and(cond, v_gt(val, vx_setall_f32(0))), v_ge(val, vmax));
+                v_float32 condm = v_and(v_and(cond, v_lt(val, vx_setall_f32(0))), v_le(val, vmin));

-                cond = condp | condm;
+                cond = v_or(condp, condm);
                if (!v_check_any(cond))
                {
                    continue;
@ -493,10 +493,10 @@ public:
                vmax = v_max(v_max(v_max(_00,_01),v_max(_02,_10)),v_max(v_max(_12,_20),v_max(_21,_22)));
                vmin = v_min(v_min(v_min(_00,_01),v_min(_02,_10)),v_min(v_min(_12,_20),v_min(_21,_22)));

-                condp &= (val >= vmax);
-                condm &= (val <= vmin);
+                condp = v_and(condp, v_ge(val, vmax));
+                condm = v_and(condm, v_le(val, vmin));

-                cond = condp | condm;
+                cond = v_or(condp, condm);
                if (!v_check_any(cond))
                {
                    continue;
@ -515,10 +515,10 @@ public:
                vmax = v_max(v_max(v_max(_00,_01),v_max(_02,_10)),v_max(v_max(_12,_20),v_max(_21,_22)));
                vmin = v_min(v_min(v_min(_00,_01),v_min(_02,_10)),v_min(v_min(_12,_20),v_min(_21,_22)));

-                condp &= (val >= v_max(vmax,max_middle));
-                condm &= (val <= v_min(vmin,min_middle));
+                condp = v_and(condp, v_ge(val, v_max(vmax, max_middle)));
+                condm = v_and(condm, v_le(val, v_min(vmin, min_middle)));

-                cond = condp | condm;
+                cond = v_or(condp, condm);
                if (!v_check_any(cond))
                {
                    continue;
@ -777,11 +777,11 @@ void calcSIFTDescriptor(
    cv::hal::exp32f(W, W, len);

    k = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    {
-        const int vecsize = v_float32::nlanes;
-        int CV_DECL_ALIGNED(CV_SIMD_WIDTH) idx_buf[vecsize];
-        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rco_buf[8*vecsize];
+        const int vecsize = VTraits<v_float32>::vlanes();
+        int CV_DECL_ALIGNED(CV_SIMD_WIDTH) idx_buf[VTraits<v_float32>::max_nlanes];
+        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rco_buf[8*VTraits<v_float32>::max_nlanes];
        const v_float32 __ori  = vx_setall_f32(ori);
        const v_float32 __bins_per_rad = vx_setall_f32(bins_per_rad);
        const v_int32 __n = vx_setall_s32(n);
@ -792,28 +792,28 @@ void calcSIFTDescriptor(
        {
            v_float32 rbin = vx_load_aligned(RBin + k);
            v_float32 cbin = vx_load_aligned(CBin + k);
-            v_float32 obin = (vx_load_aligned(Ori + k) - __ori) * __bins_per_rad;
-            v_float32 mag = vx_load_aligned(Mag + k) * vx_load_aligned(W + k);
+            v_float32 obin = v_mul(v_sub(vx_load_aligned(Ori + k), __ori), __bins_per_rad);
+            v_float32 mag = v_mul(vx_load_aligned(Mag + k), vx_load_aligned(W + k));

            v_int32 r0 = v_floor(rbin);
            v_int32 c0 = v_floor(cbin);
            v_int32 o0 = v_floor(obin);
-            rbin -= v_cvt_f32(r0);
-            cbin -= v_cvt_f32(c0);
-            obin -= v_cvt_f32(o0);
-
-            o0 = v_select(o0 < vx_setzero_s32(), o0 + __n, o0);
-            o0 = v_select(o0 >= __n, o0 - __n, o0);
-
-            v_float32 v_r1 = mag*rbin, v_r0 = mag - v_r1;
-            v_float32 v_rc11 = v_r1*cbin, v_rc10 = v_r1 - v_rc11;
-            v_float32 v_rc01 = v_r0*cbin, v_rc00 = v_r0 - v_rc01;
-            v_float32 v_rco111 = v_rc11*obin, v_rco110 = v_rc11 - v_rco111;
-            v_float32 v_rco101 = v_rc10*obin, v_rco100 = v_rc10 - v_rco101;
-            v_float32 v_rco011 = v_rc01*obin, v_rco010 = v_rc01 - v_rco011;
-            v_float32 v_rco001 = v_rc00*obin, v_rco000 = v_rc00 - v_rco001;
-
-            v_int32 idx = v_muladd(v_muladd(r0+__1, __d_plus_2, c0+__1), __n_plus_2, o0);
+            rbin = v_sub(rbin, v_cvt_f32(r0));
+            cbin = v_sub(cbin, v_cvt_f32(c0));
+            obin = v_sub(obin, v_cvt_f32(o0));
+
+            o0 = v_select(v_lt(o0, vx_setzero_s32()), v_add(o0, __n), o0);
+            o0 = v_select(v_ge(o0, __n), v_sub(o0, __n), o0);
+
+            v_float32 v_r1 = v_mul(mag, rbin), v_r0 = v_sub(mag, v_r1);
+            v_float32 v_rc11 = v_mul(v_r1, cbin), v_rc10 = v_sub(v_r1, v_rc11);
+            v_float32 v_rc01 = v_mul(v_r0, cbin), v_rc00 = v_sub(v_r0, v_rc01);
+            v_float32 v_rco111 = v_mul(v_rc11, obin), v_rco110 = v_sub(v_rc11, v_rco111);
+            v_float32 v_rco101 = v_mul(v_rc10, obin), v_rco100 = v_sub(v_rc10, v_rco101);
+            v_float32 v_rco011 = v_mul(v_rc01, obin), v_rco010 = v_sub(v_rc01, v_rco011);
+            v_float32 v_rco001 = v_mul(v_rc00, obin), v_rco000 = v_sub(v_rc00, v_rco001);
+
+            v_int32 idx = v_muladd(v_muladd(v_add(r0, __1), __d_plus_2, v_add(c0, __1)), __n_plus_2, o0);
            v_store_aligned(idx_buf, idx);

            v_store_aligned(rco_buf,           v_rco000);
@ -894,11 +894,11 @@ void calcSIFTDescriptor(
    float nrm2 = 0;
    len = d*d*n;
    k = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    {
        v_float32 __nrm2 = vx_setzero_f32();
        v_float32 __rawDst;
-        for( ; k <= len - v_float32::nlanes; k += v_float32::nlanes )
+        for( ; k <= len - VTraits<v_float32>::vlanes(); k += VTraits<v_float32>::vlanes() )
        {
            __rawDst = vx_load_aligned(rawDst + k);
            __nrm2 = v_fma(__rawDst, __rawDst, __nrm2);
@ -949,15 +949,15 @@ void calcSIFTDescriptor(
 if( dstMat.type() == CV_32F )
 {
    float* dst = dstMat.ptr<float>(row);
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    v_float32 __dst;
    v_float32 __min = vx_setzero_f32();
    v_float32 __max = vx_setall_f32(255.0f); // max of uchar
    v_float32 __nrm2 = vx_setall_f32(nrm2);
-    for( k = 0; k <= len - v_float32::nlanes; k += v_float32::nlanes )
+    for( k = 0; k <= len - VTraits<v_float32>::vlanes(); k += VTraits<v_float32>::vlanes() )
    {
        __dst = vx_load_aligned(rawDst + k);
-        __dst = v_min(v_max(v_cvt_f32(v_round(__dst * __nrm2)), __min), __max);
+        __dst = v_min(v_max(v_cvt_f32(v_round(v_mul(__dst, __nrm2))), __min), __max);
        v_store(dst + k, __dst);
    }
 #endif
@ -976,16 +976,16 @@ if( dstMat.type() == CV_32F )
 else // CV_8U
 {
    uint8_t* dst = dstMat.ptr<uint8_t>(row);
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
    v_float32 __dst0, __dst1;
    v_uint16 __pack01;
    v_float32 __nrm2 = vx_setall_f32(nrm2);
-    for( k = 0; k <= len - v_float32::nlanes * 2; k += v_float32::nlanes * 2 )
+    for( k = 0; k <= len - VTraits<v_float32>::vlanes() * 2; k += VTraits<v_float32>::vlanes() * 2 )
    {
        __dst0 = vx_load_aligned(rawDst + k);
-        __dst1 = vx_load_aligned(rawDst + k + v_float32::nlanes);
+        __dst1 = vx_load_aligned(rawDst + k + VTraits<v_float32>::vlanes());

-        __pack01 = v_pack_u(v_round(__dst0 * __nrm2), v_round(__dst1 * __nrm2));
+        __pack01 = v_pack_u(v_round(v_mul(__dst0, __nrm2)), v_round(v_mul(__dst1, __nrm2)));
        v_pack_store(dst + k, __pack01);
    }
 #endif