diff --git a/modules/calib3d/src/stereobm.cpp b/modules/calib3d/src/stereobm.cpp
index 1ac6f6560b..625196ea63 100644
--- a/modules/calib3d/src/stereobm.cpp
+++ b/modules/calib3d/src/stereobm.cpp
@@ -231,13 +231,13 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
         dptr0[0] = dptr0[size.width-1] = dptr1[0] = dptr1[size.width-1] = val0;
         x = 1;
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         {
             v_int16 ftz = vx_setall_s16((short) ftzero);
             v_int16 ftz2 = vx_setall_s16((short)(ftzero*2));
             v_int16 z = vx_setzero_s16();
 
-            for(; x <= (size.width - 1) - v_int16::nlanes; x += v_int16::nlanes)
+            for(; x <= (size.width - 1) - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
             {
                 v_int16 s00 = v_reinterpret_as_s16(vx_load_expand(srow0 + x + 1));
                 v_int16 s01 = v_reinterpret_as_s16(vx_load_expand(srow0 + x - 1));
@@ -248,13 +248,13 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
                 v_int16 s30 = v_reinterpret_as_s16(vx_load_expand(srow3 + x + 1));
                 v_int16 s31 = v_reinterpret_as_s16(vx_load_expand(srow3 + x - 1));
 
-                v_int16 d0 = s00 - s01;
-                v_int16 d1 = s10 - s11;
-                v_int16 d2 = s20 - s21;
-                v_int16 d3 = s30 - s31;
+                v_int16 d0 = v_sub(s00, s01);
+                v_int16 d1 = v_sub(s10, s11);
+                v_int16 d2 = v_sub(s20, s21);
+                v_int16 d3 = v_sub(s30, s31);
 
-                v_uint16 v0 = v_reinterpret_as_u16(v_max(v_min(d0 + d1 + d1 + d2 + ftz, ftz2), z));
-                v_uint16 v1 = v_reinterpret_as_u16(v_max(v_min(d1 + d2 + d2 + d3 + ftz, ftz2), z));
+                v_uint16 v0 = v_reinterpret_as_u16(v_max(v_min(v_add(v_add(v_add(v_add(d0, d1), d1), d2), ftz), ftz2), z));
+                v_uint16 v1 = v_reinterpret_as_u16(v_max(v_min(v_add(v_add(v_add(v_add(d1, d2), d2), d3), ftz), ftz2), z));
 
                 v_pack_store(dptr0 + x, v0);
                 v_pack_store(dptr1 + x, v1);
@@ -277,10 +277,10 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
     {
         uchar* dptr = dst.ptr<uchar>(y);
         x = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         {
             v_uint8 val0_16 = vx_setall_u8(val0);
-            for(; x <= size.width-v_uint8::nlanes; x+=v_uint8::nlanes)
+            for(; x <= size.width-VTraits<v_uint8>::vlanes(); x+=VTraits<v_uint8>::vlanes())
                 v_store(dptr + x, val0_16);
         }
 #endif
@@ -356,7 +356,7 @@ public:
         for (size_t i = 0; i < nstripes; ++i)
         {
             // 1D: [1][  ndisp  ][1]
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             if (params.useShorts())
                 area.allocate(sad_short[i], ndisp + 2);
             else
@@ -364,7 +364,7 @@ public:
                 area.allocate(sad[i], ndisp + 2);
 
             // 2D: [ wsz/2 + 1 ][   height   ][ wsz/2 + 1 ] * [ ndisp ]
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             if (params.useShorts())
                 area.allocate(hsad_short[i], (height + wsz + 2) * ndisp);
             else
@@ -390,7 +390,7 @@ public:
     }
 };
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 template <typename dType>
 static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
                                             Mat& disp, Mat& cost, const StereoBMParams& state,
@@ -422,8 +422,8 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
     short costbuf = 0;
     int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0;
     const uchar * tab = bufX.tab;
-    short v_seq[v_int16::nlanes];
-    for (short i = 0; i < v_int16::nlanes; ++i)
+    short v_seq[VTraits<v_int16>::max_nlanes];
+    for (short i = 0; i < VTraits<v_int16>::vlanes(); ++i)
         v_seq[i] = i;
 
     ushort *sad = bufX.sad_short[bufNum] + 1;
@@ -446,19 +446,19 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
         {
             int lval = lptr[0];
             v_uint8 lv = vx_setall_u8((uchar)lval);
-            for( d = 0; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
+            for( d = 0; d <= ndisp - VTraits<v_uint8>::vlanes(); d += VTraits<v_uint8>::vlanes() )
             {
                 v_uint8 diff = v_absdiff(lv, vx_load(rptr + d));
                 v_store(cbuf + d, diff);
-                v_store(hsad + d, vx_load(hsad + d) + v_expand_low(diff));
-                v_store(hsad + d + v_uint16::nlanes, vx_load(hsad + d + v_uint16::nlanes) + v_expand_high(diff));
+                v_store(hsad + d, v_add(vx_load(hsad + d), v_expand_low(diff)));
+                v_store(hsad + d + VTraits<v_uint16>::vlanes(), v_add(vx_load(hsad + d + VTraits<v_uint16>::vlanes()), v_expand_high(diff)));
             }
-            if( d <= ndisp - v_uint16::nlanes )
+            if( d <= ndisp - VTraits<v_uint16>::vlanes() )
             {
                 v_uint8 diff = v_absdiff(lv, vx_load_low(rptr + d));
                 v_store_low(cbuf + d, diff);
-                v_store(hsad + d, vx_load(hsad + d) + v_expand_low(diff));
-                d += v_uint16::nlanes;
+                v_store(hsad + d, v_add(vx_load(hsad + d), v_expand_low(diff)));
+                d += VTraits<v_uint16>::vlanes();
             }
             for( ; d < ndisp; d++ )
             {
@@ -496,20 +496,20 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
         {
             int lval = lptr[0];
             v_uint8 lv = vx_setall_u8((uchar)lval);
-            for( d = 0; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
+            for( d = 0; d <= ndisp - VTraits<v_uint8>::vlanes(); d += VTraits<v_uint8>::vlanes() )
             {
                 v_uint8 diff = v_absdiff(lv, vx_load(rptr + d));
                 v_int8 cbs = v_reinterpret_as_s8(vx_load(cbuf_sub + d));
                 v_store(cbuf + d, diff);
-                v_store(hsad + d, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d) + v_expand_low(diff)) - v_expand_low(cbs)));
-                v_store(hsad + d + v_uint16::nlanes, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d + v_uint16::nlanes) + v_expand_high(diff)) - v_expand_high(cbs)));
+                v_store(hsad + d, v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(v_add(vx_load(hsad + d), v_expand_low(diff))), v_expand_low(cbs))));
+                v_store(hsad + d + VTraits<v_uint16>::vlanes(), v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(v_add(vx_load(hsad + d + VTraits<v_uint16>::vlanes()), v_expand_high(diff))), v_expand_high(cbs))));
             }
-            if( d <= ndisp - v_uint16::nlanes)
+            if( d <= ndisp - VTraits<v_uint16>::vlanes())
             {
                 v_uint8 diff = v_absdiff(lv, vx_load_low(rptr + d));
                 v_store_low(cbuf + d, diff);
-                v_store(hsad + d, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d) + v_expand_low(diff)) - vx_load_expand((schar*)cbuf_sub + d)));
-                d += v_uint16::nlanes;
+                v_store(hsad + d, v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(v_add(vx_load(hsad + d), v_expand_low(diff))), vx_load_expand((schar *)cbuf_sub + d))));
+                d += VTraits<v_uint16>::vlanes();
             }
             for( ; d < ndisp; d++ )
             {
@@ -533,20 +533,20 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
         hsad = hsad0 + (1 - dy0)*ndisp;
         for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
         {
-            for( d = 0; d <= ndisp-2*v_uint16::nlanes; d += 2*v_uint16::nlanes )
+            for( d = 0; d <= ndisp-2*VTraits<v_uint16>::vlanes(); d += 2*VTraits<v_uint16>::vlanes() )
             {
-                v_store(sad + d, vx_load(sad + d) + vx_load(hsad + d));
-                v_store(sad + d + v_uint16::nlanes, vx_load(sad + d + v_uint16::nlanes) + vx_load(hsad + d + v_uint16::nlanes));
+                v_store(sad + d, v_add(vx_load(sad + d), vx_load(hsad + d)));
+                v_store(sad + d + VTraits<v_uint16>::vlanes(), v_add(vx_load(sad + d + VTraits<v_uint16>::vlanes()), vx_load(hsad + d + VTraits<v_uint16>::vlanes())));
             }
-            if( d <= ndisp-v_uint16::nlanes )
+            if( d <= ndisp-VTraits<v_uint16>::vlanes() )
             {
-                v_store(sad + d, vx_load(sad + d) + vx_load(hsad + d));
-                d += v_uint16::nlanes;
+                v_store(sad + d, v_add(vx_load(sad + d), vx_load(hsad + d)));
+                d += VTraits<v_uint16>::vlanes();
             }
-            if( d <= ndisp-v_uint16::nlanes/2 )
+            if( d <= ndisp-VTraits<v_uint16>::vlanes()/2 )
             {
-                v_store_low(sad + d, vx_load_low(sad + d) + vx_load_low(hsad + d));
-                d += v_uint16::nlanes/2;
+                v_store_low(sad + d, v_add(vx_load_low(sad + d), vx_load_low(hsad + d)));
+                d += VTraits<v_uint16>::vlanes()/2;
             }
             for( ; d < ndisp; d++ )
                 sad[d] = sad[d] + hsad[d];
@@ -564,29 +564,29 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
             v_int16 minsad8 = vx_setall_s16(SHRT_MAX);
             v_int16 mind8 = vx_setall_s16(0);
 
-            for( d = 0; d <= ndisp - 2*v_int16::nlanes; d += 2*v_int16::nlanes )
+            for( d = 0; d <= ndisp - 2*VTraits<v_int16>::vlanes(); d += 2*VTraits<v_int16>::vlanes() )
             {
-                v_int16 sad8 = v_reinterpret_as_s16(vx_load(hsad + d)) - v_reinterpret_as_s16(vx_load(hsad_sub + d)) + v_reinterpret_as_s16(vx_load(sad + d));
+                v_int16 sad8 = v_add(v_sub(v_reinterpret_as_s16(vx_load(hsad + d)), v_reinterpret_as_s16(vx_load(hsad_sub + d))), v_reinterpret_as_s16(vx_load(sad + d)));
                 v_store(sad + d, v_reinterpret_as_u16(sad8));
-                mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)d));
+                mind8 = v_max(mind8, v_and(v_gt(minsad8, sad8), vx_setall_s16((short)d)));
                 minsad8 = v_min(minsad8, sad8);
 
-                sad8 = v_reinterpret_as_s16(vx_load(hsad + d + v_int16::nlanes)) - v_reinterpret_as_s16(vx_load(hsad_sub + d + v_int16::nlanes)) + v_reinterpret_as_s16(vx_load(sad + d + v_int16::nlanes));
-                v_store(sad + d + v_int16::nlanes, v_reinterpret_as_u16(sad8));
-                mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)(d+v_int16::nlanes)));
+                sad8 = v_add(v_sub(v_reinterpret_as_s16(vx_load(hsad + d + VTraits<v_int16>::vlanes())), v_reinterpret_as_s16(vx_load(hsad_sub + d + VTraits<v_int16>::vlanes()))), v_reinterpret_as_s16(vx_load(sad + d + VTraits<v_int16>::vlanes())));
+                v_store(sad + d + VTraits<v_int16>::vlanes(), v_reinterpret_as_u16(sad8));
+                mind8 = v_max(mind8, v_and(v_gt(minsad8, sad8), vx_setall_s16((short)(d + VTraits<v_int16>::vlanes()))));
                 minsad8 = v_min(minsad8, sad8);
             }
-            if( d <= ndisp - v_int16::nlanes )
+            if( d <= ndisp - VTraits<v_int16>::vlanes() )
             {
-                v_int16 sad8 = v_reinterpret_as_s16(vx_load(hsad + d)) - v_reinterpret_as_s16(vx_load(hsad_sub + d)) + v_reinterpret_as_s16(vx_load(sad + d));
+                v_int16 sad8 = v_add(v_sub(v_reinterpret_as_s16(vx_load(hsad + d)), v_reinterpret_as_s16(vx_load(hsad_sub + d))), v_reinterpret_as_s16(vx_load(sad + d)));
                 v_store(sad + d, v_reinterpret_as_u16(sad8));
-                mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)d));
+                mind8 = v_max(mind8, v_and(v_gt(minsad8, sad8), vx_setall_s16((short)d)));
                 minsad8 = v_min(minsad8, sad8);
-                d += v_int16::nlanes;
+                d += VTraits<v_int16>::vlanes();
             }
             minsad = v_reduce_min(minsad8);
-            v_int16 v_mask = (vx_setall_s16((short)minsad) == minsad8);
-            mind = v_reduce_min(((mind8+vx_load(v_seq)) & v_mask) | (vx_setall_s16(SHRT_MAX) & ~v_mask));
+            v_int16 v_mask = (v_eq(vx_setall_s16((short)minsad), minsad8));
+            mind = v_reduce_min(v_or(v_and(v_add(mind8, vx_load(v_seq)), v_mask), v_and(vx_setall_s16(32767), v_not(v_mask))));
             for( ; d < ndisp; d++ )
             {
                 int sad8 = (int)(hsad[d]) - hsad_sub[d] + sad[d];
@@ -610,34 +610,34 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
                 int thresh = minsad + (minsad * uniquenessRatio/100);
                 v_int32 thresh4 = vx_setall_s32(thresh + 1);
                 v_int32 d1 = vx_setall_s32(mind-1), d2 = vx_setall_s32(mind+1);
-                v_int32 dd_4 = vx_setall_s32(v_int32::nlanes);
+                v_int32 dd_4 = vx_setall_s32(VTraits<v_int32>::vlanes());
                 v_int32 d4 = vx_load_expand(v_seq);
 
-                for( d = 0; d <= ndisp - v_int16::nlanes; d += v_int16::nlanes )
+                for( d = 0; d <= ndisp - VTraits<v_int16>::vlanes(); d += VTraits<v_int16>::vlanes() )
                 {
                     v_int32 sad4_l, sad4_h;
                     v_expand(v_reinterpret_as_s16(vx_load(sad + d)), sad4_l, sad4_h);
-                    if( v_check_any((thresh4 > sad4_l) & ((d1 > d4) | (d4 > d2))) )
+                    if( v_check_any(v_and(v_gt(thresh4, sad4_l), v_or(v_gt(d1, d4), v_gt(d4, d2)))) )
                         break;
-                    d4 += dd_4;
-                    if( v_check_any((thresh4 > sad4_h) & ((d1 > d4) | (d4 > d2))) )
+                    d4 = v_add(d4, dd_4);
+                    if( v_check_any(v_and(v_gt(thresh4, sad4_h), v_or(v_gt(d1, d4), v_gt(d4, d2)))) )
                         break;
-                    d4 += dd_4;
+                    d4 = v_add(d4, dd_4);
                 }
-                if( d <= ndisp - v_int16::nlanes )
+                if( d <= ndisp - VTraits<v_int16>::vlanes() )
                 {
                     dptr[y*dstep] = FILTERED;
                     continue;
                 }
-                if( d <= ndisp - v_int32::nlanes )
+                if( d <= ndisp - VTraits<v_int32>::vlanes() )
                 {
                     v_int32 sad4_l = vx_load_expand((short*)sad + d);
-                    if (v_check_any((thresh4 > sad4_l) & ((d1 > d4) | (d4 > d2))))
+                    if (v_check_any(v_and(v_gt(thresh4, sad4_l), v_or(v_gt(d1, d4), v_gt(d4, d2)))))
                     {
                         dptr[y*dstep] = FILTERED;
                         continue;
                     }
-                    d += v_int16::nlanes;
+                    d += VTraits<v_int16>::vlanes();
                 }
                 for( ; d < ndisp; d++ )
                 {
@@ -699,11 +699,11 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
     int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0;
     const uchar * tab = bufX.tab;
 
-#if CV_SIMD
-    int v_seq[v_int32::nlanes];
-    for (int i = 0; i < v_int32::nlanes; ++i)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int v_seq[VTraits<v_int32>::max_nlanes];
+    for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i)
         v_seq[i] = i;
-    v_int32 d0_4 = vx_load(v_seq), dd_4 = vx_setall_s32(v_int32::nlanes);
+    v_int32 d0_4 = vx_load(v_seq), dd_4 = vx_setall_s32(VTraits<v_int32>::vlanes());
 #endif
 
     int *sad = bufX.sad[bufNum] + 1;
@@ -725,17 +725,17 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
         {
             int lval = lptr[0];
             d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             {
                 v_uint8 lv = vx_setall_u8((uchar)lval);
 
-                for( ; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
+                for( ; d <= ndisp - VTraits<v_uint8>::vlanes(); d += VTraits<v_uint8>::vlanes() )
                 {
                     v_uint8 rv = vx_load(rptr + d);
                     v_int32 hsad_0 = vx_load(hsad + d);
-                    v_int32 hsad_1 = vx_load(hsad + d + v_int32::nlanes);
-                    v_int32 hsad_2 = vx_load(hsad + d + 2*v_int32::nlanes);
-                    v_int32 hsad_3 = vx_load(hsad + d + 3*v_int32::nlanes);
+                    v_int32 hsad_1 = vx_load(hsad + d + VTraits<v_int32>::vlanes());
+                    v_int32 hsad_2 = vx_load(hsad + d + 2*VTraits<v_int32>::vlanes());
+                    v_int32 hsad_3 = vx_load(hsad + d + 3*VTraits<v_int32>::vlanes());
                     v_uint8 diff = v_absdiff(lv, rv);
                     v_store(cbuf + d, diff);
 
@@ -745,15 +745,15 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
                     v_expand(diff0, diff00, diff01);
                     v_expand(diff1, diff10, diff11);
 
-                    hsad_0 += v_reinterpret_as_s32(diff00);
-                    hsad_1 += v_reinterpret_as_s32(diff01);
-                    hsad_2 += v_reinterpret_as_s32(diff10);
-                    hsad_3 += v_reinterpret_as_s32(diff11);
+                    hsad_0 = v_add(hsad_0, v_reinterpret_as_s32(diff00));
+                    hsad_1 = v_add(hsad_1, v_reinterpret_as_s32(diff01));
+                    hsad_2 = v_add(hsad_2, v_reinterpret_as_s32(diff10));
+                    hsad_3 = v_add(hsad_3, v_reinterpret_as_s32(diff11));
 
                     v_store(hsad + d, hsad_0);
-                    v_store(hsad + d + v_int32::nlanes, hsad_1);
-                    v_store(hsad + d + 2*v_int32::nlanes, hsad_2);
-                    v_store(hsad + d + 3*v_int32::nlanes, hsad_3);
+                    v_store(hsad + d + VTraits<v_int32>::vlanes(), hsad_1);
+                    v_store(hsad + d + 2*VTraits<v_int32>::vlanes(), hsad_2);
+                    v_store(hsad + d + 3*VTraits<v_int32>::vlanes(), hsad_3);
                 }
             }
 #endif
@@ -793,16 +793,16 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
         {
             int lval = lptr[0];
             d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             {
                 v_uint8 lv = vx_setall_u8((uchar)lval);
-                for( ; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
+                for( ; d <= ndisp - VTraits<v_uint8>::vlanes(); d += VTraits<v_uint8>::vlanes() )
                 {
                     v_uint8 rv = vx_load(rptr + d);
                     v_int32 hsad_0 = vx_load(hsad + d);
-                    v_int32 hsad_1 = vx_load(hsad + d + v_int32::nlanes);
-                    v_int32 hsad_2 = vx_load(hsad + d + 2*v_int32::nlanes);
-                    v_int32 hsad_3 = vx_load(hsad + d + 3*v_int32::nlanes);
+                    v_int32 hsad_1 = vx_load(hsad + d + VTraits<v_int32>::vlanes());
+                    v_int32 hsad_2 = vx_load(hsad + d + 2*VTraits<v_int32>::vlanes());
+                    v_int32 hsad_3 = vx_load(hsad + d + 3*VTraits<v_int32>::vlanes());
                     v_uint8 cbs = vx_load(cbuf_sub + d);
                     v_uint8 diff = v_absdiff(lv, rv);
                     v_store(cbuf + d, diff);
@@ -816,19 +816,19 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
                     v_expand(v_reinterpret_as_s16(cbs0), cbs00, cbs01);
                     v_expand(v_reinterpret_as_s16(cbs1), cbs10, cbs11);
 
-                    v_int32 diff_0 = diff00 - cbs00;
-                    v_int32 diff_1 = diff01 - cbs01;
-                    v_int32 diff_2 = diff10 - cbs10;
-                    v_int32 diff_3 = diff11 - cbs11;
-                    hsad_0 += diff_0;
-                    hsad_1 += diff_1;
-                    hsad_2 += diff_2;
-                    hsad_3 += diff_3;
+                    v_int32 diff_0 = v_sub(diff00, cbs00);
+                    v_int32 diff_1 = v_sub(diff01, cbs01);
+                    v_int32 diff_2 = v_sub(diff10, cbs10);
+                    v_int32 diff_3 = v_sub(diff11, cbs11);
+                    hsad_0 = v_add(hsad_0, diff_0);
+                    hsad_1 = v_add(hsad_1, diff_1);
+                    hsad_2 = v_add(hsad_2, diff_2);
+                    hsad_3 = v_add(hsad_3, diff_3);
 
                     v_store(hsad + d, hsad_0);
-                    v_store(hsad + d + v_int32::nlanes, hsad_1);
-                    v_store(hsad + d + 2*v_int32::nlanes, hsad_2);
-                    v_store(hsad + d + 3*v_int32::nlanes, hsad_3);
+                    v_store(hsad + d + VTraits<v_int32>::vlanes(), hsad_1);
+                    v_store(hsad + d + 2*VTraits<v_int32>::vlanes(), hsad_2);
+                    v_store(hsad + d + 3*VTraits<v_int32>::vlanes(), hsad_3);
                 }
             }
 #endif
@@ -855,18 +855,18 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
         for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
         {
             d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             {
-                for( d = 0; d <= ndisp-2*v_int32::nlanes; d += 2*v_int32::nlanes )
+                for( d = 0; d <= ndisp-2*VTraits<v_int32>::vlanes(); d += 2*VTraits<v_int32>::vlanes() )
                 {
                     v_int32 s0 = vx_load(sad + d);
-                    v_int32 s1 = vx_load(sad + d + v_int32::nlanes);
+                    v_int32 s1 = vx_load(sad + d + VTraits<v_int32>::vlanes());
                     v_int32 t0 = vx_load(hsad + d);
-                    v_int32 t1 = vx_load(hsad + d + v_int32::nlanes);
-                    s0 += t0;
-                    s1 += t1;
+                    v_int32 t1 = vx_load(hsad + d + VTraits<v_int32>::vlanes());
+                    s0 = v_add(s0, t0);
+                    s1 = v_add(s1, t1);
                     v_store(sad + d, s0);
-                    v_store(sad + d + v_int32::nlanes, s1);
+                    v_store(sad + d + VTraits<v_int32>::vlanes(), s1);
                 }
             }
 #endif
@@ -884,30 +884,30 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
             hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
             hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
             d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             {
                 v_int32 minsad4 = vx_setall_s32(INT_MAX);
                 v_int32 mind4 = vx_setall_s32(0), d4 = d0_4;
 
-                for( ; d <= ndisp - 2*v_int32::nlanes; d += 2*v_int32::nlanes )
+                for( ; d <= ndisp - 2*VTraits<v_int32>::vlanes(); d += 2*VTraits<v_int32>::vlanes() )
                 {
-                    v_int32 sad4 = vx_load(sad + d) + vx_load(hsad + d) - vx_load(hsad_sub + d);
+                    v_int32 sad4 = v_sub(v_add(vx_load(sad + d), vx_load(hsad + d)), vx_load(hsad_sub + d));
                     v_store(sad + d, sad4);
-                    mind4 = v_select(minsad4 > sad4, d4, mind4);
+                    mind4 = v_select(v_gt(minsad4, sad4), d4, mind4);
                     minsad4 = v_min(minsad4, sad4);
-                    d4 += dd_4;
+                    d4 = v_add(d4, dd_4);
 
-                    sad4 = vx_load(sad + d + v_int32::nlanes) + vx_load(hsad + d + v_int32::nlanes) - vx_load(hsad_sub + d + v_int32::nlanes);
-                    v_store(sad + d + v_int32::nlanes, sad4);
-                    mind4 = v_select(minsad4 > sad4, d4, mind4);
+                    sad4 = v_sub(v_add(vx_load(sad + d + VTraits<v_int32>::vlanes()), vx_load(hsad + d + VTraits<v_int32>::vlanes())), vx_load(hsad_sub + d + VTraits<v_int32>::vlanes()));
+                    v_store(sad + d + VTraits<v_int32>::vlanes(), sad4);
+                    mind4 = v_select(v_gt(minsad4, sad4), d4, mind4);
                     minsad4 = v_min(minsad4, sad4);
-                    d4 += dd_4;
+                    d4 = v_add(d4, dd_4);
                 }
 
-                int CV_DECL_ALIGNED(CV_SIMD_WIDTH) minsad_buf[v_int32::nlanes], mind_buf[v_int32::nlanes];
+                int CV_DECL_ALIGNED(CV_SIMD_WIDTH) minsad_buf[VTraits<v_int32>::max_nlanes], mind_buf[VTraits<v_int32>::max_nlanes];
                 v_store(minsad_buf, minsad4);
                 v_store(mind_buf, mind4);
-                for (int i = 0; i < v_int32::nlanes; ++i)
+                for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i)
                     if(minsad_buf[i] < minsad || (minsad == minsad_buf[i] && mind_buf[i] < mind)) { minsad = minsad_buf[i]; mind = mind_buf[i]; }
             }
 #endif
@@ -1102,7 +1102,7 @@ struct FindStereoCorrespInvoker : public ParallelLoopBody
         Mat disp_i = disp->rowRange(row0, row1);
         Mat cost_i = state.disp12MaxDiff >= 0 ? cost->rowRange(row0, row1) : Mat();
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         if (state.useShorts())
         {
             if( disp_i.type() == CV_16S)
diff --git a/modules/calib3d/src/stereosgbm.cpp b/modules/calib3d/src/stereosgbm.cpp
index bc19612b5a..75f6f32564 100644
--- a/modules/calib3d/src/stereosgbm.cpp
+++ b/modules/calib3d/src/stereosgbm.cpp
@@ -123,7 +123,7 @@ struct StereoSGBMParams
     int mode;
 };
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 #if CV_SIMD_WIDTH == 16
 static inline v_int16 vx_setseq_s16()
 { return v_int16(0, 1, 2, 3, 4, 5, 6, 7); }
@@ -136,10 +136,10 @@ static inline v_int16 vx_setseq_s16()
 #else
 struct vseq_s16
 {
-    short data[v_int16::nlanes];
+    short data[VTraits<v_int16>::max_nlanes];
     vseq_s16()
     {
-        for (int i = 0; i < v_int16::nlanes; i++)
+        for (int i = 0; i < VTraits<v_int16>::vlanes(); i++)
             data[i] = i;
     }
 };
@@ -153,8 +153,8 @@ static inline v_int16 vx_setseq_s16()
 static inline void min_pos(const v_int16& val, const v_int16& pos, short &min_val, short &min_pos)
 {
     min_val = v_reduce_min(val);
-    v_int16 v_mask = (vx_setall_s16(min_val) == val);
-    min_pos = v_reduce_min(((pos+vx_setseq_s16()) & v_mask) | (vx_setall_s16(SHRT_MAX) & ~v_mask));
+    v_int16 v_mask = (v_eq(vx_setall_s16(min_val), val));
+    min_pos = v_reduce_min(v_or(v_and(v_add(pos, vx_setseq_s16()), v_mask), v_and(vx_setall_s16(SHRT_MAX), v_not(v_mask))));
 }
 #endif
 
@@ -270,26 +270,26 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
             int u1 = std::max(ul, ur); u1 = std::max(u1, u);
 
             int d = minD;
-        #if CV_SIMD
+        #if (CV_SIMD || CV_SIMD_SCALABLE)
             v_uint8 _u  = vx_setall_u8((uchar)u), _u0 = vx_setall_u8((uchar)u0);
             v_uint8 _u1 = vx_setall_u8((uchar)u1);
 
-            for( ; d <= maxD - 2*v_int16::nlanes; d += 2*v_int16::nlanes )
+            for( ; d <= maxD - 2*VTraits<v_int16>::vlanes(); d += 2*VTraits<v_int16>::vlanes() )
             {
                 v_uint8 _v  = vx_load(prow2  + width-x-1 + d);
                 v_uint8 _v0 = vx_load(buffer + width-x-1 + d);
                 v_uint8 _v1 = vx_load(buffer + width-x-1 + d + width2);
-                v_uint8 c0 = v_max(_u - _v1, _v0 - _u);
-                v_uint8 c1 = v_max(_v - _u1, _u0 - _v);
+                v_uint8 c0 = v_max(v_sub(_u, _v1), v_sub(_v0, _u));
+                v_uint8 c1 = v_max(v_sub(_v, _u1), v_sub(_u0, _v));
                 v_uint8 diff = v_min(c0, c1);
 
                 v_int16 _c0 = vx_load_aligned(cost + x*D + d);
-                v_int16 _c1 = vx_load_aligned(cost + x*D + d + v_int16::nlanes);
+                v_int16 _c1 = vx_load_aligned(cost + x*D + d + VTraits<v_int16>::vlanes());
 
                 v_uint16 diff1,diff2;
                 v_expand(diff,diff1,diff2);
-                v_store_aligned(cost + x*D + d,                   _c0 + v_reinterpret_as_s16(diff1 >> diff_scale));
-                v_store_aligned(cost + x*D + d + v_int16::nlanes, _c1 + v_reinterpret_as_s16(diff2 >> diff_scale));
+                v_store_aligned(cost + x*D + d,                   v_add(_c0, v_reinterpret_as_s16(v_shr(diff1, diff_scale))));
+                v_store_aligned(cost + x*D + d + VTraits<v_int16>::vlanes(), v_add(_c1, v_reinterpret_as_s16(v_shr(diff2, diff_scale))));
             }
         #endif
             for( ; d < maxD; d++ )
@@ -555,13 +555,13 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                         calcPixelCostBT( img1, img2, k, minD, maxD, mem.pixDiff, mem.tempBuf, mem.getClipTab() );
 
                         memset(hsumAdd, 0, Da*sizeof(CostType));
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                         v_int16 h_scale = vx_setall_s16((short)SW2 + 1);
-                        for( d = 0; d < Da; d += v_int16::nlanes )
+                        for( d = 0; d < Da; d += VTraits<v_int16>::vlanes() )
                         {
-                            v_int16 v_hsumAdd = vx_load_aligned(mem.pixDiff + d) * h_scale;
+                            v_int16 v_hsumAdd = v_mul(vx_load_aligned(mem.pixDiff + d), h_scale);
                             for( x = Da; x <= SW2*Da; x += Da )
-                                v_hsumAdd += vx_load_aligned(mem.pixDiff + x + d);
+                                v_hsumAdd = v_add(v_hsumAdd, vx_load_aligned(mem.pixDiff + x + d));
                             v_store_aligned(hsumAdd + d, v_hsumAdd);
                         }
 #else
@@ -578,9 +578,9 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                             const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
                             const CostType* Cprev =  mem.getCBuf(y - 1);
 
-#if CV_SIMD
-                            for (d = 0; d < Da; d += v_int16::nlanes)
-                                v_store_aligned(C + d, vx_load_aligned(Cprev + d) + vx_load_aligned(hsumAdd + d) - vx_load_aligned(hsumSub + d));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                            for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
+                                v_store_aligned(C + d, v_sub(v_add(vx_load_aligned(Cprev + d), vx_load_aligned(hsumAdd + d)), vx_load_aligned(hsumSub + d)));
 #else
                             for (d = 0; d < D; d++)
                                 C[d] = (CostType)(Cprev[d] + hsumAdd[d] - hsumSub[d]);
@@ -590,12 +590,12 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                             {
                                 const CostType* pixAdd = mem.pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
                                 const CostType* pixSub = mem.pixDiff + std::max(x - (SW2+1)*Da, 0);
-#if CV_SIMD
-                                for( d = 0; d < Da; d += v_int16::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                                for( d = 0; d < Da; d += VTraits<v_int16>::vlanes() )
                                 {
-                                    v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) - vx_load_aligned(pixSub + d) + vx_load_aligned(pixAdd + d);
+                                    v_int16 hv = v_add(v_sub(vx_load_aligned(hsumAdd + x - Da + d), vx_load_aligned(pixSub + d)), vx_load_aligned(pixAdd + d));
                                     v_store_aligned(hsumAdd + x + d, hv);
-                                    v_store_aligned(C + x + d, vx_load_aligned(Cprev + x + d) - vx_load_aligned(hsumSub + x + d) + hv);
+                                    v_store_aligned(C + x + d, v_add(v_sub(vx_load_aligned(Cprev + x + d), vx_load_aligned(hsumSub + x + d)), hv));
                                 }
 #else
                                 for( d = 0; d < D; d++ )
@@ -608,10 +608,10 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                         }
                         else
                         {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                             v_int16 v_scale = vx_setall_s16(k == 0 ? (short)SH2 + 1 : 1);
-                            for (d = 0; d < Da; d += v_int16::nlanes)
-                                v_store_aligned(C + d, vx_load_aligned(C + d) + vx_load_aligned(hsumAdd + d) * v_scale);
+                            for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
+                                v_store_aligned(C + d, v_add(vx_load_aligned(C + d), v_mul(vx_load_aligned(hsumAdd + d), v_scale)));
 #else
                             int scale = k == 0 ? SH2 + 1 : 1;
                             for (d = 0; d < D; d++)
@@ -622,12 +622,12 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                                 const CostType* pixAdd = mem.pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
                                 const CostType* pixSub = mem.pixDiff + std::max(x - (SW2+1)*Da, 0);
 
-#if CV_SIMD
-                                for (d = 0; d < Da; d += v_int16::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                                for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
                                 {
-                                    v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) + vx_load_aligned(pixAdd + d) - vx_load_aligned(pixSub + d);
+                                    v_int16 hv = v_sub(v_add(vx_load_aligned(hsumAdd + x - Da + d), vx_load_aligned(pixAdd + d)), vx_load_aligned(pixSub + d));
                                     v_store_aligned(hsumAdd + x + d, hv);
-                                    v_store_aligned(C + x + d, vx_load_aligned(C + x + d) + hv * v_scale);
+                                    v_store_aligned(C + x + d, v_add(vx_load_aligned(C + x + d), v_mul(hv, v_scale)));
                                 }
 #else
                                 for( d = 0; d < D; d++ )
@@ -646,9 +646,9 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                         {
                             const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
                             const CostType* Cprev = mem.getCBuf(y - 1);
-#if CV_SIMD
-                            for (x = 0; x < width1*Da; x += v_int16::nlanes)
-                                v_store_aligned(C + x, vx_load_aligned(Cprev + x) - vx_load_aligned(hsumSub + x) + vx_load_aligned(hsumAdd + x));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                            for (x = 0; x < width1*Da; x += VTraits<v_int16>::vlanes())
+                                v_store_aligned(C + x, v_add(v_sub(vx_load_aligned(Cprev + x), vx_load_aligned(hsumSub + x)), vx_load_aligned(hsumAdd + x)));
 #else
                             for (x = 0; x < width1*Da; x++)
                                 C[x] = (CostType)(Cprev[x] + hsumAdd[x] - hsumSub[x]);
@@ -656,9 +656,9 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                         }
                         else
                         {
-#if CV_SIMD
-                            for (x = 0; x < width1*Da; x += v_int16::nlanes)
-                                v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                            for (x = 0; x < width1*Da; x += VTraits<v_int16>::vlanes())
+                                v_store_aligned(C + x, v_add(vx_load_aligned(C + x), vx_load_aligned(hsumAdd + x)));
 #else
                             for (x = 0; x < width1*Da; x++)
                                 C[x] = (CostType)(C[x] + hsumAdd[x]);
@@ -714,7 +714,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
 
                 CostType* minL = mem.getMinLr(lrID, x);
                 d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 v_int16 _P1 = vx_setall_s16((short)P1);
 
                 v_int16 _delta0 = vx_setall_s16((short)delta0);
@@ -726,31 +726,31 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                 v_int16 _minL2 = vx_setall_s16((short)MAX_COST);
                 v_int16 _minL3 = vx_setall_s16((short)MAX_COST);
 
-                for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
+                for( ; d <= D - VTraits<v_int16>::vlanes(); d += VTraits<v_int16>::vlanes() )
                 {
                     v_int16 Cpd = vx_load_aligned(Cp + d);
                     v_int16 Spd = vx_load_aligned(Sp + d);
                     v_int16 L;
 
-                    L = v_min(v_min(v_min(vx_load_aligned(Lr_p0 + d), vx_load(Lr_p0 + d - 1) + _P1), vx_load(Lr_p0 + d + 1) + _P1), _delta0) - _delta0 + Cpd;
+                    L = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_p0 + d), v_add(vx_load(Lr_p0 + d - 1), _P1)), v_add(vx_load(Lr_p0 + d + 1), _P1)), _delta0), _delta0), Cpd);
                     v_store_aligned(Lr_p + d, L);
                     _minL0 = v_min(_minL0, L);
-                    Spd += L;
+                    Spd = v_add(Spd, L);
 
-                    L = v_min(v_min(v_min(vx_load_aligned(Lr_p1 + d), vx_load(Lr_p1 + d - 1) + _P1), vx_load(Lr_p1 + d + 1) + _P1), _delta1) - _delta1 + Cpd;
+                    L = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_p1 + d), v_add(vx_load(Lr_p1 + d - 1), _P1)), v_add(vx_load(Lr_p1 + d + 1), _P1)), _delta1), _delta1), Cpd);
                     v_store_aligned(Lr_p + d + Dlra, L);
                     _minL1 = v_min(_minL1, L);
-                    Spd += L;
+                    Spd = v_add(Spd, L);
 
-                    L = v_min(v_min(v_min(vx_load_aligned(Lr_p2 + d), vx_load(Lr_p2 + d - 1) + _P1), vx_load(Lr_p2 + d + 1) + _P1), _delta2) - _delta2 + Cpd;
+                    L = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_p2 + d), v_add(vx_load(Lr_p2 + d - 1), _P1)), v_add(vx_load(Lr_p2 + d + 1), _P1)), _delta2), _delta2), Cpd);
                     v_store_aligned(Lr_p + d + Dlra*2, L);
                     _minL2 = v_min(_minL2, L);
-                    Spd += L;
+                    Spd = v_add(Spd, L);
 
-                    L = v_min(v_min(v_min(vx_load_aligned(Lr_p3 + d), vx_load(Lr_p3 + d - 1) + _P1), vx_load(Lr_p3 + d + 1) + _P1), _delta3) - _delta3 + Cpd;
+                    L = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_p3 + d), v_add(vx_load(Lr_p3 + d - 1), _P1)), v_add(vx_load(Lr_p3 + d + 1), _P1)), _delta3), _delta3), Cpd);
                     v_store_aligned(Lr_p + d + Dlra*3, L);
                     _minL3 = v_min(_minL3, L);
-                    Spd += L;
+                    Spd = v_add(Spd, L);
 
                     v_store_aligned(Sp + d, Spd);
                 }
@@ -769,7 +769,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                 t0 = v_min(t0, t1);
                 t0 = v_min(t0, v_rotate_right<4>(t0));
 #if CV_SIMD_WIDTH == 32
-                CostType buf[v_int16::nlanes];
+                CostType buf[VTraits<v_int16>::max_nlanes];
                 v_store_low(buf, v_min(t0, v_rotate_right<8>(t0)));
                 minL[0] = buf[0];
                 minL[1] = buf[1];
@@ -817,10 +817,10 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
             if( pass == npasses )
             {
                 x = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 v_int16 v_inv_dist = vx_setall_s16((DispType)INVALID_DISP_SCALED);
                 v_int16 v_max_cost = vx_setall_s16(MAX_COST);
-                for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes )
+                for( ; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes() )
                 {
                     v_store(disp1ptr + x, v_inv_dist);
                     v_store(mem.disp2ptr + x, v_inv_dist);
@@ -850,23 +850,23 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                         d = 0;
                         int delta0 = P2 + *mem.getMinLr(lrID, x + 1);
                         int minL0 = MAX_COST;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                         v_int16 _P1 = vx_setall_s16((short)P1);
                         v_int16 _delta0 = vx_setall_s16((short)delta0);
 
                         v_int16 _minL0 = vx_setall_s16((short)MAX_COST);
                         v_int16 _minS = vx_setall_s16(MAX_COST), _bestDisp = vx_setall_s16(-1);
-                        for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
+                        for( ; d <= D - VTraits<v_int16>::vlanes(); d += VTraits<v_int16>::vlanes() )
                         {
                             v_int16 Cpd = vx_load_aligned(Cp + d);
-                            v_int16 L0 = v_min(v_min(v_min(vx_load_aligned(Lr_p0 + d), vx_load(Lr_p0 + d - 1) + _P1), vx_load(Lr_p0 + d + 1) + _P1), _delta0) - _delta0 + Cpd;
+                            v_int16 L0 = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_p0 + d), v_add(vx_load(Lr_p0 + d - 1), _P1)), v_add(vx_load(Lr_p0 + d + 1), _P1)), _delta0), _delta0), Cpd);
 
                             v_store_aligned(Lr_p + d, L0);
                             _minL0 = v_min(_minL0, L0);
-                            L0 += vx_load_aligned(Sp + d);
+                            L0 = v_add(L0, vx_load_aligned(Sp + d));
                             v_store_aligned(Sp + d, L0);
 
-                            _bestDisp = v_select(_minS > L0, vx_setall_s16((short)d), _bestDisp);
+                            _bestDisp = v_select(v_gt(_minS, L0), vx_setall_s16((short)d), _bestDisp);
                             _minS = v_min(_minS, L0);
                         }
                         minL0 = (CostType)v_reduce_min(_minL0);
@@ -891,12 +891,12 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                     else
                     {
                         d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                         v_int16 _minS = vx_setall_s16(MAX_COST), _bestDisp = vx_setall_s16(-1);
-                        for( ; d <= D - v_int16::nlanes; d+= v_int16::nlanes )
+                        for( ; d <= D - VTraits<v_int16>::vlanes(); d+= VTraits<v_int16>::vlanes() )
                         {
                             v_int16 L0 = vx_load_aligned(Sp + d);
-                            _bestDisp = v_select(_minS > L0, vx_setall_s16((short)d), _bestDisp);
+                            _bestDisp = v_select(v_gt(_minS, L0), vx_setall_s16((short)d), _bestDisp);
                             _minS = v_min( L0, _minS );
                         }
                         min_pos(_minS, _bestDisp, minS, bestDisp);
@@ -1039,9 +1039,9 @@ struct CalcVerticalSums: public ParallelLoopBody
                             for( x = (x1 - SW2)*Da; x <= (x1 + SW2)*Da; x += Da )
                             {
                                 int xbord = x <= 0 ? 0 : (x > (width1 - 1)*Da ? (width1 - 1)*Da : x);
-#if CV_SIMD
-                                for( d = 0; d < Da; d += v_int16::nlanes )
-                                    v_store_aligned(hsumAdd + x1*Da + d, vx_load_aligned(hsumAdd + x1*Da + d) + vx_load_aligned(pixDiff + xbord + d));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                                for( d = 0; d < Da; d += VTraits<v_int16>::vlanes() )
+                                    v_store_aligned(hsumAdd + x1*Da + d, v_add(vx_load_aligned(hsumAdd + x1 * this->Da + d), vx_load_aligned(pixDiff + xbord + d)));
 #else
                                 for( d = 0; d < D; d++ )
                                     hsumAdd[x1*Da + d] = (CostType)(hsumAdd[x1*Da + d] + pixDiff[xbord + d]);
@@ -1052,9 +1052,9 @@ struct CalcVerticalSums: public ParallelLoopBody
                             {
                                 const CostType* hsumSub =  mem.getHSumBuf(std::max(y - SH2 - 1, 0));
                                 const CostType* Cprev = mem.getCBuf(y - 1);
-#if CV_SIMD
-                                for( d = 0; d < Da; d += v_int16::nlanes )
-                                    v_store_aligned(C + x1*Da + d, vx_load_aligned(Cprev + x1*Da + d) + vx_load_aligned(hsumAdd + x1*Da + d) - vx_load_aligned(hsumSub + x1*Da + d));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                                for( d = 0; d < Da; d += VTraits<v_int16>::vlanes() )
+                                    v_store_aligned(C + x1*Da + d, v_sub(v_add(vx_load_aligned(Cprev + x1 * this->Da + d), vx_load_aligned(hsumAdd + x1 * this->Da + d)), vx_load_aligned(hsumSub + x1 * this->Da + d)));
 #else
                                 for( d = 0; d < D; d++ )
                                     C[x1*Da + d] = (CostType)(Cprev[x1*Da + d] + hsumAdd[x1*Da + d] - hsumSub[x1*Da + d]);
@@ -1064,12 +1064,12 @@ struct CalcVerticalSums: public ParallelLoopBody
                                     const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
                                     const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
 
-#if CV_SIMD
-                                    for( d = 0; d < Da; d += v_int16::nlanes )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                                    for( d = 0; d < Da; d += VTraits<v_int16>::vlanes() )
                                     {
-                                        v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) - vx_load_aligned(pixSub + d) + vx_load_aligned(pixAdd + d);
+                                        v_int16 hv = v_add(v_sub(vx_load_aligned(hsumAdd + x - this->Da + d), vx_load_aligned(pixSub + d)), vx_load_aligned(pixAdd + d));
                                         v_store_aligned(hsumAdd + x + d, hv);
-                                        v_store_aligned(C + x + d, vx_load_aligned(Cprev + x + d) - vx_load_aligned(hsumSub + x + d) + hv);
+                                        v_store_aligned(C + x + d, v_add(v_sub(vx_load_aligned(Cprev + x + d), vx_load_aligned(hsumSub + x + d)), hv));
                                     }
 #else
                                     for( d = 0; d < D; d++ )
@@ -1082,10 +1082,10 @@ struct CalcVerticalSums: public ParallelLoopBody
                             }
                             else
                             {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                                 v_int16 v_scale = vx_setall_s16(k == 0 ? (short)SH2 + 1 : 1);
-                                for (d = 0; d < Da; d += v_int16::nlanes)
-                                    v_store_aligned(C + x1*Da + d, vx_load_aligned(C + x1*Da + d) + vx_load_aligned(hsumAdd + x1*Da + d) * v_scale);
+                                for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
+                                    v_store_aligned(C + x1*Da + d, v_add(vx_load_aligned(C + x1 * this->Da + d), v_mul(vx_load_aligned(hsumAdd + x1 * this->Da + d), v_scale)));
 #else
                                 int scale = k == 0 ? SH2 + 1 : 1;
                                 for (d = 0; d < D; d++)
@@ -1095,12 +1095,12 @@ struct CalcVerticalSums: public ParallelLoopBody
                                 {
                                     const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
                                     const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
-#if CV_SIMD
-                                    for (d = 0; d < Da; d += v_int16::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                                    for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
                                     {
-                                        v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) + vx_load_aligned(pixAdd + d) - vx_load_aligned(pixSub + d);
+                                        v_int16 hv = v_sub(v_add(vx_load_aligned(hsumAdd + x - this->Da + d), vx_load_aligned(pixAdd + d)), vx_load_aligned(pixSub + d));
                                         v_store_aligned(hsumAdd + x + d, hv);
-                                        v_store_aligned(C + x + d, vx_load_aligned(C + x + d) + hv * v_scale);
+                                        v_store_aligned(C + x + d, v_add(vx_load_aligned(C + x + d), v_mul(hv, v_scale)));
                                     }
 #else
                                     for( d = 0; d < D; d++ )
@@ -1120,9 +1120,9 @@ struct CalcVerticalSums: public ParallelLoopBody
                                 const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
                                 const CostType* Cprev = mem.getCBuf(y - 1);
 
-#if CV_SIMD
-                                for( x = x1*Da; x < x2*Da; x += v_int16::nlanes )
-                                    v_store_aligned(C + x, vx_load_aligned(Cprev + x) - vx_load_aligned(hsumSub + x) + vx_load_aligned(hsumAdd + x));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                                for( x = x1*Da; x < x2*Da; x += VTraits<v_int16>::vlanes() )
+                                    v_store_aligned(C + x, v_add(v_sub(vx_load_aligned(Cprev + x), vx_load_aligned(hsumSub + x)), vx_load_aligned(hsumAdd + x)));
 #else
                                 for( x = x1*Da; x < x2*Da; x++ )
                                     C[x] = (CostType)(Cprev[x] + hsumAdd[x] - hsumSub[x]);
@@ -1131,9 +1131,9 @@ struct CalcVerticalSums: public ParallelLoopBody
                             else*/
                             if(y == 0)
                             {
-#if CV_SIMD
-                                for( x = x1*Da; x < x2*Da; x += v_int16::nlanes )
-                                    v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                                for( x = x1*Da; x < x2*Da; x += VTraits<v_int16>::vlanes() )
+                                    v_store_aligned(C + x, v_add(vx_load_aligned(C + x), vx_load_aligned(hsumAdd + x)));
 #else
                                 for( x = x1*Da; x < x2*Da; x++ )
                                     C[x] = (CostType)(C[x] + hsumAdd[x]);
@@ -1167,19 +1167,19 @@ struct CalcVerticalSums: public ParallelLoopBody
 
                     CostType& minL = *(mem.getMinLr(lrID, x));
                     d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                     v_int16 _P1 = vx_setall_s16((short)P1);
 
                     v_int16 _delta = vx_setall_s16((short)delta);
                     v_int16 _minL = vx_setall_s16((short)MAX_COST);
 
-                    for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
+                    for( ; d <= D - VTraits<v_int16>::vlanes(); d += VTraits<v_int16>::vlanes() )
                     {
                         v_int16 Cpd = vx_load_aligned(Cp + d);
-                        v_int16 L = v_min(v_min(v_min(vx_load_aligned(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
+                        v_int16 L = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_ppr + d), v_add(vx_load(Lr_ppr + d - 1), _P1)), v_add(vx_load(Lr_ppr + d + 1), _P1)), _delta), _delta), Cpd);
                         v_store_aligned(Lr_p + d, L);
                         _minL = v_min(_minL, L);
-                        v_store_aligned(Sp + d, vx_load_aligned(Sp + d) + L);
+                        v_store_aligned(Sp + d, v_add(vx_load_aligned(Sp + d), L));
                     }
                     minL = v_reduce_min(_minL);
 #else
@@ -1264,10 +1264,10 @@ struct CalcHorizontalSums: public ParallelLoopBody
             CostType* S = mem.getSBuf(y);
 
             x = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             v_int16 v_inv_dist = vx_setall_s16((DispType)INVALID_DISP_SCALED);
             v_int16 v_max_cost = vx_setall_s16(MAX_COST);
-            for (; x <= width - v_int16::nlanes; x += v_int16::nlanes)
+            for (; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
             {
                 v_store(disp1ptr + x, v_inv_dist);
                 v_store(disp2ptr + x, v_inv_dist);
@@ -1304,19 +1304,19 @@ struct CalcHorizontalSums: public ParallelLoopBody
                 CostType* Sp = S + x*Da;
 
                 d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 v_int16 _P1 = vx_setall_s16((short)P1);
 
                 v_int16 _delta = vx_setall_s16((short)delta);
                 v_int16 _minL = vx_setall_s16((short)MAX_COST);
 
-                for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes)
+                for( ; d <= D - VTraits<v_int16>::vlanes(); d += VTraits<v_int16>::vlanes())
                 {
                     v_int16 Cpd = vx_load_aligned(Cp + d);
-                    v_int16 L = v_min(v_min(v_min(vx_load(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
+                    v_int16 L = v_add(v_sub(v_min(v_min(v_min(vx_load(Lr_ppr + d), v_add(vx_load(Lr_ppr + d - 1), _P1)), v_add(vx_load(Lr_ppr + d + 1), _P1)), _delta), _delta), Cpd);
                     v_store(Lr_p + d, L);
                     _minL = v_min(_minL, L);
-                    v_store_aligned(Sp + d, vx_load_aligned(Sp + d) + L);
+                    v_store_aligned(Sp + d, v_add(vx_load_aligned(Sp + d), L));
                 }
                 minLr = v_reduce_min(_minL);
 #else
@@ -1349,22 +1349,22 @@ struct CalcHorizontalSums: public ParallelLoopBody
                 minLr = MAX_COST;
 
                 d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 v_int16 _P1 = vx_setall_s16((short)P1);
                 v_int16 _delta = vx_setall_s16((short)delta);
 
                 v_int16 _minL = vx_setall_s16((short)MAX_COST);
                 v_int16 _minS = vx_setall_s16(MAX_COST), _bestDisp = vx_setall_s16(-1);
-                for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
+                for( ; d <= D - VTraits<v_int16>::vlanes(); d += VTraits<v_int16>::vlanes() )
                 {
                     v_int16 Cpd = vx_load_aligned(Cp + d);
-                    v_int16 L = v_min(v_min(v_min(vx_load(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
+                    v_int16 L = v_add(v_sub(v_min(v_min(v_min(vx_load(Lr_ppr + d), v_add(vx_load(Lr_ppr + d - 1), _P1)), v_add(vx_load(Lr_ppr + d + 1), _P1)), _delta), _delta), Cpd);
                     v_store(Lr_p + d, L);
                     _minL = v_min(_minL, L);
-                    L += vx_load_aligned(Sp + d);
+                    L = v_add(L, vx_load_aligned(Sp + d));
                     v_store_aligned(Sp + d, L);
 
-                    _bestDisp = v_select(_minS > L, vx_setall_s16((short)d), _bestDisp);
+                    _bestDisp = v_select(v_gt(_minS, L), vx_setall_s16((short)d), _bestDisp);
                     _minS = v_min( L, _minS );
                 }
                 minLr = v_reduce_min(_minL);
@@ -1581,8 +1581,8 @@ struct SGBM3WayMainLoop : public ParallelLoopBody
 
     utils::BufferArea aux_area;
     PixType* clipTab;
-#if CV_SIMD
-    short idx_row[v_int16::nlanes];
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    short idx_row[VTraits<v_int16>::max_nlanes];
 #endif
     SGBM3WayMainLoop(const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, int stripe_size, int _stripe_overlap);
     void operator () (const Range& range) const CV_OVERRIDE;
@@ -1637,8 +1637,8 @@ SGBM3WayMainLoop::SGBM3WayMainLoop(const Mat& _img1,
     uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
     disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
 
-#if CV_SIMD
-    for(short i = 0; i < v_int16::nlanes; ++i)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    for(short i = 0; i < VTraits<v_int16>::vlanes(); ++i)
         idx_row[i] = i;
 #endif
 }
@@ -1659,13 +1659,13 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int
         {
             calcPixelCostBT( *img1, *img2, k, minD, maxD, pixDiff, tmpBuf, clipTab + TAB_OFS );
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             v_int16 sw2_1 = vx_setall_s16((short)SW2 + 1);
-            for (d = 0; d < Da; d += v_int16::nlanes)
+            for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
             {
-                v_int16 hsA = vx_load_aligned(pixDiff + d) * sw2_1;
+                v_int16 hsA = v_mul(vx_load_aligned(pixDiff + d), sw2_1);
                 for (x = Da; x <= SW2 * Da; x += Da)
-                    hsA += vx_load_aligned(pixDiff + x + d);
+                    hsA = v_add(hsA, vx_load_aligned(pixDiff + x + d));
                 v_store_aligned(hsumAdd + d, hsA);
             }
 #else
@@ -1681,9 +1681,9 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int
             {
                 const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, src_start_idx));
 
-#if CV_SIMD
-                for (d = 0; d < Da; d += v_int16::nlanes)
-                    v_store_aligned(C + d, vx_load_aligned(C + d) + vx_load_aligned(hsumAdd + d) - vx_load_aligned(hsumSub + d));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
+                    v_store_aligned(C + d, v_sub(v_add(vx_load_aligned(C + d), vx_load_aligned(hsumAdd + d)), vx_load_aligned(hsumSub + d)));
 #else
                 for (d = 0; d < D; d++)
                     C[d] = (CostType)(C[d] + hsumAdd[d] - hsumSub[d]);
@@ -1693,13 +1693,13 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int
                 {
                     const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
                     const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                     v_int16 hv_reg;
-                    for( d = 0; d < Da; d+=v_int16::nlanes )
+                    for( d = 0; d < Da; d+=VTraits<v_int16>::vlanes() )
                     {
-                        hv_reg = vx_load_aligned(hsumAdd+x-Da+d) + vx_load_aligned(pixAdd+d) - vx_load_aligned(pixSub+d);
+                        hv_reg = v_sub(v_add(vx_load_aligned(hsumAdd + x - this->Da + d), vx_load_aligned(pixAdd + d)), vx_load_aligned(pixSub + d));
                         v_store_aligned(hsumAdd+x+d,hv_reg);
-                        v_store_aligned(C+x+d,vx_load_aligned(C+x+d)+hv_reg-vx_load_aligned(hsumSub+x+d));
+                        v_store_aligned(C+x+d,v_sub(v_add(vx_load_aligned(C + x + d), hv_reg), vx_load_aligned(hsumSub + x + d)));
                     }
 #else
                     for( d = 0; d < D; d++ )
@@ -1712,10 +1712,10 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int
             }
             else
             {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 v_int16 v_scale = vx_setall_s16(k == src_start_idx ? (short)SH2 + 1 : 1);
-                for (d = 0; d < Da; d += v_int16::nlanes)
-                    v_store_aligned(C + d, vx_load_aligned(C + d) + vx_load_aligned(hsumAdd + d) * v_scale);
+                for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
+                    v_store_aligned(C + d, v_add(vx_load_aligned(C + d), v_mul(vx_load_aligned(hsumAdd + d), v_scale)));
 #else
                 int scale = k == src_start_idx ? SH2 + 1 : 1;
                 for (d = 0; d < D; d++)
@@ -1725,12 +1725,12 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int
                 {
                     const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
                     const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
-#if CV_SIMD
-                    for (d = 0; d < Da; d += v_int16::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                    for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
                     {
-                        v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) + vx_load_aligned(pixAdd + d) - vx_load_aligned(pixSub + d);
+                        v_int16 hv = v_sub(v_add(vx_load_aligned(hsumAdd + x - this->Da + d), vx_load_aligned(pixAdd + d)), vx_load_aligned(pixSub + d));
                         v_store_aligned(hsumAdd + x + d, hv);
-                        v_store_aligned(C + x + d, vx_load_aligned(C + x + d) + hv * v_scale);
+                        v_store_aligned(C + x + d, v_add(vx_load_aligned(C + x + d), v_mul(hv, v_scale)));
                     }
 #else
                     for (d = 0; d < D; d++)
@@ -1748,9 +1748,9 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int
             if( y > src_start_idx )
             {
                 const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, src_start_idx));
-#if CV_SIMD
-                for( x = 0; x < width1*Da; x += v_int16::nlanes)
-                    v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x) - vx_load_aligned(hsumSub + x));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( x = 0; x < width1*Da; x += VTraits<v_int16>::vlanes())
+                    v_store_aligned(C + x, v_sub(v_add(vx_load_aligned(C + x), vx_load_aligned(hsumAdd + x)), vx_load_aligned(hsumSub + x)));
 #else
                 for( x = 0; x < width1*Da; x++ )
                     C[x] = (CostType)(C[x] + hsumAdd[x] - hsumSub[x]);
@@ -1758,9 +1758,9 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int
             }
             else
             {
-#if CV_SIMD
-                for( x = 0; x < width1*Da; x += v_int16::nlanes)
-                    v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x));
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                for( x = 0; x < width1*Da; x += VTraits<v_int16>::vlanes())
+                    v_store_aligned(C + x, v_add(vx_load_aligned(C + x), vx_load_aligned(hsumAdd + x)));
 #else
                 for( x = 0; x < width1*Da; x++ )
                     C[x] = (CostType)(C[x] + hsumAdd[x]);
@@ -1781,7 +1781,7 @@ void SGBM3WayMainLoop::accumulateCostsLeftTop(const BufferSGBM3Way &mem, int x,
     CostType *costs = mem.curCostVolumeLine - Da + x;
     CostType& topMinCost = mem.vertPassMin[x/Da];
     int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_int16 P1_reg = vx_setall_s16(cv::saturate_cast<CostType>(P1));
 
     v_int16 leftMinCostP2_reg   = vx_setall_s16(cv::saturate_cast<CostType>(leftMinCost+P2));
@@ -1798,18 +1798,18 @@ void SGBM3WayMainLoop::accumulateCostsLeftTop(const BufferSGBM3Way &mem, int x,
     v_int16 src_shifted_left,src_shifted_right;
     v_int16 res;
 
-    for(;i<Da-v_int16::nlanes;i+= v_int16::nlanes)
+    for(;i<Da-VTraits<v_int16>::vlanes();i+= VTraits<v_int16>::vlanes())
     {
         //process leftBuf:
         //lookahead load:
-        src2 = vx_load_aligned(leftBuf_prev+i+v_int16::nlanes);
+        src2 = vx_load_aligned(leftBuf_prev+i+VTraits<v_int16>::vlanes());
 
         //get shifted versions of the current block and add P1:
         src_shifted_left  = v_rotate_left<1>  (src1_leftBuf,src0_leftBuf);
         src_shifted_right = v_rotate_right<1> (src1_leftBuf,src2        );
 
         // process and save current block:
-        res = vx_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
+        res = v_add(vx_load_aligned(costs + i), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_leftBuf, leftMinCostP2_reg)), leftMinCostP2_reg));
         leftMinCost_new_reg = v_min(leftMinCost_new_reg,res);
         v_store_aligned(leftBuf+i, res);
 
@@ -1819,14 +1819,14 @@ void SGBM3WayMainLoop::accumulateCostsLeftTop(const BufferSGBM3Way &mem, int x,
 
         //process topBuf:
         //lookahead load:
-        src2 = vx_load_aligned(topBuf+i+v_int16::nlanes);
+        src2 = vx_load_aligned(topBuf+i+VTraits<v_int16>::vlanes());
 
         //get shifted versions of the current block and add P1:
         src_shifted_left  = v_rotate_left<1>  (src1_topBuf,src0_topBuf);
         src_shifted_right = v_rotate_right<1> (src1_topBuf,src2       );
 
         // process and save current block:
-        res = vx_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
+        res = v_add(vx_load_aligned(costs + i), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_topBuf, topMinCostP2_reg)), topMinCostP2_reg));
         topMinCost_new_reg = v_min(topMinCost_new_reg,res);
         v_store_aligned(topBuf+i, res);
 
@@ -1843,17 +1843,17 @@ void SGBM3WayMainLoop::accumulateCostsLeftTop(const BufferSGBM3Way &mem, int x,
         src_shifted_left  = v_rotate_left<1>  (src1_leftBuf,src0_leftBuf);
         src_shifted_right = v_rotate_right<1> (src1_leftBuf,src2        );
 
-        res = vx_load_aligned(costs+Da-v_int16::nlanes) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
+        res = v_add(vx_load_aligned(costs + this->Da - VTraits<v_int16>::vlanes()), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_leftBuf, leftMinCostP2_reg)), leftMinCostP2_reg));
         leftMinCost = v_reduce_min(v_min(leftMinCost_new_reg,res));
-        v_store_aligned(leftBuf+Da-v_int16::nlanes, res);
+        v_store_aligned(leftBuf+Da-VTraits<v_int16>::vlanes(), res);
 
         //process topBuf:
         src_shifted_left  = v_rotate_left<1>  (src1_topBuf,src0_topBuf);
         src_shifted_right = v_rotate_right<1> (src1_topBuf,src2       );
 
-        res = vx_load_aligned(costs+Da-v_int16::nlanes) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
+        res = v_add(vx_load_aligned(costs + this->Da - VTraits<v_int16>::vlanes()), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_topBuf, topMinCostP2_reg)), topMinCostP2_reg));
         topMinCost = v_reduce_min(v_min(topMinCost_new_reg,res));
-        v_store_aligned(topBuf+Da-v_int16::nlanes, res);
+        v_store_aligned(topBuf+Da-VTraits<v_int16>::vlanes(), res);
     }
     else
     {
@@ -1904,7 +1904,7 @@ void SGBM3WayMainLoop::accumulateCostsRight(const BufferSGBM3Way &mem, int x,
     CostType* leftBuf = mem.horPassCostVolume + x;
 
     int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_int16 P1_reg = vx_setall_s16(cv::saturate_cast<CostType>(P1));
 
     v_int16 rightMinCostP2_reg   = vx_setall_s16(cv::saturate_cast<CostType>(rightMinCost+P2));
@@ -1919,27 +1919,27 @@ void SGBM3WayMainLoop::accumulateCostsRight(const BufferSGBM3Way &mem, int x,
     v_int16 min_sum_cost_reg = vx_setall_s16(SHRT_MAX);
     v_int16 min_sum_pos_reg  = vx_setall_s16(0);
 
-    for(;i<Da-v_int16::nlanes;i+=v_int16::nlanes)
+    for(;i<Da-VTraits<v_int16>::vlanes();i+=VTraits<v_int16>::vlanes())
     {
         //lookahead load:
-        src2 = vx_load_aligned(rightBuf+i+v_int16::nlanes);
+        src2 = vx_load_aligned(rightBuf+i+VTraits<v_int16>::vlanes());
 
         //get shifted versions of the current block and add P1:
         src_shifted_left  = v_rotate_left<1>  (src1_rightBuf,src0_rightBuf);
         src_shifted_right = v_rotate_right<1> (src1_rightBuf,src2         );
 
         // process and save current block:
-        res = vx_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
+        res = v_add(vx_load_aligned(costs + i), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_rightBuf, rightMinCostP2_reg)), rightMinCostP2_reg));
         rightMinCost_new_reg = v_min(rightMinCost_new_reg,res);
         v_store_aligned(rightBuf+i, res);
 
         // compute and save total cost:
-        res = res + vx_load_aligned(leftBuf+i) + vx_load_aligned(topBuf+i);
+        res = v_add(v_add(res, vx_load_aligned(leftBuf + i)), vx_load_aligned(topBuf + i));
         v_store_aligned(leftBuf+i, res);
 
         // track disparity value with the minimum cost:
         min_sum_cost_reg = v_min(min_sum_cost_reg,res);
-        min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (vx_setall_s16((short)i) - min_sum_pos_reg));
+        min_sum_pos_reg = v_add(min_sum_pos_reg, v_and(v_eq(min_sum_cost_reg, res), v_sub(vx_setall_s16((short)i), min_sum_pos_reg)));
 
         //update src:
         src0_rightBuf    = src1_rightBuf;
@@ -1953,15 +1953,15 @@ void SGBM3WayMainLoop::accumulateCostsRight(const BufferSGBM3Way &mem, int x,
         src_shifted_left  = v_rotate_left<1>  (src1_rightBuf,src0_rightBuf);
         src_shifted_right = v_rotate_right<1> (src1_rightBuf,src2         );
 
-        res = vx_load_aligned(costs+D-v_int16::nlanes) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
+        res = v_add(vx_load_aligned(costs + this->D - VTraits<v_int16>::vlanes()), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_rightBuf, rightMinCostP2_reg)), rightMinCostP2_reg));
         rightMinCost = v_reduce_min(v_min(rightMinCost_new_reg,res));
-        v_store_aligned(rightBuf+D-v_int16::nlanes, res);
+        v_store_aligned(rightBuf+D-VTraits<v_int16>::vlanes(), res);
 
-        res = res + vx_load_aligned(leftBuf+D-v_int16::nlanes) + vx_load_aligned(topBuf+D-v_int16::nlanes);
-        v_store_aligned(leftBuf+D-v_int16::nlanes, res);
+        res = v_add(v_add(res, vx_load_aligned(leftBuf + this->D - VTraits<v_int16>::vlanes())), vx_load_aligned(topBuf + this->D - VTraits<v_int16>::vlanes()));
+        v_store_aligned(leftBuf+D-VTraits<v_int16>::vlanes(), res);
 
         min_sum_cost_reg = v_min(min_sum_cost_reg,res);
-        min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (vx_setall_s16((short)(D-v_int16::nlanes)) - min_sum_pos_reg));
+        min_sum_pos_reg = v_add(min_sum_pos_reg, v_and(v_eq(min_sum_cost_reg, res), v_sub(vx_setall_s16((short)(this->D - VTraits<v_int16>::vlanes())), min_sum_pos_reg)));
         min_pos(min_sum_cost_reg,min_sum_pos_reg, min_cost, optimal_disp);
     }
     else
@@ -2070,40 +2070,40 @@ void SGBM3WayMainLoop::impl(const Range& range) const
             if(uniquenessRatio>0)
             {
                 d = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
                 horPassCostVolume+=x;
                 int thresh = (100*min_cost)/(100-uniquenessRatio);
                 v_int16 thresh_reg = vx_setall_s16((short)(thresh+1));
                 v_int16 d1 = vx_setall_s16((short)(best_d-1));
                 v_int16 d2 = vx_setall_s16((short)(best_d+1));
-                v_int16 eight_reg = vx_setall_s16((short)v_int16::nlanes);
+                v_int16 eight_reg = vx_setall_s16((short)VTraits<v_int16>::vlanes());
                 v_int16 cur_d = vx_load(idx_row);
                 v_int16 mask;
 
-                for( ; d <= D - 2*v_int16::nlanes; d+=2*v_int16::nlanes )
+                for( ; d <= D - 2*VTraits<v_int16>::vlanes(); d+=2*VTraits<v_int16>::vlanes() )
                 {
-                    mask = (vx_load_aligned(horPassCostVolume + d) < thresh_reg) & ( (cur_d<d1) | (cur_d>d2) );
-                    cur_d = cur_d+eight_reg;
+                    mask = v_and(v_lt(vx_load_aligned(horPassCostVolume + d), thresh_reg), v_or(v_lt(cur_d, d1), v_gt(cur_d, d2)));
+                    cur_d = v_add(cur_d, eight_reg);
                     if( v_check_any(mask) )
                         break;
-                    mask = (vx_load_aligned(horPassCostVolume + d + v_int16::nlanes) < thresh_reg) & ( (cur_d<d1) | (cur_d>d2) );
-                    cur_d = cur_d+eight_reg;
+                    mask = v_and(v_lt(vx_load_aligned(horPassCostVolume + d + VTraits<v_int16>::vlanes()), thresh_reg), v_or(v_lt(cur_d, d1), v_gt(cur_d, d2)));
+                    cur_d = v_add(cur_d, eight_reg);
                     if( v_check_any(mask) )
                         break;
                 }
-                if( d <= D - 2*v_int16::nlanes )
+                if( d <= D - 2*VTraits<v_int16>::vlanes() )
                 {
                     horPassCostVolume-=x;
                     continue;
                 }
-                if( d <= D - v_int16::nlanes )
+                if( d <= D - VTraits<v_int16>::vlanes() )
                 {
-                    if( v_check_any((vx_load_aligned(horPassCostVolume + d) < thresh_reg) & ((cur_d < d1) | (cur_d > d2))) )
+                    if( v_check_any(v_and(v_lt(vx_load_aligned(horPassCostVolume + d), thresh_reg), v_or(v_lt(cur_d, d1), v_gt(cur_d, d2)))) )
                     {
                         horPassCostVolume-=x;
                         continue;
                     }
-                    d+=v_int16::nlanes;
+                    d+=VTraits<v_int16>::vlanes();
                 }
                 horPassCostVolume-=x;
 #endif
diff --git a/modules/features2d/src/sift.simd.hpp b/modules/features2d/src/sift.simd.hpp
index 8589a0225c..2c5cf9f997 100644
--- a/modules/features2d/src/sift.simd.hpp
+++ b/modules/features2d/src/sift.simd.hpp
@@ -210,24 +210,24 @@ float calcOrientationHist(
     cv::hal::magnitude32f(X, Y, Mag, len);
 
     k = 0;
-#if CV_SIMD
-    const int vecsize = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int vecsize = VTraits<v_float32>::vlanes();
     v_float32 nd360 = vx_setall_f32(n/360.f);
     v_int32 __n = vx_setall_s32(n);
-    int CV_DECL_ALIGNED(CV_SIMD_WIDTH) bin_buf[vecsize];
-    float CV_DECL_ALIGNED(CV_SIMD_WIDTH) w_mul_mag_buf[vecsize];
+    int CV_DECL_ALIGNED(CV_SIMD_WIDTH) bin_buf[VTraits<v_float32>::max_nlanes];
+    float CV_DECL_ALIGNED(CV_SIMD_WIDTH) w_mul_mag_buf[VTraits<v_float32>::max_nlanes];
 
     for( ; k <= len - vecsize; k += vecsize )
     {
         v_float32 w = vx_load_aligned( W + k );
         v_float32 mag = vx_load_aligned( Mag + k );
         v_float32 ori = vx_load_aligned( Ori + k );
-        v_int32 bin = v_round( nd360 * ori );
+        v_int32 bin = v_round( v_mul(nd360, ori) );
 
-        bin = v_select(bin >= __n, bin - __n, bin);
-        bin = v_select(bin < vx_setzero_s32(), bin + __n, bin);
+        bin = v_select(v_ge(bin, __n), v_sub(bin, __n), bin);
+        bin = v_select(v_lt(bin, vx_setzero_s32()), v_add(bin, __n), bin);
 
-        w = w * mag;
+        w = v_mul(w, mag);
         v_store_aligned(bin_buf, bin);
         v_store_aligned(w_mul_mag_buf, w);
         for(int vi = 0; vi < vecsize; vi++)
@@ -253,19 +253,19 @@ float calcOrientationHist(
     temphist[n+1] = temphist[1];
 
     i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_float32 d_1_16 = vx_setall_f32(1.f/16.f);
     v_float32 d_4_16 = vx_setall_f32(4.f/16.f);
     v_float32 d_6_16 = vx_setall_f32(6.f/16.f);
-    for( ; i <= n - v_float32::nlanes; i += v_float32::nlanes )
+    for( ; i <= n - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes() )
     {
         v_float32 tn2 = vx_load_aligned(temphist + i-2);
         v_float32 tn1 = vx_load(temphist + i-1);
         v_float32 t0 = vx_load(temphist + i);
         v_float32 t1 = vx_load(temphist + i+1);
         v_float32 t2 = vx_load(temphist + i+2);
-        v_float32 _hist = v_fma(tn2 + t2, d_1_16,
-            v_fma(tn1 + t1, d_4_16, t0 * d_6_16));
+        v_float32 _hist = v_fma(v_add(tn2, t2), d_1_16,
+            v_fma(v_add(tn1, t1), d_4_16, v_mul(t0, d_6_16)));
         v_store(hist + i, _hist);
     }
 #endif
@@ -452,8 +452,8 @@ public:
             const sift_wt* nextptr = next.ptr<sift_wt>(r);
             int c = SIFT_IMG_BORDER;
 
-#if CV_SIMD && !(DoG_TYPE_SHORT)
-            const int vecsize = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE) && !(DoG_TYPE_SHORT)
+            const int vecsize = VTraits<v_float32>::vlanes();
             for( ; c <= cols-SIFT_IMG_BORDER - vecsize; c += vecsize)
             {
                 v_float32 val = vx_load(&currptr[c]);
@@ -464,7 +464,7 @@ public:
                 v_float32 vmin,vmax;
 
 
-                v_float32 cond = v_abs(val) > vx_setall_f32((float)threshold);
+                v_float32 cond = v_gt(v_abs(val), vx_setall_f32((float)this->threshold));
                 if (!v_check_any(cond))
                 {
                     continue;
@@ -477,10 +477,10 @@ public:
                 vmax = v_max(v_max(v_max(_00,_01),v_max(_02,_10)),v_max(v_max(_12,_20),v_max(_21,_22)));
                 vmin = v_min(v_min(v_min(_00,_01),v_min(_02,_10)),v_min(v_min(_12,_20),v_min(_21,_22)));
 
-                v_float32 condp = cond & (val > vx_setall_f32(0)) & (val >= vmax);
-                v_float32 condm = cond & (val < vx_setall_f32(0)) & (val <= vmin);
+                v_float32 condp = v_and(v_and(cond, v_gt(val, vx_setall_f32(0))), v_ge(val, vmax));
+                v_float32 condm = v_and(v_and(cond, v_lt(val, vx_setall_f32(0))), v_le(val, vmin));
 
-                cond = condp | condm;
+                cond = v_or(condp, condm);
                 if (!v_check_any(cond))
                 {
                     continue;
@@ -493,10 +493,10 @@ public:
                 vmax = v_max(v_max(v_max(_00,_01),v_max(_02,_10)),v_max(v_max(_12,_20),v_max(_21,_22)));
                 vmin = v_min(v_min(v_min(_00,_01),v_min(_02,_10)),v_min(v_min(_12,_20),v_min(_21,_22)));
 
-                condp &= (val >= vmax);
-                condm &= (val <= vmin);
+                condp = v_and(condp, v_ge(val, vmax));
+                condm = v_and(condm, v_le(val, vmin));
 
-                cond = condp | condm;
+                cond = v_or(condp, condm);
                 if (!v_check_any(cond))
                 {
                     continue;
@@ -515,10 +515,10 @@ public:
                 vmax = v_max(v_max(v_max(_00,_01),v_max(_02,_10)),v_max(v_max(_12,_20),v_max(_21,_22)));
                 vmin = v_min(v_min(v_min(_00,_01),v_min(_02,_10)),v_min(v_min(_12,_20),v_min(_21,_22)));
 
-                condp &= (val >= v_max(vmax,max_middle));
-                condm &= (val <= v_min(vmin,min_middle));
+                condp = v_and(condp, v_ge(val, v_max(vmax, max_middle)));
+                condm = v_and(condm, v_le(val, v_min(vmin, min_middle)));
 
-                cond = condp | condm;
+                cond = v_or(condp, condm);
                 if (!v_check_any(cond))
                 {
                     continue;
@@ -777,11 +777,11 @@ void calcSIFTDescriptor(
     cv::hal::exp32f(W, W, len);
 
     k = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     {
-        const int vecsize = v_float32::nlanes;
-        int CV_DECL_ALIGNED(CV_SIMD_WIDTH) idx_buf[vecsize];
-        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rco_buf[8*vecsize];
+        const int vecsize = VTraits<v_float32>::vlanes();
+        int CV_DECL_ALIGNED(CV_SIMD_WIDTH) idx_buf[VTraits<v_float32>::max_nlanes];
+        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rco_buf[8*VTraits<v_float32>::max_nlanes];
         const v_float32 __ori  = vx_setall_f32(ori);
         const v_float32 __bins_per_rad = vx_setall_f32(bins_per_rad);
         const v_int32 __n = vx_setall_s32(n);
@@ -792,28 +792,28 @@ void calcSIFTDescriptor(
         {
             v_float32 rbin = vx_load_aligned(RBin + k);
             v_float32 cbin = vx_load_aligned(CBin + k);
-            v_float32 obin = (vx_load_aligned(Ori + k) - __ori) * __bins_per_rad;
-            v_float32 mag = vx_load_aligned(Mag + k) * vx_load_aligned(W + k);
+            v_float32 obin = v_mul(v_sub(vx_load_aligned(Ori + k), __ori), __bins_per_rad);
+            v_float32 mag = v_mul(vx_load_aligned(Mag + k), vx_load_aligned(W + k));
 
             v_int32 r0 = v_floor(rbin);
             v_int32 c0 = v_floor(cbin);
             v_int32 o0 = v_floor(obin);
-            rbin -= v_cvt_f32(r0);
-            cbin -= v_cvt_f32(c0);
-            obin -= v_cvt_f32(o0);
-
-            o0 = v_select(o0 < vx_setzero_s32(), o0 + __n, o0);
-            o0 = v_select(o0 >= __n, o0 - __n, o0);
-
-            v_float32 v_r1 = mag*rbin, v_r0 = mag - v_r1;
-            v_float32 v_rc11 = v_r1*cbin, v_rc10 = v_r1 - v_rc11;
-            v_float32 v_rc01 = v_r0*cbin, v_rc00 = v_r0 - v_rc01;
-            v_float32 v_rco111 = v_rc11*obin, v_rco110 = v_rc11 - v_rco111;
-            v_float32 v_rco101 = v_rc10*obin, v_rco100 = v_rc10 - v_rco101;
-            v_float32 v_rco011 = v_rc01*obin, v_rco010 = v_rc01 - v_rco011;
-            v_float32 v_rco001 = v_rc00*obin, v_rco000 = v_rc00 - v_rco001;
-
-            v_int32 idx = v_muladd(v_muladd(r0+__1, __d_plus_2, c0+__1), __n_plus_2, o0);
+            rbin = v_sub(rbin, v_cvt_f32(r0));
+            cbin = v_sub(cbin, v_cvt_f32(c0));
+            obin = v_sub(obin, v_cvt_f32(o0));
+
+            o0 = v_select(v_lt(o0, vx_setzero_s32()), v_add(o0, __n), o0);
+            o0 = v_select(v_ge(o0, __n), v_sub(o0, __n), o0);
+
+            v_float32 v_r1 = v_mul(mag, rbin), v_r0 = v_sub(mag, v_r1);
+            v_float32 v_rc11 = v_mul(v_r1, cbin), v_rc10 = v_sub(v_r1, v_rc11);
+            v_float32 v_rc01 = v_mul(v_r0, cbin), v_rc00 = v_sub(v_r0, v_rc01);
+            v_float32 v_rco111 = v_mul(v_rc11, obin), v_rco110 = v_sub(v_rc11, v_rco111);
+            v_float32 v_rco101 = v_mul(v_rc10, obin), v_rco100 = v_sub(v_rc10, v_rco101);
+            v_float32 v_rco011 = v_mul(v_rc01, obin), v_rco010 = v_sub(v_rc01, v_rco011);
+            v_float32 v_rco001 = v_mul(v_rc00, obin), v_rco000 = v_sub(v_rc00, v_rco001);
+
+            v_int32 idx = v_muladd(v_muladd(v_add(r0, __1), __d_plus_2, v_add(c0, __1)), __n_plus_2, o0);
             v_store_aligned(idx_buf, idx);
 
             v_store_aligned(rco_buf,           v_rco000);
@@ -894,11 +894,11 @@ void calcSIFTDescriptor(
     float nrm2 = 0;
     len = d*d*n;
     k = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     {
         v_float32 __nrm2 = vx_setzero_f32();
         v_float32 __rawDst;
-        for( ; k <= len - v_float32::nlanes; k += v_float32::nlanes )
+        for( ; k <= len - VTraits<v_float32>::vlanes(); k += VTraits<v_float32>::vlanes() )
         {
             __rawDst = vx_load_aligned(rawDst + k);
             __nrm2 = v_fma(__rawDst, __rawDst, __nrm2);
@@ -949,15 +949,15 @@ void calcSIFTDescriptor(
 if( dstMat.type() == CV_32F )
 {
     float* dst = dstMat.ptr<float>(row);
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_float32 __dst;
     v_float32 __min = vx_setzero_f32();
     v_float32 __max = vx_setall_f32(255.0f); // max of uchar
     v_float32 __nrm2 = vx_setall_f32(nrm2);
-    for( k = 0; k <= len - v_float32::nlanes; k += v_float32::nlanes )
+    for( k = 0; k <= len - VTraits<v_float32>::vlanes(); k += VTraits<v_float32>::vlanes() )
     {
         __dst = vx_load_aligned(rawDst + k);
-        __dst = v_min(v_max(v_cvt_f32(v_round(__dst * __nrm2)), __min), __max);
+        __dst = v_min(v_max(v_cvt_f32(v_round(v_mul(__dst, __nrm2))), __min), __max);
         v_store(dst + k, __dst);
     }
 #endif
@@ -976,16 +976,16 @@ if( dstMat.type() == CV_32F )
 else // CV_8U
 {
     uint8_t* dst = dstMat.ptr<uint8_t>(row);
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_float32 __dst0, __dst1;
     v_uint16 __pack01;
     v_float32 __nrm2 = vx_setall_f32(nrm2);
-    for( k = 0; k <= len - v_float32::nlanes * 2; k += v_float32::nlanes * 2 )
+    for( k = 0; k <= len - VTraits<v_float32>::vlanes() * 2; k += VTraits<v_float32>::vlanes() * 2 )
     {
         __dst0 = vx_load_aligned(rawDst + k);
-        __dst1 = vx_load_aligned(rawDst + k + v_float32::nlanes);
+        __dst1 = vx_load_aligned(rawDst + k + VTraits<v_float32>::vlanes());
 
-        __pack01 = v_pack_u(v_round(__dst0 * __nrm2), v_round(__dst1 * __nrm2));
+        __pack01 = v_pack_u(v_round(v_mul(__dst0, __nrm2)), v_round(v_mul(__dst1, __nrm2)));
         v_pack_store(dst + k, __pack01);
     }
 #endif