diff --git a/modules/calib3d/src/stereobm.cpp b/modules/calib3d/src/stereobm.cpp index 1ac6f6560b..625196ea63 100644 --- a/modules/calib3d/src/stereobm.cpp +++ b/modules/calib3d/src/stereobm.cpp @@ -231,13 +231,13 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero ) dptr0[0] = dptr0[size.width-1] = dptr1[0] = dptr1[size.width-1] = val0; x = 1; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) { v_int16 ftz = vx_setall_s16((short) ftzero); v_int16 ftz2 = vx_setall_s16((short)(ftzero*2)); v_int16 z = vx_setzero_s16(); - for(; x <= (size.width - 1) - v_int16::nlanes; x += v_int16::nlanes) + for(; x <= (size.width - 1) - VTraits::vlanes(); x += VTraits::vlanes()) { v_int16 s00 = v_reinterpret_as_s16(vx_load_expand(srow0 + x + 1)); v_int16 s01 = v_reinterpret_as_s16(vx_load_expand(srow0 + x - 1)); @@ -248,13 +248,13 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero ) v_int16 s30 = v_reinterpret_as_s16(vx_load_expand(srow3 + x + 1)); v_int16 s31 = v_reinterpret_as_s16(vx_load_expand(srow3 + x - 1)); - v_int16 d0 = s00 - s01; - v_int16 d1 = s10 - s11; - v_int16 d2 = s20 - s21; - v_int16 d3 = s30 - s31; + v_int16 d0 = v_sub(s00, s01); + v_int16 d1 = v_sub(s10, s11); + v_int16 d2 = v_sub(s20, s21); + v_int16 d3 = v_sub(s30, s31); - v_uint16 v0 = v_reinterpret_as_u16(v_max(v_min(d0 + d1 + d1 + d2 + ftz, ftz2), z)); - v_uint16 v1 = v_reinterpret_as_u16(v_max(v_min(d1 + d2 + d2 + d3 + ftz, ftz2), z)); + v_uint16 v0 = v_reinterpret_as_u16(v_max(v_min(v_add(v_add(v_add(v_add(d0, d1), d1), d2), ftz), ftz2), z)); + v_uint16 v1 = v_reinterpret_as_u16(v_max(v_min(v_add(v_add(v_add(v_add(d1, d2), d2), d3), ftz), ftz2), z)); v_pack_store(dptr0 + x, v0); v_pack_store(dptr1 + x, v1); @@ -277,10 +277,10 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero ) { uchar* dptr = dst.ptr(y); x = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) { v_uint8 val0_16 = vx_setall_u8(val0); - for(; x <= size.width-v_uint8::nlanes; x+=v_uint8::nlanes) + for(; x <= size.width-VTraits::vlanes(); x+=VTraits::vlanes()) v_store(dptr + x, val0_16); } #endif @@ -356,7 +356,7 @@ public: for (size_t i = 0; i < nstripes; ++i) { // 1D: [1][ ndisp ][1] -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) if (params.useShorts()) area.allocate(sad_short[i], ndisp + 2); else @@ -364,7 +364,7 @@ public: area.allocate(sad[i], ndisp + 2); // 2D: [ wsz/2 + 1 ][ height ][ wsz/2 + 1 ] * [ ndisp ] -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) if (params.useShorts()) area.allocate(hsad_short[i], (height + wsz + 2) * ndisp); else @@ -390,7 +390,7 @@ public: } }; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) template static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right, Mat& disp, Mat& cost, const StereoBMParams& state, @@ -422,8 +422,8 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right, short costbuf = 0; int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0; const uchar * tab = bufX.tab; - short v_seq[v_int16::nlanes]; - for (short i = 0; i < v_int16::nlanes; ++i) + short v_seq[VTraits::max_nlanes]; + for (short i = 0; i < VTraits::vlanes(); ++i) v_seq[i] = i; ushort *sad = bufX.sad_short[bufNum] + 1; @@ -446,19 +446,19 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right, { int lval = lptr[0]; v_uint8 lv = vx_setall_u8((uchar)lval); - for( d = 0; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes ) + for( d = 0; d <= ndisp - VTraits::vlanes(); d += VTraits::vlanes() ) { v_uint8 diff = v_absdiff(lv, vx_load(rptr + d)); v_store(cbuf + d, diff); - v_store(hsad + d, vx_load(hsad + d) + v_expand_low(diff)); - v_store(hsad + d + v_uint16::nlanes, vx_load(hsad + d + v_uint16::nlanes) + v_expand_high(diff)); + v_store(hsad + d, v_add(vx_load(hsad + d), v_expand_low(diff))); + v_store(hsad + d + VTraits::vlanes(), v_add(vx_load(hsad + d + VTraits::vlanes()), v_expand_high(diff))); } - if( d <= ndisp - v_uint16::nlanes ) + if( d <= ndisp - VTraits::vlanes() ) { v_uint8 diff = v_absdiff(lv, vx_load_low(rptr + d)); v_store_low(cbuf + d, diff); - v_store(hsad + d, vx_load(hsad + d) + v_expand_low(diff)); - d += v_uint16::nlanes; + v_store(hsad + d, v_add(vx_load(hsad + d), v_expand_low(diff))); + d += VTraits::vlanes(); } for( ; d < ndisp; d++ ) { @@ -496,20 +496,20 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right, { int lval = lptr[0]; v_uint8 lv = vx_setall_u8((uchar)lval); - for( d = 0; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes ) + for( d = 0; d <= ndisp - VTraits::vlanes(); d += VTraits::vlanes() ) { v_uint8 diff = v_absdiff(lv, vx_load(rptr + d)); v_int8 cbs = v_reinterpret_as_s8(vx_load(cbuf_sub + d)); v_store(cbuf + d, diff); - v_store(hsad + d, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d) + v_expand_low(diff)) - v_expand_low(cbs))); - v_store(hsad + d + v_uint16::nlanes, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d + v_uint16::nlanes) + v_expand_high(diff)) - v_expand_high(cbs))); + v_store(hsad + d, v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(v_add(vx_load(hsad + d), v_expand_low(diff))), v_expand_low(cbs)))); + v_store(hsad + d + VTraits::vlanes(), v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(v_add(vx_load(hsad + d + VTraits::vlanes()), v_expand_high(diff))), v_expand_high(cbs)))); } - if( d <= ndisp - v_uint16::nlanes) + if( d <= ndisp - VTraits::vlanes()) { v_uint8 diff = v_absdiff(lv, vx_load_low(rptr + d)); v_store_low(cbuf + d, diff); - v_store(hsad + d, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d) + v_expand_low(diff)) - vx_load_expand((schar*)cbuf_sub + d))); - d += v_uint16::nlanes; + v_store(hsad + d, v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(v_add(vx_load(hsad + d), v_expand_low(diff))), vx_load_expand((schar *)cbuf_sub + d)))); + d += VTraits::vlanes(); } for( ; d < ndisp; d++ ) { @@ -533,20 +533,20 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right, hsad = hsad0 + (1 - dy0)*ndisp; for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp ) { - for( d = 0; d <= ndisp-2*v_uint16::nlanes; d += 2*v_uint16::nlanes ) + for( d = 0; d <= ndisp-2*VTraits::vlanes(); d += 2*VTraits::vlanes() ) { - v_store(sad + d, vx_load(sad + d) + vx_load(hsad + d)); - v_store(sad + d + v_uint16::nlanes, vx_load(sad + d + v_uint16::nlanes) + vx_load(hsad + d + v_uint16::nlanes)); + v_store(sad + d, v_add(vx_load(sad + d), vx_load(hsad + d))); + v_store(sad + d + VTraits::vlanes(), v_add(vx_load(sad + d + VTraits::vlanes()), vx_load(hsad + d + VTraits::vlanes()))); } - if( d <= ndisp-v_uint16::nlanes ) + if( d <= ndisp-VTraits::vlanes() ) { - v_store(sad + d, vx_load(sad + d) + vx_load(hsad + d)); - d += v_uint16::nlanes; + v_store(sad + d, v_add(vx_load(sad + d), vx_load(hsad + d))); + d += VTraits::vlanes(); } - if( d <= ndisp-v_uint16::nlanes/2 ) + if( d <= ndisp-VTraits::vlanes()/2 ) { - v_store_low(sad + d, vx_load_low(sad + d) + vx_load_low(hsad + d)); - d += v_uint16::nlanes/2; + v_store_low(sad + d, v_add(vx_load_low(sad + d), vx_load_low(hsad + d))); + d += VTraits::vlanes()/2; } for( ; d < ndisp; d++ ) sad[d] = sad[d] + hsad[d]; @@ -564,29 +564,29 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right, v_int16 minsad8 = vx_setall_s16(SHRT_MAX); v_int16 mind8 = vx_setall_s16(0); - for( d = 0; d <= ndisp - 2*v_int16::nlanes; d += 2*v_int16::nlanes ) + for( d = 0; d <= ndisp - 2*VTraits::vlanes(); d += 2*VTraits::vlanes() ) { - v_int16 sad8 = v_reinterpret_as_s16(vx_load(hsad + d)) - v_reinterpret_as_s16(vx_load(hsad_sub + d)) + v_reinterpret_as_s16(vx_load(sad + d)); + v_int16 sad8 = v_add(v_sub(v_reinterpret_as_s16(vx_load(hsad + d)), v_reinterpret_as_s16(vx_load(hsad_sub + d))), v_reinterpret_as_s16(vx_load(sad + d))); v_store(sad + d, v_reinterpret_as_u16(sad8)); - mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)d)); + mind8 = v_max(mind8, v_and(v_gt(minsad8, sad8), vx_setall_s16((short)d))); minsad8 = v_min(minsad8, sad8); - sad8 = v_reinterpret_as_s16(vx_load(hsad + d + v_int16::nlanes)) - v_reinterpret_as_s16(vx_load(hsad_sub + d + v_int16::nlanes)) + v_reinterpret_as_s16(vx_load(sad + d + v_int16::nlanes)); - v_store(sad + d + v_int16::nlanes, v_reinterpret_as_u16(sad8)); - mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)(d+v_int16::nlanes))); + sad8 = v_add(v_sub(v_reinterpret_as_s16(vx_load(hsad + d + VTraits::vlanes())), v_reinterpret_as_s16(vx_load(hsad_sub + d + VTraits::vlanes()))), v_reinterpret_as_s16(vx_load(sad + d + VTraits::vlanes()))); + v_store(sad + d + VTraits::vlanes(), v_reinterpret_as_u16(sad8)); + mind8 = v_max(mind8, v_and(v_gt(minsad8, sad8), vx_setall_s16((short)(d + VTraits::vlanes())))); minsad8 = v_min(minsad8, sad8); } - if( d <= ndisp - v_int16::nlanes ) + if( d <= ndisp - VTraits::vlanes() ) { - v_int16 sad8 = v_reinterpret_as_s16(vx_load(hsad + d)) - v_reinterpret_as_s16(vx_load(hsad_sub + d)) + v_reinterpret_as_s16(vx_load(sad + d)); + v_int16 sad8 = v_add(v_sub(v_reinterpret_as_s16(vx_load(hsad + d)), v_reinterpret_as_s16(vx_load(hsad_sub + d))), v_reinterpret_as_s16(vx_load(sad + d))); v_store(sad + d, v_reinterpret_as_u16(sad8)); - mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)d)); + mind8 = v_max(mind8, v_and(v_gt(minsad8, sad8), vx_setall_s16((short)d))); minsad8 = v_min(minsad8, sad8); - d += v_int16::nlanes; + d += VTraits::vlanes(); } minsad = v_reduce_min(minsad8); - v_int16 v_mask = (vx_setall_s16((short)minsad) == minsad8); - mind = v_reduce_min(((mind8+vx_load(v_seq)) & v_mask) | (vx_setall_s16(SHRT_MAX) & ~v_mask)); + v_int16 v_mask = (v_eq(vx_setall_s16((short)minsad), minsad8)); + mind = v_reduce_min(v_or(v_and(v_add(mind8, vx_load(v_seq)), v_mask), v_and(vx_setall_s16(32767), v_not(v_mask)))); for( ; d < ndisp; d++ ) { int sad8 = (int)(hsad[d]) - hsad_sub[d] + sad[d]; @@ -610,34 +610,34 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right, int thresh = minsad + (minsad * uniquenessRatio/100); v_int32 thresh4 = vx_setall_s32(thresh + 1); v_int32 d1 = vx_setall_s32(mind-1), d2 = vx_setall_s32(mind+1); - v_int32 dd_4 = vx_setall_s32(v_int32::nlanes); + v_int32 dd_4 = vx_setall_s32(VTraits::vlanes()); v_int32 d4 = vx_load_expand(v_seq); - for( d = 0; d <= ndisp - v_int16::nlanes; d += v_int16::nlanes ) + for( d = 0; d <= ndisp - VTraits::vlanes(); d += VTraits::vlanes() ) { v_int32 sad4_l, sad4_h; v_expand(v_reinterpret_as_s16(vx_load(sad + d)), sad4_l, sad4_h); - if( v_check_any((thresh4 > sad4_l) & ((d1 > d4) | (d4 > d2))) ) + if( v_check_any(v_and(v_gt(thresh4, sad4_l), v_or(v_gt(d1, d4), v_gt(d4, d2)))) ) break; - d4 += dd_4; - if( v_check_any((thresh4 > sad4_h) & ((d1 > d4) | (d4 > d2))) ) + d4 = v_add(d4, dd_4); + if( v_check_any(v_and(v_gt(thresh4, sad4_h), v_or(v_gt(d1, d4), v_gt(d4, d2)))) ) break; - d4 += dd_4; + d4 = v_add(d4, dd_4); } - if( d <= ndisp - v_int16::nlanes ) + if( d <= ndisp - VTraits::vlanes() ) { dptr[y*dstep] = FILTERED; continue; } - if( d <= ndisp - v_int32::nlanes ) + if( d <= ndisp - VTraits::vlanes() ) { v_int32 sad4_l = vx_load_expand((short*)sad + d); - if (v_check_any((thresh4 > sad4_l) & ((d1 > d4) | (d4 > d2)))) + if (v_check_any(v_and(v_gt(thresh4, sad4_l), v_or(v_gt(d1, d4), v_gt(d4, d2))))) { dptr[y*dstep] = FILTERED; continue; } - d += v_int16::nlanes; + d += VTraits::vlanes(); } for( ; d < ndisp; d++ ) { @@ -699,11 +699,11 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right, int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0; const uchar * tab = bufX.tab; -#if CV_SIMD - int v_seq[v_int32::nlanes]; - for (int i = 0; i < v_int32::nlanes; ++i) +#if (CV_SIMD || CV_SIMD_SCALABLE) + int v_seq[VTraits::max_nlanes]; + for (int i = 0; i < VTraits::vlanes(); ++i) v_seq[i] = i; - v_int32 d0_4 = vx_load(v_seq), dd_4 = vx_setall_s32(v_int32::nlanes); + v_int32 d0_4 = vx_load(v_seq), dd_4 = vx_setall_s32(VTraits::vlanes()); #endif int *sad = bufX.sad[bufNum] + 1; @@ -725,17 +725,17 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right, { int lval = lptr[0]; d = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) { v_uint8 lv = vx_setall_u8((uchar)lval); - for( ; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes ) + for( ; d <= ndisp - VTraits::vlanes(); d += VTraits::vlanes() ) { v_uint8 rv = vx_load(rptr + d); v_int32 hsad_0 = vx_load(hsad + d); - v_int32 hsad_1 = vx_load(hsad + d + v_int32::nlanes); - v_int32 hsad_2 = vx_load(hsad + d + 2*v_int32::nlanes); - v_int32 hsad_3 = vx_load(hsad + d + 3*v_int32::nlanes); + v_int32 hsad_1 = vx_load(hsad + d + VTraits::vlanes()); + v_int32 hsad_2 = vx_load(hsad + d + 2*VTraits::vlanes()); + v_int32 hsad_3 = vx_load(hsad + d + 3*VTraits::vlanes()); v_uint8 diff = v_absdiff(lv, rv); v_store(cbuf + d, diff); @@ -745,15 +745,15 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right, v_expand(diff0, diff00, diff01); v_expand(diff1, diff10, diff11); - hsad_0 += v_reinterpret_as_s32(diff00); - hsad_1 += v_reinterpret_as_s32(diff01); - hsad_2 += v_reinterpret_as_s32(diff10); - hsad_3 += v_reinterpret_as_s32(diff11); + hsad_0 = v_add(hsad_0, v_reinterpret_as_s32(diff00)); + hsad_1 = v_add(hsad_1, v_reinterpret_as_s32(diff01)); + hsad_2 = v_add(hsad_2, v_reinterpret_as_s32(diff10)); + hsad_3 = v_add(hsad_3, v_reinterpret_as_s32(diff11)); v_store(hsad + d, hsad_0); - v_store(hsad + d + v_int32::nlanes, hsad_1); - v_store(hsad + d + 2*v_int32::nlanes, hsad_2); - v_store(hsad + d + 3*v_int32::nlanes, hsad_3); + v_store(hsad + d + VTraits::vlanes(), hsad_1); + v_store(hsad + d + 2*VTraits::vlanes(), hsad_2); + v_store(hsad + d + 3*VTraits::vlanes(), hsad_3); } } #endif @@ -793,16 +793,16 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right, { int lval = lptr[0]; d = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) { v_uint8 lv = vx_setall_u8((uchar)lval); - for( ; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes ) + for( ; d <= ndisp - VTraits::vlanes(); d += VTraits::vlanes() ) { v_uint8 rv = vx_load(rptr + d); v_int32 hsad_0 = vx_load(hsad + d); - v_int32 hsad_1 = vx_load(hsad + d + v_int32::nlanes); - v_int32 hsad_2 = vx_load(hsad + d + 2*v_int32::nlanes); - v_int32 hsad_3 = vx_load(hsad + d + 3*v_int32::nlanes); + v_int32 hsad_1 = vx_load(hsad + d + VTraits::vlanes()); + v_int32 hsad_2 = vx_load(hsad + d + 2*VTraits::vlanes()); + v_int32 hsad_3 = vx_load(hsad + d + 3*VTraits::vlanes()); v_uint8 cbs = vx_load(cbuf_sub + d); v_uint8 diff = v_absdiff(lv, rv); v_store(cbuf + d, diff); @@ -816,19 +816,19 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right, v_expand(v_reinterpret_as_s16(cbs0), cbs00, cbs01); v_expand(v_reinterpret_as_s16(cbs1), cbs10, cbs11); - v_int32 diff_0 = diff00 - cbs00; - v_int32 diff_1 = diff01 - cbs01; - v_int32 diff_2 = diff10 - cbs10; - v_int32 diff_3 = diff11 - cbs11; - hsad_0 += diff_0; - hsad_1 += diff_1; - hsad_2 += diff_2; - hsad_3 += diff_3; + v_int32 diff_0 = v_sub(diff00, cbs00); + v_int32 diff_1 = v_sub(diff01, cbs01); + v_int32 diff_2 = v_sub(diff10, cbs10); + v_int32 diff_3 = v_sub(diff11, cbs11); + hsad_0 = v_add(hsad_0, diff_0); + hsad_1 = v_add(hsad_1, diff_1); + hsad_2 = v_add(hsad_2, diff_2); + hsad_3 = v_add(hsad_3, diff_3); v_store(hsad + d, hsad_0); - v_store(hsad + d + v_int32::nlanes, hsad_1); - v_store(hsad + d + 2*v_int32::nlanes, hsad_2); - v_store(hsad + d + 3*v_int32::nlanes, hsad_3); + v_store(hsad + d + VTraits::vlanes(), hsad_1); + v_store(hsad + d + 2*VTraits::vlanes(), hsad_2); + v_store(hsad + d + 3*VTraits::vlanes(), hsad_3); } } #endif @@ -855,18 +855,18 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right, for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp ) { d = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) { - for( d = 0; d <= ndisp-2*v_int32::nlanes; d += 2*v_int32::nlanes ) + for( d = 0; d <= ndisp-2*VTraits::vlanes(); d += 2*VTraits::vlanes() ) { v_int32 s0 = vx_load(sad + d); - v_int32 s1 = vx_load(sad + d + v_int32::nlanes); + v_int32 s1 = vx_load(sad + d + VTraits::vlanes()); v_int32 t0 = vx_load(hsad + d); - v_int32 t1 = vx_load(hsad + d + v_int32::nlanes); - s0 += t0; - s1 += t1; + v_int32 t1 = vx_load(hsad + d + VTraits::vlanes()); + s0 = v_add(s0, t0); + s1 = v_add(s1, t1); v_store(sad + d, s0); - v_store(sad + d + v_int32::nlanes, s1); + v_store(sad + d + VTraits::vlanes(), s1); } } #endif @@ -884,30 +884,30 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right, hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp; hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp; d = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) { v_int32 minsad4 = vx_setall_s32(INT_MAX); v_int32 mind4 = vx_setall_s32(0), d4 = d0_4; - for( ; d <= ndisp - 2*v_int32::nlanes; d += 2*v_int32::nlanes ) + for( ; d <= ndisp - 2*VTraits::vlanes(); d += 2*VTraits::vlanes() ) { - v_int32 sad4 = vx_load(sad + d) + vx_load(hsad + d) - vx_load(hsad_sub + d); + v_int32 sad4 = v_sub(v_add(vx_load(sad + d), vx_load(hsad + d)), vx_load(hsad_sub + d)); v_store(sad + d, sad4); - mind4 = v_select(minsad4 > sad4, d4, mind4); + mind4 = v_select(v_gt(minsad4, sad4), d4, mind4); minsad4 = v_min(minsad4, sad4); - d4 += dd_4; + d4 = v_add(d4, dd_4); - sad4 = vx_load(sad + d + v_int32::nlanes) + vx_load(hsad + d + v_int32::nlanes) - vx_load(hsad_sub + d + v_int32::nlanes); - v_store(sad + d + v_int32::nlanes, sad4); - mind4 = v_select(minsad4 > sad4, d4, mind4); + sad4 = v_sub(v_add(vx_load(sad + d + VTraits::vlanes()), vx_load(hsad + d + VTraits::vlanes())), vx_load(hsad_sub + d + VTraits::vlanes())); + v_store(sad + d + VTraits::vlanes(), sad4); + mind4 = v_select(v_gt(minsad4, sad4), d4, mind4); minsad4 = v_min(minsad4, sad4); - d4 += dd_4; + d4 = v_add(d4, dd_4); } - int CV_DECL_ALIGNED(CV_SIMD_WIDTH) minsad_buf[v_int32::nlanes], mind_buf[v_int32::nlanes]; + int CV_DECL_ALIGNED(CV_SIMD_WIDTH) minsad_buf[VTraits::max_nlanes], mind_buf[VTraits::max_nlanes]; v_store(minsad_buf, minsad4); v_store(mind_buf, mind4); - for (int i = 0; i < v_int32::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) if(minsad_buf[i] < minsad || (minsad == minsad_buf[i] && mind_buf[i] < mind)) { minsad = minsad_buf[i]; mind = mind_buf[i]; } } #endif @@ -1102,7 +1102,7 @@ struct FindStereoCorrespInvoker : public ParallelLoopBody Mat disp_i = disp->rowRange(row0, row1); Mat cost_i = state.disp12MaxDiff >= 0 ? cost->rowRange(row0, row1) : Mat(); -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) if (state.useShorts()) { if( disp_i.type() == CV_16S) diff --git a/modules/calib3d/src/stereosgbm.cpp b/modules/calib3d/src/stereosgbm.cpp index bc19612b5a..75f6f32564 100644 --- a/modules/calib3d/src/stereosgbm.cpp +++ b/modules/calib3d/src/stereosgbm.cpp @@ -123,7 +123,7 @@ struct StereoSGBMParams int mode; }; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) #if CV_SIMD_WIDTH == 16 static inline v_int16 vx_setseq_s16() { return v_int16(0, 1, 2, 3, 4, 5, 6, 7); } @@ -136,10 +136,10 @@ static inline v_int16 vx_setseq_s16() #else struct vseq_s16 { - short data[v_int16::nlanes]; + short data[VTraits::max_nlanes]; vseq_s16() { - for (int i = 0; i < v_int16::nlanes; i++) + for (int i = 0; i < VTraits::vlanes(); i++) data[i] = i; } }; @@ -153,8 +153,8 @@ static inline v_int16 vx_setseq_s16() static inline void min_pos(const v_int16& val, const v_int16& pos, short &min_val, short &min_pos) { min_val = v_reduce_min(val); - v_int16 v_mask = (vx_setall_s16(min_val) == val); - min_pos = v_reduce_min(((pos+vx_setseq_s16()) & v_mask) | (vx_setall_s16(SHRT_MAX) & ~v_mask)); + v_int16 v_mask = (v_eq(vx_setall_s16(min_val), val)); + min_pos = v_reduce_min(v_or(v_and(v_add(pos, vx_setseq_s16()), v_mask), v_and(vx_setall_s16(SHRT_MAX), v_not(v_mask)))); } #endif @@ -270,26 +270,26 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y, int u1 = std::max(ul, ur); u1 = std::max(u1, u); int d = minD; - #if CV_SIMD + #if (CV_SIMD || CV_SIMD_SCALABLE) v_uint8 _u = vx_setall_u8((uchar)u), _u0 = vx_setall_u8((uchar)u0); v_uint8 _u1 = vx_setall_u8((uchar)u1); - for( ; d <= maxD - 2*v_int16::nlanes; d += 2*v_int16::nlanes ) + for( ; d <= maxD - 2*VTraits::vlanes(); d += 2*VTraits::vlanes() ) { v_uint8 _v = vx_load(prow2 + width-x-1 + d); v_uint8 _v0 = vx_load(buffer + width-x-1 + d); v_uint8 _v1 = vx_load(buffer + width-x-1 + d + width2); - v_uint8 c0 = v_max(_u - _v1, _v0 - _u); - v_uint8 c1 = v_max(_v - _u1, _u0 - _v); + v_uint8 c0 = v_max(v_sub(_u, _v1), v_sub(_v0, _u)); + v_uint8 c1 = v_max(v_sub(_v, _u1), v_sub(_u0, _v)); v_uint8 diff = v_min(c0, c1); v_int16 _c0 = vx_load_aligned(cost + x*D + d); - v_int16 _c1 = vx_load_aligned(cost + x*D + d + v_int16::nlanes); + v_int16 _c1 = vx_load_aligned(cost + x*D + d + VTraits::vlanes()); v_uint16 diff1,diff2; v_expand(diff,diff1,diff2); - v_store_aligned(cost + x*D + d, _c0 + v_reinterpret_as_s16(diff1 >> diff_scale)); - v_store_aligned(cost + x*D + d + v_int16::nlanes, _c1 + v_reinterpret_as_s16(diff2 >> diff_scale)); + v_store_aligned(cost + x*D + d, v_add(_c0, v_reinterpret_as_s16(v_shr(diff1, diff_scale)))); + v_store_aligned(cost + x*D + d + VTraits::vlanes(), v_add(_c1, v_reinterpret_as_s16(v_shr(diff2, diff_scale)))); } #endif for( ; d < maxD; d++ ) @@ -555,13 +555,13 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, calcPixelCostBT( img1, img2, k, minD, maxD, mem.pixDiff, mem.tempBuf, mem.getClipTab() ); memset(hsumAdd, 0, Da*sizeof(CostType)); -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_int16 h_scale = vx_setall_s16((short)SW2 + 1); - for( d = 0; d < Da; d += v_int16::nlanes ) + for( d = 0; d < Da; d += VTraits::vlanes() ) { - v_int16 v_hsumAdd = vx_load_aligned(mem.pixDiff + d) * h_scale; + v_int16 v_hsumAdd = v_mul(vx_load_aligned(mem.pixDiff + d), h_scale); for( x = Da; x <= SW2*Da; x += Da ) - v_hsumAdd += vx_load_aligned(mem.pixDiff + x + d); + v_hsumAdd = v_add(v_hsumAdd, vx_load_aligned(mem.pixDiff + x + d)); v_store_aligned(hsumAdd + d, v_hsumAdd); } #else @@ -578,9 +578,9 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0)); const CostType* Cprev = mem.getCBuf(y - 1); -#if CV_SIMD - for (d = 0; d < Da; d += v_int16::nlanes) - v_store_aligned(C + d, vx_load_aligned(Cprev + d) + vx_load_aligned(hsumAdd + d) - vx_load_aligned(hsumSub + d)); +#if (CV_SIMD || CV_SIMD_SCALABLE) + for (d = 0; d < Da; d += VTraits::vlanes()) + v_store_aligned(C + d, v_sub(v_add(vx_load_aligned(Cprev + d), vx_load_aligned(hsumAdd + d)), vx_load_aligned(hsumSub + d))); #else for (d = 0; d < D; d++) C[d] = (CostType)(Cprev[d] + hsumAdd[d] - hsumSub[d]); @@ -590,12 +590,12 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, { const CostType* pixAdd = mem.pixDiff + std::min(x + SW2*Da, (width1-1)*Da); const CostType* pixSub = mem.pixDiff + std::max(x - (SW2+1)*Da, 0); -#if CV_SIMD - for( d = 0; d < Da; d += v_int16::nlanes ) +#if (CV_SIMD || CV_SIMD_SCALABLE) + for( d = 0; d < Da; d += VTraits::vlanes() ) { - v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) - vx_load_aligned(pixSub + d) + vx_load_aligned(pixAdd + d); + v_int16 hv = v_add(v_sub(vx_load_aligned(hsumAdd + x - Da + d), vx_load_aligned(pixSub + d)), vx_load_aligned(pixAdd + d)); v_store_aligned(hsumAdd + x + d, hv); - v_store_aligned(C + x + d, vx_load_aligned(Cprev + x + d) - vx_load_aligned(hsumSub + x + d) + hv); + v_store_aligned(C + x + d, v_add(v_sub(vx_load_aligned(Cprev + x + d), vx_load_aligned(hsumSub + x + d)), hv)); } #else for( d = 0; d < D; d++ ) @@ -608,10 +608,10 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, } else { -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_int16 v_scale = vx_setall_s16(k == 0 ? (short)SH2 + 1 : 1); - for (d = 0; d < Da; d += v_int16::nlanes) - v_store_aligned(C + d, vx_load_aligned(C + d) + vx_load_aligned(hsumAdd + d) * v_scale); + for (d = 0; d < Da; d += VTraits::vlanes()) + v_store_aligned(C + d, v_add(vx_load_aligned(C + d), v_mul(vx_load_aligned(hsumAdd + d), v_scale))); #else int scale = k == 0 ? SH2 + 1 : 1; for (d = 0; d < D; d++) @@ -622,12 +622,12 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, const CostType* pixAdd = mem.pixDiff + std::min(x + SW2*Da, (width1-1)*Da); const CostType* pixSub = mem.pixDiff + std::max(x - (SW2+1)*Da, 0); -#if CV_SIMD - for (d = 0; d < Da; d += v_int16::nlanes) +#if (CV_SIMD || CV_SIMD_SCALABLE) + for (d = 0; d < Da; d += VTraits::vlanes()) { - v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) + vx_load_aligned(pixAdd + d) - vx_load_aligned(pixSub + d); + v_int16 hv = v_sub(v_add(vx_load_aligned(hsumAdd + x - Da + d), vx_load_aligned(pixAdd + d)), vx_load_aligned(pixSub + d)); v_store_aligned(hsumAdd + x + d, hv); - v_store_aligned(C + x + d, vx_load_aligned(C + x + d) + hv * v_scale); + v_store_aligned(C + x + d, v_add(vx_load_aligned(C + x + d), v_mul(hv, v_scale))); } #else for( d = 0; d < D; d++ ) @@ -646,9 +646,9 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, { const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0)); const CostType* Cprev = mem.getCBuf(y - 1); -#if CV_SIMD - for (x = 0; x < width1*Da; x += v_int16::nlanes) - v_store_aligned(C + x, vx_load_aligned(Cprev + x) - vx_load_aligned(hsumSub + x) + vx_load_aligned(hsumAdd + x)); +#if (CV_SIMD || CV_SIMD_SCALABLE) + for (x = 0; x < width1*Da; x += VTraits::vlanes()) + v_store_aligned(C + x, v_add(v_sub(vx_load_aligned(Cprev + x), vx_load_aligned(hsumSub + x)), vx_load_aligned(hsumAdd + x))); #else for (x = 0; x < width1*Da; x++) C[x] = (CostType)(Cprev[x] + hsumAdd[x] - hsumSub[x]); @@ -656,9 +656,9 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, } else { -#if CV_SIMD - for (x = 0; x < width1*Da; x += v_int16::nlanes) - v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x)); +#if (CV_SIMD || CV_SIMD_SCALABLE) + for (x = 0; x < width1*Da; x += VTraits::vlanes()) + v_store_aligned(C + x, v_add(vx_load_aligned(C + x), vx_load_aligned(hsumAdd + x))); #else for (x = 0; x < width1*Da; x++) C[x] = (CostType)(C[x] + hsumAdd[x]); @@ -714,7 +714,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, CostType* minL = mem.getMinLr(lrID, x); d = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_int16 _P1 = vx_setall_s16((short)P1); v_int16 _delta0 = vx_setall_s16((short)delta0); @@ -726,31 +726,31 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, v_int16 _minL2 = vx_setall_s16((short)MAX_COST); v_int16 _minL3 = vx_setall_s16((short)MAX_COST); - for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes ) + for( ; d <= D - VTraits::vlanes(); d += VTraits::vlanes() ) { v_int16 Cpd = vx_load_aligned(Cp + d); v_int16 Spd = vx_load_aligned(Sp + d); v_int16 L; - L = v_min(v_min(v_min(vx_load_aligned(Lr_p0 + d), vx_load(Lr_p0 + d - 1) + _P1), vx_load(Lr_p0 + d + 1) + _P1), _delta0) - _delta0 + Cpd; + L = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_p0 + d), v_add(vx_load(Lr_p0 + d - 1), _P1)), v_add(vx_load(Lr_p0 + d + 1), _P1)), _delta0), _delta0), Cpd); v_store_aligned(Lr_p + d, L); _minL0 = v_min(_minL0, L); - Spd += L; + Spd = v_add(Spd, L); - L = v_min(v_min(v_min(vx_load_aligned(Lr_p1 + d), vx_load(Lr_p1 + d - 1) + _P1), vx_load(Lr_p1 + d + 1) + _P1), _delta1) - _delta1 + Cpd; + L = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_p1 + d), v_add(vx_load(Lr_p1 + d - 1), _P1)), v_add(vx_load(Lr_p1 + d + 1), _P1)), _delta1), _delta1), Cpd); v_store_aligned(Lr_p + d + Dlra, L); _minL1 = v_min(_minL1, L); - Spd += L; + Spd = v_add(Spd, L); - L = v_min(v_min(v_min(vx_load_aligned(Lr_p2 + d), vx_load(Lr_p2 + d - 1) + _P1), vx_load(Lr_p2 + d + 1) + _P1), _delta2) - _delta2 + Cpd; + L = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_p2 + d), v_add(vx_load(Lr_p2 + d - 1), _P1)), v_add(vx_load(Lr_p2 + d + 1), _P1)), _delta2), _delta2), Cpd); v_store_aligned(Lr_p + d + Dlra*2, L); _minL2 = v_min(_minL2, L); - Spd += L; + Spd = v_add(Spd, L); - L = v_min(v_min(v_min(vx_load_aligned(Lr_p3 + d), vx_load(Lr_p3 + d - 1) + _P1), vx_load(Lr_p3 + d + 1) + _P1), _delta3) - _delta3 + Cpd; + L = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_p3 + d), v_add(vx_load(Lr_p3 + d - 1), _P1)), v_add(vx_load(Lr_p3 + d + 1), _P1)), _delta3), _delta3), Cpd); v_store_aligned(Lr_p + d + Dlra*3, L); _minL3 = v_min(_minL3, L); - Spd += L; + Spd = v_add(Spd, L); v_store_aligned(Sp + d, Spd); } @@ -769,7 +769,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, t0 = v_min(t0, t1); t0 = v_min(t0, v_rotate_right<4>(t0)); #if CV_SIMD_WIDTH == 32 - CostType buf[v_int16::nlanes]; + CostType buf[VTraits::max_nlanes]; v_store_low(buf, v_min(t0, v_rotate_right<8>(t0))); minL[0] = buf[0]; minL[1] = buf[1]; @@ -817,10 +817,10 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, if( pass == npasses ) { x = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_int16 v_inv_dist = vx_setall_s16((DispType)INVALID_DISP_SCALED); v_int16 v_max_cost = vx_setall_s16(MAX_COST); - for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes ) + for( ; x <= width - VTraits::vlanes(); x += VTraits::vlanes() ) { v_store(disp1ptr + x, v_inv_dist); v_store(mem.disp2ptr + x, v_inv_dist); @@ -850,23 +850,23 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, d = 0; int delta0 = P2 + *mem.getMinLr(lrID, x + 1); int minL0 = MAX_COST; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_int16 _P1 = vx_setall_s16((short)P1); v_int16 _delta0 = vx_setall_s16((short)delta0); v_int16 _minL0 = vx_setall_s16((short)MAX_COST); v_int16 _minS = vx_setall_s16(MAX_COST), _bestDisp = vx_setall_s16(-1); - for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes ) + for( ; d <= D - VTraits::vlanes(); d += VTraits::vlanes() ) { v_int16 Cpd = vx_load_aligned(Cp + d); - v_int16 L0 = v_min(v_min(v_min(vx_load_aligned(Lr_p0 + d), vx_load(Lr_p0 + d - 1) + _P1), vx_load(Lr_p0 + d + 1) + _P1), _delta0) - _delta0 + Cpd; + v_int16 L0 = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_p0 + d), v_add(vx_load(Lr_p0 + d - 1), _P1)), v_add(vx_load(Lr_p0 + d + 1), _P1)), _delta0), _delta0), Cpd); v_store_aligned(Lr_p + d, L0); _minL0 = v_min(_minL0, L0); - L0 += vx_load_aligned(Sp + d); + L0 = v_add(L0, vx_load_aligned(Sp + d)); v_store_aligned(Sp + d, L0); - _bestDisp = v_select(_minS > L0, vx_setall_s16((short)d), _bestDisp); + _bestDisp = v_select(v_gt(_minS, L0), vx_setall_s16((short)d), _bestDisp); _minS = v_min(_minS, L0); } minL0 = (CostType)v_reduce_min(_minL0); @@ -891,12 +891,12 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, else { d = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_int16 _minS = vx_setall_s16(MAX_COST), _bestDisp = vx_setall_s16(-1); - for( ; d <= D - v_int16::nlanes; d+= v_int16::nlanes ) + for( ; d <= D - VTraits::vlanes(); d+= VTraits::vlanes() ) { v_int16 L0 = vx_load_aligned(Sp + d); - _bestDisp = v_select(_minS > L0, vx_setall_s16((short)d), _bestDisp); + _bestDisp = v_select(v_gt(_minS, L0), vx_setall_s16((short)d), _bestDisp); _minS = v_min( L0, _minS ); } min_pos(_minS, _bestDisp, minS, bestDisp); @@ -1039,9 +1039,9 @@ struct CalcVerticalSums: public ParallelLoopBody for( x = (x1 - SW2)*Da; x <= (x1 + SW2)*Da; x += Da ) { int xbord = x <= 0 ? 0 : (x > (width1 - 1)*Da ? (width1 - 1)*Da : x); -#if CV_SIMD - for( d = 0; d < Da; d += v_int16::nlanes ) - v_store_aligned(hsumAdd + x1*Da + d, vx_load_aligned(hsumAdd + x1*Da + d) + vx_load_aligned(pixDiff + xbord + d)); +#if (CV_SIMD || CV_SIMD_SCALABLE) + for( d = 0; d < Da; d += VTraits::vlanes() ) + v_store_aligned(hsumAdd + x1*Da + d, v_add(vx_load_aligned(hsumAdd + x1 * this->Da + d), vx_load_aligned(pixDiff + xbord + d))); #else for( d = 0; d < D; d++ ) hsumAdd[x1*Da + d] = (CostType)(hsumAdd[x1*Da + d] + pixDiff[xbord + d]); @@ -1052,9 +1052,9 @@ struct CalcVerticalSums: public ParallelLoopBody { const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0)); const CostType* Cprev = mem.getCBuf(y - 1); -#if CV_SIMD - for( d = 0; d < Da; d += v_int16::nlanes ) - v_store_aligned(C + x1*Da + d, vx_load_aligned(Cprev + x1*Da + d) + vx_load_aligned(hsumAdd + x1*Da + d) - vx_load_aligned(hsumSub + x1*Da + d)); +#if (CV_SIMD || CV_SIMD_SCALABLE) + for( d = 0; d < Da; d += VTraits::vlanes() ) + v_store_aligned(C + x1*Da + d, v_sub(v_add(vx_load_aligned(Cprev + x1 * this->Da + d), vx_load_aligned(hsumAdd + x1 * this->Da + d)), vx_load_aligned(hsumSub + x1 * this->Da + d))); #else for( d = 0; d < D; d++ ) C[x1*Da + d] = (CostType)(Cprev[x1*Da + d] + hsumAdd[x1*Da + d] - hsumSub[x1*Da + d]); @@ -1064,12 +1064,12 @@ struct CalcVerticalSums: public ParallelLoopBody const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da); const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0); -#if CV_SIMD - for( d = 0; d < Da; d += v_int16::nlanes ) +#if (CV_SIMD || CV_SIMD_SCALABLE) + for( d = 0; d < Da; d += VTraits::vlanes() ) { - v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) - vx_load_aligned(pixSub + d) + vx_load_aligned(pixAdd + d); + v_int16 hv = v_add(v_sub(vx_load_aligned(hsumAdd + x - this->Da + d), vx_load_aligned(pixSub + d)), vx_load_aligned(pixAdd + d)); v_store_aligned(hsumAdd + x + d, hv); - v_store_aligned(C + x + d, vx_load_aligned(Cprev + x + d) - vx_load_aligned(hsumSub + x + d) + hv); + v_store_aligned(C + x + d, v_add(v_sub(vx_load_aligned(Cprev + x + d), vx_load_aligned(hsumSub + x + d)), hv)); } #else for( d = 0; d < D; d++ ) @@ -1082,10 +1082,10 @@ struct CalcVerticalSums: public ParallelLoopBody } else { -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_int16 v_scale = vx_setall_s16(k == 0 ? (short)SH2 + 1 : 1); - for (d = 0; d < Da; d += v_int16::nlanes) - v_store_aligned(C + x1*Da + d, vx_load_aligned(C + x1*Da + d) + vx_load_aligned(hsumAdd + x1*Da + d) * v_scale); + for (d = 0; d < Da; d += VTraits::vlanes()) + v_store_aligned(C + x1*Da + d, v_add(vx_load_aligned(C + x1 * this->Da + d), v_mul(vx_load_aligned(hsumAdd + x1 * this->Da + d), v_scale))); #else int scale = k == 0 ? SH2 + 1 : 1; for (d = 0; d < D; d++) @@ -1095,12 +1095,12 @@ struct CalcVerticalSums: public ParallelLoopBody { const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da); const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0); -#if CV_SIMD - for (d = 0; d < Da; d += v_int16::nlanes) +#if (CV_SIMD || CV_SIMD_SCALABLE) + for (d = 0; d < Da; d += VTraits::vlanes()) { - v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) + vx_load_aligned(pixAdd + d) - vx_load_aligned(pixSub + d); + v_int16 hv = v_sub(v_add(vx_load_aligned(hsumAdd + x - this->Da + d), vx_load_aligned(pixAdd + d)), vx_load_aligned(pixSub + d)); v_store_aligned(hsumAdd + x + d, hv); - v_store_aligned(C + x + d, vx_load_aligned(C + x + d) + hv * v_scale); + v_store_aligned(C + x + d, v_add(vx_load_aligned(C + x + d), v_mul(hv, v_scale))); } #else for( d = 0; d < D; d++ ) @@ -1120,9 +1120,9 @@ struct CalcVerticalSums: public ParallelLoopBody const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0)); const CostType* Cprev = mem.getCBuf(y - 1); -#if CV_SIMD - for( x = x1*Da; x < x2*Da; x += v_int16::nlanes ) - v_store_aligned(C + x, vx_load_aligned(Cprev + x) - vx_load_aligned(hsumSub + x) + vx_load_aligned(hsumAdd + x)); +#if (CV_SIMD || CV_SIMD_SCALABLE) + for( x = x1*Da; x < x2*Da; x += VTraits::vlanes() ) + v_store_aligned(C + x, v_add(v_sub(vx_load_aligned(Cprev + x), vx_load_aligned(hsumSub + x)), vx_load_aligned(hsumAdd + x))); #else for( x = x1*Da; x < x2*Da; x++ ) C[x] = (CostType)(Cprev[x] + hsumAdd[x] - hsumSub[x]); @@ -1131,9 +1131,9 @@ struct CalcVerticalSums: public ParallelLoopBody else*/ if(y == 0) { -#if CV_SIMD - for( x = x1*Da; x < x2*Da; x += v_int16::nlanes ) - v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x)); +#if (CV_SIMD || CV_SIMD_SCALABLE) + for( x = x1*Da; x < x2*Da; x += VTraits::vlanes() ) + v_store_aligned(C + x, v_add(vx_load_aligned(C + x), vx_load_aligned(hsumAdd + x))); #else for( x = x1*Da; x < x2*Da; x++ ) C[x] = (CostType)(C[x] + hsumAdd[x]); @@ -1167,19 +1167,19 @@ struct CalcVerticalSums: public ParallelLoopBody CostType& minL = *(mem.getMinLr(lrID, x)); d = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_int16 _P1 = vx_setall_s16((short)P1); v_int16 _delta = vx_setall_s16((short)delta); v_int16 _minL = vx_setall_s16((short)MAX_COST); - for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes ) + for( ; d <= D - VTraits::vlanes(); d += VTraits::vlanes() ) { v_int16 Cpd = vx_load_aligned(Cp + d); - v_int16 L = v_min(v_min(v_min(vx_load_aligned(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd; + v_int16 L = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_ppr + d), v_add(vx_load(Lr_ppr + d - 1), _P1)), v_add(vx_load(Lr_ppr + d + 1), _P1)), _delta), _delta), Cpd); v_store_aligned(Lr_p + d, L); _minL = v_min(_minL, L); - v_store_aligned(Sp + d, vx_load_aligned(Sp + d) + L); + v_store_aligned(Sp + d, v_add(vx_load_aligned(Sp + d), L)); } minL = v_reduce_min(_minL); #else @@ -1264,10 +1264,10 @@ struct CalcHorizontalSums: public ParallelLoopBody CostType* S = mem.getSBuf(y); x = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_int16 v_inv_dist = vx_setall_s16((DispType)INVALID_DISP_SCALED); v_int16 v_max_cost = vx_setall_s16(MAX_COST); - for (; x <= width - v_int16::nlanes; x += v_int16::nlanes) + for (; x <= width - VTraits::vlanes(); x += VTraits::vlanes()) { v_store(disp1ptr + x, v_inv_dist); v_store(disp2ptr + x, v_inv_dist); @@ -1304,19 +1304,19 @@ struct CalcHorizontalSums: public ParallelLoopBody CostType* Sp = S + x*Da; d = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_int16 _P1 = vx_setall_s16((short)P1); v_int16 _delta = vx_setall_s16((short)delta); v_int16 _minL = vx_setall_s16((short)MAX_COST); - for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes) + for( ; d <= D - VTraits::vlanes(); d += VTraits::vlanes()) { v_int16 Cpd = vx_load_aligned(Cp + d); - v_int16 L = v_min(v_min(v_min(vx_load(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd; + v_int16 L = v_add(v_sub(v_min(v_min(v_min(vx_load(Lr_ppr + d), v_add(vx_load(Lr_ppr + d - 1), _P1)), v_add(vx_load(Lr_ppr + d + 1), _P1)), _delta), _delta), Cpd); v_store(Lr_p + d, L); _minL = v_min(_minL, L); - v_store_aligned(Sp + d, vx_load_aligned(Sp + d) + L); + v_store_aligned(Sp + d, v_add(vx_load_aligned(Sp + d), L)); } minLr = v_reduce_min(_minL); #else @@ -1349,22 +1349,22 @@ struct CalcHorizontalSums: public ParallelLoopBody minLr = MAX_COST; d = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_int16 _P1 = vx_setall_s16((short)P1); v_int16 _delta = vx_setall_s16((short)delta); v_int16 _minL = vx_setall_s16((short)MAX_COST); v_int16 _minS = vx_setall_s16(MAX_COST), _bestDisp = vx_setall_s16(-1); - for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes ) + for( ; d <= D - VTraits::vlanes(); d += VTraits::vlanes() ) { v_int16 Cpd = vx_load_aligned(Cp + d); - v_int16 L = v_min(v_min(v_min(vx_load(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd; + v_int16 L = v_add(v_sub(v_min(v_min(v_min(vx_load(Lr_ppr + d), v_add(vx_load(Lr_ppr + d - 1), _P1)), v_add(vx_load(Lr_ppr + d + 1), _P1)), _delta), _delta), Cpd); v_store(Lr_p + d, L); _minL = v_min(_minL, L); - L += vx_load_aligned(Sp + d); + L = v_add(L, vx_load_aligned(Sp + d)); v_store_aligned(Sp + d, L); - _bestDisp = v_select(_minS > L, vx_setall_s16((short)d), _bestDisp); + _bestDisp = v_select(v_gt(_minS, L), vx_setall_s16((short)d), _bestDisp); _minS = v_min( L, _minS ); } minLr = v_reduce_min(_minL); @@ -1581,8 +1581,8 @@ struct SGBM3WayMainLoop : public ParallelLoopBody utils::BufferArea aux_area; PixType* clipTab; -#if CV_SIMD - short idx_row[v_int16::nlanes]; +#if (CV_SIMD || CV_SIMD_SCALABLE) + short idx_row[VTraits::max_nlanes]; #endif SGBM3WayMainLoop(const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, int stripe_size, int _stripe_overlap); void operator () (const Range& range) const CV_OVERRIDE; @@ -1637,8 +1637,8 @@ SGBM3WayMainLoop::SGBM3WayMainLoop(const Mat& _img1, uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10; disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1; -#if CV_SIMD - for(short i = 0; i < v_int16::nlanes; ++i) +#if (CV_SIMD || CV_SIMD_SCALABLE) + for(short i = 0; i < VTraits::vlanes(); ++i) idx_row[i] = i; #endif } @@ -1659,13 +1659,13 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int { calcPixelCostBT( *img1, *img2, k, minD, maxD, pixDiff, tmpBuf, clipTab + TAB_OFS ); -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_int16 sw2_1 = vx_setall_s16((short)SW2 + 1); - for (d = 0; d < Da; d += v_int16::nlanes) + for (d = 0; d < Da; d += VTraits::vlanes()) { - v_int16 hsA = vx_load_aligned(pixDiff + d) * sw2_1; + v_int16 hsA = v_mul(vx_load_aligned(pixDiff + d), sw2_1); for (x = Da; x <= SW2 * Da; x += Da) - hsA += vx_load_aligned(pixDiff + x + d); + hsA = v_add(hsA, vx_load_aligned(pixDiff + x + d)); v_store_aligned(hsumAdd + d, hsA); } #else @@ -1681,9 +1681,9 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int { const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, src_start_idx)); -#if CV_SIMD - for (d = 0; d < Da; d += v_int16::nlanes) - v_store_aligned(C + d, vx_load_aligned(C + d) + vx_load_aligned(hsumAdd + d) - vx_load_aligned(hsumSub + d)); +#if (CV_SIMD || CV_SIMD_SCALABLE) + for (d = 0; d < Da; d += VTraits::vlanes()) + v_store_aligned(C + d, v_sub(v_add(vx_load_aligned(C + d), vx_load_aligned(hsumAdd + d)), vx_load_aligned(hsumSub + d))); #else for (d = 0; d < D; d++) C[d] = (CostType)(C[d] + hsumAdd[d] - hsumSub[d]); @@ -1693,13 +1693,13 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int { const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da); const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0); -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_int16 hv_reg; - for( d = 0; d < Da; d+=v_int16::nlanes ) + for( d = 0; d < Da; d+=VTraits::vlanes() ) { - hv_reg = vx_load_aligned(hsumAdd+x-Da+d) + vx_load_aligned(pixAdd+d) - vx_load_aligned(pixSub+d); + hv_reg = v_sub(v_add(vx_load_aligned(hsumAdd + x - this->Da + d), vx_load_aligned(pixAdd + d)), vx_load_aligned(pixSub + d)); v_store_aligned(hsumAdd+x+d,hv_reg); - v_store_aligned(C+x+d,vx_load_aligned(C+x+d)+hv_reg-vx_load_aligned(hsumSub+x+d)); + v_store_aligned(C+x+d,v_sub(v_add(vx_load_aligned(C + x + d), hv_reg), vx_load_aligned(hsumSub + x + d))); } #else for( d = 0; d < D; d++ ) @@ -1712,10 +1712,10 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int } else { -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_int16 v_scale = vx_setall_s16(k == src_start_idx ? (short)SH2 + 1 : 1); - for (d = 0; d < Da; d += v_int16::nlanes) - v_store_aligned(C + d, vx_load_aligned(C + d) + vx_load_aligned(hsumAdd + d) * v_scale); + for (d = 0; d < Da; d += VTraits::vlanes()) + v_store_aligned(C + d, v_add(vx_load_aligned(C + d), v_mul(vx_load_aligned(hsumAdd + d), v_scale))); #else int scale = k == src_start_idx ? SH2 + 1 : 1; for (d = 0; d < D; d++) @@ -1725,12 +1725,12 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int { const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da); const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0); -#if CV_SIMD - for (d = 0; d < Da; d += v_int16::nlanes) +#if (CV_SIMD || CV_SIMD_SCALABLE) + for (d = 0; d < Da; d += VTraits::vlanes()) { - v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) + vx_load_aligned(pixAdd + d) - vx_load_aligned(pixSub + d); + v_int16 hv = v_sub(v_add(vx_load_aligned(hsumAdd + x - this->Da + d), vx_load_aligned(pixAdd + d)), vx_load_aligned(pixSub + d)); v_store_aligned(hsumAdd + x + d, hv); - v_store_aligned(C + x + d, vx_load_aligned(C + x + d) + hv * v_scale); + v_store_aligned(C + x + d, v_add(vx_load_aligned(C + x + d), v_mul(hv, v_scale))); } #else for (d = 0; d < D; d++) @@ -1748,9 +1748,9 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int if( y > src_start_idx ) { const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, src_start_idx)); -#if CV_SIMD - for( x = 0; x < width1*Da; x += v_int16::nlanes) - v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x) - vx_load_aligned(hsumSub + x)); +#if (CV_SIMD || CV_SIMD_SCALABLE) + for( x = 0; x < width1*Da; x += VTraits::vlanes()) + v_store_aligned(C + x, v_sub(v_add(vx_load_aligned(C + x), vx_load_aligned(hsumAdd + x)), vx_load_aligned(hsumSub + x))); #else for( x = 0; x < width1*Da; x++ ) C[x] = (CostType)(C[x] + hsumAdd[x] - hsumSub[x]); @@ -1758,9 +1758,9 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int } else { -#if CV_SIMD - for( x = 0; x < width1*Da; x += v_int16::nlanes) - v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x)); +#if (CV_SIMD || CV_SIMD_SCALABLE) + for( x = 0; x < width1*Da; x += VTraits::vlanes()) + v_store_aligned(C + x, v_add(vx_load_aligned(C + x), vx_load_aligned(hsumAdd + x))); #else for( x = 0; x < width1*Da; x++ ) C[x] = (CostType)(C[x] + hsumAdd[x]); @@ -1781,7 +1781,7 @@ void SGBM3WayMainLoop::accumulateCostsLeftTop(const BufferSGBM3Way &mem, int x, CostType *costs = mem.curCostVolumeLine - Da + x; CostType& topMinCost = mem.vertPassMin[x/Da]; int i = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_int16 P1_reg = vx_setall_s16(cv::saturate_cast(P1)); v_int16 leftMinCostP2_reg = vx_setall_s16(cv::saturate_cast(leftMinCost+P2)); @@ -1798,18 +1798,18 @@ void SGBM3WayMainLoop::accumulateCostsLeftTop(const BufferSGBM3Way &mem, int x, v_int16 src_shifted_left,src_shifted_right; v_int16 res; - for(;i::vlanes();i+= VTraits::vlanes()) { //process leftBuf: //lookahead load: - src2 = vx_load_aligned(leftBuf_prev+i+v_int16::nlanes); + src2 = vx_load_aligned(leftBuf_prev+i+VTraits::vlanes()); //get shifted versions of the current block and add P1: src_shifted_left = v_rotate_left<1> (src1_leftBuf,src0_leftBuf); src_shifted_right = v_rotate_right<1> (src1_leftBuf,src2 ); // process and save current block: - res = vx_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg); + res = v_add(vx_load_aligned(costs + i), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_leftBuf, leftMinCostP2_reg)), leftMinCostP2_reg)); leftMinCost_new_reg = v_min(leftMinCost_new_reg,res); v_store_aligned(leftBuf+i, res); @@ -1819,14 +1819,14 @@ void SGBM3WayMainLoop::accumulateCostsLeftTop(const BufferSGBM3Way &mem, int x, //process topBuf: //lookahead load: - src2 = vx_load_aligned(topBuf+i+v_int16::nlanes); + src2 = vx_load_aligned(topBuf+i+VTraits::vlanes()); //get shifted versions of the current block and add P1: src_shifted_left = v_rotate_left<1> (src1_topBuf,src0_topBuf); src_shifted_right = v_rotate_right<1> (src1_topBuf,src2 ); // process and save current block: - res = vx_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg); + res = v_add(vx_load_aligned(costs + i), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_topBuf, topMinCostP2_reg)), topMinCostP2_reg)); topMinCost_new_reg = v_min(topMinCost_new_reg,res); v_store_aligned(topBuf+i, res); @@ -1843,17 +1843,17 @@ void SGBM3WayMainLoop::accumulateCostsLeftTop(const BufferSGBM3Way &mem, int x, src_shifted_left = v_rotate_left<1> (src1_leftBuf,src0_leftBuf); src_shifted_right = v_rotate_right<1> (src1_leftBuf,src2 ); - res = vx_load_aligned(costs+Da-v_int16::nlanes) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg); + res = v_add(vx_load_aligned(costs + this->Da - VTraits::vlanes()), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_leftBuf, leftMinCostP2_reg)), leftMinCostP2_reg)); leftMinCost = v_reduce_min(v_min(leftMinCost_new_reg,res)); - v_store_aligned(leftBuf+Da-v_int16::nlanes, res); + v_store_aligned(leftBuf+Da-VTraits::vlanes(), res); //process topBuf: src_shifted_left = v_rotate_left<1> (src1_topBuf,src0_topBuf); src_shifted_right = v_rotate_right<1> (src1_topBuf,src2 ); - res = vx_load_aligned(costs+Da-v_int16::nlanes) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg); + res = v_add(vx_load_aligned(costs + this->Da - VTraits::vlanes()), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_topBuf, topMinCostP2_reg)), topMinCostP2_reg)); topMinCost = v_reduce_min(v_min(topMinCost_new_reg,res)); - v_store_aligned(topBuf+Da-v_int16::nlanes, res); + v_store_aligned(topBuf+Da-VTraits::vlanes(), res); } else { @@ -1904,7 +1904,7 @@ void SGBM3WayMainLoop::accumulateCostsRight(const BufferSGBM3Way &mem, int x, CostType* leftBuf = mem.horPassCostVolume + x; int i = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_int16 P1_reg = vx_setall_s16(cv::saturate_cast(P1)); v_int16 rightMinCostP2_reg = vx_setall_s16(cv::saturate_cast(rightMinCost+P2)); @@ -1919,27 +1919,27 @@ void SGBM3WayMainLoop::accumulateCostsRight(const BufferSGBM3Way &mem, int x, v_int16 min_sum_cost_reg = vx_setall_s16(SHRT_MAX); v_int16 min_sum_pos_reg = vx_setall_s16(0); - for(;i::vlanes();i+=VTraits::vlanes()) { //lookahead load: - src2 = vx_load_aligned(rightBuf+i+v_int16::nlanes); + src2 = vx_load_aligned(rightBuf+i+VTraits::vlanes()); //get shifted versions of the current block and add P1: src_shifted_left = v_rotate_left<1> (src1_rightBuf,src0_rightBuf); src_shifted_right = v_rotate_right<1> (src1_rightBuf,src2 ); // process and save current block: - res = vx_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg); + res = v_add(vx_load_aligned(costs + i), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_rightBuf, rightMinCostP2_reg)), rightMinCostP2_reg)); rightMinCost_new_reg = v_min(rightMinCost_new_reg,res); v_store_aligned(rightBuf+i, res); // compute and save total cost: - res = res + vx_load_aligned(leftBuf+i) + vx_load_aligned(topBuf+i); + res = v_add(v_add(res, vx_load_aligned(leftBuf + i)), vx_load_aligned(topBuf + i)); v_store_aligned(leftBuf+i, res); // track disparity value with the minimum cost: min_sum_cost_reg = v_min(min_sum_cost_reg,res); - min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (vx_setall_s16((short)i) - min_sum_pos_reg)); + min_sum_pos_reg = v_add(min_sum_pos_reg, v_and(v_eq(min_sum_cost_reg, res), v_sub(vx_setall_s16((short)i), min_sum_pos_reg))); //update src: src0_rightBuf = src1_rightBuf; @@ -1953,15 +1953,15 @@ void SGBM3WayMainLoop::accumulateCostsRight(const BufferSGBM3Way &mem, int x, src_shifted_left = v_rotate_left<1> (src1_rightBuf,src0_rightBuf); src_shifted_right = v_rotate_right<1> (src1_rightBuf,src2 ); - res = vx_load_aligned(costs+D-v_int16::nlanes) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg); + res = v_add(vx_load_aligned(costs + this->D - VTraits::vlanes()), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_rightBuf, rightMinCostP2_reg)), rightMinCostP2_reg)); rightMinCost = v_reduce_min(v_min(rightMinCost_new_reg,res)); - v_store_aligned(rightBuf+D-v_int16::nlanes, res); + v_store_aligned(rightBuf+D-VTraits::vlanes(), res); - res = res + vx_load_aligned(leftBuf+D-v_int16::nlanes) + vx_load_aligned(topBuf+D-v_int16::nlanes); - v_store_aligned(leftBuf+D-v_int16::nlanes, res); + res = v_add(v_add(res, vx_load_aligned(leftBuf + this->D - VTraits::vlanes())), vx_load_aligned(topBuf + this->D - VTraits::vlanes())); + v_store_aligned(leftBuf+D-VTraits::vlanes(), res); min_sum_cost_reg = v_min(min_sum_cost_reg,res); - min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (vx_setall_s16((short)(D-v_int16::nlanes)) - min_sum_pos_reg)); + min_sum_pos_reg = v_add(min_sum_pos_reg, v_and(v_eq(min_sum_cost_reg, res), v_sub(vx_setall_s16((short)(this->D - VTraits::vlanes())), min_sum_pos_reg))); min_pos(min_sum_cost_reg,min_sum_pos_reg, min_cost, optimal_disp); } else @@ -2070,40 +2070,40 @@ void SGBM3WayMainLoop::impl(const Range& range) const if(uniquenessRatio>0) { d = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) horPassCostVolume+=x; int thresh = (100*min_cost)/(100-uniquenessRatio); v_int16 thresh_reg = vx_setall_s16((short)(thresh+1)); v_int16 d1 = vx_setall_s16((short)(best_d-1)); v_int16 d2 = vx_setall_s16((short)(best_d+1)); - v_int16 eight_reg = vx_setall_s16((short)v_int16::nlanes); + v_int16 eight_reg = vx_setall_s16((short)VTraits::vlanes()); v_int16 cur_d = vx_load(idx_row); v_int16 mask; - for( ; d <= D - 2*v_int16::nlanes; d+=2*v_int16::nlanes ) + for( ; d <= D - 2*VTraits::vlanes(); d+=2*VTraits::vlanes() ) { - mask = (vx_load_aligned(horPassCostVolume + d) < thresh_reg) & ( (cur_dd2) ); - cur_d = cur_d+eight_reg; + mask = v_and(v_lt(vx_load_aligned(horPassCostVolume + d), thresh_reg), v_or(v_lt(cur_d, d1), v_gt(cur_d, d2))); + cur_d = v_add(cur_d, eight_reg); if( v_check_any(mask) ) break; - mask = (vx_load_aligned(horPassCostVolume + d + v_int16::nlanes) < thresh_reg) & ( (cur_dd2) ); - cur_d = cur_d+eight_reg; + mask = v_and(v_lt(vx_load_aligned(horPassCostVolume + d + VTraits::vlanes()), thresh_reg), v_or(v_lt(cur_d, d1), v_gt(cur_d, d2))); + cur_d = v_add(cur_d, eight_reg); if( v_check_any(mask) ) break; } - if( d <= D - 2*v_int16::nlanes ) + if( d <= D - 2*VTraits::vlanes() ) { horPassCostVolume-=x; continue; } - if( d <= D - v_int16::nlanes ) + if( d <= D - VTraits::vlanes() ) { - if( v_check_any((vx_load_aligned(horPassCostVolume + d) < thresh_reg) & ((cur_d < d1) | (cur_d > d2))) ) + if( v_check_any(v_and(v_lt(vx_load_aligned(horPassCostVolume + d), thresh_reg), v_or(v_lt(cur_d, d1), v_gt(cur_d, d2)))) ) { horPassCostVolume-=x; continue; } - d+=v_int16::nlanes; + d+=VTraits::vlanes(); } horPassCostVolume-=x; #endif diff --git a/modules/features2d/src/sift.simd.hpp b/modules/features2d/src/sift.simd.hpp index 8589a0225c..2c5cf9f997 100644 --- a/modules/features2d/src/sift.simd.hpp +++ b/modules/features2d/src/sift.simd.hpp @@ -210,24 +210,24 @@ float calcOrientationHist( cv::hal::magnitude32f(X, Y, Mag, len); k = 0; -#if CV_SIMD - const int vecsize = v_float32::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int vecsize = VTraits::vlanes(); v_float32 nd360 = vx_setall_f32(n/360.f); v_int32 __n = vx_setall_s32(n); - int CV_DECL_ALIGNED(CV_SIMD_WIDTH) bin_buf[vecsize]; - float CV_DECL_ALIGNED(CV_SIMD_WIDTH) w_mul_mag_buf[vecsize]; + int CV_DECL_ALIGNED(CV_SIMD_WIDTH) bin_buf[VTraits::max_nlanes]; + float CV_DECL_ALIGNED(CV_SIMD_WIDTH) w_mul_mag_buf[VTraits::max_nlanes]; for( ; k <= len - vecsize; k += vecsize ) { v_float32 w = vx_load_aligned( W + k ); v_float32 mag = vx_load_aligned( Mag + k ); v_float32 ori = vx_load_aligned( Ori + k ); - v_int32 bin = v_round( nd360 * ori ); + v_int32 bin = v_round( v_mul(nd360, ori) ); - bin = v_select(bin >= __n, bin - __n, bin); - bin = v_select(bin < vx_setzero_s32(), bin + __n, bin); + bin = v_select(v_ge(bin, __n), v_sub(bin, __n), bin); + bin = v_select(v_lt(bin, vx_setzero_s32()), v_add(bin, __n), bin); - w = w * mag; + w = v_mul(w, mag); v_store_aligned(bin_buf, bin); v_store_aligned(w_mul_mag_buf, w); for(int vi = 0; vi < vecsize; vi++) @@ -253,19 +253,19 @@ float calcOrientationHist( temphist[n+1] = temphist[1]; i = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 d_1_16 = vx_setall_f32(1.f/16.f); v_float32 d_4_16 = vx_setall_f32(4.f/16.f); v_float32 d_6_16 = vx_setall_f32(6.f/16.f); - for( ; i <= n - v_float32::nlanes; i += v_float32::nlanes ) + for( ; i <= n - VTraits::vlanes(); i += VTraits::vlanes() ) { v_float32 tn2 = vx_load_aligned(temphist + i-2); v_float32 tn1 = vx_load(temphist + i-1); v_float32 t0 = vx_load(temphist + i); v_float32 t1 = vx_load(temphist + i+1); v_float32 t2 = vx_load(temphist + i+2); - v_float32 _hist = v_fma(tn2 + t2, d_1_16, - v_fma(tn1 + t1, d_4_16, t0 * d_6_16)); + v_float32 _hist = v_fma(v_add(tn2, t2), d_1_16, + v_fma(v_add(tn1, t1), d_4_16, v_mul(t0, d_6_16))); v_store(hist + i, _hist); } #endif @@ -452,8 +452,8 @@ public: const sift_wt* nextptr = next.ptr(r); int c = SIFT_IMG_BORDER; -#if CV_SIMD && !(DoG_TYPE_SHORT) - const int vecsize = v_float32::nlanes; +#if (CV_SIMD || CV_SIMD_SCALABLE) && !(DoG_TYPE_SHORT) + const int vecsize = VTraits::vlanes(); for( ; c <= cols-SIFT_IMG_BORDER - vecsize; c += vecsize) { v_float32 val = vx_load(&currptr[c]); @@ -464,7 +464,7 @@ public: v_float32 vmin,vmax; - v_float32 cond = v_abs(val) > vx_setall_f32((float)threshold); + v_float32 cond = v_gt(v_abs(val), vx_setall_f32((float)this->threshold)); if (!v_check_any(cond)) { continue; @@ -477,10 +477,10 @@ public: vmax = v_max(v_max(v_max(_00,_01),v_max(_02,_10)),v_max(v_max(_12,_20),v_max(_21,_22))); vmin = v_min(v_min(v_min(_00,_01),v_min(_02,_10)),v_min(v_min(_12,_20),v_min(_21,_22))); - v_float32 condp = cond & (val > vx_setall_f32(0)) & (val >= vmax); - v_float32 condm = cond & (val < vx_setall_f32(0)) & (val <= vmin); + v_float32 condp = v_and(v_and(cond, v_gt(val, vx_setall_f32(0))), v_ge(val, vmax)); + v_float32 condm = v_and(v_and(cond, v_lt(val, vx_setall_f32(0))), v_le(val, vmin)); - cond = condp | condm; + cond = v_or(condp, condm); if (!v_check_any(cond)) { continue; @@ -493,10 +493,10 @@ public: vmax = v_max(v_max(v_max(_00,_01),v_max(_02,_10)),v_max(v_max(_12,_20),v_max(_21,_22))); vmin = v_min(v_min(v_min(_00,_01),v_min(_02,_10)),v_min(v_min(_12,_20),v_min(_21,_22))); - condp &= (val >= vmax); - condm &= (val <= vmin); + condp = v_and(condp, v_ge(val, vmax)); + condm = v_and(condm, v_le(val, vmin)); - cond = condp | condm; + cond = v_or(condp, condm); if (!v_check_any(cond)) { continue; @@ -515,10 +515,10 @@ public: vmax = v_max(v_max(v_max(_00,_01),v_max(_02,_10)),v_max(v_max(_12,_20),v_max(_21,_22))); vmin = v_min(v_min(v_min(_00,_01),v_min(_02,_10)),v_min(v_min(_12,_20),v_min(_21,_22))); - condp &= (val >= v_max(vmax,max_middle)); - condm &= (val <= v_min(vmin,min_middle)); + condp = v_and(condp, v_ge(val, v_max(vmax, max_middle))); + condm = v_and(condm, v_le(val, v_min(vmin, min_middle))); - cond = condp | condm; + cond = v_or(condp, condm); if (!v_check_any(cond)) { continue; @@ -777,11 +777,11 @@ void calcSIFTDescriptor( cv::hal::exp32f(W, W, len); k = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) { - const int vecsize = v_float32::nlanes; - int CV_DECL_ALIGNED(CV_SIMD_WIDTH) idx_buf[vecsize]; - float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rco_buf[8*vecsize]; + const int vecsize = VTraits::vlanes(); + int CV_DECL_ALIGNED(CV_SIMD_WIDTH) idx_buf[VTraits::max_nlanes]; + float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rco_buf[8*VTraits::max_nlanes]; const v_float32 __ori = vx_setall_f32(ori); const v_float32 __bins_per_rad = vx_setall_f32(bins_per_rad); const v_int32 __n = vx_setall_s32(n); @@ -792,28 +792,28 @@ void calcSIFTDescriptor( { v_float32 rbin = vx_load_aligned(RBin + k); v_float32 cbin = vx_load_aligned(CBin + k); - v_float32 obin = (vx_load_aligned(Ori + k) - __ori) * __bins_per_rad; - v_float32 mag = vx_load_aligned(Mag + k) * vx_load_aligned(W + k); + v_float32 obin = v_mul(v_sub(vx_load_aligned(Ori + k), __ori), __bins_per_rad); + v_float32 mag = v_mul(vx_load_aligned(Mag + k), vx_load_aligned(W + k)); v_int32 r0 = v_floor(rbin); v_int32 c0 = v_floor(cbin); v_int32 o0 = v_floor(obin); - rbin -= v_cvt_f32(r0); - cbin -= v_cvt_f32(c0); - obin -= v_cvt_f32(o0); - - o0 = v_select(o0 < vx_setzero_s32(), o0 + __n, o0); - o0 = v_select(o0 >= __n, o0 - __n, o0); - - v_float32 v_r1 = mag*rbin, v_r0 = mag - v_r1; - v_float32 v_rc11 = v_r1*cbin, v_rc10 = v_r1 - v_rc11; - v_float32 v_rc01 = v_r0*cbin, v_rc00 = v_r0 - v_rc01; - v_float32 v_rco111 = v_rc11*obin, v_rco110 = v_rc11 - v_rco111; - v_float32 v_rco101 = v_rc10*obin, v_rco100 = v_rc10 - v_rco101; - v_float32 v_rco011 = v_rc01*obin, v_rco010 = v_rc01 - v_rco011; - v_float32 v_rco001 = v_rc00*obin, v_rco000 = v_rc00 - v_rco001; - - v_int32 idx = v_muladd(v_muladd(r0+__1, __d_plus_2, c0+__1), __n_plus_2, o0); + rbin = v_sub(rbin, v_cvt_f32(r0)); + cbin = v_sub(cbin, v_cvt_f32(c0)); + obin = v_sub(obin, v_cvt_f32(o0)); + + o0 = v_select(v_lt(o0, vx_setzero_s32()), v_add(o0, __n), o0); + o0 = v_select(v_ge(o0, __n), v_sub(o0, __n), o0); + + v_float32 v_r1 = v_mul(mag, rbin), v_r0 = v_sub(mag, v_r1); + v_float32 v_rc11 = v_mul(v_r1, cbin), v_rc10 = v_sub(v_r1, v_rc11); + v_float32 v_rc01 = v_mul(v_r0, cbin), v_rc00 = v_sub(v_r0, v_rc01); + v_float32 v_rco111 = v_mul(v_rc11, obin), v_rco110 = v_sub(v_rc11, v_rco111); + v_float32 v_rco101 = v_mul(v_rc10, obin), v_rco100 = v_sub(v_rc10, v_rco101); + v_float32 v_rco011 = v_mul(v_rc01, obin), v_rco010 = v_sub(v_rc01, v_rco011); + v_float32 v_rco001 = v_mul(v_rc00, obin), v_rco000 = v_sub(v_rc00, v_rco001); + + v_int32 idx = v_muladd(v_muladd(v_add(r0, __1), __d_plus_2, v_add(c0, __1)), __n_plus_2, o0); v_store_aligned(idx_buf, idx); v_store_aligned(rco_buf, v_rco000); @@ -894,11 +894,11 @@ void calcSIFTDescriptor( float nrm2 = 0; len = d*d*n; k = 0; -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) { v_float32 __nrm2 = vx_setzero_f32(); v_float32 __rawDst; - for( ; k <= len - v_float32::nlanes; k += v_float32::nlanes ) + for( ; k <= len - VTraits::vlanes(); k += VTraits::vlanes() ) { __rawDst = vx_load_aligned(rawDst + k); __nrm2 = v_fma(__rawDst, __rawDst, __nrm2); @@ -949,15 +949,15 @@ void calcSIFTDescriptor( if( dstMat.type() == CV_32F ) { float* dst = dstMat.ptr(row); -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 __dst; v_float32 __min = vx_setzero_f32(); v_float32 __max = vx_setall_f32(255.0f); // max of uchar v_float32 __nrm2 = vx_setall_f32(nrm2); - for( k = 0; k <= len - v_float32::nlanes; k += v_float32::nlanes ) + for( k = 0; k <= len - VTraits::vlanes(); k += VTraits::vlanes() ) { __dst = vx_load_aligned(rawDst + k); - __dst = v_min(v_max(v_cvt_f32(v_round(__dst * __nrm2)), __min), __max); + __dst = v_min(v_max(v_cvt_f32(v_round(v_mul(__dst, __nrm2))), __min), __max); v_store(dst + k, __dst); } #endif @@ -976,16 +976,16 @@ if( dstMat.type() == CV_32F ) else // CV_8U { uint8_t* dst = dstMat.ptr(row); -#if CV_SIMD +#if (CV_SIMD || CV_SIMD_SCALABLE) v_float32 __dst0, __dst1; v_uint16 __pack01; v_float32 __nrm2 = vx_setall_f32(nrm2); - for( k = 0; k <= len - v_float32::nlanes * 2; k += v_float32::nlanes * 2 ) + for( k = 0; k <= len - VTraits::vlanes() * 2; k += VTraits::vlanes() * 2 ) { __dst0 = vx_load_aligned(rawDst + k); - __dst1 = vx_load_aligned(rawDst + k + v_float32::nlanes); + __dst1 = vx_load_aligned(rawDst + k + VTraits::vlanes()); - __pack01 = v_pack_u(v_round(__dst0 * __nrm2), v_round(__dst1 * __nrm2)); + __pack01 = v_pack_u(v_round(v_mul(__dst0, __nrm2)), v_round(v_mul(__dst1, __nrm2))); v_pack_store(dst + k, __pack01); } #endif