Merge pull request #24301 from hanliutong:rewrite-stereo-sift

Rewrite Universal Intrinsic code: features2d and calib3d module. #24301

The goal of this series of PRs is to modify the SIMD code blocks guarded by CV_SIMD macro: rewrite them by using the new Universal Intrinsic API.

This is the modification to the features2d module and calib3d module.

Test with clang 16 and QEMU v7.0.0. `AP3P.ctheta1p_nan_23607` failed beacuse of a small calculation error. But this patch does not touch the relevant code, and this error always reproduce on QEMU, regardless of whether the patch is applied or not. I think we can ignore it
```
[ RUN      ] AP3P.ctheta1p_nan_23607
/home/hanliutong/project/opencv/modules/calib3d/test/test_solvepnp_ransac.cpp:2319: Failure
Expected: (cvtest::norm(res.colRange(0, 2), expected, NORM_INF)) <= (3e-16), actual: 3.33067e-16 vs 3e-16
[  FAILED  ] AP3P.ctheta1p_nan_23607 (26 ms)

...

[==========] 148 tests from 64 test cases ran. (1147114 ms total)
[  PASSED  ] 147 tests.
[  FAILED  ] 1 test, listed below:
[  FAILED  ] AP3P.ctheta1p_nan_23607
```

Note: There are 2 test cases failed with GCC 13.2.1 without this patch, seems like there are someting wrong with RVV part on GCC.
```
[----------] Global test environment tear-down
[==========] 148 tests from 64 test cases ran. (1511399 ms total)
[  PASSED  ] 146 tests.
[  FAILED  ] 2 tests, listed below:
[  FAILED  ] Calib3d_StereoSGBM.regression
[  FAILED  ] Calib3d_StereoSGBM_HH4.regression
```

The patch is partially auto-generated by using the [rewriter](https://github.com/hanliutong/rewriter).

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
pull/24334/head
HAN Liutong 1 year ago committed by GitHub
parent 3889dcf3f8
commit aa143a3dd1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 218
      modules/calib3d/src/stereobm.cpp
  2. 310
      modules/calib3d/src/stereosgbm.cpp
  3. 110
      modules/features2d/src/sift.simd.hpp

@ -231,13 +231,13 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
dptr0[0] = dptr0[size.width-1] = dptr1[0] = dptr1[size.width-1] = val0;
x = 1;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
{
v_int16 ftz = vx_setall_s16((short) ftzero);
v_int16 ftz2 = vx_setall_s16((short)(ftzero*2));
v_int16 z = vx_setzero_s16();
for(; x <= (size.width - 1) - v_int16::nlanes; x += v_int16::nlanes)
for(; x <= (size.width - 1) - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
{
v_int16 s00 = v_reinterpret_as_s16(vx_load_expand(srow0 + x + 1));
v_int16 s01 = v_reinterpret_as_s16(vx_load_expand(srow0 + x - 1));
@ -248,13 +248,13 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
v_int16 s30 = v_reinterpret_as_s16(vx_load_expand(srow3 + x + 1));
v_int16 s31 = v_reinterpret_as_s16(vx_load_expand(srow3 + x - 1));
v_int16 d0 = s00 - s01;
v_int16 d1 = s10 - s11;
v_int16 d2 = s20 - s21;
v_int16 d3 = s30 - s31;
v_int16 d0 = v_sub(s00, s01);
v_int16 d1 = v_sub(s10, s11);
v_int16 d2 = v_sub(s20, s21);
v_int16 d3 = v_sub(s30, s31);
v_uint16 v0 = v_reinterpret_as_u16(v_max(v_min(d0 + d1 + d1 + d2 + ftz, ftz2), z));
v_uint16 v1 = v_reinterpret_as_u16(v_max(v_min(d1 + d2 + d2 + d3 + ftz, ftz2), z));
v_uint16 v0 = v_reinterpret_as_u16(v_max(v_min(v_add(v_add(v_add(v_add(d0, d1), d1), d2), ftz), ftz2), z));
v_uint16 v1 = v_reinterpret_as_u16(v_max(v_min(v_add(v_add(v_add(v_add(d1, d2), d2), d3), ftz), ftz2), z));
v_pack_store(dptr0 + x, v0);
v_pack_store(dptr1 + x, v1);
@ -277,10 +277,10 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
{
uchar* dptr = dst.ptr<uchar>(y);
x = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
{
v_uint8 val0_16 = vx_setall_u8(val0);
for(; x <= size.width-v_uint8::nlanes; x+=v_uint8::nlanes)
for(; x <= size.width-VTraits<v_uint8>::vlanes(); x+=VTraits<v_uint8>::vlanes())
v_store(dptr + x, val0_16);
}
#endif
@ -356,7 +356,7 @@ public:
for (size_t i = 0; i < nstripes; ++i)
{
// 1D: [1][ ndisp ][1]
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
if (params.useShorts())
area.allocate(sad_short[i], ndisp + 2);
else
@ -364,7 +364,7 @@ public:
area.allocate(sad[i], ndisp + 2);
// 2D: [ wsz/2 + 1 ][ height ][ wsz/2 + 1 ] * [ ndisp ]
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
if (params.useShorts())
area.allocate(hsad_short[i], (height + wsz + 2) * ndisp);
else
@ -390,7 +390,7 @@ public:
}
};
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
template <typename dType>
static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
Mat& disp, Mat& cost, const StereoBMParams& state,
@ -422,8 +422,8 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
short costbuf = 0;
int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0;
const uchar * tab = bufX.tab;
short v_seq[v_int16::nlanes];
for (short i = 0; i < v_int16::nlanes; ++i)
short v_seq[VTraits<v_int16>::max_nlanes];
for (short i = 0; i < VTraits<v_int16>::vlanes(); ++i)
v_seq[i] = i;
ushort *sad = bufX.sad_short[bufNum] + 1;
@ -446,19 +446,19 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
{
int lval = lptr[0];
v_uint8 lv = vx_setall_u8((uchar)lval);
for( d = 0; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
for( d = 0; d <= ndisp - VTraits<v_uint8>::vlanes(); d += VTraits<v_uint8>::vlanes() )
{
v_uint8 diff = v_absdiff(lv, vx_load(rptr + d));
v_store(cbuf + d, diff);
v_store(hsad + d, vx_load(hsad + d) + v_expand_low(diff));
v_store(hsad + d + v_uint16::nlanes, vx_load(hsad + d + v_uint16::nlanes) + v_expand_high(diff));
v_store(hsad + d, v_add(vx_load(hsad + d), v_expand_low(diff)));
v_store(hsad + d + VTraits<v_uint16>::vlanes(), v_add(vx_load(hsad + d + VTraits<v_uint16>::vlanes()), v_expand_high(diff)));
}
if( d <= ndisp - v_uint16::nlanes )
if( d <= ndisp - VTraits<v_uint16>::vlanes() )
{
v_uint8 diff = v_absdiff(lv, vx_load_low(rptr + d));
v_store_low(cbuf + d, diff);
v_store(hsad + d, vx_load(hsad + d) + v_expand_low(diff));
d += v_uint16::nlanes;
v_store(hsad + d, v_add(vx_load(hsad + d), v_expand_low(diff)));
d += VTraits<v_uint16>::vlanes();
}
for( ; d < ndisp; d++ )
{
@ -496,20 +496,20 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
{
int lval = lptr[0];
v_uint8 lv = vx_setall_u8((uchar)lval);
for( d = 0; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
for( d = 0; d <= ndisp - VTraits<v_uint8>::vlanes(); d += VTraits<v_uint8>::vlanes() )
{
v_uint8 diff = v_absdiff(lv, vx_load(rptr + d));
v_int8 cbs = v_reinterpret_as_s8(vx_load(cbuf_sub + d));
v_store(cbuf + d, diff);
v_store(hsad + d, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d) + v_expand_low(diff)) - v_expand_low(cbs)));
v_store(hsad + d + v_uint16::nlanes, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d + v_uint16::nlanes) + v_expand_high(diff)) - v_expand_high(cbs)));
v_store(hsad + d, v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(v_add(vx_load(hsad + d), v_expand_low(diff))), v_expand_low(cbs))));
v_store(hsad + d + VTraits<v_uint16>::vlanes(), v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(v_add(vx_load(hsad + d + VTraits<v_uint16>::vlanes()), v_expand_high(diff))), v_expand_high(cbs))));
}
if( d <= ndisp - v_uint16::nlanes)
if( d <= ndisp - VTraits<v_uint16>::vlanes())
{
v_uint8 diff = v_absdiff(lv, vx_load_low(rptr + d));
v_store_low(cbuf + d, diff);
v_store(hsad + d, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d) + v_expand_low(diff)) - vx_load_expand((schar*)cbuf_sub + d)));
d += v_uint16::nlanes;
v_store(hsad + d, v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(v_add(vx_load(hsad + d), v_expand_low(diff))), vx_load_expand((schar *)cbuf_sub + d))));
d += VTraits<v_uint16>::vlanes();
}
for( ; d < ndisp; d++ )
{
@ -533,20 +533,20 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
hsad = hsad0 + (1 - dy0)*ndisp;
for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
{
for( d = 0; d <= ndisp-2*v_uint16::nlanes; d += 2*v_uint16::nlanes )
for( d = 0; d <= ndisp-2*VTraits<v_uint16>::vlanes(); d += 2*VTraits<v_uint16>::vlanes() )
{
v_store(sad + d, vx_load(sad + d) + vx_load(hsad + d));
v_store(sad + d + v_uint16::nlanes, vx_load(sad + d + v_uint16::nlanes) + vx_load(hsad + d + v_uint16::nlanes));
v_store(sad + d, v_add(vx_load(sad + d), vx_load(hsad + d)));
v_store(sad + d + VTraits<v_uint16>::vlanes(), v_add(vx_load(sad + d + VTraits<v_uint16>::vlanes()), vx_load(hsad + d + VTraits<v_uint16>::vlanes())));
}
if( d <= ndisp-v_uint16::nlanes )
if( d <= ndisp-VTraits<v_uint16>::vlanes() )
{
v_store(sad + d, vx_load(sad + d) + vx_load(hsad + d));
d += v_uint16::nlanes;
v_store(sad + d, v_add(vx_load(sad + d), vx_load(hsad + d)));
d += VTraits<v_uint16>::vlanes();
}
if( d <= ndisp-v_uint16::nlanes/2 )
if( d <= ndisp-VTraits<v_uint16>::vlanes()/2 )
{
v_store_low(sad + d, vx_load_low(sad + d) + vx_load_low(hsad + d));
d += v_uint16::nlanes/2;
v_store_low(sad + d, v_add(vx_load_low(sad + d), vx_load_low(hsad + d)));
d += VTraits<v_uint16>::vlanes()/2;
}
for( ; d < ndisp; d++ )
sad[d] = sad[d] + hsad[d];
@ -564,29 +564,29 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
v_int16 minsad8 = vx_setall_s16(SHRT_MAX);
v_int16 mind8 = vx_setall_s16(0);
for( d = 0; d <= ndisp - 2*v_int16::nlanes; d += 2*v_int16::nlanes )
for( d = 0; d <= ndisp - 2*VTraits<v_int16>::vlanes(); d += 2*VTraits<v_int16>::vlanes() )
{
v_int16 sad8 = v_reinterpret_as_s16(vx_load(hsad + d)) - v_reinterpret_as_s16(vx_load(hsad_sub + d)) + v_reinterpret_as_s16(vx_load(sad + d));
v_int16 sad8 = v_add(v_sub(v_reinterpret_as_s16(vx_load(hsad + d)), v_reinterpret_as_s16(vx_load(hsad_sub + d))), v_reinterpret_as_s16(vx_load(sad + d)));
v_store(sad + d, v_reinterpret_as_u16(sad8));
mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)d));
mind8 = v_max(mind8, v_and(v_gt(minsad8, sad8), vx_setall_s16((short)d)));
minsad8 = v_min(minsad8, sad8);
sad8 = v_reinterpret_as_s16(vx_load(hsad + d + v_int16::nlanes)) - v_reinterpret_as_s16(vx_load(hsad_sub + d + v_int16::nlanes)) + v_reinterpret_as_s16(vx_load(sad + d + v_int16::nlanes));
v_store(sad + d + v_int16::nlanes, v_reinterpret_as_u16(sad8));
mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)(d+v_int16::nlanes)));
sad8 = v_add(v_sub(v_reinterpret_as_s16(vx_load(hsad + d + VTraits<v_int16>::vlanes())), v_reinterpret_as_s16(vx_load(hsad_sub + d + VTraits<v_int16>::vlanes()))), v_reinterpret_as_s16(vx_load(sad + d + VTraits<v_int16>::vlanes())));
v_store(sad + d + VTraits<v_int16>::vlanes(), v_reinterpret_as_u16(sad8));
mind8 = v_max(mind8, v_and(v_gt(minsad8, sad8), vx_setall_s16((short)(d + VTraits<v_int16>::vlanes()))));
minsad8 = v_min(minsad8, sad8);
}
if( d <= ndisp - v_int16::nlanes )
if( d <= ndisp - VTraits<v_int16>::vlanes() )
{
v_int16 sad8 = v_reinterpret_as_s16(vx_load(hsad + d)) - v_reinterpret_as_s16(vx_load(hsad_sub + d)) + v_reinterpret_as_s16(vx_load(sad + d));
v_int16 sad8 = v_add(v_sub(v_reinterpret_as_s16(vx_load(hsad + d)), v_reinterpret_as_s16(vx_load(hsad_sub + d))), v_reinterpret_as_s16(vx_load(sad + d)));
v_store(sad + d, v_reinterpret_as_u16(sad8));
mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)d));
mind8 = v_max(mind8, v_and(v_gt(minsad8, sad8), vx_setall_s16((short)d)));
minsad8 = v_min(minsad8, sad8);
d += v_int16::nlanes;
d += VTraits<v_int16>::vlanes();
}
minsad = v_reduce_min(minsad8);
v_int16 v_mask = (vx_setall_s16((short)minsad) == minsad8);
mind = v_reduce_min(((mind8+vx_load(v_seq)) & v_mask) | (vx_setall_s16(SHRT_MAX) & ~v_mask));
v_int16 v_mask = (v_eq(vx_setall_s16((short)minsad), minsad8));
mind = v_reduce_min(v_or(v_and(v_add(mind8, vx_load(v_seq)), v_mask), v_and(vx_setall_s16(32767), v_not(v_mask))));
for( ; d < ndisp; d++ )
{
int sad8 = (int)(hsad[d]) - hsad_sub[d] + sad[d];
@ -610,34 +610,34 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
int thresh = minsad + (minsad * uniquenessRatio/100);
v_int32 thresh4 = vx_setall_s32(thresh + 1);
v_int32 d1 = vx_setall_s32(mind-1), d2 = vx_setall_s32(mind+1);
v_int32 dd_4 = vx_setall_s32(v_int32::nlanes);
v_int32 dd_4 = vx_setall_s32(VTraits<v_int32>::vlanes());
v_int32 d4 = vx_load_expand(v_seq);
for( d = 0; d <= ndisp - v_int16::nlanes; d += v_int16::nlanes )
for( d = 0; d <= ndisp - VTraits<v_int16>::vlanes(); d += VTraits<v_int16>::vlanes() )
{
v_int32 sad4_l, sad4_h;
v_expand(v_reinterpret_as_s16(vx_load(sad + d)), sad4_l, sad4_h);
if( v_check_any((thresh4 > sad4_l) & ((d1 > d4) | (d4 > d2))) )
if( v_check_any(v_and(v_gt(thresh4, sad4_l), v_or(v_gt(d1, d4), v_gt(d4, d2)))) )
break;
d4 += dd_4;
if( v_check_any((thresh4 > sad4_h) & ((d1 > d4) | (d4 > d2))) )
d4 = v_add(d4, dd_4);
if( v_check_any(v_and(v_gt(thresh4, sad4_h), v_or(v_gt(d1, d4), v_gt(d4, d2)))) )
break;
d4 += dd_4;
d4 = v_add(d4, dd_4);
}
if( d <= ndisp - v_int16::nlanes )
if( d <= ndisp - VTraits<v_int16>::vlanes() )
{
dptr[y*dstep] = FILTERED;
continue;
}
if( d <= ndisp - v_int32::nlanes )
if( d <= ndisp - VTraits<v_int32>::vlanes() )
{
v_int32 sad4_l = vx_load_expand((short*)sad + d);
if (v_check_any((thresh4 > sad4_l) & ((d1 > d4) | (d4 > d2))))
if (v_check_any(v_and(v_gt(thresh4, sad4_l), v_or(v_gt(d1, d4), v_gt(d4, d2)))))
{
dptr[y*dstep] = FILTERED;
continue;
}
d += v_int16::nlanes;
d += VTraits<v_int16>::vlanes();
}
for( ; d < ndisp; d++ )
{
@ -699,11 +699,11 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0;
const uchar * tab = bufX.tab;
#if CV_SIMD
int v_seq[v_int32::nlanes];
for (int i = 0; i < v_int32::nlanes; ++i)
#if (CV_SIMD || CV_SIMD_SCALABLE)
int v_seq[VTraits<v_int32>::max_nlanes];
for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i)
v_seq[i] = i;
v_int32 d0_4 = vx_load(v_seq), dd_4 = vx_setall_s32(v_int32::nlanes);
v_int32 d0_4 = vx_load(v_seq), dd_4 = vx_setall_s32(VTraits<v_int32>::vlanes());
#endif
int *sad = bufX.sad[bufNum] + 1;
@ -725,17 +725,17 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
{
int lval = lptr[0];
d = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
{
v_uint8 lv = vx_setall_u8((uchar)lval);
for( ; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
for( ; d <= ndisp - VTraits<v_uint8>::vlanes(); d += VTraits<v_uint8>::vlanes() )
{
v_uint8 rv = vx_load(rptr + d);
v_int32 hsad_0 = vx_load(hsad + d);
v_int32 hsad_1 = vx_load(hsad + d + v_int32::nlanes);
v_int32 hsad_2 = vx_load(hsad + d + 2*v_int32::nlanes);
v_int32 hsad_3 = vx_load(hsad + d + 3*v_int32::nlanes);
v_int32 hsad_1 = vx_load(hsad + d + VTraits<v_int32>::vlanes());
v_int32 hsad_2 = vx_load(hsad + d + 2*VTraits<v_int32>::vlanes());
v_int32 hsad_3 = vx_load(hsad + d + 3*VTraits<v_int32>::vlanes());
v_uint8 diff = v_absdiff(lv, rv);
v_store(cbuf + d, diff);
@ -745,15 +745,15 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
v_expand(diff0, diff00, diff01);
v_expand(diff1, diff10, diff11);
hsad_0 += v_reinterpret_as_s32(diff00);
hsad_1 += v_reinterpret_as_s32(diff01);
hsad_2 += v_reinterpret_as_s32(diff10);
hsad_3 += v_reinterpret_as_s32(diff11);
hsad_0 = v_add(hsad_0, v_reinterpret_as_s32(diff00));
hsad_1 = v_add(hsad_1, v_reinterpret_as_s32(diff01));
hsad_2 = v_add(hsad_2, v_reinterpret_as_s32(diff10));
hsad_3 = v_add(hsad_3, v_reinterpret_as_s32(diff11));
v_store(hsad + d, hsad_0);
v_store(hsad + d + v_int32::nlanes, hsad_1);
v_store(hsad + d + 2*v_int32::nlanes, hsad_2);
v_store(hsad + d + 3*v_int32::nlanes, hsad_3);
v_store(hsad + d + VTraits<v_int32>::vlanes(), hsad_1);
v_store(hsad + d + 2*VTraits<v_int32>::vlanes(), hsad_2);
v_store(hsad + d + 3*VTraits<v_int32>::vlanes(), hsad_3);
}
}
#endif
@ -793,16 +793,16 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
{
int lval = lptr[0];
d = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
{
v_uint8 lv = vx_setall_u8((uchar)lval);
for( ; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
for( ; d <= ndisp - VTraits<v_uint8>::vlanes(); d += VTraits<v_uint8>::vlanes() )
{
v_uint8 rv = vx_load(rptr + d);
v_int32 hsad_0 = vx_load(hsad + d);
v_int32 hsad_1 = vx_load(hsad + d + v_int32::nlanes);
v_int32 hsad_2 = vx_load(hsad + d + 2*v_int32::nlanes);
v_int32 hsad_3 = vx_load(hsad + d + 3*v_int32::nlanes);
v_int32 hsad_1 = vx_load(hsad + d + VTraits<v_int32>::vlanes());
v_int32 hsad_2 = vx_load(hsad + d + 2*VTraits<v_int32>::vlanes());
v_int32 hsad_3 = vx_load(hsad + d + 3*VTraits<v_int32>::vlanes());
v_uint8 cbs = vx_load(cbuf_sub + d);
v_uint8 diff = v_absdiff(lv, rv);
v_store(cbuf + d, diff);
@ -816,19 +816,19 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
v_expand(v_reinterpret_as_s16(cbs0), cbs00, cbs01);
v_expand(v_reinterpret_as_s16(cbs1), cbs10, cbs11);
v_int32 diff_0 = diff00 - cbs00;
v_int32 diff_1 = diff01 - cbs01;
v_int32 diff_2 = diff10 - cbs10;
v_int32 diff_3 = diff11 - cbs11;
hsad_0 += diff_0;
hsad_1 += diff_1;
hsad_2 += diff_2;
hsad_3 += diff_3;
v_int32 diff_0 = v_sub(diff00, cbs00);
v_int32 diff_1 = v_sub(diff01, cbs01);
v_int32 diff_2 = v_sub(diff10, cbs10);
v_int32 diff_3 = v_sub(diff11, cbs11);
hsad_0 = v_add(hsad_0, diff_0);
hsad_1 = v_add(hsad_1, diff_1);
hsad_2 = v_add(hsad_2, diff_2);
hsad_3 = v_add(hsad_3, diff_3);
v_store(hsad + d, hsad_0);
v_store(hsad + d + v_int32::nlanes, hsad_1);
v_store(hsad + d + 2*v_int32::nlanes, hsad_2);
v_store(hsad + d + 3*v_int32::nlanes, hsad_3);
v_store(hsad + d + VTraits<v_int32>::vlanes(), hsad_1);
v_store(hsad + d + 2*VTraits<v_int32>::vlanes(), hsad_2);
v_store(hsad + d + 3*VTraits<v_int32>::vlanes(), hsad_3);
}
}
#endif
@ -855,18 +855,18 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
{
d = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
{
for( d = 0; d <= ndisp-2*v_int32::nlanes; d += 2*v_int32::nlanes )
for( d = 0; d <= ndisp-2*VTraits<v_int32>::vlanes(); d += 2*VTraits<v_int32>::vlanes() )
{
v_int32 s0 = vx_load(sad + d);
v_int32 s1 = vx_load(sad + d + v_int32::nlanes);
v_int32 s1 = vx_load(sad + d + VTraits<v_int32>::vlanes());
v_int32 t0 = vx_load(hsad + d);
v_int32 t1 = vx_load(hsad + d + v_int32::nlanes);
s0 += t0;
s1 += t1;
v_int32 t1 = vx_load(hsad + d + VTraits<v_int32>::vlanes());
s0 = v_add(s0, t0);
s1 = v_add(s1, t1);
v_store(sad + d, s0);
v_store(sad + d + v_int32::nlanes, s1);
v_store(sad + d + VTraits<v_int32>::vlanes(), s1);
}
}
#endif
@ -884,30 +884,30 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
d = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
{
v_int32 minsad4 = vx_setall_s32(INT_MAX);
v_int32 mind4 = vx_setall_s32(0), d4 = d0_4;
for( ; d <= ndisp - 2*v_int32::nlanes; d += 2*v_int32::nlanes )
for( ; d <= ndisp - 2*VTraits<v_int32>::vlanes(); d += 2*VTraits<v_int32>::vlanes() )
{
v_int32 sad4 = vx_load(sad + d) + vx_load(hsad + d) - vx_load(hsad_sub + d);
v_int32 sad4 = v_sub(v_add(vx_load(sad + d), vx_load(hsad + d)), vx_load(hsad_sub + d));
v_store(sad + d, sad4);
mind4 = v_select(minsad4 > sad4, d4, mind4);
mind4 = v_select(v_gt(minsad4, sad4), d4, mind4);
minsad4 = v_min(minsad4, sad4);
d4 += dd_4;
d4 = v_add(d4, dd_4);
sad4 = vx_load(sad + d + v_int32::nlanes) + vx_load(hsad + d + v_int32::nlanes) - vx_load(hsad_sub + d + v_int32::nlanes);
v_store(sad + d + v_int32::nlanes, sad4);
mind4 = v_select(minsad4 > sad4, d4, mind4);
sad4 = v_sub(v_add(vx_load(sad + d + VTraits<v_int32>::vlanes()), vx_load(hsad + d + VTraits<v_int32>::vlanes())), vx_load(hsad_sub + d + VTraits<v_int32>::vlanes()));
v_store(sad + d + VTraits<v_int32>::vlanes(), sad4);
mind4 = v_select(v_gt(minsad4, sad4), d4, mind4);
minsad4 = v_min(minsad4, sad4);
d4 += dd_4;
d4 = v_add(d4, dd_4);
}
int CV_DECL_ALIGNED(CV_SIMD_WIDTH) minsad_buf[v_int32::nlanes], mind_buf[v_int32::nlanes];
int CV_DECL_ALIGNED(CV_SIMD_WIDTH) minsad_buf[VTraits<v_int32>::max_nlanes], mind_buf[VTraits<v_int32>::max_nlanes];
v_store(minsad_buf, minsad4);
v_store(mind_buf, mind4);
for (int i = 0; i < v_int32::nlanes; ++i)
for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i)
if(minsad_buf[i] < minsad || (minsad == minsad_buf[i] && mind_buf[i] < mind)) { minsad = minsad_buf[i]; mind = mind_buf[i]; }
}
#endif
@ -1102,7 +1102,7 @@ struct FindStereoCorrespInvoker : public ParallelLoopBody
Mat disp_i = disp->rowRange(row0, row1);
Mat cost_i = state.disp12MaxDiff >= 0 ? cost->rowRange(row0, row1) : Mat();
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
if (state.useShorts())
{
if( disp_i.type() == CV_16S)

@ -123,7 +123,7 @@ struct StereoSGBMParams
int mode;
};
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
#if CV_SIMD_WIDTH == 16
static inline v_int16 vx_setseq_s16()
{ return v_int16(0, 1, 2, 3, 4, 5, 6, 7); }
@ -136,10 +136,10 @@ static inline v_int16 vx_setseq_s16()
#else
struct vseq_s16
{
short data[v_int16::nlanes];
short data[VTraits<v_int16>::max_nlanes];
vseq_s16()
{
for (int i = 0; i < v_int16::nlanes; i++)
for (int i = 0; i < VTraits<v_int16>::vlanes(); i++)
data[i] = i;
}
};
@ -153,8 +153,8 @@ static inline v_int16 vx_setseq_s16()
static inline void min_pos(const v_int16& val, const v_int16& pos, short &min_val, short &min_pos)
{
min_val = v_reduce_min(val);
v_int16 v_mask = (vx_setall_s16(min_val) == val);
min_pos = v_reduce_min(((pos+vx_setseq_s16()) & v_mask) | (vx_setall_s16(SHRT_MAX) & ~v_mask));
v_int16 v_mask = (v_eq(vx_setall_s16(min_val), val));
min_pos = v_reduce_min(v_or(v_and(v_add(pos, vx_setseq_s16()), v_mask), v_and(vx_setall_s16(SHRT_MAX), v_not(v_mask))));
}
#endif
@ -270,26 +270,26 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
int u1 = std::max(ul, ur); u1 = std::max(u1, u);
int d = minD;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_uint8 _u = vx_setall_u8((uchar)u), _u0 = vx_setall_u8((uchar)u0);
v_uint8 _u1 = vx_setall_u8((uchar)u1);
for( ; d <= maxD - 2*v_int16::nlanes; d += 2*v_int16::nlanes )
for( ; d <= maxD - 2*VTraits<v_int16>::vlanes(); d += 2*VTraits<v_int16>::vlanes() )
{
v_uint8 _v = vx_load(prow2 + width-x-1 + d);
v_uint8 _v0 = vx_load(buffer + width-x-1 + d);
v_uint8 _v1 = vx_load(buffer + width-x-1 + d + width2);
v_uint8 c0 = v_max(_u - _v1, _v0 - _u);
v_uint8 c1 = v_max(_v - _u1, _u0 - _v);
v_uint8 c0 = v_max(v_sub(_u, _v1), v_sub(_v0, _u));
v_uint8 c1 = v_max(v_sub(_v, _u1), v_sub(_u0, _v));
v_uint8 diff = v_min(c0, c1);
v_int16 _c0 = vx_load_aligned(cost + x*D + d);
v_int16 _c1 = vx_load_aligned(cost + x*D + d + v_int16::nlanes);
v_int16 _c1 = vx_load_aligned(cost + x*D + d + VTraits<v_int16>::vlanes());
v_uint16 diff1,diff2;
v_expand(diff,diff1,diff2);
v_store_aligned(cost + x*D + d, _c0 + v_reinterpret_as_s16(diff1 >> diff_scale));
v_store_aligned(cost + x*D + d + v_int16::nlanes, _c1 + v_reinterpret_as_s16(diff2 >> diff_scale));
v_store_aligned(cost + x*D + d, v_add(_c0, v_reinterpret_as_s16(v_shr(diff1, diff_scale))));
v_store_aligned(cost + x*D + d + VTraits<v_int16>::vlanes(), v_add(_c1, v_reinterpret_as_s16(v_shr(diff2, diff_scale))));
}
#endif
for( ; d < maxD; d++ )
@ -555,13 +555,13 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
calcPixelCostBT( img1, img2, k, minD, maxD, mem.pixDiff, mem.tempBuf, mem.getClipTab() );
memset(hsumAdd, 0, Da*sizeof(CostType));
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_int16 h_scale = vx_setall_s16((short)SW2 + 1);
for( d = 0; d < Da; d += v_int16::nlanes )
for( d = 0; d < Da; d += VTraits<v_int16>::vlanes() )
{
v_int16 v_hsumAdd = vx_load_aligned(mem.pixDiff + d) * h_scale;
v_int16 v_hsumAdd = v_mul(vx_load_aligned(mem.pixDiff + d), h_scale);
for( x = Da; x <= SW2*Da; x += Da )
v_hsumAdd += vx_load_aligned(mem.pixDiff + x + d);
v_hsumAdd = v_add(v_hsumAdd, vx_load_aligned(mem.pixDiff + x + d));
v_store_aligned(hsumAdd + d, v_hsumAdd);
}
#else
@ -578,9 +578,9 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
const CostType* Cprev = mem.getCBuf(y - 1);
#if CV_SIMD
for (d = 0; d < Da; d += v_int16::nlanes)
v_store_aligned(C + d, vx_load_aligned(Cprev + d) + vx_load_aligned(hsumAdd + d) - vx_load_aligned(hsumSub + d));
#if (CV_SIMD || CV_SIMD_SCALABLE)
for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
v_store_aligned(C + d, v_sub(v_add(vx_load_aligned(Cprev + d), vx_load_aligned(hsumAdd + d)), vx_load_aligned(hsumSub + d)));
#else
for (d = 0; d < D; d++)
C[d] = (CostType)(Cprev[d] + hsumAdd[d] - hsumSub[d]);
@ -590,12 +590,12 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
{
const CostType* pixAdd = mem.pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
const CostType* pixSub = mem.pixDiff + std::max(x - (SW2+1)*Da, 0);
#if CV_SIMD
for( d = 0; d < Da; d += v_int16::nlanes )
#if (CV_SIMD || CV_SIMD_SCALABLE)
for( d = 0; d < Da; d += VTraits<v_int16>::vlanes() )
{
v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) - vx_load_aligned(pixSub + d) + vx_load_aligned(pixAdd + d);
v_int16 hv = v_add(v_sub(vx_load_aligned(hsumAdd + x - Da + d), vx_load_aligned(pixSub + d)), vx_load_aligned(pixAdd + d));
v_store_aligned(hsumAdd + x + d, hv);
v_store_aligned(C + x + d, vx_load_aligned(Cprev + x + d) - vx_load_aligned(hsumSub + x + d) + hv);
v_store_aligned(C + x + d, v_add(v_sub(vx_load_aligned(Cprev + x + d), vx_load_aligned(hsumSub + x + d)), hv));
}
#else
for( d = 0; d < D; d++ )
@ -608,10 +608,10 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
}
else
{
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_int16 v_scale = vx_setall_s16(k == 0 ? (short)SH2 + 1 : 1);
for (d = 0; d < Da; d += v_int16::nlanes)
v_store_aligned(C + d, vx_load_aligned(C + d) + vx_load_aligned(hsumAdd + d) * v_scale);
for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
v_store_aligned(C + d, v_add(vx_load_aligned(C + d), v_mul(vx_load_aligned(hsumAdd + d), v_scale)));
#else
int scale = k == 0 ? SH2 + 1 : 1;
for (d = 0; d < D; d++)
@ -622,12 +622,12 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
const CostType* pixAdd = mem.pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
const CostType* pixSub = mem.pixDiff + std::max(x - (SW2+1)*Da, 0);
#if CV_SIMD
for (d = 0; d < Da; d += v_int16::nlanes)
#if (CV_SIMD || CV_SIMD_SCALABLE)
for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
{
v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) + vx_load_aligned(pixAdd + d) - vx_load_aligned(pixSub + d);
v_int16 hv = v_sub(v_add(vx_load_aligned(hsumAdd + x - Da + d), vx_load_aligned(pixAdd + d)), vx_load_aligned(pixSub + d));
v_store_aligned(hsumAdd + x + d, hv);
v_store_aligned(C + x + d, vx_load_aligned(C + x + d) + hv * v_scale);
v_store_aligned(C + x + d, v_add(vx_load_aligned(C + x + d), v_mul(hv, v_scale)));
}
#else
for( d = 0; d < D; d++ )
@ -646,9 +646,9 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
{
const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
const CostType* Cprev = mem.getCBuf(y - 1);
#if CV_SIMD
for (x = 0; x < width1*Da; x += v_int16::nlanes)
v_store_aligned(C + x, vx_load_aligned(Cprev + x) - vx_load_aligned(hsumSub + x) + vx_load_aligned(hsumAdd + x));
#if (CV_SIMD || CV_SIMD_SCALABLE)
for (x = 0; x < width1*Da; x += VTraits<v_int16>::vlanes())
v_store_aligned(C + x, v_add(v_sub(vx_load_aligned(Cprev + x), vx_load_aligned(hsumSub + x)), vx_load_aligned(hsumAdd + x)));
#else
for (x = 0; x < width1*Da; x++)
C[x] = (CostType)(Cprev[x] + hsumAdd[x] - hsumSub[x]);
@ -656,9 +656,9 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
}
else
{
#if CV_SIMD
for (x = 0; x < width1*Da; x += v_int16::nlanes)
v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x));
#if (CV_SIMD || CV_SIMD_SCALABLE)
for (x = 0; x < width1*Da; x += VTraits<v_int16>::vlanes())
v_store_aligned(C + x, v_add(vx_load_aligned(C + x), vx_load_aligned(hsumAdd + x)));
#else
for (x = 0; x < width1*Da; x++)
C[x] = (CostType)(C[x] + hsumAdd[x]);
@ -714,7 +714,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
CostType* minL = mem.getMinLr(lrID, x);
d = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_int16 _P1 = vx_setall_s16((short)P1);
v_int16 _delta0 = vx_setall_s16((short)delta0);
@ -726,31 +726,31 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
v_int16 _minL2 = vx_setall_s16((short)MAX_COST);
v_int16 _minL3 = vx_setall_s16((short)MAX_COST);
for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
for( ; d <= D - VTraits<v_int16>::vlanes(); d += VTraits<v_int16>::vlanes() )
{
v_int16 Cpd = vx_load_aligned(Cp + d);
v_int16 Spd = vx_load_aligned(Sp + d);
v_int16 L;
L = v_min(v_min(v_min(vx_load_aligned(Lr_p0 + d), vx_load(Lr_p0 + d - 1) + _P1), vx_load(Lr_p0 + d + 1) + _P1), _delta0) - _delta0 + Cpd;
L = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_p0 + d), v_add(vx_load(Lr_p0 + d - 1), _P1)), v_add(vx_load(Lr_p0 + d + 1), _P1)), _delta0), _delta0), Cpd);
v_store_aligned(Lr_p + d, L);
_minL0 = v_min(_minL0, L);
Spd += L;
Spd = v_add(Spd, L);
L = v_min(v_min(v_min(vx_load_aligned(Lr_p1 + d), vx_load(Lr_p1 + d - 1) + _P1), vx_load(Lr_p1 + d + 1) + _P1), _delta1) - _delta1 + Cpd;
L = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_p1 + d), v_add(vx_load(Lr_p1 + d - 1), _P1)), v_add(vx_load(Lr_p1 + d + 1), _P1)), _delta1), _delta1), Cpd);
v_store_aligned(Lr_p + d + Dlra, L);
_minL1 = v_min(_minL1, L);
Spd += L;
Spd = v_add(Spd, L);
L = v_min(v_min(v_min(vx_load_aligned(Lr_p2 + d), vx_load(Lr_p2 + d - 1) + _P1), vx_load(Lr_p2 + d + 1) + _P1), _delta2) - _delta2 + Cpd;
L = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_p2 + d), v_add(vx_load(Lr_p2 + d - 1), _P1)), v_add(vx_load(Lr_p2 + d + 1), _P1)), _delta2), _delta2), Cpd);
v_store_aligned(Lr_p + d + Dlra*2, L);
_minL2 = v_min(_minL2, L);
Spd += L;
Spd = v_add(Spd, L);
L = v_min(v_min(v_min(vx_load_aligned(Lr_p3 + d), vx_load(Lr_p3 + d - 1) + _P1), vx_load(Lr_p3 + d + 1) + _P1), _delta3) - _delta3 + Cpd;
L = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_p3 + d), v_add(vx_load(Lr_p3 + d - 1), _P1)), v_add(vx_load(Lr_p3 + d + 1), _P1)), _delta3), _delta3), Cpd);
v_store_aligned(Lr_p + d + Dlra*3, L);
_minL3 = v_min(_minL3, L);
Spd += L;
Spd = v_add(Spd, L);
v_store_aligned(Sp + d, Spd);
}
@ -769,7 +769,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
t0 = v_min(t0, t1);
t0 = v_min(t0, v_rotate_right<4>(t0));
#if CV_SIMD_WIDTH == 32
CostType buf[v_int16::nlanes];
CostType buf[VTraits<v_int16>::max_nlanes];
v_store_low(buf, v_min(t0, v_rotate_right<8>(t0)));
minL[0] = buf[0];
minL[1] = buf[1];
@ -817,10 +817,10 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
if( pass == npasses )
{
x = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_int16 v_inv_dist = vx_setall_s16((DispType)INVALID_DISP_SCALED);
v_int16 v_max_cost = vx_setall_s16(MAX_COST);
for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes )
for( ; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes() )
{
v_store(disp1ptr + x, v_inv_dist);
v_store(mem.disp2ptr + x, v_inv_dist);
@ -850,23 +850,23 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
d = 0;
int delta0 = P2 + *mem.getMinLr(lrID, x + 1);
int minL0 = MAX_COST;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_int16 _P1 = vx_setall_s16((short)P1);
v_int16 _delta0 = vx_setall_s16((short)delta0);
v_int16 _minL0 = vx_setall_s16((short)MAX_COST);
v_int16 _minS = vx_setall_s16(MAX_COST), _bestDisp = vx_setall_s16(-1);
for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
for( ; d <= D - VTraits<v_int16>::vlanes(); d += VTraits<v_int16>::vlanes() )
{
v_int16 Cpd = vx_load_aligned(Cp + d);
v_int16 L0 = v_min(v_min(v_min(vx_load_aligned(Lr_p0 + d), vx_load(Lr_p0 + d - 1) + _P1), vx_load(Lr_p0 + d + 1) + _P1), _delta0) - _delta0 + Cpd;
v_int16 L0 = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_p0 + d), v_add(vx_load(Lr_p0 + d - 1), _P1)), v_add(vx_load(Lr_p0 + d + 1), _P1)), _delta0), _delta0), Cpd);
v_store_aligned(Lr_p + d, L0);
_minL0 = v_min(_minL0, L0);
L0 += vx_load_aligned(Sp + d);
L0 = v_add(L0, vx_load_aligned(Sp + d));
v_store_aligned(Sp + d, L0);
_bestDisp = v_select(_minS > L0, vx_setall_s16((short)d), _bestDisp);
_bestDisp = v_select(v_gt(_minS, L0), vx_setall_s16((short)d), _bestDisp);
_minS = v_min(_minS, L0);
}
minL0 = (CostType)v_reduce_min(_minL0);
@ -891,12 +891,12 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
else
{
d = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_int16 _minS = vx_setall_s16(MAX_COST), _bestDisp = vx_setall_s16(-1);
for( ; d <= D - v_int16::nlanes; d+= v_int16::nlanes )
for( ; d <= D - VTraits<v_int16>::vlanes(); d+= VTraits<v_int16>::vlanes() )
{
v_int16 L0 = vx_load_aligned(Sp + d);
_bestDisp = v_select(_minS > L0, vx_setall_s16((short)d), _bestDisp);
_bestDisp = v_select(v_gt(_minS, L0), vx_setall_s16((short)d), _bestDisp);
_minS = v_min( L0, _minS );
}
min_pos(_minS, _bestDisp, minS, bestDisp);
@ -1039,9 +1039,9 @@ struct CalcVerticalSums: public ParallelLoopBody
for( x = (x1 - SW2)*Da; x <= (x1 + SW2)*Da; x += Da )
{
int xbord = x <= 0 ? 0 : (x > (width1 - 1)*Da ? (width1 - 1)*Da : x);
#if CV_SIMD
for( d = 0; d < Da; d += v_int16::nlanes )
v_store_aligned(hsumAdd + x1*Da + d, vx_load_aligned(hsumAdd + x1*Da + d) + vx_load_aligned(pixDiff + xbord + d));
#if (CV_SIMD || CV_SIMD_SCALABLE)
for( d = 0; d < Da; d += VTraits<v_int16>::vlanes() )
v_store_aligned(hsumAdd + x1*Da + d, v_add(vx_load_aligned(hsumAdd + x1 * this->Da + d), vx_load_aligned(pixDiff + xbord + d)));
#else
for( d = 0; d < D; d++ )
hsumAdd[x1*Da + d] = (CostType)(hsumAdd[x1*Da + d] + pixDiff[xbord + d]);
@ -1052,9 +1052,9 @@ struct CalcVerticalSums: public ParallelLoopBody
{
const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
const CostType* Cprev = mem.getCBuf(y - 1);
#if CV_SIMD
for( d = 0; d < Da; d += v_int16::nlanes )
v_store_aligned(C + x1*Da + d, vx_load_aligned(Cprev + x1*Da + d) + vx_load_aligned(hsumAdd + x1*Da + d) - vx_load_aligned(hsumSub + x1*Da + d));
#if (CV_SIMD || CV_SIMD_SCALABLE)
for( d = 0; d < Da; d += VTraits<v_int16>::vlanes() )
v_store_aligned(C + x1*Da + d, v_sub(v_add(vx_load_aligned(Cprev + x1 * this->Da + d), vx_load_aligned(hsumAdd + x1 * this->Da + d)), vx_load_aligned(hsumSub + x1 * this->Da + d)));
#else
for( d = 0; d < D; d++ )
C[x1*Da + d] = (CostType)(Cprev[x1*Da + d] + hsumAdd[x1*Da + d] - hsumSub[x1*Da + d]);
@ -1064,12 +1064,12 @@ struct CalcVerticalSums: public ParallelLoopBody
const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
#if CV_SIMD
for( d = 0; d < Da; d += v_int16::nlanes )
#if (CV_SIMD || CV_SIMD_SCALABLE)
for( d = 0; d < Da; d += VTraits<v_int16>::vlanes() )
{
v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) - vx_load_aligned(pixSub + d) + vx_load_aligned(pixAdd + d);
v_int16 hv = v_add(v_sub(vx_load_aligned(hsumAdd + x - this->Da + d), vx_load_aligned(pixSub + d)), vx_load_aligned(pixAdd + d));
v_store_aligned(hsumAdd + x + d, hv);
v_store_aligned(C + x + d, vx_load_aligned(Cprev + x + d) - vx_load_aligned(hsumSub + x + d) + hv);
v_store_aligned(C + x + d, v_add(v_sub(vx_load_aligned(Cprev + x + d), vx_load_aligned(hsumSub + x + d)), hv));
}
#else
for( d = 0; d < D; d++ )
@ -1082,10 +1082,10 @@ struct CalcVerticalSums: public ParallelLoopBody
}
else
{
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_int16 v_scale = vx_setall_s16(k == 0 ? (short)SH2 + 1 : 1);
for (d = 0; d < Da; d += v_int16::nlanes)
v_store_aligned(C + x1*Da + d, vx_load_aligned(C + x1*Da + d) + vx_load_aligned(hsumAdd + x1*Da + d) * v_scale);
for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
v_store_aligned(C + x1*Da + d, v_add(vx_load_aligned(C + x1 * this->Da + d), v_mul(vx_load_aligned(hsumAdd + x1 * this->Da + d), v_scale)));
#else
int scale = k == 0 ? SH2 + 1 : 1;
for (d = 0; d < D; d++)
@ -1095,12 +1095,12 @@ struct CalcVerticalSums: public ParallelLoopBody
{
const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
#if CV_SIMD
for (d = 0; d < Da; d += v_int16::nlanes)
#if (CV_SIMD || CV_SIMD_SCALABLE)
for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
{
v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) + vx_load_aligned(pixAdd + d) - vx_load_aligned(pixSub + d);
v_int16 hv = v_sub(v_add(vx_load_aligned(hsumAdd + x - this->Da + d), vx_load_aligned(pixAdd + d)), vx_load_aligned(pixSub + d));
v_store_aligned(hsumAdd + x + d, hv);
v_store_aligned(C + x + d, vx_load_aligned(C + x + d) + hv * v_scale);
v_store_aligned(C + x + d, v_add(vx_load_aligned(C + x + d), v_mul(hv, v_scale)));
}
#else
for( d = 0; d < D; d++ )
@ -1120,9 +1120,9 @@ struct CalcVerticalSums: public ParallelLoopBody
const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
const CostType* Cprev = mem.getCBuf(y - 1);
#if CV_SIMD
for( x = x1*Da; x < x2*Da; x += v_int16::nlanes )
v_store_aligned(C + x, vx_load_aligned(Cprev + x) - vx_load_aligned(hsumSub + x) + vx_load_aligned(hsumAdd + x));
#if (CV_SIMD || CV_SIMD_SCALABLE)
for( x = x1*Da; x < x2*Da; x += VTraits<v_int16>::vlanes() )
v_store_aligned(C + x, v_add(v_sub(vx_load_aligned(Cprev + x), vx_load_aligned(hsumSub + x)), vx_load_aligned(hsumAdd + x)));
#else
for( x = x1*Da; x < x2*Da; x++ )
C[x] = (CostType)(Cprev[x] + hsumAdd[x] - hsumSub[x]);
@ -1131,9 +1131,9 @@ struct CalcVerticalSums: public ParallelLoopBody
else*/
if(y == 0)
{
#if CV_SIMD
for( x = x1*Da; x < x2*Da; x += v_int16::nlanes )
v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x));
#if (CV_SIMD || CV_SIMD_SCALABLE)
for( x = x1*Da; x < x2*Da; x += VTraits<v_int16>::vlanes() )
v_store_aligned(C + x, v_add(vx_load_aligned(C + x), vx_load_aligned(hsumAdd + x)));
#else
for( x = x1*Da; x < x2*Da; x++ )
C[x] = (CostType)(C[x] + hsumAdd[x]);
@ -1167,19 +1167,19 @@ struct CalcVerticalSums: public ParallelLoopBody
CostType& minL = *(mem.getMinLr(lrID, x));
d = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_int16 _P1 = vx_setall_s16((short)P1);
v_int16 _delta = vx_setall_s16((short)delta);
v_int16 _minL = vx_setall_s16((short)MAX_COST);
for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
for( ; d <= D - VTraits<v_int16>::vlanes(); d += VTraits<v_int16>::vlanes() )
{
v_int16 Cpd = vx_load_aligned(Cp + d);
v_int16 L = v_min(v_min(v_min(vx_load_aligned(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
v_int16 L = v_add(v_sub(v_min(v_min(v_min(vx_load_aligned(Lr_ppr + d), v_add(vx_load(Lr_ppr + d - 1), _P1)), v_add(vx_load(Lr_ppr + d + 1), _P1)), _delta), _delta), Cpd);
v_store_aligned(Lr_p + d, L);
_minL = v_min(_minL, L);
v_store_aligned(Sp + d, vx_load_aligned(Sp + d) + L);
v_store_aligned(Sp + d, v_add(vx_load_aligned(Sp + d), L));
}
minL = v_reduce_min(_minL);
#else
@ -1264,10 +1264,10 @@ struct CalcHorizontalSums: public ParallelLoopBody
CostType* S = mem.getSBuf(y);
x = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_int16 v_inv_dist = vx_setall_s16((DispType)INVALID_DISP_SCALED);
v_int16 v_max_cost = vx_setall_s16(MAX_COST);
for (; x <= width - v_int16::nlanes; x += v_int16::nlanes)
for (; x <= width - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
{
v_store(disp1ptr + x, v_inv_dist);
v_store(disp2ptr + x, v_inv_dist);
@ -1304,19 +1304,19 @@ struct CalcHorizontalSums: public ParallelLoopBody
CostType* Sp = S + x*Da;
d = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_int16 _P1 = vx_setall_s16((short)P1);
v_int16 _delta = vx_setall_s16((short)delta);
v_int16 _minL = vx_setall_s16((short)MAX_COST);
for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes)
for( ; d <= D - VTraits<v_int16>::vlanes(); d += VTraits<v_int16>::vlanes())
{
v_int16 Cpd = vx_load_aligned(Cp + d);
v_int16 L = v_min(v_min(v_min(vx_load(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
v_int16 L = v_add(v_sub(v_min(v_min(v_min(vx_load(Lr_ppr + d), v_add(vx_load(Lr_ppr + d - 1), _P1)), v_add(vx_load(Lr_ppr + d + 1), _P1)), _delta), _delta), Cpd);
v_store(Lr_p + d, L);
_minL = v_min(_minL, L);
v_store_aligned(Sp + d, vx_load_aligned(Sp + d) + L);
v_store_aligned(Sp + d, v_add(vx_load_aligned(Sp + d), L));
}
minLr = v_reduce_min(_minL);
#else
@ -1349,22 +1349,22 @@ struct CalcHorizontalSums: public ParallelLoopBody
minLr = MAX_COST;
d = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_int16 _P1 = vx_setall_s16((short)P1);
v_int16 _delta = vx_setall_s16((short)delta);
v_int16 _minL = vx_setall_s16((short)MAX_COST);
v_int16 _minS = vx_setall_s16(MAX_COST), _bestDisp = vx_setall_s16(-1);
for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
for( ; d <= D - VTraits<v_int16>::vlanes(); d += VTraits<v_int16>::vlanes() )
{
v_int16 Cpd = vx_load_aligned(Cp + d);
v_int16 L = v_min(v_min(v_min(vx_load(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
v_int16 L = v_add(v_sub(v_min(v_min(v_min(vx_load(Lr_ppr + d), v_add(vx_load(Lr_ppr + d - 1), _P1)), v_add(vx_load(Lr_ppr + d + 1), _P1)), _delta), _delta), Cpd);
v_store(Lr_p + d, L);
_minL = v_min(_minL, L);
L += vx_load_aligned(Sp + d);
L = v_add(L, vx_load_aligned(Sp + d));
v_store_aligned(Sp + d, L);
_bestDisp = v_select(_minS > L, vx_setall_s16((short)d), _bestDisp);
_bestDisp = v_select(v_gt(_minS, L), vx_setall_s16((short)d), _bestDisp);
_minS = v_min( L, _minS );
}
minLr = v_reduce_min(_minL);
@ -1581,8 +1581,8 @@ struct SGBM3WayMainLoop : public ParallelLoopBody
utils::BufferArea aux_area;
PixType* clipTab;
#if CV_SIMD
short idx_row[v_int16::nlanes];
#if (CV_SIMD || CV_SIMD_SCALABLE)
short idx_row[VTraits<v_int16>::max_nlanes];
#endif
SGBM3WayMainLoop(const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, int stripe_size, int _stripe_overlap);
void operator () (const Range& range) const CV_OVERRIDE;
@ -1637,8 +1637,8 @@ SGBM3WayMainLoop::SGBM3WayMainLoop(const Mat& _img1,
uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
#if CV_SIMD
for(short i = 0; i < v_int16::nlanes; ++i)
#if (CV_SIMD || CV_SIMD_SCALABLE)
for(short i = 0; i < VTraits<v_int16>::vlanes(); ++i)
idx_row[i] = i;
#endif
}
@ -1659,13 +1659,13 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int
{
calcPixelCostBT( *img1, *img2, k, minD, maxD, pixDiff, tmpBuf, clipTab + TAB_OFS );
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_int16 sw2_1 = vx_setall_s16((short)SW2 + 1);
for (d = 0; d < Da; d += v_int16::nlanes)
for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
{
v_int16 hsA = vx_load_aligned(pixDiff + d) * sw2_1;
v_int16 hsA = v_mul(vx_load_aligned(pixDiff + d), sw2_1);
for (x = Da; x <= SW2 * Da; x += Da)
hsA += vx_load_aligned(pixDiff + x + d);
hsA = v_add(hsA, vx_load_aligned(pixDiff + x + d));
v_store_aligned(hsumAdd + d, hsA);
}
#else
@ -1681,9 +1681,9 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int
{
const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, src_start_idx));
#if CV_SIMD
for (d = 0; d < Da; d += v_int16::nlanes)
v_store_aligned(C + d, vx_load_aligned(C + d) + vx_load_aligned(hsumAdd + d) - vx_load_aligned(hsumSub + d));
#if (CV_SIMD || CV_SIMD_SCALABLE)
for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
v_store_aligned(C + d, v_sub(v_add(vx_load_aligned(C + d), vx_load_aligned(hsumAdd + d)), vx_load_aligned(hsumSub + d)));
#else
for (d = 0; d < D; d++)
C[d] = (CostType)(C[d] + hsumAdd[d] - hsumSub[d]);
@ -1693,13 +1693,13 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int
{
const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_int16 hv_reg;
for( d = 0; d < Da; d+=v_int16::nlanes )
for( d = 0; d < Da; d+=VTraits<v_int16>::vlanes() )
{
hv_reg = vx_load_aligned(hsumAdd+x-Da+d) + vx_load_aligned(pixAdd+d) - vx_load_aligned(pixSub+d);
hv_reg = v_sub(v_add(vx_load_aligned(hsumAdd + x - this->Da + d), vx_load_aligned(pixAdd + d)), vx_load_aligned(pixSub + d));
v_store_aligned(hsumAdd+x+d,hv_reg);
v_store_aligned(C+x+d,vx_load_aligned(C+x+d)+hv_reg-vx_load_aligned(hsumSub+x+d));
v_store_aligned(C+x+d,v_sub(v_add(vx_load_aligned(C + x + d), hv_reg), vx_load_aligned(hsumSub + x + d)));
}
#else
for( d = 0; d < D; d++ )
@ -1712,10 +1712,10 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int
}
else
{
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_int16 v_scale = vx_setall_s16(k == src_start_idx ? (short)SH2 + 1 : 1);
for (d = 0; d < Da; d += v_int16::nlanes)
v_store_aligned(C + d, vx_load_aligned(C + d) + vx_load_aligned(hsumAdd + d) * v_scale);
for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
v_store_aligned(C + d, v_add(vx_load_aligned(C + d), v_mul(vx_load_aligned(hsumAdd + d), v_scale)));
#else
int scale = k == src_start_idx ? SH2 + 1 : 1;
for (d = 0; d < D; d++)
@ -1725,12 +1725,12 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int
{
const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
#if CV_SIMD
for (d = 0; d < Da; d += v_int16::nlanes)
#if (CV_SIMD || CV_SIMD_SCALABLE)
for (d = 0; d < Da; d += VTraits<v_int16>::vlanes())
{
v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) + vx_load_aligned(pixAdd + d) - vx_load_aligned(pixSub + d);
v_int16 hv = v_sub(v_add(vx_load_aligned(hsumAdd + x - this->Da + d), vx_load_aligned(pixAdd + d)), vx_load_aligned(pixSub + d));
v_store_aligned(hsumAdd + x + d, hv);
v_store_aligned(C + x + d, vx_load_aligned(C + x + d) + hv * v_scale);
v_store_aligned(C + x + d, v_add(vx_load_aligned(C + x + d), v_mul(hv, v_scale)));
}
#else
for (d = 0; d < D; d++)
@ -1748,9 +1748,9 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int
if( y > src_start_idx )
{
const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, src_start_idx));
#if CV_SIMD
for( x = 0; x < width1*Da; x += v_int16::nlanes)
v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x) - vx_load_aligned(hsumSub + x));
#if (CV_SIMD || CV_SIMD_SCALABLE)
for( x = 0; x < width1*Da; x += VTraits<v_int16>::vlanes())
v_store_aligned(C + x, v_sub(v_add(vx_load_aligned(C + x), vx_load_aligned(hsumAdd + x)), vx_load_aligned(hsumSub + x)));
#else
for( x = 0; x < width1*Da; x++ )
C[x] = (CostType)(C[x] + hsumAdd[x] - hsumSub[x]);
@ -1758,9 +1758,9 @@ void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int
}
else
{
#if CV_SIMD
for( x = 0; x < width1*Da; x += v_int16::nlanes)
v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x));
#if (CV_SIMD || CV_SIMD_SCALABLE)
for( x = 0; x < width1*Da; x += VTraits<v_int16>::vlanes())
v_store_aligned(C + x, v_add(vx_load_aligned(C + x), vx_load_aligned(hsumAdd + x)));
#else
for( x = 0; x < width1*Da; x++ )
C[x] = (CostType)(C[x] + hsumAdd[x]);
@ -1781,7 +1781,7 @@ void SGBM3WayMainLoop::accumulateCostsLeftTop(const BufferSGBM3Way &mem, int x,
CostType *costs = mem.curCostVolumeLine - Da + x;
CostType& topMinCost = mem.vertPassMin[x/Da];
int i = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_int16 P1_reg = vx_setall_s16(cv::saturate_cast<CostType>(P1));
v_int16 leftMinCostP2_reg = vx_setall_s16(cv::saturate_cast<CostType>(leftMinCost+P2));
@ -1798,18 +1798,18 @@ void SGBM3WayMainLoop::accumulateCostsLeftTop(const BufferSGBM3Way &mem, int x,
v_int16 src_shifted_left,src_shifted_right;
v_int16 res;
for(;i<Da-v_int16::nlanes;i+= v_int16::nlanes)
for(;i<Da-VTraits<v_int16>::vlanes();i+= VTraits<v_int16>::vlanes())
{
//process leftBuf:
//lookahead load:
src2 = vx_load_aligned(leftBuf_prev+i+v_int16::nlanes);
src2 = vx_load_aligned(leftBuf_prev+i+VTraits<v_int16>::vlanes());
//get shifted versions of the current block and add P1:
src_shifted_left = v_rotate_left<1> (src1_leftBuf,src0_leftBuf);
src_shifted_right = v_rotate_right<1> (src1_leftBuf,src2 );
// process and save current block:
res = vx_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
res = v_add(vx_load_aligned(costs + i), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_leftBuf, leftMinCostP2_reg)), leftMinCostP2_reg));
leftMinCost_new_reg = v_min(leftMinCost_new_reg,res);
v_store_aligned(leftBuf+i, res);
@ -1819,14 +1819,14 @@ void SGBM3WayMainLoop::accumulateCostsLeftTop(const BufferSGBM3Way &mem, int x,
//process topBuf:
//lookahead load:
src2 = vx_load_aligned(topBuf+i+v_int16::nlanes);
src2 = vx_load_aligned(topBuf+i+VTraits<v_int16>::vlanes());
//get shifted versions of the current block and add P1:
src_shifted_left = v_rotate_left<1> (src1_topBuf,src0_topBuf);
src_shifted_right = v_rotate_right<1> (src1_topBuf,src2 );
// process and save current block:
res = vx_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
res = v_add(vx_load_aligned(costs + i), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_topBuf, topMinCostP2_reg)), topMinCostP2_reg));
topMinCost_new_reg = v_min(topMinCost_new_reg,res);
v_store_aligned(topBuf+i, res);
@ -1843,17 +1843,17 @@ void SGBM3WayMainLoop::accumulateCostsLeftTop(const BufferSGBM3Way &mem, int x,
src_shifted_left = v_rotate_left<1> (src1_leftBuf,src0_leftBuf);
src_shifted_right = v_rotate_right<1> (src1_leftBuf,src2 );
res = vx_load_aligned(costs+Da-v_int16::nlanes) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
res = v_add(vx_load_aligned(costs + this->Da - VTraits<v_int16>::vlanes()), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_leftBuf, leftMinCostP2_reg)), leftMinCostP2_reg));
leftMinCost = v_reduce_min(v_min(leftMinCost_new_reg,res));
v_store_aligned(leftBuf+Da-v_int16::nlanes, res);
v_store_aligned(leftBuf+Da-VTraits<v_int16>::vlanes(), res);
//process topBuf:
src_shifted_left = v_rotate_left<1> (src1_topBuf,src0_topBuf);
src_shifted_right = v_rotate_right<1> (src1_topBuf,src2 );
res = vx_load_aligned(costs+Da-v_int16::nlanes) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
res = v_add(vx_load_aligned(costs + this->Da - VTraits<v_int16>::vlanes()), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_topBuf, topMinCostP2_reg)), topMinCostP2_reg));
topMinCost = v_reduce_min(v_min(topMinCost_new_reg,res));
v_store_aligned(topBuf+Da-v_int16::nlanes, res);
v_store_aligned(topBuf+Da-VTraits<v_int16>::vlanes(), res);
}
else
{
@ -1904,7 +1904,7 @@ void SGBM3WayMainLoop::accumulateCostsRight(const BufferSGBM3Way &mem, int x,
CostType* leftBuf = mem.horPassCostVolume + x;
int i = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_int16 P1_reg = vx_setall_s16(cv::saturate_cast<CostType>(P1));
v_int16 rightMinCostP2_reg = vx_setall_s16(cv::saturate_cast<CostType>(rightMinCost+P2));
@ -1919,27 +1919,27 @@ void SGBM3WayMainLoop::accumulateCostsRight(const BufferSGBM3Way &mem, int x,
v_int16 min_sum_cost_reg = vx_setall_s16(SHRT_MAX);
v_int16 min_sum_pos_reg = vx_setall_s16(0);
for(;i<Da-v_int16::nlanes;i+=v_int16::nlanes)
for(;i<Da-VTraits<v_int16>::vlanes();i+=VTraits<v_int16>::vlanes())
{
//lookahead load:
src2 = vx_load_aligned(rightBuf+i+v_int16::nlanes);
src2 = vx_load_aligned(rightBuf+i+VTraits<v_int16>::vlanes());
//get shifted versions of the current block and add P1:
src_shifted_left = v_rotate_left<1> (src1_rightBuf,src0_rightBuf);
src_shifted_right = v_rotate_right<1> (src1_rightBuf,src2 );
// process and save current block:
res = vx_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
res = v_add(vx_load_aligned(costs + i), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_rightBuf, rightMinCostP2_reg)), rightMinCostP2_reg));
rightMinCost_new_reg = v_min(rightMinCost_new_reg,res);
v_store_aligned(rightBuf+i, res);
// compute and save total cost:
res = res + vx_load_aligned(leftBuf+i) + vx_load_aligned(topBuf+i);
res = v_add(v_add(res, vx_load_aligned(leftBuf + i)), vx_load_aligned(topBuf + i));
v_store_aligned(leftBuf+i, res);
// track disparity value with the minimum cost:
min_sum_cost_reg = v_min(min_sum_cost_reg,res);
min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (vx_setall_s16((short)i) - min_sum_pos_reg));
min_sum_pos_reg = v_add(min_sum_pos_reg, v_and(v_eq(min_sum_cost_reg, res), v_sub(vx_setall_s16((short)i), min_sum_pos_reg)));
//update src:
src0_rightBuf = src1_rightBuf;
@ -1953,15 +1953,15 @@ void SGBM3WayMainLoop::accumulateCostsRight(const BufferSGBM3Way &mem, int x,
src_shifted_left = v_rotate_left<1> (src1_rightBuf,src0_rightBuf);
src_shifted_right = v_rotate_right<1> (src1_rightBuf,src2 );
res = vx_load_aligned(costs+D-v_int16::nlanes) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
res = v_add(vx_load_aligned(costs + this->D - VTraits<v_int16>::vlanes()), v_sub(v_min(v_add(v_min(src_shifted_left, src_shifted_right), P1_reg), v_min(src1_rightBuf, rightMinCostP2_reg)), rightMinCostP2_reg));
rightMinCost = v_reduce_min(v_min(rightMinCost_new_reg,res));
v_store_aligned(rightBuf+D-v_int16::nlanes, res);
v_store_aligned(rightBuf+D-VTraits<v_int16>::vlanes(), res);
res = res + vx_load_aligned(leftBuf+D-v_int16::nlanes) + vx_load_aligned(topBuf+D-v_int16::nlanes);
v_store_aligned(leftBuf+D-v_int16::nlanes, res);
res = v_add(v_add(res, vx_load_aligned(leftBuf + this->D - VTraits<v_int16>::vlanes())), vx_load_aligned(topBuf + this->D - VTraits<v_int16>::vlanes()));
v_store_aligned(leftBuf+D-VTraits<v_int16>::vlanes(), res);
min_sum_cost_reg = v_min(min_sum_cost_reg,res);
min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (vx_setall_s16((short)(D-v_int16::nlanes)) - min_sum_pos_reg));
min_sum_pos_reg = v_add(min_sum_pos_reg, v_and(v_eq(min_sum_cost_reg, res), v_sub(vx_setall_s16((short)(this->D - VTraits<v_int16>::vlanes())), min_sum_pos_reg)));
min_pos(min_sum_cost_reg,min_sum_pos_reg, min_cost, optimal_disp);
}
else
@ -2070,40 +2070,40 @@ void SGBM3WayMainLoop::impl(const Range& range) const
if(uniquenessRatio>0)
{
d = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
horPassCostVolume+=x;
int thresh = (100*min_cost)/(100-uniquenessRatio);
v_int16 thresh_reg = vx_setall_s16((short)(thresh+1));
v_int16 d1 = vx_setall_s16((short)(best_d-1));
v_int16 d2 = vx_setall_s16((short)(best_d+1));
v_int16 eight_reg = vx_setall_s16((short)v_int16::nlanes);
v_int16 eight_reg = vx_setall_s16((short)VTraits<v_int16>::vlanes());
v_int16 cur_d = vx_load(idx_row);
v_int16 mask;
for( ; d <= D - 2*v_int16::nlanes; d+=2*v_int16::nlanes )
for( ; d <= D - 2*VTraits<v_int16>::vlanes(); d+=2*VTraits<v_int16>::vlanes() )
{
mask = (vx_load_aligned(horPassCostVolume + d) < thresh_reg) & ( (cur_d<d1) | (cur_d>d2) );
cur_d = cur_d+eight_reg;
mask = v_and(v_lt(vx_load_aligned(horPassCostVolume + d), thresh_reg), v_or(v_lt(cur_d, d1), v_gt(cur_d, d2)));
cur_d = v_add(cur_d, eight_reg);
if( v_check_any(mask) )
break;
mask = (vx_load_aligned(horPassCostVolume + d + v_int16::nlanes) < thresh_reg) & ( (cur_d<d1) | (cur_d>d2) );
cur_d = cur_d+eight_reg;
mask = v_and(v_lt(vx_load_aligned(horPassCostVolume + d + VTraits<v_int16>::vlanes()), thresh_reg), v_or(v_lt(cur_d, d1), v_gt(cur_d, d2)));
cur_d = v_add(cur_d, eight_reg);
if( v_check_any(mask) )
break;
}
if( d <= D - 2*v_int16::nlanes )
if( d <= D - 2*VTraits<v_int16>::vlanes() )
{
horPassCostVolume-=x;
continue;
}
if( d <= D - v_int16::nlanes )
if( d <= D - VTraits<v_int16>::vlanes() )
{
if( v_check_any((vx_load_aligned(horPassCostVolume + d) < thresh_reg) & ((cur_d < d1) | (cur_d > d2))) )
if( v_check_any(v_and(v_lt(vx_load_aligned(horPassCostVolume + d), thresh_reg), v_or(v_lt(cur_d, d1), v_gt(cur_d, d2)))) )
{
horPassCostVolume-=x;
continue;
}
d+=v_int16::nlanes;
d+=VTraits<v_int16>::vlanes();
}
horPassCostVolume-=x;
#endif

@ -210,24 +210,24 @@ float calcOrientationHist(
cv::hal::magnitude32f(X, Y, Mag, len);
k = 0;
#if CV_SIMD
const int vecsize = v_float32::nlanes;
#if (CV_SIMD || CV_SIMD_SCALABLE)
const int vecsize = VTraits<v_float32>::vlanes();
v_float32 nd360 = vx_setall_f32(n/360.f);
v_int32 __n = vx_setall_s32(n);
int CV_DECL_ALIGNED(CV_SIMD_WIDTH) bin_buf[vecsize];
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) w_mul_mag_buf[vecsize];
int CV_DECL_ALIGNED(CV_SIMD_WIDTH) bin_buf[VTraits<v_float32>::max_nlanes];
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) w_mul_mag_buf[VTraits<v_float32>::max_nlanes];
for( ; k <= len - vecsize; k += vecsize )
{
v_float32 w = vx_load_aligned( W + k );
v_float32 mag = vx_load_aligned( Mag + k );
v_float32 ori = vx_load_aligned( Ori + k );
v_int32 bin = v_round( nd360 * ori );
v_int32 bin = v_round( v_mul(nd360, ori) );
bin = v_select(bin >= __n, bin - __n, bin);
bin = v_select(bin < vx_setzero_s32(), bin + __n, bin);
bin = v_select(v_ge(bin, __n), v_sub(bin, __n), bin);
bin = v_select(v_lt(bin, vx_setzero_s32()), v_add(bin, __n), bin);
w = w * mag;
w = v_mul(w, mag);
v_store_aligned(bin_buf, bin);
v_store_aligned(w_mul_mag_buf, w);
for(int vi = 0; vi < vecsize; vi++)
@ -253,19 +253,19 @@ float calcOrientationHist(
temphist[n+1] = temphist[1];
i = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_float32 d_1_16 = vx_setall_f32(1.f/16.f);
v_float32 d_4_16 = vx_setall_f32(4.f/16.f);
v_float32 d_6_16 = vx_setall_f32(6.f/16.f);
for( ; i <= n - v_float32::nlanes; i += v_float32::nlanes )
for( ; i <= n - VTraits<v_float32>::vlanes(); i += VTraits<v_float32>::vlanes() )
{
v_float32 tn2 = vx_load_aligned(temphist + i-2);
v_float32 tn1 = vx_load(temphist + i-1);
v_float32 t0 = vx_load(temphist + i);
v_float32 t1 = vx_load(temphist + i+1);
v_float32 t2 = vx_load(temphist + i+2);
v_float32 _hist = v_fma(tn2 + t2, d_1_16,
v_fma(tn1 + t1, d_4_16, t0 * d_6_16));
v_float32 _hist = v_fma(v_add(tn2, t2), d_1_16,
v_fma(v_add(tn1, t1), d_4_16, v_mul(t0, d_6_16)));
v_store(hist + i, _hist);
}
#endif
@ -452,8 +452,8 @@ public:
const sift_wt* nextptr = next.ptr<sift_wt>(r);
int c = SIFT_IMG_BORDER;
#if CV_SIMD && !(DoG_TYPE_SHORT)
const int vecsize = v_float32::nlanes;
#if (CV_SIMD || CV_SIMD_SCALABLE) && !(DoG_TYPE_SHORT)
const int vecsize = VTraits<v_float32>::vlanes();
for( ; c <= cols-SIFT_IMG_BORDER - vecsize; c += vecsize)
{
v_float32 val = vx_load(&currptr[c]);
@ -464,7 +464,7 @@ public:
v_float32 vmin,vmax;
v_float32 cond = v_abs(val) > vx_setall_f32((float)threshold);
v_float32 cond = v_gt(v_abs(val), vx_setall_f32((float)this->threshold));
if (!v_check_any(cond))
{
continue;
@ -477,10 +477,10 @@ public:
vmax = v_max(v_max(v_max(_00,_01),v_max(_02,_10)),v_max(v_max(_12,_20),v_max(_21,_22)));
vmin = v_min(v_min(v_min(_00,_01),v_min(_02,_10)),v_min(v_min(_12,_20),v_min(_21,_22)));
v_float32 condp = cond & (val > vx_setall_f32(0)) & (val >= vmax);
v_float32 condm = cond & (val < vx_setall_f32(0)) & (val <= vmin);
v_float32 condp = v_and(v_and(cond, v_gt(val, vx_setall_f32(0))), v_ge(val, vmax));
v_float32 condm = v_and(v_and(cond, v_lt(val, vx_setall_f32(0))), v_le(val, vmin));
cond = condp | condm;
cond = v_or(condp, condm);
if (!v_check_any(cond))
{
continue;
@ -493,10 +493,10 @@ public:
vmax = v_max(v_max(v_max(_00,_01),v_max(_02,_10)),v_max(v_max(_12,_20),v_max(_21,_22)));
vmin = v_min(v_min(v_min(_00,_01),v_min(_02,_10)),v_min(v_min(_12,_20),v_min(_21,_22)));
condp &= (val >= vmax);
condm &= (val <= vmin);
condp = v_and(condp, v_ge(val, vmax));
condm = v_and(condm, v_le(val, vmin));
cond = condp | condm;
cond = v_or(condp, condm);
if (!v_check_any(cond))
{
continue;
@ -515,10 +515,10 @@ public:
vmax = v_max(v_max(v_max(_00,_01),v_max(_02,_10)),v_max(v_max(_12,_20),v_max(_21,_22)));
vmin = v_min(v_min(v_min(_00,_01),v_min(_02,_10)),v_min(v_min(_12,_20),v_min(_21,_22)));
condp &= (val >= v_max(vmax,max_middle));
condm &= (val <= v_min(vmin,min_middle));
condp = v_and(condp, v_ge(val, v_max(vmax, max_middle)));
condm = v_and(condm, v_le(val, v_min(vmin, min_middle)));
cond = condp | condm;
cond = v_or(condp, condm);
if (!v_check_any(cond))
{
continue;
@ -777,11 +777,11 @@ void calcSIFTDescriptor(
cv::hal::exp32f(W, W, len);
k = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
{
const int vecsize = v_float32::nlanes;
int CV_DECL_ALIGNED(CV_SIMD_WIDTH) idx_buf[vecsize];
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rco_buf[8*vecsize];
const int vecsize = VTraits<v_float32>::vlanes();
int CV_DECL_ALIGNED(CV_SIMD_WIDTH) idx_buf[VTraits<v_float32>::max_nlanes];
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rco_buf[8*VTraits<v_float32>::max_nlanes];
const v_float32 __ori = vx_setall_f32(ori);
const v_float32 __bins_per_rad = vx_setall_f32(bins_per_rad);
const v_int32 __n = vx_setall_s32(n);
@ -792,28 +792,28 @@ void calcSIFTDescriptor(
{
v_float32 rbin = vx_load_aligned(RBin + k);
v_float32 cbin = vx_load_aligned(CBin + k);
v_float32 obin = (vx_load_aligned(Ori + k) - __ori) * __bins_per_rad;
v_float32 mag = vx_load_aligned(Mag + k) * vx_load_aligned(W + k);
v_float32 obin = v_mul(v_sub(vx_load_aligned(Ori + k), __ori), __bins_per_rad);
v_float32 mag = v_mul(vx_load_aligned(Mag + k), vx_load_aligned(W + k));
v_int32 r0 = v_floor(rbin);
v_int32 c0 = v_floor(cbin);
v_int32 o0 = v_floor(obin);
rbin -= v_cvt_f32(r0);
cbin -= v_cvt_f32(c0);
obin -= v_cvt_f32(o0);
o0 = v_select(o0 < vx_setzero_s32(), o0 + __n, o0);
o0 = v_select(o0 >= __n, o0 - __n, o0);
v_float32 v_r1 = mag*rbin, v_r0 = mag - v_r1;
v_float32 v_rc11 = v_r1*cbin, v_rc10 = v_r1 - v_rc11;
v_float32 v_rc01 = v_r0*cbin, v_rc00 = v_r0 - v_rc01;
v_float32 v_rco111 = v_rc11*obin, v_rco110 = v_rc11 - v_rco111;
v_float32 v_rco101 = v_rc10*obin, v_rco100 = v_rc10 - v_rco101;
v_float32 v_rco011 = v_rc01*obin, v_rco010 = v_rc01 - v_rco011;
v_float32 v_rco001 = v_rc00*obin, v_rco000 = v_rc00 - v_rco001;
v_int32 idx = v_muladd(v_muladd(r0+__1, __d_plus_2, c0+__1), __n_plus_2, o0);
rbin = v_sub(rbin, v_cvt_f32(r0));
cbin = v_sub(cbin, v_cvt_f32(c0));
obin = v_sub(obin, v_cvt_f32(o0));
o0 = v_select(v_lt(o0, vx_setzero_s32()), v_add(o0, __n), o0);
o0 = v_select(v_ge(o0, __n), v_sub(o0, __n), o0);
v_float32 v_r1 = v_mul(mag, rbin), v_r0 = v_sub(mag, v_r1);
v_float32 v_rc11 = v_mul(v_r1, cbin), v_rc10 = v_sub(v_r1, v_rc11);
v_float32 v_rc01 = v_mul(v_r0, cbin), v_rc00 = v_sub(v_r0, v_rc01);
v_float32 v_rco111 = v_mul(v_rc11, obin), v_rco110 = v_sub(v_rc11, v_rco111);
v_float32 v_rco101 = v_mul(v_rc10, obin), v_rco100 = v_sub(v_rc10, v_rco101);
v_float32 v_rco011 = v_mul(v_rc01, obin), v_rco010 = v_sub(v_rc01, v_rco011);
v_float32 v_rco001 = v_mul(v_rc00, obin), v_rco000 = v_sub(v_rc00, v_rco001);
v_int32 idx = v_muladd(v_muladd(v_add(r0, __1), __d_plus_2, v_add(c0, __1)), __n_plus_2, o0);
v_store_aligned(idx_buf, idx);
v_store_aligned(rco_buf, v_rco000);
@ -894,11 +894,11 @@ void calcSIFTDescriptor(
float nrm2 = 0;
len = d*d*n;
k = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
{
v_float32 __nrm2 = vx_setzero_f32();
v_float32 __rawDst;
for( ; k <= len - v_float32::nlanes; k += v_float32::nlanes )
for( ; k <= len - VTraits<v_float32>::vlanes(); k += VTraits<v_float32>::vlanes() )
{
__rawDst = vx_load_aligned(rawDst + k);
__nrm2 = v_fma(__rawDst, __rawDst, __nrm2);
@ -949,15 +949,15 @@ void calcSIFTDescriptor(
if( dstMat.type() == CV_32F )
{
float* dst = dstMat.ptr<float>(row);
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_float32 __dst;
v_float32 __min = vx_setzero_f32();
v_float32 __max = vx_setall_f32(255.0f); // max of uchar
v_float32 __nrm2 = vx_setall_f32(nrm2);
for( k = 0; k <= len - v_float32::nlanes; k += v_float32::nlanes )
for( k = 0; k <= len - VTraits<v_float32>::vlanes(); k += VTraits<v_float32>::vlanes() )
{
__dst = vx_load_aligned(rawDst + k);
__dst = v_min(v_max(v_cvt_f32(v_round(__dst * __nrm2)), __min), __max);
__dst = v_min(v_max(v_cvt_f32(v_round(v_mul(__dst, __nrm2))), __min), __max);
v_store(dst + k, __dst);
}
#endif
@ -976,16 +976,16 @@ if( dstMat.type() == CV_32F )
else // CV_8U
{
uint8_t* dst = dstMat.ptr<uint8_t>(row);
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_float32 __dst0, __dst1;
v_uint16 __pack01;
v_float32 __nrm2 = vx_setall_f32(nrm2);
for( k = 0; k <= len - v_float32::nlanes * 2; k += v_float32::nlanes * 2 )
for( k = 0; k <= len - VTraits<v_float32>::vlanes() * 2; k += VTraits<v_float32>::vlanes() * 2 )
{
__dst0 = vx_load_aligned(rawDst + k);
__dst1 = vx_load_aligned(rawDst + k + v_float32::nlanes);
__dst1 = vx_load_aligned(rawDst + k + VTraits<v_float32>::vlanes());
__pack01 = v_pack_u(v_round(__dst0 * __nrm2), v_round(__dst1 * __nrm2));
__pack01 = v_pack_u(v_round(v_mul(__dst0, __nrm2)), v_round(v_mul(__dst1, __nrm2)));
v_pack_store(dst + k, __pack01);
}
#endif

Loading…
Cancel
Save