StereoBM algorithm updated to use wide universal intrinsics

pull/15180/head
Vitaly Tuzov 5 years ago
parent c2096771cb
commit 0a1b957331
  1. 413
      modules/calib3d/src/stereobm.cpp

@ -216,30 +216,30 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
dptr0[0] = dptr0[size.width-1] = dptr1[0] = dptr1[size.width-1] = val0; dptr0[0] = dptr0[size.width-1] = dptr1[0] = dptr1[size.width-1] = val0;
x = 1; x = 1;
#if CV_SIMD128 #if CV_SIMD
{ {
v_int16x8 ftz = v_setall_s16((short) ftzero); v_int16 ftz = vx_setall_s16((short) ftzero);
v_int16x8 ftz2 = v_setall_s16((short)(ftzero*2)); v_int16 ftz2 = vx_setall_s16((short)(ftzero*2));
v_int16x8 z = v_setzero_s16(); v_int16 z = vx_setzero_s16();
for(; x <= (size.width - 1) - 8; x += 8 ) for(; x <= (size.width - 1) - v_int16::nlanes; x += v_int16::nlanes)
{ {
v_int16x8 s00 = v_reinterpret_as_s16(v_load_expand(srow0 + x + 1)); v_int16 s00 = v_reinterpret_as_s16(vx_load_expand(srow0 + x + 1));
v_int16x8 s01 = v_reinterpret_as_s16(v_load_expand(srow0 + x - 1)); v_int16 s01 = v_reinterpret_as_s16(vx_load_expand(srow0 + x - 1));
v_int16x8 s10 = v_reinterpret_as_s16(v_load_expand(srow1 + x + 1)); v_int16 s10 = v_reinterpret_as_s16(vx_load_expand(srow1 + x + 1));
v_int16x8 s11 = v_reinterpret_as_s16(v_load_expand(srow1 + x - 1)); v_int16 s11 = v_reinterpret_as_s16(vx_load_expand(srow1 + x - 1));
v_int16x8 s20 = v_reinterpret_as_s16(v_load_expand(srow2 + x + 1)); v_int16 s20 = v_reinterpret_as_s16(vx_load_expand(srow2 + x + 1));
v_int16x8 s21 = v_reinterpret_as_s16(v_load_expand(srow2 + x - 1)); v_int16 s21 = v_reinterpret_as_s16(vx_load_expand(srow2 + x - 1));
v_int16x8 s30 = v_reinterpret_as_s16(v_load_expand(srow3 + x + 1)); v_int16 s30 = v_reinterpret_as_s16(vx_load_expand(srow3 + x + 1));
v_int16x8 s31 = v_reinterpret_as_s16(v_load_expand(srow3 + x - 1)); v_int16 s31 = v_reinterpret_as_s16(vx_load_expand(srow3 + x - 1));
v_int16x8 d0 = s00 - s01; v_int16 d0 = s00 - s01;
v_int16x8 d1 = s10 - s11; v_int16 d1 = s10 - s11;
v_int16x8 d2 = s20 - s21; v_int16 d2 = s20 - s21;
v_int16x8 d3 = s30 - s31; v_int16 d3 = s30 - s31;
v_uint16x8 v0 = v_reinterpret_as_u16(v_max(v_min(d0 + d1 + d1 + d2 + ftz, ftz2), z)); v_uint16 v0 = v_reinterpret_as_u16(v_max(v_min(d0 + d1 + d1 + d2 + ftz, ftz2), z));
v_uint16x8 v1 = v_reinterpret_as_u16(v_max(v_min(d1 + d2 + d2 + d3 + ftz, ftz2), z)); v_uint16 v1 = v_reinterpret_as_u16(v_max(v_min(d1 + d2 + d2 + d3 + ftz, ftz2), z));
v_pack_store(dptr0 + x, v0); v_pack_store(dptr0 + x, v0);
v_pack_store(dptr1 + x, v1); v_pack_store(dptr1 + x, v1);
@ -262,10 +262,10 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
{ {
uchar* dptr = dst.ptr<uchar>(y); uchar* dptr = dst.ptr<uchar>(y);
x = 0; x = 0;
#if CV_SIMD128 #if CV_SIMD
{ {
v_uint8x16 val0_16 = v_setall_u8(val0); v_uint8 val0_16 = vx_setall_u8(val0);
for(; x <= size.width-16; x+=16 ) for(; x <= size.width-v_uint8::nlanes; x+=v_uint8::nlanes)
v_store(dptr + x, val0_16); v_store(dptr + x, val0_16);
} }
#endif #endif
@ -309,13 +309,13 @@ inline int dispDescale(int v1, int v2, int d)
return (int)(v1*256 + (d != 0 ? v2*256/d : 0)); // no need to add 127, this will be converted to float return (int)(v1*256 + (d != 0 ? v2*256/d : 0)); // no need to add 127, this will be converted to float
} }
#if CV_SIMD128 #if CV_SIMD
template <typename dType> template <typename dType>
static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right, static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
Mat& disp, Mat& cost, StereoBMParams& state, Mat& disp, Mat& cost, StereoBMParams& state,
uchar* buf, int _dy0, int _dy1 ) uchar* buf, int _dy0, int _dy1 )
{ {
const int ALIGN = 16; const int ALIGN = CV_SIMD_WIDTH;
int x, y, d; int x, y, d;
int wsz = state.SADWindowSize, wsz2 = wsz/2; int wsz = state.SADWindowSize, wsz2 = wsz/2;
int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1); int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1);
@ -345,7 +345,9 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0; int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0;
const int TABSZ = 256; const int TABSZ = 256;
uchar tab[TABSZ]; uchar tab[TABSZ];
const v_int16x8 d0_8 = v_int16x8(0,1,2,3,4,5,6,7), dd_8 = v_setall_s16(8); short v_seq[v_int16::nlanes];
for (short i = 0; i < v_int16::nlanes; ++i)
v_seq[i] = i;
sad = (ushort*)alignPtr(buf + sizeof(sad[0]), ALIGN); sad = (ushort*)alignPtr(buf + sizeof(sad[0]), ALIGN);
hsad0 = (ushort*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN); hsad0 = (ushort*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN);
@ -368,20 +370,26 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
for( y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep ) for( y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep )
{ {
int lval = lptr[0]; int lval = lptr[0];
v_uint8x16 lv = v_setall_u8((uchar)lval); v_uint8 lv = vx_setall_u8((uchar)lval);
for( d = 0; d < ndisp; d += 16 ) for( d = 0; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
{ {
v_uint8x16 rv = v_load(rptr + d); v_uint8 diff = v_absdiff(lv, vx_load(rptr + d));
v_uint16x8 hsad_l = v_load(hsad + d);
v_uint16x8 hsad_h = v_load(hsad + d + 8);
v_uint8x16 diff = v_absdiff(lv, rv);
v_store(cbuf + d, diff); v_store(cbuf + d, diff);
v_uint16x8 diff0, diff1; v_store(hsad + d, vx_load(hsad + d) + v_expand_low(diff));
v_expand(diff, diff0, diff1); v_store(hsad + d + v_uint16::nlanes, vx_load(hsad + d + v_uint16::nlanes) + v_expand_high(diff));
hsad_l += diff0; }
hsad_h += diff1; if( d <= ndisp - v_uint16::nlanes )
v_store(hsad + d, hsad_l); {
v_store(hsad + d + 8, hsad_h); v_uint8 diff = v_absdiff(lv, vx_load_low(rptr + d));
v_store_low(cbuf + d, diff);
v_store(hsad + d, vx_load(hsad + d) + v_expand_low(diff));
d += v_uint16::nlanes;
}
for( ; d < ndisp; d++ )
{
int diff = abs(lval - rptr[d]);
cbuf[d] = (uchar)diff;
hsad[d] += (ushort)diff;
} }
htext[y] += tab[lval]; htext[y] += tab[lval];
} }
@ -412,24 +420,27 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep ) hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep )
{ {
int lval = lptr[0]; int lval = lptr[0];
v_uint8x16 lv = v_setall_u8((uchar)lval); v_uint8 lv = vx_setall_u8((uchar)lval);
for( d = 0; d < ndisp; d += 16 ) for( d = 0; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
{ {
v_uint8x16 rv = v_load(rptr + d); v_uint8 diff = v_absdiff(lv, vx_load(rptr + d));
v_uint16x8 hsad_l = v_load(hsad + d); v_int8 cbs = v_reinterpret_as_s8(vx_load(cbuf_sub + d));
v_uint16x8 hsad_h = v_load(hsad + d + 8);
v_uint8x16 cbs = v_load(cbuf_sub + d);
v_uint8x16 diff = v_absdiff(lv, rv);
v_int16x8 diff_l, diff_h, cbs_l, cbs_h;
v_store(cbuf + d, diff); v_store(cbuf + d, diff);
v_expand(v_reinterpret_as_s8(diff), diff_l, diff_h); v_store(hsad + d, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d) + v_expand_low(diff)) - v_expand_low(cbs)));
v_expand(v_reinterpret_as_s8(cbs), cbs_l, cbs_h); v_store(hsad + d + v_uint16::nlanes, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d + v_uint16::nlanes) + v_expand_high(diff)) - v_expand_high(cbs)));
diff_l -= cbs_l; }
diff_h -= cbs_h; if( d <= ndisp - v_uint16::nlanes)
hsad_h = v_reinterpret_as_u16(v_reinterpret_as_s16(hsad_h) + diff_h); {
hsad_l = v_reinterpret_as_u16(v_reinterpret_as_s16(hsad_l) + diff_l); v_uint8 diff = v_absdiff(lv, vx_load_low(rptr + d));
v_store(hsad + d, hsad_l); v_store_low(cbuf + d, diff);
v_store(hsad + d + 8, hsad_h); v_store(hsad + d, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d) + v_expand_low(diff)) - vx_load_expand((schar*)cbuf_sub + d)));
d += v_uint16::nlanes;
}
for( ; d < ndisp; d++ )
{
int diff = abs(lval - rptr[d]);
cbuf[d] = (uchar)diff;
hsad[d] = hsad[d] + (ushort)diff - cbuf_sub[d];
} }
htext[y] += tab[lval] - tab[lptr_sub[0]]; htext[y] += tab[lval] - tab[lptr_sub[0]];
} }
@ -446,16 +457,24 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
hsad = hsad0 + (1 - dy0)*ndisp; hsad = hsad0 + (1 - dy0)*ndisp;
for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp ) for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
for( d = 0; d <= ndisp-16; d += 16 ) {
{ for( d = 0; d <= ndisp-2*v_uint16::nlanes; d += 2*v_uint16::nlanes )
v_uint16x8 s0 = v_load(sad + d); {
v_uint16x8 s1 = v_load(sad + d + 8); v_store(sad + d, vx_load(sad + d) + vx_load(hsad + d));
v_uint16x8 t0 = v_load(hsad + d); v_store(sad + d + v_uint16::nlanes, vx_load(sad + d + v_uint16::nlanes) + vx_load(hsad + d + v_uint16::nlanes));
v_uint16x8 t1 = v_load(hsad + d + 8); }
s0 = s0 + t0; if( d <= ndisp-v_uint16::nlanes )
s1 = s1 + t1; {
v_store(sad + d, s0); v_store(sad + d, vx_load(sad + d) + vx_load(hsad + d));
v_store(sad + d + 8, s1); d += v_uint16::nlanes;
}
if( d <= ndisp-v_uint16::nlanes/2 )
{
v_store_low(sad + d, vx_load_low(sad + d) + vx_load_low(hsad + d));
d += v_uint16::nlanes/2;
}
for( ; d < ndisp; d++ )
sad[d] = sad[d] + hsad[d];
} }
int tsum = 0; int tsum = 0;
for( y = -wsz2-1; y < wsz2; y++ ) for( y = -wsz2-1; y < wsz2; y++ )
@ -467,38 +486,41 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
int minsad = INT_MAX, mind = -1; int minsad = INT_MAX, mind = -1;
hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp; hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp; hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
v_int16x8 minsad8 = v_setall_s16(SHRT_MAX); v_int16 minsad8 = vx_setall_s16(SHRT_MAX);
v_int16x8 mind8 = v_setall_s16(0), d8 = d0_8; v_int16 mind8 = vx_setall_s16(0);
for( d = 0; d < ndisp; d += 16 ) for( d = 0; d <= ndisp - 2*v_int16::nlanes; d += 2*v_int16::nlanes )
{ {
v_int16x8 u0 = v_reinterpret_as_s16(v_load(hsad_sub + d)); v_int16 sad8 = v_reinterpret_as_s16(vx_load(hsad + d)) - v_reinterpret_as_s16(vx_load(hsad_sub + d)) + v_reinterpret_as_s16(vx_load(sad + d));
v_int16x8 u1 = v_reinterpret_as_s16(v_load(hsad + d)); v_store(sad + d, v_reinterpret_as_u16(sad8));
mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)d));
v_int16x8 v0 = v_reinterpret_as_s16(v_load(hsad_sub + d + 8)); minsad8 = v_min(minsad8, sad8);
v_int16x8 v1 = v_reinterpret_as_s16(v_load(hsad + d + 8));
v_int16x8 usad8 = v_reinterpret_as_s16(v_load(sad + d)); sad8 = v_reinterpret_as_s16(vx_load(hsad + d + v_int16::nlanes)) - v_reinterpret_as_s16(vx_load(hsad_sub + d + v_int16::nlanes)) + v_reinterpret_as_s16(vx_load(sad + d + v_int16::nlanes));
v_int16x8 vsad8 = v_reinterpret_as_s16(v_load(sad + d + 8)); v_store(sad + d + v_int16::nlanes, v_reinterpret_as_u16(sad8));
mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)d+v_int16::nlanes));
u1 -= u0; minsad8 = v_min(minsad8, sad8);
v1 -= v0; }
usad8 += u1; if( d <= ndisp - v_int16::nlanes )
vsad8 += v1; {
v_int16 sad8 = v_reinterpret_as_s16(vx_load(hsad + d)) - v_reinterpret_as_s16(vx_load(hsad_sub + d)) + v_reinterpret_as_s16(vx_load(sad + d));
v_int16x8 mask = minsad8 > usad8; v_store(sad + d, v_reinterpret_as_u16(sad8));
minsad8 = v_min(minsad8, usad8); mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)d));
mind8 = v_max(mind8, (mask& d8)); minsad8 = v_min(minsad8, sad8);
d += v_int16::nlanes;
v_store(sad + d, v_reinterpret_as_u16(usad8)); }
v_store(sad + d + 8, v_reinterpret_as_u16(vsad8)); minsad = v_reduce_min(minsad8);
v_int16 v_mask = (vx_setall_s16((short)minsad) == minsad8);
mask = minsad8 > vsad8; mind = v_reduce_min(((mind8+vx_load(v_seq)) & v_mask) | (vx_setall_s16(SHRT_MAX) & ~v_mask));
minsad8 = v_min(minsad8, vsad8); for( ; d < ndisp; d++ )
{
d8 = d8 + dd_8; int sad8 = (int)(hsad[d]) - hsad_sub[d] + sad[d];
mind8 = v_max(mind8, (mask & d8)); sad[d] = (ushort)sad8;
d8 = d8 + dd_8; if(minsad > sad8)
{
mind = d;
minsad = sad8;
}
} }
tsum += htext[y + wsz2] - htext[y - wsz2 - 1]; tsum += htext[y + wsz2] - htext[y - wsz2 - 1];
@ -508,41 +530,42 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
continue; continue;
} }
ushort CV_DECL_ALIGNED(16) minsad_buf[8], mind_buf[8];
v_store(minsad_buf, v_reinterpret_as_u16(minsad8));
v_store(mind_buf, v_reinterpret_as_u16(mind8));
for( d = 0; d < 8; d++ )
if(minsad > (int)minsad_buf[d] || (minsad == (int)minsad_buf[d] && mind > mind_buf[d]))
{
minsad = minsad_buf[d];
mind = mind_buf[d];
}
if( uniquenessRatio > 0 ) if( uniquenessRatio > 0 )
{ {
int thresh = minsad + (minsad * uniquenessRatio/100); int thresh = minsad + (minsad * uniquenessRatio/100);
v_int32x4 thresh4 = v_setall_s32(thresh + 1); v_int32 thresh4 = vx_setall_s32(thresh + 1);
v_int32x4 d1 = v_setall_s32(mind-1), d2 = v_setall_s32(mind+1); v_int32 d1 = vx_setall_s32(mind-1), d2 = vx_setall_s32(mind+1);
v_int32x4 dd_4 = v_setall_s32(4); v_int32 dd_4 = vx_setall_s32(v_int32::nlanes);
v_int32x4 d4 = v_int32x4(0,1,2,3); v_int32 d4 = vx_load_expand(v_seq);
v_int32x4 mask4;
for( d = 0; d <= ndisp - v_int16::nlanes; d += v_int16::nlanes )
for( d = 0; d < ndisp; d += 8 ) {
{ v_int32 sad4_l, sad4_h;
v_int16x8 sad8 = v_reinterpret_as_s16(v_load(sad + d)); v_expand(v_reinterpret_as_s16(vx_load(sad + d)), sad4_l, sad4_h);
v_int32x4 sad4_l, sad4_h; if( v_check_any((thresh4 > sad4_l) & ((d1 > d4) | (d4 > d2))) )
v_expand(sad8, sad4_l, sad4_h);
mask4 = thresh4 > sad4_l;
mask4 = mask4 & ((d1 > d4) | (d4 > d2));
if( v_check_any(mask4) )
break; break;
d4 += dd_4; d4 += dd_4;
mask4 = thresh4 > sad4_h; if( v_check_any((thresh4 > sad4_h) & ((d1 > d4) | (d4 > d2))) )
mask4 = mask4 & ((d1 > d4) | (d4 > d2));
if( v_check_any(mask4) )
break; break;
d4 += dd_4; d4 += dd_4;
} }
if( d <= ndisp - v_int16::nlanes )
{
dptr[y*dstep] = FILTERED;
continue;
}
if( d <= ndisp - v_int32::nlanes )
{
v_int32 sad4_l = vx_load_expand((short*)sad + d);
if (v_check_any((thresh4 > sad4_l) & ((d1 > d4) | (d4 > d2))))
continue;
d += v_int16::nlanes;
}
for( ; d < ndisp; d++ )
{
if( (thresh + 1) > sad[d] && ((mind - 1) > d || d > (mind + 1)) )
break;
}
if( d < ndisp ) if( d < ndisp )
{ {
dptr[y*dstep] = FILTERED; dptr[y*dstep] = FILTERED;
@ -571,7 +594,7 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
uchar* buf, int _dy0, int _dy1 ) uchar* buf, int _dy0, int _dy1 )
{ {
const int ALIGN = 16; const int ALIGN = CV_SIMD_WIDTH;
int x, y, d; int x, y, d;
int wsz = state.SADWindowSize, wsz2 = wsz/2; int wsz = state.SADWindowSize, wsz2 = wsz/2;
int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1); int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1);
@ -587,12 +610,6 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
const int disp_shift = dispShiftTemplate<mType>::value; const int disp_shift = dispShiftTemplate<mType>::value;
mType FILTERED = (mType)((mindisp - 1) << disp_shift); mType FILTERED = (mType)((mindisp - 1) << disp_shift);
#if CV_SIMD128
{
CV_Assert (ndisp % 8 == 0);
}
#endif
int *sad, *hsad0, *hsad, *hsad_sub, *htext; int *sad, *hsad0, *hsad, *hsad_sub, *htext;
uchar *cbuf0, *cbuf; uchar *cbuf0, *cbuf;
const uchar* lptr0 = left.ptr() + lofs; const uchar* lptr0 = left.ptr() + lofs;
@ -607,6 +624,13 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
const int TABSZ = 256; const int TABSZ = 256;
uchar tab[TABSZ]; uchar tab[TABSZ];
#if CV_SIMD
int v_seq[v_int32::nlanes];
for (int i = 0; i < v_int32::nlanes; ++i)
v_seq[i] = i;
v_int32 d0_4 = vx_load(v_seq), dd_4 = vx_setall_s32(v_int32::nlanes);
#endif
sad = (int*)alignPtr(buf + sizeof(sad[0]), ALIGN); sad = (int*)alignPtr(buf + sizeof(sad[0]), ALIGN);
hsad0 = (int*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN); hsad0 = (int*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN);
htext = (int*)alignPtr((int*)(hsad0 + (height+dy1)*ndisp) + wsz2 + 2, ALIGN); htext = (int*)alignPtr((int*)(hsad0 + (height+dy1)*ndisp) + wsz2 + 2, ALIGN);
@ -628,22 +652,22 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
{ {
int lval = lptr[0]; int lval = lptr[0];
d = 0; d = 0;
#if CV_SIMD128 #if CV_SIMD
{ {
v_uint8x16 lv = v_setall_u8((uchar)lval); v_uint8 lv = vx_setall_u8((uchar)lval);
for( ; d <= ndisp - 16; d += 16 ) for( ; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
{ {
v_uint8x16 rv = v_load(rptr + d); v_uint8 rv = vx_load(rptr + d);
v_int32x4 hsad_0 = v_load(hsad + d); v_int32 hsad_0 = vx_load(hsad + d);
v_int32x4 hsad_1 = v_load(hsad + d + 4); v_int32 hsad_1 = vx_load(hsad + d + v_int32::nlanes);
v_int32x4 hsad_2 = v_load(hsad + d + 8); v_int32 hsad_2 = vx_load(hsad + d + 2*v_int32::nlanes);
v_int32x4 hsad_3 = v_load(hsad + d + 12); v_int32 hsad_3 = vx_load(hsad + d + 3*v_int32::nlanes);
v_uint8x16 diff = v_absdiff(lv, rv); v_uint8 diff = v_absdiff(lv, rv);
v_store(cbuf + d, diff); v_store(cbuf + d, diff);
v_uint16x8 diff0, diff1; v_uint16 diff0, diff1;
v_uint32x4 diff00, diff01, diff10, diff11; v_uint32 diff00, diff01, diff10, diff11;
v_expand(diff, diff0, diff1); v_expand(diff, diff0, diff1);
v_expand(diff0, diff00, diff01); v_expand(diff0, diff00, diff01);
v_expand(diff1, diff10, diff11); v_expand(diff1, diff10, diff11);
@ -654,9 +678,9 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
hsad_3 += v_reinterpret_as_s32(diff11); hsad_3 += v_reinterpret_as_s32(diff11);
v_store(hsad + d, hsad_0); v_store(hsad + d, hsad_0);
v_store(hsad + d + 4, hsad_1); v_store(hsad + d + v_int32::nlanes, hsad_1);
v_store(hsad + d + 8, hsad_2); v_store(hsad + d + 2*v_int32::nlanes, hsad_2);
v_store(hsad + d + 12, hsad_3); v_store(hsad + d + 3*v_int32::nlanes, hsad_3);
} }
} }
#endif #endif
@ -696,22 +720,22 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
{ {
int lval = lptr[0]; int lval = lptr[0];
d = 0; d = 0;
#if CV_SIMD128 #if CV_SIMD
{ {
v_uint8x16 lv = v_setall_u8((uchar)lval); v_uint8 lv = vx_setall_u8((uchar)lval);
for( ; d <= ndisp - 16; d += 16 ) for( ; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
{ {
v_uint8x16 rv = v_load(rptr + d); v_uint8 rv = vx_load(rptr + d);
v_int32x4 hsad_0 = v_load(hsad + d); v_int32 hsad_0 = vx_load(hsad + d);
v_int32x4 hsad_1 = v_load(hsad + d + 4); v_int32 hsad_1 = vx_load(hsad + d + v_int32::nlanes);
v_int32x4 hsad_2 = v_load(hsad + d + 8); v_int32 hsad_2 = vx_load(hsad + d + 2*v_int32::nlanes);
v_int32x4 hsad_3 = v_load(hsad + d + 12); v_int32 hsad_3 = vx_load(hsad + d + 3*v_int32::nlanes);
v_uint8x16 cbs = v_load(cbuf_sub + d); v_uint8 cbs = vx_load(cbuf_sub + d);
v_uint8x16 diff = v_absdiff(lv, rv); v_uint8 diff = v_absdiff(lv, rv);
v_store(cbuf + d, diff); v_store(cbuf + d, diff);
v_uint16x8 diff0, diff1, cbs0, cbs1; v_uint16 diff0, diff1, cbs0, cbs1;
v_int32x4 diff00, diff01, diff10, diff11, cbs00, cbs01, cbs10, cbs11; v_int32 diff00, diff01, diff10, diff11, cbs00, cbs01, cbs10, cbs11;
v_expand(diff, diff0, diff1); v_expand(diff, diff0, diff1);
v_expand(cbs, cbs0, cbs1); v_expand(cbs, cbs0, cbs1);
v_expand(v_reinterpret_as_s16(diff0), diff00, diff01); v_expand(v_reinterpret_as_s16(diff0), diff00, diff01);
@ -719,19 +743,19 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
v_expand(v_reinterpret_as_s16(cbs0), cbs00, cbs01); v_expand(v_reinterpret_as_s16(cbs0), cbs00, cbs01);
v_expand(v_reinterpret_as_s16(cbs1), cbs10, cbs11); v_expand(v_reinterpret_as_s16(cbs1), cbs10, cbs11);
v_int32x4 diff_0 = diff00 - cbs00; v_int32 diff_0 = diff00 - cbs00;
v_int32x4 diff_1 = diff01 - cbs01; v_int32 diff_1 = diff01 - cbs01;
v_int32x4 diff_2 = diff10 - cbs10; v_int32 diff_2 = diff10 - cbs10;
v_int32x4 diff_3 = diff11 - cbs11; v_int32 diff_3 = diff11 - cbs11;
hsad_0 += diff_0; hsad_0 += diff_0;
hsad_1 += diff_1; hsad_1 += diff_1;
hsad_2 += diff_2; hsad_2 += diff_2;
hsad_3 += diff_3; hsad_3 += diff_3;
v_store(hsad + d, hsad_0); v_store(hsad + d, hsad_0);
v_store(hsad + d + 4, hsad_1); v_store(hsad + d + v_int32::nlanes, hsad_1);
v_store(hsad + d + 8, hsad_2); v_store(hsad + d + 2*v_int32::nlanes, hsad_2);
v_store(hsad + d + 12, hsad_3); v_store(hsad + d + 3*v_int32::nlanes, hsad_3);
} }
} }
#endif #endif
@ -758,18 +782,18 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp ) for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
{ {
d = 0; d = 0;
#if CV_SIMD128 #if CV_SIMD
{ {
for( d = 0; d <= ndisp-8; d += 8 ) for( d = 0; d <= ndisp-2*v_int32::nlanes; d += 2*v_int32::nlanes )
{ {
v_int32x4 s0 = v_load(sad + d); v_int32 s0 = vx_load(sad + d);
v_int32x4 s1 = v_load(sad + d + 4); v_int32 s1 = vx_load(sad + d + v_int32::nlanes);
v_int32x4 t0 = v_load(hsad + d); v_int32 t0 = vx_load(hsad + d);
v_int32x4 t1 = v_load(hsad + d + 4); v_int32 t1 = vx_load(hsad + d + v_int32::nlanes);
s0 += t0; s0 += t0;
s1 += t1; s1 += t1;
v_store(sad + d, s0); v_store(sad + d, s0);
v_store(sad + d + 4, s1); v_store(sad + d + v_int32::nlanes, s1);
} }
} }
#endif #endif
@ -787,50 +811,31 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp; hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp; hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
d = 0; d = 0;
#if CV_SIMD128 #if CV_SIMD
{ {
v_int32x4 d0_4 = v_int32x4(0, 1, 2, 3); v_int32 minsad4 = vx_setall_s32(INT_MAX);
v_int32x4 dd_4 = v_setall_s32(4); v_int32 mind4 = vx_setall_s32(0), d4 = d0_4;
v_int32x4 minsad4 = v_setall_s32(INT_MAX);
v_int32x4 mind4 = v_setall_s32(0), d4 = d0_4;
for( ; d <= ndisp - 8; d += 8 ) for( ; d <= ndisp - 2*v_int32::nlanes; d += 2*v_int32::nlanes )
{ {
v_int32x4 u0 = v_load(hsad_sub + d); v_int32 sad4 = vx_load(sad + d) + vx_load(hsad + d) - vx_load(hsad_sub + d);
v_int32x4 u1 = v_load(hsad + d); v_store(sad + d, sad4);
mind4 = v_select(minsad4 > sad4, d4, mind4);
v_int32x4 v0 = v_load(hsad_sub + d + 4); minsad4 = v_min(minsad4, sad4);
v_int32x4 v1 = v_load(hsad + d + 4);
v_int32x4 usad4 = v_load(sad + d);
v_int32x4 vsad4 = v_load(sad + d + 4);
u1 -= u0;
v1 -= v0;
usad4 += u1;
vsad4 += v1;
v_store(sad + d, usad4);
v_store(sad + d + 4, vsad4);
v_int32x4 mask = minsad4 > usad4;
minsad4 = v_min(minsad4, usad4);
mind4 = v_select(mask, d4, mind4);
d4 += dd_4; d4 += dd_4;
mask = minsad4 > vsad4; sad4 = vx_load(sad + d + v_int32::nlanes) + vx_load(hsad + d + v_int32::nlanes) - vx_load(hsad_sub + d + v_int32::nlanes);
minsad4 = v_min(minsad4, vsad4); v_store(sad + d + v_int32::nlanes, sad4);
mind4 = v_select(mask, d4, mind4); mind4 = v_select(minsad4 > sad4, d4, mind4);
minsad4 = v_min(minsad4, sad4);
d4 += dd_4; d4 += dd_4;
} }
int CV_DECL_ALIGNED(16) minsad_buf[4], mind_buf[4]; int CV_DECL_ALIGNED(CV_SIMD_WIDTH) minsad_buf[v_int32::nlanes], mind_buf[v_int32::nlanes];
v_store(minsad_buf, minsad4); v_store(minsad_buf, minsad4);
v_store(mind_buf, mind4); v_store(mind_buf, mind4);
if(minsad_buf[0] < minsad || (minsad == minsad_buf[0] && mind_buf[0] < mind)) { minsad = minsad_buf[0]; mind = mind_buf[0]; } for (int i = 0; i < v_int32::nlanes; ++i)
if(minsad_buf[1] < minsad || (minsad == minsad_buf[1] && mind_buf[1] < mind)) { minsad = minsad_buf[1]; mind = mind_buf[1]; } if(minsad_buf[i] < minsad || (minsad == minsad_buf[i] && mind_buf[i] < mind)) { minsad = minsad_buf[i]; mind = mind_buf[i]; }
if(minsad_buf[2] < minsad || (minsad == minsad_buf[2] && mind_buf[2] < mind)) { minsad = minsad_buf[2]; mind = mind_buf[2]; }
if(minsad_buf[3] < minsad || (minsad == minsad_buf[3] && mind_buf[3] < mind)) { minsad = minsad_buf[3]; mind = mind_buf[3]; }
} }
#endif #endif
for( ; d < ndisp; d++ ) for( ; d < ndisp; d++ )
@ -1027,7 +1032,7 @@ struct FindStereoCorrespInvoker : public ParallelLoopBody
Mat disp_i = disp->rowRange(row0, row1); Mat disp_i = disp->rowRange(row0, row1);
Mat cost_i = state->disp12MaxDiff >= 0 ? cost->rowRange(row0, row1) : Mat(); Mat cost_i = state->disp12MaxDiff >= 0 ? cost->rowRange(row0, row1) : Mat();
#if CV_SIMD128 #if CV_SIMD
if (useShorts) if (useShorts)
{ {
if( disp_i.type() == CV_16S) if( disp_i.type() == CV_16S)

Loading…
Cancel
Save