Use new Universal Intrinsic API.

pull/3572/head
Liutong HAN 2 years ago
parent 3af65e6cb1
commit b9a460b857
  1. 224
      modules/optflow/src/rlof/berlof_invoker.hpp
  2. 74
      modules/optflow/src/rlof/plk_invoker.hpp
  3. 132
      modules/optflow/src/rlof/rlof_invoker.hpp
  4. 76
      modules/optflow/src/rlof/rlof_invokerbase.hpp
  5. 8
      modules/optflow/src/rlof/rlof_localflow.cpp
  6. 120
      modules/rgbd/src/colored_tsdf.cpp
  7. 48
      modules/rgbd/src/fast_icp.cpp
  8. 32
      modules/rgbd/src/hash_tsdf.cpp
  9. 90
      modules/rgbd/src/tsdf.cpp
  10. 83
      modules/rgbd/src/tsdf_functions.cpp
  11. 2
      modules/rgbd/src/tsdf_functions.hpp
  12. 2
      modules/rgbd/src/utils.hpp
  13. 20
      modules/ximgproc/src/anisodiff.cpp
  14. 58
      modules/ximgproc/src/fgs_filter.cpp
  15. 56
      modules/xphoto/src/grayworld_white_balance.cpp
  16. 82
      modules/xphoto/src/learning_based_color_balance.cpp

@ -296,7 +296,7 @@ public:
v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn)); v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x)); v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn)); v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16; v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
v_int32x4 t0, t1; v_int32x4 t0, t1;
v_int16x8 t00, t01, t10, t11; v_int16x8 t00, t01, t10, t11;
@ -304,35 +304,35 @@ public:
v_zip(v10, v11, t10, t11); v_zip(v10, v11, t10, t11);
//subpixel interpolation //subpixel interpolation
t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1); t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1); t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
t0 = t0 >> (W_BITS - 5); t0 = v_shr(t0, W_BITS - 5);
t1 = t1 >> (W_BITS - 5); t1 = v_shr(t1, W_BITS - 5);
diff0 = v_pack(t0, t1); diff0 = v_pack(t0, t1);
// I*gain.x + gain.x // I*gain.x + gain.x
v_int16x8 diff[4] = v_int16x8 diff[4] =
{ {
((v11 << 5) - vI) & vmask, v_and(v_sub(v_shl<5>(v11), vI), vmask),
((v01 << 5) - vI) & vmask, v_and(v_sub(v_shl<5>(v01), vI), vmask),
((v10 << 5) - vI) & vmask, v_and(v_sub(v_shl<5>(v10), vI), vmask),
((v00 << 5) - vI) & vmask v_and(v_sub(v_shl<5>(v00), vI), vmask)
}; };
diff0 = diff0 - vI; diff0 = v_sub(diff0, vI);
diff0 = diff0 & vmask; diff0 = v_and(diff0, vmask);
v_int16x8 vscale_diff_is_pos = diff0 > vscale; v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale);
veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1); veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1));
// since there is no abs vor int16x8 we have to do this hack // since there is no abs vor int16x8 we have to do this hack
v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0)); v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
v_int16x8 vset2, vset1; v_int16x8 vset2, vset1;
// |It| < sigma1 ? // |It| < sigma1 ?
vset2 = vabs_diff < vparam1; vset2 = v_lt(vabs_diff, vparam1);
// It > 0 ? // It > 0 ?
v_int16x8 vdiff_is_pos = diff0 > vzero; v_int16x8 vdiff_is_pos = v_gt(diff0, vzero);
// sigma0 < |It| < sigma1 ? // sigma0 < |It| < sigma1 ?
vset1 = vset2 & (vabs_diff > vparam0); vset1 = v_and(vset2, v_gt(vabs_diff, vparam0));
// val = |It| -/+ sigma1 // val = |It| -/+ sigma1
v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1); v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1));
v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8)); v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8));
@ -342,7 +342,7 @@ public:
for (unsigned int mmi = 0; mmi < 4; mmi++) for (unsigned int mmi = 0; mmi < 4; mmi++)
{ {
// It == 0 ? |It| > sigma13 // It == 0 ? |It| > sigma13
diff0 = vset2 & diff[mmi]; diff0 = v_and(vset2, diff[mmi]);
// It == val ? sigma0 < |It| < sigma1 // It == val ? sigma0 < |It| < sigma1
diff0 = v_select(vset1, vtmp_param1, diff0); diff0 = v_select(vset1, vtmp_param1, diff0);
@ -350,16 +350,16 @@ public:
// diff = diff * sigma2 // diff = diff * sigma2
v_int32x4 diff_int_0, diff_int_1; v_int32x4 diff_int_0, diff_int_1;
v_mul_expand(diff0, tale_, diff_int_0, diff_int_1); v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
v_int32x4 diff0_0 = diff_int_0 >> s2bitShift; v_int32x4 diff0_0 = v_shr(diff_int_0, s2bitShift);
v_int32x4 diff0_1 = diff_int_1 >> s2bitShift; v_int32x4 diff0_1 = v_shr(diff_int_1, s2bitShift);
diff0 = v_pack(diff0_0, diff0_1); diff0 = v_pack(diff0_0, diff0_1);
v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ... v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
v_zip(vIxy_0, vIxy_1, v10, v11); v_zip(vIxy_0, vIxy_1, v10, v11);
v_zip(diff2, diff1, v00, v01); v_zip(diff2, diff1, v00, v01);
vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10)); vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10)));
vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11)); vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11)));
} }
if (j == 0) if (j == 0)
{ {
@ -387,8 +387,8 @@ public:
v_float32x4 fx = v_cvt_f32(t1); v_float32x4 fx = v_cvt_f32(t1);
// A11 - A22 // A11 - A22
v_float32x4 fxtale = fx * vtale_0; v_float32x4 fxtale = v_mul(fx, vtale_0);
v_float32x4 fytale = fy * vtale_0; v_float32x4 fytale = v_mul(fy, vtale_0);
vAyy = v_muladd(fy, fytale, vAyy); vAyy = v_muladd(fy, fytale, vAyy);
vAxy = v_muladd(fx, fytale, vAxy); vAxy = v_muladd(fx, fytale, vAxy);
@ -402,8 +402,8 @@ public:
fx = v_cvt_f32(t1); fx = v_cvt_f32(t1);
// A11 - A22 // A11 - A22
fxtale = fx * vtale_1; fxtale = v_mul(fx, vtale_1);
fytale = fy * vtale_1; fytale = v_mul(fy, vtale_1);
vAyy = v_muladd(fy, fytale, vAyy); vAyy = v_muladd(fy, fytale, vAyy);
vAxy = v_muladd(fx, fytale, vAxy); vAxy = v_muladd(fx, fytale, vAxy);
@ -544,7 +544,7 @@ public:
float CV_DECL_ALIGNED(16) bbuf[4]; float CV_DECL_ALIGNED(16) bbuf[4];
for (int mmi = 0; mmi < 4; mmi++) for (int mmi = 0; mmi < 4; mmi++)
{ {
v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]); v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi]));
_b0[mmi] = bbuf[0] + bbuf[2]; _b0[mmi] = bbuf[0] + bbuf[2];
_b1[mmi] = bbuf[1] + bbuf[3]; _b1[mmi] = bbuf[1] + bbuf[3];
} }
@ -960,7 +960,7 @@ public:
v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn)); v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x)); v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn)); v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16; v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
v_int32x4 t0, t1; v_int32x4 t0, t1;
v_int16x8 t00, t01, t10, t11; v_int16x8 t00, t01, t10, t11;
@ -968,38 +968,38 @@ public:
v_zip(v10, v11, t10, t11); v_zip(v10, v11, t10, t11);
//subpixel interpolation //subpixel interpolation
t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1); t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1); t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
t0 = t0 >> (W_BITS - 5); t0 = v_shr(t0, W_BITS - 5);
t1 = t1 >> (W_BITS - 5); t1 = v_shr(t1, W_BITS - 5);
diff0 = v_pack(t0, t1); diff0 = v_pack(t0, t1);
// I*gain.x + gain.x // I*gain.x + gain.x
v_mul_expand(vI, vgain_value, t0, t1); v_mul_expand(vI, vgain_value, t0, t1);
v_int16x8 diff_value = v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value - vI; v_int16x8 diff_value = v_sub(v_add(v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift)), vconst_value), vI);
v_int16x8 diff[4] = v_int16x8 diff[4] =
{ {
((v11 << 5) + diff_value) & vmask, v_and(v_add(v_shl<5>(v11), diff_value), vmask),
((v01 << 5) + diff_value) & vmask, v_and(v_add(v_shl<5>(v01), diff_value), vmask),
((v10 << 5) + diff_value) & vmask, v_and(v_add(v_shl<5>(v10), diff_value), vmask),
((v00 << 5) + diff_value) & vmask v_and(v_add(v_shl<5>(v00), diff_value), vmask)
}; };
diff0 = diff0 + diff_value; diff0 = v_add(diff0, diff_value);
diff0 = diff0 & vmask; diff0 = v_and(diff0, vmask);
v_int16x8 vscale_diff_is_pos = diff0 > vscale; v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale);
veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1); veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1));
// since there is no abs vor int16x8 we have to do this hack // since there is no abs vor int16x8 we have to do this hack
v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0)); v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
v_int16x8 vset2, vset1; v_int16x8 vset2, vset1;
// |It| < sigma1 ? // |It| < sigma1 ?
vset2 = vabs_diff < vparam1; vset2 = v_lt(vabs_diff, vparam1);
// It > 0 ? // It > 0 ?
v_int16x8 vdiff_is_pos = diff0 > vzero; v_int16x8 vdiff_is_pos = v_gt(diff0, vzero);
// sigma0 < |It| < sigma1 ? // sigma0 < |It| < sigma1 ?
vset1 = vset2 & (vabs_diff > vparam0); vset1 = v_and(vset2, v_gt(vabs_diff, vparam0));
// val = |It| -/+ sigma1 // val = |It| -/+ sigma1
v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1); v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1));
v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8)); v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8));
@ -1009,7 +1009,7 @@ public:
for (unsigned int mmi = 0; mmi < 4; mmi++) for (unsigned int mmi = 0; mmi < 4; mmi++)
{ {
// It == 0 ? |It| > sigma13 // It == 0 ? |It| > sigma13
diff0 = vset2 & diff[mmi]; diff0 = v_and(vset2, diff[mmi]);
// It == val ? sigma0 < |It| < sigma1 // It == val ? sigma0 < |It| < sigma1
diff0 = v_select(vset1, vtmp_param1, diff0); diff0 = v_select(vset1, vtmp_param1, diff0);
@ -1017,22 +1017,22 @@ public:
// diff = diff * sigma2 // diff = diff * sigma2
v_int32x4 diff_int_0, diff_int_1; v_int32x4 diff_int_0, diff_int_1;
v_mul_expand(diff0, tale_, diff_int_0, diff_int_1); v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
v_int32x4 diff0_0 = diff_int_0 >> s2bitShift; v_int32x4 diff0_0 = v_shr(diff_int_0, s2bitShift);
v_int32x4 diff0_1 = diff_int_1 >> s2bitShift; v_int32x4 diff0_1 = v_shr(diff_int_1, s2bitShift);
diff0 = v_pack(diff0_0, diff0_1); diff0 = v_pack(diff0_0, diff0_1);
v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ... v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
v_zip(vIxy_0, vIxy_1, v10, v11); v_zip(vIxy_0, vIxy_1, v10, v11);
v_zip(diff2, diff1, v00, v01); v_zip(diff2, diff1, v00, v01);
vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10)); vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10)));
vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11)); vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11)));
vqb2[mmi] += v_cvt_f32(diff0_0 * vI0); vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_0, vI0)));
vqb2[mmi] += v_cvt_f32(diff0_1 * vI1); vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_1, vI1)));
vqb3[mmi] += v_cvt_f32(diff0_0); vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_0));
vqb3[mmi] += v_cvt_f32(diff0_1); vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_1));
} }
if (j == 0) if (j == 0)
{ {
@ -1060,29 +1060,29 @@ public:
v_float32x4 fx = v_cvt_f32(t1); v_float32x4 fx = v_cvt_f32(t1);
// A11 - A22 // A11 - A22
v_float32x4 fxtale = fx * vtale_0; v_float32x4 fxtale = v_mul(fx, vtale_0);
v_float32x4 fytale = fy * vtale_0; v_float32x4 fytale = v_mul(fy, vtale_0);
vAyy = v_muladd(fy, fytale, vAyy); vAyy = v_muladd(fy, fytale, vAyy);
vAxy = v_muladd(fx, fytale, vAxy); vAxy = v_muladd(fx, fytale, vAxy);
vAxx = v_muladd(fx, fxtale, vAxx); vAxx = v_muladd(fx, fxtale, vAxx);
// sumIx und sumIy // sumIx und sumIy
vsumIx += fxtale; vsumIx = v_add(vsumIx, fxtale);
vsumIy += fytale; vsumIy = v_add(vsumIy, fytale);
vsumW1 += vI_ps * fxtale; vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale));
vsumW2 += vI_ps * fytale; vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale));
// sumI // sumI
v_float32x4 vI_tale = vI_ps * vtale_0; v_float32x4 vI_tale = v_mul(vI_ps, vtale_0);
vsumI += vI_tale; vsumI = v_add(vsumI, vI_tale);
// sumW // sumW
vsumW += vtale_0; vsumW = v_add(vsumW, vtale_0);
// sumDI // sumDI
vsumDI += vI_ps * vI_tale; vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale));
v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1)))); v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
v_expand(v01, t1, t0); v_expand(v01, t1, t0);
@ -1092,29 +1092,29 @@ public:
fx = v_cvt_f32(t1); fx = v_cvt_f32(t1);
// A11 - A22 // A11 - A22
fxtale = fx * vtale_1; fxtale = v_mul(fx, vtale_1);
fytale = fy * vtale_1; fytale = v_mul(fy, vtale_1);
vAyy = v_muladd(fy, fytale, vAyy); vAyy = v_muladd(fy, fytale, vAyy);
vAxy = v_muladd(fx, fytale, vAxy); vAxy = v_muladd(fx, fytale, vAxy);
vAxx = v_muladd(fx, fxtale, vAxx); vAxx = v_muladd(fx, fxtale, vAxx);
// sumIx und sumIy // sumIx und sumIy
vsumIx += fxtale; vsumIx = v_add(vsumIx, fxtale);
vsumIy += fytale; vsumIy = v_add(vsumIy, fytale);
vsumW1 += vI_ps * fxtale; vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale));
vsumW2 += vI_ps * fytale; vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale));
// sumI // sumI
vI_tale = vI_ps * vtale_1; vI_tale = v_mul(vI_ps, vtale_1);
vsumI += vI_tale; vsumI = v_add(vsumI, vI_tale);
// sumW // sumW
vsumW += vtale_1; vsumW = v_add(vsumW, vtale_1);
// sumDI // sumDI
vsumDI += vI_ps * vI_tale; vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale));
} }
} }
@ -1304,7 +1304,7 @@ public:
float CV_DECL_ALIGNED(16) bbuf[4]; float CV_DECL_ALIGNED(16) bbuf[4];
for(int mmi = 0; mmi < 4; mmi++) for(int mmi = 0; mmi < 4; mmi++)
{ {
v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]); v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi]));
_b0[mmi] = bbuf[0] + bbuf[2]; _b0[mmi] = bbuf[0] + bbuf[2];
_b1[mmi] = bbuf[1] + bbuf[3]; _b1[mmi] = bbuf[1] + bbuf[3];
_b2[mmi] = v_reduce_sum(vqb2[mmi]); _b2[mmi] = v_reduce_sum(vqb2[mmi]);
@ -1655,14 +1655,14 @@ public:
v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn)); v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x)); v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn)); v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16; v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
v_int16x8 diff[4] = v_int16x8 diff[4] =
{ {
((v00 << 5) - vI) & vmask, v_and(v_sub(v_shl<5>(v00), vI), vmask),
((v01 << 5) - vI) & vmask, v_and(v_sub(v_shl<5>(v01), vI), vmask),
((v10 << 5) - vI) & vmask, v_and(v_sub(v_shl<5>(v10), vI), vmask),
((v11 << 5) - vI) & vmask, v_and(v_sub(v_shl<5>(v11), vI), vmask),
}; };
v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
@ -1672,8 +1672,8 @@ public:
v_zip(diff[mmi], diff[mmi], diff1, diff0); v_zip(diff[mmi], diff[mmi], diff1, diff0);
v_zip(vIxy_0, vIxy_1, v10, v11); v_zip(vIxy_0, vIxy_1, v10, v11);
v_zip(diff1, diff0, v00, v01); v_zip(diff1, diff0, v00, v01);
vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10)); vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10)));
vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11)); vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11)));
} }
} }
#else #else
@ -1704,7 +1704,7 @@ public:
float CV_DECL_ALIGNED(16) bbuf[4]; float CV_DECL_ALIGNED(16) bbuf[4];
for (int mmi = 0; mmi < 4; mmi++) for (int mmi = 0; mmi < 4; mmi++)
{ {
v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]); v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi]));
_b1[mmi] = bbuf[0] + bbuf[2]; _b1[mmi] = bbuf[0] + bbuf[2];
_b2[mmi] = bbuf[1] + bbuf[3]; _b2[mmi] = bbuf[1] + bbuf[3];
} }
@ -2071,7 +2071,7 @@ namespace radial {
v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn)); v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x)); v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn)); v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16; v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
v_int32x4 t0, t1; v_int32x4 t0, t1;
v_int16x8 t00, t01, t10, t11; v_int16x8 t00, t01, t10, t11;
@ -2079,21 +2079,21 @@ namespace radial {
v_zip(v10, v11, t10, t11); v_zip(v10, v11, t10, t11);
//subpixel interpolation //subpixel interpolation
t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1); t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1); t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
t0 = t0 >> (W_BITS - 5); t0 = v_shr(t0, W_BITS - 5);
t1 = t1 >> (W_BITS - 5); t1 = v_shr(t1, W_BITS - 5);
diff0 = v_pack(t0, t1); diff0 = v_pack(t0, t1);
// I*gain.x + gain.x // I*gain.x + gain.x
v_mul_expand(vI, vgain_value, t0, t1); v_mul_expand(vI, vgain_value, t0, t1);
v_int16x8 diff_value = v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value - vI; v_int16x8 diff_value = v_sub(v_add(v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift)), vconst_value), vI);
v_int16x8 diff[4] = v_int16x8 diff[4] =
{ {
((v11 << 5) + diff_value) & vmask, v_and(v_add(v_shl<5>(v11), diff_value), vmask),
((v01 << 5) + diff_value) & vmask, v_and(v_add(v_shl<5>(v01), diff_value), vmask),
((v10 << 5) + diff_value) & vmask, v_and(v_add(v_shl<5>(v10), diff_value), vmask),
((v00 << 5) + diff_value) & vmask v_and(v_add(v_shl<5>(v00), diff_value), vmask)
}; };
v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8)); v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8));
@ -2109,14 +2109,14 @@ namespace radial {
v_zip(diff[mmi], diff[mmi], diff2, diff1); v_zip(diff[mmi], diff[mmi], diff2, diff1);
v_zip(diff2, diff1, v00, v01); v_zip(diff2, diff1, v00, v01);
vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10)); vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10)));
vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11)); vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11)));
vqb2[mmi] += v_cvt_f32(diff0_0 * vI0); vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_0, vI0)));
vqb2[mmi] += v_cvt_f32(diff0_1 * vI1); vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_1, vI1)));
vqb3[mmi] += v_cvt_f32(diff0_0); vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_0));
vqb3[mmi] += v_cvt_f32(diff0_1); vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_1));
} }
if (j == 0) if (j == 0)
{ {
@ -2133,17 +2133,17 @@ namespace radial {
vAxx = v_muladd(fx, fx, vAxx); vAxx = v_muladd(fx, fx, vAxx);
// sumIx und sumIy // sumIx und sumIy
vsumIx += fx; vsumIx = v_add(vsumIx, fx);
vsumIy += fy; vsumIy = v_add(vsumIy, fy);
vsumW1 += vI_ps * fx; vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx));
vsumW2 += vI_ps * fy; vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy));
// sumI // sumI
vsumI += vI_ps; vsumI = v_add(vsumI, vI_ps);
// sumDI // sumDI
vsumDI += vI_ps * vI_ps; vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps));
v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1)))); v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
v_expand(v01, t1, t0); v_expand(v01, t1, t0);
@ -2158,17 +2158,17 @@ namespace radial {
vAxx = v_muladd(fx, fx, vAxx); vAxx = v_muladd(fx, fx, vAxx);
// sumIx und sumIy // sumIx und sumIy
vsumIx += fx; vsumIx = v_add(vsumIx, fx);
vsumIy += fy; vsumIy = v_add(vsumIy, fy);
vsumW1 += vI_ps * fx; vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx));
vsumW2 += vI_ps * fy; vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy));
// sumI // sumI
vsumI += vI_ps; vsumI = v_add(vsumI, vI_ps);
// sumDI // sumDI
vsumDI += vI_ps * vI_ps; vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps));
} }
} }
@ -2299,7 +2299,7 @@ namespace radial {
float CV_DECL_ALIGNED(16) bbuf[4]; float CV_DECL_ALIGNED(16) bbuf[4];
for (int mmi = 0; mmi < 4; mmi++) for (int mmi = 0; mmi < 4; mmi++)
{ {
v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]); v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi]));
_b0[mmi] = bbuf[0] + bbuf[2]; _b0[mmi] = bbuf[0] + bbuf[2];
_b1[mmi] = bbuf[1] + bbuf[3]; _b1[mmi] = bbuf[1] + bbuf[3];
_b2[mmi] = v_reduce_sum(vqb2[mmi]); _b2[mmi] = v_reduce_sum(vqb2[mmi]);

@ -229,7 +229,7 @@ public:
v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn)); v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x)); v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn)); v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16; v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
v_int32x4 t0, t1; v_int32x4 t0, t1;
v_int16x8 t00, t01, t10, t11; v_int16x8 t00, t01, t10, t11;
@ -237,17 +237,17 @@ public:
v_zip(v10, v11, t10, t11); v_zip(v10, v11, t10, t11);
//subpixel interpolation //subpixel interpolation
t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1); t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1); t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
t0 = t0 >> (W_BITS - 5); t0 = v_shr(t0, W_BITS - 5);
t1 = t1 >> (W_BITS - 5); t1 = v_shr(t1, W_BITS - 5);
// diff = J - I // diff = J - I
diff0 = v_pack(t0, t1) - vI; diff0 = v_sub(v_pack(t0, t1), vI);
// I*gain.x + gain.x // I*gain.x + gain.x
v_mul_expand(vI, vgain_value, t0, t1); v_mul_expand(vI, vgain_value, t0, t1);
diff0 = diff0 + v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value; diff0 = v_add(v_add(diff0, v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift))), vconst_value);
diff0 = diff0 & vmask; diff0 = v_and(diff0, vmask);
v_zip(diff0, diff0, diff2, diff1); v_zip(diff0, diff0, diff2, diff1);
v_int32x4 diff0_0; v_int32x4 diff0_0;
@ -259,16 +259,16 @@ public:
v_zip(vIxy_0, vIxy_1, v10, v11); v_zip(vIxy_0, vIxy_1, v10, v11);
v_zip(diff2, diff1, v00, v01); v_zip(diff2, diff1, v00, v01);
vqb0 += v_cvt_f32(v_dotprod(v00, v10)); vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10)));
vqb1 += v_cvt_f32(v_dotprod(v01, v11)); vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11)));
v_int32x4 vI0, vI1; v_int32x4 vI0, vI1;
v_expand(vI, vI0, vI1); v_expand(vI, vI0, vI1);
vqb2 += v_cvt_f32(diff0_0 * vI0); vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_0, vI0)));
vqb2 += v_cvt_f32(diff0_1 * vI1); vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_1, vI1)));
vqb3 += v_cvt_f32(diff0_0); vqb3 = v_add(vqb3, v_cvt_f32(diff0_0));
vqb3 += v_cvt_f32(diff0_1); vqb3 = v_add(vqb3, v_cvt_f32(diff0_1));
if (j == 0) if (j == 0)
{ {
@ -285,17 +285,17 @@ public:
vAxx = v_muladd(fx, fx, vAxx); vAxx = v_muladd(fx, fx, vAxx);
// sumIx und sumIy // sumIx und sumIy
vsumIx += fx; vsumIx = v_add(vsumIx, fx);
vsumIy += fy; vsumIy = v_add(vsumIy, fy);
vsumW1 += vI_ps * fx; vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx));
vsumW2 += vI_ps * fy; vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy));
// sumI // sumI
vsumI += vI_ps; vsumI = v_add(vsumI, vI_ps);
// sumDI // sumDI
vsumDI += vI_ps * vI_ps; vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps));
v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1)))); v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
v_expand(v01, t1, t0); v_expand(v01, t1, t0);
@ -309,17 +309,17 @@ public:
vAxx = v_muladd(fx, fx, vAxx); vAxx = v_muladd(fx, fx, vAxx);
// sumIx und sumIy // sumIx und sumIy
vsumIx += fx; vsumIx = v_add(vsumIx, fx);
vsumIy += fy; vsumIy = v_add(vsumIy, fy);
vsumW1 += vI_ps * fx; vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx));
vsumW2 += vI_ps * fy; vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy));
// sumI // sumI
vsumI += vI_ps; vsumI = v_add(vsumI, vI_ps);
// sumDI // sumDI
vsumDI += vI_ps * vI_ps; vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps));
} }
} }
#else #else
@ -388,7 +388,7 @@ public:
#if CV_SIMD128 #if CV_SIMD128
float CV_DECL_ALIGNED(16) bbuf[4]; float CV_DECL_ALIGNED(16) bbuf[4];
v_store_aligned(bbuf, vqb0 + vqb1); v_store_aligned(bbuf, v_add(vqb0, vqb1));
b1 = bbuf[0] + bbuf[2]; b1 = bbuf[0] + bbuf[2];
b2 = bbuf[1] + bbuf[3]; b2 = bbuf[1] + bbuf[3];
b3 = v_reduce_sum(vqb2); b3 = v_reduce_sum(vqb2);
@ -696,19 +696,19 @@ public:
v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn)); v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x)); v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn)); v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16; v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
v_int32x4 t0, t1; v_int32x4 t0, t1;
v_int16x8 t00, t01, t10, t11; v_int16x8 t00, t01, t10, t11;
v_zip(v00, v01, t00, t01); v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11); v_zip(v10, v11, t10, t11);
t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1); t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1); t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
t0 = t0 >> (W_BITS - 5); t0 = v_shr(t0, W_BITS - 5);
t1 = t1 >> (W_BITS - 5); t1 = v_shr(t1, W_BITS - 5);
diff0 = v_pack(t0, t1) - diff0; diff0 = v_sub(v_pack(t0, t1), diff0);
diff0 = diff0 & vmask; diff0 = v_and(diff0, vmask);
v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ... v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
@ -717,8 +717,8 @@ public:
v_zip(vIxy_0, vIxy_1, v10, v11); v_zip(vIxy_0, vIxy_1, v10, v11);
v_zip(diff2, diff1, v00, v01); v_zip(diff2, diff1, v00, v01);
vqb0 += v_cvt_f32(v_dotprod(v00, v10)); vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10)));
vqb1 += v_cvt_f32(v_dotprod(v01, v11)); vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11)));
} }
#else #else
for( ; x < winSize.width*cn; x++, dIptr += 2 ) for( ; x < winSize.width*cn; x++, dIptr += 2 )
@ -737,7 +737,7 @@ public:
#if CV_SIMD128 #if CV_SIMD128
float CV_DECL_ALIGNED(16) bbuf[4]; float CV_DECL_ALIGNED(16) bbuf[4];
v_store_aligned(bbuf, vqb0 + vqb1); v_store_aligned(bbuf, v_add(vqb0, vqb1));
b1 = bbuf[0] + bbuf[2]; b1 = bbuf[0] + bbuf[2];
b2 = bbuf[1] + bbuf[3]; b2 = bbuf[1] + bbuf[3];
#endif #endif

@ -246,35 +246,35 @@ public:
v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn)); v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x)); v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn)); v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16; v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
v_int32x4 t0, t1; v_int32x4 t0, t1;
v_int16x8 t00, t01, t10, t11; v_int16x8 t00, t01, t10, t11;
v_zip(v00, v01, t00, t01); v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11); v_zip(v10, v11, t10, t11);
t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1); t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1); t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
t0 = t0 >> (W_BITS - 5); t0 = v_shr(t0, W_BITS - 5);
t1 = t1 >> (W_BITS - 5); t1 = v_shr(t1, W_BITS - 5);
diff0 = v_pack(t0, t1) - diff0; diff0 = v_sub(v_pack(t0, t1), diff0);
diff0 = diff0 & vmask; diff0 = v_and(diff0, vmask);
v_int16x8 vscale_diff_is_pos = diff0 > vscale; v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale);
veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1); veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1));
// since there is no abs vor int16x8 we have to do this hack // since there is no abs vor int16x8 we have to do this hack
v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0)); v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
v_int16x8 vset2, vset1; v_int16x8 vset2, vset1;
// |It| < sigma1 ? // |It| < sigma1 ?
vset2 = vabs_diff < vparam1; vset2 = v_lt(vabs_diff, vparam1);
// It > 0 ? // It > 0 ?
v_int16x8 vdiff_is_pos = diff0 > vzero; v_int16x8 vdiff_is_pos = v_gt(diff0, vzero);
// sigma0 < |It| < sigma1 ? // sigma0 < |It| < sigma1 ?
vset1 = vset2 & (vabs_diff > vparam0); vset1 = v_and(vset2, v_gt(vabs_diff, vparam0));
// val = |It| -/+ sigma1 // val = |It| -/+ sigma1
v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1); v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1));
// It == 0 ? |It| > sigma13 // It == 0 ? |It| > sigma13
diff0 = vset2 & diff0; diff0 = v_and(vset2, diff0);
// It == val ? sigma0 < |It| < sigma1 // It == val ? sigma0 < |It| < sigma1
diff0 = v_select(vset1, vtmp_param1, diff0); diff0 = v_select(vset1, vtmp_param1, diff0);
@ -282,7 +282,7 @@ public:
// diff = diff * sigma2 // diff = diff * sigma2
v_int32x4 diff_int_0, diff_int_1; v_int32x4 diff_int_0, diff_int_1;
v_mul_expand(diff0, tale_, diff_int_0, diff_int_1); v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
diff0 = v_pack(diff_int_0 >> s2bitShift, diff_int_1 >> s2bitShift); diff0 = v_pack(v_shr(diff_int_0, s2bitShift), v_shr(diff_int_1, s2bitShift));
v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ... v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
@ -290,8 +290,8 @@ public:
v_zip(vIxy_0, vIxy_1, v10, v11); v_zip(vIxy_0, vIxy_1, v10, v11);
v_zip(diff2, diff1, v00, v01); v_zip(diff2, diff1, v00, v01);
vqb0 += v_cvt_f32(v_dotprod(v00, v10)); vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10)));
vqb1 += v_cvt_f32(v_dotprod(v01, v11)); vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11)));
if (j == 0) if (j == 0)
{ {
v_int32x4 vset1_0, vset1_1, vset2_0, vset2_1; v_int32x4 vset1_0, vset1_1, vset2_0, vset2_1;
@ -316,8 +316,8 @@ public:
v_float32x4 fx = v_cvt_f32(t1); v_float32x4 fx = v_cvt_f32(t1);
// A11 - A22 // A11 - A22
v_float32x4 fxtale = fx * vtale_0; v_float32x4 fxtale = v_mul(fx, vtale_0);
v_float32x4 fytale = fy * vtale_0; v_float32x4 fytale = v_mul(fy, vtale_0);
vAyy = v_muladd(fy, fytale, vAyy); vAyy = v_muladd(fy, fytale, vAyy);
vAxy = v_muladd(fx, fytale, vAxy); vAxy = v_muladd(fx, fytale, vAxy);
@ -330,8 +330,8 @@ public:
fx = v_cvt_f32(t1); fx = v_cvt_f32(t1);
// A11 - A22 // A11 - A22
fxtale = fx * vtale_1; fxtale = v_mul(fx, vtale_1);
fytale = fy * vtale_1; fytale = v_mul(fy, vtale_1);
vAyy = v_muladd(fy, fytale, vAyy); vAyy = v_muladd(fy, fytale, vAyy);
vAxy = v_muladd(fx, fytale, vAxy); vAxy = v_muladd(fx, fytale, vAxy);
@ -431,7 +431,7 @@ public:
#if CV_SIMD128 #if CV_SIMD128
float CV_DECL_ALIGNED(16) bbuf[4]; float CV_DECL_ALIGNED(16) bbuf[4];
v_store_aligned(bbuf, vqb0 + vqb1); v_store_aligned(bbuf, v_add(vqb0, vqb1));
b1 += bbuf[0] + bbuf[2]; b1 += bbuf[0] + bbuf[2];
b2 += bbuf[1] + bbuf[3]; b2 += bbuf[1] + bbuf[3];
#endif #endif
@ -769,7 +769,7 @@ public:
v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn)); v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x)); v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn)); v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16; v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
v_int32x4 t0, t1; v_int32x4 t0, t1;
v_int16x8 t00, t01, t10, t11; v_int16x8 t00, t01, t10, t11;
@ -777,33 +777,33 @@ public:
v_zip(v10, v11, t10, t11); v_zip(v10, v11, t10, t11);
//subpixel interpolation //subpixel interpolation
t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1); t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1); t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
t0 = t0 >> (W_BITS - 5); t0 = v_shr(t0, W_BITS - 5);
t1 = t1 >> (W_BITS - 5); t1 = v_shr(t1, W_BITS - 5);
// diff = J - I // diff = J - I
diff0 = v_pack(t0, t1) - vI; diff0 = v_sub(v_pack(t0, t1), vI);
// I*gain.x + gain.x // I*gain.x + gain.x
v_mul_expand(vI, vgain_value, t0, t1); v_mul_expand(vI, vgain_value, t0, t1);
diff0 = diff0 + v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value; diff0 = v_add(v_add(diff0, v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift))), vconst_value);
diff0 = diff0 & vmask; diff0 = v_and(diff0, vmask);
v_int16x8 vscale_diff_is_pos = diff0 > vscale; v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale);
veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1); veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1));
// since there is no abs vor int16x8 we have to do this hack // since there is no abs vor int16x8 we have to do this hack
v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0)); v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
v_int16x8 vset2, vset1; v_int16x8 vset2, vset1;
// |It| < sigma1 ? // |It| < sigma1 ?
vset2 = vabs_diff < vparam1; vset2 = v_lt(vabs_diff, vparam1);
// It > 0 ? // It > 0 ?
v_int16x8 vdiff_is_pos = diff0 > vzero; v_int16x8 vdiff_is_pos = v_gt(diff0, vzero);
// sigma0 < |It| < sigma1 ? // sigma0 < |It| < sigma1 ?
vset1 = vset2 & (vabs_diff > vparam0); vset1 = v_and(vset2, v_gt(vabs_diff, vparam0));
// val = |It| -/+ sigma1 // val = |It| -/+ sigma1
v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1); v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1));
// It == 0 ? |It| > sigma13 // It == 0 ? |It| > sigma13
diff0 = vset2 & diff0; diff0 = v_and(vset2, diff0);
// It == val ? sigma0 < |It| < sigma1 // It == val ? sigma0 < |It| < sigma1
diff0 = v_select(vset1, vtmp_param1, diff0); diff0 = v_select(vset1, vtmp_param1, diff0);
@ -811,8 +811,8 @@ public:
// diff = diff * sigma2 // diff = diff * sigma2
v_int32x4 diff_int_0, diff_int_1; v_int32x4 diff_int_0, diff_int_1;
v_mul_expand(diff0, tale_, diff_int_0, diff_int_1); v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
v_int32x4 diff0_0 = diff_int_0 >> s2bitShift; v_int32x4 diff0_0 = v_shr(diff_int_0, s2bitShift);
v_int32x4 diff0_1 = diff_int_1 >> s2bitShift; v_int32x4 diff0_1 = v_shr(diff_int_1, s2bitShift);
diff0 = v_pack(diff0_0, diff0_1); diff0 = v_pack(diff0_0, diff0_1);
v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ... v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
@ -821,16 +821,16 @@ public:
v_zip(vIxy_0, vIxy_1, v10, v11); v_zip(vIxy_0, vIxy_1, v10, v11);
v_zip(diff2, diff1, v00, v01); v_zip(diff2, diff1, v00, v01);
vqb0 += v_cvt_f32(v_dotprod(v00, v10)); vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10)));
vqb1 += v_cvt_f32(v_dotprod(v01, v11)); vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11)));
v_int32x4 vI0, vI1; v_int32x4 vI0, vI1;
v_expand(vI, vI0, vI1); v_expand(vI, vI0, vI1);
vqb2 += v_cvt_f32(diff0_0 * vI0); vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_0, vI0)));
vqb2 += v_cvt_f32(diff0_1 * vI1); vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_1, vI1)));
vqb3 += v_cvt_f32(diff0_0); vqb3 = v_add(vqb3, v_cvt_f32(diff0_0));
vqb3 += v_cvt_f32(diff0_1); vqb3 = v_add(vqb3, v_cvt_f32(diff0_1));
if (j == 0) if (j == 0)
{ {
@ -858,29 +858,29 @@ public:
v_float32x4 fx = v_cvt_f32(t1); v_float32x4 fx = v_cvt_f32(t1);
// A11 - A22 // A11 - A22
v_float32x4 fxtale = fx * vtale_0; v_float32x4 fxtale = v_mul(fx, vtale_0);
v_float32x4 fytale = fy * vtale_0; v_float32x4 fytale = v_mul(fy, vtale_0);
vAyy = v_muladd(fy, fytale, vAyy); vAyy = v_muladd(fy, fytale, vAyy);
vAxy = v_muladd(fx, fytale, vAxy); vAxy = v_muladd(fx, fytale, vAxy);
vAxx = v_muladd(fx, fxtale, vAxx); vAxx = v_muladd(fx, fxtale, vAxx);
// sumIx und sumIy // sumIx und sumIy
vsumIx += fxtale; vsumIx = v_add(vsumIx, fxtale);
vsumIy += fytale; vsumIy = v_add(vsumIy, fytale);
vsumW1 += vI_ps * fxtale; vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale));
vsumW2 += vI_ps * fytale; vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale));
// sumI // sumI
v_float32x4 vI_tale = vI_ps * vtale_0; v_float32x4 vI_tale = v_mul(vI_ps, vtale_0);
vsumI += vI_tale; vsumI = v_add(vsumI, vI_tale);
// sumW // sumW
vsumW += vtale_0; vsumW = v_add(vsumW, vtale_0);
// sumDI // sumDI
vsumDI += vI_ps * vI_tale; vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale));
v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1)))); v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
v_expand(v01, t1, t0); v_expand(v01, t1, t0);
@ -890,29 +890,29 @@ public:
fx = v_cvt_f32(t1); fx = v_cvt_f32(t1);
// A11 - A22 // A11 - A22
fxtale = fx * vtale_1; fxtale = v_mul(fx, vtale_1);
fytale = fy * vtale_1; fytale = v_mul(fy, vtale_1);
vAyy = v_muladd(fy, fytale, vAyy); vAyy = v_muladd(fy, fytale, vAyy);
vAxy = v_muladd(fx, fytale, vAxy); vAxy = v_muladd(fx, fytale, vAxy);
vAxx = v_muladd(fx, fxtale, vAxx); vAxx = v_muladd(fx, fxtale, vAxx);
// sumIx und sumIy // sumIx und sumIy
vsumIx += fxtale; vsumIx = v_add(vsumIx, fxtale);
vsumIy += fytale; vsumIy = v_add(vsumIy, fytale);
vsumW1 += vI_ps * fxtale; vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale));
vsumW2 += vI_ps * fytale; vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale));
// sumI // sumI
vI_tale = vI_ps * vtale_1; vI_tale = v_mul(vI_ps, vtale_1);
vsumI += vI_tale; vsumI = v_add(vsumI, vI_tale);
// sumW // sumW
vsumW += vtale_1; vsumW = v_add(vsumW, vtale_1);
// sumDI // sumDI
vsumDI += vI_ps * vI_tale; vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale));
} }
} }
#else #else
@ -1017,7 +1017,7 @@ public:
} }
#if CV_SIMD128 #if CV_SIMD128
float CV_DECL_ALIGNED(16) bbuf[4]; float CV_DECL_ALIGNED(16) bbuf[4];
v_store_aligned(bbuf, vqb0 + vqb1); v_store_aligned(bbuf, v_add(vqb0, vqb1));
b1 = bbuf[0] + bbuf[2]; b1 = bbuf[0] + bbuf[2];
b2 = bbuf[1] + bbuf[3]; b2 = bbuf[1] + bbuf[3];
b3 = v_reduce_sum(vqb2); b3 = v_reduce_sum(vqb2);

@ -71,15 +71,15 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
for (; x <= winSize.width*cn; x += 8, dsrc += 8 * 2, dsrc1 += 8 * 2, dIptr += 8 * 2) for (; x <= winSize.width*cn; x += 8, dsrc += 8 * 2, dsrc1 += 8 * 2, dIptr += 8 * 2)
{ {
v_int32x4 vmask0 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)) * vmax_val_32; v_int32x4 vmask0 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)), vmax_val_32);
v_int32x4 vmask1 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)) * vmax_val_32; v_int32x4 vmask1 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)), vmax_val_32);
if (x + 4 > winSize.width) if (x + 4 > winSize.width)
{ {
vmask0 = vmask0 & vmask_border_0; vmask0 = v_and(vmask0, vmask_border_0);
} }
if (x + 8 > winSize.width) if (x + 8 > winSize.width)
{ {
vmask1 = vmask1 & vmask_border_1; vmask1 = v_and(vmask1, vmask_border_1);
} }
v_int32x4 t0, t1; v_int32x4 t0, t1;
@ -91,10 +91,10 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
v_zip(v00, v01, t00, t01); v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11); v_zip(v10, v11, t10, t11);
t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1); t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1); t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
t0 = t0 >> (W_BITS - 5) & vmask0; t0 = v_and(v_shr(t0, W_BITS - 5), vmask0);
t1 = t1 >> (W_BITS - 5) & vmask1; t1 = v_and(v_shr(t1, W_BITS - 5), vmask1);
v_store(Iptr + x, v_pack(t0, t1)); v_store(Iptr + x, v_pack(t0, t1));
v00 = v_reinterpret_as_s16(v_load(dsrc)); v00 = v_reinterpret_as_s16(v_load(dsrc));
@ -105,12 +105,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
v_zip(v00, v01, t00, t01); v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11); v_zip(v10, v11, t10, t11);
t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1); t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1));
t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1); t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1));
t0 = t0 >> W_BITS; t0 = v_shr(t0, W_BITS);
t1 = t1 >> W_BITS; t1 = v_shr(t1, W_BITS);
v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
v00 = v00 & v_reinterpret_as_s16(vmask0); v00 = v_and(v00, v_reinterpret_as_s16(vmask0));
v_store(dIptr, v00); v_store(dIptr, v00);
v00 = v_reinterpret_as_s16(v_load(dsrc + 4 * 2)); v00 = v_reinterpret_as_s16(v_load(dsrc + 4 * 2));
@ -121,12 +121,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
v_zip(v00, v01, t00, t01); v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11); v_zip(v10, v11, t10, t11);
t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1); t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1));
t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1); t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1));
t0 = t0 >> W_BITS; t0 = v_shr(t0, W_BITS);
t1 = t1 >> W_BITS; t1 = v_shr(t1, W_BITS);
v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
v00 = v00 & v_reinterpret_as_s16(vmask1); v00 = v_and(v00, v_reinterpret_as_s16(vmask1));
v_store(dIptr + 4 * 2, v00); v_store(dIptr + 4 * 2, v00);
} }
#else #else
@ -187,15 +187,15 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
#if CV_SIMD128 #if CV_SIMD128
for (int x = 0; x <= winSize.width*cn; x += 8, dsrc += 8 * 2, dsrc1 += 8 * 2, dIptr += 8 * 2) for (int x = 0; x <= winSize.width*cn; x += 8, dsrc += 8 * 2, dsrc1 += 8 * 2, dIptr += 8 * 2)
{ {
v_int32x4 vmask0 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)) * vmax_val_32; v_int32x4 vmask0 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)), vmax_val_32);
v_int32x4 vmask1 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)) * vmax_val_32; v_int32x4 vmask1 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)), vmax_val_32);
if (x + 4 > winSize.width) if (x + 4 > winSize.width)
{ {
vmask0 = vmask0 & vmask_border0; vmask0 = v_and(vmask0, vmask_border0);
} }
if (x + 8 > winSize.width) if (x + 8 > winSize.width)
{ {
vmask1 = vmask1 & vmask_border1; vmask1 = v_and(vmask1, vmask_border1);
} }
v_int32x4 t0, t1; v_int32x4 t0, t1;
@ -207,12 +207,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
v_zip(v00, v01, t00, t01); v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11); v_zip(v10, v11, t10, t11);
t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1); t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1); t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
t0 = t0 >> (W_BITS - 5); t0 = v_shr(t0, W_BITS - 5);
t1 = t1 >> (W_BITS - 5); t1 = v_shr(t1, W_BITS - 5);
t0 = t0 & vmask0; t0 = v_and(t0, vmask0);
t1 = t1 & vmask1; t1 = v_and(t1, vmask1);
v_store(Iptr + x, v_pack(t0, t1)); v_store(Iptr + x, v_pack(t0, t1));
v00 = v_reinterpret_as_s16(v_load(dsrc)); v00 = v_reinterpret_as_s16(v_load(dsrc));
@ -223,12 +223,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
v_zip(v00, v01, t00, t01); v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11); v_zip(v10, v11, t10, t11);
t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1); t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1));
t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1); t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1));
t0 = t0 >> W_BITS; t0 = v_shr(t0, W_BITS);
t1 = t1 >> W_BITS; t1 = v_shr(t1, W_BITS);
v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
v00 = v00 & v_reinterpret_as_s16(vmask0); v00 = v_and(v00, v_reinterpret_as_s16(vmask0));
v_store(dIptr, v00); v_store(dIptr, v00);
v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(v00)))); v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(v00))));
@ -249,12 +249,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
v_zip(v00, v01, t00, t01); v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11); v_zip(v10, v11, t10, t11);
t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1); t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1));
t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1); t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1));
t0 = t0 >> W_BITS; t0 = v_shr(t0, W_BITS);
t1 = t1 >> W_BITS; t1 = v_shr(t1, W_BITS);
v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
v00 = v00 & v_reinterpret_as_s16(vmask1); v00 = v_and(v00, v_reinterpret_as_s16(vmask1));
v_store(dIptr + 4 * 2, v00); v_store(dIptr + 4 * 2, v00);
v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(v00)))); v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(v00))));

@ -52,8 +52,8 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(srow1 + x)); v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(srow1 + x));
v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x)); v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x));
v_int16x8 t1 = s2 - s0; v_int16x8 t1 = v_sub(s2, s0);
v_int16x8 t0 = v_mul_wrap(s0 + s2, c3) + v_mul_wrap(s1, c10); v_int16x8 t0 = v_add(v_mul_wrap(v_add(s0, s2), c3), v_mul_wrap(s1, c10));
v_store(trow0 + x, t0); v_store(trow0 + x, t0);
v_store(trow1 + x, t1); v_store(trow1 + x, t1);
@ -90,8 +90,8 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
v_int16x8 s3 = v_load(trow1 + x); v_int16x8 s3 = v_load(trow1 + x);
v_int16x8 s4 = v_load(trow1 + x + cn); v_int16x8 s4 = v_load(trow1 + x + cn);
v_int16x8 t0 = s1 - s0; v_int16x8 t0 = v_sub(s1, s0);
v_int16x8 t1 = v_mul_wrap(s2 + s4, c3) + v_mul_wrap(s3, c10); v_int16x8 t1 = v_add(v_mul_wrap(v_add(s2, s4), c3), v_mul_wrap(s3, c10));
v_store_interleave((drow + x * 2), t0, t1); v_store_interleave((drow + x * 2), t0, t1);
} }

@ -194,21 +194,21 @@ inline float ColoredTSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const
{ {
// tx, ty, tz = floor(p) // tx, ty, tz = floor(p)
v_int32x4 ip = v_floor(p); v_int32x4 ip = v_floor(p);
v_float32x4 t = p - v_cvt_f32(ip); v_float32x4 t = v_sub(p, v_cvt_f32(ip));
float tx = t.get0(); float tx = v_get0(t);
t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t))); t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
float ty = t.get0(); float ty = v_get0(t);
t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t))); t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
float tz = t.get0(); float tz = v_get0(t);
int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2]; int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
const RGBTsdfVoxel* volData = volume.ptr<RGBTsdfVoxel>(); const RGBTsdfVoxel* volData = volume.ptr<RGBTsdfVoxel>();
int ix = ip.get0(); int ix = v_get0(ip);
ip = v_rotate_right<1>(ip); ip = v_rotate_right<1>(ip);
int iy = ip.get0(); int iy = v_get0(ip);
ip = v_rotate_right<1>(ip); ip = v_rotate_right<1>(ip);
int iz = ip.get0(); int iz = v_get0(ip);
int coordBase = ix * xdim + iy * ydim + iz * zdim; int coordBase = ix * xdim + iy * ydim + iz * zdim;
@ -218,15 +218,15 @@ inline float ColoredTSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const
v_float32x4 v0246 = tsdfToFloat_INTR(v_int32x4(vx[0], vx[2], vx[4], vx[6])); v_float32x4 v0246 = tsdfToFloat_INTR(v_int32x4(vx[0], vx[2], vx[4], vx[6]));
v_float32x4 v1357 = tsdfToFloat_INTR(v_int32x4(vx[1], vx[3], vx[5], vx[7])); v_float32x4 v1357 = tsdfToFloat_INTR(v_int32x4(vx[1], vx[3], vx[5], vx[7]));
v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246); v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
v_float32x4 v00_10 = vxx; v_float32x4 v00_10 = vxx;
v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx))); v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10); v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
float v0 = v0_1.get0(); float v0 = v_get0(v0_1);
v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1))); v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
float v1 = v0_1.get0(); float v1 = v_get0(v0_1);
return v0 + tx * (v1 - v0); return v0 + tx * (v1 - v0);
} }
@ -276,27 +276,27 @@ inline Point3f ColoredTSDFVolumeCPU::getNormalVoxel(const Point3f& _p) const
inline v_float32x4 ColoredTSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const inline v_float32x4 ColoredTSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const
{ {
if (v_check_any(p < v_float32x4(1.f, 1.f, 1.f, 0.f)) || if (v_check_any(v_lt(p, v_float32x4(1.f, 1.f, 1.f, 0.f))) ||
v_check_any(p >= v_float32x4((float)(volResolution.x - 2), v_check_any(v_ge(p, v_float32x4((float)(volResolution.x - 2),
(float)(volResolution.y - 2), (float)(volResolution.y - 2),
(float)(volResolution.z - 2), 1.f)) (float)(volResolution.z - 2), 1.f)))
) )
return nanv; return nanv;
v_int32x4 ip = v_floor(p); v_int32x4 ip = v_floor(p);
v_float32x4 t = p - v_cvt_f32(ip); v_float32x4 t = v_sub(p, v_cvt_f32(ip));
float tx = t.get0(); float tx = v_get0(t);
t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t))); t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
float ty = t.get0(); float ty = v_get0(t);
t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t))); t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
float tz = t.get0(); float tz = v_get0(t);
const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2]; const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
const RGBTsdfVoxel* volData = volume.ptr<RGBTsdfVoxel>(); const RGBTsdfVoxel* volData = volume.ptr<RGBTsdfVoxel>();
int ix = ip.get0(); ip = v_rotate_right<1>(ip); int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
int iy = ip.get0(); ip = v_rotate_right<1>(ip); int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
int iz = ip.get0(); int iz = v_get0(ip);
int coordBase = ix * xdim + iy * ydim + iz * zdim; int coordBase = ix * xdim + iy * ydim + iz * zdim;
@ -314,23 +314,23 @@ inline v_float32x4 ColoredTSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) co
v_float32x4 v0246(vx[0], vx[2], vx[4], vx[6]); v_float32x4 v0246(vx[0], vx[2], vx[4], vx[6]);
v_float32x4 v1357(vx[1], vx[3], vx[5], vx[7]); v_float32x4 v1357(vx[1], vx[3], vx[5], vx[7]);
v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246); v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
v_float32x4 v00_10 = vxx; v_float32x4 v00_10 = vxx;
v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx))); v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10); v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
float v0 = v0_1.get0(); float v0 = v_get0(v0_1);
v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1))); v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
float v1 = v0_1.get0(); float v1 = v_get0(v0_1);
nv = v0 + tx * (v1 - v0); nv = v0 + tx * (v1 - v0);
} }
v_float32x4 n = v_load_aligned(an); v_float32x4 n = v_load_aligned(an);
v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(n * n))); v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(v_mul(n, n))));
return Norm.get0() < 0.0001f ? nanv : n / Norm; return v_get0(Norm) < 0.0001f ? nanv : v_div(n, Norm);
} }
#else #else
inline Point3f ColoredTSDFVolumeCPU::getNormalVoxel(const Point3f& p) const inline Point3f ColoredTSDFVolumeCPU::getNormalVoxel(const Point3f& p) const
@ -388,15 +388,15 @@ inline float ColoredTSDFVolumeCPU::interpolateColor(float tx, float ty, float tz
v_float32x4 v0246, v1357; v_float32x4 v0246, v1357;
v_load_deinterleave(vx, v0246, v1357); v_load_deinterleave(vx, v0246, v1357);
v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246); v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
v_float32x4 v00_10 = vxx; v_float32x4 v00_10 = vxx;
v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx))); v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10); v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
float v0 = v0_1.get0(); float v0 = v_get0(v0_1);
v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1))); v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
float v1 = v0_1.get0(); float v1 = v_get0(v0_1);
return v0 + tx * (v1 - v0); return v0 + tx * (v1 - v0);
} }
@ -427,10 +427,10 @@ inline Point3f ColoredTSDFVolumeCPU::getColorVoxel(const Point3f& _p) const
} }
inline v_float32x4 ColoredTSDFVolumeCPU::getColorVoxel(const v_float32x4& p) const inline v_float32x4 ColoredTSDFVolumeCPU::getColorVoxel(const v_float32x4& p) const
{ {
if (v_check_any(p < v_float32x4(1.f, 1.f, 1.f, 0.f)) || if (v_check_any(v_lt(p, v_float32x4(1.f, 1.f, 1.f, 0.f))) ||
v_check_any(p >= v_float32x4((float)(volResolution.x - 2), v_check_any(v_ge(p, v_float32x4((float)(volResolution.x - 2),
(float)(volResolution.y - 2), (float)(volResolution.y - 2),
(float)(volResolution.z - 2), 1.f)) (float)(volResolution.z - 2), 1.f)))
) )
return nanv; return nanv;
@ -439,9 +439,9 @@ inline v_float32x4 ColoredTSDFVolumeCPU::getColorVoxel(const v_float32x4& p) con
const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2]; const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
const RGBTsdfVoxel* volData = volume.ptr<RGBTsdfVoxel>(); const RGBTsdfVoxel* volData = volume.ptr<RGBTsdfVoxel>();
int ix = ip.get0(); ip = v_rotate_right<1>(ip); int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
int iy = ip.get0(); ip = v_rotate_right<1>(ip); int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
int iz = ip.get0(); int iz = v_get0(ip);
int coordBase = ix * xdim + iy * ydim + iz * zdim; int coordBase = ix * xdim + iy * ydim + iz * zdim;
float CV_DECL_ALIGNED(16) rgb[4]; float CV_DECL_ALIGNED(16) rgb[4];
@ -456,12 +456,12 @@ inline v_float32x4 ColoredTSDFVolumeCPU::getColorVoxel(const v_float32x4& p) con
} }
v_float32x4 vsi(voxelSizeInv, voxelSizeInv, voxelSizeInv, voxelSizeInv); v_float32x4 vsi(voxelSizeInv, voxelSizeInv, voxelSizeInv, voxelSizeInv);
v_float32x4 ptVox = p * vsi; v_float32x4 ptVox = v_mul(p, vsi);
v_int32x4 iptVox = v_floor(ptVox); v_int32x4 iptVox = v_floor(ptVox);
v_float32x4 t = ptVox - v_cvt_f32(iptVox); v_float32x4 t = v_sub(ptVox, v_cvt_f32(iptVox));
float tx = t.get0(); t = v_rotate_right<1>(t); float tx = v_get0(t); t = v_rotate_right<1>(t);
float ty = t.get0(); t = v_rotate_right<1>(t); float ty = v_get0(t); t = v_rotate_right<1>(t);
float tz = t.get0(); float tz = v_get0(t);
rgb[0] = interpolateColor(tx, ty, tz, r); rgb[0] = interpolateColor(tx, ty, tz, r);
rgb[1] = interpolateColor(tx, ty, tz, g); rgb[1] = interpolateColor(tx, ty, tz, g);
rgb[2] = interpolateColor(tx, ty, tz, b); rgb[2] = interpolateColor(tx, ty, tz, b);
@ -583,21 +583,21 @@ struct ColorRaycastInvoker : ParallelLoopBody
// get direction through pixel in volume space: // get direction through pixel in volume space:
// 1. reproject (x, y) on projecting plane where z = 1.f // 1. reproject (x, y) on projecting plane where z = 1.f
v_float32x4 planed = (v_float32x4((float)x, (float)y, 0.f, 0.f) - vcxy) * vfxy; v_float32x4 planed = v_mul(v_sub(v_float32x4((float)x, (float)y, 0.F, 0.F), vcxy), vfxy);
planed = v_combine_low(planed, v_float32x4(1.f, 0.f, 0.f, 0.f)); planed = v_combine_low(planed, v_float32x4(1.f, 0.f, 0.f, 0.f));
// 2. rotate to volume space // 2. rotate to volume space
planed = v_matmuladd(planed, camRot0, camRot1, camRot2, v_setzero_f32()); planed = v_matmuladd(planed, camRot0, camRot1, camRot2, v_setzero_f32());
// 3. normalize // 3. normalize
v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(planed * planed))); v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(v_mul(planed, planed))));
v_float32x4 dir = planed * invNorm; v_float32x4 dir = v_mul(planed, invNorm);
// compute intersection of ray with all six bbox planes // compute intersection of ray with all six bbox planes
v_float32x4 rayinv = v_setall_f32(1.f) / dir; v_float32x4 rayinv = v_div(v_setall_f32(1.F), dir);
// div by zero should be eliminated by these products // div by zero should be eliminated by these products
v_float32x4 tbottom = rayinv * (boxDown - orig); v_float32x4 tbottom = v_mul(rayinv, v_sub(boxDown, orig));
v_float32x4 ttop = rayinv * (boxUp - orig); v_float32x4 ttop = v_mul(rayinv, v_sub(boxUp, orig));
// re-order intersections to find smallest and largest on each axis // re-order intersections to find smallest and largest on each axis
v_float32x4 minAx = v_min(ttop, tbottom); v_float32x4 minAx = v_min(ttop, tbottom);
@ -618,14 +618,14 @@ struct ColorRaycastInvoker : ParallelLoopBody
if (tmin < tmax) if (tmin < tmax)
{ {
// interpolation optimized a little // interpolation optimized a little
orig *= invVoxelSize; orig = v_mul(orig, invVoxelSize);
dir *= invVoxelSize; dir = v_mul(dir, invVoxelSize);
int xdim = volume.volDims[0]; int xdim = volume.volDims[0];
int ydim = volume.volDims[1]; int ydim = volume.volDims[1];
int zdim = volume.volDims[2]; int zdim = volume.volDims[2];
v_float32x4 rayStep = dir * v_setall_f32(tstep); v_float32x4 rayStep = v_mul(dir, v_setall_f32(this->tstep));
v_float32x4 next = (orig + dir * v_setall_f32(tmin)); v_float32x4 next = (v_add(orig, v_mul(dir, v_setall_f32(tmin))));
float f = volume.interpolateVoxel(next), fnext = f; float f = volume.interpolateVoxel(next), fnext = f;
//raymarch //raymarch
@ -633,11 +633,11 @@ struct ColorRaycastInvoker : ParallelLoopBody
int nSteps = cvFloor((tmax - tmin) / tstep); int nSteps = cvFloor((tmax - tmin) / tstep);
for (; steps < nSteps; steps++) for (; steps < nSteps; steps++)
{ {
next += rayStep; next = v_add(next, rayStep);
v_int32x4 ip = v_round(next); v_int32x4 ip = v_round(next);
int ix = ip.get0(); ip = v_rotate_right<1>(ip); int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
int iy = ip.get0(); ip = v_rotate_right<1>(ip); int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
int iz = ip.get0(); int iz = v_get0(ip);
int coord = ix * xdim + iy * ydim + iz * zdim; int coord = ix * xdim + iy * ydim + iz * zdim;
fnext = tsdfToFloat(volume.volume.at<RGBTsdfVoxel>(coord).tsdf); fnext = tsdfToFloat(volume.volume.at<RGBTsdfVoxel>(coord).tsdf);
@ -657,7 +657,7 @@ struct ColorRaycastInvoker : ParallelLoopBody
// linearly interpolate t between two f values // linearly interpolate t between two f values
if (f > 0.f && fnext < 0.f) if (f > 0.f && fnext < 0.f)
{ {
v_float32x4 tp = next - rayStep; v_float32x4 tp = v_sub(next, rayStep);
float ft = volume.interpolateVoxel(tp); float ft = volume.interpolateVoxel(tp);
float ftdt = volume.interpolateVoxel(next); float ftdt = volume.interpolateVoxel(next);
float ts = tmin + tstep * (steps - ft / (ftdt - ft)); float ts = tmin + tstep * (steps - ft / (ftdt - ft));
@ -665,7 +665,7 @@ struct ColorRaycastInvoker : ParallelLoopBody
// avoid division by zero // avoid division by zero
if (!cvIsNaN(ts) && !cvIsInf(ts)) if (!cvIsNaN(ts) && !cvIsInf(ts))
{ {
v_float32x4 pv = (orig + dir * v_setall_f32(ts)); v_float32x4 pv = (v_add(orig, v_mul(dir, v_setall_f32(ts))));
v_float32x4 nv = volume.getNormalVoxel(pv); v_float32x4 nv = volume.getNormalVoxel(pv);
v_float32x4 cv = volume.getColorVoxel(pv); v_float32x4 cv = volume.getColorVoxel(pv);
@ -675,9 +675,7 @@ struct ColorRaycastInvoker : ParallelLoopBody
//convert pv and nv to camera space //convert pv and nv to camera space
normal = v_matmuladd(nv, volRot0, volRot1, volRot2, v_setzero_f32()); normal = v_matmuladd(nv, volRot0, volRot1, volRot2, v_setzero_f32());
// interpolation optimized a little // interpolation optimized a little
point = v_matmuladd(pv * v_float32x4(volume.voxelSize, point = v_matmuladd(v_mul(pv, v_float32x4(this->volume.voxelSize, this->volume.voxelSize, this->volume.voxelSize, 1.F)),
volume.voxelSize,
volume.voxelSize, 1.f),
volRot0, volRot1, volRot2, volTrans); volRot0, volRot1, volRot2, volTrans);
} }
} }

@ -138,7 +138,7 @@ bool ICPImpl::estimateTransformT(cv::Affine3f& transform,
#if USE_INTRINSICS #if USE_INTRINSICS
static inline bool fastCheck(const v_float32x4& p0, const v_float32x4& p1) static inline bool fastCheck(const v_float32x4& p0, const v_float32x4& p1)
{ {
float check = (p0.get0() + p1.get0()); float check = (v_get0(p0) + v_get0(p1));
return !cvIsNaN(check); return !cvIsNaN(check);
} }
@ -160,7 +160,7 @@ static inline v_float32x4 crossProduct(const v_float32x4& a, const v_float32x4&
v_float32x4 ayzx, azxy, byzx, bzxy; v_float32x4 ayzx, azxy, byzx, bzxy;
getCrossPerm(a, ayzx, azxy); getCrossPerm(a, ayzx, azxy);
getCrossPerm(b, byzx, bzxy); getCrossPerm(b, byzx, bzxy);
return ayzx*bzxy - azxy*byzx; return v_sub(v_mul(ayzx, bzxy), v_mul(azxy, byzx));
} }
#else #else
static inline bool fastCheck(const Point3f& p) static inline bool fastCheck(const Point3f& p)
@ -235,11 +235,11 @@ struct GetAbInvoker : ParallelLoopBody
//find correspondence by projecting the point //find correspondence by projecting the point
v_float32x4 oldCoords; v_float32x4 oldCoords;
float pz = (v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(newP))).get0()); float pz = v_get0(v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(newP))));
// x, y, 0, 0 // x, y, 0, 0
oldCoords = v_muladd(newP/v_setall_f32(pz), vfxy, vcxy); oldCoords = v_muladd(v_div(newP, v_setall_f32(pz)), vfxy, vcxy);
if(!v_check_all((oldCoords >= v_setzero_f32()) & (oldCoords < vframe))) if(!v_check_all(v_and(v_ge(oldCoords, v_setzero_f32()), v_lt(oldCoords, vframe))))
continue; continue;
// bilinearly interpolate oldPts and oldNrm under oldCoords point // bilinearly interpolate oldPts and oldNrm under oldCoords point
@ -247,12 +247,12 @@ struct GetAbInvoker : ParallelLoopBody
v_float32x4 oldN; v_float32x4 oldN;
{ {
v_int32x4 ixy = v_floor(oldCoords); v_int32x4 ixy = v_floor(oldCoords);
v_float32x4 txy = oldCoords - v_cvt_f32(ixy); v_float32x4 txy = v_sub(oldCoords, v_cvt_f32(ixy));
int xi = ixy.get0(); int xi = v_get0(ixy);
int yi = v_rotate_right<1>(ixy).get0(); int yi = v_get0(v_rotate_right<1>(ixy));
v_float32x4 tx = v_setall_f32(txy.get0()); v_float32x4 tx = v_setall_f32(v_get0(txy));
txy = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(txy))); txy = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(txy)));
v_float32x4 ty = v_setall_f32(txy.get0()); v_float32x4 ty = v_setall_f32(v_get0(txy));
const float* prow0 = (const float*)oldPts[yi+0]; const float* prow0 = (const float*)oldPts[yi+0];
const float* prow1 = (const float*)oldPts[yi+1]; const float* prow1 = (const float*)oldPts[yi+1];
@ -275,23 +275,23 @@ struct GetAbInvoker : ParallelLoopBody
// NaN check is done later // NaN check is done later
v_float32x4 p0 = p00 + tx*(p01 - p00); v_float32x4 p0 = v_add(p00, v_mul(tx, v_sub(p01, p00)));
v_float32x4 p1 = p10 + tx*(p11 - p10); v_float32x4 p1 = v_add(p10, v_mul(tx, v_sub(p11, p10)));
oldP = p0 + ty*(p1 - p0); oldP = v_add(p0, v_mul(ty, v_sub(p1, p0)));
v_float32x4 n0 = n00 + tx*(n01 - n00); v_float32x4 n0 = v_add(n00, v_mul(tx, v_sub(n01, n00)));
v_float32x4 n1 = n10 + tx*(n11 - n10); v_float32x4 n1 = v_add(n10, v_mul(tx, v_sub(n11, n10)));
oldN = n0 + ty*(n1 - n0); oldN = v_add(n0, v_mul(ty, v_sub(n1, n0)));
} }
bool oldPNcheck = fastCheck(oldP, oldN); bool oldPNcheck = fastCheck(oldP, oldN);
//filter by distance //filter by distance
v_float32x4 diff = newP - oldP; v_float32x4 diff = v_sub(newP, oldP);
bool distCheck = !(v_reduce_sum(diff*diff) > sqThresh); bool distCheck = !(v_reduce_sum(v_mul(diff, diff)) > sqThresh);
//filter by angle //filter by angle
bool angleCheck = !(abs(v_reduce_sum(newN*oldN)) < cosThresh); bool angleCheck = !(abs(v_reduce_sum(v_mul(newN, oldN))) < cosThresh);
if(!(oldPNcheck && distCheck && angleCheck)) if(!(oldPNcheck && distCheck && angleCheck))
continue; continue;
@ -299,17 +299,17 @@ struct GetAbInvoker : ParallelLoopBody
// build point-wise vector ab = [ A | b ] // build point-wise vector ab = [ A | b ]
v_float32x4 VxNv = crossProduct(newP, oldN); v_float32x4 VxNv = crossProduct(newP, oldN);
Point3f VxN; Point3f VxN;
VxN.x = VxNv.get0(); VxN.x = v_get0(VxNv);
VxN.y = v_reinterpret_as_f32(v_extract<1>(v_reinterpret_as_u32(VxNv), v_setzero_u32())).get0(); VxN.y = v_get0(v_reinterpret_as_f32(v_extract<1>(v_reinterpret_as_u32(VxNv), v_setzero_u32())));
VxN.z = v_reinterpret_as_f32(v_extract<2>(v_reinterpret_as_u32(VxNv), v_setzero_u32())).get0(); VxN.z = v_get0(v_reinterpret_as_f32(v_extract<2>(v_reinterpret_as_u32(VxNv), v_setzero_u32())));
float dotp = -v_reduce_sum(oldN*diff); float dotp = -v_reduce_sum(v_mul(oldN, diff));
// build point-wise upper-triangle matrix [ab^T * ab] w/o last row // build point-wise upper-triangle matrix [ab^T * ab] w/o last row
// which is [A^T*A | A^T*b] // which is [A^T*A | A^T*b]
// and gather sum // and gather sum
v_float32x4 vd = VxNv | v_float32x4(0, 0, 0, dotp); v_float32x4 vd = v_or(VxNv, v_float32x4(0, 0, 0, dotp));
v_float32x4 n = oldN; v_float32x4 n = oldN;
v_float32x4 nyzx; v_float32x4 nyzx;
{ {

@ -425,15 +425,15 @@ inline float interpolate(float tx, float ty, float tz, float vx[8])
v_float32x4 v0246, v1357; v_float32x4 v0246, v1357;
v_load_deinterleave(vx, v0246, v1357); v_load_deinterleave(vx, v0246, v1357);
v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246); v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
v_float32x4 v00_10 = vxx; v_float32x4 v00_10 = vxx;
v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx))); v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10); v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
float v0 = v0_1.get0(); float v0 = v_get0(v0_1);
v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1))); v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
float v1 = v0_1.get0(); float v1 = v_get0(v0_1);
return v0 + tx * (v1 - v0); return v0 + tx * (v1 - v0);
} }
@ -598,9 +598,9 @@ Point3f HashTSDFVolumeCPU::getNormalVoxel(const Point3f &point) const
v_float32x8 czp = v_lut(vals, idxzp); v_float32x8 czp = v_lut(vals, idxzp);
v_float32x8 czn = v_lut(vals, idxzn); v_float32x8 czn = v_lut(vals, idxzn);
v_float32x8 vcxv = cxn - cxp; v_float32x8 vcxv = v_sub(cxn, cxp);
v_float32x8 vcyv = cyn - cyp; v_float32x8 vcyv = v_sub(cyn, cyp);
v_float32x8 vczv = czn - czp; v_float32x8 vczv = v_sub(czn, czp);
v_store(cxv, vcxv); v_store(cxv, vcxv);
v_store(cyv, vcyv); v_store(cyv, vcyv);
@ -615,9 +615,9 @@ Point3f HashTSDFVolumeCPU::getNormalVoxel(const Point3f &point) const
v_float32x4 czp0 = v_lut(vals, idxzp + 0); v_float32x4 czp1 = v_lut(vals, idxzp + 4); v_float32x4 czp0 = v_lut(vals, idxzp + 0); v_float32x4 czp1 = v_lut(vals, idxzp + 4);
v_float32x4 czn0 = v_lut(vals, idxzn + 0); v_float32x4 czn1 = v_lut(vals, idxzn + 4); v_float32x4 czn0 = v_lut(vals, idxzn + 0); v_float32x4 czn1 = v_lut(vals, idxzn + 4);
v_float32x4 cxv0 = cxn0 - cxp0; v_float32x4 cxv1 = cxn1 - cxp1; v_float32x4 cxv0 = v_sub(cxn0, cxp0); v_float32x4 cxv1 = v_sub(cxn1, cxp1);
v_float32x4 cyv0 = cyn0 - cyp0; v_float32x4 cyv1 = cyn1 - cyp1; v_float32x4 cyv0 = v_sub(cyn0, cyp0); v_float32x4 cyv1 = v_sub(cyn1, cyp1);
v_float32x4 czv0 = czn0 - czp0; v_float32x4 czv1 = czn1 - czp1; v_float32x4 czv0 = v_sub(czn0, czp0); v_float32x4 czv1 = v_sub(czn1, czp1);
v_store(cxv + 0, cxv0); v_store(cxv + 4, cxv1); v_store(cxv + 0, cxv0); v_store(cxv + 4, cxv1);
v_store(cyv + 0, cyv0); v_store(cyv + 4, cyv1); v_store(cyv + 0, cyv0); v_store(cyv + 4, cyv1);
@ -1523,9 +1523,9 @@ Point3f HashTSDFVolumeGPU::getNormalVoxel(const Point3f& point) const
v_float32x8 czp = v_lut(vals, idxzp); v_float32x8 czp = v_lut(vals, idxzp);
v_float32x8 czn = v_lut(vals, idxzn); v_float32x8 czn = v_lut(vals, idxzn);
v_float32x8 vcxv = cxn - cxp; v_float32x8 vcxv = v_sub(cxn, cxp);
v_float32x8 vcyv = cyn - cyp; v_float32x8 vcyv = v_sub(cyn, cyp);
v_float32x8 vczv = czn - czp; v_float32x8 vczv = v_sub(czn, czp);
v_store(cxv, vcxv); v_store(cxv, vcxv);
v_store(cyv, vcyv); v_store(cyv, vcyv);
@ -1540,9 +1540,9 @@ Point3f HashTSDFVolumeGPU::getNormalVoxel(const Point3f& point) const
v_float32x4 czp0 = v_lut(vals, idxzp + 0); v_float32x4 czp1 = v_lut(vals, idxzp + 4); v_float32x4 czp0 = v_lut(vals, idxzp + 0); v_float32x4 czp1 = v_lut(vals, idxzp + 4);
v_float32x4 czn0 = v_lut(vals, idxzn + 0); v_float32x4 czn1 = v_lut(vals, idxzn + 4); v_float32x4 czn0 = v_lut(vals, idxzn + 0); v_float32x4 czn1 = v_lut(vals, idxzn + 4);
v_float32x4 cxv0 = cxn0 - cxp0; v_float32x4 cxv1 = cxn1 - cxp1; v_float32x4 cxv0 = v_sub(cxn0, cxp0); v_float32x4 cxv1 = v_sub(cxn1, cxp1);
v_float32x4 cyv0 = cyn0 - cyp0; v_float32x4 cyv1 = cyn1 - cyp1; v_float32x4 cyv0 = v_sub(cyn0, cyp0); v_float32x4 cyv1 = v_sub(cyn1, cyp1);
v_float32x4 czv0 = czn0 - czp0; v_float32x4 czv1 = czn1 - czp1; v_float32x4 czv0 = v_sub(czn0, czp0); v_float32x4 czv1 = v_sub(czn1, czp1);
v_store(cxv + 0, cxv0); v_store(cxv + 4, cxv1); v_store(cxv + 0, cxv0); v_store(cxv + 4, cxv1);
v_store(cyv + 0, cyv0); v_store(cyv + 4, cyv1); v_store(cyv + 0, cyv0); v_store(cyv + 4, cyv1);

@ -146,21 +146,21 @@ inline float TSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const
{ {
// tx, ty, tz = floor(p) // tx, ty, tz = floor(p)
v_int32x4 ip = v_floor(p); v_int32x4 ip = v_floor(p);
v_float32x4 t = p - v_cvt_f32(ip); v_float32x4 t = v_sub(p, v_cvt_f32(ip));
float tx = t.get0(); float tx = v_get0(t);
t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t))); t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
float ty = t.get0(); float ty = v_get0(t);
t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t))); t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
float tz = t.get0(); float tz = v_get0(t);
int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2]; int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
const TsdfVoxel* volData = volume.ptr<TsdfVoxel>(); const TsdfVoxel* volData = volume.ptr<TsdfVoxel>();
int ix = ip.get0(); int ix = v_get0(ip);
ip = v_rotate_right<1>(ip); ip = v_rotate_right<1>(ip);
int iy = ip.get0(); int iy = v_get0(ip);
ip = v_rotate_right<1>(ip); ip = v_rotate_right<1>(ip);
int iz = ip.get0(); int iz = v_get0(ip);
int coordBase = ix*xdim + iy*ydim + iz*zdim; int coordBase = ix*xdim + iy*ydim + iz*zdim;
@ -170,15 +170,15 @@ inline float TSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const
v_float32x4 v0246 = tsdfToFloat_INTR(v_int32x4(vx[0], vx[2], vx[4], vx[6])); v_float32x4 v0246 = tsdfToFloat_INTR(v_int32x4(vx[0], vx[2], vx[4], vx[6]));
v_float32x4 v1357 = tsdfToFloat_INTR(v_int32x4(vx[1], vx[3], vx[5], vx[7])); v_float32x4 v1357 = tsdfToFloat_INTR(v_int32x4(vx[1], vx[3], vx[5], vx[7]));
v_float32x4 vxx = v0246 + v_setall_f32(tz)*(v1357 - v0246); v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
v_float32x4 v00_10 = vxx; v_float32x4 v00_10 = vxx;
v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx))); v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
v_float32x4 v0_1 = v00_10 + v_setall_f32(ty)*(v01_11 - v00_10); v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
float v0 = v0_1.get0(); float v0 = v_get0(v0_1);
v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1))); v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
float v1 = v0_1.get0(); float v1 = v_get0(v0_1);
return v0 + tx*(v1 - v0); return v0 + tx*(v1 - v0);
} }
@ -228,27 +228,27 @@ inline Point3f TSDFVolumeCPU::getNormalVoxel(const Point3f& _p) const
inline v_float32x4 TSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const inline v_float32x4 TSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const
{ {
if(v_check_any (p < v_float32x4(1.f, 1.f, 1.f, 0.f)) || if(v_check_any (v_lt(p, v_float32x4(1.f, 1.f, 1.f, 0.f))) ||
v_check_any (p >= v_float32x4((float)(volResolution.x-2), v_check_any (v_ge(p, v_float32x4((float)(volResolution.x-2),
(float)(volResolution.y-2), (float)(volResolution.y-2),
(float)(volResolution.z-2), 1.f)) (float)(volResolution.z-2), 1.f)))
) )
return nanv; return nanv;
v_int32x4 ip = v_floor(p); v_int32x4 ip = v_floor(p);
v_float32x4 t = p - v_cvt_f32(ip); v_float32x4 t = v_sub(p, v_cvt_f32(ip));
float tx = t.get0(); float tx = v_get0(t);
t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t))); t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
float ty = t.get0(); float ty = v_get0(t);
t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t))); t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
float tz = t.get0(); float tz = v_get0(t);
const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2]; const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
const TsdfVoxel* volData = volume.ptr<TsdfVoxel>(); const TsdfVoxel* volData = volume.ptr<TsdfVoxel>();
int ix = ip.get0(); ip = v_rotate_right<1>(ip); int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
int iy = ip.get0(); ip = v_rotate_right<1>(ip); int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
int iz = ip.get0(); int iz = v_get0(ip);
int coordBase = ix*xdim + iy*ydim + iz*zdim; int coordBase = ix*xdim + iy*ydim + iz*zdim;
@ -266,23 +266,23 @@ inline v_float32x4 TSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const
v_float32x4 v0246 (vx[0], vx[2], vx[4], vx[6]); v_float32x4 v0246 (vx[0], vx[2], vx[4], vx[6]);
v_float32x4 v1357 (vx[1], vx[3], vx[5], vx[7]); v_float32x4 v1357 (vx[1], vx[3], vx[5], vx[7]);
v_float32x4 vxx = v0246 + v_setall_f32(tz)*(v1357 - v0246); v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
v_float32x4 v00_10 = vxx; v_float32x4 v00_10 = vxx;
v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx))); v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
v_float32x4 v0_1 = v00_10 + v_setall_f32(ty)*(v01_11 - v00_10); v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
float v0 = v0_1.get0(); float v0 = v_get0(v0_1);
v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1))); v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
float v1 = v0_1.get0(); float v1 = v_get0(v0_1);
nv = v0 + tx*(v1 - v0); nv = v0 + tx*(v1 - v0);
} }
v_float32x4 n = v_load_aligned(an); v_float32x4 n = v_load_aligned(an);
v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(n*n))); v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(v_mul(n, n))));
return Norm.get0() < 0.0001f ? nanv : n/Norm; return v_get0(Norm) < 0.0001f ? nanv : v_div(n, Norm);
} }
#else #else
inline Point3f TSDFVolumeCPU::getNormalVoxel(const Point3f& p) const inline Point3f TSDFVolumeCPU::getNormalVoxel(const Point3f& p) const
@ -394,21 +394,21 @@ struct RaycastInvoker : ParallelLoopBody
// get direction through pixel in volume space: // get direction through pixel in volume space:
// 1. reproject (x, y) on projecting plane where z = 1.f // 1. reproject (x, y) on projecting plane where z = 1.f
v_float32x4 planed = (v_float32x4((float)x, (float)y, 0.f, 0.f) - vcxy)*vfxy; v_float32x4 planed = v_mul(v_sub(v_float32x4((float)x, (float)y, 0.F, 0.F), vcxy), vfxy);
planed = v_combine_low(planed, v_float32x4(1.f, 0.f, 0.f, 0.f)); planed = v_combine_low(planed, v_float32x4(1.f, 0.f, 0.f, 0.f));
// 2. rotate to volume space // 2. rotate to volume space
planed = v_matmuladd(planed, camRot0, camRot1, camRot2, v_setzero_f32()); planed = v_matmuladd(planed, camRot0, camRot1, camRot2, v_setzero_f32());
// 3. normalize // 3. normalize
v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(planed*planed))); v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(v_mul(planed, planed))));
v_float32x4 dir = planed*invNorm; v_float32x4 dir = v_mul(planed, invNorm);
// compute intersection of ray with all six bbox planes // compute intersection of ray with all six bbox planes
v_float32x4 rayinv = v_setall_f32(1.f)/dir; v_float32x4 rayinv = v_div(v_setall_f32(1.F), dir);
// div by zero should be eliminated by these products // div by zero should be eliminated by these products
v_float32x4 tbottom = rayinv*(boxDown - orig); v_float32x4 tbottom = v_mul(rayinv, v_sub(boxDown, orig));
v_float32x4 ttop = rayinv*(boxUp - orig); v_float32x4 ttop = v_mul(rayinv, v_sub(boxUp, orig));
// re-order intersections to find smallest and largest on each axis // re-order intersections to find smallest and largest on each axis
v_float32x4 minAx = v_min(ttop, tbottom); v_float32x4 minAx = v_min(ttop, tbottom);
@ -429,14 +429,14 @@ struct RaycastInvoker : ParallelLoopBody
if(tmin < tmax) if(tmin < tmax)
{ {
// interpolation optimized a little // interpolation optimized a little
orig *= invVoxelSize; orig = v_mul(orig, invVoxelSize);
dir *= invVoxelSize; dir = v_mul(dir, invVoxelSize);
int xdim = volume.volDims[0]; int xdim = volume.volDims[0];
int ydim = volume.volDims[1]; int ydim = volume.volDims[1];
int zdim = volume.volDims[2]; int zdim = volume.volDims[2];
v_float32x4 rayStep = dir * v_setall_f32(tstep); v_float32x4 rayStep = v_mul(dir, v_setall_f32(this->tstep));
v_float32x4 next = (orig + dir * v_setall_f32(tmin)); v_float32x4 next = (v_add(orig, v_mul(dir, v_setall_f32(tmin))));
float f = volume.interpolateVoxel(next), fnext = f; float f = volume.interpolateVoxel(next), fnext = f;
//raymarch //raymarch
@ -444,11 +444,11 @@ struct RaycastInvoker : ParallelLoopBody
int nSteps = cvFloor((tmax - tmin)/tstep); int nSteps = cvFloor((tmax - tmin)/tstep);
for(; steps < nSteps; steps++) for(; steps < nSteps; steps++)
{ {
next += rayStep; next = v_add(next, rayStep);
v_int32x4 ip = v_round(next); v_int32x4 ip = v_round(next);
int ix = ip.get0(); ip = v_rotate_right<1>(ip); int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
int iy = ip.get0(); ip = v_rotate_right<1>(ip); int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
int iz = ip.get0(); int iz = v_get0(ip);
int coord = ix*xdim + iy*ydim + iz*zdim; int coord = ix*xdim + iy*ydim + iz*zdim;
fnext = tsdfToFloat(volume.volume.at<TsdfVoxel>(coord).tsdf); fnext = tsdfToFloat(volume.volume.at<TsdfVoxel>(coord).tsdf);
@ -468,7 +468,7 @@ struct RaycastInvoker : ParallelLoopBody
// linearly interpolate t between two f values // linearly interpolate t between two f values
if(f > 0.f && fnext < 0.f) if(f > 0.f && fnext < 0.f)
{ {
v_float32x4 tp = next - rayStep; v_float32x4 tp = v_sub(next, rayStep);
float ft = volume.interpolateVoxel(tp); float ft = volume.interpolateVoxel(tp);
float ftdt = volume.interpolateVoxel(next); float ftdt = volume.interpolateVoxel(next);
float ts = tmin + tstep*(steps - ft/(ftdt - ft)); float ts = tmin + tstep*(steps - ft/(ftdt - ft));
@ -476,7 +476,7 @@ struct RaycastInvoker : ParallelLoopBody
// avoid division by zero // avoid division by zero
if(!cvIsNaN(ts) && !cvIsInf(ts)) if(!cvIsNaN(ts) && !cvIsInf(ts))
{ {
v_float32x4 pv = (orig + dir*v_setall_f32(ts)); v_float32x4 pv = (v_add(orig, v_mul(dir, v_setall_f32(ts))));
v_float32x4 nv = volume.getNormalVoxel(pv); v_float32x4 nv = volume.getNormalVoxel(pv);
if(!isNaN(nv)) if(!isNaN(nv))
@ -484,9 +484,7 @@ struct RaycastInvoker : ParallelLoopBody
//convert pv and nv to camera space //convert pv and nv to camera space
normal = v_matmuladd(nv, volRot0, volRot1, volRot2, v_setzero_f32()); normal = v_matmuladd(nv, volRot0, volRot1, volRot2, v_setzero_f32());
// interpolation optimized a little // interpolation optimized a little
point = v_matmuladd(pv*v_float32x4(volume.voxelSize, point = v_matmuladd(v_mul(pv, v_float32x4(this->volume.voxelSize, this->volume.voxelSize, this->volume.voxelSize, 1.F)),
volume.voxelSize,
volume.voxelSize, 1.f),
volRot0, volRot1, volRot2, volTrans); volRot0, volRot1, volRot2, volTrans);
} }
} }

@ -235,35 +235,33 @@ void integrateVolumeUnit(
// optimization of the following: // optimization of the following:
//Point3f volPt = Point3f(x, y, z)*voxelSize; //Point3f volPt = Point3f(x, y, z)*voxelSize;
//Point3f camSpacePt = vol2cam * volPt; //Point3f camSpacePt = vol2cam * volPt;
camSpacePt += zStep; camSpacePt = v_add(camSpacePt, zStep);
float zCamSpace = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))).get0(); float zCamSpace = v_get0(v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))));
if (zCamSpace <= 0.f) if (zCamSpace <= 0.f)
continue; continue;
v_float32x4 camPixVec = camSpacePt / v_setall_f32(zCamSpace); v_float32x4 camPixVec = v_div(camSpacePt, v_setall_f32(zCamSpace));
v_float32x4 projected = v_muladd(camPixVec, vfxy, vcxy); v_float32x4 projected = v_muladd(camPixVec, vfxy, vcxy);
// leave only first 2 lanes // leave only first 2 lanes
projected = v_reinterpret_as_f32(v_reinterpret_as_u32(projected) & projected = v_reinterpret_as_f32(v_and(v_reinterpret_as_u32(projected), v_uint32x4(4294967295U, 4294967295U, 0, 0)));
v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0));
depthType v; depthType v;
// bilinearly interpolate depth at projected // bilinearly interpolate depth at projected
{ {
const v_float32x4& pt = projected; const v_float32x4& pt = projected;
// check coords >= 0 and < imgSize // check coords >= 0 and < imgSize
v_uint32x4 limits = v_reinterpret_as_u32(pt < v_setzero_f32()) | v_uint32x4 limits = v_or(v_reinterpret_as_u32(v_lt(pt, v_setzero_f32())), v_reinterpret_as_u32(v_ge(pt, upLimits)));
v_reinterpret_as_u32(pt >= upLimits); limits = v_or(limits, v_rotate_right<1>(limits));
limits = limits | v_rotate_right<1>(limits); if (v_get0(limits))
if (limits.get0())
continue; continue;
// xi, yi = floor(pt) // xi, yi = floor(pt)
v_int32x4 ip = v_floor(pt); v_int32x4 ip = v_floor(pt);
v_int32x4 ipshift = ip; v_int32x4 ipshift = ip;
int xi = ipshift.get0(); int xi = v_get0(ipshift);
ipshift = v_rotate_right<1>(ipshift); ipshift = v_rotate_right<1>(ipshift);
int yi = ipshift.get0(); int yi = v_get0(ipshift);
const depthType* row0 = depth[yi + 0]; const depthType* row0 = depth[yi + 0];
const depthType* row1 = depth[yi + 1]; const depthType* row1 = depth[yi + 1];
@ -277,17 +275,17 @@ void integrateVolumeUnit(
// assume correct depth is positive // assume correct depth is positive
// don't fix missing data // don't fix missing data
if (v_check_all(vall > v_setzero_f32())) if (v_check_all(v_gt(vall, v_setzero_f32())))
{ {
v_float32x4 t = pt - v_cvt_f32(ip); v_float32x4 t = v_sub(pt, v_cvt_f32(ip));
float tx = t.get0(); float tx = v_get0(t);
t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t))); t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
v_float32x4 ty = v_setall_f32(t.get0()); v_float32x4 ty = v_setall_f32(v_get0(t));
// vx is y-interpolated between rows 0 and 1 // vx is y-interpolated between rows 0 and 1
v_float32x4 vx = v001 + ty * (v101 - v001); v_float32x4 vx = v_add(v001, v_mul(ty, v_sub(v101, v001)));
float v0 = vx.get0(); float v0 = v_get0(vx);
vx = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vx))); vx = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vx)));
float v1 = vx.get0(); float v1 = v_get0(vx);
v = v0 + tx * (v1 - v0); v = v0 + tx * (v1 - v0);
} }
else else
@ -295,8 +293,8 @@ void integrateVolumeUnit(
} }
// norm(camPixVec) produces double which is too slow // norm(camPixVec) produces double which is too slow
int _u = (int)projected.get0(); int _u = (int)v_get0(projected);
int _v = (int)v_rotate_right<1>(projected).get0(); int _v = (int)v_get0(v_rotate_right<1>(projected));
if (!(_u >= 0 && _u < depth.cols && _v >= 0 && _v < depth.rows)) if (!(_u >= 0 && _u < depth.cols && _v >= 0 && _v < depth.rows))
continue; continue;
float pixNorm = pixNorms.at<float>(_v, _u); float pixNorm = pixNorms.at<float>(_v, _u);
@ -500,35 +498,33 @@ void integrateRGBVolumeUnit(
// optimization of the following: // optimization of the following:
//Point3f volPt = Point3f(x, y, z)*voxelSize; //Point3f volPt = Point3f(x, y, z)*voxelSize;
//Point3f camSpacePt = vol2cam * volPt; //Point3f camSpacePt = vol2cam * volPt;
camSpacePt += zStep; camSpacePt = v_add(camSpacePt, zStep);
float zCamSpace = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))).get0(); float zCamSpace = v_get0(v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))));
if (zCamSpace <= 0.f) if (zCamSpace <= 0.f)
continue; continue;
v_float32x4 camPixVec = camSpacePt / v_setall_f32(zCamSpace); v_float32x4 camPixVec = v_div(camSpacePt, v_setall_f32(zCamSpace));
v_float32x4 projected = v_muladd(camPixVec, vfxy, vcxy); v_float32x4 projected = v_muladd(camPixVec, vfxy, vcxy);
// leave only first 2 lanes // leave only first 2 lanes
projected = v_reinterpret_as_f32(v_reinterpret_as_u32(projected) & projected = v_reinterpret_as_f32(v_and(v_reinterpret_as_u32(projected), v_uint32x4(4294967295U, 4294967295U, 0, 0)));
v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0));
depthType v; depthType v;
// bilinearly interpolate depth at projected // bilinearly interpolate depth at projected
{ {
const v_float32x4& pt = projected; const v_float32x4& pt = projected;
// check coords >= 0 and < imgSize // check coords >= 0 and < imgSize
v_uint32x4 limits = v_reinterpret_as_u32(pt < v_setzero_f32()) | v_uint32x4 limits = v_or(v_reinterpret_as_u32(v_lt(pt, v_setzero_f32())), v_reinterpret_as_u32(v_ge(pt, upLimits)));
v_reinterpret_as_u32(pt >= upLimits); limits = v_or(limits,v_rotate_right<1>(limits));
limits = limits | v_rotate_right<1>(limits); if (v_get0(limits))
if (limits.get0())
continue; continue;
// xi, yi = floor(pt) // xi, yi = floor(pt)
v_int32x4 ip = v_floor(pt); v_int32x4 ip = v_floor(pt);
v_int32x4 ipshift = ip; v_int32x4 ipshift = ip;
int xi = ipshift.get0(); int xi = v_get0(ipshift);
ipshift = v_rotate_right<1>(ipshift); ipshift = v_rotate_right<1>(ipshift);
int yi = ipshift.get0(); int yi = v_get0(ipshift);
const depthType* row0 = depth[yi + 0]; const depthType* row0 = depth[yi + 0];
const depthType* row1 = depth[yi + 1]; const depthType* row1 = depth[yi + 1];
@ -542,17 +538,17 @@ void integrateRGBVolumeUnit(
// assume correct depth is positive // assume correct depth is positive
// don't fix missing data // don't fix missing data
if (v_check_all(vall > v_setzero_f32())) if (v_check_all(v_gt(vall, v_setzero_f32())))
{ {
v_float32x4 t = pt - v_cvt_f32(ip); v_float32x4 t = v_sub(pt, v_cvt_f32(ip));
float tx = t.get0(); float tx = v_get0(t);
t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t))); t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
v_float32x4 ty = v_setall_f32(t.get0()); v_float32x4 ty = v_setall_f32(v_get0(t));
// vx is y-interpolated between rows 0 and 1 // vx is y-interpolated between rows 0 and 1
v_float32x4 vx = v001 + ty * (v101 - v001); v_float32x4 vx = v_add(v001, v_mul(ty, v_sub(v101, v001)));
float v0 = vx.get0(); float v0 = v_get0(vx);
vx = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vx))); vx = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vx)));
float v1 = vx.get0(); float v1 = v_get0(vx);
v = v0 + tx * (v1 - v0); v = v0 + tx * (v1 - v0);
} }
else else
@ -561,14 +557,13 @@ void integrateRGBVolumeUnit(
v_float32x4 projectedRGB = v_muladd(camPixVec, rgb_vfxy, rgb_vcxy); v_float32x4 projectedRGB = v_muladd(camPixVec, rgb_vfxy, rgb_vcxy);
// leave only first 2 lanes // leave only first 2 lanes
projectedRGB = v_reinterpret_as_f32(v_reinterpret_as_u32(projected) & projectedRGB = v_reinterpret_as_f32(v_and(v_reinterpret_as_u32(projected), v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0)));
v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0));
// norm(camPixVec) produces double which is too slow // norm(camPixVec) produces double which is too slow
int _u = (int)projected.get0(); int _u = (int)v_get0(projected);
int _v = (int)v_rotate_right<1>(projected).get0(); int _v = (int)v_get0(v_rotate_right<1>(projected));
int rgb_u = (int)projectedRGB.get0(); int rgb_u = (int)v_get0(projectedRGB);
int rgb_v = (int)v_rotate_right<1>(projectedRGB).get0(); int rgb_v = (int)v_get0(v_rotate_right<1>(projectedRGB));
if (!(_u >= 0 && _u < depth.cols && _v >= 0 && _v < depth.rows && if (!(_u >= 0 && _u < depth.cols && _v >= 0 && _v < depth.rows &&
rgb_v >= 0 && rgb_v < color.rows && rgb_u >= 0 && rgb_u < color.cols)) rgb_v >= 0 && rgb_v < color.rows && rgb_u >= 0 && rgb_u < color.cols))

@ -20,7 +20,7 @@ namespace kinfu
inline v_float32x4 tsdfToFloat_INTR(const v_int32x4& num) inline v_float32x4 tsdfToFloat_INTR(const v_int32x4& num)
{ {
v_float32x4 num128 = v_setall_f32(-1.f / 128.f); v_float32x4 num128 = v_setall_f32(-1.f / 128.f);
return v_cvt_f32(num) * num128; return v_mul(v_cvt_f32(num), num128);
} }
#endif #endif

@ -68,7 +68,7 @@ inline bool isNaN(cv::Point3f p)
#if USE_INTRINSICS #if USE_INTRINSICS
static inline bool isNaN(const cv::v_float32x4& p) static inline bool isNaN(const cv::v_float32x4& p)
{ {
return cv::v_check_any(p != p); return cv::v_check_any(v_ne(p, p));
} }
#endif #endif

@ -74,8 +74,8 @@ inline v_uint8x16 v_finalize_pix_ch(const v_int16x8& c0, const v_int16x8& c1,
v_expand_f32(c0, f0, f1); v_expand_f32(c0, f0, f1);
v_expand_f32(c1, f2, f3); v_expand_f32(c1, f2, f3);
v_int16x8 d0 = v_pack(v_round(s0*alpha + f0), v_round(s1*alpha + f1)); v_int16x8 d0 = v_pack(v_round(v_add(v_mul(s0, alpha), f0)), v_round(v_add(v_mul(s1, alpha), f1)));
v_int16x8 d1 = v_pack(v_round(s2*alpha + f2), v_round(s3*alpha + f3)); v_int16x8 d1 = v_pack(v_round(v_add(v_mul(s2, alpha), f2)), v_round(v_add(v_mul(s3, alpha), f3)));
return v_pack_u(d0, d1); return v_pack_u(d0, d1);
} }
@ -135,12 +135,12 @@ public:
v_expand_s(p1, p10, p11); v_expand_s(p1, p10, p11);
v_expand_s(p2, p20, p21); v_expand_s(p2, p20, p21);
v_int16x8 d00 = p00 - c00, d01 = p01 - c01; v_int16x8 d00 = v_sub(p00, c00), d01 = v_sub(p01, c01);
v_int16x8 d10 = p10 - c10, d11 = p11 - c11; v_int16x8 d10 = v_sub(p10, c10), d11 = v_sub(p11, c11);
v_int16x8 d20 = p20 - c20, d21 = p21 - c21; v_int16x8 d20 = v_sub(p20, c20), d21 = v_sub(p21, c21);
v_uint16x8 n0 = v_abs(d00) + v_abs(d10) + v_abs(d20); v_uint16x8 n0 = v_add(v_add(v_abs(d00), v_abs(d10)), v_abs(d20));
v_uint16x8 n1 = v_abs(d01) + v_abs(d11) + v_abs(d21); v_uint16x8 n1 = v_add(v_add(v_abs(d01), v_abs(d11)), v_abs(d21));
ushort CV_DECL_ALIGNED(16) nbuf[16]; ushort CV_DECL_ALIGNED(16) nbuf[16];
v_store(nbuf, n0); v_store(nbuf, n0);
@ -153,13 +153,13 @@ public:
v_expand_f32(d00, fd0, fd1); v_expand_f32(d00, fd0, fd1);
v_expand_f32(d01, fd2, fd3); v_expand_f32(d01, fd2, fd3);
s00 += fd0*w0; s01 += fd1*w1; s02 += fd2*w2; s03 += fd3*w3; s00 = v_add(s00, v_mul(fd0, w0)); s01 = v_add(s01, v_mul(fd1, w1)); s02 = v_add(s02, v_mul(fd2, w2)); s03 = v_add(s03, v_mul(fd3, w3));
v_expand_f32(d10, fd0, fd1); v_expand_f32(d10, fd0, fd1);
v_expand_f32(d11, fd2, fd3); v_expand_f32(d11, fd2, fd3);
s10 += fd0*w0; s11 += fd1*w1; s12 += fd2*w2; s13 += fd3*w3; s10 = v_add(s10, v_mul(fd0, w0)); s11 = v_add(s11, v_mul(fd1, w1)); s12 = v_add(s12, v_mul(fd2, w2)); s13 = v_add(s13, v_mul(fd3, w3));
v_expand_f32(d20, fd0, fd1); v_expand_f32(d20, fd0, fd1);
v_expand_f32(d21, fd2, fd3); v_expand_f32(d21, fd2, fd3);
s20 += fd0*w0; s21 += fd1*w1; s22 += fd2*w2; s23 += fd3*w3; s20 = v_add(s20, v_mul(fd0, w0)); s21 = v_add(s21, v_mul(fd1, w1)); s22 = v_add(s22, v_mul(fd2, w2)); s23 = v_add(s23, v_mul(fd3, w3));
} }
c0 = v_finalize_pix_ch(c00, c01, s00, s01, s02, s03, v_alpha); c0 = v_finalize_pix_ch(c00, c01, s00, s01, s02, s03, v_alpha);

@ -303,15 +303,15 @@ void FastGlobalSmootherFilterImpl::process_4row_block(Mat* cur,int i)
v_float32x4 aux0,aux1,aux2,aux3; v_float32x4 aux0,aux1,aux2,aux3;
#define PROC4(Chor_in,cur_in,coef_prev_in,interD_prev_in,cur_prev_in,interD_out,cur_out,coef_cur_out)\ #define PROC4(Chor_in,cur_in,coef_prev_in,interD_prev_in,cur_prev_in,interD_out,cur_out,coef_cur_out)\
coef_cur_out = lambda_reg*Chor_in;\ coef_cur_out = v_mul(lambda_reg, Chor_in);\
aux0 = interD_prev_in*coef_prev_in;\ aux0 = v_mul(interD_prev_in, coef_prev_in);\
aux1 = coef_cur_out+coef_prev_in;\ aux1 = v_add(coef_cur_out, coef_prev_in);\
aux1 = one_reg-aux1;\ aux1 = v_sub(one_reg, aux1);\
aux0 = aux1-aux0;\ aux0 = v_sub(aux1, aux0);\
interD_out = coef_cur_out/aux0;\ interD_out = v_div(coef_cur_out, aux0);\
aux1 = cur_prev_in*coef_prev_in;\ aux1 = v_mul(cur_prev_in, coef_prev_in);\
aux1 = cur_in - aux1;\ aux1 = v_sub(cur_in, aux1);\
cur_out = aux1/aux0; cur_out = v_div(aux1, aux0);
for(;j<w-3;j+=4) for(;j<w-3;j+=4)
{ {
@ -406,14 +406,14 @@ void FastGlobalSmootherFilterImpl::process_4row_block(Mat* cur,int i)
aux3 = v_load(cur_row_next3+j); aux3 = v_load(cur_row_next3+j);
v_transpose4x4(aux0,aux1,aux2,aux3,b0,b1,b2,b3); v_transpose4x4(aux0,aux1,aux2,aux3,b0,b1,b2,b3);
aux0 = a3*cur_next_reg; aux0 = v_mul(a3, cur_next_reg);
b3 = b3-aux0; b3 = v_sub(b3, aux0);
aux0 = a2*b3; aux0 = v_mul(a2, b3);
b2 = b2-aux0; b2 = v_sub(b2, aux0);
aux0 = a1*b2; aux0 = v_mul(a1, b2);
b1 = b1-aux0; b1 = v_sub(b1, aux0);
aux0 = a0*b1; aux0 = v_mul(a0, b1);
b0 = b0-aux0; b0 = v_sub(b0, aux0);
cur_next_reg = b0; cur_next_reg = b0;
@ -522,24 +522,24 @@ void FastGlobalSmootherFilterImpl::VerticalPass_ParBody::operator()(const Range&
{ {
a = v_load(Cvert_row_prev+j); a = v_load(Cvert_row_prev+j);
b = v_load(Cvert_row+j); b = v_load(Cvert_row+j);
coef_prev_reg = lambda_reg*a; coef_prev_reg = v_mul(lambda_reg, a);
coef_cur_reg = lambda_reg*b; coef_cur_reg = v_mul(lambda_reg, b);
a = v_load(interD_row_prev+j); a = v_load(interD_row_prev+j);
a = a*coef_prev_reg; a = v_mul(a, coef_prev_reg);
b = coef_prev_reg+coef_cur_reg; b = v_add(coef_prev_reg, coef_cur_reg);
b = b+a; b = v_add(b, a);
a = one_reg-b; //computed denom a = v_sub(one_reg, b); //computed denom
b = coef_cur_reg/a; //computed interD_row b = v_div(coef_cur_reg, a); //computed interD_row
c = v_load(cur_row_prev+j); c = v_load(cur_row_prev+j);
c = c*coef_prev_reg; c = v_mul(c, coef_prev_reg);
d = v_load(cur_row+j); d = v_load(cur_row+j);
d = d-c; d = v_sub(d, c);
d = d/a; //computed cur_row d = v_div(d, a); //computed cur_row
v_store(interD_row+j,b); v_store(interD_row+j,b);
v_store(cur_row+j,d); v_store(cur_row+j,d);
@ -570,10 +570,10 @@ void FastGlobalSmootherFilterImpl::VerticalPass_ParBody::operator()(const Range&
{ {
a = v_load(interD_row+j); a = v_load(interD_row+j);
b = v_load(cur_row_next+j); b = v_load(cur_row_next+j);
b = a*b; b = v_mul(a, b);
a = v_load(cur_row+j); a = v_load(cur_row+j);
b = a-b; b = v_sub(a, b);
v_store(cur_row+j,b); v_store(cur_row+j,b);
} }
#endif #endif

@ -130,21 +130,21 @@ void calculateChannelSums(uint &sumB, uint &sumG, uint &sumR, uchar *src_data, i
v_expand(v_max_val, v_max1, v_max2); v_expand(v_max_val, v_max1, v_max2);
// Calculate masks // Calculate masks
v_m1 = ~(v_mul_wrap(v_max1 - v_min1, v_255) > v_mul_wrap(v_thresh, v_max1)); v_m1 = v_not(v_gt(v_mul_wrap(v_sub(v_max1, v_min1), v_255), v_mul_wrap(v_thresh, v_max1)));
v_m2 = ~(v_mul_wrap(v_max2 - v_min2, v_255) > v_mul_wrap(v_thresh, v_max2)); v_m2 = v_not(v_gt(v_mul_wrap(v_sub(v_max2, v_min2), v_255), v_mul_wrap(v_thresh, v_max2)));
// Apply masks // Apply masks
v_iB1 = (v_iB1 & v_m1) + (v_iB2 & v_m2); v_iB1 = v_add(v_and(v_iB1, v_m1), v_and(v_iB2, v_m2));
v_iG1 = (v_iG1 & v_m1) + (v_iG2 & v_m2); v_iG1 = v_add(v_and(v_iG1, v_m1), v_and(v_iG2, v_m2));
v_iR1 = (v_iR1 & v_m1) + (v_iR2 & v_m2); v_iR1 = v_add(v_and(v_iR1, v_m1), v_and(v_iR2, v_m2));
// Split and add to the sums: // Split and add to the sums:
v_expand(v_iB1, v_uint1, v_uint2); v_expand(v_iB1, v_uint1, v_uint2);
v_SB += v_uint1 + v_uint2; v_SB = v_add(v_SB, v_add(v_uint1, v_uint2));
v_expand(v_iG1, v_uint1, v_uint2); v_expand(v_iG1, v_uint1, v_uint2);
v_SG += v_uint1 + v_uint2; v_SG = v_add(v_SG, v_add(v_uint1, v_uint2));
v_expand(v_iR1, v_uint1, v_uint2); v_expand(v_iR1, v_uint1, v_uint2);
v_SR += v_uint1 + v_uint2; v_SR = v_add(v_SR, v_add(v_uint1, v_uint2));
} }
sumB = v_reduce_sum(v_SB); sumB = v_reduce_sum(v_SB);
@ -197,21 +197,21 @@ void calculateChannelSums(uint64 &sumB, uint64 &sumG, uint64 &sumR, ushort *src_
v_expand(v_max_val, v_max1, v_max2); v_expand(v_max_val, v_max1, v_max2);
// Calculate masks // Calculate masks
v_m1 = ~((v_max1 - v_min1) * v_65535 > v_thresh * v_max1); v_m1 = v_not(v_gt(v_mul(v_sub(v_max1, v_min1), v_65535), v_mul(v_thresh, v_max1)));
v_m2 = ~((v_max2 - v_min2) * v_65535 > v_thresh * v_max2); v_m2 = v_not(v_gt(v_mul(v_sub(v_max2, v_min2), v_65535), v_mul(v_thresh, v_max2)));
// Apply masks // Apply masks
v_iB1 = (v_iB1 & v_m1) + (v_iB2 & v_m2); v_iB1 = v_add(v_and(v_iB1, v_m1), v_and(v_iB2, v_m2));
v_iG1 = (v_iG1 & v_m1) + (v_iG2 & v_m2); v_iG1 = v_add(v_and(v_iG1, v_m1), v_and(v_iG2, v_m2));
v_iR1 = (v_iR1 & v_m1) + (v_iR2 & v_m2); v_iR1 = v_add(v_and(v_iR1, v_m1), v_and(v_iR2, v_m2));
// Split and add to the sums: // Split and add to the sums:
v_expand(v_iB1, v_u64_1, v_u64_2); v_expand(v_iB1, v_u64_1, v_u64_2);
v_SB += v_u64_1 + v_u64_2; v_SB = v_add(v_SB, v_add(v_u64_1, v_u64_2));
v_expand(v_iG1, v_u64_1, v_u64_2); v_expand(v_iG1, v_u64_1, v_u64_2);
v_SG += v_u64_1 + v_u64_2; v_SG = v_add(v_SG, v_add(v_u64_1, v_u64_2));
v_expand(v_iR1, v_u64_1, v_u64_2); v_expand(v_iR1, v_u64_1, v_u64_2);
v_SR += v_u64_1 + v_u64_2; v_SR = v_add(v_SR, v_add(v_u64_1, v_u64_2));
} }
// Perform final reduction // Perform final reduction
@ -282,12 +282,12 @@ void applyChannelGains(InputArray _src, OutputArray _dst, float gainB, float gai
v_expand(v_inR, v_sR1, v_sR2); v_expand(v_inR, v_sR1, v_sR2);
// Multiply by gains // Multiply by gains
v_sB1 = v_mul_wrap(v_sB1, v_gainB) >> 8; v_sB1 = v_shr(v_mul_wrap(v_sB1, v_gainB), 8);
v_sB2 = v_mul_wrap(v_sB2, v_gainB) >> 8; v_sB2 = v_shr(v_mul_wrap(v_sB2, v_gainB), 8);
v_sG1 = v_mul_wrap(v_sG1, v_gainG) >> 8; v_sG1 = v_shr(v_mul_wrap(v_sG1, v_gainG), 8);
v_sG2 = v_mul_wrap(v_sG2, v_gainG) >> 8; v_sG2 = v_shr(v_mul_wrap(v_sG2, v_gainG), 8);
v_sR1 = v_mul_wrap(v_sR1, v_gainR) >> 8; v_sR1 = v_shr(v_mul_wrap(v_sR1, v_gainR), 8);
v_sR2 = v_mul_wrap(v_sR2, v_gainR) >> 8; v_sR2 = v_shr(v_mul_wrap(v_sR2, v_gainR), 8);
// Pack into vectors of v_uint8x16 // Pack into vectors of v_uint8x16
v_store_interleave(&dst_data[i], v_pack(v_sB1, v_sB2), v_pack(v_sG1, v_sG2), v_pack(v_sR1, v_sR2)); v_store_interleave(&dst_data[i], v_pack(v_sB1, v_sB2), v_pack(v_sG1, v_sG2), v_pack(v_sR1, v_sR2));
@ -325,12 +325,12 @@ void applyChannelGains(InputArray _src, OutputArray _dst, float gainB, float gai
v_expand(v_inR, v_sR1, v_sR2); v_expand(v_inR, v_sR1, v_sR2);
// Multiply by scaling factors // Multiply by scaling factors
v_sB1 = (v_sB1 * v_gainB) >> 16; v_sB1 = v_shr(v_mul(v_sB1, v_gainB), 16);
v_sB2 = (v_sB2 * v_gainB) >> 16; v_sB2 = v_shr(v_mul(v_sB2, v_gainB), 16);
v_sG1 = (v_sG1 * v_gainG) >> 16; v_sG1 = v_shr(v_mul(v_sG1, v_gainG), 16);
v_sG2 = (v_sG2 * v_gainG) >> 16; v_sG2 = v_shr(v_mul(v_sG2, v_gainG), 16);
v_sR1 = (v_sR1 * v_gainR) >> 16; v_sR1 = v_shr(v_mul(v_sR1, v_gainR), 16);
v_sR2 = (v_sR2 * v_gainR) >> 16; v_sR2 = v_shr(v_mul(v_sR2, v_gainR), 16);
// Pack into vectors of v_uint16x8 // Pack into vectors of v_uint16x8
v_store_interleave(&dst_data[i], v_pack(v_sB1, v_sB2), v_pack(v_sG1, v_sG2), v_pack(v_sR1, v_sR2)); v_store_interleave(&dst_data[i], v_pack(v_sB1, v_sB2), v_pack(v_sG1, v_sG2), v_pack(v_sR1, v_sR2));

@ -192,7 +192,7 @@ void LearningBasedWBImpl::preprocessing(Mat &src)
v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR); v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
v_local_max = v_max(v_inB, v_max(v_inG, v_inR)); v_local_max = v_max(v_inB, v_max(v_inG, v_inR));
v_global_max = v_max(v_local_max, v_global_max); v_global_max = v_max(v_local_max, v_global_max);
v_mask = (v_local_max < v_thresh); v_mask = (v_lt(v_local_max, v_thresh));
v_store(mask_ptr + i, v_mask); v_store(mask_ptr + i, v_mask);
} }
uchar global_max[16]; uchar global_max[16];
@ -225,7 +225,7 @@ void LearningBasedWBImpl::preprocessing(Mat &src)
v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR); v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
v_local_max = v_max(v_inB, v_max(v_inG, v_inR)); v_local_max = v_max(v_inB, v_max(v_inG, v_inR));
v_global_max = v_max(v_local_max, v_global_max); v_global_max = v_max(v_local_max, v_global_max);
v_mask = (v_local_max < v_thresh); v_mask = (v_lt(v_local_max, v_thresh));
v_pack_store(mask_ptr + i, v_mask); v_pack_store(mask_ptr + i, v_mask);
} }
ushort global_max[8]; ushort global_max[8];
@ -270,9 +270,9 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR); v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
v_uint8x16 v_mask = v_load(mask_ptr + i); v_uint8x16 v_mask = v_load(mask_ptr + i);
v_inB &= v_mask; v_inB = v_and(v_inB, v_mask);
v_inG &= v_mask; v_inG = v_and(v_inG, v_mask);
v_inR &= v_mask; v_inR = v_and(v_inR, v_mask);
v_uint16x8 v_sR1, v_sR2, v_sG1, v_sG2, v_sB1, v_sB2; v_uint16x8 v_sR1, v_sR2, v_sG1, v_sG2, v_sB1, v_sB2;
v_expand(v_inB, v_sB1, v_sB2); v_expand(v_inB, v_sB1, v_sB2);
@ -280,33 +280,33 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
v_expand(v_inR, v_sR1, v_sR2); v_expand(v_inR, v_sR1, v_sR2);
// update the brightest (R,G,B) tuple (process left half): // update the brightest (R,G,B) tuple (process left half):
v_uint16x8 v_sum = v_sB1 + v_sG1 + v_sR1; v_uint16x8 v_sum = v_add(v_add(v_sB1, v_sG1), v_sR1);
v_uint16x8 v_max_mask = (v_sum > v_max_sum); v_uint16x8 v_max_mask = (v_gt(v_sum, v_max_sum));
v_max_sum = v_max(v_sum, v_max_sum); v_max_sum = v_max(v_sum, v_max_sum);
v_brightestB = (v_sB1 & v_max_mask) + (v_brightestB & (~v_max_mask)); v_brightestB = v_add(v_and(v_sB1, v_max_mask), v_and(v_brightestB, v_not(v_max_mask)));
v_brightestG = (v_sG1 & v_max_mask) + (v_brightestG & (~v_max_mask)); v_brightestG = v_add(v_and(v_sG1, v_max_mask), v_and(v_brightestG, v_not(v_max_mask)));
v_brightestR = (v_sR1 & v_max_mask) + (v_brightestR & (~v_max_mask)); v_brightestR = v_add(v_and(v_sR1, v_max_mask), v_and(v_brightestR, v_not(v_max_mask)));
// update the brightest (R,G,B) tuple (process right half): // update the brightest (R,G,B) tuple (process right half):
v_sum = v_sB2 + v_sG2 + v_sR2; v_sum = v_add(v_add(v_sB2, v_sG2), v_sR2);
v_max_mask = (v_sum > v_max_sum); v_max_mask = (v_gt(v_sum, v_max_sum));
v_max_sum = v_max(v_sum, v_max_sum); v_max_sum = v_max(v_sum, v_max_sum);
v_brightestB = (v_sB2 & v_max_mask) + (v_brightestB & (~v_max_mask)); v_brightestB = v_add(v_and(v_sB2, v_max_mask), v_and(v_brightestB, v_not(v_max_mask)));
v_brightestG = (v_sG2 & v_max_mask) + (v_brightestG & (~v_max_mask)); v_brightestG = v_add(v_and(v_sG2, v_max_mask), v_and(v_brightestG, v_not(v_max_mask)));
v_brightestR = (v_sR2 & v_max_mask) + (v_brightestR & (~v_max_mask)); v_brightestR = v_add(v_and(v_sR2, v_max_mask), v_and(v_brightestR, v_not(v_max_mask)));
// update sums: // update sums:
v_sB1 = v_sB1 + v_sB2; v_sB1 = v_add(v_sB1, v_sB2);
v_sG1 = v_sG1 + v_sG2; v_sG1 = v_add(v_sG1, v_sG2);
v_sR1 = v_sR1 + v_sR2; v_sR1 = v_add(v_sR1, v_sR2);
v_uint32x4 v_uint1, v_uint2; v_uint32x4 v_uint1, v_uint2;
v_expand(v_sB1, v_uint1, v_uint2); v_expand(v_sB1, v_uint1, v_uint2);
v_SB += v_uint1 + v_uint2; v_SB = v_add(v_SB, v_add(v_uint1, v_uint2));
v_expand(v_sG1, v_uint1, v_uint2); v_expand(v_sG1, v_uint1, v_uint2);
v_SG += v_uint1 + v_uint2; v_SG = v_add(v_SG, v_add(v_uint1, v_uint2));
v_expand(v_sR1, v_uint1, v_uint2); v_expand(v_sR1, v_uint1, v_uint2);
v_SR += v_uint1 + v_uint2; v_SR = v_add(v_SR, v_add(v_uint1, v_uint2));
} }
sumB = v_reduce_sum(v_SB); sumB = v_reduce_sum(v_SB);
sumG = v_reduce_sum(v_SG); sumG = v_reduce_sum(v_SG);
@ -361,11 +361,11 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
v_uint16x8 v_inB, v_inG, v_inR; v_uint16x8 v_inB, v_inG, v_inR;
v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR); v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
v_uint16x8 v_mask = v_load_expand(mask_ptr + i); v_uint16x8 v_mask = v_load_expand(mask_ptr + i);
v_mask = v_mask | ((v_mask & v_mask_lower) << 8); v_mask = v_or(v_mask, v_shl<8>(v_and(v_mask, v_mask_lower)));
v_inB &= v_mask; v_inB = v_and(v_inB, v_mask);
v_inG &= v_mask; v_inG = v_and(v_inG, v_mask);
v_inR &= v_mask; v_inR = v_and(v_inR, v_mask);
v_uint32x4 v_iR1, v_iR2, v_iG1, v_iG2, v_iB1, v_iB2; v_uint32x4 v_iR1, v_iR2, v_iG1, v_iG2, v_iB1, v_iB2;
v_expand(v_inB, v_iB1, v_iB2); v_expand(v_inB, v_iB1, v_iB2);
@ -373,32 +373,32 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
v_expand(v_inR, v_iR1, v_iR2); v_expand(v_inR, v_iR1, v_iR2);
// update the brightest (R,G,B) tuple (process left half): // update the brightest (R,G,B) tuple (process left half):
v_uint32x4 v_sum = v_iB1 + v_iG1 + v_iR1; v_uint32x4 v_sum = v_add(v_add(v_iB1, v_iG1), v_iR1);
v_uint32x4 v_max_mask = (v_sum > v_max_sum); v_uint32x4 v_max_mask = (v_gt(v_sum, v_max_sum));
v_max_sum = v_max(v_sum, v_max_sum); v_max_sum = v_max(v_sum, v_max_sum);
v_brightestB = (v_iB1 & v_max_mask) + (v_brightestB & (~v_max_mask)); v_brightestB = v_add(v_and(v_iB1, v_max_mask), v_and(v_brightestB, v_not(v_max_mask)));
v_brightestG = (v_iG1 & v_max_mask) + (v_brightestG & (~v_max_mask)); v_brightestG = v_add(v_and(v_iG1, v_max_mask), v_and(v_brightestG, v_not(v_max_mask)));
v_brightestR = (v_iR1 & v_max_mask) + (v_brightestR & (~v_max_mask)); v_brightestR = v_add(v_and(v_iR1, v_max_mask), v_and(v_brightestR, v_not(v_max_mask)));
// update the brightest (R,G,B) tuple (process right half): // update the brightest (R,G,B) tuple (process right half):
v_sum = v_iB2 + v_iG2 + v_iR2; v_sum = v_add(v_add(v_iB2, v_iG2), v_iR2);
v_max_mask = (v_sum > v_max_sum); v_max_mask = (v_gt(v_sum, v_max_sum));
v_max_sum = v_max(v_sum, v_max_sum); v_max_sum = v_max(v_sum, v_max_sum);
v_brightestB = (v_iB2 & v_max_mask) + (v_brightestB & (~v_max_mask)); v_brightestB = v_add(v_and(v_iB2, v_max_mask), v_and(v_brightestB, v_not(v_max_mask)));
v_brightestG = (v_iG2 & v_max_mask) + (v_brightestG & (~v_max_mask)); v_brightestG = v_add(v_and(v_iG2, v_max_mask), v_and(v_brightestG, v_not(v_max_mask)));
v_brightestR = (v_iR2 & v_max_mask) + (v_brightestR & (~v_max_mask)); v_brightestR = v_add(v_and(v_iR2, v_max_mask), v_and(v_brightestR, v_not(v_max_mask)));
// update sums: // update sums:
v_iB1 = v_iB1 + v_iB2; v_iB1 = v_add(v_iB1, v_iB2);
v_iG1 = v_iG1 + v_iG2; v_iG1 = v_add(v_iG1, v_iG2);
v_iR1 = v_iR1 + v_iR2; v_iR1 = v_add(v_iR1, v_iR2);
v_uint64x2 v_uint64_1, v_uint64_2; v_uint64x2 v_uint64_1, v_uint64_2;
v_expand(v_iB1, v_uint64_1, v_uint64_2); v_expand(v_iB1, v_uint64_1, v_uint64_2);
v_SB += v_uint64_1 + v_uint64_2; v_SB = v_add(v_SB, v_add(v_uint64_1, v_uint64_2));
v_expand(v_iG1, v_uint64_1, v_uint64_2); v_expand(v_iG1, v_uint64_1, v_uint64_2);
v_SG += v_uint64_1 + v_uint64_2; v_SG = v_add(v_SG, v_add(v_uint64_1, v_uint64_2));
v_expand(v_iR1, v_uint64_1, v_uint64_2); v_expand(v_iR1, v_uint64_1, v_uint64_2);
v_SR += v_uint64_1 + v_uint64_2; v_SR = v_add(v_SR, v_add(v_uint64_1, v_uint64_2));
} }
uint64 sum_arr[2]; uint64 sum_arr[2];
v_store(sum_arr, v_SB); v_store(sum_arr, v_SB);

Loading…
Cancel
Save