Use new Universal Intrinsic API.

pull/3572/head
Liutong HAN 2 years ago
parent 3af65e6cb1
commit b9a460b857
  1. 224
      modules/optflow/src/rlof/berlof_invoker.hpp
  2. 74
      modules/optflow/src/rlof/plk_invoker.hpp
  3. 132
      modules/optflow/src/rlof/rlof_invoker.hpp
  4. 76
      modules/optflow/src/rlof/rlof_invokerbase.hpp
  5. 8
      modules/optflow/src/rlof/rlof_localflow.cpp
  6. 120
      modules/rgbd/src/colored_tsdf.cpp
  7. 48
      modules/rgbd/src/fast_icp.cpp
  8. 32
      modules/rgbd/src/hash_tsdf.cpp
  9. 90
      modules/rgbd/src/tsdf.cpp
  10. 83
      modules/rgbd/src/tsdf_functions.cpp
  11. 2
      modules/rgbd/src/tsdf_functions.hpp
  12. 2
      modules/rgbd/src/utils.hpp
  13. 20
      modules/ximgproc/src/anisodiff.cpp
  14. 58
      modules/ximgproc/src/fgs_filter.cpp
  15. 56
      modules/xphoto/src/grayworld_white_balance.cpp
  16. 82
      modules/xphoto/src/learning_based_color_balance.cpp

@ -296,7 +296,7 @@ public:
v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
v_int32x4 t0, t1;
v_int16x8 t00, t01, t10, t11;
@ -304,35 +304,35 @@ public:
v_zip(v10, v11, t10, t11);
//subpixel interpolation
t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
t0 = t0 >> (W_BITS - 5);
t1 = t1 >> (W_BITS - 5);
t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
t0 = v_shr(t0, W_BITS - 5);
t1 = v_shr(t1, W_BITS - 5);
diff0 = v_pack(t0, t1);
// I*gain.x + gain.x
v_int16x8 diff[4] =
{
((v11 << 5) - vI) & vmask,
((v01 << 5) - vI) & vmask,
((v10 << 5) - vI) & vmask,
((v00 << 5) - vI) & vmask
v_and(v_sub(v_shl<5>(v11), vI), vmask),
v_and(v_sub(v_shl<5>(v01), vI), vmask),
v_and(v_sub(v_shl<5>(v10), vI), vmask),
v_and(v_sub(v_shl<5>(v00), vI), vmask)
};
diff0 = diff0 - vI;
diff0 = diff0 & vmask;
diff0 = v_sub(diff0, vI);
diff0 = v_and(diff0, vmask);
v_int16x8 vscale_diff_is_pos = diff0 > vscale;
veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1);
v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale);
veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1));
// since there is no abs vor int16x8 we have to do this hack
v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
v_int16x8 vset2, vset1;
// |It| < sigma1 ?
vset2 = vabs_diff < vparam1;
vset2 = v_lt(vabs_diff, vparam1);
// It > 0 ?
v_int16x8 vdiff_is_pos = diff0 > vzero;
v_int16x8 vdiff_is_pos = v_gt(diff0, vzero);
// sigma0 < |It| < sigma1 ?
vset1 = vset2 & (vabs_diff > vparam0);
vset1 = v_and(vset2, v_gt(vabs_diff, vparam0));
// val = |It| -/+ sigma1
v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1);
v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1));
v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8));
@ -342,7 +342,7 @@ public:
for (unsigned int mmi = 0; mmi < 4; mmi++)
{
// It == 0 ? |It| > sigma13
diff0 = vset2 & diff[mmi];
diff0 = v_and(vset2, diff[mmi]);
// It == val ? sigma0 < |It| < sigma1
diff0 = v_select(vset1, vtmp_param1, diff0);
@ -350,16 +350,16 @@ public:
// diff = diff * sigma2
v_int32x4 diff_int_0, diff_int_1;
v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
v_int32x4 diff0_0 = diff_int_0 >> s2bitShift;
v_int32x4 diff0_1 = diff_int_1 >> s2bitShift;
v_int32x4 diff0_0 = v_shr(diff_int_0, s2bitShift);
v_int32x4 diff0_1 = v_shr(diff_int_1, s2bitShift);
diff0 = v_pack(diff0_0, diff0_1);
v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
v_zip(vIxy_0, vIxy_1, v10, v11);
v_zip(diff2, diff1, v00, v01);
vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10));
vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11));
vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10)));
vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11)));
}
if (j == 0)
{
@ -387,8 +387,8 @@ public:
v_float32x4 fx = v_cvt_f32(t1);
// A11 - A22
v_float32x4 fxtale = fx * vtale_0;
v_float32x4 fytale = fy * vtale_0;
v_float32x4 fxtale = v_mul(fx, vtale_0);
v_float32x4 fytale = v_mul(fy, vtale_0);
vAyy = v_muladd(fy, fytale, vAyy);
vAxy = v_muladd(fx, fytale, vAxy);
@ -402,8 +402,8 @@ public:
fx = v_cvt_f32(t1);
// A11 - A22
fxtale = fx * vtale_1;
fytale = fy * vtale_1;
fxtale = v_mul(fx, vtale_1);
fytale = v_mul(fy, vtale_1);
vAyy = v_muladd(fy, fytale, vAyy);
vAxy = v_muladd(fx, fytale, vAxy);
@ -544,7 +544,7 @@ public:
float CV_DECL_ALIGNED(16) bbuf[4];
for (int mmi = 0; mmi < 4; mmi++)
{
v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]);
v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi]));
_b0[mmi] = bbuf[0] + bbuf[2];
_b1[mmi] = bbuf[1] + bbuf[3];
}
@ -960,7 +960,7 @@ public:
v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
v_int32x4 t0, t1;
v_int16x8 t00, t01, t10, t11;
@ -968,38 +968,38 @@ public:
v_zip(v10, v11, t10, t11);
//subpixel interpolation
t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
t0 = t0 >> (W_BITS - 5);
t1 = t1 >> (W_BITS - 5);
t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
t0 = v_shr(t0, W_BITS - 5);
t1 = v_shr(t1, W_BITS - 5);
diff0 = v_pack(t0, t1);
// I*gain.x + gain.x
v_mul_expand(vI, vgain_value, t0, t1);
v_int16x8 diff_value = v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value - vI;
v_int16x8 diff_value = v_sub(v_add(v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift)), vconst_value), vI);
v_int16x8 diff[4] =
{
((v11 << 5) + diff_value) & vmask,
((v01 << 5) + diff_value) & vmask,
((v10 << 5) + diff_value) & vmask,
((v00 << 5) + diff_value) & vmask
v_and(v_add(v_shl<5>(v11), diff_value), vmask),
v_and(v_add(v_shl<5>(v01), diff_value), vmask),
v_and(v_add(v_shl<5>(v10), diff_value), vmask),
v_and(v_add(v_shl<5>(v00), diff_value), vmask)
};
diff0 = diff0 + diff_value;
diff0 = diff0 & vmask;
diff0 = v_add(diff0, diff_value);
diff0 = v_and(diff0, vmask);
v_int16x8 vscale_diff_is_pos = diff0 > vscale;
veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1);
v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale);
veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1));
// since there is no abs vor int16x8 we have to do this hack
v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
v_int16x8 vset2, vset1;
// |It| < sigma1 ?
vset2 = vabs_diff < vparam1;
vset2 = v_lt(vabs_diff, vparam1);
// It > 0 ?
v_int16x8 vdiff_is_pos = diff0 > vzero;
v_int16x8 vdiff_is_pos = v_gt(diff0, vzero);
// sigma0 < |It| < sigma1 ?
vset1 = vset2 & (vabs_diff > vparam0);
vset1 = v_and(vset2, v_gt(vabs_diff, vparam0));
// val = |It| -/+ sigma1
v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1);
v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1));
v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8));
@ -1009,7 +1009,7 @@ public:
for (unsigned int mmi = 0; mmi < 4; mmi++)
{
// It == 0 ? |It| > sigma13
diff0 = vset2 & diff[mmi];
diff0 = v_and(vset2, diff[mmi]);
// It == val ? sigma0 < |It| < sigma1
diff0 = v_select(vset1, vtmp_param1, diff0);
@ -1017,22 +1017,22 @@ public:
// diff = diff * sigma2
v_int32x4 diff_int_0, diff_int_1;
v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
v_int32x4 diff0_0 = diff_int_0 >> s2bitShift;
v_int32x4 diff0_1 = diff_int_1 >> s2bitShift;
v_int32x4 diff0_0 = v_shr(diff_int_0, s2bitShift);
v_int32x4 diff0_1 = v_shr(diff_int_1, s2bitShift);
diff0 = v_pack(diff0_0, diff0_1);
v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
v_zip(vIxy_0, vIxy_1, v10, v11);
v_zip(diff2, diff1, v00, v01);
vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10));
vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11));
vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10)));
vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11)));
vqb2[mmi] += v_cvt_f32(diff0_0 * vI0);
vqb2[mmi] += v_cvt_f32(diff0_1 * vI1);
vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_0, vI0)));
vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_1, vI1)));
vqb3[mmi] += v_cvt_f32(diff0_0);
vqb3[mmi] += v_cvt_f32(diff0_1);
vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_0));
vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_1));
}
if (j == 0)
{
@ -1060,29 +1060,29 @@ public:
v_float32x4 fx = v_cvt_f32(t1);
// A11 - A22
v_float32x4 fxtale = fx * vtale_0;
v_float32x4 fytale = fy * vtale_0;
v_float32x4 fxtale = v_mul(fx, vtale_0);
v_float32x4 fytale = v_mul(fy, vtale_0);
vAyy = v_muladd(fy, fytale, vAyy);
vAxy = v_muladd(fx, fytale, vAxy);
vAxx = v_muladd(fx, fxtale, vAxx);
// sumIx und sumIy
vsumIx += fxtale;
vsumIy += fytale;
vsumIx = v_add(vsumIx, fxtale);
vsumIy = v_add(vsumIy, fytale);
vsumW1 += vI_ps * fxtale;
vsumW2 += vI_ps * fytale;
vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale));
vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale));
// sumI
v_float32x4 vI_tale = vI_ps * vtale_0;
vsumI += vI_tale;
v_float32x4 vI_tale = v_mul(vI_ps, vtale_0);
vsumI = v_add(vsumI, vI_tale);
// sumW
vsumW += vtale_0;
vsumW = v_add(vsumW, vtale_0);
// sumDI
vsumDI += vI_ps * vI_tale;
vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale));
v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
v_expand(v01, t1, t0);
@ -1092,29 +1092,29 @@ public:
fx = v_cvt_f32(t1);
// A11 - A22
fxtale = fx * vtale_1;
fytale = fy * vtale_1;
fxtale = v_mul(fx, vtale_1);
fytale = v_mul(fy, vtale_1);
vAyy = v_muladd(fy, fytale, vAyy);
vAxy = v_muladd(fx, fytale, vAxy);
vAxx = v_muladd(fx, fxtale, vAxx);
// sumIx und sumIy
vsumIx += fxtale;
vsumIy += fytale;
vsumIx = v_add(vsumIx, fxtale);
vsumIy = v_add(vsumIy, fytale);
vsumW1 += vI_ps * fxtale;
vsumW2 += vI_ps * fytale;
vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale));
vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale));
// sumI
vI_tale = vI_ps * vtale_1;
vsumI += vI_tale;
vI_tale = v_mul(vI_ps, vtale_1);
vsumI = v_add(vsumI, vI_tale);
// sumW
vsumW += vtale_1;
vsumW = v_add(vsumW, vtale_1);
// sumDI
vsumDI += vI_ps * vI_tale;
vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale));
}
}
@ -1304,7 +1304,7 @@ public:
float CV_DECL_ALIGNED(16) bbuf[4];
for(int mmi = 0; mmi < 4; mmi++)
{
v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]);
v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi]));
_b0[mmi] = bbuf[0] + bbuf[2];
_b1[mmi] = bbuf[1] + bbuf[3];
_b2[mmi] = v_reduce_sum(vqb2[mmi]);
@ -1655,14 +1655,14 @@ public:
v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
v_int16x8 diff[4] =
{
((v00 << 5) - vI) & vmask,
((v01 << 5) - vI) & vmask,
((v10 << 5) - vI) & vmask,
((v11 << 5) - vI) & vmask,
v_and(v_sub(v_shl<5>(v00), vI), vmask),
v_and(v_sub(v_shl<5>(v01), vI), vmask),
v_and(v_sub(v_shl<5>(v10), vI), vmask),
v_and(v_sub(v_shl<5>(v11), vI), vmask),
};
v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
@ -1672,8 +1672,8 @@ public:
v_zip(diff[mmi], diff[mmi], diff1, diff0);
v_zip(vIxy_0, vIxy_1, v10, v11);
v_zip(diff1, diff0, v00, v01);
vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10));
vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11));
vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10)));
vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11)));
}
}
#else
@ -1704,7 +1704,7 @@ public:
float CV_DECL_ALIGNED(16) bbuf[4];
for (int mmi = 0; mmi < 4; mmi++)
{
v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]);
v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi]));
_b1[mmi] = bbuf[0] + bbuf[2];
_b2[mmi] = bbuf[1] + bbuf[3];
}
@ -2071,7 +2071,7 @@ namespace radial {
v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
v_int32x4 t0, t1;
v_int16x8 t00, t01, t10, t11;
@ -2079,21 +2079,21 @@ namespace radial {
v_zip(v10, v11, t10, t11);
//subpixel interpolation
t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
t0 = t0 >> (W_BITS - 5);
t1 = t1 >> (W_BITS - 5);
t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
t0 = v_shr(t0, W_BITS - 5);
t1 = v_shr(t1, W_BITS - 5);
diff0 = v_pack(t0, t1);
// I*gain.x + gain.x
v_mul_expand(vI, vgain_value, t0, t1);
v_int16x8 diff_value = v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value - vI;
v_int16x8 diff_value = v_sub(v_add(v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift)), vconst_value), vI);
v_int16x8 diff[4] =
{
((v11 << 5) + diff_value) & vmask,
((v01 << 5) + diff_value) & vmask,
((v10 << 5) + diff_value) & vmask,
((v00 << 5) + diff_value) & vmask
v_and(v_add(v_shl<5>(v11), diff_value), vmask),
v_and(v_add(v_shl<5>(v01), diff_value), vmask),
v_and(v_add(v_shl<5>(v10), diff_value), vmask),
v_and(v_add(v_shl<5>(v00), diff_value), vmask)
};
v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8));
@ -2109,14 +2109,14 @@ namespace radial {
v_zip(diff[mmi], diff[mmi], diff2, diff1);
v_zip(diff2, diff1, v00, v01);
vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10));
vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11));
vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10)));
vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11)));
vqb2[mmi] += v_cvt_f32(diff0_0 * vI0);
vqb2[mmi] += v_cvt_f32(diff0_1 * vI1);
vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_0, vI0)));
vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_1, vI1)));
vqb3[mmi] += v_cvt_f32(diff0_0);
vqb3[mmi] += v_cvt_f32(diff0_1);
vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_0));
vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_1));
}
if (j == 0)
{
@ -2133,17 +2133,17 @@ namespace radial {
vAxx = v_muladd(fx, fx, vAxx);
// sumIx und sumIy
vsumIx += fx;
vsumIy += fy;
vsumIx = v_add(vsumIx, fx);
vsumIy = v_add(vsumIy, fy);
vsumW1 += vI_ps * fx;
vsumW2 += vI_ps * fy;
vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx));
vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy));
// sumI
vsumI += vI_ps;
vsumI = v_add(vsumI, vI_ps);
// sumDI
vsumDI += vI_ps * vI_ps;
vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps));
v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
v_expand(v01, t1, t0);
@ -2158,17 +2158,17 @@ namespace radial {
vAxx = v_muladd(fx, fx, vAxx);
// sumIx und sumIy
vsumIx += fx;
vsumIy += fy;
vsumIx = v_add(vsumIx, fx);
vsumIy = v_add(vsumIy, fy);
vsumW1 += vI_ps * fx;
vsumW2 += vI_ps * fy;
vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx));
vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy));
// sumI
vsumI += vI_ps;
vsumI = v_add(vsumI, vI_ps);
// sumDI
vsumDI += vI_ps * vI_ps;
vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps));
}
}
@ -2299,7 +2299,7 @@ namespace radial {
float CV_DECL_ALIGNED(16) bbuf[4];
for (int mmi = 0; mmi < 4; mmi++)
{
v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]);
v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi]));
_b0[mmi] = bbuf[0] + bbuf[2];
_b1[mmi] = bbuf[1] + bbuf[3];
_b2[mmi] = v_reduce_sum(vqb2[mmi]);

@ -229,7 +229,7 @@ public:
v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
v_int32x4 t0, t1;
v_int16x8 t00, t01, t10, t11;
@ -237,17 +237,17 @@ public:
v_zip(v10, v11, t10, t11);
//subpixel interpolation
t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
t0 = t0 >> (W_BITS - 5);
t1 = t1 >> (W_BITS - 5);
t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
t0 = v_shr(t0, W_BITS - 5);
t1 = v_shr(t1, W_BITS - 5);
// diff = J - I
diff0 = v_pack(t0, t1) - vI;
diff0 = v_sub(v_pack(t0, t1), vI);
// I*gain.x + gain.x
v_mul_expand(vI, vgain_value, t0, t1);
diff0 = diff0 + v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value;
diff0 = diff0 & vmask;
diff0 = v_add(v_add(diff0, v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift))), vconst_value);
diff0 = v_and(diff0, vmask);
v_zip(diff0, diff0, diff2, diff1);
v_int32x4 diff0_0;
@ -259,16 +259,16 @@ public:
v_zip(vIxy_0, vIxy_1, v10, v11);
v_zip(diff2, diff1, v00, v01);
vqb0 += v_cvt_f32(v_dotprod(v00, v10));
vqb1 += v_cvt_f32(v_dotprod(v01, v11));
vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10)));
vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11)));
v_int32x4 vI0, vI1;
v_expand(vI, vI0, vI1);
vqb2 += v_cvt_f32(diff0_0 * vI0);
vqb2 += v_cvt_f32(diff0_1 * vI1);
vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_0, vI0)));
vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_1, vI1)));
vqb3 += v_cvt_f32(diff0_0);
vqb3 += v_cvt_f32(diff0_1);
vqb3 = v_add(vqb3, v_cvt_f32(diff0_0));
vqb3 = v_add(vqb3, v_cvt_f32(diff0_1));
if (j == 0)
{
@ -285,17 +285,17 @@ public:
vAxx = v_muladd(fx, fx, vAxx);
// sumIx und sumIy
vsumIx += fx;
vsumIy += fy;
vsumIx = v_add(vsumIx, fx);
vsumIy = v_add(vsumIy, fy);
vsumW1 += vI_ps * fx;
vsumW2 += vI_ps * fy;
vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx));
vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy));
// sumI
vsumI += vI_ps;
vsumI = v_add(vsumI, vI_ps);
// sumDI
vsumDI += vI_ps * vI_ps;
vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps));
v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
v_expand(v01, t1, t0);
@ -309,17 +309,17 @@ public:
vAxx = v_muladd(fx, fx, vAxx);
// sumIx und sumIy
vsumIx += fx;
vsumIy += fy;
vsumIx = v_add(vsumIx, fx);
vsumIy = v_add(vsumIy, fy);
vsumW1 += vI_ps * fx;
vsumW2 += vI_ps * fy;
vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx));
vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy));
// sumI
vsumI += vI_ps;
vsumI = v_add(vsumI, vI_ps);
// sumDI
vsumDI += vI_ps * vI_ps;
vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps));
}
}
#else
@ -388,7 +388,7 @@ public:
#if CV_SIMD128
float CV_DECL_ALIGNED(16) bbuf[4];
v_store_aligned(bbuf, vqb0 + vqb1);
v_store_aligned(bbuf, v_add(vqb0, vqb1));
b1 = bbuf[0] + bbuf[2];
b2 = bbuf[1] + bbuf[3];
b3 = v_reduce_sum(vqb2);
@ -696,19 +696,19 @@ public:
v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
v_int32x4 t0, t1;
v_int16x8 t00, t01, t10, t11;
v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11);
t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
t0 = t0 >> (W_BITS - 5);
t1 = t1 >> (W_BITS - 5);
diff0 = v_pack(t0, t1) - diff0;
diff0 = diff0 & vmask;
t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
t0 = v_shr(t0, W_BITS - 5);
t1 = v_shr(t1, W_BITS - 5);
diff0 = v_sub(v_pack(t0, t1), diff0);
diff0 = v_and(diff0, vmask);
v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
@ -717,8 +717,8 @@ public:
v_zip(vIxy_0, vIxy_1, v10, v11);
v_zip(diff2, diff1, v00, v01);
vqb0 += v_cvt_f32(v_dotprod(v00, v10));
vqb1 += v_cvt_f32(v_dotprod(v01, v11));
vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10)));
vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11)));
}
#else
for( ; x < winSize.width*cn; x++, dIptr += 2 )
@ -737,7 +737,7 @@ public:
#if CV_SIMD128
float CV_DECL_ALIGNED(16) bbuf[4];
v_store_aligned(bbuf, vqb0 + vqb1);
v_store_aligned(bbuf, v_add(vqb0, vqb1));
b1 = bbuf[0] + bbuf[2];
b2 = bbuf[1] + bbuf[3];
#endif

@ -246,35 +246,35 @@ public:
v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
v_int32x4 t0, t1;
v_int16x8 t00, t01, t10, t11;
v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11);
t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
t0 = t0 >> (W_BITS - 5);
t1 = t1 >> (W_BITS - 5);
diff0 = v_pack(t0, t1) - diff0;
diff0 = diff0 & vmask;
t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
t0 = v_shr(t0, W_BITS - 5);
t1 = v_shr(t1, W_BITS - 5);
diff0 = v_sub(v_pack(t0, t1), diff0);
diff0 = v_and(diff0, vmask);
v_int16x8 vscale_diff_is_pos = diff0 > vscale;
veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1);
v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale);
veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1));
// since there is no abs vor int16x8 we have to do this hack
v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
v_int16x8 vset2, vset1;
// |It| < sigma1 ?
vset2 = vabs_diff < vparam1;
vset2 = v_lt(vabs_diff, vparam1);
// It > 0 ?
v_int16x8 vdiff_is_pos = diff0 > vzero;
v_int16x8 vdiff_is_pos = v_gt(diff0, vzero);
// sigma0 < |It| < sigma1 ?
vset1 = vset2 & (vabs_diff > vparam0);
vset1 = v_and(vset2, v_gt(vabs_diff, vparam0));
// val = |It| -/+ sigma1
v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1);
v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1));
// It == 0 ? |It| > sigma13
diff0 = vset2 & diff0;
diff0 = v_and(vset2, diff0);
// It == val ? sigma0 < |It| < sigma1
diff0 = v_select(vset1, vtmp_param1, diff0);
@ -282,7 +282,7 @@ public:
// diff = diff * sigma2
v_int32x4 diff_int_0, diff_int_1;
v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
diff0 = v_pack(diff_int_0 >> s2bitShift, diff_int_1 >> s2bitShift);
diff0 = v_pack(v_shr(diff_int_0, s2bitShift), v_shr(diff_int_1, s2bitShift));
v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
@ -290,8 +290,8 @@ public:
v_zip(vIxy_0, vIxy_1, v10, v11);
v_zip(diff2, diff1, v00, v01);
vqb0 += v_cvt_f32(v_dotprod(v00, v10));
vqb1 += v_cvt_f32(v_dotprod(v01, v11));
vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10)));
vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11)));
if (j == 0)
{
v_int32x4 vset1_0, vset1_1, vset2_0, vset2_1;
@ -316,8 +316,8 @@ public:
v_float32x4 fx = v_cvt_f32(t1);
// A11 - A22
v_float32x4 fxtale = fx * vtale_0;
v_float32x4 fytale = fy * vtale_0;
v_float32x4 fxtale = v_mul(fx, vtale_0);
v_float32x4 fytale = v_mul(fy, vtale_0);
vAyy = v_muladd(fy, fytale, vAyy);
vAxy = v_muladd(fx, fytale, vAxy);
@ -330,8 +330,8 @@ public:
fx = v_cvt_f32(t1);
// A11 - A22
fxtale = fx * vtale_1;
fytale = fy * vtale_1;
fxtale = v_mul(fx, vtale_1);
fytale = v_mul(fy, vtale_1);
vAyy = v_muladd(fy, fytale, vAyy);
vAxy = v_muladd(fx, fytale, vAxy);
@ -431,7 +431,7 @@ public:
#if CV_SIMD128
float CV_DECL_ALIGNED(16) bbuf[4];
v_store_aligned(bbuf, vqb0 + vqb1);
v_store_aligned(bbuf, v_add(vqb0, vqb1));
b1 += bbuf[0] + bbuf[2];
b2 += bbuf[1] + bbuf[3];
#endif
@ -769,7 +769,7 @@ public:
v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
v_int32x4 t0, t1;
v_int16x8 t00, t01, t10, t11;
@ -777,33 +777,33 @@ public:
v_zip(v10, v11, t10, t11);
//subpixel interpolation
t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
t0 = t0 >> (W_BITS - 5);
t1 = t1 >> (W_BITS - 5);
t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
t0 = v_shr(t0, W_BITS - 5);
t1 = v_shr(t1, W_BITS - 5);
// diff = J - I
diff0 = v_pack(t0, t1) - vI;
diff0 = v_sub(v_pack(t0, t1), vI);
// I*gain.x + gain.x
v_mul_expand(vI, vgain_value, t0, t1);
diff0 = diff0 + v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value;
diff0 = diff0 & vmask;
diff0 = v_add(v_add(diff0, v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift))), vconst_value);
diff0 = v_and(diff0, vmask);
v_int16x8 vscale_diff_is_pos = diff0 > vscale;
veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1);
v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale);
veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1));
// since there is no abs vor int16x8 we have to do this hack
v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
v_int16x8 vset2, vset1;
// |It| < sigma1 ?
vset2 = vabs_diff < vparam1;
vset2 = v_lt(vabs_diff, vparam1);
// It > 0 ?
v_int16x8 vdiff_is_pos = diff0 > vzero;
v_int16x8 vdiff_is_pos = v_gt(diff0, vzero);
// sigma0 < |It| < sigma1 ?
vset1 = vset2 & (vabs_diff > vparam0);
vset1 = v_and(vset2, v_gt(vabs_diff, vparam0));
// val = |It| -/+ sigma1
v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1);
v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1));
// It == 0 ? |It| > sigma13
diff0 = vset2 & diff0;
diff0 = v_and(vset2, diff0);
// It == val ? sigma0 < |It| < sigma1
diff0 = v_select(vset1, vtmp_param1, diff0);
@ -811,8 +811,8 @@ public:
// diff = diff * sigma2
v_int32x4 diff_int_0, diff_int_1;
v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
v_int32x4 diff0_0 = diff_int_0 >> s2bitShift;
v_int32x4 diff0_1 = diff_int_1 >> s2bitShift;
v_int32x4 diff0_0 = v_shr(diff_int_0, s2bitShift);
v_int32x4 diff0_1 = v_shr(diff_int_1, s2bitShift);
diff0 = v_pack(diff0_0, diff0_1);
v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
@ -821,16 +821,16 @@ public:
v_zip(vIxy_0, vIxy_1, v10, v11);
v_zip(diff2, diff1, v00, v01);
vqb0 += v_cvt_f32(v_dotprod(v00, v10));
vqb1 += v_cvt_f32(v_dotprod(v01, v11));
vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10)));
vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11)));
v_int32x4 vI0, vI1;
v_expand(vI, vI0, vI1);
vqb2 += v_cvt_f32(diff0_0 * vI0);
vqb2 += v_cvt_f32(diff0_1 * vI1);
vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_0, vI0)));
vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_1, vI1)));
vqb3 += v_cvt_f32(diff0_0);
vqb3 += v_cvt_f32(diff0_1);
vqb3 = v_add(vqb3, v_cvt_f32(diff0_0));
vqb3 = v_add(vqb3, v_cvt_f32(diff0_1));
if (j == 0)
{
@ -858,29 +858,29 @@ public:
v_float32x4 fx = v_cvt_f32(t1);
// A11 - A22
v_float32x4 fxtale = fx * vtale_0;
v_float32x4 fytale = fy * vtale_0;
v_float32x4 fxtale = v_mul(fx, vtale_0);
v_float32x4 fytale = v_mul(fy, vtale_0);
vAyy = v_muladd(fy, fytale, vAyy);
vAxy = v_muladd(fx, fytale, vAxy);
vAxx = v_muladd(fx, fxtale, vAxx);
// sumIx und sumIy
vsumIx += fxtale;
vsumIy += fytale;
vsumIx = v_add(vsumIx, fxtale);
vsumIy = v_add(vsumIy, fytale);
vsumW1 += vI_ps * fxtale;
vsumW2 += vI_ps * fytale;
vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale));
vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale));
// sumI
v_float32x4 vI_tale = vI_ps * vtale_0;
vsumI += vI_tale;
v_float32x4 vI_tale = v_mul(vI_ps, vtale_0);
vsumI = v_add(vsumI, vI_tale);
// sumW
vsumW += vtale_0;
vsumW = v_add(vsumW, vtale_0);
// sumDI
vsumDI += vI_ps * vI_tale;
vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale));
v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
v_expand(v01, t1, t0);
@ -890,29 +890,29 @@ public:
fx = v_cvt_f32(t1);
// A11 - A22
fxtale = fx * vtale_1;
fytale = fy * vtale_1;
fxtale = v_mul(fx, vtale_1);
fytale = v_mul(fy, vtale_1);
vAyy = v_muladd(fy, fytale, vAyy);
vAxy = v_muladd(fx, fytale, vAxy);
vAxx = v_muladd(fx, fxtale, vAxx);
// sumIx und sumIy
vsumIx += fxtale;
vsumIy += fytale;
vsumIx = v_add(vsumIx, fxtale);
vsumIy = v_add(vsumIy, fytale);
vsumW1 += vI_ps * fxtale;
vsumW2 += vI_ps * fytale;
vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale));
vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale));
// sumI
vI_tale = vI_ps * vtale_1;
vsumI += vI_tale;
vI_tale = v_mul(vI_ps, vtale_1);
vsumI = v_add(vsumI, vI_tale);
// sumW
vsumW += vtale_1;
vsumW = v_add(vsumW, vtale_1);
// sumDI
vsumDI += vI_ps * vI_tale;
vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale));
}
}
#else
@ -1017,7 +1017,7 @@ public:
}
#if CV_SIMD128
float CV_DECL_ALIGNED(16) bbuf[4];
v_store_aligned(bbuf, vqb0 + vqb1);
v_store_aligned(bbuf, v_add(vqb0, vqb1));
b1 = bbuf[0] + bbuf[2];
b2 = bbuf[1] + bbuf[3];
b3 = v_reduce_sum(vqb2);

@ -71,15 +71,15 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
for (; x <= winSize.width*cn; x += 8, dsrc += 8 * 2, dsrc1 += 8 * 2, dIptr += 8 * 2)
{
v_int32x4 vmask0 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)) * vmax_val_32;
v_int32x4 vmask1 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)) * vmax_val_32;
v_int32x4 vmask0 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)), vmax_val_32);
v_int32x4 vmask1 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)), vmax_val_32);
if (x + 4 > winSize.width)
{
vmask0 = vmask0 & vmask_border_0;
vmask0 = v_and(vmask0, vmask_border_0);
}
if (x + 8 > winSize.width)
{
vmask1 = vmask1 & vmask_border_1;
vmask1 = v_and(vmask1, vmask_border_1);
}
v_int32x4 t0, t1;
@ -91,10 +91,10 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11);
t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
t0 = t0 >> (W_BITS - 5) & vmask0;
t1 = t1 >> (W_BITS - 5) & vmask1;
t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
t0 = v_and(v_shr(t0, W_BITS - 5), vmask0);
t1 = v_and(v_shr(t1, W_BITS - 5), vmask1);
v_store(Iptr + x, v_pack(t0, t1));
v00 = v_reinterpret_as_s16(v_load(dsrc));
@ -105,12 +105,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11);
t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
t0 = t0 >> W_BITS;
t1 = t1 >> W_BITS;
t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1));
t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1));
t0 = v_shr(t0, W_BITS);
t1 = v_shr(t1, W_BITS);
v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
v00 = v00 & v_reinterpret_as_s16(vmask0);
v00 = v_and(v00, v_reinterpret_as_s16(vmask0));
v_store(dIptr, v00);
v00 = v_reinterpret_as_s16(v_load(dsrc + 4 * 2));
@ -121,12 +121,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11);
t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
t0 = t0 >> W_BITS;
t1 = t1 >> W_BITS;
t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1));
t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1));
t0 = v_shr(t0, W_BITS);
t1 = v_shr(t1, W_BITS);
v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
v00 = v00 & v_reinterpret_as_s16(vmask1);
v00 = v_and(v00, v_reinterpret_as_s16(vmask1));
v_store(dIptr + 4 * 2, v00);
}
#else
@ -187,15 +187,15 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
#if CV_SIMD128
for (int x = 0; x <= winSize.width*cn; x += 8, dsrc += 8 * 2, dsrc1 += 8 * 2, dIptr += 8 * 2)
{
v_int32x4 vmask0 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)) * vmax_val_32;
v_int32x4 vmask1 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)) * vmax_val_32;
v_int32x4 vmask0 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)), vmax_val_32);
v_int32x4 vmask1 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)), vmax_val_32);
if (x + 4 > winSize.width)
{
vmask0 = vmask0 & vmask_border0;
vmask0 = v_and(vmask0, vmask_border0);
}
if (x + 8 > winSize.width)
{
vmask1 = vmask1 & vmask_border1;
vmask1 = v_and(vmask1, vmask_border1);
}
v_int32x4 t0, t1;
@ -207,12 +207,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11);
t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
t0 = t0 >> (W_BITS - 5);
t1 = t1 >> (W_BITS - 5);
t0 = t0 & vmask0;
t1 = t1 & vmask1;
t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
t0 = v_shr(t0, W_BITS - 5);
t1 = v_shr(t1, W_BITS - 5);
t0 = v_and(t0, vmask0);
t1 = v_and(t1, vmask1);
v_store(Iptr + x, v_pack(t0, t1));
v00 = v_reinterpret_as_s16(v_load(dsrc));
@ -223,12 +223,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11);
t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
t0 = t0 >> W_BITS;
t1 = t1 >> W_BITS;
t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1));
t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1));
t0 = v_shr(t0, W_BITS);
t1 = v_shr(t1, W_BITS);
v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
v00 = v00 & v_reinterpret_as_s16(vmask0);
v00 = v_and(v00, v_reinterpret_as_s16(vmask0));
v_store(dIptr, v00);
v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(v00))));
@ -249,12 +249,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11);
t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
t0 = t0 >> W_BITS;
t1 = t1 >> W_BITS;
t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1));
t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1));
t0 = v_shr(t0, W_BITS);
t1 = v_shr(t1, W_BITS);
v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
v00 = v00 & v_reinterpret_as_s16(vmask1);
v00 = v_and(v00, v_reinterpret_as_s16(vmask1));
v_store(dIptr + 4 * 2, v00);
v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(v00))));

@ -52,8 +52,8 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(srow1 + x));
v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x));
v_int16x8 t1 = s2 - s0;
v_int16x8 t0 = v_mul_wrap(s0 + s2, c3) + v_mul_wrap(s1, c10);
v_int16x8 t1 = v_sub(s2, s0);
v_int16x8 t0 = v_add(v_mul_wrap(v_add(s0, s2), c3), v_mul_wrap(s1, c10));
v_store(trow0 + x, t0);
v_store(trow1 + x, t1);
@ -90,8 +90,8 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
v_int16x8 s3 = v_load(trow1 + x);
v_int16x8 s4 = v_load(trow1 + x + cn);
v_int16x8 t0 = s1 - s0;
v_int16x8 t1 = v_mul_wrap(s2 + s4, c3) + v_mul_wrap(s3, c10);
v_int16x8 t0 = v_sub(s1, s0);
v_int16x8 t1 = v_add(v_mul_wrap(v_add(s2, s4), c3), v_mul_wrap(s3, c10));
v_store_interleave((drow + x * 2), t0, t1);
}

@ -194,21 +194,21 @@ inline float ColoredTSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const
{
// tx, ty, tz = floor(p)
v_int32x4 ip = v_floor(p);
v_float32x4 t = p - v_cvt_f32(ip);
float tx = t.get0();
v_float32x4 t = v_sub(p, v_cvt_f32(ip));
float tx = v_get0(t);
t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
float ty = t.get0();
float ty = v_get0(t);
t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
float tz = t.get0();
float tz = v_get0(t);
int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
const RGBTsdfVoxel* volData = volume.ptr<RGBTsdfVoxel>();
int ix = ip.get0();
int ix = v_get0(ip);
ip = v_rotate_right<1>(ip);
int iy = ip.get0();
int iy = v_get0(ip);
ip = v_rotate_right<1>(ip);
int iz = ip.get0();
int iz = v_get0(ip);
int coordBase = ix * xdim + iy * ydim + iz * zdim;
@ -218,15 +218,15 @@ inline float ColoredTSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const
v_float32x4 v0246 = tsdfToFloat_INTR(v_int32x4(vx[0], vx[2], vx[4], vx[6]));
v_float32x4 v1357 = tsdfToFloat_INTR(v_int32x4(vx[1], vx[3], vx[5], vx[7]));
v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246);
v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
v_float32x4 v00_10 = vxx;
v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10);
float v0 = v0_1.get0();
v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
float v0 = v_get0(v0_1);
v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
float v1 = v0_1.get0();
float v1 = v_get0(v0_1);
return v0 + tx * (v1 - v0);
}
@ -276,27 +276,27 @@ inline Point3f ColoredTSDFVolumeCPU::getNormalVoxel(const Point3f& _p) const
inline v_float32x4 ColoredTSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const
{
if (v_check_any(p < v_float32x4(1.f, 1.f, 1.f, 0.f)) ||
v_check_any(p >= v_float32x4((float)(volResolution.x - 2),
if (v_check_any(v_lt(p, v_float32x4(1.f, 1.f, 1.f, 0.f))) ||
v_check_any(v_ge(p, v_float32x4((float)(volResolution.x - 2),
(float)(volResolution.y - 2),
(float)(volResolution.z - 2), 1.f))
(float)(volResolution.z - 2), 1.f)))
)
return nanv;
v_int32x4 ip = v_floor(p);
v_float32x4 t = p - v_cvt_f32(ip);
float tx = t.get0();
v_float32x4 t = v_sub(p, v_cvt_f32(ip));
float tx = v_get0(t);
t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
float ty = t.get0();
float ty = v_get0(t);
t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
float tz = t.get0();
float tz = v_get0(t);
const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
const RGBTsdfVoxel* volData = volume.ptr<RGBTsdfVoxel>();
int ix = ip.get0(); ip = v_rotate_right<1>(ip);
int iy = ip.get0(); ip = v_rotate_right<1>(ip);
int iz = ip.get0();
int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
int iz = v_get0(ip);
int coordBase = ix * xdim + iy * ydim + iz * zdim;
@ -314,23 +314,23 @@ inline v_float32x4 ColoredTSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) co
v_float32x4 v0246(vx[0], vx[2], vx[4], vx[6]);
v_float32x4 v1357(vx[1], vx[3], vx[5], vx[7]);
v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246);
v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
v_float32x4 v00_10 = vxx;
v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10);
float v0 = v0_1.get0();
v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
float v0 = v_get0(v0_1);
v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
float v1 = v0_1.get0();
float v1 = v_get0(v0_1);
nv = v0 + tx * (v1 - v0);
}
v_float32x4 n = v_load_aligned(an);
v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(n * n)));
v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(v_mul(n, n))));
return Norm.get0() < 0.0001f ? nanv : n / Norm;
return v_get0(Norm) < 0.0001f ? nanv : v_div(n, Norm);
}
#else
inline Point3f ColoredTSDFVolumeCPU::getNormalVoxel(const Point3f& p) const
@ -388,15 +388,15 @@ inline float ColoredTSDFVolumeCPU::interpolateColor(float tx, float ty, float tz
v_float32x4 v0246, v1357;
v_load_deinterleave(vx, v0246, v1357);
v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246);
v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
v_float32x4 v00_10 = vxx;
v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10);
float v0 = v0_1.get0();
v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
float v0 = v_get0(v0_1);
v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
float v1 = v0_1.get0();
float v1 = v_get0(v0_1);
return v0 + tx * (v1 - v0);
}
@ -427,10 +427,10 @@ inline Point3f ColoredTSDFVolumeCPU::getColorVoxel(const Point3f& _p) const
}
inline v_float32x4 ColoredTSDFVolumeCPU::getColorVoxel(const v_float32x4& p) const
{
if (v_check_any(p < v_float32x4(1.f, 1.f, 1.f, 0.f)) ||
v_check_any(p >= v_float32x4((float)(volResolution.x - 2),
if (v_check_any(v_lt(p, v_float32x4(1.f, 1.f, 1.f, 0.f))) ||
v_check_any(v_ge(p, v_float32x4((float)(volResolution.x - 2),
(float)(volResolution.y - 2),
(float)(volResolution.z - 2), 1.f))
(float)(volResolution.z - 2), 1.f)))
)
return nanv;
@ -439,9 +439,9 @@ inline v_float32x4 ColoredTSDFVolumeCPU::getColorVoxel(const v_float32x4& p) con
const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
const RGBTsdfVoxel* volData = volume.ptr<RGBTsdfVoxel>();
int ix = ip.get0(); ip = v_rotate_right<1>(ip);
int iy = ip.get0(); ip = v_rotate_right<1>(ip);
int iz = ip.get0();
int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
int iz = v_get0(ip);
int coordBase = ix * xdim + iy * ydim + iz * zdim;
float CV_DECL_ALIGNED(16) rgb[4];
@ -456,12 +456,12 @@ inline v_float32x4 ColoredTSDFVolumeCPU::getColorVoxel(const v_float32x4& p) con
}
v_float32x4 vsi(voxelSizeInv, voxelSizeInv, voxelSizeInv, voxelSizeInv);
v_float32x4 ptVox = p * vsi;
v_float32x4 ptVox = v_mul(p, vsi);
v_int32x4 iptVox = v_floor(ptVox);
v_float32x4 t = ptVox - v_cvt_f32(iptVox);
float tx = t.get0(); t = v_rotate_right<1>(t);
float ty = t.get0(); t = v_rotate_right<1>(t);
float tz = t.get0();
v_float32x4 t = v_sub(ptVox, v_cvt_f32(iptVox));
float tx = v_get0(t); t = v_rotate_right<1>(t);
float ty = v_get0(t); t = v_rotate_right<1>(t);
float tz = v_get0(t);
rgb[0] = interpolateColor(tx, ty, tz, r);
rgb[1] = interpolateColor(tx, ty, tz, g);
rgb[2] = interpolateColor(tx, ty, tz, b);
@ -583,21 +583,21 @@ struct ColorRaycastInvoker : ParallelLoopBody
// get direction through pixel in volume space:
// 1. reproject (x, y) on projecting plane where z = 1.f
v_float32x4 planed = (v_float32x4((float)x, (float)y, 0.f, 0.f) - vcxy) * vfxy;
v_float32x4 planed = v_mul(v_sub(v_float32x4((float)x, (float)y, 0.F, 0.F), vcxy), vfxy);
planed = v_combine_low(planed, v_float32x4(1.f, 0.f, 0.f, 0.f));
// 2. rotate to volume space
planed = v_matmuladd(planed, camRot0, camRot1, camRot2, v_setzero_f32());
// 3. normalize
v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(planed * planed)));
v_float32x4 dir = planed * invNorm;
v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(v_mul(planed, planed))));
v_float32x4 dir = v_mul(planed, invNorm);
// compute intersection of ray with all six bbox planes
v_float32x4 rayinv = v_setall_f32(1.f) / dir;
v_float32x4 rayinv = v_div(v_setall_f32(1.F), dir);
// div by zero should be eliminated by these products
v_float32x4 tbottom = rayinv * (boxDown - orig);
v_float32x4 ttop = rayinv * (boxUp - orig);
v_float32x4 tbottom = v_mul(rayinv, v_sub(boxDown, orig));
v_float32x4 ttop = v_mul(rayinv, v_sub(boxUp, orig));
// re-order intersections to find smallest and largest on each axis
v_float32x4 minAx = v_min(ttop, tbottom);
@ -618,14 +618,14 @@ struct ColorRaycastInvoker : ParallelLoopBody
if (tmin < tmax)
{
// interpolation optimized a little
orig *= invVoxelSize;
dir *= invVoxelSize;
orig = v_mul(orig, invVoxelSize);
dir = v_mul(dir, invVoxelSize);
int xdim = volume.volDims[0];
int ydim = volume.volDims[1];
int zdim = volume.volDims[2];
v_float32x4 rayStep = dir * v_setall_f32(tstep);
v_float32x4 next = (orig + dir * v_setall_f32(tmin));
v_float32x4 rayStep = v_mul(dir, v_setall_f32(this->tstep));
v_float32x4 next = (v_add(orig, v_mul(dir, v_setall_f32(tmin))));
float f = volume.interpolateVoxel(next), fnext = f;
//raymarch
@ -633,11 +633,11 @@ struct ColorRaycastInvoker : ParallelLoopBody
int nSteps = cvFloor((tmax - tmin) / tstep);
for (; steps < nSteps; steps++)
{
next += rayStep;
next = v_add(next, rayStep);
v_int32x4 ip = v_round(next);
int ix = ip.get0(); ip = v_rotate_right<1>(ip);
int iy = ip.get0(); ip = v_rotate_right<1>(ip);
int iz = ip.get0();
int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
int iz = v_get0(ip);
int coord = ix * xdim + iy * ydim + iz * zdim;
fnext = tsdfToFloat(volume.volume.at<RGBTsdfVoxel>(coord).tsdf);
@ -657,7 +657,7 @@ struct ColorRaycastInvoker : ParallelLoopBody
// linearly interpolate t between two f values
if (f > 0.f && fnext < 0.f)
{
v_float32x4 tp = next - rayStep;
v_float32x4 tp = v_sub(next, rayStep);
float ft = volume.interpolateVoxel(tp);
float ftdt = volume.interpolateVoxel(next);
float ts = tmin + tstep * (steps - ft / (ftdt - ft));
@ -665,7 +665,7 @@ struct ColorRaycastInvoker : ParallelLoopBody
// avoid division by zero
if (!cvIsNaN(ts) && !cvIsInf(ts))
{
v_float32x4 pv = (orig + dir * v_setall_f32(ts));
v_float32x4 pv = (v_add(orig, v_mul(dir, v_setall_f32(ts))));
v_float32x4 nv = volume.getNormalVoxel(pv);
v_float32x4 cv = volume.getColorVoxel(pv);
@ -675,9 +675,7 @@ struct ColorRaycastInvoker : ParallelLoopBody
//convert pv and nv to camera space
normal = v_matmuladd(nv, volRot0, volRot1, volRot2, v_setzero_f32());
// interpolation optimized a little
point = v_matmuladd(pv * v_float32x4(volume.voxelSize,
volume.voxelSize,
volume.voxelSize, 1.f),
point = v_matmuladd(v_mul(pv, v_float32x4(this->volume.voxelSize, this->volume.voxelSize, this->volume.voxelSize, 1.F)),
volRot0, volRot1, volRot2, volTrans);
}
}

@ -138,7 +138,7 @@ bool ICPImpl::estimateTransformT(cv::Affine3f& transform,
#if USE_INTRINSICS
static inline bool fastCheck(const v_float32x4& p0, const v_float32x4& p1)
{
float check = (p0.get0() + p1.get0());
float check = (v_get0(p0) + v_get0(p1));
return !cvIsNaN(check);
}
@ -160,7 +160,7 @@ static inline v_float32x4 crossProduct(const v_float32x4& a, const v_float32x4&
v_float32x4 ayzx, azxy, byzx, bzxy;
getCrossPerm(a, ayzx, azxy);
getCrossPerm(b, byzx, bzxy);
return ayzx*bzxy - azxy*byzx;
return v_sub(v_mul(ayzx, bzxy), v_mul(azxy, byzx));
}
#else
static inline bool fastCheck(const Point3f& p)
@ -235,11 +235,11 @@ struct GetAbInvoker : ParallelLoopBody
//find correspondence by projecting the point
v_float32x4 oldCoords;
float pz = (v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(newP))).get0());
float pz = v_get0(v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(newP))));
// x, y, 0, 0
oldCoords = v_muladd(newP/v_setall_f32(pz), vfxy, vcxy);
oldCoords = v_muladd(v_div(newP, v_setall_f32(pz)), vfxy, vcxy);
if(!v_check_all((oldCoords >= v_setzero_f32()) & (oldCoords < vframe)))
if(!v_check_all(v_and(v_ge(oldCoords, v_setzero_f32()), v_lt(oldCoords, vframe))))
continue;
// bilinearly interpolate oldPts and oldNrm under oldCoords point
@ -247,12 +247,12 @@ struct GetAbInvoker : ParallelLoopBody
v_float32x4 oldN;
{
v_int32x4 ixy = v_floor(oldCoords);
v_float32x4 txy = oldCoords - v_cvt_f32(ixy);
int xi = ixy.get0();
int yi = v_rotate_right<1>(ixy).get0();
v_float32x4 tx = v_setall_f32(txy.get0());
v_float32x4 txy = v_sub(oldCoords, v_cvt_f32(ixy));
int xi = v_get0(ixy);
int yi = v_get0(v_rotate_right<1>(ixy));
v_float32x4 tx = v_setall_f32(v_get0(txy));
txy = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(txy)));
v_float32x4 ty = v_setall_f32(txy.get0());
v_float32x4 ty = v_setall_f32(v_get0(txy));
const float* prow0 = (const float*)oldPts[yi+0];
const float* prow1 = (const float*)oldPts[yi+1];
@ -275,23 +275,23 @@ struct GetAbInvoker : ParallelLoopBody
// NaN check is done later
v_float32x4 p0 = p00 + tx*(p01 - p00);
v_float32x4 p1 = p10 + tx*(p11 - p10);
oldP = p0 + ty*(p1 - p0);
v_float32x4 p0 = v_add(p00, v_mul(tx, v_sub(p01, p00)));
v_float32x4 p1 = v_add(p10, v_mul(tx, v_sub(p11, p10)));
oldP = v_add(p0, v_mul(ty, v_sub(p1, p0)));
v_float32x4 n0 = n00 + tx*(n01 - n00);
v_float32x4 n1 = n10 + tx*(n11 - n10);
oldN = n0 + ty*(n1 - n0);
v_float32x4 n0 = v_add(n00, v_mul(tx, v_sub(n01, n00)));
v_float32x4 n1 = v_add(n10, v_mul(tx, v_sub(n11, n10)));
oldN = v_add(n0, v_mul(ty, v_sub(n1, n0)));
}
bool oldPNcheck = fastCheck(oldP, oldN);
//filter by distance
v_float32x4 diff = newP - oldP;
bool distCheck = !(v_reduce_sum(diff*diff) > sqThresh);
v_float32x4 diff = v_sub(newP, oldP);
bool distCheck = !(v_reduce_sum(v_mul(diff, diff)) > sqThresh);
//filter by angle
bool angleCheck = !(abs(v_reduce_sum(newN*oldN)) < cosThresh);
bool angleCheck = !(abs(v_reduce_sum(v_mul(newN, oldN))) < cosThresh);
if(!(oldPNcheck && distCheck && angleCheck))
continue;
@ -299,17 +299,17 @@ struct GetAbInvoker : ParallelLoopBody
// build point-wise vector ab = [ A | b ]
v_float32x4 VxNv = crossProduct(newP, oldN);
Point3f VxN;
VxN.x = VxNv.get0();
VxN.y = v_reinterpret_as_f32(v_extract<1>(v_reinterpret_as_u32(VxNv), v_setzero_u32())).get0();
VxN.z = v_reinterpret_as_f32(v_extract<2>(v_reinterpret_as_u32(VxNv), v_setzero_u32())).get0();
VxN.x = v_get0(VxNv);
VxN.y = v_get0(v_reinterpret_as_f32(v_extract<1>(v_reinterpret_as_u32(VxNv), v_setzero_u32())));
VxN.z = v_get0(v_reinterpret_as_f32(v_extract<2>(v_reinterpret_as_u32(VxNv), v_setzero_u32())));
float dotp = -v_reduce_sum(oldN*diff);
float dotp = -v_reduce_sum(v_mul(oldN, diff));
// build point-wise upper-triangle matrix [ab^T * ab] w/o last row
// which is [A^T*A | A^T*b]
// and gather sum
v_float32x4 vd = VxNv | v_float32x4(0, 0, 0, dotp);
v_float32x4 vd = v_or(VxNv, v_float32x4(0, 0, 0, dotp));
v_float32x4 n = oldN;
v_float32x4 nyzx;
{

@ -425,15 +425,15 @@ inline float interpolate(float tx, float ty, float tz, float vx[8])
v_float32x4 v0246, v1357;
v_load_deinterleave(vx, v0246, v1357);
v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246);
v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
v_float32x4 v00_10 = vxx;
v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10);
float v0 = v0_1.get0();
v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
float v0 = v_get0(v0_1);
v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
float v1 = v0_1.get0();
float v1 = v_get0(v0_1);
return v0 + tx * (v1 - v0);
}
@ -598,9 +598,9 @@ Point3f HashTSDFVolumeCPU::getNormalVoxel(const Point3f &point) const
v_float32x8 czp = v_lut(vals, idxzp);
v_float32x8 czn = v_lut(vals, idxzn);
v_float32x8 vcxv = cxn - cxp;
v_float32x8 vcyv = cyn - cyp;
v_float32x8 vczv = czn - czp;
v_float32x8 vcxv = v_sub(cxn, cxp);
v_float32x8 vcyv = v_sub(cyn, cyp);
v_float32x8 vczv = v_sub(czn, czp);
v_store(cxv, vcxv);
v_store(cyv, vcyv);
@ -615,9 +615,9 @@ Point3f HashTSDFVolumeCPU::getNormalVoxel(const Point3f &point) const
v_float32x4 czp0 = v_lut(vals, idxzp + 0); v_float32x4 czp1 = v_lut(vals, idxzp + 4);
v_float32x4 czn0 = v_lut(vals, idxzn + 0); v_float32x4 czn1 = v_lut(vals, idxzn + 4);
v_float32x4 cxv0 = cxn0 - cxp0; v_float32x4 cxv1 = cxn1 - cxp1;
v_float32x4 cyv0 = cyn0 - cyp0; v_float32x4 cyv1 = cyn1 - cyp1;
v_float32x4 czv0 = czn0 - czp0; v_float32x4 czv1 = czn1 - czp1;
v_float32x4 cxv0 = v_sub(cxn0, cxp0); v_float32x4 cxv1 = v_sub(cxn1, cxp1);
v_float32x4 cyv0 = v_sub(cyn0, cyp0); v_float32x4 cyv1 = v_sub(cyn1, cyp1);
v_float32x4 czv0 = v_sub(czn0, czp0); v_float32x4 czv1 = v_sub(czn1, czp1);
v_store(cxv + 0, cxv0); v_store(cxv + 4, cxv1);
v_store(cyv + 0, cyv0); v_store(cyv + 4, cyv1);
@ -1523,9 +1523,9 @@ Point3f HashTSDFVolumeGPU::getNormalVoxel(const Point3f& point) const
v_float32x8 czp = v_lut(vals, idxzp);
v_float32x8 czn = v_lut(vals, idxzn);
v_float32x8 vcxv = cxn - cxp;
v_float32x8 vcyv = cyn - cyp;
v_float32x8 vczv = czn - czp;
v_float32x8 vcxv = v_sub(cxn, cxp);
v_float32x8 vcyv = v_sub(cyn, cyp);
v_float32x8 vczv = v_sub(czn, czp);
v_store(cxv, vcxv);
v_store(cyv, vcyv);
@ -1540,9 +1540,9 @@ Point3f HashTSDFVolumeGPU::getNormalVoxel(const Point3f& point) const
v_float32x4 czp0 = v_lut(vals, idxzp + 0); v_float32x4 czp1 = v_lut(vals, idxzp + 4);
v_float32x4 czn0 = v_lut(vals, idxzn + 0); v_float32x4 czn1 = v_lut(vals, idxzn + 4);
v_float32x4 cxv0 = cxn0 - cxp0; v_float32x4 cxv1 = cxn1 - cxp1;
v_float32x4 cyv0 = cyn0 - cyp0; v_float32x4 cyv1 = cyn1 - cyp1;
v_float32x4 czv0 = czn0 - czp0; v_float32x4 czv1 = czn1 - czp1;
v_float32x4 cxv0 = v_sub(cxn0, cxp0); v_float32x4 cxv1 = v_sub(cxn1, cxp1);
v_float32x4 cyv0 = v_sub(cyn0, cyp0); v_float32x4 cyv1 = v_sub(cyn1, cyp1);
v_float32x4 czv0 = v_sub(czn0, czp0); v_float32x4 czv1 = v_sub(czn1, czp1);
v_store(cxv + 0, cxv0); v_store(cxv + 4, cxv1);
v_store(cyv + 0, cyv0); v_store(cyv + 4, cyv1);

@ -146,21 +146,21 @@ inline float TSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const
{
// tx, ty, tz = floor(p)
v_int32x4 ip = v_floor(p);
v_float32x4 t = p - v_cvt_f32(ip);
float tx = t.get0();
v_float32x4 t = v_sub(p, v_cvt_f32(ip));
float tx = v_get0(t);
t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
float ty = t.get0();
float ty = v_get0(t);
t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
float tz = t.get0();
float tz = v_get0(t);
int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
const TsdfVoxel* volData = volume.ptr<TsdfVoxel>();
int ix = ip.get0();
int ix = v_get0(ip);
ip = v_rotate_right<1>(ip);
int iy = ip.get0();
int iy = v_get0(ip);
ip = v_rotate_right<1>(ip);
int iz = ip.get0();
int iz = v_get0(ip);
int coordBase = ix*xdim + iy*ydim + iz*zdim;
@ -170,15 +170,15 @@ inline float TSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const
v_float32x4 v0246 = tsdfToFloat_INTR(v_int32x4(vx[0], vx[2], vx[4], vx[6]));
v_float32x4 v1357 = tsdfToFloat_INTR(v_int32x4(vx[1], vx[3], vx[5], vx[7]));
v_float32x4 vxx = v0246 + v_setall_f32(tz)*(v1357 - v0246);
v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
v_float32x4 v00_10 = vxx;
v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
v_float32x4 v0_1 = v00_10 + v_setall_f32(ty)*(v01_11 - v00_10);
float v0 = v0_1.get0();
v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
float v0 = v_get0(v0_1);
v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
float v1 = v0_1.get0();
float v1 = v_get0(v0_1);
return v0 + tx*(v1 - v0);
}
@ -228,27 +228,27 @@ inline Point3f TSDFVolumeCPU::getNormalVoxel(const Point3f& _p) const
inline v_float32x4 TSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const
{
if(v_check_any (p < v_float32x4(1.f, 1.f, 1.f, 0.f)) ||
v_check_any (p >= v_float32x4((float)(volResolution.x-2),
if(v_check_any (v_lt(p, v_float32x4(1.f, 1.f, 1.f, 0.f))) ||
v_check_any (v_ge(p, v_float32x4((float)(volResolution.x-2),
(float)(volResolution.y-2),
(float)(volResolution.z-2), 1.f))
(float)(volResolution.z-2), 1.f)))
)
return nanv;
v_int32x4 ip = v_floor(p);
v_float32x4 t = p - v_cvt_f32(ip);
float tx = t.get0();
v_float32x4 t = v_sub(p, v_cvt_f32(ip));
float tx = v_get0(t);
t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
float ty = t.get0();
float ty = v_get0(t);
t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
float tz = t.get0();
float tz = v_get0(t);
const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
const TsdfVoxel* volData = volume.ptr<TsdfVoxel>();
int ix = ip.get0(); ip = v_rotate_right<1>(ip);
int iy = ip.get0(); ip = v_rotate_right<1>(ip);
int iz = ip.get0();
int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
int iz = v_get0(ip);
int coordBase = ix*xdim + iy*ydim + iz*zdim;
@ -266,23 +266,23 @@ inline v_float32x4 TSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const
v_float32x4 v0246 (vx[0], vx[2], vx[4], vx[6]);
v_float32x4 v1357 (vx[1], vx[3], vx[5], vx[7]);
v_float32x4 vxx = v0246 + v_setall_f32(tz)*(v1357 - v0246);
v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
v_float32x4 v00_10 = vxx;
v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
v_float32x4 v0_1 = v00_10 + v_setall_f32(ty)*(v01_11 - v00_10);
float v0 = v0_1.get0();
v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
float v0 = v_get0(v0_1);
v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
float v1 = v0_1.get0();
float v1 = v_get0(v0_1);
nv = v0 + tx*(v1 - v0);
}
v_float32x4 n = v_load_aligned(an);
v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(n*n)));
v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(v_mul(n, n))));
return Norm.get0() < 0.0001f ? nanv : n/Norm;
return v_get0(Norm) < 0.0001f ? nanv : v_div(n, Norm);
}
#else
inline Point3f TSDFVolumeCPU::getNormalVoxel(const Point3f& p) const
@ -394,21 +394,21 @@ struct RaycastInvoker : ParallelLoopBody
// get direction through pixel in volume space:
// 1. reproject (x, y) on projecting plane where z = 1.f
v_float32x4 planed = (v_float32x4((float)x, (float)y, 0.f, 0.f) - vcxy)*vfxy;
v_float32x4 planed = v_mul(v_sub(v_float32x4((float)x, (float)y, 0.F, 0.F), vcxy), vfxy);
planed = v_combine_low(planed, v_float32x4(1.f, 0.f, 0.f, 0.f));
// 2. rotate to volume space
planed = v_matmuladd(planed, camRot0, camRot1, camRot2, v_setzero_f32());
// 3. normalize
v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(planed*planed)));
v_float32x4 dir = planed*invNorm;
v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(v_mul(planed, planed))));
v_float32x4 dir = v_mul(planed, invNorm);
// compute intersection of ray with all six bbox planes
v_float32x4 rayinv = v_setall_f32(1.f)/dir;
v_float32x4 rayinv = v_div(v_setall_f32(1.F), dir);
// div by zero should be eliminated by these products
v_float32x4 tbottom = rayinv*(boxDown - orig);
v_float32x4 ttop = rayinv*(boxUp - orig);
v_float32x4 tbottom = v_mul(rayinv, v_sub(boxDown, orig));
v_float32x4 ttop = v_mul(rayinv, v_sub(boxUp, orig));
// re-order intersections to find smallest and largest on each axis
v_float32x4 minAx = v_min(ttop, tbottom);
@ -429,14 +429,14 @@ struct RaycastInvoker : ParallelLoopBody
if(tmin < tmax)
{
// interpolation optimized a little
orig *= invVoxelSize;
dir *= invVoxelSize;
orig = v_mul(orig, invVoxelSize);
dir = v_mul(dir, invVoxelSize);
int xdim = volume.volDims[0];
int ydim = volume.volDims[1];
int zdim = volume.volDims[2];
v_float32x4 rayStep = dir * v_setall_f32(tstep);
v_float32x4 next = (orig + dir * v_setall_f32(tmin));
v_float32x4 rayStep = v_mul(dir, v_setall_f32(this->tstep));
v_float32x4 next = (v_add(orig, v_mul(dir, v_setall_f32(tmin))));
float f = volume.interpolateVoxel(next), fnext = f;
//raymarch
@ -444,11 +444,11 @@ struct RaycastInvoker : ParallelLoopBody
int nSteps = cvFloor((tmax - tmin)/tstep);
for(; steps < nSteps; steps++)
{
next += rayStep;
next = v_add(next, rayStep);
v_int32x4 ip = v_round(next);
int ix = ip.get0(); ip = v_rotate_right<1>(ip);
int iy = ip.get0(); ip = v_rotate_right<1>(ip);
int iz = ip.get0();
int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
int iz = v_get0(ip);
int coord = ix*xdim + iy*ydim + iz*zdim;
fnext = tsdfToFloat(volume.volume.at<TsdfVoxel>(coord).tsdf);
@ -468,7 +468,7 @@ struct RaycastInvoker : ParallelLoopBody
// linearly interpolate t between two f values
if(f > 0.f && fnext < 0.f)
{
v_float32x4 tp = next - rayStep;
v_float32x4 tp = v_sub(next, rayStep);
float ft = volume.interpolateVoxel(tp);
float ftdt = volume.interpolateVoxel(next);
float ts = tmin + tstep*(steps - ft/(ftdt - ft));
@ -476,7 +476,7 @@ struct RaycastInvoker : ParallelLoopBody
// avoid division by zero
if(!cvIsNaN(ts) && !cvIsInf(ts))
{
v_float32x4 pv = (orig + dir*v_setall_f32(ts));
v_float32x4 pv = (v_add(orig, v_mul(dir, v_setall_f32(ts))));
v_float32x4 nv = volume.getNormalVoxel(pv);
if(!isNaN(nv))
@ -484,9 +484,7 @@ struct RaycastInvoker : ParallelLoopBody
//convert pv and nv to camera space
normal = v_matmuladd(nv, volRot0, volRot1, volRot2, v_setzero_f32());
// interpolation optimized a little
point = v_matmuladd(pv*v_float32x4(volume.voxelSize,
volume.voxelSize,
volume.voxelSize, 1.f),
point = v_matmuladd(v_mul(pv, v_float32x4(this->volume.voxelSize, this->volume.voxelSize, this->volume.voxelSize, 1.F)),
volRot0, volRot1, volRot2, volTrans);
}
}

@ -235,35 +235,33 @@ void integrateVolumeUnit(
// optimization of the following:
//Point3f volPt = Point3f(x, y, z)*voxelSize;
//Point3f camSpacePt = vol2cam * volPt;
camSpacePt += zStep;
camSpacePt = v_add(camSpacePt, zStep);
float zCamSpace = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))).get0();
float zCamSpace = v_get0(v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))));
if (zCamSpace <= 0.f)
continue;
v_float32x4 camPixVec = camSpacePt / v_setall_f32(zCamSpace);
v_float32x4 camPixVec = v_div(camSpacePt, v_setall_f32(zCamSpace));
v_float32x4 projected = v_muladd(camPixVec, vfxy, vcxy);
// leave only first 2 lanes
projected = v_reinterpret_as_f32(v_reinterpret_as_u32(projected) &
v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0));
projected = v_reinterpret_as_f32(v_and(v_reinterpret_as_u32(projected), v_uint32x4(4294967295U, 4294967295U, 0, 0)));
depthType v;
// bilinearly interpolate depth at projected
{
const v_float32x4& pt = projected;
// check coords >= 0 and < imgSize
v_uint32x4 limits = v_reinterpret_as_u32(pt < v_setzero_f32()) |
v_reinterpret_as_u32(pt >= upLimits);
limits = limits | v_rotate_right<1>(limits);
if (limits.get0())
v_uint32x4 limits = v_or(v_reinterpret_as_u32(v_lt(pt, v_setzero_f32())), v_reinterpret_as_u32(v_ge(pt, upLimits)));
limits = v_or(limits, v_rotate_right<1>(limits));
if (v_get0(limits))
continue;
// xi, yi = floor(pt)
v_int32x4 ip = v_floor(pt);
v_int32x4 ipshift = ip;
int xi = ipshift.get0();
int xi = v_get0(ipshift);
ipshift = v_rotate_right<1>(ipshift);
int yi = ipshift.get0();
int yi = v_get0(ipshift);
const depthType* row0 = depth[yi + 0];
const depthType* row1 = depth[yi + 1];
@ -277,17 +275,17 @@ void integrateVolumeUnit(
// assume correct depth is positive
// don't fix missing data
if (v_check_all(vall > v_setzero_f32()))
if (v_check_all(v_gt(vall, v_setzero_f32())))
{
v_float32x4 t = pt - v_cvt_f32(ip);
float tx = t.get0();
v_float32x4 t = v_sub(pt, v_cvt_f32(ip));
float tx = v_get0(t);
t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
v_float32x4 ty = v_setall_f32(t.get0());
v_float32x4 ty = v_setall_f32(v_get0(t));
// vx is y-interpolated between rows 0 and 1
v_float32x4 vx = v001 + ty * (v101 - v001);
float v0 = vx.get0();
v_float32x4 vx = v_add(v001, v_mul(ty, v_sub(v101, v001)));
float v0 = v_get0(vx);
vx = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vx)));
float v1 = vx.get0();
float v1 = v_get0(vx);
v = v0 + tx * (v1 - v0);
}
else
@ -295,8 +293,8 @@ void integrateVolumeUnit(
}
// norm(camPixVec) produces double which is too slow
int _u = (int)projected.get0();
int _v = (int)v_rotate_right<1>(projected).get0();
int _u = (int)v_get0(projected);
int _v = (int)v_get0(v_rotate_right<1>(projected));
if (!(_u >= 0 && _u < depth.cols && _v >= 0 && _v < depth.rows))
continue;
float pixNorm = pixNorms.at<float>(_v, _u);
@ -500,35 +498,33 @@ void integrateRGBVolumeUnit(
// optimization of the following:
//Point3f volPt = Point3f(x, y, z)*voxelSize;
//Point3f camSpacePt = vol2cam * volPt;
camSpacePt += zStep;
camSpacePt = v_add(camSpacePt, zStep);
float zCamSpace = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))).get0();
float zCamSpace = v_get0(v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))));
if (zCamSpace <= 0.f)
continue;
v_float32x4 camPixVec = camSpacePt / v_setall_f32(zCamSpace);
v_float32x4 camPixVec = v_div(camSpacePt, v_setall_f32(zCamSpace));
v_float32x4 projected = v_muladd(camPixVec, vfxy, vcxy);
// leave only first 2 lanes
projected = v_reinterpret_as_f32(v_reinterpret_as_u32(projected) &
v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0));
projected = v_reinterpret_as_f32(v_and(v_reinterpret_as_u32(projected), v_uint32x4(4294967295U, 4294967295U, 0, 0)));
depthType v;
// bilinearly interpolate depth at projected
{
const v_float32x4& pt = projected;
// check coords >= 0 and < imgSize
v_uint32x4 limits = v_reinterpret_as_u32(pt < v_setzero_f32()) |
v_reinterpret_as_u32(pt >= upLimits);
limits = limits | v_rotate_right<1>(limits);
if (limits.get0())
v_uint32x4 limits = v_or(v_reinterpret_as_u32(v_lt(pt, v_setzero_f32())), v_reinterpret_as_u32(v_ge(pt, upLimits)));
limits = v_or(limits,v_rotate_right<1>(limits));
if (v_get0(limits))
continue;
// xi, yi = floor(pt)
v_int32x4 ip = v_floor(pt);
v_int32x4 ipshift = ip;
int xi = ipshift.get0();
int xi = v_get0(ipshift);
ipshift = v_rotate_right<1>(ipshift);
int yi = ipshift.get0();
int yi = v_get0(ipshift);
const depthType* row0 = depth[yi + 0];
const depthType* row1 = depth[yi + 1];
@ -542,17 +538,17 @@ void integrateRGBVolumeUnit(
// assume correct depth is positive
// don't fix missing data
if (v_check_all(vall > v_setzero_f32()))
if (v_check_all(v_gt(vall, v_setzero_f32())))
{
v_float32x4 t = pt - v_cvt_f32(ip);
float tx = t.get0();
v_float32x4 t = v_sub(pt, v_cvt_f32(ip));
float tx = v_get0(t);
t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
v_float32x4 ty = v_setall_f32(t.get0());
v_float32x4 ty = v_setall_f32(v_get0(t));
// vx is y-interpolated between rows 0 and 1
v_float32x4 vx = v001 + ty * (v101 - v001);
float v0 = vx.get0();
v_float32x4 vx = v_add(v001, v_mul(ty, v_sub(v101, v001)));
float v0 = v_get0(vx);
vx = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vx)));
float v1 = vx.get0();
float v1 = v_get0(vx);
v = v0 + tx * (v1 - v0);
}
else
@ -561,14 +557,13 @@ void integrateRGBVolumeUnit(
v_float32x4 projectedRGB = v_muladd(camPixVec, rgb_vfxy, rgb_vcxy);
// leave only first 2 lanes
projectedRGB = v_reinterpret_as_f32(v_reinterpret_as_u32(projected) &
v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0));
projectedRGB = v_reinterpret_as_f32(v_and(v_reinterpret_as_u32(projected), v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0)));
// norm(camPixVec) produces double which is too slow
int _u = (int)projected.get0();
int _v = (int)v_rotate_right<1>(projected).get0();
int rgb_u = (int)projectedRGB.get0();
int rgb_v = (int)v_rotate_right<1>(projectedRGB).get0();
int _u = (int)v_get0(projected);
int _v = (int)v_get0(v_rotate_right<1>(projected));
int rgb_u = (int)v_get0(projectedRGB);
int rgb_v = (int)v_get0(v_rotate_right<1>(projectedRGB));
if (!(_u >= 0 && _u < depth.cols && _v >= 0 && _v < depth.rows &&
rgb_v >= 0 && rgb_v < color.rows && rgb_u >= 0 && rgb_u < color.cols))

@ -20,7 +20,7 @@ namespace kinfu
inline v_float32x4 tsdfToFloat_INTR(const v_int32x4& num)
{
v_float32x4 num128 = v_setall_f32(-1.f / 128.f);
return v_cvt_f32(num) * num128;
return v_mul(v_cvt_f32(num), num128);
}
#endif

@ -68,7 +68,7 @@ inline bool isNaN(cv::Point3f p)
#if USE_INTRINSICS
static inline bool isNaN(const cv::v_float32x4& p)
{
return cv::v_check_any(p != p);
return cv::v_check_any(v_ne(p, p));
}
#endif

@ -74,8 +74,8 @@ inline v_uint8x16 v_finalize_pix_ch(const v_int16x8& c0, const v_int16x8& c1,
v_expand_f32(c0, f0, f1);
v_expand_f32(c1, f2, f3);
v_int16x8 d0 = v_pack(v_round(s0*alpha + f0), v_round(s1*alpha + f1));
v_int16x8 d1 = v_pack(v_round(s2*alpha + f2), v_round(s3*alpha + f3));
v_int16x8 d0 = v_pack(v_round(v_add(v_mul(s0, alpha), f0)), v_round(v_add(v_mul(s1, alpha), f1)));
v_int16x8 d1 = v_pack(v_round(v_add(v_mul(s2, alpha), f2)), v_round(v_add(v_mul(s3, alpha), f3)));
return v_pack_u(d0, d1);
}
@ -135,12 +135,12 @@ public:
v_expand_s(p1, p10, p11);
v_expand_s(p2, p20, p21);
v_int16x8 d00 = p00 - c00, d01 = p01 - c01;
v_int16x8 d10 = p10 - c10, d11 = p11 - c11;
v_int16x8 d20 = p20 - c20, d21 = p21 - c21;
v_int16x8 d00 = v_sub(p00, c00), d01 = v_sub(p01, c01);
v_int16x8 d10 = v_sub(p10, c10), d11 = v_sub(p11, c11);
v_int16x8 d20 = v_sub(p20, c20), d21 = v_sub(p21, c21);
v_uint16x8 n0 = v_abs(d00) + v_abs(d10) + v_abs(d20);
v_uint16x8 n1 = v_abs(d01) + v_abs(d11) + v_abs(d21);
v_uint16x8 n0 = v_add(v_add(v_abs(d00), v_abs(d10)), v_abs(d20));
v_uint16x8 n1 = v_add(v_add(v_abs(d01), v_abs(d11)), v_abs(d21));
ushort CV_DECL_ALIGNED(16) nbuf[16];
v_store(nbuf, n0);
@ -153,13 +153,13 @@ public:
v_expand_f32(d00, fd0, fd1);
v_expand_f32(d01, fd2, fd3);
s00 += fd0*w0; s01 += fd1*w1; s02 += fd2*w2; s03 += fd3*w3;
s00 = v_add(s00, v_mul(fd0, w0)); s01 = v_add(s01, v_mul(fd1, w1)); s02 = v_add(s02, v_mul(fd2, w2)); s03 = v_add(s03, v_mul(fd3, w3));
v_expand_f32(d10, fd0, fd1);
v_expand_f32(d11, fd2, fd3);
s10 += fd0*w0; s11 += fd1*w1; s12 += fd2*w2; s13 += fd3*w3;
s10 = v_add(s10, v_mul(fd0, w0)); s11 = v_add(s11, v_mul(fd1, w1)); s12 = v_add(s12, v_mul(fd2, w2)); s13 = v_add(s13, v_mul(fd3, w3));
v_expand_f32(d20, fd0, fd1);
v_expand_f32(d21, fd2, fd3);
s20 += fd0*w0; s21 += fd1*w1; s22 += fd2*w2; s23 += fd3*w3;
s20 = v_add(s20, v_mul(fd0, w0)); s21 = v_add(s21, v_mul(fd1, w1)); s22 = v_add(s22, v_mul(fd2, w2)); s23 = v_add(s23, v_mul(fd3, w3));
}
c0 = v_finalize_pix_ch(c00, c01, s00, s01, s02, s03, v_alpha);

@ -303,15 +303,15 @@ void FastGlobalSmootherFilterImpl::process_4row_block(Mat* cur,int i)
v_float32x4 aux0,aux1,aux2,aux3;
#define PROC4(Chor_in,cur_in,coef_prev_in,interD_prev_in,cur_prev_in,interD_out,cur_out,coef_cur_out)\
coef_cur_out = lambda_reg*Chor_in;\
aux0 = interD_prev_in*coef_prev_in;\
aux1 = coef_cur_out+coef_prev_in;\
aux1 = one_reg-aux1;\
aux0 = aux1-aux0;\
interD_out = coef_cur_out/aux0;\
aux1 = cur_prev_in*coef_prev_in;\
aux1 = cur_in - aux1;\
cur_out = aux1/aux0;
coef_cur_out = v_mul(lambda_reg, Chor_in);\
aux0 = v_mul(interD_prev_in, coef_prev_in);\
aux1 = v_add(coef_cur_out, coef_prev_in);\
aux1 = v_sub(one_reg, aux1);\
aux0 = v_sub(aux1, aux0);\
interD_out = v_div(coef_cur_out, aux0);\
aux1 = v_mul(cur_prev_in, coef_prev_in);\
aux1 = v_sub(cur_in, aux1);\
cur_out = v_div(aux1, aux0);
for(;j<w-3;j+=4)
{
@ -406,14 +406,14 @@ void FastGlobalSmootherFilterImpl::process_4row_block(Mat* cur,int i)
aux3 = v_load(cur_row_next3+j);
v_transpose4x4(aux0,aux1,aux2,aux3,b0,b1,b2,b3);
aux0 = a3*cur_next_reg;
b3 = b3-aux0;
aux0 = a2*b3;
b2 = b2-aux0;
aux0 = a1*b2;
b1 = b1-aux0;
aux0 = a0*b1;
b0 = b0-aux0;
aux0 = v_mul(a3, cur_next_reg);
b3 = v_sub(b3, aux0);
aux0 = v_mul(a2, b3);
b2 = v_sub(b2, aux0);
aux0 = v_mul(a1, b2);
b1 = v_sub(b1, aux0);
aux0 = v_mul(a0, b1);
b0 = v_sub(b0, aux0);
cur_next_reg = b0;
@ -522,24 +522,24 @@ void FastGlobalSmootherFilterImpl::VerticalPass_ParBody::operator()(const Range&
{
a = v_load(Cvert_row_prev+j);
b = v_load(Cvert_row+j);
coef_prev_reg = lambda_reg*a;
coef_cur_reg = lambda_reg*b;
coef_prev_reg = v_mul(lambda_reg, a);
coef_cur_reg = v_mul(lambda_reg, b);
a = v_load(interD_row_prev+j);
a = a*coef_prev_reg;
a = v_mul(a, coef_prev_reg);
b = coef_prev_reg+coef_cur_reg;
b = b+a;
a = one_reg-b; //computed denom
b = v_add(coef_prev_reg, coef_cur_reg);
b = v_add(b, a);
a = v_sub(one_reg, b); //computed denom
b = coef_cur_reg/a; //computed interD_row
b = v_div(coef_cur_reg, a); //computed interD_row
c = v_load(cur_row_prev+j);
c = c*coef_prev_reg;
c = v_mul(c, coef_prev_reg);
d = v_load(cur_row+j);
d = d-c;
d = d/a; //computed cur_row
d = v_sub(d, c);
d = v_div(d, a); //computed cur_row
v_store(interD_row+j,b);
v_store(cur_row+j,d);
@ -570,10 +570,10 @@ void FastGlobalSmootherFilterImpl::VerticalPass_ParBody::operator()(const Range&
{
a = v_load(interD_row+j);
b = v_load(cur_row_next+j);
b = a*b;
b = v_mul(a, b);
a = v_load(cur_row+j);
b = a-b;
b = v_sub(a, b);
v_store(cur_row+j,b);
}
#endif

@ -130,21 +130,21 @@ void calculateChannelSums(uint &sumB, uint &sumG, uint &sumR, uchar *src_data, i
v_expand(v_max_val, v_max1, v_max2);
// Calculate masks
v_m1 = ~(v_mul_wrap(v_max1 - v_min1, v_255) > v_mul_wrap(v_thresh, v_max1));
v_m2 = ~(v_mul_wrap(v_max2 - v_min2, v_255) > v_mul_wrap(v_thresh, v_max2));
v_m1 = v_not(v_gt(v_mul_wrap(v_sub(v_max1, v_min1), v_255), v_mul_wrap(v_thresh, v_max1)));
v_m2 = v_not(v_gt(v_mul_wrap(v_sub(v_max2, v_min2), v_255), v_mul_wrap(v_thresh, v_max2)));
// Apply masks
v_iB1 = (v_iB1 & v_m1) + (v_iB2 & v_m2);
v_iG1 = (v_iG1 & v_m1) + (v_iG2 & v_m2);
v_iR1 = (v_iR1 & v_m1) + (v_iR2 & v_m2);
v_iB1 = v_add(v_and(v_iB1, v_m1), v_and(v_iB2, v_m2));
v_iG1 = v_add(v_and(v_iG1, v_m1), v_and(v_iG2, v_m2));
v_iR1 = v_add(v_and(v_iR1, v_m1), v_and(v_iR2, v_m2));
// Split and add to the sums:
v_expand(v_iB1, v_uint1, v_uint2);
v_SB += v_uint1 + v_uint2;
v_SB = v_add(v_SB, v_add(v_uint1, v_uint2));
v_expand(v_iG1, v_uint1, v_uint2);
v_SG += v_uint1 + v_uint2;
v_SG = v_add(v_SG, v_add(v_uint1, v_uint2));
v_expand(v_iR1, v_uint1, v_uint2);
v_SR += v_uint1 + v_uint2;
v_SR = v_add(v_SR, v_add(v_uint1, v_uint2));
}
sumB = v_reduce_sum(v_SB);
@ -197,21 +197,21 @@ void calculateChannelSums(uint64 &sumB, uint64 &sumG, uint64 &sumR, ushort *src_
v_expand(v_max_val, v_max1, v_max2);
// Calculate masks
v_m1 = ~((v_max1 - v_min1) * v_65535 > v_thresh * v_max1);
v_m2 = ~((v_max2 - v_min2) * v_65535 > v_thresh * v_max2);
v_m1 = v_not(v_gt(v_mul(v_sub(v_max1, v_min1), v_65535), v_mul(v_thresh, v_max1)));
v_m2 = v_not(v_gt(v_mul(v_sub(v_max2, v_min2), v_65535), v_mul(v_thresh, v_max2)));
// Apply masks
v_iB1 = (v_iB1 & v_m1) + (v_iB2 & v_m2);
v_iG1 = (v_iG1 & v_m1) + (v_iG2 & v_m2);
v_iR1 = (v_iR1 & v_m1) + (v_iR2 & v_m2);
v_iB1 = v_add(v_and(v_iB1, v_m1), v_and(v_iB2, v_m2));
v_iG1 = v_add(v_and(v_iG1, v_m1), v_and(v_iG2, v_m2));
v_iR1 = v_add(v_and(v_iR1, v_m1), v_and(v_iR2, v_m2));
// Split and add to the sums:
v_expand(v_iB1, v_u64_1, v_u64_2);
v_SB += v_u64_1 + v_u64_2;
v_SB = v_add(v_SB, v_add(v_u64_1, v_u64_2));
v_expand(v_iG1, v_u64_1, v_u64_2);
v_SG += v_u64_1 + v_u64_2;
v_SG = v_add(v_SG, v_add(v_u64_1, v_u64_2));
v_expand(v_iR1, v_u64_1, v_u64_2);
v_SR += v_u64_1 + v_u64_2;
v_SR = v_add(v_SR, v_add(v_u64_1, v_u64_2));
}
// Perform final reduction
@ -282,12 +282,12 @@ void applyChannelGains(InputArray _src, OutputArray _dst, float gainB, float gai
v_expand(v_inR, v_sR1, v_sR2);
// Multiply by gains
v_sB1 = v_mul_wrap(v_sB1, v_gainB) >> 8;
v_sB2 = v_mul_wrap(v_sB2, v_gainB) >> 8;
v_sG1 = v_mul_wrap(v_sG1, v_gainG) >> 8;
v_sG2 = v_mul_wrap(v_sG2, v_gainG) >> 8;
v_sR1 = v_mul_wrap(v_sR1, v_gainR) >> 8;
v_sR2 = v_mul_wrap(v_sR2, v_gainR) >> 8;
v_sB1 = v_shr(v_mul_wrap(v_sB1, v_gainB), 8);
v_sB2 = v_shr(v_mul_wrap(v_sB2, v_gainB), 8);
v_sG1 = v_shr(v_mul_wrap(v_sG1, v_gainG), 8);
v_sG2 = v_shr(v_mul_wrap(v_sG2, v_gainG), 8);
v_sR1 = v_shr(v_mul_wrap(v_sR1, v_gainR), 8);
v_sR2 = v_shr(v_mul_wrap(v_sR2, v_gainR), 8);
// Pack into vectors of v_uint8x16
v_store_interleave(&dst_data[i], v_pack(v_sB1, v_sB2), v_pack(v_sG1, v_sG2), v_pack(v_sR1, v_sR2));
@ -325,12 +325,12 @@ void applyChannelGains(InputArray _src, OutputArray _dst, float gainB, float gai
v_expand(v_inR, v_sR1, v_sR2);
// Multiply by scaling factors
v_sB1 = (v_sB1 * v_gainB) >> 16;
v_sB2 = (v_sB2 * v_gainB) >> 16;
v_sG1 = (v_sG1 * v_gainG) >> 16;
v_sG2 = (v_sG2 * v_gainG) >> 16;
v_sR1 = (v_sR1 * v_gainR) >> 16;
v_sR2 = (v_sR2 * v_gainR) >> 16;
v_sB1 = v_shr(v_mul(v_sB1, v_gainB), 16);
v_sB2 = v_shr(v_mul(v_sB2, v_gainB), 16);
v_sG1 = v_shr(v_mul(v_sG1, v_gainG), 16);
v_sG2 = v_shr(v_mul(v_sG2, v_gainG), 16);
v_sR1 = v_shr(v_mul(v_sR1, v_gainR), 16);
v_sR2 = v_shr(v_mul(v_sR2, v_gainR), 16);
// Pack into vectors of v_uint16x8
v_store_interleave(&dst_data[i], v_pack(v_sB1, v_sB2), v_pack(v_sG1, v_sG2), v_pack(v_sR1, v_sR2));

@ -192,7 +192,7 @@ void LearningBasedWBImpl::preprocessing(Mat &src)
v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
v_local_max = v_max(v_inB, v_max(v_inG, v_inR));
v_global_max = v_max(v_local_max, v_global_max);
v_mask = (v_local_max < v_thresh);
v_mask = (v_lt(v_local_max, v_thresh));
v_store(mask_ptr + i, v_mask);
}
uchar global_max[16];
@ -225,7 +225,7 @@ void LearningBasedWBImpl::preprocessing(Mat &src)
v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
v_local_max = v_max(v_inB, v_max(v_inG, v_inR));
v_global_max = v_max(v_local_max, v_global_max);
v_mask = (v_local_max < v_thresh);
v_mask = (v_lt(v_local_max, v_thresh));
v_pack_store(mask_ptr + i, v_mask);
}
ushort global_max[8];
@ -270,9 +270,9 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
v_uint8x16 v_mask = v_load(mask_ptr + i);
v_inB &= v_mask;
v_inG &= v_mask;
v_inR &= v_mask;
v_inB = v_and(v_inB, v_mask);
v_inG = v_and(v_inG, v_mask);
v_inR = v_and(v_inR, v_mask);
v_uint16x8 v_sR1, v_sR2, v_sG1, v_sG2, v_sB1, v_sB2;
v_expand(v_inB, v_sB1, v_sB2);
@ -280,33 +280,33 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
v_expand(v_inR, v_sR1, v_sR2);
// update the brightest (R,G,B) tuple (process left half):
v_uint16x8 v_sum = v_sB1 + v_sG1 + v_sR1;
v_uint16x8 v_max_mask = (v_sum > v_max_sum);
v_uint16x8 v_sum = v_add(v_add(v_sB1, v_sG1), v_sR1);
v_uint16x8 v_max_mask = (v_gt(v_sum, v_max_sum));
v_max_sum = v_max(v_sum, v_max_sum);
v_brightestB = (v_sB1 & v_max_mask) + (v_brightestB & (~v_max_mask));
v_brightestG = (v_sG1 & v_max_mask) + (v_brightestG & (~v_max_mask));
v_brightestR = (v_sR1 & v_max_mask) + (v_brightestR & (~v_max_mask));
v_brightestB = v_add(v_and(v_sB1, v_max_mask), v_and(v_brightestB, v_not(v_max_mask)));
v_brightestG = v_add(v_and(v_sG1, v_max_mask), v_and(v_brightestG, v_not(v_max_mask)));
v_brightestR = v_add(v_and(v_sR1, v_max_mask), v_and(v_brightestR, v_not(v_max_mask)));
// update the brightest (R,G,B) tuple (process right half):
v_sum = v_sB2 + v_sG2 + v_sR2;
v_max_mask = (v_sum > v_max_sum);
v_sum = v_add(v_add(v_sB2, v_sG2), v_sR2);
v_max_mask = (v_gt(v_sum, v_max_sum));
v_max_sum = v_max(v_sum, v_max_sum);
v_brightestB = (v_sB2 & v_max_mask) + (v_brightestB & (~v_max_mask));
v_brightestG = (v_sG2 & v_max_mask) + (v_brightestG & (~v_max_mask));
v_brightestR = (v_sR2 & v_max_mask) + (v_brightestR & (~v_max_mask));
v_brightestB = v_add(v_and(v_sB2, v_max_mask), v_and(v_brightestB, v_not(v_max_mask)));
v_brightestG = v_add(v_and(v_sG2, v_max_mask), v_and(v_brightestG, v_not(v_max_mask)));
v_brightestR = v_add(v_and(v_sR2, v_max_mask), v_and(v_brightestR, v_not(v_max_mask)));
// update sums:
v_sB1 = v_sB1 + v_sB2;
v_sG1 = v_sG1 + v_sG2;
v_sR1 = v_sR1 + v_sR2;
v_sB1 = v_add(v_sB1, v_sB2);
v_sG1 = v_add(v_sG1, v_sG2);
v_sR1 = v_add(v_sR1, v_sR2);
v_uint32x4 v_uint1, v_uint2;
v_expand(v_sB1, v_uint1, v_uint2);
v_SB += v_uint1 + v_uint2;
v_SB = v_add(v_SB, v_add(v_uint1, v_uint2));
v_expand(v_sG1, v_uint1, v_uint2);
v_SG += v_uint1 + v_uint2;
v_SG = v_add(v_SG, v_add(v_uint1, v_uint2));
v_expand(v_sR1, v_uint1, v_uint2);
v_SR += v_uint1 + v_uint2;
v_SR = v_add(v_SR, v_add(v_uint1, v_uint2));
}
sumB = v_reduce_sum(v_SB);
sumG = v_reduce_sum(v_SG);
@ -361,11 +361,11 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
v_uint16x8 v_inB, v_inG, v_inR;
v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
v_uint16x8 v_mask = v_load_expand(mask_ptr + i);
v_mask = v_mask | ((v_mask & v_mask_lower) << 8);
v_mask = v_or(v_mask, v_shl<8>(v_and(v_mask, v_mask_lower)));
v_inB &= v_mask;
v_inG &= v_mask;
v_inR &= v_mask;
v_inB = v_and(v_inB, v_mask);
v_inG = v_and(v_inG, v_mask);
v_inR = v_and(v_inR, v_mask);
v_uint32x4 v_iR1, v_iR2, v_iG1, v_iG2, v_iB1, v_iB2;
v_expand(v_inB, v_iB1, v_iB2);
@ -373,32 +373,32 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
v_expand(v_inR, v_iR1, v_iR2);
// update the brightest (R,G,B) tuple (process left half):
v_uint32x4 v_sum = v_iB1 + v_iG1 + v_iR1;
v_uint32x4 v_max_mask = (v_sum > v_max_sum);
v_uint32x4 v_sum = v_add(v_add(v_iB1, v_iG1), v_iR1);
v_uint32x4 v_max_mask = (v_gt(v_sum, v_max_sum));
v_max_sum = v_max(v_sum, v_max_sum);
v_brightestB = (v_iB1 & v_max_mask) + (v_brightestB & (~v_max_mask));
v_brightestG = (v_iG1 & v_max_mask) + (v_brightestG & (~v_max_mask));
v_brightestR = (v_iR1 & v_max_mask) + (v_brightestR & (~v_max_mask));
v_brightestB = v_add(v_and(v_iB1, v_max_mask), v_and(v_brightestB, v_not(v_max_mask)));
v_brightestG = v_add(v_and(v_iG1, v_max_mask), v_and(v_brightestG, v_not(v_max_mask)));
v_brightestR = v_add(v_and(v_iR1, v_max_mask), v_and(v_brightestR, v_not(v_max_mask)));
// update the brightest (R,G,B) tuple (process right half):
v_sum = v_iB2 + v_iG2 + v_iR2;
v_max_mask = (v_sum > v_max_sum);
v_sum = v_add(v_add(v_iB2, v_iG2), v_iR2);
v_max_mask = (v_gt(v_sum, v_max_sum));
v_max_sum = v_max(v_sum, v_max_sum);
v_brightestB = (v_iB2 & v_max_mask) + (v_brightestB & (~v_max_mask));
v_brightestG = (v_iG2 & v_max_mask) + (v_brightestG & (~v_max_mask));
v_brightestR = (v_iR2 & v_max_mask) + (v_brightestR & (~v_max_mask));
v_brightestB = v_add(v_and(v_iB2, v_max_mask), v_and(v_brightestB, v_not(v_max_mask)));
v_brightestG = v_add(v_and(v_iG2, v_max_mask), v_and(v_brightestG, v_not(v_max_mask)));
v_brightestR = v_add(v_and(v_iR2, v_max_mask), v_and(v_brightestR, v_not(v_max_mask)));
// update sums:
v_iB1 = v_iB1 + v_iB2;
v_iG1 = v_iG1 + v_iG2;
v_iR1 = v_iR1 + v_iR2;
v_iB1 = v_add(v_iB1, v_iB2);
v_iG1 = v_add(v_iG1, v_iG2);
v_iR1 = v_add(v_iR1, v_iR2);
v_uint64x2 v_uint64_1, v_uint64_2;
v_expand(v_iB1, v_uint64_1, v_uint64_2);
v_SB += v_uint64_1 + v_uint64_2;
v_SB = v_add(v_SB, v_add(v_uint64_1, v_uint64_2));
v_expand(v_iG1, v_uint64_1, v_uint64_2);
v_SG += v_uint64_1 + v_uint64_2;
v_SG = v_add(v_SG, v_add(v_uint64_1, v_uint64_2));
v_expand(v_iR1, v_uint64_1, v_uint64_2);
v_SR += v_uint64_1 + v_uint64_2;
v_SR = v_add(v_SR, v_add(v_uint64_1, v_uint64_2));
}
uint64 sum_arr[2];
v_store(sum_arr, v_SB);

Loading…
Cancel
Save