From b9a460b85733c75a700c61b52ccee1fe653e21db Mon Sep 17 00:00:00 2001 From: Liutong HAN Date: Sun, 8 Oct 2023 16:01:23 +0800 Subject: [PATCH] Use new Universal Intrinsic API. --- modules/optflow/src/rlof/berlof_invoker.hpp | 224 +++++++++--------- modules/optflow/src/rlof/plk_invoker.hpp | 74 +++--- modules/optflow/src/rlof/rlof_invoker.hpp | 132 +++++------ modules/optflow/src/rlof/rlof_invokerbase.hpp | 76 +++--- modules/optflow/src/rlof/rlof_localflow.cpp | 8 +- modules/rgbd/src/colored_tsdf.cpp | 120 +++++----- modules/rgbd/src/fast_icp.cpp | 48 ++-- modules/rgbd/src/hash_tsdf.cpp | 32 +-- modules/rgbd/src/tsdf.cpp | 90 ++++--- modules/rgbd/src/tsdf_functions.cpp | 83 +++---- modules/rgbd/src/tsdf_functions.hpp | 2 +- modules/rgbd/src/utils.hpp | 2 +- modules/ximgproc/src/anisodiff.cpp | 20 +- modules/ximgproc/src/fgs_filter.cpp | 58 ++--- .../xphoto/src/grayworld_white_balance.cpp | 56 ++--- .../src/learning_based_color_balance.cpp | 82 +++---- 16 files changed, 549 insertions(+), 558 deletions(-) diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp index 8fde6e457..e51f8091c 100644 --- a/modules/optflow/src/rlof/berlof_invoker.hpp +++ b/modules/optflow/src/rlof/berlof_invoker.hpp @@ -296,7 +296,7 @@ public: v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn)); v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x)); v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn)); - v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16; + v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16); v_int32x4 t0, t1; v_int16x8 t00, t01, t10, t11; @@ -304,35 +304,35 @@ public: v_zip(v10, v11, t10, t11); //subpixel interpolation - t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1); - t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1); - t0 = t0 >> (W_BITS - 5); - t1 = t1 >> (W_BITS - 5); + t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1)); + t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1)); + t0 = v_shr(t0, W_BITS - 5); + t1 = v_shr(t1, W_BITS - 5); diff0 = v_pack(t0, t1); // I*gain.x + gain.x v_int16x8 diff[4] = { - ((v11 << 5) - vI) & vmask, - ((v01 << 5) - vI) & vmask, - ((v10 << 5) - vI) & vmask, - ((v00 << 5) - vI) & vmask + v_and(v_sub(v_shl<5>(v11), vI), vmask), + v_and(v_sub(v_shl<5>(v01), vI), vmask), + v_and(v_sub(v_shl<5>(v10), vI), vmask), + v_and(v_sub(v_shl<5>(v00), vI), vmask) }; - diff0 = diff0 - vI; - diff0 = diff0 & vmask; + diff0 = v_sub(diff0, vI); + diff0 = v_and(diff0, vmask); - v_int16x8 vscale_diff_is_pos = diff0 > vscale; - veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1); + v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale); + veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1)); // since there is no abs vor int16x8 we have to do this hack v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0)); v_int16x8 vset2, vset1; // |It| < sigma1 ? - vset2 = vabs_diff < vparam1; + vset2 = v_lt(vabs_diff, vparam1); // It > 0 ? - v_int16x8 vdiff_is_pos = diff0 > vzero; + v_int16x8 vdiff_is_pos = v_gt(diff0, vzero); // sigma0 < |It| < sigma1 ? - vset1 = vset2 & (vabs_diff > vparam0); + vset1 = v_and(vset2, v_gt(vabs_diff, vparam0)); // val = |It| -/+ sigma1 - v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1); + v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1)); v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8)); @@ -342,7 +342,7 @@ public: for (unsigned int mmi = 0; mmi < 4; mmi++) { // It == 0 ? |It| > sigma13 - diff0 = vset2 & diff[mmi]; + diff0 = v_and(vset2, diff[mmi]); // It == val ? sigma0 < |It| < sigma1 diff0 = v_select(vset1, vtmp_param1, diff0); @@ -350,16 +350,16 @@ public: // diff = diff * sigma2 v_int32x4 diff_int_0, diff_int_1; v_mul_expand(diff0, tale_, diff_int_0, diff_int_1); - v_int32x4 diff0_0 = diff_int_0 >> s2bitShift; - v_int32x4 diff0_1 = diff_int_1 >> s2bitShift; + v_int32x4 diff0_0 = v_shr(diff_int_0, s2bitShift); + v_int32x4 diff0_1 = v_shr(diff_int_1, s2bitShift); diff0 = v_pack(diff0_0, diff0_1); v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ... v_zip(vIxy_0, vIxy_1, v10, v11); v_zip(diff2, diff1, v00, v01); - vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10)); - vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11)); + vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10))); + vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11))); } if (j == 0) { @@ -387,8 +387,8 @@ public: v_float32x4 fx = v_cvt_f32(t1); // A11 - A22 - v_float32x4 fxtale = fx * vtale_0; - v_float32x4 fytale = fy * vtale_0; + v_float32x4 fxtale = v_mul(fx, vtale_0); + v_float32x4 fytale = v_mul(fy, vtale_0); vAyy = v_muladd(fy, fytale, vAyy); vAxy = v_muladd(fx, fytale, vAxy); @@ -402,8 +402,8 @@ public: fx = v_cvt_f32(t1); // A11 - A22 - fxtale = fx * vtale_1; - fytale = fy * vtale_1; + fxtale = v_mul(fx, vtale_1); + fytale = v_mul(fy, vtale_1); vAyy = v_muladd(fy, fytale, vAyy); vAxy = v_muladd(fx, fytale, vAxy); @@ -544,7 +544,7 @@ public: float CV_DECL_ALIGNED(16) bbuf[4]; for (int mmi = 0; mmi < 4; mmi++) { - v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]); + v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi])); _b0[mmi] = bbuf[0] + bbuf[2]; _b1[mmi] = bbuf[1] + bbuf[3]; } @@ -960,7 +960,7 @@ public: v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn)); v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x)); v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn)); - v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16; + v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16); v_int32x4 t0, t1; v_int16x8 t00, t01, t10, t11; @@ -968,38 +968,38 @@ public: v_zip(v10, v11, t10, t11); //subpixel interpolation - t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1); - t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1); - t0 = t0 >> (W_BITS - 5); - t1 = t1 >> (W_BITS - 5); + t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1)); + t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1)); + t0 = v_shr(t0, W_BITS - 5); + t1 = v_shr(t1, W_BITS - 5); diff0 = v_pack(t0, t1); // I*gain.x + gain.x v_mul_expand(vI, vgain_value, t0, t1); - v_int16x8 diff_value = v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value - vI; + v_int16x8 diff_value = v_sub(v_add(v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift)), vconst_value), vI); v_int16x8 diff[4] = { - ((v11 << 5) + diff_value) & vmask, - ((v01 << 5) + diff_value) & vmask, - ((v10 << 5) + diff_value) & vmask, - ((v00 << 5) + diff_value) & vmask + v_and(v_add(v_shl<5>(v11), diff_value), vmask), + v_and(v_add(v_shl<5>(v01), diff_value), vmask), + v_and(v_add(v_shl<5>(v10), diff_value), vmask), + v_and(v_add(v_shl<5>(v00), diff_value), vmask) }; - diff0 = diff0 + diff_value; - diff0 = diff0 & vmask; + diff0 = v_add(diff0, diff_value); + diff0 = v_and(diff0, vmask); - v_int16x8 vscale_diff_is_pos = diff0 > vscale; - veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1); + v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale); + veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1)); // since there is no abs vor int16x8 we have to do this hack v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0)); v_int16x8 vset2, vset1; // |It| < sigma1 ? - vset2 = vabs_diff < vparam1; + vset2 = v_lt(vabs_diff, vparam1); // It > 0 ? - v_int16x8 vdiff_is_pos = diff0 > vzero; + v_int16x8 vdiff_is_pos = v_gt(diff0, vzero); // sigma0 < |It| < sigma1 ? - vset1 = vset2 & (vabs_diff > vparam0); + vset1 = v_and(vset2, v_gt(vabs_diff, vparam0)); // val = |It| -/+ sigma1 - v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1); + v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1)); v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8)); @@ -1009,7 +1009,7 @@ public: for (unsigned int mmi = 0; mmi < 4; mmi++) { // It == 0 ? |It| > sigma13 - diff0 = vset2 & diff[mmi]; + diff0 = v_and(vset2, diff[mmi]); // It == val ? sigma0 < |It| < sigma1 diff0 = v_select(vset1, vtmp_param1, diff0); @@ -1017,22 +1017,22 @@ public: // diff = diff * sigma2 v_int32x4 diff_int_0, diff_int_1; v_mul_expand(diff0, tale_, diff_int_0, diff_int_1); - v_int32x4 diff0_0 = diff_int_0 >> s2bitShift; - v_int32x4 diff0_1 = diff_int_1 >> s2bitShift; + v_int32x4 diff0_0 = v_shr(diff_int_0, s2bitShift); + v_int32x4 diff0_1 = v_shr(diff_int_1, s2bitShift); diff0 = v_pack(diff0_0, diff0_1); v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ... v_zip(vIxy_0, vIxy_1, v10, v11); v_zip(diff2, diff1, v00, v01); - vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10)); - vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11)); + vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10))); + vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11))); - vqb2[mmi] += v_cvt_f32(diff0_0 * vI0); - vqb2[mmi] += v_cvt_f32(diff0_1 * vI1); + vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_0, vI0))); + vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_1, vI1))); - vqb3[mmi] += v_cvt_f32(diff0_0); - vqb3[mmi] += v_cvt_f32(diff0_1); + vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_0)); + vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_1)); } if (j == 0) { @@ -1060,29 +1060,29 @@ public: v_float32x4 fx = v_cvt_f32(t1); // A11 - A22 - v_float32x4 fxtale = fx * vtale_0; - v_float32x4 fytale = fy * vtale_0; + v_float32x4 fxtale = v_mul(fx, vtale_0); + v_float32x4 fytale = v_mul(fy, vtale_0); vAyy = v_muladd(fy, fytale, vAyy); vAxy = v_muladd(fx, fytale, vAxy); vAxx = v_muladd(fx, fxtale, vAxx); // sumIx und sumIy - vsumIx += fxtale; - vsumIy += fytale; + vsumIx = v_add(vsumIx, fxtale); + vsumIy = v_add(vsumIy, fytale); - vsumW1 += vI_ps * fxtale; - vsumW2 += vI_ps * fytale; + vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale)); + vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale)); // sumI - v_float32x4 vI_tale = vI_ps * vtale_0; - vsumI += vI_tale; + v_float32x4 vI_tale = v_mul(vI_ps, vtale_0); + vsumI = v_add(vsumI, vI_tale); // sumW - vsumW += vtale_0; + vsumW = v_add(vsumW, vtale_0); // sumDI - vsumDI += vI_ps * vI_tale; + vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale)); v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1)))); v_expand(v01, t1, t0); @@ -1092,29 +1092,29 @@ public: fx = v_cvt_f32(t1); // A11 - A22 - fxtale = fx * vtale_1; - fytale = fy * vtale_1; + fxtale = v_mul(fx, vtale_1); + fytale = v_mul(fy, vtale_1); vAyy = v_muladd(fy, fytale, vAyy); vAxy = v_muladd(fx, fytale, vAxy); vAxx = v_muladd(fx, fxtale, vAxx); // sumIx und sumIy - vsumIx += fxtale; - vsumIy += fytale; + vsumIx = v_add(vsumIx, fxtale); + vsumIy = v_add(vsumIy, fytale); - vsumW1 += vI_ps * fxtale; - vsumW2 += vI_ps * fytale; + vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale)); + vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale)); // sumI - vI_tale = vI_ps * vtale_1; - vsumI += vI_tale; + vI_tale = v_mul(vI_ps, vtale_1); + vsumI = v_add(vsumI, vI_tale); // sumW - vsumW += vtale_1; + vsumW = v_add(vsumW, vtale_1); // sumDI - vsumDI += vI_ps * vI_tale; + vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale)); } } @@ -1304,7 +1304,7 @@ public: float CV_DECL_ALIGNED(16) bbuf[4]; for(int mmi = 0; mmi < 4; mmi++) { - v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]); + v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi])); _b0[mmi] = bbuf[0] + bbuf[2]; _b1[mmi] = bbuf[1] + bbuf[3]; _b2[mmi] = v_reduce_sum(vqb2[mmi]); @@ -1655,14 +1655,14 @@ public: v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn)); v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x)); v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn)); - v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16; + v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16); v_int16x8 diff[4] = { - ((v00 << 5) - vI) & vmask, - ((v01 << 5) - vI) & vmask, - ((v10 << 5) - vI) & vmask, - ((v11 << 5) - vI) & vmask, + v_and(v_sub(v_shl<5>(v00), vI), vmask), + v_and(v_sub(v_shl<5>(v01), vI), vmask), + v_and(v_sub(v_shl<5>(v10), vI), vmask), + v_and(v_sub(v_shl<5>(v11), vI), vmask), }; v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... @@ -1672,8 +1672,8 @@ public: v_zip(diff[mmi], diff[mmi], diff1, diff0); v_zip(vIxy_0, vIxy_1, v10, v11); v_zip(diff1, diff0, v00, v01); - vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10)); - vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11)); + vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10))); + vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11))); } } #else @@ -1704,7 +1704,7 @@ public: float CV_DECL_ALIGNED(16) bbuf[4]; for (int mmi = 0; mmi < 4; mmi++) { - v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]); + v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi])); _b1[mmi] = bbuf[0] + bbuf[2]; _b2[mmi] = bbuf[1] + bbuf[3]; } @@ -2071,7 +2071,7 @@ namespace radial { v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn)); v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x)); v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn)); - v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16; + v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16); v_int32x4 t0, t1; v_int16x8 t00, t01, t10, t11; @@ -2079,21 +2079,21 @@ namespace radial { v_zip(v10, v11, t10, t11); //subpixel interpolation - t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1); - t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1); - t0 = t0 >> (W_BITS - 5); - t1 = t1 >> (W_BITS - 5); + t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1)); + t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1)); + t0 = v_shr(t0, W_BITS - 5); + t1 = v_shr(t1, W_BITS - 5); diff0 = v_pack(t0, t1); // I*gain.x + gain.x v_mul_expand(vI, vgain_value, t0, t1); - v_int16x8 diff_value = v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value - vI; + v_int16x8 diff_value = v_sub(v_add(v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift)), vconst_value), vI); v_int16x8 diff[4] = { - ((v11 << 5) + diff_value) & vmask, - ((v01 << 5) + diff_value) & vmask, - ((v10 << 5) + diff_value) & vmask, - ((v00 << 5) + diff_value) & vmask + v_and(v_add(v_shl<5>(v11), diff_value), vmask), + v_and(v_add(v_shl<5>(v01), diff_value), vmask), + v_and(v_add(v_shl<5>(v10), diff_value), vmask), + v_and(v_add(v_shl<5>(v00), diff_value), vmask) }; v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8)); @@ -2109,14 +2109,14 @@ namespace radial { v_zip(diff[mmi], diff[mmi], diff2, diff1); v_zip(diff2, diff1, v00, v01); - vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10)); - vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11)); + vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10))); + vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11))); - vqb2[mmi] += v_cvt_f32(diff0_0 * vI0); - vqb2[mmi] += v_cvt_f32(diff0_1 * vI1); + vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_0, vI0))); + vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_1, vI1))); - vqb3[mmi] += v_cvt_f32(diff0_0); - vqb3[mmi] += v_cvt_f32(diff0_1); + vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_0)); + vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_1)); } if (j == 0) { @@ -2133,17 +2133,17 @@ namespace radial { vAxx = v_muladd(fx, fx, vAxx); // sumIx und sumIy - vsumIx += fx; - vsumIy += fy; + vsumIx = v_add(vsumIx, fx); + vsumIy = v_add(vsumIy, fy); - vsumW1 += vI_ps * fx; - vsumW2 += vI_ps * fy; + vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx)); + vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy)); // sumI - vsumI += vI_ps; + vsumI = v_add(vsumI, vI_ps); // sumDI - vsumDI += vI_ps * vI_ps; + vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps)); v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1)))); v_expand(v01, t1, t0); @@ -2158,17 +2158,17 @@ namespace radial { vAxx = v_muladd(fx, fx, vAxx); // sumIx und sumIy - vsumIx += fx; - vsumIy += fy; + vsumIx = v_add(vsumIx, fx); + vsumIy = v_add(vsumIy, fy); - vsumW1 += vI_ps * fx; - vsumW2 += vI_ps * fy; + vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx)); + vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy)); // sumI - vsumI += vI_ps; + vsumI = v_add(vsumI, vI_ps); // sumDI - vsumDI += vI_ps * vI_ps; + vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps)); } } @@ -2299,7 +2299,7 @@ namespace radial { float CV_DECL_ALIGNED(16) bbuf[4]; for (int mmi = 0; mmi < 4; mmi++) { - v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]); + v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi])); _b0[mmi] = bbuf[0] + bbuf[2]; _b1[mmi] = bbuf[1] + bbuf[3]; _b2[mmi] = v_reduce_sum(vqb2[mmi]); diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp index 5ea85de88..71cf50c82 100644 --- a/modules/optflow/src/rlof/plk_invoker.hpp +++ b/modules/optflow/src/rlof/plk_invoker.hpp @@ -229,7 +229,7 @@ public: v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn)); v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x)); v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn)); - v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16; + v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16); v_int32x4 t0, t1; v_int16x8 t00, t01, t10, t11; @@ -237,17 +237,17 @@ public: v_zip(v10, v11, t10, t11); //subpixel interpolation - t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1); - t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1); - t0 = t0 >> (W_BITS - 5); - t1 = t1 >> (W_BITS - 5); + t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1)); + t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1)); + t0 = v_shr(t0, W_BITS - 5); + t1 = v_shr(t1, W_BITS - 5); // diff = J - I - diff0 = v_pack(t0, t1) - vI; + diff0 = v_sub(v_pack(t0, t1), vI); // I*gain.x + gain.x v_mul_expand(vI, vgain_value, t0, t1); - diff0 = diff0 + v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value; - diff0 = diff0 & vmask; + diff0 = v_add(v_add(diff0, v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift))), vconst_value); + diff0 = v_and(diff0, vmask); v_zip(diff0, diff0, diff2, diff1); v_int32x4 diff0_0; @@ -259,16 +259,16 @@ public: v_zip(vIxy_0, vIxy_1, v10, v11); v_zip(diff2, diff1, v00, v01); - vqb0 += v_cvt_f32(v_dotprod(v00, v10)); - vqb1 += v_cvt_f32(v_dotprod(v01, v11)); + vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10))); + vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11))); v_int32x4 vI0, vI1; v_expand(vI, vI0, vI1); - vqb2 += v_cvt_f32(diff0_0 * vI0); - vqb2 += v_cvt_f32(diff0_1 * vI1); + vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_0, vI0))); + vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_1, vI1))); - vqb3 += v_cvt_f32(diff0_0); - vqb3 += v_cvt_f32(diff0_1); + vqb3 = v_add(vqb3, v_cvt_f32(diff0_0)); + vqb3 = v_add(vqb3, v_cvt_f32(diff0_1)); if (j == 0) { @@ -285,17 +285,17 @@ public: vAxx = v_muladd(fx, fx, vAxx); // sumIx und sumIy - vsumIx += fx; - vsumIy += fy; + vsumIx = v_add(vsumIx, fx); + vsumIy = v_add(vsumIy, fy); - vsumW1 += vI_ps * fx; - vsumW2 += vI_ps * fy; + vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx)); + vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy)); // sumI - vsumI += vI_ps; + vsumI = v_add(vsumI, vI_ps); // sumDI - vsumDI += vI_ps * vI_ps; + vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps)); v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1)))); v_expand(v01, t1, t0); @@ -309,17 +309,17 @@ public: vAxx = v_muladd(fx, fx, vAxx); // sumIx und sumIy - vsumIx += fx; - vsumIy += fy; + vsumIx = v_add(vsumIx, fx); + vsumIy = v_add(vsumIy, fy); - vsumW1 += vI_ps * fx; - vsumW2 += vI_ps * fy; + vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx)); + vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy)); // sumI - vsumI += vI_ps; + vsumI = v_add(vsumI, vI_ps); // sumDI - vsumDI += vI_ps * vI_ps; + vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps)); } } #else @@ -388,7 +388,7 @@ public: #if CV_SIMD128 float CV_DECL_ALIGNED(16) bbuf[4]; - v_store_aligned(bbuf, vqb0 + vqb1); + v_store_aligned(bbuf, v_add(vqb0, vqb1)); b1 = bbuf[0] + bbuf[2]; b2 = bbuf[1] + bbuf[3]; b3 = v_reduce_sum(vqb2); @@ -696,19 +696,19 @@ public: v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn)); v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x)); v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn)); - v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16; + v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16); v_int32x4 t0, t1; v_int16x8 t00, t01, t10, t11; v_zip(v00, v01, t00, t01); v_zip(v10, v11, t10, t11); - t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1); - t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1); - t0 = t0 >> (W_BITS - 5); - t1 = t1 >> (W_BITS - 5); - diff0 = v_pack(t0, t1) - diff0; - diff0 = diff0 & vmask; + t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1)); + t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1)); + t0 = v_shr(t0, W_BITS - 5); + t1 = v_shr(t1, W_BITS - 5); + diff0 = v_sub(v_pack(t0, t1), diff0); + diff0 = v_and(diff0, vmask); v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ... @@ -717,8 +717,8 @@ public: v_zip(vIxy_0, vIxy_1, v10, v11); v_zip(diff2, diff1, v00, v01); - vqb0 += v_cvt_f32(v_dotprod(v00, v10)); - vqb1 += v_cvt_f32(v_dotprod(v01, v11)); + vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10))); + vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11))); } #else for( ; x < winSize.width*cn; x++, dIptr += 2 ) @@ -737,7 +737,7 @@ public: #if CV_SIMD128 float CV_DECL_ALIGNED(16) bbuf[4]; - v_store_aligned(bbuf, vqb0 + vqb1); + v_store_aligned(bbuf, v_add(vqb0, vqb1)); b1 = bbuf[0] + bbuf[2]; b2 = bbuf[1] + bbuf[3]; #endif diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp index 9bee35fc6..5597d8824 100644 --- a/modules/optflow/src/rlof/rlof_invoker.hpp +++ b/modules/optflow/src/rlof/rlof_invoker.hpp @@ -246,35 +246,35 @@ public: v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn)); v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x)); v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn)); - v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16; + v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16); v_int32x4 t0, t1; v_int16x8 t00, t01, t10, t11; v_zip(v00, v01, t00, t01); v_zip(v10, v11, t10, t11); - t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1); - t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1); - t0 = t0 >> (W_BITS - 5); - t1 = t1 >> (W_BITS - 5); - diff0 = v_pack(t0, t1) - diff0; - diff0 = diff0 & vmask; + t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1)); + t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1)); + t0 = v_shr(t0, W_BITS - 5); + t1 = v_shr(t1, W_BITS - 5); + diff0 = v_sub(v_pack(t0, t1), diff0); + diff0 = v_and(diff0, vmask); - v_int16x8 vscale_diff_is_pos = diff0 > vscale; - veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1); + v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale); + veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1)); // since there is no abs vor int16x8 we have to do this hack v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0)); v_int16x8 vset2, vset1; // |It| < sigma1 ? - vset2 = vabs_diff < vparam1; + vset2 = v_lt(vabs_diff, vparam1); // It > 0 ? - v_int16x8 vdiff_is_pos = diff0 > vzero; + v_int16x8 vdiff_is_pos = v_gt(diff0, vzero); // sigma0 < |It| < sigma1 ? - vset1 = vset2 & (vabs_diff > vparam0); + vset1 = v_and(vset2, v_gt(vabs_diff, vparam0)); // val = |It| -/+ sigma1 - v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1); + v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1)); // It == 0 ? |It| > sigma13 - diff0 = vset2 & diff0; + diff0 = v_and(vset2, diff0); // It == val ? sigma0 < |It| < sigma1 diff0 = v_select(vset1, vtmp_param1, diff0); @@ -282,7 +282,7 @@ public: // diff = diff * sigma2 v_int32x4 diff_int_0, diff_int_1; v_mul_expand(diff0, tale_, diff_int_0, diff_int_1); - diff0 = v_pack(diff_int_0 >> s2bitShift, diff_int_1 >> s2bitShift); + diff0 = v_pack(v_shr(diff_int_0, s2bitShift), v_shr(diff_int_1, s2bitShift)); v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ... v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... @@ -290,8 +290,8 @@ public: v_zip(vIxy_0, vIxy_1, v10, v11); v_zip(diff2, diff1, v00, v01); - vqb0 += v_cvt_f32(v_dotprod(v00, v10)); - vqb1 += v_cvt_f32(v_dotprod(v01, v11)); + vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10))); + vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11))); if (j == 0) { v_int32x4 vset1_0, vset1_1, vset2_0, vset2_1; @@ -316,8 +316,8 @@ public: v_float32x4 fx = v_cvt_f32(t1); // A11 - A22 - v_float32x4 fxtale = fx * vtale_0; - v_float32x4 fytale = fy * vtale_0; + v_float32x4 fxtale = v_mul(fx, vtale_0); + v_float32x4 fytale = v_mul(fy, vtale_0); vAyy = v_muladd(fy, fytale, vAyy); vAxy = v_muladd(fx, fytale, vAxy); @@ -330,8 +330,8 @@ public: fx = v_cvt_f32(t1); // A11 - A22 - fxtale = fx * vtale_1; - fytale = fy * vtale_1; + fxtale = v_mul(fx, vtale_1); + fytale = v_mul(fy, vtale_1); vAyy = v_muladd(fy, fytale, vAyy); vAxy = v_muladd(fx, fytale, vAxy); @@ -431,7 +431,7 @@ public: #if CV_SIMD128 float CV_DECL_ALIGNED(16) bbuf[4]; - v_store_aligned(bbuf, vqb0 + vqb1); + v_store_aligned(bbuf, v_add(vqb0, vqb1)); b1 += bbuf[0] + bbuf[2]; b2 += bbuf[1] + bbuf[3]; #endif @@ -769,7 +769,7 @@ public: v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn)); v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x)); v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn)); - v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16; + v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16); v_int32x4 t0, t1; v_int16x8 t00, t01, t10, t11; @@ -777,33 +777,33 @@ public: v_zip(v10, v11, t10, t11); //subpixel interpolation - t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1); - t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1); - t0 = t0 >> (W_BITS - 5); - t1 = t1 >> (W_BITS - 5); + t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1)); + t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1)); + t0 = v_shr(t0, W_BITS - 5); + t1 = v_shr(t1, W_BITS - 5); // diff = J - I - diff0 = v_pack(t0, t1) - vI; + diff0 = v_sub(v_pack(t0, t1), vI); // I*gain.x + gain.x v_mul_expand(vI, vgain_value, t0, t1); - diff0 = diff0 + v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value; - diff0 = diff0 & vmask; + diff0 = v_add(v_add(diff0, v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift))), vconst_value); + diff0 = v_and(diff0, vmask); - v_int16x8 vscale_diff_is_pos = diff0 > vscale; - veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1); + v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale); + veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1)); // since there is no abs vor int16x8 we have to do this hack v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0)); v_int16x8 vset2, vset1; // |It| < sigma1 ? - vset2 = vabs_diff < vparam1; + vset2 = v_lt(vabs_diff, vparam1); // It > 0 ? - v_int16x8 vdiff_is_pos = diff0 > vzero; + v_int16x8 vdiff_is_pos = v_gt(diff0, vzero); // sigma0 < |It| < sigma1 ? - vset1 = vset2 & (vabs_diff > vparam0); + vset1 = v_and(vset2, v_gt(vabs_diff, vparam0)); // val = |It| -/+ sigma1 - v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1); + v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1)); // It == 0 ? |It| > sigma13 - diff0 = vset2 & diff0; + diff0 = v_and(vset2, diff0); // It == val ? sigma0 < |It| < sigma1 diff0 = v_select(vset1, vtmp_param1, diff0); @@ -811,8 +811,8 @@ public: // diff = diff * sigma2 v_int32x4 diff_int_0, diff_int_1; v_mul_expand(diff0, tale_, diff_int_0, diff_int_1); - v_int32x4 diff0_0 = diff_int_0 >> s2bitShift; - v_int32x4 diff0_1 = diff_int_1 >> s2bitShift; + v_int32x4 diff0_0 = v_shr(diff_int_0, s2bitShift); + v_int32x4 diff0_1 = v_shr(diff_int_1, s2bitShift); diff0 = v_pack(diff0_0, diff0_1); v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ... @@ -821,16 +821,16 @@ public: v_zip(vIxy_0, vIxy_1, v10, v11); v_zip(diff2, diff1, v00, v01); - vqb0 += v_cvt_f32(v_dotprod(v00, v10)); - vqb1 += v_cvt_f32(v_dotprod(v01, v11)); + vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10))); + vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11))); v_int32x4 vI0, vI1; v_expand(vI, vI0, vI1); - vqb2 += v_cvt_f32(diff0_0 * vI0); - vqb2 += v_cvt_f32(diff0_1 * vI1); + vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_0, vI0))); + vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_1, vI1))); - vqb3 += v_cvt_f32(diff0_0); - vqb3 += v_cvt_f32(diff0_1); + vqb3 = v_add(vqb3, v_cvt_f32(diff0_0)); + vqb3 = v_add(vqb3, v_cvt_f32(diff0_1)); if (j == 0) { @@ -858,29 +858,29 @@ public: v_float32x4 fx = v_cvt_f32(t1); // A11 - A22 - v_float32x4 fxtale = fx * vtale_0; - v_float32x4 fytale = fy * vtale_0; + v_float32x4 fxtale = v_mul(fx, vtale_0); + v_float32x4 fytale = v_mul(fy, vtale_0); vAyy = v_muladd(fy, fytale, vAyy); vAxy = v_muladd(fx, fytale, vAxy); vAxx = v_muladd(fx, fxtale, vAxx); // sumIx und sumIy - vsumIx += fxtale; - vsumIy += fytale; + vsumIx = v_add(vsumIx, fxtale); + vsumIy = v_add(vsumIy, fytale); - vsumW1 += vI_ps * fxtale; - vsumW2 += vI_ps * fytale; + vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale)); + vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale)); // sumI - v_float32x4 vI_tale = vI_ps * vtale_0; - vsumI += vI_tale; + v_float32x4 vI_tale = v_mul(vI_ps, vtale_0); + vsumI = v_add(vsumI, vI_tale); // sumW - vsumW += vtale_0; + vsumW = v_add(vsumW, vtale_0); // sumDI - vsumDI += vI_ps * vI_tale; + vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale)); v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1)))); v_expand(v01, t1, t0); @@ -890,29 +890,29 @@ public: fx = v_cvt_f32(t1); // A11 - A22 - fxtale = fx * vtale_1; - fytale = fy * vtale_1; + fxtale = v_mul(fx, vtale_1); + fytale = v_mul(fy, vtale_1); vAyy = v_muladd(fy, fytale, vAyy); vAxy = v_muladd(fx, fytale, vAxy); vAxx = v_muladd(fx, fxtale, vAxx); // sumIx und sumIy - vsumIx += fxtale; - vsumIy += fytale; + vsumIx = v_add(vsumIx, fxtale); + vsumIy = v_add(vsumIy, fytale); - vsumW1 += vI_ps * fxtale; - vsumW2 += vI_ps * fytale; + vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale)); + vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale)); // sumI - vI_tale = vI_ps * vtale_1; - vsumI += vI_tale; + vI_tale = v_mul(vI_ps, vtale_1); + vsumI = v_add(vsumI, vI_tale); // sumW - vsumW += vtale_1; + vsumW = v_add(vsumW, vtale_1); // sumDI - vsumDI += vI_ps * vI_tale; + vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale)); } } #else @@ -1017,7 +1017,7 @@ public: } #if CV_SIMD128 float CV_DECL_ALIGNED(16) bbuf[4]; - v_store_aligned(bbuf, vqb0 + vqb1); + v_store_aligned(bbuf, v_add(vqb0, vqb1)); b1 = bbuf[0] + bbuf[2]; b2 = bbuf[1] + bbuf[3]; b3 = v_reduce_sum(vqb2); diff --git a/modules/optflow/src/rlof/rlof_invokerbase.hpp b/modules/optflow/src/rlof/rlof_invokerbase.hpp index c6f77f6d6..2db4234ec 100644 --- a/modules/optflow/src/rlof/rlof_invokerbase.hpp +++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp @@ -71,15 +71,15 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11, for (; x <= winSize.width*cn; x += 8, dsrc += 8 * 2, dsrc1 += 8 * 2, dIptr += 8 * 2) { - v_int32x4 vmask0 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)) * vmax_val_32; - v_int32x4 vmask1 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)) * vmax_val_32; + v_int32x4 vmask0 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)), vmax_val_32); + v_int32x4 vmask1 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)), vmax_val_32); if (x + 4 > winSize.width) { - vmask0 = vmask0 & vmask_border_0; + vmask0 = v_and(vmask0, vmask_border_0); } if (x + 8 > winSize.width) { - vmask1 = vmask1 & vmask_border_1; + vmask1 = v_and(vmask1, vmask_border_1); } v_int32x4 t0, t1; @@ -91,10 +91,10 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11, v_zip(v00, v01, t00, t01); v_zip(v10, v11, t10, t11); - t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1); - t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1); - t0 = t0 >> (W_BITS - 5) & vmask0; - t1 = t1 >> (W_BITS - 5) & vmask1; + t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1)); + t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1)); + t0 = v_and(v_shr(t0, W_BITS - 5), vmask0); + t1 = v_and(v_shr(t1, W_BITS - 5), vmask1); v_store(Iptr + x, v_pack(t0, t1)); v00 = v_reinterpret_as_s16(v_load(dsrc)); @@ -105,12 +105,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11, v_zip(v00, v01, t00, t01); v_zip(v10, v11, t10, t11); - t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1); - t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1); - t0 = t0 >> W_BITS; - t1 = t1 >> W_BITS; + t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1)); + t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1)); + t0 = v_shr(t0, W_BITS); + t1 = v_shr(t1, W_BITS); v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... - v00 = v00 & v_reinterpret_as_s16(vmask0); + v00 = v_and(v00, v_reinterpret_as_s16(vmask0)); v_store(dIptr, v00); v00 = v_reinterpret_as_s16(v_load(dsrc + 4 * 2)); @@ -121,12 +121,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11, v_zip(v00, v01, t00, t01); v_zip(v10, v11, t10, t11); - t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1); - t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1); - t0 = t0 >> W_BITS; - t1 = t1 >> W_BITS; + t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1)); + t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1)); + t0 = v_shr(t0, W_BITS); + t1 = v_shr(t1, W_BITS); v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... - v00 = v00 & v_reinterpret_as_s16(vmask1); + v00 = v_and(v00, v_reinterpret_as_s16(vmask1)); v_store(dIptr + 4 * 2, v00); } #else @@ -187,15 +187,15 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11, #if CV_SIMD128 for (int x = 0; x <= winSize.width*cn; x += 8, dsrc += 8 * 2, dsrc1 += 8 * 2, dIptr += 8 * 2) { - v_int32x4 vmask0 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)) * vmax_val_32; - v_int32x4 vmask1 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)) * vmax_val_32; + v_int32x4 vmask0 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)), vmax_val_32); + v_int32x4 vmask1 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)), vmax_val_32); if (x + 4 > winSize.width) { - vmask0 = vmask0 & vmask_border0; + vmask0 = v_and(vmask0, vmask_border0); } if (x + 8 > winSize.width) { - vmask1 = vmask1 & vmask_border1; + vmask1 = v_and(vmask1, vmask_border1); } v_int32x4 t0, t1; @@ -207,12 +207,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11, v_zip(v00, v01, t00, t01); v_zip(v10, v11, t10, t11); - t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1); - t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1); - t0 = t0 >> (W_BITS - 5); - t1 = t1 >> (W_BITS - 5); - t0 = t0 & vmask0; - t1 = t1 & vmask1; + t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1)); + t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1)); + t0 = v_shr(t0, W_BITS - 5); + t1 = v_shr(t1, W_BITS - 5); + t0 = v_and(t0, vmask0); + t1 = v_and(t1, vmask1); v_store(Iptr + x, v_pack(t0, t1)); v00 = v_reinterpret_as_s16(v_load(dsrc)); @@ -223,12 +223,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11, v_zip(v00, v01, t00, t01); v_zip(v10, v11, t10, t11); - t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1); - t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1); - t0 = t0 >> W_BITS; - t1 = t1 >> W_BITS; + t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1)); + t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1)); + t0 = v_shr(t0, W_BITS); + t1 = v_shr(t1, W_BITS); v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... - v00 = v00 & v_reinterpret_as_s16(vmask0); + v00 = v_and(v00, v_reinterpret_as_s16(vmask0)); v_store(dIptr, v00); v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(v00)))); @@ -249,12 +249,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11, v_zip(v00, v01, t00, t01); v_zip(v10, v11, t10, t11); - t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1); - t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1); - t0 = t0 >> W_BITS; - t1 = t1 >> W_BITS; + t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1)); + t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1)); + t0 = v_shr(t0, W_BITS); + t1 = v_shr(t1, W_BITS); v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... - v00 = v00 & v_reinterpret_as_s16(vmask1); + v00 = v_and(v00, v_reinterpret_as_s16(vmask1)); v_store(dIptr + 4 * 2, v00); v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(v00)))); diff --git a/modules/optflow/src/rlof/rlof_localflow.cpp b/modules/optflow/src/rlof/rlof_localflow.cpp index 8f3c72820..3bc264f3e 100644 --- a/modules/optflow/src/rlof/rlof_localflow.cpp +++ b/modules/optflow/src/rlof/rlof_localflow.cpp @@ -52,8 +52,8 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst) v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(srow1 + x)); v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x)); - v_int16x8 t1 = s2 - s0; - v_int16x8 t0 = v_mul_wrap(s0 + s2, c3) + v_mul_wrap(s1, c10); + v_int16x8 t1 = v_sub(s2, s0); + v_int16x8 t0 = v_add(v_mul_wrap(v_add(s0, s2), c3), v_mul_wrap(s1, c10)); v_store(trow0 + x, t0); v_store(trow1 + x, t1); @@ -90,8 +90,8 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst) v_int16x8 s3 = v_load(trow1 + x); v_int16x8 s4 = v_load(trow1 + x + cn); - v_int16x8 t0 = s1 - s0; - v_int16x8 t1 = v_mul_wrap(s2 + s4, c3) + v_mul_wrap(s3, c10); + v_int16x8 t0 = v_sub(s1, s0); + v_int16x8 t1 = v_add(v_mul_wrap(v_add(s2, s4), c3), v_mul_wrap(s3, c10)); v_store_interleave((drow + x * 2), t0, t1); } diff --git a/modules/rgbd/src/colored_tsdf.cpp b/modules/rgbd/src/colored_tsdf.cpp index 0247f66d0..7ce2c7428 100644 --- a/modules/rgbd/src/colored_tsdf.cpp +++ b/modules/rgbd/src/colored_tsdf.cpp @@ -194,21 +194,21 @@ inline float ColoredTSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const { // tx, ty, tz = floor(p) v_int32x4 ip = v_floor(p); - v_float32x4 t = p - v_cvt_f32(ip); - float tx = t.get0(); + v_float32x4 t = v_sub(p, v_cvt_f32(ip)); + float tx = v_get0(t); t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t))); - float ty = t.get0(); + float ty = v_get0(t); t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t))); - float tz = t.get0(); + float tz = v_get0(t); int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2]; const RGBTsdfVoxel* volData = volume.ptr(); - int ix = ip.get0(); + int ix = v_get0(ip); ip = v_rotate_right<1>(ip); - int iy = ip.get0(); + int iy = v_get0(ip); ip = v_rotate_right<1>(ip); - int iz = ip.get0(); + int iz = v_get0(ip); int coordBase = ix * xdim + iy * ydim + iz * zdim; @@ -218,15 +218,15 @@ inline float ColoredTSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const v_float32x4 v0246 = tsdfToFloat_INTR(v_int32x4(vx[0], vx[2], vx[4], vx[6])); v_float32x4 v1357 = tsdfToFloat_INTR(v_int32x4(vx[1], vx[3], vx[5], vx[7])); - v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246); + v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246))); v_float32x4 v00_10 = vxx; v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx))); - v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10); - float v0 = v0_1.get0(); + v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10))); + float v0 = v_get0(v0_1); v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1))); - float v1 = v0_1.get0(); + float v1 = v_get0(v0_1); return v0 + tx * (v1 - v0); } @@ -276,27 +276,27 @@ inline Point3f ColoredTSDFVolumeCPU::getNormalVoxel(const Point3f& _p) const inline v_float32x4 ColoredTSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const { - if (v_check_any(p < v_float32x4(1.f, 1.f, 1.f, 0.f)) || - v_check_any(p >= v_float32x4((float)(volResolution.x - 2), + if (v_check_any(v_lt(p, v_float32x4(1.f, 1.f, 1.f, 0.f))) || + v_check_any(v_ge(p, v_float32x4((float)(volResolution.x - 2), (float)(volResolution.y - 2), - (float)(volResolution.z - 2), 1.f)) + (float)(volResolution.z - 2), 1.f))) ) return nanv; v_int32x4 ip = v_floor(p); - v_float32x4 t = p - v_cvt_f32(ip); - float tx = t.get0(); + v_float32x4 t = v_sub(p, v_cvt_f32(ip)); + float tx = v_get0(t); t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t))); - float ty = t.get0(); + float ty = v_get0(t); t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t))); - float tz = t.get0(); + float tz = v_get0(t); const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2]; const RGBTsdfVoxel* volData = volume.ptr(); - int ix = ip.get0(); ip = v_rotate_right<1>(ip); - int iy = ip.get0(); ip = v_rotate_right<1>(ip); - int iz = ip.get0(); + int ix = v_get0(ip); ip = v_rotate_right<1>(ip); + int iy = v_get0(ip); ip = v_rotate_right<1>(ip); + int iz = v_get0(ip); int coordBase = ix * xdim + iy * ydim + iz * zdim; @@ -314,23 +314,23 @@ inline v_float32x4 ColoredTSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) co v_float32x4 v0246(vx[0], vx[2], vx[4], vx[6]); v_float32x4 v1357(vx[1], vx[3], vx[5], vx[7]); - v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246); + v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246))); v_float32x4 v00_10 = vxx; v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx))); - v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10); - float v0 = v0_1.get0(); + v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10))); + float v0 = v_get0(v0_1); v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1))); - float v1 = v0_1.get0(); + float v1 = v_get0(v0_1); nv = v0 + tx * (v1 - v0); } v_float32x4 n = v_load_aligned(an); - v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(n * n))); + v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(v_mul(n, n)))); - return Norm.get0() < 0.0001f ? nanv : n / Norm; + return v_get0(Norm) < 0.0001f ? nanv : v_div(n, Norm); } #else inline Point3f ColoredTSDFVolumeCPU::getNormalVoxel(const Point3f& p) const @@ -388,15 +388,15 @@ inline float ColoredTSDFVolumeCPU::interpolateColor(float tx, float ty, float tz v_float32x4 v0246, v1357; v_load_deinterleave(vx, v0246, v1357); - v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246); + v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246))); v_float32x4 v00_10 = vxx; v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx))); - v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10); - float v0 = v0_1.get0(); + v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10))); + float v0 = v_get0(v0_1); v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1))); - float v1 = v0_1.get0(); + float v1 = v_get0(v0_1); return v0 + tx * (v1 - v0); } @@ -427,10 +427,10 @@ inline Point3f ColoredTSDFVolumeCPU::getColorVoxel(const Point3f& _p) const } inline v_float32x4 ColoredTSDFVolumeCPU::getColorVoxel(const v_float32x4& p) const { - if (v_check_any(p < v_float32x4(1.f, 1.f, 1.f, 0.f)) || - v_check_any(p >= v_float32x4((float)(volResolution.x - 2), + if (v_check_any(v_lt(p, v_float32x4(1.f, 1.f, 1.f, 0.f))) || + v_check_any(v_ge(p, v_float32x4((float)(volResolution.x - 2), (float)(volResolution.y - 2), - (float)(volResolution.z - 2), 1.f)) + (float)(volResolution.z - 2), 1.f))) ) return nanv; @@ -439,9 +439,9 @@ inline v_float32x4 ColoredTSDFVolumeCPU::getColorVoxel(const v_float32x4& p) con const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2]; const RGBTsdfVoxel* volData = volume.ptr(); - int ix = ip.get0(); ip = v_rotate_right<1>(ip); - int iy = ip.get0(); ip = v_rotate_right<1>(ip); - int iz = ip.get0(); + int ix = v_get0(ip); ip = v_rotate_right<1>(ip); + int iy = v_get0(ip); ip = v_rotate_right<1>(ip); + int iz = v_get0(ip); int coordBase = ix * xdim + iy * ydim + iz * zdim; float CV_DECL_ALIGNED(16) rgb[4]; @@ -456,12 +456,12 @@ inline v_float32x4 ColoredTSDFVolumeCPU::getColorVoxel(const v_float32x4& p) con } v_float32x4 vsi(voxelSizeInv, voxelSizeInv, voxelSizeInv, voxelSizeInv); - v_float32x4 ptVox = p * vsi; + v_float32x4 ptVox = v_mul(p, vsi); v_int32x4 iptVox = v_floor(ptVox); - v_float32x4 t = ptVox - v_cvt_f32(iptVox); - float tx = t.get0(); t = v_rotate_right<1>(t); - float ty = t.get0(); t = v_rotate_right<1>(t); - float tz = t.get0(); + v_float32x4 t = v_sub(ptVox, v_cvt_f32(iptVox)); + float tx = v_get0(t); t = v_rotate_right<1>(t); + float ty = v_get0(t); t = v_rotate_right<1>(t); + float tz = v_get0(t); rgb[0] = interpolateColor(tx, ty, tz, r); rgb[1] = interpolateColor(tx, ty, tz, g); rgb[2] = interpolateColor(tx, ty, tz, b); @@ -583,21 +583,21 @@ struct ColorRaycastInvoker : ParallelLoopBody // get direction through pixel in volume space: // 1. reproject (x, y) on projecting plane where z = 1.f - v_float32x4 planed = (v_float32x4((float)x, (float)y, 0.f, 0.f) - vcxy) * vfxy; + v_float32x4 planed = v_mul(v_sub(v_float32x4((float)x, (float)y, 0.F, 0.F), vcxy), vfxy); planed = v_combine_low(planed, v_float32x4(1.f, 0.f, 0.f, 0.f)); // 2. rotate to volume space planed = v_matmuladd(planed, camRot0, camRot1, camRot2, v_setzero_f32()); // 3. normalize - v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(planed * planed))); - v_float32x4 dir = planed * invNorm; + v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(v_mul(planed, planed)))); + v_float32x4 dir = v_mul(planed, invNorm); // compute intersection of ray with all six bbox planes - v_float32x4 rayinv = v_setall_f32(1.f) / dir; + v_float32x4 rayinv = v_div(v_setall_f32(1.F), dir); // div by zero should be eliminated by these products - v_float32x4 tbottom = rayinv * (boxDown - orig); - v_float32x4 ttop = rayinv * (boxUp - orig); + v_float32x4 tbottom = v_mul(rayinv, v_sub(boxDown, orig)); + v_float32x4 ttop = v_mul(rayinv, v_sub(boxUp, orig)); // re-order intersections to find smallest and largest on each axis v_float32x4 minAx = v_min(ttop, tbottom); @@ -618,14 +618,14 @@ struct ColorRaycastInvoker : ParallelLoopBody if (tmin < tmax) { // interpolation optimized a little - orig *= invVoxelSize; - dir *= invVoxelSize; + orig = v_mul(orig, invVoxelSize); + dir = v_mul(dir, invVoxelSize); int xdim = volume.volDims[0]; int ydim = volume.volDims[1]; int zdim = volume.volDims[2]; - v_float32x4 rayStep = dir * v_setall_f32(tstep); - v_float32x4 next = (orig + dir * v_setall_f32(tmin)); + v_float32x4 rayStep = v_mul(dir, v_setall_f32(this->tstep)); + v_float32x4 next = (v_add(orig, v_mul(dir, v_setall_f32(tmin)))); float f = volume.interpolateVoxel(next), fnext = f; //raymarch @@ -633,11 +633,11 @@ struct ColorRaycastInvoker : ParallelLoopBody int nSteps = cvFloor((tmax - tmin) / tstep); for (; steps < nSteps; steps++) { - next += rayStep; + next = v_add(next, rayStep); v_int32x4 ip = v_round(next); - int ix = ip.get0(); ip = v_rotate_right<1>(ip); - int iy = ip.get0(); ip = v_rotate_right<1>(ip); - int iz = ip.get0(); + int ix = v_get0(ip); ip = v_rotate_right<1>(ip); + int iy = v_get0(ip); ip = v_rotate_right<1>(ip); + int iz = v_get0(ip); int coord = ix * xdim + iy * ydim + iz * zdim; fnext = tsdfToFloat(volume.volume.at(coord).tsdf); @@ -657,7 +657,7 @@ struct ColorRaycastInvoker : ParallelLoopBody // linearly interpolate t between two f values if (f > 0.f && fnext < 0.f) { - v_float32x4 tp = next - rayStep; + v_float32x4 tp = v_sub(next, rayStep); float ft = volume.interpolateVoxel(tp); float ftdt = volume.interpolateVoxel(next); float ts = tmin + tstep * (steps - ft / (ftdt - ft)); @@ -665,7 +665,7 @@ struct ColorRaycastInvoker : ParallelLoopBody // avoid division by zero if (!cvIsNaN(ts) && !cvIsInf(ts)) { - v_float32x4 pv = (orig + dir * v_setall_f32(ts)); + v_float32x4 pv = (v_add(orig, v_mul(dir, v_setall_f32(ts)))); v_float32x4 nv = volume.getNormalVoxel(pv); v_float32x4 cv = volume.getColorVoxel(pv); @@ -675,9 +675,7 @@ struct ColorRaycastInvoker : ParallelLoopBody //convert pv and nv to camera space normal = v_matmuladd(nv, volRot0, volRot1, volRot2, v_setzero_f32()); // interpolation optimized a little - point = v_matmuladd(pv * v_float32x4(volume.voxelSize, - volume.voxelSize, - volume.voxelSize, 1.f), + point = v_matmuladd(v_mul(pv, v_float32x4(this->volume.voxelSize, this->volume.voxelSize, this->volume.voxelSize, 1.F)), volRot0, volRot1, volRot2, volTrans); } } diff --git a/modules/rgbd/src/fast_icp.cpp b/modules/rgbd/src/fast_icp.cpp index 07eb84d19..5f3ab3f0a 100644 --- a/modules/rgbd/src/fast_icp.cpp +++ b/modules/rgbd/src/fast_icp.cpp @@ -138,7 +138,7 @@ bool ICPImpl::estimateTransformT(cv::Affine3f& transform, #if USE_INTRINSICS static inline bool fastCheck(const v_float32x4& p0, const v_float32x4& p1) { - float check = (p0.get0() + p1.get0()); + float check = (v_get0(p0) + v_get0(p1)); return !cvIsNaN(check); } @@ -160,7 +160,7 @@ static inline v_float32x4 crossProduct(const v_float32x4& a, const v_float32x4& v_float32x4 ayzx, azxy, byzx, bzxy; getCrossPerm(a, ayzx, azxy); getCrossPerm(b, byzx, bzxy); - return ayzx*bzxy - azxy*byzx; + return v_sub(v_mul(ayzx, bzxy), v_mul(azxy, byzx)); } #else static inline bool fastCheck(const Point3f& p) @@ -235,11 +235,11 @@ struct GetAbInvoker : ParallelLoopBody //find correspondence by projecting the point v_float32x4 oldCoords; - float pz = (v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(newP))).get0()); + float pz = v_get0(v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(newP)))); // x, y, 0, 0 - oldCoords = v_muladd(newP/v_setall_f32(pz), vfxy, vcxy); + oldCoords = v_muladd(v_div(newP, v_setall_f32(pz)), vfxy, vcxy); - if(!v_check_all((oldCoords >= v_setzero_f32()) & (oldCoords < vframe))) + if(!v_check_all(v_and(v_ge(oldCoords, v_setzero_f32()), v_lt(oldCoords, vframe)))) continue; // bilinearly interpolate oldPts and oldNrm under oldCoords point @@ -247,12 +247,12 @@ struct GetAbInvoker : ParallelLoopBody v_float32x4 oldN; { v_int32x4 ixy = v_floor(oldCoords); - v_float32x4 txy = oldCoords - v_cvt_f32(ixy); - int xi = ixy.get0(); - int yi = v_rotate_right<1>(ixy).get0(); - v_float32x4 tx = v_setall_f32(txy.get0()); + v_float32x4 txy = v_sub(oldCoords, v_cvt_f32(ixy)); + int xi = v_get0(ixy); + int yi = v_get0(v_rotate_right<1>(ixy)); + v_float32x4 tx = v_setall_f32(v_get0(txy)); txy = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(txy))); - v_float32x4 ty = v_setall_f32(txy.get0()); + v_float32x4 ty = v_setall_f32(v_get0(txy)); const float* prow0 = (const float*)oldPts[yi+0]; const float* prow1 = (const float*)oldPts[yi+1]; @@ -275,23 +275,23 @@ struct GetAbInvoker : ParallelLoopBody // NaN check is done later - v_float32x4 p0 = p00 + tx*(p01 - p00); - v_float32x4 p1 = p10 + tx*(p11 - p10); - oldP = p0 + ty*(p1 - p0); + v_float32x4 p0 = v_add(p00, v_mul(tx, v_sub(p01, p00))); + v_float32x4 p1 = v_add(p10, v_mul(tx, v_sub(p11, p10))); + oldP = v_add(p0, v_mul(ty, v_sub(p1, p0))); - v_float32x4 n0 = n00 + tx*(n01 - n00); - v_float32x4 n1 = n10 + tx*(n11 - n10); - oldN = n0 + ty*(n1 - n0); + v_float32x4 n0 = v_add(n00, v_mul(tx, v_sub(n01, n00))); + v_float32x4 n1 = v_add(n10, v_mul(tx, v_sub(n11, n10))); + oldN = v_add(n0, v_mul(ty, v_sub(n1, n0))); } bool oldPNcheck = fastCheck(oldP, oldN); //filter by distance - v_float32x4 diff = newP - oldP; - bool distCheck = !(v_reduce_sum(diff*diff) > sqThresh); + v_float32x4 diff = v_sub(newP, oldP); + bool distCheck = !(v_reduce_sum(v_mul(diff, diff)) > sqThresh); //filter by angle - bool angleCheck = !(abs(v_reduce_sum(newN*oldN)) < cosThresh); + bool angleCheck = !(abs(v_reduce_sum(v_mul(newN, oldN))) < cosThresh); if(!(oldPNcheck && distCheck && angleCheck)) continue; @@ -299,17 +299,17 @@ struct GetAbInvoker : ParallelLoopBody // build point-wise vector ab = [ A | b ] v_float32x4 VxNv = crossProduct(newP, oldN); Point3f VxN; - VxN.x = VxNv.get0(); - VxN.y = v_reinterpret_as_f32(v_extract<1>(v_reinterpret_as_u32(VxNv), v_setzero_u32())).get0(); - VxN.z = v_reinterpret_as_f32(v_extract<2>(v_reinterpret_as_u32(VxNv), v_setzero_u32())).get0(); + VxN.x = v_get0(VxNv); + VxN.y = v_get0(v_reinterpret_as_f32(v_extract<1>(v_reinterpret_as_u32(VxNv), v_setzero_u32()))); + VxN.z = v_get0(v_reinterpret_as_f32(v_extract<2>(v_reinterpret_as_u32(VxNv), v_setzero_u32()))); - float dotp = -v_reduce_sum(oldN*diff); + float dotp = -v_reduce_sum(v_mul(oldN, diff)); // build point-wise upper-triangle matrix [ab^T * ab] w/o last row // which is [A^T*A | A^T*b] // and gather sum - v_float32x4 vd = VxNv | v_float32x4(0, 0, 0, dotp); + v_float32x4 vd = v_or(VxNv, v_float32x4(0, 0, 0, dotp)); v_float32x4 n = oldN; v_float32x4 nyzx; { diff --git a/modules/rgbd/src/hash_tsdf.cpp b/modules/rgbd/src/hash_tsdf.cpp index 9cf05e556..e47af731d 100644 --- a/modules/rgbd/src/hash_tsdf.cpp +++ b/modules/rgbd/src/hash_tsdf.cpp @@ -425,15 +425,15 @@ inline float interpolate(float tx, float ty, float tz, float vx[8]) v_float32x4 v0246, v1357; v_load_deinterleave(vx, v0246, v1357); - v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246); + v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246))); v_float32x4 v00_10 = vxx; v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx))); - v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10); - float v0 = v0_1.get0(); + v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10))); + float v0 = v_get0(v0_1); v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1))); - float v1 = v0_1.get0(); + float v1 = v_get0(v0_1); return v0 + tx * (v1 - v0); } @@ -598,9 +598,9 @@ Point3f HashTSDFVolumeCPU::getNormalVoxel(const Point3f &point) const v_float32x8 czp = v_lut(vals, idxzp); v_float32x8 czn = v_lut(vals, idxzn); - v_float32x8 vcxv = cxn - cxp; - v_float32x8 vcyv = cyn - cyp; - v_float32x8 vczv = czn - czp; + v_float32x8 vcxv = v_sub(cxn, cxp); + v_float32x8 vcyv = v_sub(cyn, cyp); + v_float32x8 vczv = v_sub(czn, czp); v_store(cxv, vcxv); v_store(cyv, vcyv); @@ -615,9 +615,9 @@ Point3f HashTSDFVolumeCPU::getNormalVoxel(const Point3f &point) const v_float32x4 czp0 = v_lut(vals, idxzp + 0); v_float32x4 czp1 = v_lut(vals, idxzp + 4); v_float32x4 czn0 = v_lut(vals, idxzn + 0); v_float32x4 czn1 = v_lut(vals, idxzn + 4); - v_float32x4 cxv0 = cxn0 - cxp0; v_float32x4 cxv1 = cxn1 - cxp1; - v_float32x4 cyv0 = cyn0 - cyp0; v_float32x4 cyv1 = cyn1 - cyp1; - v_float32x4 czv0 = czn0 - czp0; v_float32x4 czv1 = czn1 - czp1; + v_float32x4 cxv0 = v_sub(cxn0, cxp0); v_float32x4 cxv1 = v_sub(cxn1, cxp1); + v_float32x4 cyv0 = v_sub(cyn0, cyp0); v_float32x4 cyv1 = v_sub(cyn1, cyp1); + v_float32x4 czv0 = v_sub(czn0, czp0); v_float32x4 czv1 = v_sub(czn1, czp1); v_store(cxv + 0, cxv0); v_store(cxv + 4, cxv1); v_store(cyv + 0, cyv0); v_store(cyv + 4, cyv1); @@ -1523,9 +1523,9 @@ Point3f HashTSDFVolumeGPU::getNormalVoxel(const Point3f& point) const v_float32x8 czp = v_lut(vals, idxzp); v_float32x8 czn = v_lut(vals, idxzn); - v_float32x8 vcxv = cxn - cxp; - v_float32x8 vcyv = cyn - cyp; - v_float32x8 vczv = czn - czp; + v_float32x8 vcxv = v_sub(cxn, cxp); + v_float32x8 vcyv = v_sub(cyn, cyp); + v_float32x8 vczv = v_sub(czn, czp); v_store(cxv, vcxv); v_store(cyv, vcyv); @@ -1540,9 +1540,9 @@ Point3f HashTSDFVolumeGPU::getNormalVoxel(const Point3f& point) const v_float32x4 czp0 = v_lut(vals, idxzp + 0); v_float32x4 czp1 = v_lut(vals, idxzp + 4); v_float32x4 czn0 = v_lut(vals, idxzn + 0); v_float32x4 czn1 = v_lut(vals, idxzn + 4); - v_float32x4 cxv0 = cxn0 - cxp0; v_float32x4 cxv1 = cxn1 - cxp1; - v_float32x4 cyv0 = cyn0 - cyp0; v_float32x4 cyv1 = cyn1 - cyp1; - v_float32x4 czv0 = czn0 - czp0; v_float32x4 czv1 = czn1 - czp1; + v_float32x4 cxv0 = v_sub(cxn0, cxp0); v_float32x4 cxv1 = v_sub(cxn1, cxp1); + v_float32x4 cyv0 = v_sub(cyn0, cyp0); v_float32x4 cyv1 = v_sub(cyn1, cyp1); + v_float32x4 czv0 = v_sub(czn0, czp0); v_float32x4 czv1 = v_sub(czn1, czp1); v_store(cxv + 0, cxv0); v_store(cxv + 4, cxv1); v_store(cyv + 0, cyv0); v_store(cyv + 4, cyv1); diff --git a/modules/rgbd/src/tsdf.cpp b/modules/rgbd/src/tsdf.cpp index 7b76985eb..73a39a659 100644 --- a/modules/rgbd/src/tsdf.cpp +++ b/modules/rgbd/src/tsdf.cpp @@ -146,21 +146,21 @@ inline float TSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const { // tx, ty, tz = floor(p) v_int32x4 ip = v_floor(p); - v_float32x4 t = p - v_cvt_f32(ip); - float tx = t.get0(); + v_float32x4 t = v_sub(p, v_cvt_f32(ip)); + float tx = v_get0(t); t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t))); - float ty = t.get0(); + float ty = v_get0(t); t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t))); - float tz = t.get0(); + float tz = v_get0(t); int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2]; const TsdfVoxel* volData = volume.ptr(); - int ix = ip.get0(); + int ix = v_get0(ip); ip = v_rotate_right<1>(ip); - int iy = ip.get0(); + int iy = v_get0(ip); ip = v_rotate_right<1>(ip); - int iz = ip.get0(); + int iz = v_get0(ip); int coordBase = ix*xdim + iy*ydim + iz*zdim; @@ -170,15 +170,15 @@ inline float TSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const v_float32x4 v0246 = tsdfToFloat_INTR(v_int32x4(vx[0], vx[2], vx[4], vx[6])); v_float32x4 v1357 = tsdfToFloat_INTR(v_int32x4(vx[1], vx[3], vx[5], vx[7])); - v_float32x4 vxx = v0246 + v_setall_f32(tz)*(v1357 - v0246); + v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246))); v_float32x4 v00_10 = vxx; v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx))); - v_float32x4 v0_1 = v00_10 + v_setall_f32(ty)*(v01_11 - v00_10); - float v0 = v0_1.get0(); + v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10))); + float v0 = v_get0(v0_1); v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1))); - float v1 = v0_1.get0(); + float v1 = v_get0(v0_1); return v0 + tx*(v1 - v0); } @@ -228,27 +228,27 @@ inline Point3f TSDFVolumeCPU::getNormalVoxel(const Point3f& _p) const inline v_float32x4 TSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const { - if(v_check_any (p < v_float32x4(1.f, 1.f, 1.f, 0.f)) || - v_check_any (p >= v_float32x4((float)(volResolution.x-2), + if(v_check_any (v_lt(p, v_float32x4(1.f, 1.f, 1.f, 0.f))) || + v_check_any (v_ge(p, v_float32x4((float)(volResolution.x-2), (float)(volResolution.y-2), - (float)(volResolution.z-2), 1.f)) + (float)(volResolution.z-2), 1.f))) ) return nanv; v_int32x4 ip = v_floor(p); - v_float32x4 t = p - v_cvt_f32(ip); - float tx = t.get0(); + v_float32x4 t = v_sub(p, v_cvt_f32(ip)); + float tx = v_get0(t); t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t))); - float ty = t.get0(); + float ty = v_get0(t); t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t))); - float tz = t.get0(); + float tz = v_get0(t); const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2]; const TsdfVoxel* volData = volume.ptr(); - int ix = ip.get0(); ip = v_rotate_right<1>(ip); - int iy = ip.get0(); ip = v_rotate_right<1>(ip); - int iz = ip.get0(); + int ix = v_get0(ip); ip = v_rotate_right<1>(ip); + int iy = v_get0(ip); ip = v_rotate_right<1>(ip); + int iz = v_get0(ip); int coordBase = ix*xdim + iy*ydim + iz*zdim; @@ -266,23 +266,23 @@ inline v_float32x4 TSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const v_float32x4 v0246 (vx[0], vx[2], vx[4], vx[6]); v_float32x4 v1357 (vx[1], vx[3], vx[5], vx[7]); - v_float32x4 vxx = v0246 + v_setall_f32(tz)*(v1357 - v0246); + v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246))); v_float32x4 v00_10 = vxx; v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx))); - v_float32x4 v0_1 = v00_10 + v_setall_f32(ty)*(v01_11 - v00_10); - float v0 = v0_1.get0(); + v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10))); + float v0 = v_get0(v0_1); v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1))); - float v1 = v0_1.get0(); + float v1 = v_get0(v0_1); nv = v0 + tx*(v1 - v0); } v_float32x4 n = v_load_aligned(an); - v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(n*n))); + v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(v_mul(n, n)))); - return Norm.get0() < 0.0001f ? nanv : n/Norm; + return v_get0(Norm) < 0.0001f ? nanv : v_div(n, Norm); } #else inline Point3f TSDFVolumeCPU::getNormalVoxel(const Point3f& p) const @@ -394,21 +394,21 @@ struct RaycastInvoker : ParallelLoopBody // get direction through pixel in volume space: // 1. reproject (x, y) on projecting plane where z = 1.f - v_float32x4 planed = (v_float32x4((float)x, (float)y, 0.f, 0.f) - vcxy)*vfxy; + v_float32x4 planed = v_mul(v_sub(v_float32x4((float)x, (float)y, 0.F, 0.F), vcxy), vfxy); planed = v_combine_low(planed, v_float32x4(1.f, 0.f, 0.f, 0.f)); // 2. rotate to volume space planed = v_matmuladd(planed, camRot0, camRot1, camRot2, v_setzero_f32()); // 3. normalize - v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(planed*planed))); - v_float32x4 dir = planed*invNorm; + v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(v_mul(planed, planed)))); + v_float32x4 dir = v_mul(planed, invNorm); // compute intersection of ray with all six bbox planes - v_float32x4 rayinv = v_setall_f32(1.f)/dir; + v_float32x4 rayinv = v_div(v_setall_f32(1.F), dir); // div by zero should be eliminated by these products - v_float32x4 tbottom = rayinv*(boxDown - orig); - v_float32x4 ttop = rayinv*(boxUp - orig); + v_float32x4 tbottom = v_mul(rayinv, v_sub(boxDown, orig)); + v_float32x4 ttop = v_mul(rayinv, v_sub(boxUp, orig)); // re-order intersections to find smallest and largest on each axis v_float32x4 minAx = v_min(ttop, tbottom); @@ -429,14 +429,14 @@ struct RaycastInvoker : ParallelLoopBody if(tmin < tmax) { // interpolation optimized a little - orig *= invVoxelSize; - dir *= invVoxelSize; + orig = v_mul(orig, invVoxelSize); + dir = v_mul(dir, invVoxelSize); int xdim = volume.volDims[0]; int ydim = volume.volDims[1]; int zdim = volume.volDims[2]; - v_float32x4 rayStep = dir * v_setall_f32(tstep); - v_float32x4 next = (orig + dir * v_setall_f32(tmin)); + v_float32x4 rayStep = v_mul(dir, v_setall_f32(this->tstep)); + v_float32x4 next = (v_add(orig, v_mul(dir, v_setall_f32(tmin)))); float f = volume.interpolateVoxel(next), fnext = f; //raymarch @@ -444,11 +444,11 @@ struct RaycastInvoker : ParallelLoopBody int nSteps = cvFloor((tmax - tmin)/tstep); for(; steps < nSteps; steps++) { - next += rayStep; + next = v_add(next, rayStep); v_int32x4 ip = v_round(next); - int ix = ip.get0(); ip = v_rotate_right<1>(ip); - int iy = ip.get0(); ip = v_rotate_right<1>(ip); - int iz = ip.get0(); + int ix = v_get0(ip); ip = v_rotate_right<1>(ip); + int iy = v_get0(ip); ip = v_rotate_right<1>(ip); + int iz = v_get0(ip); int coord = ix*xdim + iy*ydim + iz*zdim; fnext = tsdfToFloat(volume.volume.at(coord).tsdf); @@ -468,7 +468,7 @@ struct RaycastInvoker : ParallelLoopBody // linearly interpolate t between two f values if(f > 0.f && fnext < 0.f) { - v_float32x4 tp = next - rayStep; + v_float32x4 tp = v_sub(next, rayStep); float ft = volume.interpolateVoxel(tp); float ftdt = volume.interpolateVoxel(next); float ts = tmin + tstep*(steps - ft/(ftdt - ft)); @@ -476,7 +476,7 @@ struct RaycastInvoker : ParallelLoopBody // avoid division by zero if(!cvIsNaN(ts) && !cvIsInf(ts)) { - v_float32x4 pv = (orig + dir*v_setall_f32(ts)); + v_float32x4 pv = (v_add(orig, v_mul(dir, v_setall_f32(ts)))); v_float32x4 nv = volume.getNormalVoxel(pv); if(!isNaN(nv)) @@ -484,9 +484,7 @@ struct RaycastInvoker : ParallelLoopBody //convert pv and nv to camera space normal = v_matmuladd(nv, volRot0, volRot1, volRot2, v_setzero_f32()); // interpolation optimized a little - point = v_matmuladd(pv*v_float32x4(volume.voxelSize, - volume.voxelSize, - volume.voxelSize, 1.f), + point = v_matmuladd(v_mul(pv, v_float32x4(this->volume.voxelSize, this->volume.voxelSize, this->volume.voxelSize, 1.F)), volRot0, volRot1, volRot2, volTrans); } } diff --git a/modules/rgbd/src/tsdf_functions.cpp b/modules/rgbd/src/tsdf_functions.cpp index 3f8bc26f0..09ca986a1 100644 --- a/modules/rgbd/src/tsdf_functions.cpp +++ b/modules/rgbd/src/tsdf_functions.cpp @@ -235,35 +235,33 @@ void integrateVolumeUnit( // optimization of the following: //Point3f volPt = Point3f(x, y, z)*voxelSize; //Point3f camSpacePt = vol2cam * volPt; - camSpacePt += zStep; + camSpacePt = v_add(camSpacePt, zStep); - float zCamSpace = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))).get0(); + float zCamSpace = v_get0(v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt)))); if (zCamSpace <= 0.f) continue; - v_float32x4 camPixVec = camSpacePt / v_setall_f32(zCamSpace); + v_float32x4 camPixVec = v_div(camSpacePt, v_setall_f32(zCamSpace)); v_float32x4 projected = v_muladd(camPixVec, vfxy, vcxy); // leave only first 2 lanes - projected = v_reinterpret_as_f32(v_reinterpret_as_u32(projected) & - v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0)); + projected = v_reinterpret_as_f32(v_and(v_reinterpret_as_u32(projected), v_uint32x4(4294967295U, 4294967295U, 0, 0))); depthType v; // bilinearly interpolate depth at projected { const v_float32x4& pt = projected; // check coords >= 0 and < imgSize - v_uint32x4 limits = v_reinterpret_as_u32(pt < v_setzero_f32()) | - v_reinterpret_as_u32(pt >= upLimits); - limits = limits | v_rotate_right<1>(limits); - if (limits.get0()) + v_uint32x4 limits = v_or(v_reinterpret_as_u32(v_lt(pt, v_setzero_f32())), v_reinterpret_as_u32(v_ge(pt, upLimits))); + limits = v_or(limits, v_rotate_right<1>(limits)); + if (v_get0(limits)) continue; // xi, yi = floor(pt) v_int32x4 ip = v_floor(pt); v_int32x4 ipshift = ip; - int xi = ipshift.get0(); + int xi = v_get0(ipshift); ipshift = v_rotate_right<1>(ipshift); - int yi = ipshift.get0(); + int yi = v_get0(ipshift); const depthType* row0 = depth[yi + 0]; const depthType* row1 = depth[yi + 1]; @@ -277,17 +275,17 @@ void integrateVolumeUnit( // assume correct depth is positive // don't fix missing data - if (v_check_all(vall > v_setzero_f32())) + if (v_check_all(v_gt(vall, v_setzero_f32()))) { - v_float32x4 t = pt - v_cvt_f32(ip); - float tx = t.get0(); + v_float32x4 t = v_sub(pt, v_cvt_f32(ip)); + float tx = v_get0(t); t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t))); - v_float32x4 ty = v_setall_f32(t.get0()); + v_float32x4 ty = v_setall_f32(v_get0(t)); // vx is y-interpolated between rows 0 and 1 - v_float32x4 vx = v001 + ty * (v101 - v001); - float v0 = vx.get0(); + v_float32x4 vx = v_add(v001, v_mul(ty, v_sub(v101, v001))); + float v0 = v_get0(vx); vx = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vx))); - float v1 = vx.get0(); + float v1 = v_get0(vx); v = v0 + tx * (v1 - v0); } else @@ -295,8 +293,8 @@ void integrateVolumeUnit( } // norm(camPixVec) produces double which is too slow - int _u = (int)projected.get0(); - int _v = (int)v_rotate_right<1>(projected).get0(); + int _u = (int)v_get0(projected); + int _v = (int)v_get0(v_rotate_right<1>(projected)); if (!(_u >= 0 && _u < depth.cols && _v >= 0 && _v < depth.rows)) continue; float pixNorm = pixNorms.at(_v, _u); @@ -500,35 +498,33 @@ void integrateRGBVolumeUnit( // optimization of the following: //Point3f volPt = Point3f(x, y, z)*voxelSize; //Point3f camSpacePt = vol2cam * volPt; - camSpacePt += zStep; + camSpacePt = v_add(camSpacePt, zStep); - float zCamSpace = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))).get0(); + float zCamSpace = v_get0(v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt)))); if (zCamSpace <= 0.f) continue; - v_float32x4 camPixVec = camSpacePt / v_setall_f32(zCamSpace); + v_float32x4 camPixVec = v_div(camSpacePt, v_setall_f32(zCamSpace)); v_float32x4 projected = v_muladd(camPixVec, vfxy, vcxy); // leave only first 2 lanes - projected = v_reinterpret_as_f32(v_reinterpret_as_u32(projected) & - v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0)); + projected = v_reinterpret_as_f32(v_and(v_reinterpret_as_u32(projected), v_uint32x4(4294967295U, 4294967295U, 0, 0))); depthType v; // bilinearly interpolate depth at projected { const v_float32x4& pt = projected; // check coords >= 0 and < imgSize - v_uint32x4 limits = v_reinterpret_as_u32(pt < v_setzero_f32()) | - v_reinterpret_as_u32(pt >= upLimits); - limits = limits | v_rotate_right<1>(limits); - if (limits.get0()) + v_uint32x4 limits = v_or(v_reinterpret_as_u32(v_lt(pt, v_setzero_f32())), v_reinterpret_as_u32(v_ge(pt, upLimits))); + limits = v_or(limits,v_rotate_right<1>(limits)); + if (v_get0(limits)) continue; // xi, yi = floor(pt) v_int32x4 ip = v_floor(pt); v_int32x4 ipshift = ip; - int xi = ipshift.get0(); + int xi = v_get0(ipshift); ipshift = v_rotate_right<1>(ipshift); - int yi = ipshift.get0(); + int yi = v_get0(ipshift); const depthType* row0 = depth[yi + 0]; const depthType* row1 = depth[yi + 1]; @@ -542,17 +538,17 @@ void integrateRGBVolumeUnit( // assume correct depth is positive // don't fix missing data - if (v_check_all(vall > v_setzero_f32())) + if (v_check_all(v_gt(vall, v_setzero_f32()))) { - v_float32x4 t = pt - v_cvt_f32(ip); - float tx = t.get0(); + v_float32x4 t = v_sub(pt, v_cvt_f32(ip)); + float tx = v_get0(t); t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t))); - v_float32x4 ty = v_setall_f32(t.get0()); + v_float32x4 ty = v_setall_f32(v_get0(t)); // vx is y-interpolated between rows 0 and 1 - v_float32x4 vx = v001 + ty * (v101 - v001); - float v0 = vx.get0(); + v_float32x4 vx = v_add(v001, v_mul(ty, v_sub(v101, v001))); + float v0 = v_get0(vx); vx = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vx))); - float v1 = vx.get0(); + float v1 = v_get0(vx); v = v0 + tx * (v1 - v0); } else @@ -561,14 +557,13 @@ void integrateRGBVolumeUnit( v_float32x4 projectedRGB = v_muladd(camPixVec, rgb_vfxy, rgb_vcxy); // leave only first 2 lanes - projectedRGB = v_reinterpret_as_f32(v_reinterpret_as_u32(projected) & - v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0)); + projectedRGB = v_reinterpret_as_f32(v_and(v_reinterpret_as_u32(projected), v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0))); // norm(camPixVec) produces double which is too slow - int _u = (int)projected.get0(); - int _v = (int)v_rotate_right<1>(projected).get0(); - int rgb_u = (int)projectedRGB.get0(); - int rgb_v = (int)v_rotate_right<1>(projectedRGB).get0(); + int _u = (int)v_get0(projected); + int _v = (int)v_get0(v_rotate_right<1>(projected)); + int rgb_u = (int)v_get0(projectedRGB); + int rgb_v = (int)v_get0(v_rotate_right<1>(projectedRGB)); if (!(_u >= 0 && _u < depth.cols && _v >= 0 && _v < depth.rows && rgb_v >= 0 && rgb_v < color.rows && rgb_u >= 0 && rgb_u < color.cols)) diff --git a/modules/rgbd/src/tsdf_functions.hpp b/modules/rgbd/src/tsdf_functions.hpp index c763f9275..6038ab60c 100644 --- a/modules/rgbd/src/tsdf_functions.hpp +++ b/modules/rgbd/src/tsdf_functions.hpp @@ -20,7 +20,7 @@ namespace kinfu inline v_float32x4 tsdfToFloat_INTR(const v_int32x4& num) { v_float32x4 num128 = v_setall_f32(-1.f / 128.f); - return v_cvt_f32(num) * num128; + return v_mul(v_cvt_f32(num), num128); } #endif diff --git a/modules/rgbd/src/utils.hpp b/modules/rgbd/src/utils.hpp index 0b9636753..2bb69713d 100644 --- a/modules/rgbd/src/utils.hpp +++ b/modules/rgbd/src/utils.hpp @@ -68,7 +68,7 @@ inline bool isNaN(cv::Point3f p) #if USE_INTRINSICS static inline bool isNaN(const cv::v_float32x4& p) { - return cv::v_check_any(p != p); + return cv::v_check_any(v_ne(p, p)); } #endif diff --git a/modules/ximgproc/src/anisodiff.cpp b/modules/ximgproc/src/anisodiff.cpp index 996b4ac5b..2b230a712 100644 --- a/modules/ximgproc/src/anisodiff.cpp +++ b/modules/ximgproc/src/anisodiff.cpp @@ -74,8 +74,8 @@ inline v_uint8x16 v_finalize_pix_ch(const v_int16x8& c0, const v_int16x8& c1, v_expand_f32(c0, f0, f1); v_expand_f32(c1, f2, f3); - v_int16x8 d0 = v_pack(v_round(s0*alpha + f0), v_round(s1*alpha + f1)); - v_int16x8 d1 = v_pack(v_round(s2*alpha + f2), v_round(s3*alpha + f3)); + v_int16x8 d0 = v_pack(v_round(v_add(v_mul(s0, alpha), f0)), v_round(v_add(v_mul(s1, alpha), f1))); + v_int16x8 d1 = v_pack(v_round(v_add(v_mul(s2, alpha), f2)), v_round(v_add(v_mul(s3, alpha), f3))); return v_pack_u(d0, d1); } @@ -135,12 +135,12 @@ public: v_expand_s(p1, p10, p11); v_expand_s(p2, p20, p21); - v_int16x8 d00 = p00 - c00, d01 = p01 - c01; - v_int16x8 d10 = p10 - c10, d11 = p11 - c11; - v_int16x8 d20 = p20 - c20, d21 = p21 - c21; + v_int16x8 d00 = v_sub(p00, c00), d01 = v_sub(p01, c01); + v_int16x8 d10 = v_sub(p10, c10), d11 = v_sub(p11, c11); + v_int16x8 d20 = v_sub(p20, c20), d21 = v_sub(p21, c21); - v_uint16x8 n0 = v_abs(d00) + v_abs(d10) + v_abs(d20); - v_uint16x8 n1 = v_abs(d01) + v_abs(d11) + v_abs(d21); + v_uint16x8 n0 = v_add(v_add(v_abs(d00), v_abs(d10)), v_abs(d20)); + v_uint16x8 n1 = v_add(v_add(v_abs(d01), v_abs(d11)), v_abs(d21)); ushort CV_DECL_ALIGNED(16) nbuf[16]; v_store(nbuf, n0); @@ -153,13 +153,13 @@ public: v_expand_f32(d00, fd0, fd1); v_expand_f32(d01, fd2, fd3); - s00 += fd0*w0; s01 += fd1*w1; s02 += fd2*w2; s03 += fd3*w3; + s00 = v_add(s00, v_mul(fd0, w0)); s01 = v_add(s01, v_mul(fd1, w1)); s02 = v_add(s02, v_mul(fd2, w2)); s03 = v_add(s03, v_mul(fd3, w3)); v_expand_f32(d10, fd0, fd1); v_expand_f32(d11, fd2, fd3); - s10 += fd0*w0; s11 += fd1*w1; s12 += fd2*w2; s13 += fd3*w3; + s10 = v_add(s10, v_mul(fd0, w0)); s11 = v_add(s11, v_mul(fd1, w1)); s12 = v_add(s12, v_mul(fd2, w2)); s13 = v_add(s13, v_mul(fd3, w3)); v_expand_f32(d20, fd0, fd1); v_expand_f32(d21, fd2, fd3); - s20 += fd0*w0; s21 += fd1*w1; s22 += fd2*w2; s23 += fd3*w3; + s20 = v_add(s20, v_mul(fd0, w0)); s21 = v_add(s21, v_mul(fd1, w1)); s22 = v_add(s22, v_mul(fd2, w2)); s23 = v_add(s23, v_mul(fd3, w3)); } c0 = v_finalize_pix_ch(c00, c01, s00, s01, s02, s03, v_alpha); diff --git a/modules/ximgproc/src/fgs_filter.cpp b/modules/ximgproc/src/fgs_filter.cpp index 5e168da5d..804e9f00a 100644 --- a/modules/ximgproc/src/fgs_filter.cpp +++ b/modules/ximgproc/src/fgs_filter.cpp @@ -303,15 +303,15 @@ void FastGlobalSmootherFilterImpl::process_4row_block(Mat* cur,int i) v_float32x4 aux0,aux1,aux2,aux3; #define PROC4(Chor_in,cur_in,coef_prev_in,interD_prev_in,cur_prev_in,interD_out,cur_out,coef_cur_out)\ - coef_cur_out = lambda_reg*Chor_in;\ - aux0 = interD_prev_in*coef_prev_in;\ - aux1 = coef_cur_out+coef_prev_in;\ - aux1 = one_reg-aux1;\ - aux0 = aux1-aux0;\ - interD_out = coef_cur_out/aux0;\ - aux1 = cur_prev_in*coef_prev_in;\ - aux1 = cur_in - aux1;\ - cur_out = aux1/aux0; + coef_cur_out = v_mul(lambda_reg, Chor_in);\ + aux0 = v_mul(interD_prev_in, coef_prev_in);\ + aux1 = v_add(coef_cur_out, coef_prev_in);\ + aux1 = v_sub(one_reg, aux1);\ + aux0 = v_sub(aux1, aux0);\ + interD_out = v_div(coef_cur_out, aux0);\ + aux1 = v_mul(cur_prev_in, coef_prev_in);\ + aux1 = v_sub(cur_in, aux1);\ + cur_out = v_div(aux1, aux0); for(;j v_mul_wrap(v_thresh, v_max1)); - v_m2 = ~(v_mul_wrap(v_max2 - v_min2, v_255) > v_mul_wrap(v_thresh, v_max2)); + v_m1 = v_not(v_gt(v_mul_wrap(v_sub(v_max1, v_min1), v_255), v_mul_wrap(v_thresh, v_max1))); + v_m2 = v_not(v_gt(v_mul_wrap(v_sub(v_max2, v_min2), v_255), v_mul_wrap(v_thresh, v_max2))); // Apply masks - v_iB1 = (v_iB1 & v_m1) + (v_iB2 & v_m2); - v_iG1 = (v_iG1 & v_m1) + (v_iG2 & v_m2); - v_iR1 = (v_iR1 & v_m1) + (v_iR2 & v_m2); + v_iB1 = v_add(v_and(v_iB1, v_m1), v_and(v_iB2, v_m2)); + v_iG1 = v_add(v_and(v_iG1, v_m1), v_and(v_iG2, v_m2)); + v_iR1 = v_add(v_and(v_iR1, v_m1), v_and(v_iR2, v_m2)); // Split and add to the sums: v_expand(v_iB1, v_uint1, v_uint2); - v_SB += v_uint1 + v_uint2; + v_SB = v_add(v_SB, v_add(v_uint1, v_uint2)); v_expand(v_iG1, v_uint1, v_uint2); - v_SG += v_uint1 + v_uint2; + v_SG = v_add(v_SG, v_add(v_uint1, v_uint2)); v_expand(v_iR1, v_uint1, v_uint2); - v_SR += v_uint1 + v_uint2; + v_SR = v_add(v_SR, v_add(v_uint1, v_uint2)); } sumB = v_reduce_sum(v_SB); @@ -197,21 +197,21 @@ void calculateChannelSums(uint64 &sumB, uint64 &sumG, uint64 &sumR, ushort *src_ v_expand(v_max_val, v_max1, v_max2); // Calculate masks - v_m1 = ~((v_max1 - v_min1) * v_65535 > v_thresh * v_max1); - v_m2 = ~((v_max2 - v_min2) * v_65535 > v_thresh * v_max2); + v_m1 = v_not(v_gt(v_mul(v_sub(v_max1, v_min1), v_65535), v_mul(v_thresh, v_max1))); + v_m2 = v_not(v_gt(v_mul(v_sub(v_max2, v_min2), v_65535), v_mul(v_thresh, v_max2))); // Apply masks - v_iB1 = (v_iB1 & v_m1) + (v_iB2 & v_m2); - v_iG1 = (v_iG1 & v_m1) + (v_iG2 & v_m2); - v_iR1 = (v_iR1 & v_m1) + (v_iR2 & v_m2); + v_iB1 = v_add(v_and(v_iB1, v_m1), v_and(v_iB2, v_m2)); + v_iG1 = v_add(v_and(v_iG1, v_m1), v_and(v_iG2, v_m2)); + v_iR1 = v_add(v_and(v_iR1, v_m1), v_and(v_iR2, v_m2)); // Split and add to the sums: v_expand(v_iB1, v_u64_1, v_u64_2); - v_SB += v_u64_1 + v_u64_2; + v_SB = v_add(v_SB, v_add(v_u64_1, v_u64_2)); v_expand(v_iG1, v_u64_1, v_u64_2); - v_SG += v_u64_1 + v_u64_2; + v_SG = v_add(v_SG, v_add(v_u64_1, v_u64_2)); v_expand(v_iR1, v_u64_1, v_u64_2); - v_SR += v_u64_1 + v_u64_2; + v_SR = v_add(v_SR, v_add(v_u64_1, v_u64_2)); } // Perform final reduction @@ -282,12 +282,12 @@ void applyChannelGains(InputArray _src, OutputArray _dst, float gainB, float gai v_expand(v_inR, v_sR1, v_sR2); // Multiply by gains - v_sB1 = v_mul_wrap(v_sB1, v_gainB) >> 8; - v_sB2 = v_mul_wrap(v_sB2, v_gainB) >> 8; - v_sG1 = v_mul_wrap(v_sG1, v_gainG) >> 8; - v_sG2 = v_mul_wrap(v_sG2, v_gainG) >> 8; - v_sR1 = v_mul_wrap(v_sR1, v_gainR) >> 8; - v_sR2 = v_mul_wrap(v_sR2, v_gainR) >> 8; + v_sB1 = v_shr(v_mul_wrap(v_sB1, v_gainB), 8); + v_sB2 = v_shr(v_mul_wrap(v_sB2, v_gainB), 8); + v_sG1 = v_shr(v_mul_wrap(v_sG1, v_gainG), 8); + v_sG2 = v_shr(v_mul_wrap(v_sG2, v_gainG), 8); + v_sR1 = v_shr(v_mul_wrap(v_sR1, v_gainR), 8); + v_sR2 = v_shr(v_mul_wrap(v_sR2, v_gainR), 8); // Pack into vectors of v_uint8x16 v_store_interleave(&dst_data[i], v_pack(v_sB1, v_sB2), v_pack(v_sG1, v_sG2), v_pack(v_sR1, v_sR2)); @@ -325,12 +325,12 @@ void applyChannelGains(InputArray _src, OutputArray _dst, float gainB, float gai v_expand(v_inR, v_sR1, v_sR2); // Multiply by scaling factors - v_sB1 = (v_sB1 * v_gainB) >> 16; - v_sB2 = (v_sB2 * v_gainB) >> 16; - v_sG1 = (v_sG1 * v_gainG) >> 16; - v_sG2 = (v_sG2 * v_gainG) >> 16; - v_sR1 = (v_sR1 * v_gainR) >> 16; - v_sR2 = (v_sR2 * v_gainR) >> 16; + v_sB1 = v_shr(v_mul(v_sB1, v_gainB), 16); + v_sB2 = v_shr(v_mul(v_sB2, v_gainB), 16); + v_sG1 = v_shr(v_mul(v_sG1, v_gainG), 16); + v_sG2 = v_shr(v_mul(v_sG2, v_gainG), 16); + v_sR1 = v_shr(v_mul(v_sR1, v_gainR), 16); + v_sR2 = v_shr(v_mul(v_sR2, v_gainR), 16); // Pack into vectors of v_uint16x8 v_store_interleave(&dst_data[i], v_pack(v_sB1, v_sB2), v_pack(v_sG1, v_sG2), v_pack(v_sR1, v_sR2)); diff --git a/modules/xphoto/src/learning_based_color_balance.cpp b/modules/xphoto/src/learning_based_color_balance.cpp index bd408e6cb..de1958dcc 100644 --- a/modules/xphoto/src/learning_based_color_balance.cpp +++ b/modules/xphoto/src/learning_based_color_balance.cpp @@ -192,7 +192,7 @@ void LearningBasedWBImpl::preprocessing(Mat &src) v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR); v_local_max = v_max(v_inB, v_max(v_inG, v_inR)); v_global_max = v_max(v_local_max, v_global_max); - v_mask = (v_local_max < v_thresh); + v_mask = (v_lt(v_local_max, v_thresh)); v_store(mask_ptr + i, v_mask); } uchar global_max[16]; @@ -225,7 +225,7 @@ void LearningBasedWBImpl::preprocessing(Mat &src) v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR); v_local_max = v_max(v_inB, v_max(v_inG, v_inR)); v_global_max = v_max(v_local_max, v_global_max); - v_mask = (v_local_max < v_thresh); + v_mask = (v_lt(v_local_max, v_thresh)); v_pack_store(mask_ptr + i, v_mask); } ushort global_max[8]; @@ -270,9 +270,9 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR); v_uint8x16 v_mask = v_load(mask_ptr + i); - v_inB &= v_mask; - v_inG &= v_mask; - v_inR &= v_mask; + v_inB = v_and(v_inB, v_mask); + v_inG = v_and(v_inG, v_mask); + v_inR = v_and(v_inR, v_mask); v_uint16x8 v_sR1, v_sR2, v_sG1, v_sG2, v_sB1, v_sB2; v_expand(v_inB, v_sB1, v_sB2); @@ -280,33 +280,33 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average v_expand(v_inR, v_sR1, v_sR2); // update the brightest (R,G,B) tuple (process left half): - v_uint16x8 v_sum = v_sB1 + v_sG1 + v_sR1; - v_uint16x8 v_max_mask = (v_sum > v_max_sum); + v_uint16x8 v_sum = v_add(v_add(v_sB1, v_sG1), v_sR1); + v_uint16x8 v_max_mask = (v_gt(v_sum, v_max_sum)); v_max_sum = v_max(v_sum, v_max_sum); - v_brightestB = (v_sB1 & v_max_mask) + (v_brightestB & (~v_max_mask)); - v_brightestG = (v_sG1 & v_max_mask) + (v_brightestG & (~v_max_mask)); - v_brightestR = (v_sR1 & v_max_mask) + (v_brightestR & (~v_max_mask)); + v_brightestB = v_add(v_and(v_sB1, v_max_mask), v_and(v_brightestB, v_not(v_max_mask))); + v_brightestG = v_add(v_and(v_sG1, v_max_mask), v_and(v_brightestG, v_not(v_max_mask))); + v_brightestR = v_add(v_and(v_sR1, v_max_mask), v_and(v_brightestR, v_not(v_max_mask))); // update the brightest (R,G,B) tuple (process right half): - v_sum = v_sB2 + v_sG2 + v_sR2; - v_max_mask = (v_sum > v_max_sum); + v_sum = v_add(v_add(v_sB2, v_sG2), v_sR2); + v_max_mask = (v_gt(v_sum, v_max_sum)); v_max_sum = v_max(v_sum, v_max_sum); - v_brightestB = (v_sB2 & v_max_mask) + (v_brightestB & (~v_max_mask)); - v_brightestG = (v_sG2 & v_max_mask) + (v_brightestG & (~v_max_mask)); - v_brightestR = (v_sR2 & v_max_mask) + (v_brightestR & (~v_max_mask)); + v_brightestB = v_add(v_and(v_sB2, v_max_mask), v_and(v_brightestB, v_not(v_max_mask))); + v_brightestG = v_add(v_and(v_sG2, v_max_mask), v_and(v_brightestG, v_not(v_max_mask))); + v_brightestR = v_add(v_and(v_sR2, v_max_mask), v_and(v_brightestR, v_not(v_max_mask))); // update sums: - v_sB1 = v_sB1 + v_sB2; - v_sG1 = v_sG1 + v_sG2; - v_sR1 = v_sR1 + v_sR2; + v_sB1 = v_add(v_sB1, v_sB2); + v_sG1 = v_add(v_sG1, v_sG2); + v_sR1 = v_add(v_sR1, v_sR2); v_uint32x4 v_uint1, v_uint2; v_expand(v_sB1, v_uint1, v_uint2); - v_SB += v_uint1 + v_uint2; + v_SB = v_add(v_SB, v_add(v_uint1, v_uint2)); v_expand(v_sG1, v_uint1, v_uint2); - v_SG += v_uint1 + v_uint2; + v_SG = v_add(v_SG, v_add(v_uint1, v_uint2)); v_expand(v_sR1, v_uint1, v_uint2); - v_SR += v_uint1 + v_uint2; + v_SR = v_add(v_SR, v_add(v_uint1, v_uint2)); } sumB = v_reduce_sum(v_SB); sumG = v_reduce_sum(v_SG); @@ -361,11 +361,11 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average v_uint16x8 v_inB, v_inG, v_inR; v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR); v_uint16x8 v_mask = v_load_expand(mask_ptr + i); - v_mask = v_mask | ((v_mask & v_mask_lower) << 8); + v_mask = v_or(v_mask, v_shl<8>(v_and(v_mask, v_mask_lower))); - v_inB &= v_mask; - v_inG &= v_mask; - v_inR &= v_mask; + v_inB = v_and(v_inB, v_mask); + v_inG = v_and(v_inG, v_mask); + v_inR = v_and(v_inR, v_mask); v_uint32x4 v_iR1, v_iR2, v_iG1, v_iG2, v_iB1, v_iB2; v_expand(v_inB, v_iB1, v_iB2); @@ -373,32 +373,32 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average v_expand(v_inR, v_iR1, v_iR2); // update the brightest (R,G,B) tuple (process left half): - v_uint32x4 v_sum = v_iB1 + v_iG1 + v_iR1; - v_uint32x4 v_max_mask = (v_sum > v_max_sum); + v_uint32x4 v_sum = v_add(v_add(v_iB1, v_iG1), v_iR1); + v_uint32x4 v_max_mask = (v_gt(v_sum, v_max_sum)); v_max_sum = v_max(v_sum, v_max_sum); - v_brightestB = (v_iB1 & v_max_mask) + (v_brightestB & (~v_max_mask)); - v_brightestG = (v_iG1 & v_max_mask) + (v_brightestG & (~v_max_mask)); - v_brightestR = (v_iR1 & v_max_mask) + (v_brightestR & (~v_max_mask)); + v_brightestB = v_add(v_and(v_iB1, v_max_mask), v_and(v_brightestB, v_not(v_max_mask))); + v_brightestG = v_add(v_and(v_iG1, v_max_mask), v_and(v_brightestG, v_not(v_max_mask))); + v_brightestR = v_add(v_and(v_iR1, v_max_mask), v_and(v_brightestR, v_not(v_max_mask))); // update the brightest (R,G,B) tuple (process right half): - v_sum = v_iB2 + v_iG2 + v_iR2; - v_max_mask = (v_sum > v_max_sum); + v_sum = v_add(v_add(v_iB2, v_iG2), v_iR2); + v_max_mask = (v_gt(v_sum, v_max_sum)); v_max_sum = v_max(v_sum, v_max_sum); - v_brightestB = (v_iB2 & v_max_mask) + (v_brightestB & (~v_max_mask)); - v_brightestG = (v_iG2 & v_max_mask) + (v_brightestG & (~v_max_mask)); - v_brightestR = (v_iR2 & v_max_mask) + (v_brightestR & (~v_max_mask)); + v_brightestB = v_add(v_and(v_iB2, v_max_mask), v_and(v_brightestB, v_not(v_max_mask))); + v_brightestG = v_add(v_and(v_iG2, v_max_mask), v_and(v_brightestG, v_not(v_max_mask))); + v_brightestR = v_add(v_and(v_iR2, v_max_mask), v_and(v_brightestR, v_not(v_max_mask))); // update sums: - v_iB1 = v_iB1 + v_iB2; - v_iG1 = v_iG1 + v_iG2; - v_iR1 = v_iR1 + v_iR2; + v_iB1 = v_add(v_iB1, v_iB2); + v_iG1 = v_add(v_iG1, v_iG2); + v_iR1 = v_add(v_iR1, v_iR2); v_uint64x2 v_uint64_1, v_uint64_2; v_expand(v_iB1, v_uint64_1, v_uint64_2); - v_SB += v_uint64_1 + v_uint64_2; + v_SB = v_add(v_SB, v_add(v_uint64_1, v_uint64_2)); v_expand(v_iG1, v_uint64_1, v_uint64_2); - v_SG += v_uint64_1 + v_uint64_2; + v_SG = v_add(v_SG, v_add(v_uint64_1, v_uint64_2)); v_expand(v_iR1, v_uint64_1, v_uint64_2); - v_SR += v_uint64_1 + v_uint64_2; + v_SR = v_add(v_SR, v_add(v_uint64_1, v_uint64_2)); } uint64 sum_arr[2]; v_store(sum_arr, v_SB);