From b9a460b85733c75a700c61b52ccee1fe653e21db Mon Sep 17 00:00:00 2001
From: Liutong HAN <liutong2020@iscas.ac.cn>
Date: Sun, 8 Oct 2023 16:01:23 +0800
Subject: [PATCH] Use new Universal Intrinsic API.

---
 modules/optflow/src/rlof/berlof_invoker.hpp   | 224 +++++++++---------
 modules/optflow/src/rlof/plk_invoker.hpp      |  74 +++---
 modules/optflow/src/rlof/rlof_invoker.hpp     | 132 +++++------
 modules/optflow/src/rlof/rlof_invokerbase.hpp |  76 +++---
 modules/optflow/src/rlof/rlof_localflow.cpp   |   8 +-
 modules/rgbd/src/colored_tsdf.cpp             | 120 +++++-----
 modules/rgbd/src/fast_icp.cpp                 |  48 ++--
 modules/rgbd/src/hash_tsdf.cpp                |  32 +--
 modules/rgbd/src/tsdf.cpp                     |  90 ++++---
 modules/rgbd/src/tsdf_functions.cpp           |  83 +++----
 modules/rgbd/src/tsdf_functions.hpp           |   2 +-
 modules/rgbd/src/utils.hpp                    |   2 +-
 modules/ximgproc/src/anisodiff.cpp            |  20 +-
 modules/ximgproc/src/fgs_filter.cpp           |  58 ++---
 .../xphoto/src/grayworld_white_balance.cpp    |  56 ++---
 .../src/learning_based_color_balance.cpp      |  82 +++----
 16 files changed, 549 insertions(+), 558 deletions(-)

diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp
index 8fde6e457..e51f8091c 100644
--- a/modules/optflow/src/rlof/berlof_invoker.hpp
+++ b/modules/optflow/src/rlof/berlof_invoker.hpp
@@ -296,7 +296,7 @@ public:
                             v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                             v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                             v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                            v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                            v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                             v_int32x4 t0, t1;
                             v_int16x8 t00, t01, t10, t11;
@@ -304,35 +304,35 @@ public:
                             v_zip(v10, v11, t10, t11);
 
                             //subpixel interpolation
-                            t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                            t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                            t0 = t0 >> (W_BITS - 5);
-                            t1 = t1 >> (W_BITS - 5);
+                            t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                            t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                            t0 = v_shr(t0, W_BITS - 5);
+                            t1 = v_shr(t1, W_BITS - 5);
                             diff0 = v_pack(t0, t1);
                             // I*gain.x + gain.x
                             v_int16x8 diff[4] =
                             {
-                                ((v11 << 5) - vI) & vmask,
-                                ((v01 << 5) - vI) & vmask,
-                                ((v10 << 5) - vI) & vmask,
-                                ((v00 << 5) - vI) & vmask
+                                v_and(v_sub(v_shl<5>(v11), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v01), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v10), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v00), vI), vmask)
                             };
-                            diff0 = diff0 - vI;
-                            diff0 = diff0 & vmask;
+                            diff0 = v_sub(diff0, vI);
+                            diff0 = v_and(diff0, vmask);
 
-                            v_int16x8 vscale_diff_is_pos = diff0 > vscale;
-                            veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1);
+                            v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale);
+                            veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1));
                             // since there is no abs vor int16x8 we have to do this hack
                             v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
                             v_int16x8 vset2, vset1;
                             // |It| < sigma1 ?
-                            vset2 = vabs_diff < vparam1;
+                            vset2 = v_lt(vabs_diff, vparam1);
                             // It > 0 ?
-                            v_int16x8 vdiff_is_pos = diff0 > vzero;
+                            v_int16x8 vdiff_is_pos = v_gt(diff0, vzero);
                             // sigma0 < |It| < sigma1 ?
-                            vset1 = vset2 & (vabs_diff > vparam0);
+                            vset1 = v_and(vset2, v_gt(vabs_diff, vparam0));
                             // val = |It| -/+ sigma1
-                            v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1);
+                            v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1));
 
                             v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
                             v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8));
@@ -342,7 +342,7 @@ public:
                             for (unsigned int mmi = 0; mmi < 4; mmi++)
                             {
                                 // It == 0     ? |It| > sigma13
-                                diff0 = vset2 & diff[mmi];
+                                diff0 = v_and(vset2, diff[mmi]);
                                 // It == val ? sigma0 < |It| < sigma1
                                 diff0 = v_select(vset1, vtmp_param1, diff0);
 
@@ -350,16 +350,16 @@ public:
                                 // diff = diff * sigma2
                                 v_int32x4 diff_int_0, diff_int_1;
                                 v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
-                                v_int32x4 diff0_0 = diff_int_0 >> s2bitShift;
-                                v_int32x4 diff0_1 = diff_int_1 >> s2bitShift;
+                                v_int32x4 diff0_0 = v_shr(diff_int_0, s2bitShift);
+                                v_int32x4 diff0_1 = v_shr(diff_int_1, s2bitShift);
                                 diff0 = v_pack(diff0_0, diff0_1);
                                 v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
 
                                 v_zip(vIxy_0, vIxy_1, v10, v11);
                                 v_zip(diff2, diff1, v00, v01);
 
-                                vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10));
-                                vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11));
+                                vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10)));
+                                vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11)));
                             }
                             if (j == 0)
                             {
@@ -387,8 +387,8 @@ public:
                                 v_float32x4 fx = v_cvt_f32(t1);
 
                                 // A11 - A22
-                                v_float32x4 fxtale = fx * vtale_0;
-                                v_float32x4 fytale = fy * vtale_0;
+                                v_float32x4 fxtale = v_mul(fx, vtale_0);
+                                v_float32x4 fytale = v_mul(fy, vtale_0);
 
                                 vAyy = v_muladd(fy, fytale, vAyy);
                                 vAxy = v_muladd(fx, fytale, vAxy);
@@ -402,8 +402,8 @@ public:
                                 fx = v_cvt_f32(t1);
 
                                 // A11 - A22
-                                fxtale = fx * vtale_1;
-                                fytale = fy * vtale_1;
+                                fxtale = v_mul(fx, vtale_1);
+                                fytale = v_mul(fy, vtale_1);
 
                                 vAyy = v_muladd(fy, fytale, vAyy);
                                 vAxy = v_muladd(fx, fytale, vAxy);
@@ -544,7 +544,7 @@ public:
                     float CV_DECL_ALIGNED(16) bbuf[4];
                     for (int mmi = 0; mmi < 4; mmi++)
                     {
-                        v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]);
+                        v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi]));
                         _b0[mmi] = bbuf[0] + bbuf[2];
                         _b1[mmi] = bbuf[1] + bbuf[3];
                     }
@@ -960,7 +960,7 @@ public:
                             v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                             v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                             v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                            v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                            v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                             v_int32x4 t0, t1;
                             v_int16x8 t00, t01, t10, t11;
@@ -968,38 +968,38 @@ public:
                             v_zip(v10, v11, t10, t11);
 
                             //subpixel interpolation
-                            t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                            t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                            t0 = t0 >> (W_BITS - 5);
-                            t1 = t1 >> (W_BITS - 5);
+                            t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                            t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                            t0 = v_shr(t0, W_BITS - 5);
+                            t1 = v_shr(t1, W_BITS - 5);
                             diff0 = v_pack(t0, t1);
                             // I*gain.x + gain.x
                             v_mul_expand(vI, vgain_value, t0, t1);
-                            v_int16x8 diff_value = v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value - vI;
+                            v_int16x8 diff_value = v_sub(v_add(v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift)), vconst_value), vI);
 
                             v_int16x8 diff[4] =
                             {
-                                ((v11 << 5) + diff_value) & vmask,
-                                ((v01 << 5) + diff_value) & vmask,
-                                ((v10 << 5) + diff_value) & vmask,
-                                ((v00 << 5) + diff_value) & vmask
+                                v_and(v_add(v_shl<5>(v11), diff_value), vmask),
+                                v_and(v_add(v_shl<5>(v01), diff_value), vmask),
+                                v_and(v_add(v_shl<5>(v10), diff_value), vmask),
+                                v_and(v_add(v_shl<5>(v00), diff_value), vmask)
                             };
-                            diff0 = diff0 + diff_value;
-                            diff0 = diff0 & vmask;
+                            diff0 = v_add(diff0, diff_value);
+                            diff0 = v_and(diff0, vmask);
 
-                            v_int16x8 vscale_diff_is_pos = diff0 > vscale;
-                            veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1);
+                            v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale);
+                            veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1));
                             // since there is no abs vor int16x8 we have to do this hack
                             v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
                             v_int16x8 vset2, vset1;
                             // |It| < sigma1 ?
-                            vset2 = vabs_diff < vparam1;
+                            vset2 = v_lt(vabs_diff, vparam1);
                             // It > 0 ?
-                            v_int16x8 vdiff_is_pos = diff0 > vzero;
+                            v_int16x8 vdiff_is_pos = v_gt(diff0, vzero);
                             // sigma0 < |It| < sigma1 ?
-                            vset1 = vset2 & (vabs_diff > vparam0);
+                            vset1 = v_and(vset2, v_gt(vabs_diff, vparam0));
                             // val = |It| -/+ sigma1
-                            v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1);
+                            v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1));
 
                             v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
                             v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8));
@@ -1009,7 +1009,7 @@ public:
                             for (unsigned int mmi = 0; mmi < 4; mmi++)
                             {
                                 // It == 0     ? |It| > sigma13
-                                diff0 = vset2 & diff[mmi];
+                                diff0 = v_and(vset2, diff[mmi]);
                                 // It == val ? sigma0 < |It| < sigma1
                                 diff0 = v_select(vset1, vtmp_param1, diff0);
 
@@ -1017,22 +1017,22 @@ public:
                                 // diff = diff * sigma2
                                 v_int32x4 diff_int_0, diff_int_1;
                                 v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
-                                v_int32x4 diff0_0 = diff_int_0 >> s2bitShift;
-                                v_int32x4 diff0_1 = diff_int_1 >> s2bitShift;
+                                v_int32x4 diff0_0 = v_shr(diff_int_0, s2bitShift);
+                                v_int32x4 diff0_1 = v_shr(diff_int_1, s2bitShift);
                                 diff0 = v_pack(diff0_0, diff0_1);
                                 v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
 
                                 v_zip(vIxy_0, vIxy_1, v10, v11);
                                 v_zip(diff2, diff1, v00, v01);
 
-                                vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10));
-                                vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11));
+                                vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10)));
+                                vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11)));
 
-                                vqb2[mmi] += v_cvt_f32(diff0_0 * vI0);
-                                vqb2[mmi] += v_cvt_f32(diff0_1 * vI1);
+                                vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_0, vI0)));
+                                vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_1, vI1)));
 
-                                vqb3[mmi] += v_cvt_f32(diff0_0);
-                                vqb3[mmi] += v_cvt_f32(diff0_1);
+                                vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_0));
+                                vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_1));
                             }
                             if (j == 0)
                             {
@@ -1060,29 +1060,29 @@ public:
                                 v_float32x4 fx = v_cvt_f32(t1);
 
                                 // A11 - A22
-                                v_float32x4 fxtale = fx * vtale_0;
-                                v_float32x4 fytale = fy * vtale_0;
+                                v_float32x4 fxtale = v_mul(fx, vtale_0);
+                                v_float32x4 fytale = v_mul(fy, vtale_0);
 
                                 vAyy = v_muladd(fy, fytale, vAyy);
                                 vAxy = v_muladd(fx, fytale, vAxy);
                                 vAxx = v_muladd(fx, fxtale, vAxx);
 
                                 // sumIx und sumIy
-                                vsumIx += fxtale;
-                                vsumIy += fytale;
+                                vsumIx = v_add(vsumIx, fxtale);
+                                vsumIy = v_add(vsumIy, fytale);
 
-                                vsumW1 += vI_ps * fxtale;
-                                vsumW2 += vI_ps * fytale;
+                                vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale));
+                                vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale));
 
                                 // sumI
-                                v_float32x4 vI_tale = vI_ps * vtale_0;
-                                vsumI += vI_tale;
+                                v_float32x4 vI_tale = v_mul(vI_ps, vtale_0);
+                                vsumI = v_add(vsumI, vI_tale);
 
                                 // sumW
-                                vsumW += vtale_0;
+                                vsumW = v_add(vsumW, vtale_0);
 
                                 // sumDI
-                                vsumDI += vI_ps * vI_tale;
+                                vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale));
 
                                 v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
                                 v_expand(v01, t1, t0);
@@ -1092,29 +1092,29 @@ public:
                                 fx = v_cvt_f32(t1);
 
                                 // A11 - A22
-                                fxtale = fx * vtale_1;
-                                fytale = fy * vtale_1;
+                                fxtale = v_mul(fx, vtale_1);
+                                fytale = v_mul(fy, vtale_1);
 
                                 vAyy = v_muladd(fy, fytale, vAyy);
                                 vAxy = v_muladd(fx, fytale, vAxy);
                                 vAxx = v_muladd(fx, fxtale, vAxx);
 
                                 // sumIx und sumIy
-                                vsumIx += fxtale;
-                                vsumIy += fytale;
+                                vsumIx = v_add(vsumIx, fxtale);
+                                vsumIy = v_add(vsumIy, fytale);
 
-                                vsumW1 += vI_ps * fxtale;
-                                vsumW2 += vI_ps * fytale;
+                                vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale));
+                                vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale));
 
                                 // sumI
-                                vI_tale = vI_ps * vtale_1;
-                                vsumI += vI_tale;
+                                vI_tale = v_mul(vI_ps, vtale_1);
+                                vsumI = v_add(vsumI, vI_tale);
 
                                 // sumW
-                                vsumW += vtale_1;
+                                vsumW = v_add(vsumW, vtale_1);
 
                                 // sumDI
-                                vsumDI += vI_ps * vI_tale;
+                                vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale));
                             }
                      }
 
@@ -1304,7 +1304,7 @@ public:
                 float CV_DECL_ALIGNED(16) bbuf[4];
                 for(int mmi = 0; mmi < 4; mmi++)
                 {
-                    v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]);
+                    v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi]));
                     _b0[mmi] = bbuf[0] + bbuf[2];
                     _b1[mmi] = bbuf[1] + bbuf[3];
                     _b2[mmi] = v_reduce_sum(vqb2[mmi]);
@@ -1655,14 +1655,14 @@ public:
                             v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                             v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                             v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                            v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                            v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                             v_int16x8 diff[4] =
                             {
-                                ((v00 << 5) - vI) & vmask,
-                                ((v01 << 5) - vI) & vmask,
-                                ((v10 << 5) - vI) & vmask,
-                                ((v11 << 5) - vI) & vmask,
+                                v_and(v_sub(v_shl<5>(v00), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v01), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v10), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v11), vI), vmask),
                             };
 
                             v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
@@ -1672,8 +1672,8 @@ public:
                                 v_zip(diff[mmi], diff[mmi], diff1, diff0);
                                 v_zip(vIxy_0, vIxy_1, v10, v11);
                                 v_zip(diff1, diff0, v00, v01);
-                                vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10));
-                                vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11));
+                                vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10)));
+                                vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11)));
                             }
                          }
 #else
@@ -1704,7 +1704,7 @@ public:
                     float CV_DECL_ALIGNED(16) bbuf[4];
                     for (int mmi = 0; mmi < 4; mmi++)
                     {
-                        v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]);
+                        v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi]));
                         _b1[mmi] = bbuf[0] + bbuf[2];
                         _b2[mmi] = bbuf[1] + bbuf[3];
                     }
@@ -2071,7 +2071,7 @@ namespace radial {
                                 v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                                 v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                                 v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                                v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                                v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                                 v_int32x4 t0, t1;
                                 v_int16x8 t00, t01, t10, t11;
@@ -2079,21 +2079,21 @@ namespace radial {
                                 v_zip(v10, v11, t10, t11);
 
                                 //subpixel interpolation
-                                t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                                t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                                t0 = t0 >> (W_BITS - 5);
-                                t1 = t1 >> (W_BITS - 5);
+                                t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                                t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                                t0 = v_shr(t0, W_BITS - 5);
+                                t1 = v_shr(t1, W_BITS - 5);
                                 diff0 = v_pack(t0, t1);
                                 // I*gain.x + gain.x
                                 v_mul_expand(vI, vgain_value, t0, t1);
-                                v_int16x8 diff_value = v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value - vI;
+                                v_int16x8 diff_value = v_sub(v_add(v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift)), vconst_value), vI);
 
                                 v_int16x8 diff[4] =
                                 {
-                                    ((v11 << 5) + diff_value) & vmask,
-                                    ((v01 << 5) + diff_value) & vmask,
-                                    ((v10 << 5) + diff_value) & vmask,
-                                    ((v00 << 5) + diff_value) & vmask
+                                    v_and(v_add(v_shl<5>(v11), diff_value), vmask),
+                                    v_and(v_add(v_shl<5>(v01), diff_value), vmask),
+                                    v_and(v_add(v_shl<5>(v10), diff_value), vmask),
+                                    v_and(v_add(v_shl<5>(v00), diff_value), vmask)
                                 };
                                 v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
                                 v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8));
@@ -2109,14 +2109,14 @@ namespace radial {
                                     v_zip(diff[mmi], diff[mmi], diff2, diff1);
 
                                     v_zip(diff2, diff1, v00, v01);
-                                    vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10));
-                                    vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11));
+                                    vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10)));
+                                    vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11)));
 
-                                    vqb2[mmi] += v_cvt_f32(diff0_0 * vI0);
-                                    vqb2[mmi] += v_cvt_f32(diff0_1 * vI1);
+                                    vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_0, vI0)));
+                                    vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_1, vI1)));
 
-                                    vqb3[mmi] += v_cvt_f32(diff0_0);
-                                    vqb3[mmi] += v_cvt_f32(diff0_1);
+                                    vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_0));
+                                    vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_1));
                                 }
                                 if (j == 0)
                                 {
@@ -2133,17 +2133,17 @@ namespace radial {
                                     vAxx = v_muladd(fx, fx, vAxx);
 
                                     // sumIx und sumIy
-                                    vsumIx += fx;
-                                    vsumIy += fy;
+                                    vsumIx = v_add(vsumIx, fx);
+                                    vsumIy = v_add(vsumIy, fy);
 
-                                    vsumW1 += vI_ps * fx;
-                                    vsumW2 += vI_ps * fy;
+                                    vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx));
+                                    vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy));
 
                                     // sumI
-                                    vsumI += vI_ps;
+                                    vsumI = v_add(vsumI, vI_ps);
 
                                     // sumDI
-                                    vsumDI += vI_ps * vI_ps;
+                                    vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps));
 
                                     v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
                                     v_expand(v01, t1, t0);
@@ -2158,17 +2158,17 @@ namespace radial {
                                     vAxx = v_muladd(fx, fx, vAxx);
 
                                     // sumIx und sumIy
-                                    vsumIx += fx;
-                                    vsumIy += fy;
+                                    vsumIx = v_add(vsumIx, fx);
+                                    vsumIy = v_add(vsumIy, fy);
 
-                                    vsumW1 += vI_ps * fx;
-                                    vsumW2 += vI_ps * fy;
+                                    vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx));
+                                    vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy));
 
                                     // sumI
-                                    vsumI += vI_ps;
+                                    vsumI = v_add(vsumI, vI_ps);
 
                                     // sumDI
-                                    vsumDI += vI_ps * vI_ps;
+                                    vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps));
                                 }
                             }
 
@@ -2299,7 +2299,7 @@ namespace radial {
                         float CV_DECL_ALIGNED(16) bbuf[4];
                         for (int mmi = 0; mmi < 4; mmi++)
                         {
-                            v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]);
+                            v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi]));
                             _b0[mmi] = bbuf[0] + bbuf[2];
                             _b1[mmi] = bbuf[1] + bbuf[3];
                             _b2[mmi] = v_reduce_sum(vqb2[mmi]);
diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp
index 5ea85de88..71cf50c82 100644
--- a/modules/optflow/src/rlof/plk_invoker.hpp
+++ b/modules/optflow/src/rlof/plk_invoker.hpp
@@ -229,7 +229,7 @@ public:
                         v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                         v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                         v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                        v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                        v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                         v_int32x4 t0, t1;
                         v_int16x8 t00, t01, t10, t11;
@@ -237,17 +237,17 @@ public:
                         v_zip(v10, v11, t10, t11);
 
                         //subpixel interpolation
-                        t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                        t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                        t0 = t0 >> (W_BITS - 5);
-                        t1 = t1 >> (W_BITS - 5);
+                        t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                        t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                        t0 = v_shr(t0, W_BITS - 5);
+                        t1 = v_shr(t1, W_BITS - 5);
 
                         // diff = J - I
-                        diff0 = v_pack(t0, t1) - vI;
+                        diff0 = v_sub(v_pack(t0, t1), vI);
                         // I*gain.x + gain.x
                         v_mul_expand(vI, vgain_value, t0, t1);
-                        diff0 = diff0 + v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value;
-                        diff0 = diff0 & vmask;
+                        diff0 = v_add(v_add(diff0, v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift))), vconst_value);
+                        diff0 = v_and(diff0, vmask);
                         v_zip(diff0, diff0, diff2, diff1);
 
                         v_int32x4 diff0_0;
@@ -259,16 +259,16 @@ public:
                         v_zip(vIxy_0, vIxy_1, v10, v11);
                         v_zip(diff2, diff1, v00, v01);
 
-                        vqb0 += v_cvt_f32(v_dotprod(v00, v10));
-                        vqb1 += v_cvt_f32(v_dotprod(v01, v11));
+                        vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10)));
+                        vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11)));
 
                         v_int32x4 vI0, vI1;
                         v_expand(vI, vI0, vI1);
-                        vqb2 += v_cvt_f32(diff0_0 * vI0);
-                        vqb2 += v_cvt_f32(diff0_1 * vI1);
+                        vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_0, vI0)));
+                        vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_1, vI1)));
 
-                        vqb3 += v_cvt_f32(diff0_0);
-                        vqb3 += v_cvt_f32(diff0_1);
+                        vqb3 = v_add(vqb3, v_cvt_f32(diff0_0));
+                        vqb3 = v_add(vqb3, v_cvt_f32(diff0_1));
 
                         if (j == 0)
                         {
@@ -285,17 +285,17 @@ public:
                             vAxx = v_muladd(fx, fx, vAxx);
 
                             // sumIx und sumIy
-                            vsumIx += fx;
-                            vsumIy += fy;
+                            vsumIx = v_add(vsumIx, fx);
+                            vsumIy = v_add(vsumIy, fy);
 
-                            vsumW1 += vI_ps * fx;
-                            vsumW2 += vI_ps * fy;
+                            vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx));
+                            vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy));
 
                             // sumI
-                            vsumI += vI_ps;
+                            vsumI = v_add(vsumI, vI_ps);
 
                             // sumDI
-                            vsumDI += vI_ps * vI_ps;
+                            vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps));
 
                             v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
                             v_expand(v01, t1, t0);
@@ -309,17 +309,17 @@ public:
                             vAxx = v_muladd(fx, fx, vAxx);
 
                             // sumIx und sumIy
-                            vsumIx += fx;
-                            vsumIy += fy;
+                            vsumIx = v_add(vsumIx, fx);
+                            vsumIy = v_add(vsumIy, fy);
 
-                            vsumW1 += vI_ps * fx;
-                            vsumW2 += vI_ps * fy;
+                            vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx));
+                            vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy));
 
                             // sumI
-                            vsumI += vI_ps;
+                            vsumI = v_add(vsumI, vI_ps);
 
                             // sumDI
-                            vsumDI += vI_ps * vI_ps;
+                            vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps));
                         }
                     }
 #else
@@ -388,7 +388,7 @@ public:
 
 #if CV_SIMD128
                 float CV_DECL_ALIGNED(16) bbuf[4];
-                v_store_aligned(bbuf, vqb0 + vqb1);
+                v_store_aligned(bbuf, v_add(vqb0, vqb1));
                 b1 = bbuf[0] + bbuf[2];
                 b2 = bbuf[1] + bbuf[3];
                 b3 = v_reduce_sum(vqb2);
@@ -696,19 +696,19 @@ public:
                         v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                         v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                         v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                        v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                        v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                         v_int32x4 t0, t1;
                         v_int16x8 t00, t01, t10, t11;
                         v_zip(v00, v01, t00, t01);
                         v_zip(v10, v11, t10, t11);
 
-                        t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                        t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                        t0 = t0 >> (W_BITS - 5);
-                        t1 = t1 >> (W_BITS - 5);
-                        diff0 = v_pack(t0, t1) - diff0;
-                        diff0 = diff0 & vmask;
+                        t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                        t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                        t0 = v_shr(t0, W_BITS - 5);
+                        t1 = v_shr(t1, W_BITS - 5);
+                        diff0 = v_sub(v_pack(t0, t1), diff0);
+                        diff0 = v_and(diff0, vmask);
 
                         v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
 
@@ -717,8 +717,8 @@ public:
                         v_zip(vIxy_0, vIxy_1, v10, v11);
                         v_zip(diff2, diff1, v00, v01);
 
-                        vqb0 += v_cvt_f32(v_dotprod(v00, v10));
-                        vqb1 += v_cvt_f32(v_dotprod(v01, v11));
+                        vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10)));
+                        vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11)));
                     }
 #else
                     for( ; x < winSize.width*cn; x++, dIptr += 2 )
@@ -737,7 +737,7 @@ public:
 
 #if CV_SIMD128
                 float CV_DECL_ALIGNED(16) bbuf[4];
-                v_store_aligned(bbuf, vqb0 + vqb1);
+                v_store_aligned(bbuf, v_add(vqb0, vqb1));
                 b1 = bbuf[0] + bbuf[2];
                 b2 = bbuf[1] + bbuf[3];
 #endif
diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp
index 9bee35fc6..5597d8824 100644
--- a/modules/optflow/src/rlof/rlof_invoker.hpp
+++ b/modules/optflow/src/rlof/rlof_invoker.hpp
@@ -246,35 +246,35 @@ public:
                         v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                         v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                         v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                        v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                        v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                         v_int32x4 t0, t1;
                         v_int16x8 t00, t01, t10, t11;
                         v_zip(v00, v01, t00, t01);
                         v_zip(v10, v11, t10, t11);
 
-                        t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                        t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                        t0 = t0 >> (W_BITS - 5);
-                        t1 = t1 >> (W_BITS - 5);
-                        diff0 = v_pack(t0, t1) - diff0;
-                        diff0 = diff0 & vmask;
+                        t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                        t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                        t0 = v_shr(t0, W_BITS - 5);
+                        t1 = v_shr(t1, W_BITS - 5);
+                        diff0 = v_sub(v_pack(t0, t1), diff0);
+                        diff0 = v_and(diff0, vmask);
 
-                        v_int16x8 vscale_diff_is_pos = diff0 > vscale;
-                        veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1);
+                        v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale);
+                        veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1));
                         // since there is no abs vor int16x8 we have to do this hack
                         v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
                         v_int16x8 vset2, vset1;
                         // |It| < sigma1 ?
-                        vset2 = vabs_diff < vparam1;
+                        vset2 = v_lt(vabs_diff, vparam1);
                         // It > 0 ?
-                        v_int16x8 vdiff_is_pos = diff0 > vzero;
+                        v_int16x8 vdiff_is_pos = v_gt(diff0, vzero);
                         // sigma0 < |It| < sigma1 ?
-                        vset1 = vset2 & (vabs_diff > vparam0);
+                        vset1 = v_and(vset2, v_gt(vabs_diff, vparam0));
                         // val = |It| -/+ sigma1
-                        v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1);
+                        v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1));
                         // It == 0     ? |It| > sigma13
-                        diff0 = vset2 & diff0;
+                        diff0 = v_and(vset2, diff0);
                         // It == val ? sigma0 < |It| < sigma1
                         diff0 = v_select(vset1, vtmp_param1, diff0);
 
@@ -282,7 +282,7 @@ public:
                         // diff = diff * sigma2
                         v_int32x4 diff_int_0, diff_int_1;
                         v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
-                        diff0 = v_pack(diff_int_0 >> s2bitShift, diff_int_1 >> s2bitShift);
+                        diff0 = v_pack(v_shr(diff_int_0, s2bitShift), v_shr(diff_int_1, s2bitShift));
                         v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
 
                         v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
@@ -290,8 +290,8 @@ public:
                         v_zip(vIxy_0, vIxy_1, v10, v11);
                         v_zip(diff2, diff1, v00, v01);
 
-                        vqb0 += v_cvt_f32(v_dotprod(v00, v10));
-                        vqb1 += v_cvt_f32(v_dotprod(v01, v11));
+                        vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10)));
+                        vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11)));
                         if (j == 0)
                         {
                             v_int32x4 vset1_0, vset1_1, vset2_0, vset2_1;
@@ -316,8 +316,8 @@ public:
                             v_float32x4 fx = v_cvt_f32(t1);
 
                             // A11 - A22
-                            v_float32x4 fxtale = fx * vtale_0;
-                            v_float32x4 fytale = fy * vtale_0;
+                            v_float32x4 fxtale = v_mul(fx, vtale_0);
+                            v_float32x4 fytale = v_mul(fy, vtale_0);
 
                             vAyy = v_muladd(fy, fytale, vAyy);
                             vAxy = v_muladd(fx, fytale, vAxy);
@@ -330,8 +330,8 @@ public:
                             fx = v_cvt_f32(t1);
 
                             // A11 - A22
-                            fxtale = fx * vtale_1;
-                            fytale = fy * vtale_1;
+                            fxtale = v_mul(fx, vtale_1);
+                            fytale = v_mul(fy, vtale_1);
 
                             vAyy = v_muladd(fy, fytale, vAyy);
                             vAxy = v_muladd(fx, fytale, vAxy);
@@ -431,7 +431,7 @@ public:
 
 #if CV_SIMD128
                 float CV_DECL_ALIGNED(16) bbuf[4];
-                v_store_aligned(bbuf, vqb0 + vqb1);
+                v_store_aligned(bbuf, v_add(vqb0, vqb1));
                 b1 += bbuf[0] + bbuf[2];
                 b2 += bbuf[1] + bbuf[3];
 #endif
@@ -769,7 +769,7 @@ public:
                         v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                         v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                         v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                        v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                        v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                         v_int32x4 t0, t1;
                         v_int16x8 t00, t01, t10, t11;
@@ -777,33 +777,33 @@ public:
                         v_zip(v10, v11, t10, t11);
 
                         //subpixel interpolation
-                        t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                        t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                        t0 = t0 >> (W_BITS - 5);
-                        t1 = t1 >> (W_BITS - 5);
+                        t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                        t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                        t0 = v_shr(t0, W_BITS - 5);
+                        t1 = v_shr(t1, W_BITS - 5);
 
                         // diff = J - I
-                        diff0 = v_pack(t0, t1) - vI;
+                        diff0 = v_sub(v_pack(t0, t1), vI);
                         // I*gain.x + gain.x
                         v_mul_expand(vI, vgain_value, t0, t1);
-                        diff0 = diff0 + v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value;
-                        diff0 = diff0 & vmask;
+                        diff0 = v_add(v_add(diff0, v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift))), vconst_value);
+                        diff0 = v_and(diff0, vmask);
 
-                        v_int16x8 vscale_diff_is_pos = diff0 > vscale;
-                        veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1);
+                        v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale);
+                        veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1));
                         // since there is no abs vor int16x8 we have to do this hack
                         v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
                         v_int16x8 vset2, vset1;
                         // |It| < sigma1 ?
-                        vset2 = vabs_diff < vparam1;
+                        vset2 = v_lt(vabs_diff, vparam1);
                         // It > 0 ?
-                        v_int16x8 vdiff_is_pos = diff0 > vzero;
+                        v_int16x8 vdiff_is_pos = v_gt(diff0, vzero);
                         // sigma0 < |It| < sigma1 ?
-                        vset1 = vset2 & (vabs_diff > vparam0);
+                        vset1 = v_and(vset2, v_gt(vabs_diff, vparam0));
                         // val = |It| -/+ sigma1
-                        v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1);
+                        v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1));
                         // It == 0     ? |It| > sigma13
-                        diff0 = vset2 & diff0;
+                        diff0 = v_and(vset2, diff0);
                         // It == val ? sigma0 < |It| < sigma1
                         diff0 = v_select(vset1, vtmp_param1, diff0);
 
@@ -811,8 +811,8 @@ public:
                         // diff = diff * sigma2
                         v_int32x4 diff_int_0, diff_int_1;
                         v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
-                        v_int32x4 diff0_0 = diff_int_0 >> s2bitShift;
-                        v_int32x4 diff0_1 = diff_int_1 >> s2bitShift;
+                        v_int32x4 diff0_0 = v_shr(diff_int_0, s2bitShift);
+                        v_int32x4 diff0_1 = v_shr(diff_int_1, s2bitShift);
                         diff0 = v_pack(diff0_0, diff0_1);
                         v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
 
@@ -821,16 +821,16 @@ public:
                         v_zip(vIxy_0, vIxy_1, v10, v11);
                         v_zip(diff2, diff1, v00, v01);
 
-                        vqb0 += v_cvt_f32(v_dotprod(v00, v10));
-                        vqb1 += v_cvt_f32(v_dotprod(v01, v11));
+                        vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10)));
+                        vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11)));
 
                         v_int32x4 vI0, vI1;
                         v_expand(vI, vI0, vI1);
-                        vqb2 += v_cvt_f32(diff0_0 * vI0);
-                        vqb2 += v_cvt_f32(diff0_1 * vI1);
+                        vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_0, vI0)));
+                        vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_1, vI1)));
 
-                        vqb3 += v_cvt_f32(diff0_0);
-                        vqb3 += v_cvt_f32(diff0_1);
+                        vqb3 = v_add(vqb3, v_cvt_f32(diff0_0));
+                        vqb3 = v_add(vqb3, v_cvt_f32(diff0_1));
 
                         if (j == 0)
                         {
@@ -858,29 +858,29 @@ public:
                             v_float32x4 fx = v_cvt_f32(t1);
 
                             // A11 - A22
-                            v_float32x4 fxtale = fx * vtale_0;
-                            v_float32x4 fytale = fy * vtale_0;
+                            v_float32x4 fxtale = v_mul(fx, vtale_0);
+                            v_float32x4 fytale = v_mul(fy, vtale_0);
 
                             vAyy = v_muladd(fy, fytale, vAyy);
                             vAxy = v_muladd(fx, fytale, vAxy);
                             vAxx = v_muladd(fx, fxtale, vAxx);
 
                             // sumIx und sumIy
-                            vsumIx += fxtale;
-                            vsumIy += fytale;
+                            vsumIx = v_add(vsumIx, fxtale);
+                            vsumIy = v_add(vsumIy, fytale);
 
-                            vsumW1 += vI_ps * fxtale;
-                            vsumW2 += vI_ps * fytale;
+                            vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale));
+                            vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale));
 
                             // sumI
-                            v_float32x4 vI_tale = vI_ps * vtale_0;
-                            vsumI += vI_tale;
+                            v_float32x4 vI_tale = v_mul(vI_ps, vtale_0);
+                            vsumI = v_add(vsumI, vI_tale);
 
                             // sumW
-                            vsumW += vtale_0;
+                            vsumW = v_add(vsumW, vtale_0);
 
                             // sumDI
-                            vsumDI += vI_ps * vI_tale;
+                            vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale));
 
                             v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
                             v_expand(v01, t1, t0);
@@ -890,29 +890,29 @@ public:
                             fx = v_cvt_f32(t1);
 
                             // A11 - A22
-                            fxtale = fx * vtale_1;
-                            fytale = fy * vtale_1;
+                            fxtale = v_mul(fx, vtale_1);
+                            fytale = v_mul(fy, vtale_1);
 
                             vAyy = v_muladd(fy, fytale, vAyy);
                             vAxy = v_muladd(fx, fytale, vAxy);
                             vAxx = v_muladd(fx, fxtale, vAxx);
 
                             // sumIx und sumIy
-                            vsumIx += fxtale;
-                            vsumIy += fytale;
+                            vsumIx = v_add(vsumIx, fxtale);
+                            vsumIy = v_add(vsumIy, fytale);
 
-                            vsumW1 += vI_ps * fxtale;
-                            vsumW2 += vI_ps * fytale;
+                            vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale));
+                            vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale));
 
                             // sumI
-                            vI_tale = vI_ps * vtale_1;
-                            vsumI += vI_tale;
+                            vI_tale = v_mul(vI_ps, vtale_1);
+                            vsumI = v_add(vsumI, vI_tale);
 
                             // sumW
-                            vsumW += vtale_1;
+                            vsumW = v_add(vsumW, vtale_1);
 
                             // sumDI
-                            vsumDI += vI_ps * vI_tale;
+                            vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale));
                         }
                     }
 #else
@@ -1017,7 +1017,7 @@ public:
                 }
 #if CV_SIMD128
                 float CV_DECL_ALIGNED(16) bbuf[4];
-                v_store_aligned(bbuf, vqb0 + vqb1);
+                v_store_aligned(bbuf, v_add(vqb0, vqb1));
                 b1 = bbuf[0] + bbuf[2];
                 b2 = bbuf[1] + bbuf[3];
                 b3 = v_reduce_sum(vqb2);
diff --git a/modules/optflow/src/rlof/rlof_invokerbase.hpp b/modules/optflow/src/rlof/rlof_invokerbase.hpp
index c6f77f6d6..2db4234ec 100644
--- a/modules/optflow/src/rlof/rlof_invokerbase.hpp
+++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp
@@ -71,15 +71,15 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
 
         for (; x <= winSize.width*cn; x += 8, dsrc += 8 * 2, dsrc1 += 8 * 2, dIptr += 8 * 2)
         {
-            v_int32x4 vmask0 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)) * vmax_val_32;
-            v_int32x4 vmask1 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)) * vmax_val_32;
+            v_int32x4 vmask0 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)), vmax_val_32);
+            v_int32x4 vmask1 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)), vmax_val_32);
             if (x + 4 > winSize.width)
             {
-                vmask0 = vmask0 & vmask_border_0;
+                vmask0 = v_and(vmask0, vmask_border_0);
             }
             if (x + 8 > winSize.width)
             {
-                vmask1 = vmask1 & vmask_border_1;
+                vmask1 = v_and(vmask1, vmask_border_1);
             }
 
             v_int32x4 t0, t1;
@@ -91,10 +91,10 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
 
             v_zip(v00, v01, t00, t01);
             v_zip(v10, v11, t10, t11);
-            t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-            t0 = t0 >> (W_BITS - 5)  & vmask0;
-            t1 = t1 >> (W_BITS - 5)  & vmask1;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+            t0 = v_and(v_shr(t0, W_BITS - 5), vmask0);
+            t1 = v_and(v_shr(t1, W_BITS - 5), vmask1);
             v_store(Iptr + x, v_pack(t0, t1));
 
             v00 = v_reinterpret_as_s16(v_load(dsrc));
@@ -105,12 +105,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
             v_zip(v00, v01, t00, t01);
             v_zip(v10, v11, t10, t11);
 
-            t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
-            t0 = t0 >> W_BITS;
-            t1 = t1 >> W_BITS;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1));
+            t0 = v_shr(t0, W_BITS);
+            t1 = v_shr(t1, W_BITS);
             v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-            v00 = v00 & v_reinterpret_as_s16(vmask0);
+            v00 = v_and(v00, v_reinterpret_as_s16(vmask0));
             v_store(dIptr, v00);
 
             v00 = v_reinterpret_as_s16(v_load(dsrc + 4 * 2));
@@ -121,12 +121,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
             v_zip(v00, v01, t00, t01);
             v_zip(v10, v11, t10, t11);
 
-            t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
-            t0 = t0 >> W_BITS;
-            t1 = t1 >> W_BITS;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1));
+            t0 = v_shr(t0, W_BITS);
+            t1 = v_shr(t1, W_BITS);
             v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-            v00 = v00 & v_reinterpret_as_s16(vmask1);
+            v00 = v_and(v00, v_reinterpret_as_s16(vmask1));
             v_store(dIptr + 4 * 2, v00);
         }
 #else
@@ -187,15 +187,15 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
 #if CV_SIMD128
         for (int x = 0; x <= winSize.width*cn; x += 8, dsrc += 8 * 2, dsrc1 += 8 * 2, dIptr += 8 * 2)
         {
-            v_int32x4 vmask0 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)) * vmax_val_32;
-            v_int32x4 vmask1 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)) * vmax_val_32;
+            v_int32x4 vmask0 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)), vmax_val_32);
+            v_int32x4 vmask1 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)), vmax_val_32);
             if (x + 4 > winSize.width)
             {
-                vmask0 = vmask0 & vmask_border0;
+                vmask0 = v_and(vmask0, vmask_border0);
             }
             if (x + 8 > winSize.width)
             {
-                vmask1 = vmask1 & vmask_border1;
+                vmask1 = v_and(vmask1, vmask_border1);
             }
 
             v_int32x4 t0, t1;
@@ -207,12 +207,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
 
             v_zip(v00, v01, t00, t01);
             v_zip(v10, v11, t10, t11);
-            t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-            t0 = t0 >> (W_BITS - 5);
-            t1 = t1 >> (W_BITS - 5);
-            t0 = t0 & vmask0;
-            t1 = t1 & vmask1;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+            t0 = v_shr(t0, W_BITS - 5);
+            t1 = v_shr(t1, W_BITS - 5);
+            t0 = v_and(t0, vmask0);
+            t1 = v_and(t1, vmask1);
             v_store(Iptr + x, v_pack(t0, t1));
 
             v00 = v_reinterpret_as_s16(v_load(dsrc));
@@ -223,12 +223,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
             v_zip(v00, v01, t00, t01);
             v_zip(v10, v11, t10, t11);
 
-            t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
-            t0 = t0 >> W_BITS;
-            t1 = t1 >> W_BITS;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1));
+            t0 = v_shr(t0, W_BITS);
+            t1 = v_shr(t1, W_BITS);
             v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-            v00 = v00 & v_reinterpret_as_s16(vmask0);
+            v00 = v_and(v00, v_reinterpret_as_s16(vmask0));
             v_store(dIptr, v00);
 
             v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(v00))));
@@ -249,12 +249,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
             v_zip(v00, v01, t00, t01);
             v_zip(v10, v11, t10, t11);
 
-            t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
-            t0 = t0 >> W_BITS;
-            t1 = t1 >> W_BITS;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1));
+            t0 = v_shr(t0, W_BITS);
+            t1 = v_shr(t1, W_BITS);
             v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-            v00 = v00 & v_reinterpret_as_s16(vmask1);
+            v00 = v_and(v00, v_reinterpret_as_s16(vmask1));
             v_store(dIptr + 4 * 2, v00);
 
             v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(v00))));
diff --git a/modules/optflow/src/rlof/rlof_localflow.cpp b/modules/optflow/src/rlof/rlof_localflow.cpp
index 8f3c72820..3bc264f3e 100644
--- a/modules/optflow/src/rlof/rlof_localflow.cpp
+++ b/modules/optflow/src/rlof/rlof_localflow.cpp
@@ -52,8 +52,8 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
                 v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(srow1 + x));
                 v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x));
 
-                v_int16x8 t1 = s2 - s0;
-                v_int16x8 t0 = v_mul_wrap(s0 + s2, c3) + v_mul_wrap(s1, c10);
+                v_int16x8 t1 = v_sub(s2, s0);
+                v_int16x8 t0 = v_add(v_mul_wrap(v_add(s0, s2), c3), v_mul_wrap(s1, c10));
 
                 v_store(trow0 + x, t0);
                 v_store(trow1 + x, t1);
@@ -90,8 +90,8 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
                 v_int16x8 s3 = v_load(trow1 + x);
                 v_int16x8 s4 = v_load(trow1 + x + cn);
 
-                v_int16x8 t0 = s1 - s0;
-                v_int16x8 t1 = v_mul_wrap(s2 + s4, c3) + v_mul_wrap(s3, c10);
+                v_int16x8 t0 = v_sub(s1, s0);
+                v_int16x8 t1 = v_add(v_mul_wrap(v_add(s2, s4), c3), v_mul_wrap(s3, c10));
 
                 v_store_interleave((drow + x * 2), t0, t1);
             }
diff --git a/modules/rgbd/src/colored_tsdf.cpp b/modules/rgbd/src/colored_tsdf.cpp
index 0247f66d0..7ce2c7428 100644
--- a/modules/rgbd/src/colored_tsdf.cpp
+++ b/modules/rgbd/src/colored_tsdf.cpp
@@ -194,21 +194,21 @@ inline float ColoredTSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const
 {
     // tx, ty, tz = floor(p)
     v_int32x4 ip = v_floor(p);
-    v_float32x4 t = p - v_cvt_f32(ip);
-    float tx = t.get0();
+    v_float32x4 t = v_sub(p, v_cvt_f32(ip));
+    float tx = v_get0(t);
     t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float ty = t.get0();
+    float ty = v_get0(t);
     t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float tz = t.get0();
+    float tz = v_get0(t);
 
     int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
     const RGBTsdfVoxel* volData = volume.ptr<RGBTsdfVoxel>();
 
-    int ix = ip.get0();
+    int ix = v_get0(ip);
     ip = v_rotate_right<1>(ip);
-    int iy = ip.get0();
+    int iy = v_get0(ip);
     ip = v_rotate_right<1>(ip);
-    int iz = ip.get0();
+    int iz = v_get0(ip);
 
     int coordBase = ix * xdim + iy * ydim + iz * zdim;
 
@@ -218,15 +218,15 @@ inline float ColoredTSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const
 
     v_float32x4 v0246 = tsdfToFloat_INTR(v_int32x4(vx[0], vx[2], vx[4], vx[6]));
     v_float32x4 v1357 = tsdfToFloat_INTR(v_int32x4(vx[1], vx[3], vx[5], vx[7]));
-    v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246);
+    v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
 
     v_float32x4 v00_10 = vxx;
     v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
 
-    v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10);
-    float v0 = v0_1.get0();
+    v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
+    float v0 = v_get0(v0_1);
     v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
-    float v1 = v0_1.get0();
+    float v1 = v_get0(v0_1);
 
     return v0 + tx * (v1 - v0);
 }
@@ -276,27 +276,27 @@ inline Point3f ColoredTSDFVolumeCPU::getNormalVoxel(const Point3f& _p) const
 
 inline v_float32x4 ColoredTSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const
 {
-    if (v_check_any(p < v_float32x4(1.f, 1.f, 1.f, 0.f)) ||
-        v_check_any(p >= v_float32x4((float)(volResolution.x - 2),
+    if (v_check_any(v_lt(p, v_float32x4(1.f, 1.f, 1.f, 0.f))) ||
+        v_check_any(v_ge(p, v_float32x4((float)(volResolution.x - 2),
             (float)(volResolution.y - 2),
-            (float)(volResolution.z - 2), 1.f))
+            (float)(volResolution.z - 2), 1.f)))
         )
         return nanv;
 
     v_int32x4 ip = v_floor(p);
-    v_float32x4 t = p - v_cvt_f32(ip);
-    float tx = t.get0();
+    v_float32x4 t = v_sub(p, v_cvt_f32(ip));
+    float tx = v_get0(t);
     t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float ty = t.get0();
+    float ty = v_get0(t);
     t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float tz = t.get0();
+    float tz = v_get0(t);
 
     const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
     const RGBTsdfVoxel* volData = volume.ptr<RGBTsdfVoxel>();
 
-    int ix = ip.get0(); ip = v_rotate_right<1>(ip);
-    int iy = ip.get0(); ip = v_rotate_right<1>(ip);
-    int iz = ip.get0();
+    int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
+    int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
+    int iz = v_get0(ip);
 
     int coordBase = ix * xdim + iy * ydim + iz * zdim;
 
@@ -314,23 +314,23 @@ inline v_float32x4 ColoredTSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) co
 
         v_float32x4 v0246(vx[0], vx[2], vx[4], vx[6]);
         v_float32x4 v1357(vx[1], vx[3], vx[5], vx[7]);
-        v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246);
+        v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
 
         v_float32x4 v00_10 = vxx;
         v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
 
-        v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10);
-        float v0 = v0_1.get0();
+        v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
+        float v0 = v_get0(v0_1);
         v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
-        float v1 = v0_1.get0();
+        float v1 = v_get0(v0_1);
 
         nv = v0 + tx * (v1 - v0);
     }
 
     v_float32x4 n = v_load_aligned(an);
-    v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(n * n)));
+    v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(v_mul(n, n))));
 
-    return Norm.get0() < 0.0001f ? nanv : n / Norm;
+    return v_get0(Norm) < 0.0001f ? nanv : v_div(n, Norm);
 }
 #else
 inline Point3f ColoredTSDFVolumeCPU::getNormalVoxel(const Point3f& p) const
@@ -388,15 +388,15 @@ inline float ColoredTSDFVolumeCPU::interpolateColor(float tx, float ty, float tz
     v_float32x4 v0246, v1357;
     v_load_deinterleave(vx, v0246, v1357);
 
-    v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246);
+    v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
 
     v_float32x4 v00_10 = vxx;
     v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
 
-    v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10);
-    float v0 = v0_1.get0();
+    v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
+    float v0 = v_get0(v0_1);
     v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
-    float v1 = v0_1.get0();
+    float v1 = v_get0(v0_1);
 
     return v0 + tx * (v1 - v0);
 }
@@ -427,10 +427,10 @@ inline Point3f ColoredTSDFVolumeCPU::getColorVoxel(const Point3f& _p) const
 }
 inline v_float32x4 ColoredTSDFVolumeCPU::getColorVoxel(const v_float32x4& p) const
 {
-    if (v_check_any(p < v_float32x4(1.f, 1.f, 1.f, 0.f)) ||
-        v_check_any(p >= v_float32x4((float)(volResolution.x - 2),
+    if (v_check_any(v_lt(p, v_float32x4(1.f, 1.f, 1.f, 0.f))) ||
+        v_check_any(v_ge(p, v_float32x4((float)(volResolution.x - 2),
             (float)(volResolution.y - 2),
-            (float)(volResolution.z - 2), 1.f))
+            (float)(volResolution.z - 2), 1.f)))
         )
         return nanv;
 
@@ -439,9 +439,9 @@ inline v_float32x4 ColoredTSDFVolumeCPU::getColorVoxel(const v_float32x4& p) con
     const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
     const RGBTsdfVoxel* volData = volume.ptr<RGBTsdfVoxel>();
 
-    int ix = ip.get0(); ip = v_rotate_right<1>(ip);
-    int iy = ip.get0(); ip = v_rotate_right<1>(ip);
-    int iz = ip.get0();
+    int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
+    int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
+    int iz = v_get0(ip);
 
     int coordBase = ix * xdim + iy * ydim + iz * zdim;
     float CV_DECL_ALIGNED(16) rgb[4];
@@ -456,12 +456,12 @@ inline v_float32x4 ColoredTSDFVolumeCPU::getColorVoxel(const v_float32x4& p) con
     }
 
     v_float32x4 vsi(voxelSizeInv, voxelSizeInv, voxelSizeInv, voxelSizeInv);
-    v_float32x4 ptVox = p * vsi;
+    v_float32x4 ptVox = v_mul(p, vsi);
     v_int32x4 iptVox = v_floor(ptVox);
-    v_float32x4 t = ptVox - v_cvt_f32(iptVox);
-    float tx = t.get0(); t = v_rotate_right<1>(t);
-    float ty = t.get0(); t = v_rotate_right<1>(t);
-    float tz = t.get0();
+    v_float32x4 t = v_sub(ptVox, v_cvt_f32(iptVox));
+    float tx = v_get0(t); t = v_rotate_right<1>(t);
+    float ty = v_get0(t); t = v_rotate_right<1>(t);
+    float tz = v_get0(t);
     rgb[0] = interpolateColor(tx, ty, tz, r);
     rgb[1] = interpolateColor(tx, ty, tz, g);
     rgb[2] = interpolateColor(tx, ty, tz, b);
@@ -583,21 +583,21 @@ struct ColorRaycastInvoker : ParallelLoopBody
                 // get direction through pixel in volume space:
 
                 // 1. reproject (x, y) on projecting plane where z = 1.f
-                v_float32x4 planed = (v_float32x4((float)x, (float)y, 0.f, 0.f) - vcxy) * vfxy;
+                v_float32x4 planed = v_mul(v_sub(v_float32x4((float)x, (float)y, 0.F, 0.F), vcxy), vfxy);
                 planed = v_combine_low(planed, v_float32x4(1.f, 0.f, 0.f, 0.f));
 
                 // 2. rotate to volume space
                 planed = v_matmuladd(planed, camRot0, camRot1, camRot2, v_setzero_f32());
 
                 // 3. normalize
-                v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(planed * planed)));
-                v_float32x4 dir = planed * invNorm;
+                v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(v_mul(planed, planed))));
+                v_float32x4 dir = v_mul(planed, invNorm);
 
                 // compute intersection of ray with all six bbox planes
-                v_float32x4 rayinv = v_setall_f32(1.f) / dir;
+                v_float32x4 rayinv = v_div(v_setall_f32(1.F), dir);
                 // div by zero should be eliminated by these products
-                v_float32x4 tbottom = rayinv * (boxDown - orig);
-                v_float32x4 ttop = rayinv * (boxUp - orig);
+                v_float32x4 tbottom = v_mul(rayinv, v_sub(boxDown, orig));
+                v_float32x4 ttop = v_mul(rayinv, v_sub(boxUp, orig));
 
                 // re-order intersections to find smallest and largest on each axis
                 v_float32x4 minAx = v_min(ttop, tbottom);
@@ -618,14 +618,14 @@ struct ColorRaycastInvoker : ParallelLoopBody
                 if (tmin < tmax)
                 {
                     // interpolation optimized a little
-                    orig *= invVoxelSize;
-                    dir *= invVoxelSize;
+                    orig = v_mul(orig, invVoxelSize);
+                    dir = v_mul(dir, invVoxelSize);
 
                     int xdim = volume.volDims[0];
                     int ydim = volume.volDims[1];
                     int zdim = volume.volDims[2];
-                    v_float32x4 rayStep = dir * v_setall_f32(tstep);
-                    v_float32x4 next = (orig + dir * v_setall_f32(tmin));
+                    v_float32x4 rayStep = v_mul(dir, v_setall_f32(this->tstep));
+                    v_float32x4 next = (v_add(orig, v_mul(dir, v_setall_f32(tmin))));
                     float f = volume.interpolateVoxel(next), fnext = f;
 
                     //raymarch
@@ -633,11 +633,11 @@ struct ColorRaycastInvoker : ParallelLoopBody
                     int nSteps = cvFloor((tmax - tmin) / tstep);
                     for (; steps < nSteps; steps++)
                     {
-                        next += rayStep;
+                        next = v_add(next, rayStep);
                         v_int32x4 ip = v_round(next);
-                        int ix = ip.get0(); ip = v_rotate_right<1>(ip);
-                        int iy = ip.get0(); ip = v_rotate_right<1>(ip);
-                        int iz = ip.get0();
+                        int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
+                        int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
+                        int iz = v_get0(ip);
                         int coord = ix * xdim + iy * ydim + iz * zdim;
 
                         fnext = tsdfToFloat(volume.volume.at<RGBTsdfVoxel>(coord).tsdf);
@@ -657,7 +657,7 @@ struct ColorRaycastInvoker : ParallelLoopBody
                     // linearly interpolate t between two f values
                     if (f > 0.f && fnext < 0.f)
                     {
-                        v_float32x4 tp = next - rayStep;
+                        v_float32x4 tp = v_sub(next, rayStep);
                         float ft = volume.interpolateVoxel(tp);
                         float ftdt = volume.interpolateVoxel(next);
                         float ts = tmin + tstep * (steps - ft / (ftdt - ft));
@@ -665,7 +665,7 @@ struct ColorRaycastInvoker : ParallelLoopBody
                         // avoid division by zero
                         if (!cvIsNaN(ts) && !cvIsInf(ts))
                         {
-                            v_float32x4 pv = (orig + dir * v_setall_f32(ts));
+                            v_float32x4 pv = (v_add(orig, v_mul(dir, v_setall_f32(ts))));
                             v_float32x4 nv = volume.getNormalVoxel(pv);
                             v_float32x4 cv = volume.getColorVoxel(pv);
 
@@ -675,9 +675,7 @@ struct ColorRaycastInvoker : ParallelLoopBody
                                 //convert pv and nv to camera space
                                 normal = v_matmuladd(nv, volRot0, volRot1, volRot2, v_setzero_f32());
                                 // interpolation optimized a little
-                                point = v_matmuladd(pv * v_float32x4(volume.voxelSize,
-                                    volume.voxelSize,
-                                    volume.voxelSize, 1.f),
+                                point = v_matmuladd(v_mul(pv, v_float32x4(this->volume.voxelSize, this->volume.voxelSize, this->volume.voxelSize, 1.F)),
                                     volRot0, volRot1, volRot2, volTrans);
                             }
                         }
diff --git a/modules/rgbd/src/fast_icp.cpp b/modules/rgbd/src/fast_icp.cpp
index 07eb84d19..5f3ab3f0a 100644
--- a/modules/rgbd/src/fast_icp.cpp
+++ b/modules/rgbd/src/fast_icp.cpp
@@ -138,7 +138,7 @@ bool ICPImpl::estimateTransformT(cv::Affine3f& transform,
 #if USE_INTRINSICS
 static inline bool fastCheck(const v_float32x4& p0, const v_float32x4& p1)
 {
-    float check = (p0.get0() + p1.get0());
+    float check = (v_get0(p0) + v_get0(p1));
     return !cvIsNaN(check);
 }
 
@@ -160,7 +160,7 @@ static inline v_float32x4 crossProduct(const v_float32x4& a, const v_float32x4&
     v_float32x4 ayzx, azxy, byzx, bzxy;
     getCrossPerm(a, ayzx, azxy);
     getCrossPerm(b, byzx, bzxy);
-    return ayzx*bzxy - azxy*byzx;
+    return v_sub(v_mul(ayzx, bzxy), v_mul(azxy, byzx));
 }
 #else
 static inline bool fastCheck(const Point3f& p)
@@ -235,11 +235,11 @@ struct GetAbInvoker : ParallelLoopBody
 
                 //find correspondence by projecting the point
                 v_float32x4 oldCoords;
-                float pz = (v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(newP))).get0());
+                float pz = v_get0(v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(newP))));
                 // x, y, 0, 0
-                oldCoords = v_muladd(newP/v_setall_f32(pz), vfxy, vcxy);
+                oldCoords = v_muladd(v_div(newP, v_setall_f32(pz)), vfxy, vcxy);
 
-                if(!v_check_all((oldCoords >= v_setzero_f32()) & (oldCoords < vframe)))
+                if(!v_check_all(v_and(v_ge(oldCoords, v_setzero_f32()), v_lt(oldCoords, vframe))))
                     continue;
 
                 // bilinearly interpolate oldPts and oldNrm under oldCoords point
@@ -247,12 +247,12 @@ struct GetAbInvoker : ParallelLoopBody
                 v_float32x4 oldN;
                 {
                     v_int32x4 ixy = v_floor(oldCoords);
-                    v_float32x4 txy = oldCoords - v_cvt_f32(ixy);
-                    int xi = ixy.get0();
-                    int yi = v_rotate_right<1>(ixy).get0();
-                    v_float32x4 tx = v_setall_f32(txy.get0());
+                    v_float32x4 txy = v_sub(oldCoords, v_cvt_f32(ixy));
+                    int xi = v_get0(ixy);
+                    int yi = v_get0(v_rotate_right<1>(ixy));
+                    v_float32x4 tx = v_setall_f32(v_get0(txy));
                     txy = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(txy)));
-                    v_float32x4 ty = v_setall_f32(txy.get0());
+                    v_float32x4 ty = v_setall_f32(v_get0(txy));
 
                     const float* prow0 = (const float*)oldPts[yi+0];
                     const float* prow1 = (const float*)oldPts[yi+1];
@@ -275,23 +275,23 @@ struct GetAbInvoker : ParallelLoopBody
 
                     // NaN check is done later
 
-                    v_float32x4 p0 = p00 + tx*(p01 - p00);
-                    v_float32x4 p1 = p10 + tx*(p11 - p10);
-                    oldP = p0 + ty*(p1 - p0);
+                    v_float32x4 p0 = v_add(p00, v_mul(tx, v_sub(p01, p00)));
+                    v_float32x4 p1 = v_add(p10, v_mul(tx, v_sub(p11, p10)));
+                    oldP = v_add(p0, v_mul(ty, v_sub(p1, p0)));
 
-                    v_float32x4 n0 = n00 + tx*(n01 - n00);
-                    v_float32x4 n1 = n10 + tx*(n11 - n10);
-                    oldN = n0 + ty*(n1 - n0);
+                    v_float32x4 n0 = v_add(n00, v_mul(tx, v_sub(n01, n00)));
+                    v_float32x4 n1 = v_add(n10, v_mul(tx, v_sub(n11, n10)));
+                    oldN = v_add(n0, v_mul(ty, v_sub(n1, n0)));
                 }
 
                 bool oldPNcheck = fastCheck(oldP, oldN);
 
                 //filter by distance
-                v_float32x4 diff = newP - oldP;
-                bool distCheck = !(v_reduce_sum(diff*diff) > sqThresh);
+                v_float32x4 diff = v_sub(newP, oldP);
+                bool distCheck = !(v_reduce_sum(v_mul(diff, diff)) > sqThresh);
 
                 //filter by angle
-                bool angleCheck = !(abs(v_reduce_sum(newN*oldN)) < cosThresh);
+                bool angleCheck = !(abs(v_reduce_sum(v_mul(newN, oldN))) < cosThresh);
 
                 if(!(oldPNcheck && distCheck && angleCheck))
                     continue;
@@ -299,17 +299,17 @@ struct GetAbInvoker : ParallelLoopBody
                 // build point-wise vector ab = [ A | b ]
                 v_float32x4 VxNv = crossProduct(newP, oldN);
                 Point3f VxN;
-                VxN.x = VxNv.get0();
-                VxN.y = v_reinterpret_as_f32(v_extract<1>(v_reinterpret_as_u32(VxNv), v_setzero_u32())).get0();
-                VxN.z = v_reinterpret_as_f32(v_extract<2>(v_reinterpret_as_u32(VxNv), v_setzero_u32())).get0();
+                VxN.x = v_get0(VxNv);
+                VxN.y = v_get0(v_reinterpret_as_f32(v_extract<1>(v_reinterpret_as_u32(VxNv), v_setzero_u32())));
+                VxN.z = v_get0(v_reinterpret_as_f32(v_extract<2>(v_reinterpret_as_u32(VxNv), v_setzero_u32())));
 
-                float dotp = -v_reduce_sum(oldN*diff);
+                float dotp = -v_reduce_sum(v_mul(oldN, diff));
 
                 // build point-wise upper-triangle matrix [ab^T * ab] w/o last row
                 // which is [A^T*A | A^T*b]
                 // and gather sum
 
-                v_float32x4 vd = VxNv | v_float32x4(0, 0, 0, dotp);
+                v_float32x4 vd = v_or(VxNv, v_float32x4(0, 0, 0, dotp));
                 v_float32x4 n = oldN;
                 v_float32x4 nyzx;
                 {
diff --git a/modules/rgbd/src/hash_tsdf.cpp b/modules/rgbd/src/hash_tsdf.cpp
index 9cf05e556..e47af731d 100644
--- a/modules/rgbd/src/hash_tsdf.cpp
+++ b/modules/rgbd/src/hash_tsdf.cpp
@@ -425,15 +425,15 @@ inline float interpolate(float tx, float ty, float tz, float vx[8])
     v_float32x4 v0246, v1357;
     v_load_deinterleave(vx, v0246, v1357);
 
-    v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246);
+    v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
 
     v_float32x4 v00_10 = vxx;
     v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
 
-    v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10);
-    float v0 = v0_1.get0();
+    v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
+    float v0 = v_get0(v0_1);
     v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
-    float v1 = v0_1.get0();
+    float v1 = v_get0(v0_1);
 
     return v0 + tx * (v1 - v0);
 }
@@ -598,9 +598,9 @@ Point3f HashTSDFVolumeCPU::getNormalVoxel(const Point3f &point) const
     v_float32x8 czp = v_lut(vals, idxzp);
     v_float32x8 czn = v_lut(vals, idxzn);
 
-    v_float32x8 vcxv = cxn - cxp;
-    v_float32x8 vcyv = cyn - cyp;
-    v_float32x8 vczv = czn - czp;
+    v_float32x8 vcxv = v_sub(cxn, cxp);
+    v_float32x8 vcyv = v_sub(cyn, cyp);
+    v_float32x8 vczv = v_sub(czn, czp);
 
     v_store(cxv, vcxv);
     v_store(cyv, vcyv);
@@ -615,9 +615,9 @@ Point3f HashTSDFVolumeCPU::getNormalVoxel(const Point3f &point) const
     v_float32x4 czp0 = v_lut(vals, idxzp + 0); v_float32x4 czp1 = v_lut(vals, idxzp + 4);
     v_float32x4 czn0 = v_lut(vals, idxzn + 0); v_float32x4 czn1 = v_lut(vals, idxzn + 4);
 
-    v_float32x4 cxv0 = cxn0 - cxp0; v_float32x4 cxv1 = cxn1 - cxp1;
-    v_float32x4 cyv0 = cyn0 - cyp0; v_float32x4 cyv1 = cyn1 - cyp1;
-    v_float32x4 czv0 = czn0 - czp0; v_float32x4 czv1 = czn1 - czp1;
+    v_float32x4 cxv0 = v_sub(cxn0, cxp0); v_float32x4 cxv1 = v_sub(cxn1, cxp1);
+    v_float32x4 cyv0 = v_sub(cyn0, cyp0); v_float32x4 cyv1 = v_sub(cyn1, cyp1);
+    v_float32x4 czv0 = v_sub(czn0, czp0); v_float32x4 czv1 = v_sub(czn1, czp1);
 
     v_store(cxv + 0, cxv0); v_store(cxv + 4, cxv1);
     v_store(cyv + 0, cyv0); v_store(cyv + 4, cyv1);
@@ -1523,9 +1523,9 @@ Point3f HashTSDFVolumeGPU::getNormalVoxel(const Point3f& point) const
     v_float32x8 czp = v_lut(vals, idxzp);
     v_float32x8 czn = v_lut(vals, idxzn);
 
-    v_float32x8 vcxv = cxn - cxp;
-    v_float32x8 vcyv = cyn - cyp;
-    v_float32x8 vczv = czn - czp;
+    v_float32x8 vcxv = v_sub(cxn, cxp);
+    v_float32x8 vcyv = v_sub(cyn, cyp);
+    v_float32x8 vczv = v_sub(czn, czp);
 
     v_store(cxv, vcxv);
     v_store(cyv, vcyv);
@@ -1540,9 +1540,9 @@ Point3f HashTSDFVolumeGPU::getNormalVoxel(const Point3f& point) const
     v_float32x4 czp0 = v_lut(vals, idxzp + 0); v_float32x4 czp1 = v_lut(vals, idxzp + 4);
     v_float32x4 czn0 = v_lut(vals, idxzn + 0); v_float32x4 czn1 = v_lut(vals, idxzn + 4);
 
-    v_float32x4 cxv0 = cxn0 - cxp0; v_float32x4 cxv1 = cxn1 - cxp1;
-    v_float32x4 cyv0 = cyn0 - cyp0; v_float32x4 cyv1 = cyn1 - cyp1;
-    v_float32x4 czv0 = czn0 - czp0; v_float32x4 czv1 = czn1 - czp1;
+    v_float32x4 cxv0 = v_sub(cxn0, cxp0); v_float32x4 cxv1 = v_sub(cxn1, cxp1);
+    v_float32x4 cyv0 = v_sub(cyn0, cyp0); v_float32x4 cyv1 = v_sub(cyn1, cyp1);
+    v_float32x4 czv0 = v_sub(czn0, czp0); v_float32x4 czv1 = v_sub(czn1, czp1);
 
     v_store(cxv + 0, cxv0); v_store(cxv + 4, cxv1);
     v_store(cyv + 0, cyv0); v_store(cyv + 4, cyv1);
diff --git a/modules/rgbd/src/tsdf.cpp b/modules/rgbd/src/tsdf.cpp
index 7b76985eb..73a39a659 100644
--- a/modules/rgbd/src/tsdf.cpp
+++ b/modules/rgbd/src/tsdf.cpp
@@ -146,21 +146,21 @@ inline float TSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const
 {
     // tx, ty, tz = floor(p)
     v_int32x4 ip  = v_floor(p);
-    v_float32x4 t = p - v_cvt_f32(ip);
-    float tx      = t.get0();
+    v_float32x4 t = v_sub(p, v_cvt_f32(ip));
+    float tx      = v_get0(t);
     t             = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float ty      = t.get0();
+    float ty      = v_get0(t);
     t             = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float tz      = t.get0();
+    float tz      = v_get0(t);
 
     int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
     const TsdfVoxel* volData = volume.ptr<TsdfVoxel>();
 
-    int ix = ip.get0();
+    int ix = v_get0(ip);
     ip     = v_rotate_right<1>(ip);
-    int iy = ip.get0();
+    int iy = v_get0(ip);
     ip     = v_rotate_right<1>(ip);
-    int iz = ip.get0();
+    int iz = v_get0(ip);
 
     int coordBase = ix*xdim + iy*ydim + iz*zdim;
 
@@ -170,15 +170,15 @@ inline float TSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const
 
     v_float32x4 v0246 = tsdfToFloat_INTR(v_int32x4(vx[0], vx[2], vx[4], vx[6]));
     v_float32x4 v1357 = tsdfToFloat_INTR(v_int32x4(vx[1], vx[3], vx[5], vx[7]));
-    v_float32x4 vxx = v0246 + v_setall_f32(tz)*(v1357 - v0246);
+    v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
 
     v_float32x4 v00_10 = vxx;
     v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
 
-    v_float32x4 v0_1 = v00_10 + v_setall_f32(ty)*(v01_11 - v00_10);
-    float v0         = v0_1.get0();
+    v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
+    float v0         = v_get0(v0_1);
     v0_1             = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
-    float v1         = v0_1.get0();
+    float v1         = v_get0(v0_1);
 
     return v0 + tx*(v1 - v0);
 }
@@ -228,27 +228,27 @@ inline Point3f TSDFVolumeCPU::getNormalVoxel(const Point3f& _p) const
 
 inline v_float32x4 TSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const
 {
-    if(v_check_any (p < v_float32x4(1.f, 1.f, 1.f, 0.f)) ||
-       v_check_any (p >= v_float32x4((float)(volResolution.x-2),
+    if(v_check_any (v_lt(p, v_float32x4(1.f, 1.f, 1.f, 0.f))) ||
+       v_check_any (v_ge(p, v_float32x4((float)(volResolution.x-2),
                                      (float)(volResolution.y-2),
-                                     (float)(volResolution.z-2), 1.f))
+                                     (float)(volResolution.z-2), 1.f)))
                    )
         return nanv;
 
     v_int32x4 ip  = v_floor(p);
-    v_float32x4 t = p - v_cvt_f32(ip);
-    float tx      = t.get0();
+    v_float32x4 t = v_sub(p, v_cvt_f32(ip));
+    float tx      = v_get0(t);
     t             = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float ty      = t.get0();
+    float ty      = v_get0(t);
     t             = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float tz      = t.get0();
+    float tz      = v_get0(t);
 
     const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
     const TsdfVoxel* volData = volume.ptr<TsdfVoxel>();
 
-    int ix = ip.get0(); ip = v_rotate_right<1>(ip);
-    int iy = ip.get0(); ip = v_rotate_right<1>(ip);
-    int iz = ip.get0();
+    int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
+    int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
+    int iz = v_get0(ip);
 
     int coordBase = ix*xdim + iy*ydim + iz*zdim;
 
@@ -266,23 +266,23 @@ inline v_float32x4 TSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const
 
         v_float32x4 v0246 (vx[0], vx[2], vx[4], vx[6]);
         v_float32x4 v1357 (vx[1], vx[3], vx[5], vx[7]);
-        v_float32x4 vxx = v0246 + v_setall_f32(tz)*(v1357 - v0246);
+        v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
 
         v_float32x4 v00_10 = vxx;
         v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
 
-        v_float32x4 v0_1 = v00_10 + v_setall_f32(ty)*(v01_11 - v00_10);
-        float v0         = v0_1.get0();
+        v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
+        float v0         = v_get0(v0_1);
         v0_1             = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
-        float v1         = v0_1.get0();
+        float v1         = v_get0(v0_1);
 
         nv = v0 + tx*(v1 - v0);
     }
 
     v_float32x4 n       = v_load_aligned(an);
-    v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(n*n)));
+    v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(v_mul(n, n))));
 
-    return Norm.get0() < 0.0001f ? nanv : n/Norm;
+    return v_get0(Norm) < 0.0001f ? nanv : v_div(n, Norm);
 }
 #else
 inline Point3f TSDFVolumeCPU::getNormalVoxel(const Point3f& p) const
@@ -394,21 +394,21 @@ struct RaycastInvoker : ParallelLoopBody
                 // get direction through pixel in volume space:
 
                 // 1. reproject (x, y) on projecting plane where z = 1.f
-                v_float32x4 planed = (v_float32x4((float)x, (float)y, 0.f, 0.f) - vcxy)*vfxy;
+                v_float32x4 planed = v_mul(v_sub(v_float32x4((float)x, (float)y, 0.F, 0.F), vcxy), vfxy);
                 planed             = v_combine_low(planed, v_float32x4(1.f, 0.f, 0.f, 0.f));
 
                 // 2. rotate to volume space
                 planed = v_matmuladd(planed, camRot0, camRot1, camRot2, v_setzero_f32());
 
                 // 3. normalize
-                v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(planed*planed)));
-                v_float32x4 dir     = planed*invNorm;
+                v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(v_mul(planed, planed))));
+                v_float32x4 dir     = v_mul(planed, invNorm);
 
                 // compute intersection of ray with all six bbox planes
-                v_float32x4 rayinv = v_setall_f32(1.f)/dir;
+                v_float32x4 rayinv = v_div(v_setall_f32(1.F), dir);
                 // div by zero should be eliminated by these products
-                v_float32x4 tbottom = rayinv*(boxDown - orig);
-                v_float32x4 ttop    = rayinv*(boxUp   - orig);
+                v_float32x4 tbottom = v_mul(rayinv, v_sub(boxDown, orig));
+                v_float32x4 ttop    = v_mul(rayinv, v_sub(boxUp, orig));
 
                 // re-order intersections to find smallest and largest on each axis
                 v_float32x4 minAx = v_min(ttop, tbottom);
@@ -429,14 +429,14 @@ struct RaycastInvoker : ParallelLoopBody
                 if(tmin < tmax)
                 {
                     // interpolation optimized a little
-                    orig *= invVoxelSize;
-                    dir  *= invVoxelSize;
+                    orig = v_mul(orig, invVoxelSize);
+                    dir = v_mul(dir, invVoxelSize);
 
                     int xdim            = volume.volDims[0];
                     int ydim            = volume.volDims[1];
                     int zdim            = volume.volDims[2];
-                    v_float32x4 rayStep = dir * v_setall_f32(tstep);
-                    v_float32x4 next    = (orig + dir * v_setall_f32(tmin));
+                    v_float32x4 rayStep = v_mul(dir, v_setall_f32(this->tstep));
+                    v_float32x4 next    = (v_add(orig, v_mul(dir, v_setall_f32(tmin))));
                     float f = volume.interpolateVoxel(next), fnext = f;
 
                     //raymarch
@@ -444,11 +444,11 @@ struct RaycastInvoker : ParallelLoopBody
                     int nSteps = cvFloor((tmax - tmin)/tstep);
                     for(; steps < nSteps; steps++)
                     {
-                        next += rayStep;
+                        next = v_add(next, rayStep);
                         v_int32x4 ip = v_round(next);
-                        int ix = ip.get0(); ip = v_rotate_right<1>(ip);
-                        int iy = ip.get0(); ip = v_rotate_right<1>(ip);
-                        int iz = ip.get0();
+                        int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
+                        int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
+                        int iz = v_get0(ip);
                         int coord = ix*xdim + iy*ydim + iz*zdim;
 
                         fnext = tsdfToFloat(volume.volume.at<TsdfVoxel>(coord).tsdf);
@@ -468,7 +468,7 @@ struct RaycastInvoker : ParallelLoopBody
                     // linearly interpolate t between two f values
                     if(f > 0.f && fnext < 0.f)
                     {
-                        v_float32x4 tp = next - rayStep;
+                        v_float32x4 tp = v_sub(next, rayStep);
                         float ft    = volume.interpolateVoxel(tp);
                         float ftdt  = volume.interpolateVoxel(next);
                         float ts = tmin + tstep*(steps - ft/(ftdt - ft));
@@ -476,7 +476,7 @@ struct RaycastInvoker : ParallelLoopBody
                         // avoid division by zero
                         if(!cvIsNaN(ts) && !cvIsInf(ts))
                         {
-                            v_float32x4 pv = (orig + dir*v_setall_f32(ts));
+                            v_float32x4 pv = (v_add(orig, v_mul(dir, v_setall_f32(ts))));
                             v_float32x4 nv = volume.getNormalVoxel(pv);
 
                             if(!isNaN(nv))
@@ -484,9 +484,7 @@ struct RaycastInvoker : ParallelLoopBody
                                 //convert pv and nv to camera space
                                 normal = v_matmuladd(nv, volRot0, volRot1, volRot2, v_setzero_f32());
                                 // interpolation optimized a little
-                                point = v_matmuladd(pv*v_float32x4(volume.voxelSize,
-                                                                   volume.voxelSize,
-                                                                   volume.voxelSize, 1.f),
+                                point = v_matmuladd(v_mul(pv, v_float32x4(this->volume.voxelSize, this->volume.voxelSize, this->volume.voxelSize, 1.F)),
                                                     volRot0, volRot1, volRot2, volTrans);
                             }
                         }
diff --git a/modules/rgbd/src/tsdf_functions.cpp b/modules/rgbd/src/tsdf_functions.cpp
index 3f8bc26f0..09ca986a1 100644
--- a/modules/rgbd/src/tsdf_functions.cpp
+++ b/modules/rgbd/src/tsdf_functions.cpp
@@ -235,35 +235,33 @@ void integrateVolumeUnit(
                     // optimization of the following:
                     //Point3f volPt = Point3f(x, y, z)*voxelSize;
                     //Point3f camSpacePt = vol2cam * volPt;
-                    camSpacePt += zStep;
+                    camSpacePt = v_add(camSpacePt, zStep);
 
-                    float zCamSpace = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))).get0();
+                    float zCamSpace = v_get0(v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))));
                     if (zCamSpace <= 0.f)
                         continue;
 
-                    v_float32x4 camPixVec = camSpacePt / v_setall_f32(zCamSpace);
+                    v_float32x4 camPixVec = v_div(camSpacePt, v_setall_f32(zCamSpace));
                     v_float32x4 projected = v_muladd(camPixVec, vfxy, vcxy);
                     // leave only first 2 lanes
-                    projected = v_reinterpret_as_f32(v_reinterpret_as_u32(projected) &
-                        v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0));
+                    projected = v_reinterpret_as_f32(v_and(v_reinterpret_as_u32(projected), v_uint32x4(4294967295U, 4294967295U, 0, 0)));
 
                     depthType v;
                     // bilinearly interpolate depth at projected
                     {
                         const v_float32x4& pt = projected;
                         // check coords >= 0 and < imgSize
-                        v_uint32x4 limits = v_reinterpret_as_u32(pt < v_setzero_f32()) |
-                            v_reinterpret_as_u32(pt >= upLimits);
-                        limits = limits | v_rotate_right<1>(limits);
-                        if (limits.get0())
+                        v_uint32x4 limits = v_or(v_reinterpret_as_u32(v_lt(pt, v_setzero_f32())), v_reinterpret_as_u32(v_ge(pt, upLimits)));
+                        limits = v_or(limits, v_rotate_right<1>(limits));
+                        if (v_get0(limits))
                             continue;
 
                         // xi, yi = floor(pt)
                         v_int32x4 ip = v_floor(pt);
                         v_int32x4 ipshift = ip;
-                        int xi = ipshift.get0();
+                        int xi = v_get0(ipshift);
                         ipshift = v_rotate_right<1>(ipshift);
-                        int yi = ipshift.get0();
+                        int yi = v_get0(ipshift);
 
                         const depthType* row0 = depth[yi + 0];
                         const depthType* row1 = depth[yi + 1];
@@ -277,17 +275,17 @@ void integrateVolumeUnit(
 
                         // assume correct depth is positive
                         // don't fix missing data
-                        if (v_check_all(vall > v_setzero_f32()))
+                        if (v_check_all(v_gt(vall, v_setzero_f32())))
                         {
-                            v_float32x4 t = pt - v_cvt_f32(ip);
-                            float tx = t.get0();
+                            v_float32x4 t = v_sub(pt, v_cvt_f32(ip));
+                            float tx = v_get0(t);
                             t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-                            v_float32x4 ty = v_setall_f32(t.get0());
+                            v_float32x4 ty = v_setall_f32(v_get0(t));
                             // vx is y-interpolated between rows 0 and 1
-                            v_float32x4 vx = v001 + ty * (v101 - v001);
-                            float v0 = vx.get0();
+                            v_float32x4 vx = v_add(v001, v_mul(ty, v_sub(v101, v001)));
+                            float v0 = v_get0(vx);
                             vx = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vx)));
-                            float v1 = vx.get0();
+                            float v1 = v_get0(vx);
                             v = v0 + tx * (v1 - v0);
                         }
                         else
@@ -295,8 +293,8 @@ void integrateVolumeUnit(
                     }
 
                     // norm(camPixVec) produces double which is too slow
-                    int _u = (int)projected.get0();
-                    int _v = (int)v_rotate_right<1>(projected).get0();
+                    int _u = (int)v_get0(projected);
+                    int _v = (int)v_get0(v_rotate_right<1>(projected));
                     if (!(_u >= 0 && _u < depth.cols && _v >= 0 && _v < depth.rows))
                         continue;
                     float pixNorm = pixNorms.at<float>(_v, _u);
@@ -500,35 +498,33 @@ void integrateRGBVolumeUnit(
                     // optimization of the following:
                     //Point3f volPt = Point3f(x, y, z)*voxelSize;
                     //Point3f camSpacePt = vol2cam * volPt;
-                    camSpacePt += zStep;
+                    camSpacePt = v_add(camSpacePt, zStep);
 
-                    float zCamSpace = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))).get0();
+                    float zCamSpace = v_get0(v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))));
                     if (zCamSpace <= 0.f)
                         continue;
 
-                    v_float32x4 camPixVec = camSpacePt / v_setall_f32(zCamSpace);
+                    v_float32x4 camPixVec = v_div(camSpacePt, v_setall_f32(zCamSpace));
                     v_float32x4 projected = v_muladd(camPixVec, vfxy, vcxy);
                     // leave only first 2 lanes
-                    projected = v_reinterpret_as_f32(v_reinterpret_as_u32(projected) &
-                        v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0));
+                    projected = v_reinterpret_as_f32(v_and(v_reinterpret_as_u32(projected), v_uint32x4(4294967295U, 4294967295U, 0, 0)));
 
                     depthType v;
                     // bilinearly interpolate depth at projected
                     {
                         const v_float32x4& pt = projected;
                         // check coords >= 0 and < imgSize
-                        v_uint32x4 limits = v_reinterpret_as_u32(pt < v_setzero_f32()) |
-                            v_reinterpret_as_u32(pt >= upLimits);
-                        limits = limits | v_rotate_right<1>(limits);
-                        if (limits.get0())
+                        v_uint32x4 limits = v_or(v_reinterpret_as_u32(v_lt(pt, v_setzero_f32())), v_reinterpret_as_u32(v_ge(pt, upLimits)));
+                        limits = v_or(limits,v_rotate_right<1>(limits));
+                        if (v_get0(limits))
                             continue;
 
                         // xi, yi = floor(pt)
                         v_int32x4 ip = v_floor(pt);
                         v_int32x4 ipshift = ip;
-                        int xi = ipshift.get0();
+                        int xi = v_get0(ipshift);
                         ipshift = v_rotate_right<1>(ipshift);
-                        int yi = ipshift.get0();
+                        int yi = v_get0(ipshift);
 
                         const depthType* row0 = depth[yi + 0];
                         const depthType* row1 = depth[yi + 1];
@@ -542,17 +538,17 @@ void integrateRGBVolumeUnit(
 
                         // assume correct depth is positive
                         // don't fix missing data
-                        if (v_check_all(vall > v_setzero_f32()))
+                        if (v_check_all(v_gt(vall, v_setzero_f32())))
                         {
-                            v_float32x4 t = pt - v_cvt_f32(ip);
-                            float tx = t.get0();
+                            v_float32x4 t = v_sub(pt, v_cvt_f32(ip));
+                            float tx = v_get0(t);
                             t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-                            v_float32x4 ty = v_setall_f32(t.get0());
+                            v_float32x4 ty = v_setall_f32(v_get0(t));
                             // vx is y-interpolated between rows 0 and 1
-                            v_float32x4 vx = v001 + ty * (v101 - v001);
-                            float v0 = vx.get0();
+                            v_float32x4 vx = v_add(v001, v_mul(ty, v_sub(v101, v001)));
+                            float v0 = v_get0(vx);
                             vx = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vx)));
-                            float v1 = vx.get0();
+                            float v1 = v_get0(vx);
                             v = v0 + tx * (v1 - v0);
                         }
                         else
@@ -561,14 +557,13 @@ void integrateRGBVolumeUnit(
 
                     v_float32x4 projectedRGB = v_muladd(camPixVec, rgb_vfxy, rgb_vcxy);
                     // leave only first 2 lanes
-                    projectedRGB = v_reinterpret_as_f32(v_reinterpret_as_u32(projected) &
-                        v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0));
+                    projectedRGB = v_reinterpret_as_f32(v_and(v_reinterpret_as_u32(projected), v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0)));
 
                     // norm(camPixVec) produces double which is too slow
-                    int _u = (int)projected.get0();
-                    int _v = (int)v_rotate_right<1>(projected).get0();
-                    int rgb_u = (int)projectedRGB.get0();
-                    int rgb_v = (int)v_rotate_right<1>(projectedRGB).get0();
+                    int _u = (int)v_get0(projected);
+                    int _v = (int)v_get0(v_rotate_right<1>(projected));
+                    int rgb_u = (int)v_get0(projectedRGB);
+                    int rgb_v = (int)v_get0(v_rotate_right<1>(projectedRGB));
 
                     if (!(_u >= 0 && _u < depth.cols && _v >= 0 && _v < depth.rows &&
                         rgb_v >= 0 && rgb_v < color.rows && rgb_u >= 0 && rgb_u < color.cols))
diff --git a/modules/rgbd/src/tsdf_functions.hpp b/modules/rgbd/src/tsdf_functions.hpp
index c763f9275..6038ab60c 100644
--- a/modules/rgbd/src/tsdf_functions.hpp
+++ b/modules/rgbd/src/tsdf_functions.hpp
@@ -20,7 +20,7 @@ namespace kinfu
 inline v_float32x4 tsdfToFloat_INTR(const v_int32x4& num)
 {
     v_float32x4 num128 = v_setall_f32(-1.f / 128.f);
-    return v_cvt_f32(num) * num128;
+    return v_mul(v_cvt_f32(num), num128);
 }
 #endif
 
diff --git a/modules/rgbd/src/utils.hpp b/modules/rgbd/src/utils.hpp
index 0b9636753..2bb69713d 100644
--- a/modules/rgbd/src/utils.hpp
+++ b/modules/rgbd/src/utils.hpp
@@ -68,7 +68,7 @@ inline bool isNaN(cv::Point3f p)
 #if USE_INTRINSICS
 static inline bool isNaN(const cv::v_float32x4& p)
 {
-    return cv::v_check_any(p != p);
+    return cv::v_check_any(v_ne(p, p));
 }
 #endif
 
diff --git a/modules/ximgproc/src/anisodiff.cpp b/modules/ximgproc/src/anisodiff.cpp
index 996b4ac5b..2b230a712 100644
--- a/modules/ximgproc/src/anisodiff.cpp
+++ b/modules/ximgproc/src/anisodiff.cpp
@@ -74,8 +74,8 @@ inline v_uint8x16 v_finalize_pix_ch(const v_int16x8& c0, const v_int16x8& c1,
     v_expand_f32(c0, f0, f1);
     v_expand_f32(c1, f2, f3);
 
-    v_int16x8 d0 = v_pack(v_round(s0*alpha + f0), v_round(s1*alpha + f1));
-    v_int16x8 d1 = v_pack(v_round(s2*alpha + f2), v_round(s3*alpha + f3));
+    v_int16x8 d0 = v_pack(v_round(v_add(v_mul(s0, alpha), f0)), v_round(v_add(v_mul(s1, alpha), f1)));
+    v_int16x8 d1 = v_pack(v_round(v_add(v_mul(s2, alpha), f2)), v_round(v_add(v_mul(s3, alpha), f3)));
 
     return v_pack_u(d0, d1);
 }
@@ -135,12 +135,12 @@ public:
                     v_expand_s(p1, p10, p11);
                     v_expand_s(p2, p20, p21);
 
-                    v_int16x8 d00 = p00 - c00, d01 = p01 - c01;
-                    v_int16x8 d10 = p10 - c10, d11 = p11 - c11;
-                    v_int16x8 d20 = p20 - c20, d21 = p21 - c21;
+                    v_int16x8 d00 = v_sub(p00, c00), d01 = v_sub(p01, c01);
+                    v_int16x8 d10 = v_sub(p10, c10), d11 = v_sub(p11, c11);
+                    v_int16x8 d20 = v_sub(p20, c20), d21 = v_sub(p21, c21);
 
-                    v_uint16x8 n0 = v_abs(d00) + v_abs(d10) + v_abs(d20);
-                    v_uint16x8 n1 = v_abs(d01) + v_abs(d11) + v_abs(d21);
+                    v_uint16x8 n0 = v_add(v_add(v_abs(d00), v_abs(d10)), v_abs(d20));
+                    v_uint16x8 n1 = v_add(v_add(v_abs(d01), v_abs(d11)), v_abs(d21));
 
                     ushort CV_DECL_ALIGNED(16) nbuf[16];
                     v_store(nbuf, n0);
@@ -153,13 +153,13 @@ public:
 
                     v_expand_f32(d00, fd0, fd1);
                     v_expand_f32(d01, fd2, fd3);
-                    s00 += fd0*w0; s01 += fd1*w1; s02 += fd2*w2; s03 += fd3*w3;
+                    s00 = v_add(s00, v_mul(fd0, w0)); s01 = v_add(s01, v_mul(fd1, w1)); s02 = v_add(s02, v_mul(fd2, w2)); s03 = v_add(s03, v_mul(fd3, w3));
                     v_expand_f32(d10, fd0, fd1);
                     v_expand_f32(d11, fd2, fd3);
-                    s10 += fd0*w0; s11 += fd1*w1; s12 += fd2*w2; s13 += fd3*w3;
+                    s10 = v_add(s10, v_mul(fd0, w0)); s11 = v_add(s11, v_mul(fd1, w1)); s12 = v_add(s12, v_mul(fd2, w2)); s13 = v_add(s13, v_mul(fd3, w3));
                     v_expand_f32(d20, fd0, fd1);
                     v_expand_f32(d21, fd2, fd3);
-                    s20 += fd0*w0; s21 += fd1*w1; s22 += fd2*w2; s23 += fd3*w3;
+                    s20 = v_add(s20, v_mul(fd0, w0)); s21 = v_add(s21, v_mul(fd1, w1)); s22 = v_add(s22, v_mul(fd2, w2)); s23 = v_add(s23, v_mul(fd3, w3));
                 }
 
                 c0 = v_finalize_pix_ch(c00, c01, s00, s01, s02, s03, v_alpha);
diff --git a/modules/ximgproc/src/fgs_filter.cpp b/modules/ximgproc/src/fgs_filter.cpp
index 5e168da5d..804e9f00a 100644
--- a/modules/ximgproc/src/fgs_filter.cpp
+++ b/modules/ximgproc/src/fgs_filter.cpp
@@ -303,15 +303,15 @@ void FastGlobalSmootherFilterImpl::process_4row_block(Mat* cur,int i)
         v_float32x4 aux0,aux1,aux2,aux3;
 
 #define PROC4(Chor_in,cur_in,coef_prev_in,interD_prev_in,cur_prev_in,interD_out,cur_out,coef_cur_out)\
-        coef_cur_out = lambda_reg*Chor_in;\
-        aux0 = interD_prev_in*coef_prev_in;\
-        aux1 = coef_cur_out+coef_prev_in;\
-        aux1 = one_reg-aux1;\
-        aux0 = aux1-aux0;\
-        interD_out = coef_cur_out/aux0;\
-        aux1 = cur_prev_in*coef_prev_in;\
-        aux1 = cur_in - aux1;\
-        cur_out = aux1/aux0;
+        coef_cur_out = v_mul(lambda_reg, Chor_in);\
+        aux0 = v_mul(interD_prev_in, coef_prev_in);\
+        aux1 = v_add(coef_cur_out, coef_prev_in);\
+        aux1 = v_sub(one_reg, aux1);\
+        aux0 = v_sub(aux1, aux0);\
+        interD_out = v_div(coef_cur_out, aux0);\
+        aux1 = v_mul(cur_prev_in, coef_prev_in);\
+        aux1 = v_sub(cur_in, aux1);\
+        cur_out = v_div(aux1, aux0);
 
         for(;j<w-3;j+=4)
         {
@@ -406,14 +406,14 @@ void FastGlobalSmootherFilterImpl::process_4row_block(Mat* cur,int i)
             aux3 = v_load(cur_row_next3+j);
             v_transpose4x4(aux0,aux1,aux2,aux3,b0,b1,b2,b3);
 
-            aux0 = a3*cur_next_reg;
-            b3 = b3-aux0;
-            aux0 = a2*b3;
-            b2 = b2-aux0;
-            aux0 = a1*b2;
-            b1 = b1-aux0;
-            aux0 = a0*b1;
-            b0 = b0-aux0;
+            aux0 = v_mul(a3, cur_next_reg);
+            b3 = v_sub(b3, aux0);
+            aux0 = v_mul(a2, b3);
+            b2 = v_sub(b2, aux0);
+            aux0 = v_mul(a1, b2);
+            b1 = v_sub(b1, aux0);
+            aux0 = v_mul(a0, b1);
+            b0 = v_sub(b0, aux0);
 
             cur_next_reg = b0;
 
@@ -522,24 +522,24 @@ void FastGlobalSmootherFilterImpl::VerticalPass_ParBody::operator()(const Range&
         {
             a = v_load(Cvert_row_prev+j);
             b = v_load(Cvert_row+j);
-            coef_prev_reg = lambda_reg*a;
-            coef_cur_reg =  lambda_reg*b;
+            coef_prev_reg = v_mul(lambda_reg, a);
+            coef_cur_reg =  v_mul(lambda_reg, b);
 
             a = v_load(interD_row_prev+j);
-            a = a*coef_prev_reg;
+            a = v_mul(a, coef_prev_reg);
 
-            b = coef_prev_reg+coef_cur_reg;
-            b = b+a;
-            a = one_reg-b; //computed denom
+            b = v_add(coef_prev_reg, coef_cur_reg);
+            b = v_add(b, a);
+            a = v_sub(one_reg, b); //computed denom
 
-            b =  coef_cur_reg/a; //computed interD_row
+            b =  v_div(coef_cur_reg, a); //computed interD_row
 
             c = v_load(cur_row_prev+j);
-            c = c*coef_prev_reg;
+            c = v_mul(c, coef_prev_reg);
 
             d = v_load(cur_row+j);
-            d = d-c;
-            d = d/a; //computed cur_row
+            d = v_sub(d, c);
+            d = v_div(d, a); //computed cur_row
 
             v_store(interD_row+j,b);
             v_store(cur_row+j,d);
@@ -570,10 +570,10 @@ void FastGlobalSmootherFilterImpl::VerticalPass_ParBody::operator()(const Range&
         {
             a = v_load(interD_row+j);
             b = v_load(cur_row_next+j);
-            b = a*b;
+            b = v_mul(a, b);
 
             a = v_load(cur_row+j);
-            b = a-b;
+            b = v_sub(a, b);
             v_store(cur_row+j,b);
         }
 #endif
diff --git a/modules/xphoto/src/grayworld_white_balance.cpp b/modules/xphoto/src/grayworld_white_balance.cpp
index 8d5953d34..2c944c84e 100644
--- a/modules/xphoto/src/grayworld_white_balance.cpp
+++ b/modules/xphoto/src/grayworld_white_balance.cpp
@@ -130,21 +130,21 @@ void calculateChannelSums(uint &sumB, uint &sumG, uint &sumR, uchar *src_data, i
         v_expand(v_max_val, v_max1, v_max2);
 
         // Calculate masks
-        v_m1 = ~(v_mul_wrap(v_max1 - v_min1, v_255) > v_mul_wrap(v_thresh, v_max1));
-        v_m2 = ~(v_mul_wrap(v_max2 - v_min2, v_255) > v_mul_wrap(v_thresh, v_max2));
+        v_m1 = v_not(v_gt(v_mul_wrap(v_sub(v_max1, v_min1), v_255), v_mul_wrap(v_thresh, v_max1)));
+        v_m2 = v_not(v_gt(v_mul_wrap(v_sub(v_max2, v_min2), v_255), v_mul_wrap(v_thresh, v_max2)));
 
         // Apply masks
-        v_iB1 = (v_iB1 & v_m1) + (v_iB2 & v_m2);
-        v_iG1 = (v_iG1 & v_m1) + (v_iG2 & v_m2);
-        v_iR1 = (v_iR1 & v_m1) + (v_iR2 & v_m2);
+        v_iB1 = v_add(v_and(v_iB1, v_m1), v_and(v_iB2, v_m2));
+        v_iG1 = v_add(v_and(v_iG1, v_m1), v_and(v_iG2, v_m2));
+        v_iR1 = v_add(v_and(v_iR1, v_m1), v_and(v_iR2, v_m2));
 
         // Split and add to the sums:
         v_expand(v_iB1, v_uint1, v_uint2);
-        v_SB += v_uint1 + v_uint2;
+        v_SB = v_add(v_SB, v_add(v_uint1, v_uint2));
         v_expand(v_iG1, v_uint1, v_uint2);
-        v_SG += v_uint1 + v_uint2;
+        v_SG = v_add(v_SG, v_add(v_uint1, v_uint2));
         v_expand(v_iR1, v_uint1, v_uint2);
-        v_SR += v_uint1 + v_uint2;
+        v_SR = v_add(v_SR, v_add(v_uint1, v_uint2));
     }
 
     sumB = v_reduce_sum(v_SB);
@@ -197,21 +197,21 @@ void calculateChannelSums(uint64 &sumB, uint64 &sumG, uint64 &sumR, ushort *src_
         v_expand(v_max_val, v_max1, v_max2);
 
         // Calculate masks
-        v_m1 = ~((v_max1 - v_min1) * v_65535 > v_thresh * v_max1);
-        v_m2 = ~((v_max2 - v_min2) * v_65535 > v_thresh * v_max2);
+        v_m1 = v_not(v_gt(v_mul(v_sub(v_max1, v_min1), v_65535), v_mul(v_thresh, v_max1)));
+        v_m2 = v_not(v_gt(v_mul(v_sub(v_max2, v_min2), v_65535), v_mul(v_thresh, v_max2)));
 
         // Apply masks
-        v_iB1 = (v_iB1 & v_m1) + (v_iB2 & v_m2);
-        v_iG1 = (v_iG1 & v_m1) + (v_iG2 & v_m2);
-        v_iR1 = (v_iR1 & v_m1) + (v_iR2 & v_m2);
+        v_iB1 = v_add(v_and(v_iB1, v_m1), v_and(v_iB2, v_m2));
+        v_iG1 = v_add(v_and(v_iG1, v_m1), v_and(v_iG2, v_m2));
+        v_iR1 = v_add(v_and(v_iR1, v_m1), v_and(v_iR2, v_m2));
 
         // Split and add to the sums:
         v_expand(v_iB1, v_u64_1, v_u64_2);
-        v_SB += v_u64_1 + v_u64_2;
+        v_SB = v_add(v_SB, v_add(v_u64_1, v_u64_2));
         v_expand(v_iG1, v_u64_1, v_u64_2);
-        v_SG += v_u64_1 + v_u64_2;
+        v_SG = v_add(v_SG, v_add(v_u64_1, v_u64_2));
         v_expand(v_iR1, v_u64_1, v_u64_2);
-        v_SR += v_u64_1 + v_u64_2;
+        v_SR = v_add(v_SR, v_add(v_u64_1, v_u64_2));
     }
 
     // Perform final reduction
@@ -282,12 +282,12 @@ void applyChannelGains(InputArray _src, OutputArray _dst, float gainB, float gai
             v_expand(v_inR, v_sR1, v_sR2);
 
             // Multiply by gains
-            v_sB1 = v_mul_wrap(v_sB1, v_gainB) >> 8;
-            v_sB2 = v_mul_wrap(v_sB2, v_gainB) >> 8;
-            v_sG1 = v_mul_wrap(v_sG1, v_gainG) >> 8;
-            v_sG2 = v_mul_wrap(v_sG2, v_gainG) >> 8;
-            v_sR1 = v_mul_wrap(v_sR1, v_gainR) >> 8;
-            v_sR2 = v_mul_wrap(v_sR2, v_gainR) >> 8;
+            v_sB1 = v_shr(v_mul_wrap(v_sB1, v_gainB), 8);
+            v_sB2 = v_shr(v_mul_wrap(v_sB2, v_gainB), 8);
+            v_sG1 = v_shr(v_mul_wrap(v_sG1, v_gainG), 8);
+            v_sG2 = v_shr(v_mul_wrap(v_sG2, v_gainG), 8);
+            v_sR1 = v_shr(v_mul_wrap(v_sR1, v_gainR), 8);
+            v_sR2 = v_shr(v_mul_wrap(v_sR2, v_gainR), 8);
 
             // Pack into vectors of v_uint8x16
             v_store_interleave(&dst_data[i], v_pack(v_sB1, v_sB2), v_pack(v_sG1, v_sG2), v_pack(v_sR1, v_sR2));
@@ -325,12 +325,12 @@ void applyChannelGains(InputArray _src, OutputArray _dst, float gainB, float gai
             v_expand(v_inR, v_sR1, v_sR2);
 
             // Multiply by scaling factors
-            v_sB1 = (v_sB1 * v_gainB) >> 16;
-            v_sB2 = (v_sB2 * v_gainB) >> 16;
-            v_sG1 = (v_sG1 * v_gainG) >> 16;
-            v_sG2 = (v_sG2 * v_gainG) >> 16;
-            v_sR1 = (v_sR1 * v_gainR) >> 16;
-            v_sR2 = (v_sR2 * v_gainR) >> 16;
+            v_sB1 = v_shr(v_mul(v_sB1, v_gainB), 16);
+            v_sB2 = v_shr(v_mul(v_sB2, v_gainB), 16);
+            v_sG1 = v_shr(v_mul(v_sG1, v_gainG), 16);
+            v_sG2 = v_shr(v_mul(v_sG2, v_gainG), 16);
+            v_sR1 = v_shr(v_mul(v_sR1, v_gainR), 16);
+            v_sR2 = v_shr(v_mul(v_sR2, v_gainR), 16);
 
             // Pack into vectors of v_uint16x8
             v_store_interleave(&dst_data[i], v_pack(v_sB1, v_sB2), v_pack(v_sG1, v_sG2), v_pack(v_sR1, v_sR2));
diff --git a/modules/xphoto/src/learning_based_color_balance.cpp b/modules/xphoto/src/learning_based_color_balance.cpp
index bd408e6cb..de1958dcc 100644
--- a/modules/xphoto/src/learning_based_color_balance.cpp
+++ b/modules/xphoto/src/learning_based_color_balance.cpp
@@ -192,7 +192,7 @@ void LearningBasedWBImpl::preprocessing(Mat &src)
             v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
             v_local_max = v_max(v_inB, v_max(v_inG, v_inR));
             v_global_max = v_max(v_local_max, v_global_max);
-            v_mask = (v_local_max < v_thresh);
+            v_mask = (v_lt(v_local_max, v_thresh));
             v_store(mask_ptr + i, v_mask);
         }
         uchar global_max[16];
@@ -225,7 +225,7 @@ void LearningBasedWBImpl::preprocessing(Mat &src)
             v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
             v_local_max = v_max(v_inB, v_max(v_inG, v_inR));
             v_global_max = v_max(v_local_max, v_global_max);
-            v_mask = (v_local_max < v_thresh);
+            v_mask = (v_lt(v_local_max, v_thresh));
             v_pack_store(mask_ptr + i, v_mask);
         }
         ushort global_max[8];
@@ -270,9 +270,9 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
             v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
             v_uint8x16 v_mask = v_load(mask_ptr + i);
 
-            v_inB &= v_mask;
-            v_inG &= v_mask;
-            v_inR &= v_mask;
+            v_inB = v_and(v_inB, v_mask);
+            v_inG = v_and(v_inG, v_mask);
+            v_inR = v_and(v_inR, v_mask);
 
             v_uint16x8 v_sR1, v_sR2, v_sG1, v_sG2, v_sB1, v_sB2;
             v_expand(v_inB, v_sB1, v_sB2);
@@ -280,33 +280,33 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
             v_expand(v_inR, v_sR1, v_sR2);
 
             // update the brightest (R,G,B) tuple (process left half):
-            v_uint16x8 v_sum = v_sB1 + v_sG1 + v_sR1;
-            v_uint16x8 v_max_mask = (v_sum > v_max_sum);
+            v_uint16x8 v_sum = v_add(v_add(v_sB1, v_sG1), v_sR1);
+            v_uint16x8 v_max_mask = (v_gt(v_sum, v_max_sum));
             v_max_sum = v_max(v_sum, v_max_sum);
-            v_brightestB = (v_sB1 & v_max_mask) + (v_brightestB & (~v_max_mask));
-            v_brightestG = (v_sG1 & v_max_mask) + (v_brightestG & (~v_max_mask));
-            v_brightestR = (v_sR1 & v_max_mask) + (v_brightestR & (~v_max_mask));
+            v_brightestB = v_add(v_and(v_sB1, v_max_mask), v_and(v_brightestB, v_not(v_max_mask)));
+            v_brightestG = v_add(v_and(v_sG1, v_max_mask), v_and(v_brightestG, v_not(v_max_mask)));
+            v_brightestR = v_add(v_and(v_sR1, v_max_mask), v_and(v_brightestR, v_not(v_max_mask)));
 
             // update the brightest (R,G,B) tuple (process right half):
-            v_sum = v_sB2 + v_sG2 + v_sR2;
-            v_max_mask = (v_sum > v_max_sum);
+            v_sum = v_add(v_add(v_sB2, v_sG2), v_sR2);
+            v_max_mask = (v_gt(v_sum, v_max_sum));
             v_max_sum = v_max(v_sum, v_max_sum);
-            v_brightestB = (v_sB2 & v_max_mask) + (v_brightestB & (~v_max_mask));
-            v_brightestG = (v_sG2 & v_max_mask) + (v_brightestG & (~v_max_mask));
-            v_brightestR = (v_sR2 & v_max_mask) + (v_brightestR & (~v_max_mask));
+            v_brightestB = v_add(v_and(v_sB2, v_max_mask), v_and(v_brightestB, v_not(v_max_mask)));
+            v_brightestG = v_add(v_and(v_sG2, v_max_mask), v_and(v_brightestG, v_not(v_max_mask)));
+            v_brightestR = v_add(v_and(v_sR2, v_max_mask), v_and(v_brightestR, v_not(v_max_mask)));
 
             // update sums:
-            v_sB1 = v_sB1 + v_sB2;
-            v_sG1 = v_sG1 + v_sG2;
-            v_sR1 = v_sR1 + v_sR2;
+            v_sB1 = v_add(v_sB1, v_sB2);
+            v_sG1 = v_add(v_sG1, v_sG2);
+            v_sR1 = v_add(v_sR1, v_sR2);
 
             v_uint32x4 v_uint1, v_uint2;
             v_expand(v_sB1, v_uint1, v_uint2);
-            v_SB += v_uint1 + v_uint2;
+            v_SB = v_add(v_SB, v_add(v_uint1, v_uint2));
             v_expand(v_sG1, v_uint1, v_uint2);
-            v_SG += v_uint1 + v_uint2;
+            v_SG = v_add(v_SG, v_add(v_uint1, v_uint2));
             v_expand(v_sR1, v_uint1, v_uint2);
-            v_SR += v_uint1 + v_uint2;
+            v_SR = v_add(v_SR, v_add(v_uint1, v_uint2));
         }
         sumB = v_reduce_sum(v_SB);
         sumG = v_reduce_sum(v_SG);
@@ -361,11 +361,11 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
             v_uint16x8 v_inB, v_inG, v_inR;
             v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
             v_uint16x8 v_mask = v_load_expand(mask_ptr + i);
-            v_mask = v_mask | ((v_mask & v_mask_lower) << 8);
+            v_mask = v_or(v_mask, v_shl<8>(v_and(v_mask, v_mask_lower)));
 
-            v_inB &= v_mask;
-            v_inG &= v_mask;
-            v_inR &= v_mask;
+            v_inB = v_and(v_inB, v_mask);
+            v_inG = v_and(v_inG, v_mask);
+            v_inR = v_and(v_inR, v_mask);
 
             v_uint32x4 v_iR1, v_iR2, v_iG1, v_iG2, v_iB1, v_iB2;
             v_expand(v_inB, v_iB1, v_iB2);
@@ -373,32 +373,32 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
             v_expand(v_inR, v_iR1, v_iR2);
 
             // update the brightest (R,G,B) tuple (process left half):
-            v_uint32x4 v_sum = v_iB1 + v_iG1 + v_iR1;
-            v_uint32x4 v_max_mask = (v_sum > v_max_sum);
+            v_uint32x4 v_sum = v_add(v_add(v_iB1, v_iG1), v_iR1);
+            v_uint32x4 v_max_mask = (v_gt(v_sum, v_max_sum));
             v_max_sum = v_max(v_sum, v_max_sum);
-            v_brightestB = (v_iB1 & v_max_mask) + (v_brightestB & (~v_max_mask));
-            v_brightestG = (v_iG1 & v_max_mask) + (v_brightestG & (~v_max_mask));
-            v_brightestR = (v_iR1 & v_max_mask) + (v_brightestR & (~v_max_mask));
+            v_brightestB = v_add(v_and(v_iB1, v_max_mask), v_and(v_brightestB, v_not(v_max_mask)));
+            v_brightestG = v_add(v_and(v_iG1, v_max_mask), v_and(v_brightestG, v_not(v_max_mask)));
+            v_brightestR = v_add(v_and(v_iR1, v_max_mask), v_and(v_brightestR, v_not(v_max_mask)));
 
             // update the brightest (R,G,B) tuple (process right half):
-            v_sum = v_iB2 + v_iG2 + v_iR2;
-            v_max_mask = (v_sum > v_max_sum);
+            v_sum = v_add(v_add(v_iB2, v_iG2), v_iR2);
+            v_max_mask = (v_gt(v_sum, v_max_sum));
             v_max_sum = v_max(v_sum, v_max_sum);
-            v_brightestB = (v_iB2 & v_max_mask) + (v_brightestB & (~v_max_mask));
-            v_brightestG = (v_iG2 & v_max_mask) + (v_brightestG & (~v_max_mask));
-            v_brightestR = (v_iR2 & v_max_mask) + (v_brightestR & (~v_max_mask));
+            v_brightestB = v_add(v_and(v_iB2, v_max_mask), v_and(v_brightestB, v_not(v_max_mask)));
+            v_brightestG = v_add(v_and(v_iG2, v_max_mask), v_and(v_brightestG, v_not(v_max_mask)));
+            v_brightestR = v_add(v_and(v_iR2, v_max_mask), v_and(v_brightestR, v_not(v_max_mask)));
 
             // update sums:
-            v_iB1 = v_iB1 + v_iB2;
-            v_iG1 = v_iG1 + v_iG2;
-            v_iR1 = v_iR1 + v_iR2;
+            v_iB1 = v_add(v_iB1, v_iB2);
+            v_iG1 = v_add(v_iG1, v_iG2);
+            v_iR1 = v_add(v_iR1, v_iR2);
             v_uint64x2 v_uint64_1, v_uint64_2;
             v_expand(v_iB1, v_uint64_1, v_uint64_2);
-            v_SB += v_uint64_1 + v_uint64_2;
+            v_SB = v_add(v_SB, v_add(v_uint64_1, v_uint64_2));
             v_expand(v_iG1, v_uint64_1, v_uint64_2);
-            v_SG += v_uint64_1 + v_uint64_2;
+            v_SG = v_add(v_SG, v_add(v_uint64_1, v_uint64_2));
             v_expand(v_iR1, v_uint64_1, v_uint64_2);
-            v_SR += v_uint64_1 + v_uint64_2;
+            v_SR = v_add(v_SR, v_add(v_uint64_1, v_uint64_2));
         }
         uint64 sum_arr[2];
         v_store(sum_arr, v_SB);