Use new Universal Intrinsic API.

2 years ago · b9a460b857
parent 3af65e6cb1
commit b9a460b857
16 changed files with 549 additions and 558 deletions
--- a/modules/optflow/src/rlof/berlof_invoker.hpp
+++ b/modules/optflow/src/rlof/berlof_invoker.hpp
@ -296,7 +296,7 @@ public:
                            v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                            v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                            v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                            v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                            v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);

                            v_int32x4 t0, t1;
                            v_int16x8 t00, t01, t10, t11;
@ -304,35 +304,35 @@ public:
                            v_zip(v10, v11, t10, t11);

                            //subpixel interpolation
-                            t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                            t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                            t0 = t0 >> (W_BITS - 5);
-                            t1 = t1 >> (W_BITS - 5);
+                            t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                            t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                            t0 = v_shr(t0, W_BITS - 5);
+                            t1 = v_shr(t1, W_BITS - 5);
                            diff0 = v_pack(t0, t1);
                            // I*gain.x + gain.x
                            v_int16x8 diff[4] =
                            {
-                                ((v11 << 5) - vI) & vmask,
-                                ((v01 << 5) - vI) & vmask,
-                                ((v10 << 5) - vI) & vmask,
-                                ((v00 << 5) - vI) & vmask
+                                v_and(v_sub(v_shl<5>(v11), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v01), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v10), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v00), vI), vmask)
                            };
-                            diff0 = diff0 - vI;
-                            diff0 = diff0 & vmask;
+                            diff0 = v_sub(diff0, vI);
+                            diff0 = v_and(diff0, vmask);

-                            v_int16x8 vscale_diff_is_pos = diff0 > vscale;
-                            veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1);
+                            v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale);
+                            veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1));
                            // since there is no abs vor int16x8 we have to do this hack
                            v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
                            v_int16x8 vset2, vset1;
                            // |It| < sigma1 ?
-                            vset2 = vabs_diff < vparam1;
+                            vset2 = v_lt(vabs_diff, vparam1);
                            // It > 0 ?
-                            v_int16x8 vdiff_is_pos = diff0 > vzero;
+                            v_int16x8 vdiff_is_pos = v_gt(diff0, vzero);
                            // sigma0 < |It| < sigma1 ?
-                            vset1 = vset2 & (vabs_diff > vparam0);
+                            vset1 = v_and(vset2, v_gt(vabs_diff, vparam0));
                            // val = |It| -/+ sigma1
-                            v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1);
+                            v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1));

                            v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
                            v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8));
@ -342,7 +342,7 @@ public:
                            for (unsigned int mmi = 0; mmi < 4; mmi++)
                            {
                                // It == 0     ? |It| > sigma13
-                                diff0 = vset2 & diff[mmi];
+                                diff0 = v_and(vset2, diff[mmi]);
                                // It == val ? sigma0 < |It| < sigma1
                                diff0 = v_select(vset1, vtmp_param1, diff0);

@ -350,16 +350,16 @@ public:
                                // diff = diff * sigma2
                                v_int32x4 diff_int_0, diff_int_1;
                                v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
-                                v_int32x4 diff0_0 = diff_int_0 >> s2bitShift;
-                                v_int32x4 diff0_1 = diff_int_1 >> s2bitShift;
+                                v_int32x4 diff0_0 = v_shr(diff_int_0, s2bitShift);
+                                v_int32x4 diff0_1 = v_shr(diff_int_1, s2bitShift);
                                diff0 = v_pack(diff0_0, diff0_1);
                                v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...

                                v_zip(vIxy_0, vIxy_1, v10, v11);
                                v_zip(diff2, diff1, v00, v01);

-                                vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10));
-                                vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11));
+                                vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10)));
+                                vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11)));
                            }
                            if (j == 0)
                            {
@ -387,8 +387,8 @@ public:
                                v_float32x4 fx = v_cvt_f32(t1);

                                // A11 - A22
-                                v_float32x4 fxtale = fx * vtale_0;
-                                v_float32x4 fytale = fy * vtale_0;
+                                v_float32x4 fxtale = v_mul(fx, vtale_0);
+                                v_float32x4 fytale = v_mul(fy, vtale_0);

                                vAyy = v_muladd(fy, fytale, vAyy);
                                vAxy = v_muladd(fx, fytale, vAxy);
@ -402,8 +402,8 @@ public:
                                fx = v_cvt_f32(t1);

                                // A11 - A22
-                                fxtale = fx * vtale_1;
-                                fytale = fy * vtale_1;
+                                fxtale = v_mul(fx, vtale_1);
+                                fytale = v_mul(fy, vtale_1);

                                vAyy = v_muladd(fy, fytale, vAyy);
                                vAxy = v_muladd(fx, fytale, vAxy);
@ -544,7 +544,7 @@ public:
                    float CV_DECL_ALIGNED(16) bbuf[4];
                    for (int mmi = 0; mmi < 4; mmi++)
                    {
-                        v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]);
+                        v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi]));
                        _b0[mmi] = bbuf[0] + bbuf[2];
                        _b1[mmi] = bbuf[1] + bbuf[3];
                    }
@ -960,7 +960,7 @@ public:
                            v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                            v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                            v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                            v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                            v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);

                            v_int32x4 t0, t1;
                            v_int16x8 t00, t01, t10, t11;
@ -968,38 +968,38 @@ public:
                            v_zip(v10, v11, t10, t11);

                            //subpixel interpolation
-                            t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                            t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                            t0 = t0 >> (W_BITS - 5);
-                            t1 = t1 >> (W_BITS - 5);
+                            t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                            t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                            t0 = v_shr(t0, W_BITS - 5);
+                            t1 = v_shr(t1, W_BITS - 5);
                            diff0 = v_pack(t0, t1);
                            // I*gain.x + gain.x
                            v_mul_expand(vI, vgain_value, t0, t1);
-                            v_int16x8 diff_value = v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value - vI;
+                            v_int16x8 diff_value = v_sub(v_add(v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift)), vconst_value), vI);

                            v_int16x8 diff[4] =
                            {
-                                ((v11 << 5) + diff_value) & vmask,
-                                ((v01 << 5) + diff_value) & vmask,
-                                ((v10 << 5) + diff_value) & vmask,
-                                ((v00 << 5) + diff_value) & vmask
+                                v_and(v_add(v_shl<5>(v11), diff_value), vmask),
+                                v_and(v_add(v_shl<5>(v01), diff_value), vmask),
+                                v_and(v_add(v_shl<5>(v10), diff_value), vmask),
+                                v_and(v_add(v_shl<5>(v00), diff_value), vmask)
                            };
-                            diff0 = diff0 + diff_value;
-                            diff0 = diff0 & vmask;
+                            diff0 = v_add(diff0, diff_value);
+                            diff0 = v_and(diff0, vmask);

-                            v_int16x8 vscale_diff_is_pos = diff0 > vscale;
-                            veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1);
+                            v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale);
+                            veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1));
                            // since there is no abs vor int16x8 we have to do this hack
                            v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
                            v_int16x8 vset2, vset1;
                            // |It| < sigma1 ?
-                            vset2 = vabs_diff < vparam1;
+                            vset2 = v_lt(vabs_diff, vparam1);
                            // It > 0 ?
-                            v_int16x8 vdiff_is_pos = diff0 > vzero;
+                            v_int16x8 vdiff_is_pos = v_gt(diff0, vzero);
                            // sigma0 < |It| < sigma1 ?
-                            vset1 = vset2 & (vabs_diff > vparam0);
+                            vset1 = v_and(vset2, v_gt(vabs_diff, vparam0));
                            // val = |It| -/+ sigma1
-                            v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1);
+                            v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1));

                            v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
                            v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8));
@ -1009,7 +1009,7 @@ public:
                            for (unsigned int mmi = 0; mmi < 4; mmi++)
                            {
                                // It == 0     ? |It| > sigma13
-                                diff0 = vset2 & diff[mmi];
+                                diff0 = v_and(vset2, diff[mmi]);
                                // It == val ? sigma0 < |It| < sigma1
                                diff0 = v_select(vset1, vtmp_param1, diff0);

@ -1017,22 +1017,22 @@ public:
                                // diff = diff * sigma2
                                v_int32x4 diff_int_0, diff_int_1;
                                v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
-                                v_int32x4 diff0_0 = diff_int_0 >> s2bitShift;
-                                v_int32x4 diff0_1 = diff_int_1 >> s2bitShift;
+                                v_int32x4 diff0_0 = v_shr(diff_int_0, s2bitShift);
+                                v_int32x4 diff0_1 = v_shr(diff_int_1, s2bitShift);
                                diff0 = v_pack(diff0_0, diff0_1);
                                v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...

                                v_zip(vIxy_0, vIxy_1, v10, v11);
                                v_zip(diff2, diff1, v00, v01);

-                                vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10));
-                                vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11));
+                                vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10)));
+                                vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11)));

-                                vqb2[mmi] += v_cvt_f32(diff0_0 * vI0);
-                                vqb2[mmi] += v_cvt_f32(diff0_1 * vI1);
+                                vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_0, vI0)));
+                                vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_1, vI1)));

-                                vqb3[mmi] += v_cvt_f32(diff0_0);
-                                vqb3[mmi] += v_cvt_f32(diff0_1);
+                                vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_0));
+                                vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_1));
                            }
                            if (j == 0)
                            {
@ -1060,29 +1060,29 @@ public:
                                v_float32x4 fx = v_cvt_f32(t1);

                                // A11 - A22
-                                v_float32x4 fxtale = fx * vtale_0;
-                                v_float32x4 fytale = fy * vtale_0;
+                                v_float32x4 fxtale = v_mul(fx, vtale_0);
+                                v_float32x4 fytale = v_mul(fy, vtale_0);

                                vAyy = v_muladd(fy, fytale, vAyy);
                                vAxy = v_muladd(fx, fytale, vAxy);
                                vAxx = v_muladd(fx, fxtale, vAxx);

                                // sumIx und sumIy
-                                vsumIx += fxtale;
-                                vsumIy += fytale;
+                                vsumIx = v_add(vsumIx, fxtale);
+                                vsumIy = v_add(vsumIy, fytale);

-                                vsumW1 += vI_ps * fxtale;
-                                vsumW2 += vI_ps * fytale;
+                                vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale));
+                                vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale));

                                // sumI
-                                v_float32x4 vI_tale = vI_ps * vtale_0;
-                                vsumI += vI_tale;
+                                v_float32x4 vI_tale = v_mul(vI_ps, vtale_0);
+                                vsumI = v_add(vsumI, vI_tale);

                                // sumW
-                                vsumW += vtale_0;
+                                vsumW = v_add(vsumW, vtale_0);

                                // sumDI
-                                vsumDI += vI_ps * vI_tale;
+                                vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale));

                                v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
                                v_expand(v01, t1, t0);
@ -1092,29 +1092,29 @@ public:
                                fx = v_cvt_f32(t1);

                                // A11 - A22
-                                fxtale = fx * vtale_1;
-                                fytale = fy * vtale_1;
+                                fxtale = v_mul(fx, vtale_1);
+                                fytale = v_mul(fy, vtale_1);

                                vAyy = v_muladd(fy, fytale, vAyy);
                                vAxy = v_muladd(fx, fytale, vAxy);
                                vAxx = v_muladd(fx, fxtale, vAxx);

                                // sumIx und sumIy
-                                vsumIx += fxtale;
-                                vsumIy += fytale;
+                                vsumIx = v_add(vsumIx, fxtale);
+                                vsumIy = v_add(vsumIy, fytale);

-                                vsumW1 += vI_ps * fxtale;
-                                vsumW2 += vI_ps * fytale;
+                                vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale));
+                                vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale));

                                // sumI
-                                vI_tale = vI_ps * vtale_1;
-                                vsumI += vI_tale;
+                                vI_tale = v_mul(vI_ps, vtale_1);
+                                vsumI = v_add(vsumI, vI_tale);

                                // sumW
-                                vsumW += vtale_1;
+                                vsumW = v_add(vsumW, vtale_1);

                                // sumDI
-                                vsumDI += vI_ps * vI_tale;
+                                vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale));
                            }
                     }

@ -1304,7 +1304,7 @@ public:
                float CV_DECL_ALIGNED(16) bbuf[4];
                for(int mmi = 0; mmi < 4; mmi++)
                {
-                    v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]);
+                    v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi]));
                    _b0[mmi] = bbuf[0] + bbuf[2];
                    _b1[mmi] = bbuf[1] + bbuf[3];
                    _b2[mmi] = v_reduce_sum(vqb2[mmi]);
@ -1655,14 +1655,14 @@ public:
                            v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                            v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                            v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                            v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                            v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);

                            v_int16x8 diff[4] =
                            {
-                                ((v00 << 5) - vI) & vmask,
-                                ((v01 << 5) - vI) & vmask,
-                                ((v10 << 5) - vI) & vmask,
-                                ((v11 << 5) - vI) & vmask,
+                                v_and(v_sub(v_shl<5>(v00), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v01), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v10), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v11), vI), vmask),
                            };

                            v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
@ -1672,8 +1672,8 @@ public:
                                v_zip(diff[mmi], diff[mmi], diff1, diff0);
                                v_zip(vIxy_0, vIxy_1, v10, v11);
                                v_zip(diff1, diff0, v00, v01);
-                                vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10));
-                                vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11));
+                                vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10)));
+                                vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11)));
                            }
                         }
 #else
@ -1704,7 +1704,7 @@ public:
                    float CV_DECL_ALIGNED(16) bbuf[4];
                    for (int mmi = 0; mmi < 4; mmi++)
                    {
-                        v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]);
+                        v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi]));
                        _b1[mmi] = bbuf[0] + bbuf[2];
                        _b2[mmi] = bbuf[1] + bbuf[3];
                    }
@ -2071,7 +2071,7 @@ namespace radial {
                                v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                                v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                                v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                                v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                                v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);

                                v_int32x4 t0, t1;
                                v_int16x8 t00, t01, t10, t11;
@ -2079,21 +2079,21 @@ namespace radial {
                                v_zip(v10, v11, t10, t11);

                                //subpixel interpolation
-                                t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                                t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                                t0 = t0 >> (W_BITS - 5);
-                                t1 = t1 >> (W_BITS - 5);
+                                t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                                t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                                t0 = v_shr(t0, W_BITS - 5);
+                                t1 = v_shr(t1, W_BITS - 5);
                                diff0 = v_pack(t0, t1);
                                // I*gain.x + gain.x
                                v_mul_expand(vI, vgain_value, t0, t1);
-                                v_int16x8 diff_value = v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value - vI;
+                                v_int16x8 diff_value = v_sub(v_add(v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift)), vconst_value), vI);

                                v_int16x8 diff[4] =
                                {
-                                    ((v11 << 5) + diff_value) & vmask,
-                                    ((v01 << 5) + diff_value) & vmask,
-                                    ((v10 << 5) + diff_value) & vmask,
-                                    ((v00 << 5) + diff_value) & vmask
+                                    v_and(v_add(v_shl<5>(v11), diff_value), vmask),
+                                    v_and(v_add(v_shl<5>(v01), diff_value), vmask),
+                                    v_and(v_add(v_shl<5>(v10), diff_value), vmask),
+                                    v_and(v_add(v_shl<5>(v00), diff_value), vmask)
                                };
                                v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
                                v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8));
@ -2109,14 +2109,14 @@ namespace radial {
                                    v_zip(diff[mmi], diff[mmi], diff2, diff1);

                                    v_zip(diff2, diff1, v00, v01);
-                                    vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10));
-                                    vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11));
+                                    vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10)));
+                                    vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11)));

-                                    vqb2[mmi] += v_cvt_f32(diff0_0 * vI0);
-                                    vqb2[mmi] += v_cvt_f32(diff0_1 * vI1);
+                                    vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_0, vI0)));
+                                    vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_1, vI1)));

-                                    vqb3[mmi] += v_cvt_f32(diff0_0);
-                                    vqb3[mmi] += v_cvt_f32(diff0_1);
+                                    vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_0));
+                                    vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_1));
                                }
                                if (j == 0)
                                {
@ -2133,17 +2133,17 @@ namespace radial {
                                    vAxx = v_muladd(fx, fx, vAxx);

                                    // sumIx und sumIy
-                                    vsumIx += fx;
-                                    vsumIy += fy;
+                                    vsumIx = v_add(vsumIx, fx);
+                                    vsumIy = v_add(vsumIy, fy);

-                                    vsumW1 += vI_ps * fx;
-                                    vsumW2 += vI_ps * fy;
+                                    vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx));
+                                    vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy));

                                    // sumI
-                                    vsumI += vI_ps;
+                                    vsumI = v_add(vsumI, vI_ps);

                                    // sumDI
-                                    vsumDI += vI_ps * vI_ps;
+                                    vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps));

                                    v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
                                    v_expand(v01, t1, t0);
@ -2158,17 +2158,17 @@ namespace radial {
                                    vAxx = v_muladd(fx, fx, vAxx);

                                    // sumIx und sumIy
-                                    vsumIx += fx;
-                                    vsumIy += fy;
+                                    vsumIx = v_add(vsumIx, fx);
+                                    vsumIy = v_add(vsumIy, fy);

-                                    vsumW1 += vI_ps * fx;
-                                    vsumW2 += vI_ps * fy;
+                                    vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx));
+                                    vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy));

                                    // sumI
-                                    vsumI += vI_ps;
+                                    vsumI = v_add(vsumI, vI_ps);

                                    // sumDI
-                                    vsumDI += vI_ps * vI_ps;
+                                    vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps));
                                }
                            }

@ -2299,7 +2299,7 @@ namespace radial {
                        float CV_DECL_ALIGNED(16) bbuf[4];
                        for (int mmi = 0; mmi < 4; mmi++)
                        {
-                            v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]);
+                            v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi]));
                            _b0[mmi] = bbuf[0] + bbuf[2];
                            _b1[mmi] = bbuf[1] + bbuf[3];
                            _b2[mmi] = v_reduce_sum(vqb2[mmi]);
--- a/modules/optflow/src/rlof/plk_invoker.hpp
+++ b/modules/optflow/src/rlof/plk_invoker.hpp
@ -229,7 +229,7 @@ public:
                        v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                        v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                        v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                        v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                        v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);

                        v_int32x4 t0, t1;
                        v_int16x8 t00, t01, t10, t11;
@ -237,17 +237,17 @@ public:
                        v_zip(v10, v11, t10, t11);

                        //subpixel interpolation
-                        t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                        t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                        t0 = t0 >> (W_BITS - 5);
-                        t1 = t1 >> (W_BITS - 5);
+                        t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                        t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                        t0 = v_shr(t0, W_BITS - 5);
+                        t1 = v_shr(t1, W_BITS - 5);

                        // diff = J - I
-                        diff0 = v_pack(t0, t1) - vI;
+                        diff0 = v_sub(v_pack(t0, t1), vI);
                        // I*gain.x + gain.x
                        v_mul_expand(vI, vgain_value, t0, t1);
-                        diff0 = diff0 + v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value;
-                        diff0 = diff0 & vmask;
+                        diff0 = v_add(v_add(diff0, v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift))), vconst_value);
+                        diff0 = v_and(diff0, vmask);
                        v_zip(diff0, diff0, diff2, diff1);

                        v_int32x4 diff0_0;
@ -259,16 +259,16 @@ public:
                        v_zip(vIxy_0, vIxy_1, v10, v11);
                        v_zip(diff2, diff1, v00, v01);

-                        vqb0 += v_cvt_f32(v_dotprod(v00, v10));
-                        vqb1 += v_cvt_f32(v_dotprod(v01, v11));
+                        vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10)));
+                        vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11)));

                        v_int32x4 vI0, vI1;
                        v_expand(vI, vI0, vI1);
-                        vqb2 += v_cvt_f32(diff0_0 * vI0);
-                        vqb2 += v_cvt_f32(diff0_1 * vI1);
+                        vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_0, vI0)));
+                        vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_1, vI1)));

-                        vqb3 += v_cvt_f32(diff0_0);
-                        vqb3 += v_cvt_f32(diff0_1);
+                        vqb3 = v_add(vqb3, v_cvt_f32(diff0_0));
+                        vqb3 = v_add(vqb3, v_cvt_f32(diff0_1));

                        if (j == 0)
                        {
@ -285,17 +285,17 @@ public:
                            vAxx = v_muladd(fx, fx, vAxx);

                            // sumIx und sumIy
-                            vsumIx += fx;
-                            vsumIy += fy;
+                            vsumIx = v_add(vsumIx, fx);
+                            vsumIy = v_add(vsumIy, fy);

-                            vsumW1 += vI_ps * fx;
-                            vsumW2 += vI_ps * fy;
+                            vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx));
+                            vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy));

                            // sumI
-                            vsumI += vI_ps;
+                            vsumI = v_add(vsumI, vI_ps);

                            // sumDI
-                            vsumDI += vI_ps * vI_ps;
+                            vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps));

                            v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
                            v_expand(v01, t1, t0);
@ -309,17 +309,17 @@ public:
                            vAxx = v_muladd(fx, fx, vAxx);

                            // sumIx und sumIy
-                            vsumIx += fx;
-                            vsumIy += fy;
+                            vsumIx = v_add(vsumIx, fx);
+                            vsumIy = v_add(vsumIy, fy);

-                            vsumW1 += vI_ps * fx;
-                            vsumW2 += vI_ps * fy;
+                            vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx));
+                            vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy));

                            // sumI
-                            vsumI += vI_ps;
+                            vsumI = v_add(vsumI, vI_ps);

                            // sumDI
-                            vsumDI += vI_ps * vI_ps;
+                            vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps));
                        }
                    }
 #else
@ -388,7 +388,7 @@ public:

 #if CV_SIMD128
                float CV_DECL_ALIGNED(16) bbuf[4];
-                v_store_aligned(bbuf, vqb0 + vqb1);
+                v_store_aligned(bbuf, v_add(vqb0, vqb1));
                b1 = bbuf[0] + bbuf[2];
                b2 = bbuf[1] + bbuf[3];
                b3 = v_reduce_sum(vqb2);
@ -696,19 +696,19 @@ public:
                        v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                        v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                        v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                        v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                        v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);

                        v_int32x4 t0, t1;
                        v_int16x8 t00, t01, t10, t11;
                        v_zip(v00, v01, t00, t01);
                        v_zip(v10, v11, t10, t11);

-                        t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                        t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                        t0 = t0 >> (W_BITS - 5);
-                        t1 = t1 >> (W_BITS - 5);
-                        diff0 = v_pack(t0, t1) - diff0;
-                        diff0 = diff0 & vmask;
+                        t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                        t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                        t0 = v_shr(t0, W_BITS - 5);
+                        t1 = v_shr(t1, W_BITS - 5);
+                        diff0 = v_sub(v_pack(t0, t1), diff0);
+                        diff0 = v_and(diff0, vmask);

                        v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...

@ -717,8 +717,8 @@ public:
                        v_zip(vIxy_0, vIxy_1, v10, v11);
                        v_zip(diff2, diff1, v00, v01);

-                        vqb0 += v_cvt_f32(v_dotprod(v00, v10));
-                        vqb1 += v_cvt_f32(v_dotprod(v01, v11));
+                        vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10)));
+                        vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11)));
                    }
 #else
                    for( ; x < winSize.width*cn; x++, dIptr += 2 )
@ -737,7 +737,7 @@ public:

 #if CV_SIMD128
                float CV_DECL_ALIGNED(16) bbuf[4];
-                v_store_aligned(bbuf, vqb0 + vqb1);
+                v_store_aligned(bbuf, v_add(vqb0, vqb1));
                b1 = bbuf[0] + bbuf[2];
                b2 = bbuf[1] + bbuf[3];
 #endif
--- a/modules/optflow/src/rlof/rlof_invoker.hpp
+++ b/modules/optflow/src/rlof/rlof_invoker.hpp
@ -246,35 +246,35 @@ public:
                        v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                        v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                        v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                        v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                        v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);

                        v_int32x4 t0, t1;
                        v_int16x8 t00, t01, t10, t11;
                        v_zip(v00, v01, t00, t01);
                        v_zip(v10, v11, t10, t11);

-                        t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                        t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                        t0 = t0 >> (W_BITS - 5);
-                        t1 = t1 >> (W_BITS - 5);
-                        diff0 = v_pack(t0, t1) - diff0;
-                        diff0 = diff0 & vmask;
+                        t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                        t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                        t0 = v_shr(t0, W_BITS - 5);
+                        t1 = v_shr(t1, W_BITS - 5);
+                        diff0 = v_sub(v_pack(t0, t1), diff0);
+                        diff0 = v_and(diff0, vmask);

-                        v_int16x8 vscale_diff_is_pos = diff0 > vscale;
-                        veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1);
+                        v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale);
+                        veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1));
                        // since there is no abs vor int16x8 we have to do this hack
                        v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
                        v_int16x8 vset2, vset1;
                        // |It| < sigma1 ?
-                        vset2 = vabs_diff < vparam1;
+                        vset2 = v_lt(vabs_diff, vparam1);
                        // It > 0 ?
-                        v_int16x8 vdiff_is_pos = diff0 > vzero;
+                        v_int16x8 vdiff_is_pos = v_gt(diff0, vzero);
                        // sigma0 < |It| < sigma1 ?
-                        vset1 = vset2 & (vabs_diff > vparam0);
+                        vset1 = v_and(vset2, v_gt(vabs_diff, vparam0));
                        // val = |It| -/+ sigma1
-                        v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1);
+                        v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1));
                        // It == 0     ? |It| > sigma13
-                        diff0 = vset2 & diff0;
+                        diff0 = v_and(vset2, diff0);
                        // It == val ? sigma0 < |It| < sigma1
                        diff0 = v_select(vset1, vtmp_param1, diff0);

@ -282,7 +282,7 @@ public:
                        // diff = diff * sigma2
                        v_int32x4 diff_int_0, diff_int_1;
                        v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
-                        diff0 = v_pack(diff_int_0 >> s2bitShift, diff_int_1 >> s2bitShift);
+                        diff0 = v_pack(v_shr(diff_int_0, s2bitShift), v_shr(diff_int_1, s2bitShift));
                        v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...

                        v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
@ -290,8 +290,8 @@ public:
                        v_zip(vIxy_0, vIxy_1, v10, v11);
                        v_zip(diff2, diff1, v00, v01);

-                        vqb0 += v_cvt_f32(v_dotprod(v00, v10));
-                        vqb1 += v_cvt_f32(v_dotprod(v01, v11));
+                        vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10)));
+                        vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11)));
                        if (j == 0)
                        {
                            v_int32x4 vset1_0, vset1_1, vset2_0, vset2_1;
@ -316,8 +316,8 @@ public:
                            v_float32x4 fx = v_cvt_f32(t1);

                            // A11 - A22
-                            v_float32x4 fxtale = fx * vtale_0;
-                            v_float32x4 fytale = fy * vtale_0;
+                            v_float32x4 fxtale = v_mul(fx, vtale_0);
+                            v_float32x4 fytale = v_mul(fy, vtale_0);

                            vAyy = v_muladd(fy, fytale, vAyy);
                            vAxy = v_muladd(fx, fytale, vAxy);
@ -330,8 +330,8 @@ public:
                            fx = v_cvt_f32(t1);

                            // A11 - A22
-                            fxtale = fx * vtale_1;
-                            fytale = fy * vtale_1;
+                            fxtale = v_mul(fx, vtale_1);
+                            fytale = v_mul(fy, vtale_1);

                            vAyy = v_muladd(fy, fytale, vAyy);
                            vAxy = v_muladd(fx, fytale, vAxy);
@ -431,7 +431,7 @@ public:

 #if CV_SIMD128
                float CV_DECL_ALIGNED(16) bbuf[4];
-                v_store_aligned(bbuf, vqb0 + vqb1);
+                v_store_aligned(bbuf, v_add(vqb0, vqb1));
                b1 += bbuf[0] + bbuf[2];
                b2 += bbuf[1] + bbuf[3];
 #endif
@ -769,7 +769,7 @@ public:
                        v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                        v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                        v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                        v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                        v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);

                        v_int32x4 t0, t1;
                        v_int16x8 t00, t01, t10, t11;
@ -777,33 +777,33 @@ public:
                        v_zip(v10, v11, t10, t11);

                        //subpixel interpolation
-                        t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                        t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                        t0 = t0 >> (W_BITS - 5);
-                        t1 = t1 >> (W_BITS - 5);
+                        t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                        t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                        t0 = v_shr(t0, W_BITS - 5);
+                        t1 = v_shr(t1, W_BITS - 5);

                        // diff = J - I
-                        diff0 = v_pack(t0, t1) - vI;
+                        diff0 = v_sub(v_pack(t0, t1), vI);
                        // I*gain.x + gain.x
                        v_mul_expand(vI, vgain_value, t0, t1);
-                        diff0 = diff0 + v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value;
-                        diff0 = diff0 & vmask;
+                        diff0 = v_add(v_add(diff0, v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift))), vconst_value);
+                        diff0 = v_and(diff0, vmask);

-                        v_int16x8 vscale_diff_is_pos = diff0 > vscale;
-                        veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1);
+                        v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale);
+                        veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1));
                        // since there is no abs vor int16x8 we have to do this hack
                        v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
                        v_int16x8 vset2, vset1;
                        // |It| < sigma1 ?
-                        vset2 = vabs_diff < vparam1;
+                        vset2 = v_lt(vabs_diff, vparam1);
                        // It > 0 ?
-                        v_int16x8 vdiff_is_pos = diff0 > vzero;
+                        v_int16x8 vdiff_is_pos = v_gt(diff0, vzero);
                        // sigma0 < |It| < sigma1 ?
-                        vset1 = vset2 & (vabs_diff > vparam0);
+                        vset1 = v_and(vset2, v_gt(vabs_diff, vparam0));
                        // val = |It| -/+ sigma1
-                        v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1);
+                        v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1));
                        // It == 0     ? |It| > sigma13
-                        diff0 = vset2 & diff0;
+                        diff0 = v_and(vset2, diff0);
                        // It == val ? sigma0 < |It| < sigma1
                        diff0 = v_select(vset1, vtmp_param1, diff0);

@ -811,8 +811,8 @@ public:
                        // diff = diff * sigma2
                        v_int32x4 diff_int_0, diff_int_1;
                        v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
-                        v_int32x4 diff0_0 = diff_int_0 >> s2bitShift;
-                        v_int32x4 diff0_1 = diff_int_1 >> s2bitShift;
+                        v_int32x4 diff0_0 = v_shr(diff_int_0, s2bitShift);
+                        v_int32x4 diff0_1 = v_shr(diff_int_1, s2bitShift);
                        diff0 = v_pack(diff0_0, diff0_1);
                        v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...

@ -821,16 +821,16 @@ public:
                        v_zip(vIxy_0, vIxy_1, v10, v11);
                        v_zip(diff2, diff1, v00, v01);

-                        vqb0 += v_cvt_f32(v_dotprod(v00, v10));
-                        vqb1 += v_cvt_f32(v_dotprod(v01, v11));
+                        vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10)));
+                        vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11)));

                        v_int32x4 vI0, vI1;
                        v_expand(vI, vI0, vI1);
-                        vqb2 += v_cvt_f32(diff0_0 * vI0);
-                        vqb2 += v_cvt_f32(diff0_1 * vI1);
+                        vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_0, vI0)));
+                        vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_1, vI1)));

-                        vqb3 += v_cvt_f32(diff0_0);
-                        vqb3 += v_cvt_f32(diff0_1);
+                        vqb3 = v_add(vqb3, v_cvt_f32(diff0_0));
+                        vqb3 = v_add(vqb3, v_cvt_f32(diff0_1));

                        if (j == 0)
                        {
@ -858,29 +858,29 @@ public:
                            v_float32x4 fx = v_cvt_f32(t1);

                            // A11 - A22
-                            v_float32x4 fxtale = fx * vtale_0;
-                            v_float32x4 fytale = fy * vtale_0;
+                            v_float32x4 fxtale = v_mul(fx, vtale_0);
+                            v_float32x4 fytale = v_mul(fy, vtale_0);

                            vAyy = v_muladd(fy, fytale, vAyy);
                            vAxy = v_muladd(fx, fytale, vAxy);
                            vAxx = v_muladd(fx, fxtale, vAxx);

                            // sumIx und sumIy
-                            vsumIx += fxtale;
-                            vsumIy += fytale;
+                            vsumIx = v_add(vsumIx, fxtale);
+                            vsumIy = v_add(vsumIy, fytale);

-                            vsumW1 += vI_ps * fxtale;
-                            vsumW2 += vI_ps * fytale;
+                            vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale));
+                            vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale));

                            // sumI
-                            v_float32x4 vI_tale = vI_ps * vtale_0;
-                            vsumI += vI_tale;
+                            v_float32x4 vI_tale = v_mul(vI_ps, vtale_0);
+                            vsumI = v_add(vsumI, vI_tale);

                            // sumW
-                            vsumW += vtale_0;
+                            vsumW = v_add(vsumW, vtale_0);

                            // sumDI
-                            vsumDI += vI_ps * vI_tale;
+                            vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale));

                            v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
                            v_expand(v01, t1, t0);
@ -890,29 +890,29 @@ public:
                            fx = v_cvt_f32(t1);

                            // A11 - A22
-                            fxtale = fx * vtale_1;
-                            fytale = fy * vtale_1;
+                            fxtale = v_mul(fx, vtale_1);
+                            fytale = v_mul(fy, vtale_1);

                            vAyy = v_muladd(fy, fytale, vAyy);
                            vAxy = v_muladd(fx, fytale, vAxy);
                            vAxx = v_muladd(fx, fxtale, vAxx);

                            // sumIx und sumIy
-                            vsumIx += fxtale;
-                            vsumIy += fytale;
+                            vsumIx = v_add(vsumIx, fxtale);
+                            vsumIy = v_add(vsumIy, fytale);

-                            vsumW1 += vI_ps * fxtale;
-                            vsumW2 += vI_ps * fytale;
+                            vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale));
+                            vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale));

                            // sumI
-                            vI_tale = vI_ps * vtale_1;
-                            vsumI += vI_tale;
+                            vI_tale = v_mul(vI_ps, vtale_1);
+                            vsumI = v_add(vsumI, vI_tale);

                            // sumW
-                            vsumW += vtale_1;
+                            vsumW = v_add(vsumW, vtale_1);

                            // sumDI
-                            vsumDI += vI_ps * vI_tale;
+                            vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale));
                        }
                    }
 #else
@ -1017,7 +1017,7 @@ public:
                }
 #if CV_SIMD128
                float CV_DECL_ALIGNED(16) bbuf[4];
-                v_store_aligned(bbuf, vqb0 + vqb1);
+                v_store_aligned(bbuf, v_add(vqb0, vqb1));
                b1 = bbuf[0] + bbuf[2];
                b2 = bbuf[1] + bbuf[3];
                b3 = v_reduce_sum(vqb2);
--- a/modules/optflow/src/rlof/rlof_invokerbase.hpp
+++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp
@ -71,15 +71,15 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,

        for (; x <= winSize.width*cn; x += 8, dsrc += 8 * 2, dsrc1 += 8 * 2, dIptr += 8 * 2)
        {
-            v_int32x4 vmask0 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)) * vmax_val_32;
-            v_int32x4 vmask1 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)) * vmax_val_32;
+            v_int32x4 vmask0 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)), vmax_val_32);
+            v_int32x4 vmask1 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)), vmax_val_32);
            if (x + 4 > winSize.width)
            {
-                vmask0 = vmask0 & vmask_border_0;
+                vmask0 = v_and(vmask0, vmask_border_0);
            }
            if (x + 8 > winSize.width)
            {
-                vmask1 = vmask1 & vmask_border_1;
+                vmask1 = v_and(vmask1, vmask_border_1);
            }

            v_int32x4 t0, t1;
@ -91,10 +91,10 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,

            v_zip(v00, v01, t00, t01);
            v_zip(v10, v11, t10, t11);
-            t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-            t0 = t0 >> (W_BITS - 5)  & vmask0;
-            t1 = t1 >> (W_BITS - 5)  & vmask1;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+            t0 = v_and(v_shr(t0, W_BITS - 5), vmask0);
+            t1 = v_and(v_shr(t1, W_BITS - 5), vmask1);
            v_store(Iptr + x, v_pack(t0, t1));

            v00 = v_reinterpret_as_s16(v_load(dsrc));
@ -105,12 +105,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
            v_zip(v00, v01, t00, t01);
            v_zip(v10, v11, t10, t11);

-            t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
-            t0 = t0 >> W_BITS;
-            t1 = t1 >> W_BITS;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1));
+            t0 = v_shr(t0, W_BITS);
+            t1 = v_shr(t1, W_BITS);
            v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-            v00 = v00 & v_reinterpret_as_s16(vmask0);
+            v00 = v_and(v00, v_reinterpret_as_s16(vmask0));
            v_store(dIptr, v00);

            v00 = v_reinterpret_as_s16(v_load(dsrc + 4 * 2));
@ -121,12 +121,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
            v_zip(v00, v01, t00, t01);
            v_zip(v10, v11, t10, t11);

-            t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
-            t0 = t0 >> W_BITS;
-            t1 = t1 >> W_BITS;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1));
+            t0 = v_shr(t0, W_BITS);
+            t1 = v_shr(t1, W_BITS);
            v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-            v00 = v00 & v_reinterpret_as_s16(vmask1);
+            v00 = v_and(v00, v_reinterpret_as_s16(vmask1));
            v_store(dIptr + 4 * 2, v00);
        }
 #else
@ -187,15 +187,15 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
 #if CV_SIMD128
        for (int x = 0; x <= winSize.width*cn; x += 8, dsrc += 8 * 2, dsrc1 += 8 * 2, dIptr += 8 * 2)
        {
-            v_int32x4 vmask0 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)) * vmax_val_32;
-            v_int32x4 vmask1 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)) * vmax_val_32;
+            v_int32x4 vmask0 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)), vmax_val_32);
+            v_int32x4 vmask1 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)), vmax_val_32);
            if (x + 4 > winSize.width)
            {
-                vmask0 = vmask0 & vmask_border0;
+                vmask0 = v_and(vmask0, vmask_border0);
            }
            if (x + 8 > winSize.width)
            {
-                vmask1 = vmask1 & vmask_border1;
+                vmask1 = v_and(vmask1, vmask_border1);
            }

            v_int32x4 t0, t1;
@ -207,12 +207,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,

            v_zip(v00, v01, t00, t01);
            v_zip(v10, v11, t10, t11);
-            t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-            t0 = t0 >> (W_BITS - 5);
-            t1 = t1 >> (W_BITS - 5);
-            t0 = t0 & vmask0;
-            t1 = t1 & vmask1;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+            t0 = v_shr(t0, W_BITS - 5);
+            t1 = v_shr(t1, W_BITS - 5);
+            t0 = v_and(t0, vmask0);
+            t1 = v_and(t1, vmask1);
            v_store(Iptr + x, v_pack(t0, t1));

            v00 = v_reinterpret_as_s16(v_load(dsrc));
@ -223,12 +223,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
            v_zip(v00, v01, t00, t01);
            v_zip(v10, v11, t10, t11);

-            t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
-            t0 = t0 >> W_BITS;
-            t1 = t1 >> W_BITS;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1));
+            t0 = v_shr(t0, W_BITS);
+            t1 = v_shr(t1, W_BITS);
            v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-            v00 = v00 & v_reinterpret_as_s16(vmask0);
+            v00 = v_and(v00, v_reinterpret_as_s16(vmask0));
            v_store(dIptr, v00);

            v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(v00))));
@ -249,12 +249,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
            v_zip(v00, v01, t00, t01);
            v_zip(v10, v11, t10, t11);

-            t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
-            t0 = t0 >> W_BITS;
-            t1 = t1 >> W_BITS;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1));
+            t0 = v_shr(t0, W_BITS);
+            t1 = v_shr(t1, W_BITS);
            v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-            v00 = v00 & v_reinterpret_as_s16(vmask1);
+            v00 = v_and(v00, v_reinterpret_as_s16(vmask1));
            v_store(dIptr + 4 * 2, v00);

            v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(v00))));
--- a/modules/optflow/src/rlof/rlof_localflow.cpp
+++ b/modules/optflow/src/rlof/rlof_localflow.cpp
@ -52,8 +52,8 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
                v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(srow1 + x));
                v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x));

-                v_int16x8 t1 = s2 - s0;
-                v_int16x8 t0 = v_mul_wrap(s0 + s2, c3) + v_mul_wrap(s1, c10);
+                v_int16x8 t1 = v_sub(s2, s0);
+                v_int16x8 t0 = v_add(v_mul_wrap(v_add(s0, s2), c3), v_mul_wrap(s1, c10));

                v_store(trow0 + x, t0);
                v_store(trow1 + x, t1);
@ -90,8 +90,8 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
                v_int16x8 s3 = v_load(trow1 + x);
                v_int16x8 s4 = v_load(trow1 + x + cn);

-                v_int16x8 t0 = s1 - s0;
-                v_int16x8 t1 = v_mul_wrap(s2 + s4, c3) + v_mul_wrap(s3, c10);
+                v_int16x8 t0 = v_sub(s1, s0);
+                v_int16x8 t1 = v_add(v_mul_wrap(v_add(s2, s4), c3), v_mul_wrap(s3, c10));

                v_store_interleave((drow + x * 2), t0, t1);
            }
--- a/modules/rgbd/src/colored_tsdf.cpp
+++ b/modules/rgbd/src/colored_tsdf.cpp
@ -194,21 +194,21 @@ inline float ColoredTSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const
 {
    // tx, ty, tz = floor(p)
    v_int32x4 ip = v_floor(p);
-    v_float32x4 t = p - v_cvt_f32(ip);
-    float tx = t.get0();
+    v_float32x4 t = v_sub(p, v_cvt_f32(ip));
+    float tx = v_get0(t);
    t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float ty = t.get0();
+    float ty = v_get0(t);
    t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float tz = t.get0();
+    float tz = v_get0(t);

    int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
    const RGBTsdfVoxel* volData = volume.ptr<RGBTsdfVoxel>();

-    int ix = ip.get0();
+    int ix = v_get0(ip);
    ip = v_rotate_right<1>(ip);
-    int iy = ip.get0();
+    int iy = v_get0(ip);
    ip = v_rotate_right<1>(ip);
-    int iz = ip.get0();
+    int iz = v_get0(ip);

    int coordBase = ix * xdim + iy * ydim + iz * zdim;

@ -218,15 +218,15 @@ inline float ColoredTSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const

    v_float32x4 v0246 = tsdfToFloat_INTR(v_int32x4(vx[0], vx[2], vx[4], vx[6]));
    v_float32x4 v1357 = tsdfToFloat_INTR(v_int32x4(vx[1], vx[3], vx[5], vx[7]));
-    v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246);
+    v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));

    v_float32x4 v00_10 = vxx;
    v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));

-    v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10);
-    float v0 = v0_1.get0();
+    v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
+    float v0 = v_get0(v0_1);
    v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
-    float v1 = v0_1.get0();
+    float v1 = v_get0(v0_1);

    return v0 + tx * (v1 - v0);
 }
@ -276,27 +276,27 @@ inline Point3f ColoredTSDFVolumeCPU::getNormalVoxel(const Point3f& _p) const

 inline v_float32x4 ColoredTSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const
 {
-    if (v_check_any(p < v_float32x4(1.f, 1.f, 1.f, 0.f)) ||
-        v_check_any(p >= v_float32x4((float)(volResolution.x - 2),
+    if (v_check_any(v_lt(p, v_float32x4(1.f, 1.f, 1.f, 0.f))) ||
+        v_check_any(v_ge(p, v_float32x4((float)(volResolution.x - 2),
            (float)(volResolution.y - 2),
-            (float)(volResolution.z - 2), 1.f))
+            (float)(volResolution.z - 2), 1.f)))
        )
        return nanv;

    v_int32x4 ip = v_floor(p);
-    v_float32x4 t = p - v_cvt_f32(ip);
-    float tx = t.get0();
+    v_float32x4 t = v_sub(p, v_cvt_f32(ip));
+    float tx = v_get0(t);
    t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float ty = t.get0();
+    float ty = v_get0(t);
    t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float tz = t.get0();
+    float tz = v_get0(t);

    const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
    const RGBTsdfVoxel* volData = volume.ptr<RGBTsdfVoxel>();

-    int ix = ip.get0(); ip = v_rotate_right<1>(ip);
-    int iy = ip.get0(); ip = v_rotate_right<1>(ip);
-    int iz = ip.get0();
+    int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
+    int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
+    int iz = v_get0(ip);

    int coordBase = ix * xdim + iy * ydim + iz * zdim;

@ -314,23 +314,23 @@ inline v_float32x4 ColoredTSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) co

        v_float32x4 v0246(vx[0], vx[2], vx[4], vx[6]);
        v_float32x4 v1357(vx[1], vx[3], vx[5], vx[7]);
-        v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246);
+        v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));

        v_float32x4 v00_10 = vxx;
        v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));

-        v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10);
-        float v0 = v0_1.get0();
+        v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
+        float v0 = v_get0(v0_1);
        v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
-        float v1 = v0_1.get0();
+        float v1 = v_get0(v0_1);

        nv = v0 + tx * (v1 - v0);
    }

    v_float32x4 n = v_load_aligned(an);
-    v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(n * n)));
+    v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(v_mul(n, n))));

-    return Norm.get0() < 0.0001f ? nanv : n / Norm;
+    return v_get0(Norm) < 0.0001f ? nanv : v_div(n, Norm);
 }
 #else
 inline Point3f ColoredTSDFVolumeCPU::getNormalVoxel(const Point3f& p) const
@ -388,15 +388,15 @@ inline float ColoredTSDFVolumeCPU::interpolateColor(float tx, float ty, float tz
    v_float32x4 v0246, v1357;
    v_load_deinterleave(vx, v0246, v1357);

-    v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246);
+    v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));

    v_float32x4 v00_10 = vxx;
    v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));

-    v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10);
-    float v0 = v0_1.get0();
+    v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
+    float v0 = v_get0(v0_1);
    v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
-    float v1 = v0_1.get0();
+    float v1 = v_get0(v0_1);

    return v0 + tx * (v1 - v0);
 }
@ -427,10 +427,10 @@ inline Point3f ColoredTSDFVolumeCPU::getColorVoxel(const Point3f& _p) const
 }
 inline v_float32x4 ColoredTSDFVolumeCPU::getColorVoxel(const v_float32x4& p) const
 {
-    if (v_check_any(p < v_float32x4(1.f, 1.f, 1.f, 0.f)) ||
-        v_check_any(p >= v_float32x4((float)(volResolution.x - 2),
+    if (v_check_any(v_lt(p, v_float32x4(1.f, 1.f, 1.f, 0.f))) ||
+        v_check_any(v_ge(p, v_float32x4((float)(volResolution.x - 2),
            (float)(volResolution.y - 2),
-            (float)(volResolution.z - 2), 1.f))
+            (float)(volResolution.z - 2), 1.f)))
        )
        return nanv;

@ -439,9 +439,9 @@ inline v_float32x4 ColoredTSDFVolumeCPU::getColorVoxel(const v_float32x4& p) con
    const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
    const RGBTsdfVoxel* volData = volume.ptr<RGBTsdfVoxel>();

-    int ix = ip.get0(); ip = v_rotate_right<1>(ip);
-    int iy = ip.get0(); ip = v_rotate_right<1>(ip);
-    int iz = ip.get0();
+    int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
+    int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
+    int iz = v_get0(ip);

    int coordBase = ix * xdim + iy * ydim + iz * zdim;
    float CV_DECL_ALIGNED(16) rgb[4];
@ -456,12 +456,12 @@ inline v_float32x4 ColoredTSDFVolumeCPU::getColorVoxel(const v_float32x4& p) con
    }

    v_float32x4 vsi(voxelSizeInv, voxelSizeInv, voxelSizeInv, voxelSizeInv);
-    v_float32x4 ptVox = p * vsi;
+    v_float32x4 ptVox = v_mul(p, vsi);
    v_int32x4 iptVox = v_floor(ptVox);
-    v_float32x4 t = ptVox - v_cvt_f32(iptVox);
-    float tx = t.get0(); t = v_rotate_right<1>(t);
-    float ty = t.get0(); t = v_rotate_right<1>(t);
-    float tz = t.get0();
+    v_float32x4 t = v_sub(ptVox, v_cvt_f32(iptVox));
+    float tx = v_get0(t); t = v_rotate_right<1>(t);
+    float ty = v_get0(t); t = v_rotate_right<1>(t);
+    float tz = v_get0(t);
    rgb[0] = interpolateColor(tx, ty, tz, r);
    rgb[1] = interpolateColor(tx, ty, tz, g);
    rgb[2] = interpolateColor(tx, ty, tz, b);
@ -583,21 +583,21 @@ struct ColorRaycastInvoker : ParallelLoopBody
                // get direction through pixel in volume space:

                // 1. reproject (x, y) on projecting plane where z = 1.f
-                v_float32x4 planed = (v_float32x4((float)x, (float)y, 0.f, 0.f) - vcxy) * vfxy;
+                v_float32x4 planed = v_mul(v_sub(v_float32x4((float)x, (float)y, 0.F, 0.F), vcxy), vfxy);
                planed = v_combine_low(planed, v_float32x4(1.f, 0.f, 0.f, 0.f));

                // 2. rotate to volume space
                planed = v_matmuladd(planed, camRot0, camRot1, camRot2, v_setzero_f32());

                // 3. normalize
-                v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(planed * planed)));
-                v_float32x4 dir = planed * invNorm;
+                v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(v_mul(planed, planed))));
+                v_float32x4 dir = v_mul(planed, invNorm);

                // compute intersection of ray with all six bbox planes
-                v_float32x4 rayinv = v_setall_f32(1.f) / dir;
+                v_float32x4 rayinv = v_div(v_setall_f32(1.F), dir);
                // div by zero should be eliminated by these products
-                v_float32x4 tbottom = rayinv * (boxDown - orig);
-                v_float32x4 ttop = rayinv * (boxUp - orig);
+                v_float32x4 tbottom = v_mul(rayinv, v_sub(boxDown, orig));
+                v_float32x4 ttop = v_mul(rayinv, v_sub(boxUp, orig));

                // re-order intersections to find smallest and largest on each axis
                v_float32x4 minAx = v_min(ttop, tbottom);
@ -618,14 +618,14 @@ struct ColorRaycastInvoker : ParallelLoopBody
                if (tmin < tmax)
                {
                    // interpolation optimized a little
-                    orig *= invVoxelSize;
-                    dir *= invVoxelSize;
+                    orig = v_mul(orig, invVoxelSize);
+                    dir = v_mul(dir, invVoxelSize);

                    int xdim = volume.volDims[0];
                    int ydim = volume.volDims[1];
                    int zdim = volume.volDims[2];
-                    v_float32x4 rayStep = dir * v_setall_f32(tstep);
-                    v_float32x4 next = (orig + dir * v_setall_f32(tmin));
+                    v_float32x4 rayStep = v_mul(dir, v_setall_f32(this->tstep));
+                    v_float32x4 next = (v_add(orig, v_mul(dir, v_setall_f32(tmin))));
                    float f = volume.interpolateVoxel(next), fnext = f;

                    //raymarch
@ -633,11 +633,11 @@ struct ColorRaycastInvoker : ParallelLoopBody
                    int nSteps = cvFloor((tmax - tmin) / tstep);
                    for (; steps < nSteps; steps++)
                    {
-                        next += rayStep;
+                        next = v_add(next, rayStep);
                        v_int32x4 ip = v_round(next);
-                        int ix = ip.get0(); ip = v_rotate_right<1>(ip);
-                        int iy = ip.get0(); ip = v_rotate_right<1>(ip);
-                        int iz = ip.get0();
+                        int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
+                        int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
+                        int iz = v_get0(ip);
                        int coord = ix * xdim + iy * ydim + iz * zdim;

                        fnext = tsdfToFloat(volume.volume.at<RGBTsdfVoxel>(coord).tsdf);
@ -657,7 +657,7 @@ struct ColorRaycastInvoker : ParallelLoopBody
                    // linearly interpolate t between two f values
                    if (f > 0.f && fnext < 0.f)
                    {
-                        v_float32x4 tp = next - rayStep;
+                        v_float32x4 tp = v_sub(next, rayStep);
                        float ft = volume.interpolateVoxel(tp);
                        float ftdt = volume.interpolateVoxel(next);
                        float ts = tmin + tstep * (steps - ft / (ftdt - ft));
@ -665,7 +665,7 @@ struct ColorRaycastInvoker : ParallelLoopBody
                        // avoid division by zero
                        if (!cvIsNaN(ts) && !cvIsInf(ts))
                        {
-                            v_float32x4 pv = (orig + dir * v_setall_f32(ts));
+                            v_float32x4 pv = (v_add(orig, v_mul(dir, v_setall_f32(ts))));
                            v_float32x4 nv = volume.getNormalVoxel(pv);
                            v_float32x4 cv = volume.getColorVoxel(pv);

@ -675,9 +675,7 @@ struct ColorRaycastInvoker : ParallelLoopBody
                                //convert pv and nv to camera space
                                normal = v_matmuladd(nv, volRot0, volRot1, volRot2, v_setzero_f32());
                                // interpolation optimized a little
-                                point = v_matmuladd(pv * v_float32x4(volume.voxelSize,
-                                    volume.voxelSize,
-                                    volume.voxelSize, 1.f),
+                                point = v_matmuladd(v_mul(pv, v_float32x4(this->volume.voxelSize, this->volume.voxelSize, this->volume.voxelSize, 1.F)),
                                    volRot0, volRot1, volRot2, volTrans);
                            }
                        }
--- a/modules/rgbd/src/fast_icp.cpp
+++ b/modules/rgbd/src/fast_icp.cpp
@ -138,7 +138,7 @@ bool ICPImpl::estimateTransformT(cv::Affine3f& transform,
 #if USE_INTRINSICS
 static inline bool fastCheck(const v_float32x4& p0, const v_float32x4& p1)
 {
-    float check = (p0.get0() + p1.get0());
+    float check = (v_get0(p0) + v_get0(p1));
    return !cvIsNaN(check);
 }

@ -160,7 +160,7 @@ static inline v_float32x4 crossProduct(const v_float32x4& a, const v_float32x4&
    v_float32x4 ayzx, azxy, byzx, bzxy;
    getCrossPerm(a, ayzx, azxy);
    getCrossPerm(b, byzx, bzxy);
-    return ayzx*bzxy - azxy*byzx;
+    return v_sub(v_mul(ayzx, bzxy), v_mul(azxy, byzx));
 }
 #else
 static inline bool fastCheck(const Point3f& p)
@ -235,11 +235,11 @@ struct GetAbInvoker : ParallelLoopBody

                //find correspondence by projecting the point
                v_float32x4 oldCoords;
-                float pz = (v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(newP))).get0());
+                float pz = v_get0(v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(newP))));
                // x, y, 0, 0
-                oldCoords = v_muladd(newP/v_setall_f32(pz), vfxy, vcxy);
+                oldCoords = v_muladd(v_div(newP, v_setall_f32(pz)), vfxy, vcxy);

-                if(!v_check_all((oldCoords >= v_setzero_f32()) & (oldCoords < vframe)))
+                if(!v_check_all(v_and(v_ge(oldCoords, v_setzero_f32()), v_lt(oldCoords, vframe))))
                    continue;

                // bilinearly interpolate oldPts and oldNrm under oldCoords point
@ -247,12 +247,12 @@ struct GetAbInvoker : ParallelLoopBody
                v_float32x4 oldN;
                {
                    v_int32x4 ixy = v_floor(oldCoords);
-                    v_float32x4 txy = oldCoords - v_cvt_f32(ixy);
-                    int xi = ixy.get0();
-                    int yi = v_rotate_right<1>(ixy).get0();
-                    v_float32x4 tx = v_setall_f32(txy.get0());
+                    v_float32x4 txy = v_sub(oldCoords, v_cvt_f32(ixy));
+                    int xi = v_get0(ixy);
+                    int yi = v_get0(v_rotate_right<1>(ixy));
+                    v_float32x4 tx = v_setall_f32(v_get0(txy));
                    txy = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(txy)));
-                    v_float32x4 ty = v_setall_f32(txy.get0());
+                    v_float32x4 ty = v_setall_f32(v_get0(txy));

                    const float* prow0 = (const float*)oldPts[yi+0];
                    const float* prow1 = (const float*)oldPts[yi+1];
@ -275,23 +275,23 @@ struct GetAbInvoker : ParallelLoopBody

                    // NaN check is done later

-                    v_float32x4 p0 = p00 + tx*(p01 - p00);
-                    v_float32x4 p1 = p10 + tx*(p11 - p10);
-                    oldP = p0 + ty*(p1 - p0);
+                    v_float32x4 p0 = v_add(p00, v_mul(tx, v_sub(p01, p00)));
+                    v_float32x4 p1 = v_add(p10, v_mul(tx, v_sub(p11, p10)));
+                    oldP = v_add(p0, v_mul(ty, v_sub(p1, p0)));

-                    v_float32x4 n0 = n00 + tx*(n01 - n00);
-                    v_float32x4 n1 = n10 + tx*(n11 - n10);
-                    oldN = n0 + ty*(n1 - n0);
+                    v_float32x4 n0 = v_add(n00, v_mul(tx, v_sub(n01, n00)));
+                    v_float32x4 n1 = v_add(n10, v_mul(tx, v_sub(n11, n10)));
+                    oldN = v_add(n0, v_mul(ty, v_sub(n1, n0)));
                }

                bool oldPNcheck = fastCheck(oldP, oldN);

                //filter by distance
-                v_float32x4 diff = newP - oldP;
-                bool distCheck = !(v_reduce_sum(diff*diff) > sqThresh);
+                v_float32x4 diff = v_sub(newP, oldP);
+                bool distCheck = !(v_reduce_sum(v_mul(diff, diff)) > sqThresh);

                //filter by angle
-                bool angleCheck = !(abs(v_reduce_sum(newN*oldN)) < cosThresh);
+                bool angleCheck = !(abs(v_reduce_sum(v_mul(newN, oldN))) < cosThresh);

                if(!(oldPNcheck && distCheck && angleCheck))
                    continue;
@ -299,17 +299,17 @@ struct GetAbInvoker : ParallelLoopBody
                // build point-wise vector ab = [ A | b ]
                v_float32x4 VxNv = crossProduct(newP, oldN);
                Point3f VxN;
-                VxN.x = VxNv.get0();
-                VxN.y = v_reinterpret_as_f32(v_extract<1>(v_reinterpret_as_u32(VxNv), v_setzero_u32())).get0();
-                VxN.z = v_reinterpret_as_f32(v_extract<2>(v_reinterpret_as_u32(VxNv), v_setzero_u32())).get0();
+                VxN.x = v_get0(VxNv);
+                VxN.y = v_get0(v_reinterpret_as_f32(v_extract<1>(v_reinterpret_as_u32(VxNv), v_setzero_u32())));
+                VxN.z = v_get0(v_reinterpret_as_f32(v_extract<2>(v_reinterpret_as_u32(VxNv), v_setzero_u32())));

-                float dotp = -v_reduce_sum(oldN*diff);
+                float dotp = -v_reduce_sum(v_mul(oldN, diff));

                // build point-wise upper-triangle matrix [ab^T * ab] w/o last row
                // which is [A^T*A | A^T*b]
                // and gather sum

-                v_float32x4 vd = VxNv | v_float32x4(0, 0, 0, dotp);
+                v_float32x4 vd = v_or(VxNv, v_float32x4(0, 0, 0, dotp));
                v_float32x4 n = oldN;
                v_float32x4 nyzx;
                {
--- a/modules/rgbd/src/hash_tsdf.cpp
+++ b/modules/rgbd/src/hash_tsdf.cpp
@ -425,15 +425,15 @@ inline float interpolate(float tx, float ty, float tz, float vx[8])
    v_float32x4 v0246, v1357;
    v_load_deinterleave(vx, v0246, v1357);

-    v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246);
+    v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));

    v_float32x4 v00_10 = vxx;
    v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));

-    v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10);
-    float v0 = v0_1.get0();
+    v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
+    float v0 = v_get0(v0_1);
    v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
-    float v1 = v0_1.get0();
+    float v1 = v_get0(v0_1);

    return v0 + tx * (v1 - v0);
 }
@ -598,9 +598,9 @@ Point3f HashTSDFVolumeCPU::getNormalVoxel(const Point3f &point) const
    v_float32x8 czp = v_lut(vals, idxzp);
    v_float32x8 czn = v_lut(vals, idxzn);

-    v_float32x8 vcxv = cxn - cxp;
-    v_float32x8 vcyv = cyn - cyp;
-    v_float32x8 vczv = czn - czp;
+    v_float32x8 vcxv = v_sub(cxn, cxp);
+    v_float32x8 vcyv = v_sub(cyn, cyp);
+    v_float32x8 vczv = v_sub(czn, czp);

    v_store(cxv, vcxv);
    v_store(cyv, vcyv);
@ -615,9 +615,9 @@ Point3f HashTSDFVolumeCPU::getNormalVoxel(const Point3f &point) const
    v_float32x4 czp0 = v_lut(vals, idxzp + 0); v_float32x4 czp1 = v_lut(vals, idxzp + 4);
    v_float32x4 czn0 = v_lut(vals, idxzn + 0); v_float32x4 czn1 = v_lut(vals, idxzn + 4);

-    v_float32x4 cxv0 = cxn0 - cxp0; v_float32x4 cxv1 = cxn1 - cxp1;
-    v_float32x4 cyv0 = cyn0 - cyp0; v_float32x4 cyv1 = cyn1 - cyp1;
-    v_float32x4 czv0 = czn0 - czp0; v_float32x4 czv1 = czn1 - czp1;
+    v_float32x4 cxv0 = v_sub(cxn0, cxp0); v_float32x4 cxv1 = v_sub(cxn1, cxp1);
+    v_float32x4 cyv0 = v_sub(cyn0, cyp0); v_float32x4 cyv1 = v_sub(cyn1, cyp1);
+    v_float32x4 czv0 = v_sub(czn0, czp0); v_float32x4 czv1 = v_sub(czn1, czp1);

    v_store(cxv + 0, cxv0); v_store(cxv + 4, cxv1);
    v_store(cyv + 0, cyv0); v_store(cyv + 4, cyv1);
@ -1523,9 +1523,9 @@ Point3f HashTSDFVolumeGPU::getNormalVoxel(const Point3f& point) const
    v_float32x8 czp = v_lut(vals, idxzp);
    v_float32x8 czn = v_lut(vals, idxzn);

-    v_float32x8 vcxv = cxn - cxp;
-    v_float32x8 vcyv = cyn - cyp;
-    v_float32x8 vczv = czn - czp;
+    v_float32x8 vcxv = v_sub(cxn, cxp);
+    v_float32x8 vcyv = v_sub(cyn, cyp);
+    v_float32x8 vczv = v_sub(czn, czp);

    v_store(cxv, vcxv);
    v_store(cyv, vcyv);
@ -1540,9 +1540,9 @@ Point3f HashTSDFVolumeGPU::getNormalVoxel(const Point3f& point) const
    v_float32x4 czp0 = v_lut(vals, idxzp + 0); v_float32x4 czp1 = v_lut(vals, idxzp + 4);
    v_float32x4 czn0 = v_lut(vals, idxzn + 0); v_float32x4 czn1 = v_lut(vals, idxzn + 4);

-    v_float32x4 cxv0 = cxn0 - cxp0; v_float32x4 cxv1 = cxn1 - cxp1;
-    v_float32x4 cyv0 = cyn0 - cyp0; v_float32x4 cyv1 = cyn1 - cyp1;
-    v_float32x4 czv0 = czn0 - czp0; v_float32x4 czv1 = czn1 - czp1;
+    v_float32x4 cxv0 = v_sub(cxn0, cxp0); v_float32x4 cxv1 = v_sub(cxn1, cxp1);
+    v_float32x4 cyv0 = v_sub(cyn0, cyp0); v_float32x4 cyv1 = v_sub(cyn1, cyp1);
+    v_float32x4 czv0 = v_sub(czn0, czp0); v_float32x4 czv1 = v_sub(czn1, czp1);

    v_store(cxv + 0, cxv0); v_store(cxv + 4, cxv1);
    v_store(cyv + 0, cyv0); v_store(cyv + 4, cyv1);
--- a/modules/rgbd/src/tsdf.cpp
+++ b/modules/rgbd/src/tsdf.cpp
@ -146,21 +146,21 @@ inline float TSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const
 {
    // tx, ty, tz = floor(p)
    v_int32x4 ip  = v_floor(p);
-    v_float32x4 t = p - v_cvt_f32(ip);
-    float tx      = t.get0();
+    v_float32x4 t = v_sub(p, v_cvt_f32(ip));
+    float tx      = v_get0(t);
    t             = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float ty      = t.get0();
+    float ty      = v_get0(t);
    t             = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float tz      = t.get0();
+    float tz      = v_get0(t);

    int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
    const TsdfVoxel* volData = volume.ptr<TsdfVoxel>();

-    int ix = ip.get0();
+    int ix = v_get0(ip);
    ip     = v_rotate_right<1>(ip);
-    int iy = ip.get0();
+    int iy = v_get0(ip);
    ip     = v_rotate_right<1>(ip);
-    int iz = ip.get0();
+    int iz = v_get0(ip);

    int coordBase = ix*xdim + iy*ydim + iz*zdim;

@ -170,15 +170,15 @@ inline float TSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const

    v_float32x4 v0246 = tsdfToFloat_INTR(v_int32x4(vx[0], vx[2], vx[4], vx[6]));
    v_float32x4 v1357 = tsdfToFloat_INTR(v_int32x4(vx[1], vx[3], vx[5], vx[7]));
-    v_float32x4 vxx = v0246 + v_setall_f32(tz)*(v1357 - v0246);
+    v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));

    v_float32x4 v00_10 = vxx;
    v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));

-    v_float32x4 v0_1 = v00_10 + v_setall_f32(ty)*(v01_11 - v00_10);
-    float v0         = v0_1.get0();
+    v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
+    float v0         = v_get0(v0_1);
    v0_1             = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
-    float v1         = v0_1.get0();
+    float v1         = v_get0(v0_1);

    return v0 + tx*(v1 - v0);
 }
@ -228,27 +228,27 @@ inline Point3f TSDFVolumeCPU::getNormalVoxel(const Point3f& _p) const

 inline v_float32x4 TSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const
 {
-    if(v_check_any (p < v_float32x4(1.f, 1.f, 1.f, 0.f)) ||
-       v_check_any (p >= v_float32x4((float)(volResolution.x-2),
+    if(v_check_any (v_lt(p, v_float32x4(1.f, 1.f, 1.f, 0.f))) ||
+       v_check_any (v_ge(p, v_float32x4((float)(volResolution.x-2),
                                     (float)(volResolution.y-2),
-                                     (float)(volResolution.z-2), 1.f))
+                                     (float)(volResolution.z-2), 1.f)))
                   )
        return nanv;

    v_int32x4 ip  = v_floor(p);
-    v_float32x4 t = p - v_cvt_f32(ip);
-    float tx      = t.get0();
+    v_float32x4 t = v_sub(p, v_cvt_f32(ip));
+    float tx      = v_get0(t);
    t             = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float ty      = t.get0();
+    float ty      = v_get0(t);
    t             = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float tz      = t.get0();
+    float tz      = v_get0(t);

    const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
    const TsdfVoxel* volData = volume.ptr<TsdfVoxel>();

-    int ix = ip.get0(); ip = v_rotate_right<1>(ip);
-    int iy = ip.get0(); ip = v_rotate_right<1>(ip);
-    int iz = ip.get0();
+    int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
+    int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
+    int iz = v_get0(ip);

    int coordBase = ix*xdim + iy*ydim + iz*zdim;

@ -266,23 +266,23 @@ inline v_float32x4 TSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const

        v_float32x4 v0246 (vx[0], vx[2], vx[4], vx[6]);
        v_float32x4 v1357 (vx[1], vx[3], vx[5], vx[7]);
-        v_float32x4 vxx = v0246 + v_setall_f32(tz)*(v1357 - v0246);
+        v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));

        v_float32x4 v00_10 = vxx;
        v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));

-        v_float32x4 v0_1 = v00_10 + v_setall_f32(ty)*(v01_11 - v00_10);
-        float v0         = v0_1.get0();
+        v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
+        float v0         = v_get0(v0_1);
        v0_1             = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
-        float v1         = v0_1.get0();
+        float v1         = v_get0(v0_1);

        nv = v0 + tx*(v1 - v0);
    }

    v_float32x4 n       = v_load_aligned(an);
-    v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(n*n)));
+    v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(v_mul(n, n))));

-    return Norm.get0() < 0.0001f ? nanv : n/Norm;
+    return v_get0(Norm) < 0.0001f ? nanv : v_div(n, Norm);
 }
 #else
 inline Point3f TSDFVolumeCPU::getNormalVoxel(const Point3f& p) const
@ -394,21 +394,21 @@ struct RaycastInvoker : ParallelLoopBody
                // get direction through pixel in volume space:

                // 1. reproject (x, y) on projecting plane where z = 1.f
-                v_float32x4 planed = (v_float32x4((float)x, (float)y, 0.f, 0.f) - vcxy)*vfxy;
+                v_float32x4 planed = v_mul(v_sub(v_float32x4((float)x, (float)y, 0.F, 0.F), vcxy), vfxy);
                planed             = v_combine_low(planed, v_float32x4(1.f, 0.f, 0.f, 0.f));

                // 2. rotate to volume space
                planed = v_matmuladd(planed, camRot0, camRot1, camRot2, v_setzero_f32());

                // 3. normalize
-                v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(planed*planed)));
-                v_float32x4 dir     = planed*invNorm;
+                v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(v_mul(planed, planed))));
+                v_float32x4 dir     = v_mul(planed, invNorm);

                // compute intersection of ray with all six bbox planes
-                v_float32x4 rayinv = v_setall_f32(1.f)/dir;
+                v_float32x4 rayinv = v_div(v_setall_f32(1.F), dir);
                // div by zero should be eliminated by these products
-                v_float32x4 tbottom = rayinv*(boxDown - orig);
-                v_float32x4 ttop    = rayinv*(boxUp   - orig);
+                v_float32x4 tbottom = v_mul(rayinv, v_sub(boxDown, orig));
+                v_float32x4 ttop    = v_mul(rayinv, v_sub(boxUp, orig));

                // re-order intersections to find smallest and largest on each axis
                v_float32x4 minAx = v_min(ttop, tbottom);
@ -429,14 +429,14 @@ struct RaycastInvoker : ParallelLoopBody
                if(tmin < tmax)
                {
                    // interpolation optimized a little
-                    orig *= invVoxelSize;
-                    dir  *= invVoxelSize;
+                    orig = v_mul(orig, invVoxelSize);
+                    dir = v_mul(dir, invVoxelSize);

                    int xdim            = volume.volDims[0];
                    int ydim            = volume.volDims[1];
                    int zdim            = volume.volDims[2];
-                    v_float32x4 rayStep = dir * v_setall_f32(tstep);
-                    v_float32x4 next    = (orig + dir * v_setall_f32(tmin));
+                    v_float32x4 rayStep = v_mul(dir, v_setall_f32(this->tstep));
+                    v_float32x4 next    = (v_add(orig, v_mul(dir, v_setall_f32(tmin))));
                    float f = volume.interpolateVoxel(next), fnext = f;

                    //raymarch
@ -444,11 +444,11 @@ struct RaycastInvoker : ParallelLoopBody
                    int nSteps = cvFloor((tmax - tmin)/tstep);
                    for(; steps < nSteps; steps++)
                    {
-                        next += rayStep;
+                        next = v_add(next, rayStep);
                        v_int32x4 ip = v_round(next);
-                        int ix = ip.get0(); ip = v_rotate_right<1>(ip);
-                        int iy = ip.get0(); ip = v_rotate_right<1>(ip);
-                        int iz = ip.get0();
+                        int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
+                        int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
+                        int iz = v_get0(ip);
                        int coord = ix*xdim + iy*ydim + iz*zdim;

                        fnext = tsdfToFloat(volume.volume.at<TsdfVoxel>(coord).tsdf);
@ -468,7 +468,7 @@ struct RaycastInvoker : ParallelLoopBody
                    // linearly interpolate t between two f values
                    if(f > 0.f && fnext < 0.f)
                    {
-                        v_float32x4 tp = next - rayStep;
+                        v_float32x4 tp = v_sub(next, rayStep);
                        float ft    = volume.interpolateVoxel(tp);
                        float ftdt  = volume.interpolateVoxel(next);
                        float ts = tmin + tstep*(steps - ft/(ftdt - ft));
@ -476,7 +476,7 @@ struct RaycastInvoker : ParallelLoopBody
                        // avoid division by zero
                        if(!cvIsNaN(ts) && !cvIsInf(ts))
                        {
-                            v_float32x4 pv = (orig + dir*v_setall_f32(ts));
+                            v_float32x4 pv = (v_add(orig, v_mul(dir, v_setall_f32(ts))));
                            v_float32x4 nv = volume.getNormalVoxel(pv);

                            if(!isNaN(nv))
@ -484,9 +484,7 @@ struct RaycastInvoker : ParallelLoopBody
                                //convert pv and nv to camera space
                                normal = v_matmuladd(nv, volRot0, volRot1, volRot2, v_setzero_f32());
                                // interpolation optimized a little
-                                point = v_matmuladd(pv*v_float32x4(volume.voxelSize,
-                                                                   volume.voxelSize,
-                                                                   volume.voxelSize, 1.f),
+                                point = v_matmuladd(v_mul(pv, v_float32x4(this->volume.voxelSize, this->volume.voxelSize, this->volume.voxelSize, 1.F)),
                                                    volRot0, volRot1, volRot2, volTrans);
                            }
                        }
--- a/modules/rgbd/src/tsdf_functions.cpp
+++ b/modules/rgbd/src/tsdf_functions.cpp
@ -235,35 +235,33 @@ void integrateVolumeUnit(
                    // optimization of the following:
                    //Point3f volPt = Point3f(x, y, z)*voxelSize;
                    //Point3f camSpacePt = vol2cam * volPt;
-                    camSpacePt += zStep;
+                    camSpacePt = v_add(camSpacePt, zStep);

-                    float zCamSpace = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))).get0();
+                    float zCamSpace = v_get0(v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))));
                    if (zCamSpace <= 0.f)
                        continue;

-                    v_float32x4 camPixVec = camSpacePt / v_setall_f32(zCamSpace);
+                    v_float32x4 camPixVec = v_div(camSpacePt, v_setall_f32(zCamSpace));
                    v_float32x4 projected = v_muladd(camPixVec, vfxy, vcxy);
                    // leave only first 2 lanes
-                    projected = v_reinterpret_as_f32(v_reinterpret_as_u32(projected) &
-                        v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0));
+                    projected = v_reinterpret_as_f32(v_and(v_reinterpret_as_u32(projected), v_uint32x4(4294967295U, 4294967295U, 0, 0)));

                    depthType v;
                    // bilinearly interpolate depth at projected
                    {
                        const v_float32x4& pt = projected;
                        // check coords >= 0 and < imgSize
-                        v_uint32x4 limits = v_reinterpret_as_u32(pt < v_setzero_f32()) |
-                            v_reinterpret_as_u32(pt >= upLimits);
-                        limits = limits | v_rotate_right<1>(limits);
-                        if (limits.get0())
+                        v_uint32x4 limits = v_or(v_reinterpret_as_u32(v_lt(pt, v_setzero_f32())), v_reinterpret_as_u32(v_ge(pt, upLimits)));
+                        limits = v_or(limits, v_rotate_right<1>(limits));
+                        if (v_get0(limits))
                            continue;

                        // xi, yi = floor(pt)
                        v_int32x4 ip = v_floor(pt);
                        v_int32x4 ipshift = ip;
-                        int xi = ipshift.get0();
+                        int xi = v_get0(ipshift);
                        ipshift = v_rotate_right<1>(ipshift);
-                        int yi = ipshift.get0();
+                        int yi = v_get0(ipshift);

                        const depthType* row0 = depth[yi + 0];
                        const depthType* row1 = depth[yi + 1];
@ -277,17 +275,17 @@ void integrateVolumeUnit(

                        // assume correct depth is positive
                        // don't fix missing data
-                        if (v_check_all(vall > v_setzero_f32()))
+                        if (v_check_all(v_gt(vall, v_setzero_f32())))
                        {
-                            v_float32x4 t = pt - v_cvt_f32(ip);
-                            float tx = t.get0();
+                            v_float32x4 t = v_sub(pt, v_cvt_f32(ip));
+                            float tx = v_get0(t);
                            t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-                            v_float32x4 ty = v_setall_f32(t.get0());
+                            v_float32x4 ty = v_setall_f32(v_get0(t));
                            // vx is y-interpolated between rows 0 and 1
-                            v_float32x4 vx = v001 + ty * (v101 - v001);
-                            float v0 = vx.get0();
+                            v_float32x4 vx = v_add(v001, v_mul(ty, v_sub(v101, v001)));
+                            float v0 = v_get0(vx);
                            vx = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vx)));
-                            float v1 = vx.get0();
+                            float v1 = v_get0(vx);
                            v = v0 + tx * (v1 - v0);
                        }
                        else
@ -295,8 +293,8 @@ void integrateVolumeUnit(
                    }

                    // norm(camPixVec) produces double which is too slow
-                    int _u = (int)projected.get0();
-                    int _v = (int)v_rotate_right<1>(projected).get0();
+                    int _u = (int)v_get0(projected);
+                    int _v = (int)v_get0(v_rotate_right<1>(projected));
                    if (!(_u >= 0 && _u < depth.cols && _v >= 0 && _v < depth.rows))
                        continue;
                    float pixNorm = pixNorms.at<float>(_v, _u);
@ -500,35 +498,33 @@ void integrateRGBVolumeUnit(
                    // optimization of the following:
                    //Point3f volPt = Point3f(x, y, z)*voxelSize;
                    //Point3f camSpacePt = vol2cam * volPt;
-                    camSpacePt += zStep;
+                    camSpacePt = v_add(camSpacePt, zStep);

-                    float zCamSpace = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))).get0();
+                    float zCamSpace = v_get0(v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))));
                    if (zCamSpace <= 0.f)
                        continue;

-                    v_float32x4 camPixVec = camSpacePt / v_setall_f32(zCamSpace);
+                    v_float32x4 camPixVec = v_div(camSpacePt, v_setall_f32(zCamSpace));
                    v_float32x4 projected = v_muladd(camPixVec, vfxy, vcxy);
                    // leave only first 2 lanes
-                    projected = v_reinterpret_as_f32(v_reinterpret_as_u32(projected) &
-                        v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0));
+                    projected = v_reinterpret_as_f32(v_and(v_reinterpret_as_u32(projected), v_uint32x4(4294967295U, 4294967295U, 0, 0)));

                    depthType v;
                    // bilinearly interpolate depth at projected
                    {
                        const v_float32x4& pt = projected;
                        // check coords >= 0 and < imgSize
-                        v_uint32x4 limits = v_reinterpret_as_u32(pt < v_setzero_f32()) |
-                            v_reinterpret_as_u32(pt >= upLimits);
-                        limits = limits | v_rotate_right<1>(limits);
-                        if (limits.get0())
+                        v_uint32x4 limits = v_or(v_reinterpret_as_u32(v_lt(pt, v_setzero_f32())), v_reinterpret_as_u32(v_ge(pt, upLimits)));
+                        limits = v_or(limits,v_rotate_right<1>(limits));
+                        if (v_get0(limits))
                            continue;

                        // xi, yi = floor(pt)
                        v_int32x4 ip = v_floor(pt);
                        v_int32x4 ipshift = ip;
-                        int xi = ipshift.get0();
+                        int xi = v_get0(ipshift);
                        ipshift = v_rotate_right<1>(ipshift);
-                        int yi = ipshift.get0();
+                        int yi = v_get0(ipshift);

                        const depthType* row0 = depth[yi + 0];
                        const depthType* row1 = depth[yi + 1];
@ -542,17 +538,17 @@ void integrateRGBVolumeUnit(

                        // assume correct depth is positive
                        // don't fix missing data
-                        if (v_check_all(vall > v_setzero_f32()))
+                        if (v_check_all(v_gt(vall, v_setzero_f32())))
                        {
-                            v_float32x4 t = pt - v_cvt_f32(ip);
-                            float tx = t.get0();
+                            v_float32x4 t = v_sub(pt, v_cvt_f32(ip));
+                            float tx = v_get0(t);
                            t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-                            v_float32x4 ty = v_setall_f32(t.get0());
+                            v_float32x4 ty = v_setall_f32(v_get0(t));
                            // vx is y-interpolated between rows 0 and 1
-                            v_float32x4 vx = v001 + ty * (v101 - v001);
-                            float v0 = vx.get0();
+                            v_float32x4 vx = v_add(v001, v_mul(ty, v_sub(v101, v001)));
+                            float v0 = v_get0(vx);
                            vx = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vx)));
-                            float v1 = vx.get0();
+                            float v1 = v_get0(vx);
                            v = v0 + tx * (v1 - v0);
                        }
                        else
@ -561,14 +557,13 @@ void integrateRGBVolumeUnit(

                    v_float32x4 projectedRGB = v_muladd(camPixVec, rgb_vfxy, rgb_vcxy);
                    // leave only first 2 lanes
-                    projectedRGB = v_reinterpret_as_f32(v_reinterpret_as_u32(projected) &
-                        v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0));
+                    projectedRGB = v_reinterpret_as_f32(v_and(v_reinterpret_as_u32(projected), v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0)));

                    // norm(camPixVec) produces double which is too slow
-                    int _u = (int)projected.get0();
-                    int _v = (int)v_rotate_right<1>(projected).get0();
-                    int rgb_u = (int)projectedRGB.get0();
-                    int rgb_v = (int)v_rotate_right<1>(projectedRGB).get0();
+                    int _u = (int)v_get0(projected);
+                    int _v = (int)v_get0(v_rotate_right<1>(projected));
+                    int rgb_u = (int)v_get0(projectedRGB);
+                    int rgb_v = (int)v_get0(v_rotate_right<1>(projectedRGB));

                    if (!(_u >= 0 && _u < depth.cols && _v >= 0 && _v < depth.rows &&
                        rgb_v >= 0 && rgb_v < color.rows && rgb_u >= 0 && rgb_u < color.cols))
--- a/modules/rgbd/src/tsdf_functions.hpp
+++ b/modules/rgbd/src/tsdf_functions.hpp
@ -20,7 +20,7 @@ namespace kinfu
 inline v_float32x4 tsdfToFloat_INTR(const v_int32x4& num)
 {
    v_float32x4 num128 = v_setall_f32(-1.f / 128.f);
-    return v_cvt_f32(num) * num128;
+    return v_mul(v_cvt_f32(num), num128);
 }
 #endif

--- a/modules/rgbd/src/utils.hpp
+++ b/modules/rgbd/src/utils.hpp
@ -68,7 +68,7 @@ inline bool isNaN(cv::Point3f p)
 #if USE_INTRINSICS
 static inline bool isNaN(const cv::v_float32x4& p)
 {
-    return cv::v_check_any(p != p);
+    return cv::v_check_any(v_ne(p, p));
 }
 #endif

--- a/modules/ximgproc/src/anisodiff.cpp
+++ b/modules/ximgproc/src/anisodiff.cpp
@ -74,8 +74,8 @@ inline v_uint8x16 v_finalize_pix_ch(const v_int16x8& c0, const v_int16x8& c1,
    v_expand_f32(c0, f0, f1);
    v_expand_f32(c1, f2, f3);

-    v_int16x8 d0 = v_pack(v_round(s0*alpha + f0), v_round(s1*alpha + f1));
-    v_int16x8 d1 = v_pack(v_round(s2*alpha + f2), v_round(s3*alpha + f3));
+    v_int16x8 d0 = v_pack(v_round(v_add(v_mul(s0, alpha), f0)), v_round(v_add(v_mul(s1, alpha), f1)));
+    v_int16x8 d1 = v_pack(v_round(v_add(v_mul(s2, alpha), f2)), v_round(v_add(v_mul(s3, alpha), f3)));

    return v_pack_u(d0, d1);
 }
@ -135,12 +135,12 @@ public:
                    v_expand_s(p1, p10, p11);
                    v_expand_s(p2, p20, p21);

-                    v_int16x8 d00 = p00 - c00, d01 = p01 - c01;
-                    v_int16x8 d10 = p10 - c10, d11 = p11 - c11;
-                    v_int16x8 d20 = p20 - c20, d21 = p21 - c21;
+                    v_int16x8 d00 = v_sub(p00, c00), d01 = v_sub(p01, c01);
+                    v_int16x8 d10 = v_sub(p10, c10), d11 = v_sub(p11, c11);
+                    v_int16x8 d20 = v_sub(p20, c20), d21 = v_sub(p21, c21);

-                    v_uint16x8 n0 = v_abs(d00) + v_abs(d10) + v_abs(d20);
-                    v_uint16x8 n1 = v_abs(d01) + v_abs(d11) + v_abs(d21);
+                    v_uint16x8 n0 = v_add(v_add(v_abs(d00), v_abs(d10)), v_abs(d20));
+                    v_uint16x8 n1 = v_add(v_add(v_abs(d01), v_abs(d11)), v_abs(d21));

                    ushort CV_DECL_ALIGNED(16) nbuf[16];
                    v_store(nbuf, n0);
@ -153,13 +153,13 @@ public:

                    v_expand_f32(d00, fd0, fd1);
                    v_expand_f32(d01, fd2, fd3);
-                    s00 += fd0*w0; s01 += fd1*w1; s02 += fd2*w2; s03 += fd3*w3;
+                    s00 = v_add(s00, v_mul(fd0, w0)); s01 = v_add(s01, v_mul(fd1, w1)); s02 = v_add(s02, v_mul(fd2, w2)); s03 = v_add(s03, v_mul(fd3, w3));
                    v_expand_f32(d10, fd0, fd1);
                    v_expand_f32(d11, fd2, fd3);
-                    s10 += fd0*w0; s11 += fd1*w1; s12 += fd2*w2; s13 += fd3*w3;
+                    s10 = v_add(s10, v_mul(fd0, w0)); s11 = v_add(s11, v_mul(fd1, w1)); s12 = v_add(s12, v_mul(fd2, w2)); s13 = v_add(s13, v_mul(fd3, w3));
                    v_expand_f32(d20, fd0, fd1);
                    v_expand_f32(d21, fd2, fd3);
-                    s20 += fd0*w0; s21 += fd1*w1; s22 += fd2*w2; s23 += fd3*w3;
+                    s20 = v_add(s20, v_mul(fd0, w0)); s21 = v_add(s21, v_mul(fd1, w1)); s22 = v_add(s22, v_mul(fd2, w2)); s23 = v_add(s23, v_mul(fd3, w3));
                }

                c0 = v_finalize_pix_ch(c00, c01, s00, s01, s02, s03, v_alpha);
--- a/modules/ximgproc/src/fgs_filter.cpp
+++ b/modules/ximgproc/src/fgs_filter.cpp
@ -303,15 +303,15 @@ void FastGlobalSmootherFilterImpl::process_4row_block(Mat* cur,int i)
        v_float32x4 aux0,aux1,aux2,aux3;

 #define PROC4(Chor_in,cur_in,coef_prev_in,interD_prev_in,cur_prev_in,interD_out,cur_out,coef_cur_out)\
-        coef_cur_out = lambda_reg*Chor_in;\
-        aux0 = interD_prev_in*coef_prev_in;\
-        aux1 = coef_cur_out+coef_prev_in;\
-        aux1 = one_reg-aux1;\
-        aux0 = aux1-aux0;\
-        interD_out = coef_cur_out/aux0;\
-        aux1 = cur_prev_in*coef_prev_in;\
-        aux1 = cur_in - aux1;\
-        cur_out = aux1/aux0;
+        coef_cur_out = v_mul(lambda_reg, Chor_in);\
+        aux0 = v_mul(interD_prev_in, coef_prev_in);\
+        aux1 = v_add(coef_cur_out, coef_prev_in);\
+        aux1 = v_sub(one_reg, aux1);\
+        aux0 = v_sub(aux1, aux0);\
+        interD_out = v_div(coef_cur_out, aux0);\
+        aux1 = v_mul(cur_prev_in, coef_prev_in);\
+        aux1 = v_sub(cur_in, aux1);\
+        cur_out = v_div(aux1, aux0);

        for(;j<w-3;j+=4)
        {
@ -406,14 +406,14 @@ void FastGlobalSmootherFilterImpl::process_4row_block(Mat* cur,int i)
            aux3 = v_load(cur_row_next3+j);
            v_transpose4x4(aux0,aux1,aux2,aux3,b0,b1,b2,b3);

-            aux0 = a3*cur_next_reg;
-            b3 = b3-aux0;
-            aux0 = a2*b3;
-            b2 = b2-aux0;
-            aux0 = a1*b2;
-            b1 = b1-aux0;
-            aux0 = a0*b1;
-            b0 = b0-aux0;
+            aux0 = v_mul(a3, cur_next_reg);
+            b3 = v_sub(b3, aux0);
+            aux0 = v_mul(a2, b3);
+            b2 = v_sub(b2, aux0);
+            aux0 = v_mul(a1, b2);
+            b1 = v_sub(b1, aux0);
+            aux0 = v_mul(a0, b1);
+            b0 = v_sub(b0, aux0);

            cur_next_reg = b0;

@ -522,24 +522,24 @@ void FastGlobalSmootherFilterImpl::VerticalPass_ParBody::operator()(const Range&
        {
            a = v_load(Cvert_row_prev+j);
            b = v_load(Cvert_row+j);
-            coef_prev_reg = lambda_reg*a;
-            coef_cur_reg =  lambda_reg*b;
+            coef_prev_reg = v_mul(lambda_reg, a);
+            coef_cur_reg =  v_mul(lambda_reg, b);

            a = v_load(interD_row_prev+j);
-            a = a*coef_prev_reg;
+            a = v_mul(a, coef_prev_reg);

-            b = coef_prev_reg+coef_cur_reg;
-            b = b+a;
-            a = one_reg-b; //computed denom
+            b = v_add(coef_prev_reg, coef_cur_reg);
+            b = v_add(b, a);
+            a = v_sub(one_reg, b); //computed denom

-            b =  coef_cur_reg/a; //computed interD_row
+            b =  v_div(coef_cur_reg, a); //computed interD_row

            c = v_load(cur_row_prev+j);
-            c = c*coef_prev_reg;
+            c = v_mul(c, coef_prev_reg);

            d = v_load(cur_row+j);
-            d = d-c;
-            d = d/a; //computed cur_row
+            d = v_sub(d, c);
+            d = v_div(d, a); //computed cur_row

            v_store(interD_row+j,b);
            v_store(cur_row+j,d);
@ -570,10 +570,10 @@ void FastGlobalSmootherFilterImpl::VerticalPass_ParBody::operator()(const Range&
        {
            a = v_load(interD_row+j);
            b = v_load(cur_row_next+j);
-            b = a*b;
+            b = v_mul(a, b);

            a = v_load(cur_row+j);
-            b = a-b;
+            b = v_sub(a, b);
            v_store(cur_row+j,b);
        }
 #endif
--- a/modules/xphoto/src/grayworld_white_balance.cpp
+++ b/modules/xphoto/src/grayworld_white_balance.cpp
@ -130,21 +130,21 @@ void calculateChannelSums(uint &sumB, uint &sumG, uint &sumR, uchar *src_data, i
        v_expand(v_max_val, v_max1, v_max2);

        // Calculate masks
-        v_m1 = ~(v_mul_wrap(v_max1 - v_min1, v_255) > v_mul_wrap(v_thresh, v_max1));
-        v_m2 = ~(v_mul_wrap(v_max2 - v_min2, v_255) > v_mul_wrap(v_thresh, v_max2));
+        v_m1 = v_not(v_gt(v_mul_wrap(v_sub(v_max1, v_min1), v_255), v_mul_wrap(v_thresh, v_max1)));
+        v_m2 = v_not(v_gt(v_mul_wrap(v_sub(v_max2, v_min2), v_255), v_mul_wrap(v_thresh, v_max2)));

        // Apply masks
-        v_iB1 = (v_iB1 & v_m1) + (v_iB2 & v_m2);
-        v_iG1 = (v_iG1 & v_m1) + (v_iG2 & v_m2);
-        v_iR1 = (v_iR1 & v_m1) + (v_iR2 & v_m2);
+        v_iB1 = v_add(v_and(v_iB1, v_m1), v_and(v_iB2, v_m2));
+        v_iG1 = v_add(v_and(v_iG1, v_m1), v_and(v_iG2, v_m2));
+        v_iR1 = v_add(v_and(v_iR1, v_m1), v_and(v_iR2, v_m2));

        // Split and add to the sums:
        v_expand(v_iB1, v_uint1, v_uint2);
-        v_SB += v_uint1 + v_uint2;
+        v_SB = v_add(v_SB, v_add(v_uint1, v_uint2));
        v_expand(v_iG1, v_uint1, v_uint2);
-        v_SG += v_uint1 + v_uint2;
+        v_SG = v_add(v_SG, v_add(v_uint1, v_uint2));
        v_expand(v_iR1, v_uint1, v_uint2);
-        v_SR += v_uint1 + v_uint2;
+        v_SR = v_add(v_SR, v_add(v_uint1, v_uint2));
    }

    sumB = v_reduce_sum(v_SB);
@ -197,21 +197,21 @@ void calculateChannelSums(uint64 &sumB, uint64 &sumG, uint64 &sumR, ushort *src_
        v_expand(v_max_val, v_max1, v_max2);

        // Calculate masks
-        v_m1 = ~((v_max1 - v_min1) * v_65535 > v_thresh * v_max1);
-        v_m2 = ~((v_max2 - v_min2) * v_65535 > v_thresh * v_max2);
+        v_m1 = v_not(v_gt(v_mul(v_sub(v_max1, v_min1), v_65535), v_mul(v_thresh, v_max1)));
+        v_m2 = v_not(v_gt(v_mul(v_sub(v_max2, v_min2), v_65535), v_mul(v_thresh, v_max2)));

        // Apply masks
-        v_iB1 = (v_iB1 & v_m1) + (v_iB2 & v_m2);
-        v_iG1 = (v_iG1 & v_m1) + (v_iG2 & v_m2);
-        v_iR1 = (v_iR1 & v_m1) + (v_iR2 & v_m2);
+        v_iB1 = v_add(v_and(v_iB1, v_m1), v_and(v_iB2, v_m2));
+        v_iG1 = v_add(v_and(v_iG1, v_m1), v_and(v_iG2, v_m2));
+        v_iR1 = v_add(v_and(v_iR1, v_m1), v_and(v_iR2, v_m2));

        // Split and add to the sums:
        v_expand(v_iB1, v_u64_1, v_u64_2);
-        v_SB += v_u64_1 + v_u64_2;
+        v_SB = v_add(v_SB, v_add(v_u64_1, v_u64_2));
        v_expand(v_iG1, v_u64_1, v_u64_2);
-        v_SG += v_u64_1 + v_u64_2;
+        v_SG = v_add(v_SG, v_add(v_u64_1, v_u64_2));
        v_expand(v_iR1, v_u64_1, v_u64_2);
-        v_SR += v_u64_1 + v_u64_2;
+        v_SR = v_add(v_SR, v_add(v_u64_1, v_u64_2));
    }

    // Perform final reduction
@ -282,12 +282,12 @@ void applyChannelGains(InputArray _src, OutputArray _dst, float gainB, float gai
            v_expand(v_inR, v_sR1, v_sR2);

            // Multiply by gains
-            v_sB1 = v_mul_wrap(v_sB1, v_gainB) >> 8;
-            v_sB2 = v_mul_wrap(v_sB2, v_gainB) >> 8;
-            v_sG1 = v_mul_wrap(v_sG1, v_gainG) >> 8;
-            v_sG2 = v_mul_wrap(v_sG2, v_gainG) >> 8;
-            v_sR1 = v_mul_wrap(v_sR1, v_gainR) >> 8;
-            v_sR2 = v_mul_wrap(v_sR2, v_gainR) >> 8;
+            v_sB1 = v_shr(v_mul_wrap(v_sB1, v_gainB), 8);
+            v_sB2 = v_shr(v_mul_wrap(v_sB2, v_gainB), 8);
+            v_sG1 = v_shr(v_mul_wrap(v_sG1, v_gainG), 8);
+            v_sG2 = v_shr(v_mul_wrap(v_sG2, v_gainG), 8);
+            v_sR1 = v_shr(v_mul_wrap(v_sR1, v_gainR), 8);
+            v_sR2 = v_shr(v_mul_wrap(v_sR2, v_gainR), 8);

            // Pack into vectors of v_uint8x16
            v_store_interleave(&dst_data[i], v_pack(v_sB1, v_sB2), v_pack(v_sG1, v_sG2), v_pack(v_sR1, v_sR2));
@ -325,12 +325,12 @@ void applyChannelGains(InputArray _src, OutputArray _dst, float gainB, float gai
            v_expand(v_inR, v_sR1, v_sR2);

            // Multiply by scaling factors
-            v_sB1 = (v_sB1 * v_gainB) >> 16;
-            v_sB2 = (v_sB2 * v_gainB) >> 16;
-            v_sG1 = (v_sG1 * v_gainG) >> 16;
-            v_sG2 = (v_sG2 * v_gainG) >> 16;
-            v_sR1 = (v_sR1 * v_gainR) >> 16;
-            v_sR2 = (v_sR2 * v_gainR) >> 16;
+            v_sB1 = v_shr(v_mul(v_sB1, v_gainB), 16);
+            v_sB2 = v_shr(v_mul(v_sB2, v_gainB), 16);
+            v_sG1 = v_shr(v_mul(v_sG1, v_gainG), 16);
+            v_sG2 = v_shr(v_mul(v_sG2, v_gainG), 16);
+            v_sR1 = v_shr(v_mul(v_sR1, v_gainR), 16);
+            v_sR2 = v_shr(v_mul(v_sR2, v_gainR), 16);

            // Pack into vectors of v_uint16x8
            v_store_interleave(&dst_data[i], v_pack(v_sB1, v_sB2), v_pack(v_sG1, v_sG2), v_pack(v_sR1, v_sR2));
--- a/modules/xphoto/src/learning_based_color_balance.cpp
+++ b/modules/xphoto/src/learning_based_color_balance.cpp
@ -192,7 +192,7 @@ void LearningBasedWBImpl::preprocessing(Mat &src)
            v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
            v_local_max = v_max(v_inB, v_max(v_inG, v_inR));
            v_global_max = v_max(v_local_max, v_global_max);
-            v_mask = (v_local_max < v_thresh);
+            v_mask = (v_lt(v_local_max, v_thresh));
            v_store(mask_ptr + i, v_mask);
        }
        uchar global_max[16];
@ -225,7 +225,7 @@ void LearningBasedWBImpl::preprocessing(Mat &src)
            v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
            v_local_max = v_max(v_inB, v_max(v_inG, v_inR));
            v_global_max = v_max(v_local_max, v_global_max);
-            v_mask = (v_local_max < v_thresh);
+            v_mask = (v_lt(v_local_max, v_thresh));
            v_pack_store(mask_ptr + i, v_mask);
        }
        ushort global_max[8];
@ -270,9 +270,9 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
            v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
            v_uint8x16 v_mask = v_load(mask_ptr + i);

-            v_inB &= v_mask;
-            v_inG &= v_mask;
-            v_inR &= v_mask;
+            v_inB = v_and(v_inB, v_mask);
+            v_inG = v_and(v_inG, v_mask);
+            v_inR = v_and(v_inR, v_mask);

            v_uint16x8 v_sR1, v_sR2, v_sG1, v_sG2, v_sB1, v_sB2;
            v_expand(v_inB, v_sB1, v_sB2);
@ -280,33 +280,33 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
            v_expand(v_inR, v_sR1, v_sR2);

            // update the brightest (R,G,B) tuple (process left half):
-            v_uint16x8 v_sum = v_sB1 + v_sG1 + v_sR1;
-            v_uint16x8 v_max_mask = (v_sum > v_max_sum);
+            v_uint16x8 v_sum = v_add(v_add(v_sB1, v_sG1), v_sR1);
+            v_uint16x8 v_max_mask = (v_gt(v_sum, v_max_sum));
            v_max_sum = v_max(v_sum, v_max_sum);
-            v_brightestB = (v_sB1 & v_max_mask) + (v_brightestB & (~v_max_mask));
-            v_brightestG = (v_sG1 & v_max_mask) + (v_brightestG & (~v_max_mask));
-            v_brightestR = (v_sR1 & v_max_mask) + (v_brightestR & (~v_max_mask));
+            v_brightestB = v_add(v_and(v_sB1, v_max_mask), v_and(v_brightestB, v_not(v_max_mask)));
+            v_brightestG = v_add(v_and(v_sG1, v_max_mask), v_and(v_brightestG, v_not(v_max_mask)));
+            v_brightestR = v_add(v_and(v_sR1, v_max_mask), v_and(v_brightestR, v_not(v_max_mask)));

            // update the brightest (R,G,B) tuple (process right half):
-            v_sum = v_sB2 + v_sG2 + v_sR2;
-            v_max_mask = (v_sum > v_max_sum);
+            v_sum = v_add(v_add(v_sB2, v_sG2), v_sR2);
+            v_max_mask = (v_gt(v_sum, v_max_sum));
            v_max_sum = v_max(v_sum, v_max_sum);
-            v_brightestB = (v_sB2 & v_max_mask) + (v_brightestB & (~v_max_mask));
-            v_brightestG = (v_sG2 & v_max_mask) + (v_brightestG & (~v_max_mask));
-            v_brightestR = (v_sR2 & v_max_mask) + (v_brightestR & (~v_max_mask));
+            v_brightestB = v_add(v_and(v_sB2, v_max_mask), v_and(v_brightestB, v_not(v_max_mask)));
+            v_brightestG = v_add(v_and(v_sG2, v_max_mask), v_and(v_brightestG, v_not(v_max_mask)));
+            v_brightestR = v_add(v_and(v_sR2, v_max_mask), v_and(v_brightestR, v_not(v_max_mask)));

            // update sums:
-            v_sB1 = v_sB1 + v_sB2;
-            v_sG1 = v_sG1 + v_sG2;
-            v_sR1 = v_sR1 + v_sR2;
+            v_sB1 = v_add(v_sB1, v_sB2);
+            v_sG1 = v_add(v_sG1, v_sG2);
+            v_sR1 = v_add(v_sR1, v_sR2);

            v_uint32x4 v_uint1, v_uint2;
            v_expand(v_sB1, v_uint1, v_uint2);
-            v_SB += v_uint1 + v_uint2;
+            v_SB = v_add(v_SB, v_add(v_uint1, v_uint2));
            v_expand(v_sG1, v_uint1, v_uint2);
-            v_SG += v_uint1 + v_uint2;
+            v_SG = v_add(v_SG, v_add(v_uint1, v_uint2));
            v_expand(v_sR1, v_uint1, v_uint2);
-            v_SR += v_uint1 + v_uint2;
+            v_SR = v_add(v_SR, v_add(v_uint1, v_uint2));
        }
        sumB = v_reduce_sum(v_SB);
        sumG = v_reduce_sum(v_SG);
@ -361,11 +361,11 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
            v_uint16x8 v_inB, v_inG, v_inR;
            v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
            v_uint16x8 v_mask = v_load_expand(mask_ptr + i);
-            v_mask = v_mask | ((v_mask & v_mask_lower) << 8);
+            v_mask = v_or(v_mask, v_shl<8>(v_and(v_mask, v_mask_lower)));

-            v_inB &= v_mask;
-            v_inG &= v_mask;
-            v_inR &= v_mask;
+            v_inB = v_and(v_inB, v_mask);
+            v_inG = v_and(v_inG, v_mask);
+            v_inR = v_and(v_inR, v_mask);

            v_uint32x4 v_iR1, v_iR2, v_iG1, v_iG2, v_iB1, v_iB2;
            v_expand(v_inB, v_iB1, v_iB2);
@ -373,32 +373,32 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
            v_expand(v_inR, v_iR1, v_iR2);

            // update the brightest (R,G,B) tuple (process left half):
-            v_uint32x4 v_sum = v_iB1 + v_iG1 + v_iR1;
-            v_uint32x4 v_max_mask = (v_sum > v_max_sum);
+            v_uint32x4 v_sum = v_add(v_add(v_iB1, v_iG1), v_iR1);
+            v_uint32x4 v_max_mask = (v_gt(v_sum, v_max_sum));
            v_max_sum = v_max(v_sum, v_max_sum);
-            v_brightestB = (v_iB1 & v_max_mask) + (v_brightestB & (~v_max_mask));
-            v_brightestG = (v_iG1 & v_max_mask) + (v_brightestG & (~v_max_mask));
-            v_brightestR = (v_iR1 & v_max_mask) + (v_brightestR & (~v_max_mask));
+            v_brightestB = v_add(v_and(v_iB1, v_max_mask), v_and(v_brightestB, v_not(v_max_mask)));
+            v_brightestG = v_add(v_and(v_iG1, v_max_mask), v_and(v_brightestG, v_not(v_max_mask)));
+            v_brightestR = v_add(v_and(v_iR1, v_max_mask), v_and(v_brightestR, v_not(v_max_mask)));

            // update the brightest (R,G,B) tuple (process right half):
-            v_sum = v_iB2 + v_iG2 + v_iR2;
-            v_max_mask = (v_sum > v_max_sum);
+            v_sum = v_add(v_add(v_iB2, v_iG2), v_iR2);
+            v_max_mask = (v_gt(v_sum, v_max_sum));
            v_max_sum = v_max(v_sum, v_max_sum);
-            v_brightestB = (v_iB2 & v_max_mask) + (v_brightestB & (~v_max_mask));
-            v_brightestG = (v_iG2 & v_max_mask) + (v_brightestG & (~v_max_mask));
-            v_brightestR = (v_iR2 & v_max_mask) + (v_brightestR & (~v_max_mask));
+            v_brightestB = v_add(v_and(v_iB2, v_max_mask), v_and(v_brightestB, v_not(v_max_mask)));
+            v_brightestG = v_add(v_and(v_iG2, v_max_mask), v_and(v_brightestG, v_not(v_max_mask)));
+            v_brightestR = v_add(v_and(v_iR2, v_max_mask), v_and(v_brightestR, v_not(v_max_mask)));

            // update sums:
-            v_iB1 = v_iB1 + v_iB2;
-            v_iG1 = v_iG1 + v_iG2;
-            v_iR1 = v_iR1 + v_iR2;
+            v_iB1 = v_add(v_iB1, v_iB2);
+            v_iG1 = v_add(v_iG1, v_iG2);
+            v_iR1 = v_add(v_iR1, v_iR2);
            v_uint64x2 v_uint64_1, v_uint64_2;
            v_expand(v_iB1, v_uint64_1, v_uint64_2);
-            v_SB += v_uint64_1 + v_uint64_2;
+            v_SB = v_add(v_SB, v_add(v_uint64_1, v_uint64_2));
            v_expand(v_iG1, v_uint64_1, v_uint64_2);
-            v_SG += v_uint64_1 + v_uint64_2;
+            v_SG = v_add(v_SG, v_add(v_uint64_1, v_uint64_2));
            v_expand(v_iR1, v_uint64_1, v_uint64_2);
-            v_SR += v_uint64_1 + v_uint64_2;
+            v_SR = v_add(v_SR, v_add(v_uint64_1, v_uint64_2));
        }
        uint64 sum_arr[2];
        v_store(sum_arr, v_SB);