From a287605c3e690aebd92080576d52f875cdb01242 Mon Sep 17 00:00:00 2001
From: Liutong HAN <liutong2020@iscas.ac.cn>
Date: Fri, 13 Oct 2023 19:23:30 +0800
Subject: [PATCH] Clean up the Universal Intrinsic API.

---
 .../core/include/opencv2/core/hal/intrin.hpp  |  70 ++-
 .../include/opencv2/core/hal/intrin_neon.hpp  | 317 ++++++------
 modules/core/src/convert.hpp                  |  18 +-
 modules/core/src/mathfuncs_core.simd.hpp      | 122 ++---
 modules/core/src/matmul.simd.hpp              |  30 +-
 modules/core/src/matrix_transform.cpp         |  36 +-
 modules/core/src/minmax.cpp                   | 270 +++++-----
 modules/core/test/test_intrin_utils.hpp       |   5 -
 .../dnn/src/int8layers/convolution_layer.cpp  |  20 +-
 .../src/int8layers/fully_connected_layer.cpp  |   4 +-
 modules/dnn/src/int8layers/pooling_layer.cpp  |  16 +-
 .../src/layers/cpu_kernels/conv_depthwise.cpp |  28 +-
 .../layers/cpu_kernels/conv_winograd_f63.cpp  | 176 +++----
 modules/dnn/src/layers/elementwise_layers.cpp |  24 +-
 .../dnn/src/layers/fully_connected_layer.cpp  |   2 +-
 modules/dnn/src/layers/pooling_layer.cpp      |  22 +-
 modules/features2d/src/fast.cpp               |  36 +-
 modules/features2d/src/fast_score.cpp         |   6 +-
 .../fluid/gfluidimgproc_func.simd.hpp         |  26 +-
 .../fluid/gfluidimgproc_simd_avx2.hpp         |   2 +-
 modules/imgproc/src/bilateral_filter.simd.hpp |  34 +-
 modules/imgproc/src/box_filter.simd.hpp       |  64 +--
 modules/imgproc/src/color_lab.cpp             | 299 +++++------
 modules/imgproc/src/color_rgb.simd.hpp        |  12 +-
 modules/imgproc/src/demosaicing.cpp           | 294 +++++------
 modules/imgproc/src/filter.simd.hpp           |  38 +-
 modules/imgproc/src/histogram.cpp             |   8 +-
 modules/imgproc/src/imgwarp.cpp               | 272 +++++-----
 modules/imgproc/src/median_blur.simd.hpp      |  42 +-
 modules/imgproc/src/moments.cpp               |  20 +-
 modules/imgproc/src/pyramids.cpp              |  14 +-
 modules/imgproc/src/resize.cpp                |  34 +-
 modules/imgproc/src/sumpixels.simd.hpp        | 468 +++++++++---------
 modules/objdetect/src/hog.cpp                 |  92 ++--
 modules/video/src/dis_flow.cpp                |  40 +-
 modules/video/src/lkpyramid.cpp               |  48 +-
 modules/video/src/optflowgf.cpp               |  28 +-
 modules/video/src/variational_refinement.cpp  | 101 ++--
 samples/cpp/simd_basic.cpp                    |   4 +-
 .../core/univ_intrin/univ_intrin.cpp          |   8 +-
 40 files changed, 1615 insertions(+), 1535 deletions(-)
diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
index bf9a247054..904b05e405 100644
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -723,7 +723,7 @@ namespace CV__SIMD_NAMESPACE {
     /** @brief SIMD processing state cleanup call */
     inline void vx_cleanup() { VXPREFIX(_cleanup)(); }
 
-#if !CV_SIMD_SCALABLE
+#if !CV_SIMD_SCALABLE && !(CV_NEON && !defined(CV_FORCE_SIMD128_CPP))
     // Compatibility layer
 
     template<typename T> struct VTraits {
@@ -1148,6 +1148,74 @@ namespace CV__SIMD_NAMESPACE {
 
 #endif //!CV_SIMD_SCALABLE
 
+#if (CV_NEON /* || CV_others */) && !defined(CV_FORCE_SIMD128_CPP)
+// Compatibility layer for the backend that cleaned up.
+    #define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
+    template<typename... Args> \
+    inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
+        return v_add(v_add(f1, f2), vf...); \
+    }
+
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
+    #endif
+
+    #define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
+    template<typename... Args> \
+    inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
+        return v_mul(v_mul(f1, f2), vf...); \
+    }
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
+    #endif
+
+    #define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \
+    inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \
+    { \
+        return v_extract_n<VTraits<_Tpvec>::nlanes-1>(v); \
+    }
+
+    OPENCV_HAL_WRAP_EXTRACT(v_uint8)
+    OPENCV_HAL_WRAP_EXTRACT(v_int8)
+    OPENCV_HAL_WRAP_EXTRACT(v_uint16)
+    OPENCV_HAL_WRAP_EXTRACT(v_int16)
+    OPENCV_HAL_WRAP_EXTRACT(v_uint32)
+    OPENCV_HAL_WRAP_EXTRACT(v_int32)
+    OPENCV_HAL_WRAP_EXTRACT(v_uint64)
+    OPENCV_HAL_WRAP_EXTRACT(v_int64)
+    OPENCV_HAL_WRAP_EXTRACT(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_EXTRACT(v_float64)
+    #endif
+
+    #define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \
+    inline _Tpvec v_broadcast_highest(const _Tpvec& v) \
+    { \
+        return v_broadcast_element<VTraits<_Tpvec>::nlanes-1>(v); \
+    }
+
+    OPENCV_HAL_WRAP_BROADCAST(v_uint32)
+    OPENCV_HAL_WRAP_BROADCAST(v_int32)
+    OPENCV_HAL_WRAP_BROADCAST(v_float32)
+
+#endif //CV_NEON
+
 //! @cond IGNORED
 
     // backward compatibility
diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
index 14eb180819..ee9934135a 100644
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -131,13 +131,22 @@ OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(int64x2,  int64x1,  s64)
 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(float64x2, float64x1,f64)
 #endif
 
+//////////// Compatibility layer ////////////
+template<typename T> struct VTraits {
+        static inline int vlanes() { return T::nlanes; }
+        enum { max_nlanes = T::nlanes, nlanes = T::nlanes };
+        using lane_type = typename T::lane_type;
+};
+
+template<typename T>
+inline typename VTraits<T>::lane_type v_get0(const T& v) \
+{ \
+    return v.get0(); \
+}
 //////////// Types ////////////
 
 struct v_uint8x16
 {
-    typedef uchar lane_type;
-    enum { nlanes = 16 };
-
     v_uint8x16() {}
     explicit v_uint8x16(uint8x16_t v) : val(v) {}
     v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
@@ -146,19 +155,22 @@ struct v_uint8x16
         uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
         val = vld1q_u8(v);
     }
+    uint8x16_t val;
+
+private:
+    friend struct VTraits<v_uint8x16>;
+    enum { nlanes = 16 };
+    typedef uchar lane_type;
+
+    friend typename VTraits<v_uint8x16>::lane_type v_get0<v_uint8x16>(const v_uint8x16& v);
     uchar get0() const
     {
         return vgetq_lane_u8(val, 0);
     }
-
-    uint8x16_t val;
 };
 
 struct v_int8x16
 {
-    typedef schar lane_type;
-    enum { nlanes = 16 };
-
     v_int8x16() {}
     explicit v_int8x16(int8x16_t v) : val(v) {}
     v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
@@ -167,19 +179,22 @@ struct v_int8x16
         schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
         val = vld1q_s8(v);
     }
+    int8x16_t val;
+
+private:
+    friend struct VTraits<v_int8x16>;
+    enum { nlanes = 16 };
+    typedef schar lane_type;
+
+    friend typename VTraits<v_int8x16>::lane_type v_get0<v_int8x16>(const v_int8x16& v);
     schar get0() const
     {
         return vgetq_lane_s8(val, 0);
     }
-
-    int8x16_t val;
 };
 
 struct v_uint16x8
 {
-    typedef ushort lane_type;
-    enum { nlanes = 8 };
-
     v_uint16x8() {}
     explicit v_uint16x8(uint16x8_t v) : val(v) {}
     v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
@@ -187,19 +202,22 @@ struct v_uint16x8
         ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
         val = vld1q_u16(v);
     }
+    uint16x8_t val;
+
+private:
+    friend struct VTraits<v_uint16x8>;
+    enum { nlanes = 8 };
+    typedef ushort lane_type;
+
+    friend typename VTraits<v_uint16x8>::lane_type v_get0<v_uint16x8>(const v_uint16x8& v);
     ushort get0() const
     {
         return vgetq_lane_u16(val, 0);
     }
-
-    uint16x8_t val;
 };
 
 struct v_int16x8
 {
-    typedef short lane_type;
-    enum { nlanes = 8 };
-
     v_int16x8() {}
     explicit v_int16x8(int16x8_t v) : val(v) {}
     v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
@@ -207,19 +225,22 @@ struct v_int16x8
         short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
         val = vld1q_s16(v);
     }
+    int16x8_t val;
+
+private:
+    friend struct VTraits<v_int16x8>;
+    enum { nlanes = 8 };
+    typedef short lane_type;
+
+    friend typename VTraits<v_int16x8>::lane_type v_get0<v_int16x8>(const v_int16x8& v);
     short get0() const
     {
         return vgetq_lane_s16(val, 0);
     }
-
-    int16x8_t val;
 };
 
 struct v_uint32x4
 {
-    typedef unsigned lane_type;
-    enum { nlanes = 4 };
-
     v_uint32x4() {}
     explicit v_uint32x4(uint32x4_t v) : val(v) {}
     v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
@@ -227,19 +248,22 @@ struct v_uint32x4
         unsigned v[] = {v0, v1, v2, v3};
         val = vld1q_u32(v);
     }
+    uint32x4_t val;
+
+private:
+    friend struct VTraits<v_uint32x4>;
+    enum { nlanes = 4 };
+    typedef unsigned lane_type;
+
+    friend typename VTraits<v_uint32x4>::lane_type v_get0<v_uint32x4>(const v_uint32x4& v);
     unsigned get0() const
     {
         return vgetq_lane_u32(val, 0);
     }
-
-    uint32x4_t val;
 };
 
 struct v_int32x4
 {
-    typedef int lane_type;
-    enum { nlanes = 4 };
-
     v_int32x4() {}
     explicit v_int32x4(int32x4_t v) : val(v) {}
     v_int32x4(int v0, int v1, int v2, int v3)
@@ -247,18 +271,22 @@ struct v_int32x4
         int v[] = {v0, v1, v2, v3};
         val = vld1q_s32(v);
     }
+    int32x4_t val;
+
+private:
+    friend struct VTraits<v_int32x4>;
+    enum { nlanes = 4 };
+    typedef int lane_type;
+
+    friend typename VTraits<v_int32x4>::lane_type v_get0<v_int32x4>(const v_int32x4& v);
     int get0() const
     {
         return vgetq_lane_s32(val, 0);
     }
-    int32x4_t val;
 };
 
 struct v_float32x4
 {
-    typedef float lane_type;
-    enum { nlanes = 4 };
-
     v_float32x4() {}
     explicit v_float32x4(float32x4_t v) : val(v) {}
     v_float32x4(float v0, float v1, float v2, float v3)
@@ -266,18 +294,22 @@ struct v_float32x4
         float v[] = {v0, v1, v2, v3};
         val = vld1q_f32(v);
     }
+    float32x4_t val;
+
+private:
+    friend struct VTraits<v_float32x4>;
+    enum { nlanes = 4 };
+    typedef float lane_type;
+
+    friend typename VTraits<v_float32x4>::lane_type v_get0<v_float32x4>(const v_float32x4& v);
     float get0() const
     {
         return vgetq_lane_f32(val, 0);
     }
-    float32x4_t val;
 };
 
 struct v_uint64x2
 {
-    typedef uint64 lane_type;
-    enum { nlanes = 2 };
-
     v_uint64x2() {}
     explicit v_uint64x2(uint64x2_t v) : val(v) {}
     v_uint64x2(uint64 v0, uint64 v1)
@@ -285,18 +317,21 @@ struct v_uint64x2
         uint64 v[] = {v0, v1};
         val = vld1q_u64(v);
     }
+    uint64x2_t val;
+private:
+    friend struct VTraits<v_uint64x2>;
+    enum { nlanes = 2 };
+    typedef uint64 lane_type;
+
+    friend typename VTraits<v_uint64x2>::lane_type v_get0<v_uint64x2>(const v_uint64x2& v);
     uint64 get0() const
     {
         return vgetq_lane_u64(val, 0);
     }
-    uint64x2_t val;
 };
 
 struct v_int64x2
 {
-    typedef int64 lane_type;
-    enum { nlanes = 2 };
-
     v_int64x2() {}
     explicit v_int64x2(int64x2_t v) : val(v) {}
     v_int64x2(int64 v0, int64 v1)
@@ -304,19 +339,23 @@ struct v_int64x2
         int64 v[] = {v0, v1};
         val = vld1q_s64(v);
     }
+    int64x2_t val;
+
+private:
+    friend struct VTraits<v_int64x2>;
+    enum { nlanes = 2 };
+    typedef int64 lane_type;
+
+    friend typename VTraits<v_int64x2>::lane_type v_get0<v_int64x2>(const v_int64x2& v);
     int64 get0() const
     {
         return vgetq_lane_s64(val, 0);
     }
-    int64x2_t val;
 };
 
 #if CV_SIMD128_64F
 struct v_float64x2
 {
-    typedef double lane_type;
-    enum { nlanes = 2 };
-
     v_float64x2() {}
     explicit v_float64x2(float64x2_t v) : val(v) {}
     v_float64x2(double v0, double v1)
@@ -324,11 +363,18 @@ struct v_float64x2
         double v[] = {v0, v1};
         val = vld1q_f64(v);
     }
+
+    float64x2_t val;
+private:
+    friend struct VTraits<v_float64x2>;
+    enum { nlanes = 2 };
+    typedef double lane_type;
+
+    friend typename VTraits<v_float64x2>::lane_type v_get0<v_float64x2>(const v_float64x2& v);
     double get0() const
     {
         return vgetq_lane_f64(val, 0);
     }
-    float64x2_t val;
 };
 #endif
 
@@ -460,71 +506,56 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
 }
 
 #define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
-inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec bin_op (const _Tpvec& a, const _Tpvec& b) \
 { \
     return _Tpvec(intrin(a.val, b.val)); \
-} \
-inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
-{ \
-    a.val = intrin(a.val, b.val); \
-    return a; \
-}
-
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint32x4, vaddq_u32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint32x4, vsubq_u32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint32x4, vmulq_u32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int64x2, vsubq_s64)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint64x2, vaddq_u64)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint64x2, vsubq_u64)
+}
+
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint8x16, vqaddq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint8x16, vqsubq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int8x16, vqaddq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int8x16, vqsubq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint16x8, vqaddq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint16x8, vqsubq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int16x8, vqaddq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int16x8, vqsubq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int32x4, vaddq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int32x4, vsubq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_int32x4, vmulq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint32x4, vaddq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint32x4, vsubq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_uint32x4, vmulq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_float32x4, vaddq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_float32x4, vsubq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_float32x4, vmulq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int64x2, vaddq_s64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int64x2, vsubq_s64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint64x2, vaddq_u64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint64x2, vsubq_u64)
 #if CV_SIMD128_64F
-OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float32x4, vdivq_f32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float64x2, vaddq_f64)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float64x2, vsubq_f64)
-OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float64x2, vmulq_f64)
-OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float64x2, vdivq_f64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_div, v_float32x4, vdivq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_float64x2, vaddq_f64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_float64x2, vsubq_f64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_float64x2, vmulq_f64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_div, v_float64x2, vdivq_f64)
 #else
-inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
+inline v_float32x4 v_div (const v_float32x4& a, const v_float32x4& b)
 {
     float32x4_t reciprocal = vrecpeq_f32(b.val);
     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
     return v_float32x4(vmulq_f32(a.val, reciprocal));
 }
-inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
-{
-    float32x4_t reciprocal = vrecpeq_f32(b.val);
-    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
-    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
-    a.val = vmulq_f32(a.val, reciprocal);
-    return a;
-}
 #endif
 
 // saturating multiply 8-bit, 16-bit
 #define OPENCV_HAL_IMPL_NEON_MUL_SAT(_Tpvec, _Tpwvec)            \
-    inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
+    inline _Tpvec v_mul (const _Tpvec& a, const _Tpvec& b)  \
     {                                                            \
         _Tpwvec c, d;                                            \
         v_mul_expand(a, b, c, d);                                \
         return v_pack(c, d);                                     \
-    }                                                            \
-    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
-    { a = a * b; return a; }
+    }
 
 OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int8x16,  v_int16x8)
 OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint8x16, v_uint16x8)
@@ -698,7 +729,7 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
                                    const v_uint32x4& c)
 {
-    return v_dotprod_expand(a, b) + c;
+    return v_add(v_dotprod_expand(a, b), c);
 }
 
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
@@ -715,7 +746,7 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
                                   const v_int32x4& c)
 {
-    return v_dotprod_expand(a, b) + c;
+    return v_add(v_dotprod_expand(a, b), c);
 }
 #endif
 // 16 >> 64
@@ -735,7 +766,7 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
     return v_uint64x2(vaddq_u64(s0, s1));
 }
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
 {
@@ -752,7 +783,7 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
 }
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
                                   const v_int64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 // 32 >> 64f
 #if CV_SIMD128_64F
@@ -760,7 +791,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
 { return v_cvt_f64(v_dotprod(a, b)); }
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a,   const v_int32x4& b,
                                     const v_float64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 #endif
 
 //////// Fast Dot Product ////////
@@ -850,7 +881,7 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b
 }
 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
 {
-    return v_dotprod_expand_fast(a, b) + c;
+    return v_add(v_dotprod_expand_fast(a, b), c);
 }
 
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
@@ -861,7 +892,7 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
 }
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
 {
-    return v_dotprod_expand_fast(a, b) + c;
+    return v_add(v_dotprod_expand_fast(a, b), c);
 }
 #endif
 
@@ -875,7 +906,7 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b
     return v_uint64x2(vaddq_u64(s0, s1));
 }
 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 
 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
 {
@@ -884,22 +915,22 @@ inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
     return v_int64x2(vaddl_s32(vget_low_s32(prod), vget_high_s32(prod)));
 }
 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 
 // 32 >> 64f
 #if CV_SIMD128_64F
 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
 { return v_cvt_f64(v_dotprod_fast(a, b)); }
 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 #endif
 
 
 #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
-    OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
-    OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
-    OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \
-    inline _Tpvec operator ~ (const _Tpvec& a) \
+    OPENCV_HAL_IMPL_NEON_BIN_OP(v_and, _Tpvec, vandq_##suffix) \
+    OPENCV_HAL_IMPL_NEON_BIN_OP(v_or, _Tpvec, vorrq_##suffix) \
+    OPENCV_HAL_IMPL_NEON_BIN_OP(v_xor, _Tpvec, veorq_##suffix) \
+    inline _Tpvec v_not (const _Tpvec& a) \
     { \
         return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \
     }
@@ -914,21 +945,16 @@ OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64)
 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64)
 
 #define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
-inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
+inline v_float32x4 bin_op (const v_float32x4& a, const v_float32x4& b) \
 { \
     return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
-} \
-inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
-{ \
-    a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \
-    return a; \
 }
 
-OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32)
-OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32)
-OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32)
+OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_and, vandq_s32)
+OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_or, vorrq_s32)
+OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_xor, veorq_s32)
 
-inline v_float32x4 operator ~ (const v_float32x4& a)
+inline v_float32x4 v_not (const v_float32x4& a)
 {
     return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
 }
@@ -942,7 +968,7 @@ inline v_float32x4 v_sqrt(const v_float32x4& x)
 inline v_float32x4 v_invsqrt(const v_float32x4& x)
 {
     v_float32x4 one = v_setall_f32(1.0f);
-    return one / v_sqrt(x);
+    return v_div(one, v_sqrt(x));
 }
 #else
 inline v_float32x4 v_sqrt(const v_float32x4& x)
@@ -975,21 +1001,16 @@ inline v_float32x4 v_abs(v_float32x4 x)
 
 #if CV_SIMD128_64F
 #define OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(bin_op, intrin) \
-inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
+inline v_float64x2 bin_op (const v_float64x2& a, const v_float64x2& b) \
 { \
     return v_float64x2(vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val)))); \
-} \
-inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
-{ \
-    a.val = vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val))); \
-    return a; \
 }
 
-OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(&, vandq_s64)
-OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(|, vorrq_s64)
-OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(^, veorq_s64)
+OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_and, vandq_s64)
+OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_or, vorrq_s64)
+OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_xor, veorq_s64)
 
-inline v_float64x2 operator ~ (const v_float64x2& a)
+inline v_float64x2 v_not (const v_float64x2& a)
 {
     return v_float64x2(vreinterpretq_f64_s32(vmvnq_s32(vreinterpretq_s32_f64(a.val))));
 }
@@ -1002,7 +1023,7 @@ inline v_float64x2 v_sqrt(const v_float64x2& x)
 inline v_float64x2 v_invsqrt(const v_float64x2& x)
 {
     v_float64x2 one = v_setall_f64(1.0f);
-    return one / v_sqrt(x);
+    return v_div(one, v_sqrt(x));
 }
 
 inline v_float64x2 v_abs(v_float64x2 x)
@@ -1037,17 +1058,17 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_max, vmaxq_f64)
 #endif
 
 #define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
-inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_eq (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
-inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_ne (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
-inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_lt (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
-inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_gt (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
-inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_le (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
-inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_ge (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
 
 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
@@ -1065,22 +1086,22 @@ static inline uint64x2_t vmvnq_u64(uint64x2_t a)
 }
 //OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint64x2, OPENCV_HAL_NOP, u64, u64)
 //OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64)
-static inline v_uint64x2 operator == (const v_uint64x2& a, const v_uint64x2& b)
+static inline v_uint64x2 v_eq (const v_uint64x2& a, const v_uint64x2& b)
 { return v_uint64x2(vceqq_u64(a.val, b.val)); }
-static inline v_uint64x2 operator != (const v_uint64x2& a, const v_uint64x2& b)
+static inline v_uint64x2 v_ne (const v_uint64x2& a, const v_uint64x2& b)
 { return v_uint64x2(vmvnq_u64(vceqq_u64(a.val, b.val))); }
-static inline v_int64x2 operator == (const v_int64x2& a, const v_int64x2& b)
+static inline v_int64x2 v_eq (const v_int64x2& a, const v_int64x2& b)
 { return v_int64x2(vreinterpretq_s64_u64(vceqq_s64(a.val, b.val))); }
-static inline v_int64x2 operator != (const v_int64x2& a, const v_int64x2& b)
+static inline v_int64x2 v_ne (const v_int64x2& a, const v_int64x2& b)
 { return v_int64x2(vreinterpretq_s64_u64(vmvnq_u64(vceqq_s64(a.val, b.val)))); }
 #else
-static inline v_uint64x2 operator == (const v_uint64x2& a, const v_uint64x2& b)
+static inline v_uint64x2 v_eq (const v_uint64x2& a, const v_uint64x2& b)
 {
     uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_u64(a.val), vreinterpretq_u32_u64(b.val));
     uint32x4_t swapped = vrev64q_u32(cmp);
     return v_uint64x2(vreinterpretq_u64_u32(vandq_u32(cmp, swapped)));
 }
-static inline v_uint64x2 operator != (const v_uint64x2& a, const v_uint64x2& b)
+static inline v_uint64x2 v_ne (const v_uint64x2& a, const v_uint64x2& b)
 {
     uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_u64(a.val), vreinterpretq_u32_u64(b.val));
     uint32x4_t swapped = vrev64q_u32(cmp);
@@ -1088,13 +1109,13 @@ static inline v_uint64x2 operator != (const v_uint64x2& a, const v_uint64x2& b)
     uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
     return v_uint64x2(veorq_u64(v_eq, vx));
 }
-static inline v_int64x2 operator == (const v_int64x2& a, const v_int64x2& b)
+static inline v_int64x2 v_eq (const v_int64x2& a, const v_int64x2& b)
 {
-    return v_reinterpret_as_s64(v_reinterpret_as_u64(a) == v_reinterpret_as_u64(b));
+    return v_reinterpret_as_s64(v_eq(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b)));
 }
-static inline v_int64x2 operator != (const v_int64x2& a, const v_int64x2& b)
+static inline v_int64x2 v_ne (const v_int64x2& a, const v_int64x2& b)
 {
-    return v_reinterpret_as_s64(v_reinterpret_as_u64(a) != v_reinterpret_as_u64(b));
+    return v_reinterpret_as_s64(v_ne(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b)));
 }
 #endif
 #if CV_SIMD128_64F
@@ -1207,9 +1228,9 @@ inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_
 
 // trade efficiency for convenience
 #define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
-inline _Tpvec operator << (const _Tpvec& a, int n) \
+inline _Tpvec v_shl (const _Tpvec& a, int n) \
 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
-inline _Tpvec operator >> (const _Tpvec& a, int n) \
+inline _Tpvec v_shr (const _Tpvec& a, int n) \
 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
 { return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
@@ -1231,13 +1252,13 @@ OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)
 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
 { return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
-{ return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
+{ return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, VTraits<_Tpvec>::nlanes - n)); } \
 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
 { return a; } \
 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
-{ return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); } \
+{ return _Tpvec(vextq_##suffix(b.val, a.val, VTraits<_Tpvec>::nlanes - n)); } \
 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
 { CV_UNUSED(b); return a; }
 
diff --git a/modules/core/src/convert.hpp b/modules/core/src/convert.hpp
index c689276218..65a998bd8f 100644
--- a/modules/core/src/convert.hpp
+++ b/modules/core/src/convert.hpp
@@ -358,8 +358,8 @@ static inline void v_store_pair_as(float16_t* ptr, const v_float64& a, const v_f
 
 static inline void vx_load_as(const double* ptr, v_float32& a)
 {
-    const int VECSZ = v_float32::nlanes;
-    float buf[VECSZ*2];
+    const int VECSZ = VTraits<v_float32>::vlanes();
+    float buf[VTraits<v_float32>::max_nlanes*2];
 
     for( int i = 0; i < VECSZ; i++ )
         buf[i] = saturate_cast<float>(ptr[i]);
@@ -369,19 +369,19 @@ static inline void vx_load_as(const double* ptr, v_float32& a)
 template<typename _Tdvec>
 static inline void vx_load_pair_as(const double* ptr, _Tdvec& a, _Tdvec& b)
 {
-    const int VECSZ = _Tdvec::nlanes;
-    typename _Tdvec::lane_type buf[VECSZ*2];
+    const int VECSZ = VTraits<_Tdvec>::vlanes();
+    typename VTraits<_Tdvec>::lane_type buf[VTraits<_Tdvec>::max_nlanes*2];
 
     for( int i = 0; i < VECSZ*2; i++ )
-        buf[i] = saturate_cast<typename _Tdvec::lane_type>(ptr[i]);
+        buf[i] = saturate_cast<typename VTraits<_Tdvec>::lane_type>(ptr[i]);
     a = vx_load(buf);
     b = vx_load(buf + VECSZ);
 }
 
 static inline void v_store_as(double* ptr, const v_float32& a)
 {
-    const int VECSZ = v_float32::nlanes;
-    float buf[VECSZ];
+    const int VECSZ = VTraits<v_float32>::vlanes();
+    float buf[VTraits<v_float32>::max_nlanes];
 
     v_store(buf, a);
     for( int i = 0; i < VECSZ; i++ )
@@ -391,8 +391,8 @@ static inline void v_store_as(double* ptr, const v_float32& a)
 template<typename _Tsvec>
 static inline void v_store_pair_as(double* ptr, const _Tsvec& a, const _Tsvec& b)
 {
-    const int VECSZ = _Tsvec::nlanes;
-    typename _Tsvec::lane_type buf[VECSZ*2];
+    const int VECSZ = VTraits<_Tsvec>::vlanes();
+    typename VTraits<_Tsvec>::lane_type buf[VTraits<_Tsvec>::max_nlanes*2];
 
     v_store(buf, a); v_store(buf + VECSZ, b);
     for( int i = 0; i < VECSZ*2; i++ )
diff --git a/modules/core/src/mathfuncs_core.simd.hpp b/modules/core/src/mathfuncs_core.simd.hpp
index 1bf36bb174..2aa107b9be 100644
--- a/modules/core/src/mathfuncs_core.simd.hpp
+++ b/modules/core/src/mathfuncs_core.simd.hpp
@@ -93,13 +93,13 @@ struct v_atan_f32
     {
         v_float32 ax = v_abs(x);
         v_float32 ay = v_abs(y);
-        v_float32 c = v_min(ax, ay) / (v_max(ax, ay) + eps);
-        v_float32 cc = c * c;
-        v_float32 a = v_fma(v_fma(v_fma(cc, p7, p5), cc, p3), cc, p1)*c;
-        a = v_select(ax >= ay, a, val90 - a);
-        a = v_select(x < z, val180 - a, a);
-        a = v_select(y < z, val360 - a, a);
-        return a * s;
+        v_float32 c = v_div(v_min(ax, ay), v_add(v_max(ax, ay), this->eps));
+        v_float32 cc = v_mul(c, c);
+        v_float32 a = v_mul(v_fma(v_fma(v_fma(cc, this->p7, this->p5), cc, this->p3), cc, this->p1), c);
+        a = v_select(v_ge(ax, ay), a, v_sub(this->val90, a));
+        a = v_select(v_lt(x, this->z), v_sub(this->val180, a), a);
+        a = v_select(v_lt(y, this->z), v_sub(this->val360, a), a);
+        return v_mul(a, this->s);
     }
 
     v_float32 eps;
@@ -125,7 +125,7 @@ static void fastAtan32f_(const float *Y, const float *X, float *angle, int len,
     float scale = angleInDegrees ? 1.f : (float)(CV_PI/180);
     int i = 0;
 #if CV_SIMD
-    const int VECSZ = v_float32::nlanes;
+    const int VECSZ = VTraits<v_float32>::vlanes();
     v_atan_f32 v(scale);
 
     for( ; i < len; i += VECSZ*2 )
@@ -198,7 +198,7 @@ void magnitude32f(const float* x, const float* y, float* mag, int len)
     int i = 0;
 
 #if CV_SIMD
-    const int VECSZ = v_float32::nlanes;
+    const int VECSZ = VTraits<v_float32>::vlanes();
     for( ; i < len; i += VECSZ*2 )
     {
         if( i + VECSZ*2 > len )
@@ -209,8 +209,8 @@ void magnitude32f(const float* x, const float* y, float* mag, int len)
         }
         v_float32 x0 = vx_load(x + i), x1 = vx_load(x + i + VECSZ);
         v_float32 y0 = vx_load(y + i), y1 = vx_load(y + i + VECSZ);
-        x0 = v_sqrt(v_muladd(x0, x0, y0*y0));
-        x1 = v_sqrt(v_muladd(x1, x1, y1*y1));
+        x0 = v_sqrt(v_muladd(x0, x0, v_mul(y0, y0)));
+        x1 = v_sqrt(v_muladd(x1, x1, v_mul(y1, y1)));
         v_store(mag + i, x0);
         v_store(mag + i + VECSZ, x1);
     }
@@ -231,7 +231,7 @@ void magnitude64f(const double* x, const double* y, double* mag, int len)
     int i = 0;
 
 #if CV_SIMD_64F
-    const int VECSZ = v_float64::nlanes;
+    const int VECSZ = VTraits<v_float64>::vlanes();
     for( ; i < len; i += VECSZ*2 )
     {
         if( i + VECSZ*2 > len )
@@ -242,8 +242,8 @@ void magnitude64f(const double* x, const double* y, double* mag, int len)
         }
         v_float64 x0 = vx_load(x + i), x1 = vx_load(x + i + VECSZ);
         v_float64 y0 = vx_load(y + i), y1 = vx_load(y + i + VECSZ);
-        x0 = v_sqrt(v_muladd(x0, x0, y0*y0));
-        x1 = v_sqrt(v_muladd(x1, x1, y1*y1));
+        x0 = v_sqrt(v_muladd(x0, x0, v_mul(y0, y0)));
+        x1 = v_sqrt(v_muladd(x1, x1, v_mul(y1, y1)));
         v_store(mag + i, x0);
         v_store(mag + i + VECSZ, x1);
     }
@@ -265,7 +265,7 @@ void invSqrt32f(const float* src, float* dst, int len)
     int i = 0;
 
 #if CV_SIMD
-    const int VECSZ = v_float32::nlanes;
+    const int VECSZ = VTraits<v_float32>::vlanes();
     for( ; i < len; i += VECSZ*2 )
     {
         if( i + VECSZ*2 > len )
@@ -293,7 +293,7 @@ void invSqrt64f(const double* src, double* dst, int len)
     int i = 0;
 
 #if CV_SIMD_64F
-    const int VECSZ = v_float64::nlanes;
+    const int VECSZ = VTraits<v_float64>::vlanes();
     for ( ; i < len; i += VECSZ*2)
     {
         if( i + VECSZ*2 > len )
@@ -321,7 +321,7 @@ void sqrt32f(const float* src, float* dst, int len)
     int i = 0;
 
 #if CV_SIMD
-    const int VECSZ = v_float32::nlanes;
+    const int VECSZ = VTraits<v_float32>::vlanes();
     for( ; i < len; i += VECSZ*2 )
     {
         if( i + VECSZ*2 > len )
@@ -350,7 +350,7 @@ void sqrt64f(const double* src, double* dst, int len)
     int i = 0;
 
 #if CV_SIMD_64F
-    const int VECSZ = v_float64::nlanes;
+    const int VECSZ = VTraits<v_float64>::vlanes();
     for( ; i < len; i += VECSZ*2 )
     {
         if( i + VECSZ*2 > len )
@@ -452,7 +452,7 @@ void exp32f( const float *_x, float *y, int n )
     float postscale = (float)exp_postscale;
 
 #if CV_SIMD
-    const int VECSZ = v_float32::nlanes;
+    const int VECSZ = VTraits<v_float32>::vlanes();
     const v_float32 vprescale = vx_setall_f32((float)exp_prescale);
     const v_float32 vpostscale = vx_setall_f32((float)exp_postscale);
     const v_float32 vminval = vx_setall_f32(minval);
@@ -481,26 +481,26 @@ void exp32f( const float *_x, float *y, int n )
         xf0 = v_min(v_max(xf0, vminval), vmaxval);
         xf1 = v_min(v_max(xf1, vminval), vmaxval);
 
-        xf0 *= vprescale;
-        xf1 *= vprescale;
+        xf0 = v_mul(xf0, vprescale);
+        xf1 = v_mul(xf1, vprescale);
 
         v_int32 xi0 = v_round(xf0);
         v_int32 xi1 = v_round(xf1);
-        xf0 = (xf0 - v_cvt_f32(xi0))*vpostscale;
-        xf1 = (xf1 - v_cvt_f32(xi1))*vpostscale;
+        xf0 = v_mul(v_sub(xf0, v_cvt_f32(xi0)), vpostscale);
+        xf1 = v_mul(v_sub(xf1, v_cvt_f32(xi1)), vpostscale);
 
-        v_float32 yf0 = v_lut(expTab_f, xi0 & vidxmask);
-        v_float32 yf1 = v_lut(expTab_f, xi1 & vidxmask);
+        v_float32 yf0 = v_lut(expTab_f, v_and(xi0, vidxmask));
+        v_float32 yf1 = v_lut(expTab_f, v_and(xi1, vidxmask));
 
         v_int32 v0 = vx_setzero_s32(), v127 = vx_setall_s32(127), v255 = vx_setall_s32(255);
-        xi0 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi0) + v127, v0), v255);
-        xi1 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi1) + v127, v0), v255);
+        xi0 = v_min(v_max(v_add(v_shr<6>(xi0), v127), v0), v255);
+        xi1 = v_min(v_max(v_add(v_shr<6>(xi1), v127), v0), v255);
 
-        yf0 *= v_reinterpret_as_f32(v_shl<23>(xi0));
-        yf1 *= v_reinterpret_as_f32(v_shl<23>(xi1));
+        yf0 = v_mul(yf0, v_reinterpret_as_f32(v_shl<23>(xi0)));
+        yf1 = v_mul(yf1, v_reinterpret_as_f32(v_shl<23>(xi1)));
 
-        v_float32 zf0 = xf0 + vA1;
-        v_float32 zf1 = xf1 + vA1;
+        v_float32 zf0 = v_add(xf0, vA1);
+        v_float32 zf1 = v_add(xf1, vA1);
 
         zf0 = v_fma(zf0, xf0, vA2);
         zf1 = v_fma(zf1, xf1, vA2);
@@ -511,8 +511,8 @@ void exp32f( const float *_x, float *y, int n )
         zf0 = v_fma(zf0, xf0, vA4);
         zf1 = v_fma(zf1, xf1, vA4);
 
-        zf0 *= yf0;
-        zf1 *= yf1;
+        zf0 = v_mul(zf0, yf0);
+        zf1 = v_mul(zf1, yf1);
 
         if( y_aligned )
         {
@@ -566,7 +566,7 @@ void exp64f( const double *_x, double *y, int n )
     double maxval = (exp_max_val/exp_prescale);
 
 #if CV_SIMD_64F
-    const int VECSZ = v_float64::nlanes;
+    const int VECSZ = VTraits<v_float64>::vlanes();
     const v_float64 vprescale = vx_setall_f64(exp_prescale);
     const v_float64 vpostscale = vx_setall_f64(exp_postscale);
     const v_float64 vminval = vx_setall_f64(minval);
@@ -596,30 +596,30 @@ void exp64f( const double *_x, double *y, int n )
         xf0 = v_min(v_max(xf0, vminval), vmaxval);
         xf1 = v_min(v_max(xf1, vminval), vmaxval);
 
-        xf0 *= vprescale;
-        xf1 *= vprescale;
+        xf0 = v_mul(xf0, vprescale);
+        xf1 = v_mul(xf1, vprescale);
 
         v_int32 xi0 = v_round(xf0);
         v_int32 xi1 = v_round(xf1);
-        xf0 = (xf0 - v_cvt_f64(xi0))*vpostscale;
-        xf1 = (xf1 - v_cvt_f64(xi1))*vpostscale;
+        xf0 = v_mul(v_sub(xf0, v_cvt_f64(xi0)), vpostscale);
+        xf1 = v_mul(v_sub(xf1, v_cvt_f64(xi1)), vpostscale);
 
-        v_float64 yf0 = v_lut(expTab, xi0 & vidxmask);
-        v_float64 yf1 = v_lut(expTab, xi1 & vidxmask);
+        v_float64 yf0 = v_lut(expTab, v_and(xi0, vidxmask));
+        v_float64 yf1 = v_lut(expTab, v_and(xi1, vidxmask));
 
         v_int32 v0 = vx_setzero_s32(), v1023 = vx_setall_s32(1023), v2047 = vx_setall_s32(2047);
-        xi0 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi0) + v1023, v0), v2047);
-        xi1 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi1) + v1023, v0), v2047);
+        xi0 = v_min(v_max(v_add(v_shr<6>(xi0), v1023), v0), v2047);
+        xi1 = v_min(v_max(v_add(v_shr<6>(xi1), v1023), v0), v2047);
 
         v_int64 xq0, xq1, dummy;
         v_expand(xi0, xq0, dummy);
         v_expand(xi1, xq1, dummy);
 
-        yf0 *= v_reinterpret_as_f64(v_shl<52>(xq0));
-        yf1 *= v_reinterpret_as_f64(v_shl<52>(xq1));
+        yf0 = v_mul(yf0, v_reinterpret_as_f64(v_shl<52>(xq0)));
+        yf1 = v_mul(yf1, v_reinterpret_as_f64(v_shl<52>(xq1)));
 
-        v_float64 zf0 = xf0 + vA1;
-        v_float64 zf1 = xf1 + vA1;
+        v_float64 zf0 = v_add(xf0, vA1);
+        v_float64 zf1 = v_add(xf1, vA1);
 
         zf0 = v_fma(zf0, xf0, vA2);
         zf1 = v_fma(zf1, xf1, vA2);
@@ -633,8 +633,8 @@ void exp64f( const double *_x, double *y, int n )
         zf0 = v_fma(zf0, xf0, vA5);
         zf1 = v_fma(zf1, xf1, vA5);
 
-        zf0 *= yf0;
-        zf1 *= yf1;
+        zf0 = v_mul(zf0, yf0);
+        zf1 = v_mul(zf1, yf1);
 
         if( y_aligned )
         {
@@ -696,7 +696,7 @@ void log32f( const float *_x, float *y, int n )
     const int* x = (const int*)_x;
 
 #if CV_SIMD
-    const int VECSZ = v_float32::nlanes;
+    const int VECSZ = VTraits<v_float32>::vlanes();
     const v_float32 vln2 = vx_setall_f32((float)ln_2);
     const v_float32 v1 = vx_setall_f32(1.f);
     const v_float32 vshift = vx_setall_f32(-1.f/512);
@@ -715,18 +715,18 @@ void log32f( const float *_x, float *y, int n )
         }
 
         v_int32 h0 = vx_load(x + i);
-        v_int32 yi0 = (v_shr<23>(h0) & vx_setall_s32(255)) - vx_setall_s32(127);
-        v_int32 xi0 = (h0 & vx_setall_s32(LOGTAB_MASK2_32F)) | vx_setall_s32(127 << 23);
+        v_int32 yi0 = v_sub(v_and(v_shr<23>(h0), vx_setall_s32(255)), vx_setall_s32(127));
+        v_int32 xi0 = v_or(v_and(h0, vx_setall_s32(LOGTAB_MASK2_32F)), vx_setall_s32(127 << 23));
 
-        h0 = v_shr<23 - LOGTAB_SCALE - 1>(h0) & vx_setall_s32(LOGTAB_MASK*2);
+        h0 = v_and(v_shr<23 - 8 - 1>(h0), vx_setall_s32(((1 << 8) - 1) * 2));
         v_float32 yf0, xf0;
 
         v_lut_deinterleave(logTab_f, h0, yf0, xf0);
 
         yf0 = v_fma(v_cvt_f32(yi0), vln2, yf0);
 
-        v_float32 delta = v_select(v_reinterpret_as_f32(h0 == vx_setall_s32(510)), vshift, vx_setall<float>(0));
-        xf0 = v_fma((v_reinterpret_as_f32(xi0) - v1), xf0, delta);
+        v_float32 delta = v_select(v_reinterpret_as_f32(v_eq(h0, vx_setall_s32(510))), vshift, vx_setall<float>(0));
+        xf0 = v_fma((v_sub(v_reinterpret_as_f32(xi0), v1)), xf0, delta);
 
         v_float32 zf0 = v_fma(xf0, vA0, vA1);
         zf0 = v_fma(zf0, xf0, vA2);
@@ -771,7 +771,7 @@ void log64f( const double *x, double *y, int n )
     int i = 0;
 
 #if CV_SIMD_64F
-    const int VECSZ = v_float64::nlanes;
+    const int VECSZ = VTraits<v_float64>::vlanes();
     const v_float64 vln2 = vx_setall_f64(ln_2);
 
     const v_float64
@@ -791,20 +791,20 @@ void log64f( const double *x, double *y, int n )
 
         v_int64 h0 = vx_load((const int64*)x + i);
         v_int32 yi0 = v_pack(v_shr<52>(h0), vx_setzero_s64());
-        yi0 = (yi0 & vx_setall_s32(0x7ff)) - vx_setall_s32(1023);
+        yi0 = v_sub(v_and(yi0, vx_setall_s32(2047)), vx_setall_s32(1023));
 
-        v_int64 xi0 = (h0 & vx_setall_s64(LOGTAB_MASK2_64F)) | vx_setall_s64((int64)1023 << 52);
+        v_int64 xi0 = v_or(v_and(h0, vx_setall_s64(LOGTAB_MASK2_64F)), vx_setall_s64((int64)1023 << 52));
         h0 = v_shr<52 - LOGTAB_SCALE - 1>(h0);
-        v_int32 idx = v_pack(h0, h0) & vx_setall_s32(LOGTAB_MASK*2);
+        v_int32 idx = v_and(v_pack(h0, h0), vx_setall_s32(((1 << 8) - 1) * 2));
 
         v_float64 xf0, yf0;
         v_lut_deinterleave(logTab, idx, yf0, xf0);
 
         yf0 = v_fma(v_cvt_f64(yi0), vln2, yf0);
-        v_float64 delta = v_cvt_f64(idx == vx_setall_s32(510))*vx_setall_f64(1./512);
-        xf0 = v_fma(v_reinterpret_as_f64(xi0) - vx_setall_f64(1.), xf0, delta);
+        v_float64 delta = v_mul(v_cvt_f64(v_eq(idx, vx_setall_s32(510))), vx_setall_f64(1. / 512));
+        xf0 = v_fma(v_sub(v_reinterpret_as_f64(xi0), vx_setall_f64(1.)), xf0, delta);
 
-        v_float64 xq = xf0*xf0;
+        v_float64 xq = v_mul(xf0, xf0);
         v_float64 zf0 = v_fma(xq, vA0, vA2);
         v_float64 zf1 = v_fma(xq, vA1, vA3);
         zf0 = v_fma(zf0, xq, vA4);
diff --git a/modules/core/src/matmul.simd.hpp b/modules/core/src/matmul.simd.hpp
index 3a9dbd9be8..058666485a 100644
--- a/modules/core/src/matmul.simd.hpp
+++ b/modules/core/src/matmul.simd.hpp
@@ -1584,7 +1584,7 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn,
         v_float32x4 _m2h = v_rotate_left<1>(_m2l);
         v_float32x4 _m3h = v_rotate_left<1>(_m3l);
         v_int16x8 _delta(0, -32768, -32768, -32768, -32768, -32768, -32768, 0);
-        for( ; x <= len*3 - v_uint16x8::nlanes; x += 3*v_uint16x8::nlanes/4 )
+        for( ; x <= len*3 - VTraits<v_uint16x8>::vlanes(); x += 3*VTraits<v_uint16x8>::vlanes()/4 )
             v_store(dst + x, v_rotate_right<1>(v_reinterpret_as_u16(v_add_wrap(v_pack(
                              v_round(v_matmuladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand(src + x    ))), _m0h, _m1h, _m2h, _m3h)),
                              v_round(v_matmuladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand(src + x + 3))), _m0l, _m1l, _m2l, _m3l))), _delta))));
@@ -1664,10 +1664,10 @@ transform_32f( const float* src, float* dst, const float* m, int len, int scn, i
         v_float32x4 _m2 = v_load(m + 10);
         v_float32x4 _m3 = v_load(m + 15);
         v_float32x4 _m4(m[4], m[9], m[14], m[19]);
-        for( ; x < len*4; x += v_float32x4::nlanes )
+        for( ; x < len*4; x += VTraits<v_float32x4>::vlanes() )
         {
             v_float32x4 v_src = v_load(src + x);
-            v_store(dst + x, v_reduce_sum4(v_src * _m0, v_src * _m1, v_src * _m2, v_src * _m3) + _m4);
+            v_store(dst + x, v_add(v_reduce_sum4(v_mul(v_src, _m0), v_mul(v_src, _m1), v_mul(v_src, _m2), v_mul(v_src, _m3)), _m4));
         }
 #else // CV_SIMD_WIDTH >= 16 && !CV_SIMD128
         for( ; x < len*4; x += 4 )
@@ -2113,12 +2113,12 @@ MulTransposedR(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
                     for( k = 0; k < size.height; k++, tsrc += srcstep )
                     {
                         v_float64x2 a = v_setall_f64((double)col_buf[k]);
-                        s0 += a * v_load(tsrc+0);
-                        s1 += a * v_load(tsrc+2);
+                        s0 = v_add(s0, v_mul(a, v_load(tsrc + 0)));
+                        s1 = v_add(s1, v_mul(a, v_load(tsrc + 2)));
                     }
 
-                    v_store((double*)(tdst+j), s0*v_scale);
-                    v_store((double*)(tdst+j+2), s1*v_scale);
+                    v_store((double*)(tdst+j), v_mul(s0, v_scale));
+                    v_store((double*)(tdst+j+2), v_mul(s1, v_scale));
                 } else
 #endif
                 {
@@ -2174,12 +2174,12 @@ MulTransposedR(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
                     for( k = 0; k < size.height; k++, tsrc+=srcstep, d+=deltastep )
                     {
                         v_float64x2 a = v_setall_f64((double)col_buf[k]);
-                        s0 += a * (v_load(tsrc+0) - v_load(d+0));
-                        s1 += a * (v_load(tsrc+2) - v_load(d+2));
+                        s0 = v_add(s0, v_mul(a, v_sub(v_load(tsrc + 0), v_load(d + 0))));
+                        s1 = v_add(s1, v_mul(a, v_sub(v_load(tsrc + 2), v_load(d + 2))));
                     }
 
-                    v_store((double*)(tdst+j), s0*v_scale);
-                    v_store((double*)(tdst+j+2), s1*v_scale);
+                    v_store((double*)(tdst+j), v_mul(s0, v_scale));
+                    v_store((double*)(tdst+j+2), v_mul(s1, v_scale));
                 }
                 else
 #endif
@@ -2249,8 +2249,7 @@ MulTransposedL(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
                     v_float64x2 v_s = v_setzero_f64();
 
                     for( k = 0; k <= size.width - 4; k += 4 )
-                        v_s += (v_load(v_tsrc1+k) * v_load(v_tsrc2+k)) +
-                               (v_load(v_tsrc1+k+2) * v_load(v_tsrc2+k+2));
+                        v_s = v_add(v_s, v_add(v_mul(v_load(v_tsrc1 + k), v_load(v_tsrc2 + k)), v_mul(v_load(v_tsrc1 + k + 2), v_load(v_tsrc2 + k + 2))));
                     s += v_reduce_sum(v_s);
                 }
                 else
@@ -2303,8 +2302,7 @@ MulTransposedL(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
                     v_float64x2 v_s = v_setzero_f64();
 
                     for( k = 0; k <= size.width - 4; k += 4, v_tdelta2 += delta_shift )
-                        v_s += ((v_load(v_tsrc2+k) - v_load(v_tdelta2)) * v_load(v_row_buf+k)) +
-                               ((v_load(v_tsrc2+k+2) - v_load(v_tdelta2+2)) * v_load(v_row_buf+k+2));
+                        v_s = v_add(v_s, v_add(v_mul(v_sub(v_load(v_tsrc2 + k), v_load(v_tdelta2)), v_load(v_row_buf + k)), v_mul(v_sub(v_load(v_tsrc2 + k + 2), v_load(v_tdelta2 + 2)), v_load(v_row_buf + k + 2))));
                     s += v_reduce_sum(v_s);
 
                     tdelta2 = (const dT *)(v_tdelta2);
@@ -2566,7 +2564,7 @@ double dotProd_32s(const int* src1, const int* src2, int len)
         v_sum0 = v_dotprod_expand_fast(v_src10, v_src20, v_sum0);
         v_sum1 = v_dotprod_expand_fast(v_src11, v_src21, v_sum1);
     }
-    v_sum0 += v_sum1;
+    v_sum0 = v_add(v_sum0, v_sum1);
 #endif
     for (; i < len - step; i += step, src1 += step, src2 += step)
     {
diff --git a/modules/core/src/matrix_transform.cpp b/modules/core/src/matrix_transform.cpp
index c4c7a73b4c..5a80ac8ca7 100644
--- a/modules/core/src/matrix_transform.cpp
+++ b/modules/core/src/matrix_transform.cpp
@@ -356,10 +356,10 @@ void transposeND(InputArray src_, const std::vector<int>& order, OutputArray dst
 #if CV_SIMD128
 template<typename V> CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
 {
-    typedef typename V::lane_type T;
+    typedef typename VTraits<V>::lane_type T;
     int end = (int)(size.width*esz);
     int width = (end + 1)/2;
-    int width_1 = width & -v_uint8x16::nlanes;
+    int width_1 = width & -VTraits<v_uint8x16>::vlanes();
     int i, j;
 
 #if CV_STRONG_ALIGNMENT
@@ -368,15 +368,15 @@ template<typename V> CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, s
 
     for( ; size.height--; src += sstep, dst += dstep )
     {
-        for( i = 0, j = end; i < width_1; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes )
+        for( i = 0, j = end; i < width_1; i += VTraits<v_uint8x16>::vlanes(), j -= VTraits<v_uint8x16>::vlanes() )
         {
             V t0, t1;
 
             t0 = v_load((T*)((uchar*)src + i));
-            t1 = v_load((T*)((uchar*)src + j - v_uint8x16::nlanes));
+            t1 = v_load((T*)((uchar*)src + j - VTraits<v_uint8x16>::vlanes()));
             t0 = v_reverse(t0);
             t1 = v_reverse(t1);
-            v_store((T*)(dst + j - v_uint8x16::nlanes), t0);
+            v_store((T*)(dst + j - VTraits<v_uint8x16>::vlanes()), t0);
             v_store((T*)(dst + i), t1);
         }
         if (isAligned<sizeof(T)>(src, dst))
@@ -446,14 +446,14 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size,
 #if CV_STRONG_ALIGNMENT
     size_t alignmentMark = ((size_t)src)|((size_t)dst)|sstep|dstep;
 #endif
-    if (esz == 2 * v_uint8x16::nlanes)
+    if (esz == 2 * (size_t)VTraits<v_uint8x16>::vlanes())
     {
         int end = (int)(size.width*esz);
         int width = end/2;
 
         for( ; size.height--; src += sstep, dst += dstep )
         {
-            for( int i = 0, j = end - 2 * v_uint8x16::nlanes; i < width; i += 2 * v_uint8x16::nlanes, j -= 2 * v_uint8x16::nlanes )
+            for( int i = 0, j = end - 2 * VTraits<v_uint8x16>::vlanes(); i < width; i += 2 * VTraits<v_uint8x16>::vlanes(), j -= 2 * VTraits<v_uint8x16>::vlanes() )
             {
 #if CV_SIMD256
                 v_uint8x32 t0, t1;
@@ -466,25 +466,25 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size,
                 v_uint8x16 t0, t1, t2, t3;
 
                 t0 = v_load((uchar*)src + i);
-                t1 = v_load((uchar*)src + i + v_uint8x16::nlanes);
+                t1 = v_load((uchar*)src + i + VTraits<v_uint8x16>::vlanes());
                 t2 = v_load((uchar*)src + j);
-                t3 = v_load((uchar*)src + j + v_uint8x16::nlanes);
+                t3 = v_load((uchar*)src + j + VTraits<v_uint8x16>::vlanes());
                 v_store(dst + j, t0);
-                v_store(dst + j + v_uint8x16::nlanes, t1);
+                v_store(dst + j + VTraits<v_uint8x16>::vlanes(), t1);
                 v_store(dst + i, t2);
-                v_store(dst + i + v_uint8x16::nlanes, t3);
+                v_store(dst + i + VTraits<v_uint8x16>::vlanes(), t3);
 #endif
             }
         }
     }
-    else if (esz == v_uint8x16::nlanes)
+    else if (esz == (size_t)VTraits<v_uint8x16>::vlanes())
     {
         int end = (int)(size.width*esz);
         int width = end/2;
 
         for( ; size.height--; src += sstep, dst += dstep )
         {
-            for( int i = 0, j = end - v_uint8x16::nlanes; i < width; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes )
+            for( int i = 0, j = end - VTraits<v_uint8x16>::vlanes(); i < width; i += VTraits<v_uint8x16>::vlanes(), j -= VTraits<v_uint8x16>::vlanes() )
             {
                 v_uint8x16 t0, t1;
 
@@ -534,19 +534,19 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size,
 
         for( ; size.height--; src += sstep, dst += dstep )
         {
-            for ( int i = 0, j = end; i < width; i += v_uint8x16::nlanes + sizeof(uint64_t), j -= v_uint8x16::nlanes + sizeof(uint64_t) )
+            for ( int i = 0, j = end; i < width; i += VTraits<v_uint8x16>::vlanes() + sizeof(uint64_t), j -= VTraits<v_uint8x16>::vlanes() + sizeof(uint64_t) )
             {
                 v_uint8x16 t0, t1;
                 uint64_t t2, t3;
 
                 t0 = v_load((uchar*)src + i);
-                t2 = *((uint64_t*)((uchar*)src + i + v_uint8x16::nlanes));
-                t1 = v_load((uchar*)src + j - v_uint8x16::nlanes - sizeof(uint64_t));
+                t2 = *((uint64_t*)((uchar*)src + i + VTraits<v_uint8x16>::vlanes()));
+                t1 = v_load((uchar*)src + j - VTraits<v_uint8x16>::vlanes() - sizeof(uint64_t));
                 t3 = *((uint64_t*)((uchar*)src + j - sizeof(uint64_t)));
-                v_store(dst + j - v_uint8x16::nlanes - sizeof(uint64_t), t0);
+                v_store(dst + j - VTraits<v_uint8x16>::vlanes() - sizeof(uint64_t), t0);
                 *((uint64_t*)(dst + j - sizeof(uint64_t))) = t2;
                 v_store(dst + i, t1);
-                *((uint64_t*)(dst + i + v_uint8x16::nlanes)) = t3;
+                *((uint64_t*)(dst + i + VTraits<v_uint8x16>::vlanes())) = t3;
             }
         }
     }
diff --git a/modules/core/src/minmax.cpp b/modules/core/src/minmax.cpp
index 3a5be11a37..ff3786886e 100644
--- a/modules/core/src/minmax.cpp
+++ b/modules/core/src/minmax.cpp
@@ -141,7 +141,7 @@ CV_ALWAYS_INLINE uint64_t v_reduce_min(const v_uint64x2& a)
 
 CV_ALWAYS_INLINE v_uint64x2 v_select(const v_uint64x2& mask, const v_uint64x2& a, const v_uint64x2& b)
 {
-    return b ^ ((a ^ b) & mask);
+    return v_xor(b, v_and(v_xor(a, b), mask));
 }
 #endif
 
@@ -151,16 +151,16 @@ minMaxIdx_reduce_##suffix( VT &valMin, VT &valMax, IT &idxMin, IT &idxMax, IT &n
                   T &minVal, T &maxVal, size_t &minIdx, size_t &maxIdx, \
                   size_t delta ) \
 { \
-    if ( v_check_any(idxMin != none) ) \
+    if ( v_check_any(v_ne(idxMin, none)) ) \
     { \
         minVal = v_reduce_min(valMin); \
-        minIdx = (size_t)v_reduce_min(v_select(v_reinterpret_as_##suffix2(v_setall_##suffix((IR)minVal) == valMin), \
+        minIdx = (size_t)v_reduce_min(v_select(v_reinterpret_as_##suffix2(v_eq(v_setall_##suffix((IR)minVal), valMin)), \
                      idxMin, v_setall_##suffix2(maxLimit))) + delta; \
     } \
-    if ( v_check_any(idxMax != none) ) \
+    if ( v_check_any(v_ne(idxMax, none)) ) \
     { \
         maxVal = v_reduce_max(valMax); \
-        maxIdx = (size_t)v_reduce_min(v_select(v_reinterpret_as_##suffix2(v_setall_##suffix((IR)maxVal) == valMax), \
+        maxIdx = (size_t)v_reduce_min(v_select(v_reinterpret_as_##suffix2(v_eq(v_setall_##suffix((IR)maxVal), valMax)), \
                      idxMax, v_setall_##suffix2(maxLimit))) + delta; \
     } \
 }
@@ -210,18 +210,18 @@ static void minMaxIdx_8u(const uchar* src, const uchar* mask, int* minval, int*
                          size_t* minidx, size_t* maxidx, int len, size_t startidx )
 {
 #if CV_SIMD128
-    if ( len >= v_uint8x16::nlanes )
+    if ( len >= VTraits<v_uint8x16>::vlanes() )
     {
         int j, len0;
         int minVal, maxVal;
         size_t minIdx, maxIdx;
 
         minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx,
-                        (int)0, (int)UCHAR_MAX, v_uint8x16::nlanes, len, startidx, j, len0 );
+                        (int)0, (int)UCHAR_MAX, VTraits<v_uint8x16>::vlanes(), len, startidx, j, len0 );
 
-        if ( j <= len0 - v_uint8x16::nlanes )
+        if ( j <= len0 - VTraits<v_uint8x16>::vlanes() )
         {
-            v_uint8x16 inc = v_setall_u8(v_uint8x16::nlanes);
+            v_uint8x16 inc = v_setall_u8((uchar)VTraits<v_uint8x16>::vlanes());
             v_uint8x16 none = v_reinterpret_as_u8(v_setall_s8(-1));
             v_uint8x16 idxStart(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 
@@ -235,31 +235,31 @@ static void minMaxIdx_8u(const uchar* src, const uchar* mask, int* minval, int*
 
                 if ( !mask )
                 {
-                    for( ; k < std::min(len0, j + 15 * v_uint8x16::nlanes); k += v_uint8x16::nlanes )
+                    for( ; k < std::min(len0, j + 15 * VTraits<v_uint8x16>::vlanes()); k += VTraits<v_uint8x16>::vlanes() )
                     {
                         v_uint8x16 data = v_load(src + k);
-                        v_uint8x16 cmpMin = (data < valMin);
-                        v_uint8x16 cmpMax = (data > valMax);
+                        v_uint8x16 cmpMin = (v_lt(data, valMin));
+                        v_uint8x16 cmpMax = (v_gt(data, valMax));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_min(data, valMin);
                         valMax = v_max(data, valMax);
-                        idx += inc;
+                        idx = v_add(idx, inc);
                     }
                 }
                 else
                 {
-                    for( ; k < std::min(len0, j + 15 * v_uint8x16::nlanes); k += v_uint8x16::nlanes )
+                    for( ; k < std::min(len0, j + 15 * VTraits<v_uint8x16>::vlanes()); k += VTraits<v_uint8x16>::vlanes() )
                     {
                         v_uint8x16 data = v_load(src + k);
-                        v_uint8x16 maskVal = v_load(mask + k) != v_setzero_u8();
-                        v_uint8x16 cmpMin = (data < valMin) & maskVal;
-                        v_uint8x16 cmpMax = (data > valMax) & maskVal;
+                        v_uint8x16 maskVal = v_ne(v_load(mask + k), v_setzero_u8());
+                        v_uint8x16 cmpMin = v_and(v_lt(data, valMin), maskVal);
+                        v_uint8x16 cmpMax = v_and(v_gt(data, valMax), maskVal);
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_select(cmpMin, data, valMin);
                         valMax = v_select(cmpMax, data, valMax);
-                        idx += inc;
+                        idx = v_add(idx, inc);
                     }
                 }
 
@@ -287,18 +287,18 @@ static void minMaxIdx_8s(const schar* src, const uchar* mask, int* minval, int*
                          size_t* minidx, size_t* maxidx, int len, size_t startidx )
 {
 #if CV_SIMD128
-    if ( len >= v_int8x16::nlanes )
+    if ( len >= VTraits<v_int8x16>::vlanes() )
     {
         int j, len0;
         int minVal, maxVal;
         size_t minIdx, maxIdx;
 
         minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx,
-                        (int)SCHAR_MIN, (int)SCHAR_MAX, v_int8x16::nlanes, len, startidx, j, len0 );
+                        (int)SCHAR_MIN, (int)SCHAR_MAX, VTraits<v_int8x16>::vlanes(), len, startidx, j, len0 );
 
-        if ( j <= len0 - v_int8x16::nlanes )
+        if ( j <= len0 - VTraits<v_int8x16>::vlanes() )
         {
-            v_uint8x16 inc = v_setall_u8(v_int8x16::nlanes);
+            v_uint8x16 inc = v_setall_u8((uchar)VTraits<v_int8x16>::vlanes());
             v_uint8x16 none = v_reinterpret_as_u8(v_setall_s8(-1));
             v_uint8x16 idxStart(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 
@@ -312,31 +312,31 @@ static void minMaxIdx_8s(const schar* src, const uchar* mask, int* minval, int*
 
                 if ( !mask )
                 {
-                    for( ; k < std::min(len0, j + 15 * v_int8x16::nlanes); k += v_int8x16::nlanes )
+                    for( ; k < std::min(len0, j + 15 * VTraits<v_int8x16>::vlanes()); k += VTraits<v_int8x16>::vlanes() )
                     {
                         v_int8x16 data = v_load(src + k);
-                        v_uint8x16 cmpMin = v_reinterpret_as_u8(data < valMin);
-                        v_uint8x16 cmpMax = v_reinterpret_as_u8(data > valMax);
+                        v_uint8x16 cmpMin = v_reinterpret_as_u8(v_lt(data, valMin));
+                        v_uint8x16 cmpMax = v_reinterpret_as_u8(v_gt(data, valMax));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_min(data, valMin);
                         valMax = v_max(data, valMax);
-                        idx += inc;
+                        idx = v_add(idx, inc);
                     }
                 }
                 else
                 {
-                    for( ; k < std::min(len0, j + 15 * v_int8x16::nlanes); k += v_int8x16::nlanes )
+                    for( ; k < std::min(len0, j + 15 * VTraits<v_int8x16>::vlanes()); k += VTraits<v_int8x16>::vlanes() )
                     {
                         v_int8x16 data = v_load(src + k);
-                        v_uint8x16 maskVal = v_load(mask + k) != v_setzero_u8();
-                        v_uint8x16 cmpMin = v_reinterpret_as_u8(data < valMin) & maskVal;
-                        v_uint8x16 cmpMax = v_reinterpret_as_u8(data > valMax) & maskVal;
+                        v_uint8x16 maskVal = v_ne(v_load(mask + k), v_setzero_u8());
+                        v_uint8x16 cmpMin = v_and(v_reinterpret_as_u8(v_lt(data, valMin)), maskVal);
+                        v_uint8x16 cmpMax = v_and(v_reinterpret_as_u8(v_gt(data, valMax)), maskVal);
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_select(v_reinterpret_as_s8(cmpMin), data, valMin);
                         valMax = v_select(v_reinterpret_as_s8(cmpMax), data, valMax);
-                        idx += inc;
+                        idx = v_add(idx, inc);
                     }
                 }
 
@@ -364,18 +364,18 @@ static void minMaxIdx_16u(const ushort* src, const uchar* mask, int* minval, int
                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
 {
 #if CV_SIMD128
-    if ( len >= v_uint16x8::nlanes )
+    if ( len >= VTraits<v_uint16x8>::vlanes() )
     {
         int j, len0;
         int minVal, maxVal;
         size_t minIdx, maxIdx;
 
         minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx,
-                        (int)0, (int)USHRT_MAX, v_uint16x8::nlanes, len, startidx, j, len0 );
+                        (int)0, (int)USHRT_MAX, VTraits<v_uint16x8>::vlanes(), len, startidx, j, len0 );
 
-        if ( j <= len0 - v_uint16x8::nlanes )
+        if ( j <= len0 - VTraits<v_uint16x8>::vlanes() )
         {
-            v_uint16x8 inc = v_setall_u16(v_uint16x8::nlanes);
+            v_uint16x8 inc = v_setall_u16((uchar)VTraits<v_uint16x8>::vlanes());
             v_uint16x8 none = v_reinterpret_as_u16(v_setall_s16(-1));
             v_uint16x8 idxStart(0, 1, 2, 3, 4, 5, 6, 7);
 
@@ -389,31 +389,31 @@ static void minMaxIdx_16u(const ushort* src, const uchar* mask, int* minval, int
 
                 if ( !mask )
                 {
-                    for( ; k < std::min(len0, j + 8191 * v_uint16x8::nlanes); k += v_uint16x8::nlanes )
+                    for( ; k < std::min(len0, j + 8191 * VTraits<v_uint16x8>::vlanes()); k += VTraits<v_uint16x8>::vlanes() )
                     {
                         v_uint16x8 data = v_load(src + k);
-                        v_uint16x8 cmpMin = (data < valMin);
-                        v_uint16x8 cmpMax = (data > valMax);
+                        v_uint16x8 cmpMin = (v_lt(data, valMin));
+                        v_uint16x8 cmpMax = (v_gt(data, valMax));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_min(data, valMin);
                         valMax = v_max(data, valMax);
-                        idx += inc;
+                        idx = v_add(idx, inc);
                     }
                 }
                 else
                 {
-                    for( ; k < std::min(len0, j + 8191 * v_uint16x8::nlanes); k += v_uint16x8::nlanes )
+                    for( ; k < std::min(len0, j + 8191 * VTraits<v_uint16x8>::vlanes()); k += VTraits<v_uint16x8>::vlanes() )
                     {
                         v_uint16x8 data = v_load(src + k);
-                        v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16();
-                        v_uint16x8 cmpMin = (data < valMin) & maskVal;
-                        v_uint16x8 cmpMax = (data > valMax) & maskVal;
+                        v_uint16x8 maskVal = v_ne(v_load_expand(mask + k), v_setzero_u16());
+                        v_uint16x8 cmpMin = v_and(v_lt(data, valMin), maskVal);
+                        v_uint16x8 cmpMax = v_and(v_gt(data, valMax), maskVal);
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_select(cmpMin, data, valMin);
                         valMax = v_select(cmpMax, data, valMax);
-                        idx += inc;
+                        idx = v_add(idx, inc);
                     }
                 }
 
@@ -441,18 +441,18 @@ static void minMaxIdx_16s(const short* src, const uchar* mask, int* minval, int*
                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
 {
 #if CV_SIMD128
-    if ( len >= v_int16x8::nlanes )
+    if ( len >= VTraits<v_int16x8>::vlanes() )
     {
         int j, len0;
         int minVal, maxVal;
         size_t minIdx, maxIdx;
 
         minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx,
-                        (int)SHRT_MIN, (int)SHRT_MAX, v_int16x8::nlanes, len, startidx, j, len0 );
+                        (int)SHRT_MIN, (int)SHRT_MAX, VTraits<v_int16x8>::vlanes(), len, startidx, j, len0 );
 
-        if ( j <= len0 - v_int16x8::nlanes )
+        if ( j <= len0 - VTraits<v_int16x8>::vlanes() )
         {
-            v_uint16x8 inc = v_setall_u16(v_int16x8::nlanes);
+            v_uint16x8 inc = v_setall_u16((uchar)VTraits<v_int16x8>::vlanes());
             v_uint16x8 none = v_reinterpret_as_u16(v_setall_s16(-1));
             v_uint16x8 idxStart(0, 1, 2, 3, 4, 5, 6, 7);
 
@@ -466,31 +466,31 @@ static void minMaxIdx_16s(const short* src, const uchar* mask, int* minval, int*
 
                 if ( !mask )
                 {
-                    for( ; k < std::min(len0, j + 8191 * v_int16x8::nlanes); k += v_int16x8::nlanes )
+                    for( ; k < std::min(len0, j + 8191 * VTraits<v_int16x8>::vlanes()); k += VTraits<v_int16x8>::vlanes() )
                     {
                         v_int16x8 data = v_load(src + k);
-                        v_uint16x8 cmpMin = v_reinterpret_as_u16(data < valMin);
-                        v_uint16x8 cmpMax = v_reinterpret_as_u16(data > valMax);
+                        v_uint16x8 cmpMin = v_reinterpret_as_u16(v_lt(data, valMin));
+                        v_uint16x8 cmpMax = v_reinterpret_as_u16(v_gt(data, valMax));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_min(data, valMin);
                         valMax = v_max(data, valMax);
-                        idx += inc;
+                        idx = v_add(idx, inc);
                     }
                 }
                 else
                 {
-                    for( ; k < std::min(len0, j + 8191 * v_int16x8::nlanes); k += v_int16x8::nlanes )
+                    for( ; k < std::min(len0, j + 8191 * VTraits<v_int16x8>::vlanes()); k += VTraits<v_int16x8>::vlanes() )
                     {
                         v_int16x8 data = v_load(src + k);
-                        v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16();
-                        v_uint16x8 cmpMin = v_reinterpret_as_u16(data < valMin) & maskVal;
-                        v_uint16x8 cmpMax = v_reinterpret_as_u16(data > valMax) & maskVal;
+                        v_uint16x8 maskVal = v_ne(v_load_expand(mask + k), v_setzero_u16());
+                        v_uint16x8 cmpMin = v_and(v_reinterpret_as_u16(v_lt(data, valMin)), maskVal);
+                        v_uint16x8 cmpMax = v_and(v_reinterpret_as_u16(v_gt(data, valMax)), maskVal);
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_select(v_reinterpret_as_s16(cmpMin), data, valMin);
                         valMax = v_select(v_reinterpret_as_s16(cmpMax), data, valMax);
-                        idx += inc;
+                        idx = v_add(idx, inc);
                     }
                 }
 
@@ -518,14 +518,14 @@ static void minMaxIdx_32s(const int* src, const uchar* mask, int* minval, int* m
                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
 {
 #if CV_SIMD128
-    if ( len >= 2 * v_int32x4::nlanes )
+    if ( len >= 2 * VTraits<v_int32x4>::vlanes() )
     {
-        int j = 0, len0 = len & -(2 * v_int32x4::nlanes);
+        int j = 0, len0 = len & -(2 * VTraits<v_int32x4>::vlanes());
         int minVal = *minval, maxVal = *maxval;
         size_t minIdx = *minidx, maxIdx = *maxidx;
 
         {
-            v_uint32x4 inc = v_setall_u32(v_int32x4::nlanes);
+            v_uint32x4 inc = v_setall_u32(VTraits<v_int32x4>::vlanes());
             v_uint32x4 none = v_reinterpret_as_u32(v_setall_s32(-1));
             v_uint32x4 idxStart(0, 1, 2, 3);
 
@@ -539,49 +539,49 @@ static void minMaxIdx_32s(const int* src, const uchar* mask, int* minval, int* m
 
                 if ( !mask )
                 {
-                    for( ; k < std::min(len0, j + 32766 * 2 * v_int32x4::nlanes); k += 2 * v_int32x4::nlanes )
+                    for( ; k < std::min(len0, j + 32766 * 2 * VTraits<v_int32x4>::vlanes()); k += 2 * VTraits<v_int32x4>::vlanes() )
                     {
                         v_int32x4 data = v_load(src + k);
-                        v_uint32x4 cmpMin = v_reinterpret_as_u32(data < valMin);
-                        v_uint32x4 cmpMax = v_reinterpret_as_u32(data > valMax);
+                        v_uint32x4 cmpMin = v_reinterpret_as_u32(v_lt(data, valMin));
+                        v_uint32x4 cmpMax = v_reinterpret_as_u32(v_gt(data, valMax));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_min(data, valMin);
                         valMax = v_max(data, valMax);
-                        idx += inc;
-                        data = v_load(src + k + v_int32x4::nlanes);
-                        cmpMin = v_reinterpret_as_u32(data < valMin);
-                        cmpMax = v_reinterpret_as_u32(data > valMax);
+                        idx = v_add(idx, inc);
+                        data = v_load(src + k + VTraits<v_int32x4>::vlanes());
+                        cmpMin = v_reinterpret_as_u32(v_lt(data, valMin));
+                        cmpMax = v_reinterpret_as_u32(v_gt(data, valMax));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_min(data, valMin);
                         valMax = v_max(data, valMax);
-                        idx += inc;
+                        idx = v_add(idx, inc);
                     }
                 }
                 else
                 {
-                    for( ; k < std::min(len0, j + 32766 * 2 * v_int32x4::nlanes); k += 2 * v_int32x4::nlanes )
+                    for( ; k < std::min(len0, j + 32766 * 2 * VTraits<v_int32x4>::vlanes()); k += 2 * VTraits<v_int32x4>::vlanes() )
                     {
                         v_int32x4 data = v_load(src + k);
-                        v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16();
+                        v_uint16x8 maskVal = v_ne(v_load_expand(mask + k), v_setzero_u16());
                         v_int32x4 maskVal1, maskVal2;
                         v_expand(v_reinterpret_as_s16(maskVal), maskVal1, maskVal2);
-                        v_uint32x4 cmpMin = v_reinterpret_as_u32((data < valMin) & maskVal1);
-                        v_uint32x4 cmpMax = v_reinterpret_as_u32((data > valMax) & maskVal1);
+                        v_uint32x4 cmpMin = v_reinterpret_as_u32(v_and(v_lt(data, valMin), maskVal1));
+                        v_uint32x4 cmpMax = v_reinterpret_as_u32(v_and(v_gt(data, valMax), maskVal1));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_select(v_reinterpret_as_s32(cmpMin), data, valMin);
                         valMax = v_select(v_reinterpret_as_s32(cmpMax), data, valMax);
-                        idx += inc;
-                        data = v_load(src + k + v_int32x4::nlanes);
-                        cmpMin = v_reinterpret_as_u32((data < valMin) & maskVal2);
-                        cmpMax = v_reinterpret_as_u32((data > valMax) & maskVal2);
+                        idx = v_add(idx, inc);
+                        data = v_load(src + k + VTraits<v_int32x4>::vlanes());
+                        cmpMin = v_reinterpret_as_u32(v_and(v_lt(data, valMin), maskVal2));
+                        cmpMax = v_reinterpret_as_u32(v_and(v_gt(data, valMax), maskVal2));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_select(v_reinterpret_as_s32(cmpMin), data, valMin);
                         valMax = v_select(v_reinterpret_as_s32(cmpMax), data, valMax);
-                        idx += inc;
+                        idx = v_add(idx, inc);
                     }
                 }
 
@@ -609,18 +609,18 @@ static void minMaxIdx_32f(const float* src, const uchar* mask, float* minval, fl
                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
 {
 #if CV_SIMD128
-    if ( len >= 2 * v_float32x4::nlanes )
+    if ( len >= 2 * VTraits<v_float32x4>::vlanes() )
     {
         int j, len0;
         float minVal, maxVal;
         size_t minIdx, maxIdx;
 
         minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx,
-                        FLT_MIN, FLT_MAX, 2 * v_float32x4::nlanes, len, startidx, j, len0 );
+                        FLT_MIN, FLT_MAX, 2 * VTraits<v_float32x4>::vlanes(), len, startidx, j, len0 );
 
-        if ( j <= len0 - 2 * v_float32x4::nlanes )
+        if ( j <= len0 - 2 * VTraits<v_float32x4>::vlanes() )
         {
-            v_uint32x4 inc = v_setall_u32(v_float32x4::nlanes);
+            v_uint32x4 inc = v_setall_u32(VTraits<v_float32x4>::vlanes());
             v_uint32x4 none = v_reinterpret_as_u32(v_setall_s32(-1));
             v_uint32x4 idxStart(0, 1, 2, 3);
 
@@ -634,49 +634,49 @@ static void minMaxIdx_32f(const float* src, const uchar* mask, float* minval, fl
 
                 if ( !mask )
                 {
-                    for( ; k < std::min(len0, j + 32766 * 2 * v_float32x4::nlanes); k += 2 * v_float32x4::nlanes )
+                    for( ; k < std::min(len0, j + 32766 * 2 * VTraits<v_float32x4>::vlanes()); k += 2 * VTraits<v_float32x4>::vlanes() )
                     {
                         v_float32x4 data = v_load(src + k);
-                        v_uint32x4 cmpMin = v_reinterpret_as_u32(data < valMin);
-                        v_uint32x4 cmpMax = v_reinterpret_as_u32(data > valMax);
+                        v_uint32x4 cmpMin = v_reinterpret_as_u32(v_lt(data, valMin));
+                        v_uint32x4 cmpMax = v_reinterpret_as_u32(v_gt(data, valMax));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_min(data, valMin);
                         valMax = v_max(data, valMax);
-                        idx += inc;
-                        data = v_load(src + k + v_float32x4::nlanes);
-                        cmpMin = v_reinterpret_as_u32(data < valMin);
-                        cmpMax = v_reinterpret_as_u32(data > valMax);
+                        idx = v_add(idx, inc);
+                        data = v_load(src + k + VTraits<v_float32x4>::vlanes());
+                        cmpMin = v_reinterpret_as_u32(v_lt(data, valMin));
+                        cmpMax = v_reinterpret_as_u32(v_gt(data, valMax));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_min(data, valMin);
                         valMax = v_max(data, valMax);
-                        idx += inc;
+                        idx = v_add(idx, inc);
                     }
                 }
                 else
                 {
-                    for( ; k < std::min(len0, j + 32766 * 2 * v_float32x4::nlanes); k += 2 * v_float32x4::nlanes )
+                    for( ; k < std::min(len0, j + 32766 * 2 * VTraits<v_float32x4>::vlanes()); k += 2 * VTraits<v_float32x4>::vlanes() )
                     {
                         v_float32x4 data = v_load(src + k);
-                        v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16();
+                        v_uint16x8 maskVal = v_ne(v_load_expand(mask + k), v_setzero_u16());
                         v_int32x4 maskVal1, maskVal2;
                         v_expand(v_reinterpret_as_s16(maskVal), maskVal1, maskVal2);
-                        v_uint32x4 cmpMin = v_reinterpret_as_u32(v_reinterpret_as_s32(data < valMin) & maskVal1);
-                        v_uint32x4 cmpMax = v_reinterpret_as_u32(v_reinterpret_as_s32(data > valMax) & maskVal1);
+                        v_uint32x4 cmpMin = v_reinterpret_as_u32(v_and(v_reinterpret_as_s32(v_lt(data, valMin)), maskVal1));
+                        v_uint32x4 cmpMax = v_reinterpret_as_u32(v_and(v_reinterpret_as_s32(v_gt(data, valMax)), maskVal1));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_select(v_reinterpret_as_f32(cmpMin), data, valMin);
                         valMax = v_select(v_reinterpret_as_f32(cmpMax), data, valMax);
-                        idx += inc;
-                        data = v_load(src + k + v_float32x4::nlanes);
-                        cmpMin = v_reinterpret_as_u32(v_reinterpret_as_s32(data < valMin) & maskVal2);
-                        cmpMax = v_reinterpret_as_u32(v_reinterpret_as_s32(data > valMax) & maskVal2);
+                        idx = v_add(idx, inc);
+                        data = v_load(src + k + VTraits<v_float32x4>::vlanes());
+                        cmpMin = v_reinterpret_as_u32(v_and(v_reinterpret_as_s32(v_lt(data, valMin)), maskVal2));
+                        cmpMax = v_reinterpret_as_u32(v_and(v_reinterpret_as_s32(v_gt(data, valMax)), maskVal2));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_select(v_reinterpret_as_f32(cmpMin), data, valMin);
                         valMax = v_select(v_reinterpret_as_f32(cmpMax), data, valMax);
-                        idx += inc;
+                        idx = v_add(idx, inc);
                     }
                 }
 
@@ -704,18 +704,18 @@ static void minMaxIdx_64f(const double* src, const uchar* mask, double* minval,
                           size_t* minidx, size_t* maxidx, int len, size_t startidx )
 {
 #if CV_SIMD128_64F
-    if ( len >= 4 * v_float64x2::nlanes )
+    if ( len >= 4 * VTraits<v_float64x2>::vlanes() )
     {
         int j, len0;
         double minVal, maxVal;
         size_t minIdx, maxIdx;
 
         minMaxIdx_init( src, mask, minval, maxval, minidx, maxidx, minVal, maxVal, minIdx, maxIdx,
-                        DBL_MIN, DBL_MAX, 4 * v_float64x2::nlanes, len, startidx, j, len0 );
+                        DBL_MIN, DBL_MAX, 4 * VTraits<v_float64x2>::vlanes(), len, startidx, j, len0 );
 
-        if ( j <= len0 - 4 * v_float64x2::nlanes )
+        if ( j <= len0 - 4 * VTraits<v_float64x2>::vlanes() )
         {
-            v_uint64x2 inc = v_setall_u64(v_float64x2::nlanes);
+            v_uint64x2 inc = v_setall_u64(VTraits<v_float64x2>::vlanes());
             v_uint64x2 none = v_reinterpret_as_u64(v_setall_s64(-1));
             v_uint64x2 idxStart(0, 1);
 
@@ -729,84 +729,84 @@ static void minMaxIdx_64f(const double* src, const uchar* mask, double* minval,
 
                 if ( !mask )
                 {
-                    for( ; k < std::min(len0, j + 32764 * 4 * v_float64x2::nlanes); k += 4 * v_float64x2::nlanes )
+                    for( ; k < std::min(len0, j + 32764 * 4 * VTraits<v_float64x2>::vlanes()); k += 4 * VTraits<v_float64x2>::vlanes() )
                     {
                         v_float64x2 data = v_load(src + k);
-                        v_uint64x2 cmpMin = v_reinterpret_as_u64(data < valMin);
-                        v_uint64x2 cmpMax = v_reinterpret_as_u64(data > valMax);
+                        v_uint64x2 cmpMin = v_reinterpret_as_u64(v_lt(data, valMin));
+                        v_uint64x2 cmpMax = v_reinterpret_as_u64(v_gt(data, valMax));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_min(data, valMin);
                         valMax = v_max(data, valMax);
-                        idx += inc;
-                        data = v_load(src + k + v_float64x2::nlanes);
-                        cmpMin = v_reinterpret_as_u64(data < valMin);
-                        cmpMax = v_reinterpret_as_u64(data > valMax);
+                        idx = v_add(idx, inc);
+                        data = v_load(src + k + VTraits<v_float64x2>::vlanes());
+                        cmpMin = v_reinterpret_as_u64(v_lt(data, valMin));
+                        cmpMax = v_reinterpret_as_u64(v_gt(data, valMax));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_min(data, valMin);
                         valMax = v_max(data, valMax);
-                        idx += inc;
-                        data = v_load(src + k + 2 * v_float64x2::nlanes);
-                        cmpMin = v_reinterpret_as_u64(data < valMin);
-                        cmpMax = v_reinterpret_as_u64(data > valMax);
+                        idx = v_add(idx, inc);
+                        data = v_load(src + k + 2 * VTraits<v_float64x2>::vlanes());
+                        cmpMin = v_reinterpret_as_u64(v_lt(data, valMin));
+                        cmpMax = v_reinterpret_as_u64(v_gt(data, valMax));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_min(data, valMin);
                         valMax = v_max(data, valMax);
-                        idx += inc;
-                        data = v_load(src + k + 3 * v_float64x2::nlanes);
-                        cmpMin = v_reinterpret_as_u64(data < valMin);
-                        cmpMax = v_reinterpret_as_u64(data > valMax);
+                        idx = v_add(idx, inc);
+                        data = v_load(src + k + 3 * VTraits<v_float64x2>::vlanes());
+                        cmpMin = v_reinterpret_as_u64(v_lt(data, valMin));
+                        cmpMax = v_reinterpret_as_u64(v_gt(data, valMax));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_min(data, valMin);
                         valMax = v_max(data, valMax);
-                        idx += inc;
+                        idx = v_add(idx, inc);
                     }
                 }
                 else
                 {
-                    for( ; k < std::min(len0, j + 32764 * 4 * v_float64x2::nlanes); k += 4 * v_float64x2::nlanes )
+                    for( ; k < std::min(len0, j + 32764 * 4 * VTraits<v_float64x2>::vlanes()); k += 4 * VTraits<v_float64x2>::vlanes() )
                     {
                         v_float64x2 data = v_load(src + k);
-                        v_uint16x8 maskVal = v_load_expand(mask + k) != v_setzero_u16();
+                        v_uint16x8 maskVal = v_ne(v_load_expand(mask + k), v_setzero_u16());
                         v_int32x4 maskVal1, maskVal2;
                         v_expand(v_reinterpret_as_s16(maskVal), maskVal1, maskVal2);
                         v_int64x2 maskVal3, maskVal4;
                         v_expand(maskVal1, maskVal3, maskVal4);
-                        v_uint64x2 cmpMin = v_reinterpret_as_u64(v_reinterpret_as_s64(data < valMin) & maskVal3);
-                        v_uint64x2 cmpMax = v_reinterpret_as_u64(v_reinterpret_as_s64(data > valMax) & maskVal3);
+                        v_uint64x2 cmpMin = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_lt(data, valMin)), maskVal3));
+                        v_uint64x2 cmpMax = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_gt(data, valMax)), maskVal3));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_select(v_reinterpret_as_f64(cmpMin), data, valMin);
                         valMax = v_select(v_reinterpret_as_f64(cmpMax), data, valMax);
-                        idx += inc;
-                        data = v_load(src + k + v_float64x2::nlanes);
-                        cmpMin = v_reinterpret_as_u64(v_reinterpret_as_s64(data < valMin) & maskVal4);
-                        cmpMax = v_reinterpret_as_u64(v_reinterpret_as_s64(data > valMax) & maskVal4);
+                        idx = v_add(idx, inc);
+                        data = v_load(src + k + VTraits<v_float64x2>::vlanes());
+                        cmpMin = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_lt(data, valMin)), maskVal4));
+                        cmpMax = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_gt(data, valMax)), maskVal4));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_select(v_reinterpret_as_f64(cmpMin), data, valMin);
                         valMax = v_select(v_reinterpret_as_f64(cmpMax), data, valMax);
-                        idx += inc;
-                        data = v_load(src + k + 2 * v_float64x2::nlanes);
+                        idx = v_add(idx, inc);
+                        data = v_load(src + k + 2 * VTraits<v_float64x2>::vlanes());
                         v_expand(maskVal2, maskVal3, maskVal4);
-                        cmpMin = v_reinterpret_as_u64(v_reinterpret_as_s64(data < valMin) & maskVal3);
-                        cmpMax = v_reinterpret_as_u64(v_reinterpret_as_s64(data > valMax) & maskVal3);
+                        cmpMin = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_lt(data, valMin)), maskVal3));
+                        cmpMax = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_gt(data, valMax)), maskVal3));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_select(v_reinterpret_as_f64(cmpMin), data, valMin);
                         valMax = v_select(v_reinterpret_as_f64(cmpMax), data, valMax);
-                        idx += inc;
-                        data = v_load(src + k + 3 * v_float64x2::nlanes);
-                        cmpMin = v_reinterpret_as_u64(v_reinterpret_as_s64(data < valMin) & maskVal4);
-                        cmpMax = v_reinterpret_as_u64(v_reinterpret_as_s64(data > valMax) & maskVal4);
+                        idx = v_add(idx, inc);
+                        data = v_load(src + k + 3 * VTraits<v_float64x2>::vlanes());
+                        cmpMin = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_lt(data, valMin)), maskVal4));
+                        cmpMax = v_reinterpret_as_u64(v_and(v_reinterpret_as_s64(v_gt(data, valMax)), maskVal4));
                         idxMin = v_select(cmpMin, idx, idxMin);
                         idxMax = v_select(cmpMax, idx, idxMax);
                         valMin = v_select(v_reinterpret_as_f64(cmpMin), data, valMin);
                         valMax = v_select(v_reinterpret_as_f64(cmpMax), data, valMax);
-                        idx += inc;
+                        idx = v_add(idx, inc);
                     }
                 }
 
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
index 1ece6de82f..38b8d10f7b 100644
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -1745,13 +1745,8 @@ template<typename R> struct TheTest
         R a = dataA;
         R b = dataB;
 
-#if CV_SIMD_SCALABLE
         Data<R> dataEQ = v_eq(a, b);
         Data<R> dataNE = v_ne(a, b);
-#else
-        Data<R> dataEQ = (a == b);
-        Data<R> dataNE = (a != b);
-#endif
 
         for (int i = 0; i < VTraits<R>::vlanes(); ++i)
         {
diff --git a/modules/dnn/src/int8layers/convolution_layer.cpp b/modules/dnn/src/int8layers/convolution_layer.cpp
index 60301a406c..ba9b31fe35 100644
--- a/modules/dnn/src/int8layers/convolution_layer.cpp
+++ b/modules/dnn/src/int8layers/convolution_layer.cpp
@@ -29,10 +29,10 @@ static inline void v_expand_mul_add(const v_int8x16& a, const v_int8x16& b,
 
     v_int32x4 t0, t1;
     v_mul_expand(a0, b0, t0, t1);
-    out0 += t0; out1 += t1;
+    out0 = v_add(out0, t0); out1 = v_add(out1, t1);
 
     v_mul_expand(a1, b1, t0, t1);
-    out2 += t0; out3 += t1;
+    out2 = v_add(out2, t0); out3 = v_add(out3, t1);
 }
 #endif
 
@@ -1055,10 +1055,10 @@ public:
                                             v_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3);
                                             v_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3);
 
-                                            vout0 = voutzp + v_round(v_cvt_f32(vout0)*vmult);
-                                            vout1 = voutzp + v_round(v_cvt_f32(vout1)*vmult);
-                                            vout2 = voutzp + v_round(v_cvt_f32(vout2)*vmult);
-                                            vout3 = voutzp + v_round(v_cvt_f32(vout3)*vmult);
+                                            vout0 = v_add(voutzp, v_round(v_mul(v_cvt_f32(vout0), vmult)));
+                                            vout1 = v_add(voutzp, v_round(v_mul(v_cvt_f32(vout1), vmult)));
+                                            vout2 = v_add(voutzp, v_round(v_mul(v_cvt_f32(vout2), vmult)));
+                                            vout3 = v_add(voutzp, v_round(v_mul(v_cvt_f32(vout3), vmult)));
 
                                             vout0 = v_min(v_max(vout0, outmin), outmax);
                                             vout1 = v_min(v_max(vout1, outmin), outmax);
@@ -1408,12 +1408,12 @@ public:
                                     vs12 = v_dotprod_expand_fast(w1, r2, vs12);
                                     vs13 = v_dotprod_expand_fast(w1, r3, vs13);
                                 }
-                                s0 += v_int32x4(v_reduce_sum(vs00), v_reduce_sum(vs01), v_reduce_sum(vs02), v_reduce_sum(vs03));
-                                s1 += v_int32x4(v_reduce_sum(vs10), v_reduce_sum(vs11), v_reduce_sum(vs12), v_reduce_sum(vs13));
+                                s0 = v_add(s0, v_int32x4(v_reduce_sum(vs00), v_reduce_sum(vs01), v_reduce_sum(vs02), v_reduce_sum(vs03)));
+                                s1 = v_add(s1, v_int32x4(v_reduce_sum(vs10), v_reduce_sum(vs11), v_reduce_sum(vs12), v_reduce_sum(vs13)));
                                 if( cn1 == inpCn )
                                 {
-                                    s0 = voutzp + v_round(v_cvt_f32(s0)*vmult0);
-                                    s1 = voutzp + v_round(v_cvt_f32(s1)*vmult1);
+                                    s0 = v_add(voutzp, v_round(v_mul(v_cvt_f32(s0), vmult0)));
+                                    s1 = v_add(voutzp, v_round(v_mul(v_cvt_f32(s1), vmult1)));
 
                                     s0 = v_min(v_max(s0, outmin), outmax);
                                     s1 = v_min(v_max(s1, outmin), outmax);
diff --git a/modules/dnn/src/int8layers/fully_connected_layer.cpp b/modules/dnn/src/int8layers/fully_connected_layer.cpp
index b8e3bd6ee5..ba5b0d79c1 100644
--- a/modules/dnn/src/int8layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/int8layers/fully_connected_layer.cpp
@@ -323,8 +323,8 @@ public:
                             vs3 = v_dotprod_expand_fast(v, v_load_aligned(wptr + wstep*3 + k), vs3);
                         }
 
-                        s += v_int32x4(v_reduce_sum(vs0), v_reduce_sum(vs1), v_reduce_sum(vs2), v_reduce_sum(vs3));
-                        v_int32x4 out = outzp + v_round(v_cvt_f32(s)*mult);
+                        s = v_add(s, v_int32x4(v_reduce_sum(vs0), v_reduce_sum(vs1), v_reduce_sum(vs2), v_reduce_sum(vs3)));
+                        v_int32x4 out = v_add(outzp, v_round(v_mul(v_cvt_f32(s), mult)));
                         v_store(dptr + i, v_min(v_max(out, outmin), outmax));
                     }
             #endif
diff --git a/modules/dnn/src/int8layers/pooling_layer.cpp b/modules/dnn/src/int8layers/pooling_layer.cpp
index bfff3d34c5..b321d730f7 100644
--- a/modules/dnn/src/int8layers/pooling_layer.cpp
+++ b/modules/dnn/src/int8layers/pooling_layer.cpp
@@ -631,17 +631,17 @@ public:
                                                  (int)srcData[index + stride_w*10], (int)srcData[index + stride_w*11]);
                                     v_int32x4 v3((int)srcData[index + stride_w*12], (int)srcData[index + stride_w*13],
                                                  (int)srcData[index + stride_w*14], (int)srcData[index + stride_w*15]);
-                                    sum_val0 += v0;
-                                    sum_val1 += v1;
-                                    sum_val2 += v2;
-                                    sum_val3 += v3;
+                                    sum_val0 = v_add(sum_val0, v0);
+                                    sum_val1 = v_add(sum_val1, v1);
+                                    sum_val2 = v_add(sum_val2, v2);
+                                    sum_val3 = v_add(sum_val3, v3);
                                 }
                             }
 
-                            sum_val0 = v_round(v_cvt_f32(sum_val0)*ikarea) + voutzp;
-                            sum_val1 = v_round(v_cvt_f32(sum_val1)*ikarea) + voutzp;
-                            sum_val2 = v_round(v_cvt_f32(sum_val2)*ikarea) + voutzp;
-                            sum_val3 = v_round(v_cvt_f32(sum_val3)*ikarea) + voutzp;
+                            sum_val0 = v_add(v_round(v_mul(v_cvt_f32(sum_val0), ikarea)), voutzp);
+                            sum_val1 = v_add(v_round(v_mul(v_cvt_f32(sum_val1), ikarea)), voutzp);
+                            sum_val2 = v_add(v_round(v_mul(v_cvt_f32(sum_val2), ikarea)), voutzp);
+                            sum_val3 = v_add(v_round(v_mul(v_cvt_f32(sum_val3), ikarea)), voutzp);
 
                             v_store(dstData + x0, v_pack(v_pack(sum_val0, sum_val1), v_pack(sum_val2, sum_val3)));
                             x0 += 15;
diff --git a/modules/dnn/src/layers/cpu_kernels/conv_depthwise.cpp b/modules/dnn/src/layers/cpu_kernels/conv_depthwise.cpp
index 3e969336ad..59f069eeaa 100644
--- a/modules/dnn/src/layers/cpu_kernels/conv_depthwise.cpp
+++ b/modules/dnn/src/layers/cpu_kernels/conv_depthwise.cpp
@@ -236,13 +236,11 @@ void depthWiseBlockConv2D(const float* wptr,
                             v21 = v_load(imgptr2 + in_j + dilation_w),
                             v22 = v_load(imgptr2 + in_j + dilation_w*2);
 
-                    v_float32x4 vout = v00*vw00 + v01*vw01 + v02*vw02 +
-                                     v10*vw10 + v11*vw11 + v12*vw12 +
-                                     v20*vw20 + v21*vw21 + v22*vw22 + vbias;
+                    v_float32x4 vout = v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_mul(v00, vw00), v_mul(v01, vw01)), v_mul(v02, vw02)), v_mul(v10, vw10)), v_mul(v11, vw11)), v_mul(v12, vw12)), v_mul(v20, vw20)), v_mul(v21, vw21)), v_mul(v22, vw22)), vbias);
                     if (fusedAdd)
-                        vout = v_load(outptr + out_j) + vout;
+                        vout = v_add(v_load(outptr + out_j), vout);
                     if (relu)
-                        vout = v_select(vout > z, vout, vout*vrc);
+                        vout = v_select(v_gt(vout, z), vout, v_mul(vout, vrc));
                     v_store(outptr + out_j, vout);
                 }
             }
@@ -268,14 +266,12 @@ void depthWiseBlockConv2D(const float* wptr,
                     v_load_deinterleave(imgptr2 + in_j, v20, v21);
                     v_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
 
-                    v_float32x4 vout = v00 * vw00 + v01 * vw01 + v02 * vw02 +
-                            v10 * vw10 + v11 * vw11 + v12 * vw12 +
-                            v20 * vw20 + v21 * vw21 + v22 * vw22 + vbias;
+                    v_float32x4 vout = v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_add(v_mul(v00, vw00), v_mul(v01, vw01)), v_mul(v02, vw02)), v_mul(v10, vw10)), v_mul(v11, vw11)), v_mul(v12, vw12)), v_mul(v20, vw20)), v_mul(v21, vw21)), v_mul(v22, vw22)), vbias);
 
                     if (fusedAdd)
-                        vout = v_load(outptr + out_j) + vout;
+                        vout = v_add(v_load(outptr + out_j), vout);
                     if (relu)
-                        vout = v_select(vout > z, vout, vout*vrc);
+                        vout = v_select(v_gt(vout, z), vout, v_mul(vout, vrc));
                     v_store(outptr + out_j, vout);
                 }
             }
@@ -381,11 +377,11 @@ void depthWiseBlockConv1D(const float* wptr,
                         v01 = v_load(imgptr0 + in_j + dilation_w),
                         v02 = v_load(imgptr0 + in_j + dilation_w*2);
 
-                v_float32x4 vout = v00*vw00 + v01*vw01 + v02*vw02 + vbias;
+                v_float32x4 vout = v_add(v_add(v_add(v_mul(v00, vw00), v_mul(v01, vw01)), v_mul(v02, vw02)), vbias);
                 if (fusedAdd)
-                    vout = v_load(outptr + out_j) + vout;
+                    vout = v_add(v_load(outptr + out_j), vout);
                 if (relu)
-                    vout = v_select(vout > z, vout, vout*vrc);
+                    vout = v_select(v_gt(vout, z), vout, v_mul(vout, vrc));
                 v_store(outptr + out_j, vout);
             }
         }
@@ -407,13 +403,13 @@ void depthWiseBlockConv1D(const float* wptr,
                 v_load_deinterleave(imgptr0 + in_j, v00, v01);
                 v_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
 
-                v_float32x4 vout = v00 * vw00 + v01 * vw01 + v02 * vw02 + vbias;
+                v_float32x4 vout = v_add(v_add(v_add(v_mul(v00, vw00), v_mul(v01, vw01)), v_mul(v02, vw02)), vbias);
 
                 if (fusedAdd)
-                    vout = v_load(outptr + out_j) + vout;
+                    vout = v_add(v_load(outptr + out_j), vout);
 
                 if (relu)
-                    vout = v_select(vout > z, vout, vout*vrc);
+                    vout = v_select(v_gt(vout, z), vout, v_mul(vout, vrc));
                 v_store(outptr + out_j, vout);
             }
         }
diff --git a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp
index a18943994c..605cf37949 100644
--- a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp
+++ b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp
@@ -430,32 +430,32 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
         /* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */
         /* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */
         v_float32x4 q5_25 = v_setall_f32(5.25f), t00, t01, t10, t11;
-        t00 = x40 - x20;
-        t01 = x41 - x21;
-        t10 = x30 - x50;
-        t11 = x31 - x51;
-        v_float32x4 y00 = v_fma(t00, q5_25, x00 - x60);
-        v_float32x4 y01 = v_fma(t01, q5_25, x01 - x61);
-        v_float32x4 y70 = v_fma(t10, q5_25, x70 - x10);
-        v_float32x4 y71 = v_fma(t11, q5_25, x71 - x11);
+        t00 = v_sub(x40, x20);
+        t01 = v_sub(x41, x21);
+        t10 = v_sub(x30, x50);
+        t11 = v_sub(x31, x51);
+        v_float32x4 y00 = v_fma(t00, q5_25, v_sub(x00, x60));
+        v_float32x4 y01 = v_fma(t01, q5_25, v_sub(x01, x61));
+        v_float32x4 y70 = v_fma(t10, q5_25, v_sub(x70, x10));
+        v_float32x4 y71 = v_fma(t11, q5_25, v_sub(x71, x11));
 
         /* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */
         /* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */
         v_float32x4 qm4_25 = v_setall_f32(-4.25f);
-        t00 = v_fma(x30, qm4_25, x10 + x50);
-        t01 = v_fma(x31, qm4_25, x11 + x51);
-        t10 = v_fma(x40, qm4_25, x20 + x60);
-        t11 = v_fma(x41, qm4_25, x21 + x61);
+        t00 = v_fma(x30, qm4_25, v_add(x10, x50));
+        t01 = v_fma(x31, qm4_25, v_add(x11, x51));
+        t10 = v_fma(x40, qm4_25, v_add(x20, x60));
+        t11 = v_fma(x41, qm4_25, v_add(x21, x61));
 
-        v_float32x4 y10 = t00 + t10, y11 = t01 + t11;
-        v_float32x4 y20 = t10 - t00, y21 = t11 - t01;
+        v_float32x4 y10 = v_add(t00, t10), y11 = v_add(t01, t11);
+        v_float32x4 y20 = v_sub(t10, t00), y21 = v_sub(t11, t01);
 
         /* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */
         /* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */
         v_float32x4 q0_5 = v_setall_f32(0.5f), q0_25 = v_setall_f32(0.25f);
         v_float32x4 qm2_5 = v_setall_f32(-2.5f), qm1_25 = v_setall_f32(-1.25f);
-        t00 = v_fma(x10, q0_5, x50 + x50);
-        t01 = v_fma(x11, q0_5, x51 + x51);
+        t00 = v_fma(x10, q0_5, v_add(x50, x50));
+        t01 = v_fma(x11, q0_5, v_add(x51, x51));
         t10 = v_fma(x20, q0_25, x60);
         t11 = v_fma(x21, q0_25, x61);
         t00 = v_fma(x30, qm2_5, t00);
@@ -463,14 +463,14 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
         t10 = v_fma(x40, qm1_25, t10);
         t11 = v_fma(x41, qm1_25, t11);
 
-        v_float32x4 y30 = t00 + t10, y31 = t01 + t11;
-        v_float32x4 y40 = t10 - t00, y41 = t11 - t01;
+        v_float32x4 y30 = v_add(t00, t10), y31 = v_add(t01, t11);
+        v_float32x4 y40 = v_sub(t10, t00), y41 = v_sub(t11, t01);
 
         /* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */
         /* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */
         v_float32x4 q4 = v_setall_f32(4.f), qm5 = v_setall_f32(-5.f);
-        t00 = v_fma(x50, q0_5, x10 + x10);
-        t01 = v_fma(x51, q0_5, x11 + x11);
+        t00 = v_fma(x50, q0_5, v_add(x10, x10));
+        t01 = v_fma(x51, q0_5, v_add(x11, x11));
         t10 = v_fma(x20, q4   , x60);
         t11 = v_fma(x21, q4   , x61);
         t00 = v_fma(x30, qm2_5, t00);
@@ -478,8 +478,8 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
         t10 = v_fma(x40, qm5  , t10);
         t11 = v_fma(x41, qm5  , t11);
 
-        v_float32x4 y50 = t00 + t10, y51 = t01 + t11;
-        v_float32x4 y60 = t10 - t00, y61 = t11 - t01;
+        v_float32x4 y50 = v_add(t00, t10), y51 = v_add(t01, t11);
+        v_float32x4 y60 = v_sub(t10, t00), y61 = v_sub(t11, t01);
 
         /* transpose 8x8 matrix with v_transpose4x4 */
 
@@ -491,29 +491,29 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
 
         /* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */
         /* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */
-        t00 = y010 - y200;
-        t01 = y410 - y600;
-        t10 = y300 - y110;
-        t11 = y700 - y510;
-        z00 = v_fma(t00, q5_25, y000 - y210);
-        z01 = v_fma(t01, q5_25, y400 - y610);
-        z70 = v_fma(t10, q5_25, y310 - y100);
-        z71 = v_fma(t11, q5_25, y710 - y500);
+        t00 = v_sub(y010, y200);
+        t01 = v_sub(y410, y600);
+        t10 = v_sub(y300, y110);
+        t11 = v_sub(y700, y510);
+        z00 = v_fma(t00, q5_25, v_sub(y000, y210));
+        z01 = v_fma(t01, q5_25, v_sub(y400, y610));
+        z70 = v_fma(t10, q5_25, v_sub(y310, y100));
+        z71 = v_fma(t11, q5_25, v_sub(y710, y500));
 
         /* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */
         /* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */
-        t00 = v_fma(y300, qm4_25, y100 + y110);
-        t01 = v_fma(y700, qm4_25, y500 + y510);
-        t10 = v_fma(y010, qm4_25, y200 + y210);
-        t11 = v_fma(y410, qm4_25, y600 + y610);
+        t00 = v_fma(y300, qm4_25, v_add(y100, y110));
+        t01 = v_fma(y700, qm4_25, v_add(y500, y510));
+        t10 = v_fma(y010, qm4_25, v_add(y200, y210));
+        t11 = v_fma(y410, qm4_25, v_add(y600, y610));
 
-        z10 = t00 + t10; z11 = t01 + t11;
-        z20 = t10 - t00; z21 = t11 - t01;
+        z10 = v_add(t00, t10); z11 = v_add(t01, t11);
+        z20 = v_sub(t10, t00); z21 = v_sub(t11, t01);
 
         /* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */
         /* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */
-        t00 = v_fma(y100, q0_5, y110 + y110);
-        t01 = v_fma(y500, q0_5, y510 + y510);
+        t00 = v_fma(y100, q0_5, v_add(y110, y110));
+        t01 = v_fma(y500, q0_5, v_add(y510, y510));
         t10 = v_fma(y200, q0_25, y210);
         t11 = v_fma(y600, q0_25, y610);
         t00 = v_fma(y300, qm2_5, t00);
@@ -521,13 +521,13 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
         t10 = v_fma(y010, qm1_25, t10);
         t11 = v_fma(y410, qm1_25, t11);
 
-        z30 = t00 + t10; z31 = t01 + t11;
-        z40 = t10 - t00; z41 = t11 - t01;
+        z30 = v_add(t00, t10); z31 = v_add(t01, t11);
+        z40 = v_sub(t10, t00); z41 = v_sub(t11, t01);
 
         /* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */
         /* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */
-        t00 = v_fma(y110, q0_5, y100 + y100);
-        t01 = v_fma(y510, q0_5, y500 + y500);
+        t00 = v_fma(y110, q0_5, v_add(y100, y100));
+        t01 = v_fma(y510, q0_5, v_add(y500, y500));
         t10 = v_fma(y200, q4, y210);
         t11 = v_fma(y600, q4, y610);
         t00 = v_fma(y300, qm2_5, t00);
@@ -535,8 +535,8 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
         t10 = v_fma(y010, qm5, t10);
         t11 = v_fma(y410, qm5, t11);
 
-        z50 = t00 + t10; z51 = t01 + t11;
-        z60 = t10 - t00; z61 = t11 - t01;
+        z50 = v_add(t00, t10); z51 = v_add(t01, t11);
+        z60 = v_sub(t10, t00); z61 = v_sub(t11, t01);
     }
 
     const int outstep = winoIblock*winoAtomF32*Cg;
@@ -601,12 +601,12 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
 
     {
         v_float32x4 s12_0, s12_1, s34_0, s34_1, s56_0, s56_1;
-        s12_0 = x10 + x20; s12_1 = x11 + x21;
-        s34_0 = x30 + x40; s34_1 = x31 + x41;
-        s56_0 = x50 + x60; s56_1 = x51 + x61;
+        s12_0 = v_add(x10, x20); s12_1 = v_add(x11, x21);
+        s34_0 = v_add(x30, x40); s34_1 = v_add(x31, x41);
+        s56_0 = v_add(x50, x60); s56_1 = v_add(x51, x61);
 
-        v_float32x4 y00 = x00 + s12_0 + s34_0 + s56_0;
-        v_float32x4 y01 = x01 + s12_1 + s34_1 + s56_1;
+        v_float32x4 y00 = v_add(v_add(v_add(x00, s12_0), s34_0), s56_0);
+        v_float32x4 y01 = v_add(v_add(v_add(x01, s12_1), s34_1), s56_1);
 
         v_float32x4 a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f);
         v_float32x4 y20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
@@ -616,13 +616,13 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
         v_float32x4 y40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
         v_float32x4 y41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
 
-        s12_0 = x10 - x20; s12_1 = x11 - x21;
-        s34_0 = x30 - x40; s34_1 = x31 - x41;
-        s56_0 = x50 - x60; s56_1 = x51 - x61;
+        s12_0 = v_sub(x10, x20); s12_1 = v_sub(x11, x21);
+        s34_0 = v_sub(x30, x40); s34_1 = v_sub(x31, x41);
+        s56_0 = v_sub(x50, x60); s56_1 = v_sub(x51, x61);
 
         a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.f);
-        v_float32x4 y50 = v_fma(s56_0, a0, v_fma(s34_0, a1, x70 + s12_0));
-        v_float32x4 y51 = v_fma(s56_1, a0, v_fma(s34_1, a1, x71 + s12_1));
+        v_float32x4 y50 = v_fma(s56_0, a0, v_fma(s34_0, a1, v_add(x70, s12_0)));
+        v_float32x4 y51 = v_fma(s56_1, a0, v_fma(s34_1, a1, v_add(x71, s12_1)));
 
         a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.f);
         v_float32x4 y10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
@@ -642,12 +642,12 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
         v_transpose4x4(y40, y50, y60, y70, y400, y500, y600, y700);
         v_transpose4x4(y41, y51, y61, y71, y410, y510, y610, y710);
 
-        s12_0 = y100 + y200; s12_1 = y500 + y600;
-        s34_0 = y300 + y010; s34_1 = y700 + y410;
-        s56_0 = y110 + y210; s56_1 = y510 + y610;
+        s12_0 = v_add(y100, y200); s12_1 = v_add(y500, y600);
+        s34_0 = v_add(y300, y010); s34_1 = v_add(y700, y410);
+        s56_0 = v_add(y110, y210); s56_1 = v_add(y510, y610);
 
-        z00 = y000 + s12_0 + s34_0 + s56_0;
-        z01 = y400 + s12_1 + s34_1 + s56_1;
+        z00 = v_add(v_add(v_add(y000, s12_0), s34_0), s56_0);
+        z01 = v_add(v_add(v_add(y400, s12_1), s34_1), s56_1);
 
         a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f);
         z20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
@@ -657,13 +657,13 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
         z40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
         z41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
 
-        s12_0 = y100 - y200; s12_1 = y500 - y600;
-        s34_0 = y300 - y010; s34_1 = y700 - y410;
-        s56_0 = y110 - y210; s56_1 = y510 - y610;
+        s12_0 = v_sub(y100, y200); s12_1 = v_sub(y500, y600);
+        s34_0 = v_sub(y300, y010); s34_1 = v_sub(y700, y410);
+        s56_0 = v_sub(y110, y210); s56_1 = v_sub(y510, y610);
 
         a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.0f);
-        z50 = v_fma(s56_0, a0, v_fma(s34_0, a1, y310 + s12_0));
-        z51 = v_fma(s56_1, a0, v_fma(s34_1, a1, y710 + s12_1));
+        z50 = v_fma(s56_0, a0, v_fma(s34_0, a1, v_add(y310, s12_0)));
+        z51 = v_fma(s56_1, a0, v_fma(s34_1, a1, v_add(y710, s12_1)));
         a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.0f);
         z10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
         z11 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
@@ -673,34 +673,34 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
         z31 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
 
         v_float32x4 vbias = v_setall_f32(bias);
-        z00 += vbias;
-        z01 += vbias;
-        z10 += vbias;
-        z11 += vbias;
-        z20 += vbias;
-        z21 += vbias;
-        z30 += vbias;
-        z31 += vbias;
-        z40 += vbias;
-        z41 += vbias;
-        z50 += vbias;
-        z51 += vbias;
+        z00 = v_add(z00, vbias);
+        z01 = v_add(z01, vbias);
+        z10 = v_add(z10, vbias);
+        z11 = v_add(z11, vbias);
+        z20 = v_add(z20, vbias);
+        z21 = v_add(z21, vbias);
+        z30 = v_add(z30, vbias);
+        z31 = v_add(z31, vbias);
+        z40 = v_add(z40, vbias);
+        z41 = v_add(z41, vbias);
+        z50 = v_add(z50, vbias);
+        z51 = v_add(z51, vbias);
     }
 
     if (bpptr)
     {
-        z00 += v_load(bpptr);
-        z01 += v_load_low(bpptr + 4);
-        z10 += v_load(bpptr + bpstep);
-        z11 += v_load_low(bpptr + bpstep + 4);
-        z20 += v_load(bpptr + bpstep*2);
-        z21 += v_load_low(bpptr + bpstep*2 + 4);
-        z30 += v_load(bpptr + bpstep*3);
-        z31 += v_load_low(bpptr + bpstep*3 + 4);
-        z40 += v_load(bpptr + bpstep*4);
-        z41 += v_load_low(bpptr + bpstep*4 + 4);
-        z50 += v_load(bpptr + bpstep*5);
-        z51 += v_load_low(bpptr + bpstep*5 + 4);
+        z00 = v_add(z00, v_load(bpptr));
+        z01 = v_add(z01, v_load_low(bpptr + 4));
+        z10 = v_add(z10, v_load(bpptr + bpstep));
+        z11 = v_add(z11, v_load_low(bpptr + bpstep + 4));
+        z20 = v_add(z20, v_load(bpptr + bpstep * 2));
+        z21 = v_add(z21, v_load_low(bpptr + bpstep * 2 + 4));
+        z30 = v_add(z30, v_load(bpptr + bpstep * 3));
+        z31 = v_add(z31, v_load_low(bpptr + bpstep * 3 + 4));
+        z40 = v_add(z40, v_load(bpptr + bpstep * 4));
+        z41 = v_add(z41, v_load_low(bpptr + bpstep * 4 + 4));
+        z50 = v_add(z50, v_load(bpptr + bpstep * 5));
+        z51 = v_add(z51, v_load_low(bpptr + bpstep * 5 + 4));
     }
 
     if (ifMinMaxAct)
diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp
index 815bc2dda4..2a2245b909 100644
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@@ -370,10 +370,10 @@ struct ReLUFunctor : public BaseFunctor
                 v_float32x4 x1 = v_load(srcptr + i + 4);
                 v_float32x4 x2 = v_load(srcptr + i + 8);
                 v_float32x4 x3 = v_load(srcptr + i + 12);
-                x0 = v_select(x0 >= z, x0, x0*s4);
-                x1 = v_select(x1 >= z, x1, x1*s4);
-                x2 = v_select(x2 >= z, x2, x2*s4);
-                x3 = v_select(x3 >= z, x3, x3*s4);
+                x0 = v_select(v_ge(x0, z), x0, v_mul(x0, s4));
+                x1 = v_select(v_ge(x1, z), x1, v_mul(x1, s4));
+                x2 = v_select(v_ge(x2, z), x2, v_mul(x2, s4));
+                x3 = v_select(v_ge(x3, z), x3, v_mul(x3, s4));
                 v_store(dstptr + i, x0);
                 v_store(dstptr + i + 4, x1);
                 v_store(dstptr + i + 8, x2);
@@ -2493,10 +2493,10 @@ struct ChannelsPReLUFunctor : public BaseFunctor
                 v_float32x4 x1 = v_load(srcptr + i + 4);
                 v_float32x4 x2 = v_load(srcptr + i + 8);
                 v_float32x4 x3 = v_load(srcptr + i + 12);
-                x0 = v_select(x0 >= z, x0, x0*s4);
-                x1 = v_select(x1 >= z, x1, x1*s4);
-                x2 = v_select(x2 >= z, x2, x2*s4);
-                x3 = v_select(x3 >= z, x3, x3*s4);
+                x0 = v_select(v_ge(x0, z), x0, v_mul(x0, s4));
+                x1 = v_select(v_ge(x1, z), x1, v_mul(x1, s4));
+                x2 = v_select(v_ge(x2, z), x2, v_mul(x2, s4));
+                x3 = v_select(v_ge(x3, z), x3, v_mul(x3, s4));
                 v_store(dstptr + i, x0);
                 v_store(dstptr + i + 4, x1);
                 v_store(dstptr + i + 8, x2);
@@ -2649,10 +2649,10 @@ struct PReLUFunctor : public ChannelsPReLUFunctor
                 v_float32x4 s1 = v_load(scaleptr + i + 4);
                 v_float32x4 s2 = v_load(scaleptr + i + 8);
                 v_float32x4 s3 = v_load(scaleptr + i + 12);
-                x0 = v_select(x0 >= z, x0, x0*s0);
-                x1 = v_select(x1 >= z, x1, x1*s1);
-                x2 = v_select(x2 >= z, x2, x2*s2);
-                x3 = v_select(x3 >= z, x3, x3*s3);
+                x0 = v_select(v_ge(x0, z), x0, v_mul(x0, s0));
+                x1 = v_select(v_ge(x1, z), x1, v_mul(x1, s1));
+                x2 = v_select(v_ge(x2, z), x2, v_mul(x2, s2));
+                x3 = v_select(v_ge(x3, z), x3, v_mul(x3, s3));
                 v_store(dstptr + i, x0);
                 v_store(dstptr + i + 4, x1);
                 v_store(dstptr + i + 8, x2);
diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp
index f03af7c1fb..1c27043f1a 100644
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -308,7 +308,7 @@ public:
                         }
 
                         v_float32x4 s = v_reduce_sum4(vs0, vs1, vs2, vs3);
-                        s += v_load(biasptr + i);
+                        s = v_add(s, v_load(biasptr + i));
                         v_store(dptr + i, s);
                     }
             #endif
diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp
index a75382d8a5..fb980c4152 100644
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@@ -898,25 +898,25 @@ public:
                                 v_float32x4 max_idx0 = v_setall_f32(-1.f);
                                 v_float32x4 max_idx1 = max_idx0;
                                 int index0 = ystart * inp_width + xstart;
-                                v_float32x4 idx0 = idx00 + v_setall_f32((float)index0);
-                                v_float32x4 idx1 = idx0 + v_setall_f32((float)(stride_w*4));
+                                v_float32x4 idx0 = v_add(idx00, v_setall_f32((float)index0));
+                                v_float32x4 idx1 = v_add(idx0, v_setall_f32((float)(stride_w * 4)));
 
                                 for (int y = ystart; y < yend; ++y)
                                 {
-                                    for (int x = xstart; x < xend; ++x, idx0 += ones, idx1 += ones)
+                                    for (int x = xstart; x < xend; ++x, idx0 = v_add(idx0, ones), idx1 = v_add(idx1, ones))
                                     {
                                         const int index = y * inp_width + x;
                                         v_float32x4 v0(srcData[index], srcData[index + stride_w],
                                                        srcData[index + stride_w*2], srcData[index + stride_w*3]);
                                         v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
                                                        srcData[index + stride_w*6], srcData[index + stride_w*7]);
-                                        max_idx0 = v_select(v0 > max_val0, idx0, max_idx0);
-                                        max_idx1 = v_select(v1 > max_val1, idx1, max_idx1);
+                                        max_idx0 = v_select(v_gt(v0, max_val0), idx0, max_idx0);
+                                        max_idx1 = v_select(v_gt(v1, max_val1), idx1, max_idx1);
                                         max_val0 = v_max(max_val0, v0);
                                         max_val1 = v_max(max_val1, v1);
                                     }
-                                    idx0 += idx_delta;
-                                    idx1 += idx_delta;
+                                    idx0 = v_add(idx0, idx_delta);
+                                    idx1 = v_add(idx1, idx_delta);
                                 }
                                 v_store(dstData + x0, max_val0);
                                 v_store(dstData + x0 + 4, max_val1);
@@ -1069,12 +1069,12 @@ public:
                                                    srcData[index + stride_w*2], srcData[index + stride_w*3]);
                                     v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
                                                    srcData[index + stride_w*6], srcData[index + stride_w*7]);
-                                    sum_val0 += v0;
-                                    sum_val1 += v1;
+                                    sum_val0 = v_add(sum_val0, v0);
+                                    sum_val1 = v_add(sum_val1, v1);
                                 }
                             }
-                            v_store(dstData + x0, sum_val0*ikarea);
-                            v_store(dstData + x0 + 4, sum_val1*ikarea);
+                            v_store(dstData + x0, v_mul(sum_val0, ikarea));
+                            v_store(dstData + x0 + 4, v_mul(sum_val1, ikarea));
                             x0 += 7;
                         }
                         else
diff --git a/modules/features2d/src/fast.cpp b/modules/features2d/src/fast.cpp
index 163f02717e..cb088eb535 100644
--- a/modules/features2d/src/fast.cpp
+++ b/modules/features2d/src/fast.cpp
@@ -120,8 +120,8 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
                         for (; j < img.cols - 16 - 3; j += 16, ptr += 16)
                         {
                             v_uint8x16 v = v_load(ptr);
-                            v_int8x16 v0 = v_reinterpret_as_s8((v + t) ^ delta);
-                            v_int8x16 v1 = v_reinterpret_as_s8((v - t) ^ delta);
+                            v_int8x16 v0 = v_reinterpret_as_s8(v_xor(v_add(v, t), delta));
+                            v_int8x16 v1 = v_reinterpret_as_s8(v_xor(v_sub(v, t), delta));
 
                             v_int8x16 x0 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[0]), delta));
                             v_int8x16 x1 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[quarterPatternSize]), delta));
@@ -129,15 +129,15 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
                             v_int8x16 x3 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[3*quarterPatternSize]), delta));
 
                             v_int8x16 m0, m1;
-                            m0 = (v0 < x0) & (v0 < x1);
-                            m1 = (x0 < v1) & (x1 < v1);
-                            m0 = m0 | ((v0 < x1) & (v0 < x2));
-                            m1 = m1 | ((x1 < v1) & (x2 < v1));
-                            m0 = m0 | ((v0 < x2) & (v0 < x3));
-                            m1 = m1 | ((x2 < v1) & (x3 < v1));
-                            m0 = m0 | ((v0 < x3) & (v0 < x0));
-                            m1 = m1 | ((x3 < v1) & (x0 < v1));
-                            m0 = m0 | m1;
+                            m0 = v_and(v_lt(v0, x0), v_lt(v0, x1));
+                            m1 = v_and(v_lt(x0, v1), v_lt(x1, v1));
+                            m0 = v_or(m0, v_and(v_lt(v0, x1), v_lt(v0, x2)));
+                            m1 = v_or(m1, v_and(v_lt(x1, v1), v_lt(x2, v1)));
+                            m0 = v_or(m0, v_and(v_lt(v0, x2), v_lt(v0, x3)));
+                            m1 = v_or(m1, v_and(v_lt(x2, v1), v_lt(x3, v1)));
+                            m0 = v_or(m0, v_and(v_lt(v0, x3), v_lt(v0, x0)));
+                            m1 = v_or(m1, v_and(v_lt(x3, v1), v_lt(x0, v1)));
+                            m0 = v_or(m0, m1);
 
                             if( !v_check_any(m0) )
                                 continue;
@@ -154,18 +154,18 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
                             v_uint8x16 max1 = v_setzero_u8();
                             for( k = 0; k < N; k++ )
                             {
-                                v_int8x16 x = v_reinterpret_as_s8(v_load((ptr + pixel[k])) ^ delta);
-                                m0 = v0 < x;
-                                m1 = x < v1;
+                                v_int8x16 x = v_reinterpret_as_s8(v_xor(v_load((ptr + pixel[k])), delta));
+                                m0 = v_lt(v0, x);
+                                m1 = v_lt(x, v1);
 
-                                c0 = v_sub_wrap(c0, m0) & m0;
-                                c1 = v_sub_wrap(c1, m1) & m1;
+                                c0 = v_and(v_sub_wrap(c0, m0), m0);
+                                c1 = v_and(v_sub_wrap(c1, m1), m1);
 
                                 max0 = v_max(max0, v_reinterpret_as_u8(c0));
                                 max1 = v_max(max1, v_reinterpret_as_u8(c1));
                             }
 
-                            max0 = K16 < v_max(max0, max1);
+                            max0 = v_lt(K16, v_max(max0, max1));
                             unsigned int m = v_signmask(v_reinterpret_as_s8(max0));
 
                             for( k = 0; m > 0 && k < 16; k++, m >>= 1 )
@@ -190,7 +190,7 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
                                             a1 = v_min(a1, v_nms);
                                             b1 = v_max(b1, v_nms);
                                         }
-                                        curr[j + k] = (uchar)(v_reduce_max(v_max(v_max(a0, a1), v_setzero_s16() - v_min(b0, b1))) - 1);
+                                        curr[j + k] = (uchar)(v_reduce_max(v_max(v_max(a0, a1), v_sub(v_setzero_s16(), v_min(b0, b1)))) - 1);
                                     }
                                 }
                             }
diff --git a/modules/features2d/src/fast_score.cpp b/modules/features2d/src/fast_score.cpp
index 0bc011af49..0c43ad5552 100644
--- a/modules/features2d/src/fast_score.cpp
+++ b/modules/features2d/src/fast_score.cpp
@@ -160,7 +160,7 @@ int cornerScore<16>(const uchar* ptr, const int pixel[], int threshold)
             q0 = v_max(q0, v_min(a, v0));
             q1 = v_min(q1, v_max(b, v0));
         }
-        q0 = v_max(q0, v_setzero_s16() - q1);
+        q0 = v_max(q0, v_sub(v_setzero_s16(), q1));
         threshold = v_reduce_max(q0) - 1;
     }
     else
@@ -251,7 +251,7 @@ int cornerScore<12>(const uchar* ptr, const int pixel[], int threshold)
             q0 = v_max(q0, v_min(a, v0));
             q1 = v_min(q1, v_max(b, v0));
         }
-        q0 = v_max(q0, v_setzero_s16() - q1);
+        q0 = v_max(q0, v_sub(v_setzero_s16(), q1));
         threshold = v_reduce_max(q0) - 1;
     }
     else
@@ -323,7 +323,7 @@ int cornerScore<8>(const uchar* ptr, const int pixel[], int threshold)
         v0 = v_load(d + 5);
         q0 = v_max(q0, v_min(a, v0));
         q1 = v_min(q1, v_max(b, v0));
-        q0 = v_max(q0, v_setzero_s16() - q1);
+        q0 = v_max(q0, v_sub(v_setzero_s16(), q1));
         threshold = v_reduce_max(q0) - 1;
     }
     else
diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
index 6c517b1f57..927f08d30a 100644
--- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
@@ -335,7 +335,7 @@ void run_rgb2hsv_impl(uchar out[], const uchar in[], const int sdiv_table[],
             // divide and calculate s according to above feature
             v_uint32x4 ss[4];
 
-            v_uint32x4 vadd = v_setall_u32(1) << (hsv_shift - 1);
+            v_uint32x4 vadd = v_shl(v_setall_u32(1), (hsv_shift - 1));
 
             v_uint32x4 v_diff_exp[4];
             v_diff_exp[0] = v_reinterpret_as_u32(v_and(v_reinterpret_as_u8(v_diff), mask1));
@@ -406,16 +406,16 @@ void run_rgb2hsv_impl(uchar out[], const uchar in[], const int sdiv_table[],
             // start computing H-ch
             //h = (_vr & (g - b)) + (~_vr & ((_vg & (b - r + 2 * diff)) + ((~_vg) & (r - g + 4 * diff))));
             v_int32x4 hh[4];
-            hh[0] = v_reinterpret_as_s32(v_select(e[0], v_reinterpret_as_s32(gg[0] - bb[0]),
+            hh[0] = v_reinterpret_as_s32(v_select(e[0], v_reinterpret_as_s32(v_sub(gg[0], bb[0])),
                                          v_select(p[0], v_reinterpret_as_s32(v_add(v_sub(bb[0], rr[0]), v_mul(v_setall_u32(2), vdd[0]))),
                                                         v_reinterpret_as_s32(v_add(v_sub(rr[0], gg[0]), v_mul(v_setall_u32(4), vdd[0]))))));
-            hh[1] = v_reinterpret_as_s32(v_select(e[1], v_reinterpret_as_s32(gg[1] - bb[1]),
+            hh[1] = v_reinterpret_as_s32(v_select(e[1], v_reinterpret_as_s32(v_sub(gg[1], bb[1])),
                                          v_select(p[1], v_reinterpret_as_s32(v_add(v_sub(bb[1], rr[1]), v_mul(v_setall_u32(2), vdd[1]))),
                                                         v_reinterpret_as_s32(v_add(v_sub(rr[1], gg[1]), v_mul(v_setall_u32(4), vdd[1]))))));
-            hh[2] = v_reinterpret_as_s32(v_select(e[2], v_reinterpret_as_s32(gg[2] - bb[2]),
+            hh[2] = v_reinterpret_as_s32(v_select(e[2], v_reinterpret_as_s32(v_sub(gg[2], bb[2])),
                                          v_select(p[2], v_reinterpret_as_s32(v_add(v_sub(bb[2], rr[2]), v_mul(v_setall_u32(2), vdd[2]))),
                                                         v_reinterpret_as_s32(v_add(v_sub(rr[2], gg[2]), v_mul(v_setall_u32(4), vdd[2]))))));
-            hh[3] = v_reinterpret_as_s32(v_select(e[3], v_reinterpret_as_s32(gg[3] - bb[3]),
+            hh[3] = v_reinterpret_as_s32(v_select(e[3], v_reinterpret_as_s32(v_sub(gg[3], bb[3])),
                                          v_select(p[3], v_reinterpret_as_s32(v_add(v_sub(bb[3], rr[3]), v_mul(v_setall_u32(2), vdd[3]))),
                                                         v_reinterpret_as_s32(v_add(v_sub(rr[3], gg[3]), v_mul(v_setall_u32(4), vdd[3]))))));
 
@@ -433,16 +433,16 @@ void run_rgb2hsv_impl(uchar out[], const uchar in[], const int sdiv_table[],
 
             // check for negative H
             v_int32x4 v_h_less_0[4];
-            v_h_less_0[0] = (hh[0] < v_setall_s32(0));
-            v_h_less_0[1] = (hh[1] < v_setall_s32(0));
-            v_h_less_0[2] = (hh[2] < v_setall_s32(0));
-            v_h_less_0[3] = (hh[3] < v_setall_s32(0));
+            v_h_less_0[0] = (v_lt(hh[0], v_setall_s32(0)));
+            v_h_less_0[1] = (v_lt(hh[1], v_setall_s32(0)));
+            v_h_less_0[2] = (v_lt(hh[2], v_setall_s32(0)));
+            v_h_less_0[3] = (v_lt(hh[3], v_setall_s32(0)));
 
             v_int32x4 v_h_180[4];
-            v_h_180[0] = hh[0] + v_setall_s32(180);
-            v_h_180[1] = hh[1] + v_setall_s32(180);
-            v_h_180[2] = hh[2] + v_setall_s32(180);
-            v_h_180[3] = hh[3] + v_setall_s32(180);
+            v_h_180[0] = v_add(hh[0], v_setall_s32(180));
+            v_h_180[1] = v_add(hh[1], v_setall_s32(180));
+            v_h_180[2] = v_add(hh[2], v_setall_s32(180));
+            v_h_180[3] = v_add(hh[3], v_setall_s32(180));
 
             hh[0] = v_select(v_h_less_0[0], v_h_180[0], hh[0]);
             hh[1] = v_select(v_h_less_0[1], v_h_180[1], hh[1]);
diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_simd_avx2.hpp b/modules/gapi/src/backends/fluid/gfluidimgproc_simd_avx2.hpp
index e246f0613b..f7a502f150 100644
--- a/modules/gapi/src/backends/fluid/gfluidimgproc_simd_avx2.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_simd_avx2.hpp
@@ -64,7 +64,7 @@ CV_ALWAYS_INLINE void calcRowLinear32FC1Impl(float *dst[],
     bool xRatioEq1 = inSz.width == outSz.width;
     bool yRatioEq1 = inSz.height == outSz.height;
 
-    constexpr int nlanes = v_float32x8::nlanes;
+    const int nlanes = VTraits<v_float32x8>::vlanes();
 
     if (!xRatioEq1 && !yRatioEq1)
     {
diff --git a/modules/imgproc/src/bilateral_filter.simd.hpp b/modules/imgproc/src/bilateral_filter.simd.hpp
index 332b36646c..77e0328678 100644
--- a/modules/imgproc/src/bilateral_filter.simd.hpp
+++ b/modules/imgproc/src/bilateral_filter.simd.hpp
@@ -140,9 +140,9 @@ public:
 #if CV_SIMD128
                         v_uint32x4 rval = v_setall_u32(sptr[j]);
                         v_uint32x4 val(ksptr0[j], ksptr1[j], ksptr2[j], ksptr3[j]);
-                        v_float32x4 w = kweight4 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
+                        v_float32x4 w = v_mul(kweight4, v_lut(this->color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))));
                         wsum[j] += v_reduce_sum(w);
-                        sum[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(val)) * w);
+                        sum[j] += v_reduce_sum(v_mul(v_cvt_f32(v_reinterpret_as_s32(val)), w));
 #else
                         int rval = sptr[j];
 
@@ -407,11 +407,11 @@ public:
                             v_uint32x4 b(ksptr0[0], ksptr1[0], ksptr2[0], ksptr3[0]);
                             v_uint32x4 g(ksptr0[1], ksptr1[1], ksptr2[1], ksptr3[1]);
                             v_uint32x4 r(ksptr0[2], ksptr1[2], ksptr2[2], ksptr3[2]);
-                            v_float32x4 w = kweight4 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(b, rb) + v_absdiff(g, rg) + v_absdiff(r, rr)));
+                            v_float32x4 w = v_mul(kweight4, v_lut(this->color_weight, v_reinterpret_as_s32(v_add(v_add(v_absdiff(b, rb), v_absdiff(g, rg)), v_absdiff(r, rr)))));
                             wsum[j] += v_reduce_sum(w);
-                            sum_b[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(b)) * w);
-                            sum_g[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(g)) * w);
-                            sum_r[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(r)) * w);
+                            sum_b[j] += v_reduce_sum(v_mul(v_cvt_f32(v_reinterpret_as_s32(b)), w));
+                            sum_g[j] += v_reduce_sum(v_mul(v_cvt_f32(v_reinterpret_as_s32(g)), w));
+                            sum_r[j] += v_reduce_sum(v_mul(v_cvt_f32(v_reinterpret_as_s32(r)), w));
 #else
                         int rb = rsptr[0], rg = rsptr[1], rr = rsptr[2];
 
@@ -661,12 +661,12 @@ public:
                         v_float32x4 rval = v_setall_f32(sptr[j]);
                         v_float32x4 val(ksptr0[j], ksptr1[j], ksptr2[j], ksptr3[j]);
                         v_float32x4 knan = v_not_nan(val);
-                        v_float32x4 alpha = (v_absdiff(val, rval) * sindex4) & v_not_nan(rval) & knan;
+                        v_float32x4 alpha = v_and(v_and(v_mul(v_absdiff(val, rval), sindex4), v_not_nan(rval)), knan);
                         v_int32x4 idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-                        v_float32x4 w = (kweight4 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one4 - alpha))) & knan;
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
+                        v_float32x4 w = v_and(v_mul(kweight4, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one4, alpha)))), knan);
                         wsum[j] += v_reduce_sum(w);
-                        sum[j] += v_reduce_sum((val & knan) * w);
+                        sum[j] += v_reduce_sum(v_mul(v_and(val, knan), w));
 #else
                         float rval = sptr[j];
 
@@ -862,15 +862,15 @@ public:
                         v_float32x4 kb(ksptr0[0], ksptr1[0], ksptr2[0], ksptr3[0]);
                         v_float32x4 kg(ksptr0[1], ksptr1[1], ksptr2[1], ksptr3[1]);
                         v_float32x4 kr(ksptr0[2], ksptr1[2], ksptr2[2], ksptr3[2]);
-                        v_float32x4 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
-                        v_float32x4 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex4) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
+                        v_float32x4 knan = v_and(v_and(v_not_nan(kb), v_not_nan(kg)), v_not_nan(kr));
+                        v_float32x4 alpha = v_and(v_and(v_and(v_and(v_mul(v_add(v_add(v_absdiff(kb, rb), v_absdiff(kg, rg)), v_absdiff(kr, rr)), sindex4), v_not_nan(rb)), v_not_nan(rg)), v_not_nan(rr)), knan);
                         v_int32x4 idx = v_trunc(alpha);
-                        alpha -= v_cvt_f32(idx);
-                        v_float32x4 w = (kweight4 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one4 - alpha))) & knan;
+                        alpha = v_sub(alpha, v_cvt_f32(idx));
+                        v_float32x4 w = v_and(v_mul(kweight4, v_muladd(v_lut(this->expLUT + 1, idx), alpha, v_mul(v_lut(this->expLUT, idx), v_sub(v_one4, alpha)))), knan);
                         wsum[j] += v_reduce_sum(w);
-                        sum_b[j] += v_reduce_sum((kb & knan) * w);
-                        sum_g[j] += v_reduce_sum((kg & knan) * w);
-                        sum_r[j] += v_reduce_sum((kr & knan) * w);
+                        sum_b[j] += v_reduce_sum(v_mul(v_and(kb, knan), w));
+                        sum_g[j] += v_reduce_sum(v_mul(v_and(kg, knan), w));
+                        sum_r[j] += v_reduce_sum(v_mul(v_and(kr, knan), w));
 #else
                         float rb = rsptr[0], rg = rsptr[1], rr = rsptr[2];
                         bool r_NAN = cvIsNaN(rb) || cvIsNaN(rg) || cvIsNaN(rr);
diff --git a/modules/imgproc/src/box_filter.simd.hpp b/modules/imgproc/src/box_filter.simd.hpp
index 735935c04f..f7c8f66a35 100644
--- a/modules/imgproc/src/box_filter.simd.hpp
+++ b/modules/imgproc/src/box_filter.simd.hpp
@@ -315,7 +315,7 @@ struct ColumnSum<int, uchar> :
                     v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
                 }
 #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
-                for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
+                for (; i <= width - VTraits<v_int32x4>::vlanes(); i += VTraits<v_int32x4>::vlanes())
                 {
                     v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
                 }
@@ -357,10 +357,10 @@ struct ColumnSum<int, uchar> :
                 }
 #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                 v_float32x4 v_scale = v_setall_f32((float)_scale);
-                for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
+                for( ; i <= width-VTraits<v_uint16x8>::vlanes(); i+=VTraits<v_uint16x8>::vlanes() )
                 {
                     v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
-                    v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
+                    v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits<v_int32x4>::vlanes()), v_load(Sp + i + VTraits<v_int32x4>::vlanes()));
 
                     v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), v_scale)));
                     v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), v_scale)));
@@ -369,7 +369,7 @@ struct ColumnSum<int, uchar> :
                     v_pack_store(D + i, v_dst);
 
                     v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
-                    v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
+                    v_store(SUM + i + VTraits<v_int32x4>::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits<v_int32x4>::vlanes())));
             }
 #endif
 #endif
@@ -396,16 +396,16 @@ struct ColumnSum<int, uchar> :
                     v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
                 }
 #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
-                for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
+                for( ; i <= width-VTraits<v_uint16x8>::vlanes(); i+=VTraits<v_uint16x8>::vlanes() )
                 {
                     v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
-                    v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
+                    v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits<v_int32x4>::vlanes()), v_load(Sp + i + VTraits<v_int32x4>::vlanes()));
 
                     v_uint16x8 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01));
                     v_pack_store(D + i, v_dst);
 
                     v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
-                    v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
+                    v_store(SUM + i + VTraits<v_int32x4>::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits<v_int32x4>::vlanes())));
                 }
 #endif
 #endif
@@ -486,7 +486,7 @@ public BaseColumnFilter
                     v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
                 }
 #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
-                for( ; i <= width - v_uint16x8::nlanes; i += v_uint16x8::nlanes )
+                for( ; i <= width - VTraits<v_uint16x8>::vlanes(); i += VTraits<v_uint16x8>::vlanes() )
                 {
                     v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
                 }
@@ -546,13 +546,13 @@ public BaseColumnFilter
                 v_uint32x4 ds4 = v_setall_u32((unsigned)ds);
                 v_uint16x8 dd8 = v_setall_u16((ushort)dd);
 
-                for( ; i <= width-v_uint8x16::nlanes; i+=v_uint8x16::nlanes )
+                for( ; i <= width-VTraits<v_uint8x16>::vlanes(); i+=VTraits<v_uint8x16>::vlanes() )
                 {
                     v_uint16x8 _sm0 = v_load(Sm + i);
-                    v_uint16x8 _sm1 = v_load(Sm + i + v_uint16x8::nlanes);
+                    v_uint16x8 _sm1 = v_load(Sm + i + VTraits<v_uint16x8>::vlanes());
 
                     v_uint16x8 _s0 = v_add_wrap(v_load(SUM + i), v_load(Sp + i));
-                    v_uint16x8 _s1 = v_add_wrap(v_load(SUM + i + v_uint16x8::nlanes), v_load(Sp + i + v_uint16x8::nlanes));
+                    v_uint16x8 _s1 = v_add_wrap(v_load(SUM + i + VTraits<v_uint16x8>::vlanes()), v_load(Sp + i + VTraits<v_uint16x8>::vlanes()));
 
                     v_uint32x4 _s00, _s01, _s10, _s11;
 
@@ -572,7 +572,7 @@ public BaseColumnFilter
 
                     v_store(D + i, v_pack_u(r0, r1));
                     v_store(SUM + i, _s0);
-                    v_store(SUM + i + v_uint16x8::nlanes, _s1);
+                    v_store(SUM + i + VTraits<v_uint16x8>::vlanes(), _s1);
                 }
 #endif
 #endif
@@ -649,7 +649,7 @@ struct ColumnSum<int, short> :
                     v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
                 }
 #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
-                for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
+                for( ; i <= width - VTraits<v_int32x4>::vlanes(); i+=VTraits<v_int32x4>::vlanes() )
                 {
                     v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
                 }
@@ -689,17 +689,17 @@ struct ColumnSum<int, short> :
                 }
 #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                 v_float32x4 v_scale = v_setall_f32((float)_scale);
-                for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes )
+                for( ; i <= width-VTraits<v_int16x8>::vlanes(); i+=VTraits<v_int16x8>::vlanes() )
                 {
                     v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
-                    v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
+                    v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits<v_int32x4>::vlanes()), v_load(Sp + i + VTraits<v_int32x4>::vlanes()));
 
                     v_int32x4 v_s0d =  v_round(v_mul(v_cvt_f32(v_s0), v_scale));
                     v_int32x4 v_s01d = v_round(v_mul(v_cvt_f32(v_s01), v_scale));
                     v_store(D + i, v_pack(v_s0d, v_s01d));
 
                     v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
-                    v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
+                    v_store(SUM + i + VTraits<v_int32x4>::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits<v_int32x4>::vlanes())));
                 }
 #endif
 #endif
@@ -725,15 +725,15 @@ struct ColumnSum<int, short> :
                     v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
                 }
 #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
-                for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes )
+                for( ; i <= width-VTraits<v_int16x8>::vlanes(); i+=VTraits<v_int16x8>::vlanes() )
                 {
                     v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
-                    v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
+                    v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits<v_int32x4>::vlanes()), v_load(Sp + i + VTraits<v_int32x4>::vlanes()));
 
                     v_store(D + i, v_pack(v_s0, v_s01));
 
                     v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
-                    v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
+                    v_store(SUM + i + VTraits<v_int32x4>::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits<v_int32x4>::vlanes())));
                 }
 #endif
 #endif
@@ -798,7 +798,7 @@ struct ColumnSum<int, ushort> :
                     v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
                 }
 #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
-                for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
+                for (; i <= width - VTraits<v_int32x4>::vlanes(); i += VTraits<v_int32x4>::vlanes())
                 {
                     v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
                 }
@@ -838,17 +838,17 @@ struct ColumnSum<int, ushort> :
                 }
 #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                 v_float32x4 v_scale = v_setall_f32((float)_scale);
-                for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
+                for( ; i <= width-VTraits<v_uint16x8>::vlanes(); i+=VTraits<v_uint16x8>::vlanes() )
                 {
                     v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
-                    v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
+                    v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits<v_int32x4>::vlanes()), v_load(Sp + i + VTraits<v_int32x4>::vlanes()));
 
                     v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s0), v_scale)));
                     v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_mul(v_cvt_f32(v_s01), v_scale)));
                     v_store(D + i, v_pack(v_s0d, v_s01d));
 
                     v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
-                    v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
+                    v_store(SUM + i + VTraits<v_int32x4>::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits<v_int32x4>::vlanes())));
                 }
 #endif
 #endif
@@ -874,15 +874,15 @@ struct ColumnSum<int, ushort> :
                     v_store(SUM + i + VTraits<v_int32>::vlanes(), v_sub(v_s01, vx_load(Sm + i + VTraits<v_int32>::vlanes())));
                 }
 #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
-                for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
+                for( ; i <= width-VTraits<v_uint16x8>::vlanes(); i+=VTraits<v_uint16x8>::vlanes() )
                 {
                     v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
-                    v_int32x4 v_s01 = v_add(v_load(SUM + i + v_int32x4::nlanes), v_load(Sp + i + v_int32x4::nlanes));
+                    v_int32x4 v_s01 = v_add(v_load(SUM + i + VTraits<v_int32x4>::vlanes()), v_load(Sp + i + VTraits<v_int32x4>::vlanes()));
 
                     v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)));
 
                     v_store(SUM + i, v_sub(v_s0, v_load(Sm + i)));
-                    v_store(SUM + i + v_int32x4::nlanes, v_sub(v_s01, v_load(Sm + i + v_int32x4::nlanes)));
+                    v_store(SUM + i + VTraits<v_int32x4>::vlanes(), v_sub(v_s01, v_load(Sm + i + VTraits<v_int32x4>::vlanes())));
                 }
 #endif
 #endif
@@ -945,7 +945,7 @@ struct ColumnSum<int, int> :
                     v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
                 }
 #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
-                for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
+                for( ; i <= width - VTraits<v_int32x4>::vlanes(); i+=VTraits<v_int32x4>::vlanes() )
                 {
                     v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
                 }
@@ -981,7 +981,7 @@ struct ColumnSum<int, int> :
                 }
 #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                 v_float32x4 v_scale = v_setall_f32((float)_scale);
-                for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
+                for( ; i <= width-VTraits<v_int32x4>::vlanes(); i+=VTraits<v_int32x4>::vlanes() )
                 {
                     v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
                     v_int32x4 v_s0d = v_round(v_mul(v_cvt_f32(v_s0), v_scale));
@@ -1010,7 +1010,7 @@ struct ColumnSum<int, int> :
                     v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
                 }
 #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
-                for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
+                for( ; i <= width-VTraits<v_int32x4>::vlanes(); i+=VTraits<v_int32x4>::vlanes() )
                 {
                     v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
 
@@ -1079,7 +1079,7 @@ struct ColumnSum<int, float> :
                     v_store(SUM + i, v_add(vx_load(SUM + i), vx_load(Sp + i)));
                 }
 #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
-                for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
+                for( ; i <= width - VTraits<v_int32x4>::vlanes(); i+=VTraits<v_int32x4>::vlanes() )
                 {
                     v_store(SUM + i, v_add(v_load(SUM + i), v_load(Sp + i)));
                 }
@@ -1115,7 +1115,7 @@ struct ColumnSum<int, float> :
                 }
 #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
                 v_float32x4 v_scale = v_setall_f32((float)_scale);
-                for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
+                for (; i <= width - VTraits<v_int32x4>::vlanes(); i += VTraits<v_int32x4>::vlanes())
                 {
                     v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
                     v_store(D + i, v_mul(v_cvt_f32(v_s0), v_scale));
@@ -1142,7 +1142,7 @@ struct ColumnSum<int, float> :
                     v_store(SUM + i, v_sub(v_s0, vx_load(Sm + i)));
                 }
 #if !CV_SIMD_SCALABLE && CV_SIMD_WIDTH > 16
-                for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
+                for( ; i <= width-VTraits<v_int32x4>::vlanes(); i+=VTraits<v_int32x4>::vlanes() )
                 {
                     v_int32x4 v_s0 = v_add(v_load(SUM + i), v_load(Sp + i));
                     v_store(D + i, v_cvt_f32(v_s0));
diff --git a/modules/imgproc/src/color_lab.cpp b/modules/imgproc/src/color_lab.cpp
index d111efdc47..6a8a0ea7f9 100644
--- a/modules/imgproc/src/color_lab.cpp
+++ b/modules/imgproc/src/color_lab.cpp
@@ -66,7 +66,7 @@ template<typename _Tp> static inline cv::v_float32 splineInterpolate(const cv::v
     ix = v_shl<2>(ix);
 
     v_float32 t0, t1, t2, t3;
-    // assume that v_float32::nlanes == v_int32::nlanes
+    // assume that VTraits<v_float32>::vlanes() == VTraits<v_int32>::vlanes()
     if(VTraits<v_float32>::vlanes() == 4)
     {
         int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) idx[4];
@@ -1388,16 +1388,16 @@ static inline void trilinearPackedInterpolate(const v_uint16x8& inX, const v_uin
                                               v_uint16x8& outA, v_uint16x8& outB, v_uint16x8& outC)
 {
     //LUT idx of origin pt of cube
-    v_uint16x8 idxsX = inX >> (lab_base_shift - lab_lut_shift);
-    v_uint16x8 idxsY = inY >> (lab_base_shift - lab_lut_shift);
-    v_uint16x8 idxsZ = inZ >> (lab_base_shift - lab_lut_shift);
+    v_uint16x8 idxsX = v_shr<lab_base_shift - lab_lut_shift>(inX);
+    v_uint16x8 idxsY = v_shr<lab_base_shift - lab_lut_shift>(inY);
+    v_uint16x8 idxsZ = v_shr<lab_base_shift - lab_lut_shift>(inZ);
 
     //x, y, z are [0; TRILINEAR_BASE)
     const uint16_t bitMask = (1 << trilinear_shift) - 1;
     v_uint16x8 bitMaskReg = v_setall_u16(bitMask);
-    v_uint16x8 fracX = (inX >> (lab_base_shift - 8 - 1)) & bitMaskReg;
-    v_uint16x8 fracY = (inY >> (lab_base_shift - 8 - 1)) & bitMaskReg;
-    v_uint16x8 fracZ = (inZ >> (lab_base_shift - 8 - 1)) & bitMaskReg;
+    v_uint16x8 fracX = v_and(v_shr<lab_base_shift - 8 - 1>(inX), bitMaskReg);
+    v_uint16x8 fracY = v_and(v_shr<lab_base_shift - 8 - 1>(inY), bitMaskReg);
+    v_uint16x8 fracZ = v_and(v_shr<lab_base_shift - 8 - 1>(inZ), bitMaskReg);
 
     //load values to interpolate for pix0, pix1, .., pix7
     v_int16x8 a0, a1, a2, a3, a4, a5, a6, a7;
@@ -1407,9 +1407,9 @@ static inline void trilinearPackedInterpolate(const v_uint16x8& inX, const v_uin
     v_uint32x4 addrDw0, addrDw1, addrDw10, addrDw11;
     v_mul_expand(v_setall_u16(3*8), idxsX, addrDw0, addrDw1);
     v_mul_expand(v_setall_u16(3*8*LAB_LUT_DIM), idxsY, addrDw10, addrDw11);
-    addrDw0 += addrDw10; addrDw1 += addrDw11;
+    addrDw0 = v_add(addrDw0, addrDw10); addrDw1 = v_add(addrDw1, addrDw11);
     v_mul_expand(v_setall_u16(3*8*LAB_LUT_DIM*LAB_LUT_DIM), idxsZ, addrDw10, addrDw11);
-    addrDw0 += addrDw10; addrDw1 += addrDw11;
+    addrDw0 = v_add(addrDw0, addrDw10); addrDw1 = v_add(addrDw1, addrDw11);
 
     uint32_t CV_DECL_ALIGNED(16) addrofs[8];
     v_store_aligned(addrofs, addrDw0);
@@ -1431,9 +1431,9 @@ static inline void trilinearPackedInterpolate(const v_uint16x8& inX, const v_uin
     v_int16x8 w0, w1, w2, w3, w4, w5, w6, w7;
     v_mul_expand(v_setall_u16(8), fracX, addrDw0, addrDw1);
     v_mul_expand(v_setall_u16(8*TRILINEAR_BASE), fracY, addrDw10, addrDw11);
-    addrDw0 += addrDw10; addrDw1 += addrDw11;
+    addrDw0 = v_add(addrDw0, addrDw10); addrDw1 = v_add(addrDw1, addrDw11);
     v_mul_expand(v_setall_u16(8*TRILINEAR_BASE*TRILINEAR_BASE), fracZ, addrDw10, addrDw11);
-    addrDw0 += addrDw10; addrDw1 += addrDw11;
+    addrDw0 = v_add(addrDw0, addrDw10); addrDw1 = v_add(addrDw1, addrDw11);
 
     v_store_aligned(addrofs, addrDw0);
     v_store_aligned(addrofs + 4, addrDw1);
@@ -1476,7 +1476,8 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1
                                               const int16_t* LUT,
                                               v_uint16& outA, v_uint16& outB, v_uint16& outC)
 {
-    const int vsize = VTraits<v_uint16>::max_nlanes;
+    const int vsize = VTraits<v_uint16>::vlanes();
+    const int vsize_max = VTraits<v_uint16>::max_nlanes;
 
     // LUT idx of origin pt of cube
     v_uint16 tx = v_shr<lab_base_shift - lab_lut_shift>(inX);
@@ -1492,7 +1493,7 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1
     baseIdx0 = v_add(v_add(btmp00, btmp10), btmp20);
     baseIdx1 = v_add(v_add(btmp01, btmp11), btmp21);
 
-    uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vbaseIdx[vsize];
+    uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vbaseIdx[vsize_max];
     v_store_aligned(vbaseIdx + 0*vsize/2, baseIdx0);
     v_store_aligned(vbaseIdx + 1*vsize/2, baseIdx1);
 
@@ -1513,13 +1514,13 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1
     trilinearIdx0 = v_add(v_add(v_shl<3>(fracX0), v_shl<3 + trilinear_shift>(fracY0)), v_shl<3 + trilinear_shift * 2>(fracZ0));
     trilinearIdx1 = v_add(v_add(v_shl<3>(fracX1), v_shl<3 + trilinear_shift>(fracY1)), v_shl<3 + trilinear_shift * 2>(fracZ1));
 
-    uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vtrilinearIdx[vsize];
+    uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vtrilinearIdx[vsize_max];
     v_store_aligned(vtrilinearIdx + 0*vsize/2, trilinearIdx0);
     v_store_aligned(vtrilinearIdx + 1*vsize/2, trilinearIdx1);
 
     v_uint32 a0, a1, b0, b1, c0, c1;
 
-    uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) va[vsize], vb[vsize], vc[vsize];
+    uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) va[vsize_max], vb[vsize_max], vc[vsize_max];
     for(int j = 0; j < vsize; j++)
     {
         const int16_t* baseLUT = LUT + vbaseIdx[j];
@@ -1649,11 +1650,11 @@ struct RGB2Lab_b
         vL = v_shr<lab_shift2>(vL);
 
         /* int a = CV_DESCALE( 500*(fX - fY) + 128*(1 << lab_shift2), lab_shift2 );*/
-        va = v_fma(vfX - vfY, v_setall_s32(500), v_setall_s32(abShift+labDescaleShift));
+        va = v_fma(v_sub(vfX, vfY), v_setall_s32(500), v_setall_s32(abShift+labDescaleShift));
         va = v_shr<lab_shift2>(va);
 
         /* int b = CV_DESCALE( 200*(fY - fZ) + 128*(1 << lab_shift2), lab_shift2 );*/
-        vb = v_fma(vfY - vfZ, v_setall_s32(200), v_setall_s32(abShift+labDescaleShift));
+        vb = v_fma(v_sub(vfY, vfZ), v_setall_s32(200), v_setall_s32(abShift+labDescaleShift));
         vb = v_shr<lab_shift2>(vb);
     }
 #endif // CV_NEON
@@ -1675,8 +1676,8 @@ struct RGB2Lab_b
 #if CV_NEON
         // On each loop, we load nlanes of RGB/A v_uint8s and store nlanes of
         // Lab v_uint8s
-        for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes,
-                src += scn*v_uint8::nlanes, dst += 3*v_uint8::nlanes )
+        for(; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes(),
+                src += scn*VTraits<v_uint8>::vlanes(), dst += 3*VTraits<v_uint8>::vlanes() )
         {
             // Load 4 batches of 4 src
             v_uint8 vRi, vGi, vBi;
@@ -1712,7 +1713,7 @@ struct RGB2Lab_b
 #endif // CV_NEON
 
 #if CV_SIMD
-        const int vsize = v_uint8::nlanes;
+        const int vsize = VTraits<v_uint8>::vlanes();
         const int xyzDescaleShift = 1 << (lab_shift - 1);
         v_int16 vXYZdescale = vx_setall_s16(xyzDescaleShift);
         v_int16 cxrg, cxb1, cyrg, cyb1, czrg, czb1;
@@ -1752,7 +1753,7 @@ struct RGB2Lab_b
                 v_expand(drgb[k], qrgb[k*2+0], qrgb[k*2+1]);
             }
 
-            uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vdrgb[vsize*3];
+            uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vdrgb[VTraits<v_uint8>::max_nlanes*3];
             for(int k = 0; k < 12; k++)
             {
                 v_store_aligned(vdrgb + k*vsize/4, qrgb[k]);
@@ -1784,14 +1785,14 @@ struct RGB2Lab_b
             v_uint32 x[4], y[4], z[4];
             for(int j = 0; j < 4; j++)
             {
-                x[j] = v_reinterpret_as_u32(v_dotprod(rg[j], cxrg) + v_dotprod(bd[j], cxb1)) >> lab_shift;
-                y[j] = v_reinterpret_as_u32(v_dotprod(rg[j], cyrg) + v_dotprod(bd[j], cyb1)) >> lab_shift;
-                z[j] = v_reinterpret_as_u32(v_dotprod(rg[j], czrg) + v_dotprod(bd[j], czb1)) >> lab_shift;
+                x[j] = v_shr<xyz_shift>(v_reinterpret_as_u32(v_add(v_dotprod(rg[j], cxrg), v_dotprod(bd[j], cxb1))));
+                y[j] = v_shr<xyz_shift>(v_reinterpret_as_u32(v_add(v_dotprod(rg[j], cyrg), v_dotprod(bd[j], cyb1))));
+                z[j] = v_shr<xyz_shift>(v_reinterpret_as_u32(v_add(v_dotprod(rg[j], czrg), v_dotprod(bd[j], czb1))));
             }
 
             // [fX, fY, fZ] = LabCbrtTab_b[vx, vy, vz]
             // [4 per X, 4 per Y, 4 per Z]
-            uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vxyz[vsize*3];
+            uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vxyz[VTraits<v_uint8>::max_nlanes*3];
             for(int j = 0; j < 4; j++)
             {
                 v_store_aligned(vxyz + (0*4+j)*vsize/4, x[j]);
@@ -1822,7 +1823,7 @@ struct RGB2Lab_b
             v_uint32 vLshift = vx_setall_u32((uint32_t)(Lshift + labDescaleShift));
             for(int k = 0; k < 4; k++)
             {
-                vL[k] = (vL[k] + vLshift) >> lab_shift2;
+                vL[k] = v_shr<lab_shift2>(v_add(vL[k], vLshift));
             }
             v_uint16 L0, L1;
             L0 = v_pack(vL[0], vL[1]);
@@ -1846,7 +1847,7 @@ struct RGB2Lab_b
             v_int32 abShift = vx_setall_s32(128*(1 << lab_shift2) + labDescaleShift);
             for(int k = 0; k < 8; k++)
             {
-                ab[k] = (ab[k] + abShift) >> lab_shift2;
+                ab[k] = v_shr<lab_shift2>(v_add(ab[k], abShift));
             }
             v_int16 a0, a1, b0, b1;
             a0 = v_pack(ab[0], ab[1]); a1 = v_pack(ab[2], ab[3]);
@@ -1941,7 +1942,7 @@ struct RGB2Lab_f
 #if CV_SIMD
             if(enablePackedLab)
             {
-                const int vsize = v_float32::nlanes;
+                const int vsize = VTraits<v_float32>::vlanes();
                 static const int nPixels = vsize*2;
                 for(; i < n - 3*nPixels; i += 3*nPixels, src += scn*nPixels)
                 {
@@ -1973,8 +1974,8 @@ struct RGB2Lab_f
                     #undef clipv
                     /* int iR = R*LAB_BASE, iG = G*LAB_BASE, iB = B*LAB_BASE, iL, ia, ib; */
                     v_float32 basef = vx_setall_f32(LAB_BASE);
-                    rvec0 *= basef, gvec0 *= basef, bvec0 *= basef;
-                    rvec1 *= basef, gvec1 *= basef, bvec1 *= basef;
+                    rvec0 = v_mul(rvec0, basef), gvec0 = v_mul(gvec0, basef), bvec0 = v_mul(bvec0, basef);
+                    rvec1 = v_mul(rvec1, basef), gvec1 = v_mul(gvec1, basef), bvec1 = v_mul(bvec1, basef);
 
                     v_int32 irvec0, igvec0, ibvec0, irvec1, igvec1, ibvec1;
                     irvec0 = v_round(rvec0); irvec1 = v_round(rvec1);
@@ -2004,8 +2005,8 @@ struct RGB2Lab_f
 
                     /* dst[i] = L*100.0f */
                     v_float32 v100dBase = vx_setall_f32(100.0f/LAB_BASE);
-                    l_vec0 = l_vec0*v100dBase;
-                    l_vec1 = l_vec1*v100dBase;
+                    l_vec0 = v_mul(l_vec0, v100dBase);
+                    l_vec1 = v_mul(l_vec1, v100dBase);
                     /*
                     dst[i + 1] = a*256.0f - 128.0f;
                     dst[i + 2] = b*256.0f - 128.0f;
@@ -2043,8 +2044,8 @@ struct RGB2Lab_f
             static const float _a = (softfloat(16) / softfloat(116));
             int i = 0;
 #if CV_SIMD
-            const int vsize = v_float32::nlanes;
-            const int nrepeats = vsize == 4 ? 2 : 1;
+            const int vsize = VTraits<v_float32>::vlanes();
+            const int nrepeats = VTraits<v_float32>::nlanes == 4 ? 2 : 1;
             v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2);
             v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4), vc5 = vx_setall_f32(C5);
             v_float32 vc6 = vx_setall_f32(C6), vc7 = vx_setall_f32(C7), vc8 = vx_setall_f32(C8);
@@ -2080,9 +2081,9 @@ struct RGB2Lab_f
                     v_float32 vgscale = vx_setall_f32(gscale);
                     for (int k = 0; k < nrepeats; k++)
                     {
-                        R[k] = splineInterpolate(R[k]*vgscale, gammaTab, GAMMA_TAB_SIZE);
-                        G[k] = splineInterpolate(G[k]*vgscale, gammaTab, GAMMA_TAB_SIZE);
-                        B[k] = splineInterpolate(B[k]*vgscale, gammaTab, GAMMA_TAB_SIZE);
+                        R[k] = splineInterpolate(v_mul(R[k], vgscale), gammaTab, GAMMA_TAB_SIZE);
+                        G[k] = splineInterpolate(v_mul(G[k], vgscale), gammaTab, GAMMA_TAB_SIZE);
+                        B[k] = splineInterpolate(v_mul(B[k], vgscale), gammaTab, GAMMA_TAB_SIZE);
                     }
                 }
 
@@ -2090,26 +2091,26 @@ struct RGB2Lab_f
                 v_float32 FX[nrepeats], FY[nrepeats], FZ[nrepeats];
                 for (int k = 0; k < nrepeats; k++)
                 {
-                    X[k] = v_fma(R[k], vc0, v_fma(G[k], vc1, B[k]*vc2));
-                    Y[k] = v_fma(R[k], vc3, v_fma(G[k], vc4, B[k]*vc5));
-                    Z[k] = v_fma(R[k], vc6, v_fma(G[k], vc7, B[k]*vc8));
+                    X[k] = v_fma(R[k], vc0, v_fma(G[k], vc1, v_mul(B[k], vc2)));
+                    Y[k] = v_fma(R[k], vc3, v_fma(G[k], vc4, v_mul(B[k], vc5)));
+                    Z[k] = v_fma(R[k], vc6, v_fma(G[k], vc7, v_mul(B[k], vc8)));
 
                     // use spline interpolation instead of direct calculation
                     v_float32 vTabScale = vx_setall_f32(LabCbrtTabScale);
-                    FX[k] = splineInterpolate(X[k]*vTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
-                    FY[k] = splineInterpolate(Y[k]*vTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
-                    FZ[k] = splineInterpolate(Z[k]*vTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
+                    FX[k] = splineInterpolate(v_mul(X[k], vTabScale), LabCbrtTab, LAB_CBRT_TAB_SIZE);
+                    FY[k] = splineInterpolate(v_mul(Y[k], vTabScale), LabCbrtTab, LAB_CBRT_TAB_SIZE);
+                    FZ[k] = splineInterpolate(v_mul(Z[k], vTabScale), LabCbrtTab, LAB_CBRT_TAB_SIZE);
                 }
 
                 v_float32 L[nrepeats], a[nrepeats], b[nrepeats];
                 for (int k = 0; k < nrepeats; k++)
                 {
                     // 7.787f = (29/3)^3/(29*4), 0.008856f = (6/29)^3, 903.3 = (29/3)^3
-                    v_float32 mask = Y[k] > (vx_setall_f32(0.008856f));
+                    v_float32 mask = v_gt(Y[k], (vx_setall_f32(0.008856f)));
                     v_float32 v116 = vx_setall_f32(116.f), vm16 = vx_setall_f32(-16.f);
-                    L[k] = v_select(mask, v_fma(v116, FY[k], vm16), vx_setall_f32(903.3f)*Y[k]);
-                    a[k] = vx_setall_f32(500.f) * (FX[k] - FY[k]);
-                    b[k] = vx_setall_f32(200.f) * (FY[k] - FZ[k]);
+                    L[k] = v_select(mask, v_fma(v116, FY[k], vm16), v_mul(vx_setall_f32(903.3f),Y[k]));
+                    a[k] = v_mul(vx_setall_f32(500.F), v_sub(FX[k], FY[k]));
+                    b[k] = v_mul(vx_setall_f32(200.F), v_sub(FY[k], FZ[k]));
 
                     v_store_interleave(dst + k*3*vsize, L[k], a[k], b[k]);
                 }
@@ -2204,7 +2205,7 @@ struct Lab2RGBfloat
         float alpha = ColorChannel<float>::max();
 
 #if CV_SIMD
-        const int vsize = v_float32::nlanes;
+        const int vsize = VTraits<v_float32>::vlanes();
         const int nrepeats = 2;
         v_float32 v16_116 = vx_setall_f32(16.0f / 116.0f);
         for( ; i <= n-vsize*nrepeats;
@@ -2221,14 +2222,14 @@ struct Lab2RGBfloat
             v_float32 vlThresh = vx_setall_f32(lThresh);
             for(int k = 0; k < nrepeats; k++)
             {
-                limask[k] = li[k] <= vlThresh;
+                limask[k] = v_le(li[k], vlThresh);
             }
             v_float32 ylo[nrepeats], yhi[nrepeats], fylo[nrepeats], fyhi[nrepeats];
             // 903.3 = (29/3)^3, 7.787 = (29/3)^3/(29*4)
             v_float32 vinv903 = vx_setall_f32(1.f/903.3f);
             for(int k = 0; k < nrepeats; k++)
             {
-                ylo[k] = li[k] * vinv903;
+                ylo[k] = v_mul(li[k], vinv903);
             }
             v_float32 v7787 = vx_setall_f32(7.787f);
             for(int k = 0; k < nrepeats; k++)
@@ -2238,11 +2239,11 @@ struct Lab2RGBfloat
             v_float32 v16 = vx_setall_f32(16.0f), vinv116 = vx_setall_f32(1.f/116.0f);
             for(int k = 0; k < nrepeats; k++)
             {
-                fyhi[k] = (li[k] + v16) * vinv116;
+                fyhi[k] = v_mul(v_add(li[k], v16), vinv116);
             }
             for(int k = 0; k < nrepeats; k++)
             {
-                yhi[k] = fyhi[k] * fyhi[k] * fyhi[k];
+                yhi[k] = v_mul(fyhi[k], fyhi[k], fyhi[k]);
             }
             for(int k = 0; k < nrepeats; k++)
             {
@@ -2265,9 +2266,9 @@ struct Lab2RGBfloat
                 for (int j = 0; j < 2; j++)
                 {
                     v_float32 f = fxz[k*2+j];
-                    v_float32 fmask = f <= vfTresh;
-                    v_float32 flo = (f - v16_116) * vinv7787;
-                    v_float32 fhi = f*f*f;
+                    v_float32 fmask = v_le(f, vfTresh);
+                    v_float32 flo = v_mul(v_sub(f, v16_116), vinv7787);
+                    v_float32 fhi = v_mul(v_mul(f, f), f);
                     fxz[k*2+j] = v_select(fmask, flo, fhi);
                 }
             }
@@ -2281,9 +2282,9 @@ struct Lab2RGBfloat
             v_float32 vc6 = vx_setall_f32(C6), vc7 = vx_setall_f32(C7), vc8 = vx_setall_f32(C8);
             for(int k = 0; k < nrepeats; k++)
             {
-                ro[k] = v_fma(vc0, x[k], v_fma(vc1, y[k], vc2 * z[k]));
-                go[k] = v_fma(vc3, x[k], v_fma(vc4, y[k], vc5 * z[k]));
-                bo[k] = v_fma(vc6, x[k], v_fma(vc7, y[k], vc8 * z[k]));
+                ro[k] = v_fma(vc0, x[k], v_fma(vc1, y[k], v_mul(vc2, z[k])));
+                go[k] = v_fma(vc3, x[k], v_fma(vc4, y[k], v_mul(vc5, z[k])));
+                bo[k] = v_fma(vc6, x[k], v_fma(vc7, y[k], v_mul(vc8, z[k])));
             }
             v_float32 one = vx_setall_f32(1.f), zero = vx_setzero_f32();
             for(int k = 0; k < nrepeats; k++)
@@ -2298,9 +2299,9 @@ struct Lab2RGBfloat
                 v_float32 vgscale = vx_setall_f32(gscale);
                 for(int k = 0; k < nrepeats; k++)
                 {
-                    ro[k] *= vgscale;
-                    go[k] *= vgscale;
-                    bo[k] *= vgscale;
+                    ro[k] = v_mul(ro[k], vgscale);
+                    go[k] = v_mul(go[k], vgscale);
+                    bo[k] = v_mul(bo[k], vgscale);
                 }
 
                 for(int k = 0; k < nrepeats; k++)
@@ -2500,8 +2501,8 @@ struct Lab2RGBinteger
         for(int k = 0; k < 4; k++)
         {
             yf[k] = v_lut((const int*)LabToYF_b, lq[k]);
-            y[k]   = yf[k] & mask16;
-            ify[k] = v_reinterpret_as_s32(v_reinterpret_as_u32(yf[k]) >> 16);
+            y[k]   = v_and(yf[k], mask16);
+            ify[k] = v_reinterpret_as_s32(v_shr(v_reinterpret_as_u32(yf[k]), 16));
         }
 
         v_int16 ify0, ify1;
@@ -2516,18 +2517,18 @@ struct Lab2RGBinteger
         v_uint16 mulA = vx_setall_u16(53687);
         v_uint32 ma[4];
         v_uint32 addA = vx_setall_u32(1 << 7);
-        v_mul_expand((a0 + (a0 << 2)), mulA, ma[0], ma[1]);
-        v_mul_expand((a1 + (a1 << 2)), mulA, ma[2], ma[3]);
-        adiv0 = v_reinterpret_as_s16(v_pack(((ma[0] + addA) >> 13), ((ma[1] + addA) >> 13)));
-        adiv1 = v_reinterpret_as_s16(v_pack(((ma[2] + addA) >> 13), ((ma[3] + addA) >> 13)));
+        v_mul_expand((v_add(a0, v_shl<2>(a0))), mulA, ma[0], ma[1]);
+        v_mul_expand((v_add(a1, v_shl<2>(a1))), mulA, ma[2], ma[3]);
+        adiv0 = v_reinterpret_as_s16(v_pack((v_shr<13>(v_add(ma[0], addA))), (v_shr<13>(v_add(ma[1], addA)))));
+        adiv1 = v_reinterpret_as_s16(v_pack((v_shr<13>(v_add(ma[2], addA))), (v_shr<13>(v_add(ma[3], addA)))));
 
         v_uint16 mulB = vx_setall_u16(41943);
         v_uint32 mb[4];
         v_uint32 addB = vx_setall_u32(1 << 4);
         v_mul_expand(b0, mulB, mb[0], mb[1]);
         v_mul_expand(b1, mulB, mb[2], mb[3]);
-        bdiv0 = v_reinterpret_as_s16(v_pack((mb[0] + addB) >> 9, (mb[1] + addB) >> 9));
-        bdiv1 = v_reinterpret_as_s16(v_pack((mb[2] + addB) >> 9, (mb[3] + addB) >> 9));
+        bdiv0 = v_reinterpret_as_s16(v_pack(v_shr<9>(v_add(mb[0], addB)), v_shr<9>(v_add(mb[1], addB))));
+        bdiv1 = v_reinterpret_as_s16(v_pack(v_shr<9>(v_add(mb[2], addB)), v_shr<9>(v_add(mb[3], addB))));
 
         // 0 <= adiv <= 8356, 0 <= bdiv <= 20890
         /* x = ifxz[0]; y = y; z = ifxz[1]; */
@@ -2570,7 +2571,7 @@ struct Lab2RGBinteger
         {
             bool srgb = issRGB;
             ushort* tab = sRGBInvGammaTab_b;
-            const int vsize = v_uint8::nlanes;
+            const int vsize = VTraits<v_uint8>::vlanes();
             v_uint8 valpha = vx_setall_u8(alpha);
             v_int32 vc[9];
             for(int k = 0; k < 9; k++)
@@ -2592,9 +2593,9 @@ struct Lab2RGBinteger
                 v_int32 rq[4], gq[4], bq[4];
                 for(int k = 0; k < 4; k++)
                 {
-                    rq[k] = (vc[0] * xq[k] + vc[1] * yq[k] + vc[2] * zq[k] + vdescale) >> shift;
-                    gq[k] = (vc[3] * xq[k] + vc[4] * yq[k] + vc[5] * zq[k] + vdescale) >> shift;
-                    bq[k] = (vc[6] * xq[k] + vc[7] * yq[k] + vc[8] * zq[k] + vdescale) >> shift;
+                    rq[k] = v_shr<shift>(v_add(v_add(v_add(v_mul(vc[0], xq[k]), v_mul(vc[1], yq[k])), v_mul(vc[2], zq[k])), vdescale));
+                    gq[k] = v_shr<shift>(v_add(v_add(v_add(v_mul(vc[3], xq[k]), v_mul(vc[4], yq[k])), v_mul(vc[5], zq[k])), vdescale));
+                    bq[k] = v_shr<shift>(v_add(v_add(v_add(v_mul(vc[6], xq[k]), v_mul(vc[7], yq[k])), v_mul(vc[8], zq[k])), vdescale));
                 }
 
                 //limit indices in table and then substitute
@@ -2611,7 +2612,7 @@ struct Lab2RGBinteger
                 if(srgb)
                 {
                     // [RRR... , GGG... , BBB...]
-                    int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vidx[vsize*3];
+                    int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vidx[VTraits<v_uint8>::max_nlanes*3];
                     for (int k = 0; k < 4; k++)
                         v_store_aligned(vidx + 0*vsize + k*vsize/4, rq[k]);
                     for (int k = 0; k < 4; k++)
@@ -2631,9 +2632,9 @@ struct Lab2RGBinteger
                     // rgb = (rgb*255) >> inv_gamma_shift
                     for(int k = 0; k < 4; k++)
                     {
-                        rq[k] = ((rq[k] << 8) - rq[k]) >> inv_gamma_shift;
-                        gq[k] = ((gq[k] << 8) - gq[k]) >> inv_gamma_shift;
-                        bq[k] = ((bq[k] << 8) - bq[k]) >> inv_gamma_shift;
+                        rq[k] = v_shr((v_sub(v_shl(rq[k], 8), rq[k])), inv_gamma_shift);
+                        gq[k] = v_shr((v_sub(v_shl(gq[k], 8), gq[k])), inv_gamma_shift);
+                        bq[k] = v_shr((v_sub(v_shl(bq[k], 8), bq[k])), inv_gamma_shift);
                     }
                     rgb[0] = v_reinterpret_as_u16(v_pack(rq[0], rq[1]));
                     rgb[1] = v_reinterpret_as_u16(v_pack(rq[2], rq[3]));
@@ -2730,13 +2731,13 @@ struct Lab2RGB_b
         static const softfloat fl = softfloat(100)/f255;
 
 #if CV_SIMD
-        const int fsize = v_float32::nlanes;
+        const int fsize = VTraits<v_float32>::vlanes();
         v_float32 vl = vx_setall_f32((float)fl);
         v_float32 va = vx_setall_f32(1.f);
         v_float32 vb = vx_setall_f32(1.f);
         v_float32 vaLow = vx_setall_f32(-128.f), vbLow = vx_setall_f32(-128.f);
         //TODO: fix that when v_interleave is available
-        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[fsize*3], interTmpA[fsize*3];
+        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[VTraits<v_float32>::max_nlanes*3], interTmpA[VTraits<v_float32>::max_nlanes*3];
         v_store_interleave(interTmpM, vl, va, vb);
         v_store_interleave(interTmpA, vx_setzero_f32(), vaLow, vbLow);
         v_float32 mluv[3], aluv[3];
@@ -2754,7 +2755,7 @@ struct Lab2RGB_b
             j = 0;
 
 #if CV_SIMD
-            const int vsize = v_uint8::nlanes;
+            const int vsize = VTraits<v_uint8>::vlanes();
             for( ; j <= (dn - vsize)*3; j += 3*vsize )
             {
                 v_uint8 s0, s1, s2;
@@ -2808,7 +2809,7 @@ struct Lab2RGB_b
                     v_int32 vi[4*3];
                     for(int k = 0; k < 4*3; k++)
                     {
-                        vi[k] = v_round(vf[k]*v255);
+                        vi[k] = v_round(v_mul(vf[k], v255));
                     }
 
                     v_uint8 rgb[3];
@@ -2830,7 +2831,7 @@ struct Lab2RGB_b
                     for(int k = 0; k < 4; k++)
                     {
                         vf[k] = vx_load_aligned(buf + j + k*fsize);
-                        vi[k] = v_round(vf[k]*v255);
+                        vi[k] = v_round(v_mul(vf[k], v255));
                     }
                     v_store(dst, v_pack_u(v_pack(vi[0], vi[1]), v_pack(vi[2], vi[3])));
                 }
@@ -2910,8 +2911,8 @@ struct RGB2Luvfloat
               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
 
 #if CV_SIMD
-        const int vsize = v_float32::nlanes;
-        const int nrepeats = vsize == 4 ? 2 : 1;
+        const int vsize = VTraits<v_float32>::vlanes();
+        const int nrepeats = VTraits<v_float32>::nlanes == 4 ? 2 : 1;
         for( ; i <= n-vsize*nrepeats;
              i+= vsize*nrepeats, src += scn*vsize*nrepeats, dst += 3*vsize*nrepeats)
         {
@@ -2944,9 +2945,9 @@ struct RGB2Luvfloat
                 v_float32 vgscale = vx_setall_f32(gscale);
                 for (int k = 0; k < nrepeats; k++)
                 {
-                    R[k] *= vgscale;
-                    G[k] *= vgscale;
-                    B[k] *= vgscale;
+                    R[k] = v_mul(R[k], vgscale);
+                    G[k] = v_mul(G[k], vgscale);
+                    B[k] = v_mul(B[k], vgscale);
                 }
 
                 for (int k = 0; k < nrepeats; k++)
@@ -2963,27 +2964,27 @@ struct RGB2Luvfloat
             v_float32 vc6 = vx_setall_f32(C6), vc7 = vx_setall_f32(C7), vc8 = vx_setall_f32(C8);
             for (int k = 0; k < nrepeats; k++)
             {
-                X[k] = v_fma(R[k], vc0, v_fma(G[k], vc1, B[k]*vc2));
-                Y[k] = v_fma(R[k], vc3, v_fma(G[k], vc4, B[k]*vc5));
-                Z[k] = v_fma(R[k], vc6, v_fma(G[k], vc7, B[k]*vc8));
+                X[k] = v_fma(R[k], vc0, v_fma(G[k], vc1, v_mul(B[k], vc2)));
+                Y[k] = v_fma(R[k], vc3, v_fma(G[k], vc4, v_mul(B[k], vc5)));
+                Z[k] = v_fma(R[k], vc6, v_fma(G[k], vc7, v_mul(B[k], vc8)));
             }
 
             v_float32 L[nrepeats], u[nrepeats], v[nrepeats];
             v_float32 vmun = vx_setall_f32(-un), vmvn = vx_setall_f32(-vn);
             for (int k = 0; k < nrepeats; k++)
             {
-                L[k] = splineInterpolate(Y[k]*vx_setall_f32(LabCbrtTabScale), LabCbrtTab, LAB_CBRT_TAB_SIZE);
+                L[k] = splineInterpolate(v_mul(Y[k], vx_setall_f32(LabCbrtTabScale)), LabCbrtTab, LAB_CBRT_TAB_SIZE);
                 // L = 116.f*L - 16.f;
                 L[k] = v_fma(L[k], vx_setall_f32(116.f), vx_setall_f32(-16.f));
 
                 v_float32 d;
                 // d = (4*13) / max(X + 15 * Y + 3 * Z, FLT_EPSILON)
                 d = v_fma(Y[k], vx_setall_f32(15.f), v_fma(Z[k], vx_setall_f32(3.f), X[k]));
-                d = vx_setall_f32(4.f*13.f) / v_max(d, vx_setall_f32(FLT_EPSILON));
+                d = v_div(vx_setall_f32(4.F * 13.F), v_max(d, vx_setall_f32(FLT_EPSILON)));
                 // u = L*(X*d - un)
-                u[k] = L[k]*v_fma(X[k], d, vmun);
+                u[k] = v_mul(L[k], v_fma(X[k], d, vmun));
                 // v = L*((9*0.25f)*Y*d - vn);
-                v[k] = L[k]*v_fma(vx_setall_f32(9.f*0.25f)*Y[k], d, vmvn);
+                v[k] = v_mul(L[k], v_fma(v_mul(vx_setall_f32(9.F * 0.25F), Y[k]), d, vmvn));
             }
 
             for (int k = 0; k < nrepeats; k++)
@@ -3099,8 +3100,8 @@ struct Luv2RGBfloat
         float _un = un, _vn = vn;
 
 #if CV_SIMD
-        const int vsize = v_float32::nlanes;
-        const int nrepeats = vsize == 4 ? 2 : 1;
+        const int vsize = VTraits<v_float32>::vlanes();
+        const int nrepeats = VTraits<v_float32>::nlanes == 4 ? 2 : 1;
         for( ; i <= n - vsize*nrepeats;
              i += vsize*nrepeats, src += vsize*3*nrepeats, dst += dcn*vsize*nrepeats)
         {
@@ -3120,13 +3121,13 @@ struct Luv2RGBfloat
                 v_float32 Ylo, Yhi;
 
                 // ((L + 16)/116)^3
-                Ylo = (L[k] + v16) * v116inv;
-                Ylo = Ylo*Ylo*Ylo;
+                Ylo = v_mul(v_add(L[k], v16), v116inv);
+                Ylo = v_mul(v_mul(Ylo, Ylo), Ylo);
                 // L*(3./29.)^3
-                Yhi = L[k] * v903inv;
+                Yhi = v_mul(L[k], v903inv);
 
                 // Y = (L <= 8) ? Y0 : Y1;
-                Y[k] = v_select(L[k] >= vx_setall_f32(8.f), Ylo, Yhi);
+                Y[k] = v_select(v_ge(L[k], vx_setall_f32(8.f)), Ylo, Yhi);
             }
 
             v_float32 v4inv = vx_setall_f32(0.25f), v3 = vx_setall_f32(3.f);
@@ -3135,18 +3136,18 @@ struct Luv2RGBfloat
                 v_float32 up, vp;
 
                 // up = 3*(u + L*_un);
-                up = v3*(v_fma(L[k], vx_setall_f32(_un), u[k]));
+                up = v_mul(v3, v_fma(L[k], vx_setall_f32(_un), u[k]));
                 // vp = 0.25/(v + L*_vn);
-                vp = v4inv/(v_fma(L[k], vx_setall_f32(_vn), v[k]));
+                vp = v_div(v4inv, v_fma(L[k], vx_setall_f32(_vn), v[k]));
 
                 // vp = max(-0.25, min(0.25, vp));
                 vp = v_max(vx_setall_f32(-0.25f), v_min(v4inv, vp));
 
                 //X = 3*up*vp; // (*Y) is done later
-                X[k] = v3*up*vp;
+                X[k] = v_mul(v_mul(v3, up), vp);
                 //Z = ((12*13*L - up)*vp - 5); // (*Y) is done later
                 // xor flips the sign, works like unary minus
-                Z[k] = v_fma(v_fma(L[k], vx_setall_f32(12.f*13.f), (vx_setall_f32(-0.f) ^ up)), vp, vx_setall_f32(-5.f));
+                Z[k] = v_fma(v_fma(L[k], vx_setall_f32(12.f*13.f), (v_xor(vx_setall_f32(-0.F), up))), vp, vx_setall_f32(-5.f));
             }
 
             v_float32 R[nrepeats], G[nrepeats], B[nrepeats];
@@ -3156,9 +3157,9 @@ struct Luv2RGBfloat
             for(int k = 0; k < nrepeats; k++)
             {
                 // R = (X*C0 + C1 + Z*C2)*Y; // here (*Y) is done
-                R[k] = v_fma(Z[k], vc2, v_fma(X[k], vc0, vc1))*Y[k];
-                G[k] = v_fma(Z[k], vc5, v_fma(X[k], vc3, vc4))*Y[k];
-                B[k] = v_fma(Z[k], vc8, v_fma(X[k], vc6, vc7))*Y[k];
+                R[k] = v_mul(v_fma(Z[k], vc2, v_fma(X[k], vc0, vc1)), Y[k]);
+                G[k] = v_mul(v_fma(Z[k], vc5, v_fma(X[k], vc3, vc4)), Y[k]);
+                B[k] = v_mul(v_fma(Z[k], vc8, v_fma(X[k], vc6, vc7)), Y[k]);
             }
 
             v_float32 vzero = vx_setzero_f32(), v1 = vx_setall_f32(1.f);
@@ -3174,9 +3175,9 @@ struct Luv2RGBfloat
                 v_float32 vgscale = vx_setall_f32(gscale);
                 for(int k = 0; k < nrepeats; k++)
                 {
-                    R[k] *= vgscale;
-                    G[k] *= vgscale;
-                    B[k] *= vgscale;
+                    R[k] = v_mul(R[k], vgscale);
+                    G[k] = v_mul(G[k], vgscale);
+                    B[k] = v_mul(B[k], vgscale);
                 }
                 for(int k = 0; k < nrepeats; k++)
                 {
@@ -3285,7 +3286,7 @@ struct RGB2Luvinterpolate
 #if CV_SIMD
         if(enablePackedRGB2Luv)
         {
-            const int vsize = v_uint16::nlanes;
+            const int vsize = VTraits<v_uint16>::vlanes();
             static const int nPixels = vsize*2;
             for(; i < n - 3*nPixels; i += 3*nPixels, src += scn*nPixels)
             {
@@ -3315,9 +3316,9 @@ struct RGB2Luvinterpolate
                 v_expand(r, r0, r1);
                 v_expand(g, g0, g1);
                 v_expand(b, b0, b1);
-                r0 = r0 << (lab_base_shift - 8); r1 = r1 << (lab_base_shift - 8);
-                g0 = g0 << (lab_base_shift - 8); g1 = g1 << (lab_base_shift - 8);
-                b0 = b0 << (lab_base_shift - 8); b1 = b1 << (lab_base_shift - 8);
+                r0 = v_shl<lab_base_shift - 8>(r0); r1 = v_shl<lab_base_shift - 8>(r1);
+                g0 = v_shl<lab_base_shift - 8>(g0); g1 = v_shl<lab_base_shift - 8>(g1);
+                b0 = v_shl<lab_base_shift - 8>(b0); b1 = v_shl<lab_base_shift - 8>(b1);
 
                 /*
                     int L, u, v;
@@ -3332,9 +3333,9 @@ struct RGB2Luvinterpolate
                     dst[i+1] = saturate_cast<uchar>(u/baseDiv);
                     dst[i+2] = saturate_cast<uchar>(v/baseDiv);
                  */
-                l0 = l0 >> (lab_base_shift - 8); l1 = l1 >> (lab_base_shift - 8);
-                u0 = u0 >> (lab_base_shift - 8); u1 = u1 >> (lab_base_shift - 8);
-                v0 = v0 >> (lab_base_shift - 8); v1 = v1 >> (lab_base_shift - 8);
+                l0 = v_shr<lab_base_shift - 8>(l0); l1 = v_shr<lab_base_shift - 8>(l1);
+                u0 = v_shr<lab_base_shift - 8>(u0); u1 = v_shr<lab_base_shift - 8>(u1);
+                v0 = v_shr<lab_base_shift - 8>(v0); v1 = v_shr<lab_base_shift - 8>(v1);
                 v_uint8 l = v_pack(l0, l1);
                 v_uint8 u = v_pack(u0, u1);
                 v_uint8 v = v_pack(v0, v1);
@@ -3405,12 +3406,12 @@ struct RGB2Luv_b
         static const softfloat su = -uLow*f255/uRange;
         static const softfloat sv = -vLow*f255/vRange;
 #if CV_SIMD
-        const int fsize = v_float32::nlanes;
+        const int fsize = VTraits<v_float32>::vlanes();
         v_float32 ml = vx_setall_f32((float)fL), al = vx_setzero_f32();
         v_float32 mu = vx_setall_f32((float)fu), au = vx_setall_f32((float)su);
         v_float32 mv = vx_setall_f32((float)fv), av = vx_setall_f32((float)sv);
         //TODO: fix that when v_interleave is available
-        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[fsize*3], interTmpA[fsize*3];
+        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[VTraits<v_float32>::max_nlanes*3], interTmpA[VTraits<v_float32>::max_nlanes*3];
         v_store_interleave(interTmpM, ml, mu, mv);
         v_store_interleave(interTmpA, al, au, av);
         v_float32 mluv[3], aluv[3];
@@ -3452,7 +3453,7 @@ struct RGB2Luv_b
                     v_float32 f[3*4];
                     for(int k = 0; k < 3*4; k++)
                     {
-                        f[k] = v_cvt_f32(q[k])*v255inv;
+                        f[k] = v_mul(v_cvt_f32(q[k]), v255inv);
                     }
 
                     for(int k = 0; k < 4; k++)
@@ -3478,8 +3479,8 @@ struct RGB2Luv_b
                     v_int32 q0, q1;
                     v_expand(v_reinterpret_as_s16(d), q0, q1);
 
-                    v_store_aligned(buf + j + 0*fsize, v_cvt_f32(q0)*v255inv);
-                    v_store_aligned(buf + j + 1*fsize, v_cvt_f32(q1)*v255inv);
+                    v_store_aligned(buf + j + 0*fsize, v_mul(v_cvt_f32(q0), v255inv));
+                    v_store_aligned(buf + j + 1*fsize, v_mul(v_cvt_f32(q1), v255inv));
                 }
                 for( ; j < dn*bufChannels; j++, src++ )
                 {
@@ -3633,7 +3634,8 @@ struct Luv2RGBinteger
     inline void processLuvToXYZ(const v_uint8& lv, const v_uint8& uv, const v_uint8& vv,
                                 v_int32 (&x)[4], v_int32 (&y)[4], v_int32 (&z)[4]) const
     {
-        const int vsize = v_uint8::nlanes;
+        const int vsize = VTraits<v_uint8>::vlanes();
+        const int vsize_max = VTraits<v_uint8>::max_nlanes;
 
         v_uint16 lv0, lv1;
         v_expand(lv, lv0, lv1);
@@ -3646,7 +3648,7 @@ struct Luv2RGBinteger
         v_int32 mask16 = vx_setall_s32(0xFFFF);
         for(int k = 0; k < 4; k++)
         {
-            y[k] = v_lut((const int*)LabToYF_b, v_reinterpret_as_s32(lq[k])) & mask16;
+            y[k] = v_and(v_lut((const int *)LabToYF_b, v_reinterpret_as_s32(lq[k])), mask16);
         }
 
         v_int32 up[4], vp[4];
@@ -3657,10 +3659,10 @@ struct Luv2RGBinteger
         v_expand(vv, vv0, vv1);
         // LL*256
         v_uint16 ll0, ll1;
-        ll0 = lv0 << 8; ll1 = lv1 << 8;
+        ll0 = v_shl<8>(lv0); ll1 = v_shl<8>(lv1);
         v_uint16 upidx0, upidx1, vpidx0, vpidx1;
-        upidx0 = ll0 + uv0; upidx1 = ll1 + uv1;
-        vpidx0 = ll0 + vv0; vpidx1 = ll1 + vv1;
+        upidx0 = v_add(ll0, uv0); upidx1 = v_add(ll1, uv1);
+        vpidx0 = v_add(ll0, vv0); vpidx1 = v_add(ll1, vv1);
         v_uint32 upidx[4], vpidx[4];
         v_expand(upidx0, upidx[0], upidx[1]); v_expand(upidx1, upidx[2], upidx[3]);
         v_expand(vpidx0, vpidx[0], vpidx[1]); v_expand(vpidx1, vpidx[2], vpidx[3]);
@@ -3672,7 +3674,7 @@ struct Luv2RGBinteger
 
         // long long int vpl = LUVLUT.LvToVpl_b[LL*256+v];
         v_int64 vpl[8];
-        int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vpidxstore[vsize];
+        int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vpidxstore[vsize_max];
         for(int k = 0; k < 4; k++)
         {
             v_store_aligned(vpidxstore + k*vsize/4, v_reinterpret_as_s32(vpidx[k]));
@@ -3684,12 +3686,13 @@ struct Luv2RGBinteger
 
         // not all 64-bit arithmetic is available in univ. intrinsics
         // need to handle it with scalar code
-        int64_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vvpl[vsize];
+        int64_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vvpl[vsize_max];
         for(int k = 0; k < 8; k++)
         {
             v_store_aligned(vvpl + k*vsize/8, vpl[k]);
         }
-        int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vup[vsize], vvp[vsize], vx[vsize], vy[vsize], vzm[vsize];
+        int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) vup[vsize_max], vvp[vsize_max],
+                                               vx[vsize_max], vy[vsize_max], vzm[vsize_max];
         for(int k = 0; k < 4; k++)
         {
             v_store_aligned(vup + k*vsize/4, up[k]);
@@ -3724,7 +3727,7 @@ struct Luv2RGBinteger
         // z = zm/256 + zm/65536;
         for (int k = 0; k < 4; k++)
         {
-            z[k] = (zm[k] >> 8) + (zm[k] >> 16);
+            z[k] = v_add(v_shr<8>(zm[k]), v_shr<16>(zm[k]));
         }
 
         // (x, z) = clip((x, z), min=0, max=2*BASE)
@@ -3751,7 +3754,7 @@ struct Luv2RGBinteger
         {
             ushort* tab = sRGBInvGammaTab_b;
             bool srgb = issRGB;
-            static const int vsize = v_uint8::nlanes;
+            static const int vsize = VTraits<v_uint8>::vlanes();
             const int descaleShift = 1 << (shift-1);
             v_int16 vdescale = vx_setall_s16(descaleShift);
             v_int16 vc[9];
@@ -3771,12 +3774,12 @@ struct Luv2RGBinteger
             // fixing 16bit signed multiplication
             // by subtracting 2^(base_shift-1) and then adding result back
             v_int32 dummy32, fm[3];
-            v_expand(vc[0]+vc[1]+vc[2], fm[0], dummy32);
-            v_expand(vc[3]+vc[4]+vc[5], fm[1], dummy32);
-            v_expand(vc[6]+vc[7]+vc[8], fm[2], dummy32);
-            fm[0] = fm[0] << (base_shift-1);
-            fm[1] = fm[1] << (base_shift-1);
-            fm[2] = fm[2] << (base_shift-1);
+            v_expand(v_add(vc[0],vc[1],vc[2]), fm[0], dummy32);
+            v_expand(v_add(vc[3],vc[4],vc[5]), fm[1], dummy32);
+            v_expand(v_add(vc[6],vc[7],vc[8]), fm[2], dummy32);
+            fm[0] = v_shl(fm[0], (base_shift-1));
+            fm[1] = v_shl(fm[1], (base_shift-1));
+            fm[2] = v_shl(fm[2], (base_shift-1));
 
             for (; i <= n-vsize; i += vsize, src += 3*vsize, dst += dcn*vsize)
             {
@@ -3816,15 +3819,15 @@ struct Luv2RGBinteger
                 // a bit faster than one loop for all
                 for(int k = 0; k < 4; k++)
                 {
-                    i_rgb[k+4*0] = (v_dotprod(xy[k], crxy) + v_dotprod(zd[k], crz1) + fm[0]) >> shift;
+                    i_rgb[k+4*0] = v_shr<shift>(v_add(v_add(v_dotprod(xy[k], crxy), v_dotprod(zd[k], crz1)), fm[0]));
                 }
                 for(int k = 0; k < 4; k++)
                 {
-                    i_rgb[k+4*1] = (v_dotprod(xy[k], cgxy) + v_dotprod(zd[k], cgz1) + fm[1]) >> shift;
+                    i_rgb[k+4*1] = v_shr<shift>(v_add(v_add(v_dotprod(xy[k], cgxy), v_dotprod(zd[k], cgz1)), fm[1]));
                 }
                 for(int k = 0; k < 4; k++)
                 {
-                    i_rgb[k+4*2] = (v_dotprod(xy[k], cbxy) + v_dotprod(zd[k], cbz1) + fm[2]) >> shift;
+                    i_rgb[k+4*2] = v_shr<shift>(v_add(v_add(v_dotprod(xy[k], cbxy), v_dotprod(zd[k], cbz1)), fm[2]));
                 }
 
                 // [rrggbb]
@@ -3842,7 +3845,7 @@ struct Luv2RGBinteger
                 if(srgb)
                 {
                     // [rr.., gg.., bb..]
-                    int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) rgbshifts[3*vsize];
+                    int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) rgbshifts[3*VTraits<v_uint8>::max_nlanes];
                     for(int k = 0; k < 12; k++)
                     {
                         v_store_aligned(rgbshifts + k*vsize/4, i_rgb[k]);
@@ -3857,7 +3860,7 @@ struct Luv2RGBinteger
                     // rgb = (rgb*255) >> inv_gamma_shift
                     for(int k = 0; k < 12; k++)
                     {
-                        i_rgb[k] = ((i_rgb[k] << 8) - i_rgb[k]) >> inv_gamma_shift;
+                        i_rgb[k] = v_shr((v_sub((v_shl(i_rgb[k], 8)), i_rgb[k])), inv_gamma_shift);
                     }
 
                     for(int k = 0; k < 6; k++)
@@ -3940,13 +3943,13 @@ struct Luv2RGB_b
         static const softfloat fv = vRange/f255;
 
 #if CV_SIMD
-        const int fsize = v_float32::nlanes;
+        const int fsize = VTraits<v_float32>::vlanes();
         v_float32 vl = vx_setall_f32((float)fl);
         v_float32 vu = vx_setall_f32((float)fu);
         v_float32 vv = vx_setall_f32((float)fv);
         v_float32 vuLow = vx_setall_f32((float)uLow), vvLow = vx_setall_f32((float)vLow);
         //TODO: fix that when v_interleave is available
-        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[fsize*3], interTmpA[fsize*3];
+        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) interTmpM[VTraits<v_float32>::max_nlanes*3], interTmpA[VTraits<v_float32>::max_nlanes*3];
         v_store_interleave(interTmpM, vl, vu, vv);
         v_store_interleave(interTmpA, vx_setzero_f32(), vuLow, vvLow);
         v_float32 mluv[3], aluv[3];
@@ -3964,7 +3967,7 @@ struct Luv2RGB_b
             j = 0;
 
 #if CV_SIMD
-            const int vsize = v_uint8::nlanes;
+            const int vsize = VTraits<v_uint8>::vlanes();
             for( ; j <= (dn - vsize)*3; j += 3*vsize )
             {
                 v_uint8 s0, s1, s2;
@@ -4017,7 +4020,7 @@ struct Luv2RGB_b
                     v_int32 vi[4*3];
                     for(int k = 0; k < 4*3; k++)
                     {
-                        vi[k] = v_round(vf[k]*v255);
+                        vi[k] = v_round(v_mul(vf[k], v255));
                     }
 
                     v_uint8 rgb[3];
@@ -4039,7 +4042,7 @@ struct Luv2RGB_b
                     for(int k = 0; k < 4; k++)
                     {
                         vf[k] = vx_load_aligned(buf + j + k*fsize);
-                        vi[k] = v_round(vf[k]*v255);
+                        vi[k] = v_round(v_mul(vf[k], v255));
                     }
                     v_store(dst, v_pack_u(v_pack(vi[0], vi[1]), v_pack(vi[2], vi[3])));
                 }
diff --git a/modules/imgproc/src/color_rgb.simd.hpp b/modules/imgproc/src/color_rgb.simd.hpp
index 67e2febd5b..ca39d8a908 100644
--- a/modules/imgproc/src/color_rgb.simd.hpp
+++ b/modules/imgproc/src/color_rgb.simd.hpp
@@ -882,7 +882,7 @@ struct RGBA2mRGBA<uchar>
 
         int i = 0;
 #if CV_SIMD
-        const int vsize = v_uint8::nlanes;
+        const int vsize = VTraits<v_uint8>::vlanes();
         v_uint8 amask = v_reinterpret_as_u8(vx_setall_u32(0xFF000000));
         v_uint16 vh = vx_setall_u16(half_val+1);
 
@@ -901,27 +901,27 @@ struct RGBA2mRGBA<uchar>
 
             v_uint16 a16[4];
             for(int j = 0; j < 4; j++)
-                a16[j] = v_reinterpret_as_u16(v[j] & amask);
+                a16[j] = v_reinterpret_as_u16(v_and(v[j], amask));
 
             v_uint32 a32[4];
             for(int j = 0; j < 4; j++)
-                a32[j] = v_reinterpret_as_u32(a16[j] | (a16[j] >> 8));
+                a32[j] = v_reinterpret_as_u32(v_or(a16[j], (v_shr(a16[j], 8))));
 
             v_uint8 a[4];
             for(int j = 0; j < 4; j++)
-                a[j] = v_reinterpret_as_u8(a32[j] | (a32[j] >> 16));
+                a[j] = v_reinterpret_as_u8(v_or(a32[j], (v_shr(a32[j], 16))));
 
             v_uint16 m[8];
             for(int j = 0; j < 4; j++)
                 v_mul_expand(v[j], a[j], m[j], m[j+4]);
 
             for(int j = 0; j < 8; j++)
-                m[j] += vh;
+                m[j] = v_add(m[j], vh);
 
             // div 255: (v+1+(v>>8))>8
             // +1 is in vh, has no effect on (v>>8)
             for(int j = 0; j < 8; j++)
-                m[j] = (m[j] + (m[j] >> 8)) >> 8;
+                m[j] = v_shr((v_add(m[j], (v_shr(m[j], 8)))), 8);
 
             v_uint8 d[4];
             for(int j = 0; j < 4; j++)
diff --git a/modules/imgproc/src/demosaicing.cpp b/modules/imgproc/src/demosaicing.cpp
index 627c052aea..148df552e4 100644
--- a/modules/imgproc/src/demosaicing.cpp
+++ b/modules/imgproc/src/demosaicing.cpp
@@ -188,21 +188,21 @@ public:
             v_uint16x8 r1 = v_reinterpret_as_u16(v_load(bayer+bayer_step));
             v_uint16x8 r2 = v_reinterpret_as_u16(v_load(bayer+bayer_step*2));
 
-            v_uint16x8 b1 = ((r0 << 8) >> 7) + ((r2 << 8) >> 7);
-            v_uint16x8 b0 = v_rotate_right<1>(b1) + b1;
-            b1 = v_rotate_right<1>(b1) << 1;
+            v_uint16x8 b1 = v_add(v_shr<7>(v_shl<8>(r0)), v_shr<7>(v_shl<8>(r2)));
+            v_uint16x8 b0 = v_add(v_rotate_right<1>(b1), b1);
+            b1 = v_shl<1>(v_rotate_right<1>(b1));
 
-            v_uint16x8 g0 = (r0 >> 7) + (r2 >> 7);
-            v_uint16x8 g1 = (r1 << 8) >> 7;
-            g0 += v_rotate_right<1>(g1) + g1;
-            g1 = v_rotate_right<1>(g1) << 2;
+            v_uint16x8 g0 = v_add(v_shr<7>(r0), v_shr<7>(r2));
+            v_uint16x8 g1 = v_shr<7>(v_shl<8>(r1));
+            g0 = v_add(g0, v_add(v_rotate_right<1>(g1), g1));
+            g1 = v_shl<2>(v_rotate_right<1>(g1));
 
-            r0 = r1 >> 8;
-            r1 = (v_rotate_right<1>(r0) + r0) << 2;
-            r0 = r0 << 3;
+            r0 = v_shr<8>(r1);
+            r1 = v_shl<2>(v_add(v_rotate_right<1>(r0), r0));
+            r0 = v_shl<3>(r0);
 
-            g0 = (v_mul_hi(b0, _b2y) + v_mul_hi(g0, _g2y) + v_mul_hi(r0, _r2y)) >> 2;
-            g1 = (v_mul_hi(b1, _b2y) + v_mul_hi(g1, _g2y) + v_mul_hi(r1, _r2y)) >> 2;
+            g0 = v_shr<2>(v_add(v_add(v_mul_hi(b0, _b2y), v_mul_hi(g0, _g2y)), v_mul_hi(r0, _r2y)));
+            g1 = v_shr<2>(v_add(v_add(v_mul_hi(b1, _b2y), v_mul_hi(g1, _g2y)), v_mul_hi(r1, _r2y)));
             v_uint8x16 pack_lo, pack_hi;
             v_zip(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(g0)),
                   v_pack_u(v_reinterpret_as_s16(g1), v_reinterpret_as_s16(g1)),
@@ -269,31 +269,31 @@ public:
             v_uint16x8 r1 = v_reinterpret_as_u16(v_load(bayer+bayer_step));
             v_uint16x8 r2 = v_reinterpret_as_u16(v_load(bayer+bayer_step*2));
 
-            v_uint16x8 b1 = (r0 & masklo) + (r2 & masklo);
+            v_uint16x8 b1 = v_add(v_and(r0, masklo), v_and(r2, masklo));
             v_uint16x8 nextb1 = v_rotate_right<1>(b1);
-            v_uint16x8 b0 = b1 + nextb1;
-            b1 = (nextb1 + delta1) >> 1;
-            b0 = (b0 + delta2) >> 2;
+            v_uint16x8 b0 = v_add(b1, nextb1);
+            b1 = v_shr<1>(v_add(nextb1, delta1));
+            b0 = v_shr<2>(v_add(b0, delta2));
             // b0 b2 ... b14 b1 b3 ... b15
             b0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(b1)));
 
-            v_uint16x8 g0 = (r0 >> 8) + (r2 >> 8);
-            v_uint16x8 g1 = r1 & masklo;
-            g0 += v_rotate_right<1>(g1) + g1;
+            v_uint16x8 g0 = v_add(v_shr<8>(r0), v_shr<8>(r2));
+            v_uint16x8 g1 = v_and(r1, masklo);
+            g0 = v_add(g0, v_add(v_rotate_right<1>(g1), g1));
             g1 = v_rotate_right<1>(g1);
-            g0 = (g0 + delta2) >> 2;
+            g0 = v_shr<2>(v_add(g0, delta2));
             // g0 g2 ... g14 g1 g3 ... g15
             g0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(g1)));
 
-            r0 = r1 >> 8;
-            r1 = v_rotate_right<1>(r0) + r0;
-            r1 = (r1 + delta1) >> 1;
+            r0 = v_shr<8>(r1);
+            r1 = v_add(v_rotate_right<1>(r0), r0);
+            r1 = v_shr<1>(v_add(r1, delta1));
             // r0 r2 ... r14 r1 r3 ... r15
             r0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(r0), v_reinterpret_as_s16(r1)));
 
-            b1 = (b0 ^ r0) & mask;
-            b0 = b0 ^ b1;
-            r0 = r0 ^ b1;
+            b1 = v_and(v_xor(b0, r0), mask);
+            b0 = v_xor(b0, b1);
+            r0 = v_xor(r0, b1);
 
             // b1 g1 b3 g3 b5 g5...
             v_uint8x16 pack_lo, pack_hi;
@@ -402,31 +402,31 @@ public:
             v_uint16x8 r1 = v_reinterpret_as_u16(v_load(bayer+bayer_step));
             v_uint16x8 r2 = v_reinterpret_as_u16(v_load(bayer+bayer_step*2));
 
-            v_uint16x8 b1 = (r0 & masklo) + (r2 & masklo);
+            v_uint16x8 b1 = v_add(v_and(r0, masklo), v_and(r2, masklo));
             v_uint16x8 nextb1 = v_rotate_right<1>(b1);
-            v_uint16x8 b0 = b1 + nextb1;
-            b1 = (nextb1 + delta1) >> 1;
-            b0 = (b0 + delta2) >> 2;
+            v_uint16x8 b0 = v_add(b1, nextb1);
+            b1 = v_shr<1>(v_add(nextb1, delta1));
+            b0 = v_shr<2>(v_add(b0, delta2));
             // b0 b2 ... b14 b1 b3 ... b15
             b0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(b1)));
 
-            v_uint16x8 g0 = (r0 >> 8) + (r2 >> 8);
-            v_uint16x8 g1 = r1 & masklo;
-            g0 += v_rotate_right<1>(g1) + g1;
+            v_uint16x8 g0 = v_add(v_shr<8>(r0), v_shr<8>(r2));
+            v_uint16x8 g1 = v_and(r1, masklo);
+            g0 = v_add(g0, v_add(v_rotate_right<1>(g1), g1));
             g1 = v_rotate_right<1>(g1);
-            g0 = (g0 + delta2) >> 2;
+            g0 = v_shr<2>(v_add(g0, delta2));
             // g0 g2 ... g14 g1 g3 ... g15
             g0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(g1)));
 
-            r0 = r1 >> 8;
-            r1 = v_rotate_right<1>(r0) + r0;
-            r1 = (r1 + delta1) >> 1;
+            r0 = v_shr<8>(r1);
+            r1 = v_add(v_rotate_right<1>(r0), r0);
+            r1 = v_shr<1>(v_add(r1, delta1));
             // r0 r2 ... r14 r1 r3 ... r15
             r0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(r0), v_reinterpret_as_s16(r1)));
 
-            b1 = (b0 ^ r0) & mask;
-            b0 = b0 ^ b1;
-            r0 = r0 ^ b1;
+            b1 = v_and(v_xor(b0, r0), mask);
+            b0 = v_xor(b0, b1);
+            r0 = v_xor(r0, b1);
 
             // b1 g1 b3 g3 b5 g5...
             v_uint8x16 pack_lo, pack_hi;
@@ -498,40 +498,40 @@ public:
             v_uint16x8 r1 = v_reinterpret_as_u16(v_load(bayer+bayer_step));
             v_uint16x8 r2 = v_reinterpret_as_u16(v_load(bayer+bayer_step*2));
 
-            v_uint16x8 b1 = (r0 & masklow) + (r2 & masklow);
+            v_uint16x8 b1 = v_add(v_and(r0, masklow), v_and(r2, masklow));
             v_uint16x8 nextb1 = v_rotate_right<1>(b1);
-            v_uint16x8 b0 = b1 + nextb1;
-            b1 = (nextb1 + delta1) >> 1;
-            b0 = (b0 + delta2) >> 2;
+            v_uint16x8 b0 = v_add(b1, nextb1);
+            b1 = v_shr<1>(v_add(nextb1, delta1));
+            b0 = v_shr<2>(v_add(b0, delta2));
             // b0 b2 ... b14 b1 b3 ... b15
             b0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(b1)));
 
             // vertical sum
-            v_uint16x8 r0g = r0 >> 8;
-            v_uint16x8 r2g = r2 >> 8;
-            v_uint16x8 sumv = ((r0g + r2g) + delta1) >> 1;
+            v_uint16x8 r0g = v_shr<8>(r0);
+            v_uint16x8 r2g = v_shr<8>(r2);
+            v_uint16x8 sumv = v_shr<1>(v_add(v_add(r0g, r2g), delta1));
             // horizontal sum
-            v_uint16x8 g1 = r1 & masklow;
+            v_uint16x8 g1 = v_and(r1, masklow);
             v_uint16x8 nextg1 = v_rotate_right<1>(g1);
-            v_uint16x8 sumg = (g1 + nextg1 + delta1) >> 1;
+            v_uint16x8 sumg = v_shr<1>(v_add(v_add(g1, nextg1), delta1));
 
             // gradients
-            v_uint16x8 gradv = (r0g - r2g) + (r2g - r0g);
-            v_uint16x8 gradg = (nextg1 - g1) + (g1 - nextg1);
-            v_uint16x8 gmask = gradg > gradv;
-            v_uint16x8 g0 = (gmask & sumv) + (sumg & (gmask ^ full));
+            v_uint16x8 gradv = v_add(v_sub(r0g, r2g), v_sub(r2g, r0g));
+            v_uint16x8 gradg = v_add(v_sub(nextg1, g1), v_sub(g1, nextg1));
+            v_uint16x8 gmask = v_gt(gradg, gradv);
+            v_uint16x8 g0 = v_add(v_and(gmask, sumv), v_and(sumg, v_xor(gmask, full)));
             // g0 g2 ... g14 g1 g3 ...
             g0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(g0), v_reinterpret_as_s16(nextg1)));
 
-            r0 = r1 >> 8;
-            r1 = v_rotate_right<1>(r0) + r0;
-            r1 = (r1 + delta1) >> 1;
+            r0 = v_shr<8>(r1);
+            r1 = v_add(v_rotate_right<1>(r0), r0);
+            r1 = v_shr<1>(v_add(r1, delta1));
             // r0 r2 ... r14 r1 r3 ... r15
             r0 = v_reinterpret_as_u16(v_pack_u(v_reinterpret_as_s16(r0), v_reinterpret_as_s16(r1)));
 
-            b1 = (b0 ^ r0) & mask;
-            b0 = b0 ^ b1;
-            r0 = r0 ^ b1;
+            b1 = v_and(v_xor(b0, r0), mask);
+            b0 = v_xor(b0, b1);
+            r0 = v_xor(r0, b1);
 
             // b1 g1 b3 g3 b5 g5...
             v_uint8x16 pack_lo, pack_hi;
@@ -1060,19 +1060,19 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
 
                 v_uint16x8 b0, b1, b2, b3, b4, b5, b6;
 
-                b0 = (v_absdiff(s2, s8)<<1) + v_absdiff(s1, s7) + v_absdiff(s3, s9);
-                b1 = (v_absdiff(s4, s6)<<1) + v_absdiff(s1, s3) + v_absdiff(s7, s9);
-                b2 = v_absdiff(s3, s7)<<1;
-                b3 = v_absdiff(s1, s9)<<1;
+                b0 = v_add(v_add(v_shl<1>(v_absdiff(s2, s8)), v_absdiff(s1, s7)), v_absdiff(s3, s9));
+                b1 = v_add(v_add(v_shl<1>(v_absdiff(s4, s6)), v_absdiff(s1, s3)), v_absdiff(s7, s9));
+                b2 = v_shl<1>(v_absdiff(s3, s7));
+                b3 = v_shl<1>(v_absdiff(s1, s9));
 
                 v_store(brow, b0);
                 v_store(brow + N, b1);
                 v_store(brow + N2, b2);
                 v_store(brow + N3, b3);
 
-                b4 = b2 + v_absdiff(s2, s4) + v_absdiff(s6, s8);
-                b5 = b3 + v_absdiff(s2, s6) + v_absdiff(s4, s8);
-                b6 = (s2 + s4 + s6 + s8)>>1;
+                b4 = v_add(v_add(b2, v_absdiff(s2, s4)), v_absdiff(s6, s8));
+                b5 = v_add(v_add(b3, v_absdiff(s2, s6)), v_absdiff(s4, s8));
+                b6 = v_shr<1>(v_add(v_add(v_add(s2, s4), s6), s8));
 
                 v_store(brow + N4, b4);
                 v_store(brow + N5, b5);
@@ -1279,7 +1279,7 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
             v_uint16x8 one = v_setall_u16(1), z = v_setzero_u16();
             v_float32x4 _0_5 = v_setall_f32(0.5f);
 
-            #define v_merge_u16(a, b) (((a) & v_reinterpret_as_u16(emask)) | ((b) & v_reinterpret_as_u16(omask))) //(aA_aA_aA_aA) * (bB_bB_bB_bB) => (bA_bA_bA_bA)
+            #define v_merge_u16(a, b) (v_or((v_and((a), v_reinterpret_as_u16(emask))), (v_and((b), v_reinterpret_as_u16(omask))))) //(aA_aA_aA_aA) * (bB_bB_bB_bB) => (bA_bA_bA_bA)
             #define v_cvt_s16f32_lo(a)  v_cvt_f32(v_expand_low(v_reinterpret_as_s16(a)))   //(1,2,3,4,5,6,7,8) => (1f,2f,3f,4f)
             #define v_cvt_s16f32_hi(a)  v_cvt_f32(v_expand_high(v_reinterpret_as_s16(a)))   //(1,2,3,4,5,6,7,8) => (5f,6f,7f,8f)
 
@@ -1287,16 +1287,16 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
             for( ; i <= N - 10; i += 8, srow += 8, brow0 += 8, brow1 += 8, brow2 += 8 )
             {
                 //int gradN = brow0[0] + brow1[0];
-                v_uint16x8 gradN = v_load(brow0) + v_load(brow1);
+                v_uint16x8 gradN = v_add(v_load(brow0), v_load(brow1));
 
                 //int gradS = brow1[0] + brow2[0];
-                v_uint16x8 gradS = v_load(brow1) + v_load(brow2);
+                v_uint16x8 gradS = v_add(v_load(brow1), v_load(brow2));
 
                 //int gradW = brow1[N-1] + brow1[N];
-                v_uint16x8 gradW = v_load(brow1+N-1) + v_load(brow1+N);
+                v_uint16x8 gradW = v_add(v_load(brow1 + N - 1), v_load(brow1 + N));
 
                 //int gradE = brow1[N+1] + brow1[N];
-                v_uint16x8 gradE = v_load(brow1+N+1) + v_load(brow1+N);
+                v_uint16x8 gradE = v_add(v_load(brow1 + N + 1), v_load(brow1 + N));
 
                 //int minGrad = std::min(std::min(std::min(gradN, gradS), gradW), gradE);
                 //int maxGrad = std::max(std::max(std::max(gradN, gradS), gradW), gradE);
@@ -1307,14 +1307,14 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
 
                 //int gradNE = brow0[N4+1] + brow1[N4];
                 //int gradNE = brow0[N2] + brow0[N2+1] + brow1[N2] + brow1[N2+1];
-                grad0 = v_load(brow0+N4+1) + v_load(brow1+N4);
-                grad1 = v_load(brow0+N2) + v_load(brow0+N2+1) + v_load(brow1+N2) + v_load(brow1+N2+1);
+                grad0 = v_add(v_load(brow0 + N4 + 1), v_load(brow1 + N4));
+                grad1 = v_add(v_add(v_add(v_load(brow0 + N2), v_load(brow0 + N2 + 1)), v_load(brow1 + N2)), v_load(brow1 + N2 + 1));
                 v_uint16x8 gradNE = v_merge_u16(grad0, grad1);
 
                 //int gradSW = brow1[N4] + brow2[N4-1];
                 //int gradSW = brow1[N2] + brow1[N2-1] + brow2[N2] + brow2[N2-1];
-                grad0 = v_load(brow2+N4-1) + v_load(brow1+N4);
-                grad1 = v_load(brow2+N2) + v_load(brow2+N2-1) + v_load(brow1+N2) + v_load(brow1+N2-1);
+                grad0 = v_add(v_load(brow2 + N4 - 1), v_load(brow1 + N4));
+                grad1 = v_add(v_add(v_add(v_load(brow2 + N2), v_load(brow2 + N2 - 1)), v_load(brow1 + N2)), v_load(brow1 + N2 - 1));
                 v_uint16x8 gradSW = v_merge_u16(grad0, grad1);
 
                 minGrad = v_min(v_min(minGrad, gradNE), gradSW);
@@ -1322,21 +1322,21 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
 
                 //int gradNW = brow0[N5-1] + brow1[N5];
                 //int gradNW = brow0[N3] + brow0[N3-1] + brow1[N3] + brow1[N3-1];
-                grad0 = v_load(brow0+N5-1) + v_load(brow1+N5);
-                grad1 = v_load(brow0+N3) + v_load(brow0+N3-1) + v_load(brow1+N3) + v_load(brow1+N3-1);
+                grad0 = v_add(v_load(brow0 + N5 - 1), v_load(brow1 + N5));
+                grad1 = v_add(v_add(v_add(v_load(brow0 + N3), v_load(brow0 + N3 - 1)), v_load(brow1 + N3)), v_load(brow1 + N3 - 1));
                 v_uint16x8 gradNW = v_merge_u16(grad0, grad1);
 
                 //int gradSE = brow1[N5] + brow2[N5+1];
                 //int gradSE = brow1[N3] + brow1[N3+1] + brow2[N3] + brow2[N3+1];
-                grad0 = v_load(brow2+N5+1) + v_load(brow1+N5);
-                grad1 = v_load(brow2+N3) + v_load(brow2+N3+1) + v_load(brow1+N3) + v_load(brow1+N3+1);
+                grad0 = v_add(v_load(brow2 + N5 + 1), v_load(brow1 + N5));
+                grad1 = v_add(v_add(v_add(v_load(brow2 + N3), v_load(brow2 + N3 + 1)), v_load(brow1 + N3)), v_load(brow1 + N3 + 1));
                 v_uint16x8 gradSE = v_merge_u16(grad0, grad1);
 
                 minGrad = v_min(v_min(minGrad, gradNW), gradSE);
                 maxGrad = v_max(v_max(maxGrad, gradNW), gradSE);
 
                 //int T = minGrad + maxGrad/2;
-                v_uint16x8 T = v_max((maxGrad >> 1), one) + minGrad;
+                v_uint16x8 T = v_add(v_max((v_shr<1>(maxGrad)), one), minGrad);
 
                 v_uint16x8 RGs = z, GRs = z, Bs = z, ng = z;
 
@@ -1361,133 +1361,135 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
                 v_uint16x8 t0, t1, mask;
 
                 // gradN ***********************************************
-                mask = (T > gradN); // mask = T>gradN
-                ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask));     // ng += (T>gradN)
+                mask = (v_gt(T, gradN)); // mask = T>gradN
+                ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask)));     // ng += (T>gradN)
 
-                t0 = (x3 << 1);                                 // srow[-bstep]*2
-                t1 = v_load_expand(srow - bstep*2) + x0;  // srow[-bstep*2] + srow[0]
+                t0 = (v_shl<1>(x3));                                 // srow[-bstep]*2
+                t1 = v_add(v_load_expand(srow - bstep * 2), x0);  // srow[-bstep*2] + srow[0]
 
                 // RGs += (srow[-bstep*2] + srow[0]) * (T>gradN)
-                RGs += (t1 & mask);
+                RGs = v_add(RGs, v_and(t1, mask));
                 // GRs += {srow[-bstep]*2; (srow[-bstep*2-1] + srow[-bstep*2+1])} * (T>gradN)
-                GRs += (v_merge_u16(t0, x2 + x4) & mask);
+                GRs = v_add(GRs, (v_and(v_merge_u16(t0, v_add(x2, x4)), mask)));
                 // Bs  += {(srow[-bstep-1]+srow[-bstep+1]); srow[-bstep]*2 } * (T>gradN)
-                Bs  += (v_merge_u16(x1 + x5, t0) & mask);
+                Bs = v_add(Bs, v_and(v_merge_u16(v_add(x1, x5), t0), mask));
 
                 // gradNE **********************************************
-                mask = (T > gradNE); // mask = T>gradNE
-                ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask));     // ng += (T>gradNE)
+                mask = (v_gt(T, gradNE)); // mask = T>gradNE
+                ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask)));     // ng += (T>gradNE)
 
-                t0 = (x5 << 1);                                    // srow[-bstep+1]*2
-                t1 = v_load_expand(srow - bstep*2+2) + x0;   // srow[-bstep*2+2] + srow[0]
+                t0 = (v_shl<1>(x5));                                    // srow[-bstep+1]*2
+                t1 = v_add(v_load_expand(srow - bstep * 2 + 2), x0);   // srow[-bstep*2+2] + srow[0]
 
                 // RGs += {(srow[-bstep*2+2] + srow[0]); srow[-bstep+1]*2} * (T>gradNE)
-                RGs += (v_merge_u16(t1, t0) & mask);
+                RGs = v_add(RGs, v_and(v_merge_u16(t1, t0), mask));
                 // GRs += {brow0[N6+1]; (srow[-bstep*2+1] + srow[1])} * (T>gradNE)
-                GRs += (v_merge_u16(v_load(brow0+N6+1), x4 + x7) & mask);
+                GRs = v_add(GRs, v_and(v_merge_u16(v_load(brow0+N6+1), v_add(x4, x7)), mask));
                 // Bs  += {srow[-bstep+1]*2; (srow[-bstep] + srow[-bstep+2])}  * (T>gradNE)
-                Bs  += (v_merge_u16(t0, x3 + x6) & mask);
+                Bs = v_add(Bs, v_and(v_merge_u16(t0, v_add(x3, x6)), mask));
 
                 // gradE ***********************************************
-                mask = (T > gradE);  // mask = T>gradE
-                ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask));     // ng += (T>gradE)
+                mask = (v_gt(T, gradE));  // mask = T>gradE
+                ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask)));     // ng += (T>gradE)
 
-                t0 = (x7 << 1);                         // srow[1]*2
-                t1 = v_load_expand(srow +2) + x0; // srow[2] + srow[0]
+                t0 = (v_shl<1>(x7));                         // srow[1]*2
+                t1 = v_add(v_load_expand(srow + 2), x0); // srow[2] + srow[0]
 
                 // RGs += (srow[2] + srow[0]) * (T>gradE)
-                RGs += (t1 & mask);
+                RGs = v_add(RGs, v_and(t1, mask));
                 // GRs += (srow[1]*2) * (T>gradE)
-                GRs += (t0 & mask);
+                GRs = v_add(GRs, v_and(t0, mask));
                 // Bs  += {(srow[-bstep+1]+srow[bstep+1]); (srow[-bstep+2]+srow[bstep+2])} * (T>gradE)
-                Bs  += (v_merge_u16(x5 + x9, x6 + x8) & mask);
+                Bs = v_add(Bs, v_and(v_merge_u16(v_add(x5, x9), v_add(x6, x8)), mask));
 
                 // gradSE **********************************************
-                mask = (T > gradSE);  // mask = T>gradSE
-                ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask));     // ng += (T>gradSE)
+                mask = (v_gt(T, gradSE));  // mask = T>gradSE
+                ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask)));     // ng += (T>gradSE)
 
-                t0 = (x9 << 1);                                 // srow[bstep+1]*2
-                t1 = v_load_expand(srow + bstep*2+2) + x0; // srow[bstep*2+2] + srow[0]
+                t0 = (v_shl<1>(x9));                                 // srow[bstep+1]*2
+                t1 = v_add(v_load_expand(srow + bstep * 2 + 2), x0); // srow[bstep*2+2] + srow[0]
 
                 // RGs += {(srow[bstep*2+2] + srow[0]); srow[bstep+1]*2} * (T>gradSE)
-                RGs += (v_merge_u16(t1, t0) & mask);
+                RGs = v_add(RGs, v_and(v_merge_u16(t1, t0), mask));
                 // GRs += {brow2[N6+1]; (srow[1]+srow[bstep*2+1])} * (T>gradSE)
-                GRs += (v_merge_u16(v_load(brow2+N6+1), x7 + x10) & mask);
+                GRs = v_add(GRs, v_and(v_merge_u16(v_load(brow2+N6+1), v_add(x7, x10)), mask));
                 // Bs  += {srow[bstep+1]*2; (srow[bstep+2]+srow[bstep])} * (T>gradSE)
-                Bs  += (v_merge_u16((x9 << 1), x8 + x11) & mask);
+                Bs = v_add(Bs, v_and(v_merge_u16((v_shl<1>(x9)), v_add(x8, x11)), mask));
 
                 // gradS ***********************************************
-                mask = (T > gradS);  // mask = T>gradS
-                ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask));     // ng += (T>gradS)
+                mask = (v_gt(T, gradS));  // mask = T>gradS
+                ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask)));     // ng += (T>gradS)
 
-                t0 = (x11 << 1);                             // srow[bstep]*2
-                t1 = v_load_expand(srow + bstep*2) + x0; // srow[bstep*2]+srow[0]
+                t0 = (v_shl<1>(x11));                             // srow[bstep]*2
+                t1 = v_add(v_load_expand(srow + bstep * 2), x0); // srow[bstep*2]+srow[0]
 
                 // RGs += (srow[bstep*2]+srow[0]) * (T>gradS)
-                RGs += (t1 & mask);
+                RGs = v_add(RGs, v_and(t1, mask));
                 // GRs += {srow[bstep]*2; (srow[bstep*2+1]+srow[bstep*2-1])} * (T>gradS)
-                GRs += (v_merge_u16(t0, x10 + x12) & mask);
+                GRs = v_add(GRs, v_and(v_merge_u16(t0, v_add(x10, x12)), mask));
                 // Bs  += {(srow[bstep+1]+srow[bstep-1]); srow[bstep]*2} * (T>gradS)
-                Bs  += (v_merge_u16(x9 + x13, t0) & mask);
+                Bs = v_add(Bs, v_and(v_merge_u16(v_add(x9, x13), t0), mask));
 
                 // gradSW **********************************************
-                mask = (T > gradSW);  // mask = T>gradSW
-                ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask));     // ng += (T>gradSW)
+                mask = (v_gt(T, gradSW));  // mask = T>gradSW
+                ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask)));     // ng += (T>gradSW)
 
-                t0 = (x13 << 1);                                // srow[bstep-1]*2
-                t1 = v_load_expand(srow + bstep*2-2) + x0; // srow[bstep*2-2]+srow[0]
+                t0 = (v_shl<1>(x13));                                // srow[bstep-1]*2
+                t1 = v_add(v_load_expand(srow + bstep * 2 - 2), x0); // srow[bstep*2-2]+srow[0]
 
                 // RGs += {(srow[bstep*2-2]+srow[0]); srow[bstep-1]*2} * (T>gradSW)
-                RGs += (v_merge_u16(t1, t0) & mask);
+                RGs = v_add(RGs, v_and(v_merge_u16(t1, t0), mask));
                 // GRs += {brow2[N6-1]; (srow[bstep*2-1]+srow[-1])} * (T>gradSW)
-                GRs += (v_merge_u16(v_load(brow2+N6-1), x12 + x15) & mask);
+                GRs = v_add(GRs, v_and(v_merge_u16(v_load(brow2+N6-1), v_add(x12, x15)), mask));
                 // Bs  += {srow[bstep-1]*2; (srow[bstep]+srow[bstep-2])} * (T>gradSW)
-                Bs  += (v_merge_u16(t0, x11 + x14) & mask);
+                Bs = v_add(Bs, v_and(v_merge_u16(t0, v_add(x11, x14)), mask));
 
                 // gradW ***********************************************
-                mask = (T > gradW);  // mask = T>gradW
-                ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask));     // ng += (T>gradW)
+                mask = (v_gt(T, gradW));  // mask = T>gradW
+                ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask)));     // ng += (T>gradW)
 
-                t0 = (x15 << 1);                         // srow[-1]*2
-                t1 = v_load_expand(srow -2) + x0; // srow[-2]+srow[0]
+                t0 = (v_shl<1>(x15));                         // srow[-1]*2
+                t1 = v_add(v_load_expand(srow - 2), x0); // srow[-2]+srow[0]
 
                 // RGs += (srow[-2]+srow[0]) * (T>gradW)
-                RGs += (t1 & mask);
+                RGs = v_add(RGs, v_and(t1, mask));
                 // GRs += (srow[-1]*2) * (T>gradW)
-                GRs += (t0 & mask);
+                GRs = v_add(GRs, v_and(t0, mask));
                 // Bs  += {(srow[-bstep-1]+srow[bstep-1]); (srow[bstep-2]+srow[-bstep-2])} * (T>gradW)
-                Bs  += (v_merge_u16(x1 + x13, x14 + x16) & mask);
+                Bs = v_add(Bs, v_and(v_merge_u16(v_add(x1, x13), v_add(x14, x16)), mask));
 
                 // gradNW **********************************************
-                mask = (T > gradNW);  // mask = T>gradNW
-                ng = v_reinterpret_as_u16(v_reinterpret_as_s16(ng) - v_reinterpret_as_s16(mask));     // ng += (T>gradNW)
+                mask = (v_gt(T, gradNW));  // mask = T>gradNW
+                ng = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(ng), v_reinterpret_as_s16(mask)));     // ng += (T>gradNW)
 
-                t0 = (x1 << 1);                                 // srow[-bstep-1]*2
-                t1 = v_load_expand(srow -bstep*2-2) + x0; // srow[-bstep*2-2]+srow[0]
+                t0 = (v_shl<1>(x1));                                 // srow[-bstep-1]*2
+                t1 = v_add(v_load_expand(srow - bstep * 2 - 2), x0); // srow[-bstep*2-2]+srow[0]
 
                 // RGs += {(srow[-bstep*2-2]+srow[0]); srow[-bstep-1]*2} * (T>gradNW)
-                RGs += (v_merge_u16(t1, t0) & mask);
+                RGs = v_add(RGs, v_and(v_merge_u16(t1, t0), mask));
                 // GRs += {brow0[N6-1]; (srow[-bstep*2-1]+srow[-1])} * (T>gradNW)
-                GRs += (v_merge_u16(v_load(brow0+N6-1), x2 + x15) & mask);
+                GRs = v_add(GRs, v_and(v_merge_u16(v_load(brow0+N6-1), v_add(x2, x15)), mask));
                 // Bs  += {srow[-bstep-1]*2; (srow[-bstep]+srow[-bstep-2])} * (T>gradNW)
-                Bs  += (v_merge_u16((x1 << 1), x3 + x16) & mask);
+                Bs = v_add(Bs, v_and(v_merge_u16(v_shl<1>(x1), v_add(x3, x16)), mask));
 
-                v_float32x4 ngf0 = _0_5 / v_cvt_s16f32_lo(ng);
-                v_float32x4 ngf1 = _0_5 / v_cvt_s16f32_hi(ng);
+                v_float32x4 ngf0 = v_div(_0_5, v_cvt_s16f32_lo(ng));
+                v_float32x4 ngf1 = v_div(_0_5, v_cvt_s16f32_hi(ng));
 
                 // now interpolate r, g & b
-                t0 = v_reinterpret_as_u16(v_reinterpret_as_s16(GRs) - v_reinterpret_as_s16(RGs));
-                t1 = v_reinterpret_as_u16(v_reinterpret_as_s16(Bs) -  v_reinterpret_as_s16(RGs));
+                t0 = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(GRs), v_reinterpret_as_s16(RGs)));
+                t1 = v_reinterpret_as_u16(v_sub(v_reinterpret_as_s16(Bs), v_reinterpret_as_s16(RGs)));
 
-                t0 = v_reinterpret_as_u16(v_reinterpret_as_s16(x0) +
+                t0 = v_reinterpret_as_u16(
+                    v_add(v_reinterpret_as_s16(x0),
                         v_pack(
-                          v_round(v_cvt_s16f32_lo(t0) * ngf0),
-                          v_round(v_cvt_s16f32_hi(t0) * ngf1)));
+                            v_round(v_mul(v_cvt_s16f32_lo(t0), ngf0)),
+                            v_round(v_mul(v_cvt_s16f32_hi(t0), ngf1)))));
 
-                t1 = v_reinterpret_as_u16(v_reinterpret_as_s16(x0) +
+                t1 = v_reinterpret_as_u16(
+                    v_add(v_reinterpret_as_s16(x0),
                         v_pack(
-                          v_round(v_cvt_s16f32_lo(t1) * ngf0),
-                          v_round(v_cvt_s16f32_hi(t1) * ngf1)));
+                            v_round(v_mul(v_cvt_s16f32_lo(t1), ngf0)),
+                            v_round(v_mul(v_cvt_s16f32_hi(t1), ngf1)))));
 
                 x1 = v_merge_u16(x0, t0);
                 x2 = v_merge_u16(t0, x0);
diff --git a/modules/imgproc/src/filter.simd.hpp b/modules/imgproc/src/filter.simd.hpp
index 06053e63fe..9306c78a30 100644
--- a/modules/imgproc/src/filter.simd.hpp
+++ b/modules/imgproc/src/filter.simd.hpp
@@ -1084,9 +1084,9 @@ struct SymmColumnVec_32s8u
                 i += VTraits<v_uint16>::vlanes();
             }
 #if CV_SIMD_WIDTH > 16
-            while( i <= width - 4 /*v_int32x4::nlanes*/ )
+            while( i <= width - 4 /*VTraits<v_int32x4>::vlanes()*/ )
 #else
-            if( i <= width - v_int32::nlanes )
+            if( i <= width - VTraits<v_int32>::vlanes() )
 #endif
             {
                 v_float32 s0 = v_muladd(v_cvt_f32(vx_load(src[0] + i)), vx_setall_f32(ky[0]), vx_setall_f32(delta));
@@ -1140,9 +1140,9 @@ struct SymmColumnVec_32s8u
                 i += VTraits<v_uint16>::vlanes();
             }
 #if CV_SIMD_WIDTH > 16
-            while( i <= width - 4 /*v_int32x4::nlanes*/ )
+            while( i <= width - 4 /*VTraits<v_int32x4>::vlanes()*/ )
 #else
-            if( i <= width - v_int32::nlanes )
+            if( i <= width - VTraits<v_int32>::vlanes() )
 #endif
             {
                 v_float32 s0 = v_muladd(v_cvt_f32(v_sub(vx_load(src[1] + i), vx_load(src[-1] + i))), vx_setall_f32(ky[1]), vx_setall_f32(delta));
@@ -1321,23 +1321,23 @@ struct SymmColumnSmallVec_32s16s
             {
                 v_int32 k0 = vx_setall_s32((int)ky[0]), k1 = vx_setall_s32((int)ky[1]);
                 v_int32 d4 = vx_setall_s32(d);
-                for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes )
+                for( ; i <= width - 2*VTraits<v_int16>::vlanes(); i += 2*VTraits<v_int16>::vlanes() )
                 {
-                    v_store(dst + i, v_pack(v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)),
-                                            v_muladd(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + v_int32::nlanes), k0, d4))));
-                    v_store(dst + i + v_int16::nlanes, v_pack(v_muladd(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + 2*v_int32::nlanes), k0, d4)),
-                                                              v_muladd(vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + 3*v_int32::nlanes), k0, d4))));
+                    v_store(dst + i, v_pack(v_muladd(v_add(vx_load(S0 + i), vx_load(S2 + i)), k1, v_muladd(vx_load(S1 + i), k0, d4)),
+                                            v_muladd(v_add(vx_load(S0 + i + VTraits<v_int32>::vlanes()), vx_load(S2 + i + VTraits<v_int32>::vlanes())), k1, v_muladd(vx_load(S1 + i + VTraits<v_int32>::vlanes()), k0, d4))));
+                    v_store(dst + i + VTraits<v_int16>::vlanes(), v_pack(v_muladd(v_add(vx_load(S0 + i + 2 * VTraits<v_int32>::vlanes()), vx_load(S2 + i + 2 * VTraits<v_int32>::vlanes())), k1, v_muladd(vx_load(S1 + i + 2*VTraits<v_int32>::vlanes()), k0, d4)),
+                                                              v_muladd(v_add(vx_load(S0 + i + 3 * VTraits<v_int32>::vlanes()), vx_load(S2 + i + 3 * VTraits<v_int32>::vlanes())), k1, v_muladd(vx_load(S1 + i + 3*VTraits<v_int32>::vlanes()), k0, d4))));
                 }
-                if( i <= width - v_int16::nlanes )
+                if( i <= width - VTraits<v_int16>::vlanes() )
                 {
-                    v_store(dst + i, v_pack(v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)),
-                                            v_muladd(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + v_int32::nlanes), k0, d4))));
-                    i += v_int16::nlanes;
+                    v_store(dst + i, v_pack(v_muladd(v_add(vx_load(S0 + i), vx_load(S2 + i)), k1, v_muladd(vx_load(S1 + i), k0, d4)),
+                                            v_muladd(v_add(vx_load(S0 + i + VTraits<v_int32>::vlanes()), vx_load(S2 + i + VTraits<v_int32>::vlanes())), k1, v_muladd(vx_load(S1 + i + VTraits<v_int32>::vlanes()), k0, d4))));
+                    i += VTraits<v_int16>::vlanes();
                 }
-                if( i <= width - v_int32::nlanes )
+                if( i <= width - VTraits<v_int32>::vlanes() )
                 {
-                    v_pack_store(dst + i, v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)));
-                    i += v_int32::nlanes;
+                    v_pack_store(dst + i, v_muladd(v_add(vx_load(S0 + i), vx_load(S2 + i)), k1, v_muladd(vx_load(S1 + i), k0, d4)));
+                    i += VTraits<v_int32>::vlanes();
                 }
             }
 #endif
@@ -2237,9 +2237,9 @@ struct FilterVec_8u
             i += VTraits<v_uint16>::vlanes();
         }
 #if CV_SIMD_WIDTH > 16
-        while( i <= width - 4 /*v_int32x4::nlanes*/ )
+        while( i <= width - 4 /*VTraits<v_int32x4>::vlanes()*/ )
 #else
-        if( i <= width - v_int32::nlanes )
+        if( i <= width - VTraits<v_int32>::vlanes() )
 #endif
         {
             v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[0] + i))), vx_setall_f32(kf[0]), vx_setall_f32(delta));
@@ -2248,7 +2248,7 @@ struct FilterVec_8u
             v_int32 s32 = v_round(s0);
             v_int16 s16 = v_pack(s32, s32);
             *(unaligned_int*)(dst + i) = v_get0(v_reinterpret_as_s32(v_pack_u(s16, s16)));
-            i += 4 /*v_int32x4::nlanes*/ ;
+            i += 4 /*VTraits<v_int32x4>::vlanes()*/ ;
         }
         return i;
     }
diff --git a/modules/imgproc/src/histogram.cpp b/modules/imgproc/src/histogram.cpp
index cbd60550e0..7a52d0f3fe 100644
--- a/modules/imgproc/src/histogram.cpp
+++ b/modules/imgproc/src/histogram.cpp
@@ -2093,7 +2093,7 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
             v_float32 v_s11 = vx_setzero_f32();
             v_float32 v_s12 = vx_setzero_f32();
             v_float32 v_s22 = vx_setzero_f32();
-            for (; j <= len - v_float32::nlanes; j += v_float32::nlanes)
+            for (; j <= len - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
             {
                 v_float32 v_a = vx_load(h1 + j);
                 v_float32 v_b = vx_load(h2 + j);
@@ -2134,10 +2134,10 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
             result += v_reduce_sum(v_result);
 #elif CV_SIMD
             v_float32 v_result = vx_setzero_f32();
-            for (; j <= len - v_float32::nlanes; j += v_float32::nlanes)
+            for (; j <= len - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
             {
                 v_float32 v_src = v_min(vx_load(h1 + j), vx_load(h2 + j));
-                v_result += v_src;
+                v_result = v_add(v_result, v_src);
             }
             result += v_reduce_sum(v_result);
 #endif
@@ -2174,7 +2174,7 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
             v_float32 v_s1 = vx_setzero_f32();
             v_float32 v_s2 = vx_setzero_f32();
             v_float32 v_result = vx_setzero_f32();
-            for (; j <= len - v_float32::nlanes; j += v_float32::nlanes)
+            for (; j <= len - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
             {
                 v_float32 v_a = vx_load(h1 + j);
                 v_float32 v_b = vx_load(h2 + j);
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index bbeb8223f1..fc55b0f642 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -455,7 +455,7 @@ struct RemapVec_8u
         v_int32x4 delta = v_setall_s32(INTER_REMAP_COEF_SCALE / 2);
         v_int16x8 xy2ofs = v_reinterpret_as_s16(v_setall_s32(cn + (sstep << 16)));
         int CV_DECL_ALIGNED(16) iofs0[4], iofs1[4];
-        const uchar* src_limit_8bytes = _src.datalimit - v_int16x8::nlanes;
+        const uchar* src_limit_8bytes = _src.datalimit - VTraits<v_int16x8>::vlanes();
 #define CV_PICK_AND_PACK_RGB(ptr, offset, result)  \
         {                                          \
             const uchar* const p = ((const uchar*)ptr) + (offset); \
@@ -483,7 +483,7 @@ struct RemapVec_8u
             v_uint8x16 rrggbbaa, dummy;            \
             v_uint16x8 rrggbbaa8, dummy8;          \
             v_uint8x16 rgba0 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p), 0, 0, 0)); \
-            v_uint8x16 rgba1 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p + v_int32x4::nlanes), 0, 0, 0)); \
+            v_uint8x16 rgba1 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p + VTraits<v_int32x4>::vlanes()), 0, 0, 0)); \
             v_zip(rgba0, rgba1, rrggbbaa, dummy);  \
             v_expand(rrggbbaa, rrggbbaa8, dummy8); \
             result = v_reinterpret_as_s16(rrggbbaa8); \
@@ -534,8 +534,8 @@ struct RemapVec_8u
                 v3 = v_dotprod(v_reinterpret_as_s16(v3), v_reinterpret_as_s16(d2), delta);
                 v2 = v_dotprod(v_reinterpret_as_s16(v2), v_reinterpret_as_s16(c2), v3);
 
-                v0 = v0 >> INTER_REMAP_COEF_BITS;
-                v2 = v2 >> INTER_REMAP_COEF_BITS;
+                v0 = v_shr<INTER_REMAP_COEF_BITS>(v0);
+                v2 = v_shr<INTER_REMAP_COEF_BITS>(v2);
                 v_pack_u_store(D + x, v_pack(v0, v2));
             }
         }
@@ -563,8 +563,8 @@ struct RemapVec_8u
                 CV_PICK_AND_PACK_RGB(S0, iofs0[1], u1);
                 CV_PICK_AND_PACK_RGB(S1, iofs0[1], v1);
 
-                v_int32x4 result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
-                v_int32x4 result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
+                v_int32x4 result0 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u0, w00, v_dotprod(v0, w01, delta)));
+                v_int32x4 result1 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u1, w10, v_dotprod(v1, w11, delta)));
 
                 result0 = v_rotate_left<1>(result0);
                 v_int16x8 result8 = v_pack(result0, result1);
@@ -581,8 +581,8 @@ struct RemapVec_8u
                 CV_PICK_AND_PACK_RGB(S0, iofs0[3], u1);
                 CV_PICK_AND_PACK_RGB(S1, iofs0[3], v1);
 
-                result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
-                result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
+                result0 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u0, w00, v_dotprod(v0, w01, delta)));
+                result1 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u1, w10, v_dotprod(v1, w11, delta)));
 
                 result0 = v_rotate_left<1>(result0);
                 result8 = v_pack(result0, result1);
@@ -613,8 +613,8 @@ struct RemapVec_8u
                 CV_PICK_AND_PACK_RGBA(S0, iofs0[1], u1);
                 CV_PICK_AND_PACK_RGBA(S1, iofs0[1], v1);
 
-                v_int32x4 result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
-                v_int32x4 result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
+                v_int32x4 result0 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u0, w00, v_dotprod(v0, w01, delta)));
+                v_int32x4 result1 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u1, w10, v_dotprod(v1, w11, delta)));
                 v_int16x8 result8 = v_pack(result0, result1);
                 v_pack_u_store(D, result8);
 
@@ -627,8 +627,8 @@ struct RemapVec_8u
                 CV_PICK_AND_PACK_RGBA(S0, iofs0[3], u1);
                 CV_PICK_AND_PACK_RGBA(S1, iofs0[3], v1);
 
-                result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
-                result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
+                result0 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u0, w00, v_dotprod(v0, w01, delta)));
+                result1 = v_shr<INTER_REMAP_COEF_BITS>(v_dotprod(u1, w10, v_dotprod(v1, w11, delta)));
                 result8 = v_pack(result0, result1);
                 v_pack_u_store(D + 8, result8);
             }
@@ -1164,7 +1164,7 @@ public:
 
                             #if CV_SIMD128
                             {
-                                int span = v_float32x4::nlanes;
+                                int span = VTraits<v_float32x4>::vlanes();
                                 for( ; x1 <= bcols - span * 2; x1 += span * 2 )
                                 {
                                     v_int32x4 ix0 = v_round(v_load(sX + x1));
@@ -1206,9 +1206,9 @@ public:
                         #if CV_SIMD128
                         {
                             v_uint16x8 v_scale = v_setall_u16(INTER_TAB_SIZE2 - 1);
-                            int span = v_uint16x8::nlanes;
+                            int span = VTraits<v_uint16x8>::vlanes();
                             for( ; x1 <= bcols - span; x1 += span )
-                                v_store((unsigned short*)(A + x1), v_load(sA + x1) & v_scale);
+                                v_store((unsigned short*)(A + x1), v_and(v_load(sA + x1), v_scale));
                         }
                         #endif
                         for( ; x1 < bcols; x1++ )
@@ -1224,16 +1224,16 @@ public:
                         {
                             v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
                             v_int32x4 v_scale2 = v_setall_s32(INTER_TAB_SIZE - 1);
-                            int span = v_float32x4::nlanes;
+                            int span = VTraits<v_float32x4>::vlanes();
                             for( ; x1 <= bcols - span * 2; x1 += span * 2 )
                             {
-                                v_int32x4 v_sx0 = v_round(v_scale * v_load(sX + x1));
-                                v_int32x4 v_sy0 = v_round(v_scale * v_load(sY + x1));
-                                v_int32x4 v_sx1 = v_round(v_scale * v_load(sX + x1 + span));
-                                v_int32x4 v_sy1 = v_round(v_scale * v_load(sY + x1 + span));
-                                v_uint16x8 v_sx8 = v_reinterpret_as_u16(v_pack(v_sx0 & v_scale2, v_sx1 & v_scale2));
-                                v_uint16x8 v_sy8 = v_reinterpret_as_u16(v_pack(v_sy0 & v_scale2, v_sy1 & v_scale2));
-                                v_uint16x8 v_v = v_shl<INTER_BITS>(v_sy8) | (v_sx8);
+                                v_int32x4 v_sx0 = v_round(v_mul(v_scale, v_load(sX + x1)));
+                                v_int32x4 v_sy0 = v_round(v_mul(v_scale, v_load(sY + x1)));
+                                v_int32x4 v_sx1 = v_round(v_mul(v_scale, v_load(sX + x1 + span)));
+                                v_int32x4 v_sy1 = v_round(v_mul(v_scale, v_load(sY + x1 + span)));
+                                v_uint16x8 v_sx8 = v_reinterpret_as_u16(v_pack(v_and(v_sx0, v_scale2), v_and(v_sx1, v_scale2)));
+                                v_uint16x8 v_sy8 = v_reinterpret_as_u16(v_pack(v_and(v_sy0, v_scale2), v_and(v_sy1, v_scale2)));
+                                v_uint16x8 v_v = v_or(v_shl<INTER_BITS>(v_sy8), v_sx8);
                                 v_store(A + x1, v_v);
 
                                 v_int16x8 v_d0 = v_pack(v_shr<INTER_BITS>(v_sx0), v_shr<INTER_BITS>(v_sx1));
@@ -1261,18 +1261,18 @@ public:
                         {
                             v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
                             v_int32x4 v_scale2 = v_setall_s32(INTER_TAB_SIZE - 1), v_scale3 = v_setall_s32(INTER_TAB_SIZE);
-                            int span = v_float32x4::nlanes;
+                            int span = VTraits<v_float32x4>::vlanes();
                             for( ; x1 <= bcols - span * 2; x1 += span * 2 )
                             {
                                 v_float32x4 v_fx, v_fy;
                                 v_load_deinterleave(sXY + (x1 << 1), v_fx, v_fy);
-                                v_int32x4 v_sx0 = v_round(v_fx * v_scale);
-                                v_int32x4 v_sy0 = v_round(v_fy * v_scale);
+                                v_int32x4 v_sx0 = v_round(v_mul(v_fx, v_scale));
+                                v_int32x4 v_sy0 = v_round(v_mul(v_fy, v_scale));
                                 v_load_deinterleave(sXY + ((x1 + span) << 1), v_fx, v_fy);
-                                v_int32x4 v_sx1 = v_round(v_fx * v_scale);
-                                v_int32x4 v_sy1 = v_round(v_fy * v_scale);
-                                v_int32x4 v_v0 = v_muladd(v_scale3, (v_sy0 & v_scale2), (v_sx0 & v_scale2));
-                                v_int32x4 v_v1 = v_muladd(v_scale3, (v_sy1 & v_scale2), (v_sx1 & v_scale2));
+                                v_int32x4 v_sx1 = v_round(v_mul(v_fx, v_scale));
+                                v_int32x4 v_sy1 = v_round(v_mul(v_fy, v_scale));
+                                v_int32x4 v_v0 = v_muladd(v_scale3, (v_and(v_sy0, v_scale2)), (v_and(v_sx0, v_scale2)));
+                                v_int32x4 v_v1 = v_muladd(v_scale3, (v_and(v_sy1, v_scale2)), (v_and(v_sx1, v_scale2)));
                                 v_uint16x8 v_v8 = v_reinterpret_as_u16(v_pack(v_v0, v_v1));
                                 v_store(A + x1, v_v8);
                                 v_int16x8 v_dx = v_pack(v_shr<INTER_BITS>(v_sx0), v_shr<INTER_BITS>(v_sx1));
@@ -1941,7 +1941,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                 {
                     #if CV_SIMD128
                     {
-                        int span = v_int16x8::nlanes;
+                        int span = VTraits<v_int16x8>::vlanes();
                         for( ; x <= size.width - span; x += span )
                         {
                             v_int16x8 v_dst[2];
@@ -1973,21 +1973,21 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                         v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
                         v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
                         v_int32x4 v_scale3 = v_setall_s32(INTER_TAB_SIZE);
-                        int span = v_float32x4::nlanes;
+                        int span = VTraits<v_float32x4>::vlanes();
                         for( ; x <= size.width - span * 2; x += span * 2 )
                         {
-                            v_int32x4 v_ix0 = v_round(v_scale * (v_load(src1f + x)));
-                            v_int32x4 v_ix1 = v_round(v_scale * (v_load(src1f + x + span)));
-                            v_int32x4 v_iy0 = v_round(v_scale * (v_load(src2f + x)));
-                            v_int32x4 v_iy1 = v_round(v_scale * (v_load(src2f + x + span)));
+                            v_int32x4 v_ix0 = v_round(v_mul(v_scale, v_load(src1f + x)));
+                            v_int32x4 v_ix1 = v_round(v_mul(v_scale, v_load(src1f + x + span)));
+                            v_int32x4 v_iy0 = v_round(v_mul(v_scale, v_load(src2f + x)));
+                            v_int32x4 v_iy1 = v_round(v_mul(v_scale, v_load(src2f + x + span)));
 
                             v_int16x8 v_dst[2];
                             v_dst[0] = v_pack(v_shr<INTER_BITS>(v_ix0), v_shr<INTER_BITS>(v_ix1));
                             v_dst[1] = v_pack(v_shr<INTER_BITS>(v_iy0), v_shr<INTER_BITS>(v_iy1));
                             v_store_interleave(dst1 + (x << 1), v_dst[0], v_dst[1]);
 
-                            v_int32x4 v_dst0 = v_muladd(v_scale3, (v_iy0 & v_mask), (v_ix0 & v_mask));
-                            v_int32x4 v_dst1 = v_muladd(v_scale3, (v_iy1 & v_mask), (v_ix1 & v_mask));
+                            v_int32x4 v_dst0 = v_muladd(v_scale3, (v_and(v_iy0, v_mask)), (v_and(v_ix0, v_mask)));
+                            v_int32x4 v_dst1 = v_muladd(v_scale3, (v_and(v_iy1, v_mask)), (v_and(v_ix1, v_mask)));
                             v_store(dst2 + x, v_pack_u(v_dst0, v_dst1));
                         }
                     }
@@ -2008,7 +2008,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
             if( nninterpolate )
             {
                 #if CV_SIMD128
-                int span = v_float32x4::nlanes;
+                int span = VTraits<v_float32x4>::vlanes();
                 {
                     for( ; x <= (size.width << 1) - span * 2; x += span * 2 )
                         v_store(dst1 + x, v_pack(v_round(v_load(src1f + x)),
@@ -2034,16 +2034,16 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                         v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
                         v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
                         v_int32x4 v_scale3 = v_setall_s32(INTER_TAB_SIZE);
-                        int span = v_uint16x8::nlanes;
+                        int span = VTraits<v_uint16x8>::vlanes();
                         for (; x <= size.width - span; x += span )
                         {
                             v_float32x4 v_src0[2], v_src1[2];
                             v_load_deinterleave(src1f + (x << 1), v_src0[0], v_src0[1]);
                             v_load_deinterleave(src1f + (x << 1) + span, v_src1[0], v_src1[1]);
-                            v_int32x4 v_ix0 = v_round(v_src0[0] * v_scale);
-                            v_int32x4 v_ix1 = v_round(v_src1[0] * v_scale);
-                            v_int32x4 v_iy0 = v_round(v_src0[1] * v_scale);
-                            v_int32x4 v_iy1 = v_round(v_src1[1] * v_scale);
+                            v_int32x4 v_ix0 = v_round(v_mul(v_src0[0], v_scale));
+                            v_int32x4 v_ix1 = v_round(v_mul(v_src1[0], v_scale));
+                            v_int32x4 v_iy0 = v_round(v_mul(v_src0[1], v_scale));
+                            v_int32x4 v_iy1 = v_round(v_mul(v_src1[1], v_scale));
 
                             v_int16x8 v_dst[2];
                             v_dst[0] = v_pack(v_shr<INTER_BITS>(v_ix0), v_shr<INTER_BITS>(v_ix1));
@@ -2051,8 +2051,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                             v_store_interleave(dst1 + (x << 1), v_dst[0], v_dst[1]);
 
                             v_store(dst2 + x, v_pack_u(
-                                v_muladd(v_scale3, (v_iy0 & v_mask), (v_ix0 & v_mask)),
-                                v_muladd(v_scale3, (v_iy1 & v_mask), (v_ix1 & v_mask))));
+                                v_muladd(v_scale3, (v_and(v_iy0, v_mask)), (v_and(v_ix0, v_mask))),
+                                v_muladd(v_scale3, (v_and(v_iy1, v_mask)), (v_and(v_ix1, v_mask)))));
                         }
                     }
                     #endif
@@ -2074,13 +2074,13 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                 v_uint16x8 v_mask2 =  v_setall_u16(INTER_TAB_SIZE2-1);
                 v_uint32x4 v_zero =   v_setzero_u32(), v_mask = v_setall_u32(INTER_TAB_SIZE-1);
                 v_float32x4 v_scale = v_setall_f32(scale);
-                int span = v_float32x4::nlanes;
+                int span = VTraits<v_float32x4>::vlanes();
                 for( ; x <= size.width - span * 2; x += span * 2 )
                 {
                     v_uint32x4 v_fxy1, v_fxy2;
                     if ( src2 )
                     {
-                        v_uint16x8 v_src2 = v_load(src2 + x) & v_mask2;
+                        v_uint16x8 v_src2 = v_and(v_load(src2 + x), v_mask2);
                         v_expand(v_src2, v_fxy1, v_fxy2);
                     }
                     else
@@ -2091,9 +2091,9 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                     v_load_deinterleave(src1 + (x << 1), v_src[0], v_src[1]);
                     v_expand(v_src[0], v_src0[0], v_src0[1]);
                     v_expand(v_src[1], v_src1[0], v_src1[1]);
-                    #define CV_COMPUTE_MAP_X(X, FXY)  v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32((FXY) & v_mask)),\
+                    #define CV_COMPUTE_MAP_X(X, FXY)  v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32(v_and((FXY), v_mask))),\
                                                                         v_cvt_f32(v_reinterpret_as_s32(X)))
-                    #define CV_COMPUTE_MAP_Y(Y, FXY)  v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32((FXY) >> INTER_BITS)),\
+                    #define CV_COMPUTE_MAP_Y(Y, FXY)  v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32(v_shr<INTER_BITS>((FXY)))),\
                                                                         v_cvt_f32(v_reinterpret_as_s32(Y)))
                     v_float32x4 v_dst1 = CV_COMPUTE_MAP_X(v_src0[0], v_fxy1);
                     v_float32x4 v_dst2 = CV_COMPUTE_MAP_Y(v_src1[0], v_fxy1);
@@ -2123,13 +2123,13 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                 v_int16x8 v_mask2 = v_setall_s16(INTER_TAB_SIZE2-1);
                 v_int32x4 v_zero = v_setzero_s32(), v_mask = v_setall_s32(INTER_TAB_SIZE-1);
                 v_float32x4 v_scale = v_setall_f32(scale);
-                int span = v_int16x8::nlanes;
+                int span = VTraits<v_int16x8>::vlanes();
                 for( ; x <= size.width - span; x += span )
                 {
                     v_int32x4 v_fxy1, v_fxy2;
                     if (src2)
                     {
-                        v_int16x8 v_src2 = v_load((short *)src2 + x) & v_mask2;
+                        v_int16x8 v_src2 = v_and(v_load((short *)src2 + x), v_mask2);
                         v_expand(v_src2, v_fxy1, v_fxy2);
                     }
                     else
@@ -2142,8 +2142,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                     v_expand(v_src[0], v_src0[0], v_src0[1]);
                     v_expand(v_src[1], v_src1[0], v_src1[1]);
 
-                    #define CV_COMPUTE_MAP_X(X, FXY) v_muladd(v_scale, v_cvt_f32((FXY) & v_mask), v_cvt_f32(X))
-                    #define CV_COMPUTE_MAP_Y(Y, FXY) v_muladd(v_scale, v_cvt_f32((FXY) >> INTER_BITS), v_cvt_f32(Y))
+                    #define CV_COMPUTE_MAP_X(X, FXY) v_muladd(v_scale, v_cvt_f32(v_and((FXY), v_mask)), v_cvt_f32(X))
+                    #define CV_COMPUTE_MAP_Y(Y, FXY) v_muladd(v_scale, v_cvt_f32(v_shr<INTER_BITS>((FXY))), v_cvt_f32(Y))
                     v_dst[0] = CV_COMPUTE_MAP_X(v_src0[0], v_fxy1);
                     v_dst[1] = CV_COMPUTE_MAP_Y(v_src1[0], v_fxy1);
                     v_store_interleave(dst1f + (x << 1), v_dst[0], v_dst[1]);
@@ -2234,12 +2234,12 @@ public:
                             #if CV_SIMD128
                             {
                                 v_int32x4 v_X0 = v_setall_s32(X0), v_Y0 = v_setall_s32(Y0);
-                                int span = v_uint16x8::nlanes;
+                                int span = VTraits<v_uint16x8>::vlanes();
                                 for( ; x1 <= bw - span; x1 += span )
                                 {
                                     v_int16x8 v_dst[2];
-                                    #define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr<AB_BITS>(shift+v_load(ptr + offset)),\
-                                                                                    v_shr<AB_BITS>(shift+v_load(ptr + offset + 4)))
+                                    #define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset))),\
+                                                                                    v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset + 4))))
                                     v_dst[0] = CV_CONVERT_MAP(adelta, x+x1, v_X0);
                                     v_dst[1] = CV_CONVERT_MAP(bdelta, x+x1, v_Y0);
                                     #undef CV_CONVERT_MAP
@@ -2272,21 +2272,21 @@ public:
                         {
                             v_int32x4 v__X0 = v_setall_s32(X0), v__Y0 = v_setall_s32(Y0);
                             v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
-                            int span = v_float32x4::nlanes;
+                            int span = VTraits<v_float32x4>::vlanes();
                             for( ; x1 <= bw - span * 2; x1 += span * 2 )
                             {
-                                v_int32x4 v_X0 = v_shr<AB_BITS - INTER_BITS>(v__X0 + v_load(adelta + x + x1));
-                                v_int32x4 v_Y0 = v_shr<AB_BITS - INTER_BITS>(v__Y0 + v_load(bdelta + x + x1));
-                                v_int32x4 v_X1 = v_shr<AB_BITS - INTER_BITS>(v__X0 + v_load(adelta + x + x1 + span));
-                                v_int32x4 v_Y1 = v_shr<AB_BITS - INTER_BITS>(v__Y0 + v_load(bdelta + x + x1 + span));
+                                v_int32x4 v_X0 = v_shr<AB_BITS - INTER_BITS>(v_add(v__X0, v_load(this->adelta + x + x1)));
+                                v_int32x4 v_Y0 = v_shr<AB_BITS - INTER_BITS>(v_add(v__Y0, v_load(this->bdelta + x + x1)));
+                                v_int32x4 v_X1 = v_shr<AB_BITS - INTER_BITS>(v_add(v__X0, v_load(this->adelta + x + x1 + span)));
+                                v_int32x4 v_Y1 = v_shr<AB_BITS - INTER_BITS>(v_add(v__Y0, v_load(this->bdelta + x + x1 + span)));
 
                                 v_int16x8 v_xy[2];
                                 v_xy[0] = v_pack(v_shr<INTER_BITS>(v_X0), v_shr<INTER_BITS>(v_X1));
                                 v_xy[1] = v_pack(v_shr<INTER_BITS>(v_Y0), v_shr<INTER_BITS>(v_Y1));
                                 v_store_interleave(xy + (x1 << 1), v_xy[0], v_xy[1]);
 
-                                v_int32x4 v_alpha0 = v_shl<INTER_BITS>(v_Y0 & v_mask) | (v_X0 & v_mask);
-                                v_int32x4 v_alpha1 = v_shl<INTER_BITS>(v_Y1 & v_mask) | (v_X1 & v_mask);
+                                v_int32x4 v_alpha0 = v_or(v_shl<INTER_BITS>(v_and(v_Y0, v_mask)), v_and(v_X0, v_mask));
+                                v_int32x4 v_alpha1 = v_or(v_shl<INTER_BITS>(v_and(v_Y1, v_mask)), v_and(v_X1, v_mask));
                                 v_store(alpha + x1, v_pack(v_alpha0, v_alpha1));
                             }
                         }
@@ -2866,16 +2866,16 @@ void WarpPerspectiveLine_ProcessNN_CV_SIMD(const double *M, short* xy, double X0
         v_int32x4 v_X0, v_Y0;
         {
             v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
-            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
+            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
-            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
+            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_X0 = v_round(v_fX0, v_fX1);
             v_Y0 = v_round(v_fY0, v_fY1);
@@ -2885,16 +2885,16 @@ void WarpPerspectiveLine_ProcessNN_CV_SIMD(const double *M, short* xy, double X0
         v_int32x4 v_X1, v_Y1;
         {
             v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
-            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
+            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
-            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
+            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_X1 = v_round(v_fX0, v_fX1);
             v_Y1 = v_round(v_fY0, v_fY1);
@@ -2904,16 +2904,16 @@ void WarpPerspectiveLine_ProcessNN_CV_SIMD(const double *M, short* xy, double X0
         v_int32x4 v_X2, v_Y2;
         {
             v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
-            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
+            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
-            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
+            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_X2 = v_round(v_fX0, v_fX1);
             v_Y2 = v_round(v_fY0, v_fY1);
@@ -2923,16 +2923,16 @@ void WarpPerspectiveLine_ProcessNN_CV_SIMD(const double *M, short* xy, double X0
         v_int32x4 v_X3, v_Y3;
         {
             v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
-            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
+            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_1 / v_W, v_zero);
-            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_1, v_W), v_zero);
+            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_X3 = v_round(v_fX0, v_fX1);
             v_Y3 = v_round(v_fY0, v_fY1);
@@ -2987,16 +2987,16 @@ void WarpPerspectiveLine_Process_CV_SIMD(const double *M, short* xy, short* alph
         v_int32x4 v_X0, v_Y0;
         {
             v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
-            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
+            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
-            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
+            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_X0 = v_round(v_fX0, v_fX1);
             v_Y0 = v_round(v_fY0, v_fY1);
@@ -3006,16 +3006,16 @@ void WarpPerspectiveLine_Process_CV_SIMD(const double *M, short* xy, short* alph
         v_int32x4 v_X1, v_Y1;
         {
             v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
-            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
+            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
-            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
+            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_X1 = v_round(v_fX0, v_fX1);
             v_Y1 = v_round(v_fY0, v_fY1);
@@ -3025,16 +3025,16 @@ void WarpPerspectiveLine_Process_CV_SIMD(const double *M, short* xy, short* alph
         v_int32x4 v_X2, v_Y2;
         {
             v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
-            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
+            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
-            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
+            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_X2 = v_round(v_fX0, v_fX1);
             v_Y2 = v_round(v_fY0, v_fY1);
@@ -3044,35 +3044,35 @@ void WarpPerspectiveLine_Process_CV_SIMD(const double *M, short* xy, short* alph
         v_int32x4 v_X3, v_Y3;
         {
             v_float64x2 v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
-            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
+            v_float64x2 v_fX0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY0 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_W = v_muladd(v_M6, v_x1, v_W0);
-            v_W = v_select(v_W != v_zero, v_its / v_W, v_zero);
-            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M0, v_x1, v_X0d) * v_W));
-            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_muladd(v_M3, v_x1, v_Y0d) * v_W));
-            v_x1 += v_2;
+            v_W = v_select(v_ne(v_W, v_zero), v_div(v_its, v_W), v_zero);
+            v_float64x2 v_fX1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M0, v_x1, v_X0d), v_W)));
+            v_float64x2 v_fY1 = v_max(v_intmin, v_min(v_intmax, v_mul(v_muladd(v_M3, v_x1, v_Y0d), v_W)));
+            v_x1 = v_add(v_x1, v_2);
 
             v_X3 = v_round(v_fX0, v_fX1);
             v_Y3 = v_round(v_fY0, v_fY1);
         }
 
         // store alpha
-        v_int32x4 v_alpha0 = ((v_Y0 & v_itsi1) << INTER_BITS) + (v_X0 & v_itsi1);
-        v_int32x4 v_alpha1 = ((v_Y1 & v_itsi1) << INTER_BITS) + (v_X1 & v_itsi1);
+        v_int32x4 v_alpha0 = v_add(v_shl<INTER_BITS>(v_and(v_Y0, v_itsi1)), v_and(v_X0, v_itsi1));
+        v_int32x4 v_alpha1 = v_add(v_shl<INTER_BITS>(v_and(v_Y1, v_itsi1)), v_and(v_X1, v_itsi1));
         v_store((alpha + x1), v_pack(v_alpha0, v_alpha1));
 
-        v_alpha0 = ((v_Y2 & v_itsi1) << INTER_BITS) + (v_X2 & v_itsi1);
-        v_alpha1 = ((v_Y3 & v_itsi1) << INTER_BITS) + (v_X3 & v_itsi1);
+        v_alpha0 = v_add(v_shl<INTER_BITS>(v_and(v_Y2, v_itsi1)), v_and(v_X2, v_itsi1));
+        v_alpha1 = v_add(v_shl<INTER_BITS>(v_and(v_Y3, v_itsi1)), v_and(v_X3, v_itsi1));
         v_store((alpha + x1 + 8), v_pack(v_alpha0, v_alpha1));
 
         // convert to 16s
-        v_X0 = v_reinterpret_as_s32(v_pack(v_X0 >> INTER_BITS, v_X1 >> INTER_BITS));
-        v_X1 = v_reinterpret_as_s32(v_pack(v_X2 >> INTER_BITS, v_X3 >> INTER_BITS));
-        v_Y0 = v_reinterpret_as_s32(v_pack(v_Y0 >> INTER_BITS, v_Y1 >> INTER_BITS));
-        v_Y1 = v_reinterpret_as_s32(v_pack(v_Y2 >> INTER_BITS, v_Y3 >> INTER_BITS));
+        v_X0 = v_reinterpret_as_s32(v_pack(v_shr<INTER_BITS>(v_X0), v_shr<INTER_BITS>(v_X1)));
+        v_X1 = v_reinterpret_as_s32(v_pack(v_shr<INTER_BITS>(v_X2), v_shr<INTER_BITS>(v_X3)));
+        v_Y0 = v_reinterpret_as_s32(v_pack(v_shr<INTER_BITS>(v_Y0), v_shr<INTER_BITS>(v_Y1)));
+        v_Y1 = v_reinterpret_as_s32(v_pack(v_shr<INTER_BITS>(v_Y2), v_shr<INTER_BITS>(v_Y3)));
 
         v_store_interleave(xy + x1 * 2, (v_reinterpret_as_s16)(v_X0), (v_reinterpret_as_s16)(v_Y0));
         v_store_interleave(xy + x1 * 2 + 16, (v_reinterpret_as_s16)(v_X1), (v_reinterpret_as_s16)(v_Y1));
diff --git a/modules/imgproc/src/median_blur.simd.hpp b/modules/imgproc/src/median_blur.simd.hpp
index 7d8423d322..1fe2e4060c 100644
--- a/modules/imgproc/src/median_blur.simd.hpp
+++ b/modules/imgproc/src/median_blur.simd.hpp
@@ -179,10 +179,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
                 for (k = 0; k < 16; ++k)
                 {
 #if CV_SIMD256
-                    v_store(H.fine[k], v_mul_wrap(v256_load(h_fine + 16 * n*(16 * c + k)), v256_setall_u16(2 * r + 1)) + v256_load(H.fine[k]));
+                    v_store(H.fine[k], v_mul_wrap(v256_load(h_fine + 16 * n*(16 * c + k)), v_add(v256_setall_u16(2 * r + 1), v256_load(H.fine[k]))));
 #elif CV_SIMD128
-                    v_store(H.fine[k], v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k)), v_setall_u16((ushort)(2 * r + 1))) + v_load(H.fine[k]));
-                    v_store(H.fine[k] + 8, v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k) + 8), v_setall_u16((ushort)(2 * r + 1))) + v_load(H.fine[k] + 8));
+                    v_store(H.fine[k], v_add(v_mul_wrap(v_load(h_fine + 16 * n * (16 * c + k)), v_setall_u16((ushort)(2 * r + 1))), v_load(H.fine[k])));
+                    v_store(H.fine[k] + 8, v_add(v_mul_wrap(v_load(h_fine + 16 * n * (16 * c + k) + 8), v_setall_u16((ushort)(2 * r + 1))), v_load(H.fine[k] + 8)));
 #else
                     for (int ind = 0; ind < 16; ++ind)
                         H.fine[k][ind] = (HT)(H.fine[k][ind] + (2 * r + 1) * h_fine[16 * n*(16 * c + k) + ind]);
@@ -199,10 +199,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
                 for( j = 0; j < 2*r; ++j, px += 16 )
                 {
 #if CV_SIMD256
-                    v_coarse += v256_load(px);
+                    v_coarse = v_add(v_coarse, v256_load(px));
 #elif CV_SIMD128
-                    v_coarsel += v_load(px);
-                    v_coarseh += v_load(px + 8);
+                    v_coarsel = v_add(v_coarsel, v_load(px));
+                    v_coarseh = v_add(v_coarseh, v_load(px + 8));
 #else
                     for (int ind = 0; ind < 16; ++ind)
                         H.coarse[ind] += px[ind];
@@ -216,11 +216,11 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
 
                     px = h_coarse + 16 * (n*c + std::min(j + r, n - 1));
 #if CV_SIMD256
-                    v_coarse += v256_load(px);
+                    v_coarse = v_add(v_coarse, v256_load(px));
                     v_store(H.coarse, v_coarse);
 #elif CV_SIMD128
-                    v_coarsel += v_load(px);
-                    v_coarseh += v_load(px + 8);
+                    v_coarsel = v_add(v_coarsel, v_load(px));
+                    v_coarseh = v_add(v_coarseh, v_load(px + 8));
                     v_store(H.coarse, v_coarsel);
                     v_store(H.coarse + 8, v_coarseh);
 #else
@@ -261,10 +261,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
                         for (luc[k] = HT(j - r); luc[k] < MIN(j + r + 1, n); ++luc[k], px += 16)
                         {
 #if CV_SIMD256
-                            v_fine += v256_load(px);
+                            v_fine = v_add(v_fine, v256_load(px));
 #elif CV_SIMD128
-                            v_finel += v_load(px);
-                            v_fineh += v_load(px + 8);
+                            v_finel = v_add(v_finel, v_load(px));
+                            v_fineh = v_add(v_fineh, v_load(px + 8));
 #else
                             for (int ind = 0; ind < 16; ++ind)
                                 H.fine[k][ind] += px[ind];
@@ -275,10 +275,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
                         {
                             px = h_fine + 16 * (n*(16 * c + k) + (n - 1));
 #if CV_SIMD256
-                            v_fine += v_mul_wrap(v256_load(px), v256_setall_u16(j + r + 1 - n));
+                            v_fine = v_add(v_fine, v_mul_wrap(v256_load(px), v256_setall_u16(j + r + 1 - n)));
 #elif CV_SIMD128
-                            v_finel += v_mul_wrap(v_load(px), v_setall_u16((ushort)(j + r + 1 - n)));
-                            v_fineh += v_mul_wrap(v_load(px + 8), v_setall_u16((ushort)(j + r + 1 - n)));
+                            v_finel = v_add(v_finel, v_mul_wrap(v_load(px), v_setall_u16((ushort)(j + r + 1 - n))));
+                            v_fineh = v_add(v_fineh, v_mul_wrap(v_load(px + 8), v_setall_u16((ushort)(j + r + 1 - n))));
 #else
                             for (int ind = 0; ind < 16; ++ind)
                                 H.fine[k][ind] = (HT)(H.fine[k][ind] + (j + r + 1 - n) * px[ind]);
@@ -298,10 +298,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
                         for ( ; luc[k] < j+r+1; ++luc[k] )
                         {
 #if CV_SIMD256
-                            v_fine = v_fine + v256_load(px + 16 * MIN(luc[k], n - 1)) - v256_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0));
+                            v_fine = v_sub(v_add(v_fine, v256_load(px + 16 * MIN(luc[k], n - 1))), v256_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0)));
 #elif CV_SIMD128
-                            v_finel = v_finel + v_load(px + 16 * MIN(luc[k], n - 1)    ) - v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0));
-                            v_fineh = v_fineh + v_load(px + 16 * MIN(luc[k], n - 1) + 8) - v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0) + 8);
+                            v_finel = v_sub(v_add(v_finel, v_load(px + 16 * MIN(luc[k], n - 1)    )), v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0)));
+                            v_fineh = v_sub(v_add(v_fineh, v_load(px + 16 * MIN(luc[k], n - 1) + 8)), v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0) + 8));
 #else
                             for (int ind = 0; ind < 16; ++ind)
                                 H.fine[k][ind] += px[16 * MIN(luc[k], n - 1) + ind] - px[16 * MAX(luc[k] - 2 * r - 1, 0) + ind];
@@ -312,12 +312,12 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
                     px = h_coarse + 16 * (n*c + MAX(j - r, 0));
 #if CV_SIMD256
                     v_store(H.fine[k], v_fine);
-                    v_coarse -= v256_load(px);
+                    v_coarse = v_sub(v_coarse, v256_load(px));
 #elif CV_SIMD128
                     v_store(H.fine[k], v_finel);
                     v_store(H.fine[k] + 8, v_fineh);
-                    v_coarsel -= v_load(px);
-                    v_coarseh -= v_load(px + 8);
+                    v_coarsel = v_sub(v_coarsel, v_load(px));
+                    v_coarseh = v_sub(v_coarseh, v_load(px + 8));
 #else
                     for (int ind = 0; ind < 16; ++ind)
                         H.coarse[ind] -= px[ind];
diff --git a/modules/imgproc/src/moments.cpp b/modules/imgproc/src/moments.cpp
index 204c8654af..523ea586d4 100644
--- a/modules/imgproc/src/moments.cpp
+++ b/modules/imgproc/src/moments.cpp
@@ -236,12 +236,12 @@ struct MomentsInTile_SIMD<uchar, int, int>
                 v_int16x8 p = v_reinterpret_as_s16(v_load_expand(ptr + x));
                 v_int16x8 sx = v_mul_wrap(qx, qx);
 
-                qx0 += v_reinterpret_as_u32(p);
+                qx0 = v_add(qx0, v_reinterpret_as_u32(p));
                 qx1 = v_reinterpret_as_u32(v_dotprod(p, qx, v_reinterpret_as_s32(qx1)));
                 qx2 = v_reinterpret_as_u32(v_dotprod(p, sx, v_reinterpret_as_s32(qx2)));
                 qx3 = v_reinterpret_as_u32(v_dotprod(v_mul_wrap(p, qx), sx, v_reinterpret_as_s32(qx3)));
 
-                qx += dx;
+                qx = v_add(qx, dx);
             }
 
             x0 = v_reduce_sum(qx0);
@@ -276,19 +276,19 @@ struct MomentsInTile_SIMD<ushort, int, int64>
             {
                 v_int32x4 v_src = v_reinterpret_as_s32(v_load_expand(ptr + x));
 
-                v_x0 += v_reinterpret_as_u32(v_src);
-                v_x1 += v_reinterpret_as_u32(v_src * v_ix0);
+                v_x0 = v_add(v_x0, v_reinterpret_as_u32(v_src));
+                v_x1 = v_add(v_x1, v_reinterpret_as_u32(v_mul(v_src, v_ix0)));
 
-                v_int32x4 v_ix1 = v_ix0 * v_ix0;
-                v_x2 += v_reinterpret_as_u32(v_src * v_ix1);
+                v_int32x4 v_ix1 = v_mul(v_ix0, v_ix0);
+                v_x2 = v_add(v_x2, v_reinterpret_as_u32(v_mul(v_src, v_ix1)));
 
-                v_ix1 = v_ix0 * v_ix1;
-                v_src = v_src * v_ix1;
+                v_ix1 = v_mul(v_ix0, v_ix1);
+                v_src = v_mul(v_src, v_ix1);
                 v_uint64x2 v_lo, v_hi;
                 v_expand(v_reinterpret_as_u32(v_src), v_lo, v_hi);
-                v_x3 += v_lo + v_hi;
+                v_x3 = v_add(v_x3, v_add(v_lo, v_hi));
 
-                v_ix0 += v_delta;
+                v_ix0 = v_add(v_ix0, v_delta);
             }
 
             x0 = v_reduce_sum(v_x0);
diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp
index dae09564d3..f65ae62158 100644
--- a/modules/imgproc/src/pyramids.cpp
+++ b/modules/imgproc/src/pyramids.cpp
@@ -463,7 +463,7 @@ template<> int PyrDownVecV<int, uchar>(int** src, uchar* dst, int width)
     }
     #if CV_SIMD128
     typedef int CV_DECL_ALIGNED(1) unaligned_int;
-    for ( ; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes)
+    for ( ; x <= width - VTraits<v_int32x4>::vlanes(); x += VTraits<v_int32x4>::vlanes())
     {
         v_int32x4 r0, r1, r2, r3, r4, t0;
         r0 = v_load(row0 + x);
@@ -473,7 +473,7 @@ template<> int PyrDownVecV<int, uchar>(int** src, uchar* dst, int width)
         r4 = v_load(row4 + x);
         t0 = v_add(v_add(v_add(r0, r4), v_add(r2, r2)), v_shl<2>(v_add(v_add(r1, r3), r2)));
 
-        *((unaligned_int*) (dst + x)) = v_reinterpret_as_s32(v_rshr_pack<8>(v_pack_u(t0, t0), v_setzero_u16())).get0();
+        *((unaligned_int*) (dst + x)) = v_get0(v_reinterpret_as_s32(v_rshr_pack<8>(v_pack_u(t0, t0), v_setzero_u16())));
     }
     #else
     for (; x <= width - 1; x += 1)
@@ -615,15 +615,15 @@ template <> int PyrUpVecV<int, uchar>(int** src, uchar** dst, int width)
     }
     #if CV_SIMD128
     typedef int CV_DECL_ALIGNED(1) unaligned_int;
-    for (; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes)
+    for (; x <= width - VTraits<v_int32x4>::vlanes(); x += VTraits<v_int32x4>::vlanes())
     {
         v_int32 v_r00 = vx_load(row0 + x),
                 v_r10 = vx_load(row1 + x),
                 v_r20 = vx_load(row2 + x);
         v_int32 v_2r10 = v_add(v_r10, v_r10);
         v_int16 d = v_pack(v_add(v_add(v_r00, v_r20), v_add(v_add(v_2r10, v_2r10), v_2r10)), v_shl<2>(v_add(v_r10, v_r20)));
-        *(unaligned_int*)(dst0 + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16())).get0();
-        *(unaligned_int*)(dst1 + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(v_combine_high(d, d), vx_setzero_s16())).get0();
+        *(unaligned_int*)(dst0 + x) = v_get0(v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16())));
+        *(unaligned_int*)(dst1 + x) = v_get0(v_reinterpret_as_s32(v_rshr_pack_u<6>(v_combine_high(d, d), vx_setzero_s16())));
     }
     #else
     for (; x <= width - 1; x += 1)
@@ -754,14 +754,14 @@ template <> int PyrUpVecVOneRow<int, uchar>(int** src, uchar* dst, int width)
     }
     #if CV_SIMD128
     typedef int CV_DECL_ALIGNED(1) unaligned_int;
-    for (; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes)
+    for (; x <= width - VTraits<v_int32x4>::vlanes(); x += VTraits<v_int32x4>::vlanes())
     {
         v_int32 v_r00 = vx_load(row0 + x),
                 v_r10 = vx_load(row1 + x),
                 v_r20 = vx_load(row2 + x);
         v_int32 v_2r10 = v_add(v_r10, v_r10);
         v_int16 d = v_pack(v_add(v_add(v_r00, v_r20), v_add(v_add(v_2r10, v_2r10), v_2r10)), v_shl<2>(v_add(v_r10, v_r20)));
-        *(unaligned_int*)(dst + x) = v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16())).get0();
+        *(unaligned_int*)(dst + x) = v_get0(v_reinterpret_as_s32(v_rshr_pack_u<6>(d, vx_setzero_s16())));
     }
     #else
     for (; x <= width - 1; x += 1)
diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp
index 1ad8e8932d..4668f0bdf3 100644
--- a/modules/imgproc/src/resize.cpp
+++ b/modules/imgproc/src/resize.cpp
@@ -2473,7 +2473,7 @@ public:
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
                 v_uint16 bl, gl, rl;
 #if CV_SIMD_WIDTH == 16
-                bl = t0 + t3; gl = t1 + t4; rl = t2 + t5;
+                bl = v_add(t0, t3); gl = v_add(t1, t4); rl = v_add(t2, t5);
 #elif CV_SIMD_WIDTH == 32
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 bl = v_add(s0, s3); gl = v_add(s1, s4); rl = v_add(s2, s5);
@@ -2493,7 +2493,7 @@ public:
                 v_zip(s0, s3, t0, t1); v_zip(s1, s4, t2, t3); v_zip(s2, s5, t4, t5);
                 v_uint16 bh, gh, rh;
 #if CV_SIMD_WIDTH == 16
-                bh = t0 + t3; gh = t1 + t4; rh = t2 + t5;
+                bh = v_add(t0, t3); gh = v_add(t1, t4); rh = v_add(t2, t5);
 #elif CV_SIMD_WIDTH == 32
                 v_zip(t0, t3, s0, s1); v_zip(t1, t4, s2, s3); v_zip(t2, t5, s4, s5);
                 bh = v_add(s0, s3); gh = v_add(s1, s4); rh = v_add(s2, s5);
@@ -2566,7 +2566,7 @@ public:
                 v_rshr_pack_store<2>(D, r0 + v_rotate_left<1>(r1, r0));
             }
 #else
-                v_rshr_pack_store<2>(D, v_load_expand(S0) + v_load_expand(S0 + 3) + v_load_expand(S1) + v_load_expand(S1 + 3));
+                v_rshr_pack_store<2>(D, v_add(v_add(v_add(v_load_expand(S0), v_load_expand(S0 + 3)), v_load_expand(S1)), v_load_expand(S1 + 3)));
 #endif
 #elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64
             for ( ; dx <= w - 3*VTraits<v_uint16>::vlanes(); dx += 3*VTraits<v_uint16>::vlanes(), S0 += 6*VTraits<v_uint16>::vlanes(), S1 += 6*VTraits<v_uint16>::vlanes(), D += 3*VTraits<v_uint16>::vlanes())
@@ -2609,7 +2609,7 @@ public:
             }
 #elif CV_SIMD_WIDTH >= 64
             v_uint32 masklow = vx_setall_u32(0x0000ffff);
-            for ( ; dx <= w - 3*v_uint16::nlanes; dx += 3*v_uint16::nlanes, S0 += 6*v_uint16::nlanes, S1 += 6*v_uint16::nlanes, D += 3*v_uint16::nlanes)
+            for ( ; dx <= w - 3*VTraits<v_uint16>::vlanes(); dx += 3*VTraits<v_uint16>::vlanes(), S0 += 6*VTraits<v_uint16>::vlanes(), S1 += 6*VTraits<v_uint16>::vlanes(), D += 3*VTraits<v_uint16>::vlanes())
             {
                 v_uint16 b0, g0, r0, b1, g1, r1;
                 v_load_deinterleave(S0, b0, g0, r0);
@@ -2617,8 +2617,8 @@ public:
                 v_uint32 bl = (v_reinterpret_as_u32(b0) >> 16) + (v_reinterpret_as_u32(b0) & masklow) + (v_reinterpret_as_u32(b1) >> 16) + (v_reinterpret_as_u32(b1) & masklow);
                 v_uint32 gl = (v_reinterpret_as_u32(g0) >> 16) + (v_reinterpret_as_u32(g0) & masklow) + (v_reinterpret_as_u32(g1) >> 16) + (v_reinterpret_as_u32(g1) & masklow);
                 v_uint32 rl = (v_reinterpret_as_u32(r0) >> 16) + (v_reinterpret_as_u32(r0) & masklow) + (v_reinterpret_as_u32(r1) >> 16) + (v_reinterpret_as_u32(r1) & masklow);
-                v_load_deinterleave(S0 + 3*v_uint16::nlanes, b0, g0, r0);
-                v_load_deinterleave(S1 + 3*v_uint16::nlanes, b1, g1, r1);
+                v_load_deinterleave(S0 + 3*VTraits<v_uint16>::vlanes(), b0, g0, r0);
+                v_load_deinterleave(S1 + 3*VTraits<v_uint16>::vlanes(), b1, g1, r1);
                 v_uint32 bh = (v_reinterpret_as_u32(b0) >> 16) + (v_reinterpret_as_u32(b0) & masklow) + (v_reinterpret_as_u32(b1) >> 16) + (v_reinterpret_as_u32(b1) & masklow);
                 v_uint32 gh = (v_reinterpret_as_u32(g0) >> 16) + (v_reinterpret_as_u32(g0) & masklow) + (v_reinterpret_as_u32(g1) >> 16) + (v_reinterpret_as_u32(g1) & masklow);
                 v_uint32 rh = (v_reinterpret_as_u32(r0) >> 16) + (v_reinterpret_as_u32(r0) & masklow) + (v_reinterpret_as_u32(r1) >> 16) + (v_reinterpret_as_u32(r1) & masklow);
@@ -2630,7 +2630,7 @@ public:
         {
             CV_Assert(cn == 4);
 #if CV_SIMD_WIDTH >= 64
-            for ( ; dx <= w - v_uint16::nlanes; dx += v_uint16::nlanes, S0 += 2*v_uint16::nlanes, S1 += 2*v_uint16::nlanes, D += v_uint16::nlanes)
+            for ( ; dx <= w - VTraits<v_uint16>::vlanes(); dx += VTraits<v_uint16>::vlanes(), S0 += 2*VTraits<v_uint16>::vlanes(), S1 += 2*VTraits<v_uint16>::vlanes(), D += VTraits<v_uint16>::vlanes())
             {
                 v_uint64 r00, r01, r10, r11;
                 v_load_deinterleave((uint64_t*)S0, r00, r01);
@@ -2652,7 +2652,7 @@ public:
                 r0 = v_add(r0, r2); r1 = v_add(r1, r3);
                 v_uint32 v_d;
 #if CV_SIMD_WIDTH == 16
-                v_d = r0 + r1;
+                v_d = v_add(r0, r1);
 #elif CV_SIMD_WIDTH == 32
                 v_uint32 t0, t1;
                 v_recombine(r0, r1, t0, t1);
@@ -2697,7 +2697,7 @@ public:
         {
 #if CV_SIMD_WIDTH == 16
             for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
-                v_rshr_pack_store<2>(D, v_load_expand(S0) + v_load_expand(S0 + 3) + v_load_expand(S1) + v_load_expand(S1 + 3));
+                v_rshr_pack_store<2>(D, v_add(v_add(v_add(v_load_expand(S0), v_load_expand(S0 + 3)), v_load_expand(S1)), v_load_expand(S1 + 3)));
 #elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64
             for ( ; dx <= w - 3*VTraits<v_int16>::vlanes(); dx += 3*VTraits<v_int16>::vlanes(), S0 += 6*VTraits<v_int16>::vlanes(), S1 += 6*VTraits<v_int16>::vlanes(), D += 3*VTraits<v_int16>::vlanes())
             {
@@ -2738,7 +2738,7 @@ public:
                 v_store_interleave(D, v_rshr_pack<2>(bl, bh), v_rshr_pack<2>(gl, gh), v_rshr_pack<2>(rl, rh));
             }
 #elif CV_SIMD_WIDTH >= 64
-            for ( ; dx <= w - 3*v_int16::nlanes; dx += 3*v_int16::nlanes, S0 += 6*v_int16::nlanes, S1 += 6*v_int16::nlanes, D += 3*v_int16::nlanes)
+            for ( ; dx <= w - 3*VTraits<v_int16>::vlanes(); dx += 3*VTraits<v_int16>::vlanes(), S0 += 6*VTraits<v_int16>::vlanes(), S1 += 6*VTraits<v_int16>::vlanes(), D += 3*VTraits<v_int16>::vlanes())
             {
                 v_int16 b0, g0, r0, b1, g1, r1;
                 v_load_deinterleave(S0, b0, g0, r0);
@@ -2746,8 +2746,8 @@ public:
                 v_int32 bl = (v_reinterpret_as_s32(b0) >> 16) + ((v_reinterpret_as_s32(b0) << 16) >> 16) + (v_reinterpret_as_s32(b1) >> 16) + ((v_reinterpret_as_s32(b1) << 16) >> 16);
                 v_int32 gl = (v_reinterpret_as_s32(g0) >> 16) + ((v_reinterpret_as_s32(g0) << 16) >> 16) + (v_reinterpret_as_s32(g1) >> 16) + ((v_reinterpret_as_s32(g1) << 16) >> 16);
                 v_int32 rl = (v_reinterpret_as_s32(r0) >> 16) + ((v_reinterpret_as_s32(r0) << 16) >> 16) + (v_reinterpret_as_s32(r1) >> 16) + ((v_reinterpret_as_s32(r1) << 16) >> 16);
-                v_load_deinterleave(S0 + 3*v_int16::nlanes, b0, g0, r0);
-                v_load_deinterleave(S1 + 3*v_int16::nlanes, b1, g1, r1);
+                v_load_deinterleave(S0 + 3*VTraits<v_int16>::vlanes(), b0, g0, r0);
+                v_load_deinterleave(S1 + 3*VTraits<v_int16>::vlanes(), b1, g1, r1);
                 v_int32 bh = (v_reinterpret_as_s32(b0) >> 16) + ((v_reinterpret_as_s32(b0) << 16) >> 16) + (v_reinterpret_as_s32(b1) >> 16) + ((v_reinterpret_as_s32(b1) << 16) >> 16);
                 v_int32 gh = (v_reinterpret_as_s32(g0) >> 16) + ((v_reinterpret_as_s32(g0) << 16) >> 16) + (v_reinterpret_as_s32(g1) >> 16) + ((v_reinterpret_as_s32(g1) << 16) >> 16);
                 v_int32 rh = (v_reinterpret_as_s32(r0) >> 16) + ((v_reinterpret_as_s32(r0) << 16) >> 16) + (v_reinterpret_as_s32(r1) >> 16) + ((v_reinterpret_as_s32(r1) << 16) >> 16);
@@ -2779,7 +2779,7 @@ public:
                 r3 = v_add(vx_load_expand(S0 + 3 * VTraits<v_int32>::vlanes()), vx_load_expand(S1 + 3 * VTraits<v_int32>::vlanes()));
                 v_int32 dl, dh;
 #if CV_SIMD_WIDTH == 16
-                dl = r0 + r1; dh = r2 + r3;
+                dl = v_add(r0, r1); dh = v_add(r2, r3);
 #elif CV_SIMD_WIDTH == 32
                 v_int32 t0, t1, t2, t3;
                 v_recombine(r0, r1, t0, t1); v_recombine(r2, r3, t2, t3);
@@ -2829,14 +2829,14 @@ struct ResizeAreaFastVec_SIMD_32f
         {
 #if CV_SIMD_WIDTH == 16
             v_float32 v_025 = vx_setall_f32(0.25f);
-            for (; dx <= w - v_float32::nlanes; dx += v_float32::nlanes, S0 += 2*v_float32::nlanes, S1 += 2*v_float32::nlanes, D += v_float32::nlanes)
-                v_store(D, ((vx_load(S0) + vx_load(S0 + v_float32::nlanes)) + (vx_load(S1) + vx_load(S1 + v_float32::nlanes))) * v_025);
+            for (; dx <= w - VTraits<v_float32>::vlanes(); dx += VTraits<v_float32>::vlanes(), S0 += 2*VTraits<v_float32>::vlanes(), S1 += 2*VTraits<v_float32>::vlanes(), D += VTraits<v_float32>::vlanes())
+                v_store(D, v_mul(v_add(v_add(vx_load(S0), vx_load(S0 + VTraits<v_float32>::vlanes())), v_add(vx_load(S1), vx_load(S1 + VTraits<v_float32>::vlanes()))), v_025));
 #elif CV_SIMD256
             v_float32x8 v_025 = v256_setall_f32(0.25f);
-            for (; dx <= w - v_float32x8::nlanes; dx += v_float32x8::nlanes, S0 += 2*v_float32x8::nlanes, S1 += 2*v_float32x8::nlanes, D += v_float32x8::nlanes)
+            for (; dx <= w - VTraits<v_float32x8>::vlanes(); dx += VTraits<v_float32x8>::vlanes(), S0 += 2*VTraits<v_float32x8>::vlanes(), S1 += 2*VTraits<v_float32x8>::vlanes(), D += VTraits<v_float32x8>::vlanes())
             {
                 v_float32x8 dst0, dst1;
-                v_recombine(v_add(v256_load(S0), v256_load(S1)), v_add(v256_load(S0 + v_float32x8::nlanes), v256_load(S1 + v_float32x8::nlanes)), dst0, dst1);
+                v_recombine(v_add(v256_load(S0), v256_load(S1)), v_add(v256_load(S0 + VTraits<v_float32x8>::vlanes()), v256_load(S1 + VTraits<v_float32x8>::vlanes())), dst0, dst1);
                 v_store(D, v_mul(v_add(dst0, dst1), v_025));
             }
 #endif
diff --git a/modules/imgproc/src/sumpixels.simd.hpp b/modules/imgproc/src/sumpixels.simd.hpp
index f5f3a92d85..208ffc1231 100644
--- a/modules/imgproc/src/sumpixels.simd.hpp
+++ b/modules/imgproc/src/sumpixels.simd.hpp
@@ -114,7 +114,7 @@ struct Integral_SIMD<uchar, int, double>
 
                 v_int32 prev = vx_setzero_s32();
                 int j = 0;
-                for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
+                for ( ; j + VTraits<v_uint16>::vlanes() <= width; j += VTraits<v_uint16>::vlanes())
                 {
                     v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
                     v_int32 el4l, el4h;
@@ -127,8 +127,8 @@ struct Integral_SIMD<uchar, int, double>
                     el4h.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_high(vsum)), _mm256_permutevar8x32_epi32(el4l.val, shmask));
                     prev.val = _mm256_permutevar8x32_epi32(el4h.val, shmask);
 #else
-                    el8 += v_rotate_left<1>(el8);
-                    el8 += v_rotate_left<2>(el8);
+                    el8 = v_add(el8, v_rotate_left<1>(el8));
+                    el8 = v_add(el8, v_rotate_left<2>(el8));
 #if CV_SIMD_WIDTH >= 32
                     el8 += v_rotate_left<4>(el8);
 #if CV_SIMD_WIDTH == 64
@@ -136,12 +136,12 @@ struct Integral_SIMD<uchar, int, double>
 #endif
 #endif
                     v_expand(el8, el4l, el4h);
-                    el4l += prev;
-                    el4h += el4l;
-                    prev = v_broadcast_element<v_int32::nlanes - 1>(el4h);
+                    el4l = v_add(el4l, prev);
+                    el4h = v_add(el4h, el4l);
+                    prev = v_broadcast_highest(el4h);
 #endif
-                    v_store(sum_row + j                  , el4l + vx_load(prev_sum_row + j                  ));
-                    v_store(sum_row + j + v_int32::nlanes, el4h + vx_load(prev_sum_row + j + v_int32::nlanes));
+                    v_store(sum_row + j                  , v_add(el4l, vx_load(prev_sum_row + j)));
+                    v_store(sum_row + j + VTraits<v_int32>::vlanes(), v_add(el4h, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes())));
                 }
 
                 for (int v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j)
@@ -162,11 +162,11 @@ struct Integral_SIMD<uchar, int, double>
 
                 v_int32 prev_1 = vx_setzero_s32(), prev_2 = vx_setzero_s32();
                 int j = 0;
-                for ( ; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn)
+                for ( ; j + VTraits<v_uint16>::vlanes() * cn <= width; j += VTraits<v_uint16>::vlanes() * cn)
                 {
                     v_int16 v_src_row = v_reinterpret_as_s16(vx_load(src_row + j));
-                    v_int16 el8_1 = v_src_row & mask;
-                    v_int16 el8_2 = v_reinterpret_as_s16(v_reinterpret_as_u16(v_src_row) >> 8);
+                    v_int16 el8_1 = v_and(v_src_row, mask);
+                    v_int16 el8_2 = v_reinterpret_as_s16(v_shr<8>(v_reinterpret_as_u16(v_src_row)));
                     v_int32 el4l_1, el4h_1, el4l_2, el4h_2;
 #if CV_AVX2 && CV_SIMD_WIDTH == 32
                     __m256i vsum_1 = _mm256_add_epi16(el8_1.val, _mm256_slli_si256(el8_1.val, 2));
@@ -183,10 +183,10 @@ struct Integral_SIMD<uchar, int, double>
                     prev_1.val = _mm256_permutevar8x32_epi32(el4h_1.val, shmask);
                     prev_2.val = _mm256_permutevar8x32_epi32(el4h_2.val, shmask);
 #else
-                    el8_1 += v_rotate_left<1>(el8_1);
-                    el8_2 += v_rotate_left<1>(el8_2);
-                    el8_1 += v_rotate_left<2>(el8_1);
-                    el8_2 += v_rotate_left<2>(el8_2);
+                    el8_1 = v_add(el8_1, v_rotate_left<1>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<1>(el8_2));
+                    el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
 #if CV_SIMD_WIDTH >= 32
                     el8_1 += v_rotate_left<4>(el8_1);
                     el8_2 += v_rotate_left<4>(el8_2);
@@ -197,20 +197,20 @@ struct Integral_SIMD<uchar, int, double>
 #endif
                     v_expand(el8_1, el4l_1, el4h_1);
                     v_expand(el8_2, el4l_2, el4h_2);
-                    el4l_1 += prev_1;
-                    el4l_2 += prev_2;
-                    el4h_1 += el4l_1;
-                    el4h_2 += el4l_2;
-                    prev_1 = v_broadcast_element<v_int32::nlanes - 1>(el4h_1);
-                    prev_2 = v_broadcast_element<v_int32::nlanes - 1>(el4h_2);
+                    el4l_1 = v_add(el4l_1, prev_1);
+                    el4l_2 = v_add(el4l_2, prev_2);
+                    el4h_1 = v_add(el4h_1, el4l_1);
+                    el4h_2 = v_add(el4h_2, el4l_2);
+                    prev_1 = v_broadcast_highest(el4h_1);
+                    prev_2 = v_broadcast_highest(el4h_2);
 #endif
                     v_int32 el4_1, el4_2, el4_3, el4_4;
                     v_zip(el4l_1, el4l_2, el4_1, el4_2);
                     v_zip(el4h_1, el4h_2, el4_3, el4_4);
-                    v_store(sum_row + j                      , el4_1 + vx_load(prev_sum_row + j                      ));
-                    v_store(sum_row + j + v_int32::nlanes    , el4_2 + vx_load(prev_sum_row + j + v_int32::nlanes    ));
-                    v_store(sum_row + j + v_int32::nlanes * 2, el4_3 + vx_load(prev_sum_row + j + v_int32::nlanes * 2));
-                    v_store(sum_row + j + v_int32::nlanes * 3, el4_4 + vx_load(prev_sum_row + j + v_int32::nlanes * 3));
+                    v_store(sum_row + j                      , v_add(el4_1, vx_load(prev_sum_row + j)));
+                    v_store(sum_row + j + VTraits<v_int32>::vlanes()    , v_add(el4_2, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes())));
+                    v_store(sum_row + j + VTraits<v_int32>::vlanes() * 2, v_add(el4_3, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes() * 2)));
+                    v_store(sum_row + j + VTraits<v_int32>::vlanes() * 3, v_add(el4_4, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes() * 3)));
                 }
 
                 for (int v2 = sum_row[j - 1] - prev_sum_row[j - 1],
@@ -230,7 +230,7 @@ struct Integral_SIMD<uchar, int, double>
                 const uchar * src_row = src + _srcstep * i;
                 int * prev_sum_row = (int *)((uchar *)sum + _sumstep * i) + cn;
                 int * sum_row = (int *)((uchar *)sum + _sumstep * (i + 1)) + cn;
-                int row_cache[v_int32::nlanes * 6];
+                int row_cache[VTraits<v_int32>::max_nlanes * 6];
 
                 sum_row[-1] = sum_row[-2] = sum_row[-3] = 0;
 
@@ -238,10 +238,10 @@ struct Integral_SIMD<uchar, int, double>
                         prev_3 = vx_setzero_s32();
                 int j = 0;
                 const int j_max =
-                        ((_srcstep * i + (width - v_uint16::nlanes * cn + v_uint8::nlanes * cn)) >= _srcstep * height)
-                        ? width - v_uint8::nlanes * cn    // uint8 in v_load_deinterleave()
-                        : width - v_uint16::nlanes * cn;  // v_expand_low
-                for ( ; j <= j_max; j += v_uint16::nlanes * cn)
+                        ((_srcstep * i + (width - VTraits<v_uint16>::vlanes() * cn + VTraits<v_uint8>::vlanes() * cn)) >= _srcstep * height)
+                        ? width - VTraits<v_uint8>::vlanes() * cn    // uint8 in v_load_deinterleave()
+                        : width - VTraits<v_uint16>::vlanes() * cn;  // v_expand_low
+                for ( ; j <= j_max; j += VTraits<v_uint16>::vlanes() * cn)
                 {
                     v_uint8 v_src_row_1, v_src_row_2, v_src_row_3;
                     v_load_deinterleave(src_row + j, v_src_row_1, v_src_row_2, v_src_row_3);
@@ -270,49 +270,49 @@ struct Integral_SIMD<uchar, int, double>
                     prev_2.val = _mm256_permutevar8x32_epi32(el4h_2.val, shmask);
                     prev_3.val = _mm256_permutevar8x32_epi32(el4h_3.val, shmask);
 #else
-                    el8_1 += v_rotate_left<1>(el8_1);
-                    el8_2 += v_rotate_left<1>(el8_2);
-                    el8_3 += v_rotate_left<1>(el8_3);
-                    el8_1 += v_rotate_left<2>(el8_1);
-                    el8_2 += v_rotate_left<2>(el8_2);
-                    el8_3 += v_rotate_left<2>(el8_3);
+                    el8_1 = v_add(el8_1,v_rotate_left<1>(el8_1));
+                    el8_2 = v_add(el8_2,v_rotate_left<1>(el8_2));
+                    el8_3 = v_add(el8_3,v_rotate_left<1>(el8_3));
+                    el8_1 = v_add(el8_1,v_rotate_left<2>(el8_1));
+                    el8_2 = v_add(el8_2,v_rotate_left<2>(el8_2));
+                    el8_3 = v_add(el8_3,v_rotate_left<2>(el8_3));
 #if CV_SIMD_WIDTH >= 32
-                    el8_1 += v_rotate_left<4>(el8_1);
-                    el8_2 += v_rotate_left<4>(el8_2);
-                    el8_3 += v_rotate_left<4>(el8_3);
+                    el8_1 = v_add(el8_1, v_rotate_left<4>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<4>(el8_2));
+                    el8_3 = v_add(el8_3, v_rotate_left<4>(el8_3));
 #if CV_SIMD_WIDTH == 64
-                    el8_1 += v_rotate_left<8>(el8_1);
-                    el8_2 += v_rotate_left<8>(el8_2);
-                    el8_3 += v_rotate_left<8>(el8_3);
+                    el8_1 = v_add(el8_1, v_rotate_left<8>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<8>(el8_2));
+                    el8_3 = v_add(el8_3, v_rotate_left<8>(el8_3));
 #endif
 #endif
                     v_expand(el8_1, el4l_1, el4h_1);
                     v_expand(el8_2, el4l_2, el4h_2);
                     v_expand(el8_3, el4l_3, el4h_3);
-                    el4l_1 += prev_1;
-                    el4l_2 += prev_2;
-                    el4l_3 += prev_3;
-                    el4h_1 += el4l_1;
-                    el4h_2 += el4l_2;
-                    el4h_3 += el4l_3;
-                    prev_1 = v_broadcast_element<v_int32::nlanes - 1>(el4h_1);
-                    prev_2 = v_broadcast_element<v_int32::nlanes - 1>(el4h_2);
-                    prev_3 = v_broadcast_element<v_int32::nlanes - 1>(el4h_3);
+                    el4l_1 = v_add(el4l_1, prev_1);
+                    el4l_2 = v_add(el4l_2, prev_2);
+                    el4l_3 = v_add(el4l_3, prev_3);
+                    el4h_1 = v_add(el4h_1, el4l_1);
+                    el4h_2 = v_add(el4h_2, el4l_2);
+                    el4h_3 = v_add(el4h_3, el4l_3);
+                    prev_1 = v_broadcast_highest(el4h_1);
+                    prev_2 = v_broadcast_highest(el4h_2);
+                    prev_3 = v_broadcast_highest(el4h_3);
 #endif
                     v_store_interleave(row_cache                      , el4l_1, el4l_2, el4l_3);
-                    v_store_interleave(row_cache + v_int32::nlanes * 3, el4h_1, el4h_2, el4h_3);
+                    v_store_interleave(row_cache + VTraits<v_int32>::vlanes() * 3, el4h_1, el4h_2, el4h_3);
                     el4l_1 = vx_load(row_cache                      );
-                    el4l_2 = vx_load(row_cache + v_int32::nlanes    );
-                    el4l_3 = vx_load(row_cache + v_int32::nlanes * 2);
-                    el4h_1 = vx_load(row_cache + v_int32::nlanes * 3);
-                    el4h_2 = vx_load(row_cache + v_int32::nlanes * 4);
-                    el4h_3 = vx_load(row_cache + v_int32::nlanes * 5);
-                    v_store(sum_row + j                      , el4l_1 + vx_load(prev_sum_row + j                      ));
-                    v_store(sum_row + j + v_int32::nlanes    , el4l_2 + vx_load(prev_sum_row + j + v_int32::nlanes    ));
-                    v_store(sum_row + j + v_int32::nlanes * 2, el4l_3 + vx_load(prev_sum_row + j + v_int32::nlanes * 2));
-                    v_store(sum_row + j + v_int32::nlanes * 3, el4h_1 + vx_load(prev_sum_row + j + v_int32::nlanes * 3));
-                    v_store(sum_row + j + v_int32::nlanes * 4, el4h_2 + vx_load(prev_sum_row + j + v_int32::nlanes * 4));
-                    v_store(sum_row + j + v_int32::nlanes * 5, el4h_3 + vx_load(prev_sum_row + j + v_int32::nlanes * 5));
+                    el4l_2 = vx_load(row_cache + VTraits<v_int32>::vlanes()    );
+                    el4l_3 = vx_load(row_cache + VTraits<v_int32>::vlanes() * 2);
+                    el4h_1 = vx_load(row_cache + VTraits<v_int32>::vlanes() * 3);
+                    el4h_2 = vx_load(row_cache + VTraits<v_int32>::vlanes() * 4);
+                    el4h_3 = vx_load(row_cache + VTraits<v_int32>::vlanes() * 5);
+                    v_store(sum_row + j                      ,            v_add(el4l_1, vx_load(prev_sum_row + j                      )));
+                    v_store(sum_row + j + VTraits<v_int32>::vlanes()    , v_add(el4l_2, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes()    )));
+                    v_store(sum_row + j + VTraits<v_int32>::vlanes() * 2, v_add(el4l_3, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes() * 2)));
+                    v_store(sum_row + j + VTraits<v_int32>::vlanes() * 3, v_add(el4h_1, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes() * 3)));
+                    v_store(sum_row + j + VTraits<v_int32>::vlanes() * 4, v_add(el4h_2, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes() * 4)));
+                    v_store(sum_row + j + VTraits<v_int32>::vlanes() * 5, v_add(el4h_3, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes() * 5)));
                 }
 
                 for (int v3 = sum_row[j - 1] - prev_sum_row[j - 1],
@@ -339,7 +339,7 @@ struct Integral_SIMD<uchar, int, double>
 
                 v_int32 prev = vx_setzero_s32();
                 int j = 0;
-                for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
+                for ( ; j + VTraits<v_uint16>::vlanes() <= width; j += VTraits<v_uint16>::vlanes())
                 {
                     v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
                     v_int32 el4l, el4h;
@@ -356,8 +356,8 @@ struct Integral_SIMD<uchar, int, double>
 #endif
 #endif
                     v_expand(el8, el4l, el4h);
-                    el4l += prev;
-                    el4h += el4l;
+                    el4l = v_add(el4l, prev);
+                    el4h = v_add(el4h, el4l);
 #if CV_SIMD_WIDTH == 16
                     prev = el4h;
 #elif CV_SIMD_WIDTH == 32
@@ -368,8 +368,8 @@ struct Integral_SIMD<uchar, int, double>
                     prev = v_combine_low(t, t);
 #endif
 #endif
-                    v_store(sum_row + j                  , el4l + vx_load(prev_sum_row + j                  ));
-                    v_store(sum_row + j + v_int32::nlanes, el4h + vx_load(prev_sum_row + j + v_int32::nlanes));
+                    v_store(sum_row + j                  , v_add(el4l, vx_load(prev_sum_row + j)));
+                    v_store(sum_row + j + VTraits<v_int32>::vlanes(), v_add(el4h, vx_load(prev_sum_row + j + VTraits<v_int32>::vlanes())));
                 }
 
                 for (int v4 = sum_row[j - 1] - prev_sum_row[j - 1],
@@ -426,7 +426,7 @@ struct Integral_SIMD<uchar, float, double>
 
                 v_float32 prev = vx_setzero_f32();
                 int j = 0;
-                for (; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
+                for (; j + VTraits<v_uint16>::vlanes() <= width; j += VTraits<v_uint16>::vlanes())
                 {
                     v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
                     v_float32 el4l, el4h;
@@ -439,8 +439,8 @@ struct Integral_SIMD<uchar, float, double>
                     el4h.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_high(vsum))), _mm256_permutevar8x32_ps(el4l.val, shmask));
                     prev.val = _mm256_permutevar8x32_ps(el4h.val, shmask);
 #else
-                    el8 += v_rotate_left<1>(el8);
-                    el8 += v_rotate_left<2>(el8);
+                    el8 = v_add(el8, v_rotate_left<1>(el8));
+                    el8 = v_add(el8, v_rotate_left<2>(el8));
 #if CV_SIMD_WIDTH >= 32
                     el8 += v_rotate_left<4>(el8);
 #if CV_SIMD_WIDTH == 64
@@ -449,12 +449,12 @@ struct Integral_SIMD<uchar, float, double>
 #endif
                     v_int32 el4li, el4hi;
                     v_expand(el8, el4li, el4hi);
-                    el4l = v_cvt_f32(el4li) + prev;
-                    el4h = v_cvt_f32(el4hi) + el4l;
-                    prev = v_broadcast_element<v_float32::nlanes - 1>(el4h);
+                    el4l = v_add(v_cvt_f32(el4li), prev);
+                    el4h = v_add(v_cvt_f32(el4hi), el4l);
+                    prev = v_broadcast_highest(el4h);
 #endif
-                    v_store(sum_row + j                    , el4l + vx_load(prev_sum_row + j                    ));
-                    v_store(sum_row + j + v_float32::nlanes, el4h + vx_load(prev_sum_row + j + v_float32::nlanes));
+                    v_store(sum_row + j                    , v_add(el4l, vx_load(prev_sum_row + j)));
+                    v_store(sum_row + j + VTraits<v_float32>::vlanes(), v_add(el4h, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes())));
                 }
 
                 for (float v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j)
@@ -475,11 +475,11 @@ struct Integral_SIMD<uchar, float, double>
 
                 v_float32 prev_1 = vx_setzero_f32(), prev_2 = vx_setzero_f32();
                 int j = 0;
-                for (; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn)
+                for (; j + VTraits<v_uint16>::vlanes() * cn <= width; j += VTraits<v_uint16>::vlanes() * cn)
                 {
                     v_int16 v_src_row = v_reinterpret_as_s16(vx_load(src_row + j));
-                    v_int16 el8_1 = v_src_row & mask;
-                    v_int16 el8_2 = v_reinterpret_as_s16(v_reinterpret_as_u16(v_src_row) >> 8);
+                    v_int16 el8_1 = v_and(v_src_row, mask);
+                    v_int16 el8_2 = v_reinterpret_as_s16(v_shr<8>(v_reinterpret_as_u16(v_src_row)));
                     v_float32 el4l_1, el4h_1, el4l_2, el4h_2;
 #if CV_AVX2 && CV_SIMD_WIDTH == 32
                     __m256i vsum_1 = _mm256_add_epi16(el8_1.val, _mm256_slli_si256(el8_1.val, 2));
@@ -496,10 +496,10 @@ struct Integral_SIMD<uchar, float, double>
                     prev_1.val = _mm256_permutevar8x32_ps(el4h_1.val, shmask);
                     prev_2.val = _mm256_permutevar8x32_ps(el4h_2.val, shmask);
 #else
-                    el8_1 += v_rotate_left<1>(el8_1);
-                    el8_2 += v_rotate_left<1>(el8_2);
-                    el8_1 += v_rotate_left<2>(el8_1);
-                    el8_2 += v_rotate_left<2>(el8_2);
+                    el8_1 = v_add(el8_1, v_rotate_left<1>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<1>(el8_2));
+                    el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
 #if CV_SIMD_WIDTH >= 32
                     el8_1 += v_rotate_left<4>(el8_1);
                     el8_2 += v_rotate_left<4>(el8_2);
@@ -511,20 +511,20 @@ struct Integral_SIMD<uchar, float, double>
                     v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2;
                     v_expand(el8_1, el4li_1, el4hi_1);
                     v_expand(el8_2, el4li_2, el4hi_2);
-                    el4l_1 = v_cvt_f32(el4li_1) + prev_1;
-                    el4l_2 = v_cvt_f32(el4li_2) + prev_2;
-                    el4h_1 = v_cvt_f32(el4hi_1) + el4l_1;
-                    el4h_2 = v_cvt_f32(el4hi_2) + el4l_2;
-                    prev_1 = v_broadcast_element<v_float32::nlanes - 1>(el4h_1);
-                    prev_2 = v_broadcast_element<v_float32::nlanes - 1>(el4h_2);
+                    el4l_1 = v_add(v_cvt_f32(el4li_1), prev_1);
+                    el4l_2 = v_add(v_cvt_f32(el4li_2), prev_2);
+                    el4h_1 = v_add(v_cvt_f32(el4hi_1), el4l_1);
+                    el4h_2 = v_add(v_cvt_f32(el4hi_2), el4l_2);
+                    prev_1 = v_broadcast_highest(el4h_1);
+                    prev_2 = v_broadcast_highest(el4h_2);
 #endif
                     v_float32 el4_1, el4_2, el4_3, el4_4;
                     v_zip(el4l_1, el4l_2, el4_1, el4_2);
                     v_zip(el4h_1, el4h_2, el4_3, el4_4);
-                    v_store(sum_row + j                        , el4_1 + vx_load(prev_sum_row + j                        ));
-                    v_store(sum_row + j + v_float32::nlanes    , el4_2 + vx_load(prev_sum_row + j + v_float32::nlanes    ));
-                    v_store(sum_row + j + v_float32::nlanes * 2, el4_3 + vx_load(prev_sum_row + j + v_float32::nlanes * 2));
-                    v_store(sum_row + j + v_float32::nlanes * 3, el4_4 + vx_load(prev_sum_row + j + v_float32::nlanes * 3));
+                    v_store(sum_row + j                        , v_add(el4_1, vx_load(prev_sum_row + j)));
+                    v_store(sum_row + j + VTraits<v_float32>::vlanes()    , v_add(el4_2, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes())));
+                    v_store(sum_row + j + VTraits<v_float32>::vlanes() * 2, v_add(el4_3, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes() * 2)));
+                    v_store(sum_row + j + VTraits<v_float32>::vlanes() * 3, v_add(el4_4, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes() * 3)));
                 }
 
                 for (float v2 = sum_row[j - 1] - prev_sum_row[j - 1],
@@ -543,7 +543,7 @@ struct Integral_SIMD<uchar, float, double>
                 const uchar * src_row = src + _srcstep * i;
                 float * prev_sum_row = (float *)((uchar *)sum + _sumstep * i) + cn;
                 float * sum_row = (float *)((uchar *)sum + _sumstep * (i + 1)) + cn;
-                float row_cache[v_float32::nlanes * 6];
+                float row_cache[VTraits<v_float32>::max_nlanes * 6];
 
                 sum_row[-1] = sum_row[-2] = sum_row[-3] = 0;
 
@@ -551,10 +551,10 @@ struct Integral_SIMD<uchar, float, double>
                           prev_3 = vx_setzero_f32();
                 int j = 0;
                 const int j_max =
-                        ((_srcstep * i + (width - v_uint16::nlanes * cn + v_uint8::nlanes * cn)) >= _srcstep * height)
-                        ? width - v_uint8::nlanes * cn    // uint8 in v_load_deinterleave()
-                        : width - v_uint16::nlanes * cn;  // v_expand_low
-                for ( ; j <= j_max; j += v_uint16::nlanes * cn)
+                        ((_srcstep * i + (width - VTraits<v_uint16>::vlanes() * cn + VTraits<v_uint8>::vlanes() * cn)) >= _srcstep * height)
+                        ? width - VTraits<v_uint8>::vlanes() * cn    // uint8 in v_load_deinterleave()
+                        : width - VTraits<v_uint16>::vlanes() * cn;  // v_expand_low
+                for ( ; j <= j_max; j += VTraits<v_uint16>::vlanes() * cn)
                 {
                     v_uint8 v_src_row_1, v_src_row_2, v_src_row_3;
                     v_load_deinterleave(src_row + j, v_src_row_1, v_src_row_2, v_src_row_3);
@@ -583,12 +583,12 @@ struct Integral_SIMD<uchar, float, double>
                     prev_2.val = _mm256_permutevar8x32_ps(el4h_2.val, shmask);
                     prev_3.val = _mm256_permutevar8x32_ps(el4h_3.val, shmask);
 #else
-                    el8_1 += v_rotate_left<1>(el8_1);
-                    el8_2 += v_rotate_left<1>(el8_2);
-                    el8_3 += v_rotate_left<1>(el8_3);
-                    el8_1 += v_rotate_left<2>(el8_1);
-                    el8_2 += v_rotate_left<2>(el8_2);
-                    el8_3 += v_rotate_left<2>(el8_3);
+                    el8_1 = v_add(el8_1, v_rotate_left<1>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<1>(el8_2));
+                    el8_3 = v_add(el8_3, v_rotate_left<1>(el8_3));
+                    el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
+                    el8_3 = v_add(el8_3, v_rotate_left<2>(el8_3));
 #if CV_SIMD_WIDTH >= 32
                     el8_1 += v_rotate_left<4>(el8_1);
                     el8_2 += v_rotate_left<4>(el8_2);
@@ -603,30 +603,30 @@ struct Integral_SIMD<uchar, float, double>
                     v_expand(el8_1, el4li_1, el4hi_1);
                     v_expand(el8_2, el4li_2, el4hi_2);
                     v_expand(el8_3, el4li_3, el4hi_3);
-                    el4l_1 = v_cvt_f32(el4li_1) + prev_1;
-                    el4l_2 = v_cvt_f32(el4li_2) + prev_2;
-                    el4l_3 = v_cvt_f32(el4li_3) + prev_3;
-                    el4h_1 = v_cvt_f32(el4hi_1) + el4l_1;
-                    el4h_2 = v_cvt_f32(el4hi_2) + el4l_2;
-                    el4h_3 = v_cvt_f32(el4hi_3) + el4l_3;
-                    prev_1 = v_broadcast_element<v_float32::nlanes - 1>(el4h_1);
-                    prev_2 = v_broadcast_element<v_float32::nlanes - 1>(el4h_2);
-                    prev_3 = v_broadcast_element<v_float32::nlanes - 1>(el4h_3);
+                    el4l_1 = v_add(v_cvt_f32(el4li_1), prev_1);
+                    el4l_2 = v_add(v_cvt_f32(el4li_2), prev_2);
+                    el4l_3 = v_add(v_cvt_f32(el4li_3), prev_3);
+                    el4h_1 = v_add(v_cvt_f32(el4hi_1), el4l_1);
+                    el4h_2 = v_add(v_cvt_f32(el4hi_2), el4l_2);
+                    el4h_3 = v_add(v_cvt_f32(el4hi_3), el4l_3);
+                    prev_1 = v_broadcast_highest(el4h_1);
+                    prev_2 = v_broadcast_highest(el4h_2);
+                    prev_3 = v_broadcast_highest(el4h_3);
 #endif
                     v_store_interleave(row_cache                        , el4l_1, el4l_2, el4l_3);
-                    v_store_interleave(row_cache + v_float32::nlanes * 3, el4h_1, el4h_2, el4h_3);
+                    v_store_interleave(row_cache + VTraits<v_float32>::vlanes() * 3, el4h_1, el4h_2, el4h_3);
                     el4l_1 = vx_load(row_cache                        );
-                    el4l_2 = vx_load(row_cache + v_float32::nlanes    );
-                    el4l_3 = vx_load(row_cache + v_float32::nlanes * 2);
-                    el4h_1 = vx_load(row_cache + v_float32::nlanes * 3);
-                    el4h_2 = vx_load(row_cache + v_float32::nlanes * 4);
-                    el4h_3 = vx_load(row_cache + v_float32::nlanes * 5);
-                    v_store(sum_row + j                        , el4l_1 + vx_load(prev_sum_row + j                        ));
-                    v_store(sum_row + j + v_float32::nlanes    , el4l_2 + vx_load(prev_sum_row + j + v_float32::nlanes    ));
-                    v_store(sum_row + j + v_float32::nlanes * 2, el4l_3 + vx_load(prev_sum_row + j + v_float32::nlanes * 2));
-                    v_store(sum_row + j + v_float32::nlanes * 3, el4h_1 + vx_load(prev_sum_row + j + v_float32::nlanes * 3));
-                    v_store(sum_row + j + v_float32::nlanes * 4, el4h_2 + vx_load(prev_sum_row + j + v_float32::nlanes * 4));
-                    v_store(sum_row + j + v_float32::nlanes * 5, el4h_3 + vx_load(prev_sum_row + j + v_float32::nlanes * 5));
+                    el4l_2 = vx_load(row_cache + VTraits<v_float32>::vlanes()    );
+                    el4l_3 = vx_load(row_cache + VTraits<v_float32>::vlanes() * 2);
+                    el4h_1 = vx_load(row_cache + VTraits<v_float32>::vlanes() * 3);
+                    el4h_2 = vx_load(row_cache + VTraits<v_float32>::vlanes() * 4);
+                    el4h_3 = vx_load(row_cache + VTraits<v_float32>::vlanes() * 5);
+                    v_store(sum_row + j                        , v_add(el4l_1, vx_load(prev_sum_row + j)));
+                    v_store(sum_row + j + VTraits<v_float32>::vlanes()    , v_add(el4l_2, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes())));
+                    v_store(sum_row + j + VTraits<v_float32>::vlanes() * 2, v_add(el4l_3, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes() * 2)));
+                    v_store(sum_row + j + VTraits<v_float32>::vlanes() * 3, v_add(el4h_1, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes() * 3)));
+                    v_store(sum_row + j + VTraits<v_float32>::vlanes() * 4, v_add(el4h_2, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes() * 4)));
+                    v_store(sum_row + j + VTraits<v_float32>::vlanes() * 5, v_add(el4h_3, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes() * 5)));
                 }
 
                 for (float v3 = sum_row[j - 1] - prev_sum_row[j - 1],
@@ -652,7 +652,7 @@ struct Integral_SIMD<uchar, float, double>
 
                 v_float32 prev = vx_setzero_f32();
                 int j = 0;
-                for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
+                for ( ; j + VTraits<v_uint16>::vlanes() <= width; j += VTraits<v_uint16>::vlanes())
                 {
                     v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
                     v_float32 el4l, el4h;
@@ -670,8 +670,8 @@ struct Integral_SIMD<uchar, float, double>
 #endif
                     v_int32 el4li, el4hi;
                     v_expand(el8, el4li, el4hi);
-                    el4l = v_cvt_f32(el4li) + prev;
-                    el4h = v_cvt_f32(el4hi) + el4l;
+                    el4l = v_add(v_cvt_f32(el4li), prev);
+                    el4h = v_add(v_cvt_f32(el4hi), el4l);
 #if CV_SIMD_WIDTH == 16
                     prev = el4h;
 #elif CV_SIMD_WIDTH == 32
@@ -682,8 +682,8 @@ struct Integral_SIMD<uchar, float, double>
                     prev = v_combine_low(t, t);
 #endif
 #endif
-                    v_store(sum_row + j                    , el4l + vx_load(prev_sum_row + j                    ));
-                    v_store(sum_row + j + v_float32::nlanes, el4h + vx_load(prev_sum_row + j + v_float32::nlanes));
+                    v_store(sum_row + j                    , v_add(el4l, vx_load(prev_sum_row + j)));
+                    v_store(sum_row + j + VTraits<v_float32>::vlanes(), v_add(el4h, vx_load(prev_sum_row + j + VTraits<v_float32>::vlanes())));
                 }
 
                 for (float v4 = sum_row[j - 1] - prev_sum_row[j - 1],
@@ -750,7 +750,7 @@ struct Integral_SIMD<uchar, double, double>
 
                 v_float64 prev = vx_setzero_f64();
                 int j = 0;
-                for (; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
+                for (; j + VTraits<v_uint16>::vlanes() <= width; j += VTraits<v_uint16>::vlanes())
                 {
                     v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
                     v_float64 el4ll, el4lh, el4hl, el4hh;
@@ -767,8 +767,8 @@ struct Integral_SIMD<uchar, double, double>
                     el4hh.val = _mm256_add_pd(_mm256_cvtepi32_pd(_v256_extract_high(el4h_32)), el4d);
                     prev.val = _mm256_permute4x64_pd(el4hh.val, 0xff);
 #else
-                    el8 += v_rotate_left<1>(el8);
-                    el8 += v_rotate_left<2>(el8);
+                    el8 = v_add(el8, v_rotate_left<1>(el8));
+                    el8 = v_add(el8, v_rotate_left<2>(el8));
 #if CV_SIMD_WIDTH >= 32
                     el8 += v_rotate_left<4>(el8);
 #if CV_SIMD_WIDTH == 64
@@ -777,17 +777,17 @@ struct Integral_SIMD<uchar, double, double>
 #endif
                     v_int32 el4li, el4hi;
                     v_expand(el8, el4li, el4hi);
-                    el4ll = v_cvt_f64(el4li) + prev;
-                    el4lh = v_cvt_f64_high(el4li) + prev;
-                    el4hl = v_cvt_f64(el4hi) + el4ll;
-                    el4hh = v_cvt_f64_high(el4hi) + el4lh;
-                    prev = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh));
-//                    prev = v_broadcast_element<v_float64::nlanes - 1>(el4hh);
+                    el4ll = v_add(v_cvt_f64(el4li), prev);
+                    el4lh = v_add(v_cvt_f64_high(el4li), prev);
+                    el4hl = v_add(v_cvt_f64(el4hi), el4ll);
+                    el4hh = v_add(v_cvt_f64_high(el4hi), el4lh);
+                    prev = vx_setall_f64(v_extract_highest(el4hh));
+//                    prev = v_broadcast_highest(el4hh);
 #endif
-                    v_store(sum_row + j                        , el4ll + vx_load(prev_sum_row + j                        ));
-                    v_store(sum_row + j + v_float64::nlanes    , el4lh + vx_load(prev_sum_row + j + v_float64::nlanes    ));
-                    v_store(sum_row + j + v_float64::nlanes * 2, el4hl + vx_load(prev_sum_row + j + v_float64::nlanes * 2));
-                    v_store(sum_row + j + v_float64::nlanes * 3, el4hh + vx_load(prev_sum_row + j + v_float64::nlanes * 3));
+                    v_store(sum_row + j                        , v_add(el4ll, vx_load(prev_sum_row + j)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes()    , v_add(el4lh, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes())));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 2, v_add(el4hl, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 2)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 3, v_add(el4hh, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 3)));
                 }
 
                 for (double v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j)
@@ -808,11 +808,11 @@ struct Integral_SIMD<uchar, double, double>
 
                 v_float64 prev_1 = vx_setzero_f64(), prev_2 = vx_setzero_f64();
                 int j = 0;
-                for (; j + v_uint16::nlanes * cn <= width; j += v_uint16::nlanes * cn)
+                for (; j + VTraits<v_uint16>::vlanes() * cn <= width; j += VTraits<v_uint16>::vlanes() * cn)
                 {
                     v_int16 v_src_row = v_reinterpret_as_s16(vx_load(src_row + j));
-                    v_int16 el8_1 = v_src_row & mask;
-                    v_int16 el8_2 = v_reinterpret_as_s16(v_reinterpret_as_u16(v_src_row) >> 8);
+                    v_int16 el8_1 = v_and(v_src_row, mask);
+                    v_int16 el8_2 = v_reinterpret_as_s16(v_shr<8>(v_reinterpret_as_u16(v_src_row)));
                     v_float64 el4ll_1, el4lh_1, el4hl_1, el4hh_1, el4ll_2, el4lh_2, el4hl_2, el4hh_2;
 #if CV_AVX2 && CV_SIMD_WIDTH == 32
                     __m256i vsum_1 = _mm256_add_epi16(el8_1.val, _mm256_slli_si256(el8_1.val, 2));
@@ -838,10 +838,10 @@ struct Integral_SIMD<uchar, double, double>
                     prev_1.val = _mm256_permute4x64_pd(el4hh_1.val, 0xff);
                     prev_2.val = _mm256_permute4x64_pd(el4hh_2.val, 0xff);
 #else
-                    el8_1 += v_rotate_left<1>(el8_1);
-                    el8_2 += v_rotate_left<1>(el8_2);
-                    el8_1 += v_rotate_left<2>(el8_1);
-                    el8_2 += v_rotate_left<2>(el8_2);
+                    el8_1 = v_add(el8_1, v_rotate_left<1>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<1>(el8_2));
+                    el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
 #if CV_SIMD_WIDTH >= 32
                     el8_1 += v_rotate_left<4>(el8_1);
                     el8_2 += v_rotate_left<4>(el8_2);
@@ -853,32 +853,32 @@ struct Integral_SIMD<uchar, double, double>
                     v_int32 el4li_1, el4hi_1, el4li_2, el4hi_2;
                     v_expand(el8_1, el4li_1, el4hi_1);
                     v_expand(el8_2, el4li_2, el4hi_2);
-                    el4ll_1 = v_cvt_f64(el4li_1) + prev_1;
-                    el4ll_2 = v_cvt_f64(el4li_2) + prev_2;
-                    el4lh_1 = v_cvt_f64_high(el4li_1) + prev_1;
-                    el4lh_2 = v_cvt_f64_high(el4li_2) + prev_2;
-                    el4hl_1 = v_cvt_f64(el4hi_1) + el4ll_1;
-                    el4hl_2 = v_cvt_f64(el4hi_2) + el4ll_2;
-                    el4hh_1 = v_cvt_f64_high(el4hi_1) + el4lh_1;
-                    el4hh_2 = v_cvt_f64_high(el4hi_2) + el4lh_2;
-                    prev_1 = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh_1));
-                    prev_2 = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh_2));
-//                    prev_1 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_1);
-//                    prev_2 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_2);
+                    el4ll_1 = v_add(v_cvt_f64(el4li_1), prev_1);
+                    el4ll_2 = v_add(v_cvt_f64(el4li_2), prev_2);
+                    el4lh_1 = v_add(v_cvt_f64_high(el4li_1), prev_1);
+                    el4lh_2 = v_add(v_cvt_f64_high(el4li_2), prev_2);
+                    el4hl_1 = v_add(v_cvt_f64(el4hi_1), el4ll_1);
+                    el4hl_2 = v_add(v_cvt_f64(el4hi_2), el4ll_2);
+                    el4hh_1 = v_add(v_cvt_f64_high(el4hi_1), el4lh_1);
+                    el4hh_2 = v_add(v_cvt_f64_high(el4hi_2), el4lh_2);
+                    prev_1 = vx_setall_f64(v_extract_highest(el4hh_1));
+                    prev_2 = vx_setall_f64(v_extract_highest(el4hh_2));
+//                    prev_1 = v_broadcast_highest(el4hh_1);
+//                    prev_2 = v_broadcast_highest(el4hh_2);
 #endif
                     v_float64 el4_1, el4_2, el4_3, el4_4, el4_5, el4_6, el4_7, el4_8;
                     v_zip(el4ll_1, el4ll_2, el4_1, el4_2);
                     v_zip(el4lh_1, el4lh_2, el4_3, el4_4);
                     v_zip(el4hl_1, el4hl_2, el4_5, el4_6);
                     v_zip(el4hh_1, el4hh_2, el4_7, el4_8);
-                    v_store(sum_row + j                        , el4_1 + vx_load(prev_sum_row + j                        ));
-                    v_store(sum_row + j + v_float64::nlanes    , el4_2 + vx_load(prev_sum_row + j + v_float64::nlanes    ));
-                    v_store(sum_row + j + v_float64::nlanes * 2, el4_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 2));
-                    v_store(sum_row + j + v_float64::nlanes * 3, el4_4 + vx_load(prev_sum_row + j + v_float64::nlanes * 3));
-                    v_store(sum_row + j + v_float64::nlanes * 4, el4_5 + vx_load(prev_sum_row + j + v_float64::nlanes * 4));
-                    v_store(sum_row + j + v_float64::nlanes * 5, el4_6 + vx_load(prev_sum_row + j + v_float64::nlanes * 5));
-                    v_store(sum_row + j + v_float64::nlanes * 6, el4_7 + vx_load(prev_sum_row + j + v_float64::nlanes * 6));
-                    v_store(sum_row + j + v_float64::nlanes * 7, el4_8 + vx_load(prev_sum_row + j + v_float64::nlanes * 7));
+                    v_store(sum_row + j                        , v_add(el4_1, vx_load(prev_sum_row + j)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes()    , v_add(el4_2, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes())));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 2, v_add(el4_3, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 2)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 3, v_add(el4_4, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 3)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 4, v_add(el4_5, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 4)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 5, v_add(el4_6, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 5)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 6, v_add(el4_7, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 6)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 7, v_add(el4_8, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 7)));
                 }
 
                 for (double v2 = sum_row[j - 1] - prev_sum_row[j - 1],
@@ -897,7 +897,7 @@ struct Integral_SIMD<uchar, double, double>
                 const uchar * src_row = src + _srcstep * i;
                 double * prev_sum_row = (double *)((uchar *)sum + _sumstep * i) + cn;
                 double * sum_row = (double *)((uchar *)sum + _sumstep * (i + 1)) + cn;
-                double row_cache[v_float64::nlanes * 12];
+                double row_cache[VTraits<v_float64>::max_nlanes * 12];
 
                 sum_row[-1] = sum_row[-2] = sum_row[-3] = 0;
 
@@ -905,10 +905,10 @@ struct Integral_SIMD<uchar, double, double>
                           prev_3 = vx_setzero_f64();
                 int j = 0;
                 const int j_max =
-                        ((_srcstep * i + (width - v_uint16::nlanes * cn + v_uint8::nlanes * cn)) >= _srcstep * height)
-                        ? width - v_uint8::nlanes * cn    // uint8 in v_load_deinterleave()
-                        : width - v_uint16::nlanes * cn;  // v_expand_low
-                for ( ; j <= j_max; j += v_uint16::nlanes * cn)
+                        ((_srcstep * i + (width - VTraits<v_uint16>::vlanes() * cn + VTraits<v_uint8>::vlanes() * cn)) >= _srcstep * height)
+                        ? width - VTraits<v_uint8>::vlanes() * cn    // uint8 in v_load_deinterleave()
+                        : width - VTraits<v_uint16>::vlanes() * cn;  // v_expand_low
+                for ( ; j <= j_max; j += VTraits<v_uint16>::vlanes() * cn)
                 {
                     v_uint8 v_src_row_1, v_src_row_2, v_src_row_3;
                     v_load_deinterleave(src_row + j, v_src_row_1, v_src_row_2, v_src_row_3);
@@ -951,12 +951,12 @@ struct Integral_SIMD<uchar, double, double>
                     prev_2.val = _mm256_permute4x64_pd(el4hh_2.val, 0xff);
                     prev_3.val = _mm256_permute4x64_pd(el4hh_3.val, 0xff);
 #else
-                    el8_1 += v_rotate_left<1>(el8_1);
-                    el8_2 += v_rotate_left<1>(el8_2);
-                    el8_3 += v_rotate_left<1>(el8_3);
-                    el8_1 += v_rotate_left<2>(el8_1);
-                    el8_2 += v_rotate_left<2>(el8_2);
-                    el8_3 += v_rotate_left<2>(el8_3);
+                    el8_1 = v_add(el8_1, v_rotate_left<1>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<1>(el8_2));
+                    el8_3 = v_add(el8_3, v_rotate_left<1>(el8_3));
+                    el8_1 = v_add(el8_1, v_rotate_left<2>(el8_1));
+                    el8_2 = v_add(el8_2, v_rotate_left<2>(el8_2));
+                    el8_3 = v_add(el8_3, v_rotate_left<2>(el8_3));
 #if CV_SIMD_WIDTH >= 32
                     el8_1 += v_rotate_left<4>(el8_1);
                     el8_2 += v_rotate_left<4>(el8_2);
@@ -971,53 +971,53 @@ struct Integral_SIMD<uchar, double, double>
                     v_expand(el8_1, el4li_1, el4hi_1);
                     v_expand(el8_2, el4li_2, el4hi_2);
                     v_expand(el8_3, el4li_3, el4hi_3);
-                    el4ll_1 = v_cvt_f64(el4li_1) + prev_1;
-                    el4ll_2 = v_cvt_f64(el4li_2) + prev_2;
-                    el4ll_3 = v_cvt_f64(el4li_3) + prev_3;
-                    el4lh_1 = v_cvt_f64_high(el4li_1) + prev_1;
-                    el4lh_2 = v_cvt_f64_high(el4li_2) + prev_2;
-                    el4lh_3 = v_cvt_f64_high(el4li_3) + prev_3;
-                    el4hl_1 = v_cvt_f64(el4hi_1) + el4ll_1;
-                    el4hl_2 = v_cvt_f64(el4hi_2) + el4ll_2;
-                    el4hl_3 = v_cvt_f64(el4hi_3) + el4ll_3;
-                    el4hh_1 = v_cvt_f64_high(el4hi_1) + el4lh_1;
-                    el4hh_2 = v_cvt_f64_high(el4hi_2) + el4lh_2;
-                    el4hh_3 = v_cvt_f64_high(el4hi_3) + el4lh_3;
-                    prev_1 = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh_1));
-                    prev_2 = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh_2));
-                    prev_3 = vx_setall_f64(v_extract_n<v_float64::nlanes - 1>(el4hh_3));
-//                    prev_1 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_1);
-//                    prev_2 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_2);
-//                    prev_3 = v_broadcast_element<v_float64::nlanes - 1>(el4hh_3);
+                    el4ll_1 = v_add(v_cvt_f64(el4li_1), prev_1);
+                    el4ll_2 = v_add(v_cvt_f64(el4li_2), prev_2);
+                    el4ll_3 = v_add(v_cvt_f64(el4li_3), prev_3);
+                    el4lh_1 = v_add(v_cvt_f64_high(el4li_1), prev_1);
+                    el4lh_2 = v_add(v_cvt_f64_high(el4li_2), prev_2);
+                    el4lh_3 = v_add(v_cvt_f64_high(el4li_3), prev_3);
+                    el4hl_1 = v_add(v_cvt_f64(el4hi_1), el4ll_1);
+                    el4hl_2 = v_add(v_cvt_f64(el4hi_2), el4ll_2);
+                    el4hl_3 = v_add(v_cvt_f64(el4hi_3), el4ll_3);
+                    el4hh_1 = v_add(v_cvt_f64_high(el4hi_1), el4lh_1);
+                    el4hh_2 = v_add(v_cvt_f64_high(el4hi_2), el4lh_2);
+                    el4hh_3 = v_add(v_cvt_f64_high(el4hi_3), el4lh_3);
+                    prev_1 = vx_setall_f64(v_extract_highest(el4hh_1));
+                    prev_2 = vx_setall_f64(v_extract_highest(el4hh_2));
+                    prev_3 = vx_setall_f64(v_extract_highest(el4hh_3));
+//                    prev_1 = v_broadcast_highest(el4hh_1);
+//                    prev_2 = v_broadcast_highest(el4hh_2);
+//                    prev_3 = v_broadcast_highest(el4hh_3);
 #endif
                     v_store_interleave(row_cache                        , el4ll_1, el4ll_2, el4ll_3);
-                    v_store_interleave(row_cache + v_float64::nlanes * 3, el4lh_1, el4lh_2, el4lh_3);
-                    v_store_interleave(row_cache + v_float64::nlanes * 6, el4hl_1, el4hl_2, el4hl_3);
-                    v_store_interleave(row_cache + v_float64::nlanes * 9, el4hh_1, el4hh_2, el4hh_3);
+                    v_store_interleave(row_cache + VTraits<v_float64>::vlanes() * 3, el4lh_1, el4lh_2, el4lh_3);
+                    v_store_interleave(row_cache + VTraits<v_float64>::vlanes() * 6, el4hl_1, el4hl_2, el4hl_3);
+                    v_store_interleave(row_cache + VTraits<v_float64>::vlanes() * 9, el4hh_1, el4hh_2, el4hh_3);
                     el4ll_1 = vx_load(row_cache                         );
-                    el4ll_2 = vx_load(row_cache + v_float64::nlanes     );
-                    el4ll_3 = vx_load(row_cache + v_float64::nlanes * 2 );
-                    el4lh_1 = vx_load(row_cache + v_float64::nlanes * 3 );
-                    el4lh_2 = vx_load(row_cache + v_float64::nlanes * 4 );
-                    el4lh_3 = vx_load(row_cache + v_float64::nlanes * 5 );
-                    el4hl_1 = vx_load(row_cache + v_float64::nlanes * 6 );
-                    el4hl_2 = vx_load(row_cache + v_float64::nlanes * 7 );
-                    el4hl_3 = vx_load(row_cache + v_float64::nlanes * 8 );
-                    el4hh_1 = vx_load(row_cache + v_float64::nlanes * 9 );
-                    el4hh_2 = vx_load(row_cache + v_float64::nlanes * 10);
-                    el4hh_3 = vx_load(row_cache + v_float64::nlanes * 11);
-                    v_store(sum_row + j                         , el4ll_1 + vx_load(prev_sum_row + j                         ));
-                    v_store(sum_row + j + v_float64::nlanes     , el4ll_2 + vx_load(prev_sum_row + j + v_float64::nlanes     ));
-                    v_store(sum_row + j + v_float64::nlanes * 2 , el4ll_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 2 ));
-                    v_store(sum_row + j + v_float64::nlanes * 3 , el4lh_1 + vx_load(prev_sum_row + j + v_float64::nlanes * 3 ));
-                    v_store(sum_row + j + v_float64::nlanes * 4 , el4lh_2 + vx_load(prev_sum_row + j + v_float64::nlanes * 4 ));
-                    v_store(sum_row + j + v_float64::nlanes * 5 , el4lh_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 5 ));
-                    v_store(sum_row + j + v_float64::nlanes * 6 , el4hl_1 + vx_load(prev_sum_row + j + v_float64::nlanes * 6 ));
-                    v_store(sum_row + j + v_float64::nlanes * 7 , el4hl_2 + vx_load(prev_sum_row + j + v_float64::nlanes * 7 ));
-                    v_store(sum_row + j + v_float64::nlanes * 8 , el4hl_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 8 ));
-                    v_store(sum_row + j + v_float64::nlanes * 9 , el4hh_1 + vx_load(prev_sum_row + j + v_float64::nlanes * 9 ));
-                    v_store(sum_row + j + v_float64::nlanes * 10, el4hh_2 + vx_load(prev_sum_row + j + v_float64::nlanes * 10));
-                    v_store(sum_row + j + v_float64::nlanes * 11, el4hh_3 + vx_load(prev_sum_row + j + v_float64::nlanes * 11));
+                    el4ll_2 = vx_load(row_cache + VTraits<v_float64>::vlanes()     );
+                    el4ll_3 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 2 );
+                    el4lh_1 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 3 );
+                    el4lh_2 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 4 );
+                    el4lh_3 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 5 );
+                    el4hl_1 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 6 );
+                    el4hl_2 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 7 );
+                    el4hl_3 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 8 );
+                    el4hh_1 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 9 );
+                    el4hh_2 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 10);
+                    el4hh_3 = vx_load(row_cache + VTraits<v_float64>::vlanes() * 11);
+                    v_store(sum_row + j                         , v_add(el4ll_1, vx_load(prev_sum_row + j)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes()     , v_add(el4ll_2, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes())));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 2 , v_add(el4ll_3, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 2)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 3 , v_add(el4lh_1, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 3)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 4 , v_add(el4lh_2, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 4)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 5 , v_add(el4lh_3, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 5)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 6 , v_add(el4hl_1, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 6)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 7 , v_add(el4hl_2, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 7)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 8 , v_add(el4hl_3, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 8)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 9 , v_add(el4hh_1, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 9)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 10, v_add(el4hh_2, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 10)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 11, v_add(el4hh_3, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 11)));
                 }
 
                 for (double v3 = sum_row[j - 1] - prev_sum_row[j - 1],
@@ -1043,7 +1043,7 @@ struct Integral_SIMD<uchar, double, double>
 
                 v_float64 prev_1 = vx_setzero_f64(), prev_2 = vx_setzero_f64();
                 int j = 0;
-                for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
+                for ( ; j + VTraits<v_uint16>::vlanes() <= width; j += VTraits<v_uint16>::vlanes())
                 {
                     v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
                     v_float64 el4ll, el4lh, el4hl, el4hh;
@@ -1065,10 +1065,10 @@ struct Integral_SIMD<uchar, double, double>
 #endif
                     v_int32 el4li, el4hi;
                     v_expand(el8, el4li, el4hi);
-                    el4ll = v_cvt_f64(el4li) + prev_1;
-                    el4lh = v_cvt_f64_high(el4li) + prev_2;
-                    el4hl = v_cvt_f64(el4hi) + el4ll;
-                    el4hh = v_cvt_f64_high(el4hi) + el4lh;
+                    el4ll = v_add(v_cvt_f64(el4li), prev_1);
+                    el4lh = v_add(v_cvt_f64_high(el4li), prev_2);
+                    el4hl = v_add(v_cvt_f64(el4hi), el4ll);
+                    el4hh = v_add(v_cvt_f64_high(el4hi), el4lh);
 #if CV_SIMD_WIDTH == 16
                     prev_1 = el4hl;
                     prev_2 = el4hh;
@@ -1078,10 +1078,10 @@ struct Integral_SIMD<uchar, double, double>
                     prev_1 = prev_2 = v_combine_high(el4hh, el4hh);
 #endif
 #endif
-                    v_store(sum_row + j                        , el4ll + vx_load(prev_sum_row + j                       ));
-                    v_store(sum_row + j + v_float64::nlanes    , el4lh + vx_load(prev_sum_row + j + v_float64::nlanes   ));
-                    v_store(sum_row + j + v_float64::nlanes * 2, el4hl + vx_load(prev_sum_row + j + v_float64::nlanes * 2));
-                    v_store(sum_row + j + v_float64::nlanes * 3, el4hh + vx_load(prev_sum_row + j + v_float64::nlanes * 3));
+                    v_store(sum_row + j                        , v_add(el4ll, vx_load(prev_sum_row + j)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes()    , v_add(el4lh, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes())));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 2, v_add(el4hl, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 2)));
+                    v_store(sum_row + j + VTraits<v_float64>::vlanes() * 3, v_add(el4hh, vx_load(prev_sum_row + j + VTraits<v_float64>::vlanes() * 3)));
                 }
 
                 for (double v4 = sum_row[j - 1] - prev_sum_row[j - 1],
diff --git a/modules/objdetect/src/hog.cpp b/modules/objdetect/src/hog.cpp
index b57e92ff9a..b83263304f 100644
--- a/modules/objdetect/src/hog.cpp
+++ b/modules/objdetect/src/hog.cpp
@@ -268,13 +268,13 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
         for ( i = 0; i < 256; i += 4)
         {
             v_store(_data + i, v_sqrt(idx));
-            idx += ifour;
+            idx = v_add(idx, ifour);
         }
     else
         for ( i = 0; i < 256; i += 4)
         {
             v_store(_data + i, idx);
-            idx += ifour;
+            idx = v_add(idx, ifour);
         }
 #else
     if( gammaCorrection )
@@ -320,7 +320,7 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
         for ( ; x <= end - 4; x += 4)
         {
             v_int32x4 mul_res = v_load(xmap + x);
-            mul_res += mul_res + mul_res;
+            mul_res = v_add(mul_res, v_add(mul_res, mul_res));
             v_store(xmap + x, mul_res);
         }
 #endif
@@ -444,34 +444,34 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
             {
                 int x0 = xmap[x], x1 = xmap[x+1], x2 = xmap[x+2], x3 = xmap[x+3];
 
-                v_float32x4 _dx0 = v_load(lutCurr+x+widthP2*0+2) - v_load(lutCurr+x+widthP2*0);
-                v_float32x4 _dx1 = v_load(lutCurr+x+widthP2*1+2) - v_load(lutCurr+x+widthP2*1);
-                v_float32x4 _dx2 = v_load(lutCurr+x+widthP2*2+2) - v_load(lutCurr+x+widthP2*2);
+                v_float32x4 _dx0 = v_sub(v_load(lutCurr + x + widthP2 * 0 + 2), v_load(lutCurr + x + widthP2 * 0));
+                v_float32x4 _dx1 = v_sub(v_load(lutCurr + x + widthP2 * 1 + 2), v_load(lutCurr + x + widthP2 * 1));
+                v_float32x4 _dx2 = v_sub(v_load(lutCurr + x + widthP2 * 2 + 2), v_load(lutCurr + x + widthP2 * 2));
 
                 v_float32x4 _dy00 = v_float32x4(lut[nextPtr[x0+0]], lut[nextPtr[x1+0]], lut[nextPtr[x2+0]], lut[nextPtr[x3+0]]);
-                v_float32x4 _dy0 = _dy00 - v_load(lutPrev+x+widthP2*0+1);
+                v_float32x4 _dy0 = v_sub(_dy00, v_load(lutPrev + x + widthP2 * 0 + 1));
 
                 v_store(lutNext+x+widthP2*0+1, _dy00);
 
                 v_float32x4 _dy10 = v_float32x4(lut[nextPtr[x0+1]], lut[nextPtr[x1+1]], lut[nextPtr[x2+1]], lut[nextPtr[x3+1]]);
-                v_float32x4 _dy1 = _dy10 - v_load(lutPrev+x+widthP2*1+1);
+                v_float32x4 _dy1 = v_sub(_dy10, v_load(lutPrev + x + widthP2 * 1 + 1));
 
                 v_store(lutNext+x+widthP2*1+1, _dy10);
 
                 v_float32x4 _dy20 = v_float32x4(lut[nextPtr[x0+2]], lut[nextPtr[x1+2]], lut[nextPtr[x2+2]], lut[nextPtr[x3+2]]);
-                v_float32x4 _dy2 = _dy20 - v_load(lutPrev+x+widthP2*2+1);
+                v_float32x4 _dy2 = v_sub(_dy20, v_load(lutPrev + x + widthP2 * 2 + 1));
 
                 v_store(lutNext+x+widthP2*2+1, _dy20);
 
-                v_float32x4 _mag0 = (_dx0 * _dx0) + (_dy0 * _dy0);
-                v_float32x4 _mag1 = (_dx1 * _dx1) + (_dy1 * _dy1);
-                v_float32x4 _mag2 = (_dx2 * _dx2) + (_dy2 * _dy2);
+                v_float32x4 _mag0 = v_add(v_mul(_dx0, _dx0), v_mul(_dy0, _dy0));
+                v_float32x4 _mag1 = v_add(v_mul(_dx1, _dx1), v_mul(_dy1, _dy1));
+                v_float32x4 _mag2 = v_add(v_mul(_dx2, _dx2), v_mul(_dy2, _dy2));
 
-                v_float32x4 mask = v_reinterpret_as_f32(_mag2 > _mag1);
+                v_float32x4 mask = v_reinterpret_as_f32(v_gt(_mag2, _mag1));
                 _dx2 = v_select(mask, _dx2, _dx1);
                 _dy2 = v_select(mask, _dy2, _dy1);
 
-                mask = v_reinterpret_as_f32(v_max(_mag2, _mag1) > _mag0);
+                mask = v_reinterpret_as_f32(v_gt(v_max(_mag2, _mag1), _mag0));
                 _dx2 = v_select(mask, _dx2, _dx0);
                 _dy2 = v_select(mask, _dy2, _dy0);
 
@@ -537,25 +537,25 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
             int x2 = x << 1;
             v_float32x4 _mag = v_load(dbuf + x + (width << 1));
             v_float32x4 _angle = v_load(dbuf + x + width * 3);
-            _angle = (_angleScale * _angle) - fhalf;
+            _angle = v_sub(v_mul(_angleScale, _angle), fhalf);
 
             v_int32x4 _hidx = v_floor(_angle);
-            _angle -= v_cvt_f32(_hidx);
+            _angle = v_sub(_angle, v_cvt_f32(_hidx));
 
-            v_float32x4 ft0 = _mag * (fone - _angle);
-            v_float32x4 ft1 = _mag * _angle;
+            v_float32x4 ft0 = v_mul(_mag, v_sub(fone, _angle));
+            v_float32x4 ft1 = v_mul(_mag, _angle);
 
             v_store_interleave(gradPtr + x2, ft0, ft1);
 
-            v_int32x4 mask0 = _hidx >> 31;
-            v_int32x4 it0 = mask0 & _nbins;
-            mask0 = (_hidx >= _nbins);
-            v_int32x4 it1 = mask0 & _nbins;
-            _hidx += (it0 - it1);
+            v_int32x4 mask0 = v_shr<31>(_hidx);
+            v_int32x4 it0 = v_and(mask0, _nbins);
+            mask0 = (v_ge(_hidx, _nbins));
+            v_int32x4 it1 = v_and(mask0, _nbins);
+            _hidx = v_add(_hidx, v_sub(it0, it1));
 
             it0 = v_reinterpret_as_s32(v_pack(v_pack(_hidx, izero), v_reinterpret_as_s16(izero)));
-            _hidx += ione;
-            _hidx &= (_hidx < _nbins);
+            _hidx = v_add(_hidx, ione);
+            _hidx = v_and(_hidx, v_lt(_hidx, _nbins));
             it1 = v_reinterpret_as_s32(v_pack(v_pack(_hidx, izero), v_reinterpret_as_s16(izero)));
             v_uint8x16 it2, it3;
             v_zip(v_reinterpret_as_u8(it0), v_reinterpret_as_u8(it1), it2, it3);
@@ -707,9 +707,9 @@ void HOGCache::init(const HOGDescriptor* _descriptor,
 
         for (; i <= blockSize.height - 4; i += 4)
         {
-            v_float32x4 t = idx - _bh;
-            t *= t;
-            idx += ifour;
+            v_float32x4 t = v_sub(idx, _bh);
+            t = v_mul(t, t);
+            idx = v_add(idx, ifour);
             v_store(_di + i, t);
         }
     #endif
@@ -725,9 +725,9 @@ void HOGCache::init(const HOGDescriptor* _descriptor,
 
         for (; j <= blockSize.height - 4; j += 4)
         {
-            v_float32x4 t = idx - _bw;
-            t *= t;
-            idx += ifour;
+            v_float32x4 t = v_sub(idx, _bw);
+            t = v_mul(t, t);
+            idx = v_add(idx, ifour);
             v_store(_dj + j, t);
         }
     #endif
@@ -936,8 +936,8 @@ const float* HOGCache::getBlock(Point pt, float* buf)
         int h0 = h[0], h1 = h[1];
 
         v_float32x4 _a0 = v_setall_f32(a[0]), _a1 = v_setall_f32(a[1]);
-        v_float32x4 w = v_setall_f32(pk.gradWeight) * v_load(pk.histWeights);
-        v_float32x4 _t0 = _a0 * w, _t1 = _a1 * w;
+        v_float32x4 w = v_mul(v_setall_f32(pk.gradWeight), v_load(pk.histWeights));
+        v_float32x4 _t0 = v_mul(_a0, w), _t1 = v_mul(_a1, w);
 
         v_store(hist0, _t0);
         v_store(hist1, _t1);
@@ -984,8 +984,8 @@ const float* HOGCache::getBlock(Point pt, float* buf)
         int h0 = h[0], h1 = h[1];
 
         v_float32x4 _a0 = v_setall_f32(a[0]), _a1 = v_setall_f32(a[1]);
-        v_float32x4 w = v_setall_f32(pk.gradWeight) * v_load(pk.histWeights);
-        v_float32x4 _t0 = _a0 * w, _t1 = _a1 * w;
+        v_float32x4 w = v_mul(v_setall_f32(pk.gradWeight), v_load(pk.histWeights));
+        v_float32x4 _t0 = v_mul(_a0, w), _t1 = v_mul(_a1, w);
 
         v_store(hist0, _t0);
         v_store(hist1, _t1);
@@ -1057,12 +1057,12 @@ void HOGCache::normalizeBlockHistogram(float* _hist) const
 
 #if CV_SIMD128
     v_float32x4 p0 = v_load(hist);
-    v_float32x4 s = p0 * p0;
+    v_float32x4 s = v_mul(p0, p0);
 
     for (i = 4; i <= sz - 4; i += 4)
     {
         p0 = v_load(hist + i);
-        s += p0 * p0;
+        s = v_add(s, v_mul(p0, p0));
     }
     v_store(partSum, s);
 #else
@@ -1091,17 +1091,17 @@ void HOGCache::normalizeBlockHistogram(float* _hist) const
     v_float32x4 _scale = v_setall_f32(scale);
     static v_float32x4 _threshold = v_setall_f32(thresh);
 
-    v_float32x4 p = _scale * v_load(hist);
+    v_float32x4 p = v_mul(_scale, v_load(hist));
     p = v_min(p, _threshold);
-    s = p * p;
+    s = v_mul(p, p);
     v_store(hist, p);
 
     for(i = 4 ; i <= sz - 4; i += 4)
     {
         p = v_load(hist + i);
-        p *= _scale;
+        p = v_mul(p, _scale);
         p = v_min(p, _threshold);
-        s += p * p;
+        s = v_add(s, v_mul(p, p));
         v_store(hist + i, p);
     }
 
@@ -1137,7 +1137,7 @@ void HOGCache::normalizeBlockHistogram(float* _hist) const
     v_float32x4 _scale2 = v_setall_f32(scale);
     for ( ; i <= sz - 4; i += 4)
     {
-        v_float32x4 t = _scale2 * v_load(hist + i);
+        v_float32x4 t = v_mul(_scale2, v_load(hist + i));
         v_store(hist + i, t);
     }
 #endif
@@ -1593,14 +1593,14 @@ void HOGDescriptor::detect(InputArray _img,
 #if CV_SIMD128
             v_float32x4 _vec = v_load(vec);
             v_float32x4 _svmVec = v_load(svmVec);
-            v_float32x4 sum = _svmVec * _vec;
+            v_float32x4 sum = v_mul(_svmVec, _vec);
 
             for( k = 4; k <= blockHistogramSize - 4; k += 4 )
             {
                 _vec = v_load(vec + k);
                 _svmVec = v_load(svmVec + k);
 
-                sum += _vec * _svmVec;
+                sum = v_add(sum, v_mul(_vec, _svmVec));
             }
 
             v_store(partSum, sum);
@@ -3392,14 +3392,14 @@ void HOGDescriptor::detectROI(InputArray _img, const std::vector<cv::Point> &loc
 #if CV_SIMD128
             v_float32x4 _vec = v_load(vec);
             v_float32x4 _svmVec = v_load(svmVec);
-            v_float32x4 sum = _svmVec * _vec;
+            v_float32x4 sum = v_mul(_svmVec, _vec);
 
             for( k = 4; k <= blockHistogramSize - 4; k += 4 )
             {
                 _vec = v_load(vec + k);
                 _svmVec = v_load(svmVec + k);
 
-                sum += _vec * _svmVec;
+                sum = v_add(sum, v_mul(_vec, _svmVec));
             }
 
             v_store(partSum, sum);
diff --git a/modules/video/src/dis_flow.cpp b/modules/video/src/dis_flow.cpp
index a260b8726b..40ac4517a4 100644
--- a/modules/video/src/dis_flow.cpp
+++ b/modules/video/src/dis_flow.cpp
@@ -520,16 +520,16 @@ DISOpticalFlowImpl::PatchInverseSearch_ParBody::PatchInverseSearch_ParBody(DISOp
     v_expand(I0_row_8, I0_row_4_left, I0_row_4_right);                                                                 \
                                                                                                                        \
     /* Compute diffs between I0 and bilinearly interpolated I1: */                                                     \
-    I_diff_left = w00v * v_cvt_f32(v_reinterpret_as_s32(I1_row_4_left)) +                                              \
-                  w01v * v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_left)) +                                      \
-                  w10v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_left)) +                                         \
-                  w11v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_left)) -                                 \
-                  v_cvt_f32(v_reinterpret_as_s32(I0_row_4_left));                                                      \
-    I_diff_right = w00v * v_cvt_f32(v_reinterpret_as_s32(I1_row_4_right)) +                                            \
-                   w01v * v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_right)) +                                    \
-                   w10v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_right)) +                                       \
-                   w11v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_right)) -                               \
-                   v_cvt_f32(v_reinterpret_as_s32(I0_row_4_right));
+    I_diff_left = v_sub(v_add(v_mul(w00v, v_cvt_f32(v_reinterpret_as_s32(I1_row_4_left))),                             \
+                  v_mul(w01v, v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_left))),                                 \
+                  v_mul(w10v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_left))),                                    \
+                  v_mul(w11v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_left)))),                           \
+                  v_cvt_f32(v_reinterpret_as_s32(I0_row_4_left)));                                                     \
+    I_diff_right = v_sub(v_add(v_mul(w00v, v_cvt_f32(v_reinterpret_as_s32(I1_row_4_right))),                           \
+                   v_mul(w01v, v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_right))),                               \
+                   v_mul(w10v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_right))),                                  \
+                   v_mul(w11v, v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_right)))),                         \
+                   v_cvt_f32(v_reinterpret_as_s32(I0_row_4_right)));
 
 #define HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW                                                                     \
     I0_ptr += I0_stride;                                                                                               \
@@ -572,9 +572,9 @@ inline float processPatch(float &dst_dUx, float &dst_dUy, uchar *I0_ptr, uchar *
             v_expand(I0y_row, I0y_row_4_left, I0y_row_4_right);
 
             /* Update the sums: */
-            Ux_vec += I_diff_left * v_cvt_f32(I0x_row_4_left) + I_diff_right * v_cvt_f32(I0x_row_4_right);
-            Uy_vec += I_diff_left * v_cvt_f32(I0y_row_4_left) + I_diff_right * v_cvt_f32(I0y_row_4_right);
-            SSD_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
+            Ux_vec = v_add(Ux_vec, v_add(v_mul(I_diff_left, v_cvt_f32(I0x_row_4_left)), v_mul(I_diff_right, v_cvt_f32(I0x_row_4_right))));
+            Uy_vec = v_add(Uy_vec, v_add(v_mul(I_diff_left, v_cvt_f32(I0y_row_4_left)), v_mul(I_diff_right, v_cvt_f32(I0y_row_4_right))));
+            SSD_vec = v_add(SSD_vec, v_add(v_mul(I_diff_left, I_diff_left), v_mul(I_diff_right, I_diff_right)));
 
             I0x_ptr += I0_stride;
             I0y_ptr += I0_stride;
@@ -640,10 +640,10 @@ inline float processPatchMeanNorm(float &dst_dUx, float &dst_dUy, uchar *I0_ptr,
             v_expand(I0y_row, I0y_row_4_left, I0y_row_4_right);
 
             /* Update the sums: */
-            sum_I0x_mul_vec += I_diff_left * v_cvt_f32(I0x_row_4_left) + I_diff_right * v_cvt_f32(I0x_row_4_right);
-            sum_I0y_mul_vec += I_diff_left * v_cvt_f32(I0y_row_4_left) + I_diff_right * v_cvt_f32(I0y_row_4_right);
-            sum_diff_sq_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
-            sum_diff_vec += I_diff_left + I_diff_right;
+            sum_I0x_mul_vec = v_add(sum_I0x_mul_vec, v_add(v_mul(I_diff_left, v_cvt_f32(I0x_row_4_left)), v_mul(I_diff_right, v_cvt_f32(I0x_row_4_right))));
+            sum_I0y_mul_vec = v_add(sum_I0y_mul_vec, v_add(v_mul(I_diff_left, v_cvt_f32(I0y_row_4_left)), v_mul(I_diff_right, v_cvt_f32(I0y_row_4_right))));
+            sum_diff_sq_vec = v_add(sum_diff_sq_vec, v_add(v_mul(I_diff_left, I_diff_left), v_mul(I_diff_right, I_diff_right)));
+            sum_diff_vec = v_add(sum_diff_vec, v_add(I_diff_left, I_diff_right));
 
             I0x_ptr += I0_stride;
             I0y_ptr += I0_stride;
@@ -692,7 +692,7 @@ inline float computeSSD(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int I1_stri
         for (int row = 0; row < 8; row++)
         {
             HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
-            SSD_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
+            SSD_vec = v_add(SSD_vec, v_add(v_mul(I_diff_left, I_diff_left), v_mul(I_diff_right, I_diff_right)));
             HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
         }
         SSD = v_reduce_sum(SSD_vec);
@@ -728,8 +728,8 @@ inline float computeSSDMeanNorm(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int
         for (int row = 0; row < 8; row++)
         {
             HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
-            sum_diff_sq_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
-            sum_diff_vec += I_diff_left + I_diff_right;
+            sum_diff_sq_vec = v_add(sum_diff_sq_vec, v_add(v_mul(I_diff_left, I_diff_left), v_mul(I_diff_right, I_diff_right)));
+            sum_diff_vec = v_add(sum_diff_vec, v_add(I_diff_left, I_diff_right));
             HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
         }
         sum_diff = v_reduce_sum(sum_diff_vec);
diff --git a/modules/video/src/lkpyramid.cpp b/modules/video/src/lkpyramid.cpp
index 8467035dbf..6d51c0cf1a 100644
--- a/modules/video/src/lkpyramid.cpp
+++ b/modules/video/src/lkpyramid.cpp
@@ -97,8 +97,8 @@ void cv::detail::ScharrDerivInvoker::operator()(const Range& range) const
                 v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(srow1 + x));
                 v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x));
 
-                v_int16x8 t1 = s2 - s0;
-                v_int16x8 t0 = v_mul_wrap(s0 + s2, c3) + v_mul_wrap(s1, c10);
+                v_int16x8 t1 = v_sub(s2, s0);
+                v_int16x8 t0 = v_add(v_mul_wrap(v_add(s0, s2), c3), v_mul_wrap(s1, c10));
 
                 v_store(trow0 + x, t0);
                 v_store(trow1 + x, t1);
@@ -134,8 +134,8 @@ void cv::detail::ScharrDerivInvoker::operator()(const Range& range) const
                 v_int16x8 s3 = v_load(trow1 + x);
                 v_int16x8 s4 = v_load(trow1 + x + cn);
 
-                v_int16x8 t0 = s1 - s0;
-                v_int16x8 t1 = v_mul_wrap(s2 + s4, c3) + v_mul_wrap(s3, c10);
+                v_int16x8 t0 = v_sub(s1, s0);
+                v_int16x8 t1 = v_add(v_mul_wrap(v_add(s2, s4), c3), v_mul_wrap(s3, c10));
 
                 v_store_interleave((drow + x*2), t0, t1);
             }
@@ -293,10 +293,10 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
                 v_zip(v00, v01, t00, t01);
                 v_zip(v10, v11, t10, t11);
 
-                t0 = v_dotprod(t00, qw0, qdelta) + v_dotprod(t10, qw1);
-                t1 = v_dotprod(t01, qw0, qdelta) + v_dotprod(t11, qw1);
-                t0 = t0 >> (W_BITS1-5);
-                t1 = t1 >> (W_BITS1-5);
+                t0 = v_add(v_dotprod(t00, qw0, qdelta), v_dotprod(t10, qw1));
+                t1 = v_add(v_dotprod(t01, qw0, qdelta), v_dotprod(t11, qw1));
+                t0 = v_shr<W_BITS1 - 5>(t0);
+                t1 = v_shr<W_BITS1 - 5>(t1);
                 v_store(Iptr + x, v_pack(t0, t1));
 
                 v00 = v_reinterpret_as_s16(v_load(dsrc));
@@ -307,10 +307,10 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
                 v_zip(v00, v01, t00, t01);
                 v_zip(v10, v11, t10, t11);
 
-                t0 = v_dotprod(t00, qw0, qdelta_d) + v_dotprod(t10, qw1);
-                t1 = v_dotprod(t01, qw0, qdelta_d) + v_dotprod(t11, qw1);
-                t0 = t0 >> W_BITS1;
-                t1 = t1 >> W_BITS1;
+                t0 = v_add(v_dotprod(t00, qw0, qdelta_d), v_dotprod(t10, qw1));
+                t1 = v_add(v_dotprod(t01, qw0, qdelta_d), v_dotprod(t11, qw1));
+                t0 = v_shr<W_BITS1>(t0);
+                t1 = v_shr<W_BITS1>(t1);
                 v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
                 v_store(dIptr, v00);
 
@@ -332,10 +332,10 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
                 v_zip(v00, v01, t00, t01);
                 v_zip(v10, v11, t10, t11);
 
-                t0 = v_dotprod(t00, qw0, qdelta_d) + v_dotprod(t10, qw1);
-                t1 = v_dotprod(t01, qw0, qdelta_d) + v_dotprod(t11, qw1);
-                t0 = t0 >> W_BITS1;
-                t1 = t1 >> W_BITS1;
+                t0 = v_add(v_dotprod(t00, qw0, qdelta_d), v_dotprod(t10, qw1));
+                t1 = v_add(v_dotprod(t01, qw0, qdelta_d), v_dotprod(t11, qw1));
+                t0 = v_shr<W_BITS1>(t0);
+                t1 = v_shr<W_BITS1>(t1);
                 v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
                 v_store(dIptr + 4*2, v00);
 
@@ -548,18 +548,18 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
                     v_zip(v00, v01, t00, t01);
                     v_zip(v10, v11, t10, t11);
 
-                    t0 = v_dotprod(t00, qw0, qdelta) + v_dotprod(t10, qw1);
-                    t1 = v_dotprod(t01, qw0, qdelta) + v_dotprod(t11, qw1);
-                    t0 = t0 >> (W_BITS1-5);
-                    t1 = t1 >> (W_BITS1-5);
-                    diff0 = v_pack(t0, t1) - diff0;
+                    t0 = v_add(v_dotprod(t00, qw0, qdelta), v_dotprod(t10, qw1));
+                    t1 = v_add(v_dotprod(t01, qw0, qdelta), v_dotprod(t11, qw1));
+                    t0 = v_shr<W_BITS1 - 5>(t0);
+                    t1 = v_shr<W_BITS1 - 5>(t1);
+                    diff0 = v_sub(v_pack(t0, t1), diff0);
                     v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
                     v00 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
                     v01 = v_reinterpret_as_s16(v_load(dIptr + 8));
                     v_zip(v00, v01, v10, v11);
                     v_zip(diff2, diff1, v00, v01);
-                    qb0 += v_cvt_f32(v_dotprod(v00, v10));
-                    qb1 += v_cvt_f32(v_dotprod(v01, v11));
+                    qb0 = v_add(qb0, v_cvt_f32(v_dotprod(v00, v10)));
+                    qb1 = v_add(qb1, v_cvt_f32(v_dotprod(v01, v11)));
                 }
 #endif
 
@@ -647,7 +647,7 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
 
 #if CV_SIMD128 && !CV_NEON
             v_float32x4 qf0, qf1;
-            v_recombine(v_interleave_pairs(qb0 + qb1), v_setzero_f32(), qf0, qf1);
+            v_recombine(v_interleave_pairs(v_add(qb0, qb1)), v_setzero_f32(), qf0, qf1);
             ib1 += v_reduce_sum(qf0);
             ib2 += v_reduce_sum(qf1);
 #endif
diff --git a/modules/video/src/optflowgf.cpp b/modules/video/src/optflowgf.cpp
index 2b164b62d3..02e878a577 100644
--- a/modules/video/src/optflowgf.cpp
+++ b/modules/video/src/optflowgf.cpp
@@ -463,22 +463,22 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1,
                 const float *sptr0 = srow[m], *sptr1;
                 v_float32x4 g4 = v_load(simd_kernel);
                 v_float32x4 s0, s1, s2, s3;
-                s0 = v_load(sptr0 + x) * g4;
-                s1 = v_load(sptr0 + x + 4) * g4;
-                s2 = v_load(sptr0 + x + 8) * g4;
-                s3 = v_load(sptr0 + x + 12) * g4;
+                s0 = v_mul(v_load(sptr0 + x), g4);
+                s1 = v_mul(v_load(sptr0 + x + 4), g4);
+                s2 = v_mul(v_load(sptr0 + x + 8), g4);
+                s3 = v_mul(v_load(sptr0 + x + 12), g4);
 
                 for( i = 1; i <= m; i++ )
                 {
                     v_float32x4 x0, x1;
                     sptr0 = srow[m+i], sptr1 = srow[m-i];
                     g4 = v_load(simd_kernel + i*4);
-                    x0 = v_load(sptr0 + x) + v_load(sptr1 + x);
-                    x1 = v_load(sptr0 + x + 4) + v_load(sptr1 + x + 4);
+                    x0 = v_add(v_load(sptr0 + x), v_load(sptr1 + x));
+                    x1 = v_add(v_load(sptr0 + x + 4), v_load(sptr1 + x + 4));
                     s0 = v_muladd(x0, g4, s0);
                     s1 = v_muladd(x1, g4, s1);
-                    x0 = v_load(sptr0 + x + 8) + v_load(sptr1 + x + 8);
-                    x1 = v_load(sptr0 + x + 12) + v_load(sptr1 + x + 12);
+                    x0 = v_add(v_load(sptr0 + x + 8), v_load(sptr1 + x + 8));
+                    x1 = v_add(v_load(sptr0 + x + 12), v_load(sptr1 + x + 12));
                     s2 = v_muladd(x0, g4, s2);
                     s3 = v_muladd(x1, g4, s3);
                 }
@@ -493,13 +493,13 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1,
             {
                 const float *sptr0 = srow[m], *sptr1;
                 v_float32x4 g4 = v_load(simd_kernel);
-                v_float32x4 s0 = v_load(sptr0 + x) * g4;
+                v_float32x4 s0 = v_mul(v_load(sptr0 + x), g4);
 
                 for( i = 1; i <= m; i++ )
                 {
                     sptr0 = srow[m+i], sptr1 = srow[m-i];
                     g4 = v_load(simd_kernel + i*4);
-                    v_float32x4 x0 = v_load(sptr0 + x) + v_load(sptr1 + x);
+                    v_float32x4 x0 = v_add(v_load(sptr0 + x), v_load(sptr1 + x));
                     s0 = v_muladd(x0, g4, s0);
                 }
                 v_store(vsum + x, s0);
@@ -528,14 +528,14 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1,
             for( ; x <= width*5 - 8; x += 8 )
             {
                 v_float32x4 g4 = v_load(simd_kernel);
-                v_float32x4 s0 = v_load(vsum + x) * g4;
-                v_float32x4 s1 = v_load(vsum + x + 4) * g4;
+                v_float32x4 s0 = v_mul(v_load(vsum + x), g4);
+                v_float32x4 s1 = v_mul(v_load(vsum + x + 4), g4);
 
                 for( i = 1; i <= m; i++ )
                 {
                     g4 = v_load(simd_kernel + i*4);
-                    v_float32x4 x0 = v_load(vsum + x - i*5) + v_load(vsum + x+ i*5);
-                    v_float32x4 x1 = v_load(vsum + x - i*5 + 4) + v_load(vsum + x+ i*5 + 4);
+                    v_float32x4 x0 = v_add(v_load(vsum + x - i * 5), v_load(vsum + x + i * 5));
+                    v_float32x4 x1 = v_add(v_load(vsum + x - i * 5 + 4), v_load(vsum + x + i * 5 + 4));
                     s0 = v_muladd(x0, g4, s0);
                     s1 = v_muladd(x1, g4, s1);
                 }
diff --git a/modules/video/src/variational_refinement.cpp b/modules/video/src/variational_refinement.cpp
index cca30f1ce7..968bce6717 100644
--- a/modules/video/src/variational_refinement.cpp
+++ b/modules/video/src/variational_refinement.cpp
@@ -651,15 +651,15 @@ void VariationalRefinementImpl::ComputeDataTerm_ParBody::operator()(const Range
             pdU_vec = v_load(pdU + j);
             pdV_vec = v_load(pdV + j);
 
-            derivNorm_vec = pIx_vec * pIx_vec + pIy_vec * pIy_vec + zeta_vec;
-            Ik1z_vec = pIz_vec + pIx_vec * pdU_vec + pIy_vec * pdV_vec;
-            weight_vec = (delta_vec / v_sqrt(Ik1z_vec * Ik1z_vec / derivNorm_vec + eps_vec)) / derivNorm_vec;
+            derivNorm_vec = v_add(v_add(v_mul(pIx_vec, pIx_vec), v_mul(pIy_vec, pIy_vec)), zeta_vec);
+            Ik1z_vec = v_add(v_add(pIz_vec, v_mul(pIx_vec, pdU_vec)), v_mul(pIy_vec, pdV_vec));
+            weight_vec = v_div(v_div(delta_vec, v_sqrt(v_add(v_div(v_mul(Ik1z_vec, Ik1z_vec), derivNorm_vec), eps_vec))), derivNorm_vec);
 
-            pa11_vec = weight_vec * (pIx_vec * pIx_vec) + zeta_vec;
-            pa12_vec = weight_vec * (pIx_vec * pIy_vec);
-            pa22_vec = weight_vec * (pIy_vec * pIy_vec) + zeta_vec;
-            pb1_vec = zero_vec - weight_vec * (pIz_vec * pIx_vec);
-            pb2_vec = zero_vec - weight_vec * (pIz_vec * pIy_vec);
+            pa11_vec = v_add(v_mul(weight_vec, v_mul(pIx_vec, pIx_vec)), zeta_vec);
+            pa12_vec = v_mul(weight_vec, v_mul(pIx_vec, pIy_vec));
+            pa22_vec = v_add(v_mul(weight_vec, v_mul(pIy_vec, pIy_vec)), zeta_vec);
+            pb1_vec = v_sub(zero_vec, v_mul(weight_vec, v_mul(pIz_vec, pIx_vec)));
+            pb2_vec = v_sub(zero_vec, v_mul(weight_vec, v_mul(pIz_vec, pIy_vec)));
 
             pIxx_vec = v_load(pIxx + j);
             pIxy_vec = v_load(pIxy + j);
@@ -667,18 +667,17 @@ void VariationalRefinementImpl::ComputeDataTerm_ParBody::operator()(const Range
             pIxz_vec = v_load(pIxz + j);
             pIyz_vec = v_load(pIyz + j);
 
-            derivNorm_vec = pIxx_vec * pIxx_vec + pIxy_vec * pIxy_vec + zeta_vec;
-            derivNorm2_vec = pIyy_vec * pIyy_vec + pIxy_vec * pIxy_vec + zeta_vec;
-            Ik1zx_vec = pIxz_vec + pIxx_vec * pdU_vec + pIxy_vec * pdV_vec;
-            Ik1zy_vec = pIyz_vec + pIxy_vec * pdU_vec + pIyy_vec * pdV_vec;
-            weight_vec = gamma_vec / v_sqrt(Ik1zx_vec * Ik1zx_vec / derivNorm_vec +
-                                            Ik1zy_vec * Ik1zy_vec / derivNorm2_vec + eps_vec);
+            derivNorm_vec = v_add(v_add(v_mul(pIxx_vec, pIxx_vec), v_mul(pIxy_vec, pIxy_vec)), zeta_vec);
+            derivNorm2_vec = v_add(v_add(v_mul(pIyy_vec, pIyy_vec), v_mul(pIxy_vec, pIxy_vec)), zeta_vec);
+            Ik1zx_vec = v_add(v_add(pIxz_vec, v_mul(pIxx_vec, pdU_vec)), v_mul(pIxy_vec, pdV_vec));
+            Ik1zy_vec = v_add(v_add(pIyz_vec, v_mul(pIxy_vec, pdU_vec)), v_mul(pIyy_vec, pdV_vec));
+            weight_vec = v_div(gamma_vec, v_sqrt(v_add(v_add(v_div(v_mul(Ik1zx_vec, Ik1zx_vec), derivNorm_vec), v_div(v_mul(Ik1zy_vec, Ik1zy_vec), derivNorm2_vec)), eps_vec)));
 
-            pa11_vec += weight_vec * (pIxx_vec * pIxx_vec / derivNorm_vec + pIxy_vec * pIxy_vec / derivNorm2_vec);
-            pa12_vec += weight_vec * (pIxx_vec * pIxy_vec / derivNorm_vec + pIxy_vec * pIyy_vec / derivNorm2_vec);
-            pa22_vec += weight_vec * (pIxy_vec * pIxy_vec / derivNorm_vec + pIyy_vec * pIyy_vec / derivNorm2_vec);
-            pb1_vec -= weight_vec * (pIxx_vec * pIxz_vec / derivNorm_vec + pIxy_vec * pIyz_vec / derivNorm2_vec);
-            pb2_vec -= weight_vec * (pIxy_vec * pIxz_vec / derivNorm_vec + pIyy_vec * pIyz_vec / derivNorm2_vec);
+            pa11_vec = v_add(pa11_vec, v_mul(weight_vec, v_add(v_div(v_mul(pIxx_vec, pIxx_vec), derivNorm_vec), v_div(v_mul(pIxy_vec, pIxy_vec), derivNorm2_vec))));
+            pa12_vec = v_add(pa12_vec, v_mul(weight_vec, v_add(v_div(v_mul(pIxx_vec, pIxy_vec), derivNorm_vec), v_div(v_mul(pIxy_vec, pIyy_vec), derivNorm2_vec))));
+            pa22_vec = v_add(pa22_vec, v_mul(weight_vec, v_add(v_div(v_mul(pIxy_vec, pIxy_vec), derivNorm_vec), v_div(v_mul(pIyy_vec, pIyy_vec), derivNorm2_vec))));
+            pb1_vec = v_sub(pb1_vec, v_mul(weight_vec, v_add(v_div(v_mul(pIxx_vec, pIxz_vec), derivNorm_vec), v_div(v_mul(pIxy_vec, pIyz_vec), derivNorm2_vec))));
+            pb2_vec = v_sub(pb2_vec, v_mul(weight_vec, v_add(v_div(v_mul(pIxy_vec, pIxz_vec), derivNorm_vec), v_div(v_mul(pIyy_vec, pIyz_vec), derivNorm2_vec))));
 
             v_store(pa11 + j, pa11_vec);
             v_store(pa12 + j, pa12_vec);
@@ -850,26 +849,26 @@ void VariationalRefinementImpl::ComputeSmoothnessTermHorPass_ParBody::operator()
             cW_u_vec = v_load(cW_u + j);
             cW_v_vec = v_load(cW_v + j);
 
-            ux_vec = v_load(cW_u_next + j) - cW_u_vec;
-            vx_vec = v_load(cW_v_next + j) - cW_v_vec;
-            uy_vec = v_load(cW_u_next_row + j) - cW_u_vec;
-            vy_vec = v_load(cW_v_next_row + j) - cW_v_vec;
+            ux_vec = v_sub(v_load(cW_u_next + j), cW_u_vec);
+            vx_vec = v_sub(v_load(cW_v_next + j), cW_v_vec);
+            uy_vec = v_sub(v_load(cW_u_next_row + j), cW_u_vec);
+            vy_vec = v_sub(v_load(cW_v_next_row + j), cW_v_vec);
             pWeight_vec =
-              alpha2_vec / v_sqrt(ux_vec * ux_vec + vx_vec * vx_vec + uy_vec * uy_vec + vy_vec * vy_vec + eps_vec);
+              v_div(alpha2_vec, v_sqrt(v_add(v_add(v_add(v_add(v_mul(ux_vec, ux_vec), v_mul(vx_vec, vx_vec)), v_mul(uy_vec, uy_vec)), v_mul(vy_vec, vy_vec)), eps_vec)));
             v_store(pWeight + j, pWeight_vec);
 
-            ux_vec = pWeight_vec * (v_load(pW_u_next + j) - v_load(pW_u + j));
-            vx_vec = pWeight_vec * (v_load(pW_v_next + j) - v_load(pW_v + j));
+            ux_vec = v_mul(pWeight_vec, v_sub(v_load(pW_u_next + j), v_load(pW_u + j)));
+            vx_vec = v_mul(pWeight_vec, v_sub(v_load(pW_v_next + j), v_load(pW_v + j)));
 
-            v_store(pA_u + j, v_load(pA_u + j) + pWeight_vec);
-            v_store(pA_v + j, v_load(pA_v + j) + pWeight_vec);
-            v_store(pB_u + j, v_load(pB_u + j) + ux_vec);
-            v_store(pB_v + j, v_load(pB_v + j) + vx_vec);
+            v_store(pA_u + j, v_add(v_load(pA_u + j), pWeight_vec));
+            v_store(pA_v + j, v_add(v_load(pA_v + j), pWeight_vec));
+            v_store(pB_u + j, v_add(v_load(pB_u + j), ux_vec));
+            v_store(pB_v + j, v_add(v_load(pB_v + j), vx_vec));
 
-            v_store(pA_u_next + j, v_load(pA_u_next + j) + pWeight_vec);
-            v_store(pA_v_next + j, v_load(pA_v_next + j) + pWeight_vec);
-            v_store(pB_u_next + j, v_load(pB_u_next + j) - ux_vec);
-            v_store(pB_v_next + j, v_load(pB_v_next + j) - vx_vec);
+            v_store(pA_u_next + j, v_add(v_load(pA_u_next + j), pWeight_vec));
+            v_store(pA_v_next + j, v_add(v_load(pA_v_next + j), pWeight_vec));
+            v_store(pB_u_next + j, v_sub(v_load(pB_u_next + j), ux_vec));
+            v_store(pB_v_next + j, v_sub(v_load(pB_v_next + j), vx_vec));
         }
 #endif
         for (; j < len - 1; j++)
@@ -956,18 +955,18 @@ void VariationalRefinementImpl::ComputeSmoothnessTermVertPass_ParBody::operator(
         for (; j < len - 3; j += 4)
         {
             pWeight_vec = v_load(pWeight + j);
-            uy_vec = pWeight_vec * (v_load(pW_u_next_row + j) - v_load(pW_u + j));
-            vy_vec = pWeight_vec * (v_load(pW_v_next_row + j) - v_load(pW_v + j));
-
-            v_store(pA_u + j, v_load(pA_u + j) + pWeight_vec);
-            v_store(pA_v + j, v_load(pA_v + j) + pWeight_vec);
-            v_store(pB_u + j, v_load(pB_u + j) + uy_vec);
-            v_store(pB_v + j, v_load(pB_v + j) + vy_vec);
-
-            v_store(pA_u_next_row + j, v_load(pA_u_next_row + j) + pWeight_vec);
-            v_store(pA_v_next_row + j, v_load(pA_v_next_row + j) + pWeight_vec);
-            v_store(pB_u_next_row + j, v_load(pB_u_next_row + j) - uy_vec);
-            v_store(pB_v_next_row + j, v_load(pB_v_next_row + j) - vy_vec);
+            uy_vec = v_mul(pWeight_vec, v_sub(v_load(pW_u_next_row + j), v_load(pW_u + j)));
+            vy_vec = v_mul(pWeight_vec, v_sub(v_load(pW_v_next_row + j), v_load(pW_v + j)));
+
+            v_store(pA_u + j, v_add(v_load(pA_u + j), pWeight_vec));
+            v_store(pA_v + j, v_add(v_load(pA_v + j), pWeight_vec));
+            v_store(pB_u + j, v_add(v_load(pB_u + j), uy_vec));
+            v_store(pB_v + j, v_add(v_load(pB_v + j), vy_vec));
+
+            v_store(pA_u_next_row + j, v_add(v_load(pA_u_next_row + j), pWeight_vec));
+            v_store(pA_v_next_row + j, v_add(v_load(pA_v_next_row + j), pWeight_vec));
+            v_store(pB_u_next_row + j, v_sub(v_load(pB_u_next_row + j), uy_vec));
+            v_store(pB_v_next_row + j, v_sub(v_load(pB_v_next_row + j), vy_vec));
         }
 #endif
         for (; j < len; j++)
@@ -1084,15 +1083,13 @@ void VariationalRefinementImpl::RedBlackSOR_ParBody::operator()(const Range &ran
             pdv_shifted_vec = v_reinterpret_as_f32(
               v_extract<3>(v_reinterpret_as_s32(pdv_prev_vec), v_reinterpret_as_s32(pdv_next_vec)));
 
-            sigmaU_vec = pW_shifted_vec * pdu_shifted_vec + pW_vec * pdu_next_vec + pW_prev_row_vec * pdu_prev_row_vec +
-                         pW_vec * pdu_next_row_vec;
-            sigmaV_vec = pW_shifted_vec * pdv_shifted_vec + pW_vec * pdv_next_vec + pW_prev_row_vec * pdv_prev_row_vec +
-                         pW_vec * pdv_next_row_vec;
+            sigmaU_vec = v_add(v_add(v_add(v_mul(pW_shifted_vec, pdu_shifted_vec), v_mul(pW_vec, pdu_next_vec)), v_mul(pW_prev_row_vec, pdu_prev_row_vec)), v_mul(pW_vec, pdu_next_row_vec));
+            sigmaV_vec = v_add(v_add(v_add(v_mul(pW_shifted_vec, pdv_shifted_vec), v_mul(pW_vec, pdv_next_vec)), v_mul(pW_prev_row_vec, pdv_prev_row_vec)), v_mul(pW_vec, pdv_next_row_vec));
 
             pdu_vec = v_load(pdu + j);
             pdv_vec = v_load(pdv + j);
-            pdu_vec += omega_vec * ((sigmaU_vec + v_load(pb1 + j) - pdv_vec * pa12_vec) / v_load(pa11 + j) - pdu_vec);
-            pdv_vec += omega_vec * ((sigmaV_vec + v_load(pb2 + j) - pdu_vec * pa12_vec) / v_load(pa22 + j) - pdv_vec);
+            pdu_vec = v_add(pdu_vec, v_mul(omega_vec, v_sub(v_div(v_sub(v_add(sigmaU_vec, v_load(pb1 + j)), v_mul(pdv_vec, pa12_vec)), v_load(pa11 + j)), pdu_vec)));
+            pdv_vec = v_add(pdv_vec, v_mul(omega_vec, v_sub(v_div(v_sub(v_add(sigmaV_vec, v_load(pb2 + j)), v_mul(pdu_vec, pa12_vec)), v_load(pa22 + j)), pdv_vec)));
             v_store(pdu + j, pdu_vec);
             v_store(pdv + j, pdv_vec);
 
diff --git a/samples/cpp/simd_basic.cpp b/samples/cpp/simd_basic.cpp
index 9af4d91cef..ef78c39a45 100644
--- a/samples/cpp/simd_basic.cpp
+++ b/samples/cpp/simd_basic.cpp
@@ -38,8 +38,8 @@ int main(int /*argc*/, char** /*argv*/)
 
     printf("==================  arithm check  =================\n");
     v_uint8 a = vx_setall_u8(10);
-    v_uint8 c = a + vx_setall_u8(45);
-    printf("(vx_setall_u8(10) + vx_setall_u8(45)).get0() => %d\n", (int)c.get0());
+    v_uint8 c = v_add(a, vx_setall_u8(45));
+    printf("v_get0(vx_setall_u8(10) + vx_setall_u8(45)) => %d\n", (int)v_get0(c));
 #else
     printf("\nSIMD intrinsics are not available. Check compilation target and passed build options.\n");
 #endif
diff --git a/samples/cpp/tutorial_code/core/univ_intrin/univ_intrin.cpp b/samples/cpp/tutorial_code/core/univ_intrin/univ_intrin.cpp
index 9be4170d7b..52018461c3 100644
--- a/samples/cpp/tutorial_code/core/univ_intrin/univ_intrin.cpp
+++ b/samples/cpp/tutorial_code/core/univ_intrin/univ_intrin.cpp
@@ -85,7 +85,7 @@ void conv1dsimd(Mat src, Mat kernel, float *ans, int row = 0, int rowk = 0, int
 
     //! [convolution-1D-main]
     //! [convolution-1D-main-h1]
-    int step = v_float32().nlanes;
+    int step = VTraits<v_float32x4>::vlanes();
     float *sptr = src_32.ptr<float>(row), *kptr = kernel.ptr<float>(rowk);
     for (int k = 0; k < ksize; k++)
     {
@@ -96,7 +96,7 @@ void conv1dsimd(Mat src, Mat kernel, float *ans, int row = 0, int rowk = 0, int
         for (i = 0; i + step < len; i += step)
         {
             v_float32 window = vx_load(sptr + i + k);
-            v_float32 sum = vx_load(ans + i) + kernel_wide * window;
+            v_float32 sum = v_add(vx_load(ans + i), v_mul(kernel_wide, window));
             v_store(ans + i, sum);
         }
     //! [convolution-1D-main-h2]
@@ -122,7 +122,7 @@ void convolute_simd(Mat src, Mat &dst, Mat kernel)
 
     copyMakeBorder(src, src, sz, sz, 0, 0, BORDER_REPLICATE);
 
-    int step = v_float32().nlanes;
+    int step = VTraits<v_float32x4>::vlanes();
     //! [convolution-2D-init]
 
     //! [convolution-2D-main]
@@ -135,7 +135,7 @@ void convolute_simd(Mat src, Mat &dst, Mat kernel)
             int j;
             for (j = 0; j + step < cols; j += step)
             {
-                v_float32 sum = vx_load(&dst.ptr<float>(i)[j]) + vx_load(&ans[j]);
+                v_float32 sum = v_add(vx_load(&dst.ptr<float>(i)[j]), vx_load(&ans[j]));
                 v_store(&dst.ptr<float>(i)[j], sum);
             }