diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index db13a0027b..630c64c949 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -2,6 +2,7 @@ set(the_description "The Core Functionality")
 
 ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2)
 ocv_add_dispatched_file(stat SSE4_2 AVX2)
+ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2)
 
 # dispatching for accuracy tests
 ocv_add_dispatched_file_force_all(test_intrin128 TEST SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX FP16 AVX2)
diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
index 0cf36cf174..78bd14e4d8 100644
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@@ -661,7 +661,7 @@ inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b)
 {
     v_uint16x16 c, d;
     v_mul_expand(a, b, c, d);
-    return v_pack_u(v_reinterpret_as_s16(c), v_reinterpret_as_s16(d));
+    return v_pack(c, d);
 }
 inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b)
 {
@@ -1291,6 +1291,16 @@ inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
 inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
 { return v_abs(a - b); }
 
+/** Saturating absolute difference **/
+inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b)
+{
+    v_int8x32 d = a - b;
+    v_int8x32 m = a < b;
+    return (d ^ m) - m;
+}
+inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b)
+{ return v_max(a, b) - v_min(a, b); }
+
 ////////// Conversions /////////
 
 /** Rounding **/
@@ -1300,6 +1310,12 @@ inline v_int32x8 v_round(const v_float32x8& a)
 inline v_int32x8 v_round(const v_float64x4& a)
 { return v_int32x8(_mm256_castsi128_si256(_mm256_cvtpd_epi32(a.val))); }
 
+inline v_int32x8 v_round(const v_float64x4& a, const v_float64x4& b)
+{
+    __m128i ai = _mm256_cvtpd_epi32(a.val), bi = _mm256_cvtpd_epi32(b.val);
+    return v_int32x8(_v256_combine(ai, bi));
+}
+
 inline v_int32x8 v_trunc(const v_float32x8& a)
 { return v_int32x8(_mm256_cvttps_epi32(a.val)); }
 
@@ -1689,6 +1705,40 @@ void v_rshr_pack_store(int* ptr, const v_int64x4& a)
     v_pack_store(ptr, (a + delta) >> n);
 }
 
+// pack boolean
+inline v_uint8x32 v_pack_b(const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i ab = _mm256_packs_epi16(a.val, b.val);
+    return v_uint8x32(_v256_shuffle_odd_64(ab));
+}
+
+inline v_uint8x32 v_pack_b(const v_uint32x8& a, const v_uint32x8& b,
+                           const v_uint32x8& c, const v_uint32x8& d)
+{
+    __m256i ab = _mm256_packs_epi32(a.val, b.val);
+    __m256i cd = _mm256_packs_epi32(c.val, d.val);
+
+    __m256i abcd = _v256_shuffle_odd_64(_mm256_packs_epi16(ab, cd));
+    return v_uint8x32(_mm256_shuffle_epi32(abcd, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+inline v_uint8x32 v_pack_b(const v_uint64x4& a, const v_uint64x4& b, const v_uint64x4& c,
+                           const v_uint64x4& d, const v_uint64x4& e, const v_uint64x4& f,
+                           const v_uint64x4& g, const v_uint64x4& h)
+{
+    __m256i ab = _mm256_packs_epi32(a.val, b.val);
+    __m256i cd = _mm256_packs_epi32(c.val, d.val);
+    __m256i ef = _mm256_packs_epi32(e.val, f.val);
+    __m256i gh = _mm256_packs_epi32(g.val, h.val);
+
+    __m256i abcd = _mm256_packs_epi32(ab, cd);
+    __m256i efgh = _mm256_packs_epi32(ef, gh);
+    __m256i pkall = _v256_shuffle_odd_64(_mm256_packs_epi16(abcd, efgh));
+
+    __m256i rev = _mm256_alignr_epi8(pkall, pkall, 8);
+    return v_uint8x32(_mm256_unpacklo_epi16(pkall, rev));
+}
+
 /* Recombine */
 // its up there with load and store operations
 
diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
index 38a39172d0..5712f167a8 100644
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -109,7 +109,7 @@ These operations allow to reorder or recombine elements in one or multiple vecto
 
 - Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
 - Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand, @ref v_expand_low, @ref v_expand_high
-- Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u,
+- Pack: @ref v_pack, @ref v_pack_u, @ref v_pack_b, @ref v_rshr_pack, @ref v_rshr_pack_u,
 @ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
 - Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
 - Extract: @ref v_extract
@@ -159,7 +159,7 @@ Most of these operations return only one value.
 ### Other math
 
 - Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude
-- Absolute values: @ref v_abs, @ref v_absdiff
+- Absolute values: @ref v_abs, @ref v_absdiff, @ref v_absdiffs
 
 ### Conversions
 
@@ -199,10 +199,12 @@ Regular integers:
 |logical            | x | x | x | x | x | x |
 |min, max           | x | x | x | x | x | x |
 |absdiff            | x | x | x | x | x | x |
+|absdiffs           |   | x |   | x |   |   |
 |reduce             |   |   |   |   | x | x |
 |mask               | x | x | x | x | x | x |
 |pack               | x | x | x | x | x | x |
 |pack_u             | x |   | x |   |   |   |
+|pack_b             | x |   |   |   |   |   |
 |unpack             | x | x | x | x | x | x |
 |extract            | x | x | x | x | x | x |
 |rotate (lanes)     | x | x | x | x | x | x |
@@ -762,6 +764,19 @@ inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
     return c;
 }
 
+/** @brief Saturating absolute difference
+
+Returns \f$ saturate(|a - b|) \f$ .
+For 8-, 16-bit signed integer source types. */
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_absdiffs(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++)
+        c.s[i] = saturate_cast<_Tp>(std::abs(a.s[i] - b.s[i]));
+    return c;
+}
+
 /** @brief Inversed square root
 
 Returns \f$ 1/sqrt(a) \f$
@@ -1613,6 +1628,18 @@ template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
     return c;
 }
 
+/** @overload */
+template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a, const v_reg<double, n>& b)
+{
+    v_reg<int, n*2> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = cvRound(a.s[i]);
+        c.s[i+n] = cvRound(b.s[i]);
+    }
+    return c;
+}
+
 /** @brief Floor
 
 Floor each value. Input type is float vector ==> output type is int vector.*/
@@ -2059,6 +2086,103 @@ OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, s
 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
 //! @}
 
+//! @cond IGNORED
+template<typename _Tpm, typename _Tp, int n>
+inline void _pack_b(_Tpm* mptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    for (int i = 0; i < n; ++i)
+    {
+        mptr[i] = (_Tpm)a.s[i];
+        mptr[i + n] = (_Tpm)b.s[i];
+    }
+}
+//! @endcond
+
+//! @name Pack boolean values
+//! @{
+//! @brief Pack boolean values from multiple vectors to one unsigned 8-bit integer vector
+//!
+//! @note Must provide valid boolean values to guarantee same result for all architectures.
+
+/** @brief
+//! For 16-bit boolean values
+
+Scheme:
+@code
+a  {0xFFFF 0 0 0xFFFF 0 0xFFFF 0xFFFF 0}
+b  {0xFFFF 0 0xFFFF 0 0 0xFFFF 0 0xFFFF}
+===============
+{
+   0xFF 0 0 0xFF 0 0xFF 0xFF 0
+   0xFF 0 0xFF 0 0 0xFF 0 0xFF
+}
+@endcode */
+
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v_uint8x16 mask;
+    _pack_b(mask.s, a, b);
+    return mask;
+}
+
+/** @overload
+For 32-bit boolean values
+
+Scheme:
+@code
+a  {0xFFFF.. 0 0 0xFFFF..}
+b  {0 0xFFFF.. 0xFFFF.. 0}
+c  {0xFFFF.. 0 0xFFFF.. 0}
+d  {0 0xFFFF.. 0 0xFFFF..}
+===============
+{
+   0xFF 0 0 0xFF 0 0xFF 0xFF 0
+   0xFF 0 0xFF 0 0 0xFF 0 0xFF
+}
+@endcode */
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    v_uint8x16 mask;
+    _pack_b(mask.s, a, b);
+    _pack_b(mask.s + 8, c, d);
+    return mask;
+}
+
+/** @overload
+For 64-bit boolean values
+
+Scheme:
+@code
+a  {0xFFFF.. 0}
+b  {0 0xFFFF..}
+c  {0xFFFF.. 0}
+d  {0 0xFFFF..}
+
+e  {0xFFFF.. 0}
+f  {0xFFFF.. 0}
+g  {0 0xFFFF..}
+h  {0 0xFFFF..}
+===============
+{
+   0xFF 0 0 0xFF 0xFF 0 0 0xFF
+   0xFF 0 0xFF 0 0 0xFF 0 0xFF
+}
+@endcode */
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    v_uint8x16 mask;
+    _pack_b(mask.s, a, b);
+    _pack_b(mask.s + 4, c, d);
+    _pack_b(mask.s + 8, e, f);
+    _pack_b(mask.s + 12, g, h);
+    return mask;
+}
+//! @}
+
 /** @brief Matrix multiplication
 
 Scheme:
diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
index 8c13ad52db..50c9b154ee 100644
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -394,6 +394,35 @@ OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, pack, vmovn
 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, pack_u, vqmovun_s16, vqrshrun_n_s16)
 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, pack_u, vqmovun_s32, vqrshrun_n_s32)
 
+// pack boolean
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    uint8x16_t ab = vcombine_u8(vmovn_u16(a.val), vmovn_u16(b.val));
+    return v_uint8x16(ab);
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    uint16x8_t nab = vcombine_u16(vmovn_u32(a.val), vmovn_u32(b.val));
+    uint16x8_t ncd = vcombine_u16(vmovn_u32(c.val), vmovn_u32(d.val));
+    return v_uint8x16(vcombine_u8(vmovn_u16(nab), vmovn_u16(ncd)));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    uint32x4_t ab = vcombine_u32(vmovn_u64(a.val), vmovn_u64(b.val));
+    uint32x4_t cd = vcombine_u32(vmovn_u64(c.val), vmovn_u64(d.val));
+    uint32x4_t ef = vcombine_u32(vmovn_u64(e.val), vmovn_u64(f.val));
+    uint32x4_t gh = vcombine_u32(vmovn_u64(g.val), vmovn_u64(h.val));
+
+    uint16x8_t abcd = vcombine_u16(vmovn_u32(ab), vmovn_u32(cd));
+    uint16x8_t efgh = vcombine_u16(vmovn_u32(ef), vmovn_u32(gh));
+    return v_uint8x16(vcombine_u8(vmovn_u16(abcd), vmovn_u16(efgh)));
+}
+
 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
                             const v_float32x4& m1, const v_float32x4& m2,
                             const v_float32x4& m3)
@@ -748,7 +777,6 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_mul_wrap, vmulq_s8)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_mul_wrap, vmulq_u16)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mul_wrap, vmulq_s16)
 
-// TODO: absdiff for signed integers
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
@@ -757,6 +785,12 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_absdiff, vabdq_f64)
 #endif
 
+/** Saturating absolute difference **/
+inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
+{ return v_int8x16(vqabsq_s8(vqsubq_s8(a.val, b.val))); }
+inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
+{ return v_int16x8(vqabsq_s16(vqsubq_s16(a.val, b.val))); }
+
 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
 inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
 { \
@@ -1242,6 +1276,11 @@ inline v_int32x4 v_round(const v_float64x2& a)
     return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
 }
 
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), vmovn_s64(vcvtaq_s64_f64(b.val))));
+}
+
 inline v_int32x4 v_floor(const v_float64x2& a)
 {
     static const int32x2_t zero = vdup_n_s32(0);
diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
index d4740b72fe..c49d0de377 100644
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -634,6 +634,35 @@ void v_rshr_pack_store(int* ptr, const v_int64x2& a)
     _mm_storel_epi64((__m128i*)ptr, a2);
 }
 
+// pack boolean
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    __m128i ab = _mm_packs_epi16(a.val, b.val);
+    return v_uint8x16(ab);
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    __m128i ab = _mm_packs_epi32(a.val, b.val);
+    __m128i cd = _mm_packs_epi32(c.val, d.val);
+    return v_uint8x16(_mm_packs_epi16(ab, cd));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    __m128i ab = _mm_packs_epi32(a.val, b.val);
+    __m128i cd = _mm_packs_epi32(c.val, d.val);
+    __m128i ef = _mm_packs_epi32(e.val, f.val);
+    __m128i gh = _mm_packs_epi32(g.val, h.val);
+
+    __m128i abcd = _mm_packs_epi32(ab, cd);
+    __m128i efgh = _mm_packs_epi32(ef, gh);
+    return v_uint8x16(_mm_packs_epi16(abcd, efgh));
+}
+
 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
                             const v_float32x4& m1, const v_float32x4& m2,
                             const v_float32x4& m3)
@@ -706,19 +735,11 @@ OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
     inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
     { a = a * b; return a; }
 
+OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint8x16, v_uint16x8)
 OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int8x16,  v_int16x8)
 OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint16x8, v_uint32x4)
 OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int16x8,  v_int32x4)
 
-inline v_uint8x16 operator * (const v_uint8x16& a, const v_uint8x16& b)
-{
-    v_uint16x8 c, d;
-    v_mul_expand(a, b, c, d);
-    return v_pack_u(v_reinterpret_as_s16(c), v_reinterpret_as_s16(d));
-}
-inline v_uint8x16& operator *= (v_uint8x16& a, const v_uint8x16& b)
-{ a = a * b; return a; }
-
 //  Multiply and expand
 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
                          v_uint16x8& c, v_uint16x8& d)
@@ -1045,34 +1066,43 @@ inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
     return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
 }
 
-#define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \
-inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \
-{ \
-    return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a.val, b.val), _mm_subs_epu##bits(b.val, a.val))); \
-} \
-inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \
-{ \
-    __m128i smask = _mm_set1_epi32(smask32); \
-    __m128i a1 = _mm_xor_si128(a.val, smask); \
-    __m128i b1 = _mm_xor_si128(b.val, smask); \
-    return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a1, b1), _mm_subs_epu##bits(b1, a1))); \
-}
-
-OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080)
-OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000)
+/** Absolute difference **/
 
+inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_add_wrap(a - b,  b - a); }
 inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
 {
-    return v_max(a, b) - v_min(a, b);
+    v_int8x16 d = v_sub_wrap(a, b);
+    v_int8x16 m = a < b;
+    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
+}
+inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
+{
+    return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)));
 }
-
 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
 {
-    __m128i d = _mm_sub_epi32(a.val, b.val);
-    __m128i m = _mm_cmpgt_epi32(b.val, a.val);
-    return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m));
+    v_int32x4 d = a - b;
+    v_int32x4 m = a < b;
+    return v_reinterpret_as_u32((d ^ m) - m);
 }
 
+/** Saturating absolute difference **/
+inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
+{
+    v_int8x16 d = a - b;
+    v_int8x16 m = a < b;
+    return (d ^ m) - m;
+ }
+inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+
 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
 {
     return a * b + c;
@@ -1623,6 +1653,12 @@ inline v_int32x4 v_trunc(const v_float32x4& a)
 inline v_int32x4 v_round(const v_float64x2& a)
 { return v_int32x4(_mm_cvtpd_epi32(a.val)); }
 
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{
+    __m128i ai = _mm_cvtpd_epi32(a.val), bi = _mm_cvtpd_epi32(b.val);
+    return v_int32x4(_mm_unpacklo_epi64(ai, bi));
+}
+
 inline v_int32x4 v_floor(const v_float64x2& a)
 {
     __m128i a1 = _mm_cvtpd_epi32(a.val);
diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
index 27efd2ad9c..b23e19950e 100644
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -383,6 +383,35 @@ OPENCV_HAL_IMPL_VSX_PACK(v_uint16x8, ushort, v_int32x4, unsigned int, int,
 //OPENCV_HAL_IMPL_VSX_PACK(v_uint32x4, uint, v_int64x2, unsigned long long, long long,
 //                         vec_sra, vec_packsu, vec_add, pack_u)
 
+// pack boolean
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    vec_uchar16 ab = vec_pack(a.val, b.val);
+    return v_uint8x16(ab);
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    vec_ushort8 ab = vec_pack(a.val, b.val);
+    vec_ushort8 cd = vec_pack(c.val, d.val);
+    return v_uint8x16(vec_pack(ab, cd));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    vec_uint4 ab = vec_pack(a.val, b.val);
+    vec_uint4 cd = vec_pack(c.val, d.val);
+    vec_uint4 ef = vec_pack(e.val, f.val);
+    vec_uint4 gh = vec_pack(g.val, h.val);
+
+    vec_ushort8 abcd = vec_pack(ab, cd);
+    vec_ushort8 efgh = vec_pack(ef, gh);
+    return v_uint8x16(vec_pack(abcd, efgh));
+}
+
 /* Recombine */
 template <typename _Tpvec>
 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1)
@@ -834,16 +863,27 @@ inline v_float32x4 v_abs(const v_float32x4& x)
 inline v_float64x2 v_abs(const v_float64x2& x)
 { return v_float64x2(vec_abs(x.val)); }
 
+/** Absolute difference **/
+// unsigned
 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_absdiff, vec_absd)
 
-#define OPENCV_HAL_IMPL_VSX_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin)  \
-inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b)                       \
-{ return _Tpvec2(cast(intrin(a.val, b.val))); }
+inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
+{ return v_reinterpret_as_u8(v_sub_wrap(v_max(a, b), v_min(a, b))); }
+inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
+{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
+inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
+{ return v_reinterpret_as_u32(v_max(a, b) - v_min(a, b)); }
 
-OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int8x16, v_uint8x16, vec_uchar16_c, v_absdiff, vec_absd)
-OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int16x8, v_uint16x8, vec_ushort8_c, v_absdiff, vec_absd)
-OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int32x4, v_uint32x4, vec_uint4_c, v_absdiff, vec_absd)
-OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int64x2, v_uint64x2, vec_udword2_c, v_absdiff, vec_absd)
+inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
+{ return v_abs(a - b); }
+inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
+{ return v_abs(a - b); }
+
+/** Absolute difference for signed integers **/
+inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
+{ return v_int8x16(vec_abss(vec_subs(a.val, b.val))); }
+inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
+{ return v_int16x8(vec_abss(vec_subs(a.val, b.val))); }
 
 ////////// Conversions /////////
 
@@ -854,6 +894,9 @@ inline v_int32x4 v_round(const v_float32x4& a)
 inline v_int32x4 v_round(const v_float64x2& a)
 { return v_int32x4(vec_mergesqo(vec_ctso(vec_rint(a.val)), vec_int4_z)); }
 
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{ return v_int32x4(vec_mergesqo(vec_ctso(vec_rint(a.val)), vec_ctso(vec_rint(b.val)))); }
+
 inline v_int32x4 v_floor(const v_float32x4& a)
 { return v_int32x4(vec_cts(vec_floor(a.val))); }
 
diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index 0626607e2f..197b4f0dcf 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -2134,1014 +2134,4 @@ cvMaxS( const void* srcarr1, double value, void* dstarr )
     cv::max( src1, value, dst );
 }
 
-
-
-namespace cv { namespace hal {
-
-//=======================================
-
-#if (ARITHM_USE_IPP == 1)
-static inline void fixSteps(int width, int height, size_t elemSize, size_t& step1, size_t& step2, size_t& step)
-{
-    if( height == 1 )
-        step1 = step2 = step = width*elemSize;
-}
-#define CALL_IPP_BIN_E_12(fun) \
-    CV_IPP_CHECK() \
-    { \
-        fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
-        if (0 <= CV_INSTRUMENT_FUN_IPP(fun, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0)) \
-        { \
-            CV_IMPL_ADD(CV_IMPL_IPP); \
-            return; \
-        } \
-        setIppErrorStatus(); \
-    }
-
-#define CALL_IPP_BIN_E_21(fun) \
-    CV_IPP_CHECK() \
-    { \
-        fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
-        if (0 <= CV_INSTRUMENT_FUN_IPP(fun, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height), 0)) \
-        { \
-            CV_IMPL_ADD(CV_IMPL_IPP); \
-            return; \
-        } \
-        setIppErrorStatus(); \
-    }
-
-#define CALL_IPP_BIN_12(fun) \
-    CV_IPP_CHECK() \
-    { \
-        fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
-        if (0 <= CV_INSTRUMENT_FUN_IPP(fun, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height))) \
-        { \
-            CV_IMPL_ADD(CV_IMPL_IPP); \
-            return; \
-        } \
-        setIppErrorStatus(); \
-    }
-
-#define CALL_IPP_BIN_21(fun) \
-    CV_IPP_CHECK() \
-    { \
-        fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
-        if (0 <= CV_INSTRUMENT_FUN_IPP(fun, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height))) \
-        { \
-            CV_IMPL_ADD(CV_IMPL_IPP); \
-            return; \
-        } \
-        setIppErrorStatus(); \
-    }
-
-#else
-#define CALL_IPP_BIN_E_12(fun)
-#define CALL_IPP_BIN_E_21(fun)
-#define CALL_IPP_BIN_12(fun)
-#define CALL_IPP_BIN_21(fun)
-#endif
-
-
-//=======================================
-// Add
-//=======================================
-
-void add8u( const uchar* src1, size_t step1,
-                   const uchar* src2, size_t step2,
-                   uchar* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(add8u, cv_hal_add8u, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_BIN_E_12(ippiAdd_8u_C1RSfs)
-    (vBinOp<uchar, cv::OpAdd<uchar>, IF_SIMD(VAdd<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-void add8s( const schar* src1, size_t step1,
-                   const schar* src2, size_t step2,
-                   schar* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(add8s, cv_hal_add8s, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp<schar, cv::OpAdd<schar>, IF_SIMD(VAdd<schar>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void add16u( const ushort* src1, size_t step1,
-                    const ushort* src2, size_t step2,
-                    ushort* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(add16u, cv_hal_add16u, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_BIN_E_12(ippiAdd_16u_C1RSfs)
-    (vBinOp<ushort, cv::OpAdd<ushort>, IF_SIMD(VAdd<ushort>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-void add16s( const short* src1, size_t step1,
-                    const short* src2, size_t step2,
-                    short* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(add16s, cv_hal_add16s, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_BIN_E_12(ippiAdd_16s_C1RSfs)
-    (vBinOp<short, cv::OpAdd<short>, IF_SIMD(VAdd<short>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-void add32s( const int* src1, size_t step1,
-                    const int* src2, size_t step2,
-                    int* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(add32s, cv_hal_add32s, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp32<int, cv::OpAdd<int>, IF_SIMD(VAdd<int>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void add32f( const float* src1, size_t step1,
-                    const float* src2, size_t step2,
-                    float* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(add32f, cv_hal_add32f, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_BIN_12(ippiAdd_32f_C1R)
-    (vBinOp32<float, cv::OpAdd<float>, IF_SIMD(VAdd<float>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-void add64f( const double* src1, size_t step1,
-                    const double* src2, size_t step2,
-                    double* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(add64f, cv_hal_add64f, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp64<double, cv::OpAdd<double>, IF_SIMD(VAdd<double>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-//=======================================
-// Subtract
-//=======================================
-
-void sub8u( const uchar* src1, size_t step1,
-                   const uchar* src2, size_t step2,
-                   uchar* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(sub8u, cv_hal_sub8u, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_BIN_E_21(ippiSub_8u_C1RSfs)
-    (vBinOp<uchar, cv::OpSub<uchar>, IF_SIMD(VSub<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-void sub8s( const schar* src1, size_t step1,
-                   const schar* src2, size_t step2,
-                   schar* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(sub8s, cv_hal_sub8s, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp<schar, cv::OpSub<schar>, IF_SIMD(VSub<schar>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void sub16u( const ushort* src1, size_t step1,
-                    const ushort* src2, size_t step2,
-                    ushort* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(sub16u, cv_hal_sub16u, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_BIN_E_21(ippiSub_16u_C1RSfs)
-    (vBinOp<ushort, cv::OpSub<ushort>, IF_SIMD(VSub<ushort>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-void sub16s( const short* src1, size_t step1,
-                    const short* src2, size_t step2,
-                    short* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(sub16s, cv_hal_sub16s, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_BIN_E_21(ippiSub_16s_C1RSfs)
-    (vBinOp<short, cv::OpSub<short>, IF_SIMD(VSub<short>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-void sub32s( const int* src1, size_t step1,
-                    const int* src2, size_t step2,
-                    int* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(sub32s, cv_hal_sub32s, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp32<int, cv::OpSub<int>, IF_SIMD(VSub<int>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void sub32f( const float* src1, size_t step1,
-                   const float* src2, size_t step2,
-                   float* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(sub32f, cv_hal_sub32f, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_BIN_21(ippiSub_32f_C1R)
-    (vBinOp32<float, cv::OpSub<float>, IF_SIMD(VSub<float>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-void sub64f( const double* src1, size_t step1,
-                    const double* src2, size_t step2,
-                    double* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(sub64f, cv_hal_sub64f, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp64<double, cv::OpSub<double>, IF_SIMD(VSub<double>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-//=======================================
-
-#if (ARITHM_USE_IPP == 1)
-#define CALL_IPP_MIN_MAX(fun, type) \
-    CV_IPP_CHECK() \
-    { \
-        type* s1 = (type*)src1; \
-        type* s2 = (type*)src2; \
-        type* d  = dst; \
-        fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
-        int i = 0; \
-        for(; i < height; i++) \
-        { \
-            if (0 > CV_INSTRUMENT_FUN_IPP(fun, s1, s2, d, width)) \
-                break; \
-            s1 = (type*)((uchar*)s1 + step1); \
-            s2 = (type*)((uchar*)s2 + step2); \
-            d  = (type*)((uchar*)d + step); \
-        } \
-        if (i == height) \
-        { \
-            CV_IMPL_ADD(CV_IMPL_IPP); \
-            return; \
-        } \
-        setIppErrorStatus(); \
-    }
-#else
-#define CALL_IPP_MIN_MAX(fun, type)
-#endif
-
-//=======================================
-// Max
-//=======================================
-
-void max8u( const uchar* src1, size_t step1,
-                   const uchar* src2, size_t step2,
-                   uchar* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(max8u, cv_hal_max8u, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_MIN_MAX(ippsMaxEvery_8u, uchar)
-    vBinOp<uchar, cv::OpMax<uchar>, IF_SIMD(VMax<uchar>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void max8s( const schar* src1, size_t step1,
-                   const schar* src2, size_t step2,
-                   schar* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(max8s, cv_hal_max8s, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp<schar, cv::OpMax<schar>, IF_SIMD(VMax<schar>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void max16u( const ushort* src1, size_t step1,
-                    const ushort* src2, size_t step2,
-                    ushort* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(max16u, cv_hal_max16u, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_MIN_MAX(ippsMaxEvery_16u, ushort)
-    vBinOp<ushort, cv::OpMax<ushort>, IF_SIMD(VMax<ushort>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void max16s( const short* src1, size_t step1,
-                    const short* src2, size_t step2,
-                    short* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(max16s, cv_hal_max16s, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp<short, cv::OpMax<short>, IF_SIMD(VMax<short>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void max32s( const int* src1, size_t step1,
-                    const int* src2, size_t step2,
-                    int* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(max32s, cv_hal_max32s, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp32<int, cv::OpMax<int>, IF_SIMD(VMax<int>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void max32f( const float* src1, size_t step1,
-                    const float* src2, size_t step2,
-                    float* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(max32f, cv_hal_max32f, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_MIN_MAX(ippsMaxEvery_32f, float)
-    vBinOp32<float, cv::OpMax<float>, IF_SIMD(VMax<float>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void max64f( const double* src1, size_t step1,
-                    const double* src2, size_t step2,
-                    double* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(max64f, cv_hal_max64f, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_MIN_MAX(ippsMaxEvery_64f, double)
-    vBinOp64<double, cv::OpMax<double>, IF_SIMD(VMax<double>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-//=======================================
-// Min
-//=======================================
-
-void min8u( const uchar* src1, size_t step1,
-                   const uchar* src2, size_t step2,
-                   uchar* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(min8u, cv_hal_min8u, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_MIN_MAX(ippsMinEvery_8u, uchar)
-    vBinOp<uchar, cv::OpMin<uchar>, IF_SIMD(VMin<uchar>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void min8s( const schar* src1, size_t step1,
-                   const schar* src2, size_t step2,
-                   schar* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(min8s, cv_hal_min8s, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp<schar, cv::OpMin<schar>, IF_SIMD(VMin<schar>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void min16u( const ushort* src1, size_t step1,
-                    const ushort* src2, size_t step2,
-                    ushort* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(min16u, cv_hal_min16u, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_MIN_MAX(ippsMinEvery_16u, ushort)
-    vBinOp<ushort, cv::OpMin<ushort>, IF_SIMD(VMin<ushort>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void min16s( const short* src1, size_t step1,
-                    const short* src2, size_t step2,
-                    short* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(min16s, cv_hal_min16s, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp<short, cv::OpMin<short>, IF_SIMD(VMin<short>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void min32s( const int* src1, size_t step1,
-                    const int* src2, size_t step2,
-                    int* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(min32s, cv_hal_min32s, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp32<int, cv::OpMin<int>, IF_SIMD(VMin<int>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void min32f( const float* src1, size_t step1,
-                    const float* src2, size_t step2,
-                    float* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(min32f, cv_hal_min32f, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_MIN_MAX(ippsMinEvery_32f, float)
-    vBinOp32<float, cv::OpMin<float>, IF_SIMD(VMin<float>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void min64f( const double* src1, size_t step1,
-                    const double* src2, size_t step2,
-                    double* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(min64f, cv_hal_min64f, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_MIN_MAX(ippsMinEvery_64f, double)
-    vBinOp64<double, cv::OpMin<double>, IF_SIMD(VMin<double>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-//=======================================
-// AbsDiff
-//=======================================
-
-void absdiff8u( const uchar* src1, size_t step1,
-                       const uchar* src2, size_t step2,
-                       uchar* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(absdiff8u, cv_hal_absdiff8u, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_BIN_12(ippiAbsDiff_8u_C1R)
-    (vBinOp<uchar, cv::OpAbsDiff<uchar>, IF_SIMD(VAbsDiff<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-void absdiff8s( const schar* src1, size_t step1,
-                       const schar* src2, size_t step2,
-                       schar* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(absdiff8s, cv_hal_absdiff8s, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp<schar, cv::OpAbsDiff<schar>, IF_SIMD(VAbsDiff<schar>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void absdiff16u( const ushort* src1, size_t step1,
-                        const ushort* src2, size_t step2,
-                        ushort* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(absdiff16u, cv_hal_absdiff16u, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_BIN_12(ippiAbsDiff_16u_C1R)
-    (vBinOp<ushort, cv::OpAbsDiff<ushort>, IF_SIMD(VAbsDiff<ushort>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-void absdiff16s( const short* src1, size_t step1,
-                        const short* src2, size_t step2,
-                        short* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(absdiff16s, cv_hal_absdiff16s, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp<short, cv::OpAbsDiff<short>, IF_SIMD(VAbsDiff<short>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void absdiff32s( const int* src1, size_t step1,
-                        const int* src2, size_t step2,
-                        int* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(absdiff32s, cv_hal_absdiff32s, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp32<int, cv::OpAbsDiff<int>, IF_SIMD(VAbsDiff<int>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void absdiff32f( const float* src1, size_t step1,
-                        const float* src2, size_t step2,
-                        float* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(absdiff32f, cv_hal_absdiff32f, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_BIN_12(ippiAbsDiff_32f_C1R)
-    (vBinOp32<float, cv::OpAbsDiff<float>, IF_SIMD(VAbsDiff<float>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-void absdiff64f( const double* src1, size_t step1,
-                        const double* src2, size_t step2,
-                        double* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(absdiff64f, cv_hal_absdiff64f, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp64<double, cv::OpAbsDiff<double>, IF_SIMD(VAbsDiff<double>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-//=======================================
-// Logical
-//=======================================
-
-#if (ARITHM_USE_IPP == 1)
-#define CALL_IPP_UN(fun) \
-    CV_IPP_CHECK() \
-    { \
-        fixSteps(width, height, sizeof(dst[0]), step1, step2, step); CV_UNUSED(src2); \
-        if (0 <= CV_INSTRUMENT_FUN_IPP(fun, src1, (int)step1, dst, (int)step, ippiSize(width, height))) \
-        { \
-            CV_IMPL_ADD(CV_IMPL_IPP); \
-            return; \
-        } \
-        setIppErrorStatus(); \
-    }
-#else
-#define CALL_IPP_UN(fun)
-#endif
-
-void and8u( const uchar* src1, size_t step1,
-                   const uchar* src2, size_t step2,
-                   uchar* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(and8u, cv_hal_and8u, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_BIN_12(ippiAnd_8u_C1R)
-    (vBinOp<uchar, cv::OpAnd<uchar>, IF_SIMD(VAnd<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-void or8u( const uchar* src1, size_t step1,
-                  const uchar* src2, size_t step2,
-                  uchar* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(or8u, cv_hal_or8u, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_BIN_12(ippiOr_8u_C1R)
-    (vBinOp<uchar, cv::OpOr<uchar>, IF_SIMD(VOr<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-void xor8u( const uchar* src1, size_t step1,
-                   const uchar* src2, size_t step2,
-                   uchar* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(xor8u, cv_hal_xor8u, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_BIN_12(ippiXor_8u_C1R)
-    (vBinOp<uchar, cv::OpXor<uchar>, IF_SIMD(VXor<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-void not8u( const uchar* src1, size_t step1,
-                   const uchar* src2, size_t step2,
-                   uchar* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(not8u, cv_hal_not8u, src1, step1, dst, step, width, height)
-    CALL_IPP_UN(ippiNot_8u_C1R)
-    (vBinOp<uchar, cv::OpNot<uchar>, IF_SIMD(VNot<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-//=======================================
-
-#if ARITHM_USE_IPP
-inline static IppCmpOp convert_cmp(int _cmpop)
-{
-    return _cmpop == CMP_EQ ? ippCmpEq :
-        _cmpop == CMP_GT ? ippCmpGreater :
-        _cmpop == CMP_GE ? ippCmpGreaterEq :
-        _cmpop == CMP_LT ? ippCmpLess :
-        _cmpop == CMP_LE ? ippCmpLessEq :
-        (IppCmpOp)-1;
-}
-#define CALL_IPP_CMP(fun) \
-    CV_IPP_CHECK() \
-    { \
-        IppCmpOp op = convert_cmp(*(int *)_cmpop); \
-        if( op  >= 0 ) \
-        { \
-            fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
-            if (0 <= CV_INSTRUMENT_FUN_IPP(fun, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), op)) \
-            { \
-                CV_IMPL_ADD(CV_IMPL_IPP); \
-                return; \
-            } \
-            setIppErrorStatus(); \
-        } \
-    }
-#else
-#define CALL_IPP_CMP(fun)
-#endif
-
-//=======================================
-// Compare
-//=======================================
-
-void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
-                  uchar* dst, size_t step, int width, int height, void* _cmpop)
-{
-    CALL_HAL(cmp8u, cv_hal_cmp8u, src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop)
-    CALL_IPP_CMP(ippiCompare_8u_C1R)
-  //vz optimized  cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
-    int code = *(int*)_cmpop;
-    step1 /= sizeof(src1[0]);
-    step2 /= sizeof(src2[0]);
-    if( code == CMP_GE || code == CMP_LT )
-    {
-        std::swap(src1, src2);
-        std::swap(step1, step2);
-        code = code == CMP_GE ? CMP_LE : CMP_GT;
-    }
-
-    if( code == CMP_GT || code == CMP_LE )
-    {
-        int m = code == CMP_GT ? 0 : 255;
-        for( ; height--; src1 += step1, src2 += step2, dst += step )
-        {
-            int x =0;
-#if CV_SIMD128
-            if( hasSIMD128() )
-            {
-                v_uint8x16 mask = v_setall_u8((uchar)m);
-
-                for( ; x <= width - v_uint8x16::nlanes; x += v_uint8x16::nlanes )
-                {
-                    v_store(dst + x, (v_load(src1 + x) > v_load(src2 + x)) ^ mask);
-                }
-            }
-#endif
-
-            for( ; x < width; x++ ){
-                dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
-            }
-        }
-    }
-    else if( code == CMP_EQ || code == CMP_NE )
-    {
-        int m = code == CMP_EQ ? 0 : 255;
-        for( ; height--; src1 += step1, src2 += step2, dst += step )
-        {
-            int x = 0;
-#if CV_SIMD128
-            if( hasSIMD128() )
-            {
-                v_uint8x16 mask = v_setall_u8((uchar)m);
-
-                for( ; x <= width - v_uint8x16::nlanes; x += v_uint8x16::nlanes )
-                {
-                    v_store(dst+x, (v_load(src1+x) == v_load(src2+x)) ^ mask);
-                }
-            }
-#endif
-           for( ; x < width; x++ )
-                dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
-        }
-    }
-}
-
-void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2,
-                  uchar* dst, size_t step, int width, int height, void* _cmpop)
-{
-    CALL_HAL(cmp8s, cv_hal_cmp8s, src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop)
-    cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
-}
-
-void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
-                  uchar* dst, size_t step, int width, int height, void* _cmpop)
-{
-    CALL_HAL(cmp16u, cv_hal_cmp16u, src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop)
-    CALL_IPP_CMP(ippiCompare_16u_C1R)
-    cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
-}
-
-void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2,
-                  uchar* dst, size_t step, int width, int height, void* _cmpop)
-{
-    CALL_HAL(cmp16s, cv_hal_cmp16s, src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop)
-    CALL_IPP_CMP(ippiCompare_16s_C1R)
-   //vz optimized cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
-
-    int code = *(int*)_cmpop;
-    step1 /= sizeof(src1[0]);
-    step2 /= sizeof(src2[0]);
-    if( code == CMP_GE || code == CMP_LT )
-    {
-        std::swap(src1, src2);
-        std::swap(step1, step2);
-        code = code == CMP_GE ? CMP_LE : CMP_GT;
-    }
-
-    if( code == CMP_GT || code == CMP_LE )
-    {
-        int m = code == CMP_GT ? 0 : 255;
-        for( ; height--; src1 += step1, src2 += step2, dst += step )
-        {
-            int x =0;
-#if CV_SIMD128
-            if( hasSIMD128() )
-            {
-                v_uint8x16 mask = v_setall_u8((uchar)m);
-                const int dWidth = v_uint8x16::nlanes;
-
-                for( ; x <= width - dWidth; x += dWidth )
-                {
-                    v_int16x8 in1 = v_load(src1 + x);
-                    v_int16x8 in2 = v_load(src2 + x);
-                    v_uint16x8 t1 = v_reinterpret_as_u16(in1 > in2);
-
-                    in1 = v_load(src1 + x + v_uint16x8::nlanes);
-                    in2 = v_load(src2 + x + v_uint16x8::nlanes);
-                    v_uint16x8 t2 = v_reinterpret_as_u16(in1 > in2);
-
-                    v_store(dst+x, (v_pack(t1, t2)) ^ mask);
-                }
-            }
-#endif
-            for( ; x < width; x++ ){
-                 dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
-            }
-        }
-    }
-    else if( code == CMP_EQ || code == CMP_NE )
-    {
-        int m = code == CMP_EQ ? 0 : 255;
-        for( ; height--; src1 += step1, src2 += step2, dst += step )
-        {
-            int x = 0;
-#if CV_SIMD128
-            if( hasSIMD128() )
-            {
-                v_uint8x16 mask = v_setall_u8((uchar)m);
-                const int dWidth = v_uint8x16::nlanes;
-
-                for( ; x <= width - dWidth; x += dWidth )
-                {
-                    v_int16x8 in1 = v_load(src1 + x);
-                    v_int16x8 in2 = v_load(src2 + x);
-                    v_uint16x8 t1 = v_reinterpret_as_u16(in1 == in2);
-
-                    in1 = v_load(src1 + x + 8);
-                    in2 = v_load(src2 + x + 8);
-                    v_uint16x8 t2 = v_reinterpret_as_u16(in1 == in2);
-
-                    v_store(dst+x, (v_pack(t1, t2)^ mask));
-                }
-            }
-#endif
-            for( ; x < width; x++ )
-                dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
-        }
-    }
-}
-
-void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2,
-                   uchar* dst, size_t step, int width, int height, void* _cmpop)
-{
-    CALL_HAL(cmp32s, cv_hal_cmp32s, src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop)
-    cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
-}
-
-void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2,
-                  uchar* dst, size_t step, int width, int height, void* _cmpop)
-{
-    CALL_HAL(cmp32f, cv_hal_cmp32f, src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop)
-    CALL_IPP_CMP(ippiCompare_32f_C1R)
-    cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
-}
-
-void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2,
-                  uchar* dst, size_t step, int width, int height, void* _cmpop)
-{
-    CALL_HAL(cmp64f, cv_hal_cmp64f, src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop)
-    cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
-}
-
-//=======================================
-
-#if defined HAVE_IPP
-#define CALL_IPP_MUL(fun) \
-    CV_IPP_CHECK() \
-    { \
-        if (std::fabs(fscale - 1) <= FLT_EPSILON) \
-        { \
-            if (CV_INSTRUMENT_FUN_IPP(fun, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0) >= 0) \
-            { \
-                CV_IMPL_ADD(CV_IMPL_IPP); \
-                return; \
-            } \
-            setIppErrorStatus(); \
-        } \
-    }
-
-#define CALL_IPP_MUL_2(fun) \
-    CV_IPP_CHECK() \
-    { \
-        if (std::fabs(fscale - 1) <= FLT_EPSILON) \
-        { \
-            if (CV_INSTRUMENT_FUN_IPP(fun, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height)) >= 0) \
-            { \
-                CV_IMPL_ADD(CV_IMPL_IPP); \
-                return; \
-            } \
-            setIppErrorStatus(); \
-        } \
-    }
-
-#else
-#define CALL_IPP_MUL(fun)
-#define CALL_IPP_MUL_2(fun)
-#endif
-
-//=======================================
-// Multilpy
-//=======================================
-
-void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
-                   uchar* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(mul8u, cv_hal_mul8u, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    float fscale = (float)*(const double*)scale;
-    CALL_IPP_MUL(ippiMul_8u_C1RSfs)
-    mul_(src1, step1, src2, step2, dst, step, width, height, fscale);
-}
-
-void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
-                   schar* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(mul8s, cv_hal_mul8s, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    mul_(src1, step1, src2, step2, dst, step, width, height, (float)*(const double*)scale);
-}
-
-void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
-                    ushort* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(mul16u, cv_hal_mul16u, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    float fscale = (float)*(const double*)scale;
-    CALL_IPP_MUL(ippiMul_16u_C1RSfs)
-    mul_(src1, step1, src2, step2, dst, step, width, height, fscale);
-}
-
-void mul16s( const short* src1, size_t step1, const short* src2, size_t step2,
-                    short* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(mul16s, cv_hal_mul16s, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    float fscale = (float)*(const double*)scale;
-    CALL_IPP_MUL(ippiMul_16s_C1RSfs)
-    mul_(src1, step1, src2, step2, dst, step, width, height, fscale);
-}
-
-void mul32s( const int* src1, size_t step1, const int* src2, size_t step2,
-                    int* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(mul32s, cv_hal_mul32s, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    mul_(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-void mul32f( const float* src1, size_t step1, const float* src2, size_t step2,
-                    float* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(mul32f, cv_hal_mul32f, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    float fscale = (float)*(const double*)scale;
-    CALL_IPP_MUL_2(ippiMul_32f_C1R)
-    mul_(src1, step1, src2, step2, dst, step, width, height, fscale);
-}
-
-void mul64f( const double* src1, size_t step1, const double* src2, size_t step2,
-                    double* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(mul64f, cv_hal_mul64f, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    mul_(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-//=======================================
-// Divide
-//=======================================
-
-void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
-                   uchar* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(div8u, cv_hal_div8u, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    if( src1 )
-        div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
-    else
-        recip_i(src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
-                  schar* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(div8s, cv_hal_div8s, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
-                    ushort* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(div16u, cv_hal_div16u, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-void div16s( const short* src1, size_t step1, const short* src2, size_t step2,
-                    short* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(div16s, cv_hal_div16s, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-void div32s( const int* src1, size_t step1, const int* src2, size_t step2,
-                    int* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(div32s, cv_hal_div32s, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-void div32f( const float* src1, size_t step1, const float* src2, size_t step2,
-                    float* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(div32f, cv_hal_div32f, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    div_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-void div64f( const double* src1, size_t step1, const double* src2, size_t step2,
-                    double* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(div64f, cv_hal_div64f, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    div_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-//=======================================
-// Reciprocial
-//=======================================
-
-void recip8u( const uchar*, size_t, const uchar* src2, size_t step2,
-                  uchar* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(recip8u, cv_hal_recip8u, src2, step2, dst, step, width, height, *(const double*)scale)
-    recip_i(src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-void recip8s( const schar*, size_t, const schar* src2, size_t step2,
-                  schar* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(recip8s, cv_hal_recip8s, src2, step2, dst, step, width, height, *(const double*)scale)
-    recip_i(src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-void recip16u( const ushort*, size_t, const ushort* src2, size_t step2,
-                   ushort* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(recip16u, cv_hal_recip16u, src2, step2, dst, step, width, height, *(const double*)scale)
-    recip_i(src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-void recip16s( const short*, size_t, const short* src2, size_t step2,
-                   short* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(recip16s, cv_hal_recip16s, src2, step2, dst, step, width, height, *(const double*)scale)
-    recip_i(src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-void recip32s( const int*, size_t, const int* src2, size_t step2,
-                   int* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(recip32s, cv_hal_recip32s, src2, step2, dst, step, width, height, *(const double*)scale)
-    recip_i(src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-void recip32f( const float*, size_t, const float* src2, size_t step2,
-                   float* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(recip32f, cv_hal_recip32f, src2, step2, dst, step, width, height, *(const double*)scale)
-    recip_f(src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-void recip64f( const double*, size_t, const double* src2, size_t step2,
-                   double* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(recip64f, cv_hal_recip64f, src2, step2, dst, step, width, height, *(const double*)scale)
-    recip_f(src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-//=======================================
-// Add weighted
-//=======================================
-
-void
-addWeighted8u( const uchar* src1, size_t step1,
-               const uchar* src2, size_t step2,
-               uchar* dst, size_t step, int width, int height,
-               void* scalars )
-{
-    CALL_HAL(addWeighted8u, cv_hal_addWeighted8u, src1, step1, src2, step2, dst, step, width, height, (const double*)scalars)
-    const double* scalars_ = (const double*)scalars;
-    float alpha = (float)scalars_[0], beta = (float)scalars_[1], gamma = (float)scalars_[2];
-
-    for( ; height--; src1 += step1, src2 += step2, dst += step )
-    {
-        int x = 0;
-
-#if CV_SIMD128
-        if( hasSIMD128() )
-        {
-            v_float32x4 g = v_setall_f32(gamma);
-            v_float32x4 a = v_setall_f32(alpha);
-            v_float32x4 b = v_setall_f32(beta);
-
-            for( ; x <= width - v_uint16x8::nlanes; x += v_uint16x8::nlanes )
-            {
-                v_uint16x8 in1_16 = v_load_expand(src1 + x);
-                v_int32x4 in1_32_l, in1_32_h;
-                v_expand(v_reinterpret_as_s16(in1_16), in1_32_l, in1_32_h);
-                v_float32x4 in1_f_l = v_cvt_f32(in1_32_l);
-                v_float32x4 in1_f_h = v_cvt_f32(in1_32_h);
-
-                v_uint16x8 in2_16 = v_load_expand(src2 + x);
-                v_int32x4 in2_32_l, in2_32_h;
-                v_expand(v_reinterpret_as_s16(in2_16), in2_32_l, in2_32_h);
-                v_float32x4 in2_f_l = v_cvt_f32(in2_32_l);
-                v_float32x4 in2_f_h = v_cvt_f32(in2_32_h);
-
-                v_int32x4 out_l = v_round(in1_f_l * a + in2_f_l * b + g);
-                v_int32x4 out_h = v_round(in1_f_h * a + in2_f_h * b + g);
-
-                v_int16x8 out_16 = v_pack(out_l, out_h);
-                v_pack_u_store(dst + x, out_16);
-            }
-        }
-#endif
-        #if CV_ENABLE_UNROLLED
-        for( ; x <= width - 4; x += 4 )
-        {
-            float t0, t1;
-            t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma;
-            t1 = CV_8TO32F(src1[x+1])*alpha + CV_8TO32F(src2[x+1])*beta + gamma;
-
-            dst[x] = saturate_cast<uchar>(t0);
-            dst[x+1] = saturate_cast<uchar>(t1);
-
-            t0 = CV_8TO32F(src1[x+2])*alpha + CV_8TO32F(src2[x+2])*beta + gamma;
-            t1 = CV_8TO32F(src1[x+3])*alpha + CV_8TO32F(src2[x+3])*beta + gamma;
-
-            dst[x+2] = saturate_cast<uchar>(t0);
-            dst[x+3] = saturate_cast<uchar>(t1);
-        }
-        #endif
-
-        for( ; x < width; x++ )
-        {
-            float t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma;
-            dst[x] = saturate_cast<uchar>(t0);
-        }
-    }
-}
-
-void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
-                           schar* dst, size_t step, int width, int height, void* scalars )
-{
-    CALL_HAL(addWeighted8s, cv_hal_addWeighted8s, src1, step1, src2, step2, dst, step, width, height, (const double*)scalars)
-    addWeighted_<schar, float>(src1, step1, src2, step2, dst, step, width, height, scalars);
-}
-
-void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
-                            ushort* dst, size_t step, int width, int height, void* scalars )
-{
-    CALL_HAL(addWeighted16u, cv_hal_addWeighted16u, src1, step1, src2, step2, dst, step, width, height, (const double*)scalars)
-    addWeighted_<ushort, float>(src1, step1, src2, step2, dst, step, width, height, scalars);
-}
-
-void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2,
-                            short* dst, size_t step, int width, int height, void* scalars )
-{
-    CALL_HAL(addWeighted16s, cv_hal_addWeighted16s, src1, step1, src2, step2, dst, step, width, height, (const double*)scalars)
-    addWeighted_<short, float>(src1, step1, src2, step2, dst, step, width, height, scalars);
-}
-
-void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2,
-                            int* dst, size_t step, int width, int height, void* scalars )
-{
-    CALL_HAL(addWeighted32s, cv_hal_addWeighted32s, src1, step1, src2, step2, dst, step, width, height, (const double*)scalars)
-    addWeighted_<int, double>(src1, step1, src2, step2, dst, step, width, height, scalars);
-}
-
-void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2,
-                            float* dst, size_t step, int width, int height, void* scalars )
-{
-    CALL_HAL(addWeighted32f, cv_hal_addWeighted32f, src1, step1, src2, step2, dst, step, width, height, (const double*)scalars)
-    addWeighted_<float, double>(src1, step1, src2, step2, dst, step, width, height, scalars);
-}
-
-void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2,
-                            double* dst, size_t step, int width, int height, void* scalars )
-{
-    CALL_HAL(addWeighted64f, cv_hal_addWeighted64f, src1, step1, src2, step2, dst, step, width, height, (const double*)scalars)
-    addWeighted_<double, double>(src1, step1, src2, step2, dst, step, width, height, scalars);
-}
-
-}} // cv::hal::
-
-/* End of file. */
+/* End of file. */
\ No newline at end of file
diff --git a/modules/core/src/arithm.dispatch.cpp b/modules/core/src/arithm.dispatch.cpp
new file mode 100644
index 0000000000..1cbceaee29
--- /dev/null
+++ b/modules/core/src/arithm.dispatch.cpp
@@ -0,0 +1,11 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "precomp.hpp"
+#include "arithm_ipp.hpp"
+#include "arithm.simd.hpp"
+#include "arithm.simd_declarations.hpp"
+
+#define ARITHM_DISPATCHING_ONLY
+#include "arithm.simd.hpp"
\ No newline at end of file
diff --git a/modules/core/src/arithm.simd.hpp b/modules/core/src/arithm.simd.hpp
new file mode 100644
index 0000000000..0c1b33af18
--- /dev/null
+++ b/modules/core/src/arithm.simd.hpp
@@ -0,0 +1,1937 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "opencv2/core/hal/intrin.hpp"
+
+//=========================================
+// Declare & Define & Dispatch in one step
+//=========================================
+
+// ARITHM_DISPATCHING_ONLY defined by arithm dispatch file
+
+#undef ARITHM_DECLARATIONS_ONLY
+#ifdef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+    #define ARITHM_DECLARATIONS_ONLY
+#endif
+
+#undef ARITHM_DEFINITIONS_ONLY
+#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && !defined(ARITHM_DISPATCHING_ONLY)
+    #define ARITHM_DEFINITIONS_ONLY
+#endif
+
+#ifdef ARITHM_DECLARATIONS_ONLY
+    #undef DEFINE_SIMD
+    #define DEFINE_SIMD(fun_name, c_type, ...) \
+        DECLARE_SIMD_FUN(fun_name, c_type)
+#endif // ARITHM_DECLARATIONS_ONLY
+
+#ifdef ARITHM_DEFINITIONS_ONLY
+    #undef DEFINE_SIMD
+    #define DEFINE_SIMD(fun_name, c_type, v_type, ...)          \
+        DECLARE_SIMD_FUN(fun_name, c_type)                      \
+        DEFINE_SIMD_FUN(fun_name, c_type, v_type, __VA_ARGS__)
+#endif // ARITHM_DEFINITIONS_ONLY
+
+#ifdef ARITHM_DISPATCHING_ONLY
+    #undef DEFINE_SIMD
+    #define DEFINE_SIMD(fun_name, c_type, v_type, ...)           \
+        DISPATCH_SIMD_FUN(fun_name, c_type, v_type, __VA_ARGS__)
+#endif // ARITHM_DISPATCHING_ONLY
+
+// workaround when neon miss support of double precision
+#undef DEFINE_NOSIMD
+#ifdef ARITHM_DEFINITIONS_ONLY
+    #define DEFINE_NOSIMD(fun_name, c_type, ...)          \
+        DECLARE_SIMD_FUN(fun_name, c_type)                \
+        DEFINE_NOSIMD_FUN(fun_name, c_type, __VA_ARGS__)
+#else
+    #define DEFINE_NOSIMD DEFINE_SIMD
+#endif // ARITHM_DEFINITIONS_ONLY
+
+#ifndef SIMD_GUARD
+
+#define DEFINE_SIMD_U8(fun, ...) \
+    DEFINE_SIMD(__CV_CAT(fun, 8u), uchar, v_uint8, __VA_ARGS__)
+
+#define DEFINE_SIMD_S8(fun, ...) \
+    DEFINE_SIMD(__CV_CAT(fun, 8s), schar, v_int8,  __VA_ARGS__)
+
+#define DEFINE_SIMD_U16(fun, ...) \
+    DEFINE_SIMD(__CV_CAT(fun, 16u), ushort, v_uint16, __VA_ARGS__)
+
+#define DEFINE_SIMD_S16(fun, ...) \
+    DEFINE_SIMD(__CV_CAT(fun, 16s), short, v_int16,  __VA_ARGS__)
+
+#define DEFINE_SIMD_S32(fun, ...) \
+    DEFINE_SIMD(__CV_CAT(fun, 32s), int, v_int32,  __VA_ARGS__)
+
+#define DEFINE_SIMD_F32(fun, ...) \
+    DEFINE_SIMD(__CV_CAT(fun, 32f), float, v_float32, __VA_ARGS__)
+
+#if CV_SIMD_64F
+    #define DEFINE_SIMD_F64(fun, ...) \
+        DEFINE_SIMD(__CV_CAT(fun, 64f), double, v_float64, __VA_ARGS__)
+#else
+    #define DEFINE_SIMD_F64(fun, ...) \
+        DEFINE_NOSIMD(__CV_CAT(fun, 64f), double, __VA_ARGS__)
+#endif
+
+#define DEFINE_SIMD_SAT(fun, ...)      \
+    DEFINE_SIMD_U8(fun, __VA_ARGS__)   \
+    DEFINE_SIMD_S8(fun, __VA_ARGS__)   \
+    DEFINE_SIMD_U16(fun, __VA_ARGS__)  \
+    DEFINE_SIMD_S16(fun, __VA_ARGS__)
+
+#define DEFINE_SIMD_NSAT(fun, ...)     \
+    DEFINE_SIMD_S32(fun, __VA_ARGS__)  \
+    DEFINE_SIMD_F32(fun, __VA_ARGS__)  \
+    DEFINE_SIMD_F64(fun, __VA_ARGS__)
+
+#define DEFINE_SIMD_ALL(fun, ...)      \
+    DEFINE_SIMD_SAT(fun, __VA_ARGS__)  \
+    DEFINE_SIMD_NSAT(fun, __VA_ARGS__)
+
+#endif // SIMD_GUARD
+
+///////////////////////////////////////////////////////////////////////////
+
+namespace cv { namespace hal {
+
+#ifndef ARITHM_DISPATCHING_ONLY
+    CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+#endif
+
+#ifdef ARITHM_DEFINITIONS_ONLY
+
+#if !CV_SIMD_64F
+typedef int v_float64; // dummy
+#endif
+
+//=======================================
+// Utility
+//=======================================
+
+/** add **/
+template<typename T>
+static inline T c_add(T a, T b)
+{ return saturate_cast<T>(a + b); }
+template<>
+inline uchar c_add<uchar>(uchar a, uchar b)
+{ return CV_FAST_CAST_8U(a + b); }
+// scale
+template<typename T1, typename T2>
+static inline T1 c_add(T1 a, T1 b, T2 scalar)
+{ return saturate_cast<T1>((T2)a * scalar + b); }
+template<>
+inline uchar c_add<uchar, float>(uchar a, uchar b, float scalar)
+{ return saturate_cast<uchar>(CV_8TO32F(a) * scalar + b); }
+// weight
+template<typename T1, typename T2>
+static inline T1 c_add(T1 a, T1 b, T2 alpha, T2 beta, T2 gamma)
+{ return saturate_cast<T1>(a * alpha + b * beta + gamma); }
+template<>
+inline uchar c_add<uchar, float>(uchar a, uchar b, float alpha, float beta, float gamma)
+{ return saturate_cast<uchar>(CV_8TO32F(a) * alpha + CV_8TO32F(b) * beta + gamma); }
+
+/** sub **/
+template<typename T>
+static inline T c_sub(T a, T b)
+{ return saturate_cast<T>(a - b); }
+template<>
+inline uchar c_sub<uchar>(uchar a, uchar b)
+{ return CV_FAST_CAST_8U(a - b); }
+
+/** max **/
+template<typename T>
+static inline T c_max(T a, T b)
+{ return std::max(a, b); }
+template<>
+inline uchar c_max<uchar>(uchar a, uchar b)
+{ return CV_MAX_8U(a, b); }
+
+/** min **/
+template<typename T>
+static inline T c_min(T a, T b)
+{ return std::min(a, b); }
+template<>
+inline uchar c_min<uchar>(uchar a, uchar b)
+{ return CV_MIN_8U(a, b); }
+
+/** absdiff **/
+template<typename T>
+static inline T c_absdiff(T a, T b)
+{ return a > b ? a - b : b - a; }
+template<>
+inline schar c_absdiff(schar a, schar b)
+{ return saturate_cast<schar>(std::abs(a - b)); }
+template<>
+inline short c_absdiff(short a, short b)
+{ return saturate_cast<short>(std::abs(a - b)); }
+// specializations to prevent "-0" results
+template<>
+inline float c_absdiff<float>(float a, float b)
+{ return std::abs(a - b); }
+template<>
+inline double c_absdiff<double>(double a, double b)
+{ return std::abs(a - b); }
+
+/** multiply **/
+template<typename T>
+static inline T c_mul(T a, T b)
+{ return saturate_cast<T>(a * b); }
+template<>
+inline uchar c_mul<uchar>(uchar a, uchar b)
+{ return CV_FAST_CAST_8U(a * b); }
+// scale
+template<typename T1, typename T2>
+static inline T1 c_mul(T1 a, T1 b, T2 scalar)
+{ return saturate_cast<T1>(scalar * (T2)a * b); }
+template<>
+inline uchar c_mul<uchar, float>(uchar a, uchar b, float scalar)
+{ return saturate_cast<uchar>(scalar * CV_8TO32F(a) * CV_8TO32F(b)); }
+
+/** divide & reciprocal **/
+template<typename T1, typename T2>
+static inline T2 c_div(T1 a, T2 b)
+{ return saturate_cast<T2>(a / b); }
+// recip
+template<>
+inline uchar c_div<float, uchar>(float a, uchar b)
+{ return saturate_cast<uchar>(a / CV_8TO32F(b)); }
+// scale
+template<typename T1, typename T2>
+static inline T1 c_div(T1 a, T1 b, T2 scalar)
+{ return saturate_cast<T1>(scalar * (T2)a / b); }
+template<>
+inline uchar c_div<uchar, float>(uchar a, uchar b, float scalar)
+{ return saturate_cast<uchar>(scalar * CV_8TO32F(a) / CV_8TO32F(b)); }
+
+//=======================================
+// Arithmetic and logical operations
+// +, -, *, /, &, |, ^, ~, abs ...
+//=======================================
+
+///////////////////////////// Operations //////////////////////////////////
+
+// Add
+template<typename T1, typename Tvec>
+struct op_add
+{
+    static inline Tvec r(const Tvec& a, const Tvec& b)
+    { return a + b; }
+    static inline T1 r(T1 a, T1 b)
+    { return c_add(a, b); }
+};
+
+// Subtract
+template<typename T1, typename Tvec>
+struct op_sub
+{
+    static inline Tvec r(const Tvec& a, const Tvec& b)
+    { return a - b; }
+    static inline T1 r(T1 a, T1 b)
+    { return c_sub(a, b); }
+};
+
+// Max & Min
+template<typename T1, typename Tvec>
+struct op_max
+{
+    static inline Tvec r(const Tvec& a, const Tvec& b)
+    { return v_max(a, b); }
+    static inline T1 r(T1 a, T1 b)
+    { return c_max(a, b); }
+};
+
+template<typename T1, typename Tvec>
+struct op_min
+{
+    static inline Tvec r(const Tvec& a, const Tvec& b)
+    { return v_min(a, b); }
+    static inline T1 r(T1 a, T1 b)
+    { return c_min(a, b); }
+};
+
+// Absolute difference
+template<typename T1, typename Tvec>
+struct op_absdiff
+{
+    static inline Tvec r(const Tvec& a, const Tvec& b)
+    { return v_absdiff(a, b); }
+    static inline T1 r(T1 a, T1 b)
+    { return c_absdiff(a, b); }
+};
+// Signed absolute difference, 's'
+template<>
+struct op_absdiff<schar, v_int8>
+{
+    static inline v_int8 r(const v_int8& a, const v_int8& b)
+    { return v_absdiffs(a, b); }
+    static inline schar r(schar a, schar b)
+    { return c_absdiff(a, b); }
+};
+template<>
+struct op_absdiff<short, v_int16>
+{
+    static inline v_int16 r(const v_int16& a, const v_int16& b)
+    { return v_absdiffs(a, b); }
+    static inline short r(short a, short b)
+    { return c_absdiff(a, b); }
+};
+template<>
+struct op_absdiff<int, v_int32>
+{
+    static inline v_int32 r(const v_int32& a, const v_int32& b)
+    { return v_reinterpret_as_s32(v_absdiff(a, b)); }
+    static inline int r(int a, int b)
+    { return c_absdiff(a, b); }
+};
+
+// Logical
+template<typename T1, typename Tvec>
+struct op_or
+{
+    static inline Tvec r(const Tvec& a, const Tvec& b)
+    { return a | b; }
+    static inline T1 r(T1 a, T1 b)
+    { return a | b; }
+};
+template<typename T1, typename Tvec>
+struct op_xor
+{
+    static inline Tvec r(const Tvec& a, const Tvec& b)
+    { return a ^ b; }
+    static inline T1 r(T1 a, T1 b)
+    { return a ^ b; }
+};
+template<typename T1, typename Tvec>
+struct op_and
+{
+    static inline Tvec r(const Tvec& a, const Tvec& b)
+    { return a & b; }
+    static inline T1 r(T1 a, T1 b)
+    { return a & b; }
+};
+template<typename T1, typename Tvec>
+struct op_not
+{
+    // ignored b from loader level
+    static inline Tvec r(const Tvec& a)
+    { return ~a; }
+    static inline T1 r(T1 a, T1)
+    { return ~a; }
+};
+
+//////////////////////////// Loaders /////////////////////////////////
+
+#if CV_SIMD
+
+template< template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
+struct bin_loader
+{
+    typedef OP<T1, Tvec> op;
+
+    static inline void l(const T1* src1, const T1* src2, T1* dst)
+    {
+        Tvec a = vx_load(src1);
+        Tvec b = vx_load(src2);
+        v_store(dst, op::r(a, b));
+    }
+
+    static inline void la(const T1* src1, const T1* src2, T1* dst)
+    {
+        Tvec a = vx_load_aligned(src1);
+        Tvec b = vx_load_aligned(src2);
+        v_store_aligned(dst, op::r(a, b)); // todo: try write without cache
+    }
+
+    static inline void l64(const T1* src1, const T1* src2, T1* dst)
+    {
+        Tvec a = vx_load_low(src1), b = vx_load_low(src2);
+        v_store_low(dst, op::r(a, b));
+    }
+};
+
+// void src2 for operation "not"
+template<typename T1, typename Tvec>
+struct bin_loader<op_not, T1, Tvec>
+{
+    typedef op_not<T1, Tvec> op;
+
+    static inline void l(const T1* src1, const T1*, T1* dst)
+    {
+        Tvec a = vx_load(src1);
+        v_store(dst, op::r(a));
+    }
+
+    static inline void la(const T1* src1, const T1*, T1* dst)
+    {
+        Tvec a = vx_load_aligned(src1);
+        v_store_aligned(dst, op::r(a));
+    }
+
+    static inline void l64(const T1* src1, const T1*, T1* dst)
+    {
+        Tvec a = vx_load_low(src1);
+        v_store_low(dst, op::r(a));
+    }
+};
+
+#endif // CV_SIMD
+
+//////////////////////////// Loops /////////////////////////////////
+
+template<typename T1, typename T2>
+static inline bool is_aligned(const T1* src1, const T1* src2, const T2* dst)
+{ return (((size_t)src1|(size_t)src2|(size_t)dst) & (CV_SIMD_WIDTH - 1)) == 0; }
+
+template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
+static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height)
+{
+    typedef OP<T1, Tvec> op;
+#if CV_SIMD
+    typedef bin_loader<OP, T1, Tvec> ldr;
+    enum {wide_step = Tvec::nlanes};
+    #if !CV_NEON && CV_SIMD_WIDTH == 16
+        enum {wide_step_l = wide_step * 2};
+    #else
+        enum {wide_step_l = wide_step};
+    #endif
+#endif // CV_SIMD
+
+    step1 /= sizeof(T1);
+    step2 /= sizeof(T1);
+    step  /= sizeof(T1);
+
+    for (; height--; src1 += step1, src2 += step2, dst += step)
+    {
+        int x = 0;
+
+    #if CV_SIMD
+        #if !CV_NEON
+        if (is_aligned(src1, src2, dst))
+        {
+            for (; x <= width - wide_step_l; x += wide_step_l)
+            {
+                ldr::la(src1 + x, src2 + x, dst + x);
+                #if !CV_NEON && CV_SIMD_WIDTH == 16
+                ldr::la(src1 + x + wide_step, src2 + x + wide_step, dst + x + wide_step);
+                #endif
+            }
+        }
+        else
+        #endif
+            for (; x <= width - wide_step_l; x += wide_step_l)
+            {
+                ldr::l(src1 + x, src2 + x, dst + x);
+                #if !CV_NEON && CV_SIMD_WIDTH == 16
+                ldr::l(src1 + x + wide_step, src2 + x + wide_step, dst + x + wide_step);
+                #endif
+            }
+
+        #if CV_SIMD_WIDTH == 16
+        for (; x <= width - 8/(int)sizeof(T1); x += 8/(int)sizeof(T1))
+        {
+            ldr::l64(src1 + x, src2 + x, dst + x);
+        }
+        #endif
+    #endif // CV_SIMD
+
+    #if CV_ENABLE_UNROLLED || CV_SIMD_WIDTH > 16
+        for (; x <= width - 4; x += 4)
+        {
+            T1 t0 = op::r(src1[x], src2[x]);
+            T1 t1 = op::r(src1[x + 1], src2[x + 1]);
+            dst[x] = t0; dst[x + 1] = t1;
+
+            t0 = op::r(src1[x + 2], src2[x + 2]);
+            t1 = op::r(src1[x + 3], src2[x + 3]);
+            dst[x + 2] = t0; dst[x + 3] = t1;
+        }
+    #endif
+
+        for (; x < width; x++)
+            dst[x] = op::r(src1[x], src2[x]);
+    }
+
+    vx_cleanup();
+}
+
+#if !CV_SIMD_64F
+template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
+static void bin_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height)
+{
+    typedef OP<T1, Tvec/*dummy*/> op;
+
+    step1 /= sizeof(T1);
+    step2 /= sizeof(T1);
+    step  /= sizeof(T1);
+
+    for (; height--; src1 += step1, src2 += step2, dst += step)
+    {
+        int x = 0;
+
+        for (; x <= width - 4; x += 4)
+        {
+            T1 t0 = op::r(src1[x], src2[x]);
+            T1 t1 = op::r(src1[x + 1], src2[x + 1]);
+            dst[x] = t0; dst[x + 1] = t1;
+
+            t0 = op::r(src1[x + 2], src2[x + 2]);
+            t1 = op::r(src1[x + 3], src2[x + 3]);
+            dst[x + 2] = t0; dst[x + 3] = t1;
+        }
+
+        for (; x < width; x++)
+            dst[x] = op::r(src1[x], src2[x]);
+    }
+}
+#define BIN_LOOP64F bin_loop_nosimd
+#else
+#define BIN_LOOP64F bin_loop
+#endif //!CV_SIMD_64F
+
+#endif // ARITHM_DEFINITIONS_ONLY
+
+////////////////////////////////////////////////////////////////////////////////////
+
+#ifndef SIMD_GUARD
+#define BIN_ARGS(_T1) const _T1* src1, size_t step1, const _T1* src2, size_t step2, \
+                      _T1* dst, size_t step, int width, int height
+
+#define BIN_ARGS_PASS src1, step1, src2, step2, dst, step, width, height
+#endif // SIMD_GUARD
+
+#undef DECLARE_SIMD_FUN
+#define DECLARE_SIMD_FUN(fun, _T1) void fun(BIN_ARGS(_T1));
+
+#undef DISPATCH_SIMD_FUN
+#define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, _OP)                           \
+    void fun(BIN_ARGS(_T1), void*)                                        \
+    {                                                                     \
+        CV_INSTRUMENT_REGION();                                           \
+        CALL_HAL(fun, __CV_CAT(cv_hal_, fun), BIN_ARGS_PASS)              \
+        ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun), BIN_ARGS_PASS)        \
+        CV_CPU_DISPATCH(fun, (BIN_ARGS_PASS), CV_CPU_DISPATCH_MODES_ALL); \
+    }
+
+#undef DEFINE_SIMD_FUN
+#define DEFINE_SIMD_FUN(fun, _T1, _Tvec, _OP)     \
+    void fun(BIN_ARGS(_T1))                       \
+    {                                             \
+        CV_INSTRUMENT_REGION();                   \
+        bin_loop<_OP, _T1, _Tvec>(BIN_ARGS_PASS); \
+    }
+
+#undef DEFINE_NOSIMD_FUN
+#define DEFINE_NOSIMD_FUN(fun, _T1, _OP)                     \
+    void fun(BIN_ARGS(_T1))                                  \
+    {                                                        \
+        CV_INSTRUMENT_REGION();                              \
+        bin_loop_nosimd<_OP, _T1, v_float64>(BIN_ARGS_PASS); \
+    }
+
+DEFINE_SIMD_ALL(add, op_add)
+DEFINE_SIMD_ALL(sub, op_sub)
+
+DEFINE_SIMD_ALL(min, op_min)
+DEFINE_SIMD_ALL(max, op_max)
+
+DEFINE_SIMD_ALL(absdiff, op_absdiff)
+
+DEFINE_SIMD_U8(or,  op_or)
+DEFINE_SIMD_U8(xor, op_xor)
+DEFINE_SIMD_U8(and, op_and)
+
+// One source!, an exception for operation "not"
+// we could use macros here but it's better to implement it
+// with that way to give more clarification
+// about how macroS "DEFINE_SIMD_*" are works
+
+#if defined(ARITHM_DECLARATIONS_ONLY) || defined(ARITHM_DEFINITIONS_ONLY)
+void not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height);
+#endif
+#ifdef ARITHM_DEFINITIONS_ONLY
+void not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height)
+{
+    CV_INSTRUMENT_REGION();
+    bin_loop<op_not, uchar, v_uint8>(src1, step1, src2, step2, dst, step, width, height);
+}
+#endif
+#ifdef ARITHM_DISPATCHING_ONLY
+void not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void*)
+{
+    CV_INSTRUMENT_REGION();
+    CALL_HAL(not8u, cv_hal_not8u, src1, step1, dst, step, width, height)
+    ARITHM_CALL_IPP(arithm_ipp_not8u, src1, step1, dst, step, width, height)
+    CV_CPU_DISPATCH(not8u, (src1, step1, src2, step2, dst, step, width, height), CV_CPU_DISPATCH_MODES_ALL);
+}
+#endif
+
+//=======================================
+// Compare
+//=======================================
+
+#ifdef ARITHM_DEFINITIONS_ONLY
+
+///////////////////////////// Operations //////////////////////////////////
+
+template<typename T1, typename Tvec>
+struct op_cmplt
+{
+    static inline Tvec r(const Tvec& a, const Tvec& b)
+    { return a < b; }
+    static inline uchar r(T1 a, T1 b)
+    { return (uchar)-(int)(a < b); }
+};
+
+template<typename T1, typename Tvec>
+struct op_cmple
+{
+    static inline Tvec r(const Tvec& a, const Tvec& b)
+    { return a <= b; }
+    static inline uchar r(T1 a, T1 b)
+    { return (uchar)-(int)(a <= b); }
+};
+
+template<typename T1, typename Tvec>
+struct op_cmpeq
+{
+    static inline Tvec r(const Tvec& a, const Tvec& b)
+    { return a == b; }
+    static inline uchar r(T1 a, T1 b)
+    { return (uchar)-(int)(a == b); }
+};
+
+template<typename T1, typename Tvec>
+struct op_cmpne
+{
+    static inline Tvec r(const Tvec& a, const Tvec& b)
+    { return a != b; }
+    static inline uchar r(T1 a, T1 b)
+    { return (uchar)-(int)(a != b); }
+};
+
+//////////////////////////// Loaders /////////////////////////////////
+
+#if CV_SIMD
+// todo: add support for RW alignment & stream
+template<int nload, template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
+struct cmp_loader_n
+{
+    void l(const T1* src1, const T1* src2, uchar* dst);
+};
+
+template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
+struct cmp_loader_n<sizeof(uchar), OP, T1, Tvec>
+{
+    typedef OP<T1, Tvec> op;
+
+    static inline void l(const T1* src1, const T1* src2, uchar* dst)
+    {
+        Tvec a = vx_load(src1);
+        Tvec b = vx_load(src2);
+        v_store(dst, v_reinterpret_as_u8(op::r(a, b)));
+    }
+};
+
+template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
+struct cmp_loader_n<sizeof(ushort), OP, T1, Tvec>
+{
+    typedef OP<T1, Tvec> op;
+    enum {step = Tvec::nlanes};
+
+    static inline void l(const T1* src1, const T1* src2, uchar* dst)
+    {
+        Tvec c0 = op::r(vx_load(src1), vx_load(src2));
+        Tvec c1 = op::r(vx_load(src1 + step), vx_load(src2 + step));
+        v_store(dst, v_pack_b(v_reinterpret_as_u16(c0), v_reinterpret_as_u16(c1)));
+    }
+};
+
+template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
+struct cmp_loader_n<sizeof(unsigned), OP, T1, Tvec>
+{
+    typedef OP<T1, Tvec> op;
+    enum {step = Tvec::nlanes};
+
+    static inline void l(const T1* src1, const T1* src2, uchar* dst)
+    {
+        v_uint32 c0 = v_reinterpret_as_u32(op::r(vx_load(src1), vx_load(src2)));
+        v_uint32 c1 = v_reinterpret_as_u32(op::r(vx_load(src1 + step), vx_load(src2 + step)));
+        v_uint32 c2 = v_reinterpret_as_u32(op::r(vx_load(src1 + step * 2), vx_load(src2 + step * 2)));
+        v_uint32 c3 = v_reinterpret_as_u32(op::r(vx_load(src1 + step * 3), vx_load(src2 + step * 3)));
+        v_store(dst, v_pack_b(c0, c1, c2, c3));
+    }
+};
+
+template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
+struct cmp_loader_n<sizeof(double), OP, T1, Tvec>
+{
+    typedef OP<T1, Tvec> op;
+    enum {step = Tvec::nlanes};
+
+    static inline void l(const T1* src1, const T1* src2, uchar* dst)
+    {
+        v_uint64 c0 = v_reinterpret_as_u64(op::r(vx_load(src1), vx_load(src2)));
+        v_uint64 c1 = v_reinterpret_as_u64(op::r(vx_load(src1 + step), vx_load(src2 + step)));
+        v_uint64 c2 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 2), vx_load(src2 + step * 2)));
+        v_uint64 c3 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 3), vx_load(src2 + step * 3)));
+
+        v_uint64 c4 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 4), vx_load(src2 + step * 4)));
+        v_uint64 c5 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 5), vx_load(src2 + step * 5)));
+        v_uint64 c6 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 6), vx_load(src2 + step * 6)));
+        v_uint64 c7 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 7), vx_load(src2 + step * 7)));
+        v_store(dst, v_pack_b(c0, c1, c2, c3, c4, c5, c6, c7));
+    }
+};
+
+#endif // CV_SIMD
+
+//////////////////////////// Loops /////////////////////////////////
+
+template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
+static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height)
+{
+    typedef OP<T1, Tvec> op;
+#if CV_SIMD
+    typedef cmp_loader_n<sizeof(T1), OP, T1, Tvec> ldr;
+    enum {wide_step = Tvec::nlanes * sizeof(T1)};
+#endif // CV_SIMD
+
+    step1 /= sizeof(T1);
+    step2 /= sizeof(T1);
+
+    for (; height--; src1 += step1, src2 += step2, dst += step)
+    {
+        int x = 0;
+
+    #if CV_SIMD
+        for (; x <= width - wide_step; x += wide_step)
+        {
+            ldr::l(src1 + x, src2 + x, dst + x);
+        }
+    #endif // CV_SIMD
+
+    #if CV_ENABLE_UNROLLED || CV_SIMD_WIDTH > 16
+        for (; x <= width - 4; x += 4)
+        {
+            uchar t0 = op::r(src1[x], src2[x]);
+            uchar t1 = op::r(src1[x + 1], src2[x + 1]);
+            dst[x] = t0; dst[x + 1] = t1;
+
+            t0 = op::r(src1[x + 2], src2[x + 2]);
+            t1 = op::r(src1[x + 3], src2[x + 3]);
+            dst[x + 2] = t0; dst[x + 3] = t1;
+        }
+    #endif
+
+        for (; x < width; x++)
+            dst[x] = op::r(src1[x], src2[x]);
+    }
+
+    vx_cleanup();
+}
+
+template<typename T1, typename Tvec>
+static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
+                     uchar* dst, size_t step, int width, int height, int cmpop)
+{
+    switch(cmpop)
+    {
+    case CMP_LT:
+        cmp_loop<op_cmplt, T1, Tvec>(src1, step1, src2, step2, dst, step, width, height);
+        break;
+    case CMP_GT:
+        cmp_loop<op_cmplt, T1, Tvec>(src2, step2, src1, step1, dst, step, width, height);
+        break;
+    case CMP_LE:
+        cmp_loop<op_cmple, T1, Tvec>(src1, step1, src2, step2, dst, step, width, height);
+        break;
+    case CMP_GE:
+        cmp_loop<op_cmple, T1, Tvec>(src2, step2, src1, step1, dst, step, width, height);
+        break;
+    case CMP_EQ:
+        cmp_loop<op_cmpeq, T1, Tvec>(src1, step1, src2, step2, dst, step, width, height);
+        break;
+    default:
+        CV_Assert(cmpop == CMP_NE);
+        cmp_loop<op_cmpne, T1, Tvec>(src1, step1, src2, step2, dst, step, width, height);
+        break;
+    }
+}
+
+#if !CV_SIMD_64F
+template< template<typename T1, typename Tvec> class OP, typename T1>
+static void cmp_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height)
+{
+    typedef OP<T1, v_int32 /*dummy*/> op;
+
+    step1 /= sizeof(T1);
+    step2 /= sizeof(T1);
+
+    for (; height--; src1 += step1, src2 += step2, dst += step)
+    {
+        int x = 0;
+
+        for (; x <= width - 4; x += 4)
+        {
+            uchar t0 = op::r(src1[x], src2[x]);
+            uchar t1 = op::r(src1[x + 1], src2[x + 1]);
+            dst[x] = t0; dst[x + 1] = t1;
+
+            t0 = op::r(src1[x + 2], src2[x + 2]);
+            t1 = op::r(src1[x + 3], src2[x + 3]);
+            dst[x + 2] = t0; dst[x + 3] = t1;
+        }
+
+        for (; x < width; x++)
+            dst[x] = op::r(src1[x], src2[x]);
+    }
+}
+static void cmp_loop_nosimd(const double* src1, size_t step1, const double* src2, size_t step2,
+                            uchar* dst, size_t step, int width, int height, int cmpop)
+{
+    switch(cmpop)
+    {
+    case CMP_LT:
+        cmp_loop_nosimd<op_cmplt, double>(src1, step1, src2, step2, dst, step, width, height);
+        break;
+    case CMP_GT:
+        cmp_loop_nosimd<op_cmplt, double>(src2, step2, src1, step1, dst, step, width, height);
+        break;
+    case CMP_LE:
+        cmp_loop_nosimd<op_cmple, double>(src1, step1, src2, step2, dst, step, width, height);
+        break;
+    case CMP_GE:
+        cmp_loop_nosimd<op_cmple, double>(src2, step2, src1, step1, dst, step, width, height);
+        break;
+    case CMP_EQ:
+        cmp_loop_nosimd<op_cmpeq, double>(src1, step1, src2, step2, dst, step, width, height);
+        break;
+    default:
+        CV_Assert(cmpop == CMP_NE);
+        cmp_loop_nosimd<op_cmpne, double>(src1, step1, src2, step2, dst, step, width, height);
+        break;
+    }
+}
+#endif // !CV_SIMD_64F
+
+#endif // ARITHM_DEFINITIONS_ONLY
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifndef SIMD_GUARD
+#define CMP_ARGS(_T1) const _T1* src1, size_t step1, const _T1* src2, size_t step2, \
+                           uchar* dst, size_t step, int width, int height
+
+#define CMP_ARGS_PASS src1, step1, src2, step2, dst, step, width, height
+#endif // SIMD_GUARD
+
+#undef DECLARE_SIMD_FUN
+#define DECLARE_SIMD_FUN(fun, _T1) void fun(CMP_ARGS(_T1), int cmpop);
+
+#undef DISPATCH_SIMD_FUN
+#define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, ...)                                          \
+    void fun(CMP_ARGS(_T1), void* _cmpop)                                                \
+    {                                                                                    \
+        CV_INSTRUMENT_REGION();                                                          \
+        CALL_HAL(fun, __CV_CAT(cv_hal_, fun), CMP_ARGS_PASS, *(int*)_cmpop)              \
+        ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun), CMP_ARGS_PASS, *(int*)_cmpop)        \
+        CV_CPU_DISPATCH(fun, (CMP_ARGS_PASS, *(int*)_cmpop), CV_CPU_DISPATCH_MODES_ALL); \
+    }
+
+#undef DEFINE_SIMD_FUN
+#define DEFINE_SIMD_FUN(fun, _T1, _Tvec, ...)       \
+    void fun(CMP_ARGS(_T1), int cmpop)              \
+    {                                               \
+        CV_INSTRUMENT_REGION();                     \
+        cmp_loop<_T1, _Tvec>(CMP_ARGS_PASS, cmpop); \
+    }
+
+#undef DEFINE_NOSIMD_FUN
+#define DEFINE_NOSIMD_FUN(fun, _T1, _Tvec, ...)     \
+    void fun(CMP_ARGS(_T1), int cmpop)              \
+    {                                               \
+        CV_INSTRUMENT_REGION();                     \
+        cmp_loop_nosimd(CMP_ARGS_PASS, cmpop);      \
+    }
+
+// todo: try to avoid define dispatcher functions using macros with these such cases
+DEFINE_SIMD_ALL(cmp)
+
+//=========================================================================
+// scaling helpers for single and dual source
+//
+// Dual: Multiply, Div, AddWeighted
+//
+// Single: Reciprocal
+//
+//=========================================================================
+
+#ifdef ARITHM_DEFINITIONS_ONLY
+
+//////////////////////////// Loaders ///////////////////////////////
+
+#if CV_SIMD
+// todo: add support for RW alignment & stream
+template<int nload, template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
+struct scalar_loader_n
+{
+    void l(const T1* src1, const T1* src2, const T2* scalar, T1* dst);
+    // single source
+    void l(const T1* src1, const T2* scalar, T1* dst);
+};
+
+template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
+struct scalar_loader_n<sizeof(uchar), OP, T1, T2, Tvec>
+{
+    typedef OP<T1, T2, v_int16> op;
+
+    static inline void l(const T1* src1, const T1* src2, const T2* scalar, T1* dst)
+    {
+        v_int16 v_src1 = v_reinterpret_as_s16(vx_load_expand(src1));
+        v_int16 v_src2 = v_reinterpret_as_s16(vx_load_expand(src2));
+
+        v_int32 t0, t1, t2, t3;
+        v_expand(v_src1, t0, t2);
+        v_expand(v_src2, t1, t3);
+
+        v_float32 f0, f1, f2, f3;
+        f0 = v_cvt_f32(t0);
+        f1 = v_cvt_f32(t1);
+        f2 = v_cvt_f32(t2);
+        f3 = v_cvt_f32(t3);
+
+        f0 = op::r(f0, f1, scalar);
+        f2 = op::r(f2, f3, scalar);
+
+        v_int32 r0 = v_round(f0);
+        v_int32 r1 = v_round(f2);
+
+        store(dst, v_src2, r0, r1);
+    }
+
+    static inline void l(const T1* src1, const T2* scalar, T1* dst)
+    {
+        v_int16 v_src1 = v_reinterpret_as_s16(vx_load_expand(src1));
+
+        v_int32 t0, t1;
+        v_expand(v_src1, t0, t1);
+
+        v_float32 f0, f1;
+        f0 = v_cvt_f32(t0);
+        f1 = v_cvt_f32(t1);
+
+        f0 = op::r(f0, scalar);
+        f1 = op::r(f1, scalar);
+
+        v_int32 r0 = v_round(f0);
+        v_int32 r1 = v_round(f1);
+
+        store(dst, v_src1, r0, r1);
+    }
+
+    static inline void store(uchar* dst, const v_int16& src, const v_int32& a, const v_int32& b)
+    {
+        v_pack_u_store(dst, op::pre(src, v_pack(a, b)));
+    }
+    static inline void store(schar* dst, const v_int16& src, const v_int32& a, const v_int32& b)
+    {
+        v_pack_store(dst, op::pre(src, v_pack(a, b)));
+    }
+};
+
+template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
+struct scalar_loader_n<sizeof(ushort), OP, T1, T2, Tvec>
+{
+    typedef typename V_RegTraits<Tvec>::w_reg Twvec;
+    typedef OP<T1, T2, Tvec> op;
+
+    static inline void l(const T1* src1, const T1* src2, const T2* scalar, T1* dst)
+    {
+        Tvec v_src1 = vx_load(src1);
+        Tvec v_src2 = vx_load(src2);
+
+        Twvec t0, t1, t2, t3;
+        v_expand(v_src1, t0, t2);
+        v_expand(v_src2, t1, t3);
+
+        v_float32 f0, f1, f2, f3;
+        f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
+        f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
+        f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
+        f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
+
+        f0 = op::r(f0, f1, scalar);
+        f2 = op::r(f2, f3, scalar);
+
+        v_int32 r0 = v_round(f0);
+        v_int32 r1 = v_round(f2);
+
+        store(dst, v_src2, r0, r1);
+    }
+
+    static inline void l(const T1* src1, const T2* scalar, T1* dst)
+    {
+        Tvec v_src1 = vx_load(src1);
+
+        Twvec t0, t1;
+        v_expand(v_src1, t0, t1);
+
+        v_float32 f0, f1;
+        f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
+        f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
+
+        f0 = op::r(f0, scalar);
+        f1 = op::r(f1, scalar);
+
+        v_int32 r0 = v_round(f0);
+        v_int32 r1 = v_round(f1);
+
+        store(dst, v_src1, r0, r1);
+    }
+
+    static inline void store(ushort* dst, const Tvec& src, const v_int32& a, const v_int32& b)
+    {
+        v_store(dst, op::pre(src, v_pack_u(a, b)));
+    }
+    static inline void store(short* dst, const Tvec& src, const v_int32& a, const v_int32& b)
+    {
+        v_store(dst, op::pre(src, v_pack(a, b)));
+    }
+};
+
+template<template<typename T1, typename T2, typename Tvec> class OP, typename T2>
+struct scalar_loader_n<sizeof(int), OP, int, T2, v_int32>
+{
+    typedef OP<int, T2, v_int32> op;
+    enum {step = v_int32::nlanes};
+
+    static inline void l(const int* src1, const int* src2, const T2* scalar, int* dst)
+    {
+        v_int32 v_src1 = vx_load(src1);
+        v_int32 v_src2 = vx_load(src2);
+        v_int32 v_src1s = vx_load(src1 + step);
+        v_int32 v_src2s = vx_load(src2 + step);
+
+        v_float32 f0, f1, f2, f3;
+        f0 = v_cvt_f32(v_reinterpret_as_s32(v_src1));
+        f1 = v_cvt_f32(v_reinterpret_as_s32(v_src2));
+        f2 = v_cvt_f32(v_reinterpret_as_s32(v_src1s));
+        f3 = v_cvt_f32(v_reinterpret_as_s32(v_src2s));
+
+        f0 = op::r(f0, f1, scalar);
+        f2 = op::r(f2, f3, scalar);
+
+        v_int32 r0 = v_round(f0);
+        v_int32 r1 = v_round(f2);
+
+        r0 = op::pre(v_src2, r0);
+        r1 = op::pre(v_src2s, r1);
+
+        v_store(dst, r0);
+        v_store(dst + step, r1);
+    }
+
+    static inline void l(const int* src1, const T2* scalar, int* dst)
+    {
+        v_int32 v_src1 = vx_load(src1);
+        v_int32 v_src1s = vx_load(src1 + step);
+
+        v_float32 f0, f1;
+        f0 = v_cvt_f32(v_src1);
+        f1 = v_cvt_f32(v_src1s);
+
+        f0 = op::r(f0, scalar);
+        f1 = op::r(f1, scalar);
+
+        v_int32 r0 = v_round(f0);
+        v_int32 r1 = v_round(f1);
+
+        r0 = op::pre(v_src1, r0);
+        r1 = op::pre(v_src1s, r1);
+
+        v_store(dst, r0);
+        v_store(dst + step, r1);
+    }
+};
+
+template<template<typename T1, typename T2, typename Tvec> class OP, typename T2>
+struct scalar_loader_n<sizeof(float), OP, float, T2, v_float32>
+{
+    typedef OP<float, T2, v_float32> op;
+    enum {step = v_float32::nlanes};
+
+    static inline void l(const float* src1, const float* src2, const T2* scalar, float* dst)
+    {
+        v_float32 v_src1 = vx_load(src1);
+        v_float32 v_src2 = vx_load(src2);
+        v_float32 v_src1s = vx_load(src1 + step);
+        v_float32 v_src2s = vx_load(src2 + step);
+
+        v_float32 r0 = op::r(v_src1, v_src2, scalar);
+        v_float32 r1 = op::r(v_src1s, v_src2s, scalar);
+
+        #if CV_VERSION_MAJOR == 3
+        r0 = op::pre(v_src2, r0);
+        r1 = op::pre(v_src2s, r1);
+        #endif
+
+        v_store(dst, r0);
+        v_store(dst + step, r1);
+    }
+
+    static inline void l(const float* src1, const T2* scalar, float* dst)
+    {
+        v_float32 v_src1 = vx_load(src1);
+        v_float32 v_src1s = vx_load(src1 + step);
+
+        v_float32 r0 = op::r(v_src1, scalar);
+        v_float32 r1 = op::r(v_src1s, scalar);
+
+        #if CV_VERSION_MAJOR == 3
+        r0 = op::pre(v_src1, r0);
+        r1 = op::pre(v_src1s, r1);
+        #endif
+
+        v_store(dst, r0);
+        v_store(dst + step, r1);
+    }
+};
+#endif // CV_SIMD
+
+#if CV_SIMD_64F
+template<template<typename T1, typename T2, typename Tvec> class OP>
+struct scalar_loader_n<sizeof(int), OP, int, double, v_int32>
+{
+    typedef OP<int, float, v_int32> op;
+    typedef OP<double, double, v_float64> op64;
+    enum {step = v_int32::nlanes};
+
+    static inline void l(const int* src1, const int* src2, const double* scalar, int* dst)
+    {
+        v_int32 v_src1 = vx_load(src1);
+        v_int32 v_src2 = vx_load(src2);
+        v_int32 v_src1s = vx_load(src1 + step);
+        v_int32 v_src2s = vx_load(src2 + step);
+
+        v_int32 r0 = r(v_src1, v_src2, scalar);
+        v_int32 r1 = r(v_src1s, v_src2s, scalar);
+
+        r0 = op::pre(v_src2, r0);
+        r1 = op::pre(v_src2s, r1);
+
+        v_store(dst, r0);
+        v_store(dst + step, r1);
+    }
+    static inline void l(const int* src1, const double* scalar, int* dst)
+    {
+        v_int32 v_src1 = vx_load(src1);
+        v_int32 v_src1s = vx_load(src1 + step);
+
+        v_int32 r0 = r(v_src1, scalar);
+        v_int32 r1 = r(v_src1s, scalar);
+
+        r0 = op::pre(v_src1, r0);
+        r1 = op::pre(v_src1s, r1);
+
+        v_store(dst, r0);
+        v_store(dst + step, r1);
+    }
+
+    static inline v_int32 r(const v_int32& a, const v_int32& b, const double* scalar)
+    {
+        v_float64 f0, f1, f2, f3;
+        f0 = v_cvt_f64(a);
+        f1 = v_cvt_f64_high(a);
+        f2 = v_cvt_f64(b);
+        f3 = v_cvt_f64_high(b);
+
+        v_float64 r0 = op64::r(f0, f2, scalar);
+        v_float64 r1 = op64::r(f1, f3, scalar);
+
+        return v_round(r0, r1);
+    }
+    static inline v_int32 r(const v_int32& a, const double* scalar)
+    {
+        v_float64 f0, f1;
+        f0 = v_cvt_f64(a);
+        f1 = v_cvt_f64_high(a);
+
+        v_float64 r0 = op64::r(f0, scalar);
+        v_float64 r1 = op64::r(f1, scalar);
+
+        return v_round(r0, r1);
+    }
+};
+
+template<template<typename T1, typename T2, typename Tvec> class OP>
+struct scalar_loader_n<sizeof(float), OP, float, double, v_float32>
+{
+    typedef OP<float, float, v_float32> op;
+    typedef OP<double, double, v_float64> op64;
+    enum {step = v_float32::nlanes};
+
+    static inline void l(const float* src1, const float* src2, const double* scalar, float* dst)
+    {
+        v_float32 v_src1 = vx_load(src1);
+        v_float32 v_src2 = vx_load(src2);
+        v_float32 v_src1s = vx_load(src1 + step);
+        v_float32 v_src2s = vx_load(src2 + step);
+
+        v_float32 r0 = r(v_src1, v_src2, scalar);
+        v_float32 r1 = r(v_src1s, v_src2s, scalar);
+
+        #if CV_VERSION_MAJOR == 3
+        r0 = op::pre(v_src2, r0);
+        r1 = op::pre(v_src2s, r1);
+        #endif
+
+        v_store(dst, r0);
+        v_store(dst + step, r1);
+    }
+    static inline void l(const float* src1, const double* scalar, float* dst)
+    {
+        v_float32 v_src1 = vx_load(src1);
+        v_float32 v_src1s = vx_load(src1 + step);
+
+        v_float32 r0 = r(v_src1, scalar);
+        v_float32 r1 = r(v_src1s, scalar);
+
+        #if CV_VERSION_MAJOR == 3
+        r0 = op::pre(v_src1, r0);
+        r1 = op::pre(v_src1s, r1);
+        #endif
+
+        v_store(dst, r0);
+        v_store(dst + step, r1);
+    }
+
+    static inline v_float32 r(const v_float32& a, const v_float32& b, const double* scalar)
+    {
+        v_float64 f0, f1, f2, f3;
+        f0 = v_cvt_f64(a);
+        f1 = v_cvt_f64_high(a);
+        f2 = v_cvt_f64(b);
+        f3 = v_cvt_f64_high(b);
+
+        v_float64 r0 = op64::r(f0, f2, scalar);
+        v_float64 r1 = op64::r(f1, f3, scalar);
+
+        return v_cvt_f32(r0, r1);
+    }
+    static inline v_float32 r(const v_float32& a, const double* scalar)
+    {
+        v_float64 f0, f1;
+        f0 = v_cvt_f64(a);
+        f1 = v_cvt_f64_high(a);
+
+        v_float64 r0 = op64::r(f0, scalar);
+        v_float64 r1 = op64::r(f1, scalar);
+
+        return v_cvt_f32(r0, r1);
+    }
+};
+
+template<template<typename T1, typename T2, typename Tvec> class OP>
+struct scalar_loader_n<sizeof(double), OP, double, double, v_float64>
+{
+    typedef OP<double, double, v_float64> op;
+    enum {step = v_float64::nlanes};
+
+    static inline void l(const double* src1, const double* src2, const double* scalar, double* dst)
+    {
+        v_float64 v_src1 = vx_load(src1);
+        v_float64 v_src2 = vx_load(src2);
+        v_float64 v_src1s = vx_load(src1 + step);
+        v_float64 v_src2s = vx_load(src2 + step);
+
+        v_float64 r0 = op::r(v_src1, v_src2, scalar);
+        v_float64 r1 = op::r(v_src1s, v_src2s, scalar);
+
+        #if CV_VERSION_MAJOR == 3
+        r0 = op::pre(v_src2, r0);
+        r1 = op::pre(v_src2s, r1);
+        #endif
+
+        v_store(dst, r0);
+        v_store(dst + step, r1);
+    }
+    static inline void l(const double* src1, const double* scalar, double* dst)
+    {
+        v_float64 v_src1 = vx_load(src1);
+        v_float64 v_src1s = vx_load(src1 + step);
+
+        v_float64 r0 = op::r(v_src1, scalar);
+        v_float64 r1 = op::r(v_src1s, scalar);
+
+        #if CV_VERSION_MAJOR == 3
+        r0 = op::pre(v_src1, r0);
+        r1 = op::pre(v_src1s, r1);
+        #endif
+
+        v_store(dst, r0);
+        v_store(dst + step, r1);
+    }
+};
+#endif // CV_SIMD_64F
+
+//////////////////////////// Loops /////////////////////////////////
+
+// dual source
+template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
+static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
+                 T1* dst, size_t step, int width, int height, const T2* scalar)
+{
+    typedef OP<T1, T2, Tvec> op;
+#if CV_SIMD
+    typedef scalar_loader_n<sizeof(T1), OP, T1, T2, Tvec> ldr;
+    const int wide_step = sizeof(T1) > sizeof(ushort) ? Tvec::nlanes * 2 :
+                          sizeof(T1) == sizeof(uchar) ? Tvec::nlanes / 2 : Tvec::nlanes;
+#endif // CV_SIMD
+
+    step1 /= sizeof(T1);
+    step2 /= sizeof(T1);
+    step  /= sizeof(T1);
+
+    for (; height--; src1 += step1, src2 += step2, dst += step)
+    {
+        int x = 0;
+
+    #if CV_SIMD
+        for (; x <= width - wide_step; x += wide_step)
+        {
+            ldr::l(src1 + x, src2 + x, scalar, dst + x);
+        }
+    #endif // CV_SIMD
+
+    #if CV_ENABLE_UNROLLED || CV_SIMD_WIDTH > 16
+        for (; x <= width - 4; x += 4)
+        {
+            T1 t0 = op::r(src1[x], src2[x], scalar);
+            T1 t1 = op::r(src1[x + 1], src2[x + 1], scalar);
+            dst[x] = t0; dst[x + 1] = t1;
+
+            t0 = op::r(src1[x + 2], src2[x + 2], scalar);
+            t1 = op::r(src1[x + 3], src2[x + 3], scalar);
+            dst[x + 2] = t0; dst[x + 3] = t1;
+        }
+    #endif
+
+        for (; x < width; ++x)
+            dst[x] = op::r(src1[x], src2[x], scalar);
+    }
+
+    vx_cleanup();
+}
+
+// single source
+template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
+static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const T2* scalar)
+{
+    typedef OP<T1, T2, Tvec> op;
+#if CV_SIMD
+    typedef scalar_loader_n<sizeof(T1), OP, T1, T2, Tvec> ldr;
+    const int wide_step = sizeof(T1) > sizeof(ushort) ? Tvec::nlanes * 2 :
+                          sizeof(T1) == sizeof(uchar) ? Tvec::nlanes / 2 : Tvec::nlanes;
+#endif // CV_SIMD
+
+    step1 /= sizeof(T1);
+    step  /= sizeof(T1);
+
+    for (; height--; src1 += step1, dst += step)
+    {
+        int x = 0;
+
+    #if CV_SIMD
+        for (; x <= width - wide_step; x += wide_step)
+        {
+            ldr::l(src1 + x, scalar, dst + x);
+        }
+    #endif // CV_SIMD
+
+    #if CV_ENABLE_UNROLLED || CV_SIMD_WIDTH > 16
+        for (; x <= width - 4; x += 4)
+        {
+            T1 t0 = op::r(src1[x], scalar);
+            T1 t1 = op::r(src1[x + 1], scalar);
+            dst[x] = t0; dst[x + 1] = t1;
+
+            t0 = op::r(src1[x + 2], scalar);
+            t1 = op::r(src1[x + 3], scalar);
+            dst[x + 2] = t0; dst[x + 3] = t1;
+        }
+    #endif
+
+        for (; x < width; ++x)
+            dst[x] = op::r(src1[x], scalar);
+    }
+
+    vx_cleanup();
+}
+
+#if !CV_SIMD_64F
+// dual source
+template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
+static void scalar_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2,
+                 T1* dst, size_t step, int width, int height, const T2* scalar)
+{
+    typedef OP<T1, T2, Tvec> op;
+
+    step1 /= sizeof(T1);
+    step2 /= sizeof(T1);
+    step  /= sizeof(T1);
+
+    for (; height--; src1 += step1, src2 += step2, dst += step)
+    {
+        int x = 0;
+
+        for (; x <= width - 4; x += 4)
+        {
+            T1 t0 = op::r(src1[x], src2[x], scalar);
+            T1 t1 = op::r(src1[x + 1], src2[x + 1], scalar);
+            dst[x] = t0; dst[x + 1] = t1;
+
+            t0 = op::r(src1[x + 2], src2[x + 2], scalar);
+            t1 = op::r(src1[x + 3], src2[x + 3], scalar);
+            dst[x + 2] = t0; dst[x + 3] = t1;
+        }
+
+        for (; x < width; ++x)
+            dst[x] = op::r(src1[x], src2[x], scalar);
+    }
+}
+
+// single source
+template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
+static void scalar_loop_nosimd(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const T2* scalar)
+{
+    typedef OP<T1, T2, Tvec> op;
+
+    step1 /= sizeof(T1);
+    step  /= sizeof(T1);
+
+    for (; height--; src1 += step1, dst += step)
+    {
+        int x = 0;
+
+        for (; x <= width - 4; x += 4)
+        {
+            T1 t0 = op::r(src1[x], scalar);
+            T1 t1 = op::r(src1[x + 1], scalar);
+            dst[x] = t0; dst[x + 1] = t1;
+
+            t0 = op::r(src1[x + 2], scalar);
+            t1 = op::r(src1[x + 3], scalar);
+            dst[x + 2] = t0; dst[x + 3] = t1;
+        }
+
+        for (; x < width; ++x)
+            dst[x] = op::r(src1[x], scalar);
+    }
+}
+
+#define SCALAR_LOOP64F scalar_loop_nosimd
+#else
+#define SCALAR_LOOP64F scalar_loop
+#endif // !CV_SIMD_64F
+
+#endif // ARITHM_DEFINITIONS_ONLY
+
+//=========================================================================
+// Multiply
+//=========================================================================
+
+#ifdef ARITHM_DEFINITIONS_ONLY
+
+///////////////////////////// Operations //////////////////////////////////
+
+template<typename T1, typename Tvec>
+struct op_mul
+{
+    static inline Tvec r(const Tvec& a, const Tvec& b)
+    { return a * b; }
+    static inline T1 r(T1 a, T1 b)
+    { return saturate_cast<T1>(a * b); }
+};
+
+template<typename T1, typename T2, typename Tvec>
+struct op_mul_scale
+{
+    static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
+    {
+        const v_float32 v_scalar = vx_setall_f32(*scalar);
+        return v_scalar * a * b;
+    }
+    static inline T1 r(T1 a, T1 b, const T2* scalar)
+    { return c_mul(a, b, *scalar); }
+    static inline Tvec pre(const Tvec&, const Tvec& res)
+    { return res; }
+};
+
+template<>
+struct op_mul_scale<double, double, v_float64>
+{
+#if CV_SIMD_64F
+    static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
+    {
+        const v_float64 v_scalar = vx_setall_f64(*scalar);
+        return v_scalar * a * b;
+    }
+#endif
+    static inline double r(double a, double b, const double* scalar)
+    { return c_mul(a, b, *scalar); }
+    static inline v_float64 pre(const v_float64&, const v_float64& res)
+    { return res; }
+};
+
+//////////////////////////// Loops /////////////////////////////////
+
+template<typename T1, typename Tvec>
+static void mul_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
+              T1* dst, size_t step, int width, int height, const double* scalar)
+{
+    float fscalar = (float)*scalar;
+    if (std::fabs(fscalar - 1.0f) <= FLT_EPSILON)
+    {
+        bin_loop<op_mul, T1, Tvec>(src1, step1, src2, step2, dst, step, width, height);
+    }
+    else
+    {
+        scalar_loop<op_mul_scale, T1, float, Tvec>(src1, step1, src2, step2,
+            dst, step, width, height, &fscalar);
+    }
+}
+
+template<typename T1, typename Tvec>
+static void mul_loop_d(const T1* src1, size_t step1, const T1* src2, size_t step2,
+                T1* dst, size_t step, int width, int height, const double* scalar)
+{
+    if (std::fabs(*scalar - 1.0) <= FLT_EPSILON)
+    {
+        bin_loop<op_mul, T1, Tvec>(src1, step1, src2, step2, dst, step, width, height);
+    }
+    else
+    {
+        SCALAR_LOOP64F<op_mul_scale, T1, double, Tvec>(src1, step1, src2, step2,
+            dst, step, width, height, scalar);
+    }
+}
+
+template<>
+void mul_loop_d<double, v_float64>(const double* src1, size_t step1, const double* src2, size_t step2,
+                                  double* dst, size_t step, int width, int height, const double* scalar)
+{
+    if (*scalar == 1.0)
+    {
+        BIN_LOOP64F<op_mul, double, v_float64>(src1, step1, src2, step2, dst, step, width, height);
+    }
+    else
+    {
+        SCALAR_LOOP64F<op_mul_scale, double, double, v_float64>(src1, step1, src2, step2,
+            dst, step, width, height, scalar);
+    }
+}
+
+#endif // ARITHM_DEFINITIONS_ONLY
+
+//////////////////////////////////////////////////////////////////////////
+
+#undef SCALAR_ARGS
+#define SCALAR_ARGS(_T1) const _T1* src1, size_t step1, const _T1* src2, size_t step2, \
+                          _T1* dst, size_t step, int width, int height
+
+#undef SCALAR_ARGS_PASS
+#define SCALAR_ARGS_PASS src1, step1, src2, step2, dst, step, width, height
+
+#undef DECLARE_SIMD_FUN
+#define DECLARE_SIMD_FUN(fun, _T1) void fun(SCALAR_ARGS(_T1), const double* scalar);
+
+#undef DISPATCH_SIMD_FUN
+#define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, ...)                          \
+    void fun(SCALAR_ARGS(_T1), void* scalar)                             \
+    {                                                                    \
+        CV_INSTRUMENT_REGION();                                          \
+        CALL_HAL(fun, __CV_CAT(cv_hal_, fun),                            \
+            SCALAR_ARGS_PASS, *(const double*)scalar)                    \
+        ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun),                      \
+            SCALAR_ARGS_PASS, *(const double*)scalar)                    \
+        CV_CPU_DISPATCH(fun, (SCALAR_ARGS_PASS, (const double*)scalar),  \
+            CV_CPU_DISPATCH_MODES_ALL);                                  \
+    }
+
+#undef DEFINE_SIMD_FUN
+#define DEFINE_SIMD_FUN(fun, _T1, _Tvec, op)           \
+    void fun(SCALAR_ARGS(_T1), const double* scalar)   \
+    {                                                  \
+        CV_INSTRUMENT_REGION();                        \
+        op<_T1, _Tvec>(SCALAR_ARGS_PASS, scalar);      \
+    }
+
+#undef DEFINE_NOSIMD_FUN
+#define DEFINE_NOSIMD_FUN(fun, _T1, _OP) \
+    DEFINE_SIMD_FUN(fun, _T1, v_float64, _OP)
+
+DEFINE_SIMD_SAT(mul, mul_loop)
+DEFINE_SIMD_F32(mul, mul_loop_d)
+DEFINE_SIMD_S32(mul, mul_loop_d)
+DEFINE_SIMD_F64(mul, mul_loop_d)
+
+//=========================================================================
+// Div
+//=========================================================================
+
+#ifdef ARITHM_DEFINITIONS_ONLY
+
+///////////////////////////// Operations //////////////////////////////////
+
+#if CV_VERSION_MAJOR == 3
+template<typename T1, typename Tvec>
+struct op_div_f
+{
+    static inline Tvec r(const Tvec& a, const Tvec& b)
+    {
+        const Tvec v_zero = Tvec();
+        return v_select(b == v_zero, v_zero, a / b);
+    }
+    static inline T1 r(T1 a, T1 b)
+    { return b != (T1)0 ? a / b : (T1)0; }
+};
+#else
+template<typename T1, typename Tvec>
+struct op_div_f
+{
+    static inline Tvec r(const Tvec& a, const Tvec& b)
+    { return a / b; }
+    static inline T1 r(T1 a, T1 b)
+    { return a / b; }
+};
+#endif
+
+template<typename T1, typename T2, typename Tvec>
+struct op_div_scale
+{
+    static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
+    {
+        const v_float32 v_scalar = vx_setall_f32(*scalar);
+        return a * v_scalar / b;
+    }
+    static inline Tvec pre(const Tvec& denom, const Tvec& res)
+    {
+        const Tvec v_zero = Tvec();
+        return v_select(denom == v_zero, v_zero, res);
+    }
+    static inline T1 r(T1 a, T1 denom, const T2* scalar)
+    { return denom != (T1)0 ? c_div(a, denom, *scalar) : (T1)0; }
+};
+
+template<>
+struct op_div_scale<double, double, v_float64>
+{
+#if CV_SIMD_64F
+    static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
+    {
+        const v_float64 v_scalar = vx_setall_f64(*scalar);
+        return a * v_scalar / b;
+    }
+    static inline v_float64 pre(const v_float64& denom, const v_float64& res)
+    {
+        const v_float64 v_zero = vx_setzero_f64();
+        return v_select(denom == v_zero, v_zero, res);
+    }
+#endif
+    static inline double r(double a, double denom, const double* scalar)
+    { return denom != 0.0 ? c_div(a, denom, *scalar) : 0.0; }
+};
+
+//////////////////////////// Loops /////////////////////////////////
+
+template<typename T1, typename Tvec>
+static void div_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
+              T1* dst, size_t step, int width, int height, const double* scalar)
+{
+    float fscalar = (float)*scalar;
+    // todo: add new intrinsics for integer divide
+    scalar_loop<op_div_scale, T1, float, Tvec>(src1, step1, src2, step2,
+        dst, step, width, height, &fscalar);
+}
+
+template<>
+void div_loop<float, v_float32>(const float* src1, size_t step1, const float* src2, size_t step2,
+                                float* dst, size_t step, int width, int height, const double* scalar)
+{
+    float fscalar = (float)*scalar;
+    if (std::fabs(fscalar - 1.0f) <= FLT_EPSILON)
+    {
+        bin_loop<op_div_f, float, v_float32>(src1, step1, src2, step2, dst, step, width, height);
+    }
+    else
+    {
+        SCALAR_LOOP64F<op_div_scale, float, float, v_float32>(src1, step1, src2, step2,
+            dst, step, width, height, &fscalar);
+    }
+}
+
+template<>
+void div_loop<double, v_float64>(const double* src1, size_t step1, const double* src2, size_t step2,
+                                double* dst, size_t step, int width, int height, const double* scalar)
+{
+    if (*scalar == 1.0)
+    {
+        BIN_LOOP64F<op_div_f, double, v_float64>(src1, step1, src2, step2, dst, step, width, height);
+    }
+    else
+    {
+        SCALAR_LOOP64F<op_div_scale, double, double, v_float64>(src1, step1, src2, step2,
+            dst, step, width, height, scalar);
+    }
+}
+
+#endif // ARITHM_DEFINITIONS_ONLY
+
+//////////////////////////////////////////////////////////////////////////
+
+DEFINE_SIMD_ALL(div, div_loop)
+
+//=========================================================================
+// AddWeighted
+//=========================================================================
+
+#ifdef ARITHM_DEFINITIONS_ONLY
+
+///////////////////////////// Operations //////////////////////////////////
+
+///// Add scale
+template<typename T1, typename T2, typename Tvec>
+struct op_add_scale
+{
+    static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
+    {
+        const v_float32 v_alpha = vx_setall_f32(*scalar);
+        return v_fma(a, v_alpha, b);
+    }
+    static inline T1 r(T1 a, T1 b, const T2* scalar)
+    { return c_add(a, b, *scalar); }
+    static inline Tvec pre(const Tvec&, const Tvec& res)
+    { return res; }
+};
+
+template<>
+struct op_add_scale<double, double, v_float64>
+{
+#if CV_SIMD_64F
+    static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
+    {
+        const v_float64 v_alpha = vx_setall_f64(*scalar);
+        return v_fma(a, v_alpha, b);
+    }
+#endif
+    static inline double r(double a, double b, const double* scalar)
+    { return c_add(a, b, *scalar); }
+    static inline v_float64 pre(const v_float64&, const v_float64& res)
+    { return res; }
+};
+
+///// Weighted sum
+template<typename T1, typename T2, typename Tvec>
+struct op_add_weighted
+{
+    static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalars)
+    {
+        const v_float32 v_alpha = vx_setall_f32(scalars[0]);
+        const v_float32 v_beta  = vx_setall_f32(scalars[1]);
+        const v_float32 v_gamma = vx_setall_f32(scalars[2]);
+        return v_fma(a, v_alpha, v_fma(b, v_beta, v_gamma));
+    }
+    static inline T1 r(T1 a, T1 b, const T2* scalars)
+    { return c_add(a, b, scalars[0], scalars[1], scalars[2]); }
+    static inline Tvec pre(const Tvec&, const Tvec& res)
+    { return res; }
+};
+
+template<>
+struct op_add_weighted<double, double, v_float64>
+{
+#if CV_SIMD_64F
+    static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalars)
+    {
+        const v_float64 v_alpha = vx_setall_f64(scalars[0]);
+        const v_float64 v_beta  = vx_setall_f64(scalars[1]);
+        const v_float64 v_gamma = vx_setall_f64(scalars[2]);
+        return v_fma(a, v_alpha, v_fma(b, v_beta, v_gamma));
+    }
+#endif
+    static inline double r(double a, double b, const double* scalars)
+    { return c_add(a, b, scalars[0], scalars[1], scalars[2]); }
+    static inline v_float64 pre(const v_float64&, const v_float64& res)
+    { return res; }
+};
+
+//////////////////////////// Loops /////////////////////////////////
+
+template<typename T1, typename Tvec>
+static void add_weighted_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
+                       T1* dst, size_t step, int width, int height, const double* scalars)
+{
+    float fscalars[] = {(float)scalars[0], (float)scalars[1], (float)scalars[2]};
+    if (fscalars[1] == 1.0f && fscalars[2] == 0.0f)
+    {
+        scalar_loop<op_add_scale, T1, float, Tvec>(src1, step1, src2, step2,
+            dst, step, width, height, fscalars);
+    }
+    else
+    {
+        scalar_loop<op_add_weighted, T1, float, Tvec>(src1, step1, src2, step2,
+            dst, step, width, height, fscalars);
+    }
+}
+
+template<typename T1, typename Tvec>
+static void add_weighted_loop_d(const T1* src1, size_t step1, const T1* src2, size_t step2,
+                         T1* dst, size_t step, int width, int height, const double* scalars)
+{
+    if (scalars[1] == 1.0 && scalars[2] == 0.0)
+    {
+        SCALAR_LOOP64F<op_add_scale, T1, double, Tvec>(src1, step1, src2, step2,
+            dst, step, width, height, scalars);
+    }
+    else
+    {
+        SCALAR_LOOP64F<op_add_weighted, T1, double, Tvec>(src1, step1, src2, step2,
+            dst, step, width, height, scalars);
+    }
+}
+
+template<>
+void add_weighted_loop_d<double, v_float64>(const double* src1, size_t step1, const double* src2, size_t step2,
+                                            double* dst, size_t step, int width, int height, const double* scalars)
+{
+    if (scalars[1] == 1.0 && scalars[2] == 0.0)
+    {
+        SCALAR_LOOP64F<op_add_scale, double, double, v_float64>(src1, step1, src2, step2,
+            dst, step, width, height, scalars);
+    }
+    else
+    {
+        SCALAR_LOOP64F<op_add_weighted, double, double, v_float64>(src1, step1, src2, step2,
+            dst, step, width, height, scalars);
+    }
+}
+
+#endif // ARITHM_DEFINITIONS_ONLY
+
+//////////////////////////////////////////////////////////////////////////
+
+#undef DISPATCH_SIMD_FUN
+#define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, ...)                          \
+    void fun(SCALAR_ARGS(_T1), void* scalar)                             \
+    {                                                                    \
+        CV_INSTRUMENT_REGION();                                          \
+        CALL_HAL(fun, __CV_CAT(cv_hal_, fun),                            \
+            SCALAR_ARGS_PASS, (const double*)scalar)                     \
+        ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun),                      \
+            SCALAR_ARGS_PASS, (const double*)scalar)                     \
+        CV_CPU_DISPATCH(fun, (SCALAR_ARGS_PASS, (const double*)scalar),  \
+            CV_CPU_DISPATCH_MODES_ALL);                                  \
+    }
+
+DEFINE_SIMD_SAT(addWeighted, add_weighted_loop)
+DEFINE_SIMD_S32(addWeighted, add_weighted_loop_d)
+DEFINE_SIMD_F32(addWeighted, add_weighted_loop_d)
+DEFINE_SIMD_F64(addWeighted, add_weighted_loop_d)
+
+//=======================================
+// Reciprocal
+//=======================================
+
+#ifdef ARITHM_DEFINITIONS_ONLY
+
+///////////////////////////// Operations //////////////////////////////////
+
+template<typename T1, typename T2, typename Tvec>
+struct op_recip
+{
+    static inline v_float32 r(const v_float32& a, const T2* scalar)
+    {
+        const v_float32 v_scalar = vx_setall_f32(*scalar);
+        return v_scalar / a;
+    }
+    static inline Tvec pre(const Tvec& denom, const Tvec& res)
+    {
+        const Tvec v_zero = Tvec();
+        return v_select(denom == v_zero, v_zero, res);
+    }
+    static inline T1 r(T1 denom, const T2* scalar)
+    { return denom != (T1)0 ? c_div(*scalar, denom) : (T1)0; }
+};
+
+template<>
+struct op_recip<double, double, v_float64>
+{
+#if CV_SIMD_64F
+    static inline v_float64 r(const v_float64& a, const double* scalar)
+    {
+        const v_float64 v_scalar = vx_setall_f64(*scalar);
+        return v_scalar / a;
+    }
+    static inline v_float64 pre(const v_float64& denom, const v_float64& res)
+    {
+        const v_float64 v_zero = vx_setzero_f64();
+        return v_select(denom == v_zero, v_zero, res);
+    }
+#endif
+    static inline double r(double denom, const double* scalar)
+    { return denom != 0.0 ? c_div(*scalar, denom) : 0.0; }
+};
+
+//////////////////////////// Loops /////////////////////////////////
+
+template<typename T1, typename Tvec>
+static void recip_loop(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const double* scalar)
+{
+    float fscalar = (float)*scalar;
+    scalar_loop<op_recip, T1, float, Tvec>(src1, step1, dst, step, width, height, &fscalar);
+}
+
+template<>
+void recip_loop<double, v_float64>(const double* src1, size_t step1, double* dst, size_t step, int width, int height, const double* scalar)
+{
+    SCALAR_LOOP64F<op_recip, double, double, v_float64>(src1, step1, dst, step, width, height, scalar);
+}
+
+#endif // ARITHM_DEFINITIONS_ONLY
+
+//////////////////////////////////////////////////////////////////////////
+
+#undef SCALAR_ARGS
+#define SCALAR_ARGS(_T1) const _T1* src1, size_t step1, _T1* dst, size_t step, int width, int height
+
+#undef SCALAR_ARGS_PASS
+#define SCALAR_ARGS_PASS src1, step1, dst, step, width, height
+
+#undef DISPATCH_SIMD_FUN
+#define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, ...)                          \
+    void fun(const _T1*, size_t, SCALAR_ARGS(_T1), void* scalar)         \
+    {                                                                    \
+        CV_INSTRUMENT_REGION();                                          \
+        CALL_HAL(fun, __CV_CAT(cv_hal_, fun),                            \
+            SCALAR_ARGS_PASS, *(const double*)scalar)                    \
+        ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun),                      \
+            SCALAR_ARGS_PASS, *(const double*)scalar)                    \
+        CV_CPU_DISPATCH(fun, (SCALAR_ARGS_PASS, (const double*)scalar),  \
+            CV_CPU_DISPATCH_MODES_ALL);                                  \
+    }
+
+DEFINE_SIMD_ALL(recip, recip_loop)
+
+#ifndef ARITHM_DISPATCHING_ONLY
+    CV_CPU_OPTIMIZATION_NAMESPACE_END
+#endif
+
+#ifndef SIMD_GUARD
+    #define SIMD_GUARD
+#endif
+
+}} // cv::hal::
\ No newline at end of file
diff --git a/modules/core/src/arithm_core.hpp b/modules/core/src/arithm_core.hpp
deleted file mode 100644
index 99b564cf74..0000000000
--- a/modules/core/src/arithm_core.hpp
+++ /dev/null
@@ -1,623 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                          License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Copyright (C) 2015, Itseez Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_ARITHM_CORE_HPP__
-#define __OPENCV_ARITHM_CORE_HPP__
-
-#include "arithm_simd.hpp"
-
-namespace cv {
-
-template<typename T1, typename T2=T1, typename T3=T1> struct OpAdd
-{
-    typedef T1 type1;
-    typedef T2 type2;
-    typedef T3 rtype;
-    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a + b); }
-};
-
-template<typename T1, typename T2=T1, typename T3=T1> struct OpSub
-{
-    typedef T1 type1;
-    typedef T2 type2;
-    typedef T3 rtype;
-    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a - b); }
-};
-
-template<typename T1, typename T2=T1, typename T3=T1> struct OpRSub
-{
-    typedef T1 type1;
-    typedef T2 type2;
-    typedef T3 rtype;
-    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(b - a); }
-};
-
-template<typename T> struct OpMin
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator ()(const T a, const T b) const { return std::min(a, b); }
-};
-
-template<typename T> struct OpMax
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator ()(const T a, const T b) const { return std::max(a, b); }
-};
-
-template<typename T> struct OpAbsDiff
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator()(T a, T b) const { return a > b ? a - b : b - a; }
-};
-
-// specializations to prevent "-0" results
-template<> struct OpAbsDiff<float>
-{
-    typedef float type1;
-    typedef float type2;
-    typedef float rtype;
-    float operator()(float a, float b) const { return std::abs(a - b); }
-};
-template<> struct OpAbsDiff<double>
-{
-    typedef double type1;
-    typedef double type2;
-    typedef double rtype;
-    double operator()(double a, double b) const { return std::abs(a - b); }
-};
-
-template<typename T> struct OpAnd
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator()( T a, T b ) const { return a & b; }
-};
-
-template<typename T> struct OpOr
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator()( T a, T b ) const { return a | b; }
-};
-
-template<typename T> struct OpXor
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator()( T a, T b ) const { return a ^ b; }
-};
-
-template<typename T> struct OpNot
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator()( T a, T ) const { return ~a; }
-};
-
-//=============================================================================
-
-template<typename T, class Op, class VOp>
-void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, int width, int height)
-{
-#if CV_SSE2 || CV_NEON
-    VOp vop;
-#endif
-    Op op;
-
-    for( ; height--; src1 = (const T *)((const uchar *)src1 + step1),
-                        src2 = (const T *)((const uchar *)src2 + step2),
-                        dst = (T *)((uchar *)dst + step) )
-    {
-        int x = 0;
-
-#if CV_NEON || CV_SSE2
-#if CV_AVX2
-        if( USE_AVX2 )
-        {
-            for( ; x <= width - 32/(int)sizeof(T); x += 32/sizeof(T) )
-            {
-                typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x);
-                r0 = vop(r0, VLoadStore256<T>::load(src2 + x));
-                VLoadStore256<T>::store(dst + x, r0);
-            }
-        }
-#else
-#if CV_SSE2
-        if( USE_SSE2 )
-        {
-#endif // CV_SSE2
-            for( ; x <= width - 32/(int)sizeof(T); x += 32/sizeof(T) )
-            {
-                typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x               );
-                typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 16/sizeof(T));
-                r0 = vop(r0, VLoadStore128<T>::load(src2 + x               ));
-                r1 = vop(r1, VLoadStore128<T>::load(src2 + x + 16/sizeof(T)));
-                VLoadStore128<T>::store(dst + x               , r0);
-                VLoadStore128<T>::store(dst + x + 16/sizeof(T), r1);
-            }
-#if CV_SSE2
-        }
-#endif // CV_SSE2
-#endif // CV_AVX2
-#endif // CV_NEON || CV_SSE2
-
-#if CV_AVX2
-        // nothing
-#elif CV_SSE2
-        if( USE_SSE2 )
-        {
-            for( ; x <= width - 8/(int)sizeof(T); x += 8/sizeof(T) )
-            {
-                typename VLoadStore64<T>::reg_type r = VLoadStore64<T>::load(src1 + x);
-                r = vop(r, VLoadStore64<T>::load(src2 + x));
-                VLoadStore64<T>::store(dst + x, r);
-            }
-        }
-#endif
-
-#if CV_ENABLE_UNROLLED
-        for( ; x <= width - 4; x += 4 )
-        {
-            T v0 = op(src1[x], src2[x]);
-            T v1 = op(src1[x+1], src2[x+1]);
-            dst[x] = v0; dst[x+1] = v1;
-            v0 = op(src1[x+2], src2[x+2]);
-            v1 = op(src1[x+3], src2[x+3]);
-            dst[x+2] = v0; dst[x+3] = v1;
-        }
-#endif
-
-        for( ; x < width; x++ )
-            dst[x] = op(src1[x], src2[x]);
-    }
-}
-
-template<typename T, class Op, class Op32>
-void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
-              T* dst, size_t step, int width, int height)
-{
-#if CV_SSE2 || CV_NEON
-    Op32 op32;
-#endif
-    Op op;
-
-    for( ; height--; src1 = (const T *)((const uchar *)src1 + step1),
-                        src2 = (const T *)((const uchar *)src2 + step2),
-                        dst = (T *)((uchar *)dst + step) )
-    {
-        int x = 0;
-
-#if CV_AVX2
-        if( USE_AVX2 )
-        {
-            if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 )
-            {
-                for( ; x <= width - 8; x += 8 )
-                {
-                    typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x);
-                    r0 = op32(r0, VLoadStore256Aligned<T>::load(src2 + x));
-                    VLoadStore256Aligned<T>::store(dst + x, r0);
-                }
-            }
-        }
-#elif CV_SSE2
-        if( USE_SSE2 )
-        {
-            if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
-            {
-                for( ; x <= width - 8; x += 8 )
-                {
-                    typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x    );
-                    typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 4);
-                    r0 = op32(r0, VLoadStore128Aligned<T>::load(src2 + x    ));
-                    r1 = op32(r1, VLoadStore128Aligned<T>::load(src2 + x + 4));
-                    VLoadStore128Aligned<T>::store(dst + x    , r0);
-                    VLoadStore128Aligned<T>::store(dst + x + 4, r1);
-                }
-            }
-        }
-#endif // CV_AVX2
-
-#if CV_NEON || CV_SSE2
-#if CV_AVX2
-        if( USE_AVX2 )
-        {
-            for( ; x <= width - 8; x += 8 )
-            {
-                typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x);
-                r0 = op32(r0, VLoadStore256<T>::load(src2 + x));
-                VLoadStore256<T>::store(dst + x, r0);
-            }
-        }
-#else
-#if CV_SSE2
-        if( USE_SSE2 )
-        {
-#endif // CV_SSE2
-            for( ; x <= width - 8; x += 8 )
-            {
-                typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x    );
-                typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 4);
-                r0 = op32(r0, VLoadStore128<T>::load(src2 + x    ));
-                r1 = op32(r1, VLoadStore128<T>::load(src2 + x + 4));
-                VLoadStore128<T>::store(dst + x    , r0);
-                VLoadStore128<T>::store(dst + x + 4, r1);
-            }
-#if CV_SSE2
-        }
-#endif // CV_SSE2
-#endif // CV_AVX2
-#endif // CV_NEON || CV_SSE2
-
-#if CV_ENABLE_UNROLLED
-        for( ; x <= width - 4; x += 4 )
-        {
-            T v0 = op(src1[x], src2[x]);
-            T v1 = op(src1[x+1], src2[x+1]);
-            dst[x] = v0; dst[x+1] = v1;
-            v0 = op(src1[x+2], src2[x+2]);
-            v1 = op(src1[x+3], src2[x+3]);
-            dst[x+2] = v0; dst[x+3] = v1;
-        }
-#endif
-
-        for( ; x < width; x++ )
-            dst[x] = op(src1[x], src2[x]);
-    }
-}
-
-
-template<typename T, class Op, class Op64>
-void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2,
-               T* dst, size_t step, int width, int height)
-{
-#if CV_SSE2
-    Op64 op64;
-#endif
-    Op op;
-
-    for( ; height--; src1 = (const T *)((const uchar *)src1 + step1),
-                        src2 = (const T *)((const uchar *)src2 + step2),
-                        dst = (T *)((uchar *)dst + step) )
-    {
-        int x = 0;
-
-#if CV_AVX2
-        if( USE_AVX2 )
-        {
-            if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 )
-            {
-                for( ; x <= width - 4; x += 4 )
-                {
-                    typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x);
-                    r0 = op64(r0, VLoadStore256Aligned<T>::load(src2 + x));
-                    VLoadStore256Aligned<T>::store(dst + x, r0);
-                }
-            }
-        }
-#elif CV_SSE2
-        if( USE_SSE2 )
-        {
-            if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
-            {
-                for( ; x <= width - 4; x += 4 )
-                {
-                    typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x    );
-                    typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 2);
-                    r0 = op64(r0, VLoadStore128Aligned<T>::load(src2 + x    ));
-                    r1 = op64(r1, VLoadStore128Aligned<T>::load(src2 + x + 2));
-                    VLoadStore128Aligned<T>::store(dst + x    , r0);
-                    VLoadStore128Aligned<T>::store(dst + x + 2, r1);
-                }
-            }
-        }
-#endif
-
-        for( ; x <= width - 4; x += 4 )
-        {
-            T v0 = op(src1[x], src2[x]);
-            T v1 = op(src1[x+1], src2[x+1]);
-            dst[x] = v0; dst[x+1] = v1;
-            v0 = op(src1[x+2], src2[x+2]);
-            v1 = op(src1[x+3], src2[x+3]);
-            dst[x+2] = v0; dst[x+3] = v1;
-        }
-
-        for( ; x < width; x++ )
-            dst[x] = op(src1[x], src2[x]);
-    }
-}
-
-template<typename T> static void
-cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
-     uchar* dst, size_t step, int width, int height, int code)
-{
-    step1 /= sizeof(src1[0]);
-    step2 /= sizeof(src2[0]);
-    if( code == CMP_GE || code == CMP_LT )
-    {
-        std::swap(src1, src2);
-        std::swap(step1, step2);
-        code = code == CMP_GE ? CMP_LE : CMP_GT;
-    }
-
-    Cmp_SIMD<T> vop(code);
-
-    if( code == CMP_GT || code == CMP_LE )
-    {
-        int m = code == CMP_GT ? 0 : 255;
-        for( ; height--; src1 += step1, src2 += step2, dst += step )
-        {
-            int x = vop(src1, src2, dst, width);
-            #if CV_ENABLE_UNROLLED
-            for( ; x <= width - 4; x += 4 )
-            {
-                int t0, t1;
-                t0 = -(src1[x] > src2[x]) ^ m;
-                t1 = -(src1[x+1] > src2[x+1]) ^ m;
-                dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
-                t0 = -(src1[x+2] > src2[x+2]) ^ m;
-                t1 = -(src1[x+3] > src2[x+3]) ^ m;
-                dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
-            }
-            #endif
-            for( ; x < width; x++ )
-                dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
-        }
-    }
-    else if( code == CMP_EQ || code == CMP_NE )
-    {
-        int m = code == CMP_EQ ? 0 : 255;
-        for( ; height--; src1 += step1, src2 += step2, dst += step )
-        {
-            int x = 0;
-            #if CV_ENABLE_UNROLLED
-            for( ; x <= width - 4; x += 4 )
-            {
-                int t0, t1;
-                t0 = -(src1[x] == src2[x]) ^ m;
-                t1 = -(src1[x+1] == src2[x+1]) ^ m;
-                dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
-                t0 = -(src1[x+2] == src2[x+2]) ^ m;
-                t1 = -(src1[x+3] == src2[x+3]) ^ m;
-                dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
-            }
-            #endif
-            for( ; x < width; x++ )
-                dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
-        }
-    }
-}
-
-template<typename T, typename WT> static void
-mul_( const T* src1, size_t step1, const T* src2, size_t step2,
-      T* dst, size_t step, int width, int height, WT scale )
-{
-    step1 /= sizeof(src1[0]);
-    step2 /= sizeof(src2[0]);
-    step /= sizeof(dst[0]);
-
-    Mul_SIMD<T, WT> vop;
-
-    if( scale == (WT)1. )
-    {
-        for( ; height--; src1 += step1, src2 += step2, dst += step )
-        {
-            int i = vop(src1, src2, dst, width, scale);
-            #if CV_ENABLE_UNROLLED
-            for(; i <= width - 4; i += 4 )
-            {
-                T t0;
-                T t1;
-                t0 = saturate_cast<T>(src1[i  ] * src2[i  ]);
-                t1 = saturate_cast<T>(src1[i+1] * src2[i+1]);
-                dst[i  ] = t0;
-                dst[i+1] = t1;
-
-                t0 = saturate_cast<T>(src1[i+2] * src2[i+2]);
-                t1 = saturate_cast<T>(src1[i+3] * src2[i+3]);
-                dst[i+2] = t0;
-                dst[i+3] = t1;
-            }
-            #endif
-            for( ; i < width; i++ )
-                dst[i] = saturate_cast<T>(src1[i] * src2[i]);
-        }
-    }
-    else
-    {
-        for( ; height--; src1 += step1, src2 += step2, dst += step )
-        {
-            int i = vop(src1, src2, dst, width, scale);
-            #if CV_ENABLE_UNROLLED
-            for(; i <= width - 4; i += 4 )
-            {
-                T t0 = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
-                T t1 = saturate_cast<T>(scale*(WT)src1[i+1]*src2[i+1]);
-                dst[i] = t0; dst[i+1] = t1;
-
-                t0 = saturate_cast<T>(scale*(WT)src1[i+2]*src2[i+2]);
-                t1 = saturate_cast<T>(scale*(WT)src1[i+3]*src2[i+3]);
-                dst[i+2] = t0; dst[i+3] = t1;
-            }
-            #endif
-            for( ; i < width; i++ )
-                dst[i] = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
-        }
-    }
-}
-
-
-template<typename T> static void
-div_i( const T* src1, size_t step1, const T* src2, size_t step2,
-      T* dst, size_t step, int width, int height, double scale )
-{
-    step1 /= sizeof(src1[0]);
-    step2 /= sizeof(src2[0]);
-    step /= sizeof(dst[0]);
-
-    Div_SIMD<T> vop;
-    float scale_f = (float)scale;
-
-    for( ; height--; src1 += step1, src2 += step2, dst += step )
-    {
-        int i = vop(src1, src2, dst, width, scale);
-        for( ; i < width; i++ )
-        {
-            T num = src1[i], denom = src2[i];
-            dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
-        }
-    }
-}
-
-template<typename T> static void
-div_f( const T* src1, size_t step1, const T* src2, size_t step2,
-      T* dst, size_t step, int width, int height, double scale )
-{
-    T scale_f = (T)scale;
-    step1 /= sizeof(src1[0]);
-    step2 /= sizeof(src2[0]);
-    step /= sizeof(dst[0]);
-
-    Div_SIMD<T> vop;
-
-    for( ; height--; src1 += step1, src2 += step2, dst += step )
-    {
-        int i = vop(src1, src2, dst, width, scale);
-        for( ; i < width; i++ )
-        {
-            T num = src1[i], denom = src2[i];
-            dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
-        }
-    }
-}
-
-template<typename T> static void
-recip_i( const T* src2, size_t step2,
-         T* dst, size_t step, int width, int height, double scale )
-{
-    step2 /= sizeof(src2[0]);
-    step /= sizeof(dst[0]);
-
-    Recip_SIMD<T> vop;
-    float scale_f = (float)scale;
-
-    for( ; height--; src2 += step2, dst += step )
-    {
-        int i = vop(src2, dst, width, scale);
-        for( ; i < width; i++ )
-        {
-            T denom = src2[i];
-            dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
-        }
-    }
-}
-
-template<typename T> static void
-recip_f( const T* src2, size_t step2,
-         T* dst, size_t step, int width, int height, double scale )
-{
-    T scale_f = (T)scale;
-    step2 /= sizeof(src2[0]);
-    step /= sizeof(dst[0]);
-
-    Recip_SIMD<T> vop;
-
-    for( ; height--; src2 += step2, dst += step )
-    {
-        int i = vop(src2, dst, width, scale);
-        for( ; i < width; i++ )
-        {
-            T denom = src2[i];
-            dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
-        }
-    }
-}
-
-template<typename T, typename WT> static void
-addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2,
-              T* dst, size_t step, int width, int height, void* _scalars )
-{
-    const double* scalars = (const double*)_scalars;
-    WT alpha = (WT)scalars[0], beta = (WT)scalars[1], gamma = (WT)scalars[2];
-    step1 /= sizeof(src1[0]);
-    step2 /= sizeof(src2[0]);
-    step /= sizeof(dst[0]);
-
-    AddWeighted_SIMD<T, WT> vop;
-
-    for( ; height--; src1 += step1, src2 += step2, dst += step )
-    {
-        int x = vop(src1, src2, dst, width, alpha, beta, gamma);
-        #if CV_ENABLE_UNROLLED
-        for( ; x <= width - 4; x += 4 )
-        {
-            T t0 = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
-            T t1 = saturate_cast<T>(src1[x+1]*alpha + src2[x+1]*beta + gamma);
-            dst[x] = t0; dst[x+1] = t1;
-
-            t0 = saturate_cast<T>(src1[x+2]*alpha + src2[x+2]*beta + gamma);
-            t1 = saturate_cast<T>(src1[x+3]*alpha + src2[x+3]*beta + gamma);
-            dst[x+2] = t0; dst[x+3] = t1;
-        }
-        #endif
-        for( ; x < width; x++ )
-            dst[x] = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
-    }
-}
-
-} // cv::
-
-
-#endif // __OPENCV_ARITHM_CORE_HPP__
diff --git a/modules/core/src/arithm_ipp.hpp b/modules/core/src/arithm_ipp.hpp
new file mode 100644
index 0000000000..4aa7d006e4
--- /dev/null
+++ b/modules/core/src/arithm_ipp.hpp
@@ -0,0 +1,417 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+#if ARITHM_USE_IPP
+
+namespace cv { namespace hal {
+
+//=======================================
+// Arithmetic and logical operations
+// +, -, *, /, &, |, ^, ~, abs ...
+//=======================================
+
+#define ARITHM_IPP_BIN(fun, ...)                        \
+do {                                                    \
+    if (!CV_IPP_CHECK_COND)                             \
+        return 0;                                       \
+    if (height == 1)                                    \
+        step1 = step2 = step = width * sizeof(dst[0]);  \
+    if (0 <= CV_INSTRUMENT_FUN_IPP(fun, __VA_ARGS__))   \
+    {                                                   \
+        CV_IMPL_ADD(CV_IMPL_IPP);                       \
+        return 1;                                       \
+    }                                                   \
+    setIppErrorStatus();                                \
+    return 0;                                           \
+} while(0)
+
+//=======================================
+// Addition
+//=======================================
+
+inline int arithm_ipp_add8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                            uchar* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiAdd_8u_C1RSfs, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0);
+}
+
+inline int arithm_ipp_add16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
+                             ushort* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiAdd_16u_C1RSfs, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0);
+}
+
+inline int arithm_ipp_add16s(const short* src1, size_t step1, const short* src2, size_t step2,
+                             short* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiAdd_16s_C1RSfs, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0);
+}
+
+inline int arithm_ipp_add32f(const float* src1, size_t step1, const float* src2, size_t step2,
+                             float* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiAdd_32f_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+#define arithm_ipp_add8s(...)  0
+#define arithm_ipp_add32s(...) 0
+#define arithm_ipp_add64f(...) 0
+
+//=======================================
+// Subtract
+//=======================================
+
+inline int arithm_ipp_sub8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                            uchar* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiSub_8u_C1RSfs, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height), 0);
+}
+
+inline int arithm_ipp_sub16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
+                             ushort* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiSub_16u_C1RSfs, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height), 0);
+}
+
+inline int arithm_ipp_sub16s(const short* src1, size_t step1, const short* src2, size_t step2,
+                            short* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiSub_16s_C1RSfs, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height), 0);
+}
+
+inline int arithm_ipp_sub32f(const float* src1, size_t step1, const float* src2, size_t step2,
+                            float* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiSub_32f_C1R, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height));
+}
+
+#define arithm_ipp_sub8s(...)  0
+#define arithm_ipp_sub32s(...) 0
+#define arithm_ipp_sub64f(...) 0
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define ARITHM_IPP_MIN_MAX(fun, type)                            \
+do {                                                             \
+    if (!CV_IPP_CHECK_COND)                                      \
+        return 0;                                                \
+    type* s1 = (type*)src1;                                      \
+    type* s2 = (type*)src2;                                      \
+    type* d  = dst;                                              \
+    if (height == 1)                                             \
+        step1 = step2 = step = width * sizeof(dst[0]);           \
+    int i = 0;                                                   \
+    for(; i < height; i++)                                       \
+    {                                                            \
+        if (0 > CV_INSTRUMENT_FUN_IPP(fun, s1, s2, d, width))    \
+            break;                                               \
+        s1 = (type*)((uchar*)s1 + step1);                        \
+        s2 = (type*)((uchar*)s2 + step2);                        \
+        d  = (type*)((uchar*)d + step);                          \
+    }                                                            \
+    if (i == height)                                             \
+    {                                                            \
+        CV_IMPL_ADD(CV_IMPL_IPP);                                \
+        return 1;                                                \
+    }                                                            \
+    setIppErrorStatus();                                         \
+    return 0;                                                    \
+} while(0)
+
+//=======================================
+// Max
+//=======================================
+
+inline int arithm_ipp_max8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                           uchar* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_MIN_MAX(ippsMaxEvery_8u, uchar);
+}
+
+inline int arithm_ipp_max16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
+                             ushort* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_MIN_MAX(ippsMaxEvery_16u, ushort);
+}
+
+inline int arithm_ipp_max32f(const float* src1, size_t step1, const float* src2, size_t step2,
+                             float* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_MIN_MAX(ippsMaxEvery_32f, float);
+}
+
+inline int arithm_ipp_max64f(const double* src1, size_t step1, const double* src2, size_t step2,
+                             double* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_MIN_MAX(ippsMaxEvery_64f, double);
+}
+
+#define arithm_ipp_max8s(...)  0
+#define arithm_ipp_max16s(...) 0
+#define arithm_ipp_max32s(...) 0
+
+//=======================================
+// Min
+//=======================================
+
+inline int arithm_ipp_min8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                            uchar* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_MIN_MAX(ippsMinEvery_8u, uchar);
+}
+
+inline int arithm_ipp_min16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
+                            ushort* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_MIN_MAX(ippsMinEvery_16u, ushort);
+}
+
+inline int arithm_ipp_min32f(const float* src1, size_t step1, const float* src2,size_t step2,
+                             float* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_MIN_MAX(ippsMinEvery_32f, float);
+}
+
+inline int arithm_ipp_min64f(const double* src1, size_t step1, const double* src2, size_t step2,
+                             double* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_MIN_MAX(ippsMinEvery_64f, double);
+}
+
+#define arithm_ipp_min8s(...)  0
+#define arithm_ipp_min16s(...) 0
+#define arithm_ipp_min32s(...) 0
+
+//=======================================
+// AbsDiff
+//=======================================
+
+inline int arithm_ipp_absdiff8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                                uchar* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiAbsDiff_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+inline int arithm_ipp_absdiff16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
+                                ushort* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiAbsDiff_16u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+inline int arithm_ipp_absdiff32f(const float* src1, size_t step1, const float* src2, size_t step2,
+                                float* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiAbsDiff_32f_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+#define arithm_ipp_absdiff8s(...)  0
+#define arithm_ipp_absdiff16s(...) 0
+#define arithm_ipp_absdiff32s(...) 0
+#define arithm_ipp_absdiff64f(...) 0
+
+//=======================================
+// Logical
+//=======================================
+
+inline int arithm_ipp_and8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                            uchar* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiAnd_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+inline int arithm_ipp_or8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                           uchar* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiOr_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+inline int arithm_ipp_xor8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                            uchar* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiXor_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+inline int arithm_ipp_not8u(const uchar* src1, size_t step1, uchar* dst, size_t step, int width, int height)
+{
+    if (!CV_IPP_CHECK_COND)
+        return 0;
+    if (height == 1)
+        step1 = step = width * sizeof(dst[0]);
+    if (0 <= CV_INSTRUMENT_FUN_IPP(ippiNot_8u_C1R, src1, (int)step1, dst, (int)step, ippiSize(width, height)))
+    {
+        CV_IMPL_ADD(CV_IMPL_IPP);
+        return 1;
+    }
+    setIppErrorStatus();
+    return 0;
+}
+
+//=======================================
+// Compare
+//=======================================
+
+#define ARITHM_IPP_CMP(fun, ...)                          \
+do {                                                      \
+    if (!CV_IPP_CHECK_COND)                               \
+        return 0;                                         \
+    IppCmpOp op = arithm_ipp_convert_cmp(cmpop);          \
+    if (op < 0)                                           \
+        return 0;                                         \
+    if (height == 1)                                      \
+        step1 = step2 = step = width * sizeof(dst[0]);    \
+    if (0 <= CV_INSTRUMENT_FUN_IPP(fun, __VA_ARGS__, op)) \
+    {                                                     \
+        CV_IMPL_ADD(CV_IMPL_IPP);                         \
+        return 1;                                         \
+    }                                                     \
+    setIppErrorStatus();                                  \
+    return 0;                                             \
+} while(0)
+
+inline IppCmpOp arithm_ipp_convert_cmp(int cmpop)
+{
+    switch(cmpop)
+    {
+        case CMP_EQ: return ippCmpEq;
+        case CMP_GT: return ippCmpGreater;
+        case CMP_GE: return ippCmpGreaterEq;
+        case CMP_LT: return ippCmpLess;
+        case CMP_LE: return ippCmpLessEq;
+        default:     return (IppCmpOp)-1;
+    }
+}
+
+inline int arithm_ipp_cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                            uchar* dst, size_t step, int width, int height, int cmpop)
+{
+    ARITHM_IPP_CMP(ippiCompare_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+inline int arithm_ipp_cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
+                            uchar* dst, size_t step, int width, int height, int cmpop)
+{
+    ARITHM_IPP_CMP(ippiCompare_16u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+inline int arithm_ipp_cmp16s(const short* src1, size_t step1, const short* src2, size_t step2,
+                             uchar* dst, size_t step, int width, int height, int cmpop)
+{
+    ARITHM_IPP_CMP(ippiCompare_16s_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+inline int arithm_ipp_cmp32f(const float* src1, size_t step1, const float* src2, size_t step2,
+                             uchar* dst, size_t step, int width, int height, int cmpop)
+{
+    ARITHM_IPP_CMP(ippiCompare_32f_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+#define arithm_ipp_cmp8s(...)  0
+#define arithm_ipp_cmp32s(...) 0
+#define arithm_ipp_cmp64f(...) 0
+
+//=======================================
+// Multiply
+//=======================================
+
+#define ARITHM_IPP_MUL(fun, ...)                      \
+do {                                                  \
+    if (!CV_IPP_CHECK_COND)                           \
+        return 0;                                     \
+    float fscale = (float)scale;                      \
+    if (std::fabs(fscale - 1) > FLT_EPSILON)          \
+        return 0;                                     \
+    if (0 <= CV_INSTRUMENT_FUN_IPP(fun, __VA_ARGS__)) \
+    {                                                 \
+        CV_IMPL_ADD(CV_IMPL_IPP);                     \
+        return 1;                                     \
+    }                                                 \
+    setIppErrorStatus();                              \
+    return 0;                                         \
+} while(0)
+
+inline int arithm_ipp_mul8u(const uchar *src1, size_t step1, const uchar *src2, size_t step2,
+                            uchar *dst, size_t step, int width, int height, double scale)
+{
+    ARITHM_IPP_MUL(ippiMul_8u_C1RSfs, src1, (int)step1, src2, (int)step2,dst, (int)step, ippiSize(width, height), 0);
+}
+inline int arithm_ipp_mul16u(const ushort *src1, size_t step1, const ushort *src2, size_t step2,
+                            ushort *dst, size_t step, int width, int height, double scale)
+{
+    ARITHM_IPP_MUL(ippiMul_16u_C1RSfs, src1, (int)step1, src2, (int)step2,dst, (int)step, ippiSize(width, height), 0);
+}
+
+inline int arithm_ipp_mul16s(const short *src1, size_t step1, const short *src2, size_t step2,
+                            short *dst, size_t step, int width, int height, double scale)
+{
+    ARITHM_IPP_MUL(ippiMul_16s_C1RSfs, src1, (int)step1, src2, (int)step2,dst, (int)step, ippiSize(width, height), 0);
+}
+
+inline int arithm_ipp_mul32f(const float *src1, size_t step1, const float *src2, size_t step2,
+                            float *dst, size_t step, int width, int height, double scale)
+{
+    ARITHM_IPP_MUL(ippiMul_32f_C1R, src1, (int)step1, src2, (int)step2,dst, (int)step, ippiSize(width, height));
+}
+
+#define arithm_ipp_mul8s(...)  0
+#define arithm_ipp_mul32s(...) 0
+#define arithm_ipp_mul64f(...) 0
+
+//=======================================
+// Div
+//=======================================
+
+#define arithm_ipp_div8u(...)  0
+#define arithm_ipp_div8s(...)  0
+#define arithm_ipp_div16u(...) 0
+#define arithm_ipp_div16s(...) 0
+#define arithm_ipp_div32s(...) 0
+#define arithm_ipp_div32f(...) 0
+#define arithm_ipp_div64f(...) 0
+
+//=======================================
+// AddWeighted
+//=======================================
+
+#define arithm_ipp_addWeighted8u(...)  0
+#define arithm_ipp_addWeighted8s(...)  0
+#define arithm_ipp_addWeighted16u(...) 0
+#define arithm_ipp_addWeighted16s(...) 0
+#define arithm_ipp_addWeighted32s(...) 0
+#define arithm_ipp_addWeighted32f(...) 0
+#define arithm_ipp_addWeighted64f(...) 0
+
+//=======================================
+// Reciprocial
+//=======================================
+
+#define arithm_ipp_recip8u(...)  0
+#define arithm_ipp_recip8s(...)  0
+#define arithm_ipp_recip16u(...) 0
+#define arithm_ipp_recip16s(...) 0
+#define arithm_ipp_recip32s(...) 0
+#define arithm_ipp_recip32f(...) 0
+#define arithm_ipp_recip64f(...) 0
+
+/** empty block in case if you have "fun"
+#define arithm_ipp_8u(...)  0
+#define arithm_ipp_8s(...)  0
+#define arithm_ipp_16u(...) 0
+#define arithm_ipp_16s(...) 0
+#define arithm_ipp_32s(...) 0
+#define arithm_ipp_32f(...) 0
+#define arithm_ipp_64f(...) 0
+**/
+
+}} // cv::hal::
+
+#define ARITHM_CALL_IPP(fun, ...)       \
+{                                       \
+    if (__CV_EXPAND(fun(__VA_ARGS__)))  \
+        return;                         \
+}
+
+#endif // ARITHM_USE_IPP
+
+
+#if !ARITHM_USE_IPP
+#define ARITHM_CALL_IPP(...)
+#endif
\ No newline at end of file
diff --git a/modules/core/src/arithm_simd.hpp b/modules/core/src/arithm_simd.hpp
deleted file mode 100644
index 5a37b4c200..0000000000
--- a/modules/core/src/arithm_simd.hpp
+++ /dev/null
@@ -1,2025 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                          License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Copyright (C) 2015, Itseez Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_ARITHM_SIMD_HPP__
-#define __OPENCV_ARITHM_SIMD_HPP__
-
-namespace cv {
-
-struct NOP {};
-
-#if CV_SSE2 || CV_NEON
-#define IF_SIMD(op) op
-#else
-#define IF_SIMD(op) NOP
-#endif
-
-
-#if CV_SSE2 || CV_NEON
-
-#define FUNCTOR_TEMPLATE(name)          \
-    template<typename T> struct name {}
-
-FUNCTOR_TEMPLATE(VLoadStore128);
-#if CV_SSE2
-FUNCTOR_TEMPLATE(VLoadStore64);
-FUNCTOR_TEMPLATE(VLoadStore128Aligned);
-#if CV_AVX2
-FUNCTOR_TEMPLATE(VLoadStore256);
-FUNCTOR_TEMPLATE(VLoadStore256Aligned);
-#endif
-#endif
-
-#endif
-
-#if CV_AVX2
-
-#define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)         \
-    template <>                                                                                  \
-    struct name<template_arg>{                                                                   \
-        typedef register_type reg_type;                                                          \
-        static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
-        static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); }       \
-    }
-
-#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body) \
-    template <>                                                                     \
-    struct name<template_arg>{                                                      \
-        typedef register_type reg_type;                                             \
-        static reg_type load(const template_arg * p) { return load_body (p); }      \
-        static void store(template_arg * p, reg_type v) { store_body (p, v); }      \
-    }
-
-#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)                         \
-    template<>                                                                 \
-    struct name<template_arg>                                                  \
-    {                                                                          \
-        VLoadStore256<template_arg>::reg_type operator()(                      \
-                        const VLoadStore256<template_arg>::reg_type & a,       \
-                        const VLoadStore256<template_arg>::reg_type & b) const \
-        {                                                                      \
-            body;                                                              \
-        }                                                                      \
-    }
-
-#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)                         \
-    template<>                                                                 \
-    struct name<template_arg>                                                  \
-    {                                                                          \
-        VLoadStore256<template_arg>::reg_type operator()(                      \
-                        const VLoadStore256<template_arg>::reg_type & a,       \
-                        const VLoadStore256<template_arg>::reg_type &  ) const \
-        {                                                                      \
-            body;                                                              \
-        }                                                                      \
-    }
-
-FUNCTOR_LOADSTORE_CAST(VLoadStore256,  uchar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
-FUNCTOR_LOADSTORE_CAST(VLoadStore256,  schar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
-FUNCTOR_LOADSTORE_CAST(VLoadStore256, ushort, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
-FUNCTOR_LOADSTORE_CAST(VLoadStore256,  short, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
-FUNCTOR_LOADSTORE_CAST(VLoadStore256,    int, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
-FUNCTOR_LOADSTORE(     VLoadStore256,  float, __m256 , _mm256_loadu_ps   , _mm256_storeu_ps   );
-FUNCTOR_LOADSTORE(     VLoadStore256, double, __m256d, _mm256_loadu_pd   , _mm256_storeu_pd   );
-
-FUNCTOR_LOADSTORE_CAST(VLoadStore256Aligned,    int, __m256i, _mm256_load_si256, _mm256_store_si256);
-FUNCTOR_LOADSTORE(     VLoadStore256Aligned,  float, __m256 , _mm256_load_ps   , _mm256_store_ps   );
-FUNCTOR_LOADSTORE(     VLoadStore256Aligned, double, __m256d, _mm256_load_pd   , _mm256_store_pd   );
-
-FUNCTOR_TEMPLATE(VAdd);
-FUNCTOR_CLOSURE_2arg(VAdd,  uchar, return _mm256_adds_epu8 (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,  schar, return _mm256_adds_epi8 (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm256_adds_epu16(a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,  short, return _mm256_adds_epi16(a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,    int, return _mm256_add_epi32 (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,  float, return _mm256_add_ps    (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm256_add_pd    (a, b));
-
-FUNCTOR_TEMPLATE(VSub);
-FUNCTOR_CLOSURE_2arg(VSub,  uchar, return _mm256_subs_epu8 (a, b));
-FUNCTOR_CLOSURE_2arg(VSub,  schar, return _mm256_subs_epi8 (a, b));
-FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm256_subs_epu16(a, b));
-FUNCTOR_CLOSURE_2arg(VSub,  short, return _mm256_subs_epi16(a, b));
-FUNCTOR_CLOSURE_2arg(VSub,    int, return _mm256_sub_epi32 (a, b));
-FUNCTOR_CLOSURE_2arg(VSub,  float, return _mm256_sub_ps    (a, b));
-FUNCTOR_CLOSURE_2arg(VSub, double, return _mm256_sub_pd    (a, b));
-
-FUNCTOR_TEMPLATE(VMin);
-FUNCTOR_CLOSURE_2arg(VMin,  uchar, return _mm256_min_epu8 (a, b));
-FUNCTOR_CLOSURE_2arg(VMin,  schar, return _mm256_min_epi8 (a, b));
-FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm256_min_epu16(a, b));
-FUNCTOR_CLOSURE_2arg(VMin,  short, return _mm256_min_epi16(a, b));
-FUNCTOR_CLOSURE_2arg(VMin,    int, return _mm256_min_epi32(a, b));
-FUNCTOR_CLOSURE_2arg(VMin,  float, return _mm256_min_ps   (a, b));
-FUNCTOR_CLOSURE_2arg(VMin, double, return _mm256_min_pd   (a, b));
-
-FUNCTOR_TEMPLATE(VMax);
-FUNCTOR_CLOSURE_2arg(VMax,  uchar, return _mm256_max_epu8 (a, b));
-FUNCTOR_CLOSURE_2arg(VMax,  schar, return _mm256_max_epi8 (a, b));
-FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm256_max_epu16(a, b));
-FUNCTOR_CLOSURE_2arg(VMax,  short, return _mm256_max_epi16(a, b));
-FUNCTOR_CLOSURE_2arg(VMax,    int, return _mm256_max_epi32(a, b));
-FUNCTOR_CLOSURE_2arg(VMax,  float, return _mm256_max_ps   (a, b));
-FUNCTOR_CLOSURE_2arg(VMax, double, return _mm256_max_pd   (a, b));
-
-
-static unsigned int CV_DECL_ALIGNED(32) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff,
-                                                           0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
-static unsigned int CV_DECL_ALIGNED(32) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff,
-                                                           0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
-
-FUNCTOR_TEMPLATE(VAbsDiff);
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar,
-        return _mm256_add_epi8(_mm256_subs_epu8(a, b), _mm256_subs_epu8(b, a));
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar,
-        __m256i d = _mm256_subs_epi8(a, b);
-        __m256i m = _mm256_cmpgt_epi8(b, a);
-        return _mm256_subs_epi8(_mm256_xor_si256(d, m), m);
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
-        return _mm256_add_epi16(_mm256_subs_epu16(a, b), _mm256_subs_epu16(b, a));
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  short,
-        __m256i M = _mm256_max_epi16(a, b);
-        __m256i m = _mm256_min_epi16(a, b);
-        return _mm256_subs_epi16(M, m);
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff,    int,
-        __m256i d = _mm256_sub_epi32(a, b);
-        __m256i m = _mm256_cmpgt_epi32(b, a);
-        return _mm256_sub_epi32(_mm256_xor_si256(d, m), m);
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  float,
-        return _mm256_and_ps(_mm256_sub_ps(a, b), *(const __m256*)v32f_absmask);
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
-        return _mm256_and_pd(_mm256_sub_pd(a, b), *(const __m256d*)v64f_absmask);
-    );
-
-FUNCTOR_TEMPLATE(VAnd);
-FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm256_and_si256(a, b));
-FUNCTOR_TEMPLATE(VOr);
-FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm256_or_si256 (a, b));
-FUNCTOR_TEMPLATE(VXor);
-FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm256_xor_si256(a, b));
-FUNCTOR_TEMPLATE(VNot);
-FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm256_xor_si256(_mm256_set1_epi32(-1), a));
-
-#elif CV_SSE2
-
-#define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)\
-    template <>                                                                                  \
-    struct name<template_arg>{                                                                   \
-        typedef register_type reg_type;                                                          \
-        static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
-        static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); }       \
-    }
-
-#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
-    template <>                                                                \
-    struct name<template_arg>{                                                 \
-        typedef register_type reg_type;                                        \
-        static reg_type load(const template_arg * p) { return load_body (p); } \
-        static void store(template_arg * p, reg_type v) { store_body (p, v); } \
-    }
-
-#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
-    template<>                                                                 \
-    struct name<template_arg>                                                  \
-    {                                                                          \
-        VLoadStore128<template_arg>::reg_type operator()(                      \
-                        const VLoadStore128<template_arg>::reg_type & a,       \
-                        const VLoadStore128<template_arg>::reg_type & b) const \
-        {                                                                      \
-            body;                                                              \
-        }                                                                      \
-    }
-
-#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
-    template<>                                                                 \
-    struct name<template_arg>                                                  \
-    {                                                                          \
-        VLoadStore128<template_arg>::reg_type operator()(                      \
-                        const VLoadStore128<template_arg>::reg_type & a,       \
-                        const VLoadStore128<template_arg>::reg_type &  ) const \
-        {                                                                      \
-            body;                                                              \
-        }                                                                      \
-    }
-
-FUNCTOR_LOADSTORE_CAST(VLoadStore128,  uchar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
-FUNCTOR_LOADSTORE_CAST(VLoadStore128,  schar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
-FUNCTOR_LOADSTORE_CAST(VLoadStore128, ushort, __m128i, _mm_loadu_si128, _mm_storeu_si128);
-FUNCTOR_LOADSTORE_CAST(VLoadStore128,  short, __m128i, _mm_loadu_si128, _mm_storeu_si128);
-FUNCTOR_LOADSTORE_CAST(VLoadStore128,    int, __m128i, _mm_loadu_si128, _mm_storeu_si128);
-FUNCTOR_LOADSTORE(     VLoadStore128,  float, __m128 , _mm_loadu_ps   , _mm_storeu_ps   );
-FUNCTOR_LOADSTORE(     VLoadStore128, double, __m128d, _mm_loadu_pd   , _mm_storeu_pd   );
-
-FUNCTOR_LOADSTORE_CAST(VLoadStore64,  uchar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
-FUNCTOR_LOADSTORE_CAST(VLoadStore64,  schar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
-FUNCTOR_LOADSTORE_CAST(VLoadStore64, ushort, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
-FUNCTOR_LOADSTORE_CAST(VLoadStore64,  short, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
-
-FUNCTOR_LOADSTORE_CAST(VLoadStore128Aligned,    int, __m128i, _mm_load_si128, _mm_store_si128);
-FUNCTOR_LOADSTORE(     VLoadStore128Aligned,  float, __m128 , _mm_load_ps   , _mm_store_ps   );
-FUNCTOR_LOADSTORE(     VLoadStore128Aligned, double, __m128d, _mm_load_pd   , _mm_store_pd   );
-
-FUNCTOR_TEMPLATE(VAdd);
-FUNCTOR_CLOSURE_2arg(VAdd,  uchar, return _mm_adds_epu8 (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,  schar, return _mm_adds_epi8 (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm_adds_epu16(a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,  short, return _mm_adds_epi16(a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,    int, return _mm_add_epi32 (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,  float, return _mm_add_ps    (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm_add_pd    (a, b));
-
-FUNCTOR_TEMPLATE(VSub);
-FUNCTOR_CLOSURE_2arg(VSub,  uchar, return _mm_subs_epu8 (a, b));
-FUNCTOR_CLOSURE_2arg(VSub,  schar, return _mm_subs_epi8 (a, b));
-FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm_subs_epu16(a, b));
-FUNCTOR_CLOSURE_2arg(VSub,  short, return _mm_subs_epi16(a, b));
-FUNCTOR_CLOSURE_2arg(VSub,    int, return _mm_sub_epi32 (a, b));
-FUNCTOR_CLOSURE_2arg(VSub,  float, return _mm_sub_ps    (a, b));
-FUNCTOR_CLOSURE_2arg(VSub, double, return _mm_sub_pd    (a, b));
-
-FUNCTOR_TEMPLATE(VMin);
-FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm_min_epu8(a, b));
-FUNCTOR_CLOSURE_2arg(VMin, schar,
-        __m128i m = _mm_cmpgt_epi8(a, b);
-        return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
-    );
-FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm_subs_epu16(a, _mm_subs_epu16(a, b)));
-FUNCTOR_CLOSURE_2arg(VMin,  short, return _mm_min_epi16(a, b));
-FUNCTOR_CLOSURE_2arg(VMin,    int,
-        __m128i m = _mm_cmpgt_epi32(a, b);
-        return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
-    );
-FUNCTOR_CLOSURE_2arg(VMin,  float, return _mm_min_ps(a, b));
-FUNCTOR_CLOSURE_2arg(VMin, double, return _mm_min_pd(a, b));
-
-FUNCTOR_TEMPLATE(VMax);
-FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm_max_epu8(a, b));
-FUNCTOR_CLOSURE_2arg(VMax, schar,
-        __m128i m = _mm_cmpgt_epi8(b, a);
-        return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
-    );
-FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm_adds_epu16(_mm_subs_epu16(a, b), b));
-FUNCTOR_CLOSURE_2arg(VMax,  short, return _mm_max_epi16(a, b));
-FUNCTOR_CLOSURE_2arg(VMax,    int,
-        __m128i m = _mm_cmpgt_epi32(b, a);
-        return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
-    );
-FUNCTOR_CLOSURE_2arg(VMax,  float, return _mm_max_ps(a, b));
-FUNCTOR_CLOSURE_2arg(VMax, double, return _mm_max_pd(a, b));
-
-
-static unsigned int CV_DECL_ALIGNED(16) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
-static unsigned int CV_DECL_ALIGNED(16) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
-
-FUNCTOR_TEMPLATE(VAbsDiff);
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar,
-        return _mm_add_epi8(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar,
-        __m128i d = _mm_subs_epi8(a, b);
-        __m128i m = _mm_cmpgt_epi8(b, a);
-        return _mm_subs_epi8(_mm_xor_si128(d, m), m);
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
-        return _mm_add_epi16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  short,
-        __m128i M = _mm_max_epi16(a, b);
-        __m128i m = _mm_min_epi16(a, b);
-        return _mm_subs_epi16(M, m);
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff,    int,
-        __m128i d = _mm_sub_epi32(a, b);
-        __m128i m = _mm_cmpgt_epi32(b, a);
-        return _mm_sub_epi32(_mm_xor_si128(d, m), m);
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  float,
-        return _mm_and_ps(_mm_sub_ps(a,b), *(const __m128*)v32f_absmask);
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
-        return _mm_and_pd(_mm_sub_pd(a,b), *(const __m128d*)v64f_absmask);
-    );
-
-FUNCTOR_TEMPLATE(VAnd);
-FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm_and_si128(a, b));
-FUNCTOR_TEMPLATE(VOr);
-FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm_or_si128 (a, b));
-FUNCTOR_TEMPLATE(VXor);
-FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm_xor_si128(a, b));
-FUNCTOR_TEMPLATE(VNot);
-FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm_xor_si128(_mm_set1_epi32(-1), a));
-#endif
-
-#if CV_NEON
-
-#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
-    template <>                                                                \
-    struct name<template_arg>{                                                 \
-        typedef register_type reg_type;                                        \
-        static reg_type load(const template_arg * p) { return load_body (p);}; \
-        static void store(template_arg * p, reg_type v) { store_body (p, v);}; \
-    }
-
-#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
-    template<>                                                         \
-    struct name<template_arg>                                          \
-    {                                                                  \
-        VLoadStore128<template_arg>::reg_type operator()(              \
-                        VLoadStore128<template_arg>::reg_type a,       \
-                        VLoadStore128<template_arg>::reg_type b) const \
-        {                                                              \
-            return body;                                               \
-        };                                                             \
-    }
-
-#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
-    template<>                                                         \
-    struct name<template_arg>                                          \
-    {                                                                  \
-        VLoadStore128<template_arg>::reg_type operator()(              \
-                        VLoadStore128<template_arg>::reg_type a,       \
-                        VLoadStore128<template_arg>::reg_type  ) const \
-        {                                                              \
-            return body;                                               \
-        };                                                             \
-    }
-
-FUNCTOR_LOADSTORE(VLoadStore128,  uchar,  uint8x16_t, vld1q_u8 , vst1q_u8 );
-FUNCTOR_LOADSTORE(VLoadStore128,  schar,   int8x16_t, vld1q_s8 , vst1q_s8 );
-FUNCTOR_LOADSTORE(VLoadStore128, ushort,  uint16x8_t, vld1q_u16, vst1q_u16);
-FUNCTOR_LOADSTORE(VLoadStore128,  short,   int16x8_t, vld1q_s16, vst1q_s16);
-FUNCTOR_LOADSTORE(VLoadStore128,    int,   int32x4_t, vld1q_s32, vst1q_s32);
-FUNCTOR_LOADSTORE(VLoadStore128,  float, float32x4_t, vld1q_f32, vst1q_f32);
-
-FUNCTOR_TEMPLATE(VAdd);
-FUNCTOR_CLOSURE_2arg(VAdd,  uchar, vqaddq_u8 (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,  schar, vqaddq_s8 (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd, ushort, vqaddq_u16(a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,  short, vqaddq_s16(a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,    int, vaddq_s32 (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,  float, vaddq_f32 (a, b));
-
-FUNCTOR_TEMPLATE(VSub);
-FUNCTOR_CLOSURE_2arg(VSub,  uchar, vqsubq_u8 (a, b));
-FUNCTOR_CLOSURE_2arg(VSub,  schar, vqsubq_s8 (a, b));
-FUNCTOR_CLOSURE_2arg(VSub, ushort, vqsubq_u16(a, b));
-FUNCTOR_CLOSURE_2arg(VSub,  short, vqsubq_s16(a, b));
-FUNCTOR_CLOSURE_2arg(VSub,    int, vsubq_s32 (a, b));
-FUNCTOR_CLOSURE_2arg(VSub,  float, vsubq_f32 (a, b));
-
-FUNCTOR_TEMPLATE(VMin);
-FUNCTOR_CLOSURE_2arg(VMin,  uchar, vminq_u8 (a, b));
-FUNCTOR_CLOSURE_2arg(VMin,  schar, vminq_s8 (a, b));
-FUNCTOR_CLOSURE_2arg(VMin, ushort, vminq_u16(a, b));
-FUNCTOR_CLOSURE_2arg(VMin,  short, vminq_s16(a, b));
-FUNCTOR_CLOSURE_2arg(VMin,    int, vminq_s32(a, b));
-FUNCTOR_CLOSURE_2arg(VMin,  float, vminq_f32(a, b));
-
-FUNCTOR_TEMPLATE(VMax);
-FUNCTOR_CLOSURE_2arg(VMax,  uchar, vmaxq_u8 (a, b));
-FUNCTOR_CLOSURE_2arg(VMax,  schar, vmaxq_s8 (a, b));
-FUNCTOR_CLOSURE_2arg(VMax, ushort, vmaxq_u16(a, b));
-FUNCTOR_CLOSURE_2arg(VMax,  short, vmaxq_s16(a, b));
-FUNCTOR_CLOSURE_2arg(VMax,    int, vmaxq_s32(a, b));
-FUNCTOR_CLOSURE_2arg(VMax,  float, vmaxq_f32(a, b));
-
-FUNCTOR_TEMPLATE(VAbsDiff);
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar, vabdq_u8  (a, b));
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar, vqabsq_s8 (vqsubq_s8(a, b)));
-FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, vabdq_u16 (a, b));
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  short, vqabsq_s16(vqsubq_s16(a, b)));
-FUNCTOR_CLOSURE_2arg(VAbsDiff,    int, vabdq_s32 (a, b));
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  float, vabdq_f32 (a, b));
-
-FUNCTOR_TEMPLATE(VAnd);
-FUNCTOR_CLOSURE_2arg(VAnd, uchar, vandq_u8(a, b));
-FUNCTOR_TEMPLATE(VOr);
-FUNCTOR_CLOSURE_2arg(VOr , uchar, vorrq_u8(a, b));
-FUNCTOR_TEMPLATE(VXor);
-FUNCTOR_CLOSURE_2arg(VXor, uchar, veorq_u8(a, b));
-FUNCTOR_TEMPLATE(VNot);
-FUNCTOR_CLOSURE_1arg(VNot, uchar, vmvnq_u8(a   ));
-#endif
-
-
-template <typename T>
-struct Cmp_SIMD
-{
-    explicit Cmp_SIMD(int)
-    {
-    }
-
-    int operator () (const T *, const T *, uchar *, int) const
-    {
-        return 0;
-    }
-};
-
-#if CV_NEON
-
-template <>
-struct Cmp_SIMD<schar>
-{
-    explicit Cmp_SIMD(int code_) :
-        code(code_)
-    {
-        // CV_Assert(code == CMP_GT || code == CMP_LE ||
-        //           code == CMP_EQ || code == CMP_NE);
-
-        v_mask = vdupq_n_u8(255);
-    }
-
-    int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
-    {
-        int x = 0;
-
-        if (code == CMP_GT)
-            for ( ; x <= width - 16; x += 16)
-                vst1q_u8(dst + x, vcgtq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
-        else if (code == CMP_LE)
-            for ( ; x <= width - 16; x += 16)
-                vst1q_u8(dst + x, vcleq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
-        else if (code == CMP_EQ)
-            for ( ; x <= width - 16; x += 16)
-                vst1q_u8(dst + x, vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
-        else if (code == CMP_NE)
-            for ( ; x <= width - 16; x += 16)
-                vst1q_u8(dst + x, veorq_u8(vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)), v_mask));
-
-        return x;
-    }
-
-    int code;
-    uint8x16_t v_mask;
-};
-
-template <>
-struct Cmp_SIMD<ushort>
-{
-    explicit Cmp_SIMD(int code_) :
-        code(code_)
-    {
-        // CV_Assert(code == CMP_GT || code == CMP_LE ||
-        //           code == CMP_EQ || code == CMP_NE);
-
-        v_mask = vdup_n_u8(255);
-    }
-
-    int operator () (const ushort * src1, const ushort * src2, uchar * dst, int width) const
-    {
-        int x = 0;
-
-        if (code == CMP_GT)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint16x8_t v_dst = vcgtq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
-                vst1_u8(dst + x, vmovn_u16(v_dst));
-            }
-        else if (code == CMP_LE)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint16x8_t v_dst = vcleq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
-                vst1_u8(dst + x, vmovn_u16(v_dst));
-            }
-        else if (code == CMP_EQ)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
-                vst1_u8(dst + x, vmovn_u16(v_dst));
-            }
-        else if (code == CMP_NE)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
-                vst1_u8(dst + x, veor_u8(vmovn_u16(v_dst), v_mask));
-            }
-
-        return x;
-    }
-
-    int code;
-    uint8x8_t v_mask;
-};
-
-template <>
-struct Cmp_SIMD<int>
-{
-    explicit Cmp_SIMD(int code_) :
-        code(code_)
-    {
-        // CV_Assert(code == CMP_GT || code == CMP_LE ||
-        //           code == CMP_EQ || code == CMP_NE);
-
-        v_mask = vdup_n_u8(255);
-    }
-
-    int operator () (const int * src1, const int * src2, uchar * dst, int width) const
-    {
-        int x = 0;
-
-        if (code == CMP_GT)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint32x4_t v_dst1 = vcgtq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
-                uint32x4_t v_dst2 = vcgtq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
-                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
-            }
-        else if (code == CMP_LE)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint32x4_t v_dst1 = vcleq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
-                uint32x4_t v_dst2 = vcleq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
-                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
-            }
-        else if (code == CMP_EQ)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
-                uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
-                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
-            }
-        else if (code == CMP_NE)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
-                uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
-                uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
-                vst1_u8(dst + x, veor_u8(v_dst, v_mask));
-            }
-
-        return x;
-    }
-
-    int code;
-    uint8x8_t v_mask;
-};
-
-template <>
-struct Cmp_SIMD<float>
-{
-    explicit Cmp_SIMD(int code_) :
-        code(code_)
-    {
-        // CV_Assert(code == CMP_GT || code == CMP_LE ||
-        //           code == CMP_EQ || code == CMP_NE);
-
-        v_mask = vdup_n_u8(255);
-    }
-
-    int operator () (const float * src1, const float * src2, uchar * dst, int width) const
-    {
-        int x = 0;
-
-        if (code == CMP_GT)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint32x4_t v_dst1 = vcgtq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
-                uint32x4_t v_dst2 = vcgtq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
-                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
-            }
-        else if (code == CMP_LE)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint32x4_t v_dst1 = vcleq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
-                uint32x4_t v_dst2 = vcleq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
-                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
-            }
-        else if (code == CMP_EQ)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
-                uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
-                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
-            }
-        else if (code == CMP_NE)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
-                uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
-                uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
-                vst1_u8(dst + x, veor_u8(v_dst, v_mask));
-            }
-
-        return x;
-    }
-
-    int code;
-    uint8x8_t v_mask;
-};
-
-#elif CV_SSE2
-
-template <>
-struct Cmp_SIMD<schar>
-{
-    explicit Cmp_SIMD(int code_) :
-        code(code_)
-    {
-        // CV_Assert(code == CMP_GT || code == CMP_LE ||
-        //           code == CMP_EQ || code == CMP_NE);
-
-        haveSSE = checkHardwareSupport(CV_CPU_SSE2);
-
-        v_mask = _mm_set1_epi8(-1);
-    }
-
-    int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
-    {
-        int x = 0;
-
-        if (!haveSSE)
-            return x;
-
-        if (code == CMP_GT)
-            for ( ; x <= width - 16; x += 16)
-                _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
-                                                                      _mm_loadu_si128((const __m128i *)(src2 + x))));
-        else if (code == CMP_LE)
-            for ( ; x <= width - 16; x += 16)
-            {
-                __m128i v_gt = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
-                                              _mm_loadu_si128((const __m128i *)(src2 + x)));
-                _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_gt));
-            }
-        else if (code == CMP_EQ)
-            for ( ; x <= width - 16; x += 16)
-                _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
-                                                                      _mm_loadu_si128((const __m128i *)(src2 + x))));
-        else if (code == CMP_NE)
-            for ( ; x <= width - 16; x += 16)
-            {
-                __m128i v_eq = _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
-                                              _mm_loadu_si128((const __m128i *)(src2 + x)));
-                _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_eq));
-            }
-
-        return x;
-    }
-
-    int code;
-    __m128i v_mask;
-    bool haveSSE;
-};
-
-template <>
-struct Cmp_SIMD<int>
-{
-    explicit Cmp_SIMD(int code_) :
-        code(code_)
-    {
-        // CV_Assert(code == CMP_GT || code == CMP_LE ||
-        //           code == CMP_EQ || code == CMP_NE);
-
-        haveSSE = checkHardwareSupport(CV_CPU_SSE2);
-
-        v_mask = _mm_set1_epi32(0xffffffff);
-    }
-
-    int operator () (const int * src1, const int * src2, uchar * dst, int width) const
-    {
-        int x = 0;
-
-        if (!haveSSE)
-            return x;
-
-        if (code == CMP_GT)
-            for ( ; x <= width - 8; x += 8)
-            {
-                __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
-                                                 _mm_loadu_si128((const __m128i *)(src2 + x)));
-                __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
-                                                 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
-
-                _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
-            }
-        else if (code == CMP_LE)
-            for ( ; x <= width - 8; x += 8)
-            {
-                __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
-                                                 _mm_loadu_si128((const __m128i *)(src2 + x)));
-                __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
-                                                 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
-
-                _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask), v_mask));
-            }
-        else if (code == CMP_EQ)
-            for ( ; x <= width - 8; x += 8)
-            {
-                __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
-                                                 _mm_loadu_si128((const __m128i *)(src2 + x)));
-                __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
-                                                 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
-
-                _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
-            }
-        else if (code == CMP_NE)
-            for ( ; x <= width - 8; x += 8)
-            {
-                __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
-                                                 _mm_loadu_si128((const __m128i *)(src2 + x)));
-                __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
-                                                 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
-
-                _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)));
-            }
-
-        return x;
-    }
-
-    int code;
-    __m128i v_mask;
-    bool haveSSE;
-};
-
-#endif
-
-
-template <typename T, typename WT>
-struct Mul_SIMD
-{
-    int operator() (const T *, const T *, T *, int, WT) const
-    {
-        return 0;
-    }
-};
-
-#if CV_NEON
-
-template <>
-struct Mul_SIMD<uchar, float>
-{
-    int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, float scale) const
-    {
-        int x = 0;
-
-        if( scale == 1.0f )
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
-                uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
-
-                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
-                                               vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
-                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
-                                               vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
-
-                uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
-                                                vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
-                vst1_u8(dst + x, vqmovn_u16(v_dst));
-            }
-        else
-        {
-            float32x4_t v_scale = vdupq_n_f32(scale);
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
-                uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
-
-                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
-                                               vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
-                v_dst1 = vmulq_f32(v_dst1, v_scale);
-                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
-                                               vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
-                v_dst2 = vmulq_f32(v_dst2, v_scale);
-
-                uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
-                                                vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
-                vst1_u8(dst + x, vqmovn_u16(v_dst));
-            }
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct Mul_SIMD<schar, float>
-{
-    int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
-    {
-        int x = 0;
-
-        if( scale == 1.0f )
-            for ( ; x <= width - 8; x += 8)
-            {
-                int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
-                int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
-
-                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
-                                               vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
-                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
-                                               vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
-
-                int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
-                                               vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
-                vst1_s8(dst + x, vqmovn_s16(v_dst));
-            }
-        else
-        {
-            float32x4_t v_scale = vdupq_n_f32(scale);
-            for ( ; x <= width - 8; x += 8)
-            {
-                int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
-                int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
-
-                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
-                                               vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
-                v_dst1 = vmulq_f32(v_dst1, v_scale);
-                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
-                                               vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
-                v_dst2 = vmulq_f32(v_dst2, v_scale);
-
-                int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
-                                               vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
-                vst1_s8(dst + x, vqmovn_s16(v_dst));
-            }
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct Mul_SIMD<ushort, float>
-{
-    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
-    {
-        int x = 0;
-
-        if( scale == 1.0f )
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
-
-                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
-                                               vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
-                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
-                                               vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
-
-                uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
-                                                vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
-                vst1q_u16(dst + x, v_dst);
-            }
-        else
-        {
-            float32x4_t v_scale = vdupq_n_f32(scale);
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
-
-                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
-                                               vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
-                v_dst1 = vmulq_f32(v_dst1, v_scale);
-                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
-                                               vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
-                v_dst2 = vmulq_f32(v_dst2, v_scale);
-
-                uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
-                                                vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
-                vst1q_u16(dst + x, v_dst);
-            }
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct Mul_SIMD<short, float>
-{
-    int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
-    {
-        int x = 0;
-
-        if( scale == 1.0f )
-            for ( ; x <= width - 8; x += 8)
-            {
-                int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
-
-                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
-                                               vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
-                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
-                                               vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
-
-                int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
-                                               vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
-                vst1q_s16(dst + x, v_dst);
-            }
-        else
-        {
-            float32x4_t v_scale = vdupq_n_f32(scale);
-            for ( ; x <= width - 8; x += 8)
-            {
-                int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
-
-                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
-                                               vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
-                v_dst1 = vmulq_f32(v_dst1, v_scale);
-                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
-                                               vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
-                v_dst2 = vmulq_f32(v_dst2, v_scale);
-
-                int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
-                                               vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
-                vst1q_s16(dst + x, v_dst);
-            }
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct Mul_SIMD<float, float>
-{
-    int operator() (const float * src1, const float * src2, float * dst, int width, float scale) const
-    {
-        int x = 0;
-
-        if( scale == 1.0f )
-            for ( ; x <= width - 8; x += 8)
-            {
-                float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
-                float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
-                vst1q_f32(dst + x, v_dst1);
-                vst1q_f32(dst + x + 4, v_dst2);
-            }
-        else
-        {
-            float32x4_t v_scale = vdupq_n_f32(scale);
-            for ( ; x <= width - 8; x += 8)
-            {
-                float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
-                v_dst1 = vmulq_f32(v_dst1, v_scale);
-
-                float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
-                v_dst2 = vmulq_f32(v_dst2, v_scale);
-
-                vst1q_f32(dst + x, v_dst1);
-                vst1q_f32(dst + x + 4, v_dst2);
-            }
-        }
-
-        return x;
-    }
-};
-
-#elif CV_SSE2
-
-#if CV_SSE4_1
-
-template <>
-struct Mul_SIMD<ushort, float>
-{
-    Mul_SIMD()
-    {
-        haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
-    }
-
-    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
-    {
-        int x = 0;
-
-        if (!haveSSE)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-
-        if( scale != 1.0f )
-        {
-            __m128 v_scale = _mm_set1_ps(scale);
-            for ( ; x <= width - 8; x += 8)
-            {
-                __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
-                __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
-
-                __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)),
-                                           _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)));
-                v_dst1 = _mm_mul_ps(v_dst1, v_scale);
-
-                __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)),
-                                           _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)));
-                v_dst2 = _mm_mul_ps(v_dst2, v_scale);
-
-                __m128i v_dsti = _mm_packus_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
-                _mm_storeu_si128((__m128i *)(dst + x), v_dsti);
-            }
-        }
-
-        return x;
-    }
-
-    bool haveSSE;
-};
-
-#endif
-
-template <>
-struct Mul_SIMD<schar, float>
-{
-    Mul_SIMD()
-    {
-        haveSSE = checkHardwareSupport(CV_CPU_SSE2);
-    }
-
-    int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
-    {
-        int x = 0;
-
-        if (!haveSSE)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-
-        if( scale == 1.0f )
-            for ( ; x <= width - 8; x += 8)
-            {
-                __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
-                __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
-
-                v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
-                v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
-
-                __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
-                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
-
-                __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
-                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
-
-                __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
-                _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
-            }
-        else
-        {
-            __m128 v_scale = _mm_set1_ps(scale);
-            for ( ; x <= width - 8; x += 8)
-            {
-                __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
-                __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
-
-                v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
-                v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
-
-                __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
-                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
-                v_dst1 = _mm_mul_ps(v_dst1, v_scale);
-
-                __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
-                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
-                v_dst2 = _mm_mul_ps(v_dst2, v_scale);
-
-                __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
-                _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
-            }
-        }
-
-        return x;
-    }
-
-    bool haveSSE;
-};
-
-template <>
-struct Mul_SIMD<short, float>
-{
-    Mul_SIMD()
-    {
-        haveSSE = checkHardwareSupport(CV_CPU_SSE2);
-    }
-
-    int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
-    {
-        int x = 0;
-
-        if (!haveSSE)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-
-        if( scale != 1.0f )
-        {
-            __m128 v_scale = _mm_set1_ps(scale);
-            for ( ; x <= width - 8; x += 8)
-            {
-                __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
-                __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
-
-                __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
-                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
-                v_dst1 = _mm_mul_ps(v_dst1, v_scale);
-
-                __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
-                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
-                v_dst2 = _mm_mul_ps(v_dst2, v_scale);
-
-                __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
-                _mm_storeu_si128((__m128i *)(dst + x), v_dsti);
-            }
-        }
-
-        return x;
-    }
-
-    bool haveSSE;
-};
-
-#endif
-
-template <typename T>
-struct Div_SIMD
-{
-    int operator() (const T *, const T *, T *, int, double) const
-    {
-        return 0;
-    }
-};
-
-template <typename T>
-struct Recip_SIMD
-{
-    int operator() (const T *, T *, int, double) const
-    {
-        return 0;
-    }
-};
-
-
-#if CV_SIMD128
-
-template <>
-struct Div_SIMD<uchar>
-{
-    bool haveSIMD;
-    Div_SIMD() { haveSIMD = hasSIMD128(); }
-
-    int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_uint16x8 v_zero = v_setzero_u16();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_uint16x8 v_src1 = v_load_expand(src1 + x);
-            v_uint16x8 v_src2 = v_load_expand(src2 + x);
-
-            v_uint32x4 t0, t1, t2, t3;
-            v_expand(v_src1, t0, t1);
-            v_expand(v_src2, t2, t3);
-
-            v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
-            v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
-
-            v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
-            v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
-
-            f0 = f0 * v_scale / f2;
-            f1 = f1 * v_scale / f3;
-
-            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
-            v_uint16x8 res = v_pack_u(i0, i1);
-
-            res = v_select(v_src2 == v_zero, v_zero, res);
-            v_pack_store(dst + x, res);
-        }
-
-        return x;
-    }
-};
-
-
-template <>
-struct Div_SIMD<schar>
-{
-    bool haveSIMD;
-    Div_SIMD() { haveSIMD = hasSIMD128(); }
-
-    int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_int16x8 v_zero = v_setzero_s16();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_int16x8 v_src1 = v_load_expand(src1 + x);
-            v_int16x8 v_src2 = v_load_expand(src2 + x);
-
-            v_int32x4 t0, t1, t2, t3;
-            v_expand(v_src1, t0, t1);
-            v_expand(v_src2, t2, t3);
-
-            v_float32x4 f0 = v_cvt_f32(t0);
-            v_float32x4 f1 = v_cvt_f32(t1);
-
-            v_float32x4 f2 = v_cvt_f32(t2);
-            v_float32x4 f3 = v_cvt_f32(t3);
-
-            f0 = f0 * v_scale / f2;
-            f1 = f1 * v_scale / f3;
-
-            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
-            v_int16x8 res = v_pack(i0, i1);
-
-            res = v_select(v_src2 == v_zero, v_zero, res);
-            v_pack_store(dst + x, res);
-        }
-
-        return x;
-    }
-};
-
-
-template <>
-struct Div_SIMD<ushort>
-{
-    bool haveSIMD;
-    Div_SIMD() { haveSIMD = hasSIMD128(); }
-
-    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_uint16x8 v_zero = v_setzero_u16();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_uint16x8 v_src1 = v_load(src1 + x);
-            v_uint16x8 v_src2 = v_load(src2 + x);
-
-            v_uint32x4 t0, t1, t2, t3;
-            v_expand(v_src1, t0, t1);
-            v_expand(v_src2, t2, t3);
-
-            v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
-            v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
-
-            v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
-            v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
-
-            f0 = f0 * v_scale / f2;
-            f1 = f1 * v_scale / f3;
-
-            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
-            v_uint16x8 res = v_pack_u(i0, i1);
-
-            res = v_select(v_src2 == v_zero, v_zero, res);
-            v_store(dst + x, res);
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct Div_SIMD<short>
-{
-    bool haveSIMD;
-    Div_SIMD() { haveSIMD = hasSIMD128(); }
-
-    int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_int16x8 v_zero = v_setzero_s16();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_int16x8 v_src1 = v_load(src1 + x);
-            v_int16x8 v_src2 = v_load(src2 + x);
-
-            v_int32x4 t0, t1, t2, t3;
-            v_expand(v_src1, t0, t1);
-            v_expand(v_src2, t2, t3);
-
-            v_float32x4 f0 = v_cvt_f32(t0);
-            v_float32x4 f1 = v_cvt_f32(t1);
-
-            v_float32x4 f2 = v_cvt_f32(t2);
-            v_float32x4 f3 = v_cvt_f32(t3);
-
-            f0 = f0 * v_scale / f2;
-            f1 = f1 * v_scale / f3;
-
-            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
-            v_int16x8 res = v_pack(i0, i1);
-
-            res = v_select(v_src2 == v_zero, v_zero, res);
-            v_store(dst + x, res);
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct Div_SIMD<int>
-{
-    bool haveSIMD;
-    Div_SIMD() { haveSIMD = hasSIMD128(); }
-
-    int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_int32x4 v_zero = v_setzero_s32();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_int32x4 t0 = v_load(src1 + x);
-            v_int32x4 t1 = v_load(src1 + x + 4);
-            v_int32x4 t2 = v_load(src2 + x);
-            v_int32x4 t3 = v_load(src2 + x + 4);
-
-            v_float32x4 f0 = v_cvt_f32(t0);
-            v_float32x4 f1 = v_cvt_f32(t1);
-            v_float32x4 f2 = v_cvt_f32(t2);
-            v_float32x4 f3 = v_cvt_f32(t3);
-
-            f0 = f0 * v_scale / f2;
-            f1 = f1 * v_scale / f3;
-
-            v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
-
-            res0 = v_select(t2 == v_zero, v_zero, res0);
-            res1 = v_select(t3 == v_zero, v_zero, res1);
-            v_store(dst + x, res0);
-            v_store(dst + x + 4, res1);
-        }
-
-        return x;
-    }
-};
-
-
-template <>
-struct Div_SIMD<float>
-{
-    bool haveSIMD;
-    Div_SIMD() { haveSIMD = hasSIMD128(); }
-
-    int operator() (const float * src1, const float * src2, float * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_float32x4 v_zero = v_setzero_f32();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_float32x4 f0 = v_load(src1 + x);
-            v_float32x4 f1 = v_load(src1 + x + 4);
-            v_float32x4 f2 = v_load(src2 + x);
-            v_float32x4 f3 = v_load(src2 + x + 4);
-
-            v_float32x4 res0 = f0 * v_scale / f2;
-            v_float32x4 res1 = f1 * v_scale / f3;
-
-            res0 = v_select(f2 == v_zero, v_zero, res0);
-            res1 = v_select(f3 == v_zero, v_zero, res1);
-
-            v_store(dst + x, res0);
-            v_store(dst + x + 4, res1);
-        }
-
-        return x;
-    }
-};
-
-
-///////////////////////// RECIPROCAL //////////////////////
-
-template <>
-struct Recip_SIMD<uchar>
-{
-    bool haveSIMD;
-    Recip_SIMD() { haveSIMD = hasSIMD128(); }
-
-    int operator() (const uchar * src2, uchar * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_uint16x8 v_zero = v_setzero_u16();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_uint16x8 v_src2 = v_load_expand(src2 + x);
-
-            v_uint32x4 t0, t1;
-            v_expand(v_src2, t0, t1);
-
-            v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
-            v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
-
-            f0 = v_scale / f0;
-            f1 = v_scale / f1;
-
-            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
-            v_uint16x8 res = v_pack_u(i0, i1);
-
-            res = v_select(v_src2 == v_zero, v_zero, res);
-            v_pack_store(dst + x, res);
-        }
-
-        return x;
-    }
-};
-
-
-template <>
-struct Recip_SIMD<schar>
-{
-    bool haveSIMD;
-    Recip_SIMD() { haveSIMD = hasSIMD128(); }
-
-    int operator() (const schar * src2, schar * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_int16x8 v_zero = v_setzero_s16();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_int16x8 v_src2 = v_load_expand(src2 + x);
-
-            v_int32x4 t0, t1;
-            v_expand(v_src2, t0, t1);
-
-            v_float32x4 f0 = v_cvt_f32(t0);
-            v_float32x4 f1 = v_cvt_f32(t1);
-
-            f0 = v_scale / f0;
-            f1 = v_scale / f1;
-
-            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
-            v_int16x8 res = v_pack(i0, i1);
-
-            res = v_select(v_src2 == v_zero, v_zero, res);
-            v_pack_store(dst + x, res);
-        }
-
-        return x;
-    }
-};
-
-
-template <>
-struct Recip_SIMD<ushort>
-{
-    bool haveSIMD;
-    Recip_SIMD() { haveSIMD = hasSIMD128(); }
-
-    int operator() (const ushort * src2, ushort * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_uint16x8 v_zero = v_setzero_u16();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_uint16x8 v_src2 = v_load(src2 + x);
-
-            v_uint32x4 t0, t1;
-            v_expand(v_src2, t0, t1);
-
-            v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
-            v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
-
-            f0 = v_scale / f0;
-            f1 = v_scale / f1;
-
-            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
-            v_uint16x8 res = v_pack_u(i0, i1);
-
-            res = v_select(v_src2 == v_zero, v_zero, res);
-            v_store(dst + x, res);
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct Recip_SIMD<short>
-{
-    bool haveSIMD;
-    Recip_SIMD() { haveSIMD = hasSIMD128(); }
-
-    int operator() (const short * src2, short * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_int16x8 v_zero = v_setzero_s16();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_int16x8 v_src2 = v_load(src2 + x);
-
-            v_int32x4 t0, t1;
-            v_expand(v_src2, t0, t1);
-
-            v_float32x4 f0 = v_cvt_f32(t0);
-            v_float32x4 f1 = v_cvt_f32(t1);
-
-            f0 = v_scale / f0;
-            f1 = v_scale / f1;
-
-            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
-            v_int16x8 res = v_pack(i0, i1);
-
-            res = v_select(v_src2 == v_zero, v_zero, res);
-            v_store(dst + x, res);
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct Recip_SIMD<int>
-{
-    bool haveSIMD;
-    Recip_SIMD() { haveSIMD = hasSIMD128(); }
-
-    int operator() (const int * src2, int * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_int32x4 v_zero = v_setzero_s32();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_int32x4 t0 = v_load(src2 + x);
-            v_int32x4 t1 = v_load(src2 + x + 4);
-
-            v_float32x4 f0 = v_cvt_f32(t0);
-            v_float32x4 f1 = v_cvt_f32(t1);
-
-            f0 = v_scale / f0;
-            f1 = v_scale / f1;
-
-            v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
-
-            res0 = v_select(t0 == v_zero, v_zero, res0);
-            res1 = v_select(t1 == v_zero, v_zero, res1);
-            v_store(dst + x, res0);
-            v_store(dst + x + 4, res1);
-        }
-
-        return x;
-    }
-};
-
-
-template <>
-struct Recip_SIMD<float>
-{
-    bool haveSIMD;
-    Recip_SIMD() { haveSIMD = hasSIMD128(); }
-
-    int operator() (const float * src2, float * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_float32x4 v_zero = v_setzero_f32();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_float32x4 f0 = v_load(src2 + x);
-            v_float32x4 f1 = v_load(src2 + x + 4);
-
-            v_float32x4 res0 = v_scale / f0;
-            v_float32x4 res1 = v_scale / f1;
-
-            res0 = v_select(f0 == v_zero, v_zero, res0);
-            res1 = v_select(f1 == v_zero, v_zero, res1);
-
-            v_store(dst + x, res0);
-            v_store(dst + x + 4, res1);
-        }
-
-        return x;
-    }
-};
-
-#if CV_SIMD128_64F
-
-template <>
-struct Div_SIMD<double>
-{
-    bool haveSIMD;
-    Div_SIMD() { haveSIMD = hasSIMD128(); }
-
-    int operator() (const double * src1, const double * src2, double * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float64x2 v_scale = v_setall_f64(scale);
-        v_float64x2 v_zero = v_setzero_f64();
-
-        for ( ; x <= width - 4; x += 4)
-        {
-            v_float64x2 f0 = v_load(src1 + x);
-            v_float64x2 f1 = v_load(src1 + x + 2);
-            v_float64x2 f2 = v_load(src2 + x);
-            v_float64x2 f3 = v_load(src2 + x + 2);
-
-            v_float64x2 res0 = f0 * v_scale / f2;
-            v_float64x2 res1 = f1 * v_scale / f3;
-
-            res0 = v_select(f2 == v_zero, v_zero, res0);
-            res1 = v_select(f3 == v_zero, v_zero, res1);
-
-            v_store(dst + x, res0);
-            v_store(dst + x + 2, res1);
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct Recip_SIMD<double>
-{
-    bool haveSIMD;
-    Recip_SIMD() { haveSIMD = hasSIMD128(); }
-
-    int operator() (const double * src2, double * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float64x2 v_scale = v_setall_f64(scale);
-        v_float64x2 v_zero = v_setzero_f64();
-
-        for ( ; x <= width - 4; x += 4)
-        {
-            v_float64x2 f0 = v_load(src2 + x);
-            v_float64x2 f1 = v_load(src2 + x + 2);
-
-            v_float64x2 res0 = v_scale / f0;
-            v_float64x2 res1 = v_scale / f1;
-
-            res0 = v_select(f0 == v_zero, v_zero, res0);
-            res1 = v_select(f1 == v_zero, v_zero, res1);
-
-            v_store(dst + x, res0);
-            v_store(dst + x + 2, res1);
-        }
-
-        return x;
-    }
-};
-
-#endif
-
-#endif
-
-
-template <typename T, typename WT>
-struct AddWeighted_SIMD
-{
-    int operator() (const T *, const T *, T *, int, WT, WT, WT) const
-    {
-        return 0;
-    }
-};
-
-#if CV_SSE2
-
-template <>
-struct AddWeighted_SIMD<schar, float>
-{
-    AddWeighted_SIMD()
-    {
-        haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
-    }
-
-    int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
-    {
-        int x = 0;
-
-        if (!haveSSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
-               v_gamma = _mm_set1_ps(gamma);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            __m128i v_src1 = _mm_loadl_epi64((const __m128i *)(src1 + x));
-            __m128i v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
-
-            __m128i v_src1_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
-            __m128i v_src2_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
-
-            __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1_p), 16)), v_alpha);
-            v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
-                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2_p), 16)), v_beta));
-
-            __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1_p), 16)), v_alpha);
-            v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
-                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2_p), 16)), v_beta));
-
-            __m128i v_dst16 = _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
-                                              _mm_cvtps_epi32(v_dstf1));
-
-            _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst16, v_zero));
-        }
-
-        return x;
-    }
-
-    bool haveSSE2;
-};
-
-template <>
-struct AddWeighted_SIMD<short, float>
-{
-    AddWeighted_SIMD()
-    {
-        haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
-    }
-
-    int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
-    {
-        int x = 0;
-
-        if (!haveSSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
-               v_gamma = _mm_set1_ps(gamma);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
-            __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
-
-            __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), v_alpha);
-            v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
-                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)), v_beta));
-
-            __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), v_alpha);
-            v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
-                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)), v_beta));
-
-            _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
-                                                                   _mm_cvtps_epi32(v_dstf1)));
-        }
-
-        return x;
-    }
-
-    bool haveSSE2;
-};
-
-#if CV_SSE4_1
-
-template <>
-struct AddWeighted_SIMD<ushort, float>
-{
-    AddWeighted_SIMD()
-    {
-        haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
-    }
-
-    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
-    {
-        int x = 0;
-
-        if (!haveSSE4_1)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
-               v_gamma = _mm_set1_ps(gamma);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
-            __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
-
-            __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), v_alpha);
-            v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
-                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)), v_beta));
-
-            __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), v_alpha);
-            v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
-                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)), v_beta));
-
-            _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(_mm_cvtps_epi32(v_dstf0),
-                                                                    _mm_cvtps_epi32(v_dstf1)));
-        }
-
-        return x;
-    }
-
-    bool haveSSE4_1;
-};
-
-#endif
-
-#elif CV_NEON
-
-template <>
-struct AddWeighted_SIMD<schar, float>
-{
-    int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
-    {
-        int x = 0;
-
-        float32x4_t g = vdupq_n_f32 (gamma);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            int8x8_t in1 = vld1_s8(src1 + x);
-            int16x8_t in1_16 = vmovl_s8(in1);
-            float32x4_t in1_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in1_16)));
-            float32x4_t in1_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in1_16)));
-
-            int8x8_t in2 = vld1_s8(src2+x);
-            int16x8_t in2_16 = vmovl_s8(in2);
-            float32x4_t in2_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in2_16)));
-            float32x4_t in2_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in2_16)));
-
-            float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta));
-            float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta));
-            out_f_l = vaddq_f32(out_f_l, g);
-            out_f_h = vaddq_f32(out_f_h, g);
-
-            int16x4_t out_16_l = vqmovn_s32(cv_vrndq_s32_f32(out_f_l));
-            int16x4_t out_16_h = vqmovn_s32(cv_vrndq_s32_f32(out_f_h));
-
-            int16x8_t out_16 = vcombine_s16(out_16_l, out_16_h);
-            int8x8_t out = vqmovn_s16(out_16);
-
-            vst1_s8(dst + x, out);
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct AddWeighted_SIMD<ushort, float>
-{
-    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
-    {
-        int x = 0;
-
-        float32x4_t g = vdupq_n_f32(gamma);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
-
-            float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), alpha);
-            float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))), beta);
-            uint16x4_t v_dst1 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
-
-            v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), alpha);
-            v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))), beta);
-            uint16x4_t v_dst2 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
-
-            vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2));
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct AddWeighted_SIMD<short, float>
-{
-    int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
-    {
-        int x = 0;
-
-        float32x4_t g = vdupq_n_f32(gamma);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
-
-            float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), alpha);
-            float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))), beta);
-            int16x4_t v_dst1 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
-
-            v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), alpha);
-            v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))), beta);
-            int16x4_t v_dst2 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
-
-            vst1q_s16(dst + x, vcombine_s16(v_dst1, v_dst2));
-        }
-
-        return x;
-    }
-};
-
-#endif
-
-}
-
-#endif // __OPENCV_ARITHM_SIMD_HPP__
diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp
index 6b3b23cddb..0d7035f1bd 100644
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -86,7 +86,6 @@
 #include "opencv2/core/sse_utils.hpp"
 #include "opencv2/core/neon_utils.hpp"
 #include "opencv2/core/vsx_utils.hpp"
-#include "arithm_core.hpp"
 #include "hal_replacement.hpp"
 
 #ifdef HAVE_TEGRA_OPTIMIZATION
@@ -110,6 +109,102 @@ extern const uchar g_Saturate8u[];
 #define CV_MIN_8U(a,b)       ((a) - CV_FAST_CAST_8U((a) - (b)))
 #define CV_MAX_8U(a,b)       ((a) + CV_FAST_CAST_8U((b) - (a)))
 
+template<typename T1, typename T2=T1, typename T3=T1> struct OpAdd
+{
+    typedef T1 type1;
+    typedef T2 type2;
+    typedef T3 rtype;
+    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a + b); }
+};
+
+template<typename T1, typename T2=T1, typename T3=T1> struct OpSub
+{
+    typedef T1 type1;
+    typedef T2 type2;
+    typedef T3 rtype;
+    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a - b); }
+};
+
+template<typename T1, typename T2=T1, typename T3=T1> struct OpRSub
+{
+    typedef T1 type1;
+    typedef T2 type2;
+    typedef T3 rtype;
+    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(b - a); }
+};
+
+template<typename T> struct OpMin
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator ()(const T a, const T b) const { return std::min(a, b); }
+};
+
+template<typename T> struct OpMax
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator ()(const T a, const T b) const { return std::max(a, b); }
+};
+
+template<typename T> struct OpAbsDiff
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()(T a, T b) const { return a > b ? a - b : b - a; }
+};
+
+// specializations to prevent "-0" results
+template<> struct OpAbsDiff<float>
+{
+    typedef float type1;
+    typedef float type2;
+    typedef float rtype;
+    float operator()(float a, float b) const { return std::abs(a - b); }
+};
+template<> struct OpAbsDiff<double>
+{
+    typedef double type1;
+    typedef double type2;
+    typedef double rtype;
+    double operator()(double a, double b) const { return std::abs(a - b); }
+};
+
+template<typename T> struct OpAnd
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()( T a, T b ) const { return a & b; }
+};
+
+template<typename T> struct OpOr
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()( T a, T b ) const { return a | b; }
+};
+
+template<typename T> struct OpXor
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()( T a, T b ) const { return a ^ b; }
+};
+
+template<typename T> struct OpNot
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()( T a, T ) const { return ~a; }
+};
+
 template<> inline uchar OpAdd<uchar>::operator ()(uchar a, uchar b) const
 { return CV_FAST_CAST_8U(a + b); }
 
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
index 40d282b1c2..b28929c582 100644
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -119,11 +119,15 @@ template <typename R> struct Data
             d[i] += (LaneType)m;
         return *this;
     }
-    void fill(LaneType val)
+    void fill(LaneType val, int s, int c = R::nlanes)
     {
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = s; i < c; ++i)
             d[i] = val;
     }
+    void fill(LaneType val)
+    {
+        fill(val, 0);
+    }
     void reverse()
     {
         for (int i = 0; i < R::nlanes / 2; ++i)
@@ -739,6 +743,23 @@ template<typename R> struct TheTest
         return *this;
     }
 
+    TheTest & test_absdiffs()
+    {
+        Data<R> dataA(std::numeric_limits<LaneType>::max()),
+                dataB(std::numeric_limits<LaneType>::min());
+        dataA[0] = (LaneType)-1;
+        dataB[0] = 1;
+        dataA[1] = 2;
+        dataB[1] = (LaneType)-2;
+        R a = dataA, b = dataB;
+        Data<R> resC = v_absdiffs(a, b);
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            EXPECT_EQ(saturate_cast<LaneType>(std::abs(dataA[i] - dataB[i])), resC[i]);
+        }
+        return *this;
+    }
+
     TheTest & test_reduce()
     {
         Data<R> dataA;
@@ -874,6 +895,81 @@ template<typename R> struct TheTest
         return *this;
     }
 
+    // v_uint8 only
+    TheTest & test_pack_b()
+    {
+        // 16-bit
+        Data<R> dataA, dataB;
+        dataB.fill(0, R::nlanes / 2);
+
+        R a = dataA, b = dataB;
+        Data<R> maskA = a == b, maskB = a != b;
+
+        a = maskA; b = maskB;
+        Data<R> res  = v_pack_b(v_reinterpret_as_u16(a), v_reinterpret_as_u16(b));
+        for (int i = 0; i < v_uint16::nlanes; ++i)
+        {
+            SCOPED_TRACE(cv::format("i=%d", i));
+            EXPECT_EQ(maskA[i * 2], res[i]);
+            EXPECT_EQ(maskB[i * 2], res[i + v_uint16::nlanes]);
+        }
+
+        // 32-bit
+        Data<R> dataC, dataD;
+        dataD.fill(0, R::nlanes / 2);
+
+        R c = dataC, d = dataD;
+        Data<R> maskC = c == d, maskD = c != d;
+
+        c = maskC; d = maskD;
+        res = v_pack_b
+        (
+            v_reinterpret_as_u32(a), v_reinterpret_as_u32(b),
+            v_reinterpret_as_u32(c), v_reinterpret_as_u32(d)
+        );
+
+        for (int i = 0; i < v_uint32::nlanes; ++i)
+        {
+            SCOPED_TRACE(cv::format("i=%d", i));
+            EXPECT_EQ(maskA[i * 4], res[i]);
+            EXPECT_EQ(maskB[i * 4], res[i + v_uint32::nlanes]);
+            EXPECT_EQ(maskC[i * 4], res[i + v_uint32::nlanes * 2]);
+            EXPECT_EQ(maskD[i * 4], res[i + v_uint32::nlanes * 3]);
+        }
+
+        // 64-bit
+        Data<R> dataE, dataF, dataG(0), dataH(0xFF);
+        dataF.fill(0, R::nlanes / 2);
+
+        R e = dataE, f = dataF, g = dataG, h = dataH;
+        Data<R> maskE = e == f, maskF = e != f;
+
+        e = maskE; f = maskF;
+        res = v_pack_b
+        (
+            v_reinterpret_as_u64(a), v_reinterpret_as_u64(b),
+            v_reinterpret_as_u64(c), v_reinterpret_as_u64(d),
+            v_reinterpret_as_u64(e), v_reinterpret_as_u64(f),
+            v_reinterpret_as_u64(g), v_reinterpret_as_u64(h)
+        );
+
+        for (int i = 0; i < v_uint64::nlanes; ++i)
+        {
+            SCOPED_TRACE(cv::format("i=%d", i));
+            EXPECT_EQ(maskA[i * 8], res[i]);
+            EXPECT_EQ(maskB[i * 8], res[i + v_uint64::nlanes]);
+            EXPECT_EQ(maskC[i * 8], res[i + v_uint64::nlanes * 2]);
+            EXPECT_EQ(maskD[i * 8], res[i + v_uint64::nlanes * 3]);
+
+            EXPECT_EQ(maskE[i * 8], res[i + v_uint64::nlanes * 4]);
+            EXPECT_EQ(maskF[i * 8], res[i + v_uint64::nlanes * 5]);
+            EXPECT_EQ(dataG[i * 8], res[i + v_uint64::nlanes * 6]);
+            EXPECT_EQ(dataH[i * 8], res[i + v_uint64::nlanes * 7]);
+        }
+
+        return *this;
+    }
+
     TheTest & test_unpack()
     {
         Data<R> dataA, dataB;
@@ -1228,6 +1324,7 @@ void test_hal_intrin_uint8()
         .test_popcount()
         .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
         .test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>()
+        .test_pack_b()
         .test_unpack()
         .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
         .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
@@ -1259,6 +1356,7 @@ void test_hal_intrin_int8()
         .test_logic()
         .test_min_max()
         .test_absdiff()
+        .test_absdiffs()
         .test_abs()
         .test_mask()
         .test_popcount()
@@ -1317,6 +1415,7 @@ void test_hal_intrin_int16()
         .test_logic()
         .test_min_max()
         .test_absdiff()
+        .test_absdiffs()
         .test_abs()
         .test_reduce()
         .test_mask()