core: reimplement SIMD arithmetic, logic and comparison operations into wide universal intrinsics

- initialize arithmetic dispatcher - add new universal intrinsic v_absdiffs - add new universal intrinsic v_pack_b - add accumulate version of universal intrinsic v_round - fix sse/avx2:uint8 multiplication overflow - reimplement arithmetic, logic and comparison operations into wide universal intrinsics with full support for all types - reimplement IPP arithmetic, logic and comparison operations in a sperate file arithm_ipp.hpp - avoid scalar multiplication if scaling factor eq 1 and use integer multiplication - move C arithmetic operations to precomp.hpp and delete [arithm_simd|arithm_core].hpp - add compatibility with new opencv4 divide policy
7 years ago · 93ffebc273
parent d61ad04f11
commit 93ffebc273
14 changed files with 2896 additions and 3702 deletions
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@ -2,6 +2,7 @@ set(the_description "The Core Functionality")

 ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2)
 ocv_add_dispatched_file(stat SSE4_2 AVX2)
+ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2)

 # dispatching for accuracy tests
 ocv_add_dispatched_file_force_all(test_intrin128 TEST SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX FP16 AVX2)
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@ -661,7 +661,7 @@ inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b)
 {
    v_uint16x16 c, d;
    v_mul_expand(a, b, c, d);
-    return v_pack_u(v_reinterpret_as_s16(c), v_reinterpret_as_s16(d));
+    return v_pack(c, d);
 }
 inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b)
 {
@ -1291,6 +1291,16 @@ inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
 inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
 { return v_abs(a - b); }

+/** Saturating absolute difference **/
+inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b)
+{
+    v_int8x32 d = a - b;
+    v_int8x32 m = a < b;
+    return (d ^ m) - m;
+}
+inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b)
+{ return v_max(a, b) - v_min(a, b); }
+
 ////////// Conversions /////////

 /** Rounding **/
@ -1300,6 +1310,12 @@ inline v_int32x8 v_round(const v_float32x8& a)
 inline v_int32x8 v_round(const v_float64x4& a)
 { return v_int32x8(_mm256_castsi128_si256(_mm256_cvtpd_epi32(a.val))); }

+inline v_int32x8 v_round(const v_float64x4& a, const v_float64x4& b)
+{
+    __m128i ai = _mm256_cvtpd_epi32(a.val), bi = _mm256_cvtpd_epi32(b.val);
+    return v_int32x8(_v256_combine(ai, bi));
+}
+
 inline v_int32x8 v_trunc(const v_float32x8& a)
 { return v_int32x8(_mm256_cvttps_epi32(a.val)); }

@ -1689,6 +1705,40 @@ void v_rshr_pack_store(int* ptr, const v_int64x4& a)
    v_pack_store(ptr, (a + delta) >> n);
 }

+// pack boolean
+inline v_uint8x32 v_pack_b(const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i ab = _mm256_packs_epi16(a.val, b.val);
+    return v_uint8x32(_v256_shuffle_odd_64(ab));
+}
+
+inline v_uint8x32 v_pack_b(const v_uint32x8& a, const v_uint32x8& b,
+                           const v_uint32x8& c, const v_uint32x8& d)
+{
+    __m256i ab = _mm256_packs_epi32(a.val, b.val);
+    __m256i cd = _mm256_packs_epi32(c.val, d.val);
+
+    __m256i abcd = _v256_shuffle_odd_64(_mm256_packs_epi16(ab, cd));
+    return v_uint8x32(_mm256_shuffle_epi32(abcd, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+inline v_uint8x32 v_pack_b(const v_uint64x4& a, const v_uint64x4& b, const v_uint64x4& c,
+                           const v_uint64x4& d, const v_uint64x4& e, const v_uint64x4& f,
+                           const v_uint64x4& g, const v_uint64x4& h)
+{
+    __m256i ab = _mm256_packs_epi32(a.val, b.val);
+    __m256i cd = _mm256_packs_epi32(c.val, d.val);
+    __m256i ef = _mm256_packs_epi32(e.val, f.val);
+    __m256i gh = _mm256_packs_epi32(g.val, h.val);
+
+    __m256i abcd = _mm256_packs_epi32(ab, cd);
+    __m256i efgh = _mm256_packs_epi32(ef, gh);
+    __m256i pkall = _v256_shuffle_odd_64(_mm256_packs_epi16(abcd, efgh));
+
+    __m256i rev = _mm256_alignr_epi8(pkall, pkall, 8);
+    return v_uint8x32(_mm256_unpacklo_epi16(pkall, rev));
+}
+
 /* Recombine */
 // its up there with load and store operations

--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@ -109,7 +109,7 @@ These operations allow to reorder or recombine elements in one or multiple vecto

 - Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
 - Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand, @ref v_expand_low, @ref v_expand_high
- Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u,
+- Pack: @ref v_pack, @ref v_pack_u, @ref v_pack_b, @ref v_rshr_pack, @ref v_rshr_pack_u,
@ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
 - Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
 - Extract: @ref v_extract
@ -159,7 +159,7 @@ Most of these operations return only one value.
 ### Other math

 - Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude
- Absolute values: @ref v_abs, @ref v_absdiff
+- Absolute values: @ref v_abs, @ref v_absdiff, @ref v_absdiffs

 ### Conversions

@ -199,10 +199,12 @@ Regular integers:
 |logical            | x | x | x | x | x | x |
 |min, max           | x | x | x | x | x | x |
 |absdiff            | x | x | x | x | x | x |
+|absdiffs           |   | x |   | x |   |   |
 |reduce             |   |   |   |   | x | x |
 |mask               | x | x | x | x | x | x |
 |pack               | x | x | x | x | x | x |
 |pack_u             | x |   | x |   |   |   |
+|pack_b             | x |   |   |   |   |   |
 |unpack             | x | x | x | x | x | x |
 |extract            | x | x | x | x | x | x |
 |rotate (lanes)     | x | x | x | x | x | x |
@ -762,6 +764,19 @@ inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
    return c;
 }

+/** @brief Saturating absolute difference
+
+Returns \f$ saturate(|a - b|) \f$ .
+For 8-, 16-bit signed integer source types. */
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_absdiffs(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++)
+        c.s[i] = saturate_cast<_Tp>(std::abs(a.s[i] - b.s[i]));
+    return c;
+}
+
 /** @brief Inversed square root

 Returns \f$ 1/sqrt(a) \f$
@ -1613,6 +1628,18 @@ template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
    return c;
 }

+/** @overload */
+template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a, const v_reg<double, n>& b)
+{
+    v_reg<int, n*2> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = cvRound(a.s[i]);
+        c.s[i+n] = cvRound(b.s[i]);
+    }
+    return c;
+}
+
 /** @brief Floor

 Floor each value. Input type is float vector ==> output type is int vector.*/
@ -2059,6 +2086,103 @@ OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, s
 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
 //! @}

+//! @cond IGNORED
+template<typename _Tpm, typename _Tp, int n>
+inline void _pack_b(_Tpm* mptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    for (int i = 0; i < n; ++i)
+    {
+        mptr[i] = (_Tpm)a.s[i];
+        mptr[i + n] = (_Tpm)b.s[i];
+    }
+}
+//! @endcond
+
+//! @name Pack boolean values
+//! @{
+//! @brief Pack boolean values from multiple vectors to one unsigned 8-bit integer vector
+//!
+//! @note Must provide valid boolean values to guarantee same result for all architectures.
+
+/** @brief
+//! For 16-bit boolean values
+
+Scheme:
+@code
+a  {0xFFFF 0 0 0xFFFF 0 0xFFFF 0xFFFF 0}
+b  {0xFFFF 0 0xFFFF 0 0 0xFFFF 0 0xFFFF}
+===============
+{
+   0xFF 0 0 0xFF 0 0xFF 0xFF 0
+   0xFF 0 0xFF 0 0 0xFF 0 0xFF
+}
+@endcode */
+
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v_uint8x16 mask;
+    _pack_b(mask.s, a, b);
+    return mask;
+}
+
+/** @overload
+For 32-bit boolean values
+
+Scheme:
+@code
+a  {0xFFFF.. 0 0 0xFFFF..}
+b  {0 0xFFFF.. 0xFFFF.. 0}
+c  {0xFFFF.. 0 0xFFFF.. 0}
+d  {0 0xFFFF.. 0 0xFFFF..}
+===============
+{
+   0xFF 0 0 0xFF 0 0xFF 0xFF 0
+   0xFF 0 0xFF 0 0 0xFF 0 0xFF
+}
+@endcode */
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    v_uint8x16 mask;
+    _pack_b(mask.s, a, b);
+    _pack_b(mask.s + 8, c, d);
+    return mask;
+}
+
+/** @overload
+For 64-bit boolean values
+
+Scheme:
+@code
+a  {0xFFFF.. 0}
+b  {0 0xFFFF..}
+c  {0xFFFF.. 0}
+d  {0 0xFFFF..}
+
+e  {0xFFFF.. 0}
+f  {0xFFFF.. 0}
+g  {0 0xFFFF..}
+h  {0 0xFFFF..}
+===============
+{
+   0xFF 0 0 0xFF 0xFF 0 0 0xFF
+   0xFF 0 0xFF 0 0 0xFF 0 0xFF
+}
+@endcode */
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    v_uint8x16 mask;
+    _pack_b(mask.s, a, b);
+    _pack_b(mask.s + 4, c, d);
+    _pack_b(mask.s + 8, e, f);
+    _pack_b(mask.s + 12, g, h);
+    return mask;
+}
+//! @}
+
 /** @brief Matrix multiplication

 Scheme:
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@ -394,6 +394,35 @@ OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, pack, vmovn
 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, pack_u, vqmovun_s16, vqrshrun_n_s16)
 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, pack_u, vqmovun_s32, vqrshrun_n_s32)

+// pack boolean
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    uint8x16_t ab = vcombine_u8(vmovn_u16(a.val), vmovn_u16(b.val));
+    return v_uint8x16(ab);
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    uint16x8_t nab = vcombine_u16(vmovn_u32(a.val), vmovn_u32(b.val));
+    uint16x8_t ncd = vcombine_u16(vmovn_u32(c.val), vmovn_u32(d.val));
+    return v_uint8x16(vcombine_u8(vmovn_u16(nab), vmovn_u16(ncd)));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    uint32x4_t ab = vcombine_u32(vmovn_u64(a.val), vmovn_u64(b.val));
+    uint32x4_t cd = vcombine_u32(vmovn_u64(c.val), vmovn_u64(d.val));
+    uint32x4_t ef = vcombine_u32(vmovn_u64(e.val), vmovn_u64(f.val));
+    uint32x4_t gh = vcombine_u32(vmovn_u64(g.val), vmovn_u64(h.val));
+
+    uint16x8_t abcd = vcombine_u16(vmovn_u32(ab), vmovn_u32(cd));
+    uint16x8_t efgh = vcombine_u16(vmovn_u32(ef), vmovn_u32(gh));
+    return v_uint8x16(vcombine_u8(vmovn_u16(abcd), vmovn_u16(efgh)));
+}
+
 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
                            const v_float32x4& m1, const v_float32x4& m2,
                            const v_float32x4& m3)
@ -748,7 +777,6 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_mul_wrap, vmulq_s8)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_mul_wrap, vmulq_u16)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mul_wrap, vmulq_s16)

-// TODO: absdiff for signed integers
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
@ -757,6 +785,12 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_absdiff, vabdq_f64)
 #endif

+/** Saturating absolute difference **/
+inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
+{ return v_int8x16(vqabsq_s8(vqsubq_s8(a.val, b.val))); }
+inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
+{ return v_int16x8(vqabsq_s16(vqsubq_s16(a.val, b.val))); }
+
 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
 inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
 { \
@ -1242,6 +1276,11 @@ inline v_int32x4 v_round(const v_float64x2& a)
    return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
 }

+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), vmovn_s64(vcvtaq_s64_f64(b.val))));
+}
+
 inline v_int32x4 v_floor(const v_float64x2& a)
 {
    static const int32x2_t zero = vdup_n_s32(0);
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@ -634,6 +634,35 @@ void v_rshr_pack_store(int* ptr, const v_int64x2& a)
    _mm_storel_epi64((__m128i*)ptr, a2);
 }

+// pack boolean
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    __m128i ab = _mm_packs_epi16(a.val, b.val);
+    return v_uint8x16(ab);
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    __m128i ab = _mm_packs_epi32(a.val, b.val);
+    __m128i cd = _mm_packs_epi32(c.val, d.val);
+    return v_uint8x16(_mm_packs_epi16(ab, cd));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    __m128i ab = _mm_packs_epi32(a.val, b.val);
+    __m128i cd = _mm_packs_epi32(c.val, d.val);
+    __m128i ef = _mm_packs_epi32(e.val, f.val);
+    __m128i gh = _mm_packs_epi32(g.val, h.val);
+
+    __m128i abcd = _mm_packs_epi32(ab, cd);
+    __m128i efgh = _mm_packs_epi32(ef, gh);
+    return v_uint8x16(_mm_packs_epi16(abcd, efgh));
+}
+
 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
                            const v_float32x4& m1, const v_float32x4& m2,
                            const v_float32x4& m3)
@ -706,19 +735,11 @@ OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
    { a = a * b; return a; }

+OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint8x16, v_uint16x8)
 OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int8x16,  v_int16x8)
 OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint16x8, v_uint32x4)
 OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int16x8,  v_int32x4)

-inline v_uint8x16 operator * (const v_uint8x16& a, const v_uint8x16& b)
-{
-    v_uint16x8 c, d;
-    v_mul_expand(a, b, c, d);
-    return v_pack_u(v_reinterpret_as_s16(c), v_reinterpret_as_s16(d));
-}
-inline v_uint8x16& operator *= (v_uint8x16& a, const v_uint8x16& b)
-{ a = a * b; return a; }
-
 //  Multiply and expand
 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
                         v_uint16x8& c, v_uint16x8& d)
@ -1045,34 +1066,43 @@ inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
    return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
 }

-#define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \
-inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \
-{ \
-    return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a.val, b.val), _mm_subs_epu##bits(b.val, a.val))); \
-} \
-inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \
-{ \
-    __m128i smask = _mm_set1_epi32(smask32); \
-    __m128i a1 = _mm_xor_si128(a.val, smask); \
-    __m128i b1 = _mm_xor_si128(b.val, smask); \
-    return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a1, b1), _mm_subs_epu##bits(b1, a1))); \
-}
-
-OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080)
-OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000)
+/** Absolute difference **/

+inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_add_wrap(a - b,  b - a); }
 inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
 {
-    return v_max(a, b) - v_min(a, b);
+    v_int8x16 d = v_sub_wrap(a, b);
+    v_int8x16 m = a < b;
+    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
+}
+inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
+{
+    return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)));
 }
-
 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
 {
-    __m128i d = _mm_sub_epi32(a.val, b.val);
-    __m128i m = _mm_cmpgt_epi32(b.val, a.val);
-    return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m));
+    v_int32x4 d = a - b;
+    v_int32x4 m = a < b;
+    return v_reinterpret_as_u32((d ^ m) - m);
 }

+/** Saturating absolute difference **/
+inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
+{
+    v_int8x16 d = a - b;
+    v_int8x16 m = a < b;
+    return (d ^ m) - m;
+ }
+inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+
 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
 {
    return a * b + c;
@ -1623,6 +1653,12 @@ inline v_int32x4 v_trunc(const v_float32x4& a)
 inline v_int32x4 v_round(const v_float64x2& a)
 { return v_int32x4(_mm_cvtpd_epi32(a.val)); }

+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{
+    __m128i ai = _mm_cvtpd_epi32(a.val), bi = _mm_cvtpd_epi32(b.val);
+    return v_int32x4(_mm_unpacklo_epi64(ai, bi));
+}
+
 inline v_int32x4 v_floor(const v_float64x2& a)
 {
    __m128i a1 = _mm_cvtpd_epi32(a.val);
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@ -383,6 +383,35 @@ OPENCV_HAL_IMPL_VSX_PACK(v_uint16x8, ushort, v_int32x4, unsigned int, int,
 //OPENCV_HAL_IMPL_VSX_PACK(v_uint32x4, uint, v_int64x2, unsigned long long, long long,
 //                         vec_sra, vec_packsu, vec_add, pack_u)

+// pack boolean
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    vec_uchar16 ab = vec_pack(a.val, b.val);
+    return v_uint8x16(ab);
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    vec_ushort8 ab = vec_pack(a.val, b.val);
+    vec_ushort8 cd = vec_pack(c.val, d.val);
+    return v_uint8x16(vec_pack(ab, cd));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    vec_uint4 ab = vec_pack(a.val, b.val);
+    vec_uint4 cd = vec_pack(c.val, d.val);
+    vec_uint4 ef = vec_pack(e.val, f.val);
+    vec_uint4 gh = vec_pack(g.val, h.val);
+
+    vec_ushort8 abcd = vec_pack(ab, cd);
+    vec_ushort8 efgh = vec_pack(ef, gh);
+    return v_uint8x16(vec_pack(abcd, efgh));
+}
+
 /* Recombine */
 template <typename _Tpvec>
 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1)
@ -834,16 +863,27 @@ inline v_float32x4 v_abs(const v_float32x4& x)
 inline v_float64x2 v_abs(const v_float64x2& x)
 { return v_float64x2(vec_abs(x.val)); }

+/** Absolute difference **/
+// unsigned
 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_absdiff, vec_absd)

-#define OPENCV_HAL_IMPL_VSX_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin)  \
-inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b)                       \
-{ return _Tpvec2(cast(intrin(a.val, b.val))); }
+inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
+{ return v_reinterpret_as_u8(v_sub_wrap(v_max(a, b), v_min(a, b))); }
+inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
+{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
+inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
+{ return v_reinterpret_as_u32(v_max(a, b) - v_min(a, b)); }

-OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int8x16, v_uint8x16, vec_uchar16_c, v_absdiff, vec_absd)
-OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int16x8, v_uint16x8, vec_ushort8_c, v_absdiff, vec_absd)
-OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int32x4, v_uint32x4, vec_uint4_c, v_absdiff, vec_absd)
-OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int64x2, v_uint64x2, vec_udword2_c, v_absdiff, vec_absd)
+inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
+{ return v_abs(a - b); }
+inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
+{ return v_abs(a - b); }
+
+/** Absolute difference for signed integers **/
+inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
+{ return v_int8x16(vec_abss(vec_subs(a.val, b.val))); }
+inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
+{ return v_int16x8(vec_abss(vec_subs(a.val, b.val))); }

 ////////// Conversions /////////

@ -854,6 +894,9 @@ inline v_int32x4 v_round(const v_float32x4& a)
 inline v_int32x4 v_round(const v_float64x2& a)
 { return v_int32x4(vec_mergesqo(vec_ctso(vec_rint(a.val)), vec_int4_z)); }

+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{ return v_int32x4(vec_mergesqo(vec_ctso(vec_rint(a.val)), vec_ctso(vec_rint(b.val)))); }
+
 inline v_int32x4 v_floor(const v_float32x4& a)
 { return v_int32x4(vec_cts(vec_floor(a.val))); }

--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
--- a/modules/core/src/arithm.dispatch.cpp
+++ b/modules/core/src/arithm.dispatch.cpp
@ -0,0 +1,11 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "precomp.hpp"
+#include "arithm_ipp.hpp"
+#include "arithm.simd.hpp"
+#include "arithm.simd_declarations.hpp"
+
+#define ARITHM_DISPATCHING_ONLY
+#include "arithm.simd.hpp"
--- a/modules/core/src/arithm.simd.hpp
+++ b/modules/core/src/arithm.simd.hpp
--- a/modules/core/src/arithm_core.hpp
+++ b/modules/core/src/arithm_core.hpp
@ -1,623 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                          License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Copyright (C) 2015, Itseez Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_ARITHM_CORE_HPP__
-#define __OPENCV_ARITHM_CORE_HPP__
-
-#include "arithm_simd.hpp"
-
-namespace cv {
-
-template<typename T1, typename T2=T1, typename T3=T1> struct OpAdd
-{
-    typedef T1 type1;
-    typedef T2 type2;
-    typedef T3 rtype;
-    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a + b); }
-};
-
-template<typename T1, typename T2=T1, typename T3=T1> struct OpSub
-{
-    typedef T1 type1;
-    typedef T2 type2;
-    typedef T3 rtype;
-    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a - b); }
-};
-
-template<typename T1, typename T2=T1, typename T3=T1> struct OpRSub
-{
-    typedef T1 type1;
-    typedef T2 type2;
-    typedef T3 rtype;
-    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(b - a); }
-};
-
-template<typename T> struct OpMin
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator ()(const T a, const T b) const { return std::min(a, b); }
-};
-
-template<typename T> struct OpMax
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator ()(const T a, const T b) const { return std::max(a, b); }
-};
-
-template<typename T> struct OpAbsDiff
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator()(T a, T b) const { return a > b ? a - b : b - a; }
-};
-
-// specializations to prevent "-0" results
-template<> struct OpAbsDiff<float>
-{
-    typedef float type1;
-    typedef float type2;
-    typedef float rtype;
-    float operator()(float a, float b) const { return std::abs(a - b); }
-};
-template<> struct OpAbsDiff<double>
-{
-    typedef double type1;
-    typedef double type2;
-    typedef double rtype;
-    double operator()(double a, double b) const { return std::abs(a - b); }
-};
-
-template<typename T> struct OpAnd
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator()( T a, T b ) const { return a & b; }
-};
-
-template<typename T> struct OpOr
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator()( T a, T b ) const { return a | b; }
-};
-
-template<typename T> struct OpXor
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator()( T a, T b ) const { return a ^ b; }
-};
-
-template<typename T> struct OpNot
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator()( T a, T ) const { return ~a; }
-};
-
-//=============================================================================
-
-template<typename T, class Op, class VOp>
-void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, int width, int height)
-{
-#if CV_SSE2 || CV_NEON
-    VOp vop;
-#endif
-    Op op;
-
-    for( ; height--; src1 = (const T *)((const uchar *)src1 + step1),
-                        src2 = (const T *)((const uchar *)src2 + step2),
-                        dst = (T *)((uchar *)dst + step) )
-    {
-        int x = 0;
-
-#if CV_NEON || CV_SSE2
-#if CV_AVX2
-        if( USE_AVX2 )
-        {
-            for( ; x <= width - 32/(int)sizeof(T); x += 32/sizeof(T) )
-            {
-                typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x);
-                r0 = vop(r0, VLoadStore256<T>::load(src2 + x));
-                VLoadStore256<T>::store(dst + x, r0);
-            }
-        }
-#else
-#if CV_SSE2
-        if( USE_SSE2 )
-        {
-#endif // CV_SSE2
-            for( ; x <= width - 32/(int)sizeof(T); x += 32/sizeof(T) )
-            {
-                typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x               );
-                typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 16/sizeof(T));
-                r0 = vop(r0, VLoadStore128<T>::load(src2 + x               ));
-                r1 = vop(r1, VLoadStore128<T>::load(src2 + x + 16/sizeof(T)));
-                VLoadStore128<T>::store(dst + x               , r0);
-                VLoadStore128<T>::store(dst + x + 16/sizeof(T), r1);
-            }
-#if CV_SSE2
-        }
-#endif // CV_SSE2
-#endif // CV_AVX2
-#endif // CV_NEON || CV_SSE2
-
-#if CV_AVX2
-        // nothing
-#elif CV_SSE2
-        if( USE_SSE2 )
-        {
-            for( ; x <= width - 8/(int)sizeof(T); x += 8/sizeof(T) )
-            {
-                typename VLoadStore64<T>::reg_type r = VLoadStore64<T>::load(src1 + x);
-                r = vop(r, VLoadStore64<T>::load(src2 + x));
-                VLoadStore64<T>::store(dst + x, r);
-            }
-        }
-#endif
-
-#if CV_ENABLE_UNROLLED
-        for( ; x <= width - 4; x += 4 )
-        {
-            T v0 = op(src1[x], src2[x]);
-            T v1 = op(src1[x+1], src2[x+1]);
-            dst[x] = v0; dst[x+1] = v1;
-            v0 = op(src1[x+2], src2[x+2]);
-            v1 = op(src1[x+3], src2[x+3]);
-            dst[x+2] = v0; dst[x+3] = v1;
-        }
-#endif
-
-        for( ; x < width; x++ )
-            dst[x] = op(src1[x], src2[x]);
-    }
-}
-
-template<typename T, class Op, class Op32>
-void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
-              T* dst, size_t step, int width, int height)
-{
-#if CV_SSE2 || CV_NEON
-    Op32 op32;
-#endif
-    Op op;
-
-    for( ; height--; src1 = (const T *)((const uchar *)src1 + step1),
-                        src2 = (const T *)((const uchar *)src2 + step2),
-                        dst = (T *)((uchar *)dst + step) )
-    {
-        int x = 0;
-
-#if CV_AVX2
-        if( USE_AVX2 )
-        {
-            if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 )
-            {
-                for( ; x <= width - 8; x += 8 )
-                {
-                    typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x);
-                    r0 = op32(r0, VLoadStore256Aligned<T>::load(src2 + x));
-                    VLoadStore256Aligned<T>::store(dst + x, r0);
-                }
-            }
-        }
-#elif CV_SSE2
-        if( USE_SSE2 )
-        {
-            if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
-            {
-                for( ; x <= width - 8; x += 8 )
-                {
-                    typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x    );
-                    typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 4);
-                    r0 = op32(r0, VLoadStore128Aligned<T>::load(src2 + x    ));
-                    r1 = op32(r1, VLoadStore128Aligned<T>::load(src2 + x + 4));
-                    VLoadStore128Aligned<T>::store(dst + x    , r0);
-                    VLoadStore128Aligned<T>::store(dst + x + 4, r1);
-                }
-            }
-        }
-#endif // CV_AVX2
-
-#if CV_NEON || CV_SSE2
-#if CV_AVX2
-        if( USE_AVX2 )
-        {
-            for( ; x <= width - 8; x += 8 )
-            {
-                typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x);
-                r0 = op32(r0, VLoadStore256<T>::load(src2 + x));
-                VLoadStore256<T>::store(dst + x, r0);
-            }
-        }
-#else
-#if CV_SSE2
-        if( USE_SSE2 )
-        {
-#endif // CV_SSE2
-            for( ; x <= width - 8; x += 8 )
-            {
-                typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x    );
-                typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 4);
-                r0 = op32(r0, VLoadStore128<T>::load(src2 + x    ));
-                r1 = op32(r1, VLoadStore128<T>::load(src2 + x + 4));
-                VLoadStore128<T>::store(dst + x    , r0);
-                VLoadStore128<T>::store(dst + x + 4, r1);
-            }
-#if CV_SSE2
-        }
-#endif // CV_SSE2
-#endif // CV_AVX2
-#endif // CV_NEON || CV_SSE2
-
-#if CV_ENABLE_UNROLLED
-        for( ; x <= width - 4; x += 4 )
-        {
-            T v0 = op(src1[x], src2[x]);
-            T v1 = op(src1[x+1], src2[x+1]);
-            dst[x] = v0; dst[x+1] = v1;
-            v0 = op(src1[x+2], src2[x+2]);
-            v1 = op(src1[x+3], src2[x+3]);
-            dst[x+2] = v0; dst[x+3] = v1;
-        }
-#endif
-
-        for( ; x < width; x++ )
-            dst[x] = op(src1[x], src2[x]);
-    }
-}
-
-
-template<typename T, class Op, class Op64>
-void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2,
-               T* dst, size_t step, int width, int height)
-{
-#if CV_SSE2
-    Op64 op64;
-#endif
-    Op op;
-
-    for( ; height--; src1 = (const T *)((const uchar *)src1 + step1),
-                        src2 = (const T *)((const uchar *)src2 + step2),
-                        dst = (T *)((uchar *)dst + step) )
-    {
-        int x = 0;
-
-#if CV_AVX2
-        if( USE_AVX2 )
-        {
-            if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 )
-            {
-                for( ; x <= width - 4; x += 4 )
-                {
-                    typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x);
-                    r0 = op64(r0, VLoadStore256Aligned<T>::load(src2 + x));
-                    VLoadStore256Aligned<T>::store(dst + x, r0);
-                }
-            }
-        }
-#elif CV_SSE2
-        if( USE_SSE2 )
-        {
-            if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
-            {
-                for( ; x <= width - 4; x += 4 )
-                {
-                    typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x    );
-                    typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 2);
-                    r0 = op64(r0, VLoadStore128Aligned<T>::load(src2 + x    ));
-                    r1 = op64(r1, VLoadStore128Aligned<T>::load(src2 + x + 2));
-                    VLoadStore128Aligned<T>::store(dst + x    , r0);
-                    VLoadStore128Aligned<T>::store(dst + x + 2, r1);
-                }
-            }
-        }
-#endif
-
-        for( ; x <= width - 4; x += 4 )
-        {
-            T v0 = op(src1[x], src2[x]);
-            T v1 = op(src1[x+1], src2[x+1]);
-            dst[x] = v0; dst[x+1] = v1;
-            v0 = op(src1[x+2], src2[x+2]);
-            v1 = op(src1[x+3], src2[x+3]);
-            dst[x+2] = v0; dst[x+3] = v1;
-        }
-
-        for( ; x < width; x++ )
-            dst[x] = op(src1[x], src2[x]);
-    }
-}
-
-template<typename T> static void
-cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
-     uchar* dst, size_t step, int width, int height, int code)
-{
-    step1 /= sizeof(src1[0]);
-    step2 /= sizeof(src2[0]);
-    if( code == CMP_GE || code == CMP_LT )
-    {
-        std::swap(src1, src2);
-        std::swap(step1, step2);
-        code = code == CMP_GE ? CMP_LE : CMP_GT;
-    }
-
-    Cmp_SIMD<T> vop(code);
-
-    if( code == CMP_GT || code == CMP_LE )
-    {
-        int m = code == CMP_GT ? 0 : 255;
-        for( ; height--; src1 += step1, src2 += step2, dst += step )
-        {
-            int x = vop(src1, src2, dst, width);
-            #if CV_ENABLE_UNROLLED
-            for( ; x <= width - 4; x += 4 )
-            {
-                int t0, t1;
-                t0 = -(src1[x] > src2[x]) ^ m;
-                t1 = -(src1[x+1] > src2[x+1]) ^ m;
-                dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
-                t0 = -(src1[x+2] > src2[x+2]) ^ m;
-                t1 = -(src1[x+3] > src2[x+3]) ^ m;
-                dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
-            }
-            #endif
-            for( ; x < width; x++ )
-                dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
-        }
-    }
-    else if( code == CMP_EQ || code == CMP_NE )
-    {
-        int m = code == CMP_EQ ? 0 : 255;
-        for( ; height--; src1 += step1, src2 += step2, dst += step )
-        {
-            int x = 0;
-            #if CV_ENABLE_UNROLLED
-            for( ; x <= width - 4; x += 4 )
-            {
-                int t0, t1;
-                t0 = -(src1[x] == src2[x]) ^ m;
-                t1 = -(src1[x+1] == src2[x+1]) ^ m;
-                dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
-                t0 = -(src1[x+2] == src2[x+2]) ^ m;
-                t1 = -(src1[x+3] == src2[x+3]) ^ m;
-                dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
-            }
-            #endif
-            for( ; x < width; x++ )
-                dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
-        }
-    }
-}
-
-template<typename T, typename WT> static void
-mul_( const T* src1, size_t step1, const T* src2, size_t step2,
-      T* dst, size_t step, int width, int height, WT scale )
-{
-    step1 /= sizeof(src1[0]);
-    step2 /= sizeof(src2[0]);
-    step /= sizeof(dst[0]);
-
-    Mul_SIMD<T, WT> vop;
-
-    if( scale == (WT)1. )
-    {
-        for( ; height--; src1 += step1, src2 += step2, dst += step )
-        {
-            int i = vop(src1, src2, dst, width, scale);
-            #if CV_ENABLE_UNROLLED
-            for(; i <= width - 4; i += 4 )
-            {
-                T t0;
-                T t1;
-                t0 = saturate_cast<T>(src1[i  ] * src2[i  ]);
-                t1 = saturate_cast<T>(src1[i+1] * src2[i+1]);
-                dst[i  ] = t0;
-                dst[i+1] = t1;
-
-                t0 = saturate_cast<T>(src1[i+2] * src2[i+2]);
-                t1 = saturate_cast<T>(src1[i+3] * src2[i+3]);
-                dst[i+2] = t0;
-                dst[i+3] = t1;
-            }
-            #endif
-            for( ; i < width; i++ )
-                dst[i] = saturate_cast<T>(src1[i] * src2[i]);
-        }
-    }
-    else
-    {
-        for( ; height--; src1 += step1, src2 += step2, dst += step )
-        {
-            int i = vop(src1, src2, dst, width, scale);
-            #if CV_ENABLE_UNROLLED
-            for(; i <= width - 4; i += 4 )
-            {
-                T t0 = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
-                T t1 = saturate_cast<T>(scale*(WT)src1[i+1]*src2[i+1]);
-                dst[i] = t0; dst[i+1] = t1;
-
-                t0 = saturate_cast<T>(scale*(WT)src1[i+2]*src2[i+2]);
-                t1 = saturate_cast<T>(scale*(WT)src1[i+3]*src2[i+3]);
-                dst[i+2] = t0; dst[i+3] = t1;
-            }
-            #endif
-            for( ; i < width; i++ )
-                dst[i] = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
-        }
-    }
-}
-
-
-template<typename T> static void
-div_i( const T* src1, size_t step1, const T* src2, size_t step2,
-      T* dst, size_t step, int width, int height, double scale )
-{
-    step1 /= sizeof(src1[0]);
-    step2 /= sizeof(src2[0]);
-    step /= sizeof(dst[0]);
-
-    Div_SIMD<T> vop;
-    float scale_f = (float)scale;
-
-    for( ; height--; src1 += step1, src2 += step2, dst += step )
-    {
-        int i = vop(src1, src2, dst, width, scale);
-        for( ; i < width; i++ )
-        {
-            T num = src1[i], denom = src2[i];
-            dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
-        }
-    }
-}
-
-template<typename T> static void
-div_f( const T* src1, size_t step1, const T* src2, size_t step2,
-      T* dst, size_t step, int width, int height, double scale )
-{
-    T scale_f = (T)scale;
-    step1 /= sizeof(src1[0]);
-    step2 /= sizeof(src2[0]);
-    step /= sizeof(dst[0]);
-
-    Div_SIMD<T> vop;
-
-    for( ; height--; src1 += step1, src2 += step2, dst += step )
-    {
-        int i = vop(src1, src2, dst, width, scale);
-        for( ; i < width; i++ )
-        {
-            T num = src1[i], denom = src2[i];
-            dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
-        }
-    }
-}
-
-template<typename T> static void
-recip_i( const T* src2, size_t step2,
-         T* dst, size_t step, int width, int height, double scale )
-{
-    step2 /= sizeof(src2[0]);
-    step /= sizeof(dst[0]);
-
-    Recip_SIMD<T> vop;
-    float scale_f = (float)scale;
-
-    for( ; height--; src2 += step2, dst += step )
-    {
-        int i = vop(src2, dst, width, scale);
-        for( ; i < width; i++ )
-        {
-            T denom = src2[i];
-            dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
-        }
-    }
-}
-
-template<typename T> static void
-recip_f( const T* src2, size_t step2,
-         T* dst, size_t step, int width, int height, double scale )
-{
-    T scale_f = (T)scale;
-    step2 /= sizeof(src2[0]);
-    step /= sizeof(dst[0]);
-
-    Recip_SIMD<T> vop;
-
-    for( ; height--; src2 += step2, dst += step )
-    {
-        int i = vop(src2, dst, width, scale);
-        for( ; i < width; i++ )
-        {
-            T denom = src2[i];
-            dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
-        }
-    }
-}
-
-template<typename T, typename WT> static void
-addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2,
-              T* dst, size_t step, int width, int height, void* _scalars )
-{
-    const double* scalars = (const double*)_scalars;
-    WT alpha = (WT)scalars[0], beta = (WT)scalars[1], gamma = (WT)scalars[2];
-    step1 /= sizeof(src1[0]);
-    step2 /= sizeof(src2[0]);
-    step /= sizeof(dst[0]);
-
-    AddWeighted_SIMD<T, WT> vop;
-
-    for( ; height--; src1 += step1, src2 += step2, dst += step )
-    {
-        int x = vop(src1, src2, dst, width, alpha, beta, gamma);
-        #if CV_ENABLE_UNROLLED
-        for( ; x <= width - 4; x += 4 )
-        {
-            T t0 = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
-            T t1 = saturate_cast<T>(src1[x+1]*alpha + src2[x+1]*beta + gamma);
-            dst[x] = t0; dst[x+1] = t1;
-
-            t0 = saturate_cast<T>(src1[x+2]*alpha + src2[x+2]*beta + gamma);
-            t1 = saturate_cast<T>(src1[x+3]*alpha + src2[x+3]*beta + gamma);
-            dst[x+2] = t0; dst[x+3] = t1;
-        }
-        #endif
-        for( ; x < width; x++ )
-            dst[x] = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
-    }
-}
-
-} // cv::
-
-
-#endif // __OPENCV_ARITHM_CORE_HPP__
--- a/modules/core/src/arithm_ipp.hpp
+++ b/modules/core/src/arithm_ipp.hpp
@ -0,0 +1,417 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+#if ARITHM_USE_IPP
+
+namespace cv { namespace hal {
+
+//=======================================
+// Arithmetic and logical operations
+// +, -, *, /, &, |, ^, ~, abs ...
+//=======================================
+
+#define ARITHM_IPP_BIN(fun, ...)                        \
+do {                                                    \
+    if (!CV_IPP_CHECK_COND)                             \
+        return 0;                                       \
+    if (height == 1)                                    \
+        step1 = step2 = step = width * sizeof(dst[0]);  \
+    if (0 <= CV_INSTRUMENT_FUN_IPP(fun, __VA_ARGS__))   \
+    {                                                   \
+        CV_IMPL_ADD(CV_IMPL_IPP);                       \
+        return 1;                                       \
+    }                                                   \
+    setIppErrorStatus();                                \
+    return 0;                                           \
+} while(0)
+
+//=======================================
+// Addition
+//=======================================
+
+inline int arithm_ipp_add8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                            uchar* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiAdd_8u_C1RSfs, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0);
+}
+
+inline int arithm_ipp_add16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
+                             ushort* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiAdd_16u_C1RSfs, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0);
+}
+
+inline int arithm_ipp_add16s(const short* src1, size_t step1, const short* src2, size_t step2,
+                             short* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiAdd_16s_C1RSfs, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0);
+}
+
+inline int arithm_ipp_add32f(const float* src1, size_t step1, const float* src2, size_t step2,
+                             float* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiAdd_32f_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+#define arithm_ipp_add8s(...)  0
+#define arithm_ipp_add32s(...) 0
+#define arithm_ipp_add64f(...) 0
+
+//=======================================
+// Subtract
+//=======================================
+
+inline int arithm_ipp_sub8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                            uchar* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiSub_8u_C1RSfs, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height), 0);
+}
+
+inline int arithm_ipp_sub16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
+                             ushort* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiSub_16u_C1RSfs, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height), 0);
+}
+
+inline int arithm_ipp_sub16s(const short* src1, size_t step1, const short* src2, size_t step2,
+                            short* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiSub_16s_C1RSfs, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height), 0);
+}
+
+inline int arithm_ipp_sub32f(const float* src1, size_t step1, const float* src2, size_t step2,
+                            float* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiSub_32f_C1R, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height));
+}
+
+#define arithm_ipp_sub8s(...)  0
+#define arithm_ipp_sub32s(...) 0
+#define arithm_ipp_sub64f(...) 0
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define ARITHM_IPP_MIN_MAX(fun, type)                            \
+do {                                                             \
+    if (!CV_IPP_CHECK_COND)                                      \
+        return 0;                                                \
+    type* s1 = (type*)src1;                                      \
+    type* s2 = (type*)src2;                                      \
+    type* d  = dst;                                              \
+    if (height == 1)                                             \
+        step1 = step2 = step = width * sizeof(dst[0]);           \
+    int i = 0;                                                   \
+    for(; i < height; i++)                                       \
+    {                                                            \
+        if (0 > CV_INSTRUMENT_FUN_IPP(fun, s1, s2, d, width))    \
+            break;                                               \
+        s1 = (type*)((uchar*)s1 + step1);                        \
+        s2 = (type*)((uchar*)s2 + step2);                        \
+        d  = (type*)((uchar*)d + step);                          \
+    }                                                            \
+    if (i == height)                                             \
+    {                                                            \
+        CV_IMPL_ADD(CV_IMPL_IPP);                                \
+        return 1;                                                \
+    }                                                            \
+    setIppErrorStatus();                                         \
+    return 0;                                                    \
+} while(0)
+
+//=======================================
+// Max
+//=======================================
+
+inline int arithm_ipp_max8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                           uchar* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_MIN_MAX(ippsMaxEvery_8u, uchar);
+}
+
+inline int arithm_ipp_max16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
+                             ushort* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_MIN_MAX(ippsMaxEvery_16u, ushort);
+}
+
+inline int arithm_ipp_max32f(const float* src1, size_t step1, const float* src2, size_t step2,
+                             float* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_MIN_MAX(ippsMaxEvery_32f, float);
+}
+
+inline int arithm_ipp_max64f(const double* src1, size_t step1, const double* src2, size_t step2,
+                             double* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_MIN_MAX(ippsMaxEvery_64f, double);
+}
+
+#define arithm_ipp_max8s(...)  0
+#define arithm_ipp_max16s(...) 0
+#define arithm_ipp_max32s(...) 0
+
+//=======================================
+// Min
+//=======================================
+
+inline int arithm_ipp_min8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                            uchar* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_MIN_MAX(ippsMinEvery_8u, uchar);
+}
+
+inline int arithm_ipp_min16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
+                            ushort* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_MIN_MAX(ippsMinEvery_16u, ushort);
+}
+
+inline int arithm_ipp_min32f(const float* src1, size_t step1, const float* src2,size_t step2,
+                             float* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_MIN_MAX(ippsMinEvery_32f, float);
+}
+
+inline int arithm_ipp_min64f(const double* src1, size_t step1, const double* src2, size_t step2,
+                             double* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_MIN_MAX(ippsMinEvery_64f, double);
+}
+
+#define arithm_ipp_min8s(...)  0
+#define arithm_ipp_min16s(...) 0
+#define arithm_ipp_min32s(...) 0
+
+//=======================================
+// AbsDiff
+//=======================================
+
+inline int arithm_ipp_absdiff8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                                uchar* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiAbsDiff_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+inline int arithm_ipp_absdiff16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
+                                ushort* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiAbsDiff_16u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+inline int arithm_ipp_absdiff32f(const float* src1, size_t step1, const float* src2, size_t step2,
+                                float* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiAbsDiff_32f_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+#define arithm_ipp_absdiff8s(...)  0
+#define arithm_ipp_absdiff16s(...) 0
+#define arithm_ipp_absdiff32s(...) 0
+#define arithm_ipp_absdiff64f(...) 0
+
+//=======================================
+// Logical
+//=======================================
+
+inline int arithm_ipp_and8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                            uchar* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiAnd_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+inline int arithm_ipp_or8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                           uchar* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiOr_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+inline int arithm_ipp_xor8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                            uchar* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiXor_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+inline int arithm_ipp_not8u(const uchar* src1, size_t step1, uchar* dst, size_t step, int width, int height)
+{
+    if (!CV_IPP_CHECK_COND)
+        return 0;
+    if (height == 1)
+        step1 = step = width * sizeof(dst[0]);
+    if (0 <= CV_INSTRUMENT_FUN_IPP(ippiNot_8u_C1R, src1, (int)step1, dst, (int)step, ippiSize(width, height)))
+    {
+        CV_IMPL_ADD(CV_IMPL_IPP);
+        return 1;
+    }
+    setIppErrorStatus();
+    return 0;
+}
+
+//=======================================
+// Compare
+//=======================================
+
+#define ARITHM_IPP_CMP(fun, ...)                          \
+do {                                                      \
+    if (!CV_IPP_CHECK_COND)                               \
+        return 0;                                         \
+    IppCmpOp op = arithm_ipp_convert_cmp(cmpop);          \
+    if (op < 0)                                           \
+        return 0;                                         \
+    if (height == 1)                                      \
+        step1 = step2 = step = width * sizeof(dst[0]);    \
+    if (0 <= CV_INSTRUMENT_FUN_IPP(fun, __VA_ARGS__, op)) \
+    {                                                     \
+        CV_IMPL_ADD(CV_IMPL_IPP);                         \
+        return 1;                                         \
+    }                                                     \
+    setIppErrorStatus();                                  \
+    return 0;                                             \
+} while(0)
+
+inline IppCmpOp arithm_ipp_convert_cmp(int cmpop)
+{
+    switch(cmpop)
+    {
+        case CMP_EQ: return ippCmpEq;
+        case CMP_GT: return ippCmpGreater;
+        case CMP_GE: return ippCmpGreaterEq;
+        case CMP_LT: return ippCmpLess;
+        case CMP_LE: return ippCmpLessEq;
+        default:     return (IppCmpOp)-1;
+    }
+}
+
+inline int arithm_ipp_cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                            uchar* dst, size_t step, int width, int height, int cmpop)
+{
+    ARITHM_IPP_CMP(ippiCompare_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+inline int arithm_ipp_cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
+                            uchar* dst, size_t step, int width, int height, int cmpop)
+{
+    ARITHM_IPP_CMP(ippiCompare_16u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+inline int arithm_ipp_cmp16s(const short* src1, size_t step1, const short* src2, size_t step2,
+                             uchar* dst, size_t step, int width, int height, int cmpop)
+{
+    ARITHM_IPP_CMP(ippiCompare_16s_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+inline int arithm_ipp_cmp32f(const float* src1, size_t step1, const float* src2, size_t step2,
+                             uchar* dst, size_t step, int width, int height, int cmpop)
+{
+    ARITHM_IPP_CMP(ippiCompare_32f_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+#define arithm_ipp_cmp8s(...)  0
+#define arithm_ipp_cmp32s(...) 0
+#define arithm_ipp_cmp64f(...) 0
+
+//=======================================
+// Multiply
+//=======================================
+
+#define ARITHM_IPP_MUL(fun, ...)                      \
+do {                                                  \
+    if (!CV_IPP_CHECK_COND)                           \
+        return 0;                                     \
+    float fscale = (float)scale;                      \
+    if (std::fabs(fscale - 1) > FLT_EPSILON)          \
+        return 0;                                     \
+    if (0 <= CV_INSTRUMENT_FUN_IPP(fun, __VA_ARGS__)) \
+    {                                                 \
+        CV_IMPL_ADD(CV_IMPL_IPP);                     \
+        return 1;                                     \
+    }                                                 \
+    setIppErrorStatus();                              \
+    return 0;                                         \
+} while(0)
+
+inline int arithm_ipp_mul8u(const uchar *src1, size_t step1, const uchar *src2, size_t step2,
+                            uchar *dst, size_t step, int width, int height, double scale)
+{
+    ARITHM_IPP_MUL(ippiMul_8u_C1RSfs, src1, (int)step1, src2, (int)step2,dst, (int)step, ippiSize(width, height), 0);
+}
+inline int arithm_ipp_mul16u(const ushort *src1, size_t step1, const ushort *src2, size_t step2,
+                            ushort *dst, size_t step, int width, int height, double scale)
+{
+    ARITHM_IPP_MUL(ippiMul_16u_C1RSfs, src1, (int)step1, src2, (int)step2,dst, (int)step, ippiSize(width, height), 0);
+}
+
+inline int arithm_ipp_mul16s(const short *src1, size_t step1, const short *src2, size_t step2,
+                            short *dst, size_t step, int width, int height, double scale)
+{
+    ARITHM_IPP_MUL(ippiMul_16s_C1RSfs, src1, (int)step1, src2, (int)step2,dst, (int)step, ippiSize(width, height), 0);
+}
+
+inline int arithm_ipp_mul32f(const float *src1, size_t step1, const float *src2, size_t step2,
+                            float *dst, size_t step, int width, int height, double scale)
+{
+    ARITHM_IPP_MUL(ippiMul_32f_C1R, src1, (int)step1, src2, (int)step2,dst, (int)step, ippiSize(width, height));
+}
+
+#define arithm_ipp_mul8s(...)  0
+#define arithm_ipp_mul32s(...) 0
+#define arithm_ipp_mul64f(...) 0
+
+//=======================================
+// Div
+//=======================================
+
+#define arithm_ipp_div8u(...)  0
+#define arithm_ipp_div8s(...)  0
+#define arithm_ipp_div16u(...) 0
+#define arithm_ipp_div16s(...) 0
+#define arithm_ipp_div32s(...) 0
+#define arithm_ipp_div32f(...) 0
+#define arithm_ipp_div64f(...) 0
+
+//=======================================
+// AddWeighted
+//=======================================
+
+#define arithm_ipp_addWeighted8u(...)  0
+#define arithm_ipp_addWeighted8s(...)  0
+#define arithm_ipp_addWeighted16u(...) 0
+#define arithm_ipp_addWeighted16s(...) 0
+#define arithm_ipp_addWeighted32s(...) 0
+#define arithm_ipp_addWeighted32f(...) 0
+#define arithm_ipp_addWeighted64f(...) 0
+
+//=======================================
+// Reciprocial
+//=======================================
+
+#define arithm_ipp_recip8u(...)  0
+#define arithm_ipp_recip8s(...)  0
+#define arithm_ipp_recip16u(...) 0
+#define arithm_ipp_recip16s(...) 0
+#define arithm_ipp_recip32s(...) 0
+#define arithm_ipp_recip32f(...) 0
+#define arithm_ipp_recip64f(...) 0
+
+/** empty block in case if you have "fun"
+#define arithm_ipp_8u(...)  0
+#define arithm_ipp_8s(...)  0
+#define arithm_ipp_16u(...) 0
+#define arithm_ipp_16s(...) 0
+#define arithm_ipp_32s(...) 0
+#define arithm_ipp_32f(...) 0
+#define arithm_ipp_64f(...) 0
+**/
+
+}} // cv::hal::
+
+#define ARITHM_CALL_IPP(fun, ...)       \
+{                                       \
+    if (__CV_EXPAND(fun(__VA_ARGS__)))  \
+        return;                         \
+}
+
+#endif // ARITHM_USE_IPP
+
+
+#if !ARITHM_USE_IPP
+#define ARITHM_CALL_IPP(...)
+#endif
--- a/modules/core/src/arithm_simd.hpp
+++ b/modules/core/src/arithm_simd.hpp
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@ -86,7 +86,6 @@
 #include "opencv2/core/sse_utils.hpp"
 #include "opencv2/core/neon_utils.hpp"
 #include "opencv2/core/vsx_utils.hpp"
-#include "arithm_core.hpp"
 #include "hal_replacement.hpp"

 #ifdef HAVE_TEGRA_OPTIMIZATION
@ -110,6 +109,102 @@ extern const uchar g_Saturate8u[];
 #define CV_MIN_8U(a,b)       ((a) - CV_FAST_CAST_8U((a) - (b)))
 #define CV_MAX_8U(a,b)       ((a) + CV_FAST_CAST_8U((b) - (a)))

+template<typename T1, typename T2=T1, typename T3=T1> struct OpAdd
+{
+    typedef T1 type1;
+    typedef T2 type2;
+    typedef T3 rtype;
+    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a + b); }
+};
+
+template<typename T1, typename T2=T1, typename T3=T1> struct OpSub
+{
+    typedef T1 type1;
+    typedef T2 type2;
+    typedef T3 rtype;
+    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a - b); }
+};
+
+template<typename T1, typename T2=T1, typename T3=T1> struct OpRSub
+{
+    typedef T1 type1;
+    typedef T2 type2;
+    typedef T3 rtype;
+    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(b - a); }
+};
+
+template<typename T> struct OpMin
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator ()(const T a, const T b) const { return std::min(a, b); }
+};
+
+template<typename T> struct OpMax
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator ()(const T a, const T b) const { return std::max(a, b); }
+};
+
+template<typename T> struct OpAbsDiff
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()(T a, T b) const { return a > b ? a - b : b - a; }
+};
+
+// specializations to prevent "-0" results
+template<> struct OpAbsDiff<float>
+{
+    typedef float type1;
+    typedef float type2;
+    typedef float rtype;
+    float operator()(float a, float b) const { return std::abs(a - b); }
+};
+template<> struct OpAbsDiff<double>
+{
+    typedef double type1;
+    typedef double type2;
+    typedef double rtype;
+    double operator()(double a, double b) const { return std::abs(a - b); }
+};
+
+template<typename T> struct OpAnd
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()( T a, T b ) const { return a & b; }
+};
+
+template<typename T> struct OpOr
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()( T a, T b ) const { return a | b; }
+};
+
+template<typename T> struct OpXor
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()( T a, T b ) const { return a ^ b; }
+};
+
+template<typename T> struct OpNot
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()( T a, T ) const { return ~a; }
+};
+
 template<> inline uchar OpAdd<uchar>::operator ()(uchar a, uchar b) const
 { return CV_FAST_CAST_8U(a + b); }

--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@ -119,11 +119,15 @@ template <typename R> struct Data
            d[i] += (LaneType)m;
        return *this;
    }
-    void fill(LaneType val)
+    void fill(LaneType val, int s, int c = R::nlanes)
    {
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = s; i < c; ++i)
            d[i] = val;
    }
+    void fill(LaneType val)
+    {
+        fill(val, 0);
+    }
    void reverse()
    {
        for (int i = 0; i < R::nlanes / 2; ++i)
@ -739,6 +743,23 @@ template<typename R> struct TheTest
        return *this;
    }

+    TheTest & test_absdiffs()
+    {
+        Data<R> dataA(std::numeric_limits<LaneType>::max()),
+                dataB(std::numeric_limits<LaneType>::min());
+        dataA[0] = (LaneType)-1;
+        dataB[0] = 1;
+        dataA[1] = 2;
+        dataB[1] = (LaneType)-2;
+        R a = dataA, b = dataB;
+        Data<R> resC = v_absdiffs(a, b);
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            EXPECT_EQ(saturate_cast<LaneType>(std::abs(dataA[i] - dataB[i])), resC[i]);
+        }
+        return *this;
+    }
+
    TheTest & test_reduce()
    {
        Data<R> dataA;
@ -874,6 +895,81 @@ template<typename R> struct TheTest
        return *this;
    }

+    // v_uint8 only
+    TheTest & test_pack_b()
+    {
+        // 16-bit
+        Data<R> dataA, dataB;
+        dataB.fill(0, R::nlanes / 2);
+
+        R a = dataA, b = dataB;
+        Data<R> maskA = a == b, maskB = a != b;
+
+        a = maskA; b = maskB;
+        Data<R> res  = v_pack_b(v_reinterpret_as_u16(a), v_reinterpret_as_u16(b));
+        for (int i = 0; i < v_uint16::nlanes; ++i)
+        {
+            SCOPED_TRACE(cv::format("i=%d", i));
+            EXPECT_EQ(maskA[i * 2], res[i]);
+            EXPECT_EQ(maskB[i * 2], res[i + v_uint16::nlanes]);
+        }
+
+        // 32-bit
+        Data<R> dataC, dataD;
+        dataD.fill(0, R::nlanes / 2);
+
+        R c = dataC, d = dataD;
+        Data<R> maskC = c == d, maskD = c != d;
+
+        c = maskC; d = maskD;
+        res = v_pack_b
+        (
+            v_reinterpret_as_u32(a), v_reinterpret_as_u32(b),
+            v_reinterpret_as_u32(c), v_reinterpret_as_u32(d)
+        );
+
+        for (int i = 0; i < v_uint32::nlanes; ++i)
+        {
+            SCOPED_TRACE(cv::format("i=%d", i));
+            EXPECT_EQ(maskA[i * 4], res[i]);
+            EXPECT_EQ(maskB[i * 4], res[i + v_uint32::nlanes]);
+            EXPECT_EQ(maskC[i * 4], res[i + v_uint32::nlanes * 2]);
+            EXPECT_EQ(maskD[i * 4], res[i + v_uint32::nlanes * 3]);
+        }
+
+        // 64-bit
+        Data<R> dataE, dataF, dataG(0), dataH(0xFF);
+        dataF.fill(0, R::nlanes / 2);
+
+        R e = dataE, f = dataF, g = dataG, h = dataH;
+        Data<R> maskE = e == f, maskF = e != f;
+
+        e = maskE; f = maskF;
+        res = v_pack_b
+        (
+            v_reinterpret_as_u64(a), v_reinterpret_as_u64(b),
+            v_reinterpret_as_u64(c), v_reinterpret_as_u64(d),
+            v_reinterpret_as_u64(e), v_reinterpret_as_u64(f),
+            v_reinterpret_as_u64(g), v_reinterpret_as_u64(h)
+        );
+
+        for (int i = 0; i < v_uint64::nlanes; ++i)
+        {
+            SCOPED_TRACE(cv::format("i=%d", i));
+            EXPECT_EQ(maskA[i * 8], res[i]);
+            EXPECT_EQ(maskB[i * 8], res[i + v_uint64::nlanes]);
+            EXPECT_EQ(maskC[i * 8], res[i + v_uint64::nlanes * 2]);
+            EXPECT_EQ(maskD[i * 8], res[i + v_uint64::nlanes * 3]);
+
+            EXPECT_EQ(maskE[i * 8], res[i + v_uint64::nlanes * 4]);
+            EXPECT_EQ(maskF[i * 8], res[i + v_uint64::nlanes * 5]);
+            EXPECT_EQ(dataG[i * 8], res[i + v_uint64::nlanes * 6]);
+            EXPECT_EQ(dataH[i * 8], res[i + v_uint64::nlanes * 7]);
+        }
+
+        return *this;
+    }
+
    TheTest & test_unpack()
    {
        Data<R> dataA, dataB;
@ -1228,6 +1324,7 @@ void test_hal_intrin_uint8()
        .test_popcount()
        .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
        .test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>()
+        .test_pack_b()
        .test_unpack()
        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
@ -1259,6 +1356,7 @@ void test_hal_intrin_int8()
        .test_logic()
        .test_min_max()
        .test_absdiff()
+        .test_absdiffs()
        .test_abs()
        .test_mask()
        .test_popcount()
@ -1317,6 +1415,7 @@ void test_hal_intrin_int16()
        .test_logic()
        .test_min_max()
        .test_absdiff()
+        .test_absdiffs()
        .test_abs()
        .test_reduce()
        .test_mask()