diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
index 9dcfc5623a..031f8f3d02 100644
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -60,255 +60,72 @@
 // access from within opencv code more accessible
 namespace cv {
 
-#ifndef CV_DOXYGEN
-
-#ifdef CV_CPU_DISPATCH_MODE
-#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE __CV_CAT(hal_, CV_CPU_DISPATCH_MODE)
-#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) {
-#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
-#else
-#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_baseline
-#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_baseline {
-#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
-#endif
-
-
-CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
-CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
-using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
-CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
-#endif
-
-//! @addtogroup core_hal_intrin
-//! @{
-
-//! @cond IGNORED
 template<typename _Tp> struct V_TypeTraits
 {
-    typedef _Tp int_type;
-    typedef _Tp uint_type;
-    typedef _Tp abs_type;
-    typedef _Tp sum_type;
-
-    enum { delta = 0, shift = 0 };
-
-    static int_type reinterpret_int(_Tp x) { return x; }
-    static uint_type reinterpet_uint(_Tp x) { return x; }
-    static _Tp reinterpret_from_int(int_type x) { return (_Tp)x; }
-};
-
-template<> struct V_TypeTraits<uchar>
-{
-    typedef uchar value_type;
-    typedef schar int_type;
-    typedef uchar uint_type;
-    typedef uchar abs_type;
-    typedef int sum_type;
-
-    typedef ushort w_type;
-    typedef unsigned q_type;
-
-    enum { delta = 128, shift = 8 };
-
-    static int_type reinterpret_int(value_type x) { return (int_type)x; }
-    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
-    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-template<> struct V_TypeTraits<schar>
-{
-    typedef schar value_type;
-    typedef schar int_type;
-    typedef uchar uint_type;
-    typedef uchar abs_type;
-    typedef int sum_type;
-
-    typedef short w_type;
-    typedef int q_type;
-
-    enum { delta = 128, shift = 8 };
-
-    static int_type reinterpret_int(value_type x) { return (int_type)x; }
-    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
-    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-template<> struct V_TypeTraits<ushort>
-{
-    typedef ushort value_type;
-    typedef short int_type;
-    typedef ushort uint_type;
-    typedef ushort abs_type;
-    typedef int sum_type;
-
-    typedef unsigned w_type;
-    typedef uchar nu_type;
-
-    enum { delta = 32768, shift = 16 };
-
-    static int_type reinterpret_int(value_type x) { return (int_type)x; }
-    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
-    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-template<> struct V_TypeTraits<short>
-{
-    typedef short value_type;
-    typedef short int_type;
-    typedef ushort uint_type;
-    typedef ushort abs_type;
-    typedef int sum_type;
-
-    typedef int w_type;
-    typedef uchar nu_type;
-    typedef schar n_type;
-
-    enum { delta = 128, shift = 8 };
-
-    static int_type reinterpret_int(value_type x) { return (int_type)x; }
-    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
-    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-template<> struct V_TypeTraits<unsigned>
-{
-    typedef unsigned value_type;
-    typedef int int_type;
-    typedef unsigned uint_type;
-    typedef unsigned abs_type;
-    typedef unsigned sum_type;
-
-    typedef uint64 w_type;
-    typedef ushort nu_type;
-
-    static int_type reinterpret_int(value_type x) { return (int_type)x; }
-    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
-    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-template<> struct V_TypeTraits<int>
-{
-    typedef int value_type;
-    typedef int int_type;
-    typedef unsigned uint_type;
-    typedef unsigned abs_type;
-    typedef int sum_type;
-
-    typedef int64 w_type;
-    typedef short n_type;
-    typedef ushort nu_type;
-
-    static int_type reinterpret_int(value_type x) { return (int_type)x; }
-    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
-    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-template<> struct V_TypeTraits<uint64>
-{
-    typedef uint64 value_type;
-    typedef int64 int_type;
-    typedef uint64 uint_type;
-    typedef uint64 abs_type;
-    typedef uint64 sum_type;
-
-    typedef unsigned nu_type;
-
-    static int_type reinterpret_int(value_type x) { return (int_type)x; }
-    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
-    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-template<> struct V_TypeTraits<int64>
-{
-    typedef int64 value_type;
-    typedef int64 int_type;
-    typedef uint64 uint_type;
-    typedef uint64 abs_type;
-    typedef int64 sum_type;
-
-    typedef int nu_type;
-
-    static int_type reinterpret_int(value_type x) { return (int_type)x; }
-    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
-    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
-};
-
-
-template<> struct V_TypeTraits<float>
-{
-    typedef float value_type;
-    typedef int int_type;
-    typedef unsigned uint_type;
-    typedef float abs_type;
-    typedef float sum_type;
-
-    typedef double w_type;
-
-    static int_type reinterpret_int(value_type x)
-    {
-        Cv32suf u;
-        u.f = x;
-        return u.i;
-    }
-    static uint_type reinterpet_uint(value_type x)
-    {
-        Cv32suf u;
-        u.f = x;
-        return u.u;
-    }
-    static value_type reinterpret_from_int(int_type x)
-    {
-        Cv32suf u;
-        u.i = x;
-        return u.f;
-    }
 };
 
-template<> struct V_TypeTraits<double>
-{
-    typedef double value_type;
-    typedef int64 int_type;
-    typedef uint64 uint_type;
-    typedef double abs_type;
-    typedef double sum_type;
-    static int_type reinterpret_int(value_type x)
-    {
-        Cv64suf u;
-        u.f = x;
-        return u.i;
-    }
-    static uint_type reinterpet_uint(value_type x)
-    {
-        Cv64suf u;
-        u.f = x;
-        return u.u;
+#define CV_INTRIN_DEF_TYPE_TRAITS(type, int_type_, uint_type_, abs_type_, w_type_, q_type_, sum_type_, nlanes128_) \
+    template<> struct V_TypeTraits<type> \
+    { \
+        typedef type value_type; \
+        typedef int_type_ int_type; \
+        typedef abs_type_ abs_type; \
+        typedef uint_type_ uint_type; \
+        typedef w_type_ w_type; \
+        typedef q_type_ q_type; \
+        typedef sum_type_ sum_type; \
+        enum { nlanes128 = nlanes128_ }; \
+    \
+        static inline int_type reinterpret_int(type x) \
+        { \
+            union { type l; int_type i; } v; \
+            v.l = x; \
+            return v.i; \
+        } \
+    \
+        static inline type reinterpret_from_int(int_type x) \
+        { \
+            union { type l; int_type i; } v; \
+            v.i = x; \
+            return v.l; \
+        } \
     }
-    static value_type reinterpret_from_int(int_type x)
-    {
-        Cv64suf u;
-        u.i = x;
-        return u.f;
-    }
-};
 
-template <typename T> struct V_SIMD128Traits
-{
-    enum { nlanes = 16 / sizeof(T) };
-};
+CV_INTRIN_DEF_TYPE_TRAITS(uchar, schar, uchar, uchar, ushort, unsigned, unsigned, 16);
+CV_INTRIN_DEF_TYPE_TRAITS(schar, schar, uchar, uchar, short, int, int, 16);
+CV_INTRIN_DEF_TYPE_TRAITS(ushort, short, ushort, ushort, unsigned, uint64, unsigned, 8);
+CV_INTRIN_DEF_TYPE_TRAITS(short, short, ushort, ushort, int, int64, int, 8);
+CV_INTRIN_DEF_TYPE_TRAITS(unsigned, int, unsigned, unsigned, uint64, void, unsigned, 4);
+CV_INTRIN_DEF_TYPE_TRAITS(int, int, unsigned, unsigned, int64, void, int, 4);
+CV_INTRIN_DEF_TYPE_TRAITS(float, int, unsigned, float, double, void, float, 4);
+CV_INTRIN_DEF_TYPE_TRAITS(uint64, int64, uint64, uint64, void, void, uint64, 2);
+CV_INTRIN_DEF_TYPE_TRAITS(int64, int64, uint64, uint64, void, void, int64, 2);
+CV_INTRIN_DEF_TYPE_TRAITS(double, int64, uint64, double, void, void, double, 2);
 
-//! @endcond
+#ifndef CV_DOXYGEN
 
-//! @}
+#ifdef CV_CPU_DISPATCH_MODE
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE __CV_CAT(hal_, CV_CPU_DISPATCH_MODE)
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) {
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
+#else
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_baseline
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_baseline {
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
+#endif
 
-#ifndef CV_DOXYGEN
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 #endif
 }
 
 #ifdef CV_DOXYGEN
+#   undef CV_AVX2
 #   undef CV_SSE2
 #   undef CV_NEON
 #   undef CV_VSX
+#   undef CV_FP16
 #endif
 
 #if CV_SSE2
@@ -325,27 +142,25 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
 #else
 
+#define CV_SIMD128_CPP 1
 #include "opencv2/core/hal/intrin_cpp.hpp"
 
 #endif
 
-//! @addtogroup core_hal_intrin
-//! @{
+// AVX2 can be used together with SSE2, so
+// we define those two sets of intrinsics at once.
+// Most of the intrinsics do not conflict (the proper overloaded variant is
+// resolved by the argument types, e.g. v_float32x4 ~ SSE2, v_float32x8 ~ AVX2),
+// but some of AVX2 intrinsics get v256_ prefix instead of v_, e.g. v256_load() vs v_load().
+// Correspondingly, the wide intrinsics (which are mapped to the "widest"
+// available instruction set) will get vx_ prefix
+// (and will be mapped to v256_ counterparts) (e.g. vx_load() => v245_load())
+#if CV_AVX2
 
-#ifndef CV_SIMD128
-//! Set to 1 if current compiler supports vector extensions (NEON or SSE is enabled)
-#define CV_SIMD128 0
-#endif
+#include "opencv2/core/hal/intrin_avx.hpp"
 
-#ifndef CV_SIMD128_64F
-//! Set to 1 if current intrinsics implementation supports 64-bit float vectors
-#define CV_SIMD128_64F 0
 #endif
 
-//! @}
-
-//==================================================================================================
-
 //! @cond IGNORED
 
 namespace cv {
@@ -354,88 +169,175 @@ namespace cv {
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 #endif
 
-template <typename R> struct V_RegTrait128;
+#ifndef CV_SIMD128
+#define CV_SIMD128 0
+#endif
 
-template <> struct V_RegTrait128<uchar> {
-    typedef v_uint8x16 reg;
-    typedef v_uint16x8 w_reg;
-    typedef v_uint32x4 q_reg;
-    typedef v_uint8x16 u_reg;
-    static v_uint8x16 zero() { return v_setzero_u8(); }
-    static v_uint8x16 all(uchar val) { return v_setall_u8(val); }
-};
+#ifndef CV_SIMD128_64F
+#define CV_SIMD128_64F 0
+#endif
 
-template <> struct V_RegTrait128<schar> {
-    typedef v_int8x16 reg;
-    typedef v_int16x8 w_reg;
-    typedef v_int32x4 q_reg;
-    typedef v_uint8x16 u_reg;
-    static v_int8x16 zero() { return v_setzero_s8(); }
-    static v_int8x16 all(schar val) { return v_setall_s8(val); }
-};
+#ifndef CV_SIMD256
+#define CV_SIMD256 0
+#endif
 
-template <> struct V_RegTrait128<ushort> {
-    typedef v_uint16x8 reg;
-    typedef v_uint32x4 w_reg;
-    typedef v_int16x8 int_reg;
-    typedef v_uint16x8 u_reg;
-    static v_uint16x8 zero() { return v_setzero_u16(); }
-    static v_uint16x8 all(ushort val) { return v_setall_u16(val); }
-};
+#ifndef CV_SIMD256_64F
+#define CV_SIMD256_64F 0
+#endif
 
-template <> struct V_RegTrait128<short> {
-    typedef v_int16x8 reg;
-    typedef v_int32x4 w_reg;
-    typedef v_uint16x8 u_reg;
-    static v_int16x8 zero() { return v_setzero_s16(); }
-    static v_int16x8 all(short val) { return v_setall_s16(val); }
-};
+#ifndef CV_SIMD512
+#define CV_SIMD512 0
+#endif
 
-template <> struct V_RegTrait128<unsigned> {
-    typedef v_uint32x4 reg;
-    typedef v_uint64x2 w_reg;
-    typedef v_int32x4 int_reg;
-    typedef v_uint32x4 u_reg;
-    static v_uint32x4 zero() { return v_setzero_u32(); }
-    static v_uint32x4 all(unsigned val) { return v_setall_u32(val); }
-};
+#ifndef CV_SIMD512_64F
+#define CV_SIMD512_64F 0
+#endif
 
-template <> struct V_RegTrait128<int> {
-    typedef v_int32x4 reg;
-    typedef v_int64x2 w_reg;
-    typedef v_uint32x4 u_reg;
-    static v_int32x4 zero() { return v_setzero_s32(); }
-    static v_int32x4 all(int val) { return v_setall_s32(val); }
-};
+#if CV_SIMD512
+    #define CV_SIMD 1
+    #define CV_SIMD_64F CV_SIMD512_64F
+    #define CV_SIMD_WIDTH 64
+#elif CV_SIMD256
+    #define CV_SIMD 1
+    #define CV_SIMD_64F CV_SIMD256_64F
+    #define CV_SIMD_WIDTH 32
+#else
+    #define CV_SIMD CV_SIMD128
+    #define CV_SIMD_64F CV_SIMD128_64F
+    #define CV_SIMD_WIDTH 16
+#endif
 
-template <> struct V_RegTrait128<uint64> {
-    typedef v_uint64x2 reg;
-    static v_uint64x2 zero() { return v_setzero_u64(); }
-    static v_uint64x2 all(uint64 val) { return v_setall_u64(val); }
-};
+//==================================================================================================
 
-template <> struct V_RegTrait128<int64> {
-    typedef v_int64x2 reg;
-    static v_int64x2 zero() { return v_setzero_s64(); }
-    static v_int64x2 all(int64 val) { return v_setall_s64(val); }
+#define CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \
+    inline vtyp vx_setall_##short_typ(typ v) { return prefix##_setall_##short_typ(v); } \
+    inline vtyp vx_setzero_##short_typ() { return prefix##_setzero_##short_typ(); } \
+    inline vtyp vx_##loadsfx(const typ* ptr) { return prefix##_##loadsfx(ptr); } \
+    inline vtyp vx_##loadsfx##_aligned(const typ* ptr) { return prefix##_##loadsfx##_aligned(ptr); } \
+    inline void vx_store(typ* ptr, const vtyp& v) { return v_store(ptr, v); } \
+    inline void vx_store_aligned(typ* ptr, const vtyp& v) { return v_store_aligned(ptr, v); }
+
+#define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \
+inline wtyp vx_load_expand(const typ* ptr) { return prefix##_load_expand(ptr); }
+
+#define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix) \
+inline qtyp vx_load_expand_q(const typ* ptr) { return prefix##_load_expand_q(ptr); }
+
+#define CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(typ, vtyp, short_typ, wtyp, qtyp, prefix, loadsfx) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \
+    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \
+    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix)
+
+#define CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(prefix) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(uchar, v_uint8, u8, v_uint16, v_uint32, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(schar, v_int8, s8, v_int16, v_int32, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(ushort, v_uint16, u16, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(ushort, v_uint32, prefix) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_int16, s16, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(short, v_int32, prefix) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(int, v_int32, s32, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(int, v_int64, prefix) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(unsigned, v_uint32, u32, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(unsigned, v_uint64, prefix) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(float, v_float32, f32, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(int64, v_int64, s64, prefix, load) \
+    CV_INTRIN_DEFINE_WIDE_INTRIN(uint64, v_uint64, u64, prefix, load)
+
+template<typename _Tp> struct V_RegTraits
+{
 };
 
-template <> struct V_RegTrait128<float> {
-    typedef v_float32x4 reg;
-    typedef v_int32x4 int_reg;
-    typedef v_float32x4 u_reg;
-    static v_float32x4 zero() { return v_setzero_f32(); }
-    static v_float32x4 all(float val) { return v_setall_f32(val); }
-};
+#define CV_DEF_REG_TRAITS(prefix, _reg, lane_type, suffix, _u_reg, _w_reg, _q_reg, _int_reg, _round_reg) \
+    template<> struct V_RegTraits<_reg> \
+    { \
+        typedef _reg reg; \
+        typedef _u_reg u_reg; \
+        typedef _w_reg w_reg; \
+        typedef _q_reg q_reg; \
+        typedef _int_reg int_reg; \
+        typedef _round_reg round_reg; \
+    }
 
+#if CV_SIMD128 || CV_SIMD128_CPP
+    CV_DEF_REG_TRAITS(v, v_uint8x16, uchar, u8, v_uint8x16, v_uint16x8, v_uint32x4, v_int8x16, void);
+    CV_DEF_REG_TRAITS(v, v_int8x16, schar, s8, v_uint8x16, v_int16x8, v_int32x4, v_int8x16, void);
+    CV_DEF_REG_TRAITS(v, v_uint16x8, ushort, u16, v_uint16x8, v_uint32x4, v_uint64x2, v_int16x8, void);
+    CV_DEF_REG_TRAITS(v, v_int16x8, short, s16, v_uint16x8, v_int32x4, v_int64x2, v_int16x8, void);
+    CV_DEF_REG_TRAITS(v, v_uint32x4, unsigned, u32, v_uint32x4, v_uint64x2, void, v_int32x4, void);
+    CV_DEF_REG_TRAITS(v, v_int32x4, int, s32, v_uint32x4, v_int64x2, void, v_int32x4, void);
 #if CV_SIMD128_64F
-template <> struct V_RegTrait128<double> {
-    typedef v_float64x2 reg;
-    typedef v_int32x4 int_reg;
-    typedef v_float64x2 u_reg;
-    static v_float64x2 zero() { return v_setzero_f64(); }
-    static v_float64x2 all(double val) { return v_setall_f64(val); }
-};
+    CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, v_float64x2, void, v_int32x4, v_int32x4);
+#else
+    CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, void, void, v_int32x4, v_int32x4);
+#endif
+    CV_DEF_REG_TRAITS(v, v_uint64x2, uint64, u64, v_uint64x2, void, void, v_int64x2, void);
+    CV_DEF_REG_TRAITS(v, v_int64x2, int64, s64, v_uint64x2, void, void, v_int64x2, void);
+#if CV_SIMD128_64F
+    CV_DEF_REG_TRAITS(v, v_float64x2, double, f64, v_float64x2, void, void, v_int64x2, v_int32x4);
+#endif
+#if CV_FP16
+    CV_DEF_REG_TRAITS(v, v_float16x8, short, f16, v_float32x4, void, void, v_int16x8, v_int16x8);
+#endif
+#endif
+
+#if CV_SIMD256
+    CV_DEF_REG_TRAITS(v256, v_uint8x32, uchar, u8, v_uint8x32, v_uint16x16, v_uint32x8, v_int8x32, void);
+    CV_DEF_REG_TRAITS(v256, v_int8x32, schar, s8, v_uint8x32, v_int16x16, v_int32x8, v_int8x32, void);
+    CV_DEF_REG_TRAITS(v256, v_uint16x16, ushort, u16, v_uint16x16, v_uint32x8, v_uint64x4, v_int16x16, void);
+    CV_DEF_REG_TRAITS(v256, v_int16x16, short, s16, v_uint16x16, v_int32x8, v_int64x4, v_int16x16, void);
+    CV_DEF_REG_TRAITS(v256, v_uint32x8, unsigned, u32, v_uint32x8, v_uint64x4, void, v_int32x8, void);
+    CV_DEF_REG_TRAITS(v256, v_int32x8, int, s32, v_uint32x8, v_int64x4, void, v_int32x8, void);
+    CV_DEF_REG_TRAITS(v256, v_float32x8, float, f32, v_float32x8, v_float64x4, void, v_int32x8, v_int32x8);
+    CV_DEF_REG_TRAITS(v256, v_uint64x4, uint64, u64, v_uint64x4, void, void, v_int64x4, void);
+    CV_DEF_REG_TRAITS(v256, v_int64x4, int64, s64, v_uint64x4, void, void, v_int64x4, void);
+    CV_DEF_REG_TRAITS(v256, v_float64x4, double, f64, v_float64x4, void, void, v_int64x4, v_int32x8);
+#if CV_FP16
+    CV_DEF_REG_TRAITS(v256, v_float16x16, short, f16, v_float32x8, void, void, v_int16x16, void);
+#endif
+#endif
+
+#if CV_SIMD256
+    typedef v_uint8x32   v_uint8;
+    typedef v_int8x32    v_int8;
+    typedef v_uint16x16  v_uint16;
+    typedef v_int16x16   v_int16;
+    typedef v_uint32x8   v_uint32;
+    typedef v_int32x8    v_int32;
+    typedef v_uint64x4   v_uint64;
+    typedef v_int64x4    v_int64;
+    typedef v_float32x8  v_float32;
+    #if CV_SIMD256_64F
+    typedef v_float64x4  v_float64;
+    #endif
+    #if CV_FP16
+    typedef v_float16x16  v_float16;
+    CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_float16, f16, v256, load_f16)
+    #endif
+    CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256)
+    CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v256, load)
+    inline void vx_cleanup() { v256_cleanup(); }
+#elif CV_SIMD128
+    typedef v_uint8x16  v_uint8;
+    typedef v_int8x16   v_int8;
+    typedef v_uint16x8  v_uint16;
+    typedef v_int16x8   v_int16;
+    typedef v_uint32x4  v_uint32;
+    typedef v_int32x4   v_int32;
+    typedef v_uint64x2  v_uint64;
+    typedef v_int64x2   v_int64;
+    typedef v_float32x4 v_float32;
+    #if CV_SIMD128_64F
+    typedef v_float64x2 v_float64;
+    #endif
+    #if CV_FP16
+    typedef v_float16x8  v_float16;
+    CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_float16, f16, v, load_f16)
+    #endif
+    CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v)
+    #if CV_SIMD128_64F
+    CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v, load)
+    #endif
+    inline void vx_cleanup() { v_cleanup(); }
 #endif
 
 inline unsigned int trailingZeros32(unsigned int value) {
diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
new file mode 100644
index 0000000000..7e983fd24f
--- /dev/null
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@@ -0,0 +1,2016 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_HAL_INTRIN_AVX_HPP
+#define OPENCV_HAL_INTRIN_AVX_HPP
+
+#define CV_SIMD256 1
+#define CV_SIMD256_64F 1
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+///////// Utils ////////////
+
+inline __m256i _v256_combine(const __m128i& lo, const __m128i& hi)
+{ return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); }
+
+inline __m256 _v256_combine(const __m128& lo, const __m128& hi)
+{ return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1); }
+
+inline __m256d _v256_combine(const __m128d& lo, const __m128d& hi)
+{ return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 1); }
+
+inline int _v_cvtsi256_si32(const __m256i& a)
+{ return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); }
+
+inline __m256i _v256_shuffle_odd_64(const __m256i& v)
+{ return _mm256_permute4x64_epi64(v, _MM_SHUFFLE(3, 1, 2, 0)); }
+
+inline __m256d _v256_shuffle_odd_64(const __m256d& v)
+{ return _mm256_permute4x64_pd(v, _MM_SHUFFLE(3, 1, 2, 0)); }
+
+template<int imm>
+inline __m256i _v256_permute2x128(const __m256i& a, const __m256i& b)
+{ return _mm256_permute2x128_si256(a, b, imm); }
+
+template<int imm>
+inline __m256 _v256_permute2x128(const __m256& a, const __m256& b)
+{ return _mm256_permute2f128_ps(a, b, imm); }
+
+template<int imm>
+inline __m256d _v256_permute2x128(const __m256d& a, const __m256d& b)
+{ return _mm256_permute2f128_pd(a, b, imm); }
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v256_permute2x128(const _Tpvec& a, const _Tpvec& b)
+{ return _Tpvec(_v256_permute2x128<imm>(a.val, b.val)); }
+
+template<int imm>
+inline __m256i _v256_permute4x64(const __m256i& a)
+{ return _mm256_permute4x64_epi64(a, imm); }
+
+template<int imm>
+inline __m256d _v256_permute4x64(const __m256d& a)
+{ return _mm256_permute4x64_pd(a, imm); }
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v256_permute4x64(const _Tpvec& a)
+{ return _Tpvec(_v256_permute4x64<imm>(a.val)); }
+
+inline __m128i _v256_extract_high(const __m256i& v)
+{ return _mm256_extracti128_si256(v, 1); }
+
+inline __m128  _v256_extract_high(const __m256& v)
+{ return _mm256_extractf128_ps(v, 1); }
+
+inline __m128d _v256_extract_high(const __m256d& v)
+{ return _mm256_extractf128_pd(v, 1); }
+
+inline __m128i _v256_extract_low(const __m256i& v)
+{ return _mm256_castsi256_si128(v); }
+
+inline __m128  _v256_extract_low(const __m256& v)
+{ return _mm256_castps256_ps128(v); }
+
+inline __m128d _v256_extract_low(const __m256d& v)
+{ return _mm256_castpd256_pd128(v); }
+
+///////// Types ////////////
+
+struct v_uint8x32
+{
+    typedef uchar lane_type;
+    enum { nlanes = 32 };
+    __m256i val;
+
+    explicit v_uint8x32(__m256i v) : val(v) {}
+    v_uint8x32(uchar v0,  uchar v1,  uchar v2,  uchar v3,
+               uchar v4,  uchar v5,  uchar v6,  uchar v7,
+               uchar v8,  uchar v9,  uchar v10, uchar v11,
+               uchar v12, uchar v13, uchar v14, uchar v15,
+               uchar v16, uchar v17, uchar v18, uchar v19,
+               uchar v20, uchar v21, uchar v22, uchar v23,
+               uchar v24, uchar v25, uchar v26, uchar v27,
+               uchar v28, uchar v29, uchar v30, uchar v31)
+    {
+        val = _mm256_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
+            (char)v4,  (char)v5,  (char)v6 , (char)v7,  (char)v8,  (char)v9,
+            (char)v10, (char)v11, (char)v12, (char)v13, (char)v14, (char)v15,
+            (char)v16, (char)v17, (char)v18, (char)v19, (char)v20, (char)v21,
+            (char)v22, (char)v23, (char)v24, (char)v25, (char)v26, (char)v27,
+            (char)v28, (char)v29, (char)v30, (char)v31);
+    }
+    v_uint8x32() : val(_mm256_setzero_si256()) {}
+    uchar get0() const { return (uchar)_v_cvtsi256_si32(val); }
+};
+
+struct v_int8x32
+{
+    typedef schar lane_type;
+    enum { nlanes = 32 };
+    __m256i val;
+
+    explicit v_int8x32(__m256i v) : val(v) {}
+    v_int8x32(schar v0,  schar v1,  schar v2,  schar v3,
+              schar v4,  schar v5,  schar v6,  schar v7,
+              schar v8,  schar v9,  schar v10, schar v11,
+              schar v12, schar v13, schar v14, schar v15,
+              schar v16, schar v17, schar v18, schar v19,
+              schar v20, schar v21, schar v22, schar v23,
+              schar v24, schar v25, schar v26, schar v27,
+              schar v28, schar v29, schar v30, schar v31)
+    {
+        val = _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
+            v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20,
+            v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
+    }
+    v_int8x32() : val(_mm256_setzero_si256()) {}
+    schar get0() const { return (schar)_v_cvtsi256_si32(val); }
+};
+
+struct v_uint16x16
+{
+    typedef ushort lane_type;
+    enum { nlanes = 16 };
+    __m256i val;
+
+    explicit v_uint16x16(__m256i v) : val(v) {}
+    v_uint16x16(ushort v0,  ushort v1,  ushort v2,  ushort v3,
+                ushort v4,  ushort v5,  ushort v6,  ushort v7,
+                ushort v8,  ushort v9,  ushort v10, ushort v11,
+                ushort v12, ushort v13, ushort v14, ushort v15)
+    {
+        val = _mm256_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
+            (short)v4,  (short)v5,  (short)v6,  (short)v7,  (short)v8,  (short)v9,
+            (short)v10, (short)v11, (short)v12, (short)v13, (short)v14, (short)v15);
+    }
+    v_uint16x16() : val(_mm256_setzero_si256()) {}
+    ushort get0() const { return (ushort)_v_cvtsi256_si32(val); }
+};
+
+struct v_int16x16
+{
+    typedef short lane_type;
+    enum { nlanes = 16 };
+    __m256i val;
+
+    explicit v_int16x16(__m256i v) : val(v) {}
+    v_int16x16(short v0,  short v1,  short v2,  short v3,
+               short v4,  short v5,  short v6,  short v7,
+               short v8,  short v9,  short v10, short v11,
+               short v12, short v13, short v14, short v15)
+    {
+        val = _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15);
+    }
+    v_int16x16() : val(_mm256_setzero_si256()) {}
+    short get0() const { return (short)_v_cvtsi256_si32(val); }
+};
+
+struct v_uint32x8
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 8 };
+    __m256i val;
+
+    explicit v_uint32x8(__m256i v) : val(v) {}
+    v_uint32x8(unsigned v0, unsigned v1, unsigned v2, unsigned v3,
+               unsigned v4, unsigned v5, unsigned v6, unsigned v7)
+    {
+        val = _mm256_setr_epi32((unsigned)v0, (unsigned)v1, (unsigned)v2,
+            (unsigned)v3, (unsigned)v4, (unsigned)v5, (unsigned)v6, (unsigned)v7);
+    }
+    v_uint32x8() : val(_mm256_setzero_si256()) {}
+    unsigned get0() const { return (unsigned)_v_cvtsi256_si32(val); }
+};
+
+struct v_int32x8
+{
+    typedef int lane_type;
+    enum { nlanes = 8 };
+    __m256i val;
+
+    explicit v_int32x8(__m256i v) : val(v) {}
+    v_int32x8(int v0, int v1, int v2, int v3,
+              int v4, int v5, int v6, int v7)
+    {
+        val = _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7);
+    }
+    v_int32x8() : val(_mm256_setzero_si256()) {}
+    int get0() const { return _v_cvtsi256_si32(val); }
+};
+
+struct v_float32x8
+{
+    typedef float lane_type;
+    enum { nlanes = 8 };
+    __m256 val;
+
+    explicit v_float32x8(__m256 v) : val(v) {}
+    v_float32x8(float v0, float v1, float v2, float v3,
+                float v4, float v5, float v6, float v7)
+    {
+        val = _mm256_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7);
+    }
+    v_float32x8() : val(_mm256_setzero_ps()) {}
+    float get0() const { return _mm_cvtss_f32(_mm256_castps256_ps128(val)); }
+};
+
+struct v_uint64x4
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 4 };
+    __m256i val;
+
+    explicit v_uint64x4(__m256i v) : val(v) {}
+    v_uint64x4(uint64 v0, uint64 v1, uint64 v2, uint64 v3)
+    { val = _mm256_setr_epi64x((int64)v0, (int64)v1, (int64)v2, (int64)v3); }
+    v_uint64x4() : val(_mm256_setzero_si256()) {}
+    uint64 get0() const
+    { return (uint64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val)); }
+};
+
+struct v_int64x4
+{
+    typedef int64 lane_type;
+    enum { nlanes = 4 };
+    __m256i val;
+
+    explicit v_int64x4(__m256i v) : val(v) {}
+    v_int64x4(int64 v0, int64 v1, int64 v2, int64 v3)
+    { val = _mm256_setr_epi64x(v0, v1, v2, v3); }
+    v_int64x4() : val(_mm256_setzero_si256()) {}
+    int64 get0() const { return (int64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val)); }
+};
+
+struct v_float64x4
+{
+    typedef double lane_type;
+    enum { nlanes = 4 };
+    __m256d val;
+
+    explicit v_float64x4(__m256d v) : val(v) {}
+    v_float64x4(double v0, double v1, double v2, double v3)
+    { val = _mm256_setr_pd(v0, v1, v2, v3); }
+    v_float64x4() : val(_mm256_setzero_pd()) {}
+    double get0() const { return _mm_cvtsd_f64(_mm256_castpd256_pd128(val)); }
+};
+
+struct v_float16x16
+{
+    typedef short lane_type;
+    enum { nlanes = 16 };
+    __m256i val;
+
+    explicit v_float16x16(__m256i v) : val(v) {}
+    v_float16x16(short v0, short v1, short v2, short v3,
+                 short v4, short v5, short v6, short v7,
+                 short v8, short v9, short v10, short v11,
+                 short v12, short v13, short v14, short v15)
+    {
+        val = _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+    }
+    v_float16x16() : val(_mm256_setzero_si256()) {}
+    short get0() const { return (short)_v_cvtsi256_si32(val); }
+};
+inline v_float16x16 v256_setzero_f16() { return v_float16x16(_mm256_setzero_si256()); }
+inline v_float16x16 v256_setall_f16(short val) { return v_float16x16(_mm256_set1_epi16(val)); }
+
+//////////////// Load and store operations ///////////////
+
+#define OPENCV_HAL_IMPL_AVX_LOADSTORE(_Tpvec, _Tp)                    \
+    inline _Tpvec v256_load(const _Tp* ptr)                           \
+    { return _Tpvec(_mm256_loadu_si256((const __m256i*)ptr)); }       \
+    inline _Tpvec v256_load_aligned(const _Tp* ptr)                   \
+    { return _Tpvec(_mm256_load_si256((const __m256i*)ptr)); }        \
+    inline _Tpvec v256_load_low(const _Tp* ptr)                       \
+    {                                                                 \
+        __m128i v128 = _mm_loadu_si128((const __m128i*)ptr);          \
+        return _Tpvec(_mm256_castsi128_si256(v128));                  \
+    }                                                                 \
+    inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1)  \
+    {                                                                 \
+        __m128i vlo = _mm_loadu_si128((const __m128i*)ptr0);          \
+        __m128i vhi = _mm_loadu_si128((const __m128i*)ptr1);          \
+        return _Tpvec(_v256_combine(vlo, vhi));                       \
+    }                                                                 \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                    \
+    { _mm256_storeu_si256((__m256i*)ptr, a.val); }                    \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)            \
+    { _mm256_store_si256((__m256i*)ptr, a.val); }                     \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                \
+    { _mm_storeu_si128((__m128i*)ptr, _v256_extract_low(a.val)); }    \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)               \
+    { _mm_storeu_si128((__m128i*)ptr, _v256_extract_high(a.val)); }
+
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint8x32,  uchar)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int8x32,   schar)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint16x16, ushort)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int16x16,  short)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint32x8,  unsigned)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int32x8,   int)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint64x4,  uint64)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int64x4,   int64)
+
+#define OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(_Tpvec, _Tp, suffix, halfreg)   \
+    inline _Tpvec v256_load(const _Tp* ptr)                               \
+    { return _Tpvec(_mm256_loadu_##suffix(ptr)); }                        \
+    inline _Tpvec v256_load_aligned(const _Tp* ptr)                       \
+    { return _Tpvec(_mm256_load_##suffix(ptr)); }                         \
+    inline _Tpvec v256_load_low(const _Tp* ptr)                           \
+    {                                                                     \
+        return _Tpvec(_mm256_cast##suffix##128_##suffix##256              \
+                     (_mm_loadu_##suffix(ptr)));                          \
+    }                                                                     \
+    inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1)      \
+    {                                                                     \
+        halfreg vlo = _mm_loadu_##suffix(ptr0);                           \
+        halfreg vhi = _mm_loadu_##suffix(ptr1);                           \
+        return _Tpvec(_v256_combine(vlo, vhi));                           \
+    }                                                                     \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                        \
+    { _mm256_storeu_##suffix(ptr, a.val); }                               \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)                \
+    { _mm256_store_##suffix(ptr, a.val); }                                \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                    \
+    { _mm_storeu_##suffix(ptr, _v256_extract_low(a.val)); }               \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)                   \
+    { _mm_storeu_##suffix(ptr, _v256_extract_high(a.val)); }
+
+OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float32x8, float,  ps, __m128)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float64x4, double, pd, __m128d)
+
+#define OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, _Tpvecf, suffix, cast) \
+    inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a)   \
+    { return _Tpvec(cast(a.val)); }
+
+#define OPENCV_HAL_IMPL_AVX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s)          \
+    inline _Tpvec v256_setzero_##suffix()                                        \
+    { return _Tpvec(_mm256_setzero_si256()); }                                   \
+    inline _Tpvec v256_setall_##suffix(_Tp v)                                    \
+    { return _Tpvec(_mm256_set1_##ssuffix((ctype_s)v)); }                        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32,  suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32,   suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16,  suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8,  suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8,   suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4,  suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4,   suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float32x8, suffix, _mm256_castps_si256)   \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float64x4, suffix, _mm256_castpd_si256)
+
+OPENCV_HAL_IMPL_AVX_INIT(v_uint8x32,  uchar,    u8,  epi8,   char)
+OPENCV_HAL_IMPL_AVX_INIT(v_int8x32,   schar,    s8,  epi8,   char)
+OPENCV_HAL_IMPL_AVX_INIT(v_uint16x16, ushort,   u16, epi16,  short)
+OPENCV_HAL_IMPL_AVX_INIT(v_int16x16,  short,    s16, epi16,  short)
+OPENCV_HAL_IMPL_AVX_INIT(v_uint32x8,  unsigned, u32, epi32,  int)
+OPENCV_HAL_IMPL_AVX_INIT(v_int32x8,   int,      s32, epi32,  int)
+OPENCV_HAL_IMPL_AVX_INIT(v_uint64x4,  uint64,   u64, epi64x, int64)
+OPENCV_HAL_IMPL_AVX_INIT(v_int64x4,   int64,    s64, epi64x, int64)
+
+#define OPENCV_HAL_IMPL_AVX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
+    inline _Tpvec v256_setzero_##suffix()                                \
+    { return _Tpvec(_mm256_setzero_##zsuffix()); }                       \
+    inline _Tpvec v256_setall_##suffix(_Tp v)                            \
+    { return _Tpvec(_mm256_set1_##zsuffix(v)); }                         \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32,   suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8,   suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4,   suffix, cast)
+
+OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float32x8, float,  f32, ps, _mm256_castsi256_ps)
+OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float64x4, double, f64, pd, _mm256_castsi256_pd)
+
+inline v_float32x8 v_reinterpret_as_f32(const v_float32x8& a)
+{ return a; }
+inline v_float32x8 v_reinterpret_as_f32(const v_float64x4& a)
+{ return v_float32x8(_mm256_castpd_ps(a.val)); }
+
+inline v_float64x4 v_reinterpret_as_f64(const v_float64x4& a)
+{ return a; }
+inline v_float64x4 v_reinterpret_as_f64(const v_float32x8& a)
+{ return v_float64x4(_mm256_castps_pd(a.val)); }
+
+inline v_float16x16 v256_load_f16(const short* ptr)
+{ return v_float16x16(_mm256_loadu_si256((const __m256i*)ptr)); }
+inline v_float16x16 v256_load_f16_aligned(const short* ptr)
+{ return v_float16x16(_mm256_load_si256((const __m256i*)ptr)); }
+
+inline void v_store(short* ptr, const v_float16x16& a)
+{ _mm256_storeu_si256((__m256i*)ptr, a.val); }
+inline void v_store_aligned(short* ptr, const v_float16x16& a)
+{ _mm256_store_si256((__m256i*)ptr, a.val); }
+
+/* Recombine */
+/*#define OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, perm)                    \
+    inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)    \
+    { return _Tpvec(perm(a.val, b.val, 0x20)); }                     \
+    inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)   \
+    { return _Tpvec(perm(a.val, b.val, 0x31)); }                     \
+    inline void v_recombine(const _Tpvec& a, const _Tpvec& b,        \
+                             _Tpvec& c, _Tpvec& d)                   \
+    { c = v_combine_low(a, b); d = v_combine_high(a, b); }
+
+#define OPENCV_HAL_IMPL_AVX_UNPACKS(_Tpvec, suffix)                  \
+    OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, _mm256_permute2x128_si256)   \
+    inline void v_zip(const _Tpvec& a0, const _Tpvec& a1,            \
+                             _Tpvec& b0, _Tpvec& b1)                 \
+    {                                                                \
+        __m256i v0 = _v256_shuffle_odd_64(a0.val);                   \
+        __m256i v1 = _v256_shuffle_odd_64(a1.val);                   \
+        b0.val = _mm256_unpacklo_##suffix(v0, v1);                   \
+        b1.val = _mm256_unpackhi_##suffix(v0, v1);                   \
+    }
+
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint8x32,  epi8)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_int8x32,   epi8)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint16x16, epi16)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_int16x16,  epi16)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint32x8,  epi32)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_int32x8,   epi32)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint64x4,  epi64)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_int64x4,   epi64)
+OPENCV_HAL_IMPL_AVX_COMBINE(v_float32x8, _mm256_permute2f128_ps)
+OPENCV_HAL_IMPL_AVX_COMBINE(v_float64x4, _mm256_permute2f128_pd)
+
+inline void v_zip(const v_float32x8& a0, const v_float32x8& a1, v_float32x8& b0, v_float32x8& b1)
+{
+    __m256 v0 = _mm256_unpacklo_ps(a0.val, a1.val);
+    __m256 v1 = _mm256_unpackhi_ps(a0.val, a1.val);
+    v_recombine(v_float32x8(v0), v_float32x8(v1), b0, b1);
+}
+
+inline void v_zip(const v_float64x4& a0, const v_float64x4& a1, v_float64x4& b0, v_float64x4& b1)
+{
+    __m256d v0 = _v_shuffle_odd_64(a0.val);
+    __m256d v1 = _v_shuffle_odd_64(a1.val);
+    b0.val = _mm256_unpacklo_pd(v0, v1);
+    b1.val = _mm256_unpackhi_pd(v0, v1);
+}*/
+
+//////////////// Variant Value reordering ///////////////
+
+// unpacks
+#define OPENCV_HAL_IMPL_AVX_UNPACK(_Tpvec, suffix)                 \
+    inline _Tpvec v256_unpacklo(const _Tpvec& a, const _Tpvec& b)  \
+    { return _Tpvec(_mm256_unpacklo_##suffix(a.val, b.val)); }     \
+    inline _Tpvec v256_unpackhi(const _Tpvec& a, const _Tpvec& b)  \
+    { return _Tpvec(_mm256_unpackhi_##suffix(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_AVX_UNPACK(v_uint8x32,  epi8)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_int8x32,   epi8)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_uint16x16, epi16)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_int16x16,  epi16)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_uint32x8,  epi32)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_int32x8,   epi32)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_uint64x4,  epi64)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_int64x4,   epi64)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_float32x8, ps)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_float64x4, pd)
+
+// blend
+#define OPENCV_HAL_IMPL_AVX_BLEND(_Tpvec, suffix)               \
+    template<int m>                                             \
+    inline _Tpvec v256_blend(const _Tpvec& a, const _Tpvec& b)  \
+    { return _Tpvec(_mm256_blend_##suffix(a.val, b.val, m)); }
+
+OPENCV_HAL_IMPL_AVX_BLEND(v_uint16x16, epi16)
+OPENCV_HAL_IMPL_AVX_BLEND(v_int16x16,  epi16)
+OPENCV_HAL_IMPL_AVX_BLEND(v_uint32x8,  epi32)
+OPENCV_HAL_IMPL_AVX_BLEND(v_int32x8,   epi32)
+OPENCV_HAL_IMPL_AVX_BLEND(v_float32x8, ps)
+OPENCV_HAL_IMPL_AVX_BLEND(v_float64x4, pd)
+
+template<int m>
+inline v_uint64x4 v256_blend(const v_uint64x4& a, const v_uint64x4& b)
+{
+    enum {M0 = m};
+    enum {M1 = (M0 | (M0 << 2)) & 0x33};
+    enum {M2 = (M1 | (M1 << 1)) & 0x55};
+    enum {MM =  M2 | (M2 << 1)};
+    return v_uint64x4(_mm256_blend_epi32(a.val, b.val, MM));
+}
+template<int m>
+inline v_int64x4 v256_blend(const v_int64x4& a, const v_int64x4& b)
+{ return v_int64x4(v256_blend<m>(v_uint64x4(a.val), v_uint64x4(b.val)).val); }
+
+// shuffle
+// todo: emluate 64bit
+#define OPENCV_HAL_IMPL_AVX_SHUFFLE(_Tpvec, intrin)  \
+    template<int m>                                  \
+    inline _Tpvec v256_shuffle(const _Tpvec& a)      \
+    { return _Tpvec(_mm256_##intrin(a.val, m)); }
+
+OPENCV_HAL_IMPL_AVX_SHUFFLE(v_uint32x8,  shuffle_epi32)
+OPENCV_HAL_IMPL_AVX_SHUFFLE(v_int32x8,   shuffle_epi32)
+OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float32x8, permute_ps)
+OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float64x4, permute_pd)
+
+template<typename _Tpvec>
+inline void v256_zip(const _Tpvec& a, const _Tpvec& b, _Tpvec& ab0, _Tpvec& ab1)
+{
+    ab0 = v256_unpacklo(a, b);
+    ab1 = v256_unpackhi(a, b);
+}
+
+template<typename _Tpvec>
+inline _Tpvec v256_combine_diagonal(const _Tpvec& a, const _Tpvec& b)
+{ return _Tpvec(_mm256_blend_epi32(a.val, b.val, 0b11110000)); }
+
+inline v_float32x8 v256_combine_diagonal(const v_float32x8& a, const v_float32x8& b)
+{ return v256_blend<0b11110000>(a, b); }
+
+inline v_float64x4 v256_combine_diagonal(const v_float64x4& a, const v_float64x4& b)
+{ return v256_blend<0b1100>(a, b); }
+
+template<typename _Tpvec>
+inline _Tpvec v256_alignr_128(const _Tpvec& a, const _Tpvec& b)
+{ return v256_permute2x128<0x21>(a, b); }
+
+template<typename _Tpvec>
+inline _Tpvec v256_alignr_64(const _Tpvec& a, const _Tpvec& b)
+{ return _Tpvec(_mm256_alignr_epi8(a.val, b.val, 8)); }
+inline v_float64x4 v256_alignr_64(const v_float64x4& a, const v_float64x4& b)
+{ return v_float64x4(_mm256_shuffle_pd(b.val, a.val, _MM_SHUFFLE(0, 0, 1, 1))); }
+// todo: emulate float32
+
+template<typename _Tpvec>
+inline _Tpvec v256_swap_halves(const _Tpvec& a)
+{ return v256_permute2x128<1>(a, a); }
+
+template<typename _Tpvec>
+inline _Tpvec v256_reverse_64(const _Tpvec& a)
+{ return v256_permute4x64<_MM_SHUFFLE(0, 1, 2, 3)>(a); }
+
+// ZIP
+#define OPENCV_HAL_IMPL_AVX_ZIP(_Tpvec)                              \
+    inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)    \
+    { return v256_permute2x128<0x20>(a, b); }                        \
+    inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)   \
+    { return v256_permute2x128<0x31>(a, b); }                        \
+    inline void v_recombine(const _Tpvec& a, const _Tpvec& b,        \
+                             _Tpvec& c, _Tpvec& d)                   \
+    {                                                                \
+        _Tpvec a1b0 = v256_alignr_128(a, b);                         \
+        c = v256_combine_diagonal(a, a1b0);                          \
+        d = v256_combine_diagonal(a1b0, b);                          \
+    }                                                                \
+    inline void v_zip(const _Tpvec& a, const _Tpvec& b,              \
+                      _Tpvec& ab0, _Tpvec& ab1)                      \
+    {                                                                \
+        _Tpvec ab0ab2, ab1ab3;                                       \
+        v256_zip(a, b, ab0ab2, ab1ab3);                              \
+        v_recombine(ab0ab2, ab1ab3, ab0, ab1);                       \
+    }
+
+OPENCV_HAL_IMPL_AVX_ZIP(v_uint8x32)
+OPENCV_HAL_IMPL_AVX_ZIP(v_int8x32)
+OPENCV_HAL_IMPL_AVX_ZIP(v_uint16x16)
+OPENCV_HAL_IMPL_AVX_ZIP(v_int16x16)
+OPENCV_HAL_IMPL_AVX_ZIP(v_uint32x8)
+OPENCV_HAL_IMPL_AVX_ZIP(v_int32x8)
+OPENCV_HAL_IMPL_AVX_ZIP(v_uint64x4)
+OPENCV_HAL_IMPL_AVX_ZIP(v_int64x4)
+OPENCV_HAL_IMPL_AVX_ZIP(v_float32x8)
+OPENCV_HAL_IMPL_AVX_ZIP(v_float64x4)
+
+////////// Arithmetic, bitwise and comparison operations /////////
+
+/* Element-wise binary and unary operations */
+
+/** Arithmetics **/
+#define OPENCV_HAL_IMPL_AVX_BIN_OP(bin_op, _Tpvec, intrin)            \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)  \
+    { return _Tpvec(intrin(a.val, b.val)); }                          \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)    \
+    { a.val = intrin(a.val, b.val); return a; }
+
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint8x32,  _mm256_adds_epu8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint8x32,  _mm256_subs_epu8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32,   _mm256_adds_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int8x32,   _mm256_subs_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint16x16, _mm256_adds_epu16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint16x16, _mm256_subs_epu16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint16x16, _mm256_mullo_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int16x16,  _mm256_adds_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int16x16,  _mm256_subs_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int16x16,  _mm256_mullo_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint32x8,  _mm256_add_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint32x8,  _mm256_sub_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint32x8,  _mm256_mullo_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int32x8,   _mm256_add_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int32x8,   _mm256_sub_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int32x8,   _mm256_mullo_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint64x4,  _mm256_add_epi64)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint64x4,  _mm256_sub_epi64)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int64x4,   _mm256_add_epi64)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int64x4,   _mm256_sub_epi64)
+
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float32x8, _mm256_add_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float32x8, _mm256_sub_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float32x8, _mm256_mul_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float32x8, _mm256_div_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float64x4, _mm256_add_pd)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float64x4, _mm256_mul_pd)
+OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float64x4, _mm256_div_pd)
+
+inline void v_mul_expand(const v_int16x16& a, const v_int16x16& b,
+                         v_int32x8& c, v_int32x8& d)
+{
+    v_int16x16 vhi = v_int16x16(_mm256_mulhi_epi16(a.val, b.val));
+
+    v_int16x16 v0, v1;
+    v_zip(a * b, vhi, v0, v1);
+
+    c = v_reinterpret_as_s32(v0);
+    d = v_reinterpret_as_s32(v1);
+}
+
+inline void v_mul_expand(const v_uint16x16& a, const v_uint16x16& b,
+                         v_uint32x8& c, v_uint32x8& d)
+{
+    v_uint16x16 vhi = v_uint16x16(_mm256_mulhi_epu16(a.val, b.val));
+
+    v_uint16x16 v0, v1;
+    v_zip(a * b, vhi, v0, v1);
+
+    c = v_reinterpret_as_u32(v0);
+    d = v_reinterpret_as_u32(v1);
+}
+
+inline void v_mul_expand(const v_uint32x8& a, const v_uint32x8& b,
+                         v_uint64x4& c, v_uint64x4& d)
+{
+    __m256i v0 = _mm256_mul_epu32(a.val, b.val);
+    __m256i v1 = _mm256_mul_epu32(_mm256_srli_epi64(a.val, 32), _mm256_srli_epi64(b.val, 32));
+    v_zip(v_uint64x4(v0), v_uint64x4(v1), c, d);
+}
+
+
+/** Non-saturating arithmetics **/
+#define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
+    inline _Tpvec func(const _Tpvec& a, const _Tpvec& b)   \
+    { return _Tpvec(intrin(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint8x32,  _mm256_add_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int8x32,   _mm256_add_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint16x16, _mm256_add_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int16x16,  _mm256_add_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint8x32,  _mm256_sub_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int8x32,   _mm256_sub_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint16x16, _mm256_sub_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int16x16,  _mm256_sub_epi16)
+
+/** Bitwise shifts **/
+#define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai)  \
+    inline _Tpuvec operator << (const _Tpuvec& a, int imm)            \
+    { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); }             \
+    inline _Tpsvec operator << (const _Tpsvec& a, int imm)            \
+    { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); }             \
+    inline _Tpuvec operator >> (const _Tpuvec& a, int imm)            \
+    { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); }             \
+    inline _Tpsvec operator >> (const _Tpsvec& a, int imm)            \
+    { return _Tpsvec(srai(a.val, imm)); }                             \
+    template<int imm>                                                 \
+    inline _Tpuvec v_shl(const _Tpuvec& a)                            \
+    { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); }             \
+    template<int imm>                                                 \
+    inline _Tpsvec v_shl(const _Tpsvec& a)                            \
+    { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); }             \
+    template<int imm>                                                 \
+    inline _Tpuvec v_shr(const _Tpuvec& a)                            \
+    { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); }             \
+    template<int imm>                                                 \
+    inline _Tpsvec v_shr(const _Tpsvec& a)                            \
+    { return _Tpsvec(srai(a.val, imm)); }
+
+OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint16x16, v_int16x16, epi16, _mm256_srai_epi16)
+OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint32x8,  v_int32x8,  epi32, _mm256_srai_epi32)
+
+inline __m256i _mm256_srai_epi64xx(const __m256i a, int imm)
+{
+    __m256i d = _mm256_set1_epi64x((int64)1 << 63);
+    __m256i r = _mm256_srli_epi64(_mm256_add_epi64(a, d), imm);
+    return _mm256_sub_epi64(r, _mm256_srli_epi64(d, imm));
+}
+OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint64x4,  v_int64x4,  epi64, _mm256_srai_epi64xx)
+
+
+/** Bitwise logic **/
+#define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const)  \
+    OPENCV_HAL_IMPL_AVX_BIN_OP(&, _Tpvec, _mm256_and_##suffix)   \
+    OPENCV_HAL_IMPL_AVX_BIN_OP(|, _Tpvec, _mm256_or_##suffix)    \
+    OPENCV_HAL_IMPL_AVX_BIN_OP(^, _Tpvec, _mm256_xor_##suffix)   \
+    inline _Tpvec operator ~ (const _Tpvec& a)                   \
+    { return _Tpvec(_mm256_xor_##suffix(a.val, not_const)); }
+
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint8x32,   si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int8x32,    si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint16x16,  si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int16x16,   si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint32x8,   si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int32x8,    si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint64x4,   si256, _mm256_set1_epi64x(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int64x4,    si256, _mm256_set1_epi64x(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float32x8,  ps,    _mm256_castsi256_ps(_mm256_set1_epi32(-1)))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float64x4,  pd,    _mm256_castsi256_pd(_mm256_set1_epi32(-1)))
+
+/** Select **/
+#define OPENCV_HAL_IMPL_AVX_SELECT(_Tpvec, suffix)                               \
+    inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+    { return _Tpvec(_mm256_blendv_##suffix(b.val, a.val, mask.val)); }
+
+OPENCV_HAL_IMPL_AVX_SELECT(v_uint8x32,  epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_int8x32,   epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_uint16x16, epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_int16x16,  epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_uint32x8,  epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_int32x8,   epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_float32x8, ps)
+OPENCV_HAL_IMPL_AVX_SELECT(v_float64x4, pd)
+
+/** Comparison **/
+#define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec)                     \
+    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)  \
+    { return ~(a == b); }                                         \
+    inline _Tpvec operator <  (const _Tpvec& a, const _Tpvec& b)  \
+    { return b > a; }                                             \
+    inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b)  \
+    { return ~(a < b); }                                          \
+    inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b)  \
+    { return b >= a; }
+
+#define OPENCV_HAL_IMPL_AVX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, sbit)   \
+    inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b)      \
+    { return _Tpuvec(_mm256_cmpeq_##suffix(a.val, b.val)); }             \
+    inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b)       \
+    {                                                                    \
+        __m256i smask = _mm256_set1_##suffix(sbit);                      \
+        return _Tpuvec(_mm256_cmpgt_##suffix(                            \
+                       _mm256_xor_si256(a.val, smask),                   \
+                       _mm256_xor_si256(b.val, smask)));                 \
+    }                                                                    \
+    inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b)      \
+    { return _Tpsvec(_mm256_cmpeq_##suffix(a.val, b.val)); }             \
+    inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b)       \
+    { return _Tpsvec(_mm256_cmpgt_##suffix(a.val, b.val)); }             \
+    OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpuvec)                               \
+    OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpsvec)
+
+OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint8x32,  v_int8x32,  epi8,  (char)-128)
+OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint16x16, v_int16x16, epi16, (short)-32768)
+OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint32x8,  v_int32x8,  epi32, (int)0x80000000)
+
+#define OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(_Tpvec)                 \
+    inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+    { return _Tpvec(_mm256_cmpeq_epi64(a.val, b.val)); }         \
+    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+    { return ~(a == b); }
+
+OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_uint64x4)
+OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_int64x4)
+
+#define OPENCV_HAL_IMPL_AVX_CMP_FLT(bin_op, imm8, _Tpvec, suffix)    \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+    { return _Tpvec(_mm256_cmp_##suffix(a.val, b.val, imm8)); }
+
+#define OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(_Tpvec, suffix)               \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(==, _CMP_EQ_OQ,  _Tpvec, suffix)     \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, suffix)     \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(<,  _CMP_LT_OQ,  _Tpvec, suffix)     \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(>,  _CMP_GT_OQ,  _Tpvec, suffix)     \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(<=, _CMP_LE_OQ,  _Tpvec, suffix)     \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(>=, _CMP_GE_OQ,  _Tpvec, suffix)
+
+OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float32x8, ps)
+OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float64x4, pd)
+
+/** min/max **/
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint8x32,  _mm256_min_epu8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint8x32,  _mm256_max_epu8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int8x32,   _mm256_min_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int8x32,   _mm256_max_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint16x16, _mm256_min_epu16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint16x16, _mm256_max_epu16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int16x16,  _mm256_min_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int16x16,  _mm256_max_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint32x8,  _mm256_min_epu32)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint32x8,  _mm256_max_epu32)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int32x8,   _mm256_min_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int32x8,   _mm256_max_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float32x8, _mm256_min_ps)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float32x8, _mm256_max_ps)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float64x4, _mm256_min_pd)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float64x4, _mm256_max_pd)
+
+/** Rotate **/
+template<int imm>
+inline v_uint8x32 v_rotate_left(const v_uint8x32& a, const v_uint8x32& b)
+{
+    __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x03);
+
+    switch(imm)
+    {
+        case 0:  return a;
+        case 32: return b;
+        case 16: return v_uint8x32(swap);
+    }
+
+    if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(a.val, swap, 16 - imm));
+    if (imm < 32) return v_uint8x32(_mm256_alignr_epi8(swap, b.val, 32 - imm));
+
+    return v_uint8x32();
+}
+
+template<int imm>
+inline v_uint8x32 v_rotate_right(const v_uint8x32& a, const v_uint8x32& b)
+{
+    __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x21);
+
+    switch(imm)
+    {
+        case 0:  return a;
+        case 32: return b;
+        case 16: return v_uint8x32(swap);
+    }
+
+    if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(swap, a.val, imm));
+    if (imm < 32) return v_uint8x32(_mm256_alignr_epi8(b.val, swap, imm - 16));
+
+    return v_uint8x32();
+}
+
+template<int imm>
+inline v_uint8x32 v_rotate_left(const v_uint8x32& a)
+{
+    v_uint8x32 res;
+    // ESAC control[3] ? [127:0] = 0
+    __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(0, 0, 2, 0));
+
+    if (imm == 0)
+        return a;
+    if (imm == 16)
+        res.val = swapz;
+    else if (imm < 16)
+        res.val = _mm256_alignr_epi8(a.val, swapz, 16 - imm);
+    else if (imm < 32)
+        res.val = _mm256_slli_si256(swapz, imm - 16);
+    else
+        return v_uint8x32();
+    return res;
+}
+
+template<int imm>
+inline v_uint8x32 v_rotate_right(const v_uint8x32& a)
+{
+    v_uint8x32 res;
+    // ESAC control[3] ? [127:0] = 0
+    __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(2, 0, 0, 1));
+
+    if (imm == 0)
+        return a;
+    if (imm == 16)
+        res.val = swapz;
+    else if (imm < 16)
+        res.val = _mm256_alignr_epi8(swapz, a.val, imm);
+    else if (imm < 32)
+        res.val = _mm256_srli_si256(swapz, imm - 16);
+    else
+        return v_uint8x32();
+    return res;
+}
+
+#define OPENCV_HAL_IMPL_AVX_ROTATE_CAST(intrin, _Tpvec, cast)   \
+    template<int imm>                                           \
+    inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b)      \
+    {                                                           \
+        const int w = sizeof(typename _Tpvec::lane_type);       \
+        v_uint8x32 ret = intrin<imm*w>(v_reinterpret_as_u8(a),  \
+                                       v_reinterpret_as_u8(b)); \
+        return _Tpvec(cast(ret.val));                           \
+    }                                                           \
+    template<int imm>                                           \
+    inline _Tpvec intrin(const _Tpvec& a)                       \
+    {                                                           \
+        const int w = sizeof(typename _Tpvec::lane_type);       \
+        v_uint8x32 ret = intrin<imm*w>(v_reinterpret_as_u8(a)); \
+        return _Tpvec(cast(ret.val));                           \
+    }
+
+#define OPENCV_HAL_IMPL_AVX_ROTATE(_Tpvec)                                  \
+    OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left,  _Tpvec, OPENCV_HAL_NOP) \
+    OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, _Tpvec, OPENCV_HAL_NOP)
+
+OPENCV_HAL_IMPL_AVX_ROTATE(v_int8x32)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_uint16x16)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_int16x16)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_uint32x8)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_int32x8)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_uint64x4)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_int64x4)
+
+OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left,  v_float32x8, _mm256_castsi256_ps)
+OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float32x8, _mm256_castsi256_ps)
+OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left,  v_float64x4, _mm256_castsi256_pd)
+OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float64x4, _mm256_castsi256_pd)
+
+////////// Reduce and mask /////////
+
+/** Reduce **/
+#define OPENCV_HAL_IMPL_AVX_REDUCE_16(_Tpvec, sctype, func, intrin) \
+    inline sctype v_reduce_##func(const _Tpvec& a)                  \
+    {                                                               \
+        __m128i v0 = _v256_extract_low(a.val);                      \
+        __m128i v1 = _v256_extract_high(a.val);                     \
+        v0 = intrin(v0, v1);                                        \
+        v0 = intrin(v0, _mm_srli_si128(v0, 8));                     \
+        v0 = intrin(v0, _mm_srli_si128(v0, 4));                     \
+        v0 = intrin(v0, _mm_srli_si128(v0, 2));                     \
+        return (sctype) _mm_cvtsi128_si32(v0);                      \
+    }
+
+OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16, ushort, min, _mm_min_epu16)
+OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16,  short,  min, _mm_min_epi16)
+OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16, ushort, max, _mm_max_epu16)
+OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16,  short,  max, _mm_max_epi16)
+
+#define OPENCV_HAL_IMPL_AVX_REDUCE_8(_Tpvec, sctype, func, intrin) \
+    inline sctype v_reduce_##func(const _Tpvec& a)                 \
+    {                                                              \
+        __m128i v0 = _v256_extract_low(a.val);                     \
+        __m128i v1 = _v256_extract_high(a.val);                    \
+        v0 = intrin(v0, v1);                                       \
+        v0 = intrin(v0, _mm_srli_si128(v0, 8));                    \
+        v0 = intrin(v0, _mm_srli_si128(v0, 4));                    \
+        return (sctype) _mm_cvtsi128_si32(v0);                     \
+    }
+
+OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8, unsigned, min, _mm_min_epu32)
+OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8,  int,      min, _mm_min_epi32)
+OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8, unsigned, max, _mm_max_epu32)
+OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8,  int,      max, _mm_max_epi32)
+
+#define OPENCV_HAL_IMPL_AVX_REDUCE_FLT(func, intrin)                  \
+    inline float v_reduce_##func(const v_float32x8& a)                \
+    {                                                                 \
+        __m128 v0 = _v256_extract_low(a.val);                         \
+        __m128 v1 = _v256_extract_high(a.val);                        \
+        v0 = intrin(v0, v1);                                          \
+        v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 3, 2))); \
+        v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 0, 3))); \
+        return _mm_cvtss_f32(v0);                                     \
+    }
+
+OPENCV_HAL_IMPL_AVX_REDUCE_FLT(min, _mm_min_ps)
+OPENCV_HAL_IMPL_AVX_REDUCE_FLT(max, _mm_max_ps)
+
+inline ushort v_reduce_sum(const v_uint16x16& a)
+{
+    __m128i a0 = _v256_extract_low(a.val);
+    __m128i a1 = _v256_extract_high(a.val);
+
+    __m128i s0 = _mm_adds_epu16(a0, a1);
+            s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
+            s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
+            s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 2));
+
+    return (ushort)_mm_cvtsi128_si32(s0);
+}
+
+inline short v_reduce_sum(const v_int16x16& a)
+{
+    __m256i s0 = _mm256_hadds_epi16(a.val, a.val);
+            s0 = _mm256_hadds_epi16(s0, s0);
+            s0 = _mm256_hadds_epi16(s0, s0);
+
+    __m128i s1 = _v256_extract_high(s0);
+            s1 = _mm_adds_epi16(_v256_extract_low(s0), s1);
+
+    return (short)_mm_cvtsi128_si32(s1);
+}
+
+inline int v_reduce_sum(const v_int32x8& a)
+{
+    __m256i s0 = _mm256_hadd_epi32(a.val, a.val);
+            s0 = _mm256_hadd_epi32(s0, s0);
+
+    __m128i s1 = _v256_extract_high(s0);
+            s1 = _mm_add_epi32(_v256_extract_low(s0), s1);
+
+    return _mm_cvtsi128_si32(s1);
+}
+
+inline unsigned v_reduce_sum(const v_uint32x8& a)
+{ return v_reduce_sum(v_reinterpret_as_s32(a)); }
+
+inline float v_reduce_sum(const v_float32x8& a)
+{
+    __m256 s0 = _mm256_hadd_ps(a.val, a.val);
+           s0 = _mm256_hadd_ps(s0, s0);
+
+    __m128 s1 = _v256_extract_high(s0);
+           s1 = _mm_add_ps(_v256_extract_low(s0), s1);
+
+    return _mm_cvtss_f32(s1);
+}
+
+inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b,
+                                 const v_float32x8& c, const v_float32x8& d)
+{
+    __m256 ab = _mm256_hadd_ps(a.val, b.val);
+    __m256 cd = _mm256_hadd_ps(c.val, d.val);
+    return v_float32x8(_mm256_hadd_ps(ab, cd));
+}
+
+/** Popcount **/
+#define OPENCV_HAL_IMPL_AVX_POPCOUNT(_Tpvec)                     \
+    inline v_uint32x8 v_popcount(const _Tpvec& a)                \
+    {                                                            \
+        const v_uint32x8 m1 = v256_setall_u32(0x55555555);       \
+        const v_uint32x8 m2 = v256_setall_u32(0x33333333);       \
+        const v_uint32x8 m4 = v256_setall_u32(0x0f0f0f0f);       \
+        v_uint32x8 p  = v_reinterpret_as_u32(a);                 \
+        p = ((p >> 1) & m1) + (p & m1);                          \
+        p = ((p >> 2) & m2) + (p & m2);                          \
+        p = ((p >> 4) & m4) + (p & m4);                          \
+        p.val = _mm256_sad_epu8(p.val, _mm256_setzero_si256());  \
+        return p;                                                \
+    }
+
+OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint8x32)
+OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int8x32)
+OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint16x16)
+OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int16x16)
+OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint32x8)
+OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int32x8)
+
+/** Mask **/
+inline int v_signmask(const v_int8x32& a)
+{ return _mm256_movemask_epi8(a.val); }
+inline int v_signmask(const v_uint8x32& a)
+{ return v_signmask(v_reinterpret_as_s8(a)); }
+
+inline int v_signmask(const v_int16x16& a)
+{
+    v_int8x32 v = v_int8x32(_mm256_packs_epi16(a.val, a.val));
+    return v_signmask(v) & 255;
+}
+inline int v_signmask(const v_uint16x16& a)
+{ return v_signmask(v_reinterpret_as_s16(a)); }
+
+inline int v_signmask(const v_int32x8& a)
+{
+    __m256i a16 = _mm256_packs_epi32(a.val, a.val);
+    v_int8x32 v = v_int8x32(_mm256_packs_epi16(a16, a16));
+    return v_signmask(v) & 15;
+}
+inline int v_signmask(const v_uint32x8& a)
+{ return v_signmask(v_reinterpret_as_s32(a)); }
+
+inline int v_signmask(const v_float32x8& a)
+{ return _mm256_movemask_ps(a.val); }
+inline int v_signmask(const v_float64x4& a)
+{ return _mm256_movemask_pd(a.val); }
+
+/** Checks **/
+#define OPENCV_HAL_IMPL_AVX_CHECK(_Tpvec, and_op, allmask)  \
+    inline bool v_check_all(const _Tpvec& a)                \
+    {                                                       \
+        int mask = v_signmask(v_reinterpret_as_s8(a));      \
+        return and_op(mask, allmask) == allmask;            \
+    }                                                       \
+    inline bool v_check_any(const _Tpvec& a)                \
+    {                                                       \
+        int mask = v_signmask(v_reinterpret_as_s8(a));      \
+        return and_op(mask, allmask) != 0;                  \
+    }
+
+OPENCV_HAL_IMPL_AVX_CHECK(v_uint8x32,  OPENCV_HAL_1ST, -1)
+OPENCV_HAL_IMPL_AVX_CHECK(v_int8x32,   OPENCV_HAL_1ST, -1)
+OPENCV_HAL_IMPL_AVX_CHECK(v_uint16x16, OPENCV_HAL_AND, (int)0xaaaa)
+OPENCV_HAL_IMPL_AVX_CHECK(v_int16x16,  OPENCV_HAL_AND, (int)0xaaaa)
+OPENCV_HAL_IMPL_AVX_CHECK(v_uint32x8,  OPENCV_HAL_AND, (int)0x8888)
+OPENCV_HAL_IMPL_AVX_CHECK(v_int32x8,   OPENCV_HAL_AND, (int)0x8888)
+
+#define OPENCV_HAL_IMPL_AVX_CHECK_FLT(_Tpvec, allmask) \
+    inline bool v_check_all(const _Tpvec& a)           \
+    {                                                  \
+        int mask = v_signmask(a);                      \
+        return mask == allmask;                        \
+    }                                                  \
+    inline bool v_check_any(const _Tpvec& a)           \
+    {                                                  \
+        int mask = v_signmask(a);                      \
+        return mask != 0;                              \
+    }
+
+OPENCV_HAL_IMPL_AVX_CHECK_FLT(v_float32x8, 255)
+OPENCV_HAL_IMPL_AVX_CHECK_FLT(v_float64x4, 15)
+
+
+////////// Other math /////////
+
+/** Some frequent operations **/
+#define OPENCV_HAL_IMPL_AVX_MULADD(_Tpvec, suffix)                            \
+    inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)    \
+    { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); }            \
+    inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
+    { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); }            \
+    inline _Tpvec v_sqrt(const _Tpvec& x)                                     \
+    { return _Tpvec(_mm256_sqrt_##suffix(x.val)); }                           \
+    inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)           \
+    { return v_fma(a, a, b * b); }                                            \
+    inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)               \
+    { return v_sqrt(v_fma(a, a, b*b)); }
+
+OPENCV_HAL_IMPL_AVX_MULADD(v_float32x8, ps)
+OPENCV_HAL_IMPL_AVX_MULADD(v_float64x4, pd)
+
+inline v_float32x8 v_invsqrt(const v_float32x8& x)
+{
+    v_float32x8 half = x * v256_setall_f32(0.5);
+    v_float32x8 t  = v_float32x8(_mm256_rsqrt_ps(x.val));
+    // todo: _mm256_fnmsub_ps
+    t *= v256_setall_f32(1.5) - ((t * t) * half);
+    return t;
+}
+
+inline v_float64x4 v_invsqrt(const v_float64x4& x)
+{
+    return v256_setall_f64(1.) / v_sqrt(x);
+}
+
+/** Absolute values **/
+#define OPENCV_HAL_IMPL_AVX_ABS(_Tpvec, suffix)         \
+    inline v_u##_Tpvec v_abs(const v_##_Tpvec& x)       \
+    { return v_u##_Tpvec(_mm256_abs_##suffix(x.val)); }
+
+OPENCV_HAL_IMPL_AVX_ABS(int8x32,  epi8)
+OPENCV_HAL_IMPL_AVX_ABS(int16x16, epi16)
+OPENCV_HAL_IMPL_AVX_ABS(int32x8,  epi32)
+
+inline v_float32x8 v_abs(const v_float32x8& x)
+{ return x & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))); }
+inline v_float64x4 v_abs(const v_float64x4& x)
+{ return x & v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1))); }
+
+/** Absolute difference **/
+inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b)
+{
+    v_int8x32 d = v_sub_wrap(a, b);
+    v_int8x32 m = a < b;
+    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
+}
+
+inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
+{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
+
+inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
+{
+    v_int32x8 d = a - b;
+    v_int32x8 m = a < b;
+    return v_reinterpret_as_u32((d ^ m) - m);
+}
+
+inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
+{ return v_abs(a - b); }
+
+inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
+{ return v_abs(a - b); }
+
+////////// Conversions /////////
+
+/** Rounding **/
+inline v_int32x8 v_round(const v_float32x8& a)
+{ return v_int32x8(_mm256_cvtps_epi32(a.val)); }
+
+inline v_int32x8 v_round(const v_float64x4& a)
+{ return v_int32x8(_mm256_castsi128_si256(_mm256_cvtpd_epi32(a.val))); }
+
+inline v_int32x8 v_trunc(const v_float32x8& a)
+{ return v_int32x8(_mm256_cvttps_epi32(a.val)); }
+
+inline v_int32x8 v_trunc(const v_float64x4& a)
+{ return v_int32x8(_mm256_castsi128_si256(_mm256_cvttpd_epi32(a.val))); }
+
+inline v_int32x8 v_floor(const v_float32x8& a)
+{ return v_int32x8(_mm256_cvttps_epi32(_mm256_floor_ps(a.val))); }
+
+inline v_int32x8 v_floor(const v_float64x4& a)
+{ return v_trunc(v_float64x4(_mm256_floor_pd(a.val))); }
+
+inline v_int32x8 v_ceil(const v_float32x8& a)
+{ return v_int32x8(_mm256_cvttps_epi32(_mm256_ceil_ps(a.val))); }
+
+inline v_int32x8 v_ceil(const v_float64x4& a)
+{ return v_trunc(v_float64x4(_mm256_ceil_pd(a.val))); }
+
+/** To float **/
+inline v_float32x8 v_cvt_f32(const v_int32x8& a)
+{ return v_float32x8(_mm256_cvtepi32_ps(a.val)); }
+
+inline v_float32x8 v_cvt_f32(const v_float64x4& a)
+{ return v_float32x8(_mm256_castps128_ps256(_mm256_cvtpd_ps(a.val))); }
+
+inline v_float32x8 v_cvt_f32(const v_float64x4& a, const v_float64x4& b)
+{
+    __m128 af = _mm256_cvtpd_ps(a.val), bf = _mm256_cvtpd_ps(b.val);
+    return v_float32x8(_mm256_insertf128_ps(_mm256_castps128_ps256(af), bf, 1));
+}
+
+inline v_float64x4 v_cvt_f64(const v_int32x8& a)
+{ return v_float64x4(_mm256_cvtepi32_pd(_v256_extract_low(a.val))); }
+
+inline v_float64x4 v_cvt_f64_high(const v_int32x8& a)
+{ return v_float64x4(_mm256_cvtepi32_pd(_v256_extract_high(a.val))); }
+
+inline v_float64x4 v_cvt_f64(const v_float32x8& a)
+{ return v_float64x4(_mm256_cvtps_pd(_v256_extract_low(a.val))); }
+
+inline v_float64x4 v_cvt_f64_high(const v_float32x8& a)
+{ return v_float64x4(_mm256_cvtps_pd(_v256_extract_high(a.val))); }
+
+#if CV_FP16
+inline v_float32x8 v_cvt_f32(const v_float16x16& a)
+{ return v_float32x8(_mm256_cvtph_ps(_v256_extract_low(a.val))); }
+
+inline v_float32x8 v_cvt_f32_high(const v_float16x16& a)
+{ return v_float32x8(_mm256_cvtph_ps(_v256_extract_high(a.val))); }
+
+inline v_float16x16 v_cvt_f16(const v_float32x8& a, const v_float32x8& b)
+{
+    __m128i ah = _mm256_cvtps_ph(a.val, 0), bh = _mm256_cvtps_ph(b.val, 0);
+    return v_float16x16(_mm256_inserti128_si256(_mm256_castsi128_si256(ah), bh, 1));
+}
+#endif
+
+////////////// Lookup table access ////////////////////
+
+inline v_int32x8 v_lut(const int* tab, const v_int32x8& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[8];
+    v_store_aligned(idx, idxvec);
+    return v_int32x8(_mm256_setr_epi32(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
+                                       tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
+}
+
+inline v_float32x8 v_lut(const float* tab, const v_int32x8& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[8];
+    v_store_aligned(idx, idxvec);
+    return v_float32x8(_mm256_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
+                                      tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
+}
+
+inline v_float64x4 v_lut(const double* tab, const v_int32x8& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[8];
+    v_store_aligned(idx, idxvec);
+    return v_float64x4(_mm256_setr_pd(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x8& idxvec, v_float32x8& x, v_float32x8& y)
+{
+    int CV_DECL_ALIGNED(32) idx[8];
+    v_store_aligned(idx, idxvec);
+    __m128 z = _mm_setzero_ps();
+    __m128 xy01, xy45, xy23, xy67;
+    xy01 = _mm_loadl_pi(z, (const __m64*)(tab + idx[0]));
+    xy01 = _mm_loadh_pi(xy01, (const __m64*)(tab + idx[1]));
+    xy45 = _mm_loadl_pi(z, (const __m64*)(tab + idx[4]));
+    xy45 = _mm_loadh_pi(xy45, (const __m64*)(tab + idx[5]));
+    __m256 xy0145 = _v256_combine(xy01, xy45);
+    xy23 = _mm_loadl_pi(z, (const __m64*)(tab + idx[2]));
+    xy23 = _mm_loadh_pi(xy23, (const __m64*)(tab + idx[3]));
+    xy67 = _mm_loadl_pi(z, (const __m64*)(tab + idx[6]));
+    xy67 = _mm_loadh_pi(xy67, (const __m64*)(tab + idx[7]));
+    __m256 xy2367 = _v256_combine(xy23, xy67);
+
+    __m256 xxyy0145 = _mm256_unpacklo_ps(xy0145, xy2367);
+    __m256 xxyy2367 = _mm256_unpackhi_ps(xy0145, xy2367);
+
+    x = v_float32x8(_mm256_unpacklo_ps(xxyy0145, xxyy2367));
+    y = v_float32x8(_mm256_unpackhi_ps(xxyy0145, xxyy2367));
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x8& idxvec, v_float64x4& x, v_float64x4& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_low(idx, idxvec);
+    __m128d xy0 = _mm_loadu_pd(tab + idx[0]);
+    __m128d xy2 = _mm_loadu_pd(tab + idx[2]);
+    __m128d xy1 = _mm_loadu_pd(tab + idx[1]);
+    __m128d xy3 = _mm_loadu_pd(tab + idx[3]);
+    __m256d xy02 = _v256_combine(xy0, xy2);
+    __m256d xy13 = _v256_combine(xy1, xy3);
+
+    x = v_float64x4(_mm256_unpacklo_pd(xy02, xy13));
+    y = v_float64x4(_mm256_unpackhi_pd(xy02, xy13));
+}
+
+////////// Matrix operations /////////
+
+inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
+{ return v_int32x8(_mm256_madd_epi16(a.val, b.val)); }
+
+inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
+{ return v_dotprod(a, b) + c; }
+
+#define OPENCV_HAL_AVX_SPLAT2_PS(a, im) \
+    v_float32x8(_mm256_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))
+
+inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0,
+                            const v_float32x8& m1, const v_float32x8& m2,
+                            const v_float32x8& m3)
+{
+    v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0);
+    v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
+    v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
+    v_float32x8 v37 = OPENCV_HAL_AVX_SPLAT2_PS(v, 3);
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
+}
+
+inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0,
+                               const v_float32x8& m1, const v_float32x8& m2,
+                               const v_float32x8& a)
+{
+    v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0);
+    v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
+    v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, a)));
+}
+
+#define OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to)    \
+    inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1,              \
+                               const _Tpvec& a2, const _Tpvec& a3,              \
+                               _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3)  \
+    {                                                                           \
+        __m256i t0 = cast_from(_mm256_unpacklo_##suffix(a0.val, a1.val));       \
+        __m256i t1 = cast_from(_mm256_unpacklo_##suffix(a2.val, a3.val));       \
+        __m256i t2 = cast_from(_mm256_unpackhi_##suffix(a0.val, a1.val));       \
+        __m256i t3 = cast_from(_mm256_unpackhi_##suffix(a2.val, a3.val));       \
+        b0.val = cast_to(_mm256_unpacklo_epi64(t0, t1));                        \
+        b1.val = cast_to(_mm256_unpackhi_epi64(t0, t1));                        \
+        b2.val = cast_to(_mm256_unpacklo_epi64(t2, t3));                        \
+        b3.val = cast_to(_mm256_unpackhi_epi64(t2, t3));                        \
+    }
+
+OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_uint32x8,  epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_int32x8,   epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_float32x8, ps, _mm256_castps_si256, _mm256_castsi256_ps)
+
+//////////////// Value reordering ///////////////
+
+/* Expand */
+#define OPENCV_HAL_IMPL_AVX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin)    \
+    inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+    {                                                               \
+        b0.val = intrin(_v256_extract_low(a.val));                  \
+        b1.val = intrin(_v256_extract_high(a.val));                 \
+    }                                                               \
+    inline _Tpwvec v256_load_expand(const _Tp* ptr)                 \
+    {                                                               \
+        __m128i a = _mm_loadu_si128((const __m128i*)ptr);           \
+        return _Tpwvec(intrin(a));                                  \
+    }
+
+OPENCV_HAL_IMPL_AVX_EXPAND(v_uint8x32,  v_uint16x16, uchar,    _mm256_cvtepu8_epi16)
+OPENCV_HAL_IMPL_AVX_EXPAND(v_int8x32,   v_int16x16,  schar,    _mm256_cvtepi8_epi16)
+OPENCV_HAL_IMPL_AVX_EXPAND(v_uint16x16, v_uint32x8,  ushort,   _mm256_cvtepu16_epi32)
+OPENCV_HAL_IMPL_AVX_EXPAND(v_int16x16,  v_int32x8,   short,    _mm256_cvtepi16_epi32)
+OPENCV_HAL_IMPL_AVX_EXPAND(v_uint32x8,  v_uint64x4,  unsigned, _mm256_cvtepu32_epi64)
+OPENCV_HAL_IMPL_AVX_EXPAND(v_int32x8,   v_int64x4,   int,      _mm256_cvtepi32_epi64)
+
+#define OPENCV_HAL_IMPL_AVX_EXPAND_Q(_Tpvec, _Tp, intrin)   \
+    inline _Tpvec v256_load_expand_q(const _Tp* ptr)        \
+    {                                                       \
+        __m128i a = _mm_loadl_epi64((const __m128i*)ptr);   \
+        return _Tpvec(intrin(a));                           \
+    }
+
+OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_uint32x8, uchar, _mm256_cvtepu8_epi32)
+OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_int32x8,  schar, _mm256_cvtepi8_epi32)
+
+/* pack */
+// 16
+inline v_int8x32 v_pack(const v_int16x16& a, const v_int16x16& b)
+{ return v_int8x32(_v256_shuffle_odd_64(_mm256_packs_epi16(a.val, b.val))); }
+
+inline v_uint8x32 v_pack(const v_uint16x16& a, const v_uint16x16& b)
+{ return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a.val, b.val))); }
+
+inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b)
+{ return v_pack(v_reinterpret_as_u16(a), v_reinterpret_as_u16(b)); }
+
+inline void v_pack_store(schar* ptr, const v_int16x16& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(uchar* ptr, const v_uint16x16& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_u_store(uchar* ptr, const v_int16x16& a)
+{ v_store_low(ptr, v_pack_u(a, a)); }
+
+template<int n> inline
+v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b)
+{
+    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
+    v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
+    return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
+                    v_reinterpret_as_s16((b + delta) >> n));
+}
+
+template<int n> inline
+void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a)
+{
+    v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
+    v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
+}
+
+template<int n> inline
+v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b)
+{
+    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
+    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a)
+{
+    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
+    v_pack_u_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b)
+{
+    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(schar* ptr, const v_int16x16& a)
+{
+    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+// 32
+inline v_int16x16 v_pack(const v_int32x8& a, const v_int32x8& b)
+{ return v_int16x16(_v256_shuffle_odd_64(_mm256_packs_epi32(a.val, b.val))); }
+
+inline v_uint16x16 v_pack(const v_uint32x8& a, const v_uint32x8& b)
+{ return v_uint16x16(_v256_shuffle_odd_64(_mm256_packus_epi32(a.val, b.val))); }
+
+inline v_uint16x16 v_pack_u(const v_int32x8& a, const v_int32x8& b)
+{ return v_pack(v_reinterpret_as_u32(a), v_reinterpret_as_u32(b)); }
+
+inline void v_pack_store(short* ptr, const v_int32x8& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(ushort* ptr, const v_uint32x8& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_u_store(ushort* ptr, const v_int32x8& a)
+{ v_store_low(ptr, v_pack_u(a, a)); }
+
+
+template<int n> inline
+v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b)
+{
+    // we assume that n > 0, and so the shifted 32-bit values can be treated as signed numbers.
+    v_uint32x8 delta = v256_setall_u32(1 << (n-1));
+    return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
+                    v_reinterpret_as_s32((b + delta) >> n));
+}
+
+template<int n> inline
+void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a)
+{
+    v_uint32x8 delta = v256_setall_u32(1 << (n-1));
+    v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
+}
+
+template<int n> inline
+v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b)
+{
+    v_int32x8 delta = v256_setall_s32(1 << (n-1));
+    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a)
+{
+    v_int32x8 delta = v256_setall_s32(1 << (n-1));
+    v_pack_u_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b)
+{
+    v_int32x8 delta = v256_setall_s32(1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(short* ptr, const v_int32x8& a)
+{
+    v_int32x8 delta = v256_setall_s32(1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+// 64
+// Non-saturating pack
+inline v_uint32x8 v_pack(const v_uint64x4& a, const v_uint64x4& b)
+{
+    __m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0));
+    __m256i b0 = _mm256_shuffle_epi32(b.val, _MM_SHUFFLE(0, 0, 2, 0));
+    __m256i ab = _mm256_unpacklo_epi64(a0, b0); // a0, a1, b0, b1, a2, a3, b2, b3
+    return v_uint32x8(_v256_shuffle_odd_64(ab));
+}
+
+inline v_int32x8 v_pack(const v_int64x4& a, const v_int64x4& b)
+{ return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
+
+inline void v_pack_store(unsigned* ptr, const v_uint64x4& a)
+{
+    __m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0));
+    v_store_low(ptr, v_uint32x8(_v256_shuffle_odd_64(a0)));
+}
+
+inline void v_pack_store(int* ptr, const v_int64x4& b)
+{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); }
+
+template<int n> inline
+v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b)
+{
+    v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a)
+{
+    v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b)
+{
+    v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(int* ptr, const v_int64x4& a)
+{
+    v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+/* Recombine */
+// its up there with load and store operations
+
+/* Extract */
+#define OPENCV_HAL_IMPL_AVX_EXTRACT(_Tpvec)                    \
+    template<int s>                                            \
+    inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)  \
+    { return v_rotate_right<s>(a, b); }
+
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint8x32)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_int8x32)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint16x16)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_int16x16)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint32x8)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_int32x8)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint64x4)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_int64x4)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_float32x8)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_float64x4)
+
+
+/** Reinterpret **/
+// its up there with load and store operations
+
+/* de&interleave */
+#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_2CH(_Tpvec, _Tp, suffix)                 \
+    inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b)       \
+    { return v256_load_deinterleave_##suffix(ptr, a, b); }                      \
+    inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b)  \
+    { return v256_store_interleave_2ch(ptr, a, b); }
+
+#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_3CH(_Tpvec, _Tp, suffix)   \
+    inline void v_load_deinterleave                               \
+    (const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c)             \
+    { return v256_load_deinterleave_##suffix(ptr, a, b, c); }     \
+    inline void v_store_interleave                                \
+    (_Tp* ptr, const _Tpvec& a,const _Tpvec& b, const _Tpvec& c)  \
+    { return v256_store_interleave_##suffix(ptr, a, b, c); }
+
+#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_4CH(_Tpvec, _Tp, suffix)                    \
+    inline void v_load_deinterleave                                                \
+    (const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)                   \
+    { return v256_load_deinterleave_##suffix(ptr, a, b, c, d); }                   \
+    inline void v_store_interleave                                                 \
+    (_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d) \
+    { return v256_store_interleave_##suffix(ptr, a, b, c, d); }
+
+#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_3n4CH(_Tpvec, _Tp, suffix) \
+    OPENCV_HAL_IMPL_AVX_INTERLEAVE_3CH(_Tpvec, _Tp, suffix)       \
+    OPENCV_HAL_IMPL_AVX_INTERLEAVE_4CH(_Tpvec, _Tp, suffix)
+
+#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(_Tpvec, _Tp, suffix) \
+    OPENCV_HAL_IMPL_AVX_INTERLEAVE_2CH(_Tpvec, _Tp, suffix)     \
+    OPENCV_HAL_IMPL_AVX_INTERLEAVE_3n4CH(_Tpvec, _Tp, suffix)
+
+/* **** */
+//
+template<typename _Tp, typename _Tpvec>
+inline void v256_store_interleave_2ch(_Tp* ptr, const _Tpvec& a, const _Tpvec& b)
+{
+    _Tpvec ab0, ab1;
+    v_zip(a, b, ab0, ab1);
+    v_store(ptr, ab0);
+    v_store(ptr + _Tpvec::nlanes, ab1);
+}
+
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
+{
+    _Tpvec ab0 = v256_load(ptr);
+    _Tpvec ab1 = v256_load(ptr + _Tpvec::nlanes);
+    _Tpvec ab00, ab11;
+    v_recombine(ab0, ab1, ab00, ab11);
+    v256_zip(ab00, ab11, a, b);
+}
+
+///
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c)
+{
+    _Tpvec abc0 = v256_load(ptr);
+    _Tpvec abc1 = v256_load(ptr + _Tpvec::nlanes);
+    _Tpvec abc2 = v256_load(ptr + _Tpvec::nlanes * 2);
+
+    _Tpvec ab0 = v256_combine_diagonal(abc0, abc1);
+    _Tpvec bc1 = v256_combine_diagonal(abc1, abc2);
+    _Tpvec ac1 = v256_reverse_64(v256_combine_diagonal(abc2, abc0));
+
+    a = v256_unpacklo(ab0, ac1);
+    c = v256_unpackhi(ac1, bc1);
+    b = v256_alignr_64(bc1, ab0);
+}
+
+
+template<typename _Tp, typename _Tpvec>
+inline void v256_store_interleave_l4(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)
+{
+    _Tpvec ab0 = v256_unpacklo(a, b);
+    _Tpvec bc1 = v256_unpackhi(b, c);
+    _Tpvec ca10 = v256_swap_halves(v256_blend<0b1010>(c, a));
+
+    v_store(ptr, v256_combine_diagonal(ab0, ca10));
+    v_store(ptr + _Tpvec::nlanes, v256_combine_diagonal(bc1, ab0));
+    v_store(ptr + _Tpvec::nlanes * 2, v256_combine_diagonal(ca10, bc1));
+}
+
+////
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
+{
+    _Tpvec abcd0 = v256_load(ptr);
+    _Tpvec abcd1 = v256_load(ptr + _Tpvec::nlanes);
+    _Tpvec abcd2 = v256_load(ptr + _Tpvec::nlanes * 2);
+    _Tpvec abcd3 = v256_load(ptr + _Tpvec::nlanes * 3);
+
+    _Tpvec cd0ab0 = v256_alignr_128(abcd0, abcd2);
+    _Tpvec cd1ab1 = v256_alignr_128(abcd1, abcd3);
+
+    _Tpvec ab0 = v256_combine_diagonal(abcd0, cd0ab0);
+    _Tpvec ab1 = v256_combine_diagonal(abcd1, cd1ab1);
+    _Tpvec cd0 = v256_combine_diagonal(cd0ab0, abcd2);
+    _Tpvec cd1 = v256_combine_diagonal(cd1ab1, abcd3);
+
+    v256_zip(ab0, ab1, a, b);
+    v256_zip(cd0, cd1, c, d);
+}
+
+template<typename _Tp, typename _Tpvec>
+inline void v256_store_interleave_l4(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
+{
+    _Tpvec ab0, ab1, cd0, cd1;
+    v256_zip(a, b, ab0, ab1);
+    v256_zip(c, d, cd0, cd1);
+
+    _Tpvec ab0cd0 = v256_alignr_128(ab0, cd0);
+    _Tpvec ab1cd1 = v256_alignr_128(ab1, cd1);
+
+    v_store(ptr, v256_combine_diagonal(ab0, ab0cd0));
+    v_store(ptr + _Tpvec::nlanes, v256_combine_diagonal(ab1, ab1cd1));
+    v_store(ptr + _Tpvec::nlanes * 2, v256_combine_diagonal(ab0cd0, cd0));
+    v_store(ptr + _Tpvec::nlanes * 3, v256_combine_diagonal(ab1cd1, cd1));
+}
+
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint64x4,  uint64, l4)
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int64x4,   int64,  l4)
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_float64x4, double, l4)
+
+/* **** **** */
+//
+inline void v256_load_deinterleave_l8(const float* ptr, v_float32x8& a, v_float32x8& b)
+{
+    v_float32x8 ab0 = v256_load(ptr);
+    v_float32x8 ab1 = v256_load(ptr + 8);
+
+    v_float32x8 ab0ab2, ab1ab3;
+    v_recombine(ab0, ab1, ab0ab2, ab1ab3);
+
+    a.val = _mm256_shuffle_ps(ab0ab2.val, ab1ab3.val, _MM_SHUFFLE(2, 0, 2, 0));
+    b.val = _mm256_shuffle_ps(ab0ab2.val, ab1ab3.val, _MM_SHUFFLE(3, 1, 3, 1));
+}
+
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
+{
+    v_float32x8 fa, fb;
+    v256_load_deinterleave_l8((float*)ptr, fa, fb);
+    a.val = v_reinterpret_as_u32(fa).val;
+    b.val = v_reinterpret_as_u32(fb).val;
+}
+///
+template<typename _Tp, typename _Tpvec>
+inline void v256_store_interleave_l8(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)
+{
+    _Tpvec ab0, ab1, bc0, bc1;
+    v256_zip(a, b, ab0, ab1);
+    v256_zip(b, c, bc0, bc1);
+
+    _Tpvec cazg = v256_blend<0b10101010>(c, a);
+    _Tpvec abc0abc1(_mm256_unpacklo_epi64(ab0.val, cazg.val));
+    _Tpvec abc1abc2(_mm256_unpackhi_epi64(cazg.val, bc1.val));
+    _Tpvec abc2abc0 = v256_reverse_64(v256_blend<0b11001100>(ab1, bc0));
+
+    _Tpvec abc0 = v256_combine_diagonal(abc0abc1, abc2abc0);
+    _Tpvec abc1 = v256_combine_diagonal(abc1abc2, abc0abc1);
+    _Tpvec abc2 = v256_combine_diagonal(abc2abc0, abc1abc2);
+
+    v_store(ptr, abc0);
+    v_store(ptr + _Tpvec::nlanes, abc1);
+    v_store(ptr + _Tpvec::nlanes * 2, abc2);
+}
+
+inline void v256_store_interleave_l8(float* ptr, const v_float32x8& a, const v_float32x8& b, const v_float32x8& c)
+{
+    v_float32x8 ab0, ab1, bc0, bc1;
+    v256_zip(a, b, ab0, ab1);
+    v256_zip(b, c, bc0, bc1);
+
+    v_float32x8 cazg = v256_blend<0b10101010>(c, a);
+    v_float32x8 abc0abc1(_mm256_shuffle_ps(ab0.val, cazg.val, _MM_SHUFFLE(1, 0, 1, 0)));
+    v_float32x8 abc1abc2(_mm256_shuffle_ps(cazg.val, bc1.val, _MM_SHUFFLE(3, 2, 3, 2)));
+
+    v_float32x8 abc0abc2(_mm256_shuffle_ps(bc0.val, ab1.val, _MM_SHUFFLE(1, 0, 3, 2)));
+    v_float32x8 abc2abc0 = v256_swap_halves(abc0abc2);
+
+    v_float32x8 abc0 = v256_combine_diagonal(abc0abc1, abc2abc0);
+    v_float32x8 abc1 = v256_combine_diagonal(abc1abc2, abc0abc1);
+    v_float32x8 abc2 = v256_combine_diagonal(abc2abc0, abc1abc2);
+
+    v_store(ptr, abc0);
+    v_store(ptr + 8, abc1);
+    v_store(ptr + 16, abc2);
+}
+
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c)
+{
+    _Tpvec abc02 = v256_load(ptr);
+    _Tpvec abc1  = v256_load(ptr + _Tpvec::nlanes);
+    _Tpvec abc20 = v256_load(ptr + _Tpvec::nlanes * 2);
+
+    _Tpvec abc2 = v256_alignr_128(abc02, abc20);
+    _Tpvec abc0 = v256_combine_diagonal(abc02, abc20);
+
+    a = v256_blend<0b10010010>(abc0, abc1);
+    a = v256_blend<0b01000100>(a, abc2);
+
+    b = v256_blend<0b00100100>(abc0, abc1);
+    b = v256_blend<0b10011001>(b, abc2);
+
+    c = v256_blend<0b01001001>(abc0, abc1);
+    c = v256_blend<0b00100010>(c, abc2);
+
+    a = v256_shuffle<_MM_SHUFFLE(1, 2, 3, 0)>(a);
+    b = v256_shuffle<_MM_SHUFFLE(2, 3, 0, 1)>(b);
+    c = v256_shuffle<_MM_SHUFFLE(3, 0, 1, 2)>(c);
+}
+/////
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
+{
+    _Tpvec ab0, ab1, cd0, cd1;
+    v256_load_deinterleave_l4(ptr, ab0, cd0, ab1, cd1);
+    v256_zip(ab0, ab1, a, b);
+    v256_zip(cd0, cd1, c, d);
+}
+
+template<typename _Tp, typename _Tpvec>
+inline void v256_store_interleave_l8(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
+{
+    _Tpvec ac0, ac1, bd0, bd1;
+    v256_zip(a, c, ac0, ac1);
+    v256_zip(b, d, bd0, bd1);
+
+    _Tpvec abcd0, abcd1, abcd2, abcd3;
+    v256_zip(ac0, bd0, abcd0, abcd1);
+    v256_zip(ac1, bd1, abcd2, abcd3);
+
+    _Tpvec abcd01, abcd23, abcd45, abcd67;
+    v_recombine(abcd0, abcd1, abcd01, abcd45);
+    v_recombine(abcd2, abcd3, abcd23, abcd67);
+
+    v_store(ptr, abcd01);
+    v_store(ptr + _Tpvec::nlanes, abcd23);
+    v_store(ptr + _Tpvec::nlanes * 2, abcd45);
+    v_store(ptr + _Tpvec::nlanes * 3, abcd67);
+}
+
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint32x8,  unsigned, l8)
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int32x8,   int,      l8)
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_float32x8, float,    l8)
+
+/* ********  ******** */
+//
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l16(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
+{
+    const __m256i sep = _mm256_setr_epi8(
+        0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+        0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+    );
+
+    _Tpvec ab0, ab1;
+    v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes), ab0, ab1);
+
+    __m256i a0b0 = _mm256_shuffle_epi8(ab0.val, sep);
+    __m256i a1b1 = _mm256_shuffle_epi8(ab1.val, sep);
+
+    a.val = _mm256_unpacklo_epi64(a0b0, a1b1);
+    b.val = _mm256_unpackhi_epi64(a0b0, a1b1);
+}
+///
+template<typename _Tp, typename _Tpvec>
+inline void v256_store_interleave_l16(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)
+{
+    v_uint32x8 ab0 = v_reinterpret_as_u32(v256_unpacklo(a, b));
+    v_uint32x8 ab1 = v_reinterpret_as_u32(v256_unpackhi(a, b));
+    v_uint32x8 bc0 = v_reinterpret_as_u32(v256_unpacklo(b, c));
+    v_uint32x8 bc1 = v_reinterpret_as_u32(v256_unpackhi(b, c));
+
+    v_uint32x8 cazg = v_reinterpret_as_u32(v256_blend<0b10101010>(c, a));
+               cazg = v256_shuffle<_MM_SHUFFLE(2, 1, 0, 3)>(cazg);
+
+    v_uint32x8 ac1ab1 = v256_blend<0b10101010>(ab1, bc1);
+               ac1ab1 = v256_shuffle<_MM_SHUFFLE(2, 1, 0, 3)>(ac1ab1);
+
+    v_uint32x8 abc001 = v256_blend<0b10101010>(ab0, cazg);
+    v_uint32x8 cabc0 = v256_blend<0b10101010>(cazg, bc0);
+
+    v_uint32x8 cabc1 = v256_unpacklo(cabc0, ac1ab1);
+    v_uint32x8 bcab0 = v256_unpackhi(cabc1, abc001);
+
+    v_uint64x4 abc01 = v256_unpacklo(v_reinterpret_as_u64(abc001), v_reinterpret_as_u64(bcab0));
+    v_uint64x4 abc21 = v256_unpackhi(v_reinterpret_as_u64(cabc0), v_reinterpret_as_u64(bcab0));
+               abc21 = v256_swap_halves(abc21);
+    v_uint64x4 abc12 = v_reinterpret_as_u64(v256_alignr_64(cabc1, ac1ab1));
+
+    v_uint64x4 abc0 = v256_combine_diagonal(abc01, abc21);
+    v_uint64x4 abc1 = v256_combine_diagonal(abc12, abc01);
+    v_uint64x4 abc2 = v256_combine_diagonal(abc21, abc12);
+
+    v_store(ptr, _Tpvec(abc0.val));
+    v_store(ptr + _Tpvec::nlanes, _Tpvec(abc1.val));
+    v_store(ptr + _Tpvec::nlanes * 2, _Tpvec(abc2.val));
+}
+// todo:
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l16(const _Tp*, _Tpvec&, _Tpvec&, _Tpvec&)
+{}
+////
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l16(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
+{
+    _Tpvec ab0, ab1, cd0, cd1;
+    v256_load_deinterleave_l8(ptr, ab0, cd0, ab1, cd1);
+    v256_zip(ab0, ab1, a, b);
+    v256_zip(cd0, cd1, c, d);
+}
+
+template<typename _Tp, typename _Tpvec>
+inline void v256_store_interleave_l16(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
+{ v256_store_interleave_l8(ptr, a, b, c, d); }
+
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint16x16,  ushort, l16)
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int16x16,   short,  l16)
+
+/* **************** **************** */
+//
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l32(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
+{
+    const __m256i sep = _mm256_setr_epi8(
+        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+    );
+
+    _Tpvec ab0, ab1;
+    v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes), ab0, ab1);
+
+    __m256i a0b0 = _mm256_shuffle_epi8(ab0.val, sep);
+    __m256i a1b1 = _mm256_shuffle_epi8(ab1.val, sep);
+
+    a.val = _mm256_unpacklo_epi64(a0b0, a1b1);
+    b.val = _mm256_unpackhi_epi64(a0b0, a1b1);
+}
+
+/// todo
+template<typename _Tp, typename _Tpvec>
+inline void v256_store_interleave_l32(_Tp*, const _Tpvec&, const _Tpvec&, const _Tpvec&)
+{}
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l32(const _Tp*, _Tpvec&, _Tpvec&, _Tpvec&)
+{}
+////
+template<typename _Tp, typename _Tpvec>
+inline void v256_load_deinterleave_l32(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
+{
+    const __m256i sep = _mm256_setr_epi8(
+        0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+        0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+    );
+
+    _Tpvec abcd0, abcd1, abcd2, abcd3;
+    v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes * 2), abcd0, abcd1);
+    v_recombine(v256_load(ptr + _Tpvec::nlanes), v256_load(ptr + _Tpvec::nlanes * 3), abcd2, abcd3);
+
+    __m256i ab0cd0 = _mm256_shuffle_epi8(abcd0.val, sep);
+    __m256i ab1cd1 = _mm256_shuffle_epi8(abcd1.val, sep);
+    __m256i ab2cd2 = _mm256_shuffle_epi8(abcd2.val, sep);
+    __m256i ab3cd3 = _mm256_shuffle_epi8(abcd3.val, sep);
+
+    __m256i ab0 = _mm256_unpacklo_epi32(ab0cd0, ab1cd1);
+    __m256i ab1 = _mm256_unpacklo_epi32(ab2cd2, ab3cd3);
+    __m256i cd0 = _mm256_unpackhi_epi32(ab0cd0, ab1cd1);
+    __m256i cd1 = _mm256_unpackhi_epi32(ab2cd2, ab3cd3);
+
+    a.val = _mm256_unpacklo_epi64(ab0, ab1);
+    b.val = _mm256_unpackhi_epi64(ab0, ab1);
+    c.val = _mm256_unpacklo_epi64(cd0, cd1);
+    d.val = _mm256_unpackhi_epi64(cd0, cd1);
+}
+
+template<typename _Tp, typename _Tpvec>
+inline void v256_store_interleave_l32(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
+{ v256_store_interleave_l8(ptr, a, b, c, d); }
+
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint8x32,  uchar, l32)
+OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int8x32,   schar, l32)
+
+inline void v256_cleanup() { _mm256_zeroupper(); }
+
+//! @name Check SIMD256 support
+//! @{
+//! @brief Check CPU capability of SIMD operation
+static inline bool hasSIMD256()
+{
+    return (CV_CPU_HAS_SUPPORT_AVX2) ? true : false;
+}
+//! @}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+} // cv::
+
+#endif // OPENCV_HAL_INTRIN_AVX_HPP
diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
index c7cbb578db..1f5f53100a 100644
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -247,8 +247,6 @@ template<typename _Tp, int n> struct v_reg
 {
 //! @cond IGNORED
     typedef _Tp lane_type;
-    typedef v_reg<typename V_TypeTraits<_Tp>::int_type, n> int_vec;
-    typedef v_reg<typename V_TypeTraits<_Tp>::abs_type, n> abs_vec;
     enum { nlanes = n };
 // !@endcond
 
@@ -797,11 +795,11 @@ inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>
 
 /** @brief Multiply and add
 
-Returns \f$ a*b + c \f$
-For floating point types and signed 32bit int only. */
+ Returns \f$ a*b + c \f$
+ For floating point types and signed 32bit int only. */
 template<typename _Tp, int n>
-inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                              const v_reg<_Tp, n>& c)
+inline v_reg<_Tp, n> v_fma(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                           const v_reg<_Tp, n>& c)
 {
     v_reg<_Tp, n> d;
     for( int i = 0; i < n; i++ )
@@ -809,6 +807,14 @@ inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
     return d;
 }
 
+/** @brief A synonym for v_fma */
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                              const v_reg<_Tp, n>& c)
+{
+    return v_fma(a, b, c);
+}
+
 /** @brief Dot product of elements
 
 Multiply values in two registers and sum adjacent result pairs.
@@ -1141,9 +1147,9 @@ template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const
 @note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x16, int ==> cv::v_int32x4, etc.
  */
 template<typename _Tp>
-inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load(const _Tp* ptr)
+inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load(const _Tp* ptr)
 {
-    return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr);
+    return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
 }
 
 /** @brief Load register contents from memory (aligned)
@@ -1151,9 +1157,9 @@ inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load(const _Tp* ptr)
 similar to cv::v_load, but source memory block should be aligned (to 16-byte boundary)
  */
 template<typename _Tp>
-inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_aligned(const _Tp* ptr)
+inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_aligned(const _Tp* ptr)
 {
-    return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr);
+    return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
 }
 
 /** @brief Load 64-bits of data to lower part (high part is undefined).
@@ -1166,9 +1172,9 @@ v_int32x4 r = v_load_low(lo);
 @endcode
  */
 template<typename _Tp>
-inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_low(const _Tp* ptr)
+inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_low(const _Tp* ptr)
 {
-    v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> c;
+    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
     for( int i = 0; i < c.nlanes/2; i++ )
     {
         c.s[i] = ptr[i];
@@ -1187,9 +1193,9 @@ v_int32x4 r = v_load_halves(lo, hi);
 @endcode
  */
 template<typename _Tp>
-inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
+inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
 {
-    v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> c;
+    v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
     for( int i = 0; i < c.nlanes/2; i++ )
     {
         c.s[i] = loptr[i];
@@ -1208,11 +1214,11 @@ v_int32x4 r = v_load_expand(buf); // r = {1, 2, 3, 4} - type is int32
 @endcode
 For 8-, 16-, 32-bit integer source types. */
 template<typename _Tp>
-inline v_reg<typename V_TypeTraits<_Tp>::w_type, V_SIMD128Traits<_Tp>::nlanes / 2>
+inline v_reg<typename V_TypeTraits<_Tp>::w_type, V_TypeTraits<_Tp>::nlanes128 / 2>
 v_load_expand(const _Tp* ptr)
 {
     typedef typename V_TypeTraits<_Tp>::w_type w_type;
-    v_reg<w_type, V_SIMD128Traits<w_type>::nlanes> c;
+    v_reg<w_type, V_TypeTraits<w_type>::nlanes128> c;
     for( int i = 0; i < c.nlanes; i++ )
     {
         c.s[i] = ptr[i];
@@ -1229,11 +1235,11 @@ v_int32x4 r = v_load_q(buf); // r = {1, 2, 3, 4} - type is int32
 @endcode
 For 8-bit integer source types. */
 template<typename _Tp>
-inline v_reg<typename V_TypeTraits<_Tp>::q_type, V_SIMD128Traits<_Tp>::nlanes / 4>
+inline v_reg<typename V_TypeTraits<_Tp>::q_type, V_TypeTraits<_Tp>::nlanes128 / 4>
 v_load_expand_q(const _Tp* ptr)
 {
     typedef typename V_TypeTraits<_Tp>::q_type q_type;
-    v_reg<q_type, V_SIMD128Traits<q_type>::nlanes> c;
+    v_reg<q_type, V_TypeTraits<q_type>::nlanes128> c;
     for( int i = 0; i < c.nlanes; i++ )
     {
         c.s[i] = ptr[i];
@@ -1622,6 +1628,17 @@ template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
     return c;
 }
 
+template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a, const v_reg<double, n>& b)
+{
+    v_reg<float, n*2> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = (float)a.s[i];
+        c.s[i+n] = (float)b.s[i];
+    }
+    return c;
+}
+
 /** @brief Convert to double
 
 Supported input type is cv::v_int32x4. */
@@ -1644,6 +1661,52 @@ template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a)
     return c;
 }
 
+template<int n> inline v_reg<int, n> v_lut(const int* tab, const v_reg<int, n>& idx)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = tab[idx.s[i]];
+    return c;
+}
+
+template<int n> inline v_reg<float, n> v_lut(const float* tab, const v_reg<int, n>& idx)
+{
+    v_reg<float, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = tab[idx.s[i]];
+    return c;
+}
+
+template<int n> inline v_reg<double, n> v_lut(const double* tab, const v_reg<int, n*2>& idx)
+{
+    v_reg<double, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = tab[idx.s[i]];
+    return c;
+}
+
+template<int n> inline void v_lut_deinterleave(const float* tab, const v_reg<int, n>& idx,
+                                               v_reg<float, n>& x, v_reg<float, n>& y)
+{
+    for( int i = 0; i < n; i++ )
+    {
+        int j = idx.s[i];
+        x.s[i] = tab[j];
+        y.s[i] = tab[j+1];
+    }
+}
+
+template<int n> inline void v_lut_deinterleave(const double* tab, const v_reg<int, n*2>& idx,
+                                               v_reg<double, n>& x, v_reg<double, n>& y)
+{
+    for( int i = 0; i < n; i++ )
+    {
+        int j = idx.s[i];
+        x.s[i] = tab[j];
+        y.s[i] = tab[j+1];
+    }
+}
+
 /** @brief Transpose 4x4 matrix
 
 Scheme:
@@ -1968,6 +2031,8 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
                        v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + m3.s[3]);
 }
 
+inline void v_cleanup() {}
+
 //! @}
 
 //! @name Check SIMD support
diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
index 9dadab57ea..fdb3ec09cb 100644
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -280,11 +280,29 @@ struct v_float64x2
 
 #if CV_FP16
 // Workaround for old compilers
-template <typename T> static inline int16x4_t vreinterpret_s16_f16(T a)
-{ return (int16x4_t)a; }
-template <typename T> static inline float16x4_t vreinterpret_f16_s16(T a)
-{ return (float16x4_t)a; }
-template <typename T> static inline float16x4_t cv_vld1_f16(const T* ptr)
+static inline int16x8_t vreinterpretq_s16_f16(float16x8_t a) { return (int16x8_t)a; }
+static inline float16x8_t vreinterpretq_f16_s16(int16x8_t a) { return (float16x8_t)a; }
+static inline int16x4_t vreinterpret_s16_f16(float16x4_t a) { return (int16x4_t)a; }
+static inline float16x4_t vreinterpret_f16_s16(int16x4_t a) { return (float16x4_t)a; }
+
+static inline float16x8_t cv_vld1q_f16(const void* ptr)
+{
+#ifndef vld1q_f16 // APPLE compiler defines vld1_f16 as macro
+    return vreinterpretq_f16_s16(vld1q_s16((const short*)ptr));
+#else
+    return vld1q_f16((const __fp16*)ptr);
+#endif
+}
+static inline void cv_vst1q_f16(void* ptr, float16x8_t a)
+{
+#ifndef vst1q_f16 // APPLE compiler defines vst1_f16 as macro
+    vst1q_s16((short*)ptr, vreinterpretq_s16_f16(a));
+#else
+    vst1q_f16((__fp16*)ptr, a);
+#endif
+}
+
+static inline float16x4_t cv_vld1_f16(const void* ptr)
 {
 #ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
     return vreinterpret_f16_s16(vld1_s16((const short*)ptr));
@@ -292,7 +310,7 @@ template <typename T> static inline float16x4_t cv_vld1_f16(const T* ptr)
     return vld1_f16((const __fp16*)ptr);
 #endif
 }
-template <typename T> static inline void cv_vst1_f16(T* ptr, float16x4_t a)
+static inline void cv_vst1_f16(void* ptr, float16x4_t a)
 {
 #ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro
     vst1_s16((short*)ptr, vreinterpret_s16_f16(a));
@@ -301,24 +319,28 @@ template <typename T> static inline void cv_vst1_f16(T* ptr, float16x4_t a)
 #endif
 }
 
-struct v_float16x4
+
+struct v_float16x8
 {
     typedef short lane_type;
-    enum { nlanes = 4 };
+    enum { nlanes = 8 };
 
-    v_float16x4() {}
-    explicit v_float16x4(float16x4_t v) : val(v) {}
-    v_float16x4(short v0, short v1, short v2, short v3)
+    v_float16x8() {}
+    explicit v_float16x8(float16x8_t v) : val(v) {}
+    v_float16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
     {
-        short v[] = {v0, v1, v2, v3};
-        val = cv_vld1_f16(v);
+        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = cv_vld1q_f16(v);
     }
     short get0() const
     {
-        return vget_lane_s16(vreinterpret_s16_f16(val), 0);
+        return vgetq_lane_s16(vreinterpretq_s16_f16(val), 0);
     }
-    float16x4_t val;
+    float16x8_t val;
 };
+
+inline v_float16x8 v_setzero_f16() { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16((short)0))); }
+inline v_float16x8 v_setall_f16(short v) { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16(v))); }
 #endif
 
 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
@@ -731,16 +753,32 @@ inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
     return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
 }
 
-inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
 {
+#if CV_SIMD128_64F
+    // ARMv8, which adds support for 64-bit floating-point (so CV_SIMD128_64F is defined),
+    // also adds FMA support both for single- and double-precision floating-point vectors
+    return v_float32x4(vfmaq_f32(c.val, a.val, b.val));
+#else
     return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
+#endif
 }
 
-inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
 {
     return v_int32x4(vmlaq_s32(c.val, a.val, b.val));
 }
 
+inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
 #if CV_SIMD128_64F
 inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
 {
@@ -753,9 +791,14 @@ inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
     return v_float64x2(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
 }
 
+inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    return v_float64x2(vfmaq_f64(c.val, a.val, b.val));
+}
+
 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
 {
-    return v_float64x2(vaddq_f64(c.val, vmulq_f64(a.val, b.val)));
+    return v_fma(a, b, c);
 }
 #endif
 
@@ -841,10 +884,15 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
 
 #if CV_FP16
 // Workaround for old comiplers
-inline v_float16x4 v_load_f16(const short* ptr)
-{ return v_float16x4(cv_vld1_f16(ptr)); }
-inline void v_store_f16(short* ptr, v_float16x4& a)
-{ cv_vst1_f16(ptr, a.val); }
+inline v_float16x8 v_load_f16(const short* ptr)
+{ return v_float16x8(cv_vld1q_f16(ptr)); }
+inline v_float16x8 v_load_f16_aligned(const short* ptr)
+{ return v_float16x8(cv_vld1q_f16(ptr)); }
+
+inline void v_store(short* ptr, const v_float16x8& a)
+{ cv_vst1q_f16(ptr, a.val); }
+inline void v_store_aligned(short* ptr, const v_float16x8& a)
+{ cv_vst1q_f16(ptr, a.val); }
 #endif
 
 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
@@ -1293,6 +1341,11 @@ inline v_float32x4 v_cvt_f32(const v_float64x2& a)
     return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), zero));
 }
 
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), vcvt_f32_f64(b.val)));
+}
+
 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
 {
     return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_low_s32(a.val))));
@@ -1315,17 +1368,88 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 #endif
 
 #if CV_FP16
-inline v_float32x4 v_cvt_f32(const v_float16x4& a)
+inline v_float32x4 v_cvt_f32(const v_float16x8& a)
+{
+    return v_float32x4(vcvt_f32_f16(vget_low_f16(a.val)));
+}
+inline v_float32x4 v_cvt_f32_high(const v_float16x8& a)
 {
-    return v_float32x4(vcvt_f32_f16(a.val));
+    return v_float32x4(vcvt_f32_f16(vget_high_f16(a.val)));
 }
 
-inline v_float16x4 v_cvt_f16(const v_float32x4& a)
+inline v_float16x8 v_cvt_f16(const v_float32x4& a, const v_float32x4& b)
 {
-    return v_float16x4(vcvt_f16_f32(a.val));
+    return v_float16x8(vcombine_f16(vcvt_f16_f32(a.val), vcvt_f16_f32(b.val)));
 }
 #endif
 
+////////////// Lookup table access ////////////////////
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[vgetq_lane_s32(idxvec.val, 0)],
+        tab[vgetq_lane_s32(idxvec.val, 1)],
+        tab[vgetq_lane_s32(idxvec.val, 2)],
+        tab[vgetq_lane_s32(idxvec.val, 3)]
+    };
+    return v_int32x4(vld1q_s32(elems));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    float CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[vgetq_lane_s32(idxvec.val, 0)],
+        tab[vgetq_lane_s32(idxvec.val, 1)],
+        tab[vgetq_lane_s32(idxvec.val, 2)],
+        tab[vgetq_lane_s32(idxvec.val, 3)]
+    };
+    return v_float32x4(vld1q_f32(elems));
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    /*int CV_DECL_ALIGNED(32) idx[4];
+    v_store(idx, idxvec);
+
+    float32x4_t xy02 = vcombine_f32(vld1_f32(tab + idx[0]), vld1_f32(tab + idx[2]));
+    float32x4_t xy13 = vcombine_f32(vld1_f32(tab + idx[1]), vld1_f32(tab + idx[3]));
+
+    float32x4x2_t xxyy = vuzpq_f32(xy02, xy13);
+    x = v_float32x4(xxyy.val[0]);
+    y = v_float32x4(xxyy.val[1]);*/
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+    y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
+}
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    double CV_DECL_ALIGNED(32) elems[2] =
+    {
+        tab[vgetq_lane_s32(idxvec.val, 0)],
+        tab[vgetq_lane_s32(idxvec.val, 1)],
+    };
+    return v_float64x2(vld1q_f64(elems));
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    x = v_float64x2(tab[idx[0]], tab[idx[1]]);
+    y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
+}
+#endif
+
+inline void v_cleanup() {}
+
 //! @name Check SIMD support
 //! @{
 //! @brief Check CPU capability of SIMD operation
diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
index 8c61f44f4a..b79ea16a4d 100644
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -58,6 +58,17 @@ namespace cv
 
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 
+struct v_uint8x16;
+struct v_int8x16;
+struct v_uint16x8;
+struct v_int16x8;
+struct v_uint32x4;
+struct v_int32x4;
+struct v_float32x4;
+struct v_uint64x2;
+struct v_int64x2;
+struct v_float64x2;
+
 struct v_uint8x16
 {
     typedef uchar lane_type;
@@ -144,6 +155,7 @@ struct v_int16x8
     {
         return (short)_mm_cvtsi128_si32(val);
     }
+
     __m128i val;
 };
 
@@ -163,6 +175,7 @@ struct v_uint32x4
     {
         return (unsigned)_mm_cvtsi128_si32(val);
     }
+
     __m128i val;
 };
 
@@ -182,6 +195,7 @@ struct v_int32x4
     {
         return _mm_cvtsi128_si32(val);
     }
+
     __m128i val;
 };
 
@@ -201,6 +215,7 @@ struct v_float32x4
     {
         return _mm_cvtss_f32(val);
     }
+
     __m128 val;
 };
 
@@ -222,6 +237,7 @@ struct v_uint64x2
         int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
         return (unsigned)a | ((uint64)(unsigned)b << 32);
     }
+
     __m128i val;
 };
 
@@ -243,6 +259,7 @@ struct v_int64x2
         int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
         return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
     }
+
     __m128i val;
 };
 
@@ -262,29 +279,31 @@ struct v_float64x2
     {
         return _mm_cvtsd_f64(val);
     }
+
     __m128d val;
 };
 
-#if CV_FP16
-struct v_float16x4
+struct v_float16x8
 {
     typedef short lane_type;
     typedef __m128i vector_type;
-    enum { nlanes = 4 };
+    enum { nlanes = 8 };
 
-    v_float16x4() : val(_mm_setzero_si128()) {}
-    explicit v_float16x4(__m128i v) : val(v) {}
-    v_float16x4(short v0, short v1, short v2, short v3)
+    v_float16x8() : val(_mm_setzero_si128()) {}
+    explicit v_float16x8(__m128i v) : val(v) {}
+    v_float16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
     {
-        val = _mm_setr_epi16(v0, v1, v2, v3, 0, 0, 0, 0);
+        val = _mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7);
     }
     short get0() const
     {
         return (short)_mm_cvtsi128_si32(val);
     }
+
     __m128i val;
 };
-#endif
+inline v_float16x8 v_setzero_f16() { return v_float16x8(_mm_setzero_si128()); }
+inline v_float16x8 v_setall_f16(short val) { return v_float16x8(_mm_set1_epi16(val)); }
 
 namespace hal_sse_internal
 {
@@ -697,11 +716,15 @@ inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b)
 }
 inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b)
 {
+#if CV_SSE4_1
+    return v_int32x4(_mm_mullo_epi32(a.val, b.val));
+#else
     __m128i c0 = _mm_mul_epu32(a.val, b.val);
     __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
     __m128i d0 = _mm_unpacklo_epi32(c0, c1);
     __m128i d1 = _mm_unpackhi_epi32(c0, c1);
     return v_int32x4(_mm_unpacklo_epi64(d0, d1));
+#endif
 }
 inline v_uint32x4& operator *= (v_uint32x4& a, const v_uint32x4& b)
 {
@@ -1027,11 +1050,35 @@ inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
     __m128i m = _mm_cmpgt_epi32(b.val, a.val);
     return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m));
 }
-inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
 {
     return a * b + c;
 }
 
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+#if CV_FMA3
+    return v_float32x4(_mm_fmadd_ps(a.val, b.val, c.val));
+#else
+    return v_float32x4(_mm_add_ps(_mm_mul_ps(a.val, b.val), c.val));
+#endif
+}
+
+inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+#if CV_FMA3
+    return v_float64x2(_mm_fmadd_pd(a.val, b.val, c.val));
+#else
+    return v_float64x2(_mm_add_pd(_mm_mul_pd(a.val, b.val), c.val));
+#endif
+}
+
 #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
 inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
 { \
@@ -1040,17 +1087,16 @@ inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
 } \
 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
 { \
-    _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
-    return _Tpvec(_mm_sqrt_##suffix(res)); \
+    _Tpvec res = v_fma(a, a, b*b); \
+    return _Tpvec(_mm_sqrt_##suffix(res.val)); \
 } \
 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
 { \
-    _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
-    return _Tpvec(res); \
+    return v_fma(a, a, b*b); \
 } \
 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
 { \
-    return _Tpvec(_mm_add_##suffix(_mm_mul_##suffix(a.val, b.val), c.val)); \
+    return v_fma(a, b, c); \
 }
 
 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff))
@@ -1268,12 +1314,15 @@ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
 
-#if CV_FP16
-inline v_float16x4 v_load_f16(const short* ptr)
-{ return v_float16x4(_mm_loadl_epi64((const __m128i*)ptr)); }
-inline void v_store_f16(short* ptr, v_float16x4& a)
-{ _mm_storel_epi64((__m128i*)ptr, a.val); }
-#endif
+inline v_float16x8 v_load_f16(const short* ptr)
+{ return v_float16x8(_mm_loadu_si128((const __m128i*)ptr)); }
+inline v_float16x8 v_load_f16_aligned(const short* ptr)
+{ return v_float16x8(_mm_load_si128((const __m128i*)ptr)); }
+
+inline void v_store(short* ptr, const v_float16x8& a)
+{ _mm_storeu_si128((__m128i*)ptr, a.val); }
+inline void v_store_aligned(short* ptr, const v_float16x8& a)
+{ _mm_store_si128((__m128i*)ptr, a.val); }
 
 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \
 inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
@@ -2183,6 +2232,11 @@ inline v_float32x4 v_cvt_f32(const v_float64x2& a)
     return v_float32x4(_mm_cvtpd_ps(a.val));
 }
 
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float32x4(_mm_movelh_ps(_mm_cvtpd_ps(a.val), _mm_cvtpd_ps(b.val)));
+}
+
 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
 {
     return v_float64x2(_mm_cvtepi32_pd(a.val));
@@ -2200,21 +2254,82 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a)
 
 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 {
-    return v_float64x2(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(a.val),8))));
+    return v_float64x2(_mm_cvtps_pd(_mm_movehl_ps(a.val, a.val)));
 }
 
 #if CV_FP16
-inline v_float32x4 v_cvt_f32(const v_float16x4& a)
+inline v_float32x4 v_cvt_f32(const v_float16x8& a)
 {
     return v_float32x4(_mm_cvtph_ps(a.val));
 }
 
-inline v_float16x4 v_cvt_f16(const v_float32x4& a)
+inline v_float32x4 v_cvt_f32_high(const v_float16x8& a)
+{
+    return v_float32x4(_mm_cvtph_ps(_mm_unpackhi_epi64(a.val, a.val)));
+}
+
+inline v_float16x8 v_cvt_f16(const v_float32x4& a, const v_float32x4& b)
 {
-    return v_float16x4(_mm_cvtps_ph(a.val, 0));
+    return v_float16x8(_mm_unpacklo_epi64(_mm_cvtps_ph(a.val, 0), _mm_cvtps_ph(b.val, 0)));
 }
 #endif
 
+////////////// Lookup table access ////////////////////
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    return v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    return v_float32x4(_mm_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    int idx[2];
+    v_store_low(idx, idxvec);
+    return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]]));
+}
+
+// loads pairs from the table and deinterleaves them, e.g. returns:
+//   x = (tab[idxvec[0], tab[idxvec[1]], tab[idxvec[2]], tab[idxvec[3]]),
+//   y = (tab[idxvec[0]+1], tab[idxvec[1]+1], tab[idxvec[2]+1], tab[idxvec[3]+1])
+// note that the indices are float's indices, not the float-pair indices.
+// in theory, this function can be used to implement bilinear interpolation,
+// when idxvec are the offsets within the image.
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    __m128 z = _mm_setzero_ps();
+    __m128 xy01 = _mm_loadl_pi(z, (__m64*)(tab + idx[0]));
+    __m128 xy23 = _mm_loadl_pi(z, (__m64*)(tab + idx[2]));
+    xy01 = _mm_loadh_pi(xy01, (__m64*)(tab + idx[1]));
+    xy23 = _mm_loadh_pi(xy23, (__m64*)(tab + idx[3]));
+    __m128 xxyy02 = _mm_unpacklo_ps(xy01, xy23);
+    __m128 xxyy13 = _mm_unpackhi_ps(xy01, xy23);
+    x = v_float32x4(_mm_unpacklo_ps(xxyy02, xxyy13));
+    y = v_float32x4(_mm_unpackhi_ps(xxyy02, xxyy13));
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    int idx[2];
+    v_store_low(idx, idxvec);
+    __m128d xy0 = _mm_loadu_pd(tab + idx[0]);
+    __m128d xy1 = _mm_loadu_pd(tab + idx[1]);
+    x = v_float64x2(_mm_unpacklo_pd(xy0, xy1));
+    y = v_float64x2(_mm_unpackhi_pd(xy0, xy1));
+}
+
+inline void v_cleanup() {}
+
 //! @name Check SIMD support
 //! @{
 //! @brief Check CPU capability of SIMD operation
diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
index 8b76dd8487..069e9578eb 100644
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -764,6 +764,8 @@ inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)                 \
 { return _Tpvec(vec_sqrt(vec_madd(a.val, a.val, vec_mul(b.val, b.val)))); } \
 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)             \
 { return _Tpvec(vec_madd(a.val, a.val, vec_mul(b.val, b.val))); }           \
+inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)      \
+{ return _Tpvec(vec_madd(a.val, b.val, c.val)); }                           \
 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)   \
 { return _Tpvec(vec_madd(a.val, b.val, c.val)); }
 
@@ -836,6 +838,9 @@ inline v_float32x4 v_cvt_f32(const v_int32x4& a)
 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
 { return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_float4_z)); }
 
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{ return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_cvfo(b.val))); }
+
 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
 { return v_float64x2(vec_ctdo(vec_mergeh(a.val, a.val))); }
 
@@ -848,6 +853,48 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a)
 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 { return v_float64x2(vec_cvfo(vec_mergel(a.val, a.val))); }
 
+////////////// Lookup table access ////////////////////
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    return v_float64x2(tab[idx[0]], tab[idx[1]]);
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+    y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    x = v_float64x2(tab[idx[0]], tab[idx[1]]);
+    y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
+}
+
+inline void v_cleanup() {}
+
+
 /** Reinterpret **/
 /** its up there with load and store operations **/
 
diff --git a/modules/core/src/convert.fp16.cpp b/modules/core/src/convert.fp16.cpp
index 6c71093e57..7168e8d643 100644
--- a/modules/core/src/convert.fp16.cpp
+++ b/modules/core/src/convert.fp16.cpp
@@ -81,10 +81,9 @@ void cvtScaleHalf_SIMD32f16f( const float* src, size_t sstep, short* dst, size_t
         for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth)
         {
             float32x4_t v_src = vld1q_f32(src + x);
-
             float16x4_t v_dst = vcvt_f16_f32(v_src);
 
-            cv_vst1_f16((__fp16*)dst + x, v_dst);
+            cv_vst1_f16(dst + x, v_dst);
         }
 
         for ( ; x < size.width; x++ )
diff --git a/modules/core/src/mathfuncs_core.simd.hpp b/modules/core/src/mathfuncs_core.simd.hpp
index b10bab6d63..354cc00421 100644
--- a/modules/core/src/mathfuncs_core.simd.hpp
+++ b/modules/core/src/mathfuncs_core.simd.hpp
@@ -22,7 +22,6 @@ void log32f(const float *src, float *dst, int n);
 void log64f(const double *src, double *dst, int n);
 float fastAtan2(float y, float x);
 
-
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
 using namespace std;
@@ -36,162 +35,140 @@ static const float atan2_p7 = -0.04432655554792128f*(float)(180/CV_PI);
 
 using namespace cv;
 
-#if CV_SIMD128
-
-template <typename T>
-struct v_atan
+static inline float atan_f32(float y, float x)
 {
-    typedef V_RegTrait128<T> Trait;
-    typedef typename Trait::reg VT; // vector type
-    enum { WorkWidth = VT::nlanes * 2 };
-
-    v_atan(const T & scale)
-        : s(Trait::all(scale))
+    float ax = std::abs(x), ay = std::abs(y);
+    float a, c, c2;
+    if( ax >= ay )
     {
-        eps = Trait::all(DBL_EPSILON);
-        z = Trait::zero();
-        p7 = Trait::all(atan2_p7);
-        p5 = Trait::all(atan2_p5);
-        p3 = Trait::all(atan2_p3);
-        p1 = Trait::all(atan2_p1);
-        val90 = Trait::all(90.f);
-        val180 = Trait::all(180.f);
-        val360 = Trait::all(360.f);
+        c = ay/(ax + (float)DBL_EPSILON);
+        c2 = c*c;
+        a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
     }
+    else
+    {
+        c = ax/(ay + (float)DBL_EPSILON);
+        c2 = c*c;
+        a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
+    }
+    if( x < 0 )
+        a = 180.f - a;
+    if( y < 0 )
+        a = 360.f - a;
+    return a;
+}
 
-    inline int operator()(int len, const T * Y, const T * X, T * angle)
+#if CV_SIMD
+
+struct v_atan_f32
+{
+    explicit v_atan_f32(const float& scale)
     {
-        int i = 0;
-        const int c = VT::nlanes;
-        for ( ; i <= len - c * 2; i += c * 2)
-        {
-            VT x1 = v_load(X + i);
-            VT x2 = v_load(X + i + c);
-            VT y1 = v_load(Y + i);
-            VT y2 = v_load(Y + i + c);
-            v_store(&angle[i], s * one(x1, y1));
-            v_store(&angle[i + c], s * one(x2, y2));
-        }
-        return i;
+        eps = vx_setall_f32((float)DBL_EPSILON);
+        z = vx_setzero_f32();
+        p7 = vx_setall_f32(atan2_p7);
+        p5 = vx_setall_f32(atan2_p5);
+        p3 = vx_setall_f32(atan2_p3);
+        p1 = vx_setall_f32(atan2_p1);
+        val90 = vx_setall_f32(90.f);
+        val180 = vx_setall_f32(180.f);
+        val360 = vx_setall_f32(360.f);
+        s = vx_setall_f32(scale);
     }
 
-private:
-    inline VT one(VT & x, VT & y)
+    v_float32 compute(const v_float32& y, const v_float32& x)
     {
-        VT ax = v_abs(x);
-        VT ay = v_abs(y);
-        VT c = v_min(ax, ay) / (v_max(ax, ay) + eps);
-        VT cc = c * c;
-        VT a = (((p7 * cc + p5) * cc + p3) * cc + p1) * c;
+        v_float32 ax = v_abs(x);
+        v_float32 ay = v_abs(y);
+        v_float32 c = v_min(ax, ay) / (v_max(ax, ay) + eps);
+        v_float32 cc = c * c;
+        v_float32 a = v_fma(v_fma(v_fma(cc, p7, p5), cc, p3), cc, p1)*c;
         a = v_select(ax >= ay, a, val90 - a);
         a = v_select(x < z, val180 - a, a);
         a = v_select(y < z, val360 - a, a);
-        return a;
+        return a * s;
     }
 
-private:
-    VT eps;
-    VT z;
-    VT p7;
-    VT p5;
-    VT p3;
-    VT p1;
-    VT val90;
-    VT val180;
-    VT val360;
-    VT s;
+    v_float32 eps;
+    v_float32 z;
+    v_float32 p7;
+    v_float32 p5;
+    v_float32 p3;
+    v_float32 p1;
+    v_float32 val90;
+    v_float32 val180;
+    v_float32 val360;
+    v_float32 s;
 };
 
-#if !CV_SIMD128_64F
+#endif
+
+} // anonymous::
+
+///////////////////////////////////// ATAN2 ////////////////////////////////////
 
-// emulation
-template <>
-struct v_atan<double>
+static void fastAtan32f_(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
 {
-    v_atan(double scale) : impl(static_cast<float>(scale)) {}
-    inline int operator()(int len, const double * Y, const double * X, double * angle)
+    float scale = angleInDegrees ? 1.f : (float)(CV_PI/180);
+    int i = 0;
+#if CV_SIMD
+    const int VECSZ = v_float32::nlanes;
+    v_atan_f32 v(scale);
+
+    for( ; i < len; i += VECSZ*2 )
     {
-        int i = 0;
-        const int c = v_atan<float>::WorkWidth;
-        float bufY[c];
-        float bufX[c];
-        float bufA[c];
-        for ( ; i <= len - c ; i += c)
+        if( i + VECSZ*2 > len )
         {
-            for (int j = 0; j < c; ++j)
-            {
-                bufY[j] = static_cast<float>(Y[i + j]);
-                bufX[j] = static_cast<float>(X[i + j]);
-            }
-            impl(c, bufY, bufX, bufA);
-            for (int j = 0; j < c; ++j)
-            {
-                angle[i + j] = bufA[j];
-            }
+            // if it's inplace operation, we cannot repeatedly process
+            // the tail for the second time, so we have to use the
+            // scalar code
+            if( i == 0 || angle == X || angle == Y )
+                break;
+            i = len - VECSZ*2;
         }
-        return i;
-    }
-private:
-    v_atan<float> impl;
-};
-#endif
-
-#endif
 
-template <typename T>
-static inline T atanImpl(T y, T x)
-{
-    T ax = std::abs(x), ay = std::abs(y);
-    T a, c, c2;
-    if( ax >= ay )
-    {
-        c = ay/(ax + static_cast<T>(DBL_EPSILON));
-        c2 = c*c;
-        a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
-    }
-    else
-    {
-        c = ax/(ay + static_cast<T>(DBL_EPSILON));
-        c2 = c*c;
-        a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
-    }
-    if( x < 0 )
-        a = 180.f - a;
-    if( y < 0 )
-        a = 360.f - a;
-    return a;
-}
+        v_float32 y0 = vx_load(Y + i);
+        v_float32 x0 = vx_load(X + i);
+        v_float32 y1 = vx_load(Y + i + VECSZ);
+        v_float32 x1 = vx_load(X + i + VECSZ);
 
-template <typename T>
-static inline void atanImpl(const T *Y, const T *X, T *angle, int len, bool angleInDegrees)
-{
-    int i = 0;
-    T scale = angleInDegrees ? 1 : static_cast<T>(CV_PI/180);
+        v_float32 r0 = v.compute(y0, x0);
+        v_float32 r1 = v.compute(y1, x1);
 
-#if CV_SIMD128
-    i = v_atan<T>(scale)(len, Y, X, angle);
+        v_store(angle + i, r0);
+        v_store(angle + i + VECSZ, r1);
+    }
+    vx_cleanup();
 #endif
 
     for( ; i < len; i++ )
-    {
-        angle[i] = atanImpl<T>(Y[i], X[i]) * scale;
-    }
+        angle[i] = atan_f32(Y[i], X[i])*scale;
 }
 
-} // anonymous::
-
-///////////////////////////////////// ATAN2 ////////////////////////////////////
-
 void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
 {
     CV_INSTRUMENT_REGION()
-    atanImpl<float>(Y, X, angle, len, angleInDegrees);
+    fastAtan32f_(Y, X, angle, len, angleInDegrees );
 }
 
 void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool angleInDegrees)
 {
     CV_INSTRUMENT_REGION()
-    atanImpl<double>(Y, X, angle, len, angleInDegrees);
+
+    const int BLKSZ = 128;
+    float ybuf[BLKSZ], xbuf[BLKSZ], abuf[BLKSZ];
+    for( int i = 0; i < len; i += BLKSZ )
+    {
+        int j, blksz = std::min(BLKSZ, len - i);
+        for( j = 0; j < blksz; j++ )
+        {
+            ybuf[j] = (float)Y[i + j];
+            xbuf[j] = (float)X[i + j];
+        }
+        fastAtan32f_(ybuf, xbuf, abuf, blksz, angleInDegrees);
+        for( j = 0; j < blksz; j++ )
+            angle[i + j] = abuf[j];
+    }
 }
 
 // deprecated
@@ -207,16 +184,24 @@ void magnitude32f(const float* x, const float* y, float* mag, int len)
 
     int i = 0;
 
-#if CV_SIMD128
-    for( ; i <= len - 8; i += 8 )
+#if CV_SIMD
+    const int VECSZ = v_float32::nlanes;
+    for( ; i < len; i += VECSZ*2 )
     {
-        v_float32x4 x0 = v_load(x + i), x1 = v_load(x + i + 4);
-        v_float32x4 y0 = v_load(y + i), y1 = v_load(y + i + 4);
+        if( i + VECSZ*2 > len )
+        {
+            if( i == 0 || mag == x || mag == y )
+                break;
+            i = len - VECSZ*2;
+        }
+        v_float32 x0 = vx_load(x + i), x1 = vx_load(x + i + VECSZ);
+        v_float32 y0 = vx_load(y + i), y1 = vx_load(y + i + VECSZ);
         x0 = v_sqrt(v_muladd(x0, x0, y0*y0));
         x1 = v_sqrt(v_muladd(x1, x1, y1*y1));
         v_store(mag + i, x0);
-        v_store(mag + i + 4, x1);
+        v_store(mag + i + VECSZ, x1);
     }
+    vx_cleanup();
 #endif
 
     for( ; i < len; i++ )
@@ -232,16 +217,24 @@ void magnitude64f(const double* x, const double* y, double* mag, int len)
 
     int i = 0;
 
-#if CV_SIMD128_64F
-    for( ; i <= len - 4; i += 4 )
+#if CV_SIMD_64F
+    const int VECSZ = v_float64::nlanes;
+    for( ; i < len; i += VECSZ*2 )
     {
-        v_float64x2 x0 = v_load(x + i), x1 = v_load(x + i + 2);
-        v_float64x2 y0 = v_load(y + i), y1 = v_load(y + i + 2);
+        if( i + VECSZ*2 > len )
+        {
+            if( i == 0 || mag == x || mag == y )
+                break;
+            i = len - VECSZ*2;
+        }
+        v_float64 x0 = vx_load(x + i), x1 = vx_load(x + i + VECSZ);
+        v_float64 y0 = vx_load(y + i), y1 = vx_load(y + i + VECSZ);
         x0 = v_sqrt(v_muladd(x0, x0, y0*y0));
         x1 = v_sqrt(v_muladd(x1, x1, y1*y1));
         v_store(mag + i, x0);
-        v_store(mag + i + 2, x1);
+        v_store(mag + i + VECSZ, x1);
     }
+    vx_cleanup();
 #endif
 
     for( ; i < len; i++ )
@@ -258,14 +251,22 @@ void invSqrt32f(const float* src, float* dst, int len)
 
     int i = 0;
 
-#if CV_SIMD128
-    for( ; i <= len - 8; i += 8 )
+#if CV_SIMD
+    const int VECSZ = v_float32::nlanes;
+    for( ; i < len; i += VECSZ*2 )
     {
-        v_float32x4 t0 = v_load(src + i), t1 = v_load(src + i + 4);
+        if( i + VECSZ*2 > len )
+        {
+            if( i == 0 || src == dst )
+                break;
+            i = len - VECSZ*2;
+        }
+        v_float32 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ);
         t0 = v_invsqrt(t0);
         t1 = v_invsqrt(t1);
-        v_store(dst + i, t0); v_store(dst + i + 4, t1);
+        v_store(dst + i, t0); v_store(dst + i + VECSZ, t1);
     }
+    vx_cleanup();
 #endif
 
     for( ; i < len; i++ )
@@ -276,13 +277,23 @@ void invSqrt32f(const float* src, float* dst, int len)
 void invSqrt64f(const double* src, double* dst, int len)
 {
     CV_INSTRUMENT_REGION()
-
     int i = 0;
 
-#if CV_SSE2
-    __m128d v_1 = _mm_set1_pd(1.0);
-    for ( ; i <= len - 2; i += 2)
-        _mm_storeu_pd(dst + i, _mm_div_pd(v_1, _mm_sqrt_pd(_mm_loadu_pd(src + i))));
+#if CV_SIMD_64F
+    const int VECSZ = v_float64::nlanes;
+    for ( ; i < len; i += VECSZ*2)
+    {
+        if( i + VECSZ*2 > len )
+        {
+            if( i == 0 || src == dst )
+                break;
+            i = len - VECSZ*2;
+        }
+        v_float64 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ);
+        t0 = v_invsqrt(t0);
+        t1 = v_invsqrt(t1);
+        v_store(dst + i, t0); v_store(dst + i + VECSZ, t1);
+    }
 #endif
 
     for( ; i < len; i++ )
@@ -296,14 +307,22 @@ void sqrt32f(const float* src, float* dst, int len)
 
     int i = 0;
 
-#if CV_SIMD128
-    for( ; i <= len - 8; i += 8 )
+#if CV_SIMD
+    const int VECSZ = v_float32::nlanes;
+    for( ; i < len; i += VECSZ*2 )
     {
-        v_float32x4 t0 = v_load(src + i), t1 = v_load(src + i + 4);
+        if( i + VECSZ*2 > len )
+        {
+            if( i == 0 || src == dst )
+                break;
+            i = len - VECSZ*2;
+        }
+        v_float32 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ);
         t0 = v_sqrt(t0);
         t1 = v_sqrt(t1);
-        v_store(dst + i, t0); v_store(dst + i + 4, t1);
+        v_store(dst + i, t0); v_store(dst + i + VECSZ, t1);
     }
+    vx_cleanup();
 #endif
 
     for( ; i < len; i++ )
@@ -317,14 +336,22 @@ void sqrt64f(const double* src, double* dst, int len)
 
     int i = 0;
 
-#if CV_SIMD128_64F
-    for( ; i <= len - 4; i += 4 )
+#if CV_SIMD_64F
+    const int VECSZ = v_float64::nlanes;
+    for( ; i < len; i += VECSZ*2 )
     {
-        v_float64x2 t0 = v_load(src + i), t1 = v_load(src + i + 2);
+        if( i + VECSZ*2 > len )
+        {
+            if( i == 0 || src == dst )
+                break;
+            i = len - VECSZ*2;
+        }
+        v_float64 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ);
         t0 = v_sqrt(t0);
         t1 = v_sqrt(t1);
-        v_store(dst + i, t0); v_store(dst + i + 2, t1);
+        v_store(dst + i, t0); v_store(dst + i + VECSZ, t1);
     }
+    vx_cleanup();
 #endif
 
     for( ; i < len; i++ )
@@ -377,21 +404,6 @@ void log64f(const double *src, double *dst, int n)
 
 ////////////////////////////////////// EXP /////////////////////////////////////
 
-typedef union
-{
-    struct {
-#if ( defined( WORDS_BIGENDIAN ) && !defined( OPENCV_UNIVERSAL_BUILD ) ) || defined( __BIG_ENDIAN__ )
-        int hi;
-        int lo;
-#else
-        int lo;
-        int hi;
-#endif
-    } i;
-    double d;
-}
-DBLINT;
-
 #define EXPTAB_SCALE 6
 #define EXPTAB_MASK  ((1 << EXPTAB_SCALE) - 1)
 
@@ -464,6 +476,8 @@ static const double expTab[] = {
     1.9784560263879509682582499181312 * EXPPOLY_32F_A0,
 };
 
+static float expTab_f[EXPTAB_MASK+1];
+static volatile bool extTab_f_initialized = false;
 
 // the code below uses _mm_cast* intrinsics, which are not avialable on VS2005
 #if (defined _MSC_VER && _MSC_VER < 1500) || \
@@ -480,283 +494,117 @@ void exp32f( const float *_x, float *y, int n )
 {
     CV_INSTRUMENT_REGION()
 
+    if( !extTab_f_initialized )
+    {
+        for( int j = 0; j <= EXPTAB_MASK; j++ )
+            expTab_f[j] = (float)expTab[j];
+        extTab_f_initialized = true;
+    }
+
     static const float
     A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0),
     A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0),
     A2 = (float)(.2402265109513301490103372422686535526573 / EXPPOLY_32F_A0),
     A1 = (float)(.5550339366753125211915322047004666939128e-1 / EXPPOLY_32F_A0);
 
-#undef EXPPOLY
-#define EXPPOLY(x)  \
-(((((x) + A1)*(x) + A2)*(x) + A3)*(x) + A4)
-
     int i = 0;
     const Cv32suf* x = (const Cv32suf*)_x;
-    Cv32suf buf[4];
-
-#if CV_AVX2
-    if( n >= 8 )
+    float minval = (float)(-exp_max_val/exp_prescale);
+    float maxval = (float)(exp_max_val/exp_prescale);
+    float postscale = (float)exp_postscale;
+
+#if CV_SIMD
+    const int VECSZ = v_float32::nlanes;
+    static const v_float32 vprescale = vx_setall_f32((float)exp_prescale);
+    static const v_float32 vpostscale = vx_setall_f32((float)exp_postscale);
+    static const v_float32 vminval = vx_setall_f32(minval);
+    static const v_float32 vmaxval = vx_setall_f32(maxval);
+
+    static const v_float32 vA1 = vx_setall_f32((float)A1);
+    static const v_float32 vA2 = vx_setall_f32((float)A2);
+    static const v_float32 vA3 = vx_setall_f32((float)A3);
+    static const v_float32 vA4 = vx_setall_f32((float)A4);
+
+    static const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
+    bool y_aligned = (size_t)(void*)y % 32 == 0;
+
+    for( ; i < n; i += VECSZ*2 )
     {
-        static const __m256d prescale4 = _mm256_set1_pd(exp_prescale);
-        static const __m256 postscale8 = _mm256_set1_ps((float)exp_postscale);
-        static const __m128 maxval4 = _mm_set1_ps((float)(exp_max_val/exp_prescale));
-        static const __m128 minval4 = _mm_set1_ps((float)(-exp_max_val/exp_prescale));
-
-        static const __m256 mA1 = _mm256_set1_ps(A1);
-        static const __m256 mA2 = _mm256_set1_ps(A2);
-        static const __m256 mA3 = _mm256_set1_ps(A3);
-        static const __m256 mA4 = _mm256_set1_ps(A4);
-        bool y_aligned = (size_t)(void*)y % 32 == 0;
-
-        ushort CV_DECL_ALIGNED(32) tab_idx[16];
-
-        for( ; i <= n - 8; i += 8 )
+        if( i + VECSZ*2 > n )
         {
-            __m128i xi0, xi1;
-
-            __m256d xd0 = _mm256_cvtps_pd(_mm_min_ps(_mm_max_ps(_mm_loadu_ps(&x[i].f), minval4), maxval4));
-            __m256d xd1 = _mm256_cvtps_pd(_mm_min_ps(_mm_max_ps(_mm_loadu_ps(&x[i+4].f), minval4), maxval4));
-
-            xd0 = _mm256_mul_pd(xd0, prescale4);
-            xd1 = _mm256_mul_pd(xd1, prescale4);
+            if( i == 0 || _x == y )
+                break;
+            i = n - VECSZ*2;
+            y_aligned = false;
+        }
 
-            xi0 = _mm256_cvtpd_epi32(xd0);
-            xi1 = _mm256_cvtpd_epi32(xd1);
+        v_float32 xf0 = vx_load(&x[i].f), xf1 = vx_load(&x[i + VECSZ].f);
 
-            xd0 = _mm256_sub_pd(xd0, _mm256_cvtepi32_pd(xi0));
-            xd1 = _mm256_sub_pd(xd1, _mm256_cvtepi32_pd(xi1));
+        xf0 = v_min(v_max(xf0, vminval), vmaxval);
+        xf1 = v_min(v_max(xf1, vminval), vmaxval);
 
-            // gcc does not support _mm256_set_m128
-            //xf = _mm256_set_m128(_mm256_cvtpd_ps(xd1), _mm256_cvtpd_ps(xd0));
-            __m256 xf = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(xd0)), _mm256_cvtpd_ps(xd1), 1);
+        xf0 *= vprescale;
+        xf1 *= vprescale;
 
-            xf = _mm256_mul_ps(xf, postscale8);
+        v_int32 xi0 = v_round(xf0);
+        v_int32 xi1 = v_round(xf1);
+        xf0 = (xf0 - v_cvt_f32(xi0))*vpostscale;
+        xf1 = (xf1 - v_cvt_f32(xi1))*vpostscale;
 
-            xi0 = _mm_packs_epi32(xi0, xi1);
+        v_float32 yf0 = v_lut(expTab_f, xi0 & vidxmask);
+        v_float32 yf1 = v_lut(expTab_f, xi1 & vidxmask);
 
-            _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi16(EXPTAB_MASK)));
+        v_int32 v0 = vx_setzero_s32(), v127 = vx_setall_s32(127), v255 = vx_setall_s32(255);
+        xi0 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi0) + v127, v0), v255);
+        xi1 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi1) + v127, v0), v255);
 
-            xi0 = _mm_add_epi16(_mm_srai_epi16(xi0, EXPTAB_SCALE), _mm_set1_epi16(127));
-            xi0 = _mm_max_epi16(xi0, _mm_setzero_si128());
-            xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(255));
-            xi1 = _mm_unpackhi_epi16(xi0, _mm_setzero_si128());
-            xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128());
+        yf0 *= v_reinterpret_as_f32(v_shl<23>(xi0));
+        yf1 *= v_reinterpret_as_f32(v_shl<23>(xi1));
 
-            __m256d yd0 = _mm256_set_pd(expTab[tab_idx[3]], expTab[tab_idx[2]], expTab[tab_idx[1]], expTab[tab_idx[0]]);
-            __m256d yd1 = _mm256_set_pd(expTab[tab_idx[7]], expTab[tab_idx[6]], expTab[tab_idx[5]], expTab[tab_idx[4]]);
+        v_float32 zf0 = xf0 + vA1;
+        v_float32 zf1 = xf1 + vA1;
 
-            // gcc does not support _mm256_set_m128
-            //__m256 yf = _mm256_set_m128(_mm256_cvtpd_ps(yd1), _mm256_cvtpd_ps(yd0));
-            __m256 yf = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(yd0)), _mm256_cvtpd_ps(yd1), 1);
+        zf0 = v_fma(zf0, xf0, vA2);
+        zf1 = v_fma(zf1, xf1, vA2);
 
-            //_mm256_set_m128i(xi1, xi0)
-            __m256i temp = _mm256_castps_si256(_mm256_insertf128_ps(_mm256_castps128_ps256(_mm_castsi128_ps(xi0)), _mm_castsi128_ps(xi1), 1));
+        zf0 = v_fma(zf0, xf0, vA3);
+        zf1 = v_fma(zf1, xf1, vA3);
 
-            yf = _mm256_mul_ps(yf, _mm256_castsi256_ps(_mm256_slli_epi32(temp, 23)));
+        zf0 = v_fma(zf0, xf0, vA4);
+        zf1 = v_fma(zf1, xf1, vA4);
 
-            __m256 zf = _mm256_add_ps(xf, mA1);
+        zf0 *= yf0;
+        zf1 *= yf1;
 
-#if CV_FMA3
-            zf = _mm256_fmadd_ps(zf, xf, mA2);
-            zf = _mm256_fmadd_ps(zf, xf, mA3);
-            zf = _mm256_fmadd_ps(zf, xf, mA4);
-#else
-            zf = _mm256_add_ps(_mm256_mul_ps(zf, xf), mA2);
-            zf = _mm256_add_ps(_mm256_mul_ps(zf, xf), mA3);
-            zf = _mm256_add_ps(_mm256_mul_ps(zf, xf), mA4);
-#endif
-            zf = _mm256_mul_ps(zf, yf);
-
-            if( y_aligned )
-            {
-                _mm256_store_ps(y + i, zf);
-            }
-            else
-            {
-                _mm256_storeu_ps(y + i, zf);
-            }
+        if( y_aligned )
+        {
+            v_store_aligned(y + i, zf0);
+            v_store_aligned(y + i + VECSZ, zf1);
         }
-    }
-#elif CV_SSE2
-    if( n >= 8 )
-    {
-        static const __m128d prescale2 = _mm_set1_pd(exp_prescale);
-        static const __m128 postscale4 = _mm_set1_ps((float)exp_postscale);
-        static const __m128 maxval4 = _mm_set1_ps((float)(exp_max_val/exp_prescale));
-        static const __m128 minval4 = _mm_set1_ps((float)(-exp_max_val/exp_prescale));
-
-        static const __m128 mA1 = _mm_set1_ps(A1);
-        static const __m128 mA2 = _mm_set1_ps(A2);
-        static const __m128 mA3 = _mm_set1_ps(A3);
-        static const __m128 mA4 = _mm_set1_ps(A4);
-        bool y_aligned = (size_t)(void*)y % 16 == 0;
-
-        ushort CV_DECL_ALIGNED(16) tab_idx[8];
-
-        for( ; i <= n - 8; i += 8 )
+        else
         {
-            __m128 xf0, xf1;
-            xf0 = _mm_loadu_ps(&x[i].f);
-            xf1 = _mm_loadu_ps(&x[i+4].f);
-            __m128i xi0, xi1, xi2, xi3;
-
-            xf0 = _mm_min_ps(_mm_max_ps(xf0, minval4), maxval4);
-            xf1 = _mm_min_ps(_mm_max_ps(xf1, minval4), maxval4);
-
-            __m128d xd0 = _mm_cvtps_pd(xf0);
-            __m128d xd2 = _mm_cvtps_pd(_mm_movehl_ps(xf0, xf0));
-            __m128d xd1 = _mm_cvtps_pd(xf1);
-            __m128d xd3 = _mm_cvtps_pd(_mm_movehl_ps(xf1, xf1));
-
-            xd0 = _mm_mul_pd(xd0, prescale2);
-            xd2 = _mm_mul_pd(xd2, prescale2);
-            xd1 = _mm_mul_pd(xd1, prescale2);
-            xd3 = _mm_mul_pd(xd3, prescale2);
-
-            xi0 = _mm_cvtpd_epi32(xd0);
-            xi2 = _mm_cvtpd_epi32(xd2);
-
-            xi1 = _mm_cvtpd_epi32(xd1);
-            xi3 = _mm_cvtpd_epi32(xd3);
-
-            xd0 = _mm_sub_pd(xd0, _mm_cvtepi32_pd(xi0));
-            xd2 = _mm_sub_pd(xd2, _mm_cvtepi32_pd(xi2));
-            xd1 = _mm_sub_pd(xd1, _mm_cvtepi32_pd(xi1));
-            xd3 = _mm_sub_pd(xd3, _mm_cvtepi32_pd(xi3));
-
-            xf0 = _mm_movelh_ps(_mm_cvtpd_ps(xd0), _mm_cvtpd_ps(xd2));
-            xf1 = _mm_movelh_ps(_mm_cvtpd_ps(xd1), _mm_cvtpd_ps(xd3));
-
-            xf0 = _mm_mul_ps(xf0, postscale4);
-            xf1 = _mm_mul_ps(xf1, postscale4);
-
-            xi0 = _mm_unpacklo_epi64(xi0, xi2);
-            xi1 = _mm_unpacklo_epi64(xi1, xi3);
-            xi0 = _mm_packs_epi32(xi0, xi1);
-
-            _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi16(EXPTAB_MASK)));
-
-            xi0 = _mm_add_epi16(_mm_srai_epi16(xi0, EXPTAB_SCALE), _mm_set1_epi16(127));
-            xi0 = _mm_max_epi16(xi0, _mm_setzero_si128());
-            xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(255));
-            xi1 = _mm_unpackhi_epi16(xi0, _mm_setzero_si128());
-            xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128());
-
-            __m128d yd0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1]));
-            __m128d yd1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3]));
-            __m128d yd2 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[4]), _mm_load_sd(expTab + tab_idx[5]));
-            __m128d yd3 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[6]), _mm_load_sd(expTab + tab_idx[7]));
-
-            __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1));
-            __m128 yf1 = _mm_movelh_ps(_mm_cvtpd_ps(yd2), _mm_cvtpd_ps(yd3));
-
-            yf0 = _mm_mul_ps(yf0, _mm_castsi128_ps(_mm_slli_epi32(xi0, 23)));
-            yf1 = _mm_mul_ps(yf1, _mm_castsi128_ps(_mm_slli_epi32(xi1, 23)));
-
-            __m128 zf0 = _mm_add_ps(xf0, mA1);
-            __m128 zf1 = _mm_add_ps(xf1, mA1);
-
-            zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA2);
-            zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA2);
-
-            zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA3);
-            zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA3);
-
-            zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA4);
-            zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA4);
-
-            zf0 = _mm_mul_ps(zf0, yf0);
-            zf1 = _mm_mul_ps(zf1, yf1);
-
-            if( y_aligned )
-            {
-                _mm_store_ps(y + i, zf0);
-                _mm_store_ps(y + i + 4, zf1);
-            }
-            else
-            {
-                _mm_storeu_ps(y + i, zf0);
-                _mm_storeu_ps(y + i + 4, zf1);
-            }
+            v_store(y + i, zf0);
+            v_store(y + i + VECSZ, zf1);
         }
     }
-    else
+    vx_cleanup();
 #endif
-        for( ; i <= n - 4; i += 4 )
-        {
-            double x0 = x[i].f * exp_prescale;
-            double x1 = x[i + 1].f * exp_prescale;
-            double x2 = x[i + 2].f * exp_prescale;
-            double x3 = x[i + 3].f * exp_prescale;
-            int val0, val1, val2, val3, t;
-
-            if( ((x[i].i >> 23) & 255) > 127 + 10 )
-                x0 = x[i].i < 0 ? -exp_max_val : exp_max_val;
-
-            if( ((x[i+1].i >> 23) & 255) > 127 + 10 )
-                x1 = x[i+1].i < 0 ? -exp_max_val : exp_max_val;
-
-            if( ((x[i+2].i >> 23) & 255) > 127 + 10 )
-                x2 = x[i+2].i < 0 ? -exp_max_val : exp_max_val;
-
-            if( ((x[i+3].i >> 23) & 255) > 127 + 10 )
-                x3 = x[i+3].i < 0 ? -exp_max_val : exp_max_val;
-
-            val0 = cvRound(x0);
-            val1 = cvRound(x1);
-            val2 = cvRound(x2);
-            val3 = cvRound(x3);
-
-            x0 = (x0 - val0)*exp_postscale;
-            x1 = (x1 - val1)*exp_postscale;
-            x2 = (x2 - val2)*exp_postscale;
-            x3 = (x3 - val3)*exp_postscale;
-
-            t = (val0 >> EXPTAB_SCALE) + 127;
-            t = !(t & ~255) ? t : t < 0 ? 0 : 255;
-            buf[0].i = t << 23;
-
-            t = (val1 >> EXPTAB_SCALE) + 127;
-            t = !(t & ~255) ? t : t < 0 ? 0 : 255;
-            buf[1].i = t << 23;
-
-            t = (val2 >> EXPTAB_SCALE) + 127;
-            t = !(t & ~255) ? t : t < 0 ? 0 : 255;
-            buf[2].i = t << 23;
-
-            t = (val3 >> EXPTAB_SCALE) + 127;
-            t = !(t & ~255) ? t : t < 0 ? 0 : 255;
-            buf[3].i = t << 23;
-
-            x0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
-            x1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 );
-
-            y[i] = (float)x0;
-            y[i + 1] = (float)x1;
-
-            x2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 );
-            x3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 );
-
-            y[i + 2] = (float)x2;
-            y[i + 3] = (float)x3;
-        }
 
     for( ; i < n; i++ )
     {
-        double x0 = x[i].f * exp_prescale;
-        int val0, t;
+        float x0 = x[i].f;
+        x0 = std::min(std::max(x0, minval), maxval);
+        x0 *= (float)exp_prescale;
+        Cv32suf buf;
 
-        if( ((x[i].i >> 23) & 255) > 127 + 10 )
-            x0 = x[i].i < 0 ? -exp_max_val : exp_max_val;
+        int xi = saturate_cast<int>(x0);
+        x0 = (x0 - xi)*postscale;
 
-        val0 = cvRound(x0);
-        t = (val0 >> EXPTAB_SCALE) + 127;
+        int t = (xi >> EXPTAB_SCALE) + 127;
         t = !(t & ~255) ? t : t < 0 ? 0 : 255;
+        buf.i = t << 23;
 
-        buf[0].i = t << 23;
-        x0 = (x0 - val0)*exp_postscale;
-
-        y[i] = (float)(buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY(x0));
+        y[i] = buf.f * expTab_f[xi & EXPTAB_MASK] * ((((x0 + A1)*x0 + A2)*x0 + A3)*x0 + A4);
     }
 }
 
@@ -772,162 +620,111 @@ void exp64f( const double *_x, double *y, int n )
     A1 = .96180973140732918010002372686186e-2 / EXPPOLY_32F_A0,
     A0 = .13369713757180123244806654839424e-2 / EXPPOLY_32F_A0;
 
-#undef EXPPOLY
-#define EXPPOLY(x)  (((((A0*(x) + A1)*(x) + A2)*(x) + A3)*(x) + A4)*(x) + A5)
-
     int i = 0;
-    Cv64suf buf[4];
     const Cv64suf* x = (const Cv64suf*)_x;
-
-#if CV_SSE2
-    static const __m128d prescale2 = _mm_set1_pd(exp_prescale);
-    static const __m128d postscale2 = _mm_set1_pd(exp_postscale);
-    static const __m128d maxval2 = _mm_set1_pd(exp_max_val);
-    static const __m128d minval2 = _mm_set1_pd(-exp_max_val);
-
-    static const __m128d mA0 = _mm_set1_pd(A0);
-    static const __m128d mA1 = _mm_set1_pd(A1);
-    static const __m128d mA2 = _mm_set1_pd(A2);
-    static const __m128d mA3 = _mm_set1_pd(A3);
-    static const __m128d mA4 = _mm_set1_pd(A4);
-    static const __m128d mA5 = _mm_set1_pd(A5);
-
-    int CV_DECL_ALIGNED(16) tab_idx[4];
-
-    for( ; i <= n - 4; i += 4 )
+    double minval = (-exp_max_val/exp_prescale);
+    double maxval = (exp_max_val/exp_prescale);
+
+#if CV_SIMD_64F
+    const int VECSZ = v_float64::nlanes;
+    static const v_float64 vprescale = vx_setall_f64(exp_prescale);
+    static const v_float64 vpostscale = vx_setall_f64(exp_postscale);
+    static const v_float64 vminval = vx_setall_f64(minval);
+    static const v_float64 vmaxval = vx_setall_f64(maxval);
+
+    static const v_float64 vA1 = vx_setall_f64(A1);
+    static const v_float64 vA2 = vx_setall_f64(A2);
+    static const v_float64 vA3 = vx_setall_f64(A3);
+    static const v_float64 vA4 = vx_setall_f64(A4);
+    static const v_float64 vA5 = vx_setall_f64(A5);
+
+    static const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
+    bool y_aligned = (size_t)(void*)y % 32 == 0;
+
+    for( ; i < n; i += VECSZ*2 )
     {
-        __m128d xf0 = _mm_loadu_pd(&x[i].f), xf1 = _mm_loadu_pd(&x[i+2].f);
-        __m128i xi0, xi1;
-        xf0 = _mm_min_pd(_mm_max_pd(xf0, minval2), maxval2);
-        xf1 = _mm_min_pd(_mm_max_pd(xf1, minval2), maxval2);
-        xf0 = _mm_mul_pd(xf0, prescale2);
-        xf1 = _mm_mul_pd(xf1, prescale2);
-
-        xi0 = _mm_cvtpd_epi32(xf0);
-        xi1 = _mm_cvtpd_epi32(xf1);
-        xf0 = _mm_mul_pd(_mm_sub_pd(xf0, _mm_cvtepi32_pd(xi0)), postscale2);
-        xf1 = _mm_mul_pd(_mm_sub_pd(xf1, _mm_cvtepi32_pd(xi1)), postscale2);
-
-        xi0 = _mm_unpacklo_epi64(xi0, xi1);
-        _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi32(EXPTAB_MASK)));
-
-        xi0 = _mm_add_epi32(_mm_srai_epi32(xi0, EXPTAB_SCALE), _mm_set1_epi32(1023));
-        xi0 = _mm_packs_epi32(xi0, xi0);
-        xi0 = _mm_max_epi16(xi0, _mm_setzero_si128());
-        xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(2047));
-        xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128());
-        xi1 = _mm_unpackhi_epi32(xi0, _mm_setzero_si128());
-        xi0 = _mm_unpacklo_epi32(xi0, _mm_setzero_si128());
-
-        __m128d yf0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1]));
-        __m128d yf1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3]));
-        yf0 = _mm_mul_pd(yf0, _mm_castsi128_pd(_mm_slli_epi64(xi0, 52)));
-        yf1 = _mm_mul_pd(yf1, _mm_castsi128_pd(_mm_slli_epi64(xi1, 52)));
-
-        __m128d zf0 = _mm_add_pd(_mm_mul_pd(mA0, xf0), mA1);
-        __m128d zf1 = _mm_add_pd(_mm_mul_pd(mA0, xf1), mA1);
-
-        zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA2);
-        zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA2);
-
-        zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA3);
-        zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA3);
-
-        zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA4);
-        zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA4);
-
-        zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA5);
-        zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA5);
-
-        zf0 = _mm_mul_pd(zf0, yf0);
-        zf1 = _mm_mul_pd(zf1, yf1);
-
-        _mm_storeu_pd(y + i, zf0);
-        _mm_storeu_pd(y + i + 2, zf1);
-    }
-#endif
-    for( ; i <= n - 4; i += 4 )
-    {
-        double x0 = x[i].f * exp_prescale;
-        double x1 = x[i + 1].f * exp_prescale;
-        double x2 = x[i + 2].f * exp_prescale;
-        double x3 = x[i + 3].f * exp_prescale;
+        if( i + VECSZ*2 > n )
+        {
+            if( i == 0 || _x == y )
+                break;
+            i = n - VECSZ*2;
+            y_aligned = false;
+        }
 
-        double y0, y1, y2, y3;
-        int val0, val1, val2, val3, t;
+        v_float64 xf0 = vx_load(&x[i].f), xf1 = vx_load(&x[i + VECSZ].f);
 
-        t = (int)(x[i].i >> 52);
-        if( (t & 2047) > 1023 + 10 )
-            x0 = t < 0 ? -exp_max_val : exp_max_val;
+        xf0 = v_min(v_max(xf0, vminval), vmaxval);
+        xf1 = v_min(v_max(xf1, vminval), vmaxval);
 
-        t = (int)(x[i+1].i >> 52);
-        if( (t & 2047) > 1023 + 10 )
-            x1 = t < 0 ? -exp_max_val : exp_max_val;
+        xf0 *= vprescale;
+        xf1 *= vprescale;
 
-        t = (int)(x[i+2].i >> 52);
-        if( (t & 2047) > 1023 + 10 )
-            x2 = t < 0 ? -exp_max_val : exp_max_val;
+        v_int32 xi0 = v_round(xf0);
+        v_int32 xi1 = v_round(xf1);
+        xf0 = (xf0 - v_cvt_f64(xi0))*vpostscale;
+        xf1 = (xf1 - v_cvt_f64(xi1))*vpostscale;
 
-        t = (int)(x[i+3].i >> 52);
-        if( (t & 2047) > 1023 + 10 )
-            x3 = t < 0 ? -exp_max_val : exp_max_val;
+        v_float64 yf0 = v_lut(expTab, xi0 & vidxmask);
+        v_float64 yf1 = v_lut(expTab, xi1 & vidxmask);
 
-        val0 = cvRound(x0);
-        val1 = cvRound(x1);
-        val2 = cvRound(x2);
-        val3 = cvRound(x3);
+        v_int32 v0 = vx_setzero_s32(), v1023 = vx_setall_s32(1023), v2047 = vx_setall_s32(2047);
+        xi0 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi0) + v1023, v0), v2047);
+        xi1 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi1) + v1023, v0), v2047);
 
-        x0 = (x0 - val0)*exp_postscale;
-        x1 = (x1 - val1)*exp_postscale;
-        x2 = (x2 - val2)*exp_postscale;
-        x3 = (x3 - val3)*exp_postscale;
+        v_int64 xq0, xq1, dummy;
+        v_expand(xi0, xq0, dummy);
+        v_expand(xi1, xq1, dummy);
 
-        t = (val0 >> EXPTAB_SCALE) + 1023;
-        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
-        buf[0].i = (int64)t << 52;
+        yf0 *= v_reinterpret_as_f64(v_shl<52>(xq0));
+        yf1 *= v_reinterpret_as_f64(v_shl<52>(xq1));
 
-        t = (val1 >> EXPTAB_SCALE) + 1023;
-        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
-        buf[1].i = (int64)t << 52;
+        v_float64 zf0 = xf0 + vA1;
+        v_float64 zf1 = xf1 + vA1;
 
-        t = (val2 >> EXPTAB_SCALE) + 1023;
-        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
-        buf[2].i = (int64)t << 52;
+        zf0 = v_fma(zf0, xf0, vA2);
+        zf1 = v_fma(zf1, xf1, vA2);
 
-        t = (val3 >> EXPTAB_SCALE) + 1023;
-        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
-        buf[3].i = (int64)t << 52;
+        zf0 = v_fma(zf0, xf0, vA3);
+        zf1 = v_fma(zf1, xf1, vA3);
 
-        y0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
-        y1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 );
+        zf0 = v_fma(zf0, xf0, vA4);
+        zf1 = v_fma(zf1, xf1, vA4);
 
-        y[i] = y0;
-        y[i + 1] = y1;
+        zf0 = v_fma(zf0, xf0, vA5);
+        zf1 = v_fma(zf1, xf1, vA5);
 
-        y2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 );
-        y3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 );
+        zf0 *= yf0;
+        zf1 *= yf1;
 
-        y[i + 2] = y2;
-        y[i + 3] = y3;
+        if( y_aligned )
+        {
+            v_store_aligned(y + i, zf0);
+            v_store_aligned(y + i + VECSZ, zf1);
+        }
+        else
+        {
+            v_store(y + i, zf0);
+            v_store(y + i + VECSZ, zf1);
+        }
     }
+    vx_cleanup();
+#endif
 
     for( ; i < n; i++ )
     {
-        double x0 = x[i].f * exp_prescale;
-        int val0, t;
+        double x0 = x[i].f;
+        x0 = std::min(std::max(x0, minval), maxval);
+        x0 *= exp_prescale;
+        Cv64suf buf;
 
-        t = (int)(x[i].i >> 52);
-        if( (t & 2047) > 1023 + 10 )
-            x0 = t < 0 ? -exp_max_val : exp_max_val;
+        int xi = saturate_cast<int>(x0);
+        x0 = (x0 - xi)*exp_postscale;
 
-        val0 = cvRound(x0);
-        t = (val0 >> EXPTAB_SCALE) + 1023;
+        int t = (xi >> EXPTAB_SCALE) + 1023;
         t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
+        buf.i = (int64)t << 52;
 
-        buf[0].i = (int64)t << 52;
-        x0 = (x0 - val0)*exp_postscale;
-
-        y[i] = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
+        y[i] = buf.f * expTab[xi & EXPTAB_MASK] * (((((A0*x0 + A1)*x0 + A2)*x0 + A3)*x0 + A4)*x0 + A5);
     }
 }
 
@@ -937,12 +734,10 @@ void exp64f( const double *_x, double *y, int n )
 
 /////////////////////////////////////////// LOG ///////////////////////////////////////
 
-#define LOGTAB_SCALE    8
+#define LOGTAB_SCALE        8
 #define LOGTAB_MASK         ((1 << LOGTAB_SCALE) - 1)
-#define LOGTAB_MASK2        ((1 << (20 - LOGTAB_SCALE)) - 1)
-#define LOGTAB_MASK2_32F    ((1 << (23 - LOGTAB_SCALE)) - 1)
 
-static const double CV_DECL_ALIGNED(16) icvLogTab[] = {
+static const double CV_DECL_ALIGNED(16) logTab[] = {
     0.0000000000000000000000000000000000000000,    1.000000000000000000000000000000000000000,
     .00389864041565732288852075271279318258166,    .9961089494163424124513618677042801556420,
     .00778214044205494809292034119607706088573,    .9922480620155038759689922480620155038760,
@@ -1201,154 +996,85 @@ static const double CV_DECL_ALIGNED(16) icvLogTab[] = {
     .69314718055994530941723212145818, 5.0e-01,
 };
 
+static float logTab_f[(LOGTAB_MASK+1)*2];
+static volatile bool logTab_f_initialized = false;
 
-
-#define LOGTAB_TRANSLATE(x,h) (((x) - 1.)*icvLogTab[(h)+1])
+#define LOGTAB_TRANSLATE(tab, x, h) (((x) - 1.f)*tab[(h)+1])
 static const double ln_2 = 0.69314718055994530941723212145818;
 
 void log32f( const float *_x, float *y, int n )
 {
     CV_INSTRUMENT_REGION()
 
-    static const float shift[] = { 0, -1.f/512 };
+    if( !logTab_f_initialized )
+    {
+        for( int j = 0; j < (LOGTAB_MASK+1)*2; j++ )
+            logTab_f[j] = (float)logTab[j];
+        logTab_f_initialized = true;
+    }
+
+    static const int LOGTAB_MASK2_32F = (1 << (23 - LOGTAB_SCALE)) - 1;
     static const float
     A0 = 0.3333333333333333333333333f,
     A1 = -0.5f,
     A2 = 1.f;
 
-#undef LOGPOLY
-#define LOGPOLY(x) (((A0*(x) + A1)*(x) + A2)*(x))
-
     int i = 0;
-    Cv32suf buf[4];
     const int* x = (const int*)_x;
 
-#if CV_SSE2
-    static const __m128d ln2_2 = _mm_set1_pd(ln_2);
-    static const __m128 _1_4 = _mm_set1_ps(1.f);
-    static const __m128 shift4 = _mm_set1_ps(-1.f/512);
-
-    static const __m128 mA0 = _mm_set1_ps(A0);
-    static const __m128 mA1 = _mm_set1_ps(A1);
-    static const __m128 mA2 = _mm_set1_ps(A2);
+#if CV_SIMD
+    const int VECSZ = v_float32::nlanes;
+    static const v_float32 vln2 = vx_setall_f32((float)ln_2);
+    static const v_float32 v1 = vx_setall_f32(1.f);
+    static const v_float32 vshift = vx_setall_f32(-1.f/512);
 
-    int CV_DECL_ALIGNED(16) idx[4];
+    static const v_float32 vA0 = vx_setall_f32(A0);
+    static const v_float32 vA1 = vx_setall_f32(A1);
+    static const v_float32 vA2 = vx_setall_f32(A2);
 
-    for( ; i <= n - 4; i += 4 )
+    for( ; i < n; i += VECSZ )
     {
-        __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i));
-        __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 23), _mm_set1_epi32(255)), _mm_set1_epi32(127));
-        __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2);
-        __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0,yi0)), ln2_2);
-
-        __m128i xi0 = _mm_or_si128(_mm_and_si128(h0, _mm_set1_epi32(LOGTAB_MASK2_32F)), _mm_set1_epi32(127 << 23));
-
-        h0 = _mm_and_si128(_mm_srli_epi32(h0, 23 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK*2));
-        _mm_store_si128((__m128i*)idx, h0);
-        h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510));
-
-        __m128d t0, t1, t2, t3, t4;
-        t0 = _mm_load_pd(icvLogTab + idx[0]);
-        t2 = _mm_load_pd(icvLogTab + idx[1]);
-        t1 = _mm_unpackhi_pd(t0, t2);
-        t0 = _mm_unpacklo_pd(t0, t2);
-        t2 = _mm_load_pd(icvLogTab + idx[2]);
-        t4 = _mm_load_pd(icvLogTab + idx[3]);
-        t3 = _mm_unpackhi_pd(t2, t4);
-        t2 = _mm_unpacklo_pd(t2, t4);
-
-        yd0 = _mm_add_pd(yd0, t0);
-        yd1 = _mm_add_pd(yd1, t2);
-
-        __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1));
-
-        __m128 xf0 = _mm_sub_ps(_mm_castsi128_ps(xi0), _1_4);
-        xf0 = _mm_mul_ps(xf0, _mm_movelh_ps(_mm_cvtpd_ps(t1), _mm_cvtpd_ps(t3)));
-        xf0 = _mm_add_ps(xf0, _mm_and_ps(_mm_castsi128_ps(h0), shift4));
-
-        __m128 zf0 = _mm_mul_ps(xf0, mA0);
-        zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA1), xf0);
-        zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA2), xf0);
-        yf0 = _mm_add_ps(yf0, zf0);
-
-        _mm_storeu_ps(y + i, yf0);
-    }
-#endif
-    for( ; i <= n - 4; i += 4 )
-    {
-        double x0, x1, x2, x3;
-        double y0, y1, y2, y3;
-        int h0, h1, h2, h3;
-
-        h0 = x[i];
-        h1 = x[i+1];
-        buf[0].i = (h0 & LOGTAB_MASK2_32F) | (127 << 23);
-        buf[1].i = (h1 & LOGTAB_MASK2_32F) | (127 << 23);
-
-        y0 = (((h0 >> 23) & 0xff) - 127) * ln_2;
-        y1 = (((h1 >> 23) & 0xff) - 127) * ln_2;
-
-        h0 = (h0 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-        h1 = (h1 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-
-        y0 += icvLogTab[h0];
-        y1 += icvLogTab[h1];
-
-        h2 = x[i+2];
-        h3 = x[i+3];
-
-        x0 = LOGTAB_TRANSLATE( buf[0].f, h0 );
-        x1 = LOGTAB_TRANSLATE( buf[1].f, h1 );
-
-        buf[2].i = (h2 & LOGTAB_MASK2_32F) | (127 << 23);
-        buf[3].i = (h3 & LOGTAB_MASK2_32F) | (127 << 23);
-
-        y2 = (((h2 >> 23) & 0xff) - 127) * ln_2;
-        y3 = (((h3 >> 23) & 0xff) - 127) * ln_2;
+        if( i + VECSZ > n )
+        {
+            if( i == 0 || _x == y )
+                break;
+            i = n - VECSZ;
+        }
 
-        h2 = (h2 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-        h3 = (h3 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
+        v_int32 h0 = vx_load(x + i);
+        v_int32 yi0 = (v_shr<23>(h0) & vx_setall_s32(255)) - vx_setall_s32(127);
+        v_int32 xi0 = (h0 & vx_setall_s32(LOGTAB_MASK2_32F)) | vx_setall_s32(127 << 23);
 
-        y2 += icvLogTab[h2];
-        y3 += icvLogTab[h3];
+        h0 = v_shr<23 - LOGTAB_SCALE - 1>(h0) & vx_setall_s32(LOGTAB_MASK*2);
+        v_float32 yf0, xf0;
 
-        x2 = LOGTAB_TRANSLATE( buf[2].f, h2 );
-        x3 = LOGTAB_TRANSLATE( buf[3].f, h3 );
+        v_lut_deinterleave(logTab_f, h0, yf0, xf0);
 
-        x0 += shift[h0 == 510];
-        x1 += shift[h1 == 510];
-        y0 += LOGPOLY( x0 );
-        y1 += LOGPOLY( x1 );
+        yf0 = v_fma(v_cvt_f32(yi0), vln2, yf0);
 
-        y[i] = (float) y0;
-        y[i + 1] = (float) y1;
+        v_float32 delta = v_reinterpret_as_f32(h0 == vx_setall_s32(510)) & vshift;
+        xf0 = v_fma((v_reinterpret_as_f32(xi0) - v1), xf0, delta);
 
-        x2 += shift[h2 == 510];
-        x3 += shift[h3 == 510];
-        y2 += LOGPOLY( x2 );
-        y3 += LOGPOLY( x3 );
+        v_float32 zf0 = v_fma(xf0, vA0, vA1);
+        zf0 = v_fma(zf0, xf0, vA2);
+        zf0 = v_fma(zf0, xf0, yf0);
 
-        y[i + 2] = (float) y2;
-        y[i + 3] = (float) y3;
+        v_store(y + i, zf0);
     }
+    vx_cleanup();
+#endif
 
     for( ; i < n; i++ )
     {
-        int h0 = x[i];
-        double y0;
-        float x0;
-
-        y0 = (((h0 >> 23) & 0xff) - 127) * ln_2;
+        Cv32suf buf;
+        int i0 = x[i];
 
-        buf[0].i = (h0 & LOGTAB_MASK2_32F) | (127 << 23);
-        h0 = (h0 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
+        buf.i = (i0 & LOGTAB_MASK2_32F) | (127 << 23);
+        int idx = (i0 >> (23 - LOGTAB_SCALE - 1)) & (LOGTAB_MASK*2);
 
-        y0 += icvLogTab[h0];
-        x0 = (float)LOGTAB_TRANSLATE( buf[0].f, h0 );
-        x0 += shift[h0 == 510];
-        y0 += LOGPOLY( x0 );
-
-        y[i] = (float)y0;
+        float y0 = (((i0 >> 23) & 0xff) - 127) * (float)ln_2 + logTab_f[idx];
+        float x0 = (buf.f - 1.f)*logTab_f[idx + 1] + (idx == 510 ? -1.f/512 : 0.f);
+        y[i] = ((A0*x0 + A1)*x0 + A2)*x0 + y0;
     }
 }
 
@@ -1356,7 +1082,7 @@ void log64f( const double *x, double *y, int n )
 {
     CV_INSTRUMENT_REGION()
 
-    static const double shift[] = { 0, -1./512 };
+    static const int64 LOGTAB_MASK2_64F = ((int64)1 << (52 - LOGTAB_SCALE)) - 1;
     static const double
     A7 = 1.0,
     A6 = -0.5,
@@ -1367,175 +1093,69 @@ void log64f( const double *x, double *y, int n )
     A1 = 0.1428571428571428769682682968777953647077083587646484375,
     A0 = -0.125;
 
-#undef LOGPOLY
-#define LOGPOLY(x,k) ((x)+=shift[k], xq = (x)*(x),\
-(((A0*xq + A2)*xq + A4)*xq + A6)*xq + \
-(((A1*xq + A3)*xq + A5)*xq + A7)*(x))
-
     int i = 0;
-    DBLINT buf[4];
-    DBLINT *X = (DBLINT *) x;
 
-#if CV_SSE2
-    static const __m128d ln2_2 = _mm_set1_pd(ln_2);
-    static const __m128d _1_2 = _mm_set1_pd(1.);
-    static const __m128d shift2 = _mm_set1_pd(-1./512);
+#if CV_SIMD_64F
+    const int VECSZ = v_float64::nlanes;
+    static const v_float64 vln2 = vx_setall_f64(ln_2);
 
-    static const __m128i log_and_mask2 = _mm_set_epi32(LOGTAB_MASK2, 0xffffffff, LOGTAB_MASK2, 0xffffffff);
-    static const __m128i log_or_mask2 = _mm_set_epi32(1023 << 20, 0, 1023 << 20, 0);
+    static const v_float64
+        vA0 = vx_setall_f64(A0), vA1 = vx_setall_f64(A1),
+        vA2 = vx_setall_f64(A2), vA3 = vx_setall_f64(A3),
+        vA4 = vx_setall_f64(A4), vA5 = vx_setall_f64(A5),
+        vA6 = vx_setall_f64(A6), vA7 = vx_setall_f64(A7);
 
-    static const __m128d mA0 = _mm_set1_pd(A0);
-    static const __m128d mA1 = _mm_set1_pd(A1);
-    static const __m128d mA2 = _mm_set1_pd(A2);
-    static const __m128d mA3 = _mm_set1_pd(A3);
-    static const __m128d mA4 = _mm_set1_pd(A4);
-    static const __m128d mA5 = _mm_set1_pd(A5);
-    static const __m128d mA6 = _mm_set1_pd(A6);
-    static const __m128d mA7 = _mm_set1_pd(A7);
-
-    int CV_DECL_ALIGNED(16) idx[4];
-
-    for( ; i <= n - 4; i += 4 )
+    for( ; i < n; i += VECSZ )
     {
-        __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i));
-        __m128i h1 = _mm_loadu_si128((const __m128i*)(x + i + 2));
-
-        __m128d xd0 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h0, log_and_mask2), log_or_mask2));
-        __m128d xd1 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h1, log_and_mask2), log_or_mask2));
-
-        h0 = _mm_unpackhi_epi32(_mm_unpacklo_epi32(h0, h1), _mm_unpackhi_epi32(h0, h1));
-
-        __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 20),
-                                                  _mm_set1_epi32(2047)), _mm_set1_epi32(1023));
-        __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2);
-        __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0, yi0)), ln2_2);
-
-        h0 = _mm_and_si128(_mm_srli_epi32(h0, 20 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK * 2));
-        _mm_store_si128((__m128i*)idx, h0);
-        h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510));
-
-        __m128d t0, t1, t2, t3, t4;
-        t0 = _mm_load_pd(icvLogTab + idx[0]);
-        t2 = _mm_load_pd(icvLogTab + idx[1]);
-        t1 = _mm_unpackhi_pd(t0, t2);
-        t0 = _mm_unpacklo_pd(t0, t2);
-        t2 = _mm_load_pd(icvLogTab + idx[2]);
-        t4 = _mm_load_pd(icvLogTab + idx[3]);
-        t3 = _mm_unpackhi_pd(t2, t4);
-        t2 = _mm_unpacklo_pd(t2, t4);
-
-        yd0 = _mm_add_pd(yd0, t0);
-        yd1 = _mm_add_pd(yd1, t2);
-
-        xd0 = _mm_mul_pd(_mm_sub_pd(xd0, _1_2), t1);
-        xd1 = _mm_mul_pd(_mm_sub_pd(xd1, _1_2), t3);
-
-        xd0 = _mm_add_pd(xd0, _mm_and_pd(_mm_castsi128_pd(_mm_unpacklo_epi32(h0, h0)), shift2));
-        xd1 = _mm_add_pd(xd1, _mm_and_pd(_mm_castsi128_pd(_mm_unpackhi_epi32(h0, h0)), shift2));
-
-        __m128d zd0 = _mm_mul_pd(xd0, mA0);
-        __m128d zd1 = _mm_mul_pd(xd1, mA0);
-        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA1), xd0);
-        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA1), xd1);
-        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA2), xd0);
-        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA2), xd1);
-        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA3), xd0);
-        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA3), xd1);
-        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA4), xd0);
-        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA4), xd1);
-        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA5), xd0);
-        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA5), xd1);
-        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA6), xd0);
-        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA6), xd1);
-        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA7), xd0);
-        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA7), xd1);
-
-        yd0 = _mm_add_pd(yd0, zd0);
-        yd1 = _mm_add_pd(yd1, zd1);
-
-        _mm_storeu_pd(y + i, yd0);
-        _mm_storeu_pd(y + i + 2, yd1);
-    }
-#endif
-    for( ; i <= n - 4; i += 4 )
-    {
-        double xq;
-        double x0, x1, x2, x3;
-        double y0, y1, y2, y3;
-        int h0, h1, h2, h3;
-
-        h0 = X[i].i.lo;
-        h1 = X[i + 1].i.lo;
-        buf[0].i.lo = h0;
-        buf[1].i.lo = h1;
-
-        h0 = X[i].i.hi;
-        h1 = X[i + 1].i.hi;
-        buf[0].i.hi = (h0 & LOGTAB_MASK2) | (1023 << 20);
-        buf[1].i.hi = (h1 & LOGTAB_MASK2) | (1023 << 20);
-
-        y0 = (((h0 >> 20) & 0x7ff) - 1023) * ln_2;
-        y1 = (((h1 >> 20) & 0x7ff) - 1023) * ln_2;
-
-        h2 = X[i + 2].i.lo;
-        h3 = X[i + 3].i.lo;
-        buf[2].i.lo = h2;
-        buf[3].i.lo = h3;
-
-        h0 = (h0 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-        h1 = (h1 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-
-        y0 += icvLogTab[h0];
-        y1 += icvLogTab[h1];
-
-        h2 = X[i + 2].i.hi;
-        h3 = X[i + 3].i.hi;
-
-        x0 = LOGTAB_TRANSLATE( buf[0].d, h0 );
-        x1 = LOGTAB_TRANSLATE( buf[1].d, h1 );
-
-        buf[2].i.hi = (h2 & LOGTAB_MASK2) | (1023 << 20);
-        buf[3].i.hi = (h3 & LOGTAB_MASK2) | (1023 << 20);
-
-        y2 = (((h2 >> 20) & 0x7ff) - 1023) * ln_2;
-        y3 = (((h3 >> 20) & 0x7ff) - 1023) * ln_2;
-
-        h2 = (h2 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-        h3 = (h3 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
+        if( i + VECSZ > n )
+        {
+            if( i == 0 || x == y )
+                break;
+            i = n - VECSZ;
+        }
 
-        y2 += icvLogTab[h2];
-        y3 += icvLogTab[h3];
+        v_int64 h0 = vx_load((const int64*)x + i);
+        v_int32 yi0 = v_pack(v_shr<52>(h0), vx_setzero_s64());
+        yi0 = (yi0 & vx_setall_s32(0x7ff)) - vx_setall_s32(1023);
 
-        x2 = LOGTAB_TRANSLATE( buf[2].d, h2 );
-        x3 = LOGTAB_TRANSLATE( buf[3].d, h3 );
+        v_int64 xi0 = (h0 & vx_setall_s64(LOGTAB_MASK2_64F)) | vx_setall_s64((int64)1023 << 52);
+        h0 = v_shr<52 - LOGTAB_SCALE - 1>(h0);
+        v_int32 idx = v_pack(h0, h0) & vx_setall_s32(LOGTAB_MASK*2);
 
-        y0 += LOGPOLY( x0, h0 == 510 );
-        y1 += LOGPOLY( x1, h1 == 510 );
+        v_float64 xf0, yf0;
+        v_lut_deinterleave(logTab, idx, yf0, xf0);
 
-        y[i] = y0;
-        y[i + 1] = y1;
+        yf0 = v_fma(v_cvt_f64(yi0), vln2, yf0);
+        v_float64 delta = v_cvt_f64(idx == vx_setall_s32(510))*vx_setall_f64(1./512);
+        xf0 = v_fma(v_reinterpret_as_f64(xi0) - vx_setall_f64(1.), xf0, delta);
 
-        y2 += LOGPOLY( x2, h2 == 510 );
-        y3 += LOGPOLY( x3, h3 == 510 );
+        v_float64 xq = xf0*xf0;
+        v_float64 zf0 = v_fma(xq, vA0, vA2);
+        v_float64 zf1 = v_fma(xq, vA1, vA3);
+        zf0 = v_fma(zf0, xq, vA4);
+        zf1 = v_fma(zf1, xq, vA5);
+        zf0 = v_fma(zf0, xq, vA6);
+        zf1 = v_fma(zf1, xq, vA7);
+        zf1 = v_fma(zf1, xf0, yf0);
+        zf0 = v_fma(zf0, xq, zf1);
 
-        y[i + 2] = y2;
-        y[i + 3] = y3;
+        v_store(y + i, zf0);
     }
+#endif
 
     for( ; i < n; i++ )
     {
-        int h0 = X[i].i.hi;
-        double xq;
-        double x0, y0 = (((h0 >> 20) & 0x7ff) - 1023) * ln_2;
-
-        buf[0].i.hi = (h0 & LOGTAB_MASK2) | (1023 << 20);
-        buf[0].i.lo = X[i].i.lo;
-        h0 = (h0 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-
-        y0 += icvLogTab[h0];
-        x0 = LOGTAB_TRANSLATE( buf[0].d, h0 );
-        y0 += LOGPOLY( x0, h0 == 510 );
-        y[i] = y0;
+        Cv64suf buf;
+        int64 i0 = ((const int64*)x)[i];
+
+        buf.i = (i0 & LOGTAB_MASK2_64F) | ((int64)1023 << 52);
+        int idx = (int)(i0 >> (52 - LOGTAB_SCALE - 1)) & (LOGTAB_MASK*2);
+
+        double y0 = (((int)(i0 >> 52) & 0x7ff) - 1023) * ln_2 + logTab[idx];
+        double x0 = (buf.f - 1.)*logTab[idx + 1] + (idx == 510 ? -1./512 : 0.);
+
+        double xq = x0*x0;
+        y[i] = (((A0*xq + A2)*xq + A4)*xq + A6)*xq + (((A1*xq + A3)*xq + A5)*xq + A7)*x0 + y0;
     }
 }
 
@@ -1543,7 +1163,7 @@ void log64f( const double *x, double *y, int n )
 
 float fastAtan2( float y, float x )
 {
-    return atanImpl<float>(y, x);
+    return atan_f32(y, x);
 }
 
 #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
diff --git a/modules/core/test/test_intrin.cpp b/modules/core/test/test_intrin.cpp
index 4171babc03..9a1130fe96 100644
--- a/modules/core/test/test_intrin.cpp
+++ b/modules/core/test/test_intrin.cpp
@@ -241,9 +241,9 @@ TEST(hal_intrin, float64x2) {
 }
 #endif
 
-TEST(hal_intrin,float16x4)
+TEST(hal_intrin,float16)
 {
-    CV_CPU_CALL_FP16_(test_hal_intrin_float16x4, ());
+    CV_CPU_CALL_FP16_(test_hal_intrin_float16, ());
     throw SkipTestException("Unsupported hardware: FP16 is not available");
 }
 
diff --git a/modules/core/test/test_intrin.fp16.cpp b/modules/core/test/test_intrin.fp16.cpp
index 7855fda287..893c5f147a 100644
--- a/modules/core/test/test_intrin.fp16.cpp
+++ b/modules/core/test/test_intrin.fp16.cpp
@@ -7,9 +7,9 @@
 namespace opencv_test { namespace hal {
 CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
 
-void test_hal_intrin_float16x4()
+void test_hal_intrin_float16()
 {
-    TheTest<v_float16x4>()
+    TheTest<v_float16x8>()
         .test_loadstore_fp16()
         .test_float_cvt_fp16()
         ;
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
index 7579d9cf05..2f8c1cf0b7 100644
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -6,7 +6,7 @@
 namespace opencv_test { namespace hal {
 CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
 
-void test_hal_intrin_float16x4();
+void test_hal_intrin_float16();
 
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
@@ -50,6 +50,8 @@ template <> struct initializer<2>
 template <typename R> struct Data
 {
     typedef typename R::lane_type LaneType;
+    typedef typename V_TypeTraits<LaneType>::int_type int_type;
+
     Data()
     {
         for (int i = 0; i < R::nlanes; ++i)
@@ -104,6 +106,17 @@ template <typename R> struct Data
         CV_Assert(i >= 0 && i < R::nlanes);
         return d[i];
     }
+    int_type as_int(int i) const
+    {
+        CV_Assert(i >= 0 && i < R::nlanes);
+        union
+        {
+            LaneType l;
+            int_type i;
+        } v;
+        v.l = d[i];
+        return v.i;
+    }
     const LaneType * mid() const
     {
         return d + R::nlanes / 2;
@@ -247,8 +260,9 @@ template<typename R> struct TheTest
         EXPECT_EQ(d, res);
 
         // zero, all
-        Data<R> resZ = V_RegTrait128<LaneType>::zero();
-        Data<R> resV = V_RegTrait128<LaneType>::all(8);
+        Data<R> resZ, resV;
+        resZ.fill((LaneType)0);
+        resV.fill((LaneType)8);
         for (int i = 0; i < R::nlanes; ++i)
         {
             EXPECT_EQ((LaneType)0, resZ[i]);
@@ -339,7 +353,7 @@ template<typename R> struct TheTest
     // v_expand and v_load_expand
     TheTest & test_expand()
     {
-        typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
+        typedef typename V_RegTraits<R>::w_reg Rx2;
         Data<R> dataA;
         R a = dataA;
 
@@ -362,7 +376,7 @@ template<typename R> struct TheTest
 
     TheTest & test_expand_q()
     {
-        typedef typename V_RegTrait128<LaneType>::q_reg Rx4;
+        typedef typename V_RegTraits<R>::q_reg Rx4;
         Data<R> data;
         Data<Rx4> out = v_load_expand_q(data.d);
         const int n = Rx4::nlanes;
@@ -436,7 +450,7 @@ template<typename R> struct TheTest
 
     TheTest & test_mul_expand()
     {
-        typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
+        typedef typename V_RegTraits<R>::w_reg Rx2;
         Data<R> dataA, dataB(2);
         R a = dataA, b = dataB;
         Rx2 c, d;
@@ -456,7 +470,7 @@ template<typename R> struct TheTest
 
     TheTest & test_abs()
     {
-        typedef typename V_RegTrait128<LaneType>::u_reg Ru;
+        typedef typename V_RegTraits<R>::u_reg Ru;
         typedef typename Ru::lane_type u_type;
         Data<R> dataA, dataB(10);
         R a = dataA, b = dataB;
@@ -520,7 +534,7 @@ template<typename R> struct TheTest
 
     TheTest & test_dot_prod()
     {
-        typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
+        typedef typename V_RegTraits<R>::w_reg Rx2;
         typedef typename Rx2::lane_type w_type;
 
         Data<R> dataA, dataB(2);
@@ -608,7 +622,7 @@ template<typename R> struct TheTest
 
     TheTest & test_absdiff()
     {
-        typedef typename V_RegTrait128<LaneType>::u_reg Ru;
+        typedef typename V_RegTraits<R>::u_reg Ru;
         typedef typename Ru::lane_type u_type;
         Data<R> dataA(std::numeric_limits<LaneType>::max()),
                 dataB(std::numeric_limits<LaneType>::min());
@@ -657,12 +671,21 @@ template<typename R> struct TheTest
 
     TheTest & test_mask()
     {
-        typedef V_TypeTraits<LaneType> Traits;
-        typedef typename Traits::int_type int_type;
+        typedef typename V_RegTraits<R>::int_reg int_reg;
+        typedef typename V_RegTraits<int_reg>::u_reg uint_reg;
+        typedef typename int_reg::lane_type int_type;
+        typedef typename uint_reg::lane_type uint_type;
 
         Data<R> dataA, dataB(0), dataC, dataD(1), dataE(2);
         dataA[1] *= (LaneType)-1;
-        const LaneType mask_one = Traits::reinterpret_from_int(~(typename Traits::uint_type)(0));
+        union
+        {
+            LaneType l;
+            uint_type ui;
+        }
+        all1s;
+        all1s.ui = (uint_type)-1;
+        LaneType mask_one = all1s.l;
         dataB[1] = mask_one;
         dataB[R::nlanes / 2] = mask_one;
         dataB[R::nlanes - 1] = mask_one;
@@ -684,10 +707,8 @@ template<typename R> struct TheTest
         Data<R> resF = f;
         for (int i = 0; i < R::nlanes; ++i)
         {
-            int_type m2 = Traits::reinterpret_int(dataB[i]);
-            EXPECT_EQ((Traits::reinterpret_int(dataD[i]) & m2)
-                    | (Traits::reinterpret_int(dataE[i]) & ~m2),
-                      Traits::reinterpret_int(resF[i]));
+            int_type m2 = dataB.as_int(i);
+            EXPECT_EQ((dataD.as_int(i) & m2) | (dataE.as_int(i) & ~m2), resF.as_int(i));
         }
 
         return *this;
@@ -697,7 +718,7 @@ template<typename R> struct TheTest
     TheTest & test_pack()
     {
         SCOPED_TRACE(s);
-        typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
+        typedef typename V_RegTraits<R>::w_reg Rx2;
         typedef typename Rx2::lane_type w_type;
         Data<Rx2> dataA, dataB;
         dataA += std::numeric_limits<LaneType>::is_signed ? -10 : 10;
@@ -734,8 +755,9 @@ template<typename R> struct TheTest
     TheTest & test_pack_u()
     {
         SCOPED_TRACE(s);
-        typedef typename V_TypeTraits<LaneType>::w_type LaneType_w;
-        typedef typename V_RegTrait128<LaneType_w>::int_reg Ri2;
+        //typedef typename V_RegTraits<LaneType>::w_type LaneType_w;
+        typedef typename V_RegTraits<R>::w_reg R2;
+        typedef typename V_RegTraits<R2>::int_reg Ri2;
         typedef typename Ri2::lane_type w_type;
 
         Data<Ri2> dataA, dataB;
@@ -864,7 +886,7 @@ template<typename R> struct TheTest
 
     TheTest & test_float_math()
     {
-        typedef typename V_RegTrait128<LaneType>::int_reg Ri;
+        typedef typename V_RegTraits<R>::round_reg Ri;
         Data<R> data1, data2, data3;
         data1 *= 1.1;
         data2 += 10;
@@ -1005,31 +1027,28 @@ template<typename R> struct TheTest
 
     TheTest & test_loadstore_fp16()
     {
-#if CV_FP16 && CV_SIMD128
+#if CV_FP16 && CV_SIMD
         AlignedData<R> data;
         AlignedData<R> out;
 
-        if(1 /* checkHardwareSupport(CV_CPU_FP16) */ )
-        {
-            // check if addresses are aligned and unaligned respectively
-            EXPECT_EQ((size_t)0, (size_t)&data.a.d % 16);
-            EXPECT_NE((size_t)0, (size_t)&data.u.d % 16);
-            EXPECT_EQ((size_t)0, (size_t)&out.a.d % 16);
-            EXPECT_NE((size_t)0, (size_t)&out.u.d % 16);
-
-            // check some initialization methods
-            R r1 = data.u;
-            R r2 = v_load_f16(data.a.d);
-            R r3(r2);
-            EXPECT_EQ(data.u[0], r1.get0());
-            EXPECT_EQ(data.a[0], r2.get0());
-            EXPECT_EQ(data.a[0], r3.get0());
-
-            // check some store methods
-            out.a.clear();
-            v_store_f16(out.a.d, r1);
-            EXPECT_EQ(data.a, out.a);
-        }
+        // check if addresses are aligned and unaligned respectively
+        EXPECT_EQ((size_t)0, (size_t)&data.a.d % 16);
+        EXPECT_NE((size_t)0, (size_t)&data.u.d % 16);
+        EXPECT_EQ((size_t)0, (size_t)&out.a.d % 16);
+        EXPECT_NE((size_t)0, (size_t)&out.u.d % 16);
+
+        // check some initialization methods
+        R r1 = data.u;
+        R r2 = v_load_f16(data.a.d);
+        R r3(r2);
+        EXPECT_EQ(data.u[0], r1.get0());
+        EXPECT_EQ(data.a[0], r2.get0());
+        EXPECT_EQ(data.a[0], r3.get0());
+
+        // check some store methods
+        out.a.clear();
+        v_store(out.a.d, r1);
+        EXPECT_EQ(data.a, out.a);
 
         return *this;
 #endif
@@ -1037,18 +1056,15 @@ template<typename R> struct TheTest
 
     TheTest & test_float_cvt_fp16()
     {
-#if CV_FP16 && CV_SIMD128
-        AlignedData<v_float32x4> data;
-
-        if(1 /* checkHardwareSupport(CV_CPU_FP16) */)
-        {
-            // check conversion
-            v_float32x4 r1 = v_load(data.a.d);
-            v_float16x4 r2 = v_cvt_f16(r1);
-            v_float32x4 r3 = v_cvt_f32(r2);
-            EXPECT_EQ(0x3c00, r2.get0());
-            EXPECT_EQ(r3.get0(), r1.get0());
-        }
+#if CV_FP16 && CV_SIMD
+        AlignedData<v_float32> data;
+
+        // check conversion
+        v_float32 r1 = vx_load(data.a.d);
+        v_float16 r2 = v_cvt_f16(r1, vx_setzero_f32());
+        v_float32 r3 = v_cvt_f32(r2);
+        EXPECT_EQ(0x3c00, r2.get0());
+        EXPECT_EQ(r3.get0(), r1.get0());
 
         return *this;
 #endif
diff --git a/modules/core/test/test_math.cpp b/modules/core/test/test_math.cpp
index 44b6ebdbb3..68dfc3c969 100644
--- a/modules/core/test/test_math.cpp
+++ b/modules/core/test/test_math.cpp
@@ -134,7 +134,9 @@ double Core_PowTest::get_success_error_level( int test_case_idx, int i, int j )
     if( depth < CV_32F )
         return power == cvRound(power) && power >= 0 ? 0 : 1;
     else
-        return Base::get_success_error_level( test_case_idx, i, j );
+    {
+        return depth != CV_64F ? Base::get_success_error_level( test_case_idx, i, j ) : DBL_EPSILON*1024*1.1;
+    }
 }
 
 
diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp
index a639dc2e18..a5366bf6fd 100644
--- a/modules/ts/src/ts_func.cpp
+++ b/modules/ts/src/ts_func.cpp
@@ -2129,7 +2129,7 @@ int cmpEps2( TS* ts, const Mat& a, const Mat& b, double success_err_level,
     switch( code )
     {
     case CMP_EPS_BIG_DIFF:
-        sprintf( msg, "%s: Too big difference (=%g)", desc, diff );
+        sprintf( msg, "%s: Too big difference (=%g > %g)", desc, diff, success_err_level );
         code = TS::FAIL_BAD_ACCURACY;
         break;
     case CMP_EPS_INVALID_TEST_DATA: