Merge pull request #12411 from vpisarev:wide_convert

* rewrote Mat::convertTo() and convertScaleAbs() to wide universal intrinsics; added always-available and SIMD-optimized FP16<=>FP32 conversion * fixed compile warnings * fix some more compile errors * slightly relaxed accuracy threshold for int->float conversion (since we now do it using single-precision arithmetics, not double-precision) * fixed compile errors on iOS, Android and in the baseline C++ version (intrin_cpp.hpp) * trying to fix ARM-neon builds * trying to fix ARM-neon builds * trying to fix ARM-neon builds * trying to fix ARM-neon builds
7 years ago · 80b62a41c6
parent 54279523a3
commit 80b62a41c6
18 changed files with 1131 additions and 3494 deletions
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@ -219,15 +219,10 @@ enum CpuFeatures {
 typedef union Cv16suf
 {
    short i;
    ushort u;
 #if CV_FP16_TYPE
    __fp16 h;
 #endif
    struct _fp16Format
    {
        unsigned int significand : 10;
        unsigned int exponent    : 5;
        unsigned int sign        : 1;
    } fmt;
 }
 Cv16suf;
@ -236,12 +231,6 @@ typedef union Cv32suf
    int i;
    unsigned u;
    float f;
    struct _fp32Format
    {
        unsigned int significand : 23;
        unsigned int exponent    : 8;
        unsigned int sign        : 1;
    } fmt;
 }
 Cv32suf;
@ -548,6 +537,115 @@ typedef ::uint64_t uint64_t;
 #include <stdint.h>
 #endif
 #ifdef __cplusplus
 namespace cv
 {
 class float16_t
 {
 public:
 #if CV_FP16_TYPE
    float16_t() {}
    explicit float16_t(float x) { h = (__fp16)x; }
    operator float() const { return (float)h; }
    static float16_t fromBits(ushort w)
    {
        Cv16suf u;
        u.u = w;
        float16_t result;
        result.h = u.h;
        return result;
    }
    static float16_t zero()
    {
        float16_t result;
        result.h = (__fp16)0;
        return result;
    }
    ushort bits() const
    {
        Cv16suf u;
        u.h = h;
        return u.u;
    }
 protected:
    __fp16 h;
 #else
    float16_t() {}
    explicit float16_t(float x)
    {
    #if CV_AVX2
        __m128 v = _mm_load_ss(&x);
        w = (ushort)_mm_cvtsi128_si32(_mm_cvtps_ph(v, 0));
    #else
        Cv32suf in;
        in.f = x;
        unsigned sign = in.u & 0x80000000;
        in.u ^= sign;
        if( in.u >= 0x47800000 )
            w = (ushort)(in.u > 0x7f800000 ? 0x7e00 : 0x7c00);
        else
        {
            if (in.u < 0x38800000)
            {
                in.f += 0.5f;
                w = (ushort)(in.u - 0x3f000000);
            }
            else
            {
                unsigned t = in.u + 0xc8000fff;
                w = (ushort)((t + ((in.u >> 13) & 1)) >> 13);
            }
        }
        w = (ushort)(w | (sign >> 16));
    #endif
    }
    operator float() const
    {
    #if CV_AVX2
        float f;
        _mm_store_ss(&f, _mm_cvtph_ps(_mm_cvtsi32_si128(w)));
        return f;
    #else
        Cv32suf out;
        unsigned t = ((w & 0x7fff) << 13) + 0x38000000;
        unsigned sign = (w & 0x8000) << 16;
        unsigned e = w & 0x7c00;
        out.u = t + (1 << 23);
        out.u = (e >= 0x7c00 ? t + 0x38000000 :
                 e == 0 ? (out.f -= 6.103515625e-05f, out.u) : t) | sign;
        return out.f;
    #endif
    }
    static float16_t fromBits(ushort b)
    {
        float16_t result;
        result.w = b;
        return result;
    }
    static float16_t zero()
    {
        float16_t result;
        result.w = (ushort)0;
        return result;
    }
    ushort bits() const { return w; }
 protected:
    ushort w;
 #endif
 };
 }
 #endif
 //! @}
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@ -252,7 +252,8 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(unsigned, v_uint64, prefix) \
    CV_INTRIN_DEFINE_WIDE_INTRIN(float, v_float32, f32, prefix, load) \
    CV_INTRIN_DEFINE_WIDE_INTRIN(int64, v_int64, s64, prefix, load) \
-    CV_INTRIN_DEFINE_WIDE_INTRIN(uint64, v_uint64, u64, prefix, load)
+    CV_INTRIN_DEFINE_WIDE_INTRIN(uint64, v_uint64, u64, prefix, load) \
    CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(float16_t, v_float32, prefix)
 template<typename _Tp> struct V_RegTraits
 {
@ -286,9 +287,6 @@ template<typename _Tp> struct V_RegTraits
 #if CV_SIMD128_64F
    CV_DEF_REG_TRAITS(v, v_float64x2, double, f64, v_float64x2, void, void, v_int64x2, v_int32x4);
 #endif
 #if CV_SIMD128_FP16
    CV_DEF_REG_TRAITS(v, v_float16x8, short, f16, v_float16x8, void, void, v_int16x8, v_int16x8);
 #endif
 #endif
 #if CV_SIMD256
@ -302,9 +300,6 @@ template<typename _Tp> struct V_RegTraits
    CV_DEF_REG_TRAITS(v256, v_uint64x4, uint64, u64, v_uint64x4, void, void, v_int64x4, void);
    CV_DEF_REG_TRAITS(v256, v_int64x4, int64, s64, v_uint64x4, void, void, v_int64x4, void);
    CV_DEF_REG_TRAITS(v256, v_float64x4, double, f64, v_float64x4, void, void, v_int64x4, v_int32x8);
 #if CV_SIMD256_FP16
    CV_DEF_REG_TRAITS(v256, v_float16x16, short, f16, v_float16x16, void, void, v_int16x16, void);
 #endif
 #endif
 #if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
@ -335,14 +330,6 @@ namespace CV__SIMD_NAMESPACE {
    #if CV_SIMD256_64F
    typedef v_float64x4  v_float64;
    #endif
    #if CV_FP16
    #define vx_load_fp16_f32 v256_load_fp16_f32
    #define vx_store_fp16 v_store_fp16
    #endif
    #if CV_SIMD256_FP16
    typedef v_float16x16  v_float16;
    CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_float16, f16, v256, load_f16)
    #endif
    CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256)
    CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v256, load)
    inline void vx_cleanup() { v256_cleanup(); }
@ -353,7 +340,6 @@ using namespace CV__SIMD_NAMESPACE;
 namespace CV__SIMD_NAMESPACE {
    #define CV_SIMD CV_SIMD128
    #define CV_SIMD_64F CV_SIMD128_64F
    #define CV_SIMD_FP16 CV_SIMD128_FP16
    #define CV_SIMD_WIDTH 16
    typedef v_uint8x16  v_uint8;
    typedef v_int8x16   v_int8;
@ -367,14 +353,6 @@ namespace CV__SIMD_NAMESPACE {
    #if CV_SIMD128_64F
    typedef v_float64x2 v_float64;
    #endif
    #if CV_FP16
    #define vx_load_fp16_f32 v128_load_fp16_f32
    #define vx_store_fp16 v_store_fp16
    #endif
    #if CV_SIMD128_FP16
    typedef v_float16x8  v_float16;
    CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_float16, f16, v, load_f16)
    #endif
    CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v)
    #if CV_SIMD128_64F
    CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v, load)
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@ -1414,10 +1414,17 @@ inline v_int8x32 v_pack(const v_int16x16& a, const v_int16x16& b)
 { return v_int8x32(_v256_shuffle_odd_64(_mm256_packs_epi16(a.val, b.val))); }
 inline v_uint8x32 v_pack(const v_uint16x16& a, const v_uint16x16& b)
-{ return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a.val, b.val))); }
+{
    __m256i t = _mm256_set1_epi16(255);
    __m256i a1 = _mm256_min_epu16(a.val, t);
    __m256i b1 = _mm256_min_epu16(b.val, t);
    return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a1, b1)));
 }
 inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b)
-{ return v_pack(v_reinterpret_as_u16(a), v_reinterpret_as_u16(b)); }
+{
    return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a.val, b.val)));
 }
 inline void v_pack_store(schar* ptr, const v_int16x16& a)
 { v_store_low(ptr, v_pack(a, a)); }
@ -2390,6 +2397,18 @@ OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float32x8, float, f32, v_uint32x8, un
 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int64x4, int64, s64, v_uint64x4, uint64, u64)
 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, uint64, u64)
 // FP16
 inline v_float32x8 v256_load_expand(const float16_t* ptr)
 {
    return v_float32x8(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
 }
 inline void v_pack_store(float16_t* ptr, const v_float32x8& a)
 {
    __m128i ah = _mm256_cvtps_ph(a.val, 0);
    _mm_storeu_si128((__m128i*)ptr, ah);
 }
 inline void v256_cleanup() { _mm256_zeroupper(); }
 //! @name Check SIMD256 support
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@ -2062,6 +2062,28 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
                       v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + m3.s[3]);
 }
 ////// FP16 suport ///////
 inline v_reg<float, V_TypeTraits<float>::nlanes128>
 v_load_expand(const float16_t* ptr)
 {
    v_reg<float, V_TypeTraits<float>::nlanes128> v;
    for( int i = 0; i < v.nlanes; i++ )
    {
        v.s[i] = ptr[i];
    }
    return v;
 }
 inline void
 v_pack_store(float16_t* ptr, v_reg<float, V_TypeTraits<float>::nlanes128>& v)
 {
    for( int i = 0; i < v.nlanes; i++ )
    {
        ptr[i] = float16_t(v.s[i]);
    }
 }
 inline void v_cleanup() {}
 //! @}
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@ -62,15 +62,6 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 #define CV_SIMD128_64F 0
 #endif
 #ifndef CV_SIMD128_FP16
 # if CV_FP16 && (defined(__GNUC__) && __GNUC__ >= 5)  // #12027: float16x8_t is missing in GCC 4.8.2
 #   define CV_SIMD128_FP16 1
 # endif
 #endif
 #ifndef CV_SIMD128_FP16
 # define CV_SIMD128_FP16 0
 #endif
 #if CV_SIMD128_64F
 #define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) \
 template <typename T> static inline \
@ -329,53 +320,6 @@ inline void v_store_fp16(short* ptr, const v_float32x4& a)
 }
 #endif
 #if CV_SIMD128_FP16
 // Workaround for old compilers
 static inline int16x8_t vreinterpretq_s16_f16(float16x8_t a) { return (int16x8_t)a; }
 static inline float16x8_t vreinterpretq_f16_s16(int16x8_t a) { return (float16x8_t)a; }
 static inline float16x8_t cv_vld1q_f16(const void* ptr)
 {
 #ifndef vld1q_f16 // APPLE compiler defines vld1_f16 as macro
    return vreinterpretq_f16_s16(vld1q_s16((const short*)ptr));
 #else
    return vld1q_f16((const __fp16*)ptr);
 #endif
 }
 static inline void cv_vst1q_f16(void* ptr, float16x8_t a)
 {
 #ifndef vst1q_f16 // APPLE compiler defines vst1_f16 as macro
    vst1q_s16((short*)ptr, vreinterpretq_s16_f16(a));
 #else
    vst1q_f16((__fp16*)ptr, a);
 #endif
 }
 struct v_float16x8
 {
    typedef short lane_type;
    enum { nlanes = 8 };
    v_float16x8() {}
    explicit v_float16x8(float16x8_t v) : val(v) {}
    v_float16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
    {
        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
        val = cv_vld1q_f16(v);
    }
    short get0() const
    {
        return vgetq_lane_s16(vreinterpretq_s16_f16(val), 0);
    }
    float16x8_t val;
 };
 inline v_float16x8 v_setzero_f16() { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16((short)0))); }
 inline v_float16x8 v_setall_f16(short v) { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16(v))); }
 #endif // CV_SIMD128_FP16
 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
@ -934,24 +878,6 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
 #endif
 #if CV_SIMD128_FP16
 // Workaround for old comiplers
 inline v_float16x8 v_load_f16(const short* ptr)
 { return v_float16x8(cv_vld1q_f16(ptr)); }
 inline v_float16x8 v_load_f16_aligned(const short* ptr)
 { return v_float16x8(cv_vld1q_f16(ptr)); }
 inline v_float16x8 v_load_f16_low(const short* ptr)
 { return v_float16x8(vcombine_f16(cv_vld1_f16(ptr), vdup_n_f16((float16_t)0))); }
 inline v_float16x8 v_load_f16_halves(const short* ptr0, const short* ptr1)
 { return v_float16x8(vcombine_f16(cv_vld1_f16(ptr0), cv_vld1_f16(ptr1))); }
 inline void v_store(short* ptr, const v_float16x8& a)
 { cv_vst1q_f16(ptr, a.val); }
 inline void v_store_aligned(short* ptr, const v_float16x8& a)
 { cv_vst1q_f16(ptr, a.val); }
 #endif
 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
 inline scalartype v_reduce_##func(const _Tpvec& a) \
 { \
@ -1507,22 +1433,6 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 }
 #endif
 #if CV_SIMD128_FP16
 inline v_float32x4 v_cvt_f32(const v_float16x8& a)
 {
    return v_float32x4(vcvt_f32_f16(vget_low_f16(a.val)));
 }
 inline v_float32x4 v_cvt_f32_high(const v_float16x8& a)
 {
    return v_float32x4(vcvt_f32_f16(vget_high_f16(a.val)));
 }
 inline v_float16x8 v_cvt_f16(const v_float32x4& a, const v_float32x4& b)
 {
    return v_float16x8(vcombine_f16(vcvt_f16_f32(a.val), vcvt_f16_f32(b.val)));
 }
 #endif
 ////////////// Lookup table access ////////////////////
 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
@ -1588,6 +1498,47 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
 }
 #endif
 ////// FP16 suport ///////
 #if CV_FP16
 inline v_float32x4 v_load_expand(const float16_t* ptr)
 {
    float16x4_t v =
    #ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
        (float16x4_t)vld1_s16((const short*)ptr);
    #else
        vld1_f16((const __fp16*)ptr);
    #endif
    return v_float32x4(vcvt_f32_f16(v));
 }
 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
 {
    float16x4_t hv = vcvt_f16_f32(v.val);
    #ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro
        vst1_s16((short*)ptr, (int16x4_t)hv);
    #else
        vst1_f16((__fp16*)ptr, hv);
    #endif
 }
 #else
 inline v_float32x4 v_load_expand(const float16_t* ptr)
 {
    const int N = 4;
    float buf[N];
    for( int i = 0; i < N; i++ ) buf[i] = (float)ptr[i];
    return v_load(buf);
 }
 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
 {
    const int N = 4;
    float buf[N];
    v_store(buf, v);
    for( int i = 0; i < N; i++ ) ptr[i] = float16_t(buf[i]);
 }
 #endif
 inline void v_cleanup() {}
 //! @name Check SIMD support
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@ -404,7 +404,7 @@ void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
 inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
 { return v_int8x16(_mm_packs_epi16(a.val, b.val)); }
-inline void v_pack_store(schar* ptr, v_int16x8& a)
+inline void v_pack_store(schar* ptr, const v_int16x8& a)
 { _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
 template<int n> inline
@ -2655,6 +2655,50 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
    y = v_float64x2(_mm_unpackhi_pd(xy0, xy1));
 }
 ////////////// FP16 support ///////////////////////////
 inline v_float32x4 v_load_expand(const float16_t* ptr)
 {
    const __m128i z = _mm_setzero_si128(), delta = _mm_set1_epi32(0x38000000);
    const __m128i signmask = _mm_set1_epi32(0x80000000), maxexp = _mm_set1_epi32(0x7c000000);
    const __m128 deltaf = _mm_castsi128_ps(_mm_set1_epi32(0x38800000));
    __m128i bits = _mm_unpacklo_epi16(z, _mm_loadl_epi64((const __m128i*)ptr)); // h << 16
    __m128i e = _mm_and_si128(bits, maxexp), sign = _mm_and_si128(bits, signmask);
    __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_xor_si128(bits, sign), 3), delta); // ((h & 0x7fff) << 13) + delta
    __m128i zt = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_add_epi32(t, _mm_set1_epi32(1 << 23))), deltaf));
    t = _mm_add_epi32(t, _mm_and_si128(delta, _mm_cmpeq_epi32(maxexp, e)));
    __m128i zmask = _mm_cmpeq_epi32(e, z);
    __m128i ft = v_select_si128(zmask, zt, t);
    return v_float32x4(_mm_castsi128_ps(_mm_or_si128(ft, sign)));
 }
 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
 {
    const __m128i signmask = _mm_set1_epi32(0x80000000);
    const __m128i rval = _mm_set1_epi32(0x3f000000);
    __m128i t = _mm_castps_si128(v.val);
    __m128i sign = _mm_srai_epi32(_mm_and_si128(t, signmask), 16);
    t = _mm_andnot_si128(signmask, t);
    __m128i finitemask = _mm_cmpgt_epi32(_mm_set1_epi32(0x47800000), t);
    __m128i isnan = _mm_cmpgt_epi32(t, _mm_set1_epi32(0x7f800000));
    __m128i naninf = v_select_si128(isnan, _mm_set1_epi32(0x7e00), _mm_set1_epi32(0x7c00));
    __m128i tinymask = _mm_cmpgt_epi32(_mm_set1_epi32(0x38800000), t);
    __m128i tt = _mm_castps_si128(_mm_add_ps(_mm_castsi128_ps(t), _mm_castsi128_ps(rval)));
    tt = _mm_sub_epi32(tt, rval);
    __m128i odd = _mm_and_si128(_mm_srli_epi32(t, 13), _mm_set1_epi32(1));
    __m128i nt = _mm_add_epi32(t, _mm_set1_epi32(0xc8000fff));
    nt = _mm_srli_epi32(_mm_add_epi32(nt, odd), 13);
    t = v_select_si128(tinymask, tt, nt);
    t = v_select_si128(finitemask, t, naninf);
    t = _mm_or_si128(t, sign);
    t = _mm_packs_epi32(t, t);
    _mm_storel_epi64((__m128i*)ptr, t);
 }
 inline void v_cleanup() {}
 //! @name Check SIMD support
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@ -916,6 +916,24 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
    y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
 }
 /////// FP16 support ////////
 // [TODO] implement these 2 using VSX or universal intrinsics (copy from intrin_sse.cpp and adopt)
 inline v_float32x4 v_load_expand(const float16_t* ptr)
 {
    return v_float32x4((float)ptr[0], (float)ptr[1], (float)ptr[2], (float)ptr[3]);
 }
 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
 {
    float CV_DECL_ALIGNED(32) f[4];
    v_store_aligned(f, v);
    ptr[0] = float16_t(f[0]);
    ptr[1] = float16_t(f[1]);
    ptr[2] = float16_t(f[2]);
    ptr[3] = float16_t(f[3]);
 }
 inline void v_cleanup() {}
--- a/modules/core/perf/perf_addWeighted.cpp
+++ b/modules/core/perf/perf_addWeighted.cpp
@ -11,6 +11,7 @@ PERF_TEST_P(Size_MatType, addWeighted, TYPICAL_MATS_ADWEIGHTED)
 {
    Size size = get<0>(GetParam());
    int type = get<1>(GetParam());
    int depth = CV_MAT_DEPTH(type);
    Mat src1(size, type);
    Mat src2(size, type);
    double alpha = 3.75;
@ -21,7 +22,7 @@ PERF_TEST_P(Size_MatType, addWeighted, TYPICAL_MATS_ADWEIGHTED)
    declare.in(src1, src2, dst, WARMUP_RNG).out(dst);
-    if (CV_MAT_DEPTH(type) == CV_32S)
+    if (depth == CV_32S)
    {
        // there might be not enough precision for integers
        src1 /= 2048;
@ -30,7 +31,7 @@ PERF_TEST_P(Size_MatType, addWeighted, TYPICAL_MATS_ADWEIGHTED)
    TEST_CYCLE() cv::addWeighted( src1, alpha, src2, beta, gamma, dst, dst.type() );
-    SANITY_CHECK(dst, 1);
+    SANITY_CHECK(dst, depth == CV_32S ? 4 : 1);
 }
 } // namespace
--- a/modules/core/perf/perf_convertTo.cpp
+++ b/modules/core/perf/perf_convertTo.cpp
@ -33,7 +33,7 @@ PERF_TEST_P( Size_DepthSrc_DepthDst_Channels_alpha, convertTo,
    int runs = (sz.width <= 640) ? 8 : 1;
    TEST_CYCLE_MULTIRUN(runs) src.convertTo(dst, depthDst, alpha);
-    double eps = depthSrc <= CV_32S ? 1e-12 : (FLT_EPSILON * maxValue);
+    double eps = depthSrc <= CV_32S && (depthDst <= CV_32S || depthDst == CV_64F) ? 1e-12 : (FLT_EPSILON * maxValue);
    eps = eps * std::max(1.0, fabs(alpha));
    SANITY_CHECK(dst, eps);
 }
--- a/modules/core/src/convert.avx2.cpp
+++ b/modules/core/src/convert.avx2.cpp
@ -1,40 +0,0 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html
 #include "precomp.hpp"
 #include "convert.hpp"
 namespace cv
 {
 namespace opt_AVX2
 {
 void cvtScale_s16s32f32Line_AVX2(const short* src, int* dst, float scale, float shift, int width)
 {
    int x = 0;
    __m256 scale256 = _mm256_set1_ps(scale);
    __m256 shift256 = _mm256_set1_ps(shift);
    const int shuffle = 0xD8;
    for (; x <= width - 16; x += 16)
    {
        __m256i v_src = _mm256_loadu_si256((const __m256i *)(src + x));
        v_src = _mm256_permute4x64_epi64(v_src, shuffle);
        __m256i v_src_lo = _mm256_srai_epi32(_mm256_unpacklo_epi16(v_src, v_src), 16);
        __m256i v_src_hi = _mm256_srai_epi32(_mm256_unpackhi_epi16(v_src, v_src), 16);
        __m256 v_dst0 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_lo), scale256), shift256);
        __m256 v_dst1 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_hi), scale256), shift256);
        _mm256_storeu_si256((__m256i *)(dst + x), _mm256_cvtps_epi32(v_dst0));
        _mm256_storeu_si256((__m256i *)(dst + x + 8), _mm256_cvtps_epi32(v_dst1));
    }
    for (; x < width; x++)
        dst[x] = saturate_cast<int>(src[x] * scale + shift);
 }
 }
 } // cv::
 /* End of file. */
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
--- a/modules/core/src/convert.fp16.cpp
+++ b/modules/core/src/convert.fp16.cpp
@ -1,126 +0,0 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html
 #include "precomp.hpp"
 #include "convert.hpp"
 namespace cv
 {
 namespace opt_FP16
 {
 #if !defined(CV_NEON) || !CV_NEON
 const static int cVectorWidth = 8;
 void cvtScaleHalf_SIMD32f16f( const float* src, size_t sstep, short* dst, size_t dstep, cv::Size size )
 {
    CV_INSTRUMENT_REGION()
    sstep /= sizeof(src[0]);
    dstep /= sizeof(dst[0]);
    for( ; size.height--; src += sstep, dst += dstep )
    {
        int x = 0;
        for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth )
        {
            __m256 v_src = _mm256_loadu_ps(src + x);
            // round to nearest even
            __m128i v_dst = _mm256_cvtps_ph(v_src, 0);
            _mm_storeu_si128((__m128i*)(dst + x), v_dst);
        }
        for ( ; x < size.width; x++ )
        {
            dst[x] = convertFp16SW(src[x]);
        }
    }
 }
 void cvtScaleHalf_SIMD16f32f( const short* src, size_t sstep, float* dst, size_t dstep, cv::Size size )
 {
    CV_INSTRUMENT_REGION()
    sstep /= sizeof(src[0]);
    dstep /= sizeof(dst[0]);
    for( ; size.height--; src += sstep, dst += dstep )
    {
        int x = 0;
        for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth )
        {
            __m128i v_src = _mm_loadu_si128((__m128i*)(src + x));
            __m256 v_dst = _mm256_cvtph_ps(v_src);
            _mm256_storeu_ps(dst + x, v_dst);
        }
        for ( ; x < size.width; x++ )
        {
            dst[x] = convertFp16SW(src[x]);
        }
    }
 }
 #elif CV_NEON
 const static int cVectorWidth = 4;
 void cvtScaleHalf_SIMD32f16f( const float* src, size_t sstep, short* dst, size_t dstep, cv::Size size )
 {
    CV_INSTRUMENT_REGION()
    sstep /= sizeof(src[0]);
    dstep /= sizeof(dst[0]);
    for( ; size.height--; src += sstep, dst += dstep )
    {
        int x = 0;
        for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth)
        {
            float32x4_t v_src = vld1q_f32(src + x);
            float16x4_t v_dst = vcvt_f16_f32(v_src);
            cv_vst1_f16(dst + x, v_dst);
        }
        for ( ; x < size.width; x++ )
        {
            dst[x] = convertFp16SW(src[x]);
        }
    }
 }
 void cvtScaleHalf_SIMD16f32f( const short* src, size_t sstep, float* dst, size_t dstep, cv::Size size )
 {
    CV_INSTRUMENT_REGION()
    sstep /= sizeof(src[0]);
    dstep /= sizeof(dst[0]);
    for( ; size.height--; src += sstep, dst += dstep )
    {
        int x = 0;
        for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth )
        {
            float16x4_t v_src = cv_vld1_f16((__fp16*)src + x);
            float32x4_t v_dst = vcvt_f32_f16(v_src);
            vst1q_f32(dst + x, v_dst);
        }
        for ( ; x < size.width; x++ )
        {
            dst[x] = convertFp16SW(src[x]);
        }
    }
 }
 #else
 #error "Unsupported build configuration"
 #endif
 }
 } // cv::
--- a/modules/core/src/convert.hpp
+++ b/modules/core/src/convert.hpp
@ -8,192 +8,402 @@
 #include "opencv2/core/types.hpp"
-namespace
+namespace cv
 {
 float convertFp16SW(short fp16);
 short convertFp16SW(float fp32);
-#if !CV_FP16_TYPE
+#if CV_SIMD
-// const numbers for floating points format
+
-const unsigned int kShiftSignificand    = 13;
+static inline void vx_load_as(const uchar* ptr, v_float32& a)
-const unsigned int kMaskFp16Significand = 0x3ff;
+{ a = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(ptr))); }
-const unsigned int kBiasFp16Exponent    = 15;
+
-const unsigned int kBiasFp32Exponent    = 127;
+static inline void vx_load_as(const schar* ptr, v_float32& a)
-#endif
+{ a = v_cvt_f32(vx_load_expand_q(ptr)); }
 static inline void vx_load_as(const ushort* ptr, v_float32& a)
 { a = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(ptr))); }
 static inline void vx_load_as(const short* ptr, v_float32& a)
 { a = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(ptr))); }
 static inline void vx_load_as(const int* ptr, v_float32& a)
 { a = v_cvt_f32(vx_load(ptr)); }
 static inline void vx_load_as(const float* ptr, v_float32& a)
 { a = vx_load(ptr); }
 static inline void vx_load_as(const float16_t* ptr, v_float32& a)
 { a = vx_load_expand(ptr); }
 static inline void v_store_as(ushort* ptr, const v_float32& a)
 { v_pack_u_store(ptr, v_round(a)); }
-#if CV_FP16_TYPE
+static inline void v_store_as(short* ptr, const v_float32& a)
-inline float convertFp16SW(short fp16)
+{ v_pack_store(ptr, v_round(a)); }
 static inline void v_store_as(int* ptr, const v_float32& a)
 { v_store(ptr, v_round(a)); }
 static inline void v_store_as(float* ptr, const v_float32& a)
 { v_store(ptr, a); }
 static inline void v_store_as(float16_t* ptr, const v_float32& a)
 { v_pack_store(ptr, a); }
 static inline void vx_load_pair_as(const uchar* ptr, v_uint16& a, v_uint16& b)
 { v_expand(vx_load(ptr), a, b); }
 static inline void vx_load_pair_as(const schar* ptr, v_uint16& a, v_uint16& b)
 {
-    // Fp16 -> Fp32
+    const v_int8 z = vx_setzero_s8();
-    Cv16suf a;
+    v_int16 sa, sb;
-    a.i = fp16;
+    v_expand(v_max(vx_load(ptr), z), sa, sb);
-    return (float)a.h;
+    a = v_reinterpret_as_u16(sa);
    b = v_reinterpret_as_u16(sb);
 }
-#else
+
-inline float convertFp16SW(short fp16)
+static inline void vx_load_pair_as(const ushort* ptr, v_uint16& a, v_uint16& b)
-{
+{ a = vx_load(ptr); b = vx_load(ptr + v_uint16::nlanes); }
-    // Fp16 -> Fp32
+
-    Cv16suf b;
+static inline void vx_load_pair_as(const uchar* ptr, v_int16& a, v_int16& b)
-    b.i = fp16;
+{
-    int exponent    = b.fmt.exponent - kBiasFp16Exponent;
+    v_uint16 ua, ub;
-    int significand = b.fmt.significand;
+    v_expand(vx_load(ptr), ua, ub);
-
+    a = v_reinterpret_as_s16(ua);
-    Cv32suf a;
+    b = v_reinterpret_as_s16(ub);
    a.i = 0;
    a.fmt.sign = b.fmt.sign; // sign bit
    if( exponent == 16 )
    {
        // Inf or NaN
        a.i = a.i | 0x7F800000;
        if( significand != 0 )
        {
            // NaN
 #if defined(__x86_64__) || defined(_M_X64)
            // 64bit
            a.i = a.i | 0x7FC00000;
 #endif
            a.fmt.significand = a.fmt.significand | (significand << kShiftSignificand);
        }
        return a.f;
    }
    else if ( exponent == -(int)kBiasFp16Exponent )
    {
        // subnormal in Fp16
        if( significand == 0 )
        {
            // zero
            return a.f;
        }
        else
        {
            int shift = -1;
            while( ( significand & 0x400 ) == 0 )
            {
                significand = significand << 1;
                shift++;
            }
            significand = significand & kMaskFp16Significand;
            exponent -= shift;
        }
    }
    a.fmt.exponent = (exponent+kBiasFp32Exponent);
    a.fmt.significand = significand << kShiftSignificand;
    return a.f;
 }
 #endif
 #if CV_FP16_TYPE
 inline short convertFp16SW(float fp32)
 {
    // Fp32 -> Fp16
    Cv16suf a;
    a.h = (__fp16)fp32;
    return a.i;
 }
 #else
 inline short convertFp16SW(float fp32)
 {
    // Fp32 -> Fp16
    Cv32suf a;
    a.f = fp32;
    int exponent    = a.fmt.exponent - kBiasFp32Exponent;
    int significand = a.fmt.significand;
    Cv16suf result;
    result.i = 0;
    unsigned int absolute = a.i & 0x7fffffff;
    if( 0x477ff000 <= absolute )
    {
        // Inf in Fp16
        result.i = result.i | 0x7C00;
        if( exponent == 128 && significand != 0 )
        {
            // NaN
            result.i = (short)( result.i | 0x200 | ( significand >> kShiftSignificand ) );
        }
    }
    else if ( absolute < 0x33000001 )
    {
        // too small for fp16
        result.i = 0;
    }
    else if ( absolute < 0x387fe000 )
    {
        // subnormal in Fp16
        int fp16Significand = significand | 0x800000;
        int bitShift = (-exponent) - 1;
        fp16Significand = fp16Significand >> bitShift;
        // special cases to round up
        bitShift = exponent + 24;
        int threshold = ( ( 0x400000 >> bitShift ) | ( ( ( significand & ( 0x800000 >> bitShift ) ) >> ( 126 - a.fmt.exponent ) ) ^ 1 ) );
        if( absolute == 0x33c00000 )
        {
            result.i = 2;
        }
        else
        {
            if( threshold <= ( significand & ( 0xffffff >> ( exponent + 25 ) ) ) )
            {
                fp16Significand++;
            }
            result.i = (short)fp16Significand;
        }
    }
    else
    {
        // usual situation
        // exponent
        result.fmt.exponent = ( exponent + kBiasFp16Exponent );
        // significand;
        short fp16Significand = (short)(significand >> kShiftSignificand);
        result.fmt.significand = fp16Significand;
        // special cases to round up
        short lsb10bitsFp32 = (significand & 0x1fff);
        short threshold = 0x1000 + ( ( fp16Significand & 0x1 ) ? 0 : 1 );
        if( threshold <= lsb10bitsFp32 )
        {
            result.i++;
        }
        else if ( fp16Significand == kMaskFp16Significand && exponent == -15)
        {
            result.i++;
        }
    }
    // sign bit
    result.fmt.sign = a.fmt.sign;
    return result.i;
 }
 #endif
 static inline void vx_load_pair_as(const schar* ptr, v_int16& a, v_int16& b)
 { v_expand(vx_load(ptr), a, b); }
 static inline void vx_load_pair_as(const short* ptr, v_int16& a, v_int16& b)
 { a = vx_load(ptr); b = vx_load(ptr + v_uint16::nlanes); }
 static inline void vx_load_pair_as(const uchar* ptr, v_int32& a, v_int32& b)
 {
    v_uint32 ua, ub;
    v_expand(vx_load_expand(ptr), ua, ub);
    a = v_reinterpret_as_s32(ua);
    b = v_reinterpret_as_s32(ub);
 }
-namespace cv
+static inline void vx_load_pair_as(const schar* ptr, v_int32& a, v_int32& b)
 { v_expand(vx_load_expand(ptr), a, b); }
 static inline void vx_load_pair_as(const ushort* ptr, v_int32& a, v_int32& b)
 {
    v_uint32 ua, ub;
    v_expand(vx_load(ptr), ua, ub);
    a = v_reinterpret_as_s32(ua);
    b = v_reinterpret_as_s32(ub);
 }
 static inline void vx_load_pair_as(const short* ptr, v_int32& a, v_int32& b)
 {
    v_expand(vx_load(ptr), a, b);
 }
 static inline void vx_load_pair_as(const int* ptr, v_int32& a, v_int32& b)
 {
    a = vx_load(ptr);
    b = vx_load(ptr + v_int32::nlanes);
 }
 static inline void vx_load_pair_as(const uchar* ptr, v_float32& a, v_float32& b)
 {
    v_uint32 ua, ub;
    v_expand(vx_load_expand(ptr), ua, ub);
    a = v_cvt_f32(v_reinterpret_as_s32(ua));
    b = v_cvt_f32(v_reinterpret_as_s32(ub));
 }
 static inline void vx_load_pair_as(const schar* ptr, v_float32& a, v_float32& b)
 {
    v_int32 ia, ib;
    v_expand(vx_load_expand(ptr), ia, ib);
    a = v_cvt_f32(ia);
    b = v_cvt_f32(ib);
 }
 static inline void vx_load_pair_as(const ushort* ptr, v_float32& a, v_float32& b)
 {
    v_uint32 ua, ub;
    v_expand(vx_load(ptr), ua, ub);
    a = v_cvt_f32(v_reinterpret_as_s32(ua));
    b = v_cvt_f32(v_reinterpret_as_s32(ub));
 }
 static inline void vx_load_pair_as(const short* ptr, v_float32& a, v_float32& b)
 {
    v_int32 ia, ib;
    v_expand(vx_load(ptr), ia, ib);
    a = v_cvt_f32(ia);
    b = v_cvt_f32(ib);
 }
 static inline void vx_load_pair_as(const int* ptr, v_float32& a, v_float32& b)
 {
    v_int32 ia = vx_load(ptr), ib = vx_load(ptr + v_int32::nlanes);
    a = v_cvt_f32(ia);
    b = v_cvt_f32(ib);
 }
 static inline void vx_load_pair_as(const float* ptr, v_float32& a, v_float32& b)
 { a = vx_load(ptr); b = vx_load(ptr + v_float32::nlanes); }
 //static inline void vx_load_pair_as(const float16_t* ptr, v_float32& a, v_float32& b)
 //{
 //    a = vx_load_expand(ptr);
 //    b = vx_load_expand(ptr + v_float32::nlanes);
 //}
 static inline void v_store_pair_as(uchar* ptr, const v_uint16& a, const v_uint16& b)
 {
    v_store(ptr, v_pack(a, b));
 }
 static inline void v_store_pair_as(schar* ptr, const v_uint16& a, const v_uint16& b)
 {
    const v_uint8 maxval = vx_setall_u8((uchar)std::numeric_limits<schar>::max());
    v_uint8 v = v_pack(a, b);
    v_store(ptr, v_reinterpret_as_s8(v_min(v, maxval)));
 }
 static inline void v_store_pair_as(ushort* ptr, const v_uint16& a, const v_uint16& b)
 { v_store(ptr, a); v_store(ptr + v_uint16::nlanes, b); }
 static inline void v_store_pair_as(uchar* ptr, const v_int16& a, const v_int16& b)
 { v_store(ptr, v_pack_u(a, b)); }
 static inline void v_store_pair_as(schar* ptr, const v_int16& a, const v_int16& b)
 { v_store(ptr, v_pack(a, b)); }
 static inline void v_store_pair_as(short* ptr, const v_int16& a, const v_int16& b)
 { v_store(ptr, a); v_store(ptr + v_int16::nlanes, b); }
 static inline void v_store_pair_as(uchar* ptr, const v_int32& a, const v_int32& b)
 { v_pack_u_store(ptr, v_pack(a, b)); }
 static inline void v_store_pair_as(schar* ptr, const v_int32& a, const v_int32& b)
 { v_pack_store(ptr, v_pack(a, b)); }
 static inline void v_store_pair_as(ushort* ptr, const v_int32& a, const v_int32& b)
 { v_store(ptr, v_pack_u(a, b)); }
 static inline void v_store_pair_as(short* ptr, const v_int32& a, const v_int32& b)
 { v_store(ptr, v_pack(a, b)); }
 static inline void v_store_pair_as(int* ptr, const v_int32& a, const v_int32& b)
 {
    v_store(ptr, a);
    v_store(ptr + v_int32::nlanes, b);
 }
 static inline void v_store_pair_as(uchar* ptr, const v_float32& a, const v_float32& b)
 { v_pack_u_store(ptr, v_pack(v_round(a), v_round(b))); }
 static inline void v_store_pair_as(schar* ptr, const v_float32& a, const v_float32& b)
 { v_pack_store(ptr, v_pack(v_round(a), v_round(b))); }
 static inline void v_store_pair_as(ushort* ptr, const v_float32& a, const v_float32& b)
 { v_store(ptr, v_pack_u(v_round(a), v_round(b))); }
 static inline void v_store_pair_as(short* ptr, const v_float32& a, const v_float32& b)
 { v_store(ptr, v_pack(v_round(a), v_round(b))); }
 static inline void v_store_pair_as(int* ptr, const v_float32& a, const v_float32& b)
 {
    v_int32 ia = v_round(a), ib = v_round(b);
    v_store(ptr, ia);
    v_store(ptr + v_int32::nlanes, ib);
 }
 static inline void v_store_pair_as(float* ptr, const v_float32& a, const v_float32& b)
 { v_store(ptr, a); v_store(ptr + v_float32::nlanes, b); }
 #if CV_SIMD_64F
 static inline void vx_load_as(const double* ptr, v_float32& a)
 {
    v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes);
    a = v_cvt_f32(v0, v1);
 }
 static inline void vx_load_pair_as(const double* ptr, v_int32& a, v_int32& b)
 {
    v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes);
    v_float64 v2 = vx_load(ptr + v_float64::nlanes*2), v3 = vx_load(ptr + v_float64::nlanes*3);
    v_int32 iv0 = v_round(v0), iv1 = v_round(v1);
    v_int32 iv2 = v_round(v2), iv3 = v_round(v3);
    a = v_combine_low(iv0, iv1);
    b = v_combine_low(iv2, iv3);
 }
 static inline void vx_load_pair_as(const double* ptr, v_float32& a, v_float32& b)
 {
    v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes);
    v_float64 v2 = vx_load(ptr + v_float64::nlanes*2), v3 = vx_load(ptr + v_float64::nlanes*3);
    a = v_cvt_f32(v0, v1);
    b = v_cvt_f32(v2, v3);
 }
 static inline void vx_load_pair_as(const uchar* ptr, v_float64& a, v_float64& b)
 {
    v_int32 v0 = v_reinterpret_as_s32(vx_load_expand_q(ptr));
    a = v_cvt_f64(v0);
    b = v_cvt_f64_high(v0);
 }
 static inline void vx_load_pair_as(const schar* ptr, v_float64& a, v_float64& b)
 {
-namespace opt_FP16
+    v_int32 v0 = vx_load_expand_q(ptr);
    a = v_cvt_f64(v0);
    b = v_cvt_f64_high(v0);
 }
 static inline void vx_load_pair_as(const ushort* ptr, v_float64& a, v_float64& b)
 {
    v_int32 v0 = v_reinterpret_as_s32(vx_load_expand(ptr));
    a = v_cvt_f64(v0);
    b = v_cvt_f64_high(v0);
 }
 static inline void vx_load_pair_as(const short* ptr, v_float64& a, v_float64& b)
 {
    v_int32 v0 = vx_load_expand(ptr);
    a = v_cvt_f64(v0);
    b = v_cvt_f64_high(v0);
 }
 static inline void vx_load_pair_as(const int* ptr, v_float64& a, v_float64& b)
 {
    v_int32 v0 = vx_load(ptr);
    a = v_cvt_f64(v0);
    b = v_cvt_f64_high(v0);
 }
 static inline void vx_load_pair_as(const float* ptr, v_float64& a, v_float64& b)
 {
-void cvtScaleHalf_SIMD32f16f( const float* src, size_t sstep, short* dst, size_t dstep, cv::Size size );
+    v_float32 v0 = vx_load(ptr);
-void cvtScaleHalf_SIMD16f32f( const short* src, size_t sstep, float* dst, size_t dstep, cv::Size size );
+    a = v_cvt_f64(v0);
    b = v_cvt_f64_high(v0);
 }
-namespace opt_AVX2
+
 static inline void vx_load_pair_as(const double* ptr, v_float64& a, v_float64& b)
 {
-void cvtScale_s16s32f32Line_AVX2(const short* src, int* dst, float scale, float shift, int width);
+    a = vx_load(ptr);
    b = vx_load(ptr + v_float64::nlanes);
 }
-namespace opt_SSE4_1
+
 //static inline void vx_load_pair_as(const float16_t* ptr, v_float64& a, v_float64& b)
 //{
 //    v_float32 v0 = vx_load_expand(ptr);
 //    a = v_cvt_f64(v0);
 //    b = v_cvt_f64_high(v0);
 //}
 static inline void v_store_as(double* ptr, const v_float32& a)
 {
-    int cvtScale_SIMD_u8u16f32_SSE41(const uchar * src, ushort * dst, int width, float scale, float shift);
+    v_float64 fa0 = v_cvt_f64(a), fa1 = v_cvt_f64_high(a);
-    int cvtScale_SIMD_s8u16f32_SSE41(const schar * src, ushort * dst, int width, float scale, float shift);
+    v_store(ptr, fa0);
-    int cvtScale_SIMD_u16u16f32_SSE41(const ushort * src, ushort * dst, int width, float scale, float shift);
+    v_store(ptr + v_float64::nlanes, fa1);
-    int cvtScale_SIMD_s16u16f32_SSE41(const short * src, ushort * dst, int width, float scale, float shift);
+}
-    int cvtScale_SIMD_s32u16f32_SSE41(const int * src, ushort * dst, int width, float scale, float shift);
+
-    int cvtScale_SIMD_f32u16f32_SSE41(const float * src, ushort * dst, int width, float scale, float shift);
+static inline void v_store_pair_as(double* ptr, const v_int32& a, const v_int32& b)
-    int cvtScale_SIMD_f64u16f32_SSE41(const double * src, ushort * dst, int width, float scale, float shift);
+{
-    int Cvt_SIMD_f64u16_SSE41(const double * src, ushort * dst, int width);
+    v_float64 fa0 = v_cvt_f64(a), fa1 = v_cvt_f64_high(a);
    v_float64 fb0 = v_cvt_f64(b), fb1 = v_cvt_f64_high(b);
    v_store(ptr, fa0);
    v_store(ptr + v_float64::nlanes, fa1);
    v_store(ptr + v_float64::nlanes*2, fb0);
    v_store(ptr + v_float64::nlanes*3, fb1);
 }
 static inline void v_store_pair_as(double* ptr, const v_float32& a, const v_float32& b)
 {
    v_float64 fa0 = v_cvt_f64(a), fa1 = v_cvt_f64_high(a);
    v_float64 fb0 = v_cvt_f64(b), fb1 = v_cvt_f64_high(b);
    v_store(ptr, fa0);
    v_store(ptr + v_float64::nlanes, fa1);
    v_store(ptr + v_float64::nlanes*2, fb0);
    v_store(ptr + v_float64::nlanes*3, fb1);
 }
 static inline void v_store_pair_as(double* ptr, const v_float64& a, const v_float64& b)
 {
    v_store(ptr, a);
    v_store(ptr + v_float64::nlanes, b);
 }
 static inline void v_store_pair_as(int* ptr, const v_float64& a, const v_float64& b)
 {
    v_int32 ia = v_round(a), ib = v_round(b);
    v_store(ptr, v_combine_low(ia, ib));
 }
 static inline void v_store_pair_as(float* ptr, const v_float64& a, const v_float64& b)
 {
    v_float32 v = v_cvt_f32(a, b);
    v_store(ptr, v);
 }
 //static inline void v_store_pair_as(float16_t* ptr, const v_float64& a, const v_float64& b)
 //{
 //    v_float32 v = v_cvt_f32(a, b);
 //    v_pack_store(ptr, v);
 //}
 #else
 static inline void vx_load_as(const double* ptr, v_float32& a)
 {
    const int VECSZ = v_float32::nlanes;
    float buf[VECSZ*2];
    for( int i = 0; i < VECSZ; i++ )
        buf[i] = saturate_cast<float>(ptr[i]);
    a = vx_load(buf);
 }
 template<typename _Tdvec>
 static inline void vx_load_pair_as(const double* ptr, _Tdvec& a, _Tdvec& b)
 {
    const int VECSZ = _Tdvec::nlanes;
    typename _Tdvec::lane_type buf[VECSZ*2];
    for( int i = 0; i < VECSZ*2; i++ )
        buf[i] = saturate_cast<typename _Tdvec::lane_type>(ptr[i]);
    a = vx_load(buf);
    b = vx_load(buf + VECSZ);
 }
 static inline void v_store_as(double* ptr, const v_float32& a)
 {
    const int VECSZ = v_float32::nlanes;
    float buf[VECSZ];
    v_store(buf, a);
    for( int i = 0; i < VECSZ; i++ )
        ptr[i] = (double)buf[i];
 }
 template<typename _Tsvec>
 static inline void v_store_pair_as(double* ptr, const _Tsvec& a, const _Tsvec& b)
 {
    const int VECSZ = _Tsvec::nlanes;
    typename _Tsvec::lane_type buf[VECSZ*2];
    v_store(buf, a); v_store(buf + VECSZ, b);
    for( int i = 0; i < VECSZ*2; i++ )
        ptr[i] = (double)buf[i];
 }
 #endif /////////// CV_SIMD_64F
 #endif /////////// CV_SIMD
 }
 #endif // SRC_CONVERT_HPP
--- a/modules/core/src/convert.sse4_1.cpp
+++ b/modules/core/src/convert.sse4_1.cpp
@ -1,203 +0,0 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html
 #include "precomp.hpp"
 #include "convert.hpp"
 namespace cv
 {
 namespace opt_SSE4_1
 {
 int cvtScale_SIMD_u8u16f32_SSE41(const uchar * src, ushort * dst, int width, float scale, float shift)
 {
    int x = 0;
    __m128i v_zero = _mm_setzero_si128();
    __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
    for ( ; x <= width - 8; x += 8)
    {
        __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
        __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
        __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
        v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
        __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
        __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
                                            _mm_cvtps_epi32(v_dst_1));
        _mm_storeu_si128((__m128i *)(dst + x), v_dst);
    }
    return x;
 }
 int cvtScale_SIMD_s8u16f32_SSE41(const schar * src, ushort * dst, int width, float scale, float shift)
 {
    int x = 0;
    __m128i v_zero = _mm_setzero_si128();
    __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
    for ( ; x <= width - 8; x += 8)
    {
        __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
        __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
        __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
        v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
        __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
        __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
                                            _mm_cvtps_epi32(v_dst_1));
        _mm_storeu_si128((__m128i *)(dst + x), v_dst);
    }
    return x;
 }
 int cvtScale_SIMD_u16u16f32_SSE41(const ushort * src, ushort * dst, int width, float scale, float shift)
 {
    int x = 0;
    __m128i v_zero = _mm_setzero_si128();
    __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
    for ( ; x <= width - 8; x += 8)
    {
        __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
        __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
        __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
        v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
        __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
        __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
                                            _mm_cvtps_epi32(v_dst_1));
        _mm_storeu_si128((__m128i *)(dst + x), v_dst);
    }
    return x;
 }
 int cvtScale_SIMD_s16u16f32_SSE41(const short * src, ushort * dst, int width, float scale, float shift)
 {
    int x = 0;
    __m128i v_zero = _mm_setzero_si128();
    __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
    for ( ; x <= width - 8; x += 8)
    {
        __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
        __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
        __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
        v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
        __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
        __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
                                            _mm_cvtps_epi32(v_dst_1));
        _mm_storeu_si128((__m128i *)(dst + x), v_dst);
    }
    return x;
 }
 int cvtScale_SIMD_s32u16f32_SSE41(const int * src, ushort * dst, int width, float scale, float shift)
 {
    int x = 0;
    __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
    for ( ; x <= width - 8; x += 8)
    {
        __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
        __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
        v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
        __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
        __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
                                            _mm_cvtps_epi32(v_dst_1));
        _mm_storeu_si128((__m128i *)(dst + x), v_dst);
    }
    return x;
 }
 int cvtScale_SIMD_f32u16f32_SSE41(const float * src, ushort * dst, int width, float scale, float shift)
 {
    int x = 0;
    __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
    for ( ; x <= width - 8; x += 8)
    {
        __m128 v_src = _mm_loadu_ps(src + x);
        __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
        v_src = _mm_loadu_ps(src + x + 4);
        __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
        __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
                                            _mm_cvtps_epi32(v_dst_1));
        _mm_storeu_si128((__m128i *)(dst + x), v_dst);
    }
    return x;
 }
 int cvtScale_SIMD_f64u16f32_SSE41(const double * src, ushort * dst, int width, float scale, float shift)
 {
    int x = 0;
    __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
    for ( ; x <= width - 8; x += 8)
    {
        __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
                                        _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
        __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
        v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
                                _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
        __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
        __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
                                            _mm_cvtps_epi32(v_dst_1));
        _mm_storeu_si128((__m128i *)(dst + x), v_dst);
    }
    return x;
 }
 int Cvt_SIMD_f64u16_SSE41(const double * src, ushort * dst, int width)
 {
    int x = 0;
    for ( ; x <= width - 8; x += 8)
    {
        __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
        __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
        __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
        __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
        v_src0 = _mm_movelh_ps(v_src0, v_src1);
        v_src1 = _mm_movelh_ps(v_src2, v_src3);
        __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_src0),
                                            _mm_cvtps_epi32(v_src1));
        _mm_storeu_si128((__m128i *)(dst + x), v_dst);
    }
    return x;
 }
 }
 } // cv::
 /* End of file. */
--- a/modules/core/src/convert_scale.cpp
+++ b/modules/core/src/convert_scale.cpp
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@ -1123,7 +1123,6 @@ template<typename R> struct TheTest
        return *this;
    }
 #if CV_FP16
    TheTest & test_loadstore_fp16_f32()
    {
        printf("test_loadstore_fp16_f32 ...\n");
@ -1133,14 +1132,14 @@ template<typename R> struct TheTest
        AlignedData<v_float32> data_f32; data_f32.a.clear();
        AlignedData<v_uint16> out;
-        R r1 = vx_load_fp16_f32((short*)data.a.d);
+        R r1 = vx_load_expand((const cv::float16_t*)data.a.d);
        R r2(r1);
        EXPECT_EQ(1.0f, r1.get0());
        vx_store(data_f32.a.d, r2);
        EXPECT_EQ(-2.0f, data_f32.a.d[R::nlanes - 1]);
        out.a.clear();
-        vx_store_fp16((short*)out.a.d, r2);
+        v_pack_store((cv::float16_t*)out.a.d, r2);
        for (int i = 0; i < R::nlanes; ++i)
        {
            EXPECT_EQ(data.a[i], out.a[i]) << "i=" << i;
@ -1148,9 +1147,8 @@ template<typename R> struct TheTest
        return *this;
    }
 #endif
-#if CV_SIMD_FP16
+#if 0
    TheTest & test_loadstore_fp16()
    {
        printf("test_loadstore_fp16 ...\n");
@ -1165,7 +1163,7 @@ template<typename R> struct TheTest
        // check some initialization methods
        R r1 = data.u;
-        R r2 = vx_load_f16(data.a.d);
+        R r2 = vx_load_expand((const float16_t*)data.a.d);
        R r3(r2);
        EXPECT_EQ(data.u[0], r1.get0());
        EXPECT_EQ(data.a[0], r2.get0());
--- a/modules/core/test/test_math.cpp
+++ b/modules/core/test/test_math.cpp
@ -3230,6 +3230,22 @@ softdouble naiveExp(softdouble x)
    }
 }
 static float makeFP32(int sign, int exponent, int significand)
 {
    Cv32suf x;
    x.u = (unsigned)(((sign & 1) << 31) | ((exponent&255) << 23) | (significand & 0x7fffff));
    return x.f;
 }
 static float makeRandomFP32(RNG& rng, int sign, int exprange)
 {
    if( sign == -1 )
        sign = rng() % 2;
    int exponent = rng() % exprange;
    int significand = rng() % (1 << 23);
    return makeFP32(sign, exponent, significand);
 }
 TEST(Core_SoftFloat, exp32)
 {
    //special cases
@ -3246,13 +3262,11 @@ TEST(Core_SoftFloat, exp32)
    inputs.push_back(softfloat::min());
    for(int i = 0; i < 50000; i++)
    {
-        Cv32suf x;
+        float x = makeRandomFP32(rng, -1, 10+127 //bigger exponent will produce inf
-        x.fmt.sign = rng() % 2;
+                                 );
-        x.fmt.exponent = rng() % (10 + 127); //bigger exponent will produce inf
+        if(softfloat(x) > ln_max)
-        x.fmt.significand = rng() % (1 << 23);
+            x = rng.uniform(0.0f, (float)ln_max);
-        if(softfloat(x.f) > ln_max)
+        inputs.push_back(softfloat(x));
            x.f = rng.uniform(0.0f, (float)ln_max);
        inputs.push_back(softfloat(x.f));
    }
    for(size_t i = 0; i < inputs.size(); i++)
@ -3323,11 +3337,7 @@ TEST(Core_SoftFloat, log32)
    EXPECT_TRUE(log(softfloat::nan()).isNaN());
    for(int i = 0; i < nValues; i++)
    {
-        Cv32suf x;
+        softfloat x32(makeRandomFP32(rng, 1, 255));
        x.fmt.sign = 1;
        x.fmt.exponent = rng() % 255;
        x.fmt.significand = rng() % (1 << 23);
        softfloat x32(x.f);
        ASSERT_TRUE(log(x32).isNaN());
    }
    EXPECT_TRUE(log(softfloat::zero()).isInf());
@ -3340,11 +3350,7 @@ TEST(Core_SoftFloat, log32)
    inputs.push_back(softfloat::max());
    for(int i = 0; i < nValues; i++)
    {
-        Cv32suf x;
+        inputs.push_back(softfloat(makeRandomFP32(rng, 0, 255)));
        x.fmt.sign = 0;
        x.fmt.exponent = rng() % 255;
        x.fmt.significand = rng() % (1 << 23);
        inputs.push_back(softfloat(x.f));
    }
    for(size_t i = 0; i < inputs.size(); i++)
@ -3426,11 +3432,7 @@ TEST(Core_SoftFloat, cbrt32)
    inputs.push_back(softfloat::min());
    for(int i = 0; i < 50000; i++)
    {
-        Cv32suf x;
+        inputs.push_back(softfloat(makeRandomFP32(rng, -1, 255)));
        x.fmt.sign = rng() % 2;
        x.fmt.exponent = rng() % 255;
        x.fmt.significand = rng() % (1 << 23);
        inputs.push_back(softfloat(x.f));
    }
    for(size_t i = 0; i < inputs.size(); i++)
@ -3522,11 +3524,8 @@ TEST(Core_SoftFloat, pow32)
    // inf ** y == inf, if y > 0
    for(size_t i = 0; i < nValues; i++)
    {
-        Cv32suf x;
+        float x = makeRandomFP32(rng, 0, 255);
-        x.fmt.sign = 0;
+        softfloat x32 = softfloat(x);
        x.fmt.exponent = rng() % 255;
        x.fmt.significand = rng() % (1 << 23);
        softfloat x32 = softfloat(x.f);
        ASSERT_TRUE(pow( inf, x32).isInf());
        ASSERT_TRUE(pow(-inf, x32).isInf());
        ASSERT_EQ(pow( inf, -x32), zero);
@ -3538,17 +3537,9 @@ TEST(Core_SoftFloat, pow32)
    // x ** y == nan, if x < 0 and y is not integer
    for(size_t i = 0; i < nValues; i++)
    {
-        Cv32suf x;
+        softfloat x32(makeRandomFP32(rng, 1, 255));
-        x.fmt.sign = 1;
+        softfloat y32(makeRandomFP32(rng, -1, 23+127 //bigger exponent produces integer numbers only
-        x.fmt.exponent = rng() % 255;
+                                     ));
        x.fmt.significand = rng() % (1 << 23);
        softfloat x32(x.f);
        Cv32suf y;
        y.fmt.sign = rng() % 2;
        //bigger exponent produces integer numbers only
        y.fmt.exponent = rng() % (23 + 127);
        y.fmt.significand = rng() % (1 << 23);
        softfloat y32(y.f);
        int yi = cvRound(y32);
        if(y32 != softfloat(yi))
            ASSERT_TRUE(pow(x32, y32).isNaN());
@ -3565,11 +3556,7 @@ TEST(Core_SoftFloat, pow32)
    // 0 ** y == 0, if y > 0
    for(size_t i = 0; i < nValues; i++)
    {
-        Cv32suf x;
+        softfloat x32(makeRandomFP32(rng, 0, 255));
        x.fmt.sign = 0;
        x.fmt.exponent = rng() % 255;
        x.fmt.significand = rng() % (1 << 23);
        softfloat x32(x.f);
        ASSERT_TRUE(pow(zero, -x32).isInf());
        if(x32 != one)
        {
--- a/platforms/ios/build_framework.py
+++ b/platforms/ios/build_framework.py
@ -183,7 +183,7 @@ class Builder:
        cmakecmd = self.getCMakeArgs(arch, target) + \
            (["-DCMAKE_TOOLCHAIN_FILE=%s" % toolchain] if toolchain is not None else [])
        if target.lower().startswith("iphoneos"):
-            cmakecmd.append("-DENABLE_NEON=ON")
+            cmakecmd.append("-DCPU_BASELINE=NEON;FP16")
        cmakecmd.append(self.opencv)
        cmakecmd.extend(cmakeargs)
        execute(cmakecmd, cwd = builddir)