diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index 19153479a2..8f90c3af5c 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -392,8 +392,10 @@ typedef union Cv16suf { short i; ushort u; -#if CV_FP16_TYPE +#if CV_FP16_TYPE && defined __ARM_FP16_FORMAT_IEEE __fp16 h; +#elif CV_FP16_TYPE // Other platforms suggested to use _Float16 + _Float16 h; #endif } Cv16suf; @@ -834,12 +836,16 @@ class hfloat { public: #if CV_FP16_TYPE - - hfloat() : h(0) {} + hfloat() = default; explicit hfloat(float x) { h = (__fp16)x; } operator float() const { return (float)h; } +#if defined __ARM_FP16_FORMAT_IEEE protected: __fp16 h; +#else +protected: + _Float16 h; +#endif #else hfloat() : w(0) {} diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index a58543405b..c8340b7168 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -166,7 +166,7 @@ CV_INTRIN_DEF_TYPE_TRAITS(schar, schar, uchar, uchar, short, int, int); CV_INTRIN_DEF_TYPE_TRAITS(ushort, short, ushort, ushort, unsigned, uint64, unsigned); CV_INTRIN_DEF_TYPE_TRAITS(short, short, ushort, ushort, int, int64, int); #if CV_FP16_TYPE -CV_INTRIN_DEF_TYPE_TRAITS(__fp16, short, ushort, __fp16, float, double, float); +CV_INTRIN_DEF_TYPE_TRAITS(hfloat, short, ushort, hfloat, float, double, float); #endif CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(unsigned, int, unsigned, unsigned, uint64, unsigned); CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int, int, unsigned, unsigned, int64, int); @@ -370,7 +370,7 @@ template struct V_RegTraits CV_DEF_REG_TRAITS(v, v_uint16x8, ushort, u16, v_uint16x8, v_uint32x4, v_uint64x2, v_int16x8, void); CV_DEF_REG_TRAITS(v, v_int16x8, short, s16, v_uint16x8, v_int32x4, v_int64x2, v_int16x8, void); #if CV_SIMD128_FP16 - CV_DEF_REG_TRAITS(v, v_float16x8, __fp16, f16, v_float16x8, v_float32x4, v_float64x2, v_int16x8, v_int16x8); + CV_DEF_REG_TRAITS(v, v_float16x8, hfloat, f16, v_float16x8, v_float32x4, v_float64x2, v_int16x8, v_int16x8); #endif CV_DEF_REG_TRAITS(v, v_uint32x4, unsigned, u32, v_uint32x4, v_uint64x2, void, v_int32x4, void); CV_DEF_REG_TRAITS(v, v_int32x4, int, s32, v_uint32x4, v_int64x2, void, v_int32x4, void); @@ -570,7 +570,7 @@ namespace CV__SIMD_NAMESPACE { inline v_uint16 vx_setall_u16(ushort v) { return VXPREFIX(_setall_u16)(v); } inline v_int16 vx_setall_s16(short v) { return VXPREFIX(_setall_s16)(v); } #if CV_SIMD_FP16 - inline v_float16 vx_setall_f16(__fp16 v) { return VXPREFIX(_setall_f16)(v); } + inline v_float16 vx_setall_f16(hfloat v) { return VXPREFIX(_setall_f16)(v); } #endif inline v_int32 vx_setall_s32(int v) { return VXPREFIX(_setall_s32)(v); } inline v_uint32 vx_setall_u32(unsigned v) { return VXPREFIX(_setall_u32)(v); } @@ -610,7 +610,7 @@ namespace CV__SIMD_NAMESPACE { inline v_uint16 vx_load(const ushort * ptr) { return VXPREFIX(_load)(ptr); } inline v_int16 vx_load(const short * ptr) { return VXPREFIX(_load)(ptr); } #if CV_SIMD_FP16 - inline v_float16 vx_load(const __fp16 * ptr) { return VXPREFIX(_load)(ptr); } + inline v_float16 vx_load(const hfloat * ptr) { return VXPREFIX(_load)(ptr); } #endif inline v_int32 vx_load(const int * ptr) { return VXPREFIX(_load)(ptr); } inline v_uint32 vx_load(const unsigned * ptr) { return VXPREFIX(_load)(ptr); } @@ -630,7 +630,7 @@ namespace CV__SIMD_NAMESPACE { inline v_uint16 vx_load_aligned(const ushort * ptr) { return VXPREFIX(_load_aligned)(ptr); } inline v_int16 vx_load_aligned(const short * ptr) { return VXPREFIX(_load_aligned)(ptr); } #if CV_SIMD_FP16 - inline v_float16 vx_load_aligned(const __fp16 * ptr) { return VXPREFIX(_load_aligned)(ptr); } + inline v_float16 vx_load_aligned(const hfloat * ptr) { return VXPREFIX(_load_aligned)(ptr); } #endif inline v_int32 vx_load_aligned(const int * ptr) { return VXPREFIX(_load_aligned)(ptr); } inline v_uint32 vx_load_aligned(const unsigned * ptr) { return VXPREFIX(_load_aligned)(ptr); } @@ -650,7 +650,7 @@ namespace CV__SIMD_NAMESPACE { inline v_uint16 vx_load_low(const ushort * ptr) { return VXPREFIX(_load_low)(ptr); } inline v_int16 vx_load_low(const short * ptr) { return VXPREFIX(_load_low)(ptr); } #if CV_SIMD_FP16 - inline v_float16 vx_load_low(const __fp16 * ptr) { return VXPREFIX(_load_low)(ptr); } + inline v_float16 vx_load_low(const hfloat * ptr) { return VXPREFIX(_load_low)(ptr); } #endif inline v_int32 vx_load_low(const int * ptr) { return VXPREFIX(_load_low)(ptr); } inline v_uint32 vx_load_low(const unsigned * ptr) { return VXPREFIX(_load_low)(ptr); } @@ -670,7 +670,7 @@ namespace CV__SIMD_NAMESPACE { inline v_uint16 vx_load_halves(const ushort * ptr0, const ushort * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); } inline v_int16 vx_load_halves(const short * ptr0, const short * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); } #if CV_SIMD_FP16 - inline v_float16 vx_load_halves(const __fp16 * ptr0, const __fp16 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); } + inline v_float16 vx_load_halves(const hfloat * ptr0, const hfloat * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); } #endif inline v_int32 vx_load_halves(const int * ptr0, const int * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); } inline v_uint32 vx_load_halves(const unsigned * ptr0, const unsigned * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); } @@ -690,7 +690,7 @@ namespace CV__SIMD_NAMESPACE { inline v_uint16 vx_lut(const ushort * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); } inline v_int16 vx_lut(const short* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); } #if CV_SIMD_FP16 - inline v_float16 vx_lut(const __fp16 * ptr, const int * idx) { return VXPREFIX(_lut)(ptr, idx); } + inline v_float16 vx_lut(const hfloat * ptr, const int * idx) { return VXPREFIX(_lut)(ptr, idx); } #endif inline v_int32 vx_lut(const int* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); } inline v_uint32 vx_lut(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); } @@ -710,7 +710,7 @@ namespace CV__SIMD_NAMESPACE { inline v_uint16 vx_lut_pairs(const ushort * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); } inline v_int16 vx_lut_pairs(const short* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); } #if CV_SIMD_FP16 - inline v_float16 vx_lut_pairs(const __fp16 * ptr, const int * idx) { return VXPREFIX(_lut_pairs)(ptr, idx); } + inline v_float16 vx_lut_pairs(const hfloat * ptr, const int * idx) { return VXPREFIX(_lut_pairs)(ptr, idx); } #endif inline v_int32 vx_lut_pairs(const int* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); } inline v_uint32 vx_lut_pairs(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); } diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index 4fa3120ac7..baaac3e828 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -298,9 +298,9 @@ struct v_float16x8 { v_float16x8() {} explicit v_float16x8(float16x8_t v) : val(v) {} - v_float16x8(__fp16 v0, __fp16 v1, __fp16 v2, __fp16 v3, __fp16 v4, __fp16 v5, __fp16 v6, __fp16 v7) + v_float16x8(hfloat v0, hfloat v1, hfloat v2, hfloat v3, hfloat v4, hfloat v5, hfloat v6, hfloat v7) { - __fp16 v[] = {v0, v1, v2, v3, v4, v5, v6, v7}; + __fp16 v[] = {(__fp16)v0, (__fp16)v1, (__fp16)v2, (__fp16)v3, (__fp16)v4, (__fp16)v5, (__fp16)v6, (__fp16)v7}; val = vld1q_f16(v); } float16x8_t val; @@ -308,12 +308,12 @@ struct v_float16x8 private: friend struct VTraits; enum { nlanes = 8 }; - typedef __fp16 lane_type; + typedef hfloat lane_type; friend typename VTraits::lane_type v_get0(const v_float16x8& v); - __fp16 get0() const + hfloat get0() const { - return vgetq_lane_f16(val, 0); + return (hfloat)vgetq_lane_f16(val, 0); } }; #endif @@ -411,9 +411,9 @@ private: }; #endif -#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \ -inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \ -inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \ +#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, _TpCast, suffix) \ +inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_TpCast)0)); } \ +inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix((_TpCast)v)); } \ inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \ inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \ inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \ @@ -425,16 +425,16 @@ inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vr inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \ inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); } -OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8) -OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8) -OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16) -OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16) -OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32) -OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32) -OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64) -OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64) +OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, uchar, u8) +OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, schar, s8) +OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, ushort, u16) +OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, short, s16) +OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, unsigned, u32) +OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, int, s32) +OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, uint64, u64) +OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, int64, s64) #if CV_SIMD128_FP16 -OPENCV_HAL_IMPL_NEON_INIT(float16x8, __fp16, f16); +OPENCV_HAL_IMPL_NEON_INIT(float16x8, hfloat, __fp16, f16); #define OPENCV_HAL_IMPL_NEON_INIT_FP16(_Tpv, suffix) \ inline v_float16x8 v_reinterpret_as_f16(const v_##_Tpv& v) { return v_float16x8(vreinterpretq_f16_##suffix(v.val)); } OPENCV_HAL_IMPL_NEON_INIT_FP16(uint8x16, u8) @@ -450,11 +450,11 @@ OPENCV_HAL_IMPL_NEON_INIT_FP16(float32x4, f32) OPENCV_HAL_IMPL_NEON_INIT_FP16(float64x2, f64) #endif #endif -OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32) +OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, float, f32) #if CV_SIMD128_64F #define OPENCV_HAL_IMPL_NEON_INIT_64(_Tpv, suffix) \ inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(vreinterpretq_f64_##suffix(v.val)); } -OPENCV_HAL_IMPL_NEON_INIT(float64x2, double, f64) +OPENCV_HAL_IMPL_NEON_INIT(float64x2, double, double, f64) OPENCV_HAL_IMPL_NEON_INIT_64(uint8x16, u8) OPENCV_HAL_IMPL_NEON_INIT_64(int8x16, s8) OPENCV_HAL_IMPL_NEON_INIT_64(uint16x8, u16) @@ -1082,7 +1082,7 @@ inline v_float16x8 v_sqrt(const v_float16x8& x) inline v_float16x8 v_invsqrt(const v_float16x8& x) { - v_float16x8 one = v_setall_f16(1.0f); + v_float16x8 one = v_setall_f16((hfloat)1.0f); return v_div(one, v_sqrt(x)); } #endif @@ -1467,7 +1467,7 @@ OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float64x2, f64) #if defined(__clang__) && defined(__aarch64__) // avoid LD2 instruction. details: https://github.com/opencv/opencv/issues/14863 -#define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \ +#define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, _TpCast, suffix) \ inline _Tpvec v_load_low(const _Tp* ptr) \ { \ typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64; \ @@ -1475,46 +1475,46 @@ uint64 v = *(unaligned_uint64*)ptr; \ return _Tpvec(v_reinterpret_as_##suffix(v_uint64x2(v, (uint64)123456))); \ } #else -#define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \ +#define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, _TpCast, suffix) \ inline _Tpvec v_load_low(const _Tp* ptr) \ -{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr), vdup_n_##suffix((_Tp)0))); } +{ return _Tpvec(vcombine_##suffix(vld1_##suffix((const _TpCast *)ptr), vdup_n_##suffix((_Tp)0))); } #endif -#define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \ +#define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, _TpCast, suffix) \ inline _Tpvec v_load(const _Tp* ptr) \ -{ return _Tpvec(vld1q_##suffix(ptr)); } \ +{ return _Tpvec(vld1q_##suffix((const _TpCast *)ptr)); } \ inline _Tpvec v_load_aligned(const _Tp* ptr) \ -{ return _Tpvec(vld1q_##suffix(ptr)); } \ -OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \ +{ return _Tpvec(vld1q_##suffix((const _TpCast *)ptr)); } \ +OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, _TpCast, suffix) \ inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ -{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \ +{ return _Tpvec(vcombine_##suffix(vld1_##suffix((const _TpCast *)ptr0), vld1_##suffix((const _TpCast *)ptr1))); } \ inline void v_store(_Tp* ptr, const _Tpvec& a) \ -{ vst1q_##suffix(ptr, a.val); } \ +{ vst1q_##suffix((_TpCast *)ptr, a.val); } \ inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ -{ vst1q_##suffix(ptr, a.val); } \ +{ vst1q_##suffix((_TpCast *)ptr, a.val); } \ inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \ -{ vst1q_##suffix(ptr, a.val); } \ +{ vst1q_##suffix((_TpCast *)ptr, a.val); } \ inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \ -{ vst1q_##suffix(ptr, a.val); } \ +{ vst1q_##suffix((_TpCast *)ptr, a.val); } \ inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ -{ vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \ +{ vst1_##suffix((_TpCast *)ptr, vget_low_##suffix(a.val)); } \ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ -{ vst1_##suffix(ptr, vget_high_##suffix(a.val)); } - -OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8) -OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8) -OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16) -OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16) -OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32) -OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32) -OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, u64) -OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, s64) +{ vst1_##suffix((_TpCast *)ptr, vget_high_##suffix(a.val)); } + +OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, uchar, u8) +OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, schar, s8) +OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, ushort, u16) +OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, short, s16) +OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, unsigned, u32) +OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, int, s32) +OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, uint64, u64) +OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, int64, s64) #if CV_SIMD128_FP16 -OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float16x8, __fp16, f16) +OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float16x8, hfloat, __fp16, f16) #endif -OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32) +OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, float, f32) #if CV_SIMD128_64F -OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64) +OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, double, f64) #endif inline unsigned v_reduce_sum(const v_uint8x16& a) @@ -1588,7 +1588,7 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_int8x16, int8x8, schar, min, min, s8) #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \ inline scalartype v_reduce_##func(const _Tpvec& a) \ { \ - return v##vectorfunc##vq_##suffix(a.val); \ + return (scalartype)v##vectorfunc##vq_##suffix(a.val); \ } #else // #if CV_NEON_AARCH64 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \ @@ -1605,8 +1605,8 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, min, min, u16) OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, max, max, s16) OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, min, min, s16) #if CV_SIMD128_FP16 -OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_float16x8, float16x4, __fp16, max, max, f16) -OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_float16x8, float16x4, __fp16, min, min, f16) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_float16x8, float16x4, hfloat, max, max, f16) +OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_float16x8, float16x4, hfloat, min, min, f16) #endif #if CV_NEON_AARCH64 @@ -2183,14 +2183,14 @@ OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64) #endif #define OPENCV_HAL_IMPL_NEON_EXTRACT_N(_Tpvec, _Tp, suffix) \ -template inline _Tp v_extract_n(_Tpvec v) { return vgetq_lane_##suffix(v.val, i); } +template inline _Tp v_extract_n(_Tpvec v) { return (_Tp)vgetq_lane_##suffix(v.val, i); } OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint8x16, uchar, u8) OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int8x16, schar, s8) OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint16x8, ushort, u16) OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int16x8, short, s16) #if CV_SIMD128_FP16 -OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float16x8, __fp16, f16) +OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float16x8, hfloat, f16) #endif OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint32x4, uint, u32) OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int32x4, int, s32) @@ -2209,7 +2209,7 @@ OPENCV_HAL_IMPL_NEON_BROADCAST(v_int8x16, schar, s8) OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint16x8, ushort, u16) OPENCV_HAL_IMPL_NEON_BROADCAST(v_int16x8, short, s16) #if CV_SIMD128_FP16 -OPENCV_HAL_IMPL_NEON_BROADCAST(v_float16x8, __fp16, f16) +OPENCV_HAL_IMPL_NEON_BROADCAST(v_float16x8, hfloat, f16) #endif OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint32x4, uint, u32) OPENCV_HAL_IMPL_NEON_BROADCAST(v_int32x4, int, s32) @@ -2422,16 +2422,16 @@ inline void v_transpose8x8(const v_float16x8 &a0, const v_float16x8 &a1, } #endif -#define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \ +#define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, _TpCast, suffix) \ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \ { \ - _Tpvec##x2_t v = vld2q_##suffix(ptr); \ + _Tpvec##x2_t v = vld2q_##suffix((const _TpCast *)ptr); \ a.val = v.val[0]; \ b.val = v.val[1]; \ } \ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \ { \ - _Tpvec##x3_t v = vld3q_##suffix(ptr); \ + _Tpvec##x3_t v = vld3q_##suffix((const _TpCast *)ptr); \ a.val = v.val[0]; \ b.val = v.val[1]; \ c.val = v.val[2]; \ @@ -2439,7 +2439,7 @@ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \ v_##_Tpvec& c, v_##_Tpvec& d) \ { \ - _Tpvec##x4_t v = vld4q_##suffix(ptr); \ + _Tpvec##x4_t v = vld4q_##suffix((const _TpCast *)ptr); \ a.val = v.val[0]; \ b.val = v.val[1]; \ c.val = v.val[2]; \ @@ -2451,7 +2451,7 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& _Tpvec##x2_t v; \ v.val[0] = a.val; \ v.val[1] = b.val; \ - vst2q_##suffix(ptr, v); \ + vst2q_##suffix((_TpCast *)ptr, v); \ } \ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \ const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \ @@ -2460,7 +2460,7 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& v.val[0] = a.val; \ v.val[1] = b.val; \ v.val[2] = c.val; \ - vst3q_##suffix(ptr, v); \ + vst3q_##suffix((_TpCast *)ptr, v); \ } \ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \ const v_##_Tpvec& c, const v_##_Tpvec& d, \ @@ -2471,7 +2471,7 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& v.val[1] = b.val; \ v.val[2] = c.val; \ v.val[3] = d.val; \ - vst4q_##suffix(ptr, v); \ + vst4q_##suffix((_TpCast *)ptr, v); \ } #define OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(tp, suffix) \ @@ -2551,18 +2551,18 @@ inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& vst1_##suffix(ptr + 7, vget_high_##suffix(d.val)); \ } -OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8) -OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8) -OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16) -OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16) +OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, uchar, u8) +OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, schar, s8) +OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, ushort, u16) +OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, short, s16) #if CV_SIMD128_FP16 -OPENCV_HAL_IMPL_NEON_INTERLEAVED(float16x8, __fp16, f16) +OPENCV_HAL_IMPL_NEON_INTERLEAVED(float16x8, hfloat, __fp16, f16) #endif -OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32) -OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32) -OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32) +OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, unsigned, u32) +OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, int, s32) +OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, float, f32) #if CV_SIMD128_64F -OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, f64) +OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, double, f64) #endif OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(int64, s64) @@ -2748,7 +2748,7 @@ inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_rein inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); } #if CV_SIMD128_FP16 -inline v_float16x8 v_lut(const float16_t *tab, const int *idx) +inline v_float16x8 v_lut(const hfloat *tab, const int *idx) { const __fp16 *t = (const __fp16*)tab; __fp16 CV_DECL_ALIGNED(32) elems[8] = @@ -2764,7 +2764,7 @@ inline v_float16x8 v_lut(const float16_t *tab, const int *idx) }; return v_float16x8(vld1q_f16(elems)); } -inline v_float16x8 v_lut_pairs(const float16_t *tab, const int *idx) +inline v_float16x8 v_lut_pairs(const hfloat *tab, const int *idx) { const __fp16 *t = (const __fp16*)tab; __fp16 CV_DECL_ALIGNED(32) elems[8] = @@ -2780,7 +2780,7 @@ inline v_float16x8 v_lut_pairs(const float16_t *tab, const int *idx) }; return v_float16x8(vld1q_f16(elems)); } -inline v_float16x8 v_lut_quads(const float16_t *tab, const int *idx) +inline v_float16x8 v_lut_quads(const hfloat *tab, const int *idx) { const __fp16 *t = (const __fp16*)tab; return v_float16x8(vcombine_f16(vld1_f16(t + idx[0]), vld1_f16(t + idx[1]))); diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index 9457ca6fa7..fe841617b6 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -55,13 +55,13 @@ template struct Data template Data & operator*=(T m) { for (int i = 0; i < VTraits::vlanes(); ++i) - d[i] *= (LaneType)m; + d[i] = (LaneType)(d[i] * m); return *this; } template Data & operator+=(T m) { for (int i = 0; i < VTraits::vlanes(); ++i) - d[i] += (LaneType)m; + d[i] = (LaneType)(d[i] + m); return *this; } void fill(LaneType val, int s, int c = VTraits::vlanes()) @@ -113,9 +113,9 @@ template struct Data } LaneType sum(int s, int c) { - LaneType res = 0; + LaneType res = (LaneType)0; for (int i = s; i < s + c; ++i) - res += d[i]; + res = (LaneType)(res + d[i]); return res; } LaneType sum() @@ -131,7 +131,7 @@ template struct Data } void clear() { - fill(0); + fill((LaneType)0); } bool isZero() const { @@ -183,7 +183,7 @@ template<> inline void EXPECT_COMPARE_EQ_(const double a, const double b } #if CV_SIMD_FP16 -template<> inline void EXPECT_COMPARE_EQ_<__fp16>(const __fp16 a, const __fp16 b) +template<> inline void EXPECT_COMPARE_EQ_(const hfloat a, const hfloat b) { EXPECT_LT(std::abs(float(a - b)), 0.126); } @@ -352,9 +352,9 @@ template struct TheTest TheTest & test_interleave() { Data data1, data2, data3, data4; - data2 += 20; - data3 += 40; - data4 += 60; + data2 += (LaneType)20; + data3 += (LaneType)40; + data4 += (LaneType)60; R a = data1, b = data2, c = data3; @@ -366,7 +366,7 @@ template struct TheTest v_store_interleave(buf3, a, b, c); v_store_interleave(buf4, d, e, f, g); - Data z(0); + Data z((LaneType)0); a = b = c = d = e = f = g = z; v_load_deinterleave(buf3, a, b, c); @@ -647,9 +647,9 @@ template struct TheTest TheTest & test_abs_fp16() { typedef typename V_RegTraits::u_reg Ru; // v_float16x8 - typedef typename VTraits::lane_type u_type; // __fp16 - typedef typename VTraits::lane_type R_type; // __fp16 - Data dataA, dataB(10); + typedef typename VTraits::lane_type u_type; // hfloat + typedef typename VTraits::lane_type R_type; // hfloat + Data dataA, dataB((LaneType)10); R a = dataA, b = dataB; a = v_sub(a, b); @@ -659,7 +659,7 @@ template struct TheTest for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); - R_type ssub = (dataA[i] - dataB[i]) < R_type_lowest ? R_type_lowest : dataA[i] - dataB[i]; + R_type ssub = (R_type)((dataA[i] - dataB[i]) < R_type_lowest ? R_type_lowest : dataA[i] - dataB[i]); EXPECT_EQ((u_type)std::abs(ssub), resC[i]); } @@ -930,10 +930,10 @@ template struct TheTest { Data dataA(std::numeric_limits::max()), dataB(std::numeric_limits::min()); - dataA[0] = -1; - dataB[0] = 1; - dataA[1] = 2; - dataB[1] = -2; + dataA[0] = (LaneType)-1; + dataB[0] = (LaneType)1; + dataA[1] = (LaneType)2; + dataB[1] = (LaneType)-2; R a = dataA, b = dataB; Data resC = v_absdiff(a, b); for (int i = 0; i < VTraits::vlanes(); ++i) @@ -1008,9 +1008,9 @@ template struct TheTest typedef typename VTraits::lane_type int_type; typedef typename VTraits::lane_type uint_type; - Data dataA, dataB(0), dataC, dataD(1), dataE(2); + Data dataA, dataB((LaneType)0), dataC, dataD((LaneType)1), dataE((LaneType)2); dataA[0] = (LaneType)std::numeric_limits::max(); - dataA[1] *= (LaneType)-1; + dataA[1] = (LaneType)(dataA[1] * (LaneType)-1); union { LaneType l; @@ -1025,7 +1025,7 @@ template struct TheTest dataB[VTraits::vlanes() / 2] = mask_one; dataC *= (LaneType)-1; R a = dataA, b = dataB, c = dataC, d = dataD, e = dataE; - dataC[VTraits::vlanes() - 1] = 0; + dataC[VTraits::vlanes() - 1] = (LaneType)0; R nl = dataC; EXPECT_EQ(2, v_signmask(a)); @@ -1586,14 +1586,15 @@ template struct TheTest int i = 0; for (int j = i; j < i + 8; ++j) { SCOPED_TRACE(cv::format("i=%d j=%d", i, j)); - LaneType val = dataV[i] * data0[j] + + LaneType val = (LaneType)( + dataV[i] * data0[j] + dataV[i + 1] * data1[j] + dataV[i + 2] * data2[j] + dataV[i + 3] * data3[j] + dataV[i + 4] * data4[j] + dataV[i + 5] * data5[j] + dataV[i + 6] * data6[j] + - dataV[i + 7] * data7[j]; + dataV[i + 7] * data7[j]); EXPECT_COMPARE_EQ(val, res[j]); } @@ -1601,14 +1602,15 @@ template struct TheTest i = 0; for (int j = i; j < i + 8; ++j) { SCOPED_TRACE(cv::format("i=%d j=%d", i, j)); - LaneType val = dataV[i] * data0[j] + + LaneType val = (LaneType)( + dataV[i] * data0[j] + dataV[i + 1] * data1[j] + dataV[i + 2] * data2[j] + dataV[i + 3] * data3[j] + dataV[i + 4] * data4[j] + dataV[i + 5] * data5[j] + dataV[i + 6] * data6[j] + - data7[j]; + data7[j]); EXPECT_COMPARE_EQ(val, resAdd[j]); } #else