Merge pull request #25796 from hanliutong:hfloat

Use hfloat instead of __fp16. #25796

Related: #25743

Currently, the type for the half-precision floating point data in the OpenCV source code is `__fp16`, which is a unique(?) type supported by the ARM compiler. Other compilers have very limited support for `__fp16`, so in order to introduce more backends that support FP16 (such as RISC-V), we may need a the more general FP16 type.

In this patch, we use `hfloat` instead of `__fp16` in non-ARM code blocks, mainly affected parts are:
- `core/hal/intrin.hpp`: Type Traits, REG Traits and `vx_` interface.
- `core/hal/intrin_neon.hpp`: Universal Intrinsic API for FP16 type.
- `core/test/test_intrin_utils.hpp`: Usage of Univseral Intrinsic
- `core/include/opencv2/core/cvdef.h`: Definition of class `hfloat`

If I understand correctly, class `hfloat` acts as a wrapper around FP16 types in different platform (`__fp16` for ARM and `_Float16` for RISC-V). Any OpenCV generic interface/source code should use `hfloat`, while platform-specific FP16 types only used in macro-guarded code blocks.

/cc @fengyuentau  @mshabunin 

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [ ] I agree to contribute to the project under Apache 2 License.
- [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
pull/25252/head^2
HAN Liutong 7 months ago committed by GitHub
parent 6a11847d57
commit 1d9ca7160b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 12
      modules/core/include/opencv2/core/cvdef.h
  2. 18
      modules/core/include/opencv2/core/hal/intrin.hpp
  3. 142
      modules/core/include/opencv2/core/hal/intrin_neon.hpp
  4. 52
      modules/core/test/test_intrin_utils.hpp

@ -392,8 +392,10 @@ typedef union Cv16suf
{
short i;
ushort u;
#if CV_FP16_TYPE
#if CV_FP16_TYPE && defined __ARM_FP16_FORMAT_IEEE
__fp16 h;
#elif CV_FP16_TYPE // Other platforms suggested to use _Float16
_Float16 h;
#endif
}
Cv16suf;
@ -834,12 +836,16 @@ class hfloat
{
public:
#if CV_FP16_TYPE
hfloat() : h(0) {}
hfloat() = default;
explicit hfloat(float x) { h = (__fp16)x; }
operator float() const { return (float)h; }
#if defined __ARM_FP16_FORMAT_IEEE
protected:
__fp16 h;
#else
protected:
_Float16 h;
#endif
#else
hfloat() : w(0) {}

@ -166,7 +166,7 @@ CV_INTRIN_DEF_TYPE_TRAITS(schar, schar, uchar, uchar, short, int, int);
CV_INTRIN_DEF_TYPE_TRAITS(ushort, short, ushort, ushort, unsigned, uint64, unsigned);
CV_INTRIN_DEF_TYPE_TRAITS(short, short, ushort, ushort, int, int64, int);
#if CV_FP16_TYPE
CV_INTRIN_DEF_TYPE_TRAITS(__fp16, short, ushort, __fp16, float, double, float);
CV_INTRIN_DEF_TYPE_TRAITS(hfloat, short, ushort, hfloat, float, double, float);
#endif
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(unsigned, int, unsigned, unsigned, uint64, unsigned);
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int, int, unsigned, unsigned, int64, int);
@ -370,7 +370,7 @@ template<typename _Tp> struct V_RegTraits
CV_DEF_REG_TRAITS(v, v_uint16x8, ushort, u16, v_uint16x8, v_uint32x4, v_uint64x2, v_int16x8, void);
CV_DEF_REG_TRAITS(v, v_int16x8, short, s16, v_uint16x8, v_int32x4, v_int64x2, v_int16x8, void);
#if CV_SIMD128_FP16
CV_DEF_REG_TRAITS(v, v_float16x8, __fp16, f16, v_float16x8, v_float32x4, v_float64x2, v_int16x8, v_int16x8);
CV_DEF_REG_TRAITS(v, v_float16x8, hfloat, f16, v_float16x8, v_float32x4, v_float64x2, v_int16x8, v_int16x8);
#endif
CV_DEF_REG_TRAITS(v, v_uint32x4, unsigned, u32, v_uint32x4, v_uint64x2, void, v_int32x4, void);
CV_DEF_REG_TRAITS(v, v_int32x4, int, s32, v_uint32x4, v_int64x2, void, v_int32x4, void);
@ -570,7 +570,7 @@ namespace CV__SIMD_NAMESPACE {
inline v_uint16 vx_setall_u16(ushort v) { return VXPREFIX(_setall_u16)(v); }
inline v_int16 vx_setall_s16(short v) { return VXPREFIX(_setall_s16)(v); }
#if CV_SIMD_FP16
inline v_float16 vx_setall_f16(__fp16 v) { return VXPREFIX(_setall_f16)(v); }
inline v_float16 vx_setall_f16(hfloat v) { return VXPREFIX(_setall_f16)(v); }
#endif
inline v_int32 vx_setall_s32(int v) { return VXPREFIX(_setall_s32)(v); }
inline v_uint32 vx_setall_u32(unsigned v) { return VXPREFIX(_setall_u32)(v); }
@ -610,7 +610,7 @@ namespace CV__SIMD_NAMESPACE {
inline v_uint16 vx_load(const ushort * ptr) { return VXPREFIX(_load)(ptr); }
inline v_int16 vx_load(const short * ptr) { return VXPREFIX(_load)(ptr); }
#if CV_SIMD_FP16
inline v_float16 vx_load(const __fp16 * ptr) { return VXPREFIX(_load)(ptr); }
inline v_float16 vx_load(const hfloat * ptr) { return VXPREFIX(_load)(ptr); }
#endif
inline v_int32 vx_load(const int * ptr) { return VXPREFIX(_load)(ptr); }
inline v_uint32 vx_load(const unsigned * ptr) { return VXPREFIX(_load)(ptr); }
@ -630,7 +630,7 @@ namespace CV__SIMD_NAMESPACE {
inline v_uint16 vx_load_aligned(const ushort * ptr) { return VXPREFIX(_load_aligned)(ptr); }
inline v_int16 vx_load_aligned(const short * ptr) { return VXPREFIX(_load_aligned)(ptr); }
#if CV_SIMD_FP16
inline v_float16 vx_load_aligned(const __fp16 * ptr) { return VXPREFIX(_load_aligned)(ptr); }
inline v_float16 vx_load_aligned(const hfloat * ptr) { return VXPREFIX(_load_aligned)(ptr); }
#endif
inline v_int32 vx_load_aligned(const int * ptr) { return VXPREFIX(_load_aligned)(ptr); }
inline v_uint32 vx_load_aligned(const unsigned * ptr) { return VXPREFIX(_load_aligned)(ptr); }
@ -650,7 +650,7 @@ namespace CV__SIMD_NAMESPACE {
inline v_uint16 vx_load_low(const ushort * ptr) { return VXPREFIX(_load_low)(ptr); }
inline v_int16 vx_load_low(const short * ptr) { return VXPREFIX(_load_low)(ptr); }
#if CV_SIMD_FP16
inline v_float16 vx_load_low(const __fp16 * ptr) { return VXPREFIX(_load_low)(ptr); }
inline v_float16 vx_load_low(const hfloat * ptr) { return VXPREFIX(_load_low)(ptr); }
#endif
inline v_int32 vx_load_low(const int * ptr) { return VXPREFIX(_load_low)(ptr); }
inline v_uint32 vx_load_low(const unsigned * ptr) { return VXPREFIX(_load_low)(ptr); }
@ -670,7 +670,7 @@ namespace CV__SIMD_NAMESPACE {
inline v_uint16 vx_load_halves(const ushort * ptr0, const ushort * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
inline v_int16 vx_load_halves(const short * ptr0, const short * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
#if CV_SIMD_FP16
inline v_float16 vx_load_halves(const __fp16 * ptr0, const __fp16 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
inline v_float16 vx_load_halves(const hfloat * ptr0, const hfloat * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
#endif
inline v_int32 vx_load_halves(const int * ptr0, const int * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
inline v_uint32 vx_load_halves(const unsigned * ptr0, const unsigned * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
@ -690,7 +690,7 @@ namespace CV__SIMD_NAMESPACE {
inline v_uint16 vx_lut(const ushort * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
inline v_int16 vx_lut(const short* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
#if CV_SIMD_FP16
inline v_float16 vx_lut(const __fp16 * ptr, const int * idx) { return VXPREFIX(_lut)(ptr, idx); }
inline v_float16 vx_lut(const hfloat * ptr, const int * idx) { return VXPREFIX(_lut)(ptr, idx); }
#endif
inline v_int32 vx_lut(const int* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
inline v_uint32 vx_lut(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
@ -710,7 +710,7 @@ namespace CV__SIMD_NAMESPACE {
inline v_uint16 vx_lut_pairs(const ushort * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
inline v_int16 vx_lut_pairs(const short* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
#if CV_SIMD_FP16
inline v_float16 vx_lut_pairs(const __fp16 * ptr, const int * idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
inline v_float16 vx_lut_pairs(const hfloat * ptr, const int * idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
#endif
inline v_int32 vx_lut_pairs(const int* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
inline v_uint32 vx_lut_pairs(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }

@ -298,9 +298,9 @@ struct v_float16x8
{
v_float16x8() {}
explicit v_float16x8(float16x8_t v) : val(v) {}
v_float16x8(__fp16 v0, __fp16 v1, __fp16 v2, __fp16 v3, __fp16 v4, __fp16 v5, __fp16 v6, __fp16 v7)
v_float16x8(hfloat v0, hfloat v1, hfloat v2, hfloat v3, hfloat v4, hfloat v5, hfloat v6, hfloat v7)
{
__fp16 v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
__fp16 v[] = {(__fp16)v0, (__fp16)v1, (__fp16)v2, (__fp16)v3, (__fp16)v4, (__fp16)v5, (__fp16)v6, (__fp16)v7};
val = vld1q_f16(v);
}
float16x8_t val;
@ -308,12 +308,12 @@ struct v_float16x8
private:
friend struct VTraits<v_float16x8>;
enum { nlanes = 8 };
typedef __fp16 lane_type;
typedef hfloat lane_type;
friend typename VTraits<v_float16x8>::lane_type v_get0<v_float16x8>(const v_float16x8& v);
__fp16 get0() const
hfloat get0() const
{
return vgetq_lane_f16(val, 0);
return (hfloat)vgetq_lane_f16(val, 0);
}
};
#endif
@ -411,9 +411,9 @@ private:
};
#endif
#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, _TpCast, suffix) \
inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_TpCast)0)); } \
inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix((_TpCast)v)); } \
inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
@ -425,16 +425,16 @@ inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vr
inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \
inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); }
OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8)
OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8)
OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16)
OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16)
OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32)
OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32)
OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64)
OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64)
OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, uchar, u8)
OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, schar, s8)
OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, ushort, u16)
OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, short, s16)
OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, unsigned, u32)
OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, int, s32)
OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, uint64, u64)
OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, int64, s64)
#if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_INIT(float16x8, __fp16, f16);
OPENCV_HAL_IMPL_NEON_INIT(float16x8, hfloat, __fp16, f16);
#define OPENCV_HAL_IMPL_NEON_INIT_FP16(_Tpv, suffix) \
inline v_float16x8 v_reinterpret_as_f16(const v_##_Tpv& v) { return v_float16x8(vreinterpretq_f16_##suffix(v.val)); }
OPENCV_HAL_IMPL_NEON_INIT_FP16(uint8x16, u8)
@ -450,11 +450,11 @@ OPENCV_HAL_IMPL_NEON_INIT_FP16(float32x4, f32)
OPENCV_HAL_IMPL_NEON_INIT_FP16(float64x2, f64)
#endif
#endif
OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32)
OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, float, f32)
#if CV_SIMD128_64F
#define OPENCV_HAL_IMPL_NEON_INIT_64(_Tpv, suffix) \
inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(vreinterpretq_f64_##suffix(v.val)); }
OPENCV_HAL_IMPL_NEON_INIT(float64x2, double, f64)
OPENCV_HAL_IMPL_NEON_INIT(float64x2, double, double, f64)
OPENCV_HAL_IMPL_NEON_INIT_64(uint8x16, u8)
OPENCV_HAL_IMPL_NEON_INIT_64(int8x16, s8)
OPENCV_HAL_IMPL_NEON_INIT_64(uint16x8, u16)
@ -1082,7 +1082,7 @@ inline v_float16x8 v_sqrt(const v_float16x8& x)
inline v_float16x8 v_invsqrt(const v_float16x8& x)
{
v_float16x8 one = v_setall_f16(1.0f);
v_float16x8 one = v_setall_f16((hfloat)1.0f);
return v_div(one, v_sqrt(x));
}
#endif
@ -1467,7 +1467,7 @@ OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float64x2, f64)
#if defined(__clang__) && defined(__aarch64__)
// avoid LD2 instruction. details: https://github.com/opencv/opencv/issues/14863
#define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
#define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, _TpCast, suffix) \
inline _Tpvec v_load_low(const _Tp* ptr) \
{ \
typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64; \
@ -1475,46 +1475,46 @@ uint64 v = *(unaligned_uint64*)ptr; \
return _Tpvec(v_reinterpret_as_##suffix(v_uint64x2(v, (uint64)123456))); \
}
#else
#define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
#define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, _TpCast, suffix) \
inline _Tpvec v_load_low(const _Tp* ptr) \
{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr), vdup_n_##suffix((_Tp)0))); }
{ return _Tpvec(vcombine_##suffix(vld1_##suffix((const _TpCast *)ptr), vdup_n_##suffix((_Tp)0))); }
#endif
#define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
#define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, _TpCast, suffix) \
inline _Tpvec v_load(const _Tp* ptr) \
{ return _Tpvec(vld1q_##suffix(ptr)); } \
{ return _Tpvec(vld1q_##suffix((const _TpCast *)ptr)); } \
inline _Tpvec v_load_aligned(const _Tp* ptr) \
{ return _Tpvec(vld1q_##suffix(ptr)); } \
OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
{ return _Tpvec(vld1q_##suffix((const _TpCast *)ptr)); } \
OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, _TpCast, suffix) \
inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
{ return _Tpvec(vcombine_##suffix(vld1_##suffix((const _TpCast *)ptr0), vld1_##suffix((const _TpCast *)ptr1))); } \
inline void v_store(_Tp* ptr, const _Tpvec& a) \
{ vst1q_##suffix(ptr, a.val); } \
{ vst1q_##suffix((_TpCast *)ptr, a.val); } \
inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
{ vst1q_##suffix(ptr, a.val); } \
{ vst1q_##suffix((_TpCast *)ptr, a.val); } \
inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
{ vst1q_##suffix(ptr, a.val); } \
{ vst1q_##suffix((_TpCast *)ptr, a.val); } \
inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
{ vst1q_##suffix(ptr, a.val); } \
{ vst1q_##suffix((_TpCast *)ptr, a.val); } \
inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
{ vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
{ vst1_##suffix((_TpCast *)ptr, vget_low_##suffix(a.val)); } \
inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
{ vst1_##suffix(ptr, vget_high_##suffix(a.val)); }
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, u64)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, s64)
{ vst1_##suffix((_TpCast *)ptr, vget_high_##suffix(a.val)); }
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, uchar, u8)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, schar, s8)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, ushort, u16)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, short, s16)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, unsigned, u32)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, int, s32)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, uint64, u64)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, int64, s64)
#if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float16x8, __fp16, f16)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float16x8, hfloat, __fp16, f16)
#endif
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, float, f32)
#if CV_SIMD128_64F
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, double, f64)
#endif
inline unsigned v_reduce_sum(const v_uint8x16& a)
@ -1588,7 +1588,7 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_int8x16, int8x8, schar, min, min, s8)
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
inline scalartype v_reduce_##func(const _Tpvec& a) \
{ \
return v##vectorfunc##vq_##suffix(a.val); \
return (scalartype)v##vectorfunc##vq_##suffix(a.val); \
}
#else // #if CV_NEON_AARCH64
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
@ -1605,8 +1605,8 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, min, min, u16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, max, max, s16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, min, min, s16)
#if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_float16x8, float16x4, __fp16, max, max, f16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_float16x8, float16x4, __fp16, min, min, f16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_float16x8, float16x4, hfloat, max, max, f16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_float16x8, float16x4, hfloat, min, min, f16)
#endif
#if CV_NEON_AARCH64
@ -2183,14 +2183,14 @@ OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64)
#endif
#define OPENCV_HAL_IMPL_NEON_EXTRACT_N(_Tpvec, _Tp, suffix) \
template<int i> inline _Tp v_extract_n(_Tpvec v) { return vgetq_lane_##suffix(v.val, i); }
template<int i> inline _Tp v_extract_n(_Tpvec v) { return (_Tp)vgetq_lane_##suffix(v.val, i); }
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint8x16, uchar, u8)
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int8x16, schar, s8)
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint16x8, ushort, u16)
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int16x8, short, s16)
#if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float16x8, __fp16, f16)
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float16x8, hfloat, f16)
#endif
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint32x4, uint, u32)
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int32x4, int, s32)
@ -2209,7 +2209,7 @@ OPENCV_HAL_IMPL_NEON_BROADCAST(v_int8x16, schar, s8)
OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint16x8, ushort, u16)
OPENCV_HAL_IMPL_NEON_BROADCAST(v_int16x8, short, s16)
#if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_BROADCAST(v_float16x8, __fp16, f16)
OPENCV_HAL_IMPL_NEON_BROADCAST(v_float16x8, hfloat, f16)
#endif
OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint32x4, uint, u32)
OPENCV_HAL_IMPL_NEON_BROADCAST(v_int32x4, int, s32)
@ -2422,16 +2422,16 @@ inline void v_transpose8x8(const v_float16x8 &a0, const v_float16x8 &a1,
}
#endif
#define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
#define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, _TpCast, suffix) \
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
{ \
_Tpvec##x2_t v = vld2q_##suffix(ptr); \
_Tpvec##x2_t v = vld2q_##suffix((const _TpCast *)ptr); \
a.val = v.val[0]; \
b.val = v.val[1]; \
} \
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
{ \
_Tpvec##x3_t v = vld3q_##suffix(ptr); \
_Tpvec##x3_t v = vld3q_##suffix((const _TpCast *)ptr); \
a.val = v.val[0]; \
b.val = v.val[1]; \
c.val = v.val[2]; \
@ -2439,7 +2439,7 @@ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
v_##_Tpvec& c, v_##_Tpvec& d) \
{ \
_Tpvec##x4_t v = vld4q_##suffix(ptr); \
_Tpvec##x4_t v = vld4q_##suffix((const _TpCast *)ptr); \
a.val = v.val[0]; \
b.val = v.val[1]; \
c.val = v.val[2]; \
@ -2451,7 +2451,7 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec&
_Tpvec##x2_t v; \
v.val[0] = a.val; \
v.val[1] = b.val; \
vst2q_##suffix(ptr, v); \
vst2q_##suffix((_TpCast *)ptr, v); \
} \
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
@ -2460,7 +2460,7 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec&
v.val[0] = a.val; \
v.val[1] = b.val; \
v.val[2] = c.val; \
vst3q_##suffix(ptr, v); \
vst3q_##suffix((_TpCast *)ptr, v); \
} \
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
const v_##_Tpvec& c, const v_##_Tpvec& d, \
@ -2471,7 +2471,7 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec&
v.val[1] = b.val; \
v.val[2] = c.val; \
v.val[3] = d.val; \
vst4q_##suffix(ptr, v); \
vst4q_##suffix((_TpCast *)ptr, v); \
}
#define OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(tp, suffix) \
@ -2551,18 +2551,18 @@ inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2&
vst1_##suffix(ptr + 7, vget_high_##suffix(d.val)); \
}
OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16)
OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, uchar, u8)
OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, schar, s8)
OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, ushort, u16)
OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, short, s16)
#if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_INTERLEAVED(float16x8, __fp16, f16)
OPENCV_HAL_IMPL_NEON_INTERLEAVED(float16x8, hfloat, __fp16, f16)
#endif
OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32)
OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32)
OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, unsigned, u32)
OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, int, s32)
OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, float, f32)
#if CV_SIMD128_64F
OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, f64)
OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, double, f64)
#endif
OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(int64, s64)
@ -2748,7 +2748,7 @@ inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_rein
inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
#if CV_SIMD128_FP16
inline v_float16x8 v_lut(const float16_t *tab, const int *idx)
inline v_float16x8 v_lut(const hfloat *tab, const int *idx)
{
const __fp16 *t = (const __fp16*)tab;
__fp16 CV_DECL_ALIGNED(32) elems[8] =
@ -2764,7 +2764,7 @@ inline v_float16x8 v_lut(const float16_t *tab, const int *idx)
};
return v_float16x8(vld1q_f16(elems));
}
inline v_float16x8 v_lut_pairs(const float16_t *tab, const int *idx)
inline v_float16x8 v_lut_pairs(const hfloat *tab, const int *idx)
{
const __fp16 *t = (const __fp16*)tab;
__fp16 CV_DECL_ALIGNED(32) elems[8] =
@ -2780,7 +2780,7 @@ inline v_float16x8 v_lut_pairs(const float16_t *tab, const int *idx)
};
return v_float16x8(vld1q_f16(elems));
}
inline v_float16x8 v_lut_quads(const float16_t *tab, const int *idx)
inline v_float16x8 v_lut_quads(const hfloat *tab, const int *idx)
{
const __fp16 *t = (const __fp16*)tab;
return v_float16x8(vcombine_f16(vld1_f16(t + idx[0]), vld1_f16(t + idx[1])));

@ -55,13 +55,13 @@ template <typename R> struct Data
template <typename T> Data<R> & operator*=(T m)
{
for (int i = 0; i < VTraits<R>::vlanes(); ++i)
d[i] *= (LaneType)m;
d[i] = (LaneType)(d[i] * m);
return *this;
}
template <typename T> Data<R> & operator+=(T m)
{
for (int i = 0; i < VTraits<R>::vlanes(); ++i)
d[i] += (LaneType)m;
d[i] = (LaneType)(d[i] + m);
return *this;
}
void fill(LaneType val, int s, int c = VTraits<R>::vlanes())
@ -113,9 +113,9 @@ template <typename R> struct Data
}
LaneType sum(int s, int c)
{
LaneType res = 0;
LaneType res = (LaneType)0;
for (int i = s; i < s + c; ++i)
res += d[i];
res = (LaneType)(res + d[i]);
return res;
}
LaneType sum()
@ -131,7 +131,7 @@ template <typename R> struct Data
}
void clear()
{
fill(0);
fill((LaneType)0);
}
bool isZero() const
{
@ -183,7 +183,7 @@ template<> inline void EXPECT_COMPARE_EQ_<double>(const double a, const double b
}
#if CV_SIMD_FP16
template<> inline void EXPECT_COMPARE_EQ_<__fp16>(const __fp16 a, const __fp16 b)
template<> inline void EXPECT_COMPARE_EQ_<hfloat>(const hfloat a, const hfloat b)
{
EXPECT_LT(std::abs(float(a - b)), 0.126);
}
@ -352,9 +352,9 @@ template<typename R> struct TheTest
TheTest & test_interleave()
{
Data<R> data1, data2, data3, data4;
data2 += 20;
data3 += 40;
data4 += 60;
data2 += (LaneType)20;
data3 += (LaneType)40;
data4 += (LaneType)60;
R a = data1, b = data2, c = data3;
@ -366,7 +366,7 @@ template<typename R> struct TheTest
v_store_interleave(buf3, a, b, c);
v_store_interleave(buf4, d, e, f, g);
Data<R> z(0);
Data<R> z((LaneType)0);
a = b = c = d = e = f = g = z;
v_load_deinterleave(buf3, a, b, c);
@ -647,9 +647,9 @@ template<typename R> struct TheTest
TheTest & test_abs_fp16()
{
typedef typename V_RegTraits<R>::u_reg Ru; // v_float16x8
typedef typename VTraits<Ru>::lane_type u_type; // __fp16
typedef typename VTraits<R>::lane_type R_type; // __fp16
Data<R> dataA, dataB(10);
typedef typename VTraits<Ru>::lane_type u_type; // hfloat
typedef typename VTraits<R>::lane_type R_type; // hfloat
Data<R> dataA, dataB((LaneType)10);
R a = dataA, b = dataB;
a = v_sub(a, b);
@ -659,7 +659,7 @@ template<typename R> struct TheTest
for (int i = 0; i < VTraits<Ru>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
R_type ssub = (dataA[i] - dataB[i]) < R_type_lowest ? R_type_lowest : dataA[i] - dataB[i];
R_type ssub = (R_type)((dataA[i] - dataB[i]) < R_type_lowest ? R_type_lowest : dataA[i] - dataB[i]);
EXPECT_EQ((u_type)std::abs(ssub), resC[i]);
}
@ -930,10 +930,10 @@ template<typename R> struct TheTest
{
Data<R> dataA(std::numeric_limits<LaneType>::max()),
dataB(std::numeric_limits<LaneType>::min());
dataA[0] = -1;
dataB[0] = 1;
dataA[1] = 2;
dataB[1] = -2;
dataA[0] = (LaneType)-1;
dataB[0] = (LaneType)1;
dataA[1] = (LaneType)2;
dataB[1] = (LaneType)-2;
R a = dataA, b = dataB;
Data<R> resC = v_absdiff(a, b);
for (int i = 0; i < VTraits<R>::vlanes(); ++i)
@ -1008,9 +1008,9 @@ template<typename R> struct TheTest
typedef typename VTraits<int_reg>::lane_type int_type;
typedef typename VTraits<uint_reg>::lane_type uint_type;
Data<R> dataA, dataB(0), dataC, dataD(1), dataE(2);
Data<R> dataA, dataB((LaneType)0), dataC, dataD((LaneType)1), dataE((LaneType)2);
dataA[0] = (LaneType)std::numeric_limits<int_type>::max();
dataA[1] *= (LaneType)-1;
dataA[1] = (LaneType)(dataA[1] * (LaneType)-1);
union
{
LaneType l;
@ -1025,7 +1025,7 @@ template<typename R> struct TheTest
dataB[VTraits<R>::vlanes() / 2] = mask_one;
dataC *= (LaneType)-1;
R a = dataA, b = dataB, c = dataC, d = dataD, e = dataE;
dataC[VTraits<R>::vlanes() - 1] = 0;
dataC[VTraits<R>::vlanes() - 1] = (LaneType)0;
R nl = dataC;
EXPECT_EQ(2, v_signmask(a));
@ -1586,14 +1586,15 @@ template<typename R> struct TheTest
int i = 0;
for (int j = i; j < i + 8; ++j) {
SCOPED_TRACE(cv::format("i=%d j=%d", i, j));
LaneType val = dataV[i] * data0[j] +
LaneType val = (LaneType)(
dataV[i] * data0[j] +
dataV[i + 1] * data1[j] +
dataV[i + 2] * data2[j] +
dataV[i + 3] * data3[j] +
dataV[i + 4] * data4[j] +
dataV[i + 5] * data5[j] +
dataV[i + 6] * data6[j] +
dataV[i + 7] * data7[j];
dataV[i + 7] * data7[j]);
EXPECT_COMPARE_EQ(val, res[j]);
}
@ -1601,14 +1602,15 @@ template<typename R> struct TheTest
i = 0;
for (int j = i; j < i + 8; ++j) {
SCOPED_TRACE(cv::format("i=%d j=%d", i, j));
LaneType val = dataV[i] * data0[j] +
LaneType val = (LaneType)(
dataV[i] * data0[j] +
dataV[i + 1] * data1[j] +
dataV[i + 2] * data2[j] +
dataV[i + 3] * data3[j] +
dataV[i + 4] * data4[j] +
dataV[i + 5] * data5[j] +
dataV[i + 6] * data6[j] +
data7[j];
data7[j]);
EXPECT_COMPARE_EQ(val, resAdd[j]);
}
#else

Loading…
Cancel
Save