Merge pull request #25196 from fengyuentau:fp16_bf16_arithm

core: add universal intrinsics for fp16 #25196

Partially resolves the section "Universal intrinsics evolution in OpenCV 5.0" in  https://github.com/opencv/opencv/issues/25019.

Universal intrinsics for bf16 will be added in a subsequent pull request.

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [x] The feature is well documented and sample code can be built with the project CMake
pull/25409/head
Yuantao Feng 7 months ago committed by GitHub
parent 039cf6592c
commit 4422bc9a7f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 44
      modules/core/include/opencv2/core/hal/intrin.hpp
  2. 367
      modules/core/include/opencv2/core/hal/intrin_neon.hpp
  3. 233
      modules/core/test/test_intrin_utils.hpp

@ -165,6 +165,9 @@ CV_INTRIN_DEF_TYPE_TRAITS(uchar, schar, uchar, uchar, ushort, unsigned, unsigned
CV_INTRIN_DEF_TYPE_TRAITS(schar, schar, uchar, uchar, short, int, int);
CV_INTRIN_DEF_TYPE_TRAITS(ushort, short, ushort, ushort, unsigned, uint64, unsigned);
CV_INTRIN_DEF_TYPE_TRAITS(short, short, ushort, ushort, int, int64, int);
#if CV_FP16_TYPE
CV_INTRIN_DEF_TYPE_TRAITS(__fp16, short, ushort, __fp16, float, double, float);
#endif
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(unsigned, int, unsigned, unsigned, uint64, unsigned);
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int, int, unsigned, unsigned, int64, int);
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(float, int, unsigned, float, double, float);
@ -366,6 +369,9 @@ template<typename _Tp> struct V_RegTraits
CV_DEF_REG_TRAITS(v, v_int8x16, schar, s8, v_uint8x16, v_int16x8, v_int32x4, v_int8x16, void);
CV_DEF_REG_TRAITS(v, v_uint16x8, ushort, u16, v_uint16x8, v_uint32x4, v_uint64x2, v_int16x8, void);
CV_DEF_REG_TRAITS(v, v_int16x8, short, s16, v_uint16x8, v_int32x4, v_int64x2, v_int16x8, void);
#if CV_SIMD128_FP16
CV_DEF_REG_TRAITS(v, v_float16x8, __fp16, f16, v_float16x8, v_float32x4, v_float64x2, v_int16x8, v_int16x8);
#endif
CV_DEF_REG_TRAITS(v, v_uint32x4, unsigned, u32, v_uint32x4, v_uint64x2, void, v_int32x4, void);
CV_DEF_REG_TRAITS(v, v_int32x4, int, s32, v_uint32x4, v_int64x2, void, v_int32x4, void);
#if CV_SIMD128_64F || CV_SIMD128_CPP
@ -499,6 +505,7 @@ using namespace CV__SIMD_NAMESPACE;
#endif
namespace CV__SIMD_NAMESPACE {
#define CV_SIMD CV_SIMD128
#define CV_SIMD_FP16 CV_SIMD128_FP16
#define CV_SIMD_64F CV_SIMD128_64F
#define CV_SIMD_WIDTH 16
//! @addtogroup core_hal_intrin
@ -511,6 +518,10 @@ namespace CV__SIMD_NAMESPACE {
typedef v_uint16x8 v_uint16;
//! @brief Maximum available vector register capacity 16-bit signed integer values
typedef v_int16x8 v_int16;
#if CV_SIMD128_FP16
//! @brief Maximum available vector register capacity 16-bit floating point values (half precision)
typedef v_float16x8 v_float16;
#endif
//! @brief Maximum available vector register capacity 32-bit unsigned integer values
typedef v_uint32x4 v_uint32;
//! @brief Maximum available vector register capacity 32-bit signed integer values
@ -558,6 +569,9 @@ namespace CV__SIMD_NAMESPACE {
inline v_int8 vx_setall_s8(schar v) { return VXPREFIX(_setall_s8)(v); }
inline v_uint16 vx_setall_u16(ushort v) { return VXPREFIX(_setall_u16)(v); }
inline v_int16 vx_setall_s16(short v) { return VXPREFIX(_setall_s16)(v); }
#if CV_SIMD_FP16
inline v_float16 vx_setall_f16(__fp16 v) { return VXPREFIX(_setall_f16)(v); }
#endif
inline v_int32 vx_setall_s32(int v) { return VXPREFIX(_setall_s32)(v); }
inline v_uint32 vx_setall_u32(unsigned v) { return VXPREFIX(_setall_u32)(v); }
inline v_float32 vx_setall_f32(float v) { return VXPREFIX(_setall_f32)(v); }
@ -575,6 +589,9 @@ namespace CV__SIMD_NAMESPACE {
inline v_int8 vx_setzero_s8() { return VXPREFIX(_setzero_s8)(); }
inline v_uint16 vx_setzero_u16() { return VXPREFIX(_setzero_u16)(); }
inline v_int16 vx_setzero_s16() { return VXPREFIX(_setzero_s16)(); }
#if CV_SIMD_FP16
inline v_float16 vx_setzero_f16() { return VXPREFIX(_setzero_f16)(); }
#endif
inline v_int32 vx_setzero_s32() { return VXPREFIX(_setzero_s32)(); }
inline v_uint32 vx_setzero_u32() { return VXPREFIX(_setzero_u32)(); }
inline v_float32 vx_setzero_f32() { return VXPREFIX(_setzero_f32)(); }
@ -592,6 +609,9 @@ namespace CV__SIMD_NAMESPACE {
inline v_int8 vx_load(const schar * ptr) { return VXPREFIX(_load)(ptr); }
inline v_uint16 vx_load(const ushort * ptr) { return VXPREFIX(_load)(ptr); }
inline v_int16 vx_load(const short * ptr) { return VXPREFIX(_load)(ptr); }
#if CV_SIMD_FP16
inline v_float16 vx_load(const __fp16 * ptr) { return VXPREFIX(_load)(ptr); }
#endif
inline v_int32 vx_load(const int * ptr) { return VXPREFIX(_load)(ptr); }
inline v_uint32 vx_load(const unsigned * ptr) { return VXPREFIX(_load)(ptr); }
inline v_float32 vx_load(const float * ptr) { return VXPREFIX(_load)(ptr); }
@ -609,6 +629,9 @@ namespace CV__SIMD_NAMESPACE {
inline v_int8 vx_load_aligned(const schar * ptr) { return VXPREFIX(_load_aligned)(ptr); }
inline v_uint16 vx_load_aligned(const ushort * ptr) { return VXPREFIX(_load_aligned)(ptr); }
inline v_int16 vx_load_aligned(const short * ptr) { return VXPREFIX(_load_aligned)(ptr); }
#if CV_SIMD_FP16
inline v_float16 vx_load_aligned(const __fp16 * ptr) { return VXPREFIX(_load_aligned)(ptr); }
#endif
inline v_int32 vx_load_aligned(const int * ptr) { return VXPREFIX(_load_aligned)(ptr); }
inline v_uint32 vx_load_aligned(const unsigned * ptr) { return VXPREFIX(_load_aligned)(ptr); }
inline v_float32 vx_load_aligned(const float * ptr) { return VXPREFIX(_load_aligned)(ptr); }
@ -626,6 +649,9 @@ namespace CV__SIMD_NAMESPACE {
inline v_int8 vx_load_low(const schar * ptr) { return VXPREFIX(_load_low)(ptr); }
inline v_uint16 vx_load_low(const ushort * ptr) { return VXPREFIX(_load_low)(ptr); }
inline v_int16 vx_load_low(const short * ptr) { return VXPREFIX(_load_low)(ptr); }
#if CV_SIMD_FP16
inline v_float16 vx_load_low(const __fp16 * ptr) { return VXPREFIX(_load_low)(ptr); }
#endif
inline v_int32 vx_load_low(const int * ptr) { return VXPREFIX(_load_low)(ptr); }
inline v_uint32 vx_load_low(const unsigned * ptr) { return VXPREFIX(_load_low)(ptr); }
inline v_float32 vx_load_low(const float * ptr) { return VXPREFIX(_load_low)(ptr); }
@ -643,6 +669,9 @@ namespace CV__SIMD_NAMESPACE {
inline v_int8 vx_load_halves(const schar * ptr0, const schar * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
inline v_uint16 vx_load_halves(const ushort * ptr0, const ushort * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
inline v_int16 vx_load_halves(const short * ptr0, const short * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
#if CV_SIMD_FP16
inline v_float16 vx_load_halves(const __fp16 * ptr0, const __fp16 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
#endif
inline v_int32 vx_load_halves(const int * ptr0, const int * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
inline v_uint32 vx_load_halves(const unsigned * ptr0, const unsigned * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
inline v_float32 vx_load_halves(const float * ptr0, const float * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
@ -660,6 +689,9 @@ namespace CV__SIMD_NAMESPACE {
inline v_int8 vx_lut(const schar * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
inline v_uint16 vx_lut(const ushort * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
inline v_int16 vx_lut(const short* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
#if CV_SIMD_FP16
inline v_float16 vx_lut(const __fp16 * ptr, const int * idx) { return VXPREFIX(_lut)(ptr, idx); }
#endif
inline v_int32 vx_lut(const int* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
inline v_uint32 vx_lut(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
inline v_float32 vx_lut(const float* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
@ -677,6 +709,9 @@ namespace CV__SIMD_NAMESPACE {
inline v_int8 vx_lut_pairs(const schar * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
inline v_uint16 vx_lut_pairs(const ushort * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
inline v_int16 vx_lut_pairs(const short* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
#if CV_SIMD_FP16
inline v_float16 vx_lut_pairs(const __fp16 * ptr, const int * idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
#endif
inline v_int32 vx_lut_pairs(const int* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
inline v_uint32 vx_lut_pairs(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
inline v_float32 vx_lut_pairs(const float* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
@ -1180,6 +1215,9 @@ namespace CV__SIMD_NAMESPACE {
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64)
#if CV_SIMD_FP16
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float16)
#endif
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
@ -1196,6 +1234,9 @@ namespace CV__SIMD_NAMESPACE {
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32)
#if CV_SIMD_FP16
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float16)
#endif
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
@ -1215,6 +1256,9 @@ namespace CV__SIMD_NAMESPACE {
OPENCV_HAL_WRAP_EXTRACT(v_int32)
OPENCV_HAL_WRAP_EXTRACT(v_uint64)
OPENCV_HAL_WRAP_EXTRACT(v_int64)
#if CV_SIMD_FP16
OPENCV_HAL_WRAP_EXTRACT(v_float16)
#endif
OPENCV_HAL_WRAP_EXTRACT(v_float32)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_EXTRACT(v_float64)

@ -61,6 +61,11 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
#else
#define CV_SIMD128_64F 0
#endif
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#define CV_SIMD128_FP16 1
#else
#define CV_SIMD128_FP16 0
#endif
// The following macro checks if the code is being compiled for the
// AArch64 execution state of Armv8, to enable the 128-bit
@ -124,6 +129,9 @@ OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint16x8, uint16x4, u16)
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int16x8, int16x4, s16)
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint32x4, uint32x2, u32)
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int32x4, int32x2, s32)
#if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(float16x8, float16x4, f16);
#endif
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(float32x4, float32x2, f32)
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(uint64x2, uint64x1, u64)
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(int64x2, int64x1, s64)
@ -285,6 +293,31 @@ private:
}
};
#if CV_SIMD128_FP16
struct v_float16x8
{
v_float16x8() {}
explicit v_float16x8(float16x8_t v) : val(v) {}
v_float16x8(__fp16 v0, __fp16 v1, __fp16 v2, __fp16 v3, __fp16 v4, __fp16 v5, __fp16 v6, __fp16 v7)
{
__fp16 v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
val = vld1q_f16(v);
}
float16x8_t val;
private:
friend struct VTraits<v_float16x8>;
enum { nlanes = 8 };
typedef __fp16 lane_type;
friend typename VTraits<v_float16x8>::lane_type v_get0<v_float16x8>(const v_float16x8& v);
__fp16 get0() const
{
return vgetq_lane_f16(val, 0);
}
};
#endif
struct v_float32x4
{
v_float32x4() {}
@ -400,6 +433,23 @@ OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32)
OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32)
OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64)
OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64)
#if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_INIT(float16x8, __fp16, f16);
#define OPENCV_HAL_IMPL_NEON_INIT_FP16(_Tpv, suffix) \
inline v_float16x8 v_reinterpret_as_f16(const v_##_Tpv& v) { return v_float16x8(vreinterpretq_f16_##suffix(v.val)); }
OPENCV_HAL_IMPL_NEON_INIT_FP16(uint8x16, u8)
OPENCV_HAL_IMPL_NEON_INIT_FP16(int8x16, s8)
OPENCV_HAL_IMPL_NEON_INIT_FP16(uint16x8, u16)
OPENCV_HAL_IMPL_NEON_INIT_FP16(int16x8, s16)
OPENCV_HAL_IMPL_NEON_INIT_FP16(uint32x4, u32)
OPENCV_HAL_IMPL_NEON_INIT_FP16(int32x4, s32)
OPENCV_HAL_IMPL_NEON_INIT_FP16(uint64x2, u64)
OPENCV_HAL_IMPL_NEON_INIT_FP16(int64x2, s64)
OPENCV_HAL_IMPL_NEON_INIT_FP16(float32x4, f32)
#if CV_SIMD128_64F
OPENCV_HAL_IMPL_NEON_INIT_FP16(float64x2, f64)
#endif
#endif
OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32)
#if CV_SIMD128_64F
#define OPENCV_HAL_IMPL_NEON_INIT_64(_Tpv, suffix) \
@ -413,6 +463,9 @@ OPENCV_HAL_IMPL_NEON_INIT_64(uint32x4, u32)
OPENCV_HAL_IMPL_NEON_INIT_64(int32x4, s32)
OPENCV_HAL_IMPL_NEON_INIT_64(uint64x2, u64)
OPENCV_HAL_IMPL_NEON_INIT_64(int64x2, s64)
#if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_INIT_64(float16x8, f16)
#endif
OPENCV_HAL_IMPL_NEON_INIT_64(float32x4, f32)
OPENCV_HAL_IMPL_NEON_INIT_64(float64x2, f64)
#endif
@ -505,6 +558,47 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
return v_float32x4(res);
}
#if CV_SIMD128_FP16
// res = m0 * v[0] + m1 * v[1] + ... + m7 * v[7]
inline v_float16x8 v_matmul(const v_float16x8 &v,
const v_float16x8 &m0, const v_float16x8 &m1,
const v_float16x8 &m2, const v_float16x8 &m3,
const v_float16x8 &m4, const v_float16x8 &m5,
const v_float16x8 &m6, const v_float16x8 &m7)
{
float16x4_t vl = vget_low_f16(v.val), vh = vget_high_f16(v.val);
float16x8_t res = vmulq_lane_f16(m0.val, vl, 0);
res = vfmaq_lane_f16(res, m1.val, vl, 1);
res = vfmaq_lane_f16(res, m2.val, vl, 2);
res = vfmaq_lane_f16(res, m3.val, vl, 3);
res = vfmaq_lane_f16(res, m4.val, vh, 0);
res = vfmaq_lane_f16(res, m5.val, vh, 1);
res = vfmaq_lane_f16(res, m6.val, vh, 2);
res = vfmaq_lane_f16(res, m7.val, vh, 3);
return v_float16x8(res);
}
// res = m0 * v[0] + m1 * v[1] + ... + m6 * v[6] + a
inline v_float16x8 v_matmuladd(const v_float16x8 &v,
const v_float16x8 &m0, const v_float16x8 &m1,
const v_float16x8 &m2, const v_float16x8 &m3,
const v_float16x8 &m4, const v_float16x8 &m5,
const v_float16x8 &m6,
const v_float16x8 &a)
{
float16x4_t vl = vget_low_f16(v.val), vh = vget_high_f16(v.val);
float16x8_t res = vmulq_lane_f16(m0.val, vl, 0);
res = vfmaq_lane_f16(res, m1.val, vl, 1);
res = vfmaq_lane_f16(res, m2.val, vl, 2);
res = vfmaq_lane_f16(res, m3.val, vl, 3);
res = vfmaq_lane_f16(res, m4.val, vh, 0);
res = vfmaq_lane_f16(res, m5.val, vh, 1);
res = vfmaq_lane_f16(res, m6.val, vh, 2);
res = vaddq_f16(res, a.val);
return v_float16x8(res);
}
#endif
#define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
inline _Tpvec bin_op (const _Tpvec& a, const _Tpvec& b) \
{ \
@ -525,6 +619,12 @@ OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_int32x4, vmulq_s32)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint32x4, vaddq_u32)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint32x4, vsubq_u32)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_uint32x4, vmulq_u32)
#if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_float16x8, vaddq_f16)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_float16x8, vsubq_f16)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_float16x8, vmulq_f16)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_div, v_float16x8, vdivq_f16)
#endif
OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_float32x4, vaddq_f32)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_float32x4, vsubq_f32)
OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_float32x4, vmulq_f32)
@ -944,6 +1044,21 @@ OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32)
OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64)
OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64)
#if CV_SIMD128_FP16
#define OPENCV_HAL_IMPL_NEON_FP16_BIT_OP(bin_op, intrin) \
inline v_float16x8 bin_op (const v_float16x8& a, const v_float16x8& b) \
{ \
return v_float16x8(vreinterpretq_f16_s16(intrin(vreinterpretq_s16_f16(a.val), vreinterpretq_s16_f16(b.val)))); \
}
OPENCV_HAL_IMPL_NEON_FP16_BIT_OP(v_and, vandq_s16)
OPENCV_HAL_IMPL_NEON_FP16_BIT_OP(v_or, vorrq_s16)
OPENCV_HAL_IMPL_NEON_FP16_BIT_OP(v_xor, veorq_s16)
inline v_float16x8 v_not (const v_float16x8& a)
{
return v_float16x8(vreinterpretq_f16_s16(vmvnq_s16(vreinterpretq_s16_f16(a.val))));
}
#endif
#define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
inline v_float32x4 bin_op (const v_float32x4& a, const v_float32x4& b) \
{ \
@ -959,6 +1074,19 @@ inline v_float32x4 v_not (const v_float32x4& a)
return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
}
#if CV_SIMD128_FP16
inline v_float16x8 v_sqrt(const v_float16x8& x)
{
return v_float16x8(vsqrtq_f16(x.val));
}
inline v_float16x8 v_invsqrt(const v_float16x8& x)
{
v_float16x8 one = v_setall_f16(1.0f);
return v_div(one, v_sqrt(x));
}
#endif
#if CV_SIMD128_64F
inline v_float32x4 v_sqrt(const v_float32x4& x)
{
@ -996,9 +1124,14 @@ OPENCV_HAL_IMPL_NEON_ABS(v_uint8x16, v_int8x16, u8, s8)
OPENCV_HAL_IMPL_NEON_ABS(v_uint16x8, v_int16x8, u16, s16)
OPENCV_HAL_IMPL_NEON_ABS(v_uint32x4, v_int32x4, u32, s32)
inline v_float32x4 v_abs(v_float32x4 x)
inline v_float32x4 v_abs(const v_float32x4 &x)
{ return v_float32x4(vabsq_f32(x.val)); }
#if CV_SIMD128_FP16
inline v_float16x8 v_abs(const v_float16x8 &x)
{ return v_float16x8(vabsq_f16(x.val)); }
#endif
#if CV_SIMD128_64F
#define OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(bin_op, intrin) \
inline v_float64x2 bin_op (const v_float64x2& a, const v_float64x2& b) \
@ -1052,6 +1185,10 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32)
#if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float16x8, v_min, vminq_f16)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float16x8, v_max, vmaxq_f16)
#endif
#if CV_SIMD128_64F
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_min, vminq_f64)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_max, vmaxq_f64)
@ -1075,6 +1212,9 @@ OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8)
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16)
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
#if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float16x8, vreinterpretq_f16_u16, f16, u16)
#endif
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
@ -1139,6 +1279,10 @@ static inline v_int64x2 v_lt (const v_int64x2& a, const v_int64x2& b)
OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float64x2, vreinterpretq_f64_u64, f64, u64)
#endif
#if CV_SIMD128_FP16
inline v_float16x8 v_not_nan(const v_float16x8& a)
{ return v_float16x8(vreinterpretq_f16_u16(vceqq_f16(a.val, a.val))); }
#endif
inline v_float32x4 v_not_nan(const v_float32x4& a)
{ return v_float32x4(vreinterpretq_f32_u32(vceqq_f32(a.val, a.val))); }
#if CV_SIMD128_64F
@ -1162,6 +1306,9 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mul_wrap, vmulq_s16)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
#if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float16x8, v_absdiff, vabdq_f16)
#endif
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
#if CV_SIMD128_64F
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_absdiff, vabdq_f64)
@ -1183,6 +1330,29 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int8x16, v_uint8x16, vreinterpretq_u8_s8, v_abs
OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int16x8, v_uint16x8, vreinterpretq_u16_s16, v_absdiff, vabdq_s16)
OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int32x4, v_uint32x4, vreinterpretq_u32_s32, v_absdiff, vabdq_s32)
#if CV_SIMD128_FP16
inline v_float16x8 v_magnitude(const v_float16x8& a, const v_float16x8& b)
{
v_float16x8 x(vaddq_f16(vmulq_f16(a.val, a.val), vmulq_f16(b.val, b.val)));
return v_sqrt(x);
}
inline v_float16x8 v_sqr_magnitude(const v_float16x8& a, const v_float16x8& b)
{
return v_float16x8(vaddq_f16(vmulq_f16(a.val, a.val), vmulq_f16(b.val, b.val)));
}
inline v_float16x8 v_fma(const v_float16x8& a, const v_float16x8& b, const v_float16x8& c)
{
return v_float16x8(vfmaq_f16(c.val, a.val, b.val));
}
inline v_float16x8 v_muladd(const v_float16x8& a, const v_float16x8& b, const v_float16x8& c)
{
return v_fma(a, b, c);
}
#endif
inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
{
v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
@ -1285,6 +1455,9 @@ OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint16x8, u16)
OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int16x8, s16)
OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint32x4, u32)
OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int32x4, s32)
#if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float16x8, f16)
#endif
OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float32x4, f32)
OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint64x2, u64)
OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int64x2, s64)
@ -1336,6 +1509,9 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, u64)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, s64)
#if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float16x8, __fp16, f16)
#endif
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
#if CV_SIMD128_64F
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
@ -1428,6 +1604,10 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, max, max, u16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, min, min, u16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, max, max, s16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, min, min, s16)
#if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_float16x8, float16x4, __fp16, max, max, f16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_float16x8, float16x4, __fp16, min, min, f16)
#endif
#if CV_NEON_AARCH64
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
@ -1498,6 +1678,24 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
#endif // #if CV_NEON_AARCH64
}
#if CV_SIMD128_FP16
inline v_float16x8 v_reduce_sum8(const v_float16x8 &a, const v_float16x8 &b,
const v_float16x8 &c, const v_float16x8 &d,
const v_float16x8 &w, const v_float16x8 &x,
const v_float16x8 &y, const v_float16x8 &z)
{
float16x8_t ab = vpaddq_f16(a.val, b.val); // a0+a1 a2+a3 a4+a5 a6+a7 b0+b1 b2+b3 b4+b5 b6+b7
float16x8_t cd = vpaddq_f16(c.val, d.val); // c0+c1 c2+c3 c4+c5 c6+c7 d0+d1 d2+d3 d4+d5 d6+d7
float16x8_t wx = vpaddq_f16(w.val, x.val); // w0+w1 w2+w3 w4+w5 w6+w7 x0+x1 x2+x3 x4+x5 x6+x7
float16x8_t yz = vpaddq_f16(y.val, z.val); // y0+y1 y2+y3 y4+y5 y6+y7 z0+z1 z2+z3 z4+z5 z6+z7
float16x8_t abcd = vpaddq_f16(ab, cd); // a0+a1+a2+a3 a4+a5+a6+a7 b0+b1+b2+b3 b4+b5+b6+b7 c0+c1+c2+c3 c4+c5+c6+c7 d0+d1+d2+d3 d4+d5+d6+d7
float16x8_t wxyz = vpaddq_f16(wx, yz); // w0+w1+w2+w3 w4+w5+w6+w7 x0+x1+x2+x3 x4+x5+x6+x7 y0+y1+y2+y3 y4+y5+y6+y7 z0+z1+z2+z3 z4+z5+z6+z7
return v_float16x8(vpaddq_f16(abcd, wxyz));
}
#endif
inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
{
#if CV_NEON_AARCH64
@ -1635,6 +1833,10 @@ inline int v_signmask(const v_uint16x8& a)
}
inline int v_signmask(const v_int16x8& a)
{ return v_signmask(v_reinterpret_as_u16(a)); }
#if CV_SIMD128_FP16
inline int v_signmask(const v_float16x8& a)
{ return v_signmask(v_reinterpret_as_u16(a)); }
#endif
inline int v_signmask(const v_uint32x4& a)
{
@ -1678,6 +1880,9 @@ inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmas
inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
#if CV_SIMD128_FP16
inline int v_scan_forward(const v_float16x8& a) { return trailingZeros32(v_signmask(a)); }
#endif
inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
@ -1732,6 +1937,12 @@ inline bool v_check_all(const v_int8x16& a)
{ return v_check_all(v_reinterpret_as_u8(a)); }
inline bool v_check_all(const v_int16x8& a)
{ return v_check_all(v_reinterpret_as_u16(a)); }
#if CV_SIMD128_FP16
inline bool v_check_all(const v_float16x8& a)
{ return v_check_all(v_reinterpret_as_u16(a)); }
inline bool v_check_any(const v_float16x8& a)
{ return v_check_any(v_reinterpret_as_u16(a)); }
#endif
inline bool v_check_all(const v_int32x4& a)
{ return v_check_all(v_reinterpret_as_u32(a)); }
inline bool v_check_all(const v_float32x4& a)
@ -1767,6 +1978,9 @@ OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8)
OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8)
OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16)
OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16)
#if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_SELECT(v_float16x8, f16, u16)
#endif
OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32)
OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32)
OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32)
@ -1884,6 +2098,9 @@ OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8)
OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8)
OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16)
OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16)
#if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_UNPACKS(float16x8, f16)
#endif
OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32)
OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32)
OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
@ -1909,6 +2126,11 @@ inline v_uint16x8 v_reverse(const v_uint16x8 &a)
inline v_int16x8 v_reverse(const v_int16x8 &a)
{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
#if CV_SIMD128_FP16
inline v_float16x8 v_reverse(const v_float16x8 &a)
{ return v_reinterpret_as_f16(v_reverse(v_reinterpret_as_u16(a))); }
#endif
inline v_uint32x4 v_reverse(const v_uint32x4 &a)
{
uint32x4_t vec = vrev64q_u32(a.val);
@ -1948,6 +2170,9 @@ OPENCV_HAL_IMPL_NEON_EXTRACT(uint8x16, u8)
OPENCV_HAL_IMPL_NEON_EXTRACT(int8x16, s8)
OPENCV_HAL_IMPL_NEON_EXTRACT(uint16x8, u16)
OPENCV_HAL_IMPL_NEON_EXTRACT(int16x8, s16)
#if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_EXTRACT(float16x8, f16)
#endif
OPENCV_HAL_IMPL_NEON_EXTRACT(uint32x4, u32)
OPENCV_HAL_IMPL_NEON_EXTRACT(int32x4, s32)
OPENCV_HAL_IMPL_NEON_EXTRACT(uint64x2, u64)
@ -1964,6 +2189,9 @@ OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint8x16, uchar, u8)
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int8x16, schar, s8)
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint16x8, ushort, u16)
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int16x8, short, s16)
#if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float16x8, __fp16, f16)
#endif
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint32x4, uint, u32)
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int32x4, int, s32)
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint64x2, uint64, u64)
@ -1980,6 +2208,9 @@ OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint8x16, uchar, u8)
OPENCV_HAL_IMPL_NEON_BROADCAST(v_int8x16, schar, s8)
OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint16x8, ushort, u16)
OPENCV_HAL_IMPL_NEON_BROADCAST(v_int16x8, short, s16)
#if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_BROADCAST(v_float16x8, __fp16, f16)
#endif
OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint32x4, uint, u32)
OPENCV_HAL_IMPL_NEON_BROADCAST(v_int32x4, int, s32)
OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint64x2, uint64, u64)
@ -1989,6 +2220,32 @@ OPENCV_HAL_IMPL_NEON_BROADCAST(v_float32x4, float, f32)
OPENCV_HAL_IMPL_NEON_BROADCAST(v_float64x2, double, f64)
#endif
#if CV_SIMD128_FP16
inline v_int16x8 v_round(const v_float16x8 &a)
{
return v_int16x8(vcvtnq_s16_f16(a.val));
}
inline v_int16x8 v_floor(const v_float16x8 &a)
{
int16x8_t a1 = vcvtq_s16_f16(a.val);
uint16x8_t mask = vcgtq_f16(vcvtq_f16_s16(a1), a.val);
return v_int16x8(vaddq_s16(a1, vreinterpretq_s16_u16(mask)));
}
inline v_int16x8 v_ceil(const v_float16x8 &a)
{
int16x8_t a1 = vcvtq_s16_f16(a.val);
uint16x8_t mask = vcgtq_f16(a.val, vcvtq_f16_s16(a1));
return v_int16x8(vsubq_s16(a1, vreinterpretq_s16_u16(mask)));
}
inline v_int16x8 v_trunc(const v_float16x8 &a)
{
return v_int16x8(vcvtq_s16_f16(a.val));
}
#endif
#if CV_SIMD128_64F
inline v_int32x4 v_round(const v_float32x4& a)
{
@ -2124,6 +2381,47 @@ OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
#endif // #if CV_NEON_AARCH64
#if CV_SIMD128_FP16
inline void v_transpose8x8(const v_float16x8 &a0, const v_float16x8 &a1,
const v_float16x8 &a2, const v_float16x8 &a3,
const v_float16x8 &a4, const v_float16x8 &a5,
const v_float16x8 &a6, const v_float16x8 &a7,
v_float16x8 &b0, v_float16x8 &b1,
v_float16x8 &b2, v_float16x8 &b3,
v_float16x8 &b4, v_float16x8 &b5,
v_float16x8 &b6, v_float16x8 &b7)
{
float32x4_t s0 = vreinterpretq_f32_f64(vtrn1q_f64(vreinterpretq_f64_f16(a0.val), vreinterpretq_f64_f16(a4.val)));
float32x4_t s1 = vreinterpretq_f32_f64(vtrn1q_f64(vreinterpretq_f64_f16(a1.val), vreinterpretq_f64_f16(a5.val)));
float32x4_t s2 = vreinterpretq_f32_f64(vtrn1q_f64(vreinterpretq_f64_f16(a2.val), vreinterpretq_f64_f16(a6.val)));
float32x4_t s3 = vreinterpretq_f32_f64(vtrn1q_f64(vreinterpretq_f64_f16(a3.val), vreinterpretq_f64_f16(a7.val)));
float32x4_t s4 = vreinterpretq_f32_f64(vtrn2q_f64(vreinterpretq_f64_f16(a0.val), vreinterpretq_f64_f16(a4.val)));
float32x4_t s5 = vreinterpretq_f32_f64(vtrn2q_f64(vreinterpretq_f64_f16(a1.val), vreinterpretq_f64_f16(a5.val)));
float32x4_t s6 = vreinterpretq_f32_f64(vtrn2q_f64(vreinterpretq_f64_f16(a2.val), vreinterpretq_f64_f16(a6.val)));
float32x4_t s7 = vreinterpretq_f32_f64(vtrn2q_f64(vreinterpretq_f64_f16(a3.val), vreinterpretq_f64_f16(a7.val)));
float16x8_t t0 = vreinterpretq_f16_f32(vtrn1q_f32(s0, s2));
float16x8_t t1 = vreinterpretq_f16_f32(vtrn1q_f32(s1, s3));
float16x8_t t2 = vreinterpretq_f16_f32(vtrn2q_f32(s0, s2));
float16x8_t t3 = vreinterpretq_f16_f32(vtrn2q_f32(s1, s3));
float16x8_t t4 = vreinterpretq_f16_f32(vtrn1q_f32(s4, s6));
float16x8_t t5 = vreinterpretq_f16_f32(vtrn1q_f32(s5, s7));
float16x8_t t6 = vreinterpretq_f16_f32(vtrn2q_f32(s4, s6));
float16x8_t t7 = vreinterpretq_f16_f32(vtrn2q_f32(s5, s7));
b0.val = vtrn1q_f16(t0, t1);
b1.val = vtrn2q_f16(t0, t1);
b2.val = vtrn1q_f16(t2, t3);
b3.val = vtrn2q_f16(t2, t3);
b4.val = vtrn1q_f16(t4, t5);
b5.val = vtrn2q_f16(t4, t5);
b6.val = vtrn1q_f16(t6, t7);
b7.val = vtrn2q_f16(t6, t7);
}
#endif
#define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
{ \
@ -2257,6 +2555,9 @@ OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16)
#if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_INTERLEAVED(float16x8, __fp16, f16)
#endif
OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32)
OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32)
OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
@ -2267,6 +2568,30 @@ OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, f64)
OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(int64, s64)
OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(uint64, u64)
#if CV_SIMD128_FP16
inline v_float16x8 v_cvt_f16(const v_float32x4 &a)
{
float16x4_t zero = vdup_n_f16((__fp16)0.0f);
return v_float16x8(vcombine_f16(vcvt_f16_f32(a.val), zero));
}
inline v_float16x8 v_cvt_f16(const v_float32x4 &a, const v_float32x4 &b)
{
return v_float16x8(vcombine_f16(vcvt_f16_f32(a.val), vcvt_f16_f32(b.val)));
}
inline v_float16x8 v_cvt_f16(const v_int16x8 &a)
{
return v_float16x8(vcvtq_f16_s16(a.val));
}
inline v_float32x4 v_cvt_f32(const v_float16x8 &a)
{
return v_float32x4(vcvt_f32_f16(vget_low_f16(a.val)));
}
inline v_float32x4 v_cvt_f32_high(const v_float16x8 &a)
{
return v_float32x4(vcvt_f32_f16(vget_high_f16(a.val)));
}
#endif
inline v_float32x4 v_cvt_f32(const v_int32x4& a)
{
return v_float32x4(vcvtq_f32_s32(a.val));
@ -2422,6 +2747,46 @@ inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpre
inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
#if CV_SIMD128_FP16
inline v_float16x8 v_lut(const float16_t *tab, const int *idx)
{
const __fp16 *t = (const __fp16*)tab;
__fp16 CV_DECL_ALIGNED(32) elems[8] =
{
t[idx[0]],
t[idx[1]],
t[idx[2]],
t[idx[3]],
t[idx[4]],
t[idx[5]],
t[idx[6]],
t[idx[7]],
};
return v_float16x8(vld1q_f16(elems));
}
inline v_float16x8 v_lut_pairs(const float16_t *tab, const int *idx)
{
const __fp16 *t = (const __fp16*)tab;
__fp16 CV_DECL_ALIGNED(32) elems[8] =
{
t[idx[0]],
t[idx[0] + 1],
t[idx[1]],
t[idx[1] + 1],
t[idx[2]],
t[idx[2] + 1],
t[idx[3]],
t[idx[3] + 1],
};
return v_float16x8(vld1q_f16(elems));
}
inline v_float16x8 v_lut_quads(const float16_t *tab, const int *idx)
{
const __fp16 *t = (const __fp16*)tab;
return v_float16x8(vcombine_f16(vld1_f16(t + idx[0]), vld1_f16(t + idx[1])));
}
#endif
inline v_int32x4 v_lut(const int* tab, const int* idx)
{
int CV_DECL_ALIGNED(32) elems[4] =

@ -182,6 +182,13 @@ template<> inline void EXPECT_COMPARE_EQ_<double>(const double a, const double b
EXPECT_DOUBLE_EQ( a, b );
}
#if CV_SIMD_FP16
template<> inline void EXPECT_COMPARE_EQ_<__fp16>(const __fp16 a, const __fp16 b)
{
EXPECT_LT(std::abs(float(a - b)), 0.126);
}
#endif
// pack functions do not do saturation when converting from 64-bit types
template<typename T, typename W>
inline T pack_saturate_cast(W a) { return saturate_cast<T>(a); }
@ -554,6 +561,27 @@ template<typename R> struct TheTest
return *this;
}
// Handle accuracy for fp16
TheTest & test_div_fp16()
{
#if CV_SIMD_FP16
Data<R> dataA, dataB;
dataB.reverse();
R a = dataA, b = dataB;
Data<R> resC = v_div(a, b);
for (int i = 0; i < VTraits<R>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_LT(std::abs(float((dataA[i] / dataB[i]) - resC[i])), 2e-4);
}
#else
std::cout << "SKIP: test_div_fp16, CV_SIMD_FP16 is not available" << std::endl;
#endif
return *this;
}
TheTest & test_mul_expand()
{
typedef typename V_RegTraits<R>::w_reg Rx2;
@ -604,11 +632,34 @@ template<typename R> struct TheTest
a = v_sub(a, b);
Data<Ru> resC = v_abs(a);
auto R_type_lowest = std::numeric_limits<R_type>::lowest();
for (int i = 0; i < VTraits<Ru>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
R_type ssub = (dataA[i] - dataB[i]) < R_type_lowest ? R_type_lowest : dataA[i] - dataB[i];
EXPECT_EQ((u_type)std::abs(ssub), resC[i]);
}
return *this;
}
TheTest & test_abs_fp16()
{
typedef typename V_RegTraits<R>::u_reg Ru; // v_float16x8
typedef typename VTraits<Ru>::lane_type u_type; // __fp16
typedef typename VTraits<R>::lane_type R_type; // __fp16
Data<R> dataA, dataB(10);
R a = dataA, b = dataB;
a = v_sub(a, b);
Data<Ru> resC = v_abs(a);
R_type R_type_lowest = R_type(-65504); // 0 11110 1111111111
for (int i = 0; i < VTraits<Ru>::vlanes(); ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
R_type ssub = dataA[i] - dataB[i] < std::numeric_limits<R_type>::lowest() ? std::numeric_limits<R_type>::lowest() : dataA[i] - dataB[i];
R_type ssub = (dataA[i] - dataB[i]) < R_type_lowest ? R_type_lowest : dataA[i] - dataB[i];
EXPECT_EQ((u_type)std::abs(ssub), resC[i]);
}
@ -1492,6 +1543,54 @@ template<typename R> struct TheTest
return *this;
}
TheTest & test_matmul_fp16()
{
#if CV_SIMD_FP16
Data<R> dataV, data0, data1, data2, data3, data4, data5, data6, data7;
data1.reverse();
data2 += 2;
data3 *= 0.3;
data5.reverse();
data6 += 1;
data7 *= 0.4;
R v = dataV, m0 = data0, m1 = data1, m2 = data2, m3 = data3, m4 = data4, m5 = data5, m6 = data6, m7 = data7;
Data<R> res = v_matmul(v, m0, m1, m2, m3, m4, m5, m6, m7);
int i = 0;
for (int j = i; j < i + 8; ++j) {
SCOPED_TRACE(cv::format("i=%d j=%d", i, j));
LaneType val = dataV[i] * data0[j] +
dataV[i + 1] * data1[j] +
dataV[i + 2] * data2[j] +
dataV[i + 3] * data3[j] +
dataV[i + 4] * data4[j] +
dataV[i + 5] * data5[j] +
dataV[i + 6] * data6[j] +
dataV[i + 7] * data7[j];
EXPECT_COMPARE_EQ(val, res[j]);
}
Data<R> resAdd = v_matmuladd(v, m0, m1, m2, m3, m4, m5, m6, m7);
i = 0;
for (int j = i; j < i + 8; ++j) {
SCOPED_TRACE(cv::format("i=%d j=%d", i, j));
LaneType val = dataV[i] * data0[j] +
dataV[i + 1] * data1[j] +
dataV[i + 2] * data2[j] +
dataV[i + 3] * data3[j] +
dataV[i + 4] * data4[j] +
dataV[i + 5] * data5[j] +
dataV[i + 6] * data6[j] +
data7[j];
EXPECT_COMPARE_EQ(val, resAdd[j]);
}
#else
std::cout << "SKIP: test_matmul_fp16, CV_SIMD_FP16 is not available" << std::endl;
#endif
return *this;
}
TheTest & test_transpose()
{
Data<R> dataA, dataB, dataC, dataD;
@ -1527,6 +1626,41 @@ template<typename R> struct TheTest
return *this;
}
TheTest & test_transpose8x8_fp16()
{
#if CV_SIMD_FP16
Data<R> dataA0, dataA1, dataA2, dataA3, dataA4, dataA5, dataA6, dataA7;
dataA1 *= 2;
dataA2 *= 4;
dataA3 *= 6;
dataA4 *= 8;
dataA5 *= 10;
dataA6 *= 12;
dataA7 *= 14;
R a0 = dataA0, a1 = dataA1, a2 = dataA2, a3 = dataA3,
a4 = dataA4, a5 = dataA5, a6 = dataA6, a7 = dataA7;
R b0, b1, b2, b3, b4, b5, b6, b7;
v_transpose8x8(a0, a1, a2, a3, a4, a5, a6, a7,
b0, b1, b2, b3, b4, b5, b6, b7);
Data<R> res0 = b0, res1 = b1, res2 = b2, res3 = b3, res4 = b4, res5 = b5, res6 = b6, res7 = b7;
const Data<R> ref[] = {dataA0, dataA1, dataA2, dataA3, dataA4, dataA5, dataA6, dataA7};
const Data<R> res[] = { res0, res1, res2, res3, res4, res5, res6, res7};
for (int i = 0; i < 8; i++) {
for (int j = 0; j < 8; j++) {
SCOPED_TRACE(cv::format("i=%d j=%d", i, j));
EXPECT_EQ(ref[i][j], res[j][i]);
}
}
#else
std::cout << "SKIP: test_transpose8x8_fp16, CV_SIMD_FP16 is not available" << std::endl;
#endif
return *this;
}
TheTest & test_reduce_sum4()
{
Data<R> dataA, dataB, dataC, dataD;
@ -1548,9 +1682,43 @@ template<typename R> struct TheTest
return *this;
}
TheTest & test_reduce_sum8()
{
#if CV_SIMD_FP16
Data<R> dataA, dataB, dataC, dataD, dataW, dataX, dataY, dataZ;
dataB *= 0.01f;
dataC *= 0.001f;
dataD *= 0.002f;
dataW += 0.1f;
dataX *= 0.2f;
dataY += 1;
dataZ *= 2;
R a = dataA, b = dataB, c = dataC, d = dataD,
w = dataW, x = dataX, y = dataY, z = dataZ;
Data<R> res = v_reduce_sum8(a, b, c, d, w, x, y, z);
for (int i = 0; i < VTraits<R>::vlanes(); i += 8)
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_COMPARE_EQ(dataA.sum(i, 8), res[i]);
EXPECT_COMPARE_EQ(dataB.sum(i, 8), res[i + 1]);
EXPECT_COMPARE_EQ(dataC.sum(i, 8), res[i + 2]);
EXPECT_COMPARE_EQ(dataD.sum(i, 8), res[i + 3]);
EXPECT_COMPARE_EQ(dataW.sum(i, 8), res[i + 4]);
EXPECT_COMPARE_EQ(dataX.sum(i, 8), res[i + 5]);
EXPECT_COMPARE_EQ(dataY.sum(i, 8), res[i + 6]);
EXPECT_COMPARE_EQ(dataZ.sum(i, 8), res[i + 7]);
}
#else
std::cout << "SKIP: test_reduce_sum8, CV_SIMD_FP16 is not available" << std::endl;
#endif
return *this;
}
TheTest & test_loadstore_fp16_f32()
{
printf("test_loadstore_fp16_f32 ...\n");
AlignedData<v_uint16> data; data.a.clear();
data.a.d[0] = 0x3c00; // 1.0
data.a.d[VTraits<R>::vlanes() - 1] = (unsigned short)0xc000; // -2.0
@ -1573,22 +1741,21 @@ template<typename R> struct TheTest
return *this;
}
#if 0
TheTest & test_loadstore_fp16()
{
printf("test_loadstore_fp16 ...\n");
#if CV_SIMD_FP16
AlignedData<R> data;
AlignedData<R> out;
// check if addresses are aligned and unaligned respectively
EXPECT_EQ((size_t)0, (size_t)&data.a.d % VTraits<R>::max_nlanes);
EXPECT_NE((size_t)0, (size_t)&data.u.d % VTraits<R>::max_nlanes);
EXPECT_EQ((size_t)0, (size_t)&out.a.d % VTraits<R>::max_nlanes);
EXPECT_NE((size_t)0, (size_t)&out.u.d % VTraits<R>::max_nlanes);
EXPECT_EQ((size_t)0, (size_t)&data.a.d % (sizeof(typename VTraits<R>::lane_type) * VTraits<R>::vlanes()));
EXPECT_NE((size_t)0, (size_t)&data.u.d % (sizeof(typename VTraits<R>::lane_type) * VTraits<R>::vlanes()));
EXPECT_EQ((size_t)0, (size_t)&out.a.d % (sizeof(typename VTraits<R>::lane_type) * VTraits<R>::vlanes()));
EXPECT_NE((size_t)0, (size_t)&out.u.d % (sizeof(typename VTraits<R>::lane_type) * VTraits<R>::vlanes()));
// check some initialization methods
R r1 = data.u;
R r2 = vx_load_expand((const hfloat*)data.a.d);
R r2 = vx_load(data.a.d);
R r3(r2);
EXPECT_EQ(data.u[0], v_get0(r1));
EXPECT_EQ(data.a[0], v_get0(r2));
@ -1598,24 +1765,30 @@ template<typename R> struct TheTest
out.a.clear();
v_store(out.a.d, r1);
EXPECT_EQ(data.a, out.a);
#else
std::cout << "SKIP: test_loadstore_fp16, CV_SIMD_FP16 is not available" << std::endl;
#endif
return *this;
}
TheTest & test_float_cvt_fp16()
{
printf("test_float_cvt_fp16 ...\n");
#if CV_SIMD_FP16
AlignedData<v_float32> data;
// check conversion
v_float32 r1 = vx_load(data.a.d);
v_float16 r2 = v_cvt_f16(r1, vx_setzero_f32());
v_float32 r3 = v_cvt_f32(r2);
EXPECT_EQ(0x3c00, v_get0(r2));
EXPECT_EQ(1, v_get0(r2));
EXPECT_EQ(v_get0(r3), v_get0(r1));
#else
std::cout << "SKIP: test_float_cvt_fp16, CV_SIMD_FP16 is not available" << std::endl;
#endif
return *this;
}
#endif
void do_check_cmp64(const Data<R>& dataA, const Data<R>& dataB)
{
@ -2029,11 +2202,32 @@ void test_hal_intrin_float16()
{
DUMP_ENTRY(v_float16);
#if CV_FP16
TheTest<v_float32>()
.test_loadstore_fp16_f32()
TheTest<v_float32>().test_loadstore_fp16_f32();
#if CV_SIMD_FP16
TheTest<v_float16>()
.test_loadstore_fp16()
.test_float_cvt_fp16()
.test_interleave()
.test_addsub()
.test_mul()
.test_div_fp16()
.test_abs_fp16()
.test_cmp()
.test_sqrt_abs()
.test_min_max()
.test_float_absdiff()
.test_mask()
.test_unpack()
.test_float_math()
.test_matmul_fp16()
.test_transpose8x8_fp16()
.test_reduce_sum8()
.test_reverse()
.test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
.test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
.test_extract_highest()
.test_broadcast_element<0>().test_broadcast_element<1>()
.test_extract_n<0>().test_extract_n<1>()
#endif
;
#else
@ -2041,17 +2235,6 @@ void test_hal_intrin_float16()
#endif
}
/*#if defined(CV_CPU_DISPATCH_MODE_FP16) && CV_CPU_DISPATCH_MODE == FP16
void test_hal_intrin_float16()
{
TheTest<v_float16>()
.test_loadstore_fp16()
.test_float_cvt_fp16()
;
}
#endif*/
#endif //CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
//CV_CPU_OPTIMIZATION_NAMESPACE_END

Loading…
Cancel
Save