@ -61,6 +61,11 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
# else
# define CV_SIMD128_64F 0
# endif
# if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
# define CV_SIMD128_FP16 1
# else
# define CV_SIMD128_FP16 0
# endif
// The following macro checks if the code is being compiled for the
// AArch64 execution state of Armv8, to enable the 128-bit
@ -124,6 +129,9 @@ OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint16x8, uint16x4, u16)
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX ( int16x8 , int16x4 , s16 )
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX ( uint32x4 , uint32x2 , u32 )
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX ( int32x4 , int32x2 , s32 )
# if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX ( float16x8 , float16x4 , f16 ) ;
# endif
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX ( float32x4 , float32x2 , f32 )
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64 ( uint64x2 , uint64x1 , u64 )
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64 ( int64x2 , int64x1 , s64 )
@ -285,6 +293,31 @@ private:
}
} ;
# if CV_SIMD128_FP16
struct v_float16x8
{
v_float16x8 ( ) { }
explicit v_float16x8 ( float16x8_t v ) : val ( v ) { }
v_float16x8 ( __fp16 v0 , __fp16 v1 , __fp16 v2 , __fp16 v3 , __fp16 v4 , __fp16 v5 , __fp16 v6 , __fp16 v7 )
{
__fp16 v [ ] = { v0 , v1 , v2 , v3 , v4 , v5 , v6 , v7 } ;
val = vld1q_f16 ( v ) ;
}
float16x8_t val ;
private :
friend struct VTraits < v_float16x8 > ;
enum { nlanes = 8 } ;
typedef __fp16 lane_type ;
friend typename VTraits < v_float16x8 > : : lane_type v_get0 < v_float16x8 > ( const v_float16x8 & v ) ;
__fp16 get0 ( ) const
{
return vgetq_lane_f16 ( val , 0 ) ;
}
} ;
# endif
struct v_float32x4
{
v_float32x4 ( ) { }
@ -400,6 +433,23 @@ OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32)
OPENCV_HAL_IMPL_NEON_INIT ( int32x4 , int , s32 )
OPENCV_HAL_IMPL_NEON_INIT ( uint64x2 , uint64 , u64 )
OPENCV_HAL_IMPL_NEON_INIT ( int64x2 , int64 , s64 )
# if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_INIT ( float16x8 , __fp16 , f16 ) ;
# define OPENCV_HAL_IMPL_NEON_INIT_FP16(_Tpv, suffix) \
inline v_float16x8 v_reinterpret_as_f16 ( const v_ # # _Tpv & v ) { return v_float16x8 ( vreinterpretq_f16_ # # suffix ( v . val ) ) ; }
OPENCV_HAL_IMPL_NEON_INIT_FP16 ( uint8x16 , u8 )
OPENCV_HAL_IMPL_NEON_INIT_FP16 ( int8x16 , s8 )
OPENCV_HAL_IMPL_NEON_INIT_FP16 ( uint16x8 , u16 )
OPENCV_HAL_IMPL_NEON_INIT_FP16 ( int16x8 , s16 )
OPENCV_HAL_IMPL_NEON_INIT_FP16 ( uint32x4 , u32 )
OPENCV_HAL_IMPL_NEON_INIT_FP16 ( int32x4 , s32 )
OPENCV_HAL_IMPL_NEON_INIT_FP16 ( uint64x2 , u64 )
OPENCV_HAL_IMPL_NEON_INIT_FP16 ( int64x2 , s64 )
OPENCV_HAL_IMPL_NEON_INIT_FP16 ( float32x4 , f32 )
# if CV_SIMD128_64F
OPENCV_HAL_IMPL_NEON_INIT_FP16 ( float64x2 , f64 )
# endif
# endif
OPENCV_HAL_IMPL_NEON_INIT ( float32x4 , float , f32 )
# if CV_SIMD128_64F
# define OPENCV_HAL_IMPL_NEON_INIT_64(_Tpv, suffix) \
@ -413,6 +463,9 @@ OPENCV_HAL_IMPL_NEON_INIT_64(uint32x4, u32)
OPENCV_HAL_IMPL_NEON_INIT_64 ( int32x4 , s32 )
OPENCV_HAL_IMPL_NEON_INIT_64 ( uint64x2 , u64 )
OPENCV_HAL_IMPL_NEON_INIT_64 ( int64x2 , s64 )
# if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_INIT_64 ( float16x8 , f16 )
# endif
OPENCV_HAL_IMPL_NEON_INIT_64 ( float32x4 , f32 )
OPENCV_HAL_IMPL_NEON_INIT_64 ( float64x2 , f64 )
# endif
@ -505,6 +558,47 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
return v_float32x4 ( res ) ;
}
# if CV_SIMD128_FP16
// res = m0 * v[0] + m1 * v[1] + ... + m7 * v[7]
inline v_float16x8 v_matmul ( const v_float16x8 & v ,
const v_float16x8 & m0 , const v_float16x8 & m1 ,
const v_float16x8 & m2 , const v_float16x8 & m3 ,
const v_float16x8 & m4 , const v_float16x8 & m5 ,
const v_float16x8 & m6 , const v_float16x8 & m7 )
{
float16x4_t vl = vget_low_f16 ( v . val ) , vh = vget_high_f16 ( v . val ) ;
float16x8_t res = vmulq_lane_f16 ( m0 . val , vl , 0 ) ;
res = vfmaq_lane_f16 ( res , m1 . val , vl , 1 ) ;
res = vfmaq_lane_f16 ( res , m2 . val , vl , 2 ) ;
res = vfmaq_lane_f16 ( res , m3 . val , vl , 3 ) ;
res = vfmaq_lane_f16 ( res , m4 . val , vh , 0 ) ;
res = vfmaq_lane_f16 ( res , m5 . val , vh , 1 ) ;
res = vfmaq_lane_f16 ( res , m6 . val , vh , 2 ) ;
res = vfmaq_lane_f16 ( res , m7 . val , vh , 3 ) ;
return v_float16x8 ( res ) ;
}
// res = m0 * v[0] + m1 * v[1] + ... + m6 * v[6] + a
inline v_float16x8 v_matmuladd ( const v_float16x8 & v ,
const v_float16x8 & m0 , const v_float16x8 & m1 ,
const v_float16x8 & m2 , const v_float16x8 & m3 ,
const v_float16x8 & m4 , const v_float16x8 & m5 ,
const v_float16x8 & m6 ,
const v_float16x8 & a )
{
float16x4_t vl = vget_low_f16 ( v . val ) , vh = vget_high_f16 ( v . val ) ;
float16x8_t res = vmulq_lane_f16 ( m0 . val , vl , 0 ) ;
res = vfmaq_lane_f16 ( res , m1 . val , vl , 1 ) ;
res = vfmaq_lane_f16 ( res , m2 . val , vl , 2 ) ;
res = vfmaq_lane_f16 ( res , m3 . val , vl , 3 ) ;
res = vfmaq_lane_f16 ( res , m4 . val , vh , 0 ) ;
res = vfmaq_lane_f16 ( res , m5 . val , vh , 1 ) ;
res = vfmaq_lane_f16 ( res , m6 . val , vh , 2 ) ;
res = vaddq_f16 ( res , a . val ) ;
return v_float16x8 ( res ) ;
}
# endif
# define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
inline _Tpvec bin_op ( const _Tpvec & a , const _Tpvec & b ) \
{ \
@ -525,6 +619,12 @@ OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_int32x4, vmulq_s32)
OPENCV_HAL_IMPL_NEON_BIN_OP ( v_add , v_uint32x4 , vaddq_u32 )
OPENCV_HAL_IMPL_NEON_BIN_OP ( v_sub , v_uint32x4 , vsubq_u32 )
OPENCV_HAL_IMPL_NEON_BIN_OP ( v_mul , v_uint32x4 , vmulq_u32 )
# if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_BIN_OP ( v_add , v_float16x8 , vaddq_f16 )
OPENCV_HAL_IMPL_NEON_BIN_OP ( v_sub , v_float16x8 , vsubq_f16 )
OPENCV_HAL_IMPL_NEON_BIN_OP ( v_mul , v_float16x8 , vmulq_f16 )
OPENCV_HAL_IMPL_NEON_BIN_OP ( v_div , v_float16x8 , vdivq_f16 )
# endif
OPENCV_HAL_IMPL_NEON_BIN_OP ( v_add , v_float32x4 , vaddq_f32 )
OPENCV_HAL_IMPL_NEON_BIN_OP ( v_sub , v_float32x4 , vsubq_f32 )
OPENCV_HAL_IMPL_NEON_BIN_OP ( v_mul , v_float32x4 , vmulq_f32 )
@ -944,6 +1044,21 @@ OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32)
OPENCV_HAL_IMPL_NEON_LOGIC_OP ( v_uint64x2 , u64 )
OPENCV_HAL_IMPL_NEON_LOGIC_OP ( v_int64x2 , s64 )
# if CV_SIMD128_FP16
# define OPENCV_HAL_IMPL_NEON_FP16_BIT_OP(bin_op, intrin) \
inline v_float16x8 bin_op ( const v_float16x8 & a , const v_float16x8 & b ) \
{ \
return v_float16x8 ( vreinterpretq_f16_s16 ( intrin ( vreinterpretq_s16_f16 ( a . val ) , vreinterpretq_s16_f16 ( b . val ) ) ) ) ; \
}
OPENCV_HAL_IMPL_NEON_FP16_BIT_OP ( v_and , vandq_s16 )
OPENCV_HAL_IMPL_NEON_FP16_BIT_OP ( v_or , vorrq_s16 )
OPENCV_HAL_IMPL_NEON_FP16_BIT_OP ( v_xor , veorq_s16 )
inline v_float16x8 v_not ( const v_float16x8 & a )
{
return v_float16x8 ( vreinterpretq_f16_s16 ( vmvnq_s16 ( vreinterpretq_s16_f16 ( a . val ) ) ) ) ;
}
# endif
# define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
inline v_float32x4 bin_op ( const v_float32x4 & a , const v_float32x4 & b ) \
{ \
@ -959,6 +1074,19 @@ inline v_float32x4 v_not (const v_float32x4& a)
return v_float32x4 ( vreinterpretq_f32_s32 ( vmvnq_s32 ( vreinterpretq_s32_f32 ( a . val ) ) ) ) ;
}
# if CV_SIMD128_FP16
inline v_float16x8 v_sqrt ( const v_float16x8 & x )
{
return v_float16x8 ( vsqrtq_f16 ( x . val ) ) ;
}
inline v_float16x8 v_invsqrt ( const v_float16x8 & x )
{
v_float16x8 one = v_setall_f16 ( 1.0f ) ;
return v_div ( one , v_sqrt ( x ) ) ;
}
# endif
# if CV_SIMD128_64F
inline v_float32x4 v_sqrt ( const v_float32x4 & x )
{
@ -996,9 +1124,14 @@ OPENCV_HAL_IMPL_NEON_ABS(v_uint8x16, v_int8x16, u8, s8)
OPENCV_HAL_IMPL_NEON_ABS ( v_uint16x8 , v_int16x8 , u16 , s16 )
OPENCV_HAL_IMPL_NEON_ABS ( v_uint32x4 , v_int32x4 , u32 , s32 )
inline v_float32x4 v_abs ( v_float32x4 x )
inline v_float32x4 v_abs ( const v_float32x4 & x )
{ return v_float32x4 ( vabsq_f32 ( x . val ) ) ; }
# if CV_SIMD128_FP16
inline v_float16x8 v_abs ( const v_float16x8 & x )
{ return v_float16x8 ( vabsq_f16 ( x . val ) ) ; }
# endif
# if CV_SIMD128_64F
# define OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(bin_op, intrin) \
inline v_float64x2 bin_op ( const v_float64x2 & a , const v_float64x2 & b ) \
@ -1052,6 +1185,10 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32)
OPENCV_HAL_IMPL_NEON_BIN_FUNC ( v_int32x4 , v_max , vmaxq_s32 )
OPENCV_HAL_IMPL_NEON_BIN_FUNC ( v_float32x4 , v_min , vminq_f32 )
OPENCV_HAL_IMPL_NEON_BIN_FUNC ( v_float32x4 , v_max , vmaxq_f32 )
# if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_BIN_FUNC ( v_float16x8 , v_min , vminq_f16 )
OPENCV_HAL_IMPL_NEON_BIN_FUNC ( v_float16x8 , v_max , vmaxq_f16 )
# endif
# if CV_SIMD128_64F
OPENCV_HAL_IMPL_NEON_BIN_FUNC ( v_float64x2 , v_min , vminq_f64 )
OPENCV_HAL_IMPL_NEON_BIN_FUNC ( v_float64x2 , v_max , vmaxq_f64 )
@ -1075,6 +1212,9 @@ OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
OPENCV_HAL_IMPL_NEON_INT_CMP_OP ( v_int8x16 , vreinterpretq_s8_u8 , s8 , u8 )
OPENCV_HAL_IMPL_NEON_INT_CMP_OP ( v_uint16x8 , OPENCV_HAL_NOP , u16 , u16 )
OPENCV_HAL_IMPL_NEON_INT_CMP_OP ( v_int16x8 , vreinterpretq_s16_u16 , s16 , u16 )
# if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_INT_CMP_OP ( v_float16x8 , vreinterpretq_f16_u16 , f16 , u16 )
# endif
OPENCV_HAL_IMPL_NEON_INT_CMP_OP ( v_uint32x4 , OPENCV_HAL_NOP , u32 , u32 )
OPENCV_HAL_IMPL_NEON_INT_CMP_OP ( v_int32x4 , vreinterpretq_s32_u32 , s32 , u32 )
OPENCV_HAL_IMPL_NEON_INT_CMP_OP ( v_float32x4 , vreinterpretq_f32_u32 , f32 , u32 )
@ -1139,6 +1279,10 @@ static inline v_int64x2 v_lt (const v_int64x2& a, const v_int64x2& b)
OPENCV_HAL_IMPL_NEON_INT_CMP_OP ( v_float64x2 , vreinterpretq_f64_u64 , f64 , u64 )
# endif
# if CV_SIMD128_FP16
inline v_float16x8 v_not_nan ( const v_float16x8 & a )
{ return v_float16x8 ( vreinterpretq_f16_u16 ( vceqq_f16 ( a . val , a . val ) ) ) ; }
# endif
inline v_float32x4 v_not_nan ( const v_float32x4 & a )
{ return v_float32x4 ( vreinterpretq_f32_u32 ( vceqq_f32 ( a . val , a . val ) ) ) ; }
# if CV_SIMD128_64F
@ -1162,6 +1306,9 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mul_wrap, vmulq_s16)
OPENCV_HAL_IMPL_NEON_BIN_FUNC ( v_uint8x16 , v_absdiff , vabdq_u8 )
OPENCV_HAL_IMPL_NEON_BIN_FUNC ( v_uint16x8 , v_absdiff , vabdq_u16 )
OPENCV_HAL_IMPL_NEON_BIN_FUNC ( v_uint32x4 , v_absdiff , vabdq_u32 )
# if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_BIN_FUNC ( v_float16x8 , v_absdiff , vabdq_f16 )
# endif
OPENCV_HAL_IMPL_NEON_BIN_FUNC ( v_float32x4 , v_absdiff , vabdq_f32 )
# if CV_SIMD128_64F
OPENCV_HAL_IMPL_NEON_BIN_FUNC ( v_float64x2 , v_absdiff , vabdq_f64 )
@ -1183,6 +1330,29 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int8x16, v_uint8x16, vreinterpretq_u8_s8, v_abs
OPENCV_HAL_IMPL_NEON_BIN_FUNC2 ( v_int16x8 , v_uint16x8 , vreinterpretq_u16_s16 , v_absdiff , vabdq_s16 )
OPENCV_HAL_IMPL_NEON_BIN_FUNC2 ( v_int32x4 , v_uint32x4 , vreinterpretq_u32_s32 , v_absdiff , vabdq_s32 )
# if CV_SIMD128_FP16
inline v_float16x8 v_magnitude ( const v_float16x8 & a , const v_float16x8 & b )
{
v_float16x8 x ( vaddq_f16 ( vmulq_f16 ( a . val , a . val ) , vmulq_f16 ( b . val , b . val ) ) ) ;
return v_sqrt ( x ) ;
}
inline v_float16x8 v_sqr_magnitude ( const v_float16x8 & a , const v_float16x8 & b )
{
return v_float16x8 ( vaddq_f16 ( vmulq_f16 ( a . val , a . val ) , vmulq_f16 ( b . val , b . val ) ) ) ;
}
inline v_float16x8 v_fma ( const v_float16x8 & a , const v_float16x8 & b , const v_float16x8 & c )
{
return v_float16x8 ( vfmaq_f16 ( c . val , a . val , b . val ) ) ;
}
inline v_float16x8 v_muladd ( const v_float16x8 & a , const v_float16x8 & b , const v_float16x8 & c )
{
return v_fma ( a , b , c ) ;
}
# endif
inline v_float32x4 v_magnitude ( const v_float32x4 & a , const v_float32x4 & b )
{
v_float32x4 x ( vmlaq_f32 ( vmulq_f32 ( a . val , a . val ) , b . val , b . val ) ) ;
@ -1285,6 +1455,9 @@ OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint16x8, u16)
OPENCV_HAL_IMPL_NEON_ROTATE_OP ( v_int16x8 , s16 )
OPENCV_HAL_IMPL_NEON_ROTATE_OP ( v_uint32x4 , u32 )
OPENCV_HAL_IMPL_NEON_ROTATE_OP ( v_int32x4 , s32 )
# if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_ROTATE_OP ( v_float16x8 , f16 )
# endif
OPENCV_HAL_IMPL_NEON_ROTATE_OP ( v_float32x4 , f32 )
OPENCV_HAL_IMPL_NEON_ROTATE_OP ( v_uint64x2 , u64 )
OPENCV_HAL_IMPL_NEON_ROTATE_OP ( v_int64x2 , s64 )
@ -1336,6 +1509,9 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP ( v_int32x4 , int , s32 )
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP ( v_uint64x2 , uint64 , u64 )
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP ( v_int64x2 , int64 , s64 )
# if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP ( v_float16x8 , __fp16 , f16 )
# endif
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP ( v_float32x4 , float , f32 )
# if CV_SIMD128_64F
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP ( v_float64x2 , double , f64 )
@ -1428,6 +1604,10 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, max, max, u16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8 ( v_uint16x8 , uint16x4 , ushort , min , min , u16 )
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8 ( v_int16x8 , int16x4 , short , max , max , s16 )
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8 ( v_int16x8 , int16x4 , short , min , min , s16 )
# if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8 ( v_float16x8 , float16x4 , __fp16 , max , max , f16 )
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8 ( v_float16x8 , float16x4 , __fp16 , min , min , f16 )
# endif
# if CV_NEON_AARCH64
# define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
@ -1498,6 +1678,24 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
# endif // #if CV_NEON_AARCH64
}
# if CV_SIMD128_FP16
inline v_float16x8 v_reduce_sum8 ( const v_float16x8 & a , const v_float16x8 & b ,
const v_float16x8 & c , const v_float16x8 & d ,
const v_float16x8 & w , const v_float16x8 & x ,
const v_float16x8 & y , const v_float16x8 & z )
{
float16x8_t ab = vpaddq_f16 ( a . val , b . val ) ; // a0+a1 a2+a3 a4+a5 a6+a7 b0+b1 b2+b3 b4+b5 b6+b7
float16x8_t cd = vpaddq_f16 ( c . val , d . val ) ; // c0+c1 c2+c3 c4+c5 c6+c7 d0+d1 d2+d3 d4+d5 d6+d7
float16x8_t wx = vpaddq_f16 ( w . val , x . val ) ; // w0+w1 w2+w3 w4+w5 w6+w7 x0+x1 x2+x3 x4+x5 x6+x7
float16x8_t yz = vpaddq_f16 ( y . val , z . val ) ; // y0+y1 y2+y3 y4+y5 y6+y7 z0+z1 z2+z3 z4+z5 z6+z7
float16x8_t abcd = vpaddq_f16 ( ab , cd ) ; // a0+a1+a2+a3 a4+a5+a6+a7 b0+b1+b2+b3 b4+b5+b6+b7 c0+c1+c2+c3 c4+c5+c6+c7 d0+d1+d2+d3 d4+d5+d6+d7
float16x8_t wxyz = vpaddq_f16 ( wx , yz ) ; // w0+w1+w2+w3 w4+w5+w6+w7 x0+x1+x2+x3 x4+x5+x6+x7 y0+y1+y2+y3 y4+y5+y6+y7 z0+z1+z2+z3 z4+z5+z6+z7
return v_float16x8 ( vpaddq_f16 ( abcd , wxyz ) ) ;
}
# endif
inline unsigned v_reduce_sad ( const v_uint8x16 & a , const v_uint8x16 & b )
{
# if CV_NEON_AARCH64
@ -1635,6 +1833,10 @@ inline int v_signmask(const v_uint16x8& a)
}
inline int v_signmask ( const v_int16x8 & a )
{ return v_signmask ( v_reinterpret_as_u16 ( a ) ) ; }
# if CV_SIMD128_FP16
inline int v_signmask ( const v_float16x8 & a )
{ return v_signmask ( v_reinterpret_as_u16 ( a ) ) ; }
# endif
inline int v_signmask ( const v_uint32x4 & a )
{
@ -1678,6 +1880,9 @@ inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmas
inline int v_scan_forward ( const v_uint8x16 & a ) { return trailingZeros32 ( v_signmask ( a ) ) ; }
inline int v_scan_forward ( const v_int16x8 & a ) { return trailingZeros32 ( v_signmask ( a ) ) ; }
inline int v_scan_forward ( const v_uint16x8 & a ) { return trailingZeros32 ( v_signmask ( a ) ) ; }
# if CV_SIMD128_FP16
inline int v_scan_forward ( const v_float16x8 & a ) { return trailingZeros32 ( v_signmask ( a ) ) ; }
# endif
inline int v_scan_forward ( const v_int32x4 & a ) { return trailingZeros32 ( v_signmask ( a ) ) ; }
inline int v_scan_forward ( const v_uint32x4 & a ) { return trailingZeros32 ( v_signmask ( a ) ) ; }
inline int v_scan_forward ( const v_float32x4 & a ) { return trailingZeros32 ( v_signmask ( a ) ) ; }
@ -1732,6 +1937,12 @@ inline bool v_check_all(const v_int8x16& a)
{ return v_check_all ( v_reinterpret_as_u8 ( a ) ) ; }
inline bool v_check_all ( const v_int16x8 & a )
{ return v_check_all ( v_reinterpret_as_u16 ( a ) ) ; }
# if CV_SIMD128_FP16
inline bool v_check_all ( const v_float16x8 & a )
{ return v_check_all ( v_reinterpret_as_u16 ( a ) ) ; }
inline bool v_check_any ( const v_float16x8 & a )
{ return v_check_any ( v_reinterpret_as_u16 ( a ) ) ; }
# endif
inline bool v_check_all ( const v_int32x4 & a )
{ return v_check_all ( v_reinterpret_as_u32 ( a ) ) ; }
inline bool v_check_all ( const v_float32x4 & a )
@ -1767,6 +1978,9 @@ OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8)
OPENCV_HAL_IMPL_NEON_SELECT ( v_int8x16 , s8 , u8 )
OPENCV_HAL_IMPL_NEON_SELECT ( v_uint16x8 , u16 , u16 )
OPENCV_HAL_IMPL_NEON_SELECT ( v_int16x8 , s16 , u16 )
# if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_SELECT ( v_float16x8 , f16 , u16 )
# endif
OPENCV_HAL_IMPL_NEON_SELECT ( v_uint32x4 , u32 , u32 )
OPENCV_HAL_IMPL_NEON_SELECT ( v_int32x4 , s32 , u32 )
OPENCV_HAL_IMPL_NEON_SELECT ( v_float32x4 , f32 , u32 )
@ -1884,6 +2098,9 @@ OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8)
OPENCV_HAL_IMPL_NEON_UNPACKS ( int8x16 , s8 )
OPENCV_HAL_IMPL_NEON_UNPACKS ( uint16x8 , u16 )
OPENCV_HAL_IMPL_NEON_UNPACKS ( int16x8 , s16 )
# if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_UNPACKS ( float16x8 , f16 )
# endif
OPENCV_HAL_IMPL_NEON_UNPACKS ( uint32x4 , u32 )
OPENCV_HAL_IMPL_NEON_UNPACKS ( int32x4 , s32 )
OPENCV_HAL_IMPL_NEON_UNPACKS ( float32x4 , f32 )
@ -1909,6 +2126,11 @@ inline v_uint16x8 v_reverse(const v_uint16x8 &a)
inline v_int16x8 v_reverse ( const v_int16x8 & a )
{ return v_reinterpret_as_s16 ( v_reverse ( v_reinterpret_as_u16 ( a ) ) ) ; }
# if CV_SIMD128_FP16
inline v_float16x8 v_reverse ( const v_float16x8 & a )
{ return v_reinterpret_as_f16 ( v_reverse ( v_reinterpret_as_u16 ( a ) ) ) ; }
# endif
inline v_uint32x4 v_reverse ( const v_uint32x4 & a )
{
uint32x4_t vec = vrev64q_u32 ( a . val ) ;
@ -1948,6 +2170,9 @@ OPENCV_HAL_IMPL_NEON_EXTRACT(uint8x16, u8)
OPENCV_HAL_IMPL_NEON_EXTRACT ( int8x16 , s8 )
OPENCV_HAL_IMPL_NEON_EXTRACT ( uint16x8 , u16 )
OPENCV_HAL_IMPL_NEON_EXTRACT ( int16x8 , s16 )
# if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_EXTRACT ( float16x8 , f16 )
# endif
OPENCV_HAL_IMPL_NEON_EXTRACT ( uint32x4 , u32 )
OPENCV_HAL_IMPL_NEON_EXTRACT ( int32x4 , s32 )
OPENCV_HAL_IMPL_NEON_EXTRACT ( uint64x2 , u64 )
@ -1964,6 +2189,9 @@ OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint8x16, uchar, u8)
OPENCV_HAL_IMPL_NEON_EXTRACT_N ( v_int8x16 , schar , s8 )
OPENCV_HAL_IMPL_NEON_EXTRACT_N ( v_uint16x8 , ushort , u16 )
OPENCV_HAL_IMPL_NEON_EXTRACT_N ( v_int16x8 , short , s16 )
# if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_EXTRACT_N ( v_float16x8 , __fp16 , f16 )
# endif
OPENCV_HAL_IMPL_NEON_EXTRACT_N ( v_uint32x4 , uint , u32 )
OPENCV_HAL_IMPL_NEON_EXTRACT_N ( v_int32x4 , int , s32 )
OPENCV_HAL_IMPL_NEON_EXTRACT_N ( v_uint64x2 , uint64 , u64 )
@ -1980,6 +2208,9 @@ OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint8x16, uchar, u8)
OPENCV_HAL_IMPL_NEON_BROADCAST ( v_int8x16 , schar , s8 )
OPENCV_HAL_IMPL_NEON_BROADCAST ( v_uint16x8 , ushort , u16 )
OPENCV_HAL_IMPL_NEON_BROADCAST ( v_int16x8 , short , s16 )
# if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_BROADCAST ( v_float16x8 , __fp16 , f16 )
# endif
OPENCV_HAL_IMPL_NEON_BROADCAST ( v_uint32x4 , uint , u32 )
OPENCV_HAL_IMPL_NEON_BROADCAST ( v_int32x4 , int , s32 )
OPENCV_HAL_IMPL_NEON_BROADCAST ( v_uint64x2 , uint64 , u64 )
@ -1989,6 +2220,32 @@ OPENCV_HAL_IMPL_NEON_BROADCAST(v_float32x4, float, f32)
OPENCV_HAL_IMPL_NEON_BROADCAST ( v_float64x2 , double , f64 )
# endif
# if CV_SIMD128_FP16
inline v_int16x8 v_round ( const v_float16x8 & a )
{
return v_int16x8 ( vcvtnq_s16_f16 ( a . val ) ) ;
}
inline v_int16x8 v_floor ( const v_float16x8 & a )
{
int16x8_t a1 = vcvtq_s16_f16 ( a . val ) ;
uint16x8_t mask = vcgtq_f16 ( vcvtq_f16_s16 ( a1 ) , a . val ) ;
return v_int16x8 ( vaddq_s16 ( a1 , vreinterpretq_s16_u16 ( mask ) ) ) ;
}
inline v_int16x8 v_ceil ( const v_float16x8 & a )
{
int16x8_t a1 = vcvtq_s16_f16 ( a . val ) ;
uint16x8_t mask = vcgtq_f16 ( a . val , vcvtq_f16_s16 ( a1 ) ) ;
return v_int16x8 ( vsubq_s16 ( a1 , vreinterpretq_s16_u16 ( mask ) ) ) ;
}
inline v_int16x8 v_trunc ( const v_float16x8 & a )
{
return v_int16x8 ( vcvtq_s16_f16 ( a . val ) ) ;
}
# endif
# if CV_SIMD128_64F
inline v_int32x4 v_round ( const v_float32x4 & a )
{
@ -2124,6 +2381,47 @@ OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4 ( float32x4 , f32 )
# endif // #if CV_NEON_AARCH64
# if CV_SIMD128_FP16
inline void v_transpose8x8 ( const v_float16x8 & a0 , const v_float16x8 & a1 ,
const v_float16x8 & a2 , const v_float16x8 & a3 ,
const v_float16x8 & a4 , const v_float16x8 & a5 ,
const v_float16x8 & a6 , const v_float16x8 & a7 ,
v_float16x8 & b0 , v_float16x8 & b1 ,
v_float16x8 & b2 , v_float16x8 & b3 ,
v_float16x8 & b4 , v_float16x8 & b5 ,
v_float16x8 & b6 , v_float16x8 & b7 )
{
float32x4_t s0 = vreinterpretq_f32_f64 ( vtrn1q_f64 ( vreinterpretq_f64_f16 ( a0 . val ) , vreinterpretq_f64_f16 ( a4 . val ) ) ) ;
float32x4_t s1 = vreinterpretq_f32_f64 ( vtrn1q_f64 ( vreinterpretq_f64_f16 ( a1 . val ) , vreinterpretq_f64_f16 ( a5 . val ) ) ) ;
float32x4_t s2 = vreinterpretq_f32_f64 ( vtrn1q_f64 ( vreinterpretq_f64_f16 ( a2 . val ) , vreinterpretq_f64_f16 ( a6 . val ) ) ) ;
float32x4_t s3 = vreinterpretq_f32_f64 ( vtrn1q_f64 ( vreinterpretq_f64_f16 ( a3 . val ) , vreinterpretq_f64_f16 ( a7 . val ) ) ) ;
float32x4_t s4 = vreinterpretq_f32_f64 ( vtrn2q_f64 ( vreinterpretq_f64_f16 ( a0 . val ) , vreinterpretq_f64_f16 ( a4 . val ) ) ) ;
float32x4_t s5 = vreinterpretq_f32_f64 ( vtrn2q_f64 ( vreinterpretq_f64_f16 ( a1 . val ) , vreinterpretq_f64_f16 ( a5 . val ) ) ) ;
float32x4_t s6 = vreinterpretq_f32_f64 ( vtrn2q_f64 ( vreinterpretq_f64_f16 ( a2 . val ) , vreinterpretq_f64_f16 ( a6 . val ) ) ) ;
float32x4_t s7 = vreinterpretq_f32_f64 ( vtrn2q_f64 ( vreinterpretq_f64_f16 ( a3 . val ) , vreinterpretq_f64_f16 ( a7 . val ) ) ) ;
float16x8_t t0 = vreinterpretq_f16_f32 ( vtrn1q_f32 ( s0 , s2 ) ) ;
float16x8_t t1 = vreinterpretq_f16_f32 ( vtrn1q_f32 ( s1 , s3 ) ) ;
float16x8_t t2 = vreinterpretq_f16_f32 ( vtrn2q_f32 ( s0 , s2 ) ) ;
float16x8_t t3 = vreinterpretq_f16_f32 ( vtrn2q_f32 ( s1 , s3 ) ) ;
float16x8_t t4 = vreinterpretq_f16_f32 ( vtrn1q_f32 ( s4 , s6 ) ) ;
float16x8_t t5 = vreinterpretq_f16_f32 ( vtrn1q_f32 ( s5 , s7 ) ) ;
float16x8_t t6 = vreinterpretq_f16_f32 ( vtrn2q_f32 ( s4 , s6 ) ) ;
float16x8_t t7 = vreinterpretq_f16_f32 ( vtrn2q_f32 ( s5 , s7 ) ) ;
b0 . val = vtrn1q_f16 ( t0 , t1 ) ;
b1 . val = vtrn2q_f16 ( t0 , t1 ) ;
b2 . val = vtrn1q_f16 ( t2 , t3 ) ;
b3 . val = vtrn2q_f16 ( t2 , t3 ) ;
b4 . val = vtrn1q_f16 ( t4 , t5 ) ;
b5 . val = vtrn2q_f16 ( t4 , t5 ) ;
b6 . val = vtrn1q_f16 ( t6 , t7 ) ;
b7 . val = vtrn2q_f16 ( t6 , t7 ) ;
}
# endif
# define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
inline void v_load_deinterleave ( const _Tp * ptr , v_ # # _Tpvec & a , v_ # # _Tpvec & b ) \
{ \
@ -2257,6 +2555,9 @@ OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
OPENCV_HAL_IMPL_NEON_INTERLEAVED ( int8x16 , schar , s8 )
OPENCV_HAL_IMPL_NEON_INTERLEAVED ( uint16x8 , ushort , u16 )
OPENCV_HAL_IMPL_NEON_INTERLEAVED ( int16x8 , short , s16 )
# if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_INTERLEAVED ( float16x8 , __fp16 , f16 )
# endif
OPENCV_HAL_IMPL_NEON_INTERLEAVED ( uint32x4 , unsigned , u32 )
OPENCV_HAL_IMPL_NEON_INTERLEAVED ( int32x4 , int , s32 )
OPENCV_HAL_IMPL_NEON_INTERLEAVED ( float32x4 , float , f32 )
@ -2267,6 +2568,30 @@ OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, f64)
OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64 ( int64 , s64 )
OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64 ( uint64 , u64 )
# if CV_SIMD128_FP16
inline v_float16x8 v_cvt_f16 ( const v_float32x4 & a )
{
float16x4_t zero = vdup_n_f16 ( ( __fp16 ) 0.0f ) ;
return v_float16x8 ( vcombine_f16 ( vcvt_f16_f32 ( a . val ) , zero ) ) ;
}
inline v_float16x8 v_cvt_f16 ( const v_float32x4 & a , const v_float32x4 & b )
{
return v_float16x8 ( vcombine_f16 ( vcvt_f16_f32 ( a . val ) , vcvt_f16_f32 ( b . val ) ) ) ;
}
inline v_float16x8 v_cvt_f16 ( const v_int16x8 & a )
{
return v_float16x8 ( vcvtq_f16_s16 ( a . val ) ) ;
}
inline v_float32x4 v_cvt_f32 ( const v_float16x8 & a )
{
return v_float32x4 ( vcvt_f32_f16 ( vget_low_f16 ( a . val ) ) ) ;
}
inline v_float32x4 v_cvt_f32_high ( const v_float16x8 & a )
{
return v_float32x4 ( vcvt_f32_f16 ( vget_high_f16 ( a . val ) ) ) ;
}
# endif
inline v_float32x4 v_cvt_f32 ( const v_int32x4 & a )
{
return v_float32x4 ( vcvtq_f32_s32 ( a . val ) ) ;
@ -2422,6 +2747,46 @@ inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpre
inline v_uint16x8 v_lut_pairs ( const ushort * tab , const int * idx ) { return v_reinterpret_as_u16 ( v_lut_pairs ( ( short * ) tab , idx ) ) ; }
inline v_uint16x8 v_lut_quads ( const ushort * tab , const int * idx ) { return v_reinterpret_as_u16 ( v_lut_quads ( ( short * ) tab , idx ) ) ; }
# if CV_SIMD128_FP16
inline v_float16x8 v_lut ( const float16_t * tab , const int * idx )
{
const __fp16 * t = ( const __fp16 * ) tab ;
__fp16 CV_DECL_ALIGNED ( 32 ) elems [ 8 ] =
{
t [ idx [ 0 ] ] ,
t [ idx [ 1 ] ] ,
t [ idx [ 2 ] ] ,
t [ idx [ 3 ] ] ,
t [ idx [ 4 ] ] ,
t [ idx [ 5 ] ] ,
t [ idx [ 6 ] ] ,
t [ idx [ 7 ] ] ,
} ;
return v_float16x8 ( vld1q_f16 ( elems ) ) ;
}
inline v_float16x8 v_lut_pairs ( const float16_t * tab , const int * idx )
{
const __fp16 * t = ( const __fp16 * ) tab ;
__fp16 CV_DECL_ALIGNED ( 32 ) elems [ 8 ] =
{
t [ idx [ 0 ] ] ,
t [ idx [ 0 ] + 1 ] ,
t [ idx [ 1 ] ] ,
t [ idx [ 1 ] + 1 ] ,
t [ idx [ 2 ] ] ,
t [ idx [ 2 ] + 1 ] ,
t [ idx [ 3 ] ] ,
t [ idx [ 3 ] + 1 ] ,
} ;
return v_float16x8 ( vld1q_f16 ( elems ) ) ;
}
inline v_float16x8 v_lut_quads ( const float16_t * tab , const int * idx )
{
const __fp16 * t = ( const __fp16 * ) tab ;
return v_float16x8 ( vcombine_f16 ( vld1_f16 ( t + idx [ 0 ] ) , vld1_f16 ( t + idx [ 1 ] ) ) ) ;
}
# endif
inline v_int32x4 v_lut ( const int * tab , const int * idx )
{
int CV_DECL_ALIGNED ( 32 ) elems [ 4 ] =