@ -62,6 +62,22 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
# define CV_SIMD128_64F 0
# endif
// The following macro checks if the code is being compiled for the
// AArch64 execution state of Armv8, to enable the 128-bit
// intrinsics. The macro `__ARM_64BIT_STATE` is the one recommended by
// the Arm C Language Extension (ACLE) specifications [1] to check the
// availability of 128-bit intrinsics, and it is supporrted by clang
// and gcc. The macro `_M_ARM64` is the equivalent one for Microsoft
// Visual Studio [2] .
//
// [1] https://developer.arm.com/documentation/101028/0012/13--Advanced-SIMD--Neon--intrinsics
// [2] https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros
# if defined(__ARM_64BIT_STATE) || defined(_M_ARM64)
# define CV_NEON_AARCH64 1
# else
# define CV_NEON_AARCH64 0
# endif
// TODO
# define CV_NEON_DOT 0
@ -726,41 +742,61 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b,
// 16 >> 32
inline v_int32x4 v_dotprod_fast ( const v_int16x8 & a , const v_int16x8 & b )
{
# if CV_NEON_AARCH64
int32x4_t p = vmull_s16 ( vget_low_s16 ( a . val ) , vget_low_s16 ( b . val ) ) ;
return v_int32x4 ( vmlal_high_s16 ( p , a . val , b . val ) ) ;
# else
int16x4_t a0 = vget_low_s16 ( a . val ) ;
int16x4_t a1 = vget_high_s16 ( a . val ) ;
int16x4_t b0 = vget_low_s16 ( b . val ) ;
int16x4_t b1 = vget_high_s16 ( b . val ) ;
int32x4_t p = vmull_s16 ( a0 , b0 ) ;
return v_int32x4 ( vmlal_s16 ( p , a1 , b1 ) ) ;
# endif
}
inline v_int32x4 v_dotprod_fast ( const v_int16x8 & a , const v_int16x8 & b , const v_int32x4 & c )
{
# if CV_NEON_AARCH64
int32x4_t p = vmlal_s16 ( c . val , vget_low_s16 ( a . val ) , vget_low_s16 ( b . val ) ) ;
return v_int32x4 ( vmlal_high_s16 ( p , a . val , b . val ) ) ;
# else
int16x4_t a0 = vget_low_s16 ( a . val ) ;
int16x4_t a1 = vget_high_s16 ( a . val ) ;
int16x4_t b0 = vget_low_s16 ( b . val ) ;
int16x4_t b1 = vget_high_s16 ( b . val ) ;
int32x4_t p = vmlal_s16 ( c . val , a0 , b0 ) ;
return v_int32x4 ( vmlal_s16 ( p , a1 , b1 ) ) ;
# endif
}
// 32 >> 64
inline v_int64x2 v_dotprod_fast ( const v_int32x4 & a , const v_int32x4 & b )
{
# if CV_NEON_AARCH64
int64x2_t p = vmull_s32 ( vget_low_s32 ( a . val ) , vget_low_s32 ( b . val ) ) ;
return v_int64x2 ( vmlal_high_s32 ( p , a . val , b . val ) ) ;
# else
int32x2_t a0 = vget_low_s32 ( a . val ) ;
int32x2_t a1 = vget_high_s32 ( a . val ) ;
int32x2_t b0 = vget_low_s32 ( b . val ) ;
int32x2_t b1 = vget_high_s32 ( b . val ) ;
int64x2_t p = vmull_s32 ( a0 , b0 ) ;
return v_int64x2 ( vmlal_s32 ( p , a1 , b1 ) ) ;
# endif
}
inline v_int64x2 v_dotprod_fast ( const v_int32x4 & a , const v_int32x4 & b , const v_int64x2 & c )
{
# if CV_NEON_AARCH64
int64x2_t p = vmlal_s32 ( c . val , vget_low_s32 ( a . val ) , vget_low_s32 ( b . val ) ) ;
return v_int64x2 ( vmlal_high_s32 ( p , a . val , b . val ) ) ;
# else
int32x2_t a0 = vget_low_s32 ( a . val ) ;
int32x2_t a1 = vget_high_s32 ( a . val ) ;
int32x2_t b0 = vget_low_s32 ( b . val ) ;
int32x2_t b1 = vget_high_s32 ( b . val ) ;
int64x2_t p = vmlal_s32 ( c . val , a0 , b0 ) ;
return v_int64x2 ( vmlal_s32 ( p , a1 , b1 ) ) ;
# endif
}
// 8 >> 32
@ -1292,7 +1328,7 @@ inline int64 v_reduce_sum(const v_int64x2& a)
# if CV_SIMD128_64F
inline double v_reduce_sum ( const v_float64x2 & a )
{
return vgetq_lane_f64 ( a . val , 0 ) + vgetq_lane _f64 ( a . val , 1 ) ;
return vaddvq _f64 ( a . val ) ;
}
# endif