|
|
@ -62,6 +62,22 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN |
|
|
|
#define CV_SIMD128_64F 0 |
|
|
|
#define CV_SIMD128_64F 0 |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// The following macro checks if the code is being compiled for the
|
|
|
|
|
|
|
|
// AArch64 execution state of Armv8, to enable the 128-bit
|
|
|
|
|
|
|
|
// intrinsics. The macro `__ARM_64BIT_STATE` is the one recommended by
|
|
|
|
|
|
|
|
// the Arm C Language Extension (ACLE) specifications [1] to check the
|
|
|
|
|
|
|
|
// availability of 128-bit intrinsics, and it is supporrted by clang
|
|
|
|
|
|
|
|
// and gcc. The macro `_M_ARM64` is the equivalent one for Microsoft
|
|
|
|
|
|
|
|
// Visual Studio [2] .
|
|
|
|
|
|
|
|
//
|
|
|
|
|
|
|
|
// [1] https://developer.arm.com/documentation/101028/0012/13--Advanced-SIMD--Neon--intrinsics
|
|
|
|
|
|
|
|
// [2] https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros
|
|
|
|
|
|
|
|
#if defined(__ARM_64BIT_STATE) || defined(_M_ARM64) |
|
|
|
|
|
|
|
#define CV_NEON_AARCH64 1 |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
#define CV_NEON_AARCH64 0 |
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
// TODO
|
|
|
|
// TODO
|
|
|
|
#define CV_NEON_DOT 0 |
|
|
|
#define CV_NEON_DOT 0 |
|
|
|
|
|
|
|
|
|
|
@ -726,41 +742,61 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, |
|
|
|
// 16 >> 32
|
|
|
|
// 16 >> 32
|
|
|
|
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b) |
|
|
|
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
#if CV_NEON_AARCH64 |
|
|
|
|
|
|
|
int32x4_t p = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val)); |
|
|
|
|
|
|
|
return v_int32x4(vmlal_high_s16(p, a.val, b.val)); |
|
|
|
|
|
|
|
#else |
|
|
|
int16x4_t a0 = vget_low_s16(a.val); |
|
|
|
int16x4_t a0 = vget_low_s16(a.val); |
|
|
|
int16x4_t a1 = vget_high_s16(a.val); |
|
|
|
int16x4_t a1 = vget_high_s16(a.val); |
|
|
|
int16x4_t b0 = vget_low_s16(b.val); |
|
|
|
int16x4_t b0 = vget_low_s16(b.val); |
|
|
|
int16x4_t b1 = vget_high_s16(b.val); |
|
|
|
int16x4_t b1 = vget_high_s16(b.val); |
|
|
|
int32x4_t p = vmull_s16(a0, b0); |
|
|
|
int32x4_t p = vmull_s16(a0, b0); |
|
|
|
return v_int32x4(vmlal_s16(p, a1, b1)); |
|
|
|
return v_int32x4(vmlal_s16(p, a1, b1)); |
|
|
|
|
|
|
|
#endif |
|
|
|
} |
|
|
|
} |
|
|
|
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) |
|
|
|
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
#if CV_NEON_AARCH64 |
|
|
|
|
|
|
|
int32x4_t p = vmlal_s16(c.val, vget_low_s16(a.val), vget_low_s16(b.val)); |
|
|
|
|
|
|
|
return v_int32x4(vmlal_high_s16(p, a.val, b.val)); |
|
|
|
|
|
|
|
#else |
|
|
|
int16x4_t a0 = vget_low_s16(a.val); |
|
|
|
int16x4_t a0 = vget_low_s16(a.val); |
|
|
|
int16x4_t a1 = vget_high_s16(a.val); |
|
|
|
int16x4_t a1 = vget_high_s16(a.val); |
|
|
|
int16x4_t b0 = vget_low_s16(b.val); |
|
|
|
int16x4_t b0 = vget_low_s16(b.val); |
|
|
|
int16x4_t b1 = vget_high_s16(b.val); |
|
|
|
int16x4_t b1 = vget_high_s16(b.val); |
|
|
|
int32x4_t p = vmlal_s16(c.val, a0, b0); |
|
|
|
int32x4_t p = vmlal_s16(c.val, a0, b0); |
|
|
|
return v_int32x4(vmlal_s16(p, a1, b1)); |
|
|
|
return v_int32x4(vmlal_s16(p, a1, b1)); |
|
|
|
|
|
|
|
#endif |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// 32 >> 64
|
|
|
|
// 32 >> 64
|
|
|
|
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b) |
|
|
|
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
#if CV_NEON_AARCH64 |
|
|
|
|
|
|
|
int64x2_t p = vmull_s32(vget_low_s32(a.val), vget_low_s32(b.val)); |
|
|
|
|
|
|
|
return v_int64x2(vmlal_high_s32(p, a.val, b.val)); |
|
|
|
|
|
|
|
#else |
|
|
|
int32x2_t a0 = vget_low_s32(a.val); |
|
|
|
int32x2_t a0 = vget_low_s32(a.val); |
|
|
|
int32x2_t a1 = vget_high_s32(a.val); |
|
|
|
int32x2_t a1 = vget_high_s32(a.val); |
|
|
|
int32x2_t b0 = vget_low_s32(b.val); |
|
|
|
int32x2_t b0 = vget_low_s32(b.val); |
|
|
|
int32x2_t b1 = vget_high_s32(b.val); |
|
|
|
int32x2_t b1 = vget_high_s32(b.val); |
|
|
|
int64x2_t p = vmull_s32(a0, b0); |
|
|
|
int64x2_t p = vmull_s32(a0, b0); |
|
|
|
return v_int64x2(vmlal_s32(p, a1, b1)); |
|
|
|
return v_int64x2(vmlal_s32(p, a1, b1)); |
|
|
|
|
|
|
|
#endif |
|
|
|
} |
|
|
|
} |
|
|
|
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) |
|
|
|
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
#if CV_NEON_AARCH64 |
|
|
|
|
|
|
|
int64x2_t p = vmlal_s32(c.val, vget_low_s32(a.val), vget_low_s32(b.val)); |
|
|
|
|
|
|
|
return v_int64x2(vmlal_high_s32(p, a.val, b.val)); |
|
|
|
|
|
|
|
#else |
|
|
|
int32x2_t a0 = vget_low_s32(a.val); |
|
|
|
int32x2_t a0 = vget_low_s32(a.val); |
|
|
|
int32x2_t a1 = vget_high_s32(a.val); |
|
|
|
int32x2_t a1 = vget_high_s32(a.val); |
|
|
|
int32x2_t b0 = vget_low_s32(b.val); |
|
|
|
int32x2_t b0 = vget_low_s32(b.val); |
|
|
|
int32x2_t b1 = vget_high_s32(b.val); |
|
|
|
int32x2_t b1 = vget_high_s32(b.val); |
|
|
|
int64x2_t p = vmlal_s32(c.val, a0, b0); |
|
|
|
int64x2_t p = vmlal_s32(c.val, a0, b0); |
|
|
|
return v_int64x2(vmlal_s32(p, a1, b1)); |
|
|
|
return v_int64x2(vmlal_s32(p, a1, b1)); |
|
|
|
|
|
|
|
#endif |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// 8 >> 32
|
|
|
|
// 8 >> 32
|
|
|
@ -1292,7 +1328,7 @@ inline int64 v_reduce_sum(const v_int64x2& a) |
|
|
|
#if CV_SIMD128_64F |
|
|
|
#if CV_SIMD128_64F |
|
|
|
inline double v_reduce_sum(const v_float64x2& a) |
|
|
|
inline double v_reduce_sum(const v_float64x2& a) |
|
|
|
{ |
|
|
|
{ |
|
|
|
return vgetq_lane_f64(a.val, 0) + vgetq_lane_f64(a.val, 1); |
|
|
|
return vaddvq_f64(a.val); |
|
|
|
} |
|
|
|
} |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|