core(intrin): v_load_low() workaround for aarch64+clang

6 years ago · e8a703a71d
parent e8ff5cac1d
commit e8a703a71d
1 changed files with 16 additions and 2 deletions
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@ -875,13 +875,27 @@ OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int64x2, s64)
 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float64x2, f64)
 #endif

+#if defined(__clang__) && defined(__aarch64__)
+// avoid LD2 instruction. details: https://github.com/opencv/opencv/issues/14863
+#define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ \
+typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64; \
+uint64 v = *(unaligned_uint64*)ptr; \
+return _Tpvec(v_reinterpret_as_##suffix(v_uint64x2(v, (uint64)123456))); \
+}
+#else
+#define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr), vdup_n_##suffix((_Tp)0))); }
+#endif
+
 #define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
 inline _Tpvec v_load(const _Tp* ptr) \
 { return _Tpvec(vld1q_##suffix(ptr)); } \
 inline _Tpvec v_load_aligned(const _Tp* ptr) \
 { return _Tpvec(vld1q_##suffix(ptr)); } \
-inline _Tpvec v_load_low(const _Tp* ptr) \
-{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr), vdup_n_##suffix((_Tp)0))); } \
+OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
 { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
 inline void v_store(_Tp* ptr, const _Tpvec& a) \