diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index 0a59e9baed..0fc78ca519 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -99,7 +99,7 @@ block and to save contents of the register to memory block. @ref v_setall_s8, @ref v_setall_u8, ..., @ref v_setzero_u8, @ref v_setzero_s8, ... - Memory operations: -@ref v_load, @ref v_load_aligned, @ref v_load_halves, +@ref v_load, @ref v_load_aligned, @ref v_load_low, @ref v_load_halves, @ref v_store, @ref v_store_aligned, @ref v_store_high, @ref v_store_low @@ -1080,6 +1080,26 @@ inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_aligned(const _Tp* ptr) return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr); } +/** @brief Load 64-bits of data to lower part (high part is undefined). + +@param ptr memory block containing data for first half (0..n/2) + +@code{.cpp} +int lo[2] = { 1, 2 }; +v_int32x4 r = v_load_low(lo); +@endcode + */ +template +inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_low(const _Tp* ptr) +{ + v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> c; + for( int i = 0; i < c.nlanes/2; i++ ) + { + c.s[i] = ptr[i]; + } + return c; +} + /** @brief Load register contents from two memory blocks @param loptr memory block containing data for first half (0..n/2) diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index ce661c3ce6..175750e06a 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -763,6 +763,8 @@ inline _Tpvec v_load(const _Tp* ptr) \ { return _Tpvec(vld1q_##suffix(ptr)); } \ inline _Tpvec v_load_aligned(const _Tp* ptr) \ { return _Tpvec(vld1q_##suffix(ptr)); } \ +inline _Tpvec v_load_low(const _Tp* ptr) \ +{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr), vdup_n_##suffix((_Tp)0))); } \ inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \ inline void v_store(_Tp* ptr, const _Tpvec& a) \ diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index b40f1de777..47ea2a2f54 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -1016,6 +1016,8 @@ inline _Tpvec v_load(const _Tp* ptr) \ { return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \ inline _Tpvec v_load_aligned(const _Tp* ptr) \ { return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \ +inline _Tpvec v_load_low(const _Tp* ptr) \ +{ return _Tpvec(_mm_loadl_epi64((const __m128i*)ptr)); } \ inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ { \ return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \ @@ -1044,6 +1046,8 @@ inline _Tpvec v_load(const _Tp* ptr) \ { return _Tpvec(_mm_loadu_##suffix(ptr)); } \ inline _Tpvec v_load_aligned(const _Tp* ptr) \ { return _Tpvec(_mm_load_##suffix(ptr)); } \ +inline _Tpvec v_load_low(const _Tp* ptr) \ +{ return _Tpvec(_mm_castsi128_##suffix(_mm_loadl_epi64((const __m128i*)ptr))); } \ inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ { \ return _Tpvec(_mm_castsi128_##suffix( \ diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index 3d15945de7..ea5213d39b 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -281,6 +281,8 @@ inline _Tpvec v_load(const _Tp* ptr) \ { return _Tpvec(ld_func(0, ptr)); } \ inline _Tpvec v_load_aligned(const _Tp* ptr) \ { return _Tpvec(ld_func(0, ptr)); } \ +inline _Tpvec v_load_low(const _Tp* ptr) \ +{ return _Tpvec(vec_ld_l8(ptr)); } \ inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ { return _Tpvec(vec_mergesqh(vec_ld_l8(ptr0), vec_ld_l8(ptr1))); } \ inline void v_store(_Tp* ptr, const _Tpvec& a) \ diff --git a/modules/core/include/opencv2/core/vsx_utils.hpp b/modules/core/include/opencv2/core/vsx_utils.hpp index 3ce190b9b6..29f6fa5117 100644 --- a/modules/core/include/opencv2/core/vsx_utils.hpp +++ b/modules/core/include/opencv2/core/vsx_utils.hpp @@ -556,17 +556,12 @@ VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs) * vec_ld_l8(ptr) -> Load 64-bits of integer data to lower part * vec_ldz_l8(ptr) -> Load 64-bits of integer data to lower part and zero upper part **/ -#if defined(__clang__) && !defined(__IBMCPP__) -# define __VSX_LOAD_L8(Tvec, p) (Tvec)((vec_udword2)*((uint64*)(p))) -#else -# define __VSX_LOAD_L8(Tvec, p) *((Tvec*)(p)) -#endif - #define VSX_IMPL_LOAD_L8(Tvec, Tp) \ FORCE_INLINE(Tvec) vec_ld_l8(const Tp *p) \ -{ return __VSX_LOAD_L8(Tvec, p); } \ +{ return ((Tvec)vec_promote(*((uint64*)p), 0)); } \ FORCE_INLINE(Tvec) vec_ldz_l8(const Tp *p) \ { \ + /* TODO: try (Tvec)(vec_udword2{*((uint64*)p), 0}) */ \ static const vec_bdword2 mask = {0xFFFFFFFFFFFFFFFF, 0x0000000000000000}; \ return vec_and(vec_ld_l8(p), (Tvec)mask); \ } diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index c4e74fa1e1..249ef38947 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -198,6 +198,22 @@ template struct TheTest EXPECT_EQ(data.a[0], r3.get0()); EXPECT_EQ(data.u[0], r4.get0()); + R r_low = v_load_low((LaneType*)data.u.d); + EXPECT_EQ(data.u[0], r_low.get0()); + v_store(out.u.d, r_low); + for (int i = 0; i < R::nlanes/2; ++i) + { + EXPECT_EQ((LaneType)data.u[i], (LaneType)out.u[i]); + } + + R r_low_align8byte = v_load_low((LaneType*)((char*)data.u.d + 8)); + EXPECT_EQ(data.u[R::nlanes/2], r_low_align8byte.get0()); + v_store(out.u.d, r_low_align8byte); + for (int i = 0; i < R::nlanes/2; ++i) + { + EXPECT_EQ((LaneType)data.u[i + R::nlanes/2], (LaneType)out.u[i]); + } + // check some store methods out.u.clear(); out.a.clear();