From 6a6ccf60325e7bec9386ba7b5cd232836088111c Mon Sep 17 00:00:00 2001 From: Maksim Shabunin Date: Tue, 21 Jul 2015 17:23:58 +0300 Subject: [PATCH] v_extract universal intrinsic --- .../hal/include/opencv2/hal/intrin_cpp.hpp | 14 ++++++++++++++ .../hal/include/opencv2/hal/intrin_neon.hpp | 19 +++++++++++++++++++ .../hal/include/opencv2/hal/intrin_sse.hpp | 11 +++++++++++ 3 files changed, 44 insertions(+) diff --git a/modules/hal/include/opencv2/hal/intrin_cpp.hpp b/modules/hal/include/opencv2/hal/intrin_cpp.hpp index e0140a8632..683305cc22 100644 --- a/modules/hal/include/opencv2/hal/intrin_cpp.hpp +++ b/modules/hal/include/opencv2/hal/intrin_cpp.hpp @@ -566,6 +566,7 @@ inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& c.s[i] = a.s[i]; c.s[i+(n/2)] = b.s[i]; } + return c; } template @@ -577,6 +578,7 @@ inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& c.s[i] = a.s[i+(n/2)]; c.s[i+(n/2)] = b.s[i+(n/2)]; } + return c; } template @@ -592,6 +594,18 @@ inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, } } +template +inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + v_reg<_Tp, n> r; + int i = 0; + for (; i < s; ++i) + r.s[i] = a.s[i+n-s]; + for (; i < n; ++i) + r.s[i] = b.s[i-s]; + return r; +} + template inline v_reg v_round(const v_reg& a) { v_reg c; diff --git a/modules/hal/include/opencv2/hal/intrin_neon.hpp b/modules/hal/include/opencv2/hal/intrin_neon.hpp index 31c3e30646..e326696d63 100644 --- a/modules/hal/include/opencv2/hal/intrin_neon.hpp +++ b/modules/hal/include/opencv2/hal/intrin_neon.hpp @@ -557,6 +557,8 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16) OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16) OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32) OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32) +OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, u64) +OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, s64) OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32) #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \ @@ -720,6 +722,23 @@ OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32) OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32) OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32) +#define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \ +template \ +inline v_##_Tpvec v_extract(const v_##_Tpvec& a, const v_##_Tpvec& b) \ +{ \ + return v_##_Tpvec(vextq_##suffix(a.val, b.val, s)); \ +} + +OPENCV_HAL_IMPL_NEON_EXTRACT(uint8x16, u8) +OPENCV_HAL_IMPL_NEON_EXTRACT(int8x16, s8) +OPENCV_HAL_IMPL_NEON_EXTRACT(uint16x8, u16) +OPENCV_HAL_IMPL_NEON_EXTRACT(int16x8, s16) +OPENCV_HAL_IMPL_NEON_EXTRACT(uint32x4, u32) +OPENCV_HAL_IMPL_NEON_EXTRACT(int32x4, s32) +OPENCV_HAL_IMPL_NEON_EXTRACT(uint64x2, u64) +OPENCV_HAL_IMPL_NEON_EXTRACT(int64x2, s64) +OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32) + inline v_int32x4 v_round(const v_float32x4& a) { static const int32x4_t v_sign = vdupq_n_s32(1 << 31), diff --git a/modules/hal/include/opencv2/hal/intrin_sse.hpp b/modules/hal/include/opencv2/hal/intrin_sse.hpp index 69171e2516..0c30f7d5b6 100644 --- a/modules/hal/include/opencv2/hal/intrin_sse.hpp +++ b/modules/hal/include/opencv2/hal/intrin_sse.hpp @@ -1149,6 +1149,17 @@ OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps) OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd) +template +inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) +{ + const int w = sizeof(typename _Tpvec::lane_type); + const int n = _Tpvec::nlanes; + __m128i ra, rb; + ra = _mm_srli_si128(a.val, s*w); + rb = _mm_slli_si128(b.val, (n-s)*w); + return _Tpvec(_mm_or_si128(ra, rb)); +} + inline v_int32x4 v_round(const v_float32x4& a) { return v_int32x4(_mm_cvtps_epi32(a.val)); }