From 6a6ccf60325e7bec9386ba7b5cd232836088111c Mon Sep 17 00:00:00 2001
From: Maksim Shabunin <maksim.shabunin@itseez.com>
Date: Tue, 21 Jul 2015 17:23:58 +0300
Subject: [PATCH] v_extract universal intrinsic

---
 .../hal/include/opencv2/hal/intrin_cpp.hpp    | 14 ++++++++++++++
 .../hal/include/opencv2/hal/intrin_neon.hpp   | 19 +++++++++++++++++++
 .../hal/include/opencv2/hal/intrin_sse.hpp    | 11 +++++++++++
 3 files changed, 44 insertions(+)
diff --git a/modules/hal/include/opencv2/hal/intrin_cpp.hpp b/modules/hal/include/opencv2/hal/intrin_cpp.hpp
index e0140a8632..683305cc22 100644
--- a/modules/hal/include/opencv2/hal/intrin_cpp.hpp
+++ b/modules/hal/include/opencv2/hal/intrin_cpp.hpp
@@ -566,6 +566,7 @@ inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>&
         c.s[i] = a.s[i];
         c.s[i+(n/2)] = b.s[i];
     }
+    return c;
 }
 
 template<typename _Tp, int n>
@@ -577,6 +578,7 @@ inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>&
         c.s[i] = a.s[i+(n/2)];
         c.s[i+(n/2)] = b.s[i+(n/2)];
     }
+    return c;
 }
 
 template<typename _Tp, int n>
@@ -592,6 +594,18 @@ inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
     }
 }
 
+template<int s, typename _Tp, int n>
+inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> r;
+    int i = 0;
+    for (; i < s; ++i)
+        r.s[i] = a.s[i+n-s];
+    for (; i < n; ++i)
+        r.s[i] = b.s[i-s];
+    return r;
+}
+
 template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
 {
     v_reg<int, n> c;
diff --git a/modules/hal/include/opencv2/hal/intrin_neon.hpp b/modules/hal/include/opencv2/hal/intrin_neon.hpp
index 31c3e30646..e326696d63 100644
--- a/modules/hal/include/opencv2/hal/intrin_neon.hpp
+++ b/modules/hal/include/opencv2/hal/intrin_neon.hpp
@@ -557,6 +557,8 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16)
 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16)
 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, s64)
 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
 
 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
@@ -720,6 +722,23 @@ OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32)
 OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32)
 OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
 
+#define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \
+template <int s> \
+inline v_##_Tpvec v_extract(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    return v_##_Tpvec(vextq_##suffix(a.val, b.val, s)); \
+}
+
+OPENCV_HAL_IMPL_NEON_EXTRACT(uint8x16, u8)
+OPENCV_HAL_IMPL_NEON_EXTRACT(int8x16, s8)
+OPENCV_HAL_IMPL_NEON_EXTRACT(uint16x8, u16)
+OPENCV_HAL_IMPL_NEON_EXTRACT(int16x8, s16)
+OPENCV_HAL_IMPL_NEON_EXTRACT(uint32x4, u32)
+OPENCV_HAL_IMPL_NEON_EXTRACT(int32x4, s32)
+OPENCV_HAL_IMPL_NEON_EXTRACT(uint64x2, u64)
+OPENCV_HAL_IMPL_NEON_EXTRACT(int64x2, s64)
+OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
+
 inline v_int32x4 v_round(const v_float32x4& a)
 {
     static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
diff --git a/modules/hal/include/opencv2/hal/intrin_sse.hpp b/modules/hal/include/opencv2/hal/intrin_sse.hpp
index 69171e2516..0c30f7d5b6 100644
--- a/modules/hal/include/opencv2/hal/intrin_sse.hpp
+++ b/modules/hal/include/opencv2/hal/intrin_sse.hpp
@@ -1149,6 +1149,17 @@ OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
 
+template<int s, typename _Tpvec>
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
+{
+    const int w = sizeof(typename _Tpvec::lane_type);
+    const int n = _Tpvec::nlanes;
+    __m128i ra, rb;
+    ra = _mm_srli_si128(a.val, s*w);
+    rb = _mm_slli_si128(b.val, (n-s)*w);
+    return _Tpvec(_mm_or_si128(ra, rb));
+}
+
 inline v_int32x4 v_round(const v_float32x4& a)
 { return v_int32x4(_mm_cvtps_epi32(a.val)); }