From 9678d48e1aacfb57d4efcdad30401ba5b6b24185 Mon Sep 17 00:00:00 2001
From: Matthew Self <dmz@mself.com>
Date: Wed, 17 Aug 2016 00:17:45 -0700
Subject: [PATCH] 2-channel interleaved load/store for universal intrinsics
 (float only)

* Added 2-channel ops to match existing 3-channel and 4-channel ops

* v_load_deinterleave() and v_store_interleave()

* Implements float32x4 only on SSE (but all types on NEON and CPP)

* Includes tests

* Will be used to vectorize 2D functions, such as estimateAffine2D()
---
 .../include/opencv2/core/hal/intrin_cpp.hpp   | 55 ++++++++++++++++---
 .../include/opencv2/core/hal/intrin_neon.hpp  | 13 +++++
 .../include/opencv2/core/hal/intrin_sse.hpp   | 24 ++++++++
 modules/core/test/test_intrin.cpp             | 27 +++++++++
 4 files changed, 111 insertions(+), 8 deletions(-)
diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
index 04b1d928b2..97ea5ea0b6 100644
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -103,7 +103,7 @@ block and to save contents of the register to memory block.
 
 These operations allow to reorder or recombine elements in one or multiple vectors.
 
-- Interleave, deinterleave (3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
+- Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
 - Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand
 - Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u,
 @ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
@@ -1075,12 +1075,31 @@ v_load_expand_q(const _Tp* ptr)
     return c;
 }
 
-/** @brief Load and deinterleave (4 channels)
+/** @brief Load and deinterleave (2 channels)
 
-Load data from memory deinterleave and store to 4 registers.
+Load data from memory deinterleave and store to 2 registers.
 Scheme:
 @code
-{A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...}
+{A1 B1 A2 B2 ...} ==> {A1 A2 ...}, {B1 B2 ...}
+@endcode
+For all types except 64-bit. */
+template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
+                                                            v_reg<_Tp, n>& b)
+{
+    int i, i2;
+    for( i = i2 = 0; i < n; i++, i2 += 2 )
+    {
+        a.s[i] = ptr[i2];
+        b.s[i] = ptr[i2+1];
+    }
+}
+
+/** @brief Load and deinterleave (3 channels)
+
+Load data from memory deinterleave and store to 3 registers.
+Scheme:
+@code
+{A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}
 @endcode
 For all types except 64-bit. */
 template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
@@ -1095,12 +1114,12 @@ template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_
     }
 }
 
-/** @brief Load and deinterleave (3 channels)
+/** @brief Load and deinterleave (4 channels)
 
-Load data from memory deinterleave and store to 3 registers.
+Load data from memory deinterleave and store to 4 registers.
 Scheme:
 @code
-{A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}
+{A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...}
 @endcode
 For all types except 64-bit. */
 template<typename _Tp, int n>
@@ -1118,12 +1137,32 @@ inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
     }
 }
 
+/** @brief Interleave and store (2 channels)
+
+Interleave and store data from 2 registers to memory.
+Scheme:
+@code
+{A1 A2 ...}, {B1 B2 ...} ==> {A1 B1 A2 B2 ...}
+@endcode
+For all types except 64-bit. */
+template<typename _Tp, int n>
+inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
+                               const v_reg<_Tp, n>& b)
+{
+    int i, i2;
+    for( i = i2 = 0; i < n; i++, i2 += 2 )
+    {
+        ptr[i2] = a.s[i];
+        ptr[i2+1] = b.s[i];
+    }
+}
+
 /** @brief Interleave and store (3 channels)
 
 Interleave and store data from 3 registers to memory.
 Scheme:
 @code
-{A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...}
+{A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...} ==> {A1 B1 C1 A2 B2 C2 ...}
 @endcode
 For all types except 64-bit. */
 template<typename _Tp, int n>
diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
index f3e47ca8ba..dbfd6bda2b 100644
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -809,6 +809,12 @@ OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
 
 #define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
+{ \
+    _Tpvec##x2_t v = vld2q_##suffix(ptr); \
+    a.val = v.val[0]; \
+    b.val = v.val[1]; \
+} \
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
 { \
     _Tpvec##x3_t v = vld3q_##suffix(ptr); \
@@ -825,6 +831,13 @@ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
     c.val = v.val[2]; \
     d.val = v.val[3]; \
 } \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    _Tpvec##x2_t v; \
+    v.val[0] = a.val; \
+    v.val[1] = b.val; \
+    vst2q_##suffix(ptr, v); \
+} \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \
 { \
     _Tpvec##x3_t v; \
diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
index 1840e0305d..1a9f58e263 100644
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -1374,6 +1374,18 @@ inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4&
     v_transpose4x4(u0, u1, u2, u3, a, b, c, d);
 }
 
+// 2-channel, float only
+inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
+{
+    const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
+
+    __m128 u0 = _mm_loadu_ps(ptr);       // a0 b0 a1 b1
+    __m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
+
+    a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3
+    b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
+}
+
 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
                                 const v_uint8x16& c )
 {
@@ -1529,6 +1541,18 @@ inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint3
     v_store(ptr + 12, t3);
 }
 
+// 2-channel, float only
+inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b)
+{
+    // a0 a1 a2 a3 ...
+    // b0 b1 b2 b3 ...
+    __m128 u0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1
+    __m128 u1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3
+
+    _mm_storeu_ps(ptr, u0);
+    _mm_storeu_ps((ptr + 4), u1);
+}
+
 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \
 inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
                                  _Tpvec& b0, _Tpvec& c0 ) \
diff --git a/modules/core/test/test_intrin.cpp b/modules/core/test/test_intrin.cpp
index ca9d3dc7b7..6b6817c690 100644
--- a/modules/core/test/test_intrin.cpp
+++ b/modules/core/test/test_intrin.cpp
@@ -132,6 +132,32 @@ template<typename R> struct TheTest
         return *this;
     }
 
+    // float32x4 only
+    TheTest & test_interleave_2channel()
+    {
+        Data<R> data1, data2;
+        data2 += 20;
+
+        R a = data1, b = data2;
+
+        LaneType buf2[R::nlanes * 2];
+
+        v_store_interleave(buf2, a, b);
+
+        Data<R> z(0);
+        a = b = z;
+
+        v_load_deinterleave(buf2, a, b);
+
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            EXPECT_EQ(data1, Data<R>(a));
+            EXPECT_EQ(data2, Data<R>(b));
+        }
+
+        return *this;
+    }
+
     // v_expand and v_load_expand
     TheTest & test_expand()
     {
@@ -846,6 +872,7 @@ TEST(hal_intrin, float32x4) {
     TheTest<v_float32x4>()
         .test_loadstore()
         .test_interleave()
+        .test_interleave_2channel()
         .test_addsub()
         .test_mul()
         .test_div()