From 9678d48e1aacfb57d4efcdad30401ba5b6b24185 Mon Sep 17 00:00:00 2001 From: Matthew Self Date: Wed, 17 Aug 2016 00:17:45 -0700 Subject: [PATCH] 2-channel interleaved load/store for universal intrinsics (float only) * Added 2-channel ops to match existing 3-channel and 4-channel ops * v_load_deinterleave() and v_store_interleave() * Implements float32x4 only on SSE (but all types on NEON and CPP) * Includes tests * Will be used to vectorize 2D functions, such as estimateAffine2D() --- .../include/opencv2/core/hal/intrin_cpp.hpp | 55 ++++++++++++++++--- .../include/opencv2/core/hal/intrin_neon.hpp | 13 +++++ .../include/opencv2/core/hal/intrin_sse.hpp | 24 ++++++++ modules/core/test/test_intrin.cpp | 27 +++++++++ 4 files changed, 111 insertions(+), 8 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index 04b1d928b2..97ea5ea0b6 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -103,7 +103,7 @@ block and to save contents of the register to memory block. These operations allow to reorder or recombine elements in one or multiple vectors. -- Interleave, deinterleave (3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave +- Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave - Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand - Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u, @ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store @@ -1075,12 +1075,31 @@ v_load_expand_q(const _Tp* ptr) return c; } -/** @brief Load and deinterleave (4 channels) +/** @brief Load and deinterleave (2 channels) -Load data from memory deinterleave and store to 4 registers. +Load data from memory deinterleave and store to 2 registers. Scheme: @code -{A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} +{A1 B1 A2 B2 ...} ==> {A1 A2 ...}, {B1 B2 ...} +@endcode +For all types except 64-bit. */ +template inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a, + v_reg<_Tp, n>& b) +{ + int i, i2; + for( i = i2 = 0; i < n; i++, i2 += 2 ) + { + a.s[i] = ptr[i2]; + b.s[i] = ptr[i2+1]; + } +} + +/** @brief Load and deinterleave (3 channels) + +Load data from memory deinterleave and store to 3 registers. +Scheme: +@code +{A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...} @endcode For all types except 64-bit. */ template inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a, @@ -1095,12 +1114,12 @@ template inline void v_load_deinterleave(const _Tp* ptr, v_ } } -/** @brief Load and deinterleave (3 channels) +/** @brief Load and deinterleave (4 channels) -Load data from memory deinterleave and store to 3 registers. +Load data from memory deinterleave and store to 4 registers. Scheme: @code -{A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...} +{A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} @endcode For all types except 64-bit. */ template @@ -1118,12 +1137,32 @@ inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a, } } +/** @brief Interleave and store (2 channels) + +Interleave and store data from 2 registers to memory. +Scheme: +@code +{A1 A2 ...}, {B1 B2 ...} ==> {A1 B1 A2 B2 ...} +@endcode +For all types except 64-bit. */ +template +inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, + const v_reg<_Tp, n>& b) +{ + int i, i2; + for( i = i2 = 0; i < n; i++, i2 += 2 ) + { + ptr[i2] = a.s[i]; + ptr[i2+1] = b.s[i]; + } +} + /** @brief Interleave and store (3 channels) Interleave and store data from 3 registers to memory. Scheme: @code -{A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...} +{A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...} ==> {A1 B1 C1 A2 B2 C2 ...} @endcode For all types except 64-bit. */ template diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index f3e47ca8ba..dbfd6bda2b 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -809,6 +809,12 @@ OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32) OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32) #define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \ +inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \ +{ \ + _Tpvec##x2_t v = vld2q_##suffix(ptr); \ + a.val = v.val[0]; \ + b.val = v.val[1]; \ +} \ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \ { \ _Tpvec##x3_t v = vld3q_##suffix(ptr); \ @@ -825,6 +831,13 @@ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \ c.val = v.val[2]; \ d.val = v.val[3]; \ } \ +inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b) \ +{ \ + _Tpvec##x2_t v; \ + v.val[0] = a.val; \ + v.val[1] = b.val; \ + vst2q_##suffix(ptr, v); \ +} \ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \ { \ _Tpvec##x3_t v; \ diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index 1840e0305d..1a9f58e263 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -1374,6 +1374,18 @@ inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& v_transpose4x4(u0, u1, u2, u3, a, b, c, d); } +// 2-channel, float only +inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b) +{ + const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); + + __m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1 + __m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3 + + a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3 + b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3 +} + inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, const v_uint8x16& c ) { @@ -1529,6 +1541,18 @@ inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint3 v_store(ptr + 12, t3); } +// 2-channel, float only +inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b) +{ + // a0 a1 a2 a3 ... + // b0 b1 b2 b3 ... + __m128 u0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1 + __m128 u1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3 + + _mm_storeu_ps(ptr, u0); + _mm_storeu_ps((ptr + 4), u1); +} + #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \ inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \ _Tpvec& b0, _Tpvec& c0 ) \ diff --git a/modules/core/test/test_intrin.cpp b/modules/core/test/test_intrin.cpp index ca9d3dc7b7..6b6817c690 100644 --- a/modules/core/test/test_intrin.cpp +++ b/modules/core/test/test_intrin.cpp @@ -132,6 +132,32 @@ template struct TheTest return *this; } + // float32x4 only + TheTest & test_interleave_2channel() + { + Data data1, data2; + data2 += 20; + + R a = data1, b = data2; + + LaneType buf2[R::nlanes * 2]; + + v_store_interleave(buf2, a, b); + + Data z(0); + a = b = z; + + v_load_deinterleave(buf2, a, b); + + for (int i = 0; i < R::nlanes; ++i) + { + EXPECT_EQ(data1, Data(a)); + EXPECT_EQ(data2, Data(b)); + } + + return *this; + } + // v_expand and v_load_expand TheTest & test_expand() { @@ -846,6 +872,7 @@ TEST(hal_intrin, float32x4) { TheTest() .test_loadstore() .test_interleave() + .test_interleave_2channel() .test_addsub() .test_mul() .test_div()