Merge pull request #7182 from mself:two_channel_universal_intrinsics

pull/7212/head
Maksim Shabunin 9 years ago
commit 595fd2757c
  1. 55
      modules/core/include/opencv2/core/hal/intrin_cpp.hpp
  2. 13
      modules/core/include/opencv2/core/hal/intrin_neon.hpp
  3. 24
      modules/core/include/opencv2/core/hal/intrin_sse.hpp
  4. 27
      modules/core/test/test_intrin.cpp

@ -103,7 +103,7 @@ block and to save contents of the register to memory block.
These operations allow to reorder or recombine elements in one or multiple vectors.
- Interleave, deinterleave (3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
- Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
- Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand
- Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u,
@ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
@ -1075,12 +1075,31 @@ v_load_expand_q(const _Tp* ptr)
return c;
}
/** @brief Load and deinterleave (4 channels)
/** @brief Load and deinterleave (2 channels)
Load data from memory deinterleave and store to 4 registers.
Load data from memory deinterleave and store to 2 registers.
Scheme:
@code
{A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...}
{A1 B1 A2 B2 ...} ==> {A1 A2 ...}, {B1 B2 ...}
@endcode
For all types except 64-bit. */
template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
v_reg<_Tp, n>& b)
{
int i, i2;
for( i = i2 = 0; i < n; i++, i2 += 2 )
{
a.s[i] = ptr[i2];
b.s[i] = ptr[i2+1];
}
}
/** @brief Load and deinterleave (3 channels)
Load data from memory deinterleave and store to 3 registers.
Scheme:
@code
{A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}
@endcode
For all types except 64-bit. */
template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
@ -1095,12 +1114,12 @@ template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_
}
}
/** @brief Load and deinterleave (3 channels)
/** @brief Load and deinterleave (4 channels)
Load data from memory deinterleave and store to 3 registers.
Load data from memory deinterleave and store to 4 registers.
Scheme:
@code
{A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}
{A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...}
@endcode
For all types except 64-bit. */
template<typename _Tp, int n>
@ -1118,12 +1137,32 @@ inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
}
}
/** @brief Interleave and store (2 channels)
Interleave and store data from 2 registers to memory.
Scheme:
@code
{A1 A2 ...}, {B1 B2 ...} ==> {A1 B1 A2 B2 ...}
@endcode
For all types except 64-bit. */
template<typename _Tp, int n>
inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
const v_reg<_Tp, n>& b)
{
int i, i2;
for( i = i2 = 0; i < n; i++, i2 += 2 )
{
ptr[i2] = a.s[i];
ptr[i2+1] = b.s[i];
}
}
/** @brief Interleave and store (3 channels)
Interleave and store data from 3 registers to memory.
Scheme:
@code
{A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...}
{A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...} ==> {A1 B1 C1 A2 B2 C2 ...}
@endcode
For all types except 64-bit. */
template<typename _Tp, int n>

@ -809,6 +809,12 @@ OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
#define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
{ \
_Tpvec##x2_t v = vld2q_##suffix(ptr); \
a.val = v.val[0]; \
b.val = v.val[1]; \
} \
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
{ \
_Tpvec##x3_t v = vld3q_##suffix(ptr); \
@ -825,6 +831,13 @@ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
c.val = v.val[2]; \
d.val = v.val[3]; \
} \
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b) \
{ \
_Tpvec##x2_t v; \
v.val[0] = a.val; \
v.val[1] = b.val; \
vst2q_##suffix(ptr, v); \
} \
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \
{ \
_Tpvec##x3_t v; \

@ -1374,6 +1374,18 @@ inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4&
v_transpose4x4(u0, u1, u2, u3, a, b, c, d);
}
// 2-channel, float only
inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
{
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
__m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1
__m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3
b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
}
inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
const v_uint8x16& c )
{
@ -1529,6 +1541,18 @@ inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint3
v_store(ptr + 12, t3);
}
// 2-channel, float only
inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b)
{
// a0 a1 a2 a3 ...
// b0 b1 b2 b3 ...
__m128 u0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1
__m128 u1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3
_mm_storeu_ps(ptr, u0);
_mm_storeu_ps((ptr + 4), u1);
}
#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \
inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
_Tpvec& b0, _Tpvec& c0 ) \

@ -132,6 +132,32 @@ template<typename R> struct TheTest
return *this;
}
// float32x4 only
TheTest & test_interleave_2channel()
{
Data<R> data1, data2;
data2 += 20;
R a = data1, b = data2;
LaneType buf2[R::nlanes * 2];
v_store_interleave(buf2, a, b);
Data<R> z(0);
a = b = z;
v_load_deinterleave(buf2, a, b);
for (int i = 0; i < R::nlanes; ++i)
{
EXPECT_EQ(data1, Data<R>(a));
EXPECT_EQ(data2, Data<R>(b));
}
return *this;
}
// v_expand and v_load_expand
TheTest & test_expand()
{
@ -846,6 +872,7 @@ TEST(hal_intrin, float32x4) {
TheTest<v_float32x4>()
.test_loadstore()
.test_interleave()
.test_interleave_2channel()
.test_addsub()
.test_mul()
.test_div()

Loading…
Cancel
Save