converted split() & merge() to wide univ intrinsics (#12044)

* fixed/updated v_load_deinterleave and v_store_interleave intrinsics; modified split() and merge() functions to use those intrinsics

* fixed a few compile errors and bug in v_load_deinterleave(ptr, v_uint32x4& a, v_uint32x4& b)

* fixed few more compile errors
pull/12055/head^2
Vadim Pisarevsky 7 years ago committed by GitHub
parent 8de08e0463
commit 9c7040802c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 822
      modules/core/include/opencv2/core/hal/intrin_avx.hpp
  2. 77
      modules/core/include/opencv2/core/hal/intrin_neon.hpp
  3. 327
      modules/core/include/opencv2/core/hal/intrin_sse.hpp
  4. 2
      modules/core/include/opencv2/core/hal/intrin_vsx.hpp
  5. 347
      modules/core/src/merge.cpp
  6. 361
      modules/core/src/split.cpp

@ -1609,392 +1609,592 @@ OPENCV_HAL_IMPL_AVX_EXTRACT(v_float32x8)
OPENCV_HAL_IMPL_AVX_EXTRACT(v_float64x4)
/** Reinterpret **/
// its up there with load and store operations
/* de&interleave */
#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_2CH(_Tpvec, _Tp, suffix) \
inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b) \
{ return v256_load_deinterleave_##suffix(ptr, a, b); } \
inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b) \
{ return v256_store_interleave_2ch(ptr, a, b); }
///////////////////// load deinterleave /////////////////////////////
#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_3CH(_Tpvec, _Tp, suffix) \
inline void v_load_deinterleave \
(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c) \
{ return v256_load_deinterleave_##suffix(ptr, a, b, c); } \
inline void v_store_interleave \
(_Tp* ptr, const _Tpvec& a,const _Tpvec& b, const _Tpvec& c) \
{ return v256_store_interleave_##suffix(ptr, a, b, c); }
inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b )
{
__m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
__m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_4CH(_Tpvec, _Tp, suffix) \
inline void v_load_deinterleave \
(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d) \
{ return v256_load_deinterleave_##suffix(ptr, a, b, c, d); } \
inline void v_store_interleave \
(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d) \
{ return v256_store_interleave_##suffix(ptr, a, b, c, d); }
static const __m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
__m256i p0 = _mm256_shuffle_epi8(ab0, sh);
__m256i p1 = _mm256_shuffle_epi8(ab1, sh);
__m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
__m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
__m256i a0 = _mm256_unpacklo_epi64(pl, ph);
__m256i b0 = _mm256_unpackhi_epi64(pl, ph);
a = v_uint8x32(a0);
b = v_uint8x32(b0);
}
inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b )
{
__m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
__m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
static const __m256i sh = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
__m256i p0 = _mm256_shuffle_epi8(ab0, sh);
__m256i p1 = _mm256_shuffle_epi8(ab1, sh);
__m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
__m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
__m256i a0 = _mm256_unpacklo_epi64(pl, ph);
__m256i b0 = _mm256_unpackhi_epi64(pl, ph);
a = v_uint16x16(a0);
b = v_uint16x16(b0);
}
inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b )
{
__m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
__m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
const int sh = 0+2*4+1*16+3*64;
__m256i p0 = _mm256_shuffle_epi32(ab0, sh);
__m256i p1 = _mm256_shuffle_epi32(ab1, sh);
__m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
__m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
__m256i a0 = _mm256_unpacklo_epi64(pl, ph);
__m256i b0 = _mm256_unpackhi_epi64(pl, ph);
a = v_uint32x8(a0);
b = v_uint32x8(b0);
}
#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_3n4CH(_Tpvec, _Tp, suffix) \
OPENCV_HAL_IMPL_AVX_INTERLEAVE_3CH(_Tpvec, _Tp, suffix) \
OPENCV_HAL_IMPL_AVX_INTERLEAVE_4CH(_Tpvec, _Tp, suffix)
inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b )
{
__m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
__m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(_Tpvec, _Tp, suffix) \
OPENCV_HAL_IMPL_AVX_INTERLEAVE_2CH(_Tpvec, _Tp, suffix) \
OPENCV_HAL_IMPL_AVX_INTERLEAVE_3n4CH(_Tpvec, _Tp, suffix)
__m256i pl = _mm256_permute2x128_si256(ab0, ab1, 0 + 2*16);
__m256i ph = _mm256_permute2x128_si256(ab0, ab1, 1 + 3*16);
__m256i a0 = _mm256_unpacklo_epi64(pl, ph);
__m256i b0 = _mm256_unpackhi_epi64(pl, ph);
a = v_uint64x4(a0);
b = v_uint64x4(b0);
}
/* **** */
//
template<typename _Tp, typename _Tpvec>
inline void v256_store_interleave_2ch(_Tp* ptr, const _Tpvec& a, const _Tpvec& b)
inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& b, v_uint8x32& g, v_uint8x32& r )
{
_Tpvec ab0, ab1;
v_zip(a, b, ab0, ab1);
v_store(ptr, ab0);
v_store(ptr + _Tpvec::nlanes, ab1);
}
__m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
__m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
__m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64));
template<typename _Tp, typename _Tpvec>
inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
{
_Tpvec ab0 = v256_load(ptr);
_Tpvec ab1 = v256_load(ptr + _Tpvec::nlanes);
_Tpvec ab00, ab11;
v_recombine(ab0, ab1, ab00, ab11);
v256_zip(ab00, ab11, a, b);
}
__m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
__m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
///
template<typename _Tp, typename _Tpvec>
inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c)
{
_Tpvec abc0 = v256_load(ptr);
_Tpvec abc1 = v256_load(ptr + _Tpvec::nlanes);
_Tpvec abc2 = v256_load(ptr + _Tpvec::nlanes * 2);
static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
static const __m256i m1 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
-1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1);
_Tpvec ab0 = v256_combine_diagonal(abc0, abc1);
_Tpvec bc1 = v256_combine_diagonal(abc1, abc2);
_Tpvec ac1 = v256_reverse_64(v256_combine_diagonal(abc2, abc0));
__m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
__m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
__m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
a = v256_unpacklo(ab0, ac1);
c = v256_unpackhi(ac1, bc1);
b = v256_alignr_64(bc1, ab0);
}
static const __m256i
sh_b = _mm256_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13,
0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13),
sh_g = _mm256_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14,
1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14),
sh_r = _mm256_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15,
2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
b0 = _mm256_shuffle_epi8(b0, sh_b);
g0 = _mm256_shuffle_epi8(g0, sh_g);
r0 = _mm256_shuffle_epi8(r0, sh_r);
b = v_uint8x32(b0);
g = v_uint8x32(g0);
r = v_uint8x32(r0);
}
template<typename _Tp, typename _Tpvec>
inline void v256_store_interleave_l4(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)
{
_Tpvec ab0 = v256_unpacklo(a, b);
_Tpvec bc1 = v256_unpackhi(b, c);
_Tpvec ca10 = v256_swap_halves(v256_blend<0xa>(c, a));
inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& b, v_uint16x16& g, v_uint16x16& r )
{
__m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
__m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
__m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
__m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
__m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
static const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
-1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
__m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
__m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
__m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
static const __m256i sh_b = _mm256_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
static const __m256i sh_g = _mm256_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13,
2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
static const __m256i sh_r = _mm256_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
b0 = _mm256_shuffle_epi8(b0, sh_b);
g0 = _mm256_shuffle_epi8(g0, sh_g);
r0 = _mm256_shuffle_epi8(r0, sh_r);
b = v_uint16x16(b0);
g = v_uint16x16(g0);
r = v_uint16x16(r0);
}
inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& b, v_uint32x8& g, v_uint32x8& r )
{
__m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
__m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
__m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
__m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
__m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
__m256i b0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_low, s02_high, 0x24), bgr1, 0x92);
__m256i g0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_high, s02_low, 0x92), bgr1, 0x24);
__m256i r0 = _mm256_blend_epi32(_mm256_blend_epi32(bgr1, s02_low, 0x24), s02_high, 0x92);
b0 = _mm256_shuffle_epi32(b0, 0x6c);
g0 = _mm256_shuffle_epi32(g0, 0xb1);
r0 = _mm256_shuffle_epi32(r0, 0xc6);
b = v_uint32x8(b0);
g = v_uint32x8(g0);
r = v_uint32x8(r0);
}
inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& b, v_uint64x4& g, v_uint64x4& r )
{
__m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
__m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
__m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
__m256i s01 = _mm256_blend_epi32(bgr0, bgr1, 0xf0);
__m256i s12 = _mm256_blend_epi32(bgr1, bgr2, 0xf0);
__m256i s20r = _mm256_permute4x64_epi64(_mm256_blend_epi32(bgr2, bgr0, 0xf0), 0x1b);
__m256i b0 = _mm256_unpacklo_epi64(s01, s20r);
__m256i g0 = _mm256_alignr_epi8(s12, s01, 8);
__m256i r0 = _mm256_unpackhi_epi64(s20r, s12);
b = v_uint64x4(b0);
g = v_uint64x4(g0);
r = v_uint64x4(r0);
}
inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& b, v_uint8x32& g, v_uint8x32& r, v_uint8x32& a )
{
__m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
__m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
__m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64));
__m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 96));
static const __m256i sh = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
__m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
__m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
__m256i p2 = _mm256_shuffle_epi8(bgr2, sh);
__m256i p3 = _mm256_shuffle_epi8(bgr3, sh);
__m256i p01l = _mm256_unpacklo_epi32(p0, p1);
__m256i p01h = _mm256_unpackhi_epi32(p0, p1);
__m256i p23l = _mm256_unpacklo_epi32(p2, p3);
__m256i p23h = _mm256_unpackhi_epi32(p2, p3);
__m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
__m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
__m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
__m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
__m256i b0 = _mm256_unpacklo_epi32(pll, plh);
__m256i g0 = _mm256_unpackhi_epi32(pll, plh);
__m256i r0 = _mm256_unpacklo_epi32(phl, phh);
__m256i a0 = _mm256_unpackhi_epi32(phl, phh);
v_store(ptr, v256_combine_diagonal(ab0, ca10));
v_store(ptr + _Tpvec::nlanes, v256_combine_diagonal(bc1, ab0));
v_store(ptr + _Tpvec::nlanes * 2, v256_combine_diagonal(ca10, bc1));
b = v_uint8x32(b0);
g = v_uint8x32(g0);
r = v_uint8x32(r0);
a = v_uint8x32(a0);
}
////
template<typename _Tp, typename _Tpvec>
inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& b, v_uint16x16& g, v_uint16x16& r, v_uint16x16& a )
{
_Tpvec abcd0 = v256_load(ptr);
_Tpvec abcd1 = v256_load(ptr + _Tpvec::nlanes);
_Tpvec abcd2 = v256_load(ptr + _Tpvec::nlanes * 2);
_Tpvec abcd3 = v256_load(ptr + _Tpvec::nlanes * 3);
__m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
__m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
__m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
__m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 48));
static const __m256i sh = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
__m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
__m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
__m256i p2 = _mm256_shuffle_epi8(bgr2, sh);
__m256i p3 = _mm256_shuffle_epi8(bgr3, sh);
__m256i p01l = _mm256_unpacklo_epi32(p0, p1);
__m256i p01h = _mm256_unpackhi_epi32(p0, p1);
__m256i p23l = _mm256_unpacklo_epi32(p2, p3);
__m256i p23h = _mm256_unpackhi_epi32(p2, p3);
_Tpvec cd0ab0 = v256_alignr_128(abcd0, abcd2);
_Tpvec cd1ab1 = v256_alignr_128(abcd1, abcd3);
__m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
__m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
__m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
__m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
_Tpvec ab0 = v256_combine_diagonal(abcd0, cd0ab0);
_Tpvec ab1 = v256_combine_diagonal(abcd1, cd1ab1);
_Tpvec cd0 = v256_combine_diagonal(cd0ab0, abcd2);
_Tpvec cd1 = v256_combine_diagonal(cd1ab1, abcd3);
__m256i b0 = _mm256_unpacklo_epi32(pll, plh);
__m256i g0 = _mm256_unpackhi_epi32(pll, plh);
__m256i r0 = _mm256_unpacklo_epi32(phl, phh);
__m256i a0 = _mm256_unpackhi_epi32(phl, phh);
v256_zip(ab0, ab1, a, b);
v256_zip(cd0, cd1, c, d);
b = v_uint16x16(b0);
g = v_uint16x16(g0);
r = v_uint16x16(r0);
a = v_uint16x16(a0);
}
template<typename _Tp, typename _Tpvec>
inline void v256_store_interleave_l4(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& b, v_uint32x8& g, v_uint32x8& r, v_uint32x8& a )
{
_Tpvec ab0, ab1, cd0, cd1;
v256_zip(a, b, ab0, ab1);
v256_zip(c, d, cd0, cd1);
_Tpvec ab0cd0 = v256_alignr_128(ab0, cd0);
_Tpvec ab1cd1 = v256_alignr_128(ab1, cd1);
v_store(ptr, v256_combine_diagonal(ab0, ab0cd0));
v_store(ptr + _Tpvec::nlanes, v256_combine_diagonal(ab1, ab1cd1));
v_store(ptr + _Tpvec::nlanes * 2, v256_combine_diagonal(ab0cd0, cd0));
v_store(ptr + _Tpvec::nlanes * 3, v256_combine_diagonal(ab1cd1, cd1));
}
__m256i p0 = _mm256_loadu_si256((const __m256i*)ptr);
__m256i p1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
__m256i p2 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
__m256i p3 = _mm256_loadu_si256((const __m256i*)(ptr + 24));
OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint64x4, uint64, l4)
OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int64x4, int64, l4)
OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_float64x4, double, l4)
__m256i p01l = _mm256_unpacklo_epi32(p0, p1);
__m256i p01h = _mm256_unpackhi_epi32(p0, p1);
__m256i p23l = _mm256_unpacklo_epi32(p2, p3);
__m256i p23h = _mm256_unpackhi_epi32(p2, p3);
/* **** **** */
//
inline void v256_load_deinterleave_l8(const float* ptr, v_float32x8& a, v_float32x8& b)
{
v_float32x8 ab0 = v256_load(ptr);
v_float32x8 ab1 = v256_load(ptr + 8);
__m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
__m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
__m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
__m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
v_float32x8 ab0ab2, ab1ab3;
v_recombine(ab0, ab1, ab0ab2, ab1ab3);
__m256i b0 = _mm256_unpacklo_epi32(pll, plh);
__m256i g0 = _mm256_unpackhi_epi32(pll, plh);
__m256i r0 = _mm256_unpacklo_epi32(phl, phh);
__m256i a0 = _mm256_unpackhi_epi32(phl, phh);
a.val = _mm256_shuffle_ps(ab0ab2.val, ab1ab3.val, _MM_SHUFFLE(2, 0, 2, 0));
b.val = _mm256_shuffle_ps(ab0ab2.val, ab1ab3.val, _MM_SHUFFLE(3, 1, 3, 1));
b = v_uint32x8(b0);
g = v_uint32x8(g0);
r = v_uint32x8(r0);
a = v_uint32x8(a0);
}
template<typename _Tp, typename _Tpvec>
inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& b, v_uint64x4& g, v_uint64x4& r, v_uint64x4& a )
{
v_float32x8 fa, fb;
v256_load_deinterleave_l8((float*)ptr, fa, fb);
a.val = v_reinterpret_as_u32(fa).val;
b.val = v_reinterpret_as_u32(fb).val;
}
///
template<typename _Tp, typename _Tpvec>
inline void v256_store_interleave_l8(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)
{
_Tpvec ab0, ab1, bc0, bc1;
v256_zip(a, b, ab0, ab1);
v256_zip(b, c, bc0, bc1);
__m256i bgra0 = _mm256_loadu_si256((const __m256i*)ptr);
__m256i bgra1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
__m256i bgra2 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
__m256i bgra3 = _mm256_loadu_si256((const __m256i*)(ptr + 12));
_Tpvec cazg = v256_blend<0xaa>(c, a);
_Tpvec abc0abc1(_mm256_unpacklo_epi64(ab0.val, cazg.val));
_Tpvec abc1abc2(_mm256_unpackhi_epi64(cazg.val, bc1.val));
_Tpvec abc2abc0 = v256_reverse_64(v256_blend<0xcc>(ab1, bc0));
__m256i l02 = _mm256_permute2x128_si256(bgra0, bgra2, 0 + 2*16);
__m256i h02 = _mm256_permute2x128_si256(bgra0, bgra2, 1 + 3*16);
__m256i l13 = _mm256_permute2x128_si256(bgra1, bgra3, 0 + 2*16);
__m256i h13 = _mm256_permute2x128_si256(bgra1, bgra3, 1 + 3*16);
_Tpvec abc0 = v256_combine_diagonal(abc0abc1, abc2abc0);
_Tpvec abc1 = v256_combine_diagonal(abc1abc2, abc0abc1);
_Tpvec abc2 = v256_combine_diagonal(abc2abc0, abc1abc2);
__m256i b0 = _mm256_unpacklo_epi64(l02, l13);
__m256i g0 = _mm256_unpackhi_epi64(l02, l13);
__m256i r0 = _mm256_unpacklo_epi64(h02, h13);
__m256i a0 = _mm256_unpackhi_epi64(h02, h13);
v_store(ptr, abc0);
v_store(ptr + _Tpvec::nlanes, abc1);
v_store(ptr + _Tpvec::nlanes * 2, abc2);
b = v_uint64x4(b0);
g = v_uint64x4(g0);
r = v_uint64x4(r0);
a = v_uint64x4(a0);
}
inline void v256_store_interleave_l8(float* ptr, const v_float32x8& a, const v_float32x8& b, const v_float32x8& c)
{
v_float32x8 ab0, ab1, bc0, bc1;
v256_zip(a, b, ab0, ab1);
v256_zip(b, c, bc0, bc1);
///////////////////////////// store interleave /////////////////////////////////////
v_float32x8 cazg = v256_blend<0xaa>(c, a);
v_float32x8 abc0abc1(_mm256_shuffle_ps(ab0.val, cazg.val, _MM_SHUFFLE(1, 0, 1, 0)));
v_float32x8 abc1abc2(_mm256_shuffle_ps(cazg.val, bc1.val, _MM_SHUFFLE(3, 2, 3, 2)));
v_float32x8 abc0abc2(_mm256_shuffle_ps(bc0.val, ab1.val, _MM_SHUFFLE(1, 0, 3, 2)));
v_float32x8 abc2abc0 = v256_swap_halves(abc0abc2);
inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x32& y )
{
__m256i xy_l = _mm256_unpacklo_epi8(x.val, y.val);
__m256i xy_h = _mm256_unpackhi_epi8(x.val, y.val);
v_float32x8 abc0 = v256_combine_diagonal(abc0abc1, abc2abc0);
v_float32x8 abc1 = v256_combine_diagonal(abc1abc2, abc0abc1);
v_float32x8 abc2 = v256_combine_diagonal(abc2abc0, abc1abc2);
__m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
__m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
v_store(ptr, abc0);
v_store(ptr + 8, abc1);
v_store(ptr + 16, abc2);
_mm256_storeu_si256((__m256i*)ptr, xy0);
_mm256_storeu_si256((__m256i*)(ptr + 32), xy1);
}
template<typename _Tp, typename _Tpvec>
inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c)
inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint16x16& y )
{
_Tpvec abc02 = v256_load(ptr);
_Tpvec abc1 = v256_load(ptr + _Tpvec::nlanes);
_Tpvec abc20 = v256_load(ptr + _Tpvec::nlanes * 2);
__m256i xy_l = _mm256_unpacklo_epi16(x.val, y.val);
__m256i xy_h = _mm256_unpackhi_epi16(x.val, y.val);
_Tpvec abc2 = v256_alignr_128(abc02, abc20);
_Tpvec abc0 = v256_combine_diagonal(abc02, abc20);
__m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
__m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
a = v256_blend<0x92>(abc0, abc1);
a = v256_blend<0x44>(a, abc2);
_mm256_storeu_si256((__m256i*)ptr, xy0);
_mm256_storeu_si256((__m256i*)(ptr + 16), xy1);
}
b = v256_blend<0x24>(abc0, abc1);
b = v256_blend<0x99>(b, abc2);
inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint32x8& y )
{
__m256i xy_l = _mm256_unpacklo_epi32(x.val, y.val);
__m256i xy_h = _mm256_unpackhi_epi32(x.val, y.val);
c = v256_blend<0x49>(abc0, abc1);
c = v256_blend<0x22>(c, abc2);
__m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
__m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
a = v256_shuffle<_MM_SHUFFLE(1, 2, 3, 0)>(a);
b = v256_shuffle<_MM_SHUFFLE(2, 3, 0, 1)>(b);
c = v256_shuffle<_MM_SHUFFLE(3, 0, 1, 2)>(c);
}
/////
template<typename _Tp, typename _Tpvec>
inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
{
_Tpvec ab0, ab1, cd0, cd1;
v256_load_deinterleave_l4(ptr, ab0, cd0, ab1, cd1);
v256_zip(ab0, ab1, a, b);
v256_zip(cd0, cd1, c, d);
_mm256_storeu_si256((__m256i*)ptr, xy0);
_mm256_storeu_si256((__m256i*)(ptr + 8), xy1);
}
template<typename _Tp, typename _Tpvec>
inline void v256_store_interleave_l8(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64x4& y )
{
_Tpvec ac0, ac1, bd0, bd1;
v256_zip(a, c, ac0, ac1);
v256_zip(b, d, bd0, bd1);
_Tpvec abcd0, abcd1, abcd2, abcd3;
v256_zip(ac0, bd0, abcd0, abcd1);
v256_zip(ac1, bd1, abcd2, abcd3);
__m256i xy_l = _mm256_unpacklo_epi64(x.val, y.val);
__m256i xy_h = _mm256_unpackhi_epi64(x.val, y.val);
_Tpvec abcd01, abcd23, abcd45, abcd67;
v_recombine(abcd0, abcd1, abcd01, abcd45);
v_recombine(abcd2, abcd3, abcd23, abcd67);
__m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
__m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
v_store(ptr, abcd01);
v_store(ptr + _Tpvec::nlanes, abcd23);
v_store(ptr + _Tpvec::nlanes * 2, abcd45);
v_store(ptr + _Tpvec::nlanes * 3, abcd67);
_mm256_storeu_si256((__m256i*)ptr, xy0);
_mm256_storeu_si256((__m256i*)(ptr + 4), xy1);
}
OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint32x8, unsigned, l8)
OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int32x8, int, l8)
OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_float32x8, float, l8)
/* ******** ******** */
//
template<typename _Tp, typename _Tpvec>
inline void v256_load_deinterleave_l16(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r )
{
const __m256i sep = _mm256_setr_epi8(
0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
);
static const __m256i sh_b = _mm256_setr_epi8(
0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5,
0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
static const __m256i sh_g = _mm256_setr_epi8(
5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10,
5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
static const __m256i sh_r = _mm256_setr_epi8(
10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15,
10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
_Tpvec ab0, ab1;
v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes), ab0, ab1);
__m256i b0 = _mm256_shuffle_epi8(b.val, sh_b);
__m256i g0 = _mm256_shuffle_epi8(g.val, sh_g);
__m256i r0 = _mm256_shuffle_epi8(r.val, sh_r);
__m256i a0b0 = _mm256_shuffle_epi8(ab0.val, sep);
__m256i a1b1 = _mm256_shuffle_epi8(ab1.val, sep);
a.val = _mm256_unpacklo_epi64(a0b0, a1b1);
b.val = _mm256_unpackhi_epi64(a0b0, a1b1);
}
///
template<typename _Tp, typename _Tpvec>
inline void v256_store_interleave_l16(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)
{
v_uint32x8 ab0 = v_reinterpret_as_u32(v256_unpacklo(a, b));
v_uint32x8 ab1 = v_reinterpret_as_u32(v256_unpackhi(a, b));
v_uint32x8 bc0 = v_reinterpret_as_u32(v256_unpacklo(b, c));
v_uint32x8 bc1 = v_reinterpret_as_u32(v256_unpackhi(b, c));
static const __m256i m0 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
static const __m256i m1 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
v_uint32x8 cazg = v_reinterpret_as_u32(v256_blend<0xaa>(c, a));
cazg = v256_shuffle<_MM_SHUFFLE(2, 1, 0, 3)>(cazg);
__m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
__m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);
__m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);
v_uint32x8 ac1ab1 = v256_blend<0xaa>(ab1, bc1);
ac1ab1 = v256_shuffle<_MM_SHUFFLE(2, 1, 0, 3)>(ac1ab1);
__m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
__m256i bgr1 = _mm256_permute2x128_si256(p2, p0, 0 + 3*16);
__m256i bgr2 = _mm256_permute2x128_si256(p1, p2, 1 + 3*16);
v_uint32x8 abc001 = v256_blend<0xaa>(ab0, cazg);
v_uint32x8 cabc0 = v256_blend<0xaa>(cazg, bc0);
_mm256_storeu_si256((__m256i*)ptr, bgr0);
_mm256_storeu_si256((__m256i*)(ptr + 32), bgr1);
_mm256_storeu_si256((__m256i*)(ptr + 64), bgr2);
}
v_uint32x8 cabc1 = v256_unpacklo(cabc0, ac1ab1);
v_uint32x8 bcab0 = v256_unpackhi(cabc1, abc001);
inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g, const v_uint16x16& r )
{
static const __m256i sh_b = _mm256_setr_epi8(
0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
static const __m256i sh_g = _mm256_setr_epi8(
10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5,
10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
static const __m256i sh_r = _mm256_setr_epi8(
4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
v_uint64x4 abc01 = v256_unpacklo(v_reinterpret_as_u64(abc001), v_reinterpret_as_u64(bcab0));
v_uint64x4 abc21 = v256_unpackhi(v_reinterpret_as_u64(cabc0), v_reinterpret_as_u64(bcab0));
abc21 = v256_swap_halves(abc21);
v_uint64x4 abc12 = v_reinterpret_as_u64(v256_alignr_64(cabc1, ac1ab1));
__m256i b0 = _mm256_shuffle_epi8(b.val, sh_b);
__m256i g0 = _mm256_shuffle_epi8(g.val, sh_g);
__m256i r0 = _mm256_shuffle_epi8(r.val, sh_r);
v_uint64x4 abc0 = v256_combine_diagonal(abc01, abc21);
v_uint64x4 abc1 = v256_combine_diagonal(abc12, abc01);
v_uint64x4 abc2 = v256_combine_diagonal(abc21, abc12);
static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
static const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
-1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
v_store(ptr, _Tpvec(abc0.val));
v_store(ptr + _Tpvec::nlanes, _Tpvec(abc1.val));
v_store(ptr + _Tpvec::nlanes * 2, _Tpvec(abc2.val));
}
// todo:
template<typename _Tp, typename _Tpvec>
inline void v256_load_deinterleave_l16(const _Tp*, _Tpvec&, _Tpvec&, _Tpvec&)
{}
////
template<typename _Tp, typename _Tpvec>
inline void v256_load_deinterleave_l16(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
{
_Tpvec ab0, ab1, cd0, cd1;
v256_load_deinterleave_l8(ptr, ab0, cd0, ab1, cd1);
v256_zip(ab0, ab1, a, b);
v256_zip(cd0, cd1, c, d);
}
__m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
__m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);
__m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);
template<typename _Tp, typename _Tpvec>
inline void v256_store_interleave_l16(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
{ v256_store_interleave_l8(ptr, a, b, c, d); }
__m256i bgr0 = _mm256_permute2x128_si256(p0, p2, 0 + 2*16);
//__m256i bgr1 = p1;
__m256i bgr2 = _mm256_permute2x128_si256(p0, p2, 1 + 3*16);
OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint16x16, ushort, l16)
OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int16x16, short, l16)
_mm256_storeu_si256((__m256i*)ptr, bgr0);
_mm256_storeu_si256((__m256i*)(ptr + 16), p1);
_mm256_storeu_si256((__m256i*)(ptr + 32), bgr2);
}
/* **************** **************** */
//
template<typename _Tp, typename _Tpvec>
inline void v256_load_deinterleave_l32(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g, const v_uint32x8& r )
{
const __m256i sep = _mm256_setr_epi8(
0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
);
__m256i b0 = _mm256_shuffle_epi32(b.val, 0x6c);
__m256i g0 = _mm256_shuffle_epi32(g.val, 0xb1);
__m256i r0 = _mm256_shuffle_epi32(r.val, 0xc6);
_Tpvec ab0, ab1;
v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes), ab0, ab1);
__m256i p0 = _mm256_blend_epi32(_mm256_blend_epi32(b0, g0, 0x92), r0, 0x24);
__m256i p1 = _mm256_blend_epi32(_mm256_blend_epi32(g0, r0, 0x92), b0, 0x24);
__m256i p2 = _mm256_blend_epi32(_mm256_blend_epi32(r0, b0, 0x92), g0, 0x24);
__m256i a0b0 = _mm256_shuffle_epi8(ab0.val, sep);
__m256i a1b1 = _mm256_shuffle_epi8(ab1.val, sep);
__m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
//__m256i bgr1 = p2;
__m256i bgr2 = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
a.val = _mm256_unpacklo_epi64(a0b0, a1b1);
b.val = _mm256_unpackhi_epi64(a0b0, a1b1);
_mm256_storeu_si256((__m256i*)ptr, bgr0);
_mm256_storeu_si256((__m256i*)(ptr + 8), p2);
_mm256_storeu_si256((__m256i*)(ptr + 16), bgr2);
}
/// todo
template<typename _Tp, typename _Tpvec>
inline void v256_store_interleave_l32(_Tp*, const _Tpvec&, const _Tpvec&, const _Tpvec&)
{}
template<typename _Tp, typename _Tpvec>
inline void v256_load_deinterleave_l32(const _Tp*, _Tpvec&, _Tpvec&, _Tpvec&)
{}
////
template<typename _Tp, typename _Tpvec>
inline void v256_load_deinterleave_l32(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g, const v_uint64x4& r )
{
const __m256i sep = _mm256_setr_epi8(
0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
);
_Tpvec abcd0, abcd1, abcd2, abcd3;
v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes * 2), abcd0, abcd1);
v_recombine(v256_load(ptr + _Tpvec::nlanes), v256_load(ptr + _Tpvec::nlanes * 3), abcd2, abcd3);
__m256i s01 = _mm256_unpacklo_epi64(b.val, g.val);
__m256i s12 = _mm256_unpackhi_epi64(g.val, r.val);
__m256i s20 = _mm256_blend_epi32(r.val, b.val, 0xcc);
__m256i ab0cd0 = _mm256_shuffle_epi8(abcd0.val, sep);
__m256i ab1cd1 = _mm256_shuffle_epi8(abcd1.val, sep);
__m256i ab2cd2 = _mm256_shuffle_epi8(abcd2.val, sep);
__m256i ab3cd3 = _mm256_shuffle_epi8(abcd3.val, sep);
__m256i bgr0 = _mm256_permute2x128_si256(s01, s20, 0 + 2*16);
__m256i bgr1 = _mm256_blend_epi32(s01, s12, 0x0f);
__m256i bgr2 = _mm256_permute2x128_si256(s20, s12, 1 + 3*16);
__m256i ab0 = _mm256_unpacklo_epi32(ab0cd0, ab1cd1);
__m256i ab1 = _mm256_unpacklo_epi32(ab2cd2, ab3cd3);
__m256i cd0 = _mm256_unpackhi_epi32(ab0cd0, ab1cd1);
__m256i cd1 = _mm256_unpackhi_epi32(ab2cd2, ab3cd3);
a.val = _mm256_unpacklo_epi64(ab0, ab1);
b.val = _mm256_unpackhi_epi64(ab0, ab1);
c.val = _mm256_unpacklo_epi64(cd0, cd1);
d.val = _mm256_unpackhi_epi64(cd0, cd1);
}
template<typename _Tp, typename _Tpvec>
inline void v256_store_interleave_l32(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
{ v256_store_interleave_l8(ptr, a, b, c, d); }
_mm256_storeu_si256((__m256i*)ptr, bgr0);
_mm256_storeu_si256((__m256i*)(ptr + 4), bgr1);
_mm256_storeu_si256((__m256i*)(ptr + 8), bgr2);
}
OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint8x32, uchar, l32)
OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int8x32, schar, l32)
inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r, const v_uint8x32& a )
{
__m256i bg0 = _mm256_unpacklo_epi8(b.val, g.val);
__m256i bg1 = _mm256_unpackhi_epi8(b.val, g.val);
__m256i ra0 = _mm256_unpacklo_epi8(r.val, a.val);
__m256i ra1 = _mm256_unpackhi_epi8(r.val, a.val);
__m256i bgra0_ = _mm256_unpacklo_epi16(bg0, ra0);
__m256i bgra1_ = _mm256_unpackhi_epi16(bg0, ra0);
__m256i bgra2_ = _mm256_unpacklo_epi16(bg1, ra1);
__m256i bgra3_ = _mm256_unpackhi_epi16(bg1, ra1);
__m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
__m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
__m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
__m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
_mm256_storeu_si256((__m256i*)ptr, bgra0);
_mm256_storeu_si256((__m256i*)(ptr + 32), bgra1);
_mm256_storeu_si256((__m256i*)(ptr + 64), bgra2);
_mm256_storeu_si256((__m256i*)(ptr + 96), bgra3);
}
inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g,
const v_uint16x16& r, const v_uint16x16& a )
{
__m256i bg0 = _mm256_unpacklo_epi16(b.val, g.val);
__m256i bg1 = _mm256_unpackhi_epi16(b.val, g.val);
__m256i ra0 = _mm256_unpacklo_epi16(r.val, a.val);
__m256i ra1 = _mm256_unpackhi_epi16(r.val, a.val);
__m256i bgra0_ = _mm256_unpacklo_epi32(bg0, ra0);
__m256i bgra1_ = _mm256_unpackhi_epi32(bg0, ra0);
__m256i bgra2_ = _mm256_unpacklo_epi32(bg1, ra1);
__m256i bgra3_ = _mm256_unpackhi_epi32(bg1, ra1);
__m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
__m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
__m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
__m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
_mm256_storeu_si256((__m256i*)ptr, bgra0);
_mm256_storeu_si256((__m256i*)(ptr + 16), bgra1);
_mm256_storeu_si256((__m256i*)(ptr + 32), bgra2);
_mm256_storeu_si256((__m256i*)(ptr + 48), bgra3);
}
inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g,
const v_uint32x8& r, const v_uint32x8& a )
{
__m256i bg0 = _mm256_unpacklo_epi32(b.val, g.val);
__m256i bg1 = _mm256_unpackhi_epi32(b.val, g.val);
__m256i ra0 = _mm256_unpacklo_epi32(r.val, a.val);
__m256i ra1 = _mm256_unpackhi_epi32(r.val, a.val);
__m256i bgra0_ = _mm256_unpacklo_epi64(bg0, ra0);
__m256i bgra1_ = _mm256_unpackhi_epi64(bg0, ra0);
__m256i bgra2_ = _mm256_unpacklo_epi64(bg1, ra1);
__m256i bgra3_ = _mm256_unpackhi_epi64(bg1, ra1);
__m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
__m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
__m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
__m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
_mm256_storeu_si256((__m256i*)ptr, bgra0);
_mm256_storeu_si256((__m256i*)(ptr + 8), bgra1);
_mm256_storeu_si256((__m256i*)(ptr + 16), bgra2);
_mm256_storeu_si256((__m256i*)(ptr + 24), bgra3);
}
inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g,
const v_uint64x4& r, const v_uint64x4& a )
{
__m256i bg0 = _mm256_unpacklo_epi64(b.val, g.val);
__m256i bg1 = _mm256_unpackhi_epi64(b.val, g.val);
__m256i ra0 = _mm256_unpacklo_epi64(r.val, a.val);
__m256i ra1 = _mm256_unpackhi_epi64(r.val, a.val);
__m256i bgra0 = _mm256_permute2x128_si256(bg0, ra0, 0 + 2*16);
__m256i bgra1 = _mm256_permute2x128_si256(bg1, ra1, 0 + 2*16);
__m256i bgra2 = _mm256_permute2x128_si256(bg0, ra0, 1 + 3*16);
__m256i bgra3 = _mm256_permute2x128_si256(bg1, ra1, 1 + 3*16);
_mm256_storeu_si256((__m256i*)ptr, bgra0);
_mm256_storeu_si256((__m256i*)(ptr + 4), bgra1);
_mm256_storeu_si256((__m256i*)(ptr + 8), bgra2);
_mm256_storeu_si256((__m256i*)(ptr + 12), bgra3);
}
#define OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
{ \
_Tpvec1 a1, b1; \
v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
a0 = v_reinterpret_as_##suffix0(a1); \
b0 = v_reinterpret_as_##suffix0(b1); \
} \
inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
{ \
_Tpvec1 a1, b1, c1; \
v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
a0 = v_reinterpret_as_##suffix0(a1); \
b0 = v_reinterpret_as_##suffix0(b1); \
c0 = v_reinterpret_as_##suffix0(c1); \
} \
inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
{ \
_Tpvec1 a1, b1, c1, d1; \
v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
a0 = v_reinterpret_as_##suffix0(a1); \
b0 = v_reinterpret_as_##suffix0(b1); \
c0 = v_reinterpret_as_##suffix0(c1); \
d0 = v_reinterpret_as_##suffix0(d1); \
} \
inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0 ) \
{ \
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
v_store_interleave((_Tp1*)ptr, a1, b1); \
} \
inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0 ) \
{ \
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
_Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
v_store_interleave((_Tp1*)ptr, a1, b1, c1); \
} \
inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
const _Tpvec0& c0, const _Tpvec0& d0 ) \
{ \
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
_Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
_Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \
}
OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int8x32, schar, s8, v_uint8x32, uchar, u8)
OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int16x16, short, s16, v_uint16x16, ushort, u16)
OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int32x8, int, s32, v_uint32x8, unsigned, u32)
OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float32x8, float, f32, v_uint32x8, unsigned, u32)
OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int64x4, int64, s64, v_uint64x4, uint64, u64)
OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, uint64, u64)
inline void v256_cleanup() { _mm256_zeroupper(); }

@ -1318,6 +1318,80 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec&
vst4q_##suffix(ptr, v); \
}
#define OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(tp, suffix) \
inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b ) \
{ \
tp##x1_t a0 = vld1_##suffix(ptr); \
tp##x1_t b0 = vld1_##suffix(ptr + 1); \
tp##x1_t a1 = vld1_##suffix(ptr + 2); \
tp##x1_t b1 = vld1_##suffix(ptr + 3); \
a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
} \
\
inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, \
v_##tp##x2& b, v_##tp##x2& c ) \
{ \
tp##x1_t a0 = vld1_##suffix(ptr); \
tp##x1_t b0 = vld1_##suffix(ptr + 1); \
tp##x1_t c0 = vld1_##suffix(ptr + 2); \
tp##x1_t a1 = vld1_##suffix(ptr + 3); \
tp##x1_t b1 = vld1_##suffix(ptr + 4); \
tp##x1_t c1 = vld1_##suffix(ptr + 5); \
a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
} \
\
inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b, \
v_##tp##x2& c, v_##tp##x2& d ) \
{ \
tp##x1_t a0 = vld1_##suffix(ptr); \
tp##x1_t b0 = vld1_##suffix(ptr + 1); \
tp##x1_t c0 = vld1_##suffix(ptr + 2); \
tp##x1_t d0 = vld1_##suffix(ptr + 3); \
tp##x1_t a1 = vld1_##suffix(ptr + 4); \
tp##x1_t b1 = vld1_##suffix(ptr + 5); \
tp##x1_t c1 = vld1_##suffix(ptr + 6); \
tp##x1_t d1 = vld1_##suffix(ptr + 7); \
a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
d = v_##tp##x2(vcombine_##suffix(d0, d1)); \
} \
\
inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b ) \
{ \
vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
vst1_##suffix(ptr + 2, vget_high_##suffix(a.val)); \
vst1_##suffix(ptr + 3, vget_high_##suffix(b.val)); \
} \
\
inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
const v_##tp##x2& b, const v_##tp##x2& c ) \
{ \
vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
vst1_##suffix(ptr + 3, vget_high_##suffix(a.val)); \
vst1_##suffix(ptr + 4, vget_high_##suffix(b.val)); \
vst1_##suffix(ptr + 5, vget_high_##suffix(c.val)); \
} \
\
inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
const v_##tp##x2& c, const v_##tp##x2& d ) \
{ \
vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
vst1_##suffix(ptr + 3, vget_low_##suffix(d.val)); \
vst1_##suffix(ptr + 4, vget_high_##suffix(a.val)); \
vst1_##suffix(ptr + 5, vget_high_##suffix(b.val)); \
vst1_##suffix(ptr + 6, vget_high_##suffix(c.val)); \
vst1_##suffix(ptr + 7, vget_high_##suffix(d.val)); \
}
OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
@ -1329,6 +1403,9 @@ OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, f64)
#endif
OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(int64, s64)
OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(uint64, u64)
inline v_float32x4 v_cvt_f32(const v_int32x4& a)
{
return v_float32x4(vcvtq_f32_s32(a.val));

@ -58,17 +58,6 @@ namespace cv
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
struct v_uint8x16;
struct v_int8x16;
struct v_uint16x8;
struct v_int16x8;
struct v_uint32x4;
struct v_int32x4;
struct v_float32x4;
struct v_uint64x2;
struct v_int64x2;
struct v_float64x2;
struct v_uint8x16
{
typedef uchar lane_type;
@ -1660,7 +1649,7 @@ OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_N
OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
// adopted from sse_utils.hpp
// load deinterleave
inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
{
__m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
@ -1681,7 +1670,25 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
{
#if CV_SSSE3
#if CV_SSE4_1
static const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
static const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
__m128i s0 = _mm_loadu_si128((const __m128i*)ptr);
__m128i s1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
__m128i s2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
__m128i a0 = _mm_blendv_epi8(_mm_blendv_epi8(s0, s1, m0), s2, m1);
__m128i b0 = _mm_blendv_epi8(_mm_blendv_epi8(s1, s2, m0), s0, m1);
__m128i c0 = _mm_blendv_epi8(_mm_blendv_epi8(s2, s0, m0), s1, m1);
static const __m128i sh_b = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
static const __m128i sh_g = _mm_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14);
static const __m128i sh_r = _mm_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
a0 = _mm_shuffle_epi8(a0, sh_b);
b0 = _mm_shuffle_epi8(b0, sh_g);
c0 = _mm_shuffle_epi8(c0, sh_r);
a.val = a0;
b.val = b0;
c.val = c0;
#elif CV_SSSE3
static const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14);
static const __m128i m1 = _mm_alignr_epi8(m0, m0, 11);
static const __m128i m2 = _mm_alignr_epi8(m0, m0, 6);
@ -1753,8 +1760,41 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b,
d.val = _mm_unpackhi_epi8(v2, v3);
}
inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b)
{
__m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1 a2 b2 a3 b3
__m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7
__m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5
__m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7
__m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6
__m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7
a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7
b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7
}
inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
{
#if CV_SSE4_1
__m128i v0 = _mm_loadu_si128((__m128i*)(ptr));
__m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8));
__m128i v2 = _mm_loadu_si128((__m128i*)(ptr + 16));
__m128i a0 = _mm_blend_epi16(_mm_blend_epi16(v0, v1, 0x92), v2, 0x24);
__m128i b0 = _mm_blend_epi16(_mm_blend_epi16(v2, v0, 0x92), v1, 0x24);
__m128i c0 = _mm_blend_epi16(_mm_blend_epi16(v1, v2, 0x92), v0, 0x24);
static const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
static const __m128i sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
static const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
a0 = _mm_shuffle_epi8(a0, sh_a);
b0 = _mm_shuffle_epi8(b0, sh_b);
c0 = _mm_shuffle_epi8(c0, sh_c);
a.val = a0;
b.val = b0;
c.val = c0;
#else
__m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
__m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8));
__m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16));
@ -1770,6 +1810,7 @@ inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b,
a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21));
b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22);
c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22));
#endif
}
inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
@ -1795,6 +1836,18 @@ inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b,
d.val = _mm_unpackhi_epi16(u2, u3);
}
inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b)
{
__m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1
__m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 4)); // a2 b2 a3 b3
__m128i v2 = _mm_unpacklo_epi32(v0, v1); // a0 a2 b0 b2
__m128i v3 = _mm_unpackhi_epi32(v0, v1); // a1 a3 b1 b3
a.val = _mm_unpacklo_epi32(v2, v3); // a0 a1 a2 a3
b.val = _mm_unpackhi_epi32(v2, v3); // b0 b1 ab b3
}
inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
{
__m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
@ -1812,12 +1865,23 @@ inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4&
inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
{
v_uint32x4 u0(_mm_loadu_si128((const __m128i*)ptr)); // a0 b0 c0 d0
v_uint32x4 u1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1
v_uint32x4 u2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2
v_uint32x4 u3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3
v_uint32x4 s0(_mm_loadu_si128((const __m128i*)ptr)); // a0 b0 c0 d0
v_uint32x4 s1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1
v_uint32x4 s2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2
v_uint32x4 s3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3
v_transpose4x4(u0, u1, u2, u3, a, b, c, d);
v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
}
inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
{
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
__m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1
__m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3
b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
}
inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c)
@ -1853,77 +1917,43 @@ inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b
d.val = _mm_unpackhi_ps(t02hi, t13hi);
}
inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b)
{
__m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
__m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2));
__m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4));
a = v_uint64x2(_mm_unpacklo_epi64(t0, _mm_unpackhi_epi64(t1, t1)));
b = v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2));
c = v_uint64x2(_mm_unpacklo_epi64(t1, _mm_unpackhi_epi64(t2, t2)));
}
inline void v_load_deinterleave(const int64 *ptr, v_int64x2& a, v_int64x2& b, v_int64x2& c)
{
v_uint64x2 t0, t1, t2;
v_load_deinterleave((const uint64*)ptr, t0, t1, t2);
a = v_reinterpret_as_s64(t0);
b = v_reinterpret_as_s64(t1);
c = v_reinterpret_as_s64(t2);
}
inline void v_load_deinterleave(const double *ptr, v_float64x2& a, v_float64x2& b, v_float64x2& c)
{
v_uint64x2 t0, t1, t2;
v_load_deinterleave((const uint64*)ptr, t0, t1, t2);
a = v_reinterpret_as_f64(t0);
b = v_reinterpret_as_f64(t1);
c = v_reinterpret_as_f64(t2);
a = v_uint64x2(_mm_unpacklo_epi64(t0, t1));
b = v_uint64x2(_mm_unpackhi_epi64(t0, t1));
}
// 2-channel
inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
{
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
__m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0, b0
__m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0, a1
__m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // b1, c1
__m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1
__m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
t1 = _mm_shuffle_epi32(t1, 0x4e); // a1, c0
a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3
b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
a = v_uint64x2(_mm_unpacklo_epi64(t0, t1));
b = v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2));
c = v_uint64x2(_mm_unpackhi_epi64(t1, t2));
}
inline void v_load_deinterleave(const short* ptr, v_int16x8& a, v_int16x8& b)
inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a,
v_uint64x2& b, v_uint64x2& c, v_uint64x2& d)
{
__m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1 a2 b2 a3 b3
__m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7
__m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5
__m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7
__m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6
__m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7
__m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0
__m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0 d0
__m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // a1 b1
__m128i t3 = _mm_loadu_si128((const __m128i*)(ptr + 6)); // c1 d1
a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7
b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7
a = v_uint64x2(_mm_unpacklo_epi64(t0, t2));
b = v_uint64x2(_mm_unpackhi_epi64(t0, t2));
c = v_uint64x2(_mm_unpacklo_epi64(t1, t3));
d = v_uint64x2(_mm_unpackhi_epi64(t1, t3));
}
inline void v_load_deinterleave(const ushort*ptr, v_uint16x8& a, v_uint16x8& b)
{
v_int16x8 sa, sb;
v_load_deinterleave((const short*)ptr, sa, sb);
a = v_reinterpret_as_u16(sa);
b = v_reinterpret_as_u16(sb);
}
inline void v_store_interleave(short* ptr, const v_int16x8& a, const v_int16x8& b)
{
__m128i t0, t1;
t0 = _mm_unpacklo_epi16(a.val, b.val);
t1 = _mm_unpackhi_epi16(a.val, b.val);
_mm_storeu_si128((__m128i*)(ptr), t0);
_mm_storeu_si128((__m128i*)(ptr + 8), t1);
}
// store interleave
inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b)
{
@ -1937,7 +1967,24 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x1
inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
const v_uint8x16& c )
{
#if CV_SSSE3
#if CV_SSE4_1
static const __m128i sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
static const __m128i sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
static const __m128i sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
__m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
__m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
__m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
static const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
static const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
__m128i v0 = _mm_blendv_epi8(_mm_blendv_epi8(a0, b0, m1), c0, m0);
__m128i v1 = _mm_blendv_epi8(_mm_blendv_epi8(b0, c0, m1), a0, m0);
__m128i v2 = _mm_blendv_epi8(_mm_blendv_epi8(c0, a0, m1), b0, m0);
_mm_storeu_si128((__m128i*)(ptr), v0);
_mm_storeu_si128((__m128i*)(ptr + 16), v1);
_mm_storeu_si128((__m128i*)(ptr + 32), v2);
#elif CV_SSSE3
static const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5);
static const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10);
static const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15);
@ -2025,10 +2072,35 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x1
_mm_storeu_si128((__m128i*)(ptr + 48), v3);
}
inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b )
{
__m128i t0, t1;
t0 = _mm_unpacklo_epi16(a.val, b.val);
t1 = _mm_unpackhi_epi16(a.val, b.val);
_mm_storeu_si128((__m128i*)(ptr), t0);
_mm_storeu_si128((__m128i*)(ptr + 8), t1);
}
inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
const v_uint16x8& b,
const v_uint16x8& c )
{
#if CV_SSE4_1
static const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
static const __m128i sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
static const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
__m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
__m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
__m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
__m128i v0 = _mm_blend_epi16(_mm_blend_epi16(a0, b0, 0x92), c0, 0x24);
__m128i v1 = _mm_blend_epi16(_mm_blend_epi16(c0, a0, 0x92), b0, 0x24);
__m128i v2 = _mm_blend_epi16(_mm_blend_epi16(b0, c0, 0x92), a0, 0x24);
_mm_storeu_si128((__m128i*)ptr, v0);
_mm_storeu_si128((__m128i*)(ptr + 8), v1);
_mm_storeu_si128((__m128i*)(ptr + 16), v2);
#else
__m128i z = _mm_setzero_si128();
__m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
__m128i ab1 = _mm_unpackhi_epi16(a.val, b.val);
@ -2060,6 +2132,7 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
_mm_storeu_si128((__m128i*)(ptr), v0);
_mm_storeu_si128((__m128i*)(ptr + 8), v1);
_mm_storeu_si128((__m128i*)(ptr + 16), v2);
#endif
}
inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
@ -2085,6 +2158,15 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16
_mm_storeu_si128((__m128i*)(ptr + 24), v3);
}
inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b )
{
__m128i t0 = _mm_unpacklo_epi32(a.val, b.val);
__m128i t1 = _mm_unpackhi_epi32(a.val, b.val);
_mm_storeu_si128((__m128i*)ptr, t0);
_mm_storeu_si128((__m128i*)(ptr + 4), t1);
}
inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
const v_uint32x4& c )
{
@ -2158,6 +2240,15 @@ inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32
_mm_storeu_ps(ptr + 12, v3);
}
inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b)
{
__m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
__m128i t1 = _mm_unpackhi_epi64(a.val, b.val);
_mm_storeu_si128((__m128i*)ptr, t0);
_mm_storeu_si128((__m128i*)(ptr + 2), t1);
}
inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c)
{
__m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
@ -2169,58 +2260,72 @@ inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x
_mm_storeu_si128((__m128i*)(ptr + 4), t2);
}
inline void v_store_interleave(int64 *ptr, const v_int64x2& a, const v_int64x2& b, const v_int64x2& c)
inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c, const v_uint64x2& d)
{
v_store_interleave((uint64*)ptr, v_reinterpret_as_u64(a), v_reinterpret_as_u64(b), v_reinterpret_as_u64(c));
}
__m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
__m128i t1 = _mm_unpacklo_epi64(c.val, d.val);
__m128i t2 = _mm_unpackhi_epi64(a.val, b.val);
__m128i t3 = _mm_unpackhi_epi64(c.val, d.val);
inline void v_store_interleave(double *ptr, const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
{
v_store_interleave((uint64*)ptr, v_reinterpret_as_u64(a), v_reinterpret_as_u64(b), v_reinterpret_as_u64(c));
_mm_storeu_si128((__m128i*)ptr, t0);
_mm_storeu_si128((__m128i*)(ptr + 2), t1);
_mm_storeu_si128((__m128i*)(ptr + 4), t2);
_mm_storeu_si128((__m128i*)(ptr + 6), t3);
}
#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \
inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
_Tpvec& b0, _Tpvec& c0 ) \
#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
{ \
_Tpvec1 a1, b1; \
v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
a0 = v_reinterpret_as_##suffix0(a1); \
b0 = v_reinterpret_as_##suffix0(b1); \
} \
inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
{ \
_Tpvec1 a1, b1, c1; \
v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
a0 = v_reinterpret_as_##suffix0(a1); \
b0 = v_reinterpret_as_##suffix0(b1); \
c0 = v_reinterpret_as_##suffix0(c1); \
} \
inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
{ \
_Tpuvec a1, b1, c1; \
v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1); \
a0 = v_reinterpret_as_##suffix(a1); \
b0 = v_reinterpret_as_##suffix(b1); \
c0 = v_reinterpret_as_##suffix(c1); \
_Tpvec1 a1, b1, c1, d1; \
v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
a0 = v_reinterpret_as_##suffix0(a1); \
b0 = v_reinterpret_as_##suffix0(b1); \
c0 = v_reinterpret_as_##suffix0(c1); \
d0 = v_reinterpret_as_##suffix0(d1); \
} \
inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
_Tpvec& b0, _Tpvec& c0, _Tpvec& d0 ) \
inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0 ) \
{ \
_Tpuvec a1, b1, c1, d1; \
v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1, d1); \
a0 = v_reinterpret_as_##suffix(a1); \
b0 = v_reinterpret_as_##suffix(b1); \
c0 = v_reinterpret_as_##suffix(c1); \
d0 = v_reinterpret_as_##suffix(d1); \
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
v_store_interleave((_Tp1*)ptr, a1, b1); \
} \
inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, \
const _Tpvec& b0, const _Tpvec& c0 ) \
inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0 ) \
{ \
_Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \
_Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \
_Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \
v_store_interleave((_Tpu*)ptr, a1, b1, c1); \
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
_Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
v_store_interleave((_Tp1*)ptr, a1, b1, c1); \
} \
inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, const _Tpvec& b0, \
const _Tpvec& c0, const _Tpvec& d0 ) \
inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
const _Tpvec0& c0, const _Tpvec0& d0 ) \
{ \
_Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \
_Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \
_Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \
_Tpuvec d1 = v_reinterpret_as_##usuffix(d0); \
v_store_interleave((_Tpu*)ptr, a1, b1, c1, d1); \
_Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
_Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
_Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
_Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \
}
OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
//OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32)
OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64)
OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64)
inline v_float32x4 v_cvt_f32(const v_int32x4& a)
{

@ -298,6 +298,8 @@ OPENCV_HAL_IMPL_VSX_INTERLEAVE(uint, v_uint32x4)
OPENCV_HAL_IMPL_VSX_INTERLEAVE(int, v_int32x4)
OPENCV_HAL_IMPL_VSX_INTERLEAVE(float, v_float32x4)
OPENCV_HAL_IMPL_VSX_INTERLEAVE(double, v_float64x2)
OPENCV_HAL_IMPL_VSX_INTERLEAVE(int64, v_int64x2)
OPENCV_HAL_IMPL_VSX_INTERLEAVE(uint64, v_uint64x2)
/* Expand */
#define OPENCV_HAL_IMPL_VSX_EXPAND(_Tpvec, _Tpwvec, _Tp, fl, fh) \

@ -8,223 +8,49 @@
namespace cv { namespace hal {
#if CV_NEON
template<typename T> struct VMerge2;
template<typename T> struct VMerge3;
template<typename T> struct VMerge4;
#define MERGE2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type>{ \
void operator()(const data_type* src0, const data_type* src1, \
data_type* dst){ \
reg_type r; \
r.val[0] = load_func(src0); \
r.val[1] = load_func(src1); \
store_func(dst, r); \
} \
}
#if CV_SIMD
template<typename T, typename VecT> static void
vecmerge_( const T** src, T* dst, int len, int cn )
{
int i;
const T* src0 = src[0];
const T* src1 = src[1];
#define MERGE3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type>{ \
void operator()(const data_type* src0, const data_type* src1, \
const data_type* src2, data_type* dst){ \
reg_type r; \
r.val[0] = load_func(src0); \
r.val[1] = load_func(src1); \
r.val[2] = load_func(src2); \
store_func(dst, r); \
} \
const int VECSZ = VecT::nlanes;
if( cn == 2 )
{
for( i = 0; i < len; i += VECSZ )
{
i = std::min( len - VECSZ, i );
VecT a = vx_load(src0 + i), b = vx_load(src1 + i);
v_store_interleave(dst + i*cn, a, b);
}
}
#define MERGE4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type>{ \
void operator()(const data_type* src0, const data_type* src1, \
const data_type* src2, const data_type* src3, \
data_type* dst){ \
reg_type r; \
r.val[0] = load_func(src0); \
r.val[1] = load_func(src1); \
r.val[2] = load_func(src2); \
r.val[3] = load_func(src3); \
store_func(dst, r); \
} \
else if( cn == 3 )
{
const T* src2 = src[2];
for( i = 0; i < len; i += VECSZ )
{
i = std::min( len - VECSZ, i );
VecT a = vx_load(src0 + i), b = vx_load(src1 + i), c = vx_load(src2 + i);
v_store_interleave(dst + i*cn, a, b, c);
}
}
MERGE2_KERNEL_TEMPLATE(VMerge2, uchar , uint8x16x2_t, vld1q_u8 , vst2q_u8 );
MERGE2_KERNEL_TEMPLATE(VMerge2, ushort, uint16x8x2_t, vld1q_u16, vst2q_u16);
MERGE2_KERNEL_TEMPLATE(VMerge2, int , int32x4x2_t, vld1q_s32, vst2q_s32);
MERGE2_KERNEL_TEMPLATE(VMerge2, int64 , int64x1x2_t, vld1_s64 , vst2_s64 );
MERGE3_KERNEL_TEMPLATE(VMerge3, uchar , uint8x16x3_t, vld1q_u8 , vst3q_u8 );
MERGE3_KERNEL_TEMPLATE(VMerge3, ushort, uint16x8x3_t, vld1q_u16, vst3q_u16);
MERGE3_KERNEL_TEMPLATE(VMerge3, int , int32x4x3_t, vld1q_s32, vst3q_s32);
MERGE3_KERNEL_TEMPLATE(VMerge3, int64 , int64x1x3_t, vld1_s64 , vst3_s64 );
MERGE4_KERNEL_TEMPLATE(VMerge4, uchar , uint8x16x4_t, vld1q_u8 , vst4q_u8 );
MERGE4_KERNEL_TEMPLATE(VMerge4, ushort, uint16x8x4_t, vld1q_u16, vst4q_u16);
MERGE4_KERNEL_TEMPLATE(VMerge4, int , int32x4x4_t, vld1q_s32, vst4q_s32);
MERGE4_KERNEL_TEMPLATE(VMerge4, int64 , int64x1x4_t, vld1_s64 , vst4_s64 );
#elif CV_SSE2
template <typename T>
struct VMerge2
{
VMerge2() : support(false) { }
void operator()(const T *, const T *, T *) const { }
bool support;
};
template <typename T>
struct VMerge3
{
VMerge3() : support(false) { }
void operator()(const T *, const T *, const T *, T *) const { }
bool support;
};
template <typename T>
struct VMerge4
{
VMerge4() : support(false) { }
void operator()(const T *, const T *, const T *, const T *, T *) const { }
bool support;
};
#define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
template <> \
struct VMerge2<data_type> \
{ \
enum \
{ \
ELEMS_IN_VEC = 16 / sizeof(data_type) \
}; \
\
VMerge2() \
{ \
support = checkHardwareSupport(se); \
} \
\
void operator()(const data_type * src0, const data_type * src1, \
data_type * dst) const \
{ \
reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \
reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \
reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \
reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \
\
_mm_interleave(v_src0, v_src1, v_src2, v_src3); \
\
_mm_storeu_##flavor((cast_type *)(dst), v_src0); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \
} \
\
bool support; \
}
#define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
template <> \
struct VMerge3<data_type> \
{ \
enum \
{ \
ELEMS_IN_VEC = 16 / sizeof(data_type) \
}; \
\
VMerge3() \
{ \
support = checkHardwareSupport(se); \
} \
\
void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\
data_type * dst) const \
{ \
reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \
reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \
reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \
reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \
reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \
reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \
\
_mm_interleave(v_src0, v_src1, v_src2, \
v_src3, v_src4, v_src5); \
\
_mm_storeu_##flavor((cast_type *)(dst), v_src0); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \
} \
\
bool support; \
}
#define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
template <> \
struct VMerge4<data_type> \
{ \
enum \
{ \
ELEMS_IN_VEC = 16 / sizeof(data_type) \
}; \
\
VMerge4() \
{ \
support = checkHardwareSupport(se); \
} \
\
void operator()(const data_type * src0, const data_type * src1, \
const data_type * src2, const data_type * src3, \
data_type * dst) const \
{ \
reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \
reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \
reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \
reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \
reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \
reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \
reg_type v_src6 = _mm_loadu_##flavor((const cast_type *)(src3)); \
reg_type v_src7 = _mm_loadu_##flavor((const cast_type *)(src3 + ELEMS_IN_VEC)); \
\
_mm_interleave(v_src0, v_src1, v_src2, v_src3, \
v_src4, v_src5, v_src6, v_src7); \
\
_mm_storeu_##flavor((cast_type *)(dst), v_src0); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 6), v_src6); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 7), v_src7); \
} \
\
bool support; \
else
{
CV_Assert( cn == 4 );
const T* src2 = src[2];
const T* src3 = src[3];
for( i = 0; i < len; i += VECSZ )
{
i = std::min( len - VECSZ, i );
VecT a = vx_load(src0 + i), b = vx_load(src1 + i);
VecT c = vx_load(src2 + i), d = vx_load(src3 + i);
v_store_interleave(dst + i*cn, a, b, c, d);
}
}
vx_cleanup();
}
MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
#if CV_SSE4_1
MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
#endif
MERGE2_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2);
MERGE3_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2);
MERGE4_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2);
#endif
template<typename T> static void
@ -242,28 +68,6 @@ merge_( const T** src, T* dst, int len, int cn )
{
const T *src0 = src[0], *src1 = src[1];
i = j = 0;
#if CV_NEON
if(cn == 2)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 2 * inc_i;
VMerge2<T> vmerge;
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vmerge(src0 + i, src1 + i, dst + j);
}
#elif CV_SSE2
if(cn == 2)
{
int inc_i = 32/sizeof(T);
int inc_j = 2 * inc_i;
VMerge2<T> vmerge;
if (vmerge.support)
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vmerge(src0 + i, src1 + i, dst + j);
}
#endif
for( ; i < len; i++, j += cn )
{
dst[j] = src0[i];
@ -274,28 +78,6 @@ merge_( const T** src, T* dst, int len, int cn )
{
const T *src0 = src[0], *src1 = src[1], *src2 = src[2];
i = j = 0;
#if CV_NEON
if(cn == 3)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 3 * inc_i;
VMerge3<T> vmerge;
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vmerge(src0 + i, src1 + i, src2 + i, dst + j);
}
#elif CV_SSE2
if(cn == 3)
{
int inc_i = 32/sizeof(T);
int inc_j = 3 * inc_i;
VMerge3<T> vmerge;
if (vmerge.support)
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vmerge(src0 + i, src1 + i, src2 + i, dst + j);
}
#endif
for( ; i < len; i++, j += cn )
{
dst[j] = src0[i];
@ -307,28 +89,6 @@ merge_( const T** src, T* dst, int len, int cn )
{
const T *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
i = j = 0;
#if CV_NEON
if(cn == 4)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 4 * inc_i;
VMerge4<T> vmerge;
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
}
#elif CV_SSE2
if(cn == 4)
{
int inc_i = 32/sizeof(T);
int inc_j = 4 * inc_i;
VMerge4<T> vmerge;
if (vmerge.support)
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
}
#endif
for( ; i < len; i++, j += cn )
{
dst[j] = src0[i]; dst[j+1] = src1[i];
@ -347,29 +107,48 @@ merge_( const T** src, T* dst, int len, int cn )
}
}
void merge8u(const uchar** src, uchar* dst, int len, int cn )
{
CALL_HAL(merge8u, cv_hal_merge8u, src, dst, len, cn)
merge_(src, dst, len, cn);
#if CV_SIMD
if( len >= v_uint8::nlanes && 2 <= cn && cn <= 4 )
vecmerge_<uchar, v_uint8>(src, dst, len, cn);
else
#endif
merge_(src, dst, len, cn);
}
void merge16u(const ushort** src, ushort* dst, int len, int cn )
{
CALL_HAL(merge16u, cv_hal_merge16u, src, dst, len, cn)
merge_(src, dst, len, cn);
#if CV_SIMD
if( len >= v_uint16::nlanes && 2 <= cn && cn <= 4 )
vecmerge_<ushort, v_uint16>(src, dst, len, cn);
else
#endif
merge_(src, dst, len, cn);
}
void merge32s(const int** src, int* dst, int len, int cn )
{
CALL_HAL(merge32s, cv_hal_merge32s, src, dst, len, cn)
merge_(src, dst, len, cn);
#if CV_SIMD
if( len >= v_int32::nlanes && 2 <= cn && cn <= 4 )
vecmerge_<int, v_int32>(src, dst, len, cn);
else
#endif
merge_(src, dst, len, cn);
}
void merge64s(const int64** src, int64* dst, int len, int cn )
{
CALL_HAL(merge64s, cv_hal_merge64s, src, dst, len, cn)
merge_(src, dst, len, cn);
#if CV_SIMD
if( len >= v_int64::nlanes && 2 <= cn && cn <= 4 )
vecmerge_<int64, v_int64>(src, dst, len, cn);
else
#endif
merge_(src, dst, len, cn);
}
}} // cv::hal::

@ -8,222 +8,57 @@
namespace cv { namespace hal {
#if CV_NEON
template<typename T> struct VSplit2;
template<typename T> struct VSplit3;
template<typename T> struct VSplit4;
#define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type> \
{ \
void operator()(const data_type* src, data_type* dst0, \
data_type* dst1) const \
{ \
reg_type r = load_func(src); \
store_func(dst0, r.val[0]); \
store_func(dst1, r.val[1]); \
} \
}
#if CV_SIMD
template<typename T, typename VecT> static void
vecsplit_( const T* src, T** dst, int len, int cn )
{
int i;
T* dst0 = dst[0];
T* dst1 = dst[1];
#define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type> \
{ \
void operator()(const data_type* src, data_type* dst0, data_type* dst1, \
data_type* dst2) const \
{ \
reg_type r = load_func(src); \
store_func(dst0, r.val[0]); \
store_func(dst1, r.val[1]); \
store_func(dst2, r.val[2]); \
} \
const int VECSZ = VecT::nlanes;
if( cn == 2 )
{
for( i = 0; i < len; i += VECSZ )
{
i = std::min( len - VECSZ, i );
VecT a, b;
v_load_deinterleave(src + i*cn, a, b);
v_store(dst0 + i, a);
v_store(dst1 + i, b);
}
}
#define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type> \
{ \
void operator()(const data_type* src, data_type* dst0, data_type* dst1, \
data_type* dst2, data_type* dst3) const \
{ \
reg_type r = load_func(src); \
store_func(dst0, r.val[0]); \
store_func(dst1, r.val[1]); \
store_func(dst2, r.val[2]); \
store_func(dst3, r.val[3]); \
} \
else if( cn == 3 )
{
T* dst2 = dst[2];
for( i = 0; i < len; i += VECSZ )
{
i = std::min( len - VECSZ, i );
VecT a, b, c;
v_load_deinterleave(src + i*cn, a, b, c);
v_store(dst0 + i, a);
v_store(dst1 + i, b);
v_store(dst2 + i, c);
}
}
SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar , uint8x16x2_t, vld2q_u8 , vst1q_u8 );
SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort, uint16x8x2_t, vld2q_u16, vst1q_u16);
SPLIT2_KERNEL_TEMPLATE(VSplit2, int , int32x4x2_t, vld2q_s32, vst1q_s32);
SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 , int64x1x2_t, vld2_s64 , vst1_s64 );
SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar , uint8x16x3_t, vld3q_u8 , vst1q_u8 );
SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort, uint16x8x3_t, vld3q_u16, vst1q_u16);
SPLIT3_KERNEL_TEMPLATE(VSplit3, int , int32x4x3_t, vld3q_s32, vst1q_s32);
SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 , int64x1x3_t, vld3_s64 , vst1_s64 );
SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar , uint8x16x4_t, vld4q_u8 , vst1q_u8 );
SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort, uint16x8x4_t, vld4q_u16, vst1q_u16);
SPLIT4_KERNEL_TEMPLATE(VSplit4, int , int32x4x4_t, vld4q_s32, vst1q_s32);
SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 , int64x1x4_t, vld4_s64 , vst1_s64 );
#elif CV_SSE2
template <typename T>
struct VSplit2
{
VSplit2() : support(false) { }
void operator()(const T *, T *, T *) const { }
bool support;
};
template <typename T>
struct VSplit3
{
VSplit3() : support(false) { }
void operator()(const T *, T *, T *, T *) const { }
bool support;
};
template <typename T>
struct VSplit4
{
VSplit4() : support(false) { }
void operator()(const T *, T *, T *, T *, T *) const { }
bool support;
};
#define SPLIT2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \
template <> \
struct VSplit2<data_type> \
{ \
enum \
{ \
ELEMS_IN_VEC = 16 / sizeof(data_type) \
}; \
\
VSplit2() \
{ \
support = checkHardwareSupport(CV_CPU_SSE2); \
} \
\
void operator()(const data_type * src, \
data_type * dst0, data_type * dst1) const \
{ \
reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
\
_mm_deinterleave(v_src0, v_src1, v_src2, v_src3); \
\
_mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
_mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
_mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
_mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
} \
\
bool support; \
}
#define SPLIT3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \
template <> \
struct VSplit3<data_type> \
{ \
enum \
{ \
ELEMS_IN_VEC = 16 / sizeof(data_type) \
}; \
\
VSplit3() \
{ \
support = checkHardwareSupport(CV_CPU_SSE2); \
} \
\
void operator()(const data_type * src, \
data_type * dst0, data_type * dst1, data_type * dst2) const \
{ \
reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
\
_mm_deinterleave(v_src0, v_src1, v_src2, \
v_src3, v_src4, v_src5); \
\
_mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
_mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
_mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
_mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
_mm_storeu_##flavor((cast_type *)(dst2), v_src4); \
_mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \
} \
\
bool support; \
}
#define SPLIT4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \
template <> \
struct VSplit4<data_type> \
{ \
enum \
{ \
ELEMS_IN_VEC = 16 / sizeof(data_type) \
}; \
\
VSplit4() \
{ \
support = checkHardwareSupport(CV_CPU_SSE2); \
} \
\
void operator()(const data_type * src, data_type * dst0, data_type * dst1, \
data_type * dst2, data_type * dst3) const \
{ \
reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
reg_type v_src6 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 6)); \
reg_type v_src7 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 7)); \
\
_mm_deinterleave(v_src0, v_src1, v_src2, v_src3, \
v_src4, v_src5, v_src6, v_src7); \
\
_mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
_mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
_mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
_mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
_mm_storeu_##flavor((cast_type *)(dst2), v_src4); \
_mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \
_mm_storeu_##flavor((cast_type *)(dst3), v_src6); \
_mm_storeu_##flavor((cast_type *)(dst3 + ELEMS_IN_VEC), v_src7); \
} \
\
bool support; \
else
{
CV_Assert( cn == 4 );
T* dst2 = dst[2];
T* dst3 = dst[3];
for( i = 0; i < len; i += VECSZ )
{
i = std::min( len - VECSZ, i );
VecT a, b, c, d;
v_load_deinterleave(src + i*cn, a, b, c, d);
v_store(dst0 + i, a);
v_store(dst1 + i, b);
v_store(dst2 + i, c);
v_store(dst3 + i, d);
}
}
vx_cleanup();
}
SPLIT2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
SPLIT2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
SPLIT2_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps);
SPLIT3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
SPLIT3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
SPLIT3_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps);
SPLIT4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
SPLIT4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
SPLIT4_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps);
#endif
template<typename T> static void
@ -250,30 +85,6 @@ split_( const T* src, T** dst, int len, int cn )
T *dst0 = dst[0], *dst1 = dst[1];
i = j = 0;
#if CV_NEON
if(cn == 2)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 2 * inc_i;
VSplit2<T> vsplit;
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i);
}
#elif CV_SSE2
if (cn == 2)
{
int inc_i = 32/sizeof(T);
int inc_j = 2 * inc_i;
VSplit2<T> vsplit;
if (vsplit.support)
{
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i);
}
}
#endif
for( ; i < len; i++, j += cn )
{
dst0[i] = src[j];
@ -285,31 +96,6 @@ split_( const T* src, T** dst, int len, int cn )
T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2];
i = j = 0;
#if CV_NEON
if(cn == 3)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 3 * inc_i;
VSplit3<T> vsplit;
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
}
#elif CV_SSE2
if (cn == 3)
{
int inc_i = 32/sizeof(T);
int inc_j = 3 * inc_i;
VSplit3<T> vsplit;
if (vsplit.support)
{
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
}
}
#endif
for( ; i < len; i++, j += cn )
{
dst0[i] = src[j];
@ -322,30 +108,6 @@ split_( const T* src, T** dst, int len, int cn )
T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2], *dst3 = dst[3];
i = j = 0;
#if CV_NEON
if(cn == 4)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 4 * inc_i;
VSplit4<T> vsplit;
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
}
#elif CV_SSE2
if (cn == 4)
{
int inc_i = 32/sizeof(T);
int inc_j = 4 * inc_i;
VSplit4<T> vsplit;
if (vsplit.support)
{
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
}
}
#endif
for( ; i < len; i++, j += cn )
{
dst0[i] = src[j]; dst1[i] = src[j+1];
@ -367,25 +129,46 @@ split_( const T* src, T** dst, int len, int cn )
void split8u(const uchar* src, uchar** dst, int len, int cn )
{
CALL_HAL(split8u, cv_hal_split8u, src,dst, len, cn)
split_(src, dst, len, cn);
#if CV_SIMD
if( len >= v_uint8::nlanes && 2 <= cn && cn <= 4 )
vecsplit_<uchar, v_uint8>(src, dst, len, cn);
else
#endif
split_(src, dst, len, cn);
}
void split16u(const ushort* src, ushort** dst, int len, int cn )
{
CALL_HAL(split16u, cv_hal_split16u, src,dst, len, cn)
split_(src, dst, len, cn);
#if CV_SIMD
if( len >= v_uint16::nlanes && 2 <= cn && cn <= 4 )
vecsplit_<ushort, v_uint16>(src, dst, len, cn);
else
#endif
split_(src, dst, len, cn);
}
void split32s(const int* src, int** dst, int len, int cn )
{
CALL_HAL(split32s, cv_hal_split32s, src,dst, len, cn)
split_(src, dst, len, cn);
#if CV_SIMD
if( len >= v_uint32::nlanes && 2 <= cn && cn <= 4 )
vecsplit_<int, v_int32>(src, dst, len, cn);
else
#endif
split_(src, dst, len, cn);
}
void split64s(const int64* src, int64** dst, int len, int cn )
{
CALL_HAL(split64s, cv_hal_split64s, src,dst, len, cn)
split_(src, dst, len, cn);
#if CV_SIMD
if( len >= v_int64::nlanes && 2 <= cn && cn <= 4 )
vecsplit_<int64, v_int64>(src, dst, len, cn);
else
#endif
split_(src, dst, len, cn);
}
}} // cv::hal::

Loading…
Cancel
Save