converted split() & merge() to wide univ intrinsics (#12044)

* fixed/updated v_load_deinterleave and v_store_interleave intrinsics; modified split() and merge() functions to use those intrinsics * fixed a few compile errors and bug in v_load_deinterleave(ptr, v_uint32x4& a, v_uint32x4& b) * fixed few more compile errors
7 years ago · 9c7040802c
parent 8de08e0463
commit 9c7040802c
6 changed files with 941 additions and 995 deletions
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@ -1609,392 +1609,592 @@ OPENCV_HAL_IMPL_AVX_EXTRACT(v_float32x8)
 OPENCV_HAL_IMPL_AVX_EXTRACT(v_float64x4)


-/** Reinterpret **/
-// its up there with load and store operations
-
-/* de&interleave */
-#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_2CH(_Tpvec, _Tp, suffix)                 \
-    inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b)       \
-    { return v256_load_deinterleave_##suffix(ptr, a, b); }                      \
-    inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b)  \
-    { return v256_store_interleave_2ch(ptr, a, b); }
+///////////////////// load deinterleave /////////////////////////////

-#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_3CH(_Tpvec, _Tp, suffix)   \
-    inline void v_load_deinterleave                               \
-    (const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c)             \
-    { return v256_load_deinterleave_##suffix(ptr, a, b, c); }     \
-    inline void v_store_interleave                                \
-    (_Tp* ptr, const _Tpvec& a,const _Tpvec& b, const _Tpvec& c)  \
-    { return v256_store_interleave_##suffix(ptr, a, b, c); }
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b )
+{
+    __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));

-#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_4CH(_Tpvec, _Tp, suffix)                    \
-    inline void v_load_deinterleave                                                \
-    (const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)                   \
-    { return v256_load_deinterleave_##suffix(ptr, a, b, c, d); }                   \
-    inline void v_store_interleave                                                 \
-    (_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d) \
-    { return v256_store_interleave_##suffix(ptr, a, b, c, d); }
+    static const __m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+                                               0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+    __m256i p0 = _mm256_shuffle_epi8(ab0, sh);
+    __m256i p1 = _mm256_shuffle_epi8(ab1, sh);
+    __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
+    __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
+    __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
+    __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
+    a = v_uint8x32(a0);
+    b = v_uint8x32(b0);
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b )
+{
+    __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
+
+    static const __m256i sh = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+                                               0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+    __m256i p0 = _mm256_shuffle_epi8(ab0, sh);
+    __m256i p1 = _mm256_shuffle_epi8(ab1, sh);
+    __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
+    __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
+    __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
+    __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
+    a = v_uint16x16(a0);
+    b = v_uint16x16(b0);
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b )
+{
+    __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
+
+    const int sh = 0+2*4+1*16+3*64;
+    __m256i p0 = _mm256_shuffle_epi32(ab0, sh);
+    __m256i p1 = _mm256_shuffle_epi32(ab1, sh);
+    __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
+    __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
+    __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
+    __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
+    a = v_uint32x8(a0);
+    b = v_uint32x8(b0);
+}

-#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_3n4CH(_Tpvec, _Tp, suffix) \
-    OPENCV_HAL_IMPL_AVX_INTERLEAVE_3CH(_Tpvec, _Tp, suffix)       \
-    OPENCV_HAL_IMPL_AVX_INTERLEAVE_4CH(_Tpvec, _Tp, suffix)
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b )
+{
+    __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));

-#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(_Tpvec, _Tp, suffix) \
-    OPENCV_HAL_IMPL_AVX_INTERLEAVE_2CH(_Tpvec, _Tp, suffix)     \
-    OPENCV_HAL_IMPL_AVX_INTERLEAVE_3n4CH(_Tpvec, _Tp, suffix)
+    __m256i pl = _mm256_permute2x128_si256(ab0, ab1, 0 + 2*16);
+    __m256i ph = _mm256_permute2x128_si256(ab0, ab1, 1 + 3*16);
+    __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
+    __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
+    a = v_uint64x4(a0);
+    b = v_uint64x4(b0);
+}

-/* **** */
-//
-template<typename _Tp, typename _Tpvec>
-inline void v256_store_interleave_2ch(_Tp* ptr, const _Tpvec& a, const _Tpvec& b)
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& b, v_uint8x32& g, v_uint8x32& r )
 {
-    _Tpvec ab0, ab1;
-    v_zip(a, b, ab0, ab1);
-    v_store(ptr, ab0);
-    v_store(ptr + _Tpvec::nlanes, ab1);
-}
+    __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
+    __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64));

-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
-{
-    _Tpvec ab0 = v256_load(ptr);
-    _Tpvec ab1 = v256_load(ptr + _Tpvec::nlanes);
-    _Tpvec ab00, ab11;
-    v_recombine(ab0, ab1, ab00, ab11);
-    v256_zip(ab00, ab11, a, b);
-}
+    __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
+    __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);

-///
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c)
-{
-    _Tpvec abc0 = v256_load(ptr);
-    _Tpvec abc1 = v256_load(ptr + _Tpvec::nlanes);
-    _Tpvec abc2 = v256_load(ptr + _Tpvec::nlanes * 2);
+    static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
+                                               0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    static const __m256i m1 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
+                                               -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1);

-    _Tpvec ab0 = v256_combine_diagonal(abc0, abc1);
-    _Tpvec bc1 = v256_combine_diagonal(abc1, abc2);
-    _Tpvec ac1 = v256_reverse_64(v256_combine_diagonal(abc2, abc0));
+    __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
+    __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
+    __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);

-    a = v256_unpacklo(ab0, ac1);
-    c = v256_unpackhi(ac1, bc1);
-    b = v256_alignr_64(bc1, ab0);
-}
+    static const __m256i
+    sh_b = _mm256_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13,
+                            0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13),
+    sh_g = _mm256_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14,
+                            1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14),
+    sh_r = _mm256_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15,
+                            2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
+    b0 = _mm256_shuffle_epi8(b0, sh_b);
+    g0 = _mm256_shuffle_epi8(g0, sh_g);
+    r0 = _mm256_shuffle_epi8(r0, sh_r);

+    b = v_uint8x32(b0);
+    g = v_uint8x32(g0);
+    r = v_uint8x32(r0);
+}

-template<typename _Tp, typename _Tpvec>
-inline void v256_store_interleave_l4(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)
-{
-    _Tpvec ab0 = v256_unpacklo(a, b);
-    _Tpvec bc1 = v256_unpackhi(b, c);
-    _Tpvec ca10 = v256_swap_halves(v256_blend<0xa>(c, a));
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& b, v_uint16x16& g, v_uint16x16& r )
+{
+    __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
+    __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
+
+    __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
+    __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
+
+    static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
+                                               0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
+    static const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
+                                               -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
+    __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
+    __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
+    __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
+    static const __m256i sh_b = _mm256_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
+                                                 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    static const __m256i sh_g = _mm256_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13,
+                                                 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
+    static const __m256i sh_r = _mm256_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
+                                                 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+    b0 = _mm256_shuffle_epi8(b0, sh_b);
+    g0 = _mm256_shuffle_epi8(g0, sh_g);
+    r0 = _mm256_shuffle_epi8(r0, sh_r);
+
+    b = v_uint16x16(b0);
+    g = v_uint16x16(g0);
+    r = v_uint16x16(r0);
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& b, v_uint32x8& g, v_uint32x8& r )
+{
+    __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
+    __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
+
+    __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
+    __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
+
+    __m256i b0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_low, s02_high, 0x24), bgr1, 0x92);
+    __m256i g0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_high, s02_low, 0x92), bgr1, 0x24);
+    __m256i r0 = _mm256_blend_epi32(_mm256_blend_epi32(bgr1, s02_low, 0x24), s02_high, 0x92);
+
+    b0 = _mm256_shuffle_epi32(b0, 0x6c);
+    g0 = _mm256_shuffle_epi32(g0, 0xb1);
+    r0 = _mm256_shuffle_epi32(r0, 0xc6);
+
+    b = v_uint32x8(b0);
+    g = v_uint32x8(g0);
+    r = v_uint32x8(r0);
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& b, v_uint64x4& g, v_uint64x4& r )
+{
+    __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
+    __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
+
+    __m256i s01 = _mm256_blend_epi32(bgr0, bgr1, 0xf0);
+    __m256i s12 = _mm256_blend_epi32(bgr1, bgr2, 0xf0);
+    __m256i s20r = _mm256_permute4x64_epi64(_mm256_blend_epi32(bgr2, bgr0, 0xf0), 0x1b);
+    __m256i b0 = _mm256_unpacklo_epi64(s01, s20r);
+    __m256i g0 = _mm256_alignr_epi8(s12, s01, 8);
+    __m256i r0 = _mm256_unpackhi_epi64(s20r, s12);
+
+    b = v_uint64x4(b0);
+    g = v_uint64x4(g0);
+    r = v_uint64x4(r0);
+}
+
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& b, v_uint8x32& g, v_uint8x32& r, v_uint8x32& a )
+{
+    __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
+    __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64));
+    __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 96));
+    static const __m256i sh = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+                                               0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
+
+    __m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
+    __m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
+    __m256i p2 = _mm256_shuffle_epi8(bgr2, sh);
+    __m256i p3 = _mm256_shuffle_epi8(bgr3, sh);
+
+    __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
+    __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
+    __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
+    __m256i p23h = _mm256_unpackhi_epi32(p2, p3);
+
+    __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
+    __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
+    __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
+    __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
+
+    __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
+    __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
+    __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
+    __m256i a0 = _mm256_unpackhi_epi32(phl, phh);

-    v_store(ptr, v256_combine_diagonal(ab0, ca10));
-    v_store(ptr + _Tpvec::nlanes, v256_combine_diagonal(bc1, ab0));
-    v_store(ptr + _Tpvec::nlanes * 2, v256_combine_diagonal(ca10, bc1));
+    b = v_uint8x32(b0);
+    g = v_uint8x32(g0);
+    r = v_uint8x32(r0);
+    a = v_uint8x32(a0);
 }

-////
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& b, v_uint16x16& g, v_uint16x16& r, v_uint16x16& a )
 {
-    _Tpvec abcd0 = v256_load(ptr);
-    _Tpvec abcd1 = v256_load(ptr + _Tpvec::nlanes);
-    _Tpvec abcd2 = v256_load(ptr + _Tpvec::nlanes * 2);
-    _Tpvec abcd3 = v256_load(ptr + _Tpvec::nlanes * 3);
+    __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
+    __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
+    __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 48));
+    static const __m256i sh = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+                                               0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
+    __m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
+    __m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
+    __m256i p2 = _mm256_shuffle_epi8(bgr2, sh);
+    __m256i p3 = _mm256_shuffle_epi8(bgr3, sh);
+
+    __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
+    __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
+    __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
+    __m256i p23h = _mm256_unpackhi_epi32(p2, p3);

-    _Tpvec cd0ab0 = v256_alignr_128(abcd0, abcd2);
-    _Tpvec cd1ab1 = v256_alignr_128(abcd1, abcd3);
+    __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
+    __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
+    __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
+    __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);

-    _Tpvec ab0 = v256_combine_diagonal(abcd0, cd0ab0);
-    _Tpvec ab1 = v256_combine_diagonal(abcd1, cd1ab1);
-    _Tpvec cd0 = v256_combine_diagonal(cd0ab0, abcd2);
-    _Tpvec cd1 = v256_combine_diagonal(cd1ab1, abcd3);
+    __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
+    __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
+    __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
+    __m256i a0 = _mm256_unpackhi_epi32(phl, phh);

-    v256_zip(ab0, ab1, a, b);
-    v256_zip(cd0, cd1, c, d);
+    b = v_uint16x16(b0);
+    g = v_uint16x16(g0);
+    r = v_uint16x16(r0);
+    a = v_uint16x16(a0);
 }

-template<typename _Tp, typename _Tpvec>
-inline void v256_store_interleave_l4(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& b, v_uint32x8& g, v_uint32x8& r, v_uint32x8& a )
 {
-    _Tpvec ab0, ab1, cd0, cd1;
-    v256_zip(a, b, ab0, ab1);
-    v256_zip(c, d, cd0, cd1);
-
-    _Tpvec ab0cd0 = v256_alignr_128(ab0, cd0);
-    _Tpvec ab1cd1 = v256_alignr_128(ab1, cd1);
-
-    v_store(ptr, v256_combine_diagonal(ab0, ab0cd0));
-    v_store(ptr + _Tpvec::nlanes, v256_combine_diagonal(ab1, ab1cd1));
-    v_store(ptr + _Tpvec::nlanes * 2, v256_combine_diagonal(ab0cd0, cd0));
-    v_store(ptr + _Tpvec::nlanes * 3, v256_combine_diagonal(ab1cd1, cd1));
-}
+    __m256i p0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i p1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
+    __m256i p2 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
+    __m256i p3 = _mm256_loadu_si256((const __m256i*)(ptr + 24));

-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint64x4,  uint64, l4)
-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int64x4,   int64,  l4)
-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_float64x4, double, l4)
+    __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
+    __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
+    __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
+    __m256i p23h = _mm256_unpackhi_epi32(p2, p3);

-/* **** **** */
-//
-inline void v256_load_deinterleave_l8(const float* ptr, v_float32x8& a, v_float32x8& b)
-{
-    v_float32x8 ab0 = v256_load(ptr);
-    v_float32x8 ab1 = v256_load(ptr + 8);
+    __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
+    __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
+    __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
+    __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);

-    v_float32x8 ab0ab2, ab1ab3;
-    v_recombine(ab0, ab1, ab0ab2, ab1ab3);
+    __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
+    __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
+    __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
+    __m256i a0 = _mm256_unpackhi_epi32(phl, phh);

-    a.val = _mm256_shuffle_ps(ab0ab2.val, ab1ab3.val, _MM_SHUFFLE(2, 0, 2, 0));
-    b.val = _mm256_shuffle_ps(ab0ab2.val, ab1ab3.val, _MM_SHUFFLE(3, 1, 3, 1));
+    b = v_uint32x8(b0);
+    g = v_uint32x8(g0);
+    r = v_uint32x8(r0);
+    a = v_uint32x8(a0);
 }

-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& b, v_uint64x4& g, v_uint64x4& r, v_uint64x4& a )
 {
-    v_float32x8 fa, fb;
-    v256_load_deinterleave_l8((float*)ptr, fa, fb);
-    a.val = v_reinterpret_as_u32(fa).val;
-    b.val = v_reinterpret_as_u32(fb).val;
-}
-///
-template<typename _Tp, typename _Tpvec>
-inline void v256_store_interleave_l8(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)
-{
-    _Tpvec ab0, ab1, bc0, bc1;
-    v256_zip(a, b, ab0, ab1);
-    v256_zip(b, c, bc0, bc1);
+    __m256i bgra0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i bgra1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
+    __m256i bgra2 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
+    __m256i bgra3 = _mm256_loadu_si256((const __m256i*)(ptr + 12));

-    _Tpvec cazg = v256_blend<0xaa>(c, a);
-    _Tpvec abc0abc1(_mm256_unpacklo_epi64(ab0.val, cazg.val));
-    _Tpvec abc1abc2(_mm256_unpackhi_epi64(cazg.val, bc1.val));
-    _Tpvec abc2abc0 = v256_reverse_64(v256_blend<0xcc>(ab1, bc0));
+    __m256i l02 = _mm256_permute2x128_si256(bgra0, bgra2, 0 + 2*16);
+    __m256i h02 = _mm256_permute2x128_si256(bgra0, bgra2, 1 + 3*16);
+    __m256i l13 = _mm256_permute2x128_si256(bgra1, bgra3, 0 + 2*16);
+    __m256i h13 = _mm256_permute2x128_si256(bgra1, bgra3, 1 + 3*16);

-    _Tpvec abc0 = v256_combine_diagonal(abc0abc1, abc2abc0);
-    _Tpvec abc1 = v256_combine_diagonal(abc1abc2, abc0abc1);
-    _Tpvec abc2 = v256_combine_diagonal(abc2abc0, abc1abc2);
+    __m256i b0 = _mm256_unpacklo_epi64(l02, l13);
+    __m256i g0 = _mm256_unpackhi_epi64(l02, l13);
+    __m256i r0 = _mm256_unpacklo_epi64(h02, h13);
+    __m256i a0 = _mm256_unpackhi_epi64(h02, h13);

-    v_store(ptr, abc0);
-    v_store(ptr + _Tpvec::nlanes, abc1);
-    v_store(ptr + _Tpvec::nlanes * 2, abc2);
+    b = v_uint64x4(b0);
+    g = v_uint64x4(g0);
+    r = v_uint64x4(r0);
+    a = v_uint64x4(a0);
 }

-inline void v256_store_interleave_l8(float* ptr, const v_float32x8& a, const v_float32x8& b, const v_float32x8& c)
-{
-    v_float32x8 ab0, ab1, bc0, bc1;
-    v256_zip(a, b, ab0, ab1);
-    v256_zip(b, c, bc0, bc1);
+///////////////////////////// store interleave /////////////////////////////////////

-    v_float32x8 cazg = v256_blend<0xaa>(c, a);
-    v_float32x8 abc0abc1(_mm256_shuffle_ps(ab0.val, cazg.val, _MM_SHUFFLE(1, 0, 1, 0)));
-    v_float32x8 abc1abc2(_mm256_shuffle_ps(cazg.val, bc1.val, _MM_SHUFFLE(3, 2, 3, 2)));
-
-    v_float32x8 abc0abc2(_mm256_shuffle_ps(bc0.val, ab1.val, _MM_SHUFFLE(1, 0, 3, 2)));
-    v_float32x8 abc2abc0 = v256_swap_halves(abc0abc2);
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x32& y )
+{
+    __m256i xy_l = _mm256_unpacklo_epi8(x.val, y.val);
+    __m256i xy_h = _mm256_unpackhi_epi8(x.val, y.val);

-    v_float32x8 abc0 = v256_combine_diagonal(abc0abc1, abc2abc0);
-    v_float32x8 abc1 = v256_combine_diagonal(abc1abc2, abc0abc1);
-    v_float32x8 abc2 = v256_combine_diagonal(abc2abc0, abc1abc2);
+    __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
+    __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);

-    v_store(ptr, abc0);
-    v_store(ptr + 8, abc1);
-    v_store(ptr + 16, abc2);
+    _mm256_storeu_si256((__m256i*)ptr, xy0);
+    _mm256_storeu_si256((__m256i*)(ptr + 32), xy1);
 }

-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c)
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint16x16& y )
 {
-    _Tpvec abc02 = v256_load(ptr);
-    _Tpvec abc1  = v256_load(ptr + _Tpvec::nlanes);
-    _Tpvec abc20 = v256_load(ptr + _Tpvec::nlanes * 2);
+    __m256i xy_l = _mm256_unpacklo_epi16(x.val, y.val);
+    __m256i xy_h = _mm256_unpackhi_epi16(x.val, y.val);

-    _Tpvec abc2 = v256_alignr_128(abc02, abc20);
-    _Tpvec abc0 = v256_combine_diagonal(abc02, abc20);
+    __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
+    __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);

-    a = v256_blend<0x92>(abc0, abc1);
-    a = v256_blend<0x44>(a, abc2);
+    _mm256_storeu_si256((__m256i*)ptr, xy0);
+    _mm256_storeu_si256((__m256i*)(ptr + 16), xy1);
+}

-    b = v256_blend<0x24>(abc0, abc1);
-    b = v256_blend<0x99>(b, abc2);
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint32x8& y )
+{
+    __m256i xy_l = _mm256_unpacklo_epi32(x.val, y.val);
+    __m256i xy_h = _mm256_unpackhi_epi32(x.val, y.val);

-    c = v256_blend<0x49>(abc0, abc1);
-    c = v256_blend<0x22>(c, abc2);
+    __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
+    __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);

-    a = v256_shuffle<_MM_SHUFFLE(1, 2, 3, 0)>(a);
-    b = v256_shuffle<_MM_SHUFFLE(2, 3, 0, 1)>(b);
-    c = v256_shuffle<_MM_SHUFFLE(3, 0, 1, 2)>(c);
-}
-/////
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
-{
-    _Tpvec ab0, ab1, cd0, cd1;
-    v256_load_deinterleave_l4(ptr, ab0, cd0, ab1, cd1);
-    v256_zip(ab0, ab1, a, b);
-    v256_zip(cd0, cd1, c, d);
+    _mm256_storeu_si256((__m256i*)ptr, xy0);
+    _mm256_storeu_si256((__m256i*)(ptr + 8), xy1);
 }

-template<typename _Tp, typename _Tpvec>
-inline void v256_store_interleave_l8(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64x4& y )
 {
-    _Tpvec ac0, ac1, bd0, bd1;
-    v256_zip(a, c, ac0, ac1);
-    v256_zip(b, d, bd0, bd1);
-
-    _Tpvec abcd0, abcd1, abcd2, abcd3;
-    v256_zip(ac0, bd0, abcd0, abcd1);
-    v256_zip(ac1, bd1, abcd2, abcd3);
+    __m256i xy_l = _mm256_unpacklo_epi64(x.val, y.val);
+    __m256i xy_h = _mm256_unpackhi_epi64(x.val, y.val);

-    _Tpvec abcd01, abcd23, abcd45, abcd67;
-    v_recombine(abcd0, abcd1, abcd01, abcd45);
-    v_recombine(abcd2, abcd3, abcd23, abcd67);
+    __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
+    __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);

-    v_store(ptr, abcd01);
-    v_store(ptr + _Tpvec::nlanes, abcd23);
-    v_store(ptr + _Tpvec::nlanes * 2, abcd45);
-    v_store(ptr + _Tpvec::nlanes * 3, abcd67);
+    _mm256_storeu_si256((__m256i*)ptr, xy0);
+    _mm256_storeu_si256((__m256i*)(ptr + 4), xy1);
 }

-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint32x8,  unsigned, l8)
-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int32x8,   int,      l8)
-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_float32x8, float,    l8)
-
-/* ********  ******** */
-//
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l16(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r )
 {
-    const __m256i sep = _mm256_setr_epi8(
-        0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
-        0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
-    );
+    static const __m256i sh_b = _mm256_setr_epi8(
+            0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5,
+            0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
+    static const __m256i sh_g = _mm256_setr_epi8(
+            5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10,
+            5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
+    static const __m256i sh_r = _mm256_setr_epi8(
+            10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15,
+            10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);

-    _Tpvec ab0, ab1;
-    v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes), ab0, ab1);
+    __m256i b0 = _mm256_shuffle_epi8(b.val, sh_b);
+    __m256i g0 = _mm256_shuffle_epi8(g.val, sh_g);
+    __m256i r0 = _mm256_shuffle_epi8(r.val, sh_r);

-    __m256i a0b0 = _mm256_shuffle_epi8(ab0.val, sep);
-    __m256i a1b1 = _mm256_shuffle_epi8(ab1.val, sep);
-
-    a.val = _mm256_unpacklo_epi64(a0b0, a1b1);
-    b.val = _mm256_unpackhi_epi64(a0b0, a1b1);
-}
-///
-template<typename _Tp, typename _Tpvec>
-inline void v256_store_interleave_l16(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)
-{
-    v_uint32x8 ab0 = v_reinterpret_as_u32(v256_unpacklo(a, b));
-    v_uint32x8 ab1 = v_reinterpret_as_u32(v256_unpackhi(a, b));
-    v_uint32x8 bc0 = v_reinterpret_as_u32(v256_unpacklo(b, c));
-    v_uint32x8 bc1 = v_reinterpret_as_u32(v256_unpackhi(b, c));
+    static const __m256i m0 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
+                                               0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    static const __m256i m1 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
+                                               0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);

-    v_uint32x8 cazg = v_reinterpret_as_u32(v256_blend<0xaa>(c, a));
-               cazg = v256_shuffle<_MM_SHUFFLE(2, 1, 0, 3)>(cazg);
+    __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
+    __m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);
+    __m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);

-    v_uint32x8 ac1ab1 = v256_blend<0xaa>(ab1, bc1);
-               ac1ab1 = v256_shuffle<_MM_SHUFFLE(2, 1, 0, 3)>(ac1ab1);
+    __m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
+    __m256i bgr1 = _mm256_permute2x128_si256(p2, p0, 0 + 3*16);
+    __m256i bgr2 = _mm256_permute2x128_si256(p1, p2, 1 + 3*16);

-    v_uint32x8 abc001 = v256_blend<0xaa>(ab0, cazg);
-    v_uint32x8 cabc0 = v256_blend<0xaa>(cazg, bc0);
+    _mm256_storeu_si256((__m256i*)ptr, bgr0);
+    _mm256_storeu_si256((__m256i*)(ptr + 32), bgr1);
+    _mm256_storeu_si256((__m256i*)(ptr + 64), bgr2);
+}

-    v_uint32x8 cabc1 = v256_unpacklo(cabc0, ac1ab1);
-    v_uint32x8 bcab0 = v256_unpackhi(cabc1, abc001);
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g, const v_uint16x16& r )
+{
+    static const __m256i sh_b = _mm256_setr_epi8(
+         0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
+         0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    static const __m256i sh_g = _mm256_setr_epi8(
+         10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5,
+         10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
+    static const __m256i sh_r = _mm256_setr_epi8(
+         4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
+         4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);

-    v_uint64x4 abc01 = v256_unpacklo(v_reinterpret_as_u64(abc001), v_reinterpret_as_u64(bcab0));
-    v_uint64x4 abc21 = v256_unpackhi(v_reinterpret_as_u64(cabc0), v_reinterpret_as_u64(bcab0));
-               abc21 = v256_swap_halves(abc21);
-    v_uint64x4 abc12 = v_reinterpret_as_u64(v256_alignr_64(cabc1, ac1ab1));
+    __m256i b0 = _mm256_shuffle_epi8(b.val, sh_b);
+    __m256i g0 = _mm256_shuffle_epi8(g.val, sh_g);
+    __m256i r0 = _mm256_shuffle_epi8(r.val, sh_r);

-    v_uint64x4 abc0 = v256_combine_diagonal(abc01, abc21);
-    v_uint64x4 abc1 = v256_combine_diagonal(abc12, abc01);
-    v_uint64x4 abc2 = v256_combine_diagonal(abc21, abc12);
+    static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
+                                               0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
+    static const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
+                                               -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);

-    v_store(ptr, _Tpvec(abc0.val));
-    v_store(ptr + _Tpvec::nlanes, _Tpvec(abc1.val));
-    v_store(ptr + _Tpvec::nlanes * 2, _Tpvec(abc2.val));
-}
-// todo:
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l16(const _Tp*, _Tpvec&, _Tpvec&, _Tpvec&)
-{}
-////
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l16(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
-{
-    _Tpvec ab0, ab1, cd0, cd1;
-    v256_load_deinterleave_l8(ptr, ab0, cd0, ab1, cd1);
-    v256_zip(ab0, ab1, a, b);
-    v256_zip(cd0, cd1, c, d);
-}
+    __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
+    __m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);
+    __m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);

-template<typename _Tp, typename _Tpvec>
-inline void v256_store_interleave_l16(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
-{ v256_store_interleave_l8(ptr, a, b, c, d); }
+    __m256i bgr0 = _mm256_permute2x128_si256(p0, p2, 0 + 2*16);
+    //__m256i bgr1 = p1;
+    __m256i bgr2 = _mm256_permute2x128_si256(p0, p2, 1 + 3*16);

-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint16x16,  ushort, l16)
-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int16x16,   short,  l16)
+    _mm256_storeu_si256((__m256i*)ptr, bgr0);
+    _mm256_storeu_si256((__m256i*)(ptr + 16), p1);
+    _mm256_storeu_si256((__m256i*)(ptr + 32), bgr2);
+}

-/* **************** **************** */
-//
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l32(const _Tp* ptr, _Tpvec& a, _Tpvec& b)
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g, const v_uint32x8& r )
 {
-    const __m256i sep = _mm256_setr_epi8(
-        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
-        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
-    );
+    __m256i b0 = _mm256_shuffle_epi32(b.val, 0x6c);
+    __m256i g0 = _mm256_shuffle_epi32(g.val, 0xb1);
+    __m256i r0 = _mm256_shuffle_epi32(r.val, 0xc6);

-    _Tpvec ab0, ab1;
-    v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes), ab0, ab1);
+    __m256i p0 = _mm256_blend_epi32(_mm256_blend_epi32(b0, g0, 0x92), r0, 0x24);
+    __m256i p1 = _mm256_blend_epi32(_mm256_blend_epi32(g0, r0, 0x92), b0, 0x24);
+    __m256i p2 = _mm256_blend_epi32(_mm256_blend_epi32(r0, b0, 0x92), g0, 0x24);

-    __m256i a0b0 = _mm256_shuffle_epi8(ab0.val, sep);
-    __m256i a1b1 = _mm256_shuffle_epi8(ab1.val, sep);
+    __m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
+    //__m256i bgr1 = p2;
+    __m256i bgr2 = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);

-    a.val = _mm256_unpacklo_epi64(a0b0, a1b1);
-    b.val = _mm256_unpackhi_epi64(a0b0, a1b1);
+    _mm256_storeu_si256((__m256i*)ptr, bgr0);
+    _mm256_storeu_si256((__m256i*)(ptr + 8), p2);
+    _mm256_storeu_si256((__m256i*)(ptr + 16), bgr2);
 }

-/// todo
-template<typename _Tp, typename _Tpvec>
-inline void v256_store_interleave_l32(_Tp*, const _Tpvec&, const _Tpvec&, const _Tpvec&)
-{}
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l32(const _Tp*, _Tpvec&, _Tpvec&, _Tpvec&)
-{}
-////
-template<typename _Tp, typename _Tpvec>
-inline void v256_load_deinterleave_l32(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d)
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g, const v_uint64x4& r )
 {
-    const __m256i sep = _mm256_setr_epi8(
-        0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
-        0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
-    );
-
-    _Tpvec abcd0, abcd1, abcd2, abcd3;
-    v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes * 2), abcd0, abcd1);
-    v_recombine(v256_load(ptr + _Tpvec::nlanes), v256_load(ptr + _Tpvec::nlanes * 3), abcd2, abcd3);
+    __m256i s01 = _mm256_unpacklo_epi64(b.val, g.val);
+    __m256i s12 = _mm256_unpackhi_epi64(g.val, r.val);
+    __m256i s20 = _mm256_blend_epi32(r.val, b.val, 0xcc);

-    __m256i ab0cd0 = _mm256_shuffle_epi8(abcd0.val, sep);
-    __m256i ab1cd1 = _mm256_shuffle_epi8(abcd1.val, sep);
-    __m256i ab2cd2 = _mm256_shuffle_epi8(abcd2.val, sep);
-    __m256i ab3cd3 = _mm256_shuffle_epi8(abcd3.val, sep);
+    __m256i bgr0 = _mm256_permute2x128_si256(s01, s20, 0 + 2*16);
+    __m256i bgr1 = _mm256_blend_epi32(s01, s12, 0x0f);
+    __m256i bgr2 = _mm256_permute2x128_si256(s20, s12, 1 + 3*16);

-    __m256i ab0 = _mm256_unpacklo_epi32(ab0cd0, ab1cd1);
-    __m256i ab1 = _mm256_unpacklo_epi32(ab2cd2, ab3cd3);
-    __m256i cd0 = _mm256_unpackhi_epi32(ab0cd0, ab1cd1);
-    __m256i cd1 = _mm256_unpackhi_epi32(ab2cd2, ab3cd3);
-
-    a.val = _mm256_unpacklo_epi64(ab0, ab1);
-    b.val = _mm256_unpackhi_epi64(ab0, ab1);
-    c.val = _mm256_unpacklo_epi64(cd0, cd1);
-    d.val = _mm256_unpackhi_epi64(cd0, cd1);
-}
-
-template<typename _Tp, typename _Tpvec>
-inline void v256_store_interleave_l32(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d)
-{ v256_store_interleave_l8(ptr, a, b, c, d); }
+    _mm256_storeu_si256((__m256i*)ptr, bgr0);
+    _mm256_storeu_si256((__m256i*)(ptr + 4), bgr1);
+    _mm256_storeu_si256((__m256i*)(ptr + 8), bgr2);
+}

-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint8x32,  uchar, l32)
-OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int8x32,   schar, l32)
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r, const v_uint8x32& a )
+{
+    __m256i bg0 = _mm256_unpacklo_epi8(b.val, g.val);
+    __m256i bg1 = _mm256_unpackhi_epi8(b.val, g.val);
+    __m256i ra0 = _mm256_unpacklo_epi8(r.val, a.val);
+    __m256i ra1 = _mm256_unpackhi_epi8(r.val, a.val);
+
+    __m256i bgra0_ = _mm256_unpacklo_epi16(bg0, ra0);
+    __m256i bgra1_ = _mm256_unpackhi_epi16(bg0, ra0);
+    __m256i bgra2_ = _mm256_unpacklo_epi16(bg1, ra1);
+    __m256i bgra3_ = _mm256_unpackhi_epi16(bg1, ra1);
+
+    __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
+    __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
+    __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
+    __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
+
+    _mm256_storeu_si256((__m256i*)ptr, bgra0);
+    _mm256_storeu_si256((__m256i*)(ptr + 32), bgra1);
+    _mm256_storeu_si256((__m256i*)(ptr + 64), bgra2);
+    _mm256_storeu_si256((__m256i*)(ptr + 96), bgra3);
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g,
+                                const v_uint16x16& r, const v_uint16x16& a )
+{
+    __m256i bg0 = _mm256_unpacklo_epi16(b.val, g.val);
+    __m256i bg1 = _mm256_unpackhi_epi16(b.val, g.val);
+    __m256i ra0 = _mm256_unpacklo_epi16(r.val, a.val);
+    __m256i ra1 = _mm256_unpackhi_epi16(r.val, a.val);
+
+    __m256i bgra0_ = _mm256_unpacklo_epi32(bg0, ra0);
+    __m256i bgra1_ = _mm256_unpackhi_epi32(bg0, ra0);
+    __m256i bgra2_ = _mm256_unpacklo_epi32(bg1, ra1);
+    __m256i bgra3_ = _mm256_unpackhi_epi32(bg1, ra1);
+
+    __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
+    __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
+    __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
+    __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
+
+    _mm256_storeu_si256((__m256i*)ptr, bgra0);
+    _mm256_storeu_si256((__m256i*)(ptr + 16), bgra1);
+    _mm256_storeu_si256((__m256i*)(ptr + 32), bgra2);
+    _mm256_storeu_si256((__m256i*)(ptr + 48), bgra3);
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g,
+                                const v_uint32x8& r, const v_uint32x8& a )
+{
+    __m256i bg0 = _mm256_unpacklo_epi32(b.val, g.val);
+    __m256i bg1 = _mm256_unpackhi_epi32(b.val, g.val);
+    __m256i ra0 = _mm256_unpacklo_epi32(r.val, a.val);
+    __m256i ra1 = _mm256_unpackhi_epi32(r.val, a.val);
+
+    __m256i bgra0_ = _mm256_unpacklo_epi64(bg0, ra0);
+    __m256i bgra1_ = _mm256_unpackhi_epi64(bg0, ra0);
+    __m256i bgra2_ = _mm256_unpacklo_epi64(bg1, ra1);
+    __m256i bgra3_ = _mm256_unpackhi_epi64(bg1, ra1);
+
+    __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
+    __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
+    __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
+    __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
+
+    _mm256_storeu_si256((__m256i*)ptr, bgra0);
+    _mm256_storeu_si256((__m256i*)(ptr + 8), bgra1);
+    _mm256_storeu_si256((__m256i*)(ptr + 16), bgra2);
+    _mm256_storeu_si256((__m256i*)(ptr + 24), bgra3);
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g,
+                                const v_uint64x4& r, const v_uint64x4& a )
+{
+    __m256i bg0 = _mm256_unpacklo_epi64(b.val, g.val);
+    __m256i bg1 = _mm256_unpackhi_epi64(b.val, g.val);
+    __m256i ra0 = _mm256_unpacklo_epi64(r.val, a.val);
+    __m256i ra1 = _mm256_unpackhi_epi64(r.val, a.val);
+
+    __m256i bgra0 = _mm256_permute2x128_si256(bg0, ra0, 0 + 2*16);
+    __m256i bgra1 = _mm256_permute2x128_si256(bg1, ra1, 0 + 2*16);
+    __m256i bgra2 = _mm256_permute2x128_si256(bg0, ra0, 1 + 3*16);
+    __m256i bgra3 = _mm256_permute2x128_si256(bg1, ra1, 1 + 3*16);
+
+    _mm256_storeu_si256((__m256i*)ptr, bgra0);
+    _mm256_storeu_si256((__m256i*)(ptr + 4), bgra1);
+    _mm256_storeu_si256((__m256i*)(ptr + 8), bgra2);
+    _mm256_storeu_si256((__m256i*)(ptr + 12), bgra3);
+}
+
+#define OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
+{ \
+    _Tpvec1 a1, b1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
+{ \
+    _Tpvec1 a1, b1, c1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
+{ \
+    _Tpvec1 a1, b1, c1, d1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+    d0 = v_reinterpret_as_##suffix0(d1); \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0 ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1);      \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0 ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1);  \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                const _Tpvec0& c0, const _Tpvec0& d0 ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \
+}
+
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int8x32, schar, s8, v_uint8x32, uchar, u8)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int16x16, short, s16, v_uint16x16, ushort, u16)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int32x8, int, s32, v_uint32x8, unsigned, u32)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float32x8, float, f32, v_uint32x8, unsigned, u32)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int64x4, int64, s64, v_uint64x4, uint64, u64)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, uint64, u64)

 inline void v256_cleanup() { _mm256_zeroupper(); }

--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@ -1318,6 +1318,80 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec&
    vst4q_##suffix(ptr, v); \
 }

+#define OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(tp, suffix) \
+inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b ) \
+{ \
+    tp##x1_t a0 = vld1_##suffix(ptr); \
+    tp##x1_t b0 = vld1_##suffix(ptr + 1); \
+    tp##x1_t a1 = vld1_##suffix(ptr + 2); \
+    tp##x1_t b1 = vld1_##suffix(ptr + 3); \
+    a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
+    b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
+} \
+ \
+inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, \
+                                 v_##tp##x2& b, v_##tp##x2& c ) \
+{ \
+    tp##x1_t a0 = vld1_##suffix(ptr); \
+    tp##x1_t b0 = vld1_##suffix(ptr + 1); \
+    tp##x1_t c0 = vld1_##suffix(ptr + 2); \
+    tp##x1_t a1 = vld1_##suffix(ptr + 3); \
+    tp##x1_t b1 = vld1_##suffix(ptr + 4); \
+    tp##x1_t c1 = vld1_##suffix(ptr + 5); \
+    a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
+    b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
+    c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
+} \
+ \
+inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b, \
+                                 v_##tp##x2& c, v_##tp##x2& d ) \
+{ \
+    tp##x1_t a0 = vld1_##suffix(ptr); \
+    tp##x1_t b0 = vld1_##suffix(ptr + 1); \
+    tp##x1_t c0 = vld1_##suffix(ptr + 2); \
+    tp##x1_t d0 = vld1_##suffix(ptr + 3); \
+    tp##x1_t a1 = vld1_##suffix(ptr + 4); \
+    tp##x1_t b1 = vld1_##suffix(ptr + 5); \
+    tp##x1_t c1 = vld1_##suffix(ptr + 6); \
+    tp##x1_t d1 = vld1_##suffix(ptr + 7); \
+    a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
+    b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
+    c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
+    d = v_##tp##x2(vcombine_##suffix(d0, d1)); \
+} \
+ \
+inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b ) \
+{ \
+    vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
+    vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
+    vst1_##suffix(ptr + 2, vget_high_##suffix(a.val)); \
+    vst1_##suffix(ptr + 3, vget_high_##suffix(b.val)); \
+} \
+ \
+inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
+                                const v_##tp##x2& b, const v_##tp##x2& c ) \
+{ \
+    vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
+    vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
+    vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
+    vst1_##suffix(ptr + 3, vget_high_##suffix(a.val)); \
+    vst1_##suffix(ptr + 4, vget_high_##suffix(b.val)); \
+    vst1_##suffix(ptr + 5, vget_high_##suffix(c.val)); \
+} \
+ \
+inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
+                                const v_##tp##x2& c, const v_##tp##x2& d ) \
+{ \
+    vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
+    vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
+    vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
+    vst1_##suffix(ptr + 3, vget_low_##suffix(d.val)); \
+    vst1_##suffix(ptr + 4, vget_high_##suffix(a.val)); \
+    vst1_##suffix(ptr + 5, vget_high_##suffix(b.val)); \
+    vst1_##suffix(ptr + 6, vget_high_##suffix(c.val)); \
+    vst1_##suffix(ptr + 7, vget_high_##suffix(d.val)); \
+}
+
 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
@ -1329,6 +1403,9 @@ OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, f64)
 #endif

+OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(int64, s64)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(uint64, u64)
+
 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
 {
    return v_float32x4(vcvtq_f32_s32(a.val));
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@ -58,17 +58,6 @@ namespace cv

 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN

-struct v_uint8x16;
-struct v_int8x16;
-struct v_uint16x8;
-struct v_int16x8;
-struct v_uint32x4;
-struct v_int32x4;
-struct v_float32x4;
-struct v_uint64x2;
-struct v_int64x2;
-struct v_float64x2;
-
 struct v_uint8x16
 {
    typedef uchar lane_type;
@ -1660,7 +1649,7 @@ OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_N
 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)

-// adopted from sse_utils.hpp
+// load deinterleave
 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
 {
    __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
@ -1681,7 +1670,25 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)

 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
 {
-#if CV_SSSE3
+#if CV_SSE4_1
+    static const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
+    static const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    __m128i s0 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i s1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
+    __m128i s2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
+    __m128i a0 = _mm_blendv_epi8(_mm_blendv_epi8(s0, s1, m0), s2, m1);
+    __m128i b0 = _mm_blendv_epi8(_mm_blendv_epi8(s1, s2, m0), s0, m1);
+    __m128i c0 = _mm_blendv_epi8(_mm_blendv_epi8(s2, s0, m0), s1, m1);
+    static const __m128i sh_b = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
+    static const __m128i sh_g = _mm_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14);
+    static const __m128i sh_r = _mm_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
+    a0 = _mm_shuffle_epi8(a0, sh_b);
+    b0 = _mm_shuffle_epi8(b0, sh_g);
+    c0 = _mm_shuffle_epi8(c0, sh_r);
+    a.val = a0;
+    b.val = b0;
+    c.val = c0;
+#elif CV_SSSE3
    static const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14);
    static const __m128i m1 = _mm_alignr_epi8(m0, m0, 11);
    static const __m128i m2 = _mm_alignr_epi8(m0, m0, 6);
@ -1753,8 +1760,41 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b,
    d.val = _mm_unpackhi_epi8(v2, v3);
 }

+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b)
+{
+    __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));     // a0 b0 a1 b1 a2 b2 a3 b3
+    __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7
+
+    __m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5
+    __m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7
+    __m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6
+    __m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7
+
+    a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7
+    b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7
+}
+
 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
 {
+#if CV_SSE4_1
+    __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));
+    __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8));
+    __m128i v2 = _mm_loadu_si128((__m128i*)(ptr + 16));
+    __m128i a0 = _mm_blend_epi16(_mm_blend_epi16(v0, v1, 0x92), v2, 0x24);
+    __m128i b0 = _mm_blend_epi16(_mm_blend_epi16(v2, v0, 0x92), v1, 0x24);
+    __m128i c0 = _mm_blend_epi16(_mm_blend_epi16(v1, v2, 0x92), v0, 0x24);
+
+    static const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    static const __m128i sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
+    static const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+    a0 = _mm_shuffle_epi8(a0, sh_a);
+    b0 = _mm_shuffle_epi8(b0, sh_b);
+    c0 = _mm_shuffle_epi8(c0, sh_c);
+
+    a.val = a0;
+    b.val = b0;
+    c.val = c0;
+#else
    __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
    __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8));
    __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16));
@ -1770,6 +1810,7 @@ inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b,
    a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21));
    b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22);
    c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22));
+#endif
 }

 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
@ -1795,6 +1836,18 @@ inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b,
    d.val = _mm_unpackhi_epi16(u2, u3);
 }

+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b)
+{
+    __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));     // a0 b0 a1 b1
+    __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 4)); // a2 b2 a3 b3
+
+    __m128i v2 = _mm_unpacklo_epi32(v0, v1); // a0 a2 b0 b2
+    __m128i v3 = _mm_unpackhi_epi32(v0, v1); // a1 a3 b1 b3
+
+    a.val = _mm_unpacklo_epi32(v2, v3); // a0 a1 a2 a3
+    b.val = _mm_unpackhi_epi32(v2, v3); // b0 b1 ab b3
+}
+
 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
 {
    __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
@ -1812,12 +1865,23 @@ inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4&

 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
 {
-    v_uint32x4 u0(_mm_loadu_si128((const __m128i*)ptr));        // a0 b0 c0 d0
-    v_uint32x4 u1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1
-    v_uint32x4 u2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2
-    v_uint32x4 u3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3
+    v_uint32x4 s0(_mm_loadu_si128((const __m128i*)ptr));        // a0 b0 c0 d0
+    v_uint32x4 s1(_mm_loadu_si128((const __m128i*)(ptr + 4)));  // a1 b1 c1 d1
+    v_uint32x4 s2(_mm_loadu_si128((const __m128i*)(ptr + 8)));  // a2 b2 c2 d2
+    v_uint32x4 s3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3

-    v_transpose4x4(u0, u1, u2, u3, a, b, c, d);
+    v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
+}
+
+inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
+{
+    const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
+
+    __m128 u0 = _mm_loadu_ps(ptr);       // a0 b0 a1 b1
+    __m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
+
+    a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3
+    b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
 }

 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c)
@ -1853,77 +1917,43 @@ inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b
    d.val = _mm_unpackhi_ps(t02hi, t13hi);
 }

-inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
+inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b)
 {
    __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
    __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2));
-    __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4));

-    a = v_uint64x2(_mm_unpacklo_epi64(t0, _mm_unpackhi_epi64(t1, t1)));
-    b = v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2));
-    c = v_uint64x2(_mm_unpacklo_epi64(t1, _mm_unpackhi_epi64(t2, t2)));
-}
-
-inline void v_load_deinterleave(const int64 *ptr, v_int64x2& a, v_int64x2& b, v_int64x2& c)
-{
-    v_uint64x2 t0, t1, t2;
-    v_load_deinterleave((const uint64*)ptr, t0, t1, t2);
-    a = v_reinterpret_as_s64(t0);
-    b = v_reinterpret_as_s64(t1);
-    c = v_reinterpret_as_s64(t2);
-}
-
-inline void v_load_deinterleave(const double *ptr, v_float64x2& a, v_float64x2& b, v_float64x2& c)
-{
-    v_uint64x2 t0, t1, t2;
-    v_load_deinterleave((const uint64*)ptr, t0, t1, t2);
-    a = v_reinterpret_as_f64(t0);
-    b = v_reinterpret_as_f64(t1);
-    c = v_reinterpret_as_f64(t2);
+    a = v_uint64x2(_mm_unpacklo_epi64(t0, t1));
+    b = v_uint64x2(_mm_unpackhi_epi64(t0, t1));
 }

-// 2-channel
-inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
+inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
 {
-    const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
+    __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0, b0
+    __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0, a1
+    __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // b1, c1

-    __m128 u0 = _mm_loadu_ps(ptr);       // a0 b0 a1 b1
-    __m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
+    t1 = _mm_shuffle_epi32(t1, 0x4e); // a1, c0

-    a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3
-    b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
+    a = v_uint64x2(_mm_unpacklo_epi64(t0, t1));
+    b = v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2));
+    c = v_uint64x2(_mm_unpackhi_epi64(t1, t2));
 }

-inline void v_load_deinterleave(const short* ptr, v_int16x8& a, v_int16x8& b)
+inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a,
+                                v_uint64x2& b, v_uint64x2& c, v_uint64x2& d)
 {
-    __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));     // a0 b0 a1 b1 a2 b2 a3 b3
-    __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7
-
-    __m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5
-    __m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7
-    __m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6
-    __m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7
+    __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0
+    __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0 d0
+    __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // a1 b1
+    __m128i t3 = _mm_loadu_si128((const __m128i*)(ptr + 6)); // c1 d1

-    a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7
-    b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7
+    a = v_uint64x2(_mm_unpacklo_epi64(t0, t2));
+    b = v_uint64x2(_mm_unpackhi_epi64(t0, t2));
+    c = v_uint64x2(_mm_unpacklo_epi64(t1, t3));
+    d = v_uint64x2(_mm_unpackhi_epi64(t1, t3));
 }

-inline void v_load_deinterleave(const ushort*ptr, v_uint16x8& a, v_uint16x8& b)
-{
-    v_int16x8 sa, sb;
-    v_load_deinterleave((const short*)ptr, sa, sb);
-    a = v_reinterpret_as_u16(sa);
-    b = v_reinterpret_as_u16(sb);
-}
-
-inline void v_store_interleave(short* ptr, const v_int16x8& a, const v_int16x8& b)
-{
-    __m128i t0, t1;
-    t0 = _mm_unpacklo_epi16(a.val, b.val);
-    t1 = _mm_unpackhi_epi16(a.val, b.val);
-    _mm_storeu_si128((__m128i*)(ptr), t0);
-    _mm_storeu_si128((__m128i*)(ptr + 8), t1);
-}
+// store interleave

 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b)
 {
@ -1937,7 +1967,24 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x1
 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
                                const v_uint8x16& c )
 {
-#if CV_SSSE3
+#if CV_SSE4_1
+    static const __m128i sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
+    static const __m128i sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
+    static const __m128i sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
+    __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
+    __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
+    __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
+
+    static const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
+    static const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    __m128i v0 = _mm_blendv_epi8(_mm_blendv_epi8(a0, b0, m1), c0, m0);
+    __m128i v1 = _mm_blendv_epi8(_mm_blendv_epi8(b0, c0, m1), a0, m0);
+    __m128i v2 = _mm_blendv_epi8(_mm_blendv_epi8(c0, a0, m1), b0, m0);
+
+    _mm_storeu_si128((__m128i*)(ptr), v0);
+    _mm_storeu_si128((__m128i*)(ptr + 16), v1);
+    _mm_storeu_si128((__m128i*)(ptr + 32), v2);
+#elif CV_SSSE3
    static const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5);
    static const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10);
    static const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15);
@ -2025,10 +2072,35 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x1
    _mm_storeu_si128((__m128i*)(ptr + 48), v3);
 }

+inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b )
+{
+    __m128i t0, t1;
+    t0 = _mm_unpacklo_epi16(a.val, b.val);
+    t1 = _mm_unpackhi_epi16(a.val, b.val);
+    _mm_storeu_si128((__m128i*)(ptr), t0);
+    _mm_storeu_si128((__m128i*)(ptr + 8), t1);
+}
+
 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
                                const v_uint16x8& b,
                                const v_uint16x8& c )
 {
+#if CV_SSE4_1
+    static const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    static const __m128i sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
+    static const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+    __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
+    __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
+    __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
+
+    __m128i v0 = _mm_blend_epi16(_mm_blend_epi16(a0, b0, 0x92), c0, 0x24);
+    __m128i v1 = _mm_blend_epi16(_mm_blend_epi16(c0, a0, 0x92), b0, 0x24);
+    __m128i v2 = _mm_blend_epi16(_mm_blend_epi16(b0, c0, 0x92), a0, 0x24);
+
+    _mm_storeu_si128((__m128i*)ptr, v0);
+    _mm_storeu_si128((__m128i*)(ptr + 8), v1);
+    _mm_storeu_si128((__m128i*)(ptr + 16), v2);
+#else
    __m128i z = _mm_setzero_si128();
    __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
    __m128i ab1 = _mm_unpackhi_epi16(a.val, b.val);
@ -2060,6 +2132,7 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
    _mm_storeu_si128((__m128i*)(ptr), v0);
    _mm_storeu_si128((__m128i*)(ptr + 8), v1);
    _mm_storeu_si128((__m128i*)(ptr + 16), v2);
+#endif
 }

 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
@ -2085,6 +2158,15 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16
    _mm_storeu_si128((__m128i*)(ptr + 24), v3);
 }

+inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b )
+{
+    __m128i t0 = _mm_unpacklo_epi32(a.val, b.val);
+    __m128i t1 = _mm_unpackhi_epi32(a.val, b.val);
+
+    _mm_storeu_si128((__m128i*)ptr, t0);
+    _mm_storeu_si128((__m128i*)(ptr + 4), t1);
+}
+
 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
                                const v_uint32x4& c )
 {
@ -2158,6 +2240,15 @@ inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32
    _mm_storeu_ps(ptr + 12, v3);
 }

+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b)
+{
+    __m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
+    __m128i t1 = _mm_unpackhi_epi64(a.val, b.val);
+
+    _mm_storeu_si128((__m128i*)ptr, t0);
+    _mm_storeu_si128((__m128i*)(ptr + 2), t1);
+}
+
 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c)
 {
    __m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
@ -2169,58 +2260,72 @@ inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x
    _mm_storeu_si128((__m128i*)(ptr + 4), t2);
 }

-inline void v_store_interleave(int64 *ptr, const v_int64x2& a, const v_int64x2& b, const v_int64x2& c)
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c, const v_uint64x2& d)
 {
-    v_store_interleave((uint64*)ptr, v_reinterpret_as_u64(a), v_reinterpret_as_u64(b), v_reinterpret_as_u64(c));
-}
+    __m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
+    __m128i t1 = _mm_unpacklo_epi64(c.val, d.val);
+    __m128i t2 = _mm_unpackhi_epi64(a.val, b.val);
+    __m128i t3 = _mm_unpackhi_epi64(c.val, d.val);

-inline void v_store_interleave(double *ptr, const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
-{
-    v_store_interleave((uint64*)ptr, v_reinterpret_as_u64(a), v_reinterpret_as_u64(b), v_reinterpret_as_u64(c));
+    _mm_storeu_si128((__m128i*)ptr, t0);
+    _mm_storeu_si128((__m128i*)(ptr + 2), t1);
+    _mm_storeu_si128((__m128i*)(ptr + 4), t2);
+    _mm_storeu_si128((__m128i*)(ptr + 6), t3);
 }

-#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \
-inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
-                                 _Tpvec& b0, _Tpvec& c0 ) \
+#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
+{ \
+    _Tpvec1 a1, b1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
+{ \
+    _Tpvec1 a1, b1, c1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
 { \
-    _Tpuvec a1, b1, c1; \
-    v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1); \
-    a0 = v_reinterpret_as_##suffix(a1); \
-    b0 = v_reinterpret_as_##suffix(b1); \
-    c0 = v_reinterpret_as_##suffix(c1); \
+    _Tpvec1 a1, b1, c1, d1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+    d0 = v_reinterpret_as_##suffix0(d1); \
 } \
-inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
-                                 _Tpvec& b0, _Tpvec& c0, _Tpvec& d0 ) \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0 ) \
 { \
-    _Tpuvec a1, b1, c1, d1; \
-    v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1, d1); \
-    a0 = v_reinterpret_as_##suffix(a1); \
-    b0 = v_reinterpret_as_##suffix(b1); \
-    c0 = v_reinterpret_as_##suffix(c1); \
-    d0 = v_reinterpret_as_##suffix(d1); \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1);      \
 } \
-inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, \
-                               const _Tpvec& b0, const _Tpvec& c0 ) \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0 ) \
 { \
-    _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \
-    _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \
-    _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \
-    v_store_interleave((_Tpu*)ptr, a1, b1, c1); \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1);  \
 } \
-inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, const _Tpvec& b0, \
-                               const _Tpvec& c0, const _Tpvec& d0 ) \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                               const _Tpvec0& c0, const _Tpvec0& d0 ) \
 { \
-    _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \
-    _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \
-    _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \
-    _Tpuvec d1 = v_reinterpret_as_##usuffix(d0); \
-    v_store_interleave((_Tpu*)ptr, a1, b1, c1, d1); \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \
 }

 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
-//OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64)

 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
 {
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@ -298,6 +298,8 @@ OPENCV_HAL_IMPL_VSX_INTERLEAVE(uint, v_uint32x4)
 OPENCV_HAL_IMPL_VSX_INTERLEAVE(int, v_int32x4)
 OPENCV_HAL_IMPL_VSX_INTERLEAVE(float, v_float32x4)
 OPENCV_HAL_IMPL_VSX_INTERLEAVE(double, v_float64x2)
+OPENCV_HAL_IMPL_VSX_INTERLEAVE(int64, v_int64x2)
+OPENCV_HAL_IMPL_VSX_INTERLEAVE(uint64, v_uint64x2)

 /* Expand */
 #define OPENCV_HAL_IMPL_VSX_EXPAND(_Tpvec, _Tpwvec, _Tp, fl, fh)  \
--- a/modules/core/src/merge.cpp
+++ b/modules/core/src/merge.cpp
@ -8,223 +8,49 @@

 namespace cv { namespace hal {

-#if CV_NEON
-template<typename T> struct VMerge2;
-template<typename T> struct VMerge3;
-template<typename T> struct VMerge4;
-
-#define MERGE2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
-    template<>                                                                    \
-    struct name<data_type>{                                                       \
-        void operator()(const data_type* src0, const data_type* src1,             \
-                        data_type* dst){                                          \
-            reg_type r;                                                           \
-            r.val[0] = load_func(src0);                                           \
-            r.val[1] = load_func(src1);                                           \
-            store_func(dst, r);                                                   \
-        }                                                                         \
-    }
+#if CV_SIMD
+template<typename T, typename VecT> static void
+vecmerge_( const T** src, T* dst, int len, int cn )
+{
+    int i;
+    const T* src0 = src[0];
+    const T* src1 = src[1];

-#define MERGE3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
-    template<>                                                                    \
-    struct name<data_type>{                                                       \
-        void operator()(const data_type* src0, const data_type* src1,             \
-                        const data_type* src2, data_type* dst){                   \
-            reg_type r;                                                           \
-            r.val[0] = load_func(src0);                                           \
-            r.val[1] = load_func(src1);                                           \
-            r.val[2] = load_func(src2);                                           \
-            store_func(dst, r);                                                   \
-        }                                                                         \
+    const int VECSZ = VecT::nlanes;
+    if( cn == 2 )
+    {
+        for( i = 0; i < len; i += VECSZ )
+        {
+            i = std::min( len - VECSZ, i );
+            VecT a = vx_load(src0 + i), b = vx_load(src1 + i);
+            v_store_interleave(dst + i*cn, a, b);
+        }
    }
-
-#define MERGE4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
-    template<>                                                                    \
-    struct name<data_type>{                                                       \
-        void operator()(const data_type* src0, const data_type* src1,             \
-                        const data_type* src2, const data_type* src3,             \
-                        data_type* dst){                                          \
-            reg_type r;                                                           \
-            r.val[0] = load_func(src0);                                           \
-            r.val[1] = load_func(src1);                                           \
-            r.val[2] = load_func(src2);                                           \
-            r.val[3] = load_func(src3);                                           \
-            store_func(dst, r);                                                   \
-        }                                                                         \
+    else if( cn == 3 )
+    {
+        const T* src2 = src[2];
+        for( i = 0; i < len; i += VECSZ )
+        {
+            i = std::min( len - VECSZ, i );
+            VecT a = vx_load(src0 + i), b = vx_load(src1 + i), c = vx_load(src2 + i);
+            v_store_interleave(dst + i*cn, a, b, c);
+        }
    }
-
-MERGE2_KERNEL_TEMPLATE(VMerge2, uchar ,  uint8x16x2_t, vld1q_u8 , vst2q_u8 );
-MERGE2_KERNEL_TEMPLATE(VMerge2, ushort,  uint16x8x2_t, vld1q_u16, vst2q_u16);
-MERGE2_KERNEL_TEMPLATE(VMerge2, int   ,   int32x4x2_t, vld1q_s32, vst2q_s32);
-MERGE2_KERNEL_TEMPLATE(VMerge2, int64 ,   int64x1x2_t, vld1_s64 , vst2_s64 );
-
-MERGE3_KERNEL_TEMPLATE(VMerge3, uchar ,  uint8x16x3_t, vld1q_u8 , vst3q_u8 );
-MERGE3_KERNEL_TEMPLATE(VMerge3, ushort,  uint16x8x3_t, vld1q_u16, vst3q_u16);
-MERGE3_KERNEL_TEMPLATE(VMerge3, int   ,   int32x4x3_t, vld1q_s32, vst3q_s32);
-MERGE3_KERNEL_TEMPLATE(VMerge3, int64 ,   int64x1x3_t, vld1_s64 , vst3_s64 );
-
-MERGE4_KERNEL_TEMPLATE(VMerge4, uchar ,  uint8x16x4_t, vld1q_u8 , vst4q_u8 );
-MERGE4_KERNEL_TEMPLATE(VMerge4, ushort,  uint16x8x4_t, vld1q_u16, vst4q_u16);
-MERGE4_KERNEL_TEMPLATE(VMerge4, int   ,   int32x4x4_t, vld1q_s32, vst4q_s32);
-MERGE4_KERNEL_TEMPLATE(VMerge4, int64 ,   int64x1x4_t, vld1_s64 , vst4_s64 );
-
-#elif CV_SSE2
-
-template <typename T>
-struct VMerge2
-{
-    VMerge2() : support(false) { }
-    void operator()(const T *, const T *, T *) const { }
-
-    bool support;
-};
-
-template <typename T>
-struct VMerge3
-{
-    VMerge3() : support(false) { }
-    void operator()(const T *, const T *, const T *, T *) const { }
-
-    bool support;
-};
-
-template <typename T>
-struct VMerge4
-{
-    VMerge4() : support(false) { }
-    void operator()(const T *, const T *, const T *, const T *, T *) const { }
-
-    bool support;
-};
-
-#define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
-template <>                                                                                \
-struct VMerge2<data_type>                                                                  \
-{                                                                                          \
-    enum                                                                                   \
-    {                                                                                      \
-        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
-    };                                                                                     \
-                                                                                           \
-    VMerge2()                                                                              \
-    {                                                                                      \
-        support = checkHardwareSupport(se);                                                \
-    }                                                                                      \
-                                                                                           \
-    void operator()(const data_type * src0, const data_type * src1,                        \
-                    data_type * dst) const                                                 \
-    {                                                                                      \
-        reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
-        reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
-        reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
-        reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
-                                                                                           \
-        _mm_interleave(v_src0, v_src1, v_src2, v_src3);                                    \
-                                                                                           \
-        _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
-    }                                                                                      \
-                                                                                           \
-    bool support;                                                                          \
-}
-
-#define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
-template <>                                                                                \
-struct VMerge3<data_type>                                                                  \
-{                                                                                          \
-    enum                                                                                   \
-    {                                                                                      \
-        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
-    };                                                                                     \
-                                                                                           \
-    VMerge3()                                                                              \
-    {                                                                                      \
-        support = checkHardwareSupport(se);                                                \
-    }                                                                                      \
-                                                                                           \
-    void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\
-                    data_type * dst) const                                                 \
-    {                                                                                      \
-        reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
-        reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
-        reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
-        reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
-        reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2));                   \
-        reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC));    \
-                                                                                           \
-        _mm_interleave(v_src0, v_src1, v_src2,                                             \
-                       v_src3, v_src4, v_src5);                                            \
-                                                                                           \
-        _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4);                \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5);                \
-    }                                                                                      \
-                                                                                           \
-    bool support;                                                                          \
-}
-
-#define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
-template <>                                                                                \
-struct VMerge4<data_type>                                                                  \
-{                                                                                          \
-    enum                                                                                   \
-    {                                                                                      \
-        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
-    };                                                                                     \
-                                                                                           \
-    VMerge4()                                                                              \
-    {                                                                                      \
-        support = checkHardwareSupport(se);                                                \
-    }                                                                                      \
-                                                                                           \
-    void operator()(const data_type * src0, const data_type * src1,                        \
-                    const data_type * src2, const data_type * src3,                        \
-                    data_type * dst) const                                                 \
-    {                                                                                      \
-        reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
-        reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
-        reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
-        reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
-        reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2));                   \
-        reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC));    \
-        reg_type v_src6 = _mm_loadu_##flavor((const cast_type *)(src3));                   \
-        reg_type v_src7 = _mm_loadu_##flavor((const cast_type *)(src3 + ELEMS_IN_VEC));    \
-                                                                                           \
-        _mm_interleave(v_src0, v_src1, v_src2, v_src3,                                     \
-                       v_src4, v_src5, v_src6, v_src7);                                    \
-                                                                                           \
-        _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4);                \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5);                \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 6), v_src6);                \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 7), v_src7);                \
-    }                                                                                      \
-                                                                                           \
-    bool support;                                                                          \
+    else
+    {
+        CV_Assert( cn == 4 );
+        const T* src2 = src[2];
+        const T* src3 = src[3];
+        for( i = 0; i < len; i += VECSZ )
+        {
+            i = std::min( len - VECSZ, i );
+            VecT a = vx_load(src0 + i), b = vx_load(src1 + i);
+            VecT c = vx_load(src2 + i), d = vx_load(src3 + i);
+            v_store_interleave(dst + i*cn, a, b, c, d);
+        }
+    }
+    vx_cleanup();
 }
-
-MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
-MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
-MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
-
-#if CV_SSE4_1
-MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
-MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
-MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
-#endif
-
-MERGE2_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
-MERGE3_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
-MERGE4_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
-
 #endif

 template<typename T> static void
@ -242,28 +68,6 @@ merge_( const T** src, T* dst, int len, int cn )
    {
        const T *src0 = src[0], *src1 = src[1];
        i = j = 0;
-#if CV_NEON
-        if(cn == 2)
-        {
-            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
-            int inc_j = 2 * inc_i;
-
-            VMerge2<T> vmerge;
-            for( ; i < len - inc_i; i += inc_i, j += inc_j)
-                vmerge(src0 + i, src1 + i, dst + j);
-        }
-#elif CV_SSE2
-        if(cn == 2)
-        {
-            int inc_i = 32/sizeof(T);
-            int inc_j = 2 * inc_i;
-
-            VMerge2<T> vmerge;
-            if (vmerge.support)
-                for( ; i < len - inc_i; i += inc_i, j += inc_j)
-                    vmerge(src0 + i, src1 + i, dst + j);
-        }
-#endif
        for( ; i < len; i++, j += cn )
        {
            dst[j] = src0[i];
@ -274,28 +78,6 @@ merge_( const T** src, T* dst, int len, int cn )
    {
        const T *src0 = src[0], *src1 = src[1], *src2 = src[2];
        i = j = 0;
-#if CV_NEON
-        if(cn == 3)
-        {
-            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
-            int inc_j = 3 * inc_i;
-
-            VMerge3<T> vmerge;
-            for( ; i < len - inc_i; i += inc_i, j += inc_j)
-                vmerge(src0 + i, src1 + i, src2 + i, dst + j);
-        }
-#elif CV_SSE2
-        if(cn == 3)
-        {
-            int inc_i = 32/sizeof(T);
-            int inc_j = 3 * inc_i;
-
-            VMerge3<T> vmerge;
-            if (vmerge.support)
-                for( ; i < len - inc_i; i += inc_i, j += inc_j)
-                    vmerge(src0 + i, src1 + i, src2 + i, dst + j);
-        }
-#endif
        for( ; i < len; i++, j += cn )
        {
            dst[j] = src0[i];
@ -307,28 +89,6 @@ merge_( const T** src, T* dst, int len, int cn )
    {
        const T *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
        i = j = 0;
-#if CV_NEON
-        if(cn == 4)
-        {
-            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
-            int inc_j = 4 * inc_i;
-
-            VMerge4<T> vmerge;
-            for( ; i < len - inc_i; i += inc_i, j += inc_j)
-                vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
-        }
-#elif CV_SSE2
-        if(cn == 4)
-        {
-            int inc_i = 32/sizeof(T);
-            int inc_j = 4 * inc_i;
-
-            VMerge4<T> vmerge;
-            if (vmerge.support)
-                for( ; i < len - inc_i; i += inc_i, j += inc_j)
-                    vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
-        }
-#endif
        for( ; i < len; i++, j += cn )
        {
            dst[j] = src0[i]; dst[j+1] = src1[i];
@ -347,29 +107,48 @@ merge_( const T** src, T* dst, int len, int cn )
    }
 }

-
 void merge8u(const uchar** src, uchar* dst, int len, int cn )
 {
    CALL_HAL(merge8u, cv_hal_merge8u, src, dst, len, cn)
-    merge_(src, dst, len, cn);
+#if CV_SIMD
+    if( len >= v_uint8::nlanes && 2 <= cn && cn <= 4 )
+        vecmerge_<uchar, v_uint8>(src, dst, len, cn);
+    else
+#endif
+        merge_(src, dst, len, cn);
 }

 void merge16u(const ushort** src, ushort* dst, int len, int cn )
 {
    CALL_HAL(merge16u, cv_hal_merge16u, src, dst, len, cn)
-    merge_(src, dst, len, cn);
+#if CV_SIMD
+    if( len >= v_uint16::nlanes && 2 <= cn && cn <= 4 )
+        vecmerge_<ushort, v_uint16>(src, dst, len, cn);
+    else
+#endif
+        merge_(src, dst, len, cn);
 }

 void merge32s(const int** src, int* dst, int len, int cn )
 {
    CALL_HAL(merge32s, cv_hal_merge32s, src, dst, len, cn)
-    merge_(src, dst, len, cn);
+#if CV_SIMD
+    if( len >= v_int32::nlanes && 2 <= cn && cn <= 4 )
+        vecmerge_<int, v_int32>(src, dst, len, cn);
+    else
+#endif
+        merge_(src, dst, len, cn);
 }

 void merge64s(const int64** src, int64* dst, int len, int cn )
 {
    CALL_HAL(merge64s, cv_hal_merge64s, src, dst, len, cn)
-    merge_(src, dst, len, cn);
+#if CV_SIMD
+    if( len >= v_int64::nlanes && 2 <= cn && cn <= 4 )
+        vecmerge_<int64, v_int64>(src, dst, len, cn);
+    else
+#endif
+        merge_(src, dst, len, cn);
 }

 }} // cv::hal::
--- a/modules/core/src/split.cpp
+++ b/modules/core/src/split.cpp
@ -8,222 +8,57 @@

 namespace cv { namespace hal {

-#if CV_NEON
-template<typename T> struct VSplit2;
-template<typename T> struct VSplit3;
-template<typename T> struct VSplit4;
-
-#define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
-    template<>                                                                    \
-    struct name<data_type>                                                        \
-    {                                                                             \
-        void operator()(const data_type* src, data_type* dst0,                    \
-                        data_type* dst1) const                                    \
-        {                                                                         \
-            reg_type r = load_func(src);                                          \
-            store_func(dst0, r.val[0]);                                           \
-            store_func(dst1, r.val[1]);                                           \
-        }                                                                         \
-    }
+#if CV_SIMD
+template<typename T, typename VecT> static void
+vecsplit_( const T* src, T** dst, int len, int cn )
+{
+    int i;
+    T* dst0 = dst[0];
+    T* dst1 = dst[1];

-#define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
-    template<>                                                                    \
-    struct name<data_type>                                                        \
-    {                                                                             \
-        void operator()(const data_type* src, data_type* dst0, data_type* dst1,   \
-                        data_type* dst2) const                                    \
-        {                                                                         \
-            reg_type r = load_func(src);                                          \
-            store_func(dst0, r.val[0]);                                           \
-            store_func(dst1, r.val[1]);                                           \
-            store_func(dst2, r.val[2]);                                           \
-        }                                                                         \
+    const int VECSZ = VecT::nlanes;
+    if( cn == 2 )
+    {
+        for( i = 0; i < len; i += VECSZ )
+        {
+            i = std::min( len - VECSZ, i );
+            VecT a, b;
+            v_load_deinterleave(src + i*cn, a, b);
+            v_store(dst0 + i, a);
+            v_store(dst1 + i, b);
+        }
    }
-
-#define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
-    template<>                                                                    \
-    struct name<data_type>                                                        \
-    {                                                                             \
-        void operator()(const data_type* src, data_type* dst0, data_type* dst1,   \
-                        data_type* dst2, data_type* dst3) const                   \
-        {                                                                         \
-            reg_type r = load_func(src);                                          \
-            store_func(dst0, r.val[0]);                                           \
-            store_func(dst1, r.val[1]);                                           \
-            store_func(dst2, r.val[2]);                                           \
-            store_func(dst3, r.val[3]);                                           \
-        }                                                                         \
+    else if( cn == 3 )
+    {
+        T* dst2 = dst[2];
+        for( i = 0; i < len; i += VECSZ )
+        {
+            i = std::min( len - VECSZ, i );
+            VecT a, b, c;
+            v_load_deinterleave(src + i*cn, a, b, c);
+            v_store(dst0 + i, a);
+            v_store(dst1 + i, b);
+            v_store(dst2 + i, c);
+        }
    }
-
-SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar ,  uint8x16x2_t, vld2q_u8 , vst1q_u8 );
-SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort,  uint16x8x2_t, vld2q_u16, vst1q_u16);
-SPLIT2_KERNEL_TEMPLATE(VSplit2, int   ,   int32x4x2_t, vld2q_s32, vst1q_s32);
-SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 ,   int64x1x2_t, vld2_s64 , vst1_s64 );
-
-SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar ,  uint8x16x3_t, vld3q_u8 , vst1q_u8 );
-SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort,  uint16x8x3_t, vld3q_u16, vst1q_u16);
-SPLIT3_KERNEL_TEMPLATE(VSplit3, int   ,   int32x4x3_t, vld3q_s32, vst1q_s32);
-SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 ,   int64x1x3_t, vld3_s64 , vst1_s64 );
-
-SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar ,  uint8x16x4_t, vld4q_u8 , vst1q_u8 );
-SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort,  uint16x8x4_t, vld4q_u16, vst1q_u16);
-SPLIT4_KERNEL_TEMPLATE(VSplit4, int   ,   int32x4x4_t, vld4q_s32, vst1q_s32);
-SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 ,   int64x1x4_t, vld4_s64 , vst1_s64 );
-
-#elif CV_SSE2
-
-template <typename T>
-struct VSplit2
-{
-    VSplit2() : support(false) { }
-    void operator()(const T *, T *, T *) const { }
-
-    bool support;
-};
-
-template <typename T>
-struct VSplit3
-{
-    VSplit3() : support(false) { }
-    void operator()(const T *, T *, T *, T *) const { }
-
-    bool support;
-};
-
-template <typename T>
-struct VSplit4
-{
-    VSplit4() : support(false) { }
-    void operator()(const T *, T *, T *, T *, T *) const { }
-
-    bool support;
-};
-
-#define SPLIT2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor)   \
-template <>                                                                                \
-struct VSplit2<data_type>                                                                  \
-{                                                                                          \
-    enum                                                                                   \
-    {                                                                                      \
-        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
-    };                                                                                     \
-                                                                                           \
-    VSplit2()                                                                              \
-    {                                                                                      \
-        support = checkHardwareSupport(CV_CPU_SSE2);                                       \
-    }                                                                                      \
-                                                                                           \
-    void operator()(const data_type * src,                                                 \
-                    data_type * dst0, data_type * dst1) const                              \
-    {                                                                                      \
-        reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src));                    \
-        reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC));     \
-        reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
-        reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
-                                                                                           \
-        _mm_deinterleave(v_src0, v_src1, v_src2, v_src3);                                  \
-                                                                                           \
-        _mm_storeu_##flavor((cast_type *)(dst0), v_src0);                                  \
-        _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1);                   \
-        _mm_storeu_##flavor((cast_type *)(dst1), v_src2);                                  \
-        _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3);                   \
-    }                                                                                      \
-                                                                                           \
-    bool support;                                                                          \
-}
-
-#define SPLIT3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor)   \
-template <>                                                                                \
-struct VSplit3<data_type>                                                                  \
-{                                                                                          \
-    enum                                                                                   \
-    {                                                                                      \
-        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
-    };                                                                                     \
-                                                                                           \
-    VSplit3()                                                                              \
-    {                                                                                      \
-        support = checkHardwareSupport(CV_CPU_SSE2);                                       \
-    }                                                                                      \
-                                                                                           \
-    void operator()(const data_type * src,                                                 \
-                    data_type * dst0, data_type * dst1, data_type * dst2) const            \
-    {                                                                                      \
-        reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src));                    \
-        reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC));     \
-        reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
-        reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
-        reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
-        reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
-                                                                                           \
-        _mm_deinterleave(v_src0, v_src1, v_src2,                                           \
-                         v_src3, v_src4, v_src5);                                          \
-                                                                                           \
-        _mm_storeu_##flavor((cast_type *)(dst0), v_src0);                                  \
-        _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1);                   \
-        _mm_storeu_##flavor((cast_type *)(dst1), v_src2);                                  \
-        _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3);                   \
-        _mm_storeu_##flavor((cast_type *)(dst2), v_src4);                                  \
-        _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5);                   \
-    }                                                                                      \
-                                                                                           \
-    bool support;                                                                          \
-}
-
-#define SPLIT4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor)   \
-template <>                                                                                \
-struct VSplit4<data_type>                                                                  \
-{                                                                                          \
-    enum                                                                                   \
-    {                                                                                      \
-        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
-    };                                                                                     \
-                                                                                           \
-    VSplit4()                                                                              \
-    {                                                                                      \
-        support = checkHardwareSupport(CV_CPU_SSE2);                                       \
-    }                                                                                      \
-                                                                                           \
-    void operator()(const data_type * src, data_type * dst0, data_type * dst1,             \
-                    data_type * dst2, data_type * dst3) const                              \
-    {                                                                                      \
-        reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src));                    \
-        reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC));     \
-        reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
-        reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
-        reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
-        reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
-        reg_type v_src6 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 6)); \
-        reg_type v_src7 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 7)); \
-                                                                                           \
-        _mm_deinterleave(v_src0, v_src1, v_src2, v_src3,                                   \
-                         v_src4, v_src5, v_src6, v_src7);                                  \
-                                                                                           \
-        _mm_storeu_##flavor((cast_type *)(dst0), v_src0);                                  \
-        _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1);                   \
-        _mm_storeu_##flavor((cast_type *)(dst1), v_src2);                                  \
-        _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3);                   \
-        _mm_storeu_##flavor((cast_type *)(dst2), v_src4);                                  \
-        _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5);                   \
-        _mm_storeu_##flavor((cast_type *)(dst3), v_src6);                                  \
-        _mm_storeu_##flavor((cast_type *)(dst3 + ELEMS_IN_VEC), v_src7);                   \
-    }                                                                                      \
-                                                                                           \
-    bool support;                                                                          \
+    else
+    {
+        CV_Assert( cn == 4 );
+        T* dst2 = dst[2];
+        T* dst3 = dst[3];
+        for( i = 0; i < len; i += VECSZ )
+        {
+            i = std::min( len - VECSZ, i );
+            VecT a, b, c, d;
+            v_load_deinterleave(src + i*cn, a, b, c, d);
+            v_store(dst0 + i, a);
+            v_store(dst1 + i, b);
+            v_store(dst2 + i, c);
+            v_store(dst3 + i, d);
+        }
+    }
+    vx_cleanup();
 }
-
-SPLIT2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
-SPLIT2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
-SPLIT2_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_deinterleave_ps, ps);
-
-SPLIT3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
-SPLIT3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
-SPLIT3_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_deinterleave_ps, ps);
-
-SPLIT4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
-SPLIT4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
-SPLIT4_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_deinterleave_ps, ps);
-
 #endif

 template<typename T> static void
@ -250,30 +85,6 @@ split_( const T* src, T** dst, int len, int cn )
        T *dst0 = dst[0], *dst1 = dst[1];
        i = j = 0;

-#if CV_NEON
-        if(cn == 2)
-        {
-            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
-            int inc_j = 2 * inc_i;
-
-            VSplit2<T> vsplit;
-            for( ; i < len - inc_i; i += inc_i, j += inc_j)
-                vsplit(src + j, dst0 + i, dst1 + i);
-        }
-#elif CV_SSE2
-        if (cn == 2)
-        {
-            int inc_i = 32/sizeof(T);
-            int inc_j = 2 * inc_i;
-
-            VSplit2<T> vsplit;
-            if (vsplit.support)
-            {
-                for( ; i <= len - inc_i; i += inc_i, j += inc_j)
-                    vsplit(src + j, dst0 + i, dst1 + i);
-            }
-        }
-#endif
        for( ; i < len; i++, j += cn )
        {
            dst0[i] = src[j];
@ -285,31 +96,6 @@ split_( const T* src, T** dst, int len, int cn )
        T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2];
        i = j = 0;

-#if CV_NEON
-        if(cn == 3)
-        {
-            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
-            int inc_j = 3 * inc_i;
-
-            VSplit3<T> vsplit;
-            for( ; i <= len - inc_i; i += inc_i, j += inc_j)
-                vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
-        }
-#elif CV_SSE2
-        if (cn == 3)
-        {
-            int inc_i = 32/sizeof(T);
-            int inc_j = 3 * inc_i;
-
-            VSplit3<T> vsplit;
-
-            if (vsplit.support)
-            {
-                for( ; i <= len - inc_i; i += inc_i, j += inc_j)
-                    vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
-            }
-        }
-#endif
        for( ; i < len; i++, j += cn )
        {
            dst0[i] = src[j];
@ -322,30 +108,6 @@ split_( const T* src, T** dst, int len, int cn )
        T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2], *dst3 = dst[3];
        i = j = 0;

-#if CV_NEON
-        if(cn == 4)
-        {
-            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
-            int inc_j = 4 * inc_i;
-
-            VSplit4<T> vsplit;
-            for( ; i <= len - inc_i; i += inc_i, j += inc_j)
-                vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
-        }
-#elif CV_SSE2
-        if (cn == 4)
-        {
-            int inc_i = 32/sizeof(T);
-            int inc_j = 4 * inc_i;
-
-            VSplit4<T> vsplit;
-            if (vsplit.support)
-            {
-                for( ; i <= len - inc_i; i += inc_i, j += inc_j)
-                    vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
-            }
-        }
-#endif
        for( ; i < len; i++, j += cn )
        {
            dst0[i] = src[j]; dst1[i] = src[j+1];
@ -367,25 +129,46 @@ split_( const T* src, T** dst, int len, int cn )
 void split8u(const uchar* src, uchar** dst, int len, int cn )
 {
    CALL_HAL(split8u, cv_hal_split8u, src,dst, len, cn)
-    split_(src, dst, len, cn);
+
+#if CV_SIMD
+    if( len >= v_uint8::nlanes && 2 <= cn && cn <= 4 )
+        vecsplit_<uchar, v_uint8>(src, dst, len, cn);
+    else
+#endif
+        split_(src, dst, len, cn);
 }

 void split16u(const ushort* src, ushort** dst, int len, int cn )
 {
    CALL_HAL(split16u, cv_hal_split16u, src,dst, len, cn)
-    split_(src, dst, len, cn);
+#if CV_SIMD
+    if( len >= v_uint16::nlanes && 2 <= cn && cn <= 4 )
+        vecsplit_<ushort, v_uint16>(src, dst, len, cn);
+    else
+#endif
+        split_(src, dst, len, cn);
 }

 void split32s(const int* src, int** dst, int len, int cn )
 {
    CALL_HAL(split32s, cv_hal_split32s, src,dst, len, cn)
-    split_(src, dst, len, cn);
+#if CV_SIMD
+    if( len >= v_uint32::nlanes && 2 <= cn && cn <= 4 )
+        vecsplit_<int, v_int32>(src, dst, len, cn);
+    else
+#endif
+        split_(src, dst, len, cn);
 }

 void split64s(const int64* src, int64** dst, int len, int cn )
 {
    CALL_HAL(split64s, cv_hal_split64s, src,dst, len, cn)
-    split_(src, dst, len, cn);
+#if CV_SIMD
+    if( len >= v_int64::nlanes && 2 <= cn && cn <= 4 )
+        vecsplit_<int64, v_int64>(src, dst, len, cn);
+    else
+#endif
+        split_(src, dst, len, cn);
 }

 }} // cv::hal::