|
|
@ -2044,7 +2044,7 @@ inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b |
|
|
|
b = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask1, ab1)); |
|
|
|
b = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask1, ab1)); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& b, v_uint8x64& g, v_uint8x64& r ) |
|
|
|
inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b, v_uint8x64& c ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
__m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr); |
|
|
|
__m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr); |
|
|
|
__m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 64)); |
|
|
|
__m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 64)); |
|
|
@ -2062,22 +2062,22 @@ inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& b, v_uint8x64& g, |
|
|
|
77, 74, 71, 68, 65, 127, 124, 121, 118, 115, 112, 109, 106, 103, 100, 97, |
|
|
|
77, 74, 71, 68, 65, 127, 124, 121, 118, 115, 112, 109, 106, 103, 100, 97, |
|
|
|
94, 91, 88, 85, 82, 79, 76, 73, 70, 67, 64, 61, 58, 55, 52, 49, |
|
|
|
94, 91, 88, 85, 82, 79, 76, 73, 70, 67, 64, 61, 58, 55, 52, 49, |
|
|
|
46, 43, 40, 37, 34, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1), bgr2); |
|
|
|
46, 43, 40, 37, 34, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1), bgr2); |
|
|
|
b = v_uint8x64(_mm512_mask_compress_epi8(r12b2, 0xffffffffffe00000, r0b01)); |
|
|
|
a = v_uint8x64(_mm512_mask_compress_epi8(r12b2, 0xffffffffffe00000, r0b01)); |
|
|
|
g = v_uint8x64(_mm512_mask_compress_epi8(b1g12, 0x2492492492492492, bgr0)); |
|
|
|
b = v_uint8x64(_mm512_mask_compress_epi8(b1g12, 0x2492492492492492, bgr0)); |
|
|
|
r = v_uint8x64(_mm512_mask_expand_epi8(r0b01, 0xffffffffffe00000, r12b2)); |
|
|
|
c = v_uint8x64(_mm512_mask_expand_epi8(r0b01, 0xffffffffffe00000, r12b2)); |
|
|
|
#elif CV_AVX_512VBMI |
|
|
|
#elif CV_AVX_512VBMI |
|
|
|
__m512i b0g0b1 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr1, bgr0); |
|
|
|
__m512i b0g0b1 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr1, bgr0); |
|
|
|
__m512i g1r1g2 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr2, bgr1); |
|
|
|
__m512i g1r1g2 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr2, bgr1); |
|
|
|
__m512i r2b2r0 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr0, bgr2); |
|
|
|
__m512i r2b2r0 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr0, bgr2); |
|
|
|
b = v_uint8x64(_mm512_permutex2var_epi8(b0g0b1, _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101, 98, 95, 92, 89, 86, 83, 80, |
|
|
|
a = v_uint8x64(_mm512_permutex2var_epi8(b0g0b1, _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101, 98, 95, 92, 89, 86, 83, 80, |
|
|
|
77, 74, 71, 68, 65, 63, 61, 60, 58, 57, 55, 54, 52, 51, 49, 48, |
|
|
|
77, 74, 71, 68, 65, 63, 61, 60, 58, 57, 55, 54, 52, 51, 49, 48, |
|
|
|
46, 45, 43, 42, 40, 39, 37, 36, 34, 33, 31, 30, 28, 27, 25, 24, |
|
|
|
46, 45, 43, 42, 40, 39, 37, 36, 34, 33, 31, 30, 28, 27, 25, 24, |
|
|
|
23, 21, 20, 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0), bgr2)); |
|
|
|
23, 21, 20, 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0), bgr2)); |
|
|
|
g = v_uint8x64(_mm512_permutex2var_epi8(g1r1g2, _v512_set_epu8( 63, 61, 60, 58, 57, 55, 54, 52, 51, 49, 48, 46, 45, 43, 42, 40, |
|
|
|
b = v_uint8x64(_mm512_permutex2var_epi8(g1r1g2, _v512_set_epu8( 63, 61, 60, 58, 57, 55, 54, 52, 51, 49, 48, 46, 45, 43, 42, 40, |
|
|
|
39, 37, 36, 34, 33, 31, 30, 28, 27, 25, 24, 23, 21, 20, 18, 17, |
|
|
|
39, 37, 36, 34, 33, 31, 30, 28, 27, 25, 24, 23, 21, 20, 18, 17, |
|
|
|
15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, 126, 123, 120, 117, 114, |
|
|
|
15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, 126, 123, 120, 117, 114, |
|
|
|
111, 108, 105, 102, 99, 96, 93, 90, 87, 84, 81, 78, 75, 72, 69, 66), bgr0)); |
|
|
|
111, 108, 105, 102, 99, 96, 93, 90, 87, 84, 81, 78, 75, 72, 69, 66), bgr0)); |
|
|
|
r = v_uint8x64(_mm512_permutex2var_epi8(r2b2r0, _v512_set_epu8( 63, 60, 57, 54, 51, 48, 45, 42, 39, 36, 33, 30, 27, 24, 21, 18, |
|
|
|
c = v_uint8x64(_mm512_permutex2var_epi8(r2b2r0, _v512_set_epu8( 63, 60, 57, 54, 51, 48, 45, 42, 39, 36, 33, 30, 27, 24, 21, 18, |
|
|
|
15, 12, 9, 6, 3, 0, 125, 122, 119, 116, 113, 110, 107, 104, 101, 98, |
|
|
|
15, 12, 9, 6, 3, 0, 125, 122, 119, 116, 113, 110, 107, 104, 101, 98, |
|
|
|
95, 92, 89, 86, 83, 80, 77, 74, 71, 68, 65, 62, 59, 56, 53, 50, |
|
|
|
95, 92, 89, 86, 83, 80, 77, 74, 71, 68, 65, 62, 59, 56, 53, 50, |
|
|
|
47, 44, 41, 38, 35, 32, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2), bgr1)); |
|
|
|
47, 44, 41, 38, 35, 32, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2), bgr1)); |
|
|
@ -2092,13 +2092,13 @@ inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& b, v_uint8x64& g, |
|
|
|
__m512i r0b1 = _mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17, |
|
|
|
__m512i r0b1 = _mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17, |
|
|
|
14, 11, 8, 5, 2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0); |
|
|
|
14, 11, 8, 5, 2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0); |
|
|
|
__m512i g1r1 = _mm512_alignr_epi32(r12b2, g20r0, 11); |
|
|
|
__m512i g1r1 = _mm512_alignr_epi32(r12b2, g20r0, 11); |
|
|
|
b = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b0g0, r0b1)); |
|
|
|
a = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b0g0, r0b1)); |
|
|
|
r = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, r0b1, g1r1)); |
|
|
|
c = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, r0b1, g1r1)); |
|
|
|
g = v_uint8x64(_mm512_shuffle_epi8(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1r1, b0g0), _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001))); |
|
|
|
b = v_uint8x64(_mm512_shuffle_epi8(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1r1, b0g0), _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001))); |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& b, v_uint16x32& g, v_uint16x32& r ) |
|
|
|
inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b, v_uint16x32& c ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
__m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr); |
|
|
|
__m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr); |
|
|
|
__m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 32)); |
|
|
|
__m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 32)); |
|
|
@ -2110,13 +2110,13 @@ inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& b, v_uint16x32& |
|
|
|
__m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2); |
|
|
|
__m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2); |
|
|
|
__m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0); |
|
|
|
__m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0); |
|
|
|
|
|
|
|
|
|
|
|
b = v_uint16x32(_mm512_mask_blend_epi32(0xf800, b01g1, r12b2)); |
|
|
|
a = v_uint16x32(_mm512_mask_blend_epi32(0xf800, b01g1, r12b2)); |
|
|
|
g = v_uint16x32(_mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17, |
|
|
|
b = v_uint16x32(_mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17, |
|
|
|
14, 11, 8, 5, 2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0)); |
|
|
|
14, 11, 8, 5, 2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0)); |
|
|
|
r = v_uint16x32(_mm512_alignr_epi32(r12b2, g20r0, 11)); |
|
|
|
c = v_uint16x32(_mm512_alignr_epi32(r12b2, g20r0, 11)); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& b, v_uint32x16& g, v_uint32x16& r ) |
|
|
|
inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b, v_uint32x16& c ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
__m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr); |
|
|
|
__m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr); |
|
|
|
__m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 16)); |
|
|
|
__m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 16)); |
|
|
@ -2127,12 +2127,12 @@ inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& b, v_uint32x1 |
|
|
|
__m512i g12b2 = _mm512_permutex2var_epi32(bgr1, mask0, bgr2); |
|
|
|
__m512i g12b2 = _mm512_permutex2var_epi32(bgr1, mask0, bgr2); |
|
|
|
__m512i r20g0 = _mm512_permutex2var_epi32(bgr2, mask0, bgr0); |
|
|
|
__m512i r20g0 = _mm512_permutex2var_epi32(bgr2, mask0, bgr0); |
|
|
|
|
|
|
|
|
|
|
|
b = v_uint32x16(_mm512_mask_blend_epi32(0xf800, b01r1, g12b2)); |
|
|
|
a = v_uint32x16(_mm512_mask_blend_epi32(0xf800, b01r1, g12b2)); |
|
|
|
g = v_uint32x16(_mm512_alignr_epi32(g12b2, r20g0, 11)); |
|
|
|
b = v_uint32x16(_mm512_alignr_epi32(g12b2, r20g0, 11)); |
|
|
|
r = v_uint32x16(_mm512_permutex2var_epi32(bgr1, _v512_set_epu32(21, 20, 19, 18, 17, 16, 13, 10, 7, 4, 1, 26, 25, 24, 23, 22), r20g0)); |
|
|
|
c = v_uint32x16(_mm512_permutex2var_epi32(bgr1, _v512_set_epu32(21, 20, 19, 18, 17, 16, 13, 10, 7, 4, 1, 26, 25, 24, 23, 22), r20g0)); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& b, v_uint64x8& g, v_uint64x8& r ) |
|
|
|
inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b, v_uint64x8& c ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
__m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr); |
|
|
|
__m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr); |
|
|
|
__m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 8)); |
|
|
|
__m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 8)); |
|
|
@ -2143,12 +2143,12 @@ inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& b, v_uint64x8& g |
|
|
|
__m512i r12b2 = _mm512_permutex2var_epi64(bgr1, mask0, bgr2); |
|
|
|
__m512i r12b2 = _mm512_permutex2var_epi64(bgr1, mask0, bgr2); |
|
|
|
__m512i g20r0 = _mm512_permutex2var_epi64(bgr2, mask0, bgr0); |
|
|
|
__m512i g20r0 = _mm512_permutex2var_epi64(bgr2, mask0, bgr0); |
|
|
|
|
|
|
|
|
|
|
|
b = v_uint64x8(_mm512_mask_blend_epi64(0xc0, b01g1, r12b2)); |
|
|
|
a = v_uint64x8(_mm512_mask_blend_epi64(0xc0, b01g1, r12b2)); |
|
|
|
r = v_uint64x8(_mm512_alignr_epi64(r12b2, g20r0, 6)); |
|
|
|
c = v_uint64x8(_mm512_alignr_epi64(r12b2, g20r0, 6)); |
|
|
|
g = v_uint64x8(_mm512_permutex2var_epi64(bgr1, _v512_set_epu64(10, 9, 8, 5, 2, 13, 12, 11), g20r0)); |
|
|
|
b = v_uint64x8(_mm512_permutex2var_epi64(bgr1, _v512_set_epu64(10, 9, 8, 5, 2, 13, 12, 11), g20r0)); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& b, v_uint8x64& g, v_uint8x64& r, v_uint8x64& a ) |
|
|
|
inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b, v_uint8x64& c, v_uint8x64& d ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
__m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr); |
|
|
|
__m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr); |
|
|
|
__m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 64)); |
|
|
|
__m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 64)); |
|
|
@ -2170,10 +2170,10 @@ inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& b, v_uint8x64& g, |
|
|
|
__m512i br23 = _mm512_permutex2var_epi8(bgra2, mask0, bgra3); |
|
|
|
__m512i br23 = _mm512_permutex2var_epi8(bgra2, mask0, bgra3); |
|
|
|
__m512i ga23 = _mm512_permutex2var_epi8(bgra2, mask1, bgra3); |
|
|
|
__m512i ga23 = _mm512_permutex2var_epi8(bgra2, mask1, bgra3); |
|
|
|
|
|
|
|
|
|
|
|
b = v_uint8x64(_mm512_permutex2var_epi8(br01, mask0, br23)); |
|
|
|
a = v_uint8x64(_mm512_permutex2var_epi8(br01, mask0, br23)); |
|
|
|
r = v_uint8x64(_mm512_permutex2var_epi8(br01, mask1, br23)); |
|
|
|
c = v_uint8x64(_mm512_permutex2var_epi8(br01, mask1, br23)); |
|
|
|
g = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask0, ga23)); |
|
|
|
b = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask0, ga23)); |
|
|
|
a = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask1, ga23)); |
|
|
|
d = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask1, ga23)); |
|
|
|
#else |
|
|
|
#else |
|
|
|
__m512i mask = _mm512_set4_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400); |
|
|
|
__m512i mask = _mm512_set4_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400); |
|
|
|
__m512i b0g0r0a0 = _mm512_shuffle_epi8(bgra0, mask); |
|
|
|
__m512i b0g0r0a0 = _mm512_shuffle_epi8(bgra0, mask); |
|
|
@ -2189,14 +2189,14 @@ inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& b, v_uint8x64& g, |
|
|
|
__m512i br23 = _mm512_permutex2var_epi32(b2g2r2a2, mask0, b3g3r3a3); |
|
|
|
__m512i br23 = _mm512_permutex2var_epi32(b2g2r2a2, mask0, b3g3r3a3); |
|
|
|
__m512i ga23 = _mm512_permutex2var_epi32(b2g2r2a2, mask1, b3g3r3a3); |
|
|
|
__m512i ga23 = _mm512_permutex2var_epi32(b2g2r2a2, mask1, b3g3r3a3); |
|
|
|
|
|
|
|
|
|
|
|
b = v_uint8x64(_mm512_permutex2var_epi32(br01, mask0, br23)); |
|
|
|
a = v_uint8x64(_mm512_permutex2var_epi32(br01, mask0, br23)); |
|
|
|
r = v_uint8x64(_mm512_permutex2var_epi32(br01, mask1, br23)); |
|
|
|
c = v_uint8x64(_mm512_permutex2var_epi32(br01, mask1, br23)); |
|
|
|
g = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask0, ga23)); |
|
|
|
b = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask0, ga23)); |
|
|
|
a = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask1, ga23)); |
|
|
|
d = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask1, ga23)); |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& b, v_uint16x32& g, v_uint16x32& r, v_uint16x32& a ) |
|
|
|
inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b, v_uint16x32& c, v_uint16x32& d ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
__m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr); |
|
|
|
__m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr); |
|
|
|
__m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 32)); |
|
|
|
__m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 32)); |
|
|
@ -2213,13 +2213,13 @@ inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& b, v_uint16x32& |
|
|
|
__m512i br23 = _mm512_permutex2var_epi16(bgra2, mask0, bgra3); |
|
|
|
__m512i br23 = _mm512_permutex2var_epi16(bgra2, mask0, bgra3); |
|
|
|
__m512i ga23 = _mm512_permutex2var_epi16(bgra2, mask1, bgra3); |
|
|
|
__m512i ga23 = _mm512_permutex2var_epi16(bgra2, mask1, bgra3); |
|
|
|
|
|
|
|
|
|
|
|
b = v_uint16x32(_mm512_permutex2var_epi16(br01, mask0, br23)); |
|
|
|
a = v_uint16x32(_mm512_permutex2var_epi16(br01, mask0, br23)); |
|
|
|
r = v_uint16x32(_mm512_permutex2var_epi16(br01, mask1, br23)); |
|
|
|
c = v_uint16x32(_mm512_permutex2var_epi16(br01, mask1, br23)); |
|
|
|
g = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask0, ga23)); |
|
|
|
b = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask0, ga23)); |
|
|
|
a = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask1, ga23)); |
|
|
|
d = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask1, ga23)); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& b, v_uint32x16& g, v_uint32x16& r, v_uint32x16& a ) |
|
|
|
inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b, v_uint32x16& c, v_uint32x16& d ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
__m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr); |
|
|
|
__m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr); |
|
|
|
__m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 16)); |
|
|
|
__m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 16)); |
|
|
@ -2234,13 +2234,13 @@ inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& b, v_uint32x1 |
|
|
|
__m512i br23 = _mm512_permutex2var_epi32(bgra2, mask0, bgra3); |
|
|
|
__m512i br23 = _mm512_permutex2var_epi32(bgra2, mask0, bgra3); |
|
|
|
__m512i ga23 = _mm512_permutex2var_epi32(bgra2, mask1, bgra3); |
|
|
|
__m512i ga23 = _mm512_permutex2var_epi32(bgra2, mask1, bgra3); |
|
|
|
|
|
|
|
|
|
|
|
b = v_uint32x16(_mm512_permutex2var_epi32(br01, mask0, br23)); |
|
|
|
a = v_uint32x16(_mm512_permutex2var_epi32(br01, mask0, br23)); |
|
|
|
r = v_uint32x16(_mm512_permutex2var_epi32(br01, mask1, br23)); |
|
|
|
c = v_uint32x16(_mm512_permutex2var_epi32(br01, mask1, br23)); |
|
|
|
g = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask0, ga23)); |
|
|
|
b = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask0, ga23)); |
|
|
|
a = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask1, ga23)); |
|
|
|
d = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask1, ga23)); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& b, v_uint64x8& g, v_uint64x8& r, v_uint64x8& a ) |
|
|
|
inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b, v_uint64x8& c, v_uint64x8& d ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
__m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr); |
|
|
|
__m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr); |
|
|
|
__m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 8)); |
|
|
|
__m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 8)); |
|
|
@ -2255,10 +2255,10 @@ inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& b, v_uint64x8& g |
|
|
|
__m512i br23 = _mm512_permutex2var_epi64(bgra2, mask0, bgra3); |
|
|
|
__m512i br23 = _mm512_permutex2var_epi64(bgra2, mask0, bgra3); |
|
|
|
__m512i ga23 = _mm512_permutex2var_epi64(bgra2, mask1, bgra3); |
|
|
|
__m512i ga23 = _mm512_permutex2var_epi64(bgra2, mask1, bgra3); |
|
|
|
|
|
|
|
|
|
|
|
b = v_uint64x8(_mm512_permutex2var_epi64(br01, mask0, br23)); |
|
|
|
a = v_uint64x8(_mm512_permutex2var_epi64(br01, mask0, br23)); |
|
|
|
r = v_uint64x8(_mm512_permutex2var_epi64(br01, mask1, br23)); |
|
|
|
c = v_uint64x8(_mm512_permutex2var_epi64(br01, mask1, br23)); |
|
|
|
g = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask0, ga23)); |
|
|
|
b = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask0, ga23)); |
|
|
|
a = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask1, ga23)); |
|
|
|
d = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask1, ga23)); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
///////////////////////////// store interleave /////////////////////////////////////
|
|
|
|
///////////////////////////// store interleave /////////////////////////////////////
|
|
|
@ -2351,7 +2351,7 @@ inline void v_store_interleave( uint64* ptr, const v_uint64x8& x, const v_uint64 |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
inline void v_store_interleave( uchar* ptr, const v_uint8x64& b, const v_uint8x64& g, const v_uint8x64& r, |
|
|
|
inline void v_store_interleave( uchar* ptr, const v_uint8x64& a, const v_uint8x64& b, const v_uint8x64& c, |
|
|
|
hal::StoreMode mode=hal::STORE_UNALIGNED ) |
|
|
|
hal::StoreMode mode=hal::STORE_UNALIGNED ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
#if CV_AVX_512VBMI |
|
|
|
#if CV_AVX_512VBMI |
|
|
@ -2367,18 +2367,18 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x64& b, const v_uint8x6 |
|
|
|
122, 58, 100, 121, 57, 99, 120, 56, 98, 119, 55, 97, 118, 54, 96, 117, |
|
|
|
122, 58, 100, 121, 57, 99, 120, 56, 98, 119, 55, 97, 118, 54, 96, 117, |
|
|
|
53, 95, 116, 52, 94, 115, 51, 93, 114, 50, 92, 113, 49, 91, 112, 48, |
|
|
|
53, 95, 116, 52, 94, 115, 51, 93, 114, 50, 92, 113, 49, 91, 112, 48, |
|
|
|
90, 111, 47, 89, 110, 46, 88, 109, 45, 87, 108, 44, 86, 107, 43, 85); |
|
|
|
90, 111, 47, 89, 110, 46, 88, 109, 45, 87, 108, 44, 86, 107, 43, 85); |
|
|
|
__m512i r2g0r0 = _mm512_permutex2var_epi8(g.val, mask0, r.val); |
|
|
|
__m512i r2g0r0 = _mm512_permutex2var_epi8(b.val, mask0, c.val); |
|
|
|
__m512i b0r1b1 = _mm512_permutex2var_epi8(b.val, mask1, r.val); |
|
|
|
__m512i b0r1b1 = _mm512_permutex2var_epi8(a.val, mask1, c.val); |
|
|
|
__m512i g1b2g2 = _mm512_permutex2var_epi8(b.val, mask2, g.val); |
|
|
|
__m512i g1b2g2 = _mm512_permutex2var_epi8(a.val, mask2, b.val); |
|
|
|
|
|
|
|
|
|
|
|
__m512i bgr0 = _mm512_mask_blend_epi8(0x9249249249249249, r2g0r0, b0r1b1); |
|
|
|
__m512i bgr0 = _mm512_mask_blend_epi8(0x9249249249249249, r2g0r0, b0r1b1); |
|
|
|
__m512i bgr1 = _mm512_mask_blend_epi8(0x9249249249249249, b0r1b1, g1b2g2); |
|
|
|
__m512i bgr1 = _mm512_mask_blend_epi8(0x9249249249249249, b0r1b1, g1b2g2); |
|
|
|
__m512i bgr2 = _mm512_mask_blend_epi8(0x9249249249249249, g1b2g2, r2g0r0); |
|
|
|
__m512i bgr2 = _mm512_mask_blend_epi8(0x9249249249249249, g1b2g2, r2g0r0); |
|
|
|
#else |
|
|
|
#else |
|
|
|
__m512i g1g0 = _mm512_shuffle_epi8(g.val, _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001)); |
|
|
|
__m512i g1g0 = _mm512_shuffle_epi8(b.val, _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001)); |
|
|
|
__m512i b0g0 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b.val, g1g0); |
|
|
|
__m512i b0g0 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, g1g0); |
|
|
|
__m512i r0b1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, r.val, b.val); |
|
|
|
__m512i r0b1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, c.val, a.val); |
|
|
|
__m512i g1r1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1g0, r.val); |
|
|
|
__m512i g1r1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1g0, c.val); |
|
|
|
|
|
|
|
|
|
|
|
__m512i mask0 = _v512_set_epu16(42, 10, 31, 41, 9, 30, 40, 8, 29, 39, 7, 28, 38, 6, 27, 37, |
|
|
|
__m512i mask0 = _v512_set_epu16(42, 10, 31, 41, 9, 30, 40, 8, 29, 39, 7, 28, 38, 6, 27, 37, |
|
|
|
5, 26, 36, 4, 25, 35, 3, 24, 34, 2, 23, 33, 1, 22, 32, 0); |
|
|
|
5, 26, 36, 4, 25, 35, 3, 24, 34, 2, 23, 33, 1, 22, 32, 0); |
|
|
@ -2415,7 +2415,7 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x64& b, const v_uint8x6 |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
inline void v_store_interleave( ushort* ptr, const v_uint16x32& b, const v_uint16x32& g, const v_uint16x32& r, |
|
|
|
inline void v_store_interleave( ushort* ptr, const v_uint16x32& a, const v_uint16x32& b, const v_uint16x32& c, |
|
|
|
hal::StoreMode mode=hal::STORE_UNALIGNED ) |
|
|
|
hal::StoreMode mode=hal::STORE_UNALIGNED ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
__m512i mask0 = _v512_set_epu16(42, 10, 31, 41, 9, 30, 40, 8, 29, 39, 7, 28, 38, 6, 27, 37, |
|
|
|
__m512i mask0 = _v512_set_epu16(42, 10, 31, 41, 9, 30, 40, 8, 29, 39, 7, 28, 38, 6, 27, 37, |
|
|
@ -2424,9 +2424,9 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x32& b, const v_uint1 |
|
|
|
47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42); |
|
|
|
47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42); |
|
|
|
__m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58, |
|
|
|
__m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58, |
|
|
|
26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21); |
|
|
|
26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21); |
|
|
|
__m512i b0g0b2 = _mm512_permutex2var_epi16(b.val, mask0, g.val); |
|
|
|
__m512i b0g0b2 = _mm512_permutex2var_epi16(a.val, mask0, b.val); |
|
|
|
__m512i r1b1r0 = _mm512_permutex2var_epi16(b.val, mask1, r.val); |
|
|
|
__m512i r1b1r0 = _mm512_permutex2var_epi16(a.val, mask1, c.val); |
|
|
|
__m512i g2r2g1 = _mm512_permutex2var_epi16(g.val, mask2, r.val); |
|
|
|
__m512i g2r2g1 = _mm512_permutex2var_epi16(b.val, mask2, c.val); |
|
|
|
|
|
|
|
|
|
|
|
__m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0); |
|
|
|
__m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0); |
|
|
|
__m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1); |
|
|
|
__m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1); |
|
|
@ -2452,15 +2452,15 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x32& b, const v_uint1 |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
inline void v_store_interleave( unsigned* ptr, const v_uint32x16& b, const v_uint32x16& g, const v_uint32x16& r, |
|
|
|
inline void v_store_interleave( unsigned* ptr, const v_uint32x16& a, const v_uint32x16& b, const v_uint32x16& c, |
|
|
|
hal::StoreMode mode=hal::STORE_UNALIGNED ) |
|
|
|
hal::StoreMode mode=hal::STORE_UNALIGNED ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
__m512i mask0 = _v512_set_epu32(26, 31, 15, 25, 30, 14, 24, 29, 13, 23, 28, 12, 22, 27, 11, 21); |
|
|
|
__m512i mask0 = _v512_set_epu32(26, 31, 15, 25, 30, 14, 24, 29, 13, 23, 28, 12, 22, 27, 11, 21); |
|
|
|
__m512i mask1 = _v512_set_epu32(31, 10, 25, 30, 9, 24, 29, 8, 23, 28, 7, 22, 27, 6, 21, 26); |
|
|
|
__m512i mask1 = _v512_set_epu32(31, 10, 25, 30, 9, 24, 29, 8, 23, 28, 7, 22, 27, 6, 21, 26); |
|
|
|
__m512i g1b2g2 = _mm512_permutex2var_epi32(b.val, mask0, g.val); |
|
|
|
__m512i g1b2g2 = _mm512_permutex2var_epi32(a.val, mask0, b.val); |
|
|
|
__m512i r2r1b1 = _mm512_permutex2var_epi32(b.val, mask1, r.val); |
|
|
|
__m512i r2r1b1 = _mm512_permutex2var_epi32(a.val, mask1, c.val); |
|
|
|
|
|
|
|
|
|
|
|
__m512i bgr0 = _mm512_mask_expand_epi32(_mm512_mask_expand_epi32(_mm512_maskz_expand_epi32(0x9249, b.val), 0x2492, g.val), 0x4924, r.val); |
|
|
|
__m512i bgr0 = _mm512_mask_expand_epi32(_mm512_mask_expand_epi32(_mm512_maskz_expand_epi32(0x9249, a.val), 0x2492, b.val), 0x4924, c.val); |
|
|
|
__m512i bgr1 = _mm512_mask_blend_epi32(0x9249, r2r1b1, g1b2g2); |
|
|
|
__m512i bgr1 = _mm512_mask_blend_epi32(0x9249, r2r1b1, g1b2g2); |
|
|
|
__m512i bgr2 = _mm512_mask_blend_epi32(0x9249, g1b2g2, r2r1b1); |
|
|
|
__m512i bgr2 = _mm512_mask_blend_epi32(0x9249, g1b2g2, r2r1b1); |
|
|
|
|
|
|
|
|
|
|
@ -2484,15 +2484,15 @@ inline void v_store_interleave( unsigned* ptr, const v_uint32x16& b, const v_uin |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
inline void v_store_interleave( uint64* ptr, const v_uint64x8& b, const v_uint64x8& g, const v_uint64x8& r, |
|
|
|
inline void v_store_interleave( uint64* ptr, const v_uint64x8& a, const v_uint64x8& b, const v_uint64x8& c, |
|
|
|
hal::StoreMode mode=hal::STORE_UNALIGNED ) |
|
|
|
hal::StoreMode mode=hal::STORE_UNALIGNED ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
__m512i mask0 = _v512_set_epu64( 5, 12, 7, 4, 11, 6, 3, 10); |
|
|
|
__m512i mask0 = _v512_set_epu64( 5, 12, 7, 4, 11, 6, 3, 10); |
|
|
|
__m512i mask1 = _v512_set_epu64(15, 7, 4, 14, 6, 3, 13, 5); |
|
|
|
__m512i mask1 = _v512_set_epu64(15, 7, 4, 14, 6, 3, 13, 5); |
|
|
|
__m512i r1b1b2 = _mm512_permutex2var_epi64(b.val, mask0, r.val); |
|
|
|
__m512i r1b1b2 = _mm512_permutex2var_epi64(a.val, mask0, c.val); |
|
|
|
__m512i g2r2g1 = _mm512_permutex2var_epi64(g.val, mask1, r.val); |
|
|
|
__m512i g2r2g1 = _mm512_permutex2var_epi64(b.val, mask1, c.val); |
|
|
|
|
|
|
|
|
|
|
|
__m512i bgr0 = _mm512_mask_expand_epi64(_mm512_mask_expand_epi64(_mm512_maskz_expand_epi64(0x49, b.val), 0x92, g.val), 0x24, r.val); |
|
|
|
__m512i bgr0 = _mm512_mask_expand_epi64(_mm512_mask_expand_epi64(_mm512_maskz_expand_epi64(0x49, a.val), 0x92, b.val), 0x24, c.val); |
|
|
|
__m512i bgr1 = _mm512_mask_blend_epi64(0xdb, g2r2g1, r1b1b2); |
|
|
|
__m512i bgr1 = _mm512_mask_blend_epi64(0xdb, g2r2g1, r1b1b2); |
|
|
|
__m512i bgr2 = _mm512_mask_blend_epi64(0xdb, r1b1b2, g2r2g1); |
|
|
|
__m512i bgr2 = _mm512_mask_blend_epi64(0xdb, r1b1b2, g2r2g1); |
|
|
|
|
|
|
|
|
|
|
@ -2516,13 +2516,13 @@ inline void v_store_interleave( uint64* ptr, const v_uint64x8& b, const v_uint64 |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
inline void v_store_interleave( uchar* ptr, const v_uint8x64& b, const v_uint8x64& g, |
|
|
|
inline void v_store_interleave( uchar* ptr, const v_uint8x64& a, const v_uint8x64& b, |
|
|
|
const v_uint8x64& r, const v_uint8x64& a, |
|
|
|
const v_uint8x64& c, const v_uint8x64& d, |
|
|
|
hal::StoreMode mode=hal::STORE_UNALIGNED ) |
|
|
|
hal::StoreMode mode=hal::STORE_UNALIGNED ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_uint8x64 br01, br23, ga01, ga23; |
|
|
|
v_uint8x64 br01, br23, ga01, ga23; |
|
|
|
v_zip(b, r, br01, br23); |
|
|
|
v_zip(a, c, br01, br23); |
|
|
|
v_zip(g, a, ga01, ga23); |
|
|
|
v_zip(b, d, ga01, ga23); |
|
|
|
v_uint8x64 bgra0, bgra1, bgra2, bgra3; |
|
|
|
v_uint8x64 bgra0, bgra1, bgra2, bgra3; |
|
|
|
v_zip(br01, ga01, bgra0, bgra1); |
|
|
|
v_zip(br01, ga01, bgra0, bgra1); |
|
|
|
v_zip(br23, ga23, bgra2, bgra3); |
|
|
|
v_zip(br23, ga23, bgra2, bgra3); |
|
|
@ -2550,13 +2550,13 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x64& b, const v_uint8x6 |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
inline void v_store_interleave( ushort* ptr, const v_uint16x32& b, const v_uint16x32& g, |
|
|
|
inline void v_store_interleave( ushort* ptr, const v_uint16x32& a, const v_uint16x32& b, |
|
|
|
const v_uint16x32& r, const v_uint16x32& a, |
|
|
|
const v_uint16x32& c, const v_uint16x32& d, |
|
|
|
hal::StoreMode mode=hal::STORE_UNALIGNED ) |
|
|
|
hal::StoreMode mode=hal::STORE_UNALIGNED ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_uint16x32 br01, br23, ga01, ga23; |
|
|
|
v_uint16x32 br01, br23, ga01, ga23; |
|
|
|
v_zip(b, r, br01, br23); |
|
|
|
v_zip(a, c, br01, br23); |
|
|
|
v_zip(g, a, ga01, ga23); |
|
|
|
v_zip(b, d, ga01, ga23); |
|
|
|
v_uint16x32 bgra0, bgra1, bgra2, bgra3; |
|
|
|
v_uint16x32 bgra0, bgra1, bgra2, bgra3; |
|
|
|
v_zip(br01, ga01, bgra0, bgra1); |
|
|
|
v_zip(br01, ga01, bgra0, bgra1); |
|
|
|
v_zip(br23, ga23, bgra2, bgra3); |
|
|
|
v_zip(br23, ga23, bgra2, bgra3); |
|
|
@ -2584,13 +2584,13 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x32& b, const v_uint1 |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
inline void v_store_interleave( unsigned* ptr, const v_uint32x16& b, const v_uint32x16& g, |
|
|
|
inline void v_store_interleave( unsigned* ptr, const v_uint32x16& a, const v_uint32x16& b, |
|
|
|
const v_uint32x16& r, const v_uint32x16& a, |
|
|
|
const v_uint32x16& c, const v_uint32x16& d, |
|
|
|
hal::StoreMode mode=hal::STORE_UNALIGNED ) |
|
|
|
hal::StoreMode mode=hal::STORE_UNALIGNED ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_uint32x16 br01, br23, ga01, ga23; |
|
|
|
v_uint32x16 br01, br23, ga01, ga23; |
|
|
|
v_zip(b, r, br01, br23); |
|
|
|
v_zip(a, c, br01, br23); |
|
|
|
v_zip(g, a, ga01, ga23); |
|
|
|
v_zip(b, d, ga01, ga23); |
|
|
|
v_uint32x16 bgra0, bgra1, bgra2, bgra3; |
|
|
|
v_uint32x16 bgra0, bgra1, bgra2, bgra3; |
|
|
|
v_zip(br01, ga01, bgra0, bgra1); |
|
|
|
v_zip(br01, ga01, bgra0, bgra1); |
|
|
|
v_zip(br23, ga23, bgra2, bgra3); |
|
|
|
v_zip(br23, ga23, bgra2, bgra3); |
|
|
@ -2618,13 +2618,13 @@ inline void v_store_interleave( unsigned* ptr, const v_uint32x16& b, const v_uin |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
inline void v_store_interleave( uint64* ptr, const v_uint64x8& b, const v_uint64x8& g, |
|
|
|
inline void v_store_interleave( uint64* ptr, const v_uint64x8& a, const v_uint64x8& b, |
|
|
|
const v_uint64x8& r, const v_uint64x8& a, |
|
|
|
const v_uint64x8& c, const v_uint64x8& d, |
|
|
|
hal::StoreMode mode=hal::STORE_UNALIGNED ) |
|
|
|
hal::StoreMode mode=hal::STORE_UNALIGNED ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
v_uint64x8 br01, br23, ga01, ga23; |
|
|
|
v_uint64x8 br01, br23, ga01, ga23; |
|
|
|
v_zip(b, r, br01, br23); |
|
|
|
v_zip(a, c, br01, br23); |
|
|
|
v_zip(g, a, ga01, ga23); |
|
|
|
v_zip(b, d, ga01, ga23); |
|
|
|
v_uint64x8 bgra0, bgra1, bgra2, bgra3; |
|
|
|
v_uint64x8 bgra0, bgra1, bgra2, bgra3; |
|
|
|
v_zip(br01, ga01, bgra0, bgra1); |
|
|
|
v_zip(br01, ga01, bgra0, bgra1); |
|
|
|
v_zip(br23, ga23, bgra2, bgra3); |
|
|
|
v_zip(br23, ga23, bgra2, bgra3); |
|
|
|