From a29e59a770ce33c0da447727ce07985f4d9265e5 Mon Sep 17 00:00:00 2001 From: Vitaly Tuzov Date: Thu, 13 Jun 2019 16:46:22 +0300 Subject: [PATCH] Rename parameters in AVX512 implementation of v_load_deinterleave and v_store_interleave --- .../opencv2/core/hal/intrin_avx512.hpp | 164 +++++++++--------- 1 file changed, 82 insertions(+), 82 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp index 4c2394adc3..58bc48ea5f 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp @@ -2044,7 +2044,7 @@ inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b b = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask1, ab1)); } -inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& b, v_uint8x64& g, v_uint8x64& r ) +inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b, v_uint8x64& c ) { __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr); __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 64)); @@ -2062,22 +2062,22 @@ inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& b, v_uint8x64& g, 77, 74, 71, 68, 65, 127, 124, 121, 118, 115, 112, 109, 106, 103, 100, 97, 94, 91, 88, 85, 82, 79, 76, 73, 70, 67, 64, 61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1), bgr2); - b = v_uint8x64(_mm512_mask_compress_epi8(r12b2, 0xffffffffffe00000, r0b01)); - g = v_uint8x64(_mm512_mask_compress_epi8(b1g12, 0x2492492492492492, bgr0)); - r = v_uint8x64(_mm512_mask_expand_epi8(r0b01, 0xffffffffffe00000, r12b2)); + a = v_uint8x64(_mm512_mask_compress_epi8(r12b2, 0xffffffffffe00000, r0b01)); + b = v_uint8x64(_mm512_mask_compress_epi8(b1g12, 0x2492492492492492, bgr0)); + c = v_uint8x64(_mm512_mask_expand_epi8(r0b01, 0xffffffffffe00000, r12b2)); #elif CV_AVX_512VBMI __m512i b0g0b1 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr1, bgr0); __m512i g1r1g2 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr2, bgr1); __m512i r2b2r0 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr0, bgr2); - b = v_uint8x64(_mm512_permutex2var_epi8(b0g0b1, _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101, 98, 95, 92, 89, 86, 83, 80, + a = v_uint8x64(_mm512_permutex2var_epi8(b0g0b1, _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101, 98, 95, 92, 89, 86, 83, 80, 77, 74, 71, 68, 65, 63, 61, 60, 58, 57, 55, 54, 52, 51, 49, 48, 46, 45, 43, 42, 40, 39, 37, 36, 34, 33, 31, 30, 28, 27, 25, 24, 23, 21, 20, 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0), bgr2)); - g = v_uint8x64(_mm512_permutex2var_epi8(g1r1g2, _v512_set_epu8( 63, 61, 60, 58, 57, 55, 54, 52, 51, 49, 48, 46, 45, 43, 42, 40, + b = v_uint8x64(_mm512_permutex2var_epi8(g1r1g2, _v512_set_epu8( 63, 61, 60, 58, 57, 55, 54, 52, 51, 49, 48, 46, 45, 43, 42, 40, 39, 37, 36, 34, 33, 31, 30, 28, 27, 25, 24, 23, 21, 20, 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, 126, 123, 120, 117, 114, 111, 108, 105, 102, 99, 96, 93, 90, 87, 84, 81, 78, 75, 72, 69, 66), bgr0)); - r = v_uint8x64(_mm512_permutex2var_epi8(r2b2r0, _v512_set_epu8( 63, 60, 57, 54, 51, 48, 45, 42, 39, 36, 33, 30, 27, 24, 21, 18, + c = v_uint8x64(_mm512_permutex2var_epi8(r2b2r0, _v512_set_epu8( 63, 60, 57, 54, 51, 48, 45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 125, 122, 119, 116, 113, 110, 107, 104, 101, 98, 95, 92, 89, 86, 83, 80, 77, 74, 71, 68, 65, 62, 59, 56, 53, 50, 47, 44, 41, 38, 35, 32, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2), bgr1)); @@ -2092,13 +2092,13 @@ inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& b, v_uint8x64& g, __m512i r0b1 = _mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0); __m512i g1r1 = _mm512_alignr_epi32(r12b2, g20r0, 11); - b = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b0g0, r0b1)); - r = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, r0b1, g1r1)); - g = v_uint8x64(_mm512_shuffle_epi8(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1r1, b0g0), _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001))); + a = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b0g0, r0b1)); + c = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, r0b1, g1r1)); + b = v_uint8x64(_mm512_shuffle_epi8(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1r1, b0g0), _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001))); #endif } -inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& b, v_uint16x32& g, v_uint16x32& r ) +inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b, v_uint16x32& c ) { __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr); __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 32)); @@ -2110,13 +2110,13 @@ inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& b, v_uint16x32& __m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2); __m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0); - b = v_uint16x32(_mm512_mask_blend_epi32(0xf800, b01g1, r12b2)); - g = v_uint16x32(_mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17, + a = v_uint16x32(_mm512_mask_blend_epi32(0xf800, b01g1, r12b2)); + b = v_uint16x32(_mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0)); - r = v_uint16x32(_mm512_alignr_epi32(r12b2, g20r0, 11)); + c = v_uint16x32(_mm512_alignr_epi32(r12b2, g20r0, 11)); } -inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& b, v_uint32x16& g, v_uint32x16& r ) +inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b, v_uint32x16& c ) { __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr); __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 16)); @@ -2127,12 +2127,12 @@ inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& b, v_uint32x1 __m512i g12b2 = _mm512_permutex2var_epi32(bgr1, mask0, bgr2); __m512i r20g0 = _mm512_permutex2var_epi32(bgr2, mask0, bgr0); - b = v_uint32x16(_mm512_mask_blend_epi32(0xf800, b01r1, g12b2)); - g = v_uint32x16(_mm512_alignr_epi32(g12b2, r20g0, 11)); - r = v_uint32x16(_mm512_permutex2var_epi32(bgr1, _v512_set_epu32(21, 20, 19, 18, 17, 16, 13, 10, 7, 4, 1, 26, 25, 24, 23, 22), r20g0)); + a = v_uint32x16(_mm512_mask_blend_epi32(0xf800, b01r1, g12b2)); + b = v_uint32x16(_mm512_alignr_epi32(g12b2, r20g0, 11)); + c = v_uint32x16(_mm512_permutex2var_epi32(bgr1, _v512_set_epu32(21, 20, 19, 18, 17, 16, 13, 10, 7, 4, 1, 26, 25, 24, 23, 22), r20g0)); } -inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& b, v_uint64x8& g, v_uint64x8& r ) +inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b, v_uint64x8& c ) { __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr); __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 8)); @@ -2143,12 +2143,12 @@ inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& b, v_uint64x8& g __m512i r12b2 = _mm512_permutex2var_epi64(bgr1, mask0, bgr2); __m512i g20r0 = _mm512_permutex2var_epi64(bgr2, mask0, bgr0); - b = v_uint64x8(_mm512_mask_blend_epi64(0xc0, b01g1, r12b2)); - r = v_uint64x8(_mm512_alignr_epi64(r12b2, g20r0, 6)); - g = v_uint64x8(_mm512_permutex2var_epi64(bgr1, _v512_set_epu64(10, 9, 8, 5, 2, 13, 12, 11), g20r0)); + a = v_uint64x8(_mm512_mask_blend_epi64(0xc0, b01g1, r12b2)); + c = v_uint64x8(_mm512_alignr_epi64(r12b2, g20r0, 6)); + b = v_uint64x8(_mm512_permutex2var_epi64(bgr1, _v512_set_epu64(10, 9, 8, 5, 2, 13, 12, 11), g20r0)); } -inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& b, v_uint8x64& g, v_uint8x64& r, v_uint8x64& a ) +inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b, v_uint8x64& c, v_uint8x64& d ) { __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr); __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 64)); @@ -2170,10 +2170,10 @@ inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& b, v_uint8x64& g, __m512i br23 = _mm512_permutex2var_epi8(bgra2, mask0, bgra3); __m512i ga23 = _mm512_permutex2var_epi8(bgra2, mask1, bgra3); - b = v_uint8x64(_mm512_permutex2var_epi8(br01, mask0, br23)); - r = v_uint8x64(_mm512_permutex2var_epi8(br01, mask1, br23)); - g = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask0, ga23)); - a = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask1, ga23)); + a = v_uint8x64(_mm512_permutex2var_epi8(br01, mask0, br23)); + c = v_uint8x64(_mm512_permutex2var_epi8(br01, mask1, br23)); + b = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask0, ga23)); + d = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask1, ga23)); #else __m512i mask = _mm512_set4_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400); __m512i b0g0r0a0 = _mm512_shuffle_epi8(bgra0, mask); @@ -2189,14 +2189,14 @@ inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& b, v_uint8x64& g, __m512i br23 = _mm512_permutex2var_epi32(b2g2r2a2, mask0, b3g3r3a3); __m512i ga23 = _mm512_permutex2var_epi32(b2g2r2a2, mask1, b3g3r3a3); - b = v_uint8x64(_mm512_permutex2var_epi32(br01, mask0, br23)); - r = v_uint8x64(_mm512_permutex2var_epi32(br01, mask1, br23)); - g = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask0, ga23)); - a = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask1, ga23)); + a = v_uint8x64(_mm512_permutex2var_epi32(br01, mask0, br23)); + c = v_uint8x64(_mm512_permutex2var_epi32(br01, mask1, br23)); + b = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask0, ga23)); + d = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask1, ga23)); #endif } -inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& b, v_uint16x32& g, v_uint16x32& r, v_uint16x32& a ) +inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b, v_uint16x32& c, v_uint16x32& d ) { __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr); __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 32)); @@ -2213,13 +2213,13 @@ inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& b, v_uint16x32& __m512i br23 = _mm512_permutex2var_epi16(bgra2, mask0, bgra3); __m512i ga23 = _mm512_permutex2var_epi16(bgra2, mask1, bgra3); - b = v_uint16x32(_mm512_permutex2var_epi16(br01, mask0, br23)); - r = v_uint16x32(_mm512_permutex2var_epi16(br01, mask1, br23)); - g = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask0, ga23)); - a = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask1, ga23)); + a = v_uint16x32(_mm512_permutex2var_epi16(br01, mask0, br23)); + c = v_uint16x32(_mm512_permutex2var_epi16(br01, mask1, br23)); + b = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask0, ga23)); + d = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask1, ga23)); } -inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& b, v_uint32x16& g, v_uint32x16& r, v_uint32x16& a ) +inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b, v_uint32x16& c, v_uint32x16& d ) { __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr); __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 16)); @@ -2234,13 +2234,13 @@ inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& b, v_uint32x1 __m512i br23 = _mm512_permutex2var_epi32(bgra2, mask0, bgra3); __m512i ga23 = _mm512_permutex2var_epi32(bgra2, mask1, bgra3); - b = v_uint32x16(_mm512_permutex2var_epi32(br01, mask0, br23)); - r = v_uint32x16(_mm512_permutex2var_epi32(br01, mask1, br23)); - g = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask0, ga23)); - a = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask1, ga23)); + a = v_uint32x16(_mm512_permutex2var_epi32(br01, mask0, br23)); + c = v_uint32x16(_mm512_permutex2var_epi32(br01, mask1, br23)); + b = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask0, ga23)); + d = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask1, ga23)); } -inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& b, v_uint64x8& g, v_uint64x8& r, v_uint64x8& a ) +inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b, v_uint64x8& c, v_uint64x8& d ) { __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr); __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 8)); @@ -2255,10 +2255,10 @@ inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& b, v_uint64x8& g __m512i br23 = _mm512_permutex2var_epi64(bgra2, mask0, bgra3); __m512i ga23 = _mm512_permutex2var_epi64(bgra2, mask1, bgra3); - b = v_uint64x8(_mm512_permutex2var_epi64(br01, mask0, br23)); - r = v_uint64x8(_mm512_permutex2var_epi64(br01, mask1, br23)); - g = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask0, ga23)); - a = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask1, ga23)); + a = v_uint64x8(_mm512_permutex2var_epi64(br01, mask0, br23)); + c = v_uint64x8(_mm512_permutex2var_epi64(br01, mask1, br23)); + b = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask0, ga23)); + d = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask1, ga23)); } ///////////////////////////// store interleave ///////////////////////////////////// @@ -2351,7 +2351,7 @@ inline void v_store_interleave( uint64* ptr, const v_uint64x8& x, const v_uint64 } } -inline void v_store_interleave( uchar* ptr, const v_uint8x64& b, const v_uint8x64& g, const v_uint8x64& r, +inline void v_store_interleave( uchar* ptr, const v_uint8x64& a, const v_uint8x64& b, const v_uint8x64& c, hal::StoreMode mode=hal::STORE_UNALIGNED ) { #if CV_AVX_512VBMI @@ -2367,18 +2367,18 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x64& b, const v_uint8x6 122, 58, 100, 121, 57, 99, 120, 56, 98, 119, 55, 97, 118, 54, 96, 117, 53, 95, 116, 52, 94, 115, 51, 93, 114, 50, 92, 113, 49, 91, 112, 48, 90, 111, 47, 89, 110, 46, 88, 109, 45, 87, 108, 44, 86, 107, 43, 85); - __m512i r2g0r0 = _mm512_permutex2var_epi8(g.val, mask0, r.val); - __m512i b0r1b1 = _mm512_permutex2var_epi8(b.val, mask1, r.val); - __m512i g1b2g2 = _mm512_permutex2var_epi8(b.val, mask2, g.val); + __m512i r2g0r0 = _mm512_permutex2var_epi8(b.val, mask0, c.val); + __m512i b0r1b1 = _mm512_permutex2var_epi8(a.val, mask1, c.val); + __m512i g1b2g2 = _mm512_permutex2var_epi8(a.val, mask2, b.val); __m512i bgr0 = _mm512_mask_blend_epi8(0x9249249249249249, r2g0r0, b0r1b1); __m512i bgr1 = _mm512_mask_blend_epi8(0x9249249249249249, b0r1b1, g1b2g2); __m512i bgr2 = _mm512_mask_blend_epi8(0x9249249249249249, g1b2g2, r2g0r0); #else - __m512i g1g0 = _mm512_shuffle_epi8(g.val, _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001)); - __m512i b0g0 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b.val, g1g0); - __m512i r0b1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, r.val, b.val); - __m512i g1r1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1g0, r.val); + __m512i g1g0 = _mm512_shuffle_epi8(b.val, _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001)); + __m512i b0g0 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, g1g0); + __m512i r0b1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, c.val, a.val); + __m512i g1r1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1g0, c.val); __m512i mask0 = _v512_set_epu16(42, 10, 31, 41, 9, 30, 40, 8, 29, 39, 7, 28, 38, 6, 27, 37, 5, 26, 36, 4, 25, 35, 3, 24, 34, 2, 23, 33, 1, 22, 32, 0); @@ -2415,7 +2415,7 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x64& b, const v_uint8x6 } } -inline void v_store_interleave( ushort* ptr, const v_uint16x32& b, const v_uint16x32& g, const v_uint16x32& r, +inline void v_store_interleave( ushort* ptr, const v_uint16x32& a, const v_uint16x32& b, const v_uint16x32& c, hal::StoreMode mode=hal::STORE_UNALIGNED ) { __m512i mask0 = _v512_set_epu16(42, 10, 31, 41, 9, 30, 40, 8, 29, 39, 7, 28, 38, 6, 27, 37, @@ -2424,9 +2424,9 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x32& b, const v_uint1 47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42); __m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58, 26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21); - __m512i b0g0b2 = _mm512_permutex2var_epi16(b.val, mask0, g.val); - __m512i r1b1r0 = _mm512_permutex2var_epi16(b.val, mask1, r.val); - __m512i g2r2g1 = _mm512_permutex2var_epi16(g.val, mask2, r.val); + __m512i b0g0b2 = _mm512_permutex2var_epi16(a.val, mask0, b.val); + __m512i r1b1r0 = _mm512_permutex2var_epi16(a.val, mask1, c.val); + __m512i g2r2g1 = _mm512_permutex2var_epi16(b.val, mask2, c.val); __m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0); __m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1); @@ -2452,15 +2452,15 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x32& b, const v_uint1 } } -inline void v_store_interleave( unsigned* ptr, const v_uint32x16& b, const v_uint32x16& g, const v_uint32x16& r, +inline void v_store_interleave( unsigned* ptr, const v_uint32x16& a, const v_uint32x16& b, const v_uint32x16& c, hal::StoreMode mode=hal::STORE_UNALIGNED ) { __m512i mask0 = _v512_set_epu32(26, 31, 15, 25, 30, 14, 24, 29, 13, 23, 28, 12, 22, 27, 11, 21); __m512i mask1 = _v512_set_epu32(31, 10, 25, 30, 9, 24, 29, 8, 23, 28, 7, 22, 27, 6, 21, 26); - __m512i g1b2g2 = _mm512_permutex2var_epi32(b.val, mask0, g.val); - __m512i r2r1b1 = _mm512_permutex2var_epi32(b.val, mask1, r.val); + __m512i g1b2g2 = _mm512_permutex2var_epi32(a.val, mask0, b.val); + __m512i r2r1b1 = _mm512_permutex2var_epi32(a.val, mask1, c.val); - __m512i bgr0 = _mm512_mask_expand_epi32(_mm512_mask_expand_epi32(_mm512_maskz_expand_epi32(0x9249, b.val), 0x2492, g.val), 0x4924, r.val); + __m512i bgr0 = _mm512_mask_expand_epi32(_mm512_mask_expand_epi32(_mm512_maskz_expand_epi32(0x9249, a.val), 0x2492, b.val), 0x4924, c.val); __m512i bgr1 = _mm512_mask_blend_epi32(0x9249, r2r1b1, g1b2g2); __m512i bgr2 = _mm512_mask_blend_epi32(0x9249, g1b2g2, r2r1b1); @@ -2484,15 +2484,15 @@ inline void v_store_interleave( unsigned* ptr, const v_uint32x16& b, const v_uin } } -inline void v_store_interleave( uint64* ptr, const v_uint64x8& b, const v_uint64x8& g, const v_uint64x8& r, +inline void v_store_interleave( uint64* ptr, const v_uint64x8& a, const v_uint64x8& b, const v_uint64x8& c, hal::StoreMode mode=hal::STORE_UNALIGNED ) { __m512i mask0 = _v512_set_epu64( 5, 12, 7, 4, 11, 6, 3, 10); __m512i mask1 = _v512_set_epu64(15, 7, 4, 14, 6, 3, 13, 5); - __m512i r1b1b2 = _mm512_permutex2var_epi64(b.val, mask0, r.val); - __m512i g2r2g1 = _mm512_permutex2var_epi64(g.val, mask1, r.val); + __m512i r1b1b2 = _mm512_permutex2var_epi64(a.val, mask0, c.val); + __m512i g2r2g1 = _mm512_permutex2var_epi64(b.val, mask1, c.val); - __m512i bgr0 = _mm512_mask_expand_epi64(_mm512_mask_expand_epi64(_mm512_maskz_expand_epi64(0x49, b.val), 0x92, g.val), 0x24, r.val); + __m512i bgr0 = _mm512_mask_expand_epi64(_mm512_mask_expand_epi64(_mm512_maskz_expand_epi64(0x49, a.val), 0x92, b.val), 0x24, c.val); __m512i bgr1 = _mm512_mask_blend_epi64(0xdb, g2r2g1, r1b1b2); __m512i bgr2 = _mm512_mask_blend_epi64(0xdb, r1b1b2, g2r2g1); @@ -2516,13 +2516,13 @@ inline void v_store_interleave( uint64* ptr, const v_uint64x8& b, const v_uint64 } } -inline void v_store_interleave( uchar* ptr, const v_uint8x64& b, const v_uint8x64& g, - const v_uint8x64& r, const v_uint8x64& a, +inline void v_store_interleave( uchar* ptr, const v_uint8x64& a, const v_uint8x64& b, + const v_uint8x64& c, const v_uint8x64& d, hal::StoreMode mode=hal::STORE_UNALIGNED ) { v_uint8x64 br01, br23, ga01, ga23; - v_zip(b, r, br01, br23); - v_zip(g, a, ga01, ga23); + v_zip(a, c, br01, br23); + v_zip(b, d, ga01, ga23); v_uint8x64 bgra0, bgra1, bgra2, bgra3; v_zip(br01, ga01, bgra0, bgra1); v_zip(br23, ga23, bgra2, bgra3); @@ -2550,13 +2550,13 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x64& b, const v_uint8x6 } } -inline void v_store_interleave( ushort* ptr, const v_uint16x32& b, const v_uint16x32& g, - const v_uint16x32& r, const v_uint16x32& a, +inline void v_store_interleave( ushort* ptr, const v_uint16x32& a, const v_uint16x32& b, + const v_uint16x32& c, const v_uint16x32& d, hal::StoreMode mode=hal::STORE_UNALIGNED ) { v_uint16x32 br01, br23, ga01, ga23; - v_zip(b, r, br01, br23); - v_zip(g, a, ga01, ga23); + v_zip(a, c, br01, br23); + v_zip(b, d, ga01, ga23); v_uint16x32 bgra0, bgra1, bgra2, bgra3; v_zip(br01, ga01, bgra0, bgra1); v_zip(br23, ga23, bgra2, bgra3); @@ -2584,13 +2584,13 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x32& b, const v_uint1 } } -inline void v_store_interleave( unsigned* ptr, const v_uint32x16& b, const v_uint32x16& g, - const v_uint32x16& r, const v_uint32x16& a, +inline void v_store_interleave( unsigned* ptr, const v_uint32x16& a, const v_uint32x16& b, + const v_uint32x16& c, const v_uint32x16& d, hal::StoreMode mode=hal::STORE_UNALIGNED ) { v_uint32x16 br01, br23, ga01, ga23; - v_zip(b, r, br01, br23); - v_zip(g, a, ga01, ga23); + v_zip(a, c, br01, br23); + v_zip(b, d, ga01, ga23); v_uint32x16 bgra0, bgra1, bgra2, bgra3; v_zip(br01, ga01, bgra0, bgra1); v_zip(br23, ga23, bgra2, bgra3); @@ -2618,13 +2618,13 @@ inline void v_store_interleave( unsigned* ptr, const v_uint32x16& b, const v_uin } } -inline void v_store_interleave( uint64* ptr, const v_uint64x8& b, const v_uint64x8& g, - const v_uint64x8& r, const v_uint64x8& a, +inline void v_store_interleave( uint64* ptr, const v_uint64x8& a, const v_uint64x8& b, + const v_uint64x8& c, const v_uint64x8& d, hal::StoreMode mode=hal::STORE_UNALIGNED ) { v_uint64x8 br01, br23, ga01, ga23; - v_zip(b, r, br01, br23); - v_zip(g, a, ga01, ga23); + v_zip(a, c, br01, br23); + v_zip(b, d, ga01, ga23); v_uint64x8 bgra0, bgra1, bgra2, bgra3; v_zip(br01, ga01, bgra0, bgra1); v_zip(br23, ga23, bgra2, bgra3);