From 59373a1ae168e382248dfbe0cd96c240cb093ddd Mon Sep 17 00:00:00 2001 From: Vitaly Tuzov Date: Wed, 31 May 2017 14:06:45 +0300 Subject: [PATCH] AVX and SSE optimizations for resize NN --- modules/imgproc/src/imgwarp.cpp | 435 +++++++++++++++++++++++++++++++- 1 file changed, 433 insertions(+), 2 deletions(-) diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index c49f1ec287..866c9be7ab 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -417,6 +417,403 @@ private: resizeNNInvoker& operator=(const resizeNNInvoker&); }; +#if CV_AVX2 +class resizeNNInvokerAVX4 : + public ParallelLoopBody +{ +public: + resizeNNInvokerAVX4(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) : + ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4), + ify(_ify) + { + } + +#pragma optimization_parameter target_arch=AVX + virtual void operator() (const Range& range) const + { + Size ssize = src.size(), dsize = dst.size(); + int y, x, pix_size = (int)src.elemSize(); + int width = dsize.width; + int avxWidth = width - (width & 0x7); +#if (defined WIN32 || defined _WIN32 || defined __CYGWIN__) + const __declspec(align(64)) __m256i mask = _mm256_set1_epi32(-1); +#elif defined __GNUC__ && __GNUC__ >= 4 + const __m256i mask __attribute__((aligned(64))) = _mm256_set1_epi32(-1); +#else + const __m256i mask = _mm256_set1_epi32(-1); +#endif + if(((int64)(dst.data + dst.step) & 0x1f) == 0) + { + for(y = range.start; y < range.end; y++) + { + uchar* D = dst.data + dst.step*y; + uchar* Dstart = D; + int sy = std::min(cvFloor(y*ify), ssize.height-1); + const uchar* S = src.data + sy*src.step; +#pragma unroll(4) + for(x = 0; x < avxWidth; x += 8) + { + #if (defined WIN32 || defined _WIN32 || defined __CYGWIN__) + __declspec(align(64)) const __m256i *addr = (__m256i*)(x_ofs + x); + __declspec(align(64)) __m256i indices = _mm256_lddqu_si256(addr); + __declspec(align(64)) __m256i pixels = _mm256_i32gather_epi32((const int*)S, indices, 1); + #elif defined __GNUC__ && __GNUC__ >= 4 + const __m256i *addr __attribute__((aligned(64))) = (__m256i*)(x_ofs + x); + __m256i indices __attribute__((aligned(64))) = _mm256_lddqu_si256(addr); + __m256i pixels __attribute__((aligned(64))) = _mm256_i32gather_epi32((const int*)S, indices, 1); + #else + const __m256i *addr = (__m256i*)(x_ofs + x); + __m256i indices = _mm256_lddqu_si256(addr); + __m256i pixels = _mm256_i32gather_epi32((const int*)S, indices, 1); + #endif + _mm256_maskstore_epi32((int*)D, mask, pixels); + D += 32; + } + for(; x < width; x++) + { + *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]); + } + } + } + else + { + for(y = range.start; y < range.end; y++) + { + uchar* D = dst.data + dst.step*y; + uchar* Dstart = D; + int sy = std::min(cvFloor(y*ify), ssize.height-1); + const uchar* S = src.data + sy*src.step; +#pragma unroll(4) + for(x = 0; x < avxWidth; x += 8) + { + #if (defined WIN32 || defined _WIN32 || defined __CYGWIN__) + __declspec(align(64)) const __m256i *addr = (__m256i*)(x_ofs + x); + __declspec(align(64)) __m256i indices = _mm256_lddqu_si256(addr); + __declspec(align(64)) __m256i pixels = _mm256_i32gather_epi32((const int*)S, indices, 1); + #elif defined __GNUC__ && __GNUC__ >= 4 + const __m256i *addr __attribute__((aligned(64))) = (__m256i*)(x_ofs + x); + __m256i indices __attribute__((aligned(64))) = _mm256_lddqu_si256(addr); + __m256i pixels __attribute__((aligned(64))) = _mm256_i32gather_epi32((const int*)S, indices, 1); + #else + const __m256i *addr = (__m256i*)(x_ofs + x); + __m256i indices = _mm256_lddqu_si256(addr); + __m256i pixels = _mm256_i32gather_epi32((const int*)S, indices, 1); + #endif + _mm256_storeu_si256((__m256i*)D, pixels); + D += 32; + } + for(; x < width; x++) + { + *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]); + } + } + } + } + +private: + const Mat src; + Mat dst; + int* x_ofs, pix_size4; + double ify; + + resizeNNInvokerAVX4(const resizeNNInvokerAVX4&); + resizeNNInvokerAVX4& operator=(const resizeNNInvokerAVX4&); +}; + +class resizeNNInvokerAVX2 : + public ParallelLoopBody +{ +public: + resizeNNInvokerAVX2(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) : + ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4), + ify(_ify) + { + } + +#pragma optimization_parameter target_arch=AVX + virtual void operator() (const Range& range) const + { + Size ssize = src.size(), dsize = dst.size(); + int y, x, pix_size = (int)src.elemSize(); + int width = dsize.width; + //int avxWidth = (width - 1) - ((width - 1) & 0x7); + int avxWidth = width - (width & 0xf); +#if (defined WIN32 || defined _WIN32 || defined __CYGWIN__) + const __declspec(align(64)) __m256i mask = _mm256_set1_epi32(-1); + const __declspec(align(64)) __m256i shuffle_mask = _mm256_set_epi8(15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0, + 15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0); + const __declspec(align(64)) __m256i permute_mask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0); + const __declspec(align(64)) __m256i shift_shuffle_mask = _mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2, + 13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2); +#elif defined __GNUC__ && __GNUC__ >= 4 + const __m256i mask __attribute__((aligned(64))) = _mm256_set1_epi32(-1); + const __m256i shuffle_mask __attribute__((aligned(64))) = _mm256_set_epi8(15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0, + 15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0); + const __m256i permute_mask __attribute__((aligned(64))) = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0); + const __m256i shift_shuffle_mask __attribute__((aligned(64))) = _mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2, + 13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2); + +#else + const __m256i mask = _mm256_set1_epi32(-1); + const __m256i shuffle_mask = _mm256_set_epi8(15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0, + 15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0); + const __m256i permute_mask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0); + const __m256i shift_shuffle_mask = _mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2, + 13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2); +#endif + if(((int64)(dst.data + dst.step) & 0x1f) == 0) + { + for(y = range.start; y < range.end; y++) + { + uchar* D = dst.data + dst.step*y; + uchar* Dstart = D; + int sy = std::min(cvFloor(y*ify), ssize.height-1); + const uchar* S = src.data + sy*src.step; + const uchar* S2 = S - 2; +#pragma unroll(4) + for(x = 0; x < avxWidth; x += 16) + { + #if (defined WIN32 || defined _WIN32 || defined __CYGWIN__) + __declspec(align(64)) const __m256i *addr = (__m256i*)(x_ofs + x); + __declspec(align(64)) __m256i indices = _mm256_lddqu_si256(addr); + __declspec(align(64)) __m256i pixels1 = _mm256_i32gather_epi32((const int*)S, indices, 1); + __declspec(align(64)) const __m256i *addr2 = (__m256i*)(x_ofs + x + 8); + __declspec(align(64)) __m256i indices2 = _mm256_lddqu_si256(addr2); + __declspec(align(64)) __m256i pixels2 = _mm256_i32gather_epi32((const int*)S2, indices2, 1); + __declspec(align(64)) __m256i unpacked = _mm256_blend_epi16(pixels1, pixels2, 0xaa); + + __declspec(align(64)) __m256i bytes_shuffled = _mm256_shuffle_epi8(unpacked, shuffle_mask); + __declspec(align(64)) __m256i ints_permuted = _mm256_permutevar8x32_epi32(bytes_shuffled, permute_mask); + #elif defined __GNUC__ && __GNUC__ >= 4 + const __m256i *addr __attribute__((aligned(64))) = (__m256i*)(x_ofs + x); + __m256i indices __attribute__((aligned(64))) = _mm256_lddqu_si256(addr); + __m256i pixels1 __attribute__((aligned(64))) = _mm256_i32gather_epi32((const int*)S, indices, 1); + const __m256i *addr2 __attribute__((aligned(64))) = (__m256i*)(x_ofs + x + 8); + __m256i indices2 __attribute__((aligned(64))) = _mm256_lddqu_si256(addr2); + __m256i pixels2 __attribute__((aligned(64))) = _mm256_i32gather_epi32((const int*)S2, indices2, 1); + __m256i unpacked __attribute__((aligned(64))) = _mm256_blend_epi16(pixels1, pixels2, 0xaa); + + __m256i bytes_shuffled __attribute__((aligned(64))) = _mm256_shuffle_epi8(unpacked, shuffle_mask); + __m256i ints_permuted __attribute__((aligned(64))) = _mm256_permutevar8x32_epi32(bytes_shuffled, permute_mask); + #else + const __m256i *addr = (__m256i*)(x_ofs + x); + __m256i indices = _mm256_lddqu_si256(addr); + __m256i pixels1 = _mm256_i32gather_epi32((const int*)S, indices, 1); + const __m256i *addr2 = (__m256i*)(x_ofs + x + 8); + __m256i indices2 = _mm256_lddqu_si256(addr2); + __m256i pixels2 = _mm256_i32gather_epi32((const int*)S2, indices2, 1); + __m256i unpacked = _mm256_blend_epi16(pixels1, pixels2, 0xaa); + + __m256i bytes_shuffled = _mm256_shuffle_epi8(unpacked, shuffle_mask); + __m256i ints_permuted = _mm256_permutevar8x32_epi32(bytes_shuffled, permute_mask); + #endif + _mm256_maskstore_epi32((int*)D, mask, ints_permuted); + D += 32; + } + for(; x < width; x++) + { + *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]); + } + + } + } + else + { + for(y = range.start; y < range.end; y++) + { + uchar* D = dst.data + dst.step*y; + uchar* Dstart = D; + int sy = std::min(cvFloor(y*ify), ssize.height-1); + const uchar* S = src.data + sy*src.step; + const uchar* S2 = S - 2; +#pragma unroll(4) + for(x = 0; x < avxWidth; x += 16) + { + #if (defined WIN32 || defined _WIN32 || defined __CYGWIN__) + __declspec(align(64)) const __m256i *addr = (__m256i*)(x_ofs + x); + __declspec(align(64)) __m256i indices = _mm256_lddqu_si256(addr); + __declspec(align(64)) __m256i pixels1 = _mm256_i32gather_epi32((const int*)S, indices, 1); + __declspec(align(64)) const __m256i *addr2 = (__m256i*)(x_ofs + x + 8); + __declspec(align(64)) __m256i indices2 = _mm256_lddqu_si256(addr2); + __declspec(align(64)) __m256i pixels2 = _mm256_i32gather_epi32((const int*)S2, indices2, 1); + __declspec(align(64)) __m256i unpacked = _mm256_blend_epi16(pixels1, pixels2, 0xaa); + + __declspec(align(64)) __m256i bytes_shuffled = _mm256_shuffle_epi8(unpacked, shuffle_mask); + __declspec(align(64)) __m256i ints_permuted = _mm256_permutevar8x32_epi32(bytes_shuffled, permute_mask); + #elif defined __GNUC__ && __GNUC__ >= 4 + const __m256i *addr __attribute__((aligned(64))) = (__m256i*)(x_ofs + x); + __m256i indices __attribute__((aligned(64))) = _mm256_lddqu_si256(addr); + __m256i pixels1 __attribute__((aligned(64))) = _mm256_i32gather_epi32((const int*)S, indices, 1); + const __m256i *addr2 __attribute__((aligned(64))) = (__m256i*)(x_ofs + x + 8); + __m256i indices2 __attribute__((aligned(64))) = _mm256_lddqu_si256(addr2); + __m256i pixels2 __attribute__((aligned(64))) = _mm256_i32gather_epi32((const int*)S2, indices2, 1); + __m256i unpacked __attribute__((aligned(64))) = _mm256_blend_epi16(pixels1, pixels2, 0xaa); + + __m256i bytes_shuffled __attribute__((aligned(64))) = _mm256_shuffle_epi8(unpacked, shuffle_mask); + __m256i ints_permuted __attribute__((aligned(64))) = _mm256_permutevar8x32_epi32(bytes_shuffled, permute_mask); + #else + const __m256i *addr = (__m256i*)(x_ofs + x); + __m256i indices = _mm256_lddqu_si256(addr); + __m256i pixels1 = _mm256_i32gather_epi32((const int*)S, indices, 1); + const __m256i *addr2 = (__m256i*)(x_ofs + x + 8); + __m256i indices2 = _mm256_lddqu_si256(addr2); + __m256i pixels2 = _mm256_i32gather_epi32((const int*)S2, indices2, 1); + __m256i unpacked = _mm256_blend_epi16(pixels1, pixels2, 0xaa); + + __m256i bytes_shuffled = _mm256_shuffle_epi8(unpacked, shuffle_mask); + __m256i ints_permuted = _mm256_permutevar8x32_epi32(bytes_shuffled, permute_mask); + #endif + _mm256_storeu_si256((__m256i*)D, ints_permuted); + D += 32; + } + for(; x < width; x++) + { + *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]); + } + } + } + } + +private: + const Mat src; + Mat dst; + int* x_ofs, pix_size4; + double ify; + + resizeNNInvokerAVX2(const resizeNNInvokerAVX2&); + resizeNNInvokerAVX2& operator=(const resizeNNInvokerAVX2&); +}; +#endif + +#if CV_SSE4_1 +class resizeNNInvokerSSE2 : + public ParallelLoopBody +{ +public: + resizeNNInvokerSSE2(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) : + ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4), + ify(_ify) + { + } + +#pragma optimization_parameter target_arch=SSE4.2 + virtual void operator() (const Range& range) const + { + Size ssize = src.size(), dsize = dst.size(); + int y, x, pix_size = (int)src.elemSize(); + int width = dsize.width; + int sseWidth = width - (width & 0x7); + for(y = range.start; y < range.end; y++) + { + uchar* D = dst.data + dst.step*y; + uchar* Dstart = D; + int sy = std::min(cvFloor(y*ify), ssize.height-1); + const uchar* S = src.data + sy*src.step; + #if (defined WIN32 || defined _WIN32 || defined __CYGWIN__) + __declspec(align(64)) __m128i pixels = _mm_set1_epi16(0); + #elif defined __GNUC__ && __GNUC__ >= 4 + __m128i pixels __attribute__((aligned(64))) = _mm_set1_epi16(0); + #else + __m128i pixels = _mm_set1_epi16(0); + #endif + for(x = 0; x < sseWidth; x += 8) + { + ushort imm = *(ushort*)(S + x_ofs[x + 0]); + pixels = _mm_insert_epi16(pixels, imm, 0); + imm = *(ushort*)(S + x_ofs[x + 1]); + pixels = _mm_insert_epi16(pixels, imm, 1); + imm = *(ushort*)(S + x_ofs[x + 2]); + pixels = _mm_insert_epi16(pixels, imm, 2); + imm = *(ushort*)(S + x_ofs[x + 3]); + pixels = _mm_insert_epi16(pixels, imm, 3); + imm = *(ushort*)(S + x_ofs[x + 4]); + pixels = _mm_insert_epi16(pixels, imm, 4); + imm = *(ushort*)(S + x_ofs[x + 5]); + pixels = _mm_insert_epi16(pixels, imm, 5); + imm = *(ushort*)(S + x_ofs[x + 6]); + pixels = _mm_insert_epi16(pixels, imm, 6); + imm = *(ushort*)(S + x_ofs[x + 7]); + pixels = _mm_insert_epi16(pixels, imm, 7); + _mm_storeu_si128((__m128i*)D, pixels); + D += 16; + } + for(; x < width; x++) + { + *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]); + } + } + } + +private: + const Mat src; + Mat dst; + int* x_ofs, pix_size4; + double ify; + + resizeNNInvokerSSE2(const resizeNNInvokerSSE2&); + resizeNNInvokerSSE2& operator=(const resizeNNInvokerSSE2&); +}; + +class resizeNNInvokerSSE4 : + public ParallelLoopBody +{ +public: + resizeNNInvokerSSE4(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) : + ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4), + ify(_ify) + { + } +#pragma optimization_parameter target_arch=SSE4.2 + virtual void operator() (const Range& range) const + { + Size ssize = src.size(), dsize = dst.size(); + int y, x, pix_size = (int)src.elemSize(); + int width = dsize.width; + int sseWidth = width - (width & 0x3); + for(y = range.start; y < range.end; y++) + { + uchar* D = dst.data + dst.step*y; + uchar* Dstart = D; + int sy = std::min(cvFloor(y*ify), ssize.height-1); + const uchar* S = src.data + sy*src.step; + #if (defined WIN32 || defined _WIN32 || defined __CYGWIN__) + __declspec(align(64)) __m128i pixels = _mm_set1_epi16(0); + #elif defined __GNUC__ && __GNUC__ >= 4 + __m128i pixels __attribute__((aligned(64))) = _mm_set1_epi16(0); + #else + __m128i pixels = _mm_set1_epi16(0); + #endif + for(x = 0; x < sseWidth; x += 4) + { + int imm = *(int*)(S + x_ofs[x + 0]); + pixels = _mm_insert_epi32(pixels, imm, 0); + imm = *(int*)(S + x_ofs[x + 1]); + pixels = _mm_insert_epi32(pixels, imm, 1); + imm = *(int*)(S + x_ofs[x + 2]); + pixels = _mm_insert_epi32(pixels, imm, 2); + imm = *(int*)(S + x_ofs[x + 3]); + pixels = _mm_insert_epi32(pixels, imm, 3); + _mm_storeu_si128((__m128i*)D, pixels); + D += 16; + } + for(; x < width; x++) + { + *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]); + } + } + } + +private: + const Mat src; + Mat dst; + int* x_ofs, pix_size4; + double ify; + + resizeNNInvokerSSE4(const resizeNNInvokerSSE4&); + resizeNNInvokerSSE4& operator=(const resizeNNInvokerSSE4&); +}; +#endif + static void resizeNN( const Mat& src, Mat& dst, double fx, double fy ) { @@ -435,8 +832,42 @@ resizeNN( const Mat& src, Mat& dst, double fx, double fy ) } Range range(0, dsize.height); - resizeNNInvoker invoker(src, dst, x_ofs, pix_size4, ify); - parallel_for_(range, invoker, dst.total()/(double)(1<<16)); +#if CV_AVX2 + if(checkHardwareSupport(CV_CPU_AVX2) && ((pix_size == 2) || (pix_size == 4))) + { + if(pix_size == 2) + { + resizeNNInvokerAVX2 invoker(src, dst, x_ofs, pix_size4, ify); + parallel_for_(range, invoker, dst.total()/(double)(1<<16)); + } + else if (pix_size == 4) + { + resizeNNInvokerAVX4 invoker(src, dst, x_ofs, pix_size4, ify); + parallel_for_(range, invoker, dst.total()/(double)(1<<16)); + } + } + else +#endif +#if CV_SSE4_1 + if(checkHardwareSupport(CV_CPU_SSE4_1) && ((pix_size == 2) || (pix_size == 4))) + { + if(pix_size == 2) + { + resizeNNInvokerSSE2 invoker(src, dst, x_ofs, pix_size4, ify); + parallel_for_(range, invoker, dst.total()/(double)(1<<16)); + } + else if(pix_size == 4) + { + resizeNNInvokerSSE4 invoker(src, dst, x_ofs, pix_size4, ify); + parallel_for_(range, invoker, dst.total()/(double)(1<<16)); + } + } + else +#endif + { + resizeNNInvoker invoker(src, dst, x_ofs, pix_size4, ify); + parallel_for_(range, invoker, dst.total()/(double)(1<<16)); + } }