From 3681dcef1a6d395d5640b67fff6a846b69b5108e Mon Sep 17 00:00:00 2001 From: Vitaly Tuzov Date: Mon, 3 Jul 2017 13:49:16 +0300 Subject: [PATCH] AVX optimized implementation of resize and warp functions migrated to separate file --- modules/imgproc/src/imgwarp.avx2.cpp | 289 +++++++++++++++++++ modules/imgproc/src/imgwarp.cpp | 375 +------------------------ modules/imgproc/src/imgwarp.hpp | 73 +++++ modules/imgproc/src/imgwarp.sse4_1.cpp | 192 +++++++++++++ 4 files changed, 569 insertions(+), 360 deletions(-) create mode 100644 modules/imgproc/src/imgwarp.avx2.cpp create mode 100644 modules/imgproc/src/imgwarp.hpp create mode 100644 modules/imgproc/src/imgwarp.sse4_1.cpp diff --git a/modules/imgproc/src/imgwarp.avx2.cpp b/modules/imgproc/src/imgwarp.avx2.cpp new file mode 100644 index 0000000000..321957f39e --- /dev/null +++ b/modules/imgproc/src/imgwarp.avx2.cpp @@ -0,0 +1,289 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +/* //////////////////////////////////////////////////////////////////// +// +// Geometrical transforms on images and matrices: rotation, zoom etc. +// +// */ + +#include "precomp.hpp" +#include "imgwarp.hpp" + +namespace cv +{ +namespace opt_AVX2 +{ + +class resizeNNInvokerAVX4 : + public ParallelLoopBody +{ +public: + resizeNNInvokerAVX4(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) : + ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4), + ify(_ify) + { + } + +#if defined(__INTEL_COMPILER) +#pragma optimization_parameter target_arch=AVX +#endif + virtual void operator() (const Range& range) const + { + Size ssize = src.size(), dsize = dst.size(); + int y, x; + int width = dsize.width; + int avxWidth = width - (width & 0x7); + const __m256i CV_DECL_ALIGNED(64) mask = _mm256_set1_epi32(-1); + if(((int64)(dst.data + dst.step) & 0x1f) == 0) + { + for(y = range.start; y < range.end; y++) + { + uchar* D = dst.data + dst.step*y; + uchar* Dstart = D; + int sy = std::min(cvFloor(y*ify), ssize.height-1); + const uchar* S = src.data + sy*src.step; +#pragma unroll(4) + for(x = 0; x < avxWidth; x += 8) + { + const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); + __m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr); + __m256i CV_DECL_ALIGNED(64) pixels = _mm256_i32gather_epi32((const int*)S, indices, 1); + _mm256_maskstore_epi32((int*)D, mask, pixels); + D += 32; + } + for(; x < width; x++) + { + *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]); + } + } + } + else + { + for(y = range.start; y < range.end; y++) + { + uchar* D = dst.data + dst.step*y; + uchar* Dstart = D; + int sy = std::min(cvFloor(y*ify), ssize.height-1); + const uchar* S = src.data + sy*src.step; +#pragma unroll(4) + for(x = 0; x < avxWidth; x += 8) + { + const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); + __m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr); + __m256i CV_DECL_ALIGNED(64) pixels = _mm256_i32gather_epi32((const int*)S, indices, 1); + _mm256_storeu_si256((__m256i*)D, pixels); + D += 32; + } + for(; x < width; x++) + { + *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]); + } + } + } + } + +private: + const Mat src; + Mat dst; + int* x_ofs, pix_size4; + double ify; + + resizeNNInvokerAVX4(const resizeNNInvokerAVX4&); + resizeNNInvokerAVX4& operator=(const resizeNNInvokerAVX4&); +}; + +class resizeNNInvokerAVX2 : + public ParallelLoopBody +{ +public: + resizeNNInvokerAVX2(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) : + ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4), + ify(_ify) + { + } + +#if defined(__INTEL_COMPILER) +#pragma optimization_parameter target_arch=AVX +#endif + virtual void operator() (const Range& range) const + { + Size ssize = src.size(), dsize = dst.size(); + int y, x; + int width = dsize.width; + //int avxWidth = (width - 1) - ((width - 1) & 0x7); + int avxWidth = width - (width & 0xf); + const __m256i CV_DECL_ALIGNED(64) mask = _mm256_set1_epi32(-1); + const __m256i CV_DECL_ALIGNED(64) shuffle_mask = _mm256_set_epi8(15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0, + 15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0); + const __m256i CV_DECL_ALIGNED(64) permute_mask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0); + const __m256i CV_DECL_ALIGNED(64) shift_shuffle_mask = _mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2, + 13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2); + if(((int64)(dst.data + dst.step) & 0x1f) == 0) + { + for(y = range.start; y < range.end; y++) + { + uchar* D = dst.data + dst.step*y; + uchar* Dstart = D; + int sy = std::min(cvFloor(y*ify), ssize.height-1); + const uchar* S = src.data + sy*src.step; + const uchar* S2 = S - 2; +#pragma unroll(4) + for(x = 0; x < avxWidth; x += 16) + { + const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); + __m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr); + __m256i CV_DECL_ALIGNED(64) pixels1 = _mm256_i32gather_epi32((const int*)S, indices, 1); + const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8); + __m256i CV_DECL_ALIGNED(64) indices2 = _mm256_lddqu_si256(addr2); + __m256i CV_DECL_ALIGNED(64) pixels2 = _mm256_i32gather_epi32((const int*)S2, indices2, 1); + __m256i CV_DECL_ALIGNED(64) unpacked = _mm256_blend_epi16(pixels1, pixels2, 0xaa); + + __m256i CV_DECL_ALIGNED(64) bytes_shuffled = _mm256_shuffle_epi8(unpacked, shuffle_mask); + __m256i CV_DECL_ALIGNED(64) ints_permuted = _mm256_permutevar8x32_epi32(bytes_shuffled, permute_mask); + _mm256_maskstore_epi32((int*)D, mask, ints_permuted); + D += 32; + } + for(; x < width; x++) + { + *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]); + } + + } + } + else + { + for(y = range.start; y < range.end; y++) + { + uchar* D = dst.data + dst.step*y; + uchar* Dstart = D; + int sy = std::min(cvFloor(y*ify), ssize.height-1); + const uchar* S = src.data + sy*src.step; + const uchar* S2 = S - 2; +#pragma unroll(4) + for(x = 0; x < avxWidth; x += 16) + { + const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); + __m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr); + __m256i CV_DECL_ALIGNED(64) pixels1 = _mm256_i32gather_epi32((const int*)S, indices, 1); + const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8); + __m256i CV_DECL_ALIGNED(64) indices2 = _mm256_lddqu_si256(addr2); + __m256i CV_DECL_ALIGNED(64) pixels2 = _mm256_i32gather_epi32((const int*)S2, indices2, 1); + __m256i CV_DECL_ALIGNED(64) unpacked = _mm256_blend_epi16(pixels1, pixels2, 0xaa); + + __m256i CV_DECL_ALIGNED(64) bytes_shuffled = _mm256_shuffle_epi8(unpacked, shuffle_mask); + __m256i CV_DECL_ALIGNED(64) ints_permuted = _mm256_permutevar8x32_epi32(bytes_shuffled, permute_mask); + _mm256_storeu_si256((__m256i*)D, ints_permuted); + D += 32; + } + for(; x < width; x++) + { + *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]); + } + } + } + } + +private: + const Mat src; + Mat dst; + int* x_ofs, pix_size4; + double ify; + + resizeNNInvokerAVX2(const resizeNNInvokerAVX2&); + resizeNNInvokerAVX2& operator=(const resizeNNInvokerAVX2&); +}; + +void resizeNN2_AVX2(const Range& range, const Mat& src, Mat &dst, int *x_ofs, int pix_size4, double ify) +{ + resizeNNInvokerAVX2 invoker(src, dst, x_ofs, pix_size4, ify); + parallel_for_(range, invoker, dst.total() / (double)(1 << 16)); +} + +void resizeNN4_AVX2(const Range& range, const Mat& src, Mat &dst, int *x_ofs, int pix_size4, double ify) +{ + resizeNNInvokerAVX4 invoker(src, dst, x_ofs, pix_size4, ify); + parallel_for_(range, invoker, dst.total() / (double)(1 << 16)); +} + +int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw) +{ + const int AB_BITS = MAX(10, (int)INTER_BITS); + int x1 = 0; + __m256i fxy_mask = _mm256_set1_epi32(INTER_TAB_SIZE - 1); + __m256i XX = _mm256_set1_epi32(X0), YY = _mm256_set1_epi32(Y0); + for (; x1 <= bw - 16; x1 += 16) + { + __m256i tx0, tx1, ty0, ty1; + tx0 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(adelta + x1)), XX); + ty0 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(bdelta + x1)), YY); + tx1 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(adelta + x1 + 8)), XX); + ty1 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(bdelta + x1 + 8)), YY); + + tx0 = _mm256_srai_epi32(tx0, AB_BITS - INTER_BITS); + ty0 = _mm256_srai_epi32(ty0, AB_BITS - INTER_BITS); + tx1 = _mm256_srai_epi32(tx1, AB_BITS - INTER_BITS); + ty1 = _mm256_srai_epi32(ty1, AB_BITS - INTER_BITS); + + __m256i fx_ = _mm256_packs_epi32(_mm256_and_si256(tx0, fxy_mask), + _mm256_and_si256(tx1, fxy_mask)); + __m256i fy_ = _mm256_packs_epi32(_mm256_and_si256(ty0, fxy_mask), + _mm256_and_si256(ty1, fxy_mask)); + tx0 = _mm256_packs_epi32(_mm256_srai_epi32(tx0, INTER_BITS), + _mm256_srai_epi32(tx1, INTER_BITS)); + ty0 = _mm256_packs_epi32(_mm256_srai_epi32(ty0, INTER_BITS), + _mm256_srai_epi32(ty1, INTER_BITS)); + fx_ = _mm256_adds_epi16(fx_, _mm256_slli_epi16(fy_, INTER_BITS)); + fx_ = _mm256_permute4x64_epi64(fx_, (3 << 6) + (1 << 4) + (2 << 2) + 0); + + _mm256_storeu_si256((__m256i*)(xy + x1 * 2), _mm256_unpacklo_epi16(tx0, ty0)); + _mm256_storeu_si256((__m256i*)(xy + x1 * 2 + 16), _mm256_unpackhi_epi16(tx0, ty0)); + _mm256_storeu_si256((__m256i*)(alpha + x1), fx_); + } + _mm256_zeroupper(); + return x1; +} + +} +} +/* End of file. */ diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index fd0387d6a4..4f040a91cc 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -52,6 +52,7 @@ #include "hal_replacement.hpp" #include "opencv2/core/openvx/ovx_defs.hpp" +#include "imgwarp.hpp" using namespace cv; @@ -417,308 +418,6 @@ private: resizeNNInvoker& operator=(const resizeNNInvoker&); }; -#if CV_AVX2 -class resizeNNInvokerAVX4 : - public ParallelLoopBody -{ -public: - resizeNNInvokerAVX4(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) : - ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4), - ify(_ify) - { - } - -#if defined(__INTEL_COMPILER) -#pragma optimization_parameter target_arch=AVX -#endif - virtual void operator() (const Range& range) const - { - Size ssize = src.size(), dsize = dst.size(); - int y, x, pix_size = (int)src.elemSize(); - int width = dsize.width; - int avxWidth = width - (width & 0x7); - const __m256i CV_DECL_ALIGNED(64) mask = _mm256_set1_epi32(-1); - if(((int64)(dst.data + dst.step) & 0x1f) == 0) - { - for(y = range.start; y < range.end; y++) - { - uchar* D = dst.data + dst.step*y; - uchar* Dstart = D; - int sy = std::min(cvFloor(y*ify), ssize.height-1); - const uchar* S = src.data + sy*src.step; -#pragma unroll(4) - for(x = 0; x < avxWidth; x += 8) - { - const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); - __m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr); - __m256i CV_DECL_ALIGNED(64) pixels = _mm256_i32gather_epi32((const int*)S, indices, 1); - _mm256_maskstore_epi32((int*)D, mask, pixels); - D += 32; - } - for(; x < width; x++) - { - *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]); - } - } - } - else - { - for(y = range.start; y < range.end; y++) - { - uchar* D = dst.data + dst.step*y; - uchar* Dstart = D; - int sy = std::min(cvFloor(y*ify), ssize.height-1); - const uchar* S = src.data + sy*src.step; -#pragma unroll(4) - for(x = 0; x < avxWidth; x += 8) - { - const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); - __m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr); - __m256i CV_DECL_ALIGNED(64) pixels = _mm256_i32gather_epi32((const int*)S, indices, 1); - _mm256_storeu_si256((__m256i*)D, pixels); - D += 32; - } - for(; x < width; x++) - { - *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]); - } - } - } - } - -private: - const Mat src; - Mat dst; - int* x_ofs, pix_size4; - double ify; - - resizeNNInvokerAVX4(const resizeNNInvokerAVX4&); - resizeNNInvokerAVX4& operator=(const resizeNNInvokerAVX4&); -}; - -class resizeNNInvokerAVX2 : - public ParallelLoopBody -{ -public: - resizeNNInvokerAVX2(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) : - ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4), - ify(_ify) - { - } - -#if defined(__INTEL_COMPILER) -#pragma optimization_parameter target_arch=AVX -#endif - virtual void operator() (const Range& range) const - { - Size ssize = src.size(), dsize = dst.size(); - int y, x, pix_size = (int)src.elemSize(); - int width = dsize.width; - //int avxWidth = (width - 1) - ((width - 1) & 0x7); - int avxWidth = width - (width & 0xf); - const __m256i CV_DECL_ALIGNED(64) mask = _mm256_set1_epi32(-1); - const __m256i CV_DECL_ALIGNED(64) shuffle_mask = _mm256_set_epi8(15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0, - 15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0); - const __m256i CV_DECL_ALIGNED(64) permute_mask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0); - const __m256i CV_DECL_ALIGNED(64) shift_shuffle_mask = _mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2, - 13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2); - if(((int64)(dst.data + dst.step) & 0x1f) == 0) - { - for(y = range.start; y < range.end; y++) - { - uchar* D = dst.data + dst.step*y; - uchar* Dstart = D; - int sy = std::min(cvFloor(y*ify), ssize.height-1); - const uchar* S = src.data + sy*src.step; - const uchar* S2 = S - 2; -#pragma unroll(4) - for(x = 0; x < avxWidth; x += 16) - { - const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); - __m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr); - __m256i CV_DECL_ALIGNED(64) pixels1 = _mm256_i32gather_epi32((const int*)S, indices, 1); - const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8); - __m256i CV_DECL_ALIGNED(64) indices2 = _mm256_lddqu_si256(addr2); - __m256i CV_DECL_ALIGNED(64) pixels2 = _mm256_i32gather_epi32((const int*)S2, indices2, 1); - __m256i CV_DECL_ALIGNED(64) unpacked = _mm256_blend_epi16(pixels1, pixels2, 0xaa); - - __m256i CV_DECL_ALIGNED(64) bytes_shuffled = _mm256_shuffle_epi8(unpacked, shuffle_mask); - __m256i CV_DECL_ALIGNED(64) ints_permuted = _mm256_permutevar8x32_epi32(bytes_shuffled, permute_mask); - _mm256_maskstore_epi32((int*)D, mask, ints_permuted); - D += 32; - } - for(; x < width; x++) - { - *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]); - } - - } - } - else - { - for(y = range.start; y < range.end; y++) - { - uchar* D = dst.data + dst.step*y; - uchar* Dstart = D; - int sy = std::min(cvFloor(y*ify), ssize.height-1); - const uchar* S = src.data + sy*src.step; - const uchar* S2 = S - 2; -#pragma unroll(4) - for(x = 0; x < avxWidth; x += 16) - { - const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); - __m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr); - __m256i CV_DECL_ALIGNED(64) pixels1 = _mm256_i32gather_epi32((const int*)S, indices, 1); - const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8); - __m256i CV_DECL_ALIGNED(64) indices2 = _mm256_lddqu_si256(addr2); - __m256i CV_DECL_ALIGNED(64) pixels2 = _mm256_i32gather_epi32((const int*)S2, indices2, 1); - __m256i CV_DECL_ALIGNED(64) unpacked = _mm256_blend_epi16(pixels1, pixels2, 0xaa); - - __m256i CV_DECL_ALIGNED(64) bytes_shuffled = _mm256_shuffle_epi8(unpacked, shuffle_mask); - __m256i CV_DECL_ALIGNED(64) ints_permuted = _mm256_permutevar8x32_epi32(bytes_shuffled, permute_mask); - _mm256_storeu_si256((__m256i*)D, ints_permuted); - D += 32; - } - for(; x < width; x++) - { - *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]); - } - } - } - } - -private: - const Mat src; - Mat dst; - int* x_ofs, pix_size4; - double ify; - - resizeNNInvokerAVX2(const resizeNNInvokerAVX2&); - resizeNNInvokerAVX2& operator=(const resizeNNInvokerAVX2&); -}; -#endif - -#if CV_SSE4_1 -class resizeNNInvokerSSE2 : - public ParallelLoopBody -{ -public: - resizeNNInvokerSSE2(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) : - ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4), - ify(_ify) - { - } - -#if defined(__INTEL_COMPILER) -#pragma optimization_parameter target_arch=SSE4.2 -#endif - virtual void operator() (const Range& range) const - { - Size ssize = src.size(), dsize = dst.size(); - int y, x; - int width = dsize.width; - int sseWidth = width - (width & 0x7); - for(y = range.start; y < range.end; y++) - { - uchar* D = dst.data + dst.step*y; - uchar* Dstart = D; - int sy = std::min(cvFloor(y*ify), ssize.height-1); - const uchar* S = src.data + sy*src.step; - __m128i CV_DECL_ALIGNED(64) pixels = _mm_set1_epi16(0); - for(x = 0; x < sseWidth; x += 8) - { - ushort imm = *(ushort*)(S + x_ofs[x + 0]); - pixels = _mm_insert_epi16(pixels, imm, 0); - imm = *(ushort*)(S + x_ofs[x + 1]); - pixels = _mm_insert_epi16(pixels, imm, 1); - imm = *(ushort*)(S + x_ofs[x + 2]); - pixels = _mm_insert_epi16(pixels, imm, 2); - imm = *(ushort*)(S + x_ofs[x + 3]); - pixels = _mm_insert_epi16(pixels, imm, 3); - imm = *(ushort*)(S + x_ofs[x + 4]); - pixels = _mm_insert_epi16(pixels, imm, 4); - imm = *(ushort*)(S + x_ofs[x + 5]); - pixels = _mm_insert_epi16(pixels, imm, 5); - imm = *(ushort*)(S + x_ofs[x + 6]); - pixels = _mm_insert_epi16(pixels, imm, 6); - imm = *(ushort*)(S + x_ofs[x + 7]); - pixels = _mm_insert_epi16(pixels, imm, 7); - _mm_storeu_si128((__m128i*)D, pixels); - D += 16; - } - for(; x < width; x++) - { - *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]); - } - } - } - -private: - const Mat src; - Mat dst; - int* x_ofs, pix_size4; - double ify; - - resizeNNInvokerSSE2(const resizeNNInvokerSSE2&); - resizeNNInvokerSSE2& operator=(const resizeNNInvokerSSE2&); -}; - -class resizeNNInvokerSSE4 : - public ParallelLoopBody -{ -public: - resizeNNInvokerSSE4(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) : - ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4), - ify(_ify) - { - } -#if defined(__INTEL_COMPILER) -#pragma optimization_parameter target_arch=SSE4.2 -#endif - virtual void operator() (const Range& range) const - { - Size ssize = src.size(), dsize = dst.size(); - int y, x; - int width = dsize.width; - int sseWidth = width - (width & 0x3); - for(y = range.start; y < range.end; y++) - { - uchar* D = dst.data + dst.step*y; - uchar* Dstart = D; - int sy = std::min(cvFloor(y*ify), ssize.height-1); - const uchar* S = src.data + sy*src.step; - __m128i CV_DECL_ALIGNED(64) pixels = _mm_set1_epi16(0); - for(x = 0; x < sseWidth; x += 4) - { - int imm = *(int*)(S + x_ofs[x + 0]); - pixels = _mm_insert_epi32(pixels, imm, 0); - imm = *(int*)(S + x_ofs[x + 1]); - pixels = _mm_insert_epi32(pixels, imm, 1); - imm = *(int*)(S + x_ofs[x + 2]); - pixels = _mm_insert_epi32(pixels, imm, 2); - imm = *(int*)(S + x_ofs[x + 3]); - pixels = _mm_insert_epi32(pixels, imm, 3); - _mm_storeu_si128((__m128i*)D, pixels); - D += 16; - } - for(; x < width; x++) - { - *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]); - } - } - } - -private: - const Mat src; - Mat dst; - int* x_ofs, pix_size4; - double ify; - - resizeNNInvokerSSE4(const resizeNNInvokerSSE4&); - resizeNNInvokerSSE4& operator=(const resizeNNInvokerSSE4&); -}; -#endif - static void resizeNN( const Mat& src, Mat& dst, double fx, double fy ) { @@ -737,35 +436,23 @@ resizeNN( const Mat& src, Mat& dst, double fx, double fy ) } Range range(0, dsize.height); -#if CV_AVX2 - if(checkHardwareSupport(CV_CPU_AVX2) && ((pix_size == 2) || (pix_size == 4))) +#if CV_TRY_AVX2 + if(CV_CPU_HAS_SUPPORT_AVX2 && ((pix_size == 2) || (pix_size == 4))) { if(pix_size == 2) - { - resizeNNInvokerAVX2 invoker(src, dst, x_ofs, pix_size4, ify); - parallel_for_(range, invoker, dst.total()/(double)(1<<16)); - } - else if (pix_size == 4) - { - resizeNNInvokerAVX4 invoker(src, dst, x_ofs, pix_size4, ify); - parallel_for_(range, invoker, dst.total()/(double)(1<<16)); - } + opt_AVX2::resizeNN2_AVX2(range, src, dst, x_ofs, pix_size4, ify); + else + opt_AVX2::resizeNN4_AVX2(range, src, dst, x_ofs, pix_size4, ify); } else #endif -#if CV_SSE4_1 - if(checkHardwareSupport(CV_CPU_SSE4_1) && ((pix_size == 2) || (pix_size == 4))) +#if CV_TRY_SSE4_1 + if(CV_CPU_HAS_SUPPORT_SSE4_1 && ((pix_size == 2) || (pix_size == 4))) { if(pix_size == 2) - { - resizeNNInvokerSSE2 invoker(src, dst, x_ofs, pix_size4, ify); - parallel_for_(range, invoker, dst.total()/(double)(1<<16)); - } - else if(pix_size == 4) - { - resizeNNInvokerSSE4 invoker(src, dst, x_ofs, pix_size4, ify); - parallel_for_(range, invoker, dst.total()/(double)(1<<16)); - } + opt_SSE41::resizeNN2_SSE4_1(range, src, dst, x_ofs, pix_size4, ify); + else + opt_SSE41::resizeNN4_SSE4_1(range, src, dst, x_ofs, pix_size4, ify); } else #endif @@ -5864,8 +5551,8 @@ public: const int AB_BITS = MAX(10, (int)INTER_BITS); const int AB_SCALE = 1 << AB_BITS; int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1; - #if CV_AVX2 - bool useAVX2 = checkHardwareSupport(CV_CPU_AVX2); + #if CV_TRY_AVX2 + bool useAVX2 = CV_CPU_HAS_SUPPORT_AVX2; #endif #if CV_SSE2 bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2); @@ -5947,41 +5634,9 @@ public: { short* alpha = A + y1*bw; x1 = 0; - #if CV_AVX2 + #if CV_TRY_AVX2 if ( useAVX2 ) - { - __m256i fxy_mask = _mm256_set1_epi32(INTER_TAB_SIZE - 1); - __m256i XX = _mm256_set1_epi32(X0), YY = _mm256_set1_epi32(Y0); - for( ; x1 <= bw - 16; x1 += 16 ) - { - __m256i tx0, tx1, ty0, ty1; - tx0 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(adelta + x + x1)), XX); - ty0 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(bdelta + x + x1)), YY); - tx1 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(adelta + x + x1 + 8)), XX); - ty1 = _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)(bdelta + x + x1 + 8)), YY); - - tx0 = _mm256_srai_epi32(tx0, AB_BITS - INTER_BITS); - ty0 = _mm256_srai_epi32(ty0, AB_BITS - INTER_BITS); - tx1 = _mm256_srai_epi32(tx1, AB_BITS - INTER_BITS); - ty1 = _mm256_srai_epi32(ty1, AB_BITS - INTER_BITS); - - __m256i fx_ = _mm256_packs_epi32(_mm256_and_si256(tx0, fxy_mask), - _mm256_and_si256(tx1, fxy_mask)); - __m256i fy_ = _mm256_packs_epi32(_mm256_and_si256(ty0, fxy_mask), - _mm256_and_si256(ty1, fxy_mask)); - tx0 = _mm256_packs_epi32(_mm256_srai_epi32(tx0, INTER_BITS), - _mm256_srai_epi32(tx1, INTER_BITS)); - ty0 = _mm256_packs_epi32(_mm256_srai_epi32(ty0, INTER_BITS), - _mm256_srai_epi32(ty1, INTER_BITS)); - fx_ = _mm256_adds_epi16(fx_, _mm256_slli_epi16(fy_, INTER_BITS)); - fx_ = _mm256_permute4x64_epi64(fx_, (3 << 6) + (1 << 4) + (2 << 2) + 0); - - _mm256_storeu_si256((__m256i*)(xy + x1*2), _mm256_unpacklo_epi16(tx0, ty0)); - _mm256_storeu_si256((__m256i*)(xy + x1*2 + 16), _mm256_unpackhi_epi16(tx0, ty0)); - _mm256_storeu_si256((__m256i*)(alpha + x1), fx_); - } - _mm256_zeroupper(); - } + x1 = opt_AVX2::warpAffineBlockline(adelta + x, bdelta + x, xy, alpha, X0, Y0, bw); #endif #if CV_SSE2 if( useSSE2 ) diff --git a/modules/imgproc/src/imgwarp.hpp b/modules/imgproc/src/imgwarp.hpp new file mode 100644 index 0000000000..428bcc51e7 --- /dev/null +++ b/modules/imgproc/src/imgwarp.hpp @@ -0,0 +1,73 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +/* //////////////////////////////////////////////////////////////////// +// +// Geometrical transforms on images and matrices: rotation, zoom etc. +// +// */ + +#ifndef OPENCV_IMGPROC_IMGWARP_HPP +#define OPENCV_IMGPROC_IMGWARP_HPP +#include "precomp.hpp" + +namespace cv +{ +namespace opt_AVX2 +{ +#if CV_TRY_AVX2 +void resizeNN2_AVX2(const Range&, const Mat&, Mat&, int*, int, double); +void resizeNN4_AVX2(const Range&, const Mat&, Mat&, int*, int, double); +int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw); +#endif +} +namespace opt_SSE41 +{ +#if CV_TRY_SSE4_1 +void resizeNN2_SSE4_1(const Range&, const Mat&, Mat&, int*, int, double); +void resizeNN4_SSE4_1(const Range&, const Mat&, Mat&, int*, int, double); +#endif +} +} +#endif +/* End of file. */ diff --git a/modules/imgproc/src/imgwarp.sse4_1.cpp b/modules/imgproc/src/imgwarp.sse4_1.cpp new file mode 100644 index 0000000000..b326ffcb1a --- /dev/null +++ b/modules/imgproc/src/imgwarp.sse4_1.cpp @@ -0,0 +1,192 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +/* //////////////////////////////////////////////////////////////////// +// +// Geometrical transforms on images and matrices: rotation, zoom etc. +// +// */ + +#include "precomp.hpp" +#include "imgwarp.hpp" + +namespace cv +{ +namespace opt_SSE41 +{ + +class resizeNNInvokerSSE2 : + public ParallelLoopBody +{ +public: + resizeNNInvokerSSE2(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) : + ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4), + ify(_ify) + { + } + +#if defined(__INTEL_COMPILER) +#pragma optimization_parameter target_arch=SSE4.2 +#endif + virtual void operator() (const Range& range) const + { + Size ssize = src.size(), dsize = dst.size(); + int y, x; + int width = dsize.width; + int sseWidth = width - (width & 0x7); + for(y = range.start; y < range.end; y++) + { + uchar* D = dst.data + dst.step*y; + uchar* Dstart = D; + int sy = std::min(cvFloor(y*ify), ssize.height-1); + const uchar* S = src.data + sy*src.step; + __m128i CV_DECL_ALIGNED(64) pixels = _mm_set1_epi16(0); + for(x = 0; x < sseWidth; x += 8) + { + ushort imm = *(ushort*)(S + x_ofs[x + 0]); + pixels = _mm_insert_epi16(pixels, imm, 0); + imm = *(ushort*)(S + x_ofs[x + 1]); + pixels = _mm_insert_epi16(pixels, imm, 1); + imm = *(ushort*)(S + x_ofs[x + 2]); + pixels = _mm_insert_epi16(pixels, imm, 2); + imm = *(ushort*)(S + x_ofs[x + 3]); + pixels = _mm_insert_epi16(pixels, imm, 3); + imm = *(ushort*)(S + x_ofs[x + 4]); + pixels = _mm_insert_epi16(pixels, imm, 4); + imm = *(ushort*)(S + x_ofs[x + 5]); + pixels = _mm_insert_epi16(pixels, imm, 5); + imm = *(ushort*)(S + x_ofs[x + 6]); + pixels = _mm_insert_epi16(pixels, imm, 6); + imm = *(ushort*)(S + x_ofs[x + 7]); + pixels = _mm_insert_epi16(pixels, imm, 7); + _mm_storeu_si128((__m128i*)D, pixels); + D += 16; + } + for(; x < width; x++) + { + *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]); + } + } + } + +private: + const Mat src; + Mat dst; + int* x_ofs, pix_size4; + double ify; + + resizeNNInvokerSSE2(const resizeNNInvokerSSE2&); + resizeNNInvokerSSE2& operator=(const resizeNNInvokerSSE2&); +}; + +class resizeNNInvokerSSE4 : + public ParallelLoopBody +{ +public: + resizeNNInvokerSSE4(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) : + ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4), + ify(_ify) + { + } +#if defined(__INTEL_COMPILER) +#pragma optimization_parameter target_arch=SSE4.2 +#endif + virtual void operator() (const Range& range) const + { + Size ssize = src.size(), dsize = dst.size(); + int y, x; + int width = dsize.width; + int sseWidth = width - (width & 0x3); + for(y = range.start; y < range.end; y++) + { + uchar* D = dst.data + dst.step*y; + uchar* Dstart = D; + int sy = std::min(cvFloor(y*ify), ssize.height-1); + const uchar* S = src.data + sy*src.step; + __m128i CV_DECL_ALIGNED(64) pixels = _mm_set1_epi16(0); + for(x = 0; x < sseWidth; x += 4) + { + int imm = *(int*)(S + x_ofs[x + 0]); + pixels = _mm_insert_epi32(pixels, imm, 0); + imm = *(int*)(S + x_ofs[x + 1]); + pixels = _mm_insert_epi32(pixels, imm, 1); + imm = *(int*)(S + x_ofs[x + 2]); + pixels = _mm_insert_epi32(pixels, imm, 2); + imm = *(int*)(S + x_ofs[x + 3]); + pixels = _mm_insert_epi32(pixels, imm, 3); + _mm_storeu_si128((__m128i*)D, pixels); + D += 16; + } + for(; x < width; x++) + { + *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]); + } + } + } + +private: + const Mat src; + Mat dst; + int* x_ofs, pix_size4; + double ify; + + resizeNNInvokerSSE4(const resizeNNInvokerSSE4&); + resizeNNInvokerSSE4& operator=(const resizeNNInvokerSSE4&); +}; + +void resizeNN2_SSE4_1(const Range& range, const Mat& src, Mat &dst, int *x_ofs, int pix_size4, double ify) +{ + resizeNNInvokerSSE2 invoker(src, dst, x_ofs, pix_size4, ify); + parallel_for_(range, invoker, dst.total() / (double)(1 << 16)); +} + +void resizeNN4_SSE4_1(const Range& range, const Mat& src, Mat &dst, int *x_ofs, int pix_size4, double ify) +{ + resizeNNInvokerSSE4 invoker(src, dst, x_ofs, pix_size4, ify); + parallel_for_(range, invoker, dst.total() / (double)(1 << 16)); +} + +} +} + +/* End of file. */