diff --git a/modules/imgproc/src/imgwarp.avx2.cpp b/modules/imgproc/src/imgwarp.avx2.cpp index 63ef2ae52d..6a795a7c92 100644 --- a/modules/imgproc/src/imgwarp.avx2.cpp +++ b/modules/imgproc/src/imgwarp.avx2.cpp @@ -55,207 +55,6 @@ namespace cv namespace opt_AVX2 { -class resizeNNInvokerAVX4 : - public ParallelLoopBody -{ -public: - resizeNNInvokerAVX4(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) : - ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4), - ify(_ify) - { - } - -#if defined(__INTEL_COMPILER) -#pragma optimization_parameter target_arch=AVX -#endif - virtual void operator() (const Range& range) const - { - Size ssize = src.size(), dsize = dst.size(); - int y, x; - int width = dsize.width; - int avxWidth = width - (width & 0x7); - const __m256i CV_DECL_ALIGNED(64) mask = _mm256_set1_epi32(-1); - if(((int64)(dst.data + dst.step) & 0x1f) == 0) - { - for(y = range.start; y < range.end; y++) - { - uchar* D = dst.data + dst.step*y; - uchar* Dstart = D; - int sy = std::min(cvFloor(y*ify), ssize.height-1); - const uchar* S = src.data + sy*src.step; -#ifdef CV_ICC -#pragma unroll(4) -#endif - for(x = 0; x < avxWidth; x += 8) - { - const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); - __m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr); - __m256i CV_DECL_ALIGNED(64) pixels = _mm256_i32gather_epi32((const int*)S, indices, 1); - _mm256_maskstore_epi32((int*)D, mask, pixels); - D += 32; - } - for(; x < width; x++) - { - *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]); - } - } - } - else - { - for(y = range.start; y < range.end; y++) - { - uchar* D = dst.data + dst.step*y; - uchar* Dstart = D; - int sy = std::min(cvFloor(y*ify), ssize.height-1); - const uchar* S = src.data + sy*src.step; -#ifdef CV_ICC -#pragma unroll(4) -#endif - for(x = 0; x < avxWidth; x += 8) - { - const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); - __m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr); - __m256i CV_DECL_ALIGNED(64) pixels = _mm256_i32gather_epi32((const int*)S, indices, 1); - _mm256_storeu_si256((__m256i*)D, pixels); - D += 32; - } - for(; x < width; x++) - { - *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]); - } - } - } - _mm256_zeroupper(); - } - -private: - const Mat src; - Mat dst; - int* x_ofs, pix_size4; - double ify; - - resizeNNInvokerAVX4(const resizeNNInvokerAVX4&); - resizeNNInvokerAVX4& operator=(const resizeNNInvokerAVX4&); -}; - -class resizeNNInvokerAVX2 : - public ParallelLoopBody -{ -public: - resizeNNInvokerAVX2(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) : - ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4), - ify(_ify) - { - } - -#if defined(__INTEL_COMPILER) -#pragma optimization_parameter target_arch=AVX -#endif - virtual void operator() (const Range& range) const - { - Size ssize = src.size(), dsize = dst.size(); - int y, x; - int width = dsize.width; - //int avxWidth = (width - 1) - ((width - 1) & 0x7); - int avxWidth = width - (width & 0xf); - const __m256i CV_DECL_ALIGNED(64) mask = _mm256_set1_epi32(-1); - const __m256i CV_DECL_ALIGNED(64) shuffle_mask = _mm256_set_epi8(15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0, - 15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0); - const __m256i CV_DECL_ALIGNED(64) permute_mask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0); - //const __m256i CV_DECL_ALIGNED(64) shift_shuffle_mask = _mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2, - // 13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2); - if(((int64)(dst.data + dst.step) & 0x1f) == 0) - { - for(y = range.start; y < range.end; y++) - { - uchar* D = dst.data + dst.step*y; - uchar* Dstart = D; - int sy = std::min(cvFloor(y*ify), ssize.height-1); - const uchar* S = src.data + sy*src.step; - const uchar* S2 = S - 2; -#ifdef CV_ICC -#pragma unroll(4) -#endif - for(x = 0; x < avxWidth; x += 16) - { - const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); - __m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr); - __m256i CV_DECL_ALIGNED(64) pixels1 = _mm256_i32gather_epi32((const int*)S, indices, 1); - const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8); - __m256i CV_DECL_ALIGNED(64) indices2 = _mm256_lddqu_si256(addr2); - __m256i CV_DECL_ALIGNED(64) pixels2 = _mm256_i32gather_epi32((const int*)S2, indices2, 1); - __m256i CV_DECL_ALIGNED(64) unpacked = _mm256_blend_epi16(pixels1, pixels2, 0xaa); - - __m256i CV_DECL_ALIGNED(64) bytes_shuffled = _mm256_shuffle_epi8(unpacked, shuffle_mask); - __m256i CV_DECL_ALIGNED(64) ints_permuted = _mm256_permutevar8x32_epi32(bytes_shuffled, permute_mask); - _mm256_maskstore_epi32((int*)D, mask, ints_permuted); - D += 32; - } - for(; x < width; x++) - { - *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]); - } - - } - } - else - { - for(y = range.start; y < range.end; y++) - { - uchar* D = dst.data + dst.step*y; - uchar* Dstart = D; - int sy = std::min(cvFloor(y*ify), ssize.height-1); - const uchar* S = src.data + sy*src.step; - const uchar* S2 = S - 2; -#ifdef CV_ICC -#pragma unroll(4) -#endif - for(x = 0; x < avxWidth; x += 16) - { - const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); - __m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr); - __m256i CV_DECL_ALIGNED(64) pixels1 = _mm256_i32gather_epi32((const int*)S, indices, 1); - const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8); - __m256i CV_DECL_ALIGNED(64) indices2 = _mm256_lddqu_si256(addr2); - __m256i CV_DECL_ALIGNED(64) pixels2 = _mm256_i32gather_epi32((const int*)S2, indices2, 1); - __m256i CV_DECL_ALIGNED(64) unpacked = _mm256_blend_epi16(pixels1, pixels2, 0xaa); - - __m256i CV_DECL_ALIGNED(64) bytes_shuffled = _mm256_shuffle_epi8(unpacked, shuffle_mask); - __m256i CV_DECL_ALIGNED(64) ints_permuted = _mm256_permutevar8x32_epi32(bytes_shuffled, permute_mask); - _mm256_storeu_si256((__m256i*)D, ints_permuted); - D += 32; - } - for(; x < width; x++) - { - *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]); - } - } - } - _mm256_zeroupper(); - } - -private: - const Mat src; - Mat dst; - int* x_ofs, pix_size4; - double ify; - - resizeNNInvokerAVX2(const resizeNNInvokerAVX2&); - resizeNNInvokerAVX2& operator=(const resizeNNInvokerAVX2&); -}; - -void resizeNN2_AVX2(const Range& range, const Mat& src, Mat &dst, int *x_ofs, int pix_size4, double ify) -{ - resizeNNInvokerAVX2 invoker(src, dst, x_ofs, pix_size4, ify); - parallel_for_(range, invoker, dst.total() / (double)(1 << 16)); -} - -void resizeNN4_AVX2(const Range& range, const Mat& src, Mat &dst, int *x_ofs, int pix_size4, double ify) -{ - resizeNNInvokerAVX4 invoker(src, dst, x_ofs, pix_size4, ify); - parallel_for_(range, invoker, dst.total() / (double)(1 << 16)); -} - int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw) { const int AB_BITS = MAX(10, (int)INTER_BITS); diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index 69ccbac527..8c854a41ea 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -122,3408 +122,208 @@ static bool IPPSet(const cv::Scalar &value, void *dataPointer, int step, IppiSiz /************** interpolation formulas and tables ***************/ -const int INTER_RESIZE_COEF_BITS=11; -const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS; - const int INTER_REMAP_COEF_BITS=15; -const int INTER_REMAP_COEF_SCALE=1 << INTER_REMAP_COEF_BITS; - -static uchar NNDeltaTab_i[INTER_TAB_SIZE2][2]; - -static float BilinearTab_f[INTER_TAB_SIZE2][2][2]; -static short BilinearTab_i[INTER_TAB_SIZE2][2][2]; - -#if CV_SSE2 || CV_NEON -static short BilinearTab_iC4_buf[INTER_TAB_SIZE2+2][2][8]; -static short (*BilinearTab_iC4)[2][8] = (short (*)[2][8])alignPtr(BilinearTab_iC4_buf, 16); -#endif - -static float BicubicTab_f[INTER_TAB_SIZE2][4][4]; -static short BicubicTab_i[INTER_TAB_SIZE2][4][4]; - -static float Lanczos4Tab_f[INTER_TAB_SIZE2][8][8]; -static short Lanczos4Tab_i[INTER_TAB_SIZE2][8][8]; - -static inline void interpolateLinear( float x, float* coeffs ) -{ - coeffs[0] = 1.f - x; - coeffs[1] = x; -} - -static inline void interpolateCubic( float x, float* coeffs ) -{ - const float A = -0.75f; - - coeffs[0] = ((A*(x + 1) - 5*A)*(x + 1) + 8*A)*(x + 1) - 4*A; - coeffs[1] = ((A + 2)*x - (A + 3))*x*x + 1; - coeffs[2] = ((A + 2)*(1 - x) - (A + 3))*(1 - x)*(1 - x) + 1; - coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; -} - -static inline void interpolateLanczos4( float x, float* coeffs ) -{ - static const double s45 = 0.70710678118654752440084436210485; - static const double cs[][2]= - {{1, 0}, {-s45, -s45}, {0, 1}, {s45, -s45}, {-1, 0}, {s45, s45}, {0, -1}, {-s45, s45}}; - - if( x < FLT_EPSILON ) - { - for( int i = 0; i < 8; i++ ) - coeffs[i] = 0; - coeffs[3] = 1; - return; - } - - float sum = 0; - double y0=-(x+3)*CV_PI*0.25, s0 = sin(y0), c0=cos(y0); - for(int i = 0; i < 8; i++ ) - { - double y = -(x+3-i)*CV_PI*0.25; - coeffs[i] = (float)((cs[i][0]*s0 + cs[i][1]*c0)/(y*y)); - sum += coeffs[i]; - } - - sum = 1.f/sum; - for(int i = 0; i < 8; i++ ) - coeffs[i] *= sum; -} - -static void initInterTab1D(int method, float* tab, int tabsz) -{ - float scale = 1.f/tabsz; - if( method == INTER_LINEAR ) - { - for( int i = 0; i < tabsz; i++, tab += 2 ) - interpolateLinear( i*scale, tab ); - } - else if( method == INTER_CUBIC ) - { - for( int i = 0; i < tabsz; i++, tab += 4 ) - interpolateCubic( i*scale, tab ); - } - else if( method == INTER_LANCZOS4 ) - { - for( int i = 0; i < tabsz; i++, tab += 8 ) - interpolateLanczos4( i*scale, tab ); - } - else - CV_Error( CV_StsBadArg, "Unknown interpolation method" ); -} - - -static const void* initInterTab2D( int method, bool fixpt ) -{ - static bool inittab[INTER_MAX+1] = {false}; - float* tab = 0; - short* itab = 0; - int ksize = 0; - if( method == INTER_LINEAR ) - tab = BilinearTab_f[0][0], itab = BilinearTab_i[0][0], ksize=2; - else if( method == INTER_CUBIC ) - tab = BicubicTab_f[0][0], itab = BicubicTab_i[0][0], ksize=4; - else if( method == INTER_LANCZOS4 ) - tab = Lanczos4Tab_f[0][0], itab = Lanczos4Tab_i[0][0], ksize=8; - else - CV_Error( CV_StsBadArg, "Unknown/unsupported interpolation type" ); - - if( !inittab[method] ) - { - AutoBuffer _tab(8*INTER_TAB_SIZE); - int i, j, k1, k2; - initInterTab1D(method, _tab, INTER_TAB_SIZE); - for( i = 0; i < INTER_TAB_SIZE; i++ ) - for( j = 0; j < INTER_TAB_SIZE; j++, tab += ksize*ksize, itab += ksize*ksize ) - { - int isum = 0; - NNDeltaTab_i[i*INTER_TAB_SIZE+j][0] = j < INTER_TAB_SIZE/2; - NNDeltaTab_i[i*INTER_TAB_SIZE+j][1] = i < INTER_TAB_SIZE/2; - - for( k1 = 0; k1 < ksize; k1++ ) - { - float vy = _tab[i*ksize + k1]; - for( k2 = 0; k2 < ksize; k2++ ) - { - float v = vy*_tab[j*ksize + k2]; - tab[k1*ksize + k2] = v; - isum += itab[k1*ksize + k2] = saturate_cast(v*INTER_REMAP_COEF_SCALE); - } - } - - if( isum != INTER_REMAP_COEF_SCALE ) - { - int diff = isum - INTER_REMAP_COEF_SCALE; - int ksize2 = ksize/2, Mk1=ksize2, Mk2=ksize2, mk1=ksize2, mk2=ksize2; - for( k1 = ksize2; k1 < ksize2+2; k1++ ) - for( k2 = ksize2; k2 < ksize2+2; k2++ ) - { - if( itab[k1*ksize+k2] < itab[mk1*ksize+mk2] ) - mk1 = k1, mk2 = k2; - else if( itab[k1*ksize+k2] > itab[Mk1*ksize+Mk2] ) - Mk1 = k1, Mk2 = k2; - } - if( diff < 0 ) - itab[Mk1*ksize + Mk2] = (short)(itab[Mk1*ksize + Mk2] - diff); - else - itab[mk1*ksize + mk2] = (short)(itab[mk1*ksize + mk2] - diff); - } - } - tab -= INTER_TAB_SIZE2*ksize*ksize; - itab -= INTER_TAB_SIZE2*ksize*ksize; -#if CV_SSE2 || CV_NEON - if( method == INTER_LINEAR ) - { - for( i = 0; i < INTER_TAB_SIZE2; i++ ) - for( j = 0; j < 4; j++ ) - { - BilinearTab_iC4[i][0][j*2] = BilinearTab_i[i][0][0]; - BilinearTab_iC4[i][0][j*2+1] = BilinearTab_i[i][0][1]; - BilinearTab_iC4[i][1][j*2] = BilinearTab_i[i][1][0]; - BilinearTab_iC4[i][1][j*2+1] = BilinearTab_i[i][1][1]; - } - } -#endif - inittab[method] = true; - } - return fixpt ? (const void*)itab : (const void*)tab; -} - -#ifndef __MINGW32__ -static bool initAllInterTab2D() -{ - return initInterTab2D( INTER_LINEAR, false ) && - initInterTab2D( INTER_LINEAR, true ) && - initInterTab2D( INTER_CUBIC, false ) && - initInterTab2D( INTER_CUBIC, true ) && - initInterTab2D( INTER_LANCZOS4, false ) && - initInterTab2D( INTER_LANCZOS4, true ); -} - -static volatile bool doInitAllInterTab2D = initAllInterTab2D(); -#endif - -template struct Cast -{ - typedef ST type1; - typedef DT rtype; - - DT operator()(ST val) const { return saturate_cast
(val); } -}; - -template struct FixedPtCast -{ - typedef ST type1; - typedef DT rtype; - enum { SHIFT = bits, DELTA = 1 << (bits-1) }; - - DT operator()(ST val) const { return saturate_cast
((val + DELTA)>>SHIFT); } -}; - -/****************************************************************************************\ -* Resize * -\****************************************************************************************/ - -class resizeNNInvoker : - public ParallelLoopBody -{ -public: - resizeNNInvoker(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) : - ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4), - ify(_ify) - { - } - - virtual void operator() (const Range& range) const - { - Size ssize = src.size(), dsize = dst.size(); - int y, x, pix_size = (int)src.elemSize(); - - for( y = range.start; y < range.end; y++ ) - { - uchar* D = dst.data + dst.step*y; - int sy = std::min(cvFloor(y*ify), ssize.height-1); - const uchar* S = src.ptr(sy); - - switch( pix_size ) - { - case 1: - for( x = 0; x <= dsize.width - 2; x += 2 ) - { - uchar t0 = S[x_ofs[x]]; - uchar t1 = S[x_ofs[x+1]]; - D[x] = t0; - D[x+1] = t1; - } - - for( ; x < dsize.width; x++ ) - D[x] = S[x_ofs[x]]; - break; - case 2: - for( x = 0; x < dsize.width; x++ ) - *(ushort*)(D + x*2) = *(ushort*)(S + x_ofs[x]); - break; - case 3: - for( x = 0; x < dsize.width; x++, D += 3 ) - { - const uchar* _tS = S + x_ofs[x]; - D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2]; - } - break; - case 4: - for( x = 0; x < dsize.width; x++ ) - *(int*)(D + x*4) = *(int*)(S + x_ofs[x]); - break; - case 6: - for( x = 0; x < dsize.width; x++, D += 6 ) - { - const ushort* _tS = (const ushort*)(S + x_ofs[x]); - ushort* _tD = (ushort*)D; - _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2]; - } - break; - case 8: - for( x = 0; x < dsize.width; x++, D += 8 ) - { - const int* _tS = (const int*)(S + x_ofs[x]); - int* _tD = (int*)D; - _tD[0] = _tS[0]; _tD[1] = _tS[1]; - } - break; - case 12: - for( x = 0; x < dsize.width; x++, D += 12 ) - { - const int* _tS = (const int*)(S + x_ofs[x]); - int* _tD = (int*)D; - _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2]; - } - break; - default: - for( x = 0; x < dsize.width; x++, D += pix_size ) - { - const int* _tS = (const int*)(S + x_ofs[x]); - int* _tD = (int*)D; - for( int k = 0; k < pix_size4; k++ ) - _tD[k] = _tS[k]; - } - } - } - } - -private: - const Mat src; - Mat dst; - int* x_ofs, pix_size4; - double ify; - - resizeNNInvoker(const resizeNNInvoker&); - resizeNNInvoker& operator=(const resizeNNInvoker&); -}; - -static void -resizeNN( const Mat& src, Mat& dst, double fx, double fy ) -{ - Size ssize = src.size(), dsize = dst.size(); - AutoBuffer _x_ofs(dsize.width); - int* x_ofs = _x_ofs; - int pix_size = (int)src.elemSize(); - int pix_size4 = (int)(pix_size / sizeof(int)); - double ifx = 1./fx, ify = 1./fy; - int x; - - for( x = 0; x < dsize.width; x++ ) - { - int sx = cvFloor(x*ifx); - x_ofs[x] = std::min(sx, ssize.width-1)*pix_size; - } - - Range range(0, dsize.height); -#if CV_TRY_AVX2 - if(CV_CPU_HAS_SUPPORT_AVX2 && ((pix_size == 2) || (pix_size == 4))) - { - if(pix_size == 2) - opt_AVX2::resizeNN2_AVX2(range, src, dst, x_ofs, pix_size4, ify); - else - opt_AVX2::resizeNN4_AVX2(range, src, dst, x_ofs, pix_size4, ify); - } - else -#endif -#if CV_TRY_SSE4_1 - if(CV_CPU_HAS_SUPPORT_SSE4_1 && ((pix_size == 2) || (pix_size == 4))) - { - if(pix_size == 2) - opt_SSE4_1::resizeNN2_SSE4_1(range, src, dst, x_ofs, pix_size4, ify); - else - opt_SSE4_1::resizeNN4_SSE4_1(range, src, dst, x_ofs, pix_size4, ify); - } - else -#endif - { - resizeNNInvoker invoker(src, dst, x_ofs, pix_size4, ify); - parallel_for_(range, invoker, dst.total()/(double)(1<<16)); - } -} - - -struct VResizeNoVec -{ - int operator()(const uchar**, uchar*, const uchar*, int ) const { return 0; } -}; - -struct HResizeNoVec -{ - int operator()(const uchar**, uchar**, int, const int*, - const uchar*, int, int, int, int, int) const { return 0; } -}; - -#if CV_SSE2 - -struct VResizeLinearVec_32s8u -{ - int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const - { - if( !checkHardwareSupport(CV_CPU_SSE2) ) - return 0; - - const int** src = (const int**)_src; - const short* beta = (const short*)_beta; - const int *S0 = src[0], *S1 = src[1]; - int x = 0; - __m128i b0 = _mm_set1_epi16(beta[0]), b1 = _mm_set1_epi16(beta[1]); - __m128i delta = _mm_set1_epi16(2); - - if( (((size_t)S0|(size_t)S1)&15) == 0 ) - for( ; x <= width - 16; x += 16 ) - { - __m128i x0, x1, x2, y0, y1, y2; - x0 = _mm_load_si128((const __m128i*)(S0 + x)); - x1 = _mm_load_si128((const __m128i*)(S0 + x + 4)); - y0 = _mm_load_si128((const __m128i*)(S1 + x)); - y1 = _mm_load_si128((const __m128i*)(S1 + x + 4)); - x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4)); - y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4)); - - x1 = _mm_load_si128((const __m128i*)(S0 + x + 8)); - x2 = _mm_load_si128((const __m128i*)(S0 + x + 12)); - y1 = _mm_load_si128((const __m128i*)(S1 + x + 8)); - y2 = _mm_load_si128((const __m128i*)(S1 + x + 12)); - x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4)); - y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4)); - - x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 )); - x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 )); - - x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2); - x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2); - _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1)); - } - else - for( ; x <= width - 16; x += 16 ) - { - __m128i x0, x1, x2, y0, y1, y2; - x0 = _mm_loadu_si128((const __m128i*)(S0 + x)); - x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4)); - y0 = _mm_loadu_si128((const __m128i*)(S1 + x)); - y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4)); - x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4)); - y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4)); - - x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 8)); - x2 = _mm_loadu_si128((const __m128i*)(S0 + x + 12)); - y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 8)); - y2 = _mm_loadu_si128((const __m128i*)(S1 + x + 12)); - x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4)); - y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4)); - - x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 )); - x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 )); - - x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2); - x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2); - _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1)); - } - - for( ; x < width - 4; x += 4 ) - { - __m128i x0, y0; - x0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S0 + x)), 4); - y0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S1 + x)), 4); - x0 = _mm_packs_epi32(x0, x0); - y0 = _mm_packs_epi32(y0, y0); - x0 = _mm_adds_epi16(_mm_mulhi_epi16(x0, b0), _mm_mulhi_epi16(y0, b1)); - x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2); - x0 = _mm_packus_epi16(x0, x0); - *(int*)(dst + x) = _mm_cvtsi128_si32(x0); - } - - return x; - } -}; - - -template struct VResizeLinearVec_32f16 -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - if( !checkHardwareSupport(CV_CPU_SSE2) ) - return 0; - - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1]; - ushort* dst = (ushort*)_dst; - int x = 0; - - __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]); - __m128i preshift = _mm_set1_epi32(shiftval); - __m128i postshift = _mm_set1_epi16((short)shiftval); - - if( (((size_t)S0|(size_t)S1)&15) == 0 ) - for( ; x <= width - 16; x += 16 ) - { - __m128 x0, x1, y0, y1; - __m128i t0, t1, t2; - x0 = _mm_load_ps(S0 + x); - x1 = _mm_load_ps(S0 + x + 4); - y0 = _mm_load_ps(S1 + x); - y1 = _mm_load_ps(S1 + x + 4); - - x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); - x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); - t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); - t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); - t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift); - - x0 = _mm_load_ps(S0 + x + 8); - x1 = _mm_load_ps(S0 + x + 12); - y0 = _mm_load_ps(S1 + x + 8); - y1 = _mm_load_ps(S1 + x + 12); - - x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); - x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); - t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); - t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); - t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift); - - _mm_storeu_si128( (__m128i*)(dst + x), t0); - _mm_storeu_si128( (__m128i*)(dst + x + 8), t1); - } - else - for( ; x <= width - 16; x += 16 ) - { - __m128 x0, x1, y0, y1; - __m128i t0, t1, t2; - x0 = _mm_loadu_ps(S0 + x); - x1 = _mm_loadu_ps(S0 + x + 4); - y0 = _mm_loadu_ps(S1 + x); - y1 = _mm_loadu_ps(S1 + x + 4); - - x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); - x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); - t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); - t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); - t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift); - - x0 = _mm_loadu_ps(S0 + x + 8); - x1 = _mm_loadu_ps(S0 + x + 12); - y0 = _mm_loadu_ps(S1 + x + 8); - y1 = _mm_loadu_ps(S1 + x + 12); - - x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); - x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); - t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); - t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); - t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift); - - _mm_storeu_si128( (__m128i*)(dst + x), t0); - _mm_storeu_si128( (__m128i*)(dst + x + 8), t1); - } - - for( ; x < width - 4; x += 4 ) - { - __m128 x0, y0; - __m128i t0; - x0 = _mm_loadu_ps(S0 + x); - y0 = _mm_loadu_ps(S1 + x); - - x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); - t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); - t0 = _mm_add_epi16(_mm_packs_epi32(t0, t0), postshift); - _mm_storel_epi64( (__m128i*)(dst + x), t0); - } - - return x; - } -}; - -typedef VResizeLinearVec_32f16 VResizeLinearVec_32f16u; -typedef VResizeLinearVec_32f16<0> VResizeLinearVec_32f16s; - -struct VResizeLinearVec_32f -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - if( !checkHardwareSupport(CV_CPU_SSE) ) - return 0; - - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1]; - float* dst = (float*)_dst; - int x = 0; - - __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]); - - if( (((size_t)S0|(size_t)S1)&15) == 0 ) - for( ; x <= width - 8; x += 8 ) - { - __m128 x0, x1, y0, y1; - x0 = _mm_load_ps(S0 + x); - x1 = _mm_load_ps(S0 + x + 4); - y0 = _mm_load_ps(S1 + x); - y1 = _mm_load_ps(S1 + x + 4); - - x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); - x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); - - _mm_storeu_ps( dst + x, x0); - _mm_storeu_ps( dst + x + 4, x1); - } - else - for( ; x <= width - 8; x += 8 ) - { - __m128 x0, x1, y0, y1; - x0 = _mm_loadu_ps(S0 + x); - x1 = _mm_loadu_ps(S0 + x + 4); - y0 = _mm_loadu_ps(S1 + x); - y1 = _mm_loadu_ps(S1 + x + 4); - - x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); - x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); - - _mm_storeu_ps( dst + x, x0); - _mm_storeu_ps( dst + x + 4, x1); - } - - return x; - } -}; - - -struct VResizeCubicVec_32s8u -{ - int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const - { - if( !checkHardwareSupport(CV_CPU_SSE2) ) - return 0; - - const int** src = (const int**)_src; - const short* beta = (const short*)_beta; - const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; - int x = 0; - float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE); - __m128 b0 = _mm_set1_ps(beta[0]*scale), b1 = _mm_set1_ps(beta[1]*scale), - b2 = _mm_set1_ps(beta[2]*scale), b3 = _mm_set1_ps(beta[3]*scale); - - if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&15) == 0 ) - for( ; x <= width - 8; x += 8 ) - { - __m128i x0, x1, y0, y1; - __m128 s0, s1, f0, f1; - x0 = _mm_load_si128((const __m128i*)(S0 + x)); - x1 = _mm_load_si128((const __m128i*)(S0 + x + 4)); - y0 = _mm_load_si128((const __m128i*)(S1 + x)); - y1 = _mm_load_si128((const __m128i*)(S1 + x + 4)); - - s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0); - s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0); - f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1); - f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1); - s0 = _mm_add_ps(s0, f0); - s1 = _mm_add_ps(s1, f1); - - x0 = _mm_load_si128((const __m128i*)(S2 + x)); - x1 = _mm_load_si128((const __m128i*)(S2 + x + 4)); - y0 = _mm_load_si128((const __m128i*)(S3 + x)); - y1 = _mm_load_si128((const __m128i*)(S3 + x + 4)); - - f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2); - f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2); - s0 = _mm_add_ps(s0, f0); - s1 = _mm_add_ps(s1, f1); - f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3); - f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3); - s0 = _mm_add_ps(s0, f0); - s1 = _mm_add_ps(s1, f1); - - x0 = _mm_cvtps_epi32(s0); - x1 = _mm_cvtps_epi32(s1); - - x0 = _mm_packs_epi32(x0, x1); - _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0)); - } - else - for( ; x <= width - 8; x += 8 ) - { - __m128i x0, x1, y0, y1; - __m128 s0, s1, f0, f1; - x0 = _mm_loadu_si128((const __m128i*)(S0 + x)); - x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4)); - y0 = _mm_loadu_si128((const __m128i*)(S1 + x)); - y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4)); - - s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0); - s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0); - f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1); - f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1); - s0 = _mm_add_ps(s0, f0); - s1 = _mm_add_ps(s1, f1); - - x0 = _mm_loadu_si128((const __m128i*)(S2 + x)); - x1 = _mm_loadu_si128((const __m128i*)(S2 + x + 4)); - y0 = _mm_loadu_si128((const __m128i*)(S3 + x)); - y1 = _mm_loadu_si128((const __m128i*)(S3 + x + 4)); - - f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2); - f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2); - s0 = _mm_add_ps(s0, f0); - s1 = _mm_add_ps(s1, f1); - f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3); - f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3); - s0 = _mm_add_ps(s0, f0); - s1 = _mm_add_ps(s1, f1); - - x0 = _mm_cvtps_epi32(s0); - x1 = _mm_cvtps_epi32(s1); - - x0 = _mm_packs_epi32(x0, x1); - _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0)); - } - - return x; - } -}; - - -template struct VResizeCubicVec_32f16 -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - if( !checkHardwareSupport(CV_CPU_SSE2) ) - return 0; - - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; - ushort* dst = (ushort*)_dst; - int x = 0; - __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]), - b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]); - __m128i preshift = _mm_set1_epi32(shiftval); - __m128i postshift = _mm_set1_epi16((short)shiftval); - - for( ; x <= width - 8; x += 8 ) - { - __m128 x0, x1, y0, y1, s0, s1; - __m128i t0, t1; - x0 = _mm_loadu_ps(S0 + x); - x1 = _mm_loadu_ps(S0 + x + 4); - y0 = _mm_loadu_ps(S1 + x); - y1 = _mm_loadu_ps(S1 + x + 4); - - s0 = _mm_mul_ps(x0, b0); - s1 = _mm_mul_ps(x1, b0); - y0 = _mm_mul_ps(y0, b1); - y1 = _mm_mul_ps(y1, b1); - s0 = _mm_add_ps(s0, y0); - s1 = _mm_add_ps(s1, y1); - - x0 = _mm_loadu_ps(S2 + x); - x1 = _mm_loadu_ps(S2 + x + 4); - y0 = _mm_loadu_ps(S3 + x); - y1 = _mm_loadu_ps(S3 + x + 4); - - x0 = _mm_mul_ps(x0, b2); - x1 = _mm_mul_ps(x1, b2); - y0 = _mm_mul_ps(y0, b3); - y1 = _mm_mul_ps(y1, b3); - s0 = _mm_add_ps(s0, x0); - s1 = _mm_add_ps(s1, x1); - s0 = _mm_add_ps(s0, y0); - s1 = _mm_add_ps(s1, y1); - - t0 = _mm_add_epi32(_mm_cvtps_epi32(s0), preshift); - t1 = _mm_add_epi32(_mm_cvtps_epi32(s1), preshift); - - t0 = _mm_add_epi16(_mm_packs_epi32(t0, t1), postshift); - _mm_storeu_si128( (__m128i*)(dst + x), t0); - } - - return x; - } -}; - -typedef VResizeCubicVec_32f16 VResizeCubicVec_32f16u; -typedef VResizeCubicVec_32f16<0> VResizeCubicVec_32f16s; - -struct VResizeCubicVec_32f -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - if( !checkHardwareSupport(CV_CPU_SSE) ) - return 0; - - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; - float* dst = (float*)_dst; - int x = 0; - __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]), - b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]); - - for( ; x <= width - 8; x += 8 ) - { - __m128 x0, x1, y0, y1, s0, s1; - x0 = _mm_loadu_ps(S0 + x); - x1 = _mm_loadu_ps(S0 + x + 4); - y0 = _mm_loadu_ps(S1 + x); - y1 = _mm_loadu_ps(S1 + x + 4); - - s0 = _mm_mul_ps(x0, b0); - s1 = _mm_mul_ps(x1, b0); - y0 = _mm_mul_ps(y0, b1); - y1 = _mm_mul_ps(y1, b1); - s0 = _mm_add_ps(s0, y0); - s1 = _mm_add_ps(s1, y1); - - x0 = _mm_loadu_ps(S2 + x); - x1 = _mm_loadu_ps(S2 + x + 4); - y0 = _mm_loadu_ps(S3 + x); - y1 = _mm_loadu_ps(S3 + x + 4); - - x0 = _mm_mul_ps(x0, b2); - x1 = _mm_mul_ps(x1, b2); - y0 = _mm_mul_ps(y0, b3); - y1 = _mm_mul_ps(y1, b3); - s0 = _mm_add_ps(s0, x0); - s1 = _mm_add_ps(s1, x1); - s0 = _mm_add_ps(s0, y0); - s1 = _mm_add_ps(s1, y1); - - _mm_storeu_ps( dst + x, s0); - _mm_storeu_ps( dst + x + 4, s1); - } - - return x; - } -}; - -#if CV_TRY_SSE4_1 - -struct VResizeLanczos4Vec_32f16u -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::VResizeLanczos4Vec_32f16u_SSE41(_src, _dst, _beta, width); - else return 0; - } -}; - -#else - -typedef VResizeNoVec VResizeLanczos4Vec_32f16u; - -#endif - -struct VResizeLanczos4Vec_32f16s -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], - *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; - short * dst = (short*)_dst; - int x = 0; - __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]), - v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]), - v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]), - v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]); - - for( ; x <= width - 8; x += 8 ) - { - __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x)); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x))); - - __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4)); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4))); - - __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0); - __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1); - - _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dsti0, v_dsti1)); - } - - return x; - } -}; - - -struct VResizeLanczos4Vec_32f -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], - *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; - float* dst = (float*)_dst; - int x = 0; - - __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]), - v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]), - v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]), - v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]); - - for( ; x <= width - 4; x += 4 ) - { - __m128 v_dst = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x)); - v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x))); - v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x))); - v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x))); - v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x))); - v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x))); - v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x))); - v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x))); - - _mm_storeu_ps(dst + x, v_dst); - } - - return x; - } -}; - - -#elif CV_NEON - -struct VResizeLinearVec_32s8u -{ - int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const - { - const int** src = (const int**)_src, *S0 = src[0], *S1 = src[1]; - const short* beta = (const short*)_beta; - int x = 0; - int16x8_t v_b0 = vdupq_n_s16(beta[0]), v_b1 = vdupq_n_s16(beta[1]), v_delta = vdupq_n_s16(2); - - for( ; x <= width - 16; x += 16) - { - int32x4_t v_src00 = vshrq_n_s32(vld1q_s32(S0 + x), 4), v_src10 = vshrq_n_s32(vld1q_s32(S1 + x), 4); - int32x4_t v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 4), 4), v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 4), 4); - - int16x8_t v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01)); - int16x8_t v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11)); - - int16x8_t v_dst0 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1), - vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1)); - v_dst0 = vshrq_n_s16(vaddq_s16(v_dst0, v_delta), 2); - - v_src00 = vshrq_n_s32(vld1q_s32(S0 + x + 8), 4); - v_src10 = vshrq_n_s32(vld1q_s32(S1 + x + 8), 4); - v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 12), 4); - v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 12), 4); - - v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01)); - v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11)); - - int16x8_t v_dst1 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1), - vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1)); - v_dst1 = vshrq_n_s16(vaddq_s16(v_dst1, v_delta), 2); - - vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_dst0), vqmovun_s16(v_dst1))); - } - - return x; - } -}; - -struct VResizeLinearVec_32f16u -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1]; - ushort* dst = (ushort*)_dst; - int x = 0; - - float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]); - - for( ; x <= width - 8; x += 8 ) - { - float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4); - float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4); - - float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1); - float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1); - - vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst1)))); - } - - return x; - } -}; - -struct VResizeLinearVec_32f16s -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1]; - short* dst = (short*)_dst; - int x = 0; - - float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]); - - for( ; x <= width - 8; x += 8 ) - { - float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4); - float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4); - - float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1); - float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1); - - vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst1)))); - } - - return x; - } -}; - -struct VResizeLinearVec_32f -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1]; - float* dst = (float*)_dst; - int x = 0; - - float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]); - - for( ; x <= width - 8; x += 8 ) - { - float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4); - float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4); - - vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1)); - vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1)); - } - - return x; - } -}; - -typedef VResizeNoVec VResizeCubicVec_32s8u; - -struct VResizeCubicVec_32f16u -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; - ushort* dst = (ushort*)_dst; - int x = 0; - float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), - v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]); - - for( ; x <= width - 8; x += 8 ) - { - float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), - v_b1, vld1q_f32(S1 + x)), - v_b2, vld1q_f32(S2 + x)), - v_b3, vld1q_f32(S3 + x)); - float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), - v_b1, vld1q_f32(S1 + x + 4)), - v_b2, vld1q_f32(S2 + x + 4)), - v_b3, vld1q_f32(S3 + x + 4)); - - vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst1)))); - } - - return x; - } -}; - -struct VResizeCubicVec_32f16s -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; - short* dst = (short*)_dst; - int x = 0; - float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), - v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]); - - for( ; x <= width - 8; x += 8 ) - { - float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), - v_b1, vld1q_f32(S1 + x)), - v_b2, vld1q_f32(S2 + x)), - v_b3, vld1q_f32(S3 + x)); - float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), - v_b1, vld1q_f32(S1 + x + 4)), - v_b2, vld1q_f32(S2 + x + 4)), - v_b3, vld1q_f32(S3 + x + 4)); - - vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst1)))); - } - - return x; - } -}; - -struct VResizeCubicVec_32f -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; - float* dst = (float*)_dst; - int x = 0; - float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), - v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]); - - for( ; x <= width - 8; x += 8 ) - { - vst1q_f32(dst + x, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), - v_b1, vld1q_f32(S1 + x)), - v_b2, vld1q_f32(S2 + x)), - v_b3, vld1q_f32(S3 + x))); - vst1q_f32(dst + x + 4, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), - v_b1, vld1q_f32(S1 + x + 4)), - v_b2, vld1q_f32(S2 + x + 4)), - v_b3, vld1q_f32(S3 + x + 4))); - } - - return x; - } -}; - -struct VResizeLanczos4Vec_32f16u -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], - *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; - ushort * dst = (ushort*)_dst; - int x = 0; - float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), - v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]), - v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]), - v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]); - - for( ; x <= width - 8; x += 8 ) - { - float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), - v_b1, vld1q_f32(S1 + x)), - v_b2, vld1q_f32(S2 + x)), - v_b3, vld1q_f32(S3 + x)); - float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)), - v_b5, vld1q_f32(S5 + x)), - v_b6, vld1q_f32(S6 + x)), - v_b7, vld1q_f32(S7 + x)); - float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1); - - v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), - v_b1, vld1q_f32(S1 + x + 4)), - v_b2, vld1q_f32(S2 + x + 4)), - v_b3, vld1q_f32(S3 + x + 4)); - v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)), - v_b5, vld1q_f32(S5 + x + 4)), - v_b6, vld1q_f32(S6 + x + 4)), - v_b7, vld1q_f32(S7 + x + 4)); - v_dst1 = vaddq_f32(v_dst0, v_dst1); - - vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst1)))); - } - - return x; - } -}; - -struct VResizeLanczos4Vec_32f16s -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], - *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; - short * dst = (short*)_dst; - int x = 0; - float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), - v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]), - v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]), - v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]); - - for( ; x <= width - 8; x += 8 ) - { - float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), - v_b1, vld1q_f32(S1 + x)), - v_b2, vld1q_f32(S2 + x)), - v_b3, vld1q_f32(S3 + x)); - float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)), - v_b5, vld1q_f32(S5 + x)), - v_b6, vld1q_f32(S6 + x)), - v_b7, vld1q_f32(S7 + x)); - float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1); - - v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), - v_b1, vld1q_f32(S1 + x + 4)), - v_b2, vld1q_f32(S2 + x + 4)), - v_b3, vld1q_f32(S3 + x + 4)); - v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)), - v_b5, vld1q_f32(S5 + x + 4)), - v_b6, vld1q_f32(S6 + x + 4)), - v_b7, vld1q_f32(S7 + x + 4)); - v_dst1 = vaddq_f32(v_dst0, v_dst1); - - vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst1)))); - } - - return x; - } -}; - -struct VResizeLanczos4Vec_32f -{ - int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const - { - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], - *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; - float* dst = (float*)_dst; - int x = 0; - float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), - v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]), - v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]), - v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]); - - for( ; x <= width - 4; x += 4 ) - { - float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), - v_b1, vld1q_f32(S1 + x)), - v_b2, vld1q_f32(S2 + x)), - v_b3, vld1q_f32(S3 + x)); - float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)), - v_b5, vld1q_f32(S5 + x)), - v_b6, vld1q_f32(S6 + x)), - v_b7, vld1q_f32(S7 + x)); - vst1q_f32(dst + x, vaddq_f32(v_dst0, v_dst1)); - } - - return x; - } -}; - -#else - -typedef VResizeNoVec VResizeLinearVec_32s8u; -typedef VResizeNoVec VResizeLinearVec_32f16u; -typedef VResizeNoVec VResizeLinearVec_32f16s; -typedef VResizeNoVec VResizeLinearVec_32f; - -typedef VResizeNoVec VResizeCubicVec_32s8u; -typedef VResizeNoVec VResizeCubicVec_32f16u; -typedef VResizeNoVec VResizeCubicVec_32f16s; -typedef VResizeNoVec VResizeCubicVec_32f; - -typedef VResizeNoVec VResizeLanczos4Vec_32f16u; -typedef VResizeNoVec VResizeLanczos4Vec_32f16s; -typedef VResizeNoVec VResizeLanczos4Vec_32f; - -#endif - -typedef HResizeNoVec HResizeLinearVec_8u32s; -typedef HResizeNoVec HResizeLinearVec_16u32f; -typedef HResizeNoVec HResizeLinearVec_16s32f; -typedef HResizeNoVec HResizeLinearVec_32f; -typedef HResizeNoVec HResizeLinearVec_64f; - - -template -struct HResizeLinear -{ - typedef T value_type; - typedef WT buf_type; - typedef AT alpha_type; - - void operator()(const T** src, WT** dst, int count, - const int* xofs, const AT* alpha, - int swidth, int dwidth, int cn, int xmin, int xmax ) const - { - int dx, k; - VecOp vecOp; - - int dx0 = vecOp((const uchar**)src, (uchar**)dst, count, - xofs, (const uchar*)alpha, swidth, dwidth, cn, xmin, xmax ); - - for( k = 0; k <= count - 2; k++ ) - { - const T *S0 = src[k], *S1 = src[k+1]; - WT *D0 = dst[k], *D1 = dst[k+1]; - for( dx = dx0; dx < xmax; dx++ ) - { - int sx = xofs[dx]; - WT a0 = alpha[dx*2], a1 = alpha[dx*2+1]; - WT t0 = S0[sx]*a0 + S0[sx + cn]*a1; - WT t1 = S1[sx]*a0 + S1[sx + cn]*a1; - D0[dx] = t0; D1[dx] = t1; - } - - for( ; dx < dwidth; dx++ ) - { - int sx = xofs[dx]; - D0[dx] = WT(S0[sx]*ONE); D1[dx] = WT(S1[sx]*ONE); - } - } - - for( ; k < count; k++ ) - { - const T *S = src[k]; - WT *D = dst[k]; - for( dx = 0; dx < xmax; dx++ ) - { - int sx = xofs[dx]; - D[dx] = S[sx]*alpha[dx*2] + S[sx+cn]*alpha[dx*2+1]; - } - - for( ; dx < dwidth; dx++ ) - D[dx] = WT(S[xofs[dx]]*ONE); - } - } -}; - - -template -struct VResizeLinear -{ - typedef T value_type; - typedef WT buf_type; - typedef AT alpha_type; - - void operator()(const WT** src, T* dst, const AT* beta, int width ) const - { - WT b0 = beta[0], b1 = beta[1]; - const WT *S0 = src[0], *S1 = src[1]; - CastOp castOp; - VecOp vecOp; - - int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width); - #if CV_ENABLE_UNROLLED - for( ; x <= width - 4; x += 4 ) - { - WT t0, t1; - t0 = S0[x]*b0 + S1[x]*b1; - t1 = S0[x+1]*b0 + S1[x+1]*b1; - dst[x] = castOp(t0); dst[x+1] = castOp(t1); - t0 = S0[x+2]*b0 + S1[x+2]*b1; - t1 = S0[x+3]*b0 + S1[x+3]*b1; - dst[x+2] = castOp(t0); dst[x+3] = castOp(t1); - } - #endif - for( ; x < width; x++ ) - dst[x] = castOp(S0[x]*b0 + S1[x]*b1); - } -}; - -template<> -struct VResizeLinear, VResizeLinearVec_32s8u> -{ - typedef uchar value_type; - typedef int buf_type; - typedef short alpha_type; - - void operator()(const buf_type** src, value_type* dst, const alpha_type* beta, int width ) const - { - alpha_type b0 = beta[0], b1 = beta[1]; - const buf_type *S0 = src[0], *S1 = src[1]; - VResizeLinearVec_32s8u vecOp; - - int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width); - #if CV_ENABLE_UNROLLED - for( ; x <= width - 4; x += 4 ) - { - dst[x+0] = uchar(( ((b0 * (S0[x+0] >> 4)) >> 16) + ((b1 * (S1[x+0] >> 4)) >> 16) + 2)>>2); - dst[x+1] = uchar(( ((b0 * (S0[x+1] >> 4)) >> 16) + ((b1 * (S1[x+1] >> 4)) >> 16) + 2)>>2); - dst[x+2] = uchar(( ((b0 * (S0[x+2] >> 4)) >> 16) + ((b1 * (S1[x+2] >> 4)) >> 16) + 2)>>2); - dst[x+3] = uchar(( ((b0 * (S0[x+3] >> 4)) >> 16) + ((b1 * (S1[x+3] >> 4)) >> 16) + 2)>>2); - } - #endif - for( ; x < width; x++ ) - dst[x] = uchar(( ((b0 * (S0[x] >> 4)) >> 16) + ((b1 * (S1[x] >> 4)) >> 16) + 2)>>2); - } -}; - - -template -struct HResizeCubic -{ - typedef T value_type; - typedef WT buf_type; - typedef AT alpha_type; - - void operator()(const T** src, WT** dst, int count, - const int* xofs, const AT* alpha, - int swidth, int dwidth, int cn, int xmin, int xmax ) const - { - for( int k = 0; k < count; k++ ) - { - const T *S = src[k]; - WT *D = dst[k]; - int dx = 0, limit = xmin; - for(;;) - { - for( ; dx < limit; dx++, alpha += 4 ) - { - int j, sx = xofs[dx] - cn; - WT v = 0; - for( j = 0; j < 4; j++ ) - { - int sxj = sx + j*cn; - if( (unsigned)sxj >= (unsigned)swidth ) - { - while( sxj < 0 ) - sxj += cn; - while( sxj >= swidth ) - sxj -= cn; - } - v += S[sxj]*alpha[j]; - } - D[dx] = v; - } - if( limit == dwidth ) - break; - for( ; dx < xmax; dx++, alpha += 4 ) - { - int sx = xofs[dx]; - D[dx] = S[sx-cn]*alpha[0] + S[sx]*alpha[1] + - S[sx+cn]*alpha[2] + S[sx+cn*2]*alpha[3]; - } - limit = dwidth; - } - alpha -= dwidth*4; - } - } -}; - - -template -struct VResizeCubic -{ - typedef T value_type; - typedef WT buf_type; - typedef AT alpha_type; - - void operator()(const WT** src, T* dst, const AT* beta, int width ) const - { - WT b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3]; - const WT *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; - CastOp castOp; - VecOp vecOp; - - int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width); - for( ; x < width; x++ ) - dst[x] = castOp(S0[x]*b0 + S1[x]*b1 + S2[x]*b2 + S3[x]*b3); - } -}; - - -template -struct HResizeLanczos4 -{ - typedef T value_type; - typedef WT buf_type; - typedef AT alpha_type; - - void operator()(const T** src, WT** dst, int count, - const int* xofs, const AT* alpha, - int swidth, int dwidth, int cn, int xmin, int xmax ) const - { - for( int k = 0; k < count; k++ ) - { - const T *S = src[k]; - WT *D = dst[k]; - int dx = 0, limit = xmin; - for(;;) - { - for( ; dx < limit; dx++, alpha += 8 ) - { - int j, sx = xofs[dx] - cn*3; - WT v = 0; - for( j = 0; j < 8; j++ ) - { - int sxj = sx + j*cn; - if( (unsigned)sxj >= (unsigned)swidth ) - { - while( sxj < 0 ) - sxj += cn; - while( sxj >= swidth ) - sxj -= cn; - } - v += S[sxj]*alpha[j]; - } - D[dx] = v; - } - if( limit == dwidth ) - break; - for( ; dx < xmax; dx++, alpha += 8 ) - { - int sx = xofs[dx]; - D[dx] = S[sx-cn*3]*alpha[0] + S[sx-cn*2]*alpha[1] + - S[sx-cn]*alpha[2] + S[sx]*alpha[3] + - S[sx+cn]*alpha[4] + S[sx+cn*2]*alpha[5] + - S[sx+cn*3]*alpha[6] + S[sx+cn*4]*alpha[7]; - } - limit = dwidth; - } - alpha -= dwidth*8; - } - } -}; - - -template -struct VResizeLanczos4 -{ - typedef T value_type; - typedef WT buf_type; - typedef AT alpha_type; - - void operator()(const WT** src, T* dst, const AT* beta, int width ) const - { - CastOp castOp; - VecOp vecOp; - int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width); - #if CV_ENABLE_UNROLLED - for( ; x <= width - 4; x += 4 ) - { - WT b = beta[0]; - const WT* S = src[0]; - WT s0 = S[x]*b, s1 = S[x+1]*b, s2 = S[x+2]*b, s3 = S[x+3]*b; - - for( int k = 1; k < 8; k++ ) - { - b = beta[k]; S = src[k]; - s0 += S[x]*b; s1 += S[x+1]*b; - s2 += S[x+2]*b; s3 += S[x+3]*b; - } - - dst[x] = castOp(s0); dst[x+1] = castOp(s1); - dst[x+2] = castOp(s2); dst[x+3] = castOp(s3); - } - #endif - for( ; x < width; x++ ) - { - dst[x] = castOp(src[0][x]*beta[0] + src[1][x]*beta[1] + - src[2][x]*beta[2] + src[3][x]*beta[3] + src[4][x]*beta[4] + - src[5][x]*beta[5] + src[6][x]*beta[6] + src[7][x]*beta[7]); - } - } -}; - - -static inline int clip(int x, int a, int b) -{ - return x >= a ? (x < b ? x : b-1) : a; -} - -static const int MAX_ESIZE=16; - -template -class resizeGeneric_Invoker : - public ParallelLoopBody -{ -public: - typedef typename HResize::value_type T; - typedef typename HResize::buf_type WT; - typedef typename HResize::alpha_type AT; - - resizeGeneric_Invoker(const Mat& _src, Mat &_dst, const int *_xofs, const int *_yofs, - const AT* _alpha, const AT* __beta, const Size& _ssize, const Size &_dsize, - int _ksize, int _xmin, int _xmax) : - ParallelLoopBody(), src(_src), dst(_dst), xofs(_xofs), yofs(_yofs), - alpha(_alpha), _beta(__beta), ssize(_ssize), dsize(_dsize), - ksize(_ksize), xmin(_xmin), xmax(_xmax) - { - CV_Assert(ksize <= MAX_ESIZE); - } - - virtual void operator() (const Range& range) const - { - int dy, cn = src.channels(); - HResize hresize; - VResize vresize; - - int bufstep = (int)alignSize(dsize.width, 16); - AutoBuffer _buffer(bufstep*ksize); - const T* srows[MAX_ESIZE]={0}; - WT* rows[MAX_ESIZE]={0}; - int prev_sy[MAX_ESIZE]; - - for(int k = 0; k < ksize; k++ ) - { - prev_sy[k] = -1; - rows[k] = (WT*)_buffer + bufstep*k; - } - - const AT* beta = _beta + ksize * range.start; - - for( dy = range.start; dy < range.end; dy++, beta += ksize ) - { - int sy0 = yofs[dy], k0=ksize, k1=0, ksize2 = ksize/2; - - for(int k = 0; k < ksize; k++ ) - { - int sy = clip(sy0 - ksize2 + 1 + k, 0, ssize.height); - for( k1 = std::max(k1, k); k1 < ksize; k1++ ) - { - if( k1 < MAX_ESIZE && sy == prev_sy[k1] ) // if the sy-th row has been computed already, reuse it. - { - if( k1 > k ) - memcpy( rows[k], rows[k1], bufstep*sizeof(rows[0][0]) ); - break; - } - } - if( k1 == ksize ) - k0 = std::min(k0, k); // remember the first row that needs to be computed - srows[k] = src.template ptr(sy); - prev_sy[k] = sy; - } - - if( k0 < ksize ) - hresize( (const T**)(srows + k0), (WT**)(rows + k0), ksize - k0, xofs, (const AT*)(alpha), - ssize.width, dsize.width, cn, xmin, xmax ); - vresize( (const WT**)rows, (T*)(dst.data + dst.step*dy), beta, dsize.width ); - } - } - -private: - Mat src; - Mat dst; - const int* xofs, *yofs; - const AT* alpha, *_beta; - Size ssize, dsize; - const int ksize, xmin, xmax; - - resizeGeneric_Invoker& operator = (const resizeGeneric_Invoker&); -}; - -template -static void resizeGeneric_( const Mat& src, Mat& dst, - const int* xofs, const void* _alpha, - const int* yofs, const void* _beta, - int xmin, int xmax, int ksize ) -{ - typedef typename HResize::alpha_type AT; - - const AT* beta = (const AT*)_beta; - Size ssize = src.size(), dsize = dst.size(); - int cn = src.channels(); - ssize.width *= cn; - dsize.width *= cn; - xmin *= cn; - xmax *= cn; - // image resize is a separable operation. In case of not too strong - - Range range(0, dsize.height); - resizeGeneric_Invoker invoker(src, dst, xofs, yofs, (const AT*)_alpha, beta, - ssize, dsize, ksize, xmin, xmax); - parallel_for_(range, invoker, dst.total()/(double)(1<<16)); -} - -template -struct ResizeAreaFastNoVec -{ - ResizeAreaFastNoVec(int, int) { } - ResizeAreaFastNoVec(int, int, int, int) { } - int operator() (const T*, T*, int) const - { return 0; } -}; - -#if CV_NEON - -class ResizeAreaFastVec_SIMD_8u -{ -public: - ResizeAreaFastVec_SIMD_8u(int _cn, int _step) : - cn(_cn), step(_step) - { - } - - int operator() (const uchar* S, uchar* D, int w) const - { - int dx = 0; - const uchar* S0 = S, * S1 = S0 + step; - - uint16x8_t v_2 = vdupq_n_u16(2); - - if (cn == 1) - { - for ( ; dx <= w - 16; dx += 16, S0 += 32, S1 += 32, D += 16) - { - uint8x16x2_t v_row0 = vld2q_u8(S0), v_row1 = vld2q_u8(S1); - - uint16x8_t v_dst0 = vaddl_u8(vget_low_u8(v_row0.val[0]), vget_low_u8(v_row0.val[1])); - v_dst0 = vaddq_u16(v_dst0, vaddl_u8(vget_low_u8(v_row1.val[0]), vget_low_u8(v_row1.val[1]))); - v_dst0 = vshrq_n_u16(vaddq_u16(v_dst0, v_2), 2); - - uint16x8_t v_dst1 = vaddl_u8(vget_high_u8(v_row0.val[0]), vget_high_u8(v_row0.val[1])); - v_dst1 = vaddq_u16(v_dst1, vaddl_u8(vget_high_u8(v_row1.val[0]), vget_high_u8(v_row1.val[1]))); - v_dst1 = vshrq_n_u16(vaddq_u16(v_dst1, v_2), 2); - - vst1q_u8(D, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1))); - } - } - else if (cn == 4) - { - for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) - { - uint8x16_t v_row0 = vld1q_u8(S0), v_row1 = vld1q_u8(S1); - - uint16x8_t v_row00 = vmovl_u8(vget_low_u8(v_row0)); - uint16x8_t v_row01 = vmovl_u8(vget_high_u8(v_row0)); - uint16x8_t v_row10 = vmovl_u8(vget_low_u8(v_row1)); - uint16x8_t v_row11 = vmovl_u8(vget_high_u8(v_row1)); - - uint16x4_t v_p0 = vadd_u16(vadd_u16(vget_low_u16(v_row00), vget_high_u16(v_row00)), - vadd_u16(vget_low_u16(v_row10), vget_high_u16(v_row10))); - uint16x4_t v_p1 = vadd_u16(vadd_u16(vget_low_u16(v_row01), vget_high_u16(v_row01)), - vadd_u16(vget_low_u16(v_row11), vget_high_u16(v_row11))); - uint16x8_t v_dst = vshrq_n_u16(vaddq_u16(vcombine_u16(v_p0, v_p1), v_2), 2); - - vst1_u8(D, vmovn_u16(v_dst)); - } - } - - return dx; - } - -private: - int cn, step; -}; - -class ResizeAreaFastVec_SIMD_16u -{ -public: - ResizeAreaFastVec_SIMD_16u(int _cn, int _step) : - cn(_cn), step(_step) - { - } - - int operator() (const ushort * S, ushort * D, int w) const - { - int dx = 0; - const ushort * S0 = S, * S1 = (const ushort *)((const uchar *)(S0) + step); - - uint32x4_t v_2 = vdupq_n_u32(2); - - if (cn == 1) - { - for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) - { - uint16x8x2_t v_row0 = vld2q_u16(S0), v_row1 = vld2q_u16(S1); - - uint32x4_t v_dst0 = vaddl_u16(vget_low_u16(v_row0.val[0]), vget_low_u16(v_row0.val[1])); - v_dst0 = vaddq_u32(v_dst0, vaddl_u16(vget_low_u16(v_row1.val[0]), vget_low_u16(v_row1.val[1]))); - v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_2), 2); - - uint32x4_t v_dst1 = vaddl_u16(vget_high_u16(v_row0.val[0]), vget_high_u16(v_row0.val[1])); - v_dst1 = vaddq_u32(v_dst1, vaddl_u16(vget_high_u16(v_row1.val[0]), vget_high_u16(v_row1.val[1]))); - v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_2), 2); - - vst1q_u16(D, vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1))); - } - } - else if (cn == 4) - { - for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) - { - uint16x8_t v_row0 = vld1q_u16(S0), v_row1 = vld1q_u16(S1); - uint32x4_t v_dst = vaddq_u32(vaddl_u16(vget_low_u16(v_row0), vget_high_u16(v_row0)), - vaddl_u16(vget_low_u16(v_row1), vget_high_u16(v_row1))); - vst1_u16(D, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_2), 2))); - } - } - - return dx; - } - -private: - int cn, step; -}; - -class ResizeAreaFastVec_SIMD_16s -{ -public: - ResizeAreaFastVec_SIMD_16s(int _cn, int _step) : - cn(_cn), step(_step) - { - } - - int operator() (const short * S, short * D, int w) const - { - int dx = 0; - const short * S0 = S, * S1 = (const short *)((const uchar *)(S0) + step); - - int32x4_t v_2 = vdupq_n_s32(2); - - if (cn == 1) - { - for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) - { - int16x8x2_t v_row0 = vld2q_s16(S0), v_row1 = vld2q_s16(S1); - - int32x4_t v_dst0 = vaddl_s16(vget_low_s16(v_row0.val[0]), vget_low_s16(v_row0.val[1])); - v_dst0 = vaddq_s32(v_dst0, vaddl_s16(vget_low_s16(v_row1.val[0]), vget_low_s16(v_row1.val[1]))); - v_dst0 = vshrq_n_s32(vaddq_s32(v_dst0, v_2), 2); - - int32x4_t v_dst1 = vaddl_s16(vget_high_s16(v_row0.val[0]), vget_high_s16(v_row0.val[1])); - v_dst1 = vaddq_s32(v_dst1, vaddl_s16(vget_high_s16(v_row1.val[0]), vget_high_s16(v_row1.val[1]))); - v_dst1 = vshrq_n_s32(vaddq_s32(v_dst1, v_2), 2); - - vst1q_s16(D, vcombine_s16(vmovn_s32(v_dst0), vmovn_s32(v_dst1))); - } - } - else if (cn == 4) - { - for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) - { - int16x8_t v_row0 = vld1q_s16(S0), v_row1 = vld1q_s16(S1); - int32x4_t v_dst = vaddq_s32(vaddl_s16(vget_low_s16(v_row0), vget_high_s16(v_row0)), - vaddl_s16(vget_low_s16(v_row1), vget_high_s16(v_row1))); - vst1_s16(D, vmovn_s32(vshrq_n_s32(vaddq_s32(v_dst, v_2), 2))); - } - } - - return dx; - } - -private: - int cn, step; -}; - -struct ResizeAreaFastVec_SIMD_32f -{ - ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) : - cn(_cn), step(_step) - { - fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4); - } - - int operator() (const float * S, float * D, int w) const - { - if (!fast_mode) - return 0; - - const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step); - int dx = 0; - - float32x4_t v_025 = vdupq_n_f32(0.25f); - - if (cn == 1) - { - for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) - { - float32x4x2_t v_row0 = vld2q_f32(S0), v_row1 = vld2q_f32(S1); - - float32x4_t v_dst0 = vaddq_f32(v_row0.val[0], v_row0.val[1]); - float32x4_t v_dst1 = vaddq_f32(v_row1.val[0], v_row1.val[1]); - - vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025)); - } - } - else if (cn == 4) - { - for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) - { - float32x4_t v_dst0 = vaddq_f32(vld1q_f32(S0), vld1q_f32(S0 + 4)); - float32x4_t v_dst1 = vaddq_f32(vld1q_f32(S1), vld1q_f32(S1 + 4)); - - vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025)); - } - } - - return dx; - } - -private: - int cn; - bool fast_mode; - int step; -}; - -#elif CV_SSE2 - -class ResizeAreaFastVec_SIMD_8u -{ -public: - ResizeAreaFastVec_SIMD_8u(int _cn, int _step) : - cn(_cn), step(_step) - { - use_simd = checkHardwareSupport(CV_CPU_SSE2); - } - - int operator() (const uchar* S, uchar* D, int w) const - { - if (!use_simd) - return 0; - - int dx = 0; - const uchar* S0 = S; - const uchar* S1 = S0 + step; - __m128i zero = _mm_setzero_si128(); - __m128i delta2 = _mm_set1_epi16(2); - - if (cn == 1) - { - __m128i masklow = _mm_set1_epi16(0x00ff); - for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) - { - __m128i r0 = _mm_loadu_si128((const __m128i*)S0); - __m128i r1 = _mm_loadu_si128((const __m128i*)S1); - - __m128i s0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_and_si128(r0, masklow)); - __m128i s1 = _mm_add_epi16(_mm_srli_epi16(r1, 8), _mm_and_si128(r1, masklow)); - s0 = _mm_add_epi16(_mm_add_epi16(s0, s1), delta2); - s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero); - - _mm_storel_epi64((__m128i*)D, s0); - } - } - else if (cn == 3) - for ( ; dx <= w - 11; dx += 6, S0 += 12, S1 += 12, D += 6) - { - __m128i r0 = _mm_loadu_si128((const __m128i*)S0); - __m128i r1 = _mm_loadu_si128((const __m128i*)S1); - - __m128i r0_16l = _mm_unpacklo_epi8(r0, zero); - __m128i r0_16h = _mm_unpacklo_epi8(_mm_srli_si128(r0, 6), zero); - __m128i r1_16l = _mm_unpacklo_epi8(r1, zero); - __m128i r1_16h = _mm_unpacklo_epi8(_mm_srli_si128(r1, 6), zero); - - __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 6)); - __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 6)); - s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); - s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero); - _mm_storel_epi64((__m128i*)D, s0); - - s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 6)); - s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 6)); - s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); - s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero); - _mm_storel_epi64((__m128i*)(D+3), s0); - } - else - { - CV_Assert(cn == 4); - int v[] = { 0, 0, -1, -1 }; - __m128i mask = _mm_loadu_si128((const __m128i*)v); - - for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) - { - __m128i r0 = _mm_loadu_si128((const __m128i*)S0); - __m128i r1 = _mm_loadu_si128((const __m128i*)S1); - - __m128i r0_16l = _mm_unpacklo_epi8(r0, zero); - __m128i r0_16h = _mm_unpackhi_epi8(r0, zero); - __m128i r1_16l = _mm_unpacklo_epi8(r1, zero); - __m128i r1_16h = _mm_unpackhi_epi8(r1, zero); - - __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 8)); - __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 8)); - s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); - __m128i res0 = _mm_srli_epi16(s0, 2); - - s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 8)); - s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 8)); - s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); - __m128i res1 = _mm_srli_epi16(s0, 2); - s0 = _mm_packus_epi16(_mm_or_si128(_mm_andnot_si128(mask, res0), - _mm_and_si128(mask, _mm_slli_si128(res1, 8))), zero); - _mm_storel_epi64((__m128i*)(D), s0); - } - } - - return dx; - } - -private: - int cn; - bool use_simd; - int step; -}; - -class ResizeAreaFastVec_SIMD_16u -{ -public: - ResizeAreaFastVec_SIMD_16u(int _cn, int _step) : - cn(_cn), step(_step) - { - use_simd = checkHardwareSupport(CV_CPU_SSE2); - } - - int operator() (const ushort* S, ushort* D, int w) const - { - if (!use_simd) - return 0; - - int dx = 0; - const ushort* S0 = (const ushort*)S; - const ushort* S1 = (const ushort*)((const uchar*)(S) + step); - __m128i masklow = _mm_set1_epi32(0x0000ffff); - __m128i zero = _mm_setzero_si128(); - __m128i delta2 = _mm_set1_epi32(2); - -#define _mm_packus_epi32(a, zero) _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(a, 16), 16), zero) - - if (cn == 1) - { - for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) - { - __m128i r0 = _mm_loadu_si128((const __m128i*)S0); - __m128i r1 = _mm_loadu_si128((const __m128i*)S1); - - __m128i s0 = _mm_add_epi32(_mm_srli_epi32(r0, 16), _mm_and_si128(r0, masklow)); - __m128i s1 = _mm_add_epi32(_mm_srli_epi32(r1, 16), _mm_and_si128(r1, masklow)); - s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2); - s0 = _mm_srli_epi32(s0, 2); - s0 = _mm_packus_epi32(s0, zero); - - _mm_storel_epi64((__m128i*)D, s0); - } - } - else if (cn == 3) - for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3) - { - __m128i r0 = _mm_loadu_si128((const __m128i*)S0); - __m128i r1 = _mm_loadu_si128((const __m128i*)S1); - - __m128i r0_16l = _mm_unpacklo_epi16(r0, zero); - __m128i r0_16h = _mm_unpacklo_epi16(_mm_srli_si128(r0, 6), zero); - __m128i r1_16l = _mm_unpacklo_epi16(r1, zero); - __m128i r1_16h = _mm_unpacklo_epi16(_mm_srli_si128(r1, 6), zero); - - __m128i s0 = _mm_add_epi32(r0_16l, r0_16h); - __m128i s1 = _mm_add_epi32(r1_16l, r1_16h); - s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1)); - s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero); - _mm_storel_epi64((__m128i*)D, s0); - } - else - { - CV_Assert(cn == 4); - for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) - { - __m128i r0 = _mm_loadu_si128((const __m128i*)S0); - __m128i r1 = _mm_loadu_si128((const __m128i*)S1); - - __m128i r0_32l = _mm_unpacklo_epi16(r0, zero); - __m128i r0_32h = _mm_unpackhi_epi16(r0, zero); - __m128i r1_32l = _mm_unpacklo_epi16(r1, zero); - __m128i r1_32h = _mm_unpackhi_epi16(r1, zero); - - __m128i s0 = _mm_add_epi32(r0_32l, r0_32h); - __m128i s1 = _mm_add_epi32(r1_32l, r1_32h); - s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2)); - s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero); - _mm_storel_epi64((__m128i*)D, s0); - } - } - -#undef _mm_packus_epi32 - - return dx; - } - -private: - int cn; - int step; - bool use_simd; -}; - -class ResizeAreaFastVec_SIMD_16s -{ -public: - ResizeAreaFastVec_SIMD_16s(int _cn, int _step) : - cn(_cn), step(_step) - { - use_simd = checkHardwareSupport(CV_CPU_SSE2); - } - - int operator() (const short* S, short* D, int w) const - { - if (!use_simd) - return 0; - - int dx = 0; - const short* S0 = (const short*)S; - const short* S1 = (const short*)((const uchar*)(S) + step); - __m128i masklow = _mm_set1_epi32(0x0000ffff); - __m128i zero = _mm_setzero_si128(); - __m128i delta2 = _mm_set1_epi32(2); - - if (cn == 1) - { - for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) - { - __m128i r0 = _mm_loadu_si128((const __m128i*)S0); - __m128i r1 = _mm_loadu_si128((const __m128i*)S1); - - __m128i s0 = _mm_add_epi32(_mm_srai_epi32(r0, 16), - _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r0, masklow), 16), 16)); - __m128i s1 = _mm_add_epi32(_mm_srai_epi32(r1, 16), - _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r1, masklow), 16), 16)); - s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2); - s0 = _mm_srai_epi32(s0, 2); - s0 = _mm_packs_epi32(s0, zero); - - _mm_storel_epi64((__m128i*)D, s0); - } - } - else if (cn == 3) - for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3) - { - __m128i r0 = _mm_loadu_si128((const __m128i*)S0); - __m128i r1 = _mm_loadu_si128((const __m128i*)S1); - - __m128i r0_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16); - __m128i r0_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r0, 6)), 16); - __m128i r1_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16); - __m128i r1_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r1, 6)), 16); - - __m128i s0 = _mm_add_epi32(r0_16l, r0_16h); - __m128i s1 = _mm_add_epi32(r1_16l, r1_16h); - s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1)); - s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero); - _mm_storel_epi64((__m128i*)D, s0); - } - else - { - CV_Assert(cn == 4); - for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) - { - __m128i r0 = _mm_loadu_si128((const __m128i*)S0); - __m128i r1 = _mm_loadu_si128((const __m128i*)S1); - - __m128i r0_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16); - __m128i r0_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r0), 16); - __m128i r1_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16); - __m128i r1_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r1), 16); - - __m128i s0 = _mm_add_epi32(r0_32l, r0_32h); - __m128i s1 = _mm_add_epi32(r1_32l, r1_32h); - s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2)); - s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero); - _mm_storel_epi64((__m128i*)D, s0); - } - } - - return dx; - } - -private: - int cn; - int step; - bool use_simd; -}; - -struct ResizeAreaFastVec_SIMD_32f -{ - ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) : - cn(_cn), step(_step) - { - fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4); - fast_mode = fast_mode && checkHardwareSupport(CV_CPU_SSE2); - } - - int operator() (const float * S, float * D, int w) const - { - if (!fast_mode) - return 0; - - const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step); - int dx = 0; - - __m128 v_025 = _mm_set1_ps(0.25f); - - if (cn == 1) - { - const int shuffle_lo = _MM_SHUFFLE(2, 0, 2, 0), shuffle_hi = _MM_SHUFFLE(3, 1, 3, 1); - for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) - { - __m128 v_row00 = _mm_loadu_ps(S0), v_row01 = _mm_loadu_ps(S0 + 4), - v_row10 = _mm_loadu_ps(S1), v_row11 = _mm_loadu_ps(S1 + 4); - - __m128 v_dst0 = _mm_add_ps(_mm_shuffle_ps(v_row00, v_row01, shuffle_lo), - _mm_shuffle_ps(v_row00, v_row01, shuffle_hi)); - __m128 v_dst1 = _mm_add_ps(_mm_shuffle_ps(v_row10, v_row11, shuffle_lo), - _mm_shuffle_ps(v_row10, v_row11, shuffle_hi)); - - _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025)); - } - } - else if (cn == 4) - { - for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) - { - __m128 v_dst0 = _mm_add_ps(_mm_loadu_ps(S0), _mm_loadu_ps(S0 + 4)); - __m128 v_dst1 = _mm_add_ps(_mm_loadu_ps(S1), _mm_loadu_ps(S1 + 4)); - - _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025)); - } - } - - return dx; - } - -private: - int cn; - bool fast_mode; - int step; -}; - -#else - -typedef ResizeAreaFastNoVec ResizeAreaFastVec_SIMD_8u; -typedef ResizeAreaFastNoVec ResizeAreaFastVec_SIMD_16u; -typedef ResizeAreaFastNoVec ResizeAreaFastVec_SIMD_16s; -typedef ResizeAreaFastNoVec ResizeAreaFastVec_SIMD_32f; - -#endif - -template -struct ResizeAreaFastVec -{ - ResizeAreaFastVec(int _scale_x, int _scale_y, int _cn, int _step) : - scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step), vecOp(_cn, _step) - { - fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4); - } - - int operator() (const T* S, T* D, int w) const - { - if (!fast_mode) - return 0; - - const T* nextS = (const T*)((const uchar*)S + step); - int dx = vecOp(S, D, w); - - if (cn == 1) - for( ; dx < w; ++dx ) - { - int index = dx*2; - D[dx] = (T)((S[index] + S[index+1] + nextS[index] + nextS[index+1] + 2) >> 2); - } - else if (cn == 3) - for( ; dx < w; dx += 3 ) - { - int index = dx*2; - D[dx] = (T)((S[index] + S[index+3] + nextS[index] + nextS[index+3] + 2) >> 2); - D[dx+1] = (T)((S[index+1] + S[index+4] + nextS[index+1] + nextS[index+4] + 2) >> 2); - D[dx+2] = (T)((S[index+2] + S[index+5] + nextS[index+2] + nextS[index+5] + 2) >> 2); - } - else - { - CV_Assert(cn == 4); - for( ; dx < w; dx += 4 ) - { - int index = dx*2; - D[dx] = (T)((S[index] + S[index+4] + nextS[index] + nextS[index+4] + 2) >> 2); - D[dx+1] = (T)((S[index+1] + S[index+5] + nextS[index+1] + nextS[index+5] + 2) >> 2); - D[dx+2] = (T)((S[index+2] + S[index+6] + nextS[index+2] + nextS[index+6] + 2) >> 2); - D[dx+3] = (T)((S[index+3] + S[index+7] + nextS[index+3] + nextS[index+7] + 2) >> 2); - } - } - - return dx; - } - -private: - int scale_x, scale_y; - int cn; - bool fast_mode; - int step; - SIMDVecOp vecOp; -}; - -template -class resizeAreaFast_Invoker : - public ParallelLoopBody -{ -public: - resizeAreaFast_Invoker(const Mat &_src, Mat &_dst, - int _scale_x, int _scale_y, const int* _ofs, const int* _xofs) : - ParallelLoopBody(), src(_src), dst(_dst), scale_x(_scale_x), - scale_y(_scale_y), ofs(_ofs), xofs(_xofs) - { - } - - virtual void operator() (const Range& range) const - { - Size ssize = src.size(), dsize = dst.size(); - int cn = src.channels(); - int area = scale_x*scale_y; - float scale = 1.f/(area); - int dwidth1 = (ssize.width/scale_x)*cn; - dsize.width *= cn; - ssize.width *= cn; - int dy, dx, k = 0; - - VecOp vop(scale_x, scale_y, src.channels(), (int)src.step/*, area_ofs*/); - - for( dy = range.start; dy < range.end; dy++ ) - { - T* D = (T*)(dst.data + dst.step*dy); - int sy0 = dy*scale_y; - int w = sy0 + scale_y <= ssize.height ? dwidth1 : 0; - - if( sy0 >= ssize.height ) - { - for( dx = 0; dx < dsize.width; dx++ ) - D[dx] = 0; - continue; - } - - dx = vop(src.template ptr(sy0), D, w); - for( ; dx < w; dx++ ) - { - const T* S = src.template ptr(sy0) + xofs[dx]; - WT sum = 0; - k = 0; - #if CV_ENABLE_UNROLLED - for( ; k <= area - 4; k += 4 ) - sum += S[ofs[k]] + S[ofs[k+1]] + S[ofs[k+2]] + S[ofs[k+3]]; - #endif - for( ; k < area; k++ ) - sum += S[ofs[k]]; - - D[dx] = saturate_cast(sum * scale); - } - - for( ; dx < dsize.width; dx++ ) - { - WT sum = 0; - int count = 0, sx0 = xofs[dx]; - if( sx0 >= ssize.width ) - D[dx] = 0; - - for( int sy = 0; sy < scale_y; sy++ ) - { - if( sy0 + sy >= ssize.height ) - break; - const T* S = src.template ptr(sy0 + sy) + sx0; - for( int sx = 0; sx < scale_x*cn; sx += cn ) - { - if( sx0 + sx >= ssize.width ) - break; - sum += S[sx]; - count++; - } - } - - D[dx] = saturate_cast((float)sum/count); - } - } - } - -private: - Mat src; - Mat dst; - int scale_x, scale_y; - const int *ofs, *xofs; -}; - -template -static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int* xofs, - int scale_x, int scale_y ) -{ - Range range(0, dst.rows); - resizeAreaFast_Invoker invoker(src, dst, scale_x, - scale_y, ofs, xofs); - parallel_for_(range, invoker, dst.total()/(double)(1<<16)); -} - -struct DecimateAlpha -{ - int si, di; - float alpha; -}; - - -template class ResizeArea_Invoker : - public ParallelLoopBody -{ -public: - ResizeArea_Invoker( const Mat& _src, Mat& _dst, - const DecimateAlpha* _xtab, int _xtab_size, - const DecimateAlpha* _ytab, int _ytab_size, - const int* _tabofs ) - { - src = &_src; - dst = &_dst; - xtab0 = _xtab; - xtab_size0 = _xtab_size; - ytab = _ytab; - ytab_size = _ytab_size; - tabofs = _tabofs; - } - - virtual void operator() (const Range& range) const - { - Size dsize = dst->size(); - int cn = dst->channels(); - dsize.width *= cn; - AutoBuffer _buffer(dsize.width*2); - const DecimateAlpha* xtab = xtab0; - int xtab_size = xtab_size0; - WT *buf = _buffer, *sum = buf + dsize.width; - int j_start = tabofs[range.start], j_end = tabofs[range.end], j, k, dx, prev_dy = ytab[j_start].di; - - for( dx = 0; dx < dsize.width; dx++ ) - sum[dx] = (WT)0; - - for( j = j_start; j < j_end; j++ ) - { - WT beta = ytab[j].alpha; - int dy = ytab[j].di; - int sy = ytab[j].si; - - { - const T* S = src->template ptr(sy); - for( dx = 0; dx < dsize.width; dx++ ) - buf[dx] = (WT)0; - - if( cn == 1 ) - for( k = 0; k < xtab_size; k++ ) - { - int dxn = xtab[k].di; - WT alpha = xtab[k].alpha; - buf[dxn] += S[xtab[k].si]*alpha; - } - else if( cn == 2 ) - for( k = 0; k < xtab_size; k++ ) - { - int sxn = xtab[k].si; - int dxn = xtab[k].di; - WT alpha = xtab[k].alpha; - WT t0 = buf[dxn] + S[sxn]*alpha; - WT t1 = buf[dxn+1] + S[sxn+1]*alpha; - buf[dxn] = t0; buf[dxn+1] = t1; - } - else if( cn == 3 ) - for( k = 0; k < xtab_size; k++ ) - { - int sxn = xtab[k].si; - int dxn = xtab[k].di; - WT alpha = xtab[k].alpha; - WT t0 = buf[dxn] + S[sxn]*alpha; - WT t1 = buf[dxn+1] + S[sxn+1]*alpha; - WT t2 = buf[dxn+2] + S[sxn+2]*alpha; - buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2; - } - else if( cn == 4 ) - { - for( k = 0; k < xtab_size; k++ ) - { - int sxn = xtab[k].si; - int dxn = xtab[k].di; - WT alpha = xtab[k].alpha; - WT t0 = buf[dxn] + S[sxn]*alpha; - WT t1 = buf[dxn+1] + S[sxn+1]*alpha; - buf[dxn] = t0; buf[dxn+1] = t1; - t0 = buf[dxn+2] + S[sxn+2]*alpha; - t1 = buf[dxn+3] + S[sxn+3]*alpha; - buf[dxn+2] = t0; buf[dxn+3] = t1; - } - } - else - { - for( k = 0; k < xtab_size; k++ ) - { - int sxn = xtab[k].si; - int dxn = xtab[k].di; - WT alpha = xtab[k].alpha; - for( int c = 0; c < cn; c++ ) - buf[dxn + c] += S[sxn + c]*alpha; - } - } - } - - if( dy != prev_dy ) - { - T* D = dst->template ptr(prev_dy); - - for( dx = 0; dx < dsize.width; dx++ ) - { - D[dx] = saturate_cast(sum[dx]); - sum[dx] = beta*buf[dx]; - } - prev_dy = dy; - } - else - { - for( dx = 0; dx < dsize.width; dx++ ) - sum[dx] += beta*buf[dx]; - } - } - - { - T* D = dst->template ptr(prev_dy); - for( dx = 0; dx < dsize.width; dx++ ) - D[dx] = saturate_cast(sum[dx]); - } - } - -private: - const Mat* src; - Mat* dst; - const DecimateAlpha* xtab0; - const DecimateAlpha* ytab; - int xtab_size0, ytab_size; - const int* tabofs; -}; - - -template -static void resizeArea_( const Mat& src, Mat& dst, - const DecimateAlpha* xtab, int xtab_size, - const DecimateAlpha* ytab, int ytab_size, - const int* tabofs ) -{ - parallel_for_(Range(0, dst.rows), - ResizeArea_Invoker(src, dst, xtab, xtab_size, ytab, ytab_size, tabofs), - dst.total()/((double)(1 << 16))); -} - - -typedef void (*ResizeFunc)( const Mat& src, Mat& dst, - const int* xofs, const void* alpha, - const int* yofs, const void* beta, - int xmin, int xmax, int ksize ); - -typedef void (*ResizeAreaFastFunc)( const Mat& src, Mat& dst, - const int* ofs, const int *xofs, - int scale_x, int scale_y ); - -typedef void (*ResizeAreaFunc)( const Mat& src, Mat& dst, - const DecimateAlpha* xtab, int xtab_size, - const DecimateAlpha* ytab, int ytab_size, - const int* yofs); - - -static int computeResizeAreaTab( int ssize, int dsize, int cn, double scale, DecimateAlpha* tab ) -{ - int k = 0; - for(int dx = 0; dx < dsize; dx++ ) - { - double fsx1 = dx * scale; - double fsx2 = fsx1 + scale; - double cellWidth = std::min(scale, ssize - fsx1); - - int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2); - - sx2 = std::min(sx2, ssize - 1); - sx1 = std::min(sx1, sx2); - - if( sx1 - fsx1 > 1e-3 ) - { - assert( k < ssize*2 ); - tab[k].di = dx * cn; - tab[k].si = (sx1 - 1) * cn; - tab[k++].alpha = (float)((sx1 - fsx1) / cellWidth); - } - - for(int sx = sx1; sx < sx2; sx++ ) - { - assert( k < ssize*2 ); - tab[k].di = dx * cn; - tab[k].si = sx * cn; - tab[k++].alpha = float(1.0 / cellWidth); - } - - if( fsx2 - sx2 > 1e-3 ) - { - assert( k < ssize*2 ); - tab[k].di = dx * cn; - tab[k].si = sx2 * cn; - tab[k++].alpha = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth); - } - } - return k; -} - -#ifdef HAVE_OPENCL -static void ocl_computeResizeAreaTabs(int ssize, int dsize, double scale, int * const map_tab, - float * const alpha_tab, int * const ofs_tab) -{ - int k = 0, dx = 0; - for ( ; dx < dsize; dx++) - { - ofs_tab[dx] = k; - - double fsx1 = dx * scale; - double fsx2 = fsx1 + scale; - double cellWidth = std::min(scale, ssize - fsx1); - - int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2); - - sx2 = std::min(sx2, ssize - 1); - sx1 = std::min(sx1, sx2); - - if (sx1 - fsx1 > 1e-3) - { - map_tab[k] = sx1 - 1; - alpha_tab[k++] = (float)((sx1 - fsx1) / cellWidth); - } - - for (int sx = sx1; sx < sx2; sx++) - { - map_tab[k] = sx; - alpha_tab[k++] = float(1.0 / cellWidth); - } - - if (fsx2 - sx2 > 1e-3) - { - map_tab[k] = sx2; - alpha_tab[k++] = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth); - } - } - ofs_tab[dx] = k; -} - -static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize, - double fx, double fy, int interpolation) -{ - int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - - double inv_fx = 1.0 / fx, inv_fy = 1.0 / fy; - float inv_fxf = (float)inv_fx, inv_fyf = (float)inv_fy; - int iscale_x = saturate_cast(inv_fx), iscale_y = saturate_cast(inv_fx); - bool is_area_fast = std::abs(inv_fx - iscale_x) < DBL_EPSILON && - std::abs(inv_fy - iscale_y) < DBL_EPSILON; - - // in case of scale_x && scale_y is equal to 2 - // INTER_AREA (fast) also is equal to INTER_LINEAR - if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 ) - /*interpolation = INTER_AREA*/(void)0; // INTER_AREA is slower - - if( !(cn <= 4 && - (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || - (interpolation == INTER_AREA && inv_fx >= 1 && inv_fy >= 1) )) ) - return false; - - UMat src = _src.getUMat(); - _dst.create(dsize, type); - UMat dst = _dst.getUMat(); - - Size ssize = src.size(); - ocl::Kernel k; - size_t globalsize[] = { (size_t)dst.cols, (size_t)dst.rows }; - - ocl::Image2D srcImage; - - // See if this could be done with a sampler. We stick with integer - // datatypes because the observed error is low. - bool useSampler = (interpolation == INTER_LINEAR && ocl::Device::getDefault().imageSupport() && - ocl::Image2D::canCreateAlias(src) && depth <= 4 && - ocl::Image2D::isFormatSupported(depth, cn, true) && - src.offset==0); - if (useSampler) - { - int wdepth = std::max(depth, CV_32S); - char buf[2][32]; - cv::String compileOpts = format("-D USE_SAMPLER -D depth=%d -D T=%s -D T1=%s " - "-D convertToDT=%s -D cn=%d", - depth, ocl::typeToStr(type), ocl::typeToStr(depth), - ocl::convertTypeStr(wdepth, depth, cn, buf[1]), - cn); - k.create("resizeSampler", ocl::imgproc::resize_oclsrc, compileOpts); - - if (k.empty()) - useSampler = false; - else - { - // Convert the input into an OpenCL image type, using normalized channel data types - // and aliasing the UMat. - srcImage = ocl::Image2D(src, true, true); - k.args(srcImage, ocl::KernelArg::WriteOnly(dst), - (float)inv_fx, (float)inv_fy); - } - } - - if (interpolation == INTER_LINEAR && !useSampler) - { - char buf[2][32]; - - // integer path is slower because of CPU part, so it's disabled - if (depth == CV_8U && ((void)0, 0)) - { - AutoBuffer _buffer((dsize.width + dsize.height)*(sizeof(int) + sizeof(short)*2)); - int* xofs = (int*)(uchar*)_buffer, * yofs = xofs + dsize.width; - short* ialpha = (short*)(yofs + dsize.height), * ibeta = ialpha + dsize.width*2; - float fxx, fyy; - int sx, sy; - - for (int dx = 0; dx < dsize.width; dx++) - { - fxx = (float)((dx+0.5)*inv_fx - 0.5); - sx = cvFloor(fxx); - fxx -= sx; - - if (sx < 0) - fxx = 0, sx = 0; - - if (sx >= ssize.width-1) - fxx = 0, sx = ssize.width-1; - - xofs[dx] = sx; - ialpha[dx*2 + 0] = saturate_cast((1.f - fxx) * INTER_RESIZE_COEF_SCALE); - ialpha[dx*2 + 1] = saturate_cast(fxx * INTER_RESIZE_COEF_SCALE); - } - - for (int dy = 0; dy < dsize.height; dy++) - { - fyy = (float)((dy+0.5)*inv_fy - 0.5); - sy = cvFloor(fyy); - fyy -= sy; - - yofs[dy] = sy; - ibeta[dy*2 + 0] = saturate_cast((1.f - fyy) * INTER_RESIZE_COEF_SCALE); - ibeta[dy*2 + 1] = saturate_cast(fyy * INTER_RESIZE_COEF_SCALE); - } - - int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn); - UMat coeffs; - Mat(1, static_cast(_buffer.size()), CV_8UC1, (uchar *)_buffer).copyTo(coeffs); - - k.create("resizeLN", ocl::imgproc::resize_oclsrc, - format("-D INTER_LINEAR_INTEGER -D depth=%d -D T=%s -D T1=%s " - "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d " - "-D INTER_RESIZE_COEF_BITS=%d", - depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype), - ocl::convertTypeStr(depth, wdepth, cn, buf[0]), - ocl::convertTypeStr(wdepth, depth, cn, buf[1]), - cn, INTER_RESIZE_COEF_BITS)); - if (k.empty()) - return false; - - k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), - ocl::KernelArg::PtrReadOnly(coeffs)); - } - else - { - int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn); - k.create("resizeLN", ocl::imgproc::resize_oclsrc, - format("-D INTER_LINEAR -D depth=%d -D T=%s -D T1=%s " - "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d " - "-D INTER_RESIZE_COEF_BITS=%d", - depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype), - ocl::convertTypeStr(depth, wdepth, cn, buf[0]), - ocl::convertTypeStr(wdepth, depth, cn, buf[1]), - cn, INTER_RESIZE_COEF_BITS)); - if (k.empty()) - return false; - - k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), - (float)inv_fx, (float)inv_fy); - } - } - else if (interpolation == INTER_NEAREST) - { - k.create("resizeNN", ocl::imgproc::resize_oclsrc, - format("-D INTER_NEAREST -D T=%s -D T1=%s -D cn=%d", - ocl::vecopTypeToStr(type), ocl::vecopTypeToStr(depth), cn)); - if (k.empty()) - return false; - - k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), - (float)inv_fx, (float)inv_fy); - } - else if (interpolation == INTER_AREA) - { - int wdepth = std::max(depth, is_area_fast ? CV_32S : CV_32F); - int wtype = CV_MAKE_TYPE(wdepth, cn); - - char cvt[2][40]; - String buildOption = format("-D INTER_AREA -D T=%s -D T1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d", - ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype), - ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), cn); - - UMat alphaOcl, tabofsOcl, mapOcl; - UMat dmap, smap; - - if (is_area_fast) - { - int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn); - buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST" - " -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff", - ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]), - ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]), - iscale_x, iscale_y, 1.0f / (iscale_x * iscale_y)); - - k.create("resizeAREA_FAST", ocl::imgproc::resize_oclsrc, buildOption); - if (k.empty()) - return false; - } - else - { - buildOption = buildOption + format(" -D convertToT=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0])); - k.create("resizeAREA", ocl::imgproc::resize_oclsrc, buildOption); - if (k.empty()) - return false; - - int xytab_size = (ssize.width + ssize.height) << 1; - int tabofs_size = dsize.height + dsize.width + 2; - - AutoBuffer _xymap_tab(xytab_size), _xyofs_tab(tabofs_size); - AutoBuffer _xyalpha_tab(xytab_size); - int * xmap_tab = _xymap_tab, * ymap_tab = _xymap_tab + (ssize.width << 1); - float * xalpha_tab = _xyalpha_tab, * yalpha_tab = _xyalpha_tab + (ssize.width << 1); - int * xofs_tab = _xyofs_tab, * yofs_tab = _xyofs_tab + dsize.width + 1; - - ocl_computeResizeAreaTabs(ssize.width, dsize.width, inv_fx, xmap_tab, xalpha_tab, xofs_tab); - ocl_computeResizeAreaTabs(ssize.height, dsize.height, inv_fy, ymap_tab, yalpha_tab, yofs_tab); - - // loading precomputed arrays to GPU - Mat(1, xytab_size, CV_32FC1, (void *)_xyalpha_tab).copyTo(alphaOcl); - Mat(1, xytab_size, CV_32SC1, (void *)_xymap_tab).copyTo(mapOcl); - Mat(1, tabofs_size, CV_32SC1, (void *)_xyofs_tab).copyTo(tabofsOcl); - } - - ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst); - - if (is_area_fast) - k.args(srcarg, dstarg); - else - k.args(srcarg, dstarg, inv_fxf, inv_fyf, ocl::KernelArg::PtrReadOnly(tabofsOcl), - ocl::KernelArg::PtrReadOnly(mapOcl), ocl::KernelArg::PtrReadOnly(alphaOcl)); - - return k.run(2, globalsize, NULL, false); - } - - return k.run(2, globalsize, 0, false); -} - -#endif - -#ifdef HAVE_IPP -#define IPP_RESIZE_PARALLEL 1 - -#ifdef HAVE_IPP_IW -class ipp_resizeParallel: public ParallelLoopBody -{ -public: - ipp_resizeParallel(::ipp::IwiImage &src, ::ipp::IwiImage &dst, bool &ok): - m_src(src), m_dst(dst), m_ok(ok) {} - ~ipp_resizeParallel() - { - } - - void Init(IppiInterpolationType inter) - { - iwiResize.InitAlloc(m_src.m_size, m_dst.m_size, m_src.m_dataType, m_src.m_channels, inter, ::ipp::IwiResizeParams(0, 0, 0.75, 4), ippBorderRepl); - - m_ok = true; - } - - virtual void operator() (const Range& range) const - { - CV_INSTRUMENT_REGION_IPP() - - if(!m_ok) - return; - - try - { - ::ipp::IwiTile tile = ::ipp::IwiRoi(0, range.start, m_dst.m_size.width, range.end - range.start); - CV_INSTRUMENT_FUN_IPP(iwiResize, m_src, m_dst, ippBorderRepl, tile); - } - catch(::ipp::IwException) - { - m_ok = false; - return; - } - } -private: - ::ipp::IwiImage &m_src; - ::ipp::IwiImage &m_dst; - - mutable ::ipp::IwiResize iwiResize; - - volatile bool &m_ok; - const ipp_resizeParallel& operator= (const ipp_resizeParallel&); -}; - -class ipp_resizeAffineParallel: public ParallelLoopBody -{ -public: - ipp_resizeAffineParallel(::ipp::IwiImage &src, ::ipp::IwiImage &dst, bool &ok): - m_src(src), m_dst(dst), m_ok(ok) {} - ~ipp_resizeAffineParallel() - { - } - - void Init(IppiInterpolationType inter, double scaleX, double scaleY) - { - double shift = (inter == ippNearest)?-1e-10:-0.5; - double coeffs[2][3] = { - {scaleX, 0, shift+0.5*scaleX}, - {0, scaleY, shift+0.5*scaleY} - }; - - iwiWarpAffine.InitAlloc(m_src.m_size, m_dst.m_size, m_src.m_dataType, m_src.m_channels, coeffs, iwTransForward, inter, ::ipp::IwiWarpAffineParams(0, 0, 0.75), ippBorderRepl); - - m_ok = true; - } - - virtual void operator() (const Range& range) const - { - CV_INSTRUMENT_REGION_IPP() - - if(!m_ok) - return; - - try - { - ::ipp::IwiTile tile = ::ipp::IwiRoi(0, range.start, m_dst.m_size.width, range.end - range.start); - CV_INSTRUMENT_FUN_IPP(iwiWarpAffine, m_src, m_dst, tile); - } - catch(::ipp::IwException) - { - m_ok = false; - return; - } - } -private: - ::ipp::IwiImage &m_src; - ::ipp::IwiImage &m_dst; - - mutable ::ipp::IwiWarpAffine iwiWarpAffine; - - volatile bool &m_ok; - const ipp_resizeAffineParallel& operator= (const ipp_resizeAffineParallel&); -}; -#endif - -static bool ipp_resize(const uchar * src_data, size_t src_step, int src_width, int src_height, - uchar * dst_data, size_t dst_step, int dst_width, int dst_height, double inv_scale_x, double inv_scale_y, - int depth, int channels, int interpolation) -{ -#ifdef HAVE_IPP_IW - CV_INSTRUMENT_REGION_IPP() - - IppDataType ippDataType = ippiGetDataType(depth); - IppiInterpolationType ippInter = ippiGetInterpolation(interpolation); - if(ippInter < 0) - return false; - - // Resize which doesn't match OpenCV exactly - if(!cv::ipp::useIPP_NE()) - { - if(ippInter == ippNearest || ippInter == ippSuper || (ippDataType == ipp8u && ippInter == ippLinear)) - return false; - } - - if(ippInter != ippLinear && ippDataType == ipp64f) - return false; - -#if IPP_VERSION_X100 < 201801 - // Degradations on int^2 linear downscale - if(ippDataType != ipp64f && ippInter == ippLinear && inv_scale_x < 1 && inv_scale_y < 1) // if downscale - { - int scale_x = (int)(1/inv_scale_x); - int scale_y = (int)(1/inv_scale_y); - if(1/inv_scale_x - scale_x < DBL_EPSILON && 1/inv_scale_y - scale_y < DBL_EPSILON) // if integer - { - if(!(scale_x&(scale_x-1)) && !(scale_y&(scale_y-1))) // if power of 2 - return false; - } - } -#endif - - bool affine = false; - const double IPP_RESIZE_EPS = (depth == CV_64F)?0:1e-10; - double ex = fabs((double)dst_width / src_width - inv_scale_x) / inv_scale_x; - double ey = fabs((double)dst_height / src_height - inv_scale_y) / inv_scale_y; - - // Use affine transform resize to allow sub-pixel accuracy - if(ex > IPP_RESIZE_EPS || ey > IPP_RESIZE_EPS) - affine = true; - - // Affine doesn't support Lanczos and Super interpolations - if(affine && (ippInter == ippLanczos || ippInter == ippSuper)) - return false; - - try - { - ::ipp::IwiImage iwSrc(::ipp::IwiSize(src_width, src_height), ippDataType, channels, 0, (void*)src_data, src_step); - ::ipp::IwiImage iwDst(::ipp::IwiSize(dst_width, dst_height), ippDataType, channels, 0, (void*)dst_data, dst_step); - - bool ok; - int threads = ippiSuggestThreadsNum(iwDst, 1+((double)(src_width*src_height)/(dst_width*dst_height))); - Range range(0, dst_height); - ipp_resizeParallel invokerGeneral(iwSrc, iwDst, ok); - ipp_resizeAffineParallel invokerAffine(iwSrc, iwDst, ok); - ParallelLoopBody *pInvoker = NULL; - if(affine) - { - pInvoker = &invokerAffine; - invokerAffine.Init(ippInter, inv_scale_x, inv_scale_y); - } - else - { - pInvoker = &invokerGeneral; - invokerGeneral.Init(ippInter); - } - - if(IPP_RESIZE_PARALLEL && threads > 1) - parallel_for_(range, *pInvoker, threads*4); - else - pInvoker->operator()(range); - - if(!ok) - return false; - } - catch(::ipp::IwException) - { - return false; - } - return true; -#else - CV_UNUSED(src_data); CV_UNUSED(src_step); CV_UNUSED(src_width); CV_UNUSED(src_height); CV_UNUSED(dst_data); CV_UNUSED(dst_step); - CV_UNUSED(dst_width); CV_UNUSED(dst_height); CV_UNUSED(inv_scale_x); CV_UNUSED(inv_scale_y); CV_UNUSED(depth); - CV_UNUSED(channels); CV_UNUSED(interpolation); - return false; -#endif -} +const int INTER_REMAP_COEF_SCALE=1 << INTER_REMAP_COEF_BITS; + +static uchar NNDeltaTab_i[INTER_TAB_SIZE2][2]; + +static float BilinearTab_f[INTER_TAB_SIZE2][2][2]; +static short BilinearTab_i[INTER_TAB_SIZE2][2][2]; + +#if CV_SSE2 || CV_NEON +static short BilinearTab_iC4_buf[INTER_TAB_SIZE2+2][2][8]; +static short (*BilinearTab_iC4)[2][8] = (short (*)[2][8])alignPtr(BilinearTab_iC4_buf, 16); #endif -//================================================================================================== +static float BicubicTab_f[INTER_TAB_SIZE2][4][4]; +static short BicubicTab_i[INTER_TAB_SIZE2][4][4]; -namespace hal { +static float Lanczos4Tab_f[INTER_TAB_SIZE2][8][8]; +static short Lanczos4Tab_i[INTER_TAB_SIZE2][8][8]; -void resize(int src_type, - const uchar * src_data, size_t src_step, int src_width, int src_height, - uchar * dst_data, size_t dst_step, int dst_width, int dst_height, - double inv_scale_x, double inv_scale_y, int interpolation) +static inline void interpolateLinear( float x, float* coeffs ) { - CV_INSTRUMENT_REGION() - - CV_Assert((dst_width * dst_height > 0) || (inv_scale_x > 0 && inv_scale_y > 0)); - if (inv_scale_x < DBL_EPSILON || inv_scale_y < DBL_EPSILON) - { - inv_scale_x = static_cast(dst_width) / src_width; - inv_scale_y = static_cast(dst_height) / src_height; - } + coeffs[0] = 1.f - x; + coeffs[1] = x; +} - CALL_HAL(resize, cv_hal_resize, src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, inv_scale_x, inv_scale_y, interpolation); +static inline void interpolateCubic( float x, float* coeffs ) +{ + const float A = -0.75f; - int depth = CV_MAT_DEPTH(src_type), cn = CV_MAT_CN(src_type); - Size dsize = Size(saturate_cast(src_width*inv_scale_x), - saturate_cast(src_height*inv_scale_y)); - CV_Assert( dsize.area() > 0 ); + coeffs[0] = ((A*(x + 1) - 5*A)*(x + 1) + 8*A)*(x + 1) - 4*A; + coeffs[1] = ((A + 2)*x - (A + 3))*x*x + 1; + coeffs[2] = ((A + 2)*(1 - x) - (A + 3))*(1 - x)*(1 - x) + 1; + coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; +} - CV_IPP_RUN_FAST(ipp_resize(src_data, src_step, src_width, src_height, dst_data, dst_step, dsize.width, dsize.height, inv_scale_x, inv_scale_y, depth, cn, interpolation)) +static inline void interpolateLanczos4( float x, float* coeffs ) +{ + static const double s45 = 0.70710678118654752440084436210485; + static const double cs[][2]= + {{1, 0}, {-s45, -s45}, {0, 1}, {s45, -s45}, {-1, 0}, {s45, s45}, {0, -1}, {-s45, s45}}; - static ResizeFunc linear_tab[] = + if( x < FLT_EPSILON ) { - resizeGeneric_< - HResizeLinear, - VResizeLinear, - VResizeLinearVec_32s8u> >, - 0, - resizeGeneric_< - HResizeLinear, - VResizeLinear, - VResizeLinearVec_32f16u> >, - resizeGeneric_< - HResizeLinear, - VResizeLinear, - VResizeLinearVec_32f16s> >, - 0, - resizeGeneric_< - HResizeLinear, - VResizeLinear, - VResizeLinearVec_32f> >, - resizeGeneric_< - HResizeLinear, - VResizeLinear, - VResizeNoVec> >, - 0 - }; + for( int i = 0; i < 8; i++ ) + coeffs[i] = 0; + coeffs[3] = 1; + return; + } - static ResizeFunc cubic_tab[] = + float sum = 0; + double y0=-(x+3)*CV_PI*0.25, s0 = sin(y0), c0=cos(y0); + for(int i = 0; i < 8; i++ ) { - resizeGeneric_< - HResizeCubic, - VResizeCubic, - VResizeCubicVec_32s8u> >, - 0, - resizeGeneric_< - HResizeCubic, - VResizeCubic, - VResizeCubicVec_32f16u> >, - resizeGeneric_< - HResizeCubic, - VResizeCubic, - VResizeCubicVec_32f16s> >, - 0, - resizeGeneric_< - HResizeCubic, - VResizeCubic, - VResizeCubicVec_32f> >, - resizeGeneric_< - HResizeCubic, - VResizeCubic, - VResizeNoVec> >, - 0 - }; + double y = -(x+3-i)*CV_PI*0.25; + coeffs[i] = (float)((cs[i][0]*s0 + cs[i][1]*c0)/(y*y)); + sum += coeffs[i]; + } - static ResizeFunc lanczos4_tab[] = - { - resizeGeneric_, - VResizeLanczos4, - VResizeNoVec> >, - 0, - resizeGeneric_, - VResizeLanczos4, - VResizeLanczos4Vec_32f16u> >, - resizeGeneric_, - VResizeLanczos4, - VResizeLanczos4Vec_32f16s> >, - 0, - resizeGeneric_, - VResizeLanczos4, - VResizeLanczos4Vec_32f> >, - resizeGeneric_, - VResizeLanczos4, - VResizeNoVec> >, - 0 - }; + sum = 1.f/sum; + for(int i = 0; i < 8; i++ ) + coeffs[i] *= sum; +} - static ResizeAreaFastFunc areafast_tab[] = +static void initInterTab1D(int method, float* tab, int tabsz) +{ + float scale = 1.f/tabsz; + if( method == INTER_LINEAR ) { - resizeAreaFast_ >, - 0, - resizeAreaFast_ >, - resizeAreaFast_ >, - 0, - resizeAreaFast_, - resizeAreaFast_ >, - 0 - }; - - static ResizeAreaFunc area_tab[] = + for( int i = 0; i < tabsz; i++, tab += 2 ) + interpolateLinear( i*scale, tab ); + } + else if( method == INTER_CUBIC ) { - resizeArea_, 0, resizeArea_, - resizeArea_, 0, resizeArea_, - resizeArea_, 0 - }; - - double scale_x = 1./inv_scale_x, scale_y = 1./inv_scale_y; - - int iscale_x = saturate_cast(scale_x); - int iscale_y = saturate_cast(scale_y); - - bool is_area_fast = std::abs(scale_x - iscale_x) < DBL_EPSILON && - std::abs(scale_y - iscale_y) < DBL_EPSILON; - - Mat src(Size(src_width, src_height), src_type, const_cast(src_data), src_step); - Mat dst(dsize, src_type, dst_data, dst_step); - - if( interpolation == INTER_NEAREST ) + for( int i = 0; i < tabsz; i++, tab += 4 ) + interpolateCubic( i*scale, tab ); + } + else if( method == INTER_LANCZOS4 ) { - resizeNN( src, dst, inv_scale_x, inv_scale_y ); - return; + for( int i = 0; i < tabsz; i++, tab += 8 ) + interpolateLanczos4( i*scale, tab ); } + else + CV_Error( CV_StsBadArg, "Unknown interpolation method" ); +} - int k, sx, sy, dx, dy; +static const void* initInterTab2D( int method, bool fixpt ) +{ + static bool inittab[INTER_MAX+1] = {false}; + float* tab = 0; + short* itab = 0; + int ksize = 0; + if( method == INTER_LINEAR ) + tab = BilinearTab_f[0][0], itab = BilinearTab_i[0][0], ksize=2; + else if( method == INTER_CUBIC ) + tab = BicubicTab_f[0][0], itab = BicubicTab_i[0][0], ksize=4; + else if( method == INTER_LANCZOS4 ) + tab = Lanczos4Tab_f[0][0], itab = Lanczos4Tab_i[0][0], ksize=8; + else + CV_Error( CV_StsBadArg, "Unknown/unsupported interpolation type" ); + if( !inittab[method] ) { - // in case of scale_x && scale_y is equal to 2 - // INTER_AREA (fast) also is equal to INTER_LINEAR - if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 ) - interpolation = INTER_AREA; - - // true "area" interpolation is only implemented for the case (scale_x <= 1 && scale_y <= 1). - // In other cases it is emulated using some variant of bilinear interpolation - if( interpolation == INTER_AREA && scale_x >= 1 && scale_y >= 1 ) - { - if( is_area_fast ) + AutoBuffer _tab(8*INTER_TAB_SIZE); + int i, j, k1, k2; + initInterTab1D(method, _tab, INTER_TAB_SIZE); + for( i = 0; i < INTER_TAB_SIZE; i++ ) + for( j = 0; j < INTER_TAB_SIZE; j++, tab += ksize*ksize, itab += ksize*ksize ) { - int area = iscale_x*iscale_y; - size_t srcstep = src_step / src.elemSize1(); - AutoBuffer _ofs(area + dsize.width*cn); - int* ofs = _ofs; - int* xofs = ofs + area; - ResizeAreaFastFunc func = areafast_tab[depth]; - CV_Assert( func != 0 ); - - for( sy = 0, k = 0; sy < iscale_y; sy++ ) - for( sx = 0; sx < iscale_x; sx++ ) - ofs[k++] = (int)(sy*srcstep + sx*cn); - - for( dx = 0; dx < dsize.width; dx++ ) + int isum = 0; + NNDeltaTab_i[i*INTER_TAB_SIZE+j][0] = j < INTER_TAB_SIZE/2; + NNDeltaTab_i[i*INTER_TAB_SIZE+j][1] = i < INTER_TAB_SIZE/2; + + for( k1 = 0; k1 < ksize; k1++ ) { - int j = dx * cn; - sx = iscale_x * j; - for( k = 0; k < cn; k++ ) - xofs[j + k] = sx + k; + float vy = _tab[i*ksize + k1]; + for( k2 = 0; k2 < ksize; k2++ ) + { + float v = vy*_tab[j*ksize + k2]; + tab[k1*ksize + k2] = v; + isum += itab[k1*ksize + k2] = saturate_cast(v*INTER_REMAP_COEF_SCALE); + } } - func( src, dst, ofs, xofs, iscale_x, iscale_y ); - return; - } - - ResizeAreaFunc func = area_tab[depth]; - CV_Assert( func != 0 && cn <= 4 ); - - AutoBuffer _xytab((src_width + src_height)*2); - DecimateAlpha* xtab = _xytab, *ytab = xtab + src_width*2; - - int xtab_size = computeResizeAreaTab(src_width, dsize.width, cn, scale_x, xtab); - int ytab_size = computeResizeAreaTab(src_height, dsize.height, 1, scale_y, ytab); - - AutoBuffer _tabofs(dsize.height + 1); - int* tabofs = _tabofs; - for( k = 0, dy = 0; k < ytab_size; k++ ) - { - if( k == 0 || ytab[k].di != ytab[k-1].di ) + if( isum != INTER_REMAP_COEF_SCALE ) { - assert( ytab[k].di == dy ); - tabofs[dy++] = k; + int diff = isum - INTER_REMAP_COEF_SCALE; + int ksize2 = ksize/2, Mk1=ksize2, Mk2=ksize2, mk1=ksize2, mk2=ksize2; + for( k1 = ksize2; k1 < ksize2+2; k1++ ) + for( k2 = ksize2; k2 < ksize2+2; k2++ ) + { + if( itab[k1*ksize+k2] < itab[mk1*ksize+mk2] ) + mk1 = k1, mk2 = k2; + else if( itab[k1*ksize+k2] > itab[Mk1*ksize+Mk2] ) + Mk1 = k1, Mk2 = k2; + } + if( diff < 0 ) + itab[Mk1*ksize + Mk2] = (short)(itab[Mk1*ksize + Mk2] - diff); + else + itab[mk1*ksize + mk2] = (short)(itab[mk1*ksize + mk2] - diff); } } - tabofs[dy] = ytab_size; - - func( src, dst, xtab, xtab_size, ytab, ytab_size, tabofs ); - return; - } - } - - int xmin = 0, xmax = dsize.width, width = dsize.width*cn; - bool area_mode = interpolation == INTER_AREA; - bool fixpt = depth == CV_8U; - float fx, fy; - ResizeFunc func=0; - int ksize=0, ksize2; - if( interpolation == INTER_CUBIC ) - ksize = 4, func = cubic_tab[depth]; - else if( interpolation == INTER_LANCZOS4 ) - ksize = 8, func = lanczos4_tab[depth]; - else if( interpolation == INTER_LINEAR || interpolation == INTER_AREA ) - ksize = 2, func = linear_tab[depth]; - else - CV_Error( CV_StsBadArg, "Unknown interpolation method" ); - ksize2 = ksize/2; - - CV_Assert( func != 0 ); - - AutoBuffer _buffer((width + dsize.height)*(sizeof(int) + sizeof(float)*ksize)); - int* xofs = (int*)(uchar*)_buffer; - int* yofs = xofs + width; - float* alpha = (float*)(yofs + dsize.height); - short* ialpha = (short*)alpha; - float* beta = alpha + width*ksize; - short* ibeta = ialpha + width*ksize; - float cbuf[MAX_ESIZE]; - - for( dx = 0; dx < dsize.width; dx++ ) - { - if( !area_mode ) - { - fx = (float)((dx+0.5)*scale_x - 0.5); - sx = cvFloor(fx); - fx -= sx; - } - else - { - sx = cvFloor(dx*scale_x); - fx = (float)((dx+1) - (sx+1)*inv_scale_x); - fx = fx <= 0 ? 0.f : fx - cvFloor(fx); - } - - if( sx < ksize2-1 ) - { - xmin = dx+1; - if( sx < 0 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4)) - fx = 0, sx = 0; - } - - if( sx + ksize2 >= src_width ) - { - xmax = std::min( xmax, dx ); - if( sx >= src_width-1 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4)) - fx = 0, sx = src_width-1; - } - - for( k = 0, sx *= cn; k < cn; k++ ) - xofs[dx*cn + k] = sx + k; - - if( interpolation == INTER_CUBIC ) - interpolateCubic( fx, cbuf ); - else if( interpolation == INTER_LANCZOS4 ) - interpolateLanczos4( fx, cbuf ); - else - { - cbuf[0] = 1.f - fx; - cbuf[1] = fx; - } - if( fixpt ) - { - for( k = 0; k < ksize; k++ ) - ialpha[dx*cn*ksize + k] = saturate_cast(cbuf[k]*INTER_RESIZE_COEF_SCALE); - for( ; k < cn*ksize; k++ ) - ialpha[dx*cn*ksize + k] = ialpha[dx*cn*ksize + k - ksize]; - } - else - { - for( k = 0; k < ksize; k++ ) - alpha[dx*cn*ksize + k] = cbuf[k]; - for( ; k < cn*ksize; k++ ) - alpha[dx*cn*ksize + k] = alpha[dx*cn*ksize + k - ksize]; - } - } - - for( dy = 0; dy < dsize.height; dy++ ) - { - if( !area_mode ) - { - fy = (float)((dy+0.5)*scale_y - 0.5); - sy = cvFloor(fy); - fy -= sy; - } - else - { - sy = cvFloor(dy*scale_y); - fy = (float)((dy+1) - (sy+1)*inv_scale_y); - fy = fy <= 0 ? 0.f : fy - cvFloor(fy); - } - - yofs[dy] = sy; - if( interpolation == INTER_CUBIC ) - interpolateCubic( fy, cbuf ); - else if( interpolation == INTER_LANCZOS4 ) - interpolateLanczos4( fy, cbuf ); - else - { - cbuf[0] = 1.f - fy; - cbuf[1] = fy; - } - - if( fixpt ) - { - for( k = 0; k < ksize; k++ ) - ibeta[dy*ksize + k] = saturate_cast(cbuf[k]*INTER_RESIZE_COEF_SCALE); - } - else + tab -= INTER_TAB_SIZE2*ksize*ksize; + itab -= INTER_TAB_SIZE2*ksize*ksize; +#if CV_SSE2 || CV_NEON + if( method == INTER_LINEAR ) { - for( k = 0; k < ksize; k++ ) - beta[dy*ksize + k] = cbuf[k]; + for( i = 0; i < INTER_TAB_SIZE2; i++ ) + for( j = 0; j < 4; j++ ) + { + BilinearTab_iC4[i][0][j*2] = BilinearTab_i[i][0][0]; + BilinearTab_iC4[i][0][j*2+1] = BilinearTab_i[i][0][1]; + BilinearTab_iC4[i][1][j*2] = BilinearTab_i[i][1][0]; + BilinearTab_iC4[i][1][j*2+1] = BilinearTab_i[i][1][1]; + } } +#endif + inittab[method] = true; } - - func( src, dst, xofs, fixpt ? (void*)ialpha : (void*)alpha, yofs, - fixpt ? (void*)ibeta : (void*)beta, xmin, xmax, ksize ); + return fixpt ? (const void*)itab : (const void*)tab; } -} // cv::hal:: -} // cv:: - -//================================================================================================== - -void cv::resize( InputArray _src, OutputArray _dst, Size dsize, - double inv_scale_x, double inv_scale_y, int interpolation ) +#ifndef __MINGW32__ +static bool initAllInterTab2D() { - CV_INSTRUMENT_REGION() + return initInterTab2D( INTER_LINEAR, false ) && + initInterTab2D( INTER_LINEAR, true ) && + initInterTab2D( INTER_CUBIC, false ) && + initInterTab2D( INTER_CUBIC, true ) && + initInterTab2D( INTER_LANCZOS4, false ) && + initInterTab2D( INTER_LANCZOS4, true ); +} - Size ssize = _src.size(); +static volatile bool doInitAllInterTab2D = initAllInterTab2D(); +#endif - CV_Assert( ssize.width > 0 && ssize.height > 0 ); - CV_Assert( dsize.area() > 0 || (inv_scale_x > 0 && inv_scale_y > 0) ); - if( dsize.area() == 0 ) - { - dsize = Size(saturate_cast(ssize.width*inv_scale_x), - saturate_cast(ssize.height*inv_scale_y)); - CV_Assert( dsize.area() > 0 ); - } - else - { - inv_scale_x = (double)dsize.width/ssize.width; - inv_scale_y = (double)dsize.height/ssize.height; - } +template struct Cast +{ + typedef ST type1; + typedef DT rtype; - CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() && _src.cols() > 10 && _src.rows() > 10, - ocl_resize(_src, _dst, dsize, inv_scale_x, inv_scale_y, interpolation)) + DT operator()(ST val) const { return saturate_cast
(val); } +}; - Mat src = _src.getMat(); - _dst.create(dsize, src.type()); - Mat dst = _dst.getMat(); +template struct FixedPtCast +{ + typedef ST type1; + typedef DT rtype; + enum { SHIFT = bits, DELTA = 1 << (bits-1) }; - if (dsize == ssize) - { - // Source and destination are of same size. Use simple copy. - src.copyTo(dst); - return; - } + DT operator()(ST val) const { return saturate_cast
((val + DELTA)>>SHIFT); } +}; - hal::resize(src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows, inv_scale_x, inv_scale_y, interpolation); +static inline int clip(int x, int a, int b) +{ + return x >= a ? (x < b ? x : b-1) : a; } - /****************************************************************************************\ * General warping (affine, perspective, remap) * \****************************************************************************************/ -namespace cv -{ - template static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy, int borderType, const Scalar& _borderValue ) @@ -6434,16 +3234,6 @@ cv::Mat cv::getAffineTransform(InputArray _src, InputArray _dst) return getAffineTransform((const Point2f*)src.data, (const Point2f*)dst.data); } -CV_IMPL void -cvResize( const CvArr* srcarr, CvArr* dstarr, int method ) -{ - cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr); - CV_Assert( src.type() == dst.type() ); - cv::resize( src, dst, dst.size(), (double)dst.cols/src.cols, - (double)dst.rows/src.rows, method ); -} - - CV_IMPL void cvWarpAffine( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr, int flags, CvScalar fillval ) diff --git a/modules/imgproc/src/imgwarp.hpp b/modules/imgproc/src/imgwarp.hpp index ed1146dcd3..ef1f6646a2 100644 --- a/modules/imgproc/src/imgwarp.hpp +++ b/modules/imgproc/src/imgwarp.hpp @@ -56,8 +56,6 @@ namespace cv namespace opt_AVX2 { #if CV_TRY_AVX2 -void resizeNN2_AVX2(const Range&, const Mat&, Mat&, int*, int, double); -void resizeNN4_AVX2(const Range&, const Mat&, Mat&, int*, int, double); int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw); #endif } @@ -65,10 +63,6 @@ int warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X namespace opt_SSE4_1 { #if CV_TRY_SSE4_1 -void resizeNN2_SSE4_1(const Range&, const Mat&, Mat&, int*, int, double); -void resizeNN4_SSE4_1(const Range&, const Mat&, Mat&, int*, int, double); - -int VResizeLanczos4Vec_32f16u_SSE41(const uchar** _src, uchar* _dst, const uchar* _beta, int width); void convertMaps_nninterpolate32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, int width); void convertMaps_32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, ushort* dst2, int width); void convertMaps_32f2c16s_SSE41(const float* src1f, short* dst1, ushort* dst2, int width); diff --git a/modules/imgproc/src/imgwarp.sse4_1.cpp b/modules/imgproc/src/imgwarp.sse4_1.cpp index 49954b1dbc..c25967fcc9 100644 --- a/modules/imgproc/src/imgwarp.sse4_1.cpp +++ b/modules/imgproc/src/imgwarp.sse4_1.cpp @@ -55,179 +55,6 @@ namespace cv namespace opt_SSE4_1 { -class resizeNNInvokerSSE2 : - public ParallelLoopBody -{ -public: - resizeNNInvokerSSE2(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) : - ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4), - ify(_ify) - { - } - -#if defined(__INTEL_COMPILER) -#pragma optimization_parameter target_arch=SSE4.2 -#endif - virtual void operator() (const Range& range) const - { - Size ssize = src.size(), dsize = dst.size(); - int y, x; - int width = dsize.width; - int sseWidth = width - (width & 0x7); - for(y = range.start; y < range.end; y++) - { - uchar* D = dst.data + dst.step*y; - uchar* Dstart = D; - int sy = std::min(cvFloor(y*ify), ssize.height-1); - const uchar* S = src.data + sy*src.step; - __m128i CV_DECL_ALIGNED(64) pixels = _mm_set1_epi16(0); - for(x = 0; x < sseWidth; x += 8) - { - ushort imm = *(ushort*)(S + x_ofs[x + 0]); - pixels = _mm_insert_epi16(pixels, imm, 0); - imm = *(ushort*)(S + x_ofs[x + 1]); - pixels = _mm_insert_epi16(pixels, imm, 1); - imm = *(ushort*)(S + x_ofs[x + 2]); - pixels = _mm_insert_epi16(pixels, imm, 2); - imm = *(ushort*)(S + x_ofs[x + 3]); - pixels = _mm_insert_epi16(pixels, imm, 3); - imm = *(ushort*)(S + x_ofs[x + 4]); - pixels = _mm_insert_epi16(pixels, imm, 4); - imm = *(ushort*)(S + x_ofs[x + 5]); - pixels = _mm_insert_epi16(pixels, imm, 5); - imm = *(ushort*)(S + x_ofs[x + 6]); - pixels = _mm_insert_epi16(pixels, imm, 6); - imm = *(ushort*)(S + x_ofs[x + 7]); - pixels = _mm_insert_epi16(pixels, imm, 7); - _mm_storeu_si128((__m128i*)D, pixels); - D += 16; - } - for(; x < width; x++) - { - *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]); - } - } - } - -private: - const Mat src; - Mat dst; - int* x_ofs, pix_size4; - double ify; - - resizeNNInvokerSSE2(const resizeNNInvokerSSE2&); - resizeNNInvokerSSE2& operator=(const resizeNNInvokerSSE2&); -}; - -class resizeNNInvokerSSE4 : - public ParallelLoopBody -{ -public: - resizeNNInvokerSSE4(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) : - ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4), - ify(_ify) - { - } -#if defined(__INTEL_COMPILER) -#pragma optimization_parameter target_arch=SSE4.2 -#endif - virtual void operator() (const Range& range) const - { - Size ssize = src.size(), dsize = dst.size(); - int y, x; - int width = dsize.width; - int sseWidth = width - (width & 0x3); - for(y = range.start; y < range.end; y++) - { - uchar* D = dst.data + dst.step*y; - uchar* Dstart = D; - int sy = std::min(cvFloor(y*ify), ssize.height-1); - const uchar* S = src.data + sy*src.step; - __m128i CV_DECL_ALIGNED(64) pixels = _mm_set1_epi16(0); - for(x = 0; x < sseWidth; x += 4) - { - int imm = *(int*)(S + x_ofs[x + 0]); - pixels = _mm_insert_epi32(pixels, imm, 0); - imm = *(int*)(S + x_ofs[x + 1]); - pixels = _mm_insert_epi32(pixels, imm, 1); - imm = *(int*)(S + x_ofs[x + 2]); - pixels = _mm_insert_epi32(pixels, imm, 2); - imm = *(int*)(S + x_ofs[x + 3]); - pixels = _mm_insert_epi32(pixels, imm, 3); - _mm_storeu_si128((__m128i*)D, pixels); - D += 16; - } - for(; x < width; x++) - { - *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]); - } - } - } - -private: - const Mat src; - Mat dst; - int* x_ofs, pix_size4; - double ify; - - resizeNNInvokerSSE4(const resizeNNInvokerSSE4&); - resizeNNInvokerSSE4& operator=(const resizeNNInvokerSSE4&); -}; - -void resizeNN2_SSE4_1(const Range& range, const Mat& src, Mat &dst, int *x_ofs, int pix_size4, double ify) -{ - resizeNNInvokerSSE2 invoker(src, dst, x_ofs, pix_size4, ify); - parallel_for_(range, invoker, dst.total() / (double)(1 << 16)); -} - -void resizeNN4_SSE4_1(const Range& range, const Mat& src, Mat &dst, int *x_ofs, int pix_size4, double ify) -{ - resizeNNInvokerSSE4 invoker(src, dst, x_ofs, pix_size4, ify); - parallel_for_(range, invoker, dst.total() / (double)(1 << 16)); -} - -int VResizeLanczos4Vec_32f16u_SSE41(const uchar** _src, uchar* _dst, const uchar* _beta, int width) -{ - const float** src = (const float**)_src; - const float* beta = (const float*)_beta; - const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], - *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; - short * dst = (short*)_dst; - int x = 0; - __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]), - v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]), - v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]), - v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]); - - for (; x <= width - 8; x += 8) - { - __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x)); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x))); - v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x))); - - __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4)); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4))); - v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4))); - - __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0); - __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1); - - _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(v_dsti0, v_dsti1)); - } - - return x; -} - void convertMaps_nninterpolate32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, int width) { int x = 0; diff --git a/modules/imgproc/src/resize.avx2.cpp b/modules/imgproc/src/resize.avx2.cpp new file mode 100644 index 0000000000..0d934da169 --- /dev/null +++ b/modules/imgproc/src/resize.avx2.cpp @@ -0,0 +1,261 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +/* //////////////////////////////////////////////////////////////////// +// +// Geometrical transforms on images and matrices: rotation, zoom etc. +// +// */ + +#include "precomp.hpp" +#include "resize.hpp" + +namespace cv +{ +namespace opt_AVX2 +{ + +class resizeNNInvokerAVX4 : + public ParallelLoopBody +{ +public: + resizeNNInvokerAVX4(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) : + ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4), + ify(_ify) + { + } + +#if defined(__INTEL_COMPILER) +#pragma optimization_parameter target_arch=AVX +#endif + virtual void operator() (const Range& range) const + { + Size ssize = src.size(), dsize = dst.size(); + int y, x; + int width = dsize.width; + int avxWidth = width - (width & 0x7); + const __m256i CV_DECL_ALIGNED(64) mask = _mm256_set1_epi32(-1); + if(((int64)(dst.data + dst.step) & 0x1f) == 0) + { + for(y = range.start; y < range.end; y++) + { + uchar* D = dst.data + dst.step*y; + uchar* Dstart = D; + int sy = std::min(cvFloor(y*ify), ssize.height-1); + const uchar* S = src.data + sy*src.step; +#ifdef CV_ICC +#pragma unroll(4) +#endif + for(x = 0; x < avxWidth; x += 8) + { + const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); + __m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr); + __m256i CV_DECL_ALIGNED(64) pixels = _mm256_i32gather_epi32((const int*)S, indices, 1); + _mm256_maskstore_epi32((int*)D, mask, pixels); + D += 32; + } + for(; x < width; x++) + { + *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]); + } + } + } + else + { + for(y = range.start; y < range.end; y++) + { + uchar* D = dst.data + dst.step*y; + uchar* Dstart = D; + int sy = std::min(cvFloor(y*ify), ssize.height-1); + const uchar* S = src.data + sy*src.step; +#ifdef CV_ICC +#pragma unroll(4) +#endif + for(x = 0; x < avxWidth; x += 8) + { + const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); + __m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr); + __m256i CV_DECL_ALIGNED(64) pixels = _mm256_i32gather_epi32((const int*)S, indices, 1); + _mm256_storeu_si256((__m256i*)D, pixels); + D += 32; + } + for(; x < width; x++) + { + *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]); + } + } + } + _mm256_zeroupper(); + } + +private: + const Mat src; + Mat dst; + int* x_ofs, pix_size4; + double ify; + + resizeNNInvokerAVX4(const resizeNNInvokerAVX4&); + resizeNNInvokerAVX4& operator=(const resizeNNInvokerAVX4&); +}; + +class resizeNNInvokerAVX2 : + public ParallelLoopBody +{ +public: + resizeNNInvokerAVX2(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) : + ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4), + ify(_ify) + { + } + +#if defined(__INTEL_COMPILER) +#pragma optimization_parameter target_arch=AVX +#endif + virtual void operator() (const Range& range) const + { + Size ssize = src.size(), dsize = dst.size(); + int y, x; + int width = dsize.width; + //int avxWidth = (width - 1) - ((width - 1) & 0x7); + int avxWidth = width - (width & 0xf); + const __m256i CV_DECL_ALIGNED(64) mask = _mm256_set1_epi32(-1); + const __m256i CV_DECL_ALIGNED(64) shuffle_mask = _mm256_set_epi8(15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0, + 15,14,11,10,13,12,9,8,7,6,3,2,5,4,1,0); + const __m256i CV_DECL_ALIGNED(64) permute_mask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0); + //const __m256i CV_DECL_ALIGNED(64) shift_shuffle_mask = _mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2, + // 13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2); + if(((int64)(dst.data + dst.step) & 0x1f) == 0) + { + for(y = range.start; y < range.end; y++) + { + uchar* D = dst.data + dst.step*y; + uchar* Dstart = D; + int sy = std::min(cvFloor(y*ify), ssize.height-1); + const uchar* S = src.data + sy*src.step; + const uchar* S2 = S - 2; +#ifdef CV_ICC +#pragma unroll(4) +#endif + for(x = 0; x < avxWidth; x += 16) + { + const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); + __m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr); + __m256i CV_DECL_ALIGNED(64) pixels1 = _mm256_i32gather_epi32((const int*)S, indices, 1); + const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8); + __m256i CV_DECL_ALIGNED(64) indices2 = _mm256_lddqu_si256(addr2); + __m256i CV_DECL_ALIGNED(64) pixels2 = _mm256_i32gather_epi32((const int*)S2, indices2, 1); + __m256i CV_DECL_ALIGNED(64) unpacked = _mm256_blend_epi16(pixels1, pixels2, 0xaa); + + __m256i CV_DECL_ALIGNED(64) bytes_shuffled = _mm256_shuffle_epi8(unpacked, shuffle_mask); + __m256i CV_DECL_ALIGNED(64) ints_permuted = _mm256_permutevar8x32_epi32(bytes_shuffled, permute_mask); + _mm256_maskstore_epi32((int*)D, mask, ints_permuted); + D += 32; + } + for(; x < width; x++) + { + *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]); + } + + } + } + else + { + for(y = range.start; y < range.end; y++) + { + uchar* D = dst.data + dst.step*y; + uchar* Dstart = D; + int sy = std::min(cvFloor(y*ify), ssize.height-1); + const uchar* S = src.data + sy*src.step; + const uchar* S2 = S - 2; +#ifdef CV_ICC +#pragma unroll(4) +#endif + for(x = 0; x < avxWidth; x += 16) + { + const __m256i CV_DECL_ALIGNED(64) *addr = (__m256i*)(x_ofs + x); + __m256i CV_DECL_ALIGNED(64) indices = _mm256_lddqu_si256(addr); + __m256i CV_DECL_ALIGNED(64) pixels1 = _mm256_i32gather_epi32((const int*)S, indices, 1); + const __m256i CV_DECL_ALIGNED(64) *addr2 = (__m256i*)(x_ofs + x + 8); + __m256i CV_DECL_ALIGNED(64) indices2 = _mm256_lddqu_si256(addr2); + __m256i CV_DECL_ALIGNED(64) pixels2 = _mm256_i32gather_epi32((const int*)S2, indices2, 1); + __m256i CV_DECL_ALIGNED(64) unpacked = _mm256_blend_epi16(pixels1, pixels2, 0xaa); + + __m256i CV_DECL_ALIGNED(64) bytes_shuffled = _mm256_shuffle_epi8(unpacked, shuffle_mask); + __m256i CV_DECL_ALIGNED(64) ints_permuted = _mm256_permutevar8x32_epi32(bytes_shuffled, permute_mask); + _mm256_storeu_si256((__m256i*)D, ints_permuted); + D += 32; + } + for(; x < width; x++) + { + *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]); + } + } + } + _mm256_zeroupper(); + } + +private: + const Mat src; + Mat dst; + int* x_ofs, pix_size4; + double ify; + + resizeNNInvokerAVX2(const resizeNNInvokerAVX2&); + resizeNNInvokerAVX2& operator=(const resizeNNInvokerAVX2&); +}; + +void resizeNN2_AVX2(const Range& range, const Mat& src, Mat &dst, int *x_ofs, int pix_size4, double ify) +{ + resizeNNInvokerAVX2 invoker(src, dst, x_ofs, pix_size4, ify); + parallel_for_(range, invoker, dst.total() / (double)(1 << 16)); +} + +void resizeNN4_AVX2(const Range& range, const Mat& src, Mat &dst, int *x_ofs, int pix_size4, double ify) +{ + resizeNNInvokerAVX4 invoker(src, dst, x_ofs, pix_size4, ify); + parallel_for_(range, invoker, dst.total() / (double)(1 << 16)); +} + +} +} +/* End of file. */ diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp new file mode 100644 index 0000000000..13dbf3b795 --- /dev/null +++ b/modules/imgproc/src/resize.cpp @@ -0,0 +1,3330 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, 2017, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +/* //////////////////////////////////////////////////////////////////// +// +// Geometrical transforms on images and matrices: rotation, zoom etc. +// +// */ + +#include "precomp.hpp" +#include "opencl_kernels_imgproc.hpp" +#include "hal_replacement.hpp" + +#include "opencv2/core/openvx/ovx_defs.hpp" +#include "resize.hpp" + +using namespace cv; + +namespace cv +{ + +/************** interpolation formulas and tables ***************/ + +const int INTER_RESIZE_COEF_BITS=11; +const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS; + +static inline void interpolateCubic( float x, float* coeffs ) +{ + const float A = -0.75f; + + coeffs[0] = ((A*(x + 1) - 5*A)*(x + 1) + 8*A)*(x + 1) - 4*A; + coeffs[1] = ((A + 2)*x - (A + 3))*x*x + 1; + coeffs[2] = ((A + 2)*(1 - x) - (A + 3))*(1 - x)*(1 - x) + 1; + coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; +} + +static inline void interpolateLanczos4( float x, float* coeffs ) +{ + static const double s45 = 0.70710678118654752440084436210485; + static const double cs[][2]= + {{1, 0}, {-s45, -s45}, {0, 1}, {s45, -s45}, {-1, 0}, {s45, s45}, {0, -1}, {-s45, s45}}; + + if( x < FLT_EPSILON ) + { + for( int i = 0; i < 8; i++ ) + coeffs[i] = 0; + coeffs[3] = 1; + return; + } + + float sum = 0; + double y0=-(x+3)*CV_PI*0.25, s0 = sin(y0), c0=cos(y0); + for(int i = 0; i < 8; i++ ) + { + double y = -(x+3-i)*CV_PI*0.25; + coeffs[i] = (float)((cs[i][0]*s0 + cs[i][1]*c0)/(y*y)); + sum += coeffs[i]; + } + + sum = 1.f/sum; + for(int i = 0; i < 8; i++ ) + coeffs[i] *= sum; +} + +template struct Cast +{ + typedef ST type1; + typedef DT rtype; + + DT operator()(ST val) const { return saturate_cast
(val); } +}; + +template struct FixedPtCast +{ + typedef ST type1; + typedef DT rtype; + enum { SHIFT = bits, DELTA = 1 << (bits-1) }; + + DT operator()(ST val) const { return saturate_cast
((val + DELTA)>>SHIFT); } +}; + +/****************************************************************************************\ +* Resize * +\****************************************************************************************/ + +class resizeNNInvoker : + public ParallelLoopBody +{ +public: + resizeNNInvoker(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) : + ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4), + ify(_ify) + { + } + + virtual void operator() (const Range& range) const + { + Size ssize = src.size(), dsize = dst.size(); + int y, x, pix_size = (int)src.elemSize(); + + for( y = range.start; y < range.end; y++ ) + { + uchar* D = dst.data + dst.step*y; + int sy = std::min(cvFloor(y*ify), ssize.height-1); + const uchar* S = src.ptr(sy); + + switch( pix_size ) + { + case 1: + for( x = 0; x <= dsize.width - 2; x += 2 ) + { + uchar t0 = S[x_ofs[x]]; + uchar t1 = S[x_ofs[x+1]]; + D[x] = t0; + D[x+1] = t1; + } + + for( ; x < dsize.width; x++ ) + D[x] = S[x_ofs[x]]; + break; + case 2: + for( x = 0; x < dsize.width; x++ ) + *(ushort*)(D + x*2) = *(ushort*)(S + x_ofs[x]); + break; + case 3: + for( x = 0; x < dsize.width; x++, D += 3 ) + { + const uchar* _tS = S + x_ofs[x]; + D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2]; + } + break; + case 4: + for( x = 0; x < dsize.width; x++ ) + *(int*)(D + x*4) = *(int*)(S + x_ofs[x]); + break; + case 6: + for( x = 0; x < dsize.width; x++, D += 6 ) + { + const ushort* _tS = (const ushort*)(S + x_ofs[x]); + ushort* _tD = (ushort*)D; + _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2]; + } + break; + case 8: + for( x = 0; x < dsize.width; x++, D += 8 ) + { + const int* _tS = (const int*)(S + x_ofs[x]); + int* _tD = (int*)D; + _tD[0] = _tS[0]; _tD[1] = _tS[1]; + } + break; + case 12: + for( x = 0; x < dsize.width; x++, D += 12 ) + { + const int* _tS = (const int*)(S + x_ofs[x]); + int* _tD = (int*)D; + _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2]; + } + break; + default: + for( x = 0; x < dsize.width; x++, D += pix_size ) + { + const int* _tS = (const int*)(S + x_ofs[x]); + int* _tD = (int*)D; + for( int k = 0; k < pix_size4; k++ ) + _tD[k] = _tS[k]; + } + } + } + } + +private: + const Mat src; + Mat dst; + int* x_ofs, pix_size4; + double ify; + + resizeNNInvoker(const resizeNNInvoker&); + resizeNNInvoker& operator=(const resizeNNInvoker&); +}; + +static void +resizeNN( const Mat& src, Mat& dst, double fx, double fy ) +{ + Size ssize = src.size(), dsize = dst.size(); + AutoBuffer _x_ofs(dsize.width); + int* x_ofs = _x_ofs; + int pix_size = (int)src.elemSize(); + int pix_size4 = (int)(pix_size / sizeof(int)); + double ifx = 1./fx, ify = 1./fy; + int x; + + for( x = 0; x < dsize.width; x++ ) + { + int sx = cvFloor(x*ifx); + x_ofs[x] = std::min(sx, ssize.width-1)*pix_size; + } + + Range range(0, dsize.height); +#if CV_TRY_AVX2 + if(CV_CPU_HAS_SUPPORT_AVX2 && ((pix_size == 2) || (pix_size == 4))) + { + if(pix_size == 2) + opt_AVX2::resizeNN2_AVX2(range, src, dst, x_ofs, pix_size4, ify); + else + opt_AVX2::resizeNN4_AVX2(range, src, dst, x_ofs, pix_size4, ify); + } + else +#endif +#if CV_TRY_SSE4_1 + if(CV_CPU_HAS_SUPPORT_SSE4_1 && ((pix_size == 2) || (pix_size == 4))) + { + if(pix_size == 2) + opt_SSE4_1::resizeNN2_SSE4_1(range, src, dst, x_ofs, pix_size4, ify); + else + opt_SSE4_1::resizeNN4_SSE4_1(range, src, dst, x_ofs, pix_size4, ify); + } + else +#endif + { + resizeNNInvoker invoker(src, dst, x_ofs, pix_size4, ify); + parallel_for_(range, invoker, dst.total()/(double)(1<<16)); + } +} + + +struct VResizeNoVec +{ + int operator()(const uchar**, uchar*, const uchar*, int ) const { return 0; } +}; + +struct HResizeNoVec +{ + int operator()(const uchar**, uchar**, int, const int*, + const uchar*, int, int, int, int, int) const { return 0; } +}; + +#if CV_SSE2 + +struct VResizeLinearVec_32s8u +{ + int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const + { + if( !checkHardwareSupport(CV_CPU_SSE2) ) + return 0; + + const int** src = (const int**)_src; + const short* beta = (const short*)_beta; + const int *S0 = src[0], *S1 = src[1]; + int x = 0; + __m128i b0 = _mm_set1_epi16(beta[0]), b1 = _mm_set1_epi16(beta[1]); + __m128i delta = _mm_set1_epi16(2); + + if( (((size_t)S0|(size_t)S1)&15) == 0 ) + for( ; x <= width - 16; x += 16 ) + { + __m128i x0, x1, x2, y0, y1, y2; + x0 = _mm_load_si128((const __m128i*)(S0 + x)); + x1 = _mm_load_si128((const __m128i*)(S0 + x + 4)); + y0 = _mm_load_si128((const __m128i*)(S1 + x)); + y1 = _mm_load_si128((const __m128i*)(S1 + x + 4)); + x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4)); + y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4)); + + x1 = _mm_load_si128((const __m128i*)(S0 + x + 8)); + x2 = _mm_load_si128((const __m128i*)(S0 + x + 12)); + y1 = _mm_load_si128((const __m128i*)(S1 + x + 8)); + y2 = _mm_load_si128((const __m128i*)(S1 + x + 12)); + x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4)); + y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4)); + + x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 )); + x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 )); + + x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2); + x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2); + _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1)); + } + else + for( ; x <= width - 16; x += 16 ) + { + __m128i x0, x1, x2, y0, y1, y2; + x0 = _mm_loadu_si128((const __m128i*)(S0 + x)); + x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4)); + y0 = _mm_loadu_si128((const __m128i*)(S1 + x)); + y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4)); + x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4)); + y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4)); + + x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 8)); + x2 = _mm_loadu_si128((const __m128i*)(S0 + x + 12)); + y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 8)); + y2 = _mm_loadu_si128((const __m128i*)(S1 + x + 12)); + x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4)); + y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4)); + + x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 )); + x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 )); + + x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2); + x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2); + _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1)); + } + + for( ; x < width - 4; x += 4 ) + { + __m128i x0, y0; + x0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S0 + x)), 4); + y0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S1 + x)), 4); + x0 = _mm_packs_epi32(x0, x0); + y0 = _mm_packs_epi32(y0, y0); + x0 = _mm_adds_epi16(_mm_mulhi_epi16(x0, b0), _mm_mulhi_epi16(y0, b1)); + x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2); + x0 = _mm_packus_epi16(x0, x0); + *(int*)(dst + x) = _mm_cvtsi128_si32(x0); + } + + return x; + } +}; + + +template struct VResizeLinearVec_32f16 +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + if( !checkHardwareSupport(CV_CPU_SSE2) ) + return 0; + + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1]; + ushort* dst = (ushort*)_dst; + int x = 0; + + __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]); + __m128i preshift = _mm_set1_epi32(shiftval); + __m128i postshift = _mm_set1_epi16((short)shiftval); + + if( (((size_t)S0|(size_t)S1)&15) == 0 ) + for( ; x <= width - 16; x += 16 ) + { + __m128 x0, x1, y0, y1; + __m128i t0, t1, t2; + x0 = _mm_load_ps(S0 + x); + x1 = _mm_load_ps(S0 + x + 4); + y0 = _mm_load_ps(S1 + x); + y1 = _mm_load_ps(S1 + x + 4); + + x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); + x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); + t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); + t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); + t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift); + + x0 = _mm_load_ps(S0 + x + 8); + x1 = _mm_load_ps(S0 + x + 12); + y0 = _mm_load_ps(S1 + x + 8); + y1 = _mm_load_ps(S1 + x + 12); + + x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); + x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); + t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); + t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); + t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift); + + _mm_storeu_si128( (__m128i*)(dst + x), t0); + _mm_storeu_si128( (__m128i*)(dst + x + 8), t1); + } + else + for( ; x <= width - 16; x += 16 ) + { + __m128 x0, x1, y0, y1; + __m128i t0, t1, t2; + x0 = _mm_loadu_ps(S0 + x); + x1 = _mm_loadu_ps(S0 + x + 4); + y0 = _mm_loadu_ps(S1 + x); + y1 = _mm_loadu_ps(S1 + x + 4); + + x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); + x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); + t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); + t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); + t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift); + + x0 = _mm_loadu_ps(S0 + x + 8); + x1 = _mm_loadu_ps(S0 + x + 12); + y0 = _mm_loadu_ps(S1 + x + 8); + y1 = _mm_loadu_ps(S1 + x + 12); + + x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); + x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); + t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); + t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); + t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift); + + _mm_storeu_si128( (__m128i*)(dst + x), t0); + _mm_storeu_si128( (__m128i*)(dst + x + 8), t1); + } + + for( ; x < width - 4; x += 4 ) + { + __m128 x0, y0; + __m128i t0; + x0 = _mm_loadu_ps(S0 + x); + y0 = _mm_loadu_ps(S1 + x); + + x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); + t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); + t0 = _mm_add_epi16(_mm_packs_epi32(t0, t0), postshift); + _mm_storel_epi64( (__m128i*)(dst + x), t0); + } + + return x; + } +}; + +typedef VResizeLinearVec_32f16 VResizeLinearVec_32f16u; +typedef VResizeLinearVec_32f16<0> VResizeLinearVec_32f16s; + +struct VResizeLinearVec_32f +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + if( !checkHardwareSupport(CV_CPU_SSE) ) + return 0; + + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1]; + float* dst = (float*)_dst; + int x = 0; + + __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]); + + if( (((size_t)S0|(size_t)S1)&15) == 0 ) + for( ; x <= width - 8; x += 8 ) + { + __m128 x0, x1, y0, y1; + x0 = _mm_load_ps(S0 + x); + x1 = _mm_load_ps(S0 + x + 4); + y0 = _mm_load_ps(S1 + x); + y1 = _mm_load_ps(S1 + x + 4); + + x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); + x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); + + _mm_storeu_ps( dst + x, x0); + _mm_storeu_ps( dst + x + 4, x1); + } + else + for( ; x <= width - 8; x += 8 ) + { + __m128 x0, x1, y0, y1; + x0 = _mm_loadu_ps(S0 + x); + x1 = _mm_loadu_ps(S0 + x + 4); + y0 = _mm_loadu_ps(S1 + x); + y1 = _mm_loadu_ps(S1 + x + 4); + + x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); + x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); + + _mm_storeu_ps( dst + x, x0); + _mm_storeu_ps( dst + x + 4, x1); + } + + return x; + } +}; + + +struct VResizeCubicVec_32s8u +{ + int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const + { + if( !checkHardwareSupport(CV_CPU_SSE2) ) + return 0; + + const int** src = (const int**)_src; + const short* beta = (const short*)_beta; + const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + int x = 0; + float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE); + __m128 b0 = _mm_set1_ps(beta[0]*scale), b1 = _mm_set1_ps(beta[1]*scale), + b2 = _mm_set1_ps(beta[2]*scale), b3 = _mm_set1_ps(beta[3]*scale); + + if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&15) == 0 ) + for( ; x <= width - 8; x += 8 ) + { + __m128i x0, x1, y0, y1; + __m128 s0, s1, f0, f1; + x0 = _mm_load_si128((const __m128i*)(S0 + x)); + x1 = _mm_load_si128((const __m128i*)(S0 + x + 4)); + y0 = _mm_load_si128((const __m128i*)(S1 + x)); + y1 = _mm_load_si128((const __m128i*)(S1 + x + 4)); + + s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0); + s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0); + f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1); + f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1); + s0 = _mm_add_ps(s0, f0); + s1 = _mm_add_ps(s1, f1); + + x0 = _mm_load_si128((const __m128i*)(S2 + x)); + x1 = _mm_load_si128((const __m128i*)(S2 + x + 4)); + y0 = _mm_load_si128((const __m128i*)(S3 + x)); + y1 = _mm_load_si128((const __m128i*)(S3 + x + 4)); + + f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2); + f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2); + s0 = _mm_add_ps(s0, f0); + s1 = _mm_add_ps(s1, f1); + f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3); + f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3); + s0 = _mm_add_ps(s0, f0); + s1 = _mm_add_ps(s1, f1); + + x0 = _mm_cvtps_epi32(s0); + x1 = _mm_cvtps_epi32(s1); + + x0 = _mm_packs_epi32(x0, x1); + _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0)); + } + else + for( ; x <= width - 8; x += 8 ) + { + __m128i x0, x1, y0, y1; + __m128 s0, s1, f0, f1; + x0 = _mm_loadu_si128((const __m128i*)(S0 + x)); + x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4)); + y0 = _mm_loadu_si128((const __m128i*)(S1 + x)); + y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4)); + + s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0); + s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0); + f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1); + f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1); + s0 = _mm_add_ps(s0, f0); + s1 = _mm_add_ps(s1, f1); + + x0 = _mm_loadu_si128((const __m128i*)(S2 + x)); + x1 = _mm_loadu_si128((const __m128i*)(S2 + x + 4)); + y0 = _mm_loadu_si128((const __m128i*)(S3 + x)); + y1 = _mm_loadu_si128((const __m128i*)(S3 + x + 4)); + + f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2); + f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2); + s0 = _mm_add_ps(s0, f0); + s1 = _mm_add_ps(s1, f1); + f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3); + f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3); + s0 = _mm_add_ps(s0, f0); + s1 = _mm_add_ps(s1, f1); + + x0 = _mm_cvtps_epi32(s0); + x1 = _mm_cvtps_epi32(s1); + + x0 = _mm_packs_epi32(x0, x1); + _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0)); + } + + return x; + } +}; + + +template struct VResizeCubicVec_32f16 +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + if( !checkHardwareSupport(CV_CPU_SSE2) ) + return 0; + + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + ushort* dst = (ushort*)_dst; + int x = 0; + __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]), + b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]); + __m128i preshift = _mm_set1_epi32(shiftval); + __m128i postshift = _mm_set1_epi16((short)shiftval); + + for( ; x <= width - 8; x += 8 ) + { + __m128 x0, x1, y0, y1, s0, s1; + __m128i t0, t1; + x0 = _mm_loadu_ps(S0 + x); + x1 = _mm_loadu_ps(S0 + x + 4); + y0 = _mm_loadu_ps(S1 + x); + y1 = _mm_loadu_ps(S1 + x + 4); + + s0 = _mm_mul_ps(x0, b0); + s1 = _mm_mul_ps(x1, b0); + y0 = _mm_mul_ps(y0, b1); + y1 = _mm_mul_ps(y1, b1); + s0 = _mm_add_ps(s0, y0); + s1 = _mm_add_ps(s1, y1); + + x0 = _mm_loadu_ps(S2 + x); + x1 = _mm_loadu_ps(S2 + x + 4); + y0 = _mm_loadu_ps(S3 + x); + y1 = _mm_loadu_ps(S3 + x + 4); + + x0 = _mm_mul_ps(x0, b2); + x1 = _mm_mul_ps(x1, b2); + y0 = _mm_mul_ps(y0, b3); + y1 = _mm_mul_ps(y1, b3); + s0 = _mm_add_ps(s0, x0); + s1 = _mm_add_ps(s1, x1); + s0 = _mm_add_ps(s0, y0); + s1 = _mm_add_ps(s1, y1); + + t0 = _mm_add_epi32(_mm_cvtps_epi32(s0), preshift); + t1 = _mm_add_epi32(_mm_cvtps_epi32(s1), preshift); + + t0 = _mm_add_epi16(_mm_packs_epi32(t0, t1), postshift); + _mm_storeu_si128( (__m128i*)(dst + x), t0); + } + + return x; + } +}; + +typedef VResizeCubicVec_32f16 VResizeCubicVec_32f16u; +typedef VResizeCubicVec_32f16<0> VResizeCubicVec_32f16s; + +struct VResizeCubicVec_32f +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + if( !checkHardwareSupport(CV_CPU_SSE) ) + return 0; + + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + float* dst = (float*)_dst; + int x = 0; + __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]), + b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]); + + for( ; x <= width - 8; x += 8 ) + { + __m128 x0, x1, y0, y1, s0, s1; + x0 = _mm_loadu_ps(S0 + x); + x1 = _mm_loadu_ps(S0 + x + 4); + y0 = _mm_loadu_ps(S1 + x); + y1 = _mm_loadu_ps(S1 + x + 4); + + s0 = _mm_mul_ps(x0, b0); + s1 = _mm_mul_ps(x1, b0); + y0 = _mm_mul_ps(y0, b1); + y1 = _mm_mul_ps(y1, b1); + s0 = _mm_add_ps(s0, y0); + s1 = _mm_add_ps(s1, y1); + + x0 = _mm_loadu_ps(S2 + x); + x1 = _mm_loadu_ps(S2 + x + 4); + y0 = _mm_loadu_ps(S3 + x); + y1 = _mm_loadu_ps(S3 + x + 4); + + x0 = _mm_mul_ps(x0, b2); + x1 = _mm_mul_ps(x1, b2); + y0 = _mm_mul_ps(y0, b3); + y1 = _mm_mul_ps(y1, b3); + s0 = _mm_add_ps(s0, x0); + s1 = _mm_add_ps(s1, x1); + s0 = _mm_add_ps(s0, y0); + s1 = _mm_add_ps(s1, y1); + + _mm_storeu_ps( dst + x, s0); + _mm_storeu_ps( dst + x + 4, s1); + } + + return x; + } +}; + +#if CV_TRY_SSE4_1 + +struct VResizeLanczos4Vec_32f16u +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + if (CV_CPU_HAS_SUPPORT_SSE4_1) return opt_SSE4_1::VResizeLanczos4Vec_32f16u_SSE41(_src, _dst, _beta, width); + else return 0; + } +}; + +#else + +typedef VResizeNoVec VResizeLanczos4Vec_32f16u; + +#endif + +struct VResizeLanczos4Vec_32f16s +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], + *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; + short * dst = (short*)_dst; + int x = 0; + __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]), + v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]), + v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]), + v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]); + + for( ; x <= width - 8; x += 8 ) + { + __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x)); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x))); + + __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4)); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4))); + + __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0); + __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dsti0, v_dsti1)); + } + + return x; + } +}; + + +struct VResizeLanczos4Vec_32f +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], + *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; + float* dst = (float*)_dst; + int x = 0; + + __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]), + v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]), + v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]), + v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]); + + for( ; x <= width - 4; x += 4 ) + { + __m128 v_dst = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x)); + v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x))); + v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x))); + v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x))); + v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x))); + v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x))); + v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x))); + v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x))); + + _mm_storeu_ps(dst + x, v_dst); + } + + return x; + } +}; + + +#elif CV_NEON + +struct VResizeLinearVec_32s8u +{ + int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const + { + const int** src = (const int**)_src, *S0 = src[0], *S1 = src[1]; + const short* beta = (const short*)_beta; + int x = 0; + int16x8_t v_b0 = vdupq_n_s16(beta[0]), v_b1 = vdupq_n_s16(beta[1]), v_delta = vdupq_n_s16(2); + + for( ; x <= width - 16; x += 16) + { + int32x4_t v_src00 = vshrq_n_s32(vld1q_s32(S0 + x), 4), v_src10 = vshrq_n_s32(vld1q_s32(S1 + x), 4); + int32x4_t v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 4), 4), v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 4), 4); + + int16x8_t v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01)); + int16x8_t v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11)); + + int16x8_t v_dst0 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1), + vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1)); + v_dst0 = vshrq_n_s16(vaddq_s16(v_dst0, v_delta), 2); + + v_src00 = vshrq_n_s32(vld1q_s32(S0 + x + 8), 4); + v_src10 = vshrq_n_s32(vld1q_s32(S1 + x + 8), 4); + v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 12), 4); + v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 12), 4); + + v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01)); + v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11)); + + int16x8_t v_dst1 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1), + vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1)); + v_dst1 = vshrq_n_s16(vaddq_s16(v_dst1, v_delta), 2); + + vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_dst0), vqmovun_s16(v_dst1))); + } + + return x; + } +}; + +struct VResizeLinearVec_32f16u +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1]; + ushort* dst = (ushort*)_dst; + int x = 0; + + float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]); + + for( ; x <= width - 8; x += 8 ) + { + float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4); + float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4); + + float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1); + float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1); + + vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst1)))); + } + + return x; + } +}; + +struct VResizeLinearVec_32f16s +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1]; + short* dst = (short*)_dst; + int x = 0; + + float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]); + + for( ; x <= width - 8; x += 8 ) + { + float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4); + float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4); + + float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1); + float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1); + + vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst1)))); + } + + return x; + } +}; + +struct VResizeLinearVec_32f +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1]; + float* dst = (float*)_dst; + int x = 0; + + float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]); + + for( ; x <= width - 8; x += 8 ) + { + float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4); + float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4); + + vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1)); + vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1)); + } + + return x; + } +}; + +typedef VResizeNoVec VResizeCubicVec_32s8u; + +struct VResizeCubicVec_32f16u +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + ushort* dst = (ushort*)_dst; + int x = 0; + float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), + v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]); + + for( ; x <= width - 8; x += 8 ) + { + float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), + v_b1, vld1q_f32(S1 + x)), + v_b2, vld1q_f32(S2 + x)), + v_b3, vld1q_f32(S3 + x)); + float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), + v_b1, vld1q_f32(S1 + x + 4)), + v_b2, vld1q_f32(S2 + x + 4)), + v_b3, vld1q_f32(S3 + x + 4)); + + vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst1)))); + } + + return x; + } +}; + +struct VResizeCubicVec_32f16s +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + short* dst = (short*)_dst; + int x = 0; + float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), + v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]); + + for( ; x <= width - 8; x += 8 ) + { + float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), + v_b1, vld1q_f32(S1 + x)), + v_b2, vld1q_f32(S2 + x)), + v_b3, vld1q_f32(S3 + x)); + float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), + v_b1, vld1q_f32(S1 + x + 4)), + v_b2, vld1q_f32(S2 + x + 4)), + v_b3, vld1q_f32(S3 + x + 4)); + + vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst1)))); + } + + return x; + } +}; + +struct VResizeCubicVec_32f +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + float* dst = (float*)_dst; + int x = 0; + float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), + v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]); + + for( ; x <= width - 8; x += 8 ) + { + vst1q_f32(dst + x, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), + v_b1, vld1q_f32(S1 + x)), + v_b2, vld1q_f32(S2 + x)), + v_b3, vld1q_f32(S3 + x))); + vst1q_f32(dst + x + 4, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), + v_b1, vld1q_f32(S1 + x + 4)), + v_b2, vld1q_f32(S2 + x + 4)), + v_b3, vld1q_f32(S3 + x + 4))); + } + + return x; + } +}; + +struct VResizeLanczos4Vec_32f16u +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], + *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; + ushort * dst = (ushort*)_dst; + int x = 0; + float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), + v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]), + v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]), + v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]); + + for( ; x <= width - 8; x += 8 ) + { + float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), + v_b1, vld1q_f32(S1 + x)), + v_b2, vld1q_f32(S2 + x)), + v_b3, vld1q_f32(S3 + x)); + float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)), + v_b5, vld1q_f32(S5 + x)), + v_b6, vld1q_f32(S6 + x)), + v_b7, vld1q_f32(S7 + x)); + float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1); + + v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), + v_b1, vld1q_f32(S1 + x + 4)), + v_b2, vld1q_f32(S2 + x + 4)), + v_b3, vld1q_f32(S3 + x + 4)); + v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)), + v_b5, vld1q_f32(S5 + x + 4)), + v_b6, vld1q_f32(S6 + x + 4)), + v_b7, vld1q_f32(S7 + x + 4)); + v_dst1 = vaddq_f32(v_dst0, v_dst1); + + vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst1)))); + } + + return x; + } +}; + +struct VResizeLanczos4Vec_32f16s +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], + *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; + short * dst = (short*)_dst; + int x = 0; + float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), + v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]), + v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]), + v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]); + + for( ; x <= width - 8; x += 8 ) + { + float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), + v_b1, vld1q_f32(S1 + x)), + v_b2, vld1q_f32(S2 + x)), + v_b3, vld1q_f32(S3 + x)); + float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)), + v_b5, vld1q_f32(S5 + x)), + v_b6, vld1q_f32(S6 + x)), + v_b7, vld1q_f32(S7 + x)); + float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1); + + v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), + v_b1, vld1q_f32(S1 + x + 4)), + v_b2, vld1q_f32(S2 + x + 4)), + v_b3, vld1q_f32(S3 + x + 4)); + v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)), + v_b5, vld1q_f32(S5 + x + 4)), + v_b6, vld1q_f32(S6 + x + 4)), + v_b7, vld1q_f32(S7 + x + 4)); + v_dst1 = vaddq_f32(v_dst0, v_dst1); + + vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst1)))); + } + + return x; + } +}; + +struct VResizeLanczos4Vec_32f +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], + *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; + float* dst = (float*)_dst; + int x = 0; + float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), + v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]), + v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]), + v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]); + + for( ; x <= width - 4; x += 4 ) + { + float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), + v_b1, vld1q_f32(S1 + x)), + v_b2, vld1q_f32(S2 + x)), + v_b3, vld1q_f32(S3 + x)); + float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)), + v_b5, vld1q_f32(S5 + x)), + v_b6, vld1q_f32(S6 + x)), + v_b7, vld1q_f32(S7 + x)); + vst1q_f32(dst + x, vaddq_f32(v_dst0, v_dst1)); + } + + return x; + } +}; + +#else + +typedef VResizeNoVec VResizeLinearVec_32s8u; +typedef VResizeNoVec VResizeLinearVec_32f16u; +typedef VResizeNoVec VResizeLinearVec_32f16s; +typedef VResizeNoVec VResizeLinearVec_32f; + +typedef VResizeNoVec VResizeCubicVec_32s8u; +typedef VResizeNoVec VResizeCubicVec_32f16u; +typedef VResizeNoVec VResizeCubicVec_32f16s; +typedef VResizeNoVec VResizeCubicVec_32f; + +typedef VResizeNoVec VResizeLanczos4Vec_32f16u; +typedef VResizeNoVec VResizeLanczos4Vec_32f16s; +typedef VResizeNoVec VResizeLanczos4Vec_32f; + +#endif + +typedef HResizeNoVec HResizeLinearVec_8u32s; +typedef HResizeNoVec HResizeLinearVec_16u32f; +typedef HResizeNoVec HResizeLinearVec_16s32f; +typedef HResizeNoVec HResizeLinearVec_32f; +typedef HResizeNoVec HResizeLinearVec_64f; + + +template +struct HResizeLinear +{ + typedef T value_type; + typedef WT buf_type; + typedef AT alpha_type; + + void operator()(const T** src, WT** dst, int count, + const int* xofs, const AT* alpha, + int swidth, int dwidth, int cn, int xmin, int xmax ) const + { + int dx, k; + VecOp vecOp; + + int dx0 = vecOp((const uchar**)src, (uchar**)dst, count, + xofs, (const uchar*)alpha, swidth, dwidth, cn, xmin, xmax ); + + for( k = 0; k <= count - 2; k++ ) + { + const T *S0 = src[k], *S1 = src[k+1]; + WT *D0 = dst[k], *D1 = dst[k+1]; + for( dx = dx0; dx < xmax; dx++ ) + { + int sx = xofs[dx]; + WT a0 = alpha[dx*2], a1 = alpha[dx*2+1]; + WT t0 = S0[sx]*a0 + S0[sx + cn]*a1; + WT t1 = S1[sx]*a0 + S1[sx + cn]*a1; + D0[dx] = t0; D1[dx] = t1; + } + + for( ; dx < dwidth; dx++ ) + { + int sx = xofs[dx]; + D0[dx] = WT(S0[sx]*ONE); D1[dx] = WT(S1[sx]*ONE); + } + } + + for( ; k < count; k++ ) + { + const T *S = src[k]; + WT *D = dst[k]; + for( dx = 0; dx < xmax; dx++ ) + { + int sx = xofs[dx]; + D[dx] = S[sx]*alpha[dx*2] + S[sx+cn]*alpha[dx*2+1]; + } + + for( ; dx < dwidth; dx++ ) + D[dx] = WT(S[xofs[dx]]*ONE); + } + } +}; + + +template +struct VResizeLinear +{ + typedef T value_type; + typedef WT buf_type; + typedef AT alpha_type; + + void operator()(const WT** src, T* dst, const AT* beta, int width ) const + { + WT b0 = beta[0], b1 = beta[1]; + const WT *S0 = src[0], *S1 = src[1]; + CastOp castOp; + VecOp vecOp; + + int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width); + #if CV_ENABLE_UNROLLED + for( ; x <= width - 4; x += 4 ) + { + WT t0, t1; + t0 = S0[x]*b0 + S1[x]*b1; + t1 = S0[x+1]*b0 + S1[x+1]*b1; + dst[x] = castOp(t0); dst[x+1] = castOp(t1); + t0 = S0[x+2]*b0 + S1[x+2]*b1; + t1 = S0[x+3]*b0 + S1[x+3]*b1; + dst[x+2] = castOp(t0); dst[x+3] = castOp(t1); + } + #endif + for( ; x < width; x++ ) + dst[x] = castOp(S0[x]*b0 + S1[x]*b1); + } +}; + +template<> +struct VResizeLinear, VResizeLinearVec_32s8u> +{ + typedef uchar value_type; + typedef int buf_type; + typedef short alpha_type; + + void operator()(const buf_type** src, value_type* dst, const alpha_type* beta, int width ) const + { + alpha_type b0 = beta[0], b1 = beta[1]; + const buf_type *S0 = src[0], *S1 = src[1]; + VResizeLinearVec_32s8u vecOp; + + int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width); + #if CV_ENABLE_UNROLLED + for( ; x <= width - 4; x += 4 ) + { + dst[x+0] = uchar(( ((b0 * (S0[x+0] >> 4)) >> 16) + ((b1 * (S1[x+0] >> 4)) >> 16) + 2)>>2); + dst[x+1] = uchar(( ((b0 * (S0[x+1] >> 4)) >> 16) + ((b1 * (S1[x+1] >> 4)) >> 16) + 2)>>2); + dst[x+2] = uchar(( ((b0 * (S0[x+2] >> 4)) >> 16) + ((b1 * (S1[x+2] >> 4)) >> 16) + 2)>>2); + dst[x+3] = uchar(( ((b0 * (S0[x+3] >> 4)) >> 16) + ((b1 * (S1[x+3] >> 4)) >> 16) + 2)>>2); + } + #endif + for( ; x < width; x++ ) + dst[x] = uchar(( ((b0 * (S0[x] >> 4)) >> 16) + ((b1 * (S1[x] >> 4)) >> 16) + 2)>>2); + } +}; + + +template +struct HResizeCubic +{ + typedef T value_type; + typedef WT buf_type; + typedef AT alpha_type; + + void operator()(const T** src, WT** dst, int count, + const int* xofs, const AT* alpha, + int swidth, int dwidth, int cn, int xmin, int xmax ) const + { + for( int k = 0; k < count; k++ ) + { + const T *S = src[k]; + WT *D = dst[k]; + int dx = 0, limit = xmin; + for(;;) + { + for( ; dx < limit; dx++, alpha += 4 ) + { + int j, sx = xofs[dx] - cn; + WT v = 0; + for( j = 0; j < 4; j++ ) + { + int sxj = sx + j*cn; + if( (unsigned)sxj >= (unsigned)swidth ) + { + while( sxj < 0 ) + sxj += cn; + while( sxj >= swidth ) + sxj -= cn; + } + v += S[sxj]*alpha[j]; + } + D[dx] = v; + } + if( limit == dwidth ) + break; + for( ; dx < xmax; dx++, alpha += 4 ) + { + int sx = xofs[dx]; + D[dx] = S[sx-cn]*alpha[0] + S[sx]*alpha[1] + + S[sx+cn]*alpha[2] + S[sx+cn*2]*alpha[3]; + } + limit = dwidth; + } + alpha -= dwidth*4; + } + } +}; + + +template +struct VResizeCubic +{ + typedef T value_type; + typedef WT buf_type; + typedef AT alpha_type; + + void operator()(const WT** src, T* dst, const AT* beta, int width ) const + { + WT b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3]; + const WT *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + CastOp castOp; + VecOp vecOp; + + int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width); + for( ; x < width; x++ ) + dst[x] = castOp(S0[x]*b0 + S1[x]*b1 + S2[x]*b2 + S3[x]*b3); + } +}; + + +template +struct HResizeLanczos4 +{ + typedef T value_type; + typedef WT buf_type; + typedef AT alpha_type; + + void operator()(const T** src, WT** dst, int count, + const int* xofs, const AT* alpha, + int swidth, int dwidth, int cn, int xmin, int xmax ) const + { + for( int k = 0; k < count; k++ ) + { + const T *S = src[k]; + WT *D = dst[k]; + int dx = 0, limit = xmin; + for(;;) + { + for( ; dx < limit; dx++, alpha += 8 ) + { + int j, sx = xofs[dx] - cn*3; + WT v = 0; + for( j = 0; j < 8; j++ ) + { + int sxj = sx + j*cn; + if( (unsigned)sxj >= (unsigned)swidth ) + { + while( sxj < 0 ) + sxj += cn; + while( sxj >= swidth ) + sxj -= cn; + } + v += S[sxj]*alpha[j]; + } + D[dx] = v; + } + if( limit == dwidth ) + break; + for( ; dx < xmax; dx++, alpha += 8 ) + { + int sx = xofs[dx]; + D[dx] = S[sx-cn*3]*alpha[0] + S[sx-cn*2]*alpha[1] + + S[sx-cn]*alpha[2] + S[sx]*alpha[3] + + S[sx+cn]*alpha[4] + S[sx+cn*2]*alpha[5] + + S[sx+cn*3]*alpha[6] + S[sx+cn*4]*alpha[7]; + } + limit = dwidth; + } + alpha -= dwidth*8; + } + } +}; + + +template +struct VResizeLanczos4 +{ + typedef T value_type; + typedef WT buf_type; + typedef AT alpha_type; + + void operator()(const WT** src, T* dst, const AT* beta, int width ) const + { + CastOp castOp; + VecOp vecOp; + int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width); + #if CV_ENABLE_UNROLLED + for( ; x <= width - 4; x += 4 ) + { + WT b = beta[0]; + const WT* S = src[0]; + WT s0 = S[x]*b, s1 = S[x+1]*b, s2 = S[x+2]*b, s3 = S[x+3]*b; + + for( int k = 1; k < 8; k++ ) + { + b = beta[k]; S = src[k]; + s0 += S[x]*b; s1 += S[x+1]*b; + s2 += S[x+2]*b; s3 += S[x+3]*b; + } + + dst[x] = castOp(s0); dst[x+1] = castOp(s1); + dst[x+2] = castOp(s2); dst[x+3] = castOp(s3); + } + #endif + for( ; x < width; x++ ) + { + dst[x] = castOp(src[0][x]*beta[0] + src[1][x]*beta[1] + + src[2][x]*beta[2] + src[3][x]*beta[3] + src[4][x]*beta[4] + + src[5][x]*beta[5] + src[6][x]*beta[6] + src[7][x]*beta[7]); + } + } +}; + + +static inline int clip(int x, int a, int b) +{ + return x >= a ? (x < b ? x : b-1) : a; +} + +static const int MAX_ESIZE=16; + +template +class resizeGeneric_Invoker : + public ParallelLoopBody +{ +public: + typedef typename HResize::value_type T; + typedef typename HResize::buf_type WT; + typedef typename HResize::alpha_type AT; + + resizeGeneric_Invoker(const Mat& _src, Mat &_dst, const int *_xofs, const int *_yofs, + const AT* _alpha, const AT* __beta, const Size& _ssize, const Size &_dsize, + int _ksize, int _xmin, int _xmax) : + ParallelLoopBody(), src(_src), dst(_dst), xofs(_xofs), yofs(_yofs), + alpha(_alpha), _beta(__beta), ssize(_ssize), dsize(_dsize), + ksize(_ksize), xmin(_xmin), xmax(_xmax) + { + CV_Assert(ksize <= MAX_ESIZE); + } + + virtual void operator() (const Range& range) const + { + int dy, cn = src.channels(); + HResize hresize; + VResize vresize; + + int bufstep = (int)alignSize(dsize.width, 16); + AutoBuffer _buffer(bufstep*ksize); + const T* srows[MAX_ESIZE]={0}; + WT* rows[MAX_ESIZE]={0}; + int prev_sy[MAX_ESIZE]; + + for(int k = 0; k < ksize; k++ ) + { + prev_sy[k] = -1; + rows[k] = (WT*)_buffer + bufstep*k; + } + + const AT* beta = _beta + ksize * range.start; + + for( dy = range.start; dy < range.end; dy++, beta += ksize ) + { + int sy0 = yofs[dy], k0=ksize, k1=0, ksize2 = ksize/2; + + for(int k = 0; k < ksize; k++ ) + { + int sy = clip(sy0 - ksize2 + 1 + k, 0, ssize.height); + for( k1 = std::max(k1, k); k1 < ksize; k1++ ) + { + if( k1 < MAX_ESIZE && sy == prev_sy[k1] ) // if the sy-th row has been computed already, reuse it. + { + if( k1 > k ) + memcpy( rows[k], rows[k1], bufstep*sizeof(rows[0][0]) ); + break; + } + } + if( k1 == ksize ) + k0 = std::min(k0, k); // remember the first row that needs to be computed + srows[k] = src.template ptr(sy); + prev_sy[k] = sy; + } + + if( k0 < ksize ) + hresize( (const T**)(srows + k0), (WT**)(rows + k0), ksize - k0, xofs, (const AT*)(alpha), + ssize.width, dsize.width, cn, xmin, xmax ); + vresize( (const WT**)rows, (T*)(dst.data + dst.step*dy), beta, dsize.width ); + } + } + +private: + Mat src; + Mat dst; + const int* xofs, *yofs; + const AT* alpha, *_beta; + Size ssize, dsize; + const int ksize, xmin, xmax; + + resizeGeneric_Invoker& operator = (const resizeGeneric_Invoker&); +}; + +template +static void resizeGeneric_( const Mat& src, Mat& dst, + const int* xofs, const void* _alpha, + const int* yofs, const void* _beta, + int xmin, int xmax, int ksize ) +{ + typedef typename HResize::alpha_type AT; + + const AT* beta = (const AT*)_beta; + Size ssize = src.size(), dsize = dst.size(); + int cn = src.channels(); + ssize.width *= cn; + dsize.width *= cn; + xmin *= cn; + xmax *= cn; + // image resize is a separable operation. In case of not too strong + + Range range(0, dsize.height); + resizeGeneric_Invoker invoker(src, dst, xofs, yofs, (const AT*)_alpha, beta, + ssize, dsize, ksize, xmin, xmax); + parallel_for_(range, invoker, dst.total()/(double)(1<<16)); +} + +template +struct ResizeAreaFastNoVec +{ + ResizeAreaFastNoVec(int, int) { } + ResizeAreaFastNoVec(int, int, int, int) { } + int operator() (const T*, T*, int) const + { return 0; } +}; + +#if CV_NEON + +class ResizeAreaFastVec_SIMD_8u +{ +public: + ResizeAreaFastVec_SIMD_8u(int _cn, int _step) : + cn(_cn), step(_step) + { + } + + int operator() (const uchar* S, uchar* D, int w) const + { + int dx = 0; + const uchar* S0 = S, * S1 = S0 + step; + + uint16x8_t v_2 = vdupq_n_u16(2); + + if (cn == 1) + { + for ( ; dx <= w - 16; dx += 16, S0 += 32, S1 += 32, D += 16) + { + uint8x16x2_t v_row0 = vld2q_u8(S0), v_row1 = vld2q_u8(S1); + + uint16x8_t v_dst0 = vaddl_u8(vget_low_u8(v_row0.val[0]), vget_low_u8(v_row0.val[1])); + v_dst0 = vaddq_u16(v_dst0, vaddl_u8(vget_low_u8(v_row1.val[0]), vget_low_u8(v_row1.val[1]))); + v_dst0 = vshrq_n_u16(vaddq_u16(v_dst0, v_2), 2); + + uint16x8_t v_dst1 = vaddl_u8(vget_high_u8(v_row0.val[0]), vget_high_u8(v_row0.val[1])); + v_dst1 = vaddq_u16(v_dst1, vaddl_u8(vget_high_u8(v_row1.val[0]), vget_high_u8(v_row1.val[1]))); + v_dst1 = vshrq_n_u16(vaddq_u16(v_dst1, v_2), 2); + + vst1q_u8(D, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1))); + } + } + else if (cn == 4) + { + for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) + { + uint8x16_t v_row0 = vld1q_u8(S0), v_row1 = vld1q_u8(S1); + + uint16x8_t v_row00 = vmovl_u8(vget_low_u8(v_row0)); + uint16x8_t v_row01 = vmovl_u8(vget_high_u8(v_row0)); + uint16x8_t v_row10 = vmovl_u8(vget_low_u8(v_row1)); + uint16x8_t v_row11 = vmovl_u8(vget_high_u8(v_row1)); + + uint16x4_t v_p0 = vadd_u16(vadd_u16(vget_low_u16(v_row00), vget_high_u16(v_row00)), + vadd_u16(vget_low_u16(v_row10), vget_high_u16(v_row10))); + uint16x4_t v_p1 = vadd_u16(vadd_u16(vget_low_u16(v_row01), vget_high_u16(v_row01)), + vadd_u16(vget_low_u16(v_row11), vget_high_u16(v_row11))); + uint16x8_t v_dst = vshrq_n_u16(vaddq_u16(vcombine_u16(v_p0, v_p1), v_2), 2); + + vst1_u8(D, vmovn_u16(v_dst)); + } + } + + return dx; + } + +private: + int cn, step; +}; + +class ResizeAreaFastVec_SIMD_16u +{ +public: + ResizeAreaFastVec_SIMD_16u(int _cn, int _step) : + cn(_cn), step(_step) + { + } + + int operator() (const ushort * S, ushort * D, int w) const + { + int dx = 0; + const ushort * S0 = S, * S1 = (const ushort *)((const uchar *)(S0) + step); + + uint32x4_t v_2 = vdupq_n_u32(2); + + if (cn == 1) + { + for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) + { + uint16x8x2_t v_row0 = vld2q_u16(S0), v_row1 = vld2q_u16(S1); + + uint32x4_t v_dst0 = vaddl_u16(vget_low_u16(v_row0.val[0]), vget_low_u16(v_row0.val[1])); + v_dst0 = vaddq_u32(v_dst0, vaddl_u16(vget_low_u16(v_row1.val[0]), vget_low_u16(v_row1.val[1]))); + v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_2), 2); + + uint32x4_t v_dst1 = vaddl_u16(vget_high_u16(v_row0.val[0]), vget_high_u16(v_row0.val[1])); + v_dst1 = vaddq_u32(v_dst1, vaddl_u16(vget_high_u16(v_row1.val[0]), vget_high_u16(v_row1.val[1]))); + v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_2), 2); + + vst1q_u16(D, vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1))); + } + } + else if (cn == 4) + { + for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + { + uint16x8_t v_row0 = vld1q_u16(S0), v_row1 = vld1q_u16(S1); + uint32x4_t v_dst = vaddq_u32(vaddl_u16(vget_low_u16(v_row0), vget_high_u16(v_row0)), + vaddl_u16(vget_low_u16(v_row1), vget_high_u16(v_row1))); + vst1_u16(D, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_2), 2))); + } + } + + return dx; + } + +private: + int cn, step; +}; + +class ResizeAreaFastVec_SIMD_16s +{ +public: + ResizeAreaFastVec_SIMD_16s(int _cn, int _step) : + cn(_cn), step(_step) + { + } + + int operator() (const short * S, short * D, int w) const + { + int dx = 0; + const short * S0 = S, * S1 = (const short *)((const uchar *)(S0) + step); + + int32x4_t v_2 = vdupq_n_s32(2); + + if (cn == 1) + { + for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) + { + int16x8x2_t v_row0 = vld2q_s16(S0), v_row1 = vld2q_s16(S1); + + int32x4_t v_dst0 = vaddl_s16(vget_low_s16(v_row0.val[0]), vget_low_s16(v_row0.val[1])); + v_dst0 = vaddq_s32(v_dst0, vaddl_s16(vget_low_s16(v_row1.val[0]), vget_low_s16(v_row1.val[1]))); + v_dst0 = vshrq_n_s32(vaddq_s32(v_dst0, v_2), 2); + + int32x4_t v_dst1 = vaddl_s16(vget_high_s16(v_row0.val[0]), vget_high_s16(v_row0.val[1])); + v_dst1 = vaddq_s32(v_dst1, vaddl_s16(vget_high_s16(v_row1.val[0]), vget_high_s16(v_row1.val[1]))); + v_dst1 = vshrq_n_s32(vaddq_s32(v_dst1, v_2), 2); + + vst1q_s16(D, vcombine_s16(vmovn_s32(v_dst0), vmovn_s32(v_dst1))); + } + } + else if (cn == 4) + { + for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + { + int16x8_t v_row0 = vld1q_s16(S0), v_row1 = vld1q_s16(S1); + int32x4_t v_dst = vaddq_s32(vaddl_s16(vget_low_s16(v_row0), vget_high_s16(v_row0)), + vaddl_s16(vget_low_s16(v_row1), vget_high_s16(v_row1))); + vst1_s16(D, vmovn_s32(vshrq_n_s32(vaddq_s32(v_dst, v_2), 2))); + } + } + + return dx; + } + +private: + int cn, step; +}; + +struct ResizeAreaFastVec_SIMD_32f +{ + ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) : + cn(_cn), step(_step) + { + fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4); + } + + int operator() (const float * S, float * D, int w) const + { + if (!fast_mode) + return 0; + + const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step); + int dx = 0; + + float32x4_t v_025 = vdupq_n_f32(0.25f); + + if (cn == 1) + { + for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + { + float32x4x2_t v_row0 = vld2q_f32(S0), v_row1 = vld2q_f32(S1); + + float32x4_t v_dst0 = vaddq_f32(v_row0.val[0], v_row0.val[1]); + float32x4_t v_dst1 = vaddq_f32(v_row1.val[0], v_row1.val[1]); + + vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025)); + } + } + else if (cn == 4) + { + for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + { + float32x4_t v_dst0 = vaddq_f32(vld1q_f32(S0), vld1q_f32(S0 + 4)); + float32x4_t v_dst1 = vaddq_f32(vld1q_f32(S1), vld1q_f32(S1 + 4)); + + vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025)); + } + } + + return dx; + } + +private: + int cn; + bool fast_mode; + int step; +}; + +#elif CV_SSE2 + +class ResizeAreaFastVec_SIMD_8u +{ +public: + ResizeAreaFastVec_SIMD_8u(int _cn, int _step) : + cn(_cn), step(_step) + { + use_simd = checkHardwareSupport(CV_CPU_SSE2); + } + + int operator() (const uchar* S, uchar* D, int w) const + { + if (!use_simd) + return 0; + + int dx = 0; + const uchar* S0 = S; + const uchar* S1 = S0 + step; + __m128i zero = _mm_setzero_si128(); + __m128i delta2 = _mm_set1_epi16(2); + + if (cn == 1) + { + __m128i masklow = _mm_set1_epi16(0x00ff); + for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) + { + __m128i r0 = _mm_loadu_si128((const __m128i*)S0); + __m128i r1 = _mm_loadu_si128((const __m128i*)S1); + + __m128i s0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_and_si128(r0, masklow)); + __m128i s1 = _mm_add_epi16(_mm_srli_epi16(r1, 8), _mm_and_si128(r1, masklow)); + s0 = _mm_add_epi16(_mm_add_epi16(s0, s1), delta2); + s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero); + + _mm_storel_epi64((__m128i*)D, s0); + } + } + else if (cn == 3) + for ( ; dx <= w - 11; dx += 6, S0 += 12, S1 += 12, D += 6) + { + __m128i r0 = _mm_loadu_si128((const __m128i*)S0); + __m128i r1 = _mm_loadu_si128((const __m128i*)S1); + + __m128i r0_16l = _mm_unpacklo_epi8(r0, zero); + __m128i r0_16h = _mm_unpacklo_epi8(_mm_srli_si128(r0, 6), zero); + __m128i r1_16l = _mm_unpacklo_epi8(r1, zero); + __m128i r1_16h = _mm_unpacklo_epi8(_mm_srli_si128(r1, 6), zero); + + __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 6)); + __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 6)); + s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); + s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero); + _mm_storel_epi64((__m128i*)D, s0); + + s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 6)); + s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 6)); + s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); + s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero); + _mm_storel_epi64((__m128i*)(D+3), s0); + } + else + { + CV_Assert(cn == 4); + int v[] = { 0, 0, -1, -1 }; + __m128i mask = _mm_loadu_si128((const __m128i*)v); + + for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) + { + __m128i r0 = _mm_loadu_si128((const __m128i*)S0); + __m128i r1 = _mm_loadu_si128((const __m128i*)S1); + + __m128i r0_16l = _mm_unpacklo_epi8(r0, zero); + __m128i r0_16h = _mm_unpackhi_epi8(r0, zero); + __m128i r1_16l = _mm_unpacklo_epi8(r1, zero); + __m128i r1_16h = _mm_unpackhi_epi8(r1, zero); + + __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 8)); + __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 8)); + s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); + __m128i res0 = _mm_srli_epi16(s0, 2); + + s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 8)); + s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 8)); + s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); + __m128i res1 = _mm_srli_epi16(s0, 2); + s0 = _mm_packus_epi16(_mm_or_si128(_mm_andnot_si128(mask, res0), + _mm_and_si128(mask, _mm_slli_si128(res1, 8))), zero); + _mm_storel_epi64((__m128i*)(D), s0); + } + } + + return dx; + } + +private: + int cn; + bool use_simd; + int step; +}; + +class ResizeAreaFastVec_SIMD_16u +{ +public: + ResizeAreaFastVec_SIMD_16u(int _cn, int _step) : + cn(_cn), step(_step) + { + use_simd = checkHardwareSupport(CV_CPU_SSE2); + } + + int operator() (const ushort* S, ushort* D, int w) const + { + if (!use_simd) + return 0; + + int dx = 0; + const ushort* S0 = (const ushort*)S; + const ushort* S1 = (const ushort*)((const uchar*)(S) + step); + __m128i masklow = _mm_set1_epi32(0x0000ffff); + __m128i zero = _mm_setzero_si128(); + __m128i delta2 = _mm_set1_epi32(2); + +#define _mm_packus_epi32(a, zero) _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(a, 16), 16), zero) + + if (cn == 1) + { + for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + { + __m128i r0 = _mm_loadu_si128((const __m128i*)S0); + __m128i r1 = _mm_loadu_si128((const __m128i*)S1); + + __m128i s0 = _mm_add_epi32(_mm_srli_epi32(r0, 16), _mm_and_si128(r0, masklow)); + __m128i s1 = _mm_add_epi32(_mm_srli_epi32(r1, 16), _mm_and_si128(r1, masklow)); + s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2); + s0 = _mm_srli_epi32(s0, 2); + s0 = _mm_packus_epi32(s0, zero); + + _mm_storel_epi64((__m128i*)D, s0); + } + } + else if (cn == 3) + for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3) + { + __m128i r0 = _mm_loadu_si128((const __m128i*)S0); + __m128i r1 = _mm_loadu_si128((const __m128i*)S1); + + __m128i r0_16l = _mm_unpacklo_epi16(r0, zero); + __m128i r0_16h = _mm_unpacklo_epi16(_mm_srli_si128(r0, 6), zero); + __m128i r1_16l = _mm_unpacklo_epi16(r1, zero); + __m128i r1_16h = _mm_unpacklo_epi16(_mm_srli_si128(r1, 6), zero); + + __m128i s0 = _mm_add_epi32(r0_16l, r0_16h); + __m128i s1 = _mm_add_epi32(r1_16l, r1_16h); + s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1)); + s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero); + _mm_storel_epi64((__m128i*)D, s0); + } + else + { + CV_Assert(cn == 4); + for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + { + __m128i r0 = _mm_loadu_si128((const __m128i*)S0); + __m128i r1 = _mm_loadu_si128((const __m128i*)S1); + + __m128i r0_32l = _mm_unpacklo_epi16(r0, zero); + __m128i r0_32h = _mm_unpackhi_epi16(r0, zero); + __m128i r1_32l = _mm_unpacklo_epi16(r1, zero); + __m128i r1_32h = _mm_unpackhi_epi16(r1, zero); + + __m128i s0 = _mm_add_epi32(r0_32l, r0_32h); + __m128i s1 = _mm_add_epi32(r1_32l, r1_32h); + s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2)); + s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero); + _mm_storel_epi64((__m128i*)D, s0); + } + } + +#undef _mm_packus_epi32 + + return dx; + } + +private: + int cn; + int step; + bool use_simd; +}; + +class ResizeAreaFastVec_SIMD_16s +{ +public: + ResizeAreaFastVec_SIMD_16s(int _cn, int _step) : + cn(_cn), step(_step) + { + use_simd = checkHardwareSupport(CV_CPU_SSE2); + } + + int operator() (const short* S, short* D, int w) const + { + if (!use_simd) + return 0; + + int dx = 0; + const short* S0 = (const short*)S; + const short* S1 = (const short*)((const uchar*)(S) + step); + __m128i masklow = _mm_set1_epi32(0x0000ffff); + __m128i zero = _mm_setzero_si128(); + __m128i delta2 = _mm_set1_epi32(2); + + if (cn == 1) + { + for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + { + __m128i r0 = _mm_loadu_si128((const __m128i*)S0); + __m128i r1 = _mm_loadu_si128((const __m128i*)S1); + + __m128i s0 = _mm_add_epi32(_mm_srai_epi32(r0, 16), + _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r0, masklow), 16), 16)); + __m128i s1 = _mm_add_epi32(_mm_srai_epi32(r1, 16), + _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r1, masklow), 16), 16)); + s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2); + s0 = _mm_srai_epi32(s0, 2); + s0 = _mm_packs_epi32(s0, zero); + + _mm_storel_epi64((__m128i*)D, s0); + } + } + else if (cn == 3) + for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3) + { + __m128i r0 = _mm_loadu_si128((const __m128i*)S0); + __m128i r1 = _mm_loadu_si128((const __m128i*)S1); + + __m128i r0_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16); + __m128i r0_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r0, 6)), 16); + __m128i r1_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16); + __m128i r1_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r1, 6)), 16); + + __m128i s0 = _mm_add_epi32(r0_16l, r0_16h); + __m128i s1 = _mm_add_epi32(r1_16l, r1_16h); + s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1)); + s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero); + _mm_storel_epi64((__m128i*)D, s0); + } + else + { + CV_Assert(cn == 4); + for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + { + __m128i r0 = _mm_loadu_si128((const __m128i*)S0); + __m128i r1 = _mm_loadu_si128((const __m128i*)S1); + + __m128i r0_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16); + __m128i r0_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r0), 16); + __m128i r1_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16); + __m128i r1_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r1), 16); + + __m128i s0 = _mm_add_epi32(r0_32l, r0_32h); + __m128i s1 = _mm_add_epi32(r1_32l, r1_32h); + s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2)); + s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero); + _mm_storel_epi64((__m128i*)D, s0); + } + } + + return dx; + } + +private: + int cn; + int step; + bool use_simd; +}; + +struct ResizeAreaFastVec_SIMD_32f +{ + ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) : + cn(_cn), step(_step) + { + fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4); + fast_mode = fast_mode && checkHardwareSupport(CV_CPU_SSE2); + } + + int operator() (const float * S, float * D, int w) const + { + if (!fast_mode) + return 0; + + const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step); + int dx = 0; + + __m128 v_025 = _mm_set1_ps(0.25f); + + if (cn == 1) + { + const int shuffle_lo = _MM_SHUFFLE(2, 0, 2, 0), shuffle_hi = _MM_SHUFFLE(3, 1, 3, 1); + for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + { + __m128 v_row00 = _mm_loadu_ps(S0), v_row01 = _mm_loadu_ps(S0 + 4), + v_row10 = _mm_loadu_ps(S1), v_row11 = _mm_loadu_ps(S1 + 4); + + __m128 v_dst0 = _mm_add_ps(_mm_shuffle_ps(v_row00, v_row01, shuffle_lo), + _mm_shuffle_ps(v_row00, v_row01, shuffle_hi)); + __m128 v_dst1 = _mm_add_ps(_mm_shuffle_ps(v_row10, v_row11, shuffle_lo), + _mm_shuffle_ps(v_row10, v_row11, shuffle_hi)); + + _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025)); + } + } + else if (cn == 4) + { + for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + { + __m128 v_dst0 = _mm_add_ps(_mm_loadu_ps(S0), _mm_loadu_ps(S0 + 4)); + __m128 v_dst1 = _mm_add_ps(_mm_loadu_ps(S1), _mm_loadu_ps(S1 + 4)); + + _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025)); + } + } + + return dx; + } + +private: + int cn; + bool fast_mode; + int step; +}; + +#else + +typedef ResizeAreaFastNoVec ResizeAreaFastVec_SIMD_8u; +typedef ResizeAreaFastNoVec ResizeAreaFastVec_SIMD_16u; +typedef ResizeAreaFastNoVec ResizeAreaFastVec_SIMD_16s; +typedef ResizeAreaFastNoVec ResizeAreaFastVec_SIMD_32f; + +#endif + +template +struct ResizeAreaFastVec +{ + ResizeAreaFastVec(int _scale_x, int _scale_y, int _cn, int _step) : + scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step), vecOp(_cn, _step) + { + fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4); + } + + int operator() (const T* S, T* D, int w) const + { + if (!fast_mode) + return 0; + + const T* nextS = (const T*)((const uchar*)S + step); + int dx = vecOp(S, D, w); + + if (cn == 1) + for( ; dx < w; ++dx ) + { + int index = dx*2; + D[dx] = (T)((S[index] + S[index+1] + nextS[index] + nextS[index+1] + 2) >> 2); + } + else if (cn == 3) + for( ; dx < w; dx += 3 ) + { + int index = dx*2; + D[dx] = (T)((S[index] + S[index+3] + nextS[index] + nextS[index+3] + 2) >> 2); + D[dx+1] = (T)((S[index+1] + S[index+4] + nextS[index+1] + nextS[index+4] + 2) >> 2); + D[dx+2] = (T)((S[index+2] + S[index+5] + nextS[index+2] + nextS[index+5] + 2) >> 2); + } + else + { + CV_Assert(cn == 4); + for( ; dx < w; dx += 4 ) + { + int index = dx*2; + D[dx] = (T)((S[index] + S[index+4] + nextS[index] + nextS[index+4] + 2) >> 2); + D[dx+1] = (T)((S[index+1] + S[index+5] + nextS[index+1] + nextS[index+5] + 2) >> 2); + D[dx+2] = (T)((S[index+2] + S[index+6] + nextS[index+2] + nextS[index+6] + 2) >> 2); + D[dx+3] = (T)((S[index+3] + S[index+7] + nextS[index+3] + nextS[index+7] + 2) >> 2); + } + } + + return dx; + } + +private: + int scale_x, scale_y; + int cn; + bool fast_mode; + int step; + SIMDVecOp vecOp; +}; + +template +class resizeAreaFast_Invoker : + public ParallelLoopBody +{ +public: + resizeAreaFast_Invoker(const Mat &_src, Mat &_dst, + int _scale_x, int _scale_y, const int* _ofs, const int* _xofs) : + ParallelLoopBody(), src(_src), dst(_dst), scale_x(_scale_x), + scale_y(_scale_y), ofs(_ofs), xofs(_xofs) + { + } + + virtual void operator() (const Range& range) const + { + Size ssize = src.size(), dsize = dst.size(); + int cn = src.channels(); + int area = scale_x*scale_y; + float scale = 1.f/(area); + int dwidth1 = (ssize.width/scale_x)*cn; + dsize.width *= cn; + ssize.width *= cn; + int dy, dx, k = 0; + + VecOp vop(scale_x, scale_y, src.channels(), (int)src.step/*, area_ofs*/); + + for( dy = range.start; dy < range.end; dy++ ) + { + T* D = (T*)(dst.data + dst.step*dy); + int sy0 = dy*scale_y; + int w = sy0 + scale_y <= ssize.height ? dwidth1 : 0; + + if( sy0 >= ssize.height ) + { + for( dx = 0; dx < dsize.width; dx++ ) + D[dx] = 0; + continue; + } + + dx = vop(src.template ptr(sy0), D, w); + for( ; dx < w; dx++ ) + { + const T* S = src.template ptr(sy0) + xofs[dx]; + WT sum = 0; + k = 0; + #if CV_ENABLE_UNROLLED + for( ; k <= area - 4; k += 4 ) + sum += S[ofs[k]] + S[ofs[k+1]] + S[ofs[k+2]] + S[ofs[k+3]]; + #endif + for( ; k < area; k++ ) + sum += S[ofs[k]]; + + D[dx] = saturate_cast(sum * scale); + } + + for( ; dx < dsize.width; dx++ ) + { + WT sum = 0; + int count = 0, sx0 = xofs[dx]; + if( sx0 >= ssize.width ) + D[dx] = 0; + + for( int sy = 0; sy < scale_y; sy++ ) + { + if( sy0 + sy >= ssize.height ) + break; + const T* S = src.template ptr(sy0 + sy) + sx0; + for( int sx = 0; sx < scale_x*cn; sx += cn ) + { + if( sx0 + sx >= ssize.width ) + break; + sum += S[sx]; + count++; + } + } + + D[dx] = saturate_cast((float)sum/count); + } + } + } + +private: + Mat src; + Mat dst; + int scale_x, scale_y; + const int *ofs, *xofs; +}; + +template +static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int* xofs, + int scale_x, int scale_y ) +{ + Range range(0, dst.rows); + resizeAreaFast_Invoker invoker(src, dst, scale_x, + scale_y, ofs, xofs); + parallel_for_(range, invoker, dst.total()/(double)(1<<16)); +} + +struct DecimateAlpha +{ + int si, di; + float alpha; +}; + + +template class ResizeArea_Invoker : + public ParallelLoopBody +{ +public: + ResizeArea_Invoker( const Mat& _src, Mat& _dst, + const DecimateAlpha* _xtab, int _xtab_size, + const DecimateAlpha* _ytab, int _ytab_size, + const int* _tabofs ) + { + src = &_src; + dst = &_dst; + xtab0 = _xtab; + xtab_size0 = _xtab_size; + ytab = _ytab; + ytab_size = _ytab_size; + tabofs = _tabofs; + } + + virtual void operator() (const Range& range) const + { + Size dsize = dst->size(); + int cn = dst->channels(); + dsize.width *= cn; + AutoBuffer _buffer(dsize.width*2); + const DecimateAlpha* xtab = xtab0; + int xtab_size = xtab_size0; + WT *buf = _buffer, *sum = buf + dsize.width; + int j_start = tabofs[range.start], j_end = tabofs[range.end], j, k, dx, prev_dy = ytab[j_start].di; + + for( dx = 0; dx < dsize.width; dx++ ) + sum[dx] = (WT)0; + + for( j = j_start; j < j_end; j++ ) + { + WT beta = ytab[j].alpha; + int dy = ytab[j].di; + int sy = ytab[j].si; + + { + const T* S = src->template ptr(sy); + for( dx = 0; dx < dsize.width; dx++ ) + buf[dx] = (WT)0; + + if( cn == 1 ) + for( k = 0; k < xtab_size; k++ ) + { + int dxn = xtab[k].di; + WT alpha = xtab[k].alpha; + buf[dxn] += S[xtab[k].si]*alpha; + } + else if( cn == 2 ) + for( k = 0; k < xtab_size; k++ ) + { + int sxn = xtab[k].si; + int dxn = xtab[k].di; + WT alpha = xtab[k].alpha; + WT t0 = buf[dxn] + S[sxn]*alpha; + WT t1 = buf[dxn+1] + S[sxn+1]*alpha; + buf[dxn] = t0; buf[dxn+1] = t1; + } + else if( cn == 3 ) + for( k = 0; k < xtab_size; k++ ) + { + int sxn = xtab[k].si; + int dxn = xtab[k].di; + WT alpha = xtab[k].alpha; + WT t0 = buf[dxn] + S[sxn]*alpha; + WT t1 = buf[dxn+1] + S[sxn+1]*alpha; + WT t2 = buf[dxn+2] + S[sxn+2]*alpha; + buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2; + } + else if( cn == 4 ) + { + for( k = 0; k < xtab_size; k++ ) + { + int sxn = xtab[k].si; + int dxn = xtab[k].di; + WT alpha = xtab[k].alpha; + WT t0 = buf[dxn] + S[sxn]*alpha; + WT t1 = buf[dxn+1] + S[sxn+1]*alpha; + buf[dxn] = t0; buf[dxn+1] = t1; + t0 = buf[dxn+2] + S[sxn+2]*alpha; + t1 = buf[dxn+3] + S[sxn+3]*alpha; + buf[dxn+2] = t0; buf[dxn+3] = t1; + } + } + else + { + for( k = 0; k < xtab_size; k++ ) + { + int sxn = xtab[k].si; + int dxn = xtab[k].di; + WT alpha = xtab[k].alpha; + for( int c = 0; c < cn; c++ ) + buf[dxn + c] += S[sxn + c]*alpha; + } + } + } + + if( dy != prev_dy ) + { + T* D = dst->template ptr(prev_dy); + + for( dx = 0; dx < dsize.width; dx++ ) + { + D[dx] = saturate_cast(sum[dx]); + sum[dx] = beta*buf[dx]; + } + prev_dy = dy; + } + else + { + for( dx = 0; dx < dsize.width; dx++ ) + sum[dx] += beta*buf[dx]; + } + } + + { + T* D = dst->template ptr(prev_dy); + for( dx = 0; dx < dsize.width; dx++ ) + D[dx] = saturate_cast(sum[dx]); + } + } + +private: + const Mat* src; + Mat* dst; + const DecimateAlpha* xtab0; + const DecimateAlpha* ytab; + int xtab_size0, ytab_size; + const int* tabofs; +}; + + +template +static void resizeArea_( const Mat& src, Mat& dst, + const DecimateAlpha* xtab, int xtab_size, + const DecimateAlpha* ytab, int ytab_size, + const int* tabofs ) +{ + parallel_for_(Range(0, dst.rows), + ResizeArea_Invoker(src, dst, xtab, xtab_size, ytab, ytab_size, tabofs), + dst.total()/((double)(1 << 16))); +} + + +typedef void (*ResizeFunc)( const Mat& src, Mat& dst, + const int* xofs, const void* alpha, + const int* yofs, const void* beta, + int xmin, int xmax, int ksize ); + +typedef void (*ResizeAreaFastFunc)( const Mat& src, Mat& dst, + const int* ofs, const int *xofs, + int scale_x, int scale_y ); + +typedef void (*ResizeAreaFunc)( const Mat& src, Mat& dst, + const DecimateAlpha* xtab, int xtab_size, + const DecimateAlpha* ytab, int ytab_size, + const int* yofs); + + +static int computeResizeAreaTab( int ssize, int dsize, int cn, double scale, DecimateAlpha* tab ) +{ + int k = 0; + for(int dx = 0; dx < dsize; dx++ ) + { + double fsx1 = dx * scale; + double fsx2 = fsx1 + scale; + double cellWidth = std::min(scale, ssize - fsx1); + + int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2); + + sx2 = std::min(sx2, ssize - 1); + sx1 = std::min(sx1, sx2); + + if( sx1 - fsx1 > 1e-3 ) + { + assert( k < ssize*2 ); + tab[k].di = dx * cn; + tab[k].si = (sx1 - 1) * cn; + tab[k++].alpha = (float)((sx1 - fsx1) / cellWidth); + } + + for(int sx = sx1; sx < sx2; sx++ ) + { + assert( k < ssize*2 ); + tab[k].di = dx * cn; + tab[k].si = sx * cn; + tab[k++].alpha = float(1.0 / cellWidth); + } + + if( fsx2 - sx2 > 1e-3 ) + { + assert( k < ssize*2 ); + tab[k].di = dx * cn; + tab[k].si = sx2 * cn; + tab[k++].alpha = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth); + } + } + return k; +} + +#ifdef HAVE_OPENCL +static void ocl_computeResizeAreaTabs(int ssize, int dsize, double scale, int * const map_tab, + float * const alpha_tab, int * const ofs_tab) +{ + int k = 0, dx = 0; + for ( ; dx < dsize; dx++) + { + ofs_tab[dx] = k; + + double fsx1 = dx * scale; + double fsx2 = fsx1 + scale; + double cellWidth = std::min(scale, ssize - fsx1); + + int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2); + + sx2 = std::min(sx2, ssize - 1); + sx1 = std::min(sx1, sx2); + + if (sx1 - fsx1 > 1e-3) + { + map_tab[k] = sx1 - 1; + alpha_tab[k++] = (float)((sx1 - fsx1) / cellWidth); + } + + for (int sx = sx1; sx < sx2; sx++) + { + map_tab[k] = sx; + alpha_tab[k++] = float(1.0 / cellWidth); + } + + if (fsx2 - sx2 > 1e-3) + { + map_tab[k] = sx2; + alpha_tab[k++] = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth); + } + } + ofs_tab[dx] = k; +} + +static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize, + double fx, double fy, int interpolation) +{ + int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); + + double inv_fx = 1.0 / fx, inv_fy = 1.0 / fy; + float inv_fxf = (float)inv_fx, inv_fyf = (float)inv_fy; + int iscale_x = saturate_cast(inv_fx), iscale_y = saturate_cast(inv_fx); + bool is_area_fast = std::abs(inv_fx - iscale_x) < DBL_EPSILON && + std::abs(inv_fy - iscale_y) < DBL_EPSILON; + + // in case of scale_x && scale_y is equal to 2 + // INTER_AREA (fast) also is equal to INTER_LINEAR + if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 ) + /*interpolation = INTER_AREA*/(void)0; // INTER_AREA is slower + + if( !(cn <= 4 && + (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || + (interpolation == INTER_AREA && inv_fx >= 1 && inv_fy >= 1) )) ) + return false; + + UMat src = _src.getUMat(); + _dst.create(dsize, type); + UMat dst = _dst.getUMat(); + + Size ssize = src.size(); + ocl::Kernel k; + size_t globalsize[] = { (size_t)dst.cols, (size_t)dst.rows }; + + ocl::Image2D srcImage; + + // See if this could be done with a sampler. We stick with integer + // datatypes because the observed error is low. + bool useSampler = (interpolation == INTER_LINEAR && ocl::Device::getDefault().imageSupport() && + ocl::Image2D::canCreateAlias(src) && depth <= 4 && + ocl::Image2D::isFormatSupported(depth, cn, true) && + src.offset==0); + if (useSampler) + { + int wdepth = std::max(depth, CV_32S); + char buf[2][32]; + cv::String compileOpts = format("-D USE_SAMPLER -D depth=%d -D T=%s -D T1=%s " + "-D convertToDT=%s -D cn=%d", + depth, ocl::typeToStr(type), ocl::typeToStr(depth), + ocl::convertTypeStr(wdepth, depth, cn, buf[1]), + cn); + k.create("resizeSampler", ocl::imgproc::resize_oclsrc, compileOpts); + + if (k.empty()) + useSampler = false; + else + { + // Convert the input into an OpenCL image type, using normalized channel data types + // and aliasing the UMat. + srcImage = ocl::Image2D(src, true, true); + k.args(srcImage, ocl::KernelArg::WriteOnly(dst), + (float)inv_fx, (float)inv_fy); + } + } + + if (interpolation == INTER_LINEAR && !useSampler) + { + char buf[2][32]; + + // integer path is slower because of CPU part, so it's disabled + if (depth == CV_8U && ((void)0, 0)) + { + AutoBuffer _buffer((dsize.width + dsize.height)*(sizeof(int) + sizeof(short)*2)); + int* xofs = (int*)(uchar*)_buffer, * yofs = xofs + dsize.width; + short* ialpha = (short*)(yofs + dsize.height), * ibeta = ialpha + dsize.width*2; + float fxx, fyy; + int sx, sy; + + for (int dx = 0; dx < dsize.width; dx++) + { + fxx = (float)((dx+0.5)*inv_fx - 0.5); + sx = cvFloor(fxx); + fxx -= sx; + + if (sx < 0) + fxx = 0, sx = 0; + + if (sx >= ssize.width-1) + fxx = 0, sx = ssize.width-1; + + xofs[dx] = sx; + ialpha[dx*2 + 0] = saturate_cast((1.f - fxx) * INTER_RESIZE_COEF_SCALE); + ialpha[dx*2 + 1] = saturate_cast(fxx * INTER_RESIZE_COEF_SCALE); + } + + for (int dy = 0; dy < dsize.height; dy++) + { + fyy = (float)((dy+0.5)*inv_fy - 0.5); + sy = cvFloor(fyy); + fyy -= sy; + + yofs[dy] = sy; + ibeta[dy*2 + 0] = saturate_cast((1.f - fyy) * INTER_RESIZE_COEF_SCALE); + ibeta[dy*2 + 1] = saturate_cast(fyy * INTER_RESIZE_COEF_SCALE); + } + + int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn); + UMat coeffs; + Mat(1, static_cast(_buffer.size()), CV_8UC1, (uchar *)_buffer).copyTo(coeffs); + + k.create("resizeLN", ocl::imgproc::resize_oclsrc, + format("-D INTER_LINEAR_INTEGER -D depth=%d -D T=%s -D T1=%s " + "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d " + "-D INTER_RESIZE_COEF_BITS=%d", + depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype), + ocl::convertTypeStr(depth, wdepth, cn, buf[0]), + ocl::convertTypeStr(wdepth, depth, cn, buf[1]), + cn, INTER_RESIZE_COEF_BITS)); + if (k.empty()) + return false; + + k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), + ocl::KernelArg::PtrReadOnly(coeffs)); + } + else + { + int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn); + k.create("resizeLN", ocl::imgproc::resize_oclsrc, + format("-D INTER_LINEAR -D depth=%d -D T=%s -D T1=%s " + "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d " + "-D INTER_RESIZE_COEF_BITS=%d", + depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype), + ocl::convertTypeStr(depth, wdepth, cn, buf[0]), + ocl::convertTypeStr(wdepth, depth, cn, buf[1]), + cn, INTER_RESIZE_COEF_BITS)); + if (k.empty()) + return false; + + k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), + (float)inv_fx, (float)inv_fy); + } + } + else if (interpolation == INTER_NEAREST) + { + k.create("resizeNN", ocl::imgproc::resize_oclsrc, + format("-D INTER_NEAREST -D T=%s -D T1=%s -D cn=%d", + ocl::vecopTypeToStr(type), ocl::vecopTypeToStr(depth), cn)); + if (k.empty()) + return false; + + k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), + (float)inv_fx, (float)inv_fy); + } + else if (interpolation == INTER_AREA) + { + int wdepth = std::max(depth, is_area_fast ? CV_32S : CV_32F); + int wtype = CV_MAKE_TYPE(wdepth, cn); + + char cvt[2][40]; + String buildOption = format("-D INTER_AREA -D T=%s -D T1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d", + ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype), + ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), cn); + + UMat alphaOcl, tabofsOcl, mapOcl; + UMat dmap, smap; + + if (is_area_fast) + { + int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn); + buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST" + " -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff", + ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]), + ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]), + iscale_x, iscale_y, 1.0f / (iscale_x * iscale_y)); + + k.create("resizeAREA_FAST", ocl::imgproc::resize_oclsrc, buildOption); + if (k.empty()) + return false; + } + else + { + buildOption = buildOption + format(" -D convertToT=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0])); + k.create("resizeAREA", ocl::imgproc::resize_oclsrc, buildOption); + if (k.empty()) + return false; + + int xytab_size = (ssize.width + ssize.height) << 1; + int tabofs_size = dsize.height + dsize.width + 2; + + AutoBuffer _xymap_tab(xytab_size), _xyofs_tab(tabofs_size); + AutoBuffer _xyalpha_tab(xytab_size); + int * xmap_tab = _xymap_tab, * ymap_tab = _xymap_tab + (ssize.width << 1); + float * xalpha_tab = _xyalpha_tab, * yalpha_tab = _xyalpha_tab + (ssize.width << 1); + int * xofs_tab = _xyofs_tab, * yofs_tab = _xyofs_tab + dsize.width + 1; + + ocl_computeResizeAreaTabs(ssize.width, dsize.width, inv_fx, xmap_tab, xalpha_tab, xofs_tab); + ocl_computeResizeAreaTabs(ssize.height, dsize.height, inv_fy, ymap_tab, yalpha_tab, yofs_tab); + + // loading precomputed arrays to GPU + Mat(1, xytab_size, CV_32FC1, (void *)_xyalpha_tab).copyTo(alphaOcl); + Mat(1, xytab_size, CV_32SC1, (void *)_xymap_tab).copyTo(mapOcl); + Mat(1, tabofs_size, CV_32SC1, (void *)_xyofs_tab).copyTo(tabofsOcl); + } + + ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst); + + if (is_area_fast) + k.args(srcarg, dstarg); + else + k.args(srcarg, dstarg, inv_fxf, inv_fyf, ocl::KernelArg::PtrReadOnly(tabofsOcl), + ocl::KernelArg::PtrReadOnly(mapOcl), ocl::KernelArg::PtrReadOnly(alphaOcl)); + + return k.run(2, globalsize, NULL, false); + } + + return k.run(2, globalsize, 0, false); +} + +#endif + +#ifdef HAVE_IPP +#define IPP_RESIZE_PARALLEL 1 + +#ifdef HAVE_IPP_IW +class ipp_resizeParallel: public ParallelLoopBody +{ +public: + ipp_resizeParallel(::ipp::IwiImage &src, ::ipp::IwiImage &dst, bool &ok): + m_src(src), m_dst(dst), m_ok(ok) {} + ~ipp_resizeParallel() + { + } + + void Init(IppiInterpolationType inter) + { + iwiResize.InitAlloc(m_src.m_size, m_dst.m_size, m_src.m_dataType, m_src.m_channels, inter, ::ipp::IwiResizeParams(0, 0, 0.75, 4), ippBorderRepl); + + m_ok = true; + } + + virtual void operator() (const Range& range) const + { + CV_INSTRUMENT_REGION_IPP() + + if(!m_ok) + return; + + try + { + ::ipp::IwiTile tile = ::ipp::IwiRoi(0, range.start, m_dst.m_size.width, range.end - range.start); + CV_INSTRUMENT_FUN_IPP(iwiResize, m_src, m_dst, ippBorderRepl, tile); + } + catch(::ipp::IwException) + { + m_ok = false; + return; + } + } +private: + ::ipp::IwiImage &m_src; + ::ipp::IwiImage &m_dst; + + mutable ::ipp::IwiResize iwiResize; + + volatile bool &m_ok; + const ipp_resizeParallel& operator= (const ipp_resizeParallel&); +}; + +class ipp_resizeAffineParallel: public ParallelLoopBody +{ +public: + ipp_resizeAffineParallel(::ipp::IwiImage &src, ::ipp::IwiImage &dst, bool &ok): + m_src(src), m_dst(dst), m_ok(ok) {} + ~ipp_resizeAffineParallel() + { + } + + void Init(IppiInterpolationType inter, double scaleX, double scaleY) + { + double shift = (inter == ippNearest)?-1e-10:-0.5; + double coeffs[2][3] = { + {scaleX, 0, shift+0.5*scaleX}, + {0, scaleY, shift+0.5*scaleY} + }; + + iwiWarpAffine.InitAlloc(m_src.m_size, m_dst.m_size, m_src.m_dataType, m_src.m_channels, coeffs, iwTransForward, inter, ::ipp::IwiWarpAffineParams(0, 0, 0.75), ippBorderRepl); + + m_ok = true; + } + + virtual void operator() (const Range& range) const + { + CV_INSTRUMENT_REGION_IPP() + + if(!m_ok) + return; + + try + { + ::ipp::IwiTile tile = ::ipp::IwiRoi(0, range.start, m_dst.m_size.width, range.end - range.start); + CV_INSTRUMENT_FUN_IPP(iwiWarpAffine, m_src, m_dst, tile); + } + catch(::ipp::IwException) + { + m_ok = false; + return; + } + } +private: + ::ipp::IwiImage &m_src; + ::ipp::IwiImage &m_dst; + + mutable ::ipp::IwiWarpAffine iwiWarpAffine; + + volatile bool &m_ok; + const ipp_resizeAffineParallel& operator= (const ipp_resizeAffineParallel&); +}; +#endif + +static bool ipp_resize(const uchar * src_data, size_t src_step, int src_width, int src_height, + uchar * dst_data, size_t dst_step, int dst_width, int dst_height, double inv_scale_x, double inv_scale_y, + int depth, int channels, int interpolation) +{ +#ifdef HAVE_IPP_IW + CV_INSTRUMENT_REGION_IPP() + + IppDataType ippDataType = ippiGetDataType(depth); + IppiInterpolationType ippInter = ippiGetInterpolation(interpolation); + if(ippInter < 0) + return false; + + // Resize which doesn't match OpenCV exactly + if (!cv::ipp::useIPP_NE()) + { + if (ippInter == ippNearest || ippInter == ippSuper || (ippDataType == ipp8u && ippInter == ippLinear)) + return false; + } + + if(ippInter != ippLinear && ippDataType == ipp64f) + return false; + +#if IPP_VERSION_X100 < 201801 + // Degradations on int^2 linear downscale + if (ippDataType != ipp64f && ippInter == ippLinear && inv_scale_x < 1 && inv_scale_y < 1) // if downscale + { + int scale_x = (int)(1 / inv_scale_x); + int scale_y = (int)(1 / inv_scale_y); + if (1 / inv_scale_x - scale_x < DBL_EPSILON && 1 / inv_scale_y - scale_y < DBL_EPSILON) // if integer + { + if (!(scale_x&(scale_x - 1)) && !(scale_y&(scale_y - 1))) // if power of 2 + return false; + } + } +#endif + + bool affine = false; + const double IPP_RESIZE_EPS = (depth == CV_64F)?0:1e-10; + double ex = fabs((double)dst_width / src_width - inv_scale_x) / inv_scale_x; + double ey = fabs((double)dst_height / src_height - inv_scale_y) / inv_scale_y; + + // Use affine transform resize to allow sub-pixel accuracy + if(ex > IPP_RESIZE_EPS || ey > IPP_RESIZE_EPS) + affine = true; + + // Affine doesn't support Lanczos and Super interpolations + if(affine && (ippInter == ippLanczos || ippInter == ippSuper)) + return false; + + try + { + ::ipp::IwiImage iwSrc(::ipp::IwiSize(src_width, src_height), ippDataType, channels, 0, (void*)src_data, src_step); + ::ipp::IwiImage iwDst(::ipp::IwiSize(dst_width, dst_height), ippDataType, channels, 0, (void*)dst_data, dst_step); + + bool ok; + int threads = ippiSuggestThreadsNum(iwDst, 1+((double)(src_width*src_height)/(dst_width*dst_height))); + Range range(0, dst_height); + ipp_resizeParallel invokerGeneral(iwSrc, iwDst, ok); + ipp_resizeAffineParallel invokerAffine(iwSrc, iwDst, ok); + ParallelLoopBody *pInvoker = NULL; + if(affine) + { + pInvoker = &invokerAffine; + invokerAffine.Init(ippInter, inv_scale_x, inv_scale_y); + } + else + { + pInvoker = &invokerGeneral; + invokerGeneral.Init(ippInter); + } + + if(IPP_RESIZE_PARALLEL && threads > 1) + parallel_for_(range, *pInvoker, threads*4); + else + pInvoker->operator()(range); + + if(!ok) + return false; + } + catch(::ipp::IwException) + { + return false; + } + return true; +#else + CV_UNUSED(src_data); CV_UNUSED(src_step); CV_UNUSED(src_width); CV_UNUSED(src_height); CV_UNUSED(dst_data); CV_UNUSED(dst_step); + CV_UNUSED(dst_width); CV_UNUSED(dst_height); CV_UNUSED(inv_scale_x); CV_UNUSED(inv_scale_y); CV_UNUSED(depth); + CV_UNUSED(channels); CV_UNUSED(interpolation); + return false; +#endif +} +#endif + +//================================================================================================== + +namespace hal { + +void resize(int src_type, + const uchar * src_data, size_t src_step, int src_width, int src_height, + uchar * dst_data, size_t dst_step, int dst_width, int dst_height, + double inv_scale_x, double inv_scale_y, int interpolation) +{ + CV_INSTRUMENT_REGION() + + CV_Assert((dst_width * dst_height > 0) || (inv_scale_x > 0 && inv_scale_y > 0)); + if (inv_scale_x < DBL_EPSILON || inv_scale_y < DBL_EPSILON) + { + inv_scale_x = static_cast(dst_width) / src_width; + inv_scale_y = static_cast(dst_height) / src_height; + } + + CALL_HAL(resize, cv_hal_resize, src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, inv_scale_x, inv_scale_y, interpolation); + + int depth = CV_MAT_DEPTH(src_type), cn = CV_MAT_CN(src_type); + Size dsize = Size(saturate_cast(src_width*inv_scale_x), + saturate_cast(src_height*inv_scale_y)); + CV_Assert( dsize.area() > 0 ); + + CV_IPP_RUN_FAST(ipp_resize(src_data, src_step, src_width, src_height, dst_data, dst_step, dsize.width, dsize.height, inv_scale_x, inv_scale_y, depth, cn, interpolation)) + + static ResizeFunc linear_tab[] = + { + resizeGeneric_< + HResizeLinear, + VResizeLinear, + VResizeLinearVec_32s8u> >, + 0, + resizeGeneric_< + HResizeLinear, + VResizeLinear, + VResizeLinearVec_32f16u> >, + resizeGeneric_< + HResizeLinear, + VResizeLinear, + VResizeLinearVec_32f16s> >, + 0, + resizeGeneric_< + HResizeLinear, + VResizeLinear, + VResizeLinearVec_32f> >, + resizeGeneric_< + HResizeLinear, + VResizeLinear, + VResizeNoVec> >, + 0 + }; + + static ResizeFunc cubic_tab[] = + { + resizeGeneric_< + HResizeCubic, + VResizeCubic, + VResizeCubicVec_32s8u> >, + 0, + resizeGeneric_< + HResizeCubic, + VResizeCubic, + VResizeCubicVec_32f16u> >, + resizeGeneric_< + HResizeCubic, + VResizeCubic, + VResizeCubicVec_32f16s> >, + 0, + resizeGeneric_< + HResizeCubic, + VResizeCubic, + VResizeCubicVec_32f> >, + resizeGeneric_< + HResizeCubic, + VResizeCubic, + VResizeNoVec> >, + 0 + }; + + static ResizeFunc lanczos4_tab[] = + { + resizeGeneric_, + VResizeLanczos4, + VResizeNoVec> >, + 0, + resizeGeneric_, + VResizeLanczos4, + VResizeLanczos4Vec_32f16u> >, + resizeGeneric_, + VResizeLanczos4, + VResizeLanczos4Vec_32f16s> >, + 0, + resizeGeneric_, + VResizeLanczos4, + VResizeLanczos4Vec_32f> >, + resizeGeneric_, + VResizeLanczos4, + VResizeNoVec> >, + 0 + }; + + static ResizeAreaFastFunc areafast_tab[] = + { + resizeAreaFast_ >, + 0, + resizeAreaFast_ >, + resizeAreaFast_ >, + 0, + resizeAreaFast_, + resizeAreaFast_ >, + 0 + }; + + static ResizeAreaFunc area_tab[] = + { + resizeArea_, 0, resizeArea_, + resizeArea_, 0, resizeArea_, + resizeArea_, 0 + }; + + double scale_x = 1./inv_scale_x, scale_y = 1./inv_scale_y; + + int iscale_x = saturate_cast(scale_x); + int iscale_y = saturate_cast(scale_y); + + bool is_area_fast = std::abs(scale_x - iscale_x) < DBL_EPSILON && + std::abs(scale_y - iscale_y) < DBL_EPSILON; + + Mat src(Size(src_width, src_height), src_type, const_cast(src_data), src_step); + Mat dst(dsize, src_type, dst_data, dst_step); + + if( interpolation == INTER_NEAREST ) + { + resizeNN( src, dst, inv_scale_x, inv_scale_y ); + return; + } + + int k, sx, sy, dx, dy; + + + { + // in case of scale_x && scale_y is equal to 2 + // INTER_AREA (fast) also is equal to INTER_LINEAR + if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 ) + interpolation = INTER_AREA; + + // true "area" interpolation is only implemented for the case (scale_x <= 1 && scale_y <= 1). + // In other cases it is emulated using some variant of bilinear interpolation + if( interpolation == INTER_AREA && scale_x >= 1 && scale_y >= 1 ) + { + if( is_area_fast ) + { + int area = iscale_x*iscale_y; + size_t srcstep = src_step / src.elemSize1(); + AutoBuffer _ofs(area + dsize.width*cn); + int* ofs = _ofs; + int* xofs = ofs + area; + ResizeAreaFastFunc func = areafast_tab[depth]; + CV_Assert( func != 0 ); + + for( sy = 0, k = 0; sy < iscale_y; sy++ ) + for( sx = 0; sx < iscale_x; sx++ ) + ofs[k++] = (int)(sy*srcstep + sx*cn); + + for( dx = 0; dx < dsize.width; dx++ ) + { + int j = dx * cn; + sx = iscale_x * j; + for( k = 0; k < cn; k++ ) + xofs[j + k] = sx + k; + } + + func( src, dst, ofs, xofs, iscale_x, iscale_y ); + return; + } + + ResizeAreaFunc func = area_tab[depth]; + CV_Assert( func != 0 && cn <= 4 ); + + AutoBuffer _xytab((src_width + src_height)*2); + DecimateAlpha* xtab = _xytab, *ytab = xtab + src_width*2; + + int xtab_size = computeResizeAreaTab(src_width, dsize.width, cn, scale_x, xtab); + int ytab_size = computeResizeAreaTab(src_height, dsize.height, 1, scale_y, ytab); + + AutoBuffer _tabofs(dsize.height + 1); + int* tabofs = _tabofs; + for( k = 0, dy = 0; k < ytab_size; k++ ) + { + if( k == 0 || ytab[k].di != ytab[k-1].di ) + { + assert( ytab[k].di == dy ); + tabofs[dy++] = k; + } + } + tabofs[dy] = ytab_size; + + func( src, dst, xtab, xtab_size, ytab, ytab_size, tabofs ); + return; + } + } + + int xmin = 0, xmax = dsize.width, width = dsize.width*cn; + bool area_mode = interpolation == INTER_AREA; + bool fixpt = depth == CV_8U; + float fx, fy; + ResizeFunc func=0; + int ksize=0, ksize2; + if( interpolation == INTER_CUBIC ) + ksize = 4, func = cubic_tab[depth]; + else if( interpolation == INTER_LANCZOS4 ) + ksize = 8, func = lanczos4_tab[depth]; + else if( interpolation == INTER_LINEAR || interpolation == INTER_AREA ) + ksize = 2, func = linear_tab[depth]; + else + CV_Error( CV_StsBadArg, "Unknown interpolation method" ); + ksize2 = ksize/2; + + CV_Assert( func != 0 ); + + AutoBuffer _buffer((width + dsize.height)*(sizeof(int) + sizeof(float)*ksize)); + int* xofs = (int*)(uchar*)_buffer; + int* yofs = xofs + width; + float* alpha = (float*)(yofs + dsize.height); + short* ialpha = (short*)alpha; + float* beta = alpha + width*ksize; + short* ibeta = ialpha + width*ksize; + float cbuf[MAX_ESIZE]; + + for( dx = 0; dx < dsize.width; dx++ ) + { + if( !area_mode ) + { + fx = (float)((dx+0.5)*scale_x - 0.5); + sx = cvFloor(fx); + fx -= sx; + } + else + { + sx = cvFloor(dx*scale_x); + fx = (float)((dx+1) - (sx+1)*inv_scale_x); + fx = fx <= 0 ? 0.f : fx - cvFloor(fx); + } + + if( sx < ksize2-1 ) + { + xmin = dx+1; + if( sx < 0 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4)) + fx = 0, sx = 0; + } + + if( sx + ksize2 >= src_width ) + { + xmax = std::min( xmax, dx ); + if( sx >= src_width-1 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4)) + fx = 0, sx = src_width-1; + } + + for( k = 0, sx *= cn; k < cn; k++ ) + xofs[dx*cn + k] = sx + k; + + if( interpolation == INTER_CUBIC ) + interpolateCubic( fx, cbuf ); + else if( interpolation == INTER_LANCZOS4 ) + interpolateLanczos4( fx, cbuf ); + else + { + cbuf[0] = 1.f - fx; + cbuf[1] = fx; + } + if( fixpt ) + { + for( k = 0; k < ksize; k++ ) + ialpha[dx*cn*ksize + k] = saturate_cast(cbuf[k]*INTER_RESIZE_COEF_SCALE); + for( ; k < cn*ksize; k++ ) + ialpha[dx*cn*ksize + k] = ialpha[dx*cn*ksize + k - ksize]; + } + else + { + for( k = 0; k < ksize; k++ ) + alpha[dx*cn*ksize + k] = cbuf[k]; + for( ; k < cn*ksize; k++ ) + alpha[dx*cn*ksize + k] = alpha[dx*cn*ksize + k - ksize]; + } + } + + for( dy = 0; dy < dsize.height; dy++ ) + { + if( !area_mode ) + { + fy = (float)((dy+0.5)*scale_y - 0.5); + sy = cvFloor(fy); + fy -= sy; + } + else + { + sy = cvFloor(dy*scale_y); + fy = (float)((dy+1) - (sy+1)*inv_scale_y); + fy = fy <= 0 ? 0.f : fy - cvFloor(fy); + } + + yofs[dy] = sy; + if( interpolation == INTER_CUBIC ) + interpolateCubic( fy, cbuf ); + else if( interpolation == INTER_LANCZOS4 ) + interpolateLanczos4( fy, cbuf ); + else + { + cbuf[0] = 1.f - fy; + cbuf[1] = fy; + } + + if( fixpt ) + { + for( k = 0; k < ksize; k++ ) + ibeta[dy*ksize + k] = saturate_cast(cbuf[k]*INTER_RESIZE_COEF_SCALE); + } + else + { + for( k = 0; k < ksize; k++ ) + beta[dy*ksize + k] = cbuf[k]; + } + } + + func( src, dst, xofs, fixpt ? (void*)ialpha : (void*)alpha, yofs, + fixpt ? (void*)ibeta : (void*)beta, xmin, xmax, ksize ); +} + +} // cv::hal:: +} // cv:: + +//================================================================================================== + +void cv::resize( InputArray _src, OutputArray _dst, Size dsize, + double inv_scale_x, double inv_scale_y, int interpolation ) +{ + CV_INSTRUMENT_REGION() + + Size ssize = _src.size(); + + CV_Assert( ssize.width > 0 && ssize.height > 0 ); + CV_Assert( dsize.area() > 0 || (inv_scale_x > 0 && inv_scale_y > 0) ); + if( dsize.area() == 0 ) + { + dsize = Size(saturate_cast(ssize.width*inv_scale_x), + saturate_cast(ssize.height*inv_scale_y)); + CV_Assert( dsize.area() > 0 ); + } + else + { + inv_scale_x = (double)dsize.width/ssize.width; + inv_scale_y = (double)dsize.height/ssize.height; + } + + CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() && _src.cols() > 10 && _src.rows() > 10, + ocl_resize(_src, _dst, dsize, inv_scale_x, inv_scale_y, interpolation)) + + Mat src = _src.getMat(); + _dst.create(dsize, src.type()); + Mat dst = _dst.getMat(); + + if (dsize == ssize) + { + // Source and destination are of same size. Use simple copy. + src.copyTo(dst); + return; + } + + hal::resize(src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows, inv_scale_x, inv_scale_y, interpolation); +} + + +CV_IMPL void +cvResize( const CvArr* srcarr, CvArr* dstarr, int method ) +{ + cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr); + CV_Assert( src.type() == dst.type() ); + cv::resize( src, dst, dst.size(), (double)dst.cols/src.cols, + (double)dst.rows/src.rows, method ); +} + +/* End of file. */ diff --git a/modules/imgproc/src/resize.hpp b/modules/imgproc/src/resize.hpp new file mode 100644 index 0000000000..8533306bd4 --- /dev/null +++ b/modules/imgproc/src/resize.hpp @@ -0,0 +1,75 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +/* //////////////////////////////////////////////////////////////////// +// +// Geometrical transforms on images and matrices: rotation, zoom etc. +// +// */ + +#ifndef OPENCV_IMGPROC_RESIZE_HPP +#define OPENCV_IMGPROC_RESIZE_HPP +#include "precomp.hpp" + +namespace cv +{ +namespace opt_AVX2 +{ +#if CV_TRY_AVX2 +void resizeNN2_AVX2(const Range&, const Mat&, Mat&, int*, int, double); +void resizeNN4_AVX2(const Range&, const Mat&, Mat&, int*, int, double); +#endif +} + +namespace opt_SSE4_1 +{ +#if CV_TRY_SSE4_1 +void resizeNN2_SSE4_1(const Range&, const Mat&, Mat&, int*, int, double); +void resizeNN4_SSE4_1(const Range&, const Mat&, Mat&, int*, int, double); + +int VResizeLanczos4Vec_32f16u_SSE41(const uchar** _src, uchar* _dst, const uchar* _beta, int width); +#endif +} +} +#endif +/* End of file. */ diff --git a/modules/imgproc/src/resize.sse4_1.cpp b/modules/imgproc/src/resize.sse4_1.cpp new file mode 100644 index 0000000000..edcefcb9e7 --- /dev/null +++ b/modules/imgproc/src/resize.sse4_1.cpp @@ -0,0 +1,233 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +/* //////////////////////////////////////////////////////////////////// +// +// Geometrical transforms on images and matrices: rotation, zoom etc. +// +// */ + +#include "precomp.hpp" +#include "resize.hpp" + +namespace cv +{ +namespace opt_SSE4_1 +{ + +class resizeNNInvokerSSE2 : + public ParallelLoopBody +{ +public: + resizeNNInvokerSSE2(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) : + ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4), + ify(_ify) + { + } + +#if defined(__INTEL_COMPILER) +#pragma optimization_parameter target_arch=SSE4.2 +#endif + virtual void operator() (const Range& range) const + { + Size ssize = src.size(), dsize = dst.size(); + int y, x; + int width = dsize.width; + int sseWidth = width - (width & 0x7); + for(y = range.start; y < range.end; y++) + { + uchar* D = dst.data + dst.step*y; + uchar* Dstart = D; + int sy = std::min(cvFloor(y*ify), ssize.height-1); + const uchar* S = src.data + sy*src.step; + __m128i CV_DECL_ALIGNED(64) pixels = _mm_set1_epi16(0); + for(x = 0; x < sseWidth; x += 8) + { + ushort imm = *(ushort*)(S + x_ofs[x + 0]); + pixels = _mm_insert_epi16(pixels, imm, 0); + imm = *(ushort*)(S + x_ofs[x + 1]); + pixels = _mm_insert_epi16(pixels, imm, 1); + imm = *(ushort*)(S + x_ofs[x + 2]); + pixels = _mm_insert_epi16(pixels, imm, 2); + imm = *(ushort*)(S + x_ofs[x + 3]); + pixels = _mm_insert_epi16(pixels, imm, 3); + imm = *(ushort*)(S + x_ofs[x + 4]); + pixels = _mm_insert_epi16(pixels, imm, 4); + imm = *(ushort*)(S + x_ofs[x + 5]); + pixels = _mm_insert_epi16(pixels, imm, 5); + imm = *(ushort*)(S + x_ofs[x + 6]); + pixels = _mm_insert_epi16(pixels, imm, 6); + imm = *(ushort*)(S + x_ofs[x + 7]); + pixels = _mm_insert_epi16(pixels, imm, 7); + _mm_storeu_si128((__m128i*)D, pixels); + D += 16; + } + for(; x < width; x++) + { + *(ushort*)(Dstart + x*2) = *(ushort*)(S + x_ofs[x]); + } + } + } + +private: + const Mat src; + Mat dst; + int* x_ofs, pix_size4; + double ify; + + resizeNNInvokerSSE2(const resizeNNInvokerSSE2&); + resizeNNInvokerSSE2& operator=(const resizeNNInvokerSSE2&); +}; + +class resizeNNInvokerSSE4 : + public ParallelLoopBody +{ +public: + resizeNNInvokerSSE4(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) : + ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4), + ify(_ify) + { + } +#if defined(__INTEL_COMPILER) +#pragma optimization_parameter target_arch=SSE4.2 +#endif + virtual void operator() (const Range& range) const + { + Size ssize = src.size(), dsize = dst.size(); + int y, x; + int width = dsize.width; + int sseWidth = width - (width & 0x3); + for(y = range.start; y < range.end; y++) + { + uchar* D = dst.data + dst.step*y; + uchar* Dstart = D; + int sy = std::min(cvFloor(y*ify), ssize.height-1); + const uchar* S = src.data + sy*src.step; + __m128i CV_DECL_ALIGNED(64) pixels = _mm_set1_epi16(0); + for(x = 0; x < sseWidth; x += 4) + { + int imm = *(int*)(S + x_ofs[x + 0]); + pixels = _mm_insert_epi32(pixels, imm, 0); + imm = *(int*)(S + x_ofs[x + 1]); + pixels = _mm_insert_epi32(pixels, imm, 1); + imm = *(int*)(S + x_ofs[x + 2]); + pixels = _mm_insert_epi32(pixels, imm, 2); + imm = *(int*)(S + x_ofs[x + 3]); + pixels = _mm_insert_epi32(pixels, imm, 3); + _mm_storeu_si128((__m128i*)D, pixels); + D += 16; + } + for(; x < width; x++) + { + *(int*)(Dstart + x*4) = *(int*)(S + x_ofs[x]); + } + } + } + +private: + const Mat src; + Mat dst; + int* x_ofs, pix_size4; + double ify; + + resizeNNInvokerSSE4(const resizeNNInvokerSSE4&); + resizeNNInvokerSSE4& operator=(const resizeNNInvokerSSE4&); +}; + +void resizeNN2_SSE4_1(const Range& range, const Mat& src, Mat &dst, int *x_ofs, int pix_size4, double ify) +{ + resizeNNInvokerSSE2 invoker(src, dst, x_ofs, pix_size4, ify); + parallel_for_(range, invoker, dst.total() / (double)(1 << 16)); +} + +void resizeNN4_SSE4_1(const Range& range, const Mat& src, Mat &dst, int *x_ofs, int pix_size4, double ify) +{ + resizeNNInvokerSSE4 invoker(src, dst, x_ofs, pix_size4, ify); + parallel_for_(range, invoker, dst.total() / (double)(1 << 16)); +} + +int VResizeLanczos4Vec_32f16u_SSE41(const uchar** _src, uchar* _dst, const uchar* _beta, int width) +{ + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], + *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; + short * dst = (short*)_dst; + int x = 0; + __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]), + v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]), + v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]), + v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]); + + for (; x <= width - 8; x += 8) + { + __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x)); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x))); + v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x))); + + __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4)); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4))); + v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4))); + + __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0); + __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(v_dsti0, v_dsti1)); + } + + return x; +} + +} +} +/* End of file. */